In [1]:
import pandas as pd
import numpy as np
import pydataset
import sklearn.preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import wrangle

In [2]:
df = wrangle.wrangle_telco()

In [3]:
#take a look at the data to make sure it came in ok
df.head()

Unnamed: 0,customer_id,tenure,monthly_charges,total_charges
0,0013-SMEOE,71,109.7,7904.25
1,0014-BMAQU,63,84.65,5377.8
2,0016-QLJIS,65,90.45,5957.9
3,0017-DINOC,54,45.2,2460.55
4,0017-IUDMW,72,116.8,8456.75


In [4]:
df.describe()

Unnamed: 0,tenure,monthly_charges,total_charges
count,1685.0,1685.0,1685.0
mean,57.07181,60.872374,3728.933947
std,17.72913,34.71221,2571.252806
min,1.0,18.4,20.35
25%,48.0,24.05,1278.8
50%,64.0,64.45,3623.95
75%,71.0,90.55,5999.85
max,72.0,118.75,8672.45


- data looks good, moving onto scaling

# Our scenario continues:

As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

Create split_scale.py that will contain the functions that follow. Each scaler function should create the object, fit and transform both train and test. They should return the scaler, train dataframe scaled, test dataframe scaled. Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe. Be sure to set a random state where applicable for reproducibility!

# 1. split_my_data(X, y, train_pct)

In [5]:
X = df[["tenure", "monthly_charges"]]
y = df[["total_charges"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [6]:
#X_train

In [7]:
#function to split any data
def split_my_data(X, y, train_pct):
    """This function splits data"""
    return train_test_split(X, y, train_size=train_pct, random_state=123)

In [8]:
#test my function
# X_train, X_test, y_train, y_test = split_my_data(X, y, .8)
# X_train

# 2. standard_scaler()

In [22]:
scaler = sklearn.preprocessing.StandardScaler()

In [23]:
scaler.fit(X_train)
scaler.transform(X_train)

array([[ 0.72941235,  0.41960718],
       [-0.13057119, -1.16915758],
       [ 0.44275117,  1.38524228],
       ...,
       [-2.88251851, -1.18210947],
       [-1.27721591,  0.23684167],
       [-2.7105218 , -1.00222215]])

In [24]:
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)

In [25]:
X_train_scaled

Unnamed: 0,tenure,monthly_charges
119,0.729412,0.419607
1424,-0.130571,-1.169158
385,0.442751,1.385242
1140,0.729412,1.075836
1504,0.786745,1.592472
...,...,...
1131,0.442751,-1.182109
1356,0.328087,-1.190744
1416,-2.882519,-1.182109
1399,-1.277216,0.236842


In [18]:
def standard_scaler(train, test):
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(train)
    train = pd.DataFrame(scaler.transform(train), columns=train.columns, index=train.index)
    test = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)
    return scaler, train, test

In [21]:
standard_scaler(X_train, X_test)

(StandardScaler(copy=True, with_mean=True, with_std=True),
         tenure  monthly_charges
 119   0.729412         0.419607
 1424 -0.130571        -1.169158
 385   0.442751         1.385242
 1140  0.729412         1.075836
 1504  0.786745         1.592472
 ...        ...              ...
 1131  0.442751        -1.182109
 1356  0.328087        -1.190744
 1416 -2.882519        -1.182109
 1399 -1.277216         0.236842
 1544 -2.710522        -1.002222
 
 [1348 rows x 2 columns],
         tenure  monthly_charges
 305   0.844077        -1.163401
 452   0.557416         1.450002
 917   0.328087         1.389560
 1421  0.385419        -1.193622
 1557 -1.907870        -1.059786
 ...        ...              ...
 1642  0.844077         0.572152
 460  -1.965203        -1.031004
 1170  0.844077        -1.028126
 1083 -1.105219         0.577908
 1321 -1.277216        -1.172036
 
 [337 rows x 2 columns])

# 3. scale_inverse()

In [26]:
#just going through inversing the data
scaler.inverse_transform(X_train_scaled)

array([[ 70.  ,  75.5 ],
       [ 55.  ,  20.3 ],
       [ 65.  , 109.05],
       ...,
       [  7.  ,  19.85],
       [ 35.  ,  69.15],
       [ 10.  ,  26.1 ]])

In [None]:
#return to original state
train_unscaled = pd.DataFrame(scaler.inverse_transform(X_train_scaled), columns=)

# 4. uniform_scaler()

In [None]:
sklearn.preprocessing.QuantileTransformer(output_distribution='uniform')