In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dir = './telco-customer-churn'

In [3]:
train_df = data_dir + './WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [4]:
df = pd.read_csv(train_df)

In [5]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df['TotalCharges'].fillna(0, inplace = True)
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [6]:
df.drop(['customerID'], inplace = True, axis = 1)

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [8]:
for col in df.columns:
    unique_values = df[col].unique()
    if len(unique_values) < 5:
        print(f"{col}' : {unique_values}")

gender' : ['Female' 'Male']
SeniorCitizen' : [0 1]
Partner' : ['Yes' 'No']
Dependents' : ['No' 'Yes']
PhoneService' : ['No' 'Yes']
MultipleLines' : ['No phone service' 'No' 'Yes']
InternetService' : ['DSL' 'Fiber optic' 'No']
OnlineSecurity' : ['No' 'Yes' 'No internet service']
OnlineBackup' : ['Yes' 'No' 'No internet service']
DeviceProtection' : ['No' 'Yes' 'No internet service']
TechSupport' : ['No' 'Yes' 'No internet service']
StreamingTV' : ['No' 'Yes' 'No internet service']
StreamingMovies' : ['No' 'Yes' 'No internet service']
Contract' : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling' : ['Yes' 'No']
PaymentMethod' : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn' : ['No' 'Yes']


In [9]:
df['MultipleLines'] = df['MultipleLines'].map({'No phone service': 0, 'No': 0, 'Yes': 1})
internet_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in internet_cols:
    df[col] = df[col].map({'No internet service': 0, 'No': 0, 'Yes': 1})
df = pd.get_dummies(df, columns = ['InternetService', 'Contract', 'PaymentMethod'], drop_first = True)

In [10]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,Female,0,Yes,No,1,No,0,0,1,0,...,29.85,29.85,No,False,False,False,False,False,True,False
1,Male,0,No,No,34,Yes,0,1,0,1,...,56.95,1889.5,No,False,False,True,False,False,False,True
2,Male,0,No,No,2,Yes,0,1,1,0,...,53.85,108.15,Yes,False,False,False,False,False,False,True
3,Male,0,No,No,45,No,0,1,0,1,...,42.3,1840.75,No,False,False,True,False,False,False,False
4,Female,0,No,No,2,Yes,0,0,0,0,...,70.7,151.65,Yes,True,False,False,False,False,True,False


In [11]:
for col in df.columns:
    unique_values = df[col].unique()
    if len(unique_values) < 5:
        print(f"{col}' : {unique_values}")

gender' : ['Female' 'Male']
SeniorCitizen' : [0 1]
Partner' : ['Yes' 'No']
Dependents' : ['No' 'Yes']
PhoneService' : ['No' 'Yes']
MultipleLines' : [0 1]
OnlineSecurity' : [0 1]
OnlineBackup' : [1 0]
DeviceProtection' : [0 1]
TechSupport' : [0 1]
StreamingTV' : [0 1]
StreamingMovies' : [0 1]
PaperlessBilling' : ['Yes' 'No']
Churn' : ['No' 'Yes']
InternetService_Fiber optic' : [False  True]
InternetService_No' : [False  True]
Contract_One year' : [False  True]
Contract_Two year' : [False  True]
PaymentMethod_Credit card (automatic)' : [False  True]
PaymentMethod_Electronic check' : [ True False]
PaymentMethod_Mailed check' : [False  True]


In [12]:
df.dtypes

gender                                    object
SeniorCitizen                              int64
Partner                                   object
Dependents                                object
tenure                                     int64
PhoneService                              object
MultipleLines                              int64
OnlineSecurity                             int64
OnlineBackup                               int64
DeviceProtection                           int64
TechSupport                                int64
StreamingTV                                int64
StreamingMovies                            int64
PaperlessBilling                          object
MonthlyCharges                           float64
TotalCharges                             float64
Churn                                     object
InternetService_Fiber optic                 bool
InternetService_No                          bool
Contract_One year                           bool
Contract_Two year   

In [13]:
le = LabelEncoder()

In [14]:
categorical_cols = df.select_dtypes(['object', 'bool']).columns.tolist()

In [15]:
for col in categorical_cols:
    unique_values = df[col].unique()
    if len(unique_values) < 5:
        print(f"{col}' : {unique_values}")

gender' : ['Female' 'Male']
Partner' : ['Yes' 'No']
Dependents' : ['No' 'Yes']
PhoneService' : ['No' 'Yes']
PaperlessBilling' : ['Yes' 'No']
Churn' : ['No' 'Yes']
InternetService_Fiber optic' : [False  True]
InternetService_No' : [False  True]
Contract_One year' : [False  True]
Contract_Two year' : [False  True]
PaymentMethod_Credit card (automatic)' : [False  True]
PaymentMethod_Electronic check' : [ True False]
PaymentMethod_Mailed check' : [False  True]


In [16]:
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [17]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,0,1,0,...,29.85,29.85,0,0,0,0,0,0,1,0
1,1,0,0,0,34,1,0,1,0,1,...,56.95,1889.5,0,0,0,1,0,0,0,1
2,1,0,0,0,2,1,0,1,1,0,...,53.85,108.15,1,0,0,0,0,0,0,1
3,1,0,0,0,45,0,0,1,0,1,...,42.3,1840.75,0,0,0,1,0,0,0,0
4,0,0,0,0,2,1,0,0,0,0,...,70.7,151.65,1,1,0,0,0,0,1,0


In [18]:
for col in categorical_cols:
    unique_values = df[col].unique()
    if len(unique_values) < 5:
        print(f"{col}' : {unique_values}")

gender' : [0 1]
Partner' : [1 0]
Dependents' : [0 1]
PhoneService' : [0 1]
PaperlessBilling' : [1 0]
Churn' : [0 1]
InternetService_Fiber optic' : [0 1]
InternetService_No' : [0 1]
Contract_One year' : [0 1]
Contract_Two year' : [0 1]
PaymentMethod_Credit card (automatic)' : [0 1]
PaymentMethod_Electronic check' : [1 0]
PaymentMethod_Mailed check' : [0 1]


In [19]:
numeric_cols = df.columns[df.nunique() > 5]
print(numeric_cols)

Index(['tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')


In [20]:
scaler = MinMaxScaler()

In [21]:
scaler.fit(df[numeric_cols])

In [22]:
df[numeric_cols] = scaler.transform(df[numeric_cols])

In [23]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,0.013889,0,0,0,1,0,...,0.115423,0.003437,0,0,0,0,0,0,1,0
1,1,0,0,0,0.472222,1,0,1,0,1,...,0.385075,0.217564,0,0,0,1,0,0,0,1
2,1,0,0,0,0.027778,1,0,1,1,0,...,0.354229,0.012453,1,0,0,0,0,0,0,1
3,1,0,0,0,0.625000,0,0,1,0,1,...,0.239303,0.211951,0,0,0,1,0,0,0,0
4,0,0,0,0,0.027778,1,0,0,0,0,...,0.521891,0.017462,1,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,0.333333,1,1,1,0,1,...,0.662189,0.229194,0,0,0,1,0,0,0,1
7039,0,0,1,1,1.000000,1,1,0,1,1,...,0.845274,0.847792,0,1,0,1,0,1,0,0
7040,0,0,1,1,0.152778,0,0,1,0,0,...,0.112935,0.039892,0,0,0,0,0,0,1,0
7041,1,1,1,0,0.055556,1,1,0,0,0,...,0.558706,0.035303,1,1,0,0,0,0,0,1


In [24]:
train_val, test = train_test_split(df, test_size = 0.2, random_state = 42)
train, val = train_test_split(train_val, test_size = 0.25, random_state = 42)

In [25]:
x = df.columns[df.columns != 'Churn']
y = 'Churn'

In [26]:
X_train = train[x]
y_train = train[y]

In [27]:
X_val = val[x]
y_val = val[y]

In [28]:
X_test = test[x]
y_test = test[y]

In [29]:
!pip install pyarrow --quiet

In [30]:
X_train.to_parquet('X_train.parquet')
X_val.to_parquet('X_val.parquet')
X_test.to_parquet('X_test.parquet')

In [31]:
%%time
pd.DataFrame(y_train).to_parquet('y_train.parquet')
pd.DataFrame(y_val).to_parquet('y_val.parquet')
pd.DataFrame(y_test).to_parquet('y_test.parquet')

CPU times: total: 0 ns
Wall time: 17.2 ms
