In [155]:
# import standard libraries
import numpy as np
# import third-party libraries
import pandas as pd
# import local libraries


# Load data

In [156]:
# load data 
df = pd.read_csv('../data/customers.csv')

# Data cleaning

In [157]:
# convert column TotalCharges to numeric as done in Problem Set 1
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df = df.dropna(axis = 0, how = 'any')

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


# Feature encoding

In [159]:
# set customerID to index since we are clearly not going to use this as a feature in our models
df = df.set_index('customerID')

In [160]:
# print possibe values for each categorical feature
for col in df:
    if df[col].dtype == 'object':
        print('{}: {}'.format(col, ', '.join(df[col].unique().tolist())))

gender: Female, Male
Partner: Yes, No
Dependents: No, Yes
PhoneService: No, Yes
MultipleLines: No phone service, No, Yes
InternetService: DSL, Fiber optic, No
OnlineSecurity: No, Yes, No internet service
OnlineBackup: Yes, No, No internet service
DeviceProtection: No, Yes, No internet service
TechSupport: No, Yes, No internet service
StreamingTV: No, Yes, No internet service
StreamingMovies: No, Yes, No internet service
Contract: Month-to-month, One year, Two year
PaperlessBilling: Yes, No
PaymentMethod: Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)
Churn: No, Yes


In [161]:
# customers with no internet service
customers_nointernet = df[df['InternetService'] == 'No'].index

# check features of customers which have any value == 'No internet services' are the same customers where
# InternetService == 'No'
for col in df:
    if 'No internet service' in df[col].unique().tolist():
        error = customers_nointernet.difference(df[df[col] == 'No internet service'].index)
        assert error.empty # no assert error = good news

In [162]:
# create new column internet with binary values 1 yes, 0 no
df['Internet'] = df['InternetService'].apply(lambda x: 0 if x == 'No' else 1)
# same for DSL / Fiber optic
df['DSL'] = df['InternetService'].apply(lambda x: 1 if x == 'DSL' else 0)
df['FiberOptic'] = df['InternetService'].apply(lambda x: 1 if x == 'Fiber optic' else 0)
# drop useless columns
df = df.drop(columns = 'InternetService')

In [163]:
# replace features with only two possible values with binary value. We know that if the customer has no 
# internet service then the customer will not have the service feature. The same goes for multiple lines.
yesno = {'Yes': 1, 'No': 0, 'No internet service': 0, 'No phone service': 0}
binary_map = {'gender': {'Female': 0, 'Male': 1},
              'Partner': yesno,
              'Dependents': yesno,
              'PhoneService': yesno,
              'OnlineSecurity': yesno,
              'OnlineBackup': yesno,
              'DeviceProtection': yesno,
              'TechSupport': yesno,
              'StreamingTV': yesno,
              'StreamingMovies': yesno,
              'PaperlessBilling': yesno,
              'Churn': yesno,
              'MultipleLines': yesno}
df = df.replace(binary_map, inplace = False)
df = df.rename(columns = {'gender': 'Male'})

In [164]:
# still need to encode contract and payment method features, we use one hot encoding
dummies = pd.get_dummies(df[['Contract', 'PaymentMethod']], prefix = ['Contract',  'Payment'], prefix_sep = '=')
df = pd.concat([df, dummies], axis = 1)
df = df.drop(columns = ['Contract', 'PaymentMethod'])

In [167]:
# Finished encoding, all dtypes are either int or float
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Male                               7032 non-null   int64  
 1   SeniorCitizen                      7032 non-null   int64  
 2   Partner                            7032 non-null   int64  
 3   Dependents                         7032 non-null   int64  
 4   tenure                             7032 non-null   int64  
 5   PhoneService                       7032 non-null   int64  
 6   MultipleLines                      7032 non-null   int64  
 7   OnlineSecurity                     7032 non-null   int64  
 8   OnlineBackup                       7032 non-null   int64  
 9   DeviceProtection                   7032 non-null   int64  
 10  TechSupport                        7032 non-null   int64  
 11  StreamingTV                        7032 non-nu