In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.head()['MonthlyCharges']

0    29.85
1    56.95
2    53.85
3    42.30
4    70.70
Name: MonthlyCharges, dtype: float64

In [8]:
# MonthlyCharges should be a numerical column however it is saved as a string object
# therefore we convert it to a numeric datatype
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors = 'coerce')

In [10]:
# changing the columns names
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [13]:
string_columns = df.columns[df.dtypes == 'object']

In [14]:
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [16]:
# next we convert the churn column to a one_hot_encoded column

In [19]:
df.churn = (df.churn == 'yes').astype('int')

In [21]:
# split the data into a training and test set
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 1)

In [22]:
# splitting the dataset into a training and validation set
df_train, df_val = train_test_split(df_train_full, test_size = 0.2, random_state = 11)

In [23]:
# saving the churn variable externally
ytrain = df_train.churn
yval   = df_val.churn

df_train.drop('churn', axis = 1, inplace = True)
df_val.drop('churn', axis = 1, inplace = True)

In [24]:
# Checking for class imbalance
df_train_full['churn'].value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [30]:
# checking the %age of customers churned
print(f"% of customers churned : {df_train_full['churn'].value_counts()[1]/len(df_train_full)*100:0.3f}%")

% of customers churned : 26.997%
