BT Career Challenge

1. Check the data, adjust data, delete null value

1.1 Import needed libraries and data

In [113]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# functions for modelling
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# roc curve and auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [114]:
# use pandas import data
df_customer = pd.read_csv('./Data/Data Set.csv')
df_customer

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Monthtomonth,Yes,Electronic check,29.85,29.85,No
1,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Monthtomonth,Yes,Mailed check,53.85,108.15,Yes
2,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Monthtomonth,Yes,Electronic check,70.70,151.65,Yes
3,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Monthtomonth,Yes,Electronic check,99.65,820.5,Yes
4,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Monthtomonth,Yes,Credit card (automatic),89.10,1949.4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4807-IZYOZ,Female,0,No,No,51,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),20.65,1020.75,No
7039,9710-NJERN,Female,0,No,No,39,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.15,826,No
7040,9281-CEDRU,Female,0,Yes,No,68,Yes,No,DSL,No,...,No,Yes,Yes,No,Two year,No,Bank transfer (automatic),64.10,4326.25,No
7041,2569-WGERO,Female,0,No,No,72,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No


In [115]:
# check the datatype and the number of records
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [116]:
# check the number of churn
df_customer.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

From the inital exploration, we can see we have 21 columns covering customers' features, and the most important feature churn has 1869 customers who left BT services (indicating it is an imbalanced dataset). It is time to validate data including:
- delete unuseful columns, like customerID
- delete null values
- check if customers is unique
- convert some features from string to numeric values
- merge some categories in features

1.2 Sort out data

To process logistic regression, the values in columns should be transformed:
- Change data type
- Merge categories
- Generate dummy variable for categorical variables (One hot encoding)

In [117]:
# drop customerID
df_customer.drop('customerID', axis = 'columns', inplace = True)
df_customer.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
3587,Female,0,Yes,Yes,42,Yes,Yes,Fiber optic,No,No,No,No,Yes,Yes,Monthtomonth,Yes,Electronic check,94.2,4186.3,Yes
2682,Female,0,Yes,Yes,1,Yes,No,Fiber optic,No,No,No,No,No,No,Monthtomonth,Yes,Electronic check,69.8,69.8,Yes
720,Female,0,No,No,18,Yes,Yes,Fiber optic,No,No,No,No,Yes,No,Monthtomonth,Yes,Electronic check,85.2,1553.9,Yes
5955,Male,0,Yes,Yes,72,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Electronic check,20.35,1354.4,No
5669,Female,0,Yes,Yes,45,Yes,No,DSL,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Mailed check,81.0,3533.6,No


In [121]:
# convert TotalCharges from categorical values to numerical values
df_customer['TotalCharges'] = pd.to_numeric(df_customer.TotalCharges, errors='coerce')

In [122]:
# check the unique values of each feature
def print_unique_col_values(df):
    for column in df:
        if df[column].dtype == 'object':
            print(f'{column}: {df[column].unique()}')

print_unique_col_values(df_customer)

gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Monthtomonth' 'Two year' 'One year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Credit card (automatic)'
 'Bank transfer (automatic)']
Churn: ['No' 'Yes']


In [130]:
# replace No internet service to No, replace No phone service to No
df_customer.replace('No phone service', 'No', inplace = True)
df_customer.replace('No internet service', 'No', inplace = True)
df_customer.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
3739,1,0,0,0,13,1,0,Fiber optic,0,0,0,0,0,0,Monthtomonth,1,Mailed check,70.15,886.7,0
6787,0,0,1,0,48,1,0,No,0,0,0,0,0,0,Two year,1,Mailed check,20.05,1036.0,0
6131,1,0,1,0,64,1,1,DSL,1,1,1,1,0,1,Two year,1,Mailed check,81.3,5129.3,0
231,1,0,1,1,22,1,0,Fiber optic,0,1,0,0,0,1,Monthtomonth,0,Electronic check,83.3,1845.9,1
187,0,0,1,0,63,1,1,Fiber optic,0,1,1,0,1,1,Monthtomonth,1,Electronic check,103.4,6603.0,1


In [131]:
# convert yes to 1, no to 0 for analysis; male to 1, female to 0 for analysis
mapping_dict = {'Male': 0, 'Female': 1, 'No': 0, 'Yes': 1}

cols_to_replace = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                   'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

df_customer[cols_to_replace] = df_customer[cols_to_replace].replace(mapping_dict)
df_customer.sample(5)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1703,1,0,0,0,5,1,0,DSL,1,0,0,1,0,0,Monthtomonth,0,Electronic check,54.2,308.25,1
3219,1,0,0,0,6,1,0,Fiber optic,0,1,0,0,0,1,Monthtomonth,1,Mailed check,85.95,514.6,0
866,1,0,0,0,1,1,0,Fiber optic,0,1,0,1,1,0,Monthtomonth,0,Electronic check,91.7,91.7,1
6701,0,0,0,0,72,0,0,DSL,1,0,1,1,1,1,Two year,1,Credit card (automatic),61.2,4390.25,0
3589,0,0,0,0,11,1,0,DSL,0,1,1,1,0,0,Monthtomonth,1,Electronic check,60.9,688.5,0


In [132]:
# generate dummy variables for categorical variables
df_customer_clean = pd.get_dummies(df_customer,columns=['InternetService','Contract','PaymentMethod'])
df_customer_clean.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
4309,1,0,0,0,36,1,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
4556,0,0,1,1,61,1,0,1,1,1,...,1,0,0,0,1,0,1,0,0,0
1996,1,1,1,0,18,0,0,0,0,1,...,1,0,0,1,0,0,0,0,1,0
5128,1,0,0,0,41,1,1,0,1,1,...,1,0,0,0,1,0,0,0,1,0
5036,0,0,1,1,68,0,0,1,1,0,...,1,0,0,0,1,0,0,0,0,1


In [133]:
# scaling data to avoid bias
column_to_scale = ['tenure','MonthlyCharges', 'TotalCharges']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_customer_clean[column_to_scale] = scaler.fit_transform(df_customer_clean[column_to_scale])

In [138]:
# delete records with missing values
df_customer_clean.dropna(axis=0, inplace=True)
df_customer_clean.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
4709,1,0,1,1,0.930556,1,1,1,0,1,...,0,1,0,0,1,0,0,0,1,0
767,1,0,0,0,0.013889,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
3306,0,0,1,0,0.152778,1,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
5202,0,0,0,0,0.527778,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
3583,1,0,1,1,0.611111,1,0,1,0,0,...,1,0,0,1,0,0,1,0,0,0


In [139]:
df_customer_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7032 non-null   int64  
 1   SeniorCitizen                            7032 non-null   int64  
 2   Partner                                  7032 non-null   int64  
 3   Dependents                               7032 non-null   int64  
 4   tenure                                   7032 non-null   float64
 5   PhoneService                             7032 non-null   int64  
 6   MultipleLines                            7032 non-null   int64  
 7   OnlineSecurity                           7032 non-null   int64  
 8   OnlineBackup                             7032 non-null   int64  
 9   DeviceProtection                         7032 non-null   int64  
 10  TechSupport                              7032 no

2. Modelling

After preparing data, there are several steps to build model:
- Split data into training dataset and test dataset

2.1 Split data

In [140]:
# keep columns except churn, copy it to avoid modifying the original one, set datatype as float
X = df_customer_clean.drop(columns = ['Churn']).copy().astype(float)


y = np.ravel(df_customer_clean[['Churn']]).astype(float)

In [141]:
X.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3932,0.0,0.0,0.0,0.0,0.569444,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5756,1.0,0.0,1.0,1.0,0.875,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1756,1.0,1.0,1.0,0.0,0.597222,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6889,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
5290,1.0,0.0,1.0,1.0,0.819444,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [142]:
pd.Series(y).value_counts()

0.0    5163
1.0    1869
dtype: int64

In [143]:
# split training and test sets
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=6, stratify = y)

In [144]:
pd.Series(y_train).value_counts()

0.0    4130
1.0    1495
dtype: int64

In [145]:
pd.Series(y_test).value_counts()

0.0    1033
1.0     374
dtype: int64

In [146]:
X_train.shape

(5625, 26)

In [147]:
X_train.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2400,1.0,1.0,0.0,0.0,0.111111,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4827,0.0,0.0,0.0,0.0,0.361111,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1595,0.0,0.0,1.0,0.0,0.25,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1498,1.0,0.0,0.0,1.0,0.055556,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2080,0.0,0.0,0.0,0.0,0.208333,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


2.2 Logistic Regression

In [151]:
def log_reg(X_train, y_train, X_test, y_test, weights):
    if weights==-1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0:weight[0], 1:weights[1]})

    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print('Accuracy', acc, '\n')

    y_pred = model.predict(X_test)
    print('preds', y_pred[:5], '\n')

    cl_rep = classification_report(y_test, y_pred)
    print(cl_rep)


In [152]:
# set weights -1
weights = -1
log_reg(X_train, y_train, X_test, y_test, weights)

Accuracy 0.820184790334044 

preds [1. 0. 0. 1. 0.] 

              precision    recall  f1-score   support

         0.0       0.86      0.91      0.88      1033
         1.0       0.70      0.57      0.63       374

    accuracy                           0.82      1407
   macro avg       0.78      0.74      0.76      1407
weighted avg       0.81      0.82      0.81      1407

