BT Career Challenge

1. Check the data, adjust data, delete null value

1.1 Import needed libraries and data

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# functions for modelling
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score



In [2]:
# use pandas import data
df_customer = pd.read_csv('./Data/Data Set.csv')
df_customer

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Monthtomonth,Yes,Electronic check,29.85,29.85,No
1,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Monthtomonth,Yes,Mailed check,53.85,108.15,Yes
2,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Monthtomonth,Yes,Electronic check,70.70,151.65,Yes
3,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Monthtomonth,Yes,Electronic check,99.65,820.5,Yes
4,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Monthtomonth,Yes,Credit card (automatic),89.10,1949.4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4807-IZYOZ,Female,0,No,No,51,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),20.65,1020.75,No
7039,9710-NJERN,Female,0,No,No,39,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.15,826,No
7040,9281-CEDRU,Female,0,Yes,No,68,Yes,No,DSL,No,...,No,Yes,Yes,No,Two year,No,Bank transfer (automatic),64.10,4326.25,No
7041,2569-WGERO,Female,0,No,No,72,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No


In [3]:
# check the datatype and the number of records
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
# check the number of churn
df_customer.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

From the inital exploration, we can see we have 21 columns covering customers' features, and the most important feature churn has 1869 customers who left BT services (indicating it is an imbalanced dataset). It is time to validate data including:
- delete unuseful columns, like customerID
- delete null values
- check if customers is unique
- convert some features from string to numeric values
- merge some categories in features

1.2 Sort out data

To process logistic regression, the values in columns should be transformed:
- Change data type
- Merge categories
- Generate dummy variable for categorical variables (One hot encoding)

In [5]:
# drop customerID
df_customer.drop('customerID', axis = 'columns', inplace = True)
df_customer.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1064,Female,1,Yes,Yes,22,Yes,Yes,Fiber optic,No,No,No,No,Yes,No,Monthtomonth,No,Bank transfer (automatic),85.35,1961.6,No
540,Female,0,No,No,20,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Monthtomonth,Yes,Bank transfer (automatic),19.5,413.0,No
1425,Female,0,No,No,1,Yes,No,Fiber optic,No,No,No,No,No,No,Monthtomonth,No,Electronic check,70.1,70.1,No
6488,Female,0,No,No,56,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),19.7,1051.9,No
3412,Male,0,Yes,No,5,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Monthtomonth,No,Mailed check,19.35,126.05,Yes


In [6]:
# convert TotalCharges from categorical values to numerical values
df_customer["TotalCharges"] = pd.to_numeric(df_customer.TotalCharges,errors = 'coerce')

In [7]:
# check the unique values of each feature
def print_unique_col_values(df):
    for column in df:
        if df[column].dtype == 'object':
            print(f'{column}: {df[column].unique()}')

print_unique_col_values(df_customer)

gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Monthtomonth' 'Two year' 'One year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Credit card (automatic)'
 'Bank transfer (automatic)']
Churn: ['No' 'Yes']


In [8]:
# replace No internet service to No, replace No phone service to No
df_customer.replace('No phone service', 'No', inplace = True)
df_customer.replace('No internet service', 'No', inplace = True)
df_customer.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2162,Male,0,No,No,19,Yes,No,Fiber optic,No,No,Yes,Yes,Yes,No,Monthtomonth,No,Mailed check,89.95,1682.4,No
3371,Female,0,Yes,No,25,Yes,Yes,DSL,No,Yes,No,No,No,No,Monthtomonth,No,Electronic check,54.1,1373.0,No
3760,Male,1,No,No,6,Yes,No,Fiber optic,No,No,No,No,Yes,No,Monthtomonth,Yes,Electronic check,79.7,497.6,No
6676,Male,0,Yes,No,70,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Bank transfer (automatic),24.25,1724.15,No
6202,Male,0,Yes,Yes,71,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),100.2,7209.0,No


In [9]:
# convert yes to 1, no to 0 for analysis; male to 1, female to 0 for analysis
mapping_dict = {'Male': 0, 'Female': 1, 'No': 0, 'Yes': 1}

cols_to_replace = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                   'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

df_customer[cols_to_replace] = df_customer[cols_to_replace].replace(mapping_dict)
df_customer.sample(5)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
379,1,0,0,0,1,1,0,DSL,0,0,0,1,0,0,Monthtomonth,1,Electronic check,49.9,49.9,0
5066,0,0,1,0,40,1,0,Fiber optic,0,1,1,0,0,0,One year,0,Electronic check,80.8,3132.75,0
4519,1,0,0,0,39,1,0,DSL,0,1,0,0,1,0,One year,1,Credit card (automatic),58.6,2224.5,0
6263,0,1,1,0,64,1,1,No,0,0,0,0,0,0,Two year,0,Credit card (automatic),25.65,1740.8,0
5018,1,0,0,0,26,1,0,DSL,0,1,1,0,0,0,One year,1,Mailed check,56.05,1553.2,0


In [10]:
# delete records with missing values
df_customer.dropna()
df_customer.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
3725,0,0,1,0,3,0,0,DSL,0,0,0,1,0,0,Monthtomonth,1,Electronic check,29.9,92.25,0
6676,0,0,1,0,70,1,1,No,0,0,0,0,0,0,Two year,0,Bank transfer (automatic),24.25,1724.15,0
3777,1,0,0,0,37,1,1,Fiber optic,0,1,1,0,1,1,Monthtomonth,1,Bank transfer (automatic),101.9,3545.35,1
6011,0,0,1,1,42,1,0,No,0,0,0,0,0,0,Two year,0,Mailed check,20.7,828.85,0
2279,1,0,0,0,40,1,0,Fiber optic,0,1,0,0,0,0,Monthtomonth,0,Credit card (automatic),74.8,2971.7,0


In [11]:
# generate dummy variables for categorical variables
df_customer_clean = pd.get_dummies(df_customer,columns=['InternetService','Contract','PaymentMethod'])
df_customer_clean.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
5793,0,1,1,0,72,1,1,0,1,0,...,0,1,0,0,0,1,1,0,0,0
3608,0,0,0,0,24,1,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
954,1,0,0,0,9,1,0,1,0,0,...,0,1,0,1,0,0,0,1,0,0
4463,0,0,0,1,7,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
4995,0,0,0,1,25,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,0


In [12]:
# scaling data to avoid bias
column_to_scale = ['tenure','MonthlyCharges', 'TotalCharges']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_customer_clean[column_to_scale] = scaler.fit_transform(df_customer_clean[column_to_scale])

In [24]:
# final check
df_customer_clean.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
744,1,0,0,0,0.125,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
3878,1,0,0,0,0.930556,1,1,1,1,1,...,0,1,0,1,0,0,0,1,0,0
3979,1,0,1,0,1.0,1,1,0,1,1,...,0,1,0,0,1,0,1,0,0,0
6725,1,0,1,1,0.666667,0,0,1,1,0,...,1,0,0,0,0,1,0,0,0,1
246,1,1,0,0,0.680556,1,1,1,0,0,...,0,1,0,1,0,0,0,1,0,0


2. Modelling

After preparing data, there are several steps to build model:
- Split data into training dataset and test dataset

2.1 Split data

In [14]:
# keep columns except churn, copy it to avoid modifying the original one, set datatype as float
X = df_customer_clean.drop(columns = ['Churn']).copy().astype(float)


y = df_customer_clean.Churn.astype(np.float32)

In [15]:
X.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
6667,1.0,1.0,0.0,0.0,0.930556,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6899,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2979,0.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1685,1.0,0.0,0.0,0.0,0.138889,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4988,1.0,0.0,1.0,1.0,0.805556,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [25]:
y.value_counts()

0.0    5174
1.0    1869
Name: Churn, dtype: int64

In [26]:
# split training and test sets
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=23, stratify = y)

In [27]:
y_train.value_counts()

0.0    4139
1.0    1495
Name: Churn, dtype: int64

In [28]:
y_test.value_counts()

0.0    1035
1.0     374
Name: Churn, dtype: int64

In [29]:
X_train.shape

(5634, 26)

In [30]:
X_train.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Monthtomonth,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
6290,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
520,1.0,0.0,0.0,0.0,0.111111,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5748,1.0,0.0,1.0,1.0,0.319444,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2315,0.0,1.0,0.0,0.0,0.166667,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1412,1.0,0.0,0.0,0.0,0.013889,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


2.2 Logistic Regression

In [32]:
def log_reg(X_train, X_test, y_train, y_test, weights):
    if weights == -1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0:weights[0], 1:weights[1]})
    
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print('Accuracy', acc, '\n')

    y_pred = model.predict(X_test)
    print('preds', y_pred[:5], '\n')

    cl_rep = classification_report(y_test, y_pred)
    print(cl_rep)

In [33]:
weights = -1
log_reg(X_train, X_test, y_train, y_test, weights)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
_