In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [42]:
df = pd.read_csv('telco_customer_churn.csv')

In [43]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [44]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [46]:
df.dropna(how='any', inplace=True)

In [47]:
df.Churn.value_counts()/len(df)*100

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

In [48]:
x = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

In [49]:
# Feature encoding
x = pd.get_dummies(x, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True, dtype=int)

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [51]:
# Apply Recursive Feature Elimination (RFE) to select 5 best features
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=5)

In [52]:
# Fitting the model
rfe = rfe.fit(x_train, y_train)

In [53]:
# Get the selected features
selected_features = x_train.columns[rfe.support_]
x_train_selected = x_train[selected_features]
x_test_selected = x_test[selected_features]

In [54]:
# Build the model
original_model = LogisticRegression()
original_model.fit(x_train, y_train)
y_original_model_prediction = original_model.predict(x_test)
accuracy = accuracy_score(y_test, y_original_model_prediction)
print('Accuracy of the base model is: ', round(accuracy*100, 2))

Accuracy of the base model is:  79.24


In [57]:
# Build the RFE based model on 5 predictor
rfe_model = LogisticRegression()
rfe_model.fit(x_train_selected, y_train)
rfe_y_model_prediction = rfe_model.predict(x_test_selected)
accuracy_rfe = accuracy_score(y_test, rfe_y_model_prediction)
print('Accuracy of the RFE base model is: ', round(accuracy_rfe*100, 2))

Accuracy of the RFE base model is:  76.3
