<h1>Task - predict customer churn with telecom data.</h1>

In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


# Reading in the data
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head(20)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [240]:
# Checking for data types and columns
print(f'Data shape: {data.shape}')
data_info = data.info()
data_describe = data.describe()

data_info
data_describe

Data shape: (7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [241]:
# TotalCharges should be a numeric feature, but is currently of object data type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.describe()

# There are missing values in TotalCharges now, we will replace them with the median
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)
#Check if there are any null values left in the data
null_check = data.isnull().sum()
print(null_check)

# We should also remove the column 'customerID' from the data, because it has no importance to the analysis and prediction
data = data.drop('customerID', axis=1)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [242]:
# Label encoding the data for training and prediction
label_encoder = LabelEncoder()
categorical_columns = [col for col in data.columns if data[col].dtype == 'object']

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Checking for data types
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int64  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   MultipleLines     7043 non-null   int64  
 7   InternetService   7043 non-null   int64  
 8   OnlineSecurity    7043 non-null   int64  
 9   OnlineBackup      7043 non-null   int64  
 10  DeviceProtection  7043 non-null   int64  
 11  TechSupport       7043 non-null   int64  
 12  StreamingTV       7043 non-null   int64  
 13  StreamingMovies   7043 non-null   int64  
 14  Contract          7043 non-null   int64  
 15  PaperlessBilling  7043 non-null   int64  
 16  PaymentMethod     7043 non-null   int64  


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [243]:
# Finding the most important features for churn prediction based on chi-squared test and dropping the least important ones
from sklearn.feature_selection import SelectKBest, chi2

target = 'Churn'
features = data.drop(columns=target).columns

# Selecting the best features
best_features = SelectKBest(score_func=chi2, k='all')
fit = best_features.fit(data[features], data[target])

# Getting the scores for each feature
featureScores = pd.DataFrame({'Feature Name':data[features].columns, 'Chi-Squared Score':fit.scores_})

# Sorting the features based on score
sorted_featureScores = featureScores.sort_values(by='Chi-Squared Score', ascending=False)

print(sorted_featureScores)

# Dropping the least important columns
data.drop(columns=['PhoneService', 'gender', 'StreamingTV', 'StreamingMovies', 'MultipleLines', 'InternetService'], inplace=True)

        Feature Name  Chi-Squared Score
18      TotalCharges      627340.305176
4             tenure       16278.923685
17    MonthlyCharges        3680.787699
14          Contract        1115.780167
8     OnlineSecurity         551.611529
11       TechSupport         523.303866
9       OnlineBackup         230.086520
10  DeviceProtection         191.303140
1      SeniorCitizen         134.351545
3         Dependents         133.036443
15  PaperlessBilling         105.680863
2            Partner          82.412083
16     PaymentMethod          58.492250
7    InternetService           9.821028
6      MultipleLines           9.746921
13   StreamingMovies           8.235399
12       StreamingTV           7.490203
0             gender           0.258699
5       PhoneService           0.097261


In [244]:
# Seperating the trainable columns and target column and splitting the data into a train and test splits
X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [245]:
# Checking for imbalance of the column churn in the dataset:
churn_distribution = data['Churn'].value_counts(normalize=True) * 100
print(churn_distribution)

Churn
0    73.463013
1    26.536987
Name: proportion, dtype: float64


<h1>1) Prediction without cross validation:</h1>


In [246]:
# Since the data is quite unbalanced, i will use SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [247]:
# Checking the numerical values of columns to decide what to scale
for column in X_train_smote.columns:
    unique_values = np.sort(X_train_smote[column].unique())
    print(f"{column}: {unique_values}")

SeniorCitizen: [0 1]
Partner: [0 1]
Dependents: [0 1]
tenure: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72]
OnlineSecurity: [0 1 2]
OnlineBackup: [0 1 2]
DeviceProtection: [0 1 2]
TechSupport: [0 1 2]
Contract: [0 1 2]
PaperlessBilling: [0 1]
PaymentMethod: [0 1 2 3]
MonthlyCharges: [ 18.25        18.4         18.55       ... 118.28695457 118.35
 118.6       ]
TotalCharges: [  18.8    18.85   19.   ... 8564.75 8594.4  8684.8 ]


In [248]:
# Scaling the data:
scaler = MinMaxScaler()
columns_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

X_test_scaled = X_test.copy()

X_train_smote[columns_to_scale] = scaler.fit_transform(X_train_smote[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test_scaled[columns_to_scale])

for column in X_train_smote.columns:
    unique_values = X_train_smote[column].unique() 
    print(f'{column}: {unique_values}')


SeniorCitizen: [0 1]
Partner: [0 1]
Dependents: [1 0]
tenure: [0.29166667 0.75       0.01388889 0.05555556 0.         0.09722222
 0.44444444 1.         0.26388889 0.13888889 0.625      0.55555556
 0.65277778 0.5        0.95833333 0.98611111 0.48611111 0.04166667
 0.94444444 0.58333333 0.11111111 0.63888889 0.16666667 0.36111111
 0.68055556 0.45833333 0.43055556 0.91666667 0.80555556 0.18055556
 0.79166667 0.08333333 0.81944444 0.20833333 0.375      0.47222222
 0.25       0.06944444 0.54166667 0.40277778 0.02777778 0.875
 0.27777778 0.19444444 0.77777778 0.51388889 0.33333333 0.72222222
 0.59722222 0.15277778 0.22222222 0.69444444 0.52777778 0.31944444
 0.76388889 0.66666667 0.73611111 0.97222222 0.30555556 0.38888889
 0.61111111 0.90277778 0.88888889 0.83333333 0.70833333 0.125
 0.34722222 0.84722222 0.41666667 0.23611111 0.56944444 0.93055556
 0.86111111]
OnlineSecurity: [2 0 1]
OnlineBackup: [0 2 1]
DeviceProtection: [2 0 1]
TechSupport: [0 2 1]
Contract: [1 2 0]
PaperlessBilling: [0

Logistic regression

In [249]:
# Training and testing the model with LOGISTIC REGRESSION
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_smote, y_train_smote)
predictions_lr = lr.predict(X_test_scaled)
cr_lr = classification_report(y_test, predictions_lr)

print(cr_lr)

              precision    recall  f1-score   support

           0       0.90      0.76      0.82      1036
           1       0.53      0.77      0.63       373

    accuracy                           0.76      1409
   macro avg       0.72      0.77      0.73      1409
weighted avg       0.81      0.76      0.77      1409



Random forest

In [250]:
# Training and testing the model with RANDOM FOREST
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)
predictions_rf = rf.predict(X_test_scaled)
cr_rf = classification_report(y_test, predictions_rf)

print(cr_rf)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1036
           1       0.59      0.59      0.59       373

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409



XGBoost

In [251]:
# Training and testing the model with XGBOOST
xgb = XGBClassifier(learning_rate= 0.01,max_depth = 3,n_estimators = 1000, use_label_encoder=False, eval_metric = 'logloss')
xgb.fit(X_train_smote, y_train_smote)
predictions_xgb = xgb.predict(X_test_scaled)
cr_xgb = classification_report(y_test, predictions_xgb)

print(cr_xgb)

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      1036
           1       0.55      0.74      0.63       373

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.77      0.78      1409



<h1>2) Prediction using cross validation:</h1>

Using SMOTE and scaling seperately for each model to avoid data leakage.


In [252]:
# Initializing stratified K fold and model
skf = StratifiedKFold(n_splits=5)
model = LogisticRegression(max_iter=1000)
columns_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Creating a dictionary to store the metrics across folds
metrics = {'precision': [], 'recall': [], 'f1_score': [], 'support': []}

# Iterating over each generated fold and splitting the data into training and validation sets
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    scaler = MinMaxScaler()

    # Scaling the specified columns from before
    X_train_fold.loc[:, columns_to_scale] = scaler.fit_transform(X_train_fold[columns_to_scale])
    X_val_fold.loc[:, columns_to_scale] = scaler.transform(X_val_fold[columns_to_scale])

    # Initializing and applying SMOTE
    smote = SMOTE(random_state=42)
    X_train_fold_smote, y_train_fold_smote = smote.fit_resample(X_train_fold, y_train_fold)

    # Cloning the original model to ensure a fresh model for each fold, training and predicting
    cloned_model = clone(model)
    cloned_model.fit(X_train_fold_smote, y_train_fold_smote)
    y_pred = cloned_model.predict(X_val_fold)

    # Calculating the necessary metrics
    precision, recall, f1, support = precision_recall_fscore_support(y_val_fold, y_pred)

    # And adding them to the directory
    metrics['precision'].append(precision)
    metrics['recall'].append(recall)
    metrics['f1_score'].append(f1)
    metrics['support'].append(support)

# Converting list of arrays to a DataFrame for easier manipulation
df_metrics = pd.DataFrame({
    'Precision_0': [m[0] for m in metrics['precision']],
    'Precision_1': [m[1] for m in metrics['precision']],
    'Recall_0': [m[0] for m in metrics['recall']],
    'Recall_1': [m[1] for m in metrics['recall']],
    'F1_0': [m[0] for m in metrics['f1_score']],
    'F1_1': [m[1] for m in metrics['f1_score']],
    'Support_0': [m[0] for m in metrics['support']],
    'Support_1': [m[1] for m in metrics['support']]
})

# Calculating the average of the metrics
avg_metrics = df_metrics.mean()

print("Average Metrics for Class 0 and Class 1:")
print(avg_metrics)

Average Metrics for Class 0 and Class 1:
Precision_0      0.905019
Precision_1      0.497530
Recall_0         0.710248
Recall_1         0.793454
F1_0             0.795805
F1_1             0.611461
Support_0      827.600000
Support_1      299.200000
dtype: float64


In [253]:
# Doing the same for Random Forest model
model = RandomForestClassifier(random_state=42)
columns_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

metrics = {'precision': [], 'recall': [], 'f1_score': [], 'support': []}

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    scaler = MinMaxScaler()

    X_train_fold.loc[:, columns_to_scale] = scaler.fit_transform(X_train_fold[columns_to_scale])
    X_val_fold.loc[:, columns_to_scale] = scaler.transform(X_val_fold[columns_to_scale])

    smote = SMOTE(random_state=42)
    X_train_fold_smote, y_train_fold_smote = smote.fit_resample(X_train_fold, y_train_fold)

    cloned_model = clone(model)
    cloned_model.fit(X_train_fold_smote, y_train_fold_smote)
    y_pred = cloned_model.predict(X_val_fold)

    precision, recall, f1, support = precision_recall_fscore_support(y_val_fold, y_pred)

    metrics['precision'].append(precision)
    metrics['recall'].append(recall)
    metrics['f1_score'].append(f1)
    metrics['support'].append(support)

df_metrics = pd.DataFrame({
    'Precision_0': [m[0] for m in metrics['precision']],
    'Precision_1': [m[1] for m in metrics['precision']],
    'Recall_0': [m[0] for m in metrics['recall']],
    'Recall_1': [m[1] for m in metrics['recall']],
    'F1_0': [m[0] for m in metrics['f1_score']],
    'F1_1': [m[1] for m in metrics['f1_score']],
    'Support_0': [m[0] for m in metrics['support']],
    'Support_1': [m[1] for m in metrics['support']]
})

avg_metrics = df_metrics.mean()

print("Average Metrics for Class 0 and Class 1:")
print(avg_metrics)

Average Metrics for Class 0 and Class 1:
Precision_0      0.865783
Precision_1      0.530609
Recall_0         0.788060
Recall_1         0.661759
F1_0             0.824994
F1_1             0.588688
Support_0      827.600000
Support_1      299.200000
dtype: float64


In [254]:
# Doing the same for XGB model
model = XGBClassifier(learning_rate= 0.01,max_depth = 3,n_estimators = 1000, use_label_encoder=False, eval_metric = 'logloss')
columns_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

metrics = {'precision': [], 'recall': [], 'f1_score': [], 'support': []}

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    scaler = MinMaxScaler()

    X_train_fold.loc[:, columns_to_scale] = scaler.fit_transform(X_train_fold[columns_to_scale])
    X_val_fold.loc[:, columns_to_scale] = scaler.transform(X_val_fold[columns_to_scale])

    smote = SMOTE(random_state=42)
    X_train_fold_smote, y_train_fold_smote = smote.fit_resample(X_train_fold, y_train_fold)

    cloned_model = clone(model)
    cloned_model.fit(X_train_fold_smote, y_train_fold_smote)
    y_pred = cloned_model.predict(X_val_fold)

    precision, recall, f1, support = precision_recall_fscore_support(y_val_fold, y_pred)

    metrics['precision'].append(precision)
    metrics['recall'].append(recall)
    metrics['f1_score'].append(f1)
    metrics['support'].append(support)

df_metrics = pd.DataFrame({
    'Precision_0': [m[0] for m in metrics['precision']],
    'Precision_1': [m[1] for m in metrics['precision']],
    'Recall_0': [m[0] for m in metrics['recall']],
    'Recall_1': [m[1] for m in metrics['recall']],
    'F1_0': [m[0] for m in metrics['f1_score']],
    'F1_1': [m[1] for m in metrics['f1_score']],
    'Support_0': [m[0] for m in metrics['support']],
    'Support_1': [m[1] for m in metrics['support']]
})

avg_metrics = df_metrics.mean()

print("Average Metrics for Class 0 and Class 1:")
print(avg_metrics)

Average Metrics for Class 0 and Class 1:
Precision_0      0.903532
Precision_1      0.506935
Recall_0         0.723539
Recall_1         0.785436
F1_0             0.803346
F1_1             0.615837
Support_0      827.600000
Support_1      299.200000
dtype: float64
