In [130]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

pd.set_option("display.max_columns", None)

In [95]:
customer = pd.read_csv("telco-customer-churn.csv")

In [96]:
customer.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [97]:
customer["TotalCharges"] = pd.to_numeric(customer["TotalCharges"], errors="coerce")
customer["TotalCharges"].fillna(0, inplace=True)

In [98]:
target_map = {"No": 0, "Yes":1}
customer["Churn"] = customer["Churn"].map(target_map)

In [114]:
customer_copy = customer.copy()

customer_copy.drop(columns=["customerID"], inplace=True)
customer_copy["SeniorCitizen"] = customer_copy["SeniorCitizen"].astype("object")
customer_copy["PaymentMethod"] = customer_copy["PaymentMethod"].str.replace(" (automatic)", "")

features = customer_copy.drop("Churn", axis=1)
target  = customer_copy.Churn

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=1)

In [115]:
features_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
1814,Male,0,Yes,Yes,12,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.7,258.35
5946,Female,0,No,No,42,Yes,No,DSL,Yes,Yes,Yes,Yes,No,Yes,One year,No,Credit card,73.9,3160.55
3881,Male,0,Yes,No,71,Yes,Yes,DSL,Yes,Yes,No,Yes,No,No,Two year,No,Bank transfer,65.15,4681.75
2389,Male,0,Yes,Yes,71,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,85.45,6300.85
3676,Male,0,No,No,30,Yes,No,DSL,Yes,Yes,No,Yes,Yes,No,One year,No,Electronic check,70.4,2044.75


In [123]:
cat_cols = features_train.select_dtypes(exclude=["float", "int"]).columns.to_list()
numeric_cols = features_train.select_dtypes(include=["float", "int"]).columns.to_list()

scaler = StandardScaler()
scaled_train = scaler.fit_transform(features_train[numeric_cols])
scaled_test = scaler.transform(features_test[numeric_cols])

scaled_train_df = pd.DataFrame(data=scaled_train, columns=numeric_cols)
scaled_test_df = pd.DataFrame(data=scaled_test, columns=numeric_cols)

scaled_train_df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-0.825884,-1.49753,-0.890947
1,0.395961,0.302996,0.389693
2,1.577078,0.01232,1.060945
3,1.577078,0.686687,1.775397
4,-0.092777,0.186726,-0.102671


In [121]:
encoder = OneHotEncoder(sparse_output=False)

encoded_train = encoder.fit_transform(features_train[cat_cols])
encoded_test = encoder.fit_transform(features_test[cat_cols])

encoded_features = encoder.get_feature_names_out()

encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_features)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_features)

encoded_train_df.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [122]:
transformed_features_train = pd.concat([encoded_train_df, scaled_train_df], axis=1)
transformed_features_test = pd.concat([encoded_test_df, scaled_test_df], axis=1)

transformed_features_train.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.825884,-1.49753,-0.890947
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.395961,0.302996,0.389693
2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.577078,0.01232,1.060945
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.577078,0.686687,1.775397
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.092777,0.186726,-0.102671


In [119]:
print(target_train.shape)
print(target_test.shape)

(5634,)
(1409,)


In [120]:
print(transformed_features_train.shape)
print(transformed_features_test.shape)

(5634, 46)
(1409, 46)


In [131]:
rf = RandomForestClassifier(random_state=1)
rf.fit(transformed_features_train, target_train)

rf_preds = rf.predict(transformed_features_test)

print(classification_report(target_test, rf_preds))
print(rf.score(transformed_features_test, target_test))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1061
           1       0.60      0.54      0.57       348

    accuracy                           0.80      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

0.7977288857345636


In [132]:
et = ExtraTreesClassifier(random_state=1)
et.fit(transformed_features_train, target_train)

et_preds = et.predict(transformed_features_test)

print(classification_report(target_test, et_preds))
print(et.score(transformed_features_test, target_test))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1061
           1       0.55      0.49      0.52       348

    accuracy                           0.78      1409
   macro avg       0.69      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

0.7750177430801988


In [133]:
xgclf = XGBClassifier(random_state=1)
xgclf.fit(transformed_features_train, target_train)

xgclf_preds = xgclf.predict(transformed_features_test)

print(classification_report(target_test, xgclf_preds))
print(xgclf.score(transformed_features_test, target_test))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86      1061
           1       0.59      0.56      0.57       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409

0.7934705464868701


In [134]:
lgbmclf = LGBMClassifier(random_state=1)
lgbmclf.fit(transformed_features_train, target_train)

lgbmclf_preds = lgbmclf.predict(transformed_features_test)

print(classification_report(target_test, lgbmclf_preds))
print(lgbmclf.score(transformed_features_test, target_test))

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1061
           1       0.61      0.59      0.60       348

    accuracy                           0.80      1409
   macro avg       0.74      0.73      0.73      1409
weighted avg       0.80      0.80      0.80      1409

0.8034066713981547


In [156]:
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features}

rcv = RandomizedSearchCV(et, param_distributions=hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
rcv.fit(transformed_features_train, target_train)

best_params = rcv.best_params_
print("The best parametes for the Randomized Search CV are = {}".format(best_params))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best parametes for the Randomized Search CV are = {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [153]:
best_score = rcv.best_score_
print("The best score of the Randomized Search CV = {}".format(best_score))

The best score of the Randomized Search CV = 0.7930413033233991


In [159]:
features_importance = pd.Series(et.feature_importances_, index=et.feature_names_in_).sort_values(ascending=False)
features_importance

TotalCharges                            0.126810
tenure                                  0.121773
MonthlyCharges                          0.107202
Contract_Month-to-month                 0.052932
OnlineSecurity_No                       0.031081
TechSupport_No                          0.029318
PaymentMethod_Electronic check          0.027804
Contract_Two year                       0.025307
InternetService_Fiber optic             0.025090
gender_Male                             0.022150
gender_Female                           0.021540
OnlineBackup_No                         0.018078
Partner_No                              0.017530
Partner_Yes                             0.017448
DeviceProtection_No                     0.017218
PaperlessBilling_No                     0.016795
Contract_One year                       0.016706
PaymentMethod_Credit card               0.016155
MultipleLines_No                        0.016032
MultipleLines_Yes                       0.015904
PaperlessBilling_Yes