In [100]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [101]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [102]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score



In [103]:
df = pd.read_csv("Customer_churn.csv")

In [104]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [105]:
print("Dataset info:\n")
print(df.info())
print("\n Class Distribution: \n")
print(df['Churn'].value_counts())
print("\n Sample Data:\n")
print(df.head())

Dataset info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-

In [106]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [107]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')
df.fillna({'TotalCharges': df['TotalCharges'].median()}, inplace = True)

In [108]:
#Encode categorical variables
label_encoder = LabelEncoder()
for i in df.select_dtypes(include=['object']).columns:
    if i !='Churn':
        df[i] = label_encoder.fit_transform(df[i])
        
        
        

In [109]:
#Enode target variable
scaler = StandardScaler()
numerical_features = ['tenure','MonthlyCharges','TotalCharges']
df[numerical_features]= scaler.fit_transform(df[numerical_features])

In [110]:
#split dataset
x = df.drop(columns= ['Churn'])
y = df['Churn']

In [111]:
#Using train test split to split dataset
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state =42)

In [112]:
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_train = le.fit_transform(y_train)

In [113]:
#SMOTE application
smote = SMOTE(random_state=42)
x_train_up,y_train_up = smote.fit_resample(x_train,y_train)

In [114]:
#Display sampling distribution after SMOTE
print("\n Class Distribution after SMOTE: \n")
print(pd.Series(y_train_up).value_counts())


 Class Distribution after SMOTE: 

0    4138
1    4138
Name: count, dtype: int64


In [115]:
y_train

array([0, 0, 1, ..., 0, 1, 0])

In [116]:
# Training RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train_up,y_train_up)
y_score = rf_model.predict(x_test)
rf_roc_auc = roc_auc_score(y_test_encoded, y_score)


In [117]:
#Train XGBoost
xgb_model = XGBClassifier(eval_metric = 'logloss',random_state=42)
xgb_model.fit(x_train_up,y_train_up)
y_score_xgb = xgb_model.predict(x_test)
xgb_roc_auc = roc_auc_score(y_test_encoded, y_score_xgb)

In [118]:
#Train Lightgbm
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(x_train_up,y_train_up)
y_score_lgb = lgb_model.predict(x_test)
lgb_roc_auc = roc_auc_score(y_test_encoded, y_score_lgb)

[LightGBM] [Info] Number of positive: 4138, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 8276, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [119]:
#Classification Reports
print("Random Forest Report:\n",classification_report(y_test_encoded, y_score))
print("XGBoost Report:\n", classification_report(y_test_encoded, y_score_xgb))
print("LightGBM Report:\n", classification_report(y_test_encoded, y_score_lgb))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1036
           1       0.59      0.59      0.59       373

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409

XGBoost Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      1036
           1       0.61      0.61      0.61       373

    accuracy                           0.79      1409
   macro avg       0.74      0.73      0.74      1409
weighted avg       0.79      0.79      0.79      1409

LightGBM Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.87      1036
           1       0.63      0.61      0.62       373

    accuracy                           0.80      1409
   macro avg       0.74      0.74      0.74      1409
weighted avg    

In [121]:
print("ROC-AUC Scores:\n")
print(f"Random Forest:{rf_roc_auc:.2f}")
print(f"XGBoost: {xgb_roc_auc:.2f}")
print(f"LightGBM: {lgb_roc_auc:.2f}")

ROC-AUC Scores:

Random Forest:0.72
XGBoost: 0.73
LightGBM: 0.74
