## Import Libraries

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings("ignore") 

## Read the dataset

In [2]:
df = pd.read_csv("cleandata.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_12 - 23,tenure_group_24 - 35,tenure_group_36 - 47,tenure_group_48 - 59,tenure_group_60 - 71,tenure_group_72 - 72
0,0,0,29.85,29.85,0,0,1,0,0,1,...,1,0,1,0,0,0,0,0,0,0
1,1,0,56.95,1889.5,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
2,2,0,53.85,108.15,1,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
3,3,0,42.3,1840.75,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,4,0,70.7,151.65,1,0,0,0,1,0,...,1,0,1,0,0,0,0,0,0,0


## Drop the 'Unnamed: 0' column (not required in the analysis)

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_12 - 23,tenure_group_24 - 35,tenure_group_36 - 47,tenure_group_48 - 59,tenure_group_60 - 71,tenure_group_72 - 72
0,0,29.85,29.85,0,0,1,0,0,1,0,...,1,0,1,0,0,0,0,0,0,0
1,0,56.95,1889.5,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,53.85,108.15,1,1,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
3,0,42.3,1840.75,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0,70.7,151.65,1,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0


## Separate features (x) and target (y)

In [6]:
x = df.drop('Churn', axis=1)
y = df['Churn']

## Split the data into training and testing sets:
* Split the data into 70% training and 30% testing sets

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

## Decision Tree classifier

In [8]:
model = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

## Train the Decision Tree classifier on the training data

In [9]:
model.fit(x_train,y_train)

## Make predictions on the test set:

In [10]:
predict = model.predict(x_test)
predict

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

## Print classification report

In [11]:
# Print accuracy score
accuracy = accuracy_score(y_test, predict)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:\n", classification_report(y_test, predict))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predict))

Accuracy: 0.7841930903928065
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1557
           1       0.61      0.49      0.54       556

    accuracy                           0.78      2113
   macro avg       0.72      0.69      0.70      2113
weighted avg       0.77      0.78      0.78      2113

Confusion Matrix:
 [[1384  173]
 [ 283  273]]


As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets. Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

## Test Different Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Random Forest

In [13]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train, y_train)

## Gradient Boosting Machine

In [14]:
model_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
model_gbm.fit(x_train, y_train)

## K-Nearest Neighbors

In [15]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(x_train, y_train)

## SVM

In [16]:

model_svm = SVC(kernel='rbf', C=1.0, probability=True)
model_svm.fit(x_train, y_train)

## Evaluate Classifiers:

In [17]:
models = [model_rf, model_gbm, model_knn, model_svm]
model_names = ['Random Forest', 'GBM',  'KNN', 'SVM']

for model, name in zip(models, model_names):
    y_pred = model.predict(x_test)
    accuracy = model.score(x_test, y_test)
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(metrics.classification_report(y_test, y_pred))
    print("------------")

Classifier: Random Forest
Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1557
           1       0.69      0.40      0.51       556

    accuracy                           0.79      2113
   macro avg       0.75      0.67      0.69      2113
weighted avg       0.78      0.79      0.77      2113

------------
Classifier: GBM
Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1557
           1       0.68      0.53      0.60       556

    accuracy                           0.81      2113
   macro avg       0.76      0.72      0.74      2113
weighted avg       0.80      0.81      0.80      2113

------------
Classifier: KNN
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1557
           1       0.61      0.45      0.52       556

    accuracy                           0.78      2113

here, among all the classifiers random forest, gradient boosting, KNN is perfornming better. As, we already know, it is imbalanced data, so that we will balanced the dataset with upsampling technique using smooteen. and then I will see how random forest, GBM, KNN perform.

## Apply SMOTEENN

In [18]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x_train, y_train)

## Resample data

In [19]:
xr_train,xr_test,yr_train,yr_test=train_test_split(x_resampled, y_resampled,test_size=0.3)

## Decission tree after resampling

In [20]:
model_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)


In [21]:
model_smote.fit(xr_train,yr_train)

yr_predict = model_smote.predict(xr_test)

model_score_r = model_smote.score(xr_test, yr_test)

print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_predict))
print(metrics.confusion_matrix(yr_test, yr_predict))

0.92
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       531
           1       0.91      0.96      0.93       668

    accuracy                           0.92      1199
   macro avg       0.93      0.92      0.92      1199
weighted avg       0.92      0.92      0.92      1199

[[469  62]
 [ 29 639]]


Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

as we have seen random forest did a better perform above even without smoteenn, lets's see how it performed after smoteen

## Random forest after resampling data

In [22]:
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)

In [23]:
model_rf_smote.fit(xr_train,yr_train)

yr_predict = model_rf_smote.predict(xr_test)

model_score_r = model_rf_smote.score(xr_test, yr_test)

print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_predict))
print(metrics.confusion_matrix(yr_test, yr_predict))

0.93
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       531
           1       0.92      0.97      0.94       668

    accuracy                           0.93      1199
   macro avg       0.94      0.93      0.93      1199
weighted avg       0.93      0.93      0.93      1199

[[473  58]
 [ 23 645]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

## Gradient Boosting Classifier after resampling data

In [24]:
model_gbm_smote = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)

In [25]:
model_gbm_smote.fit(xr_train, yr_train)

yr_predict = model_gbm_smote.predict(xr_test)

model_score_r = model_gbm_smote.score(xr_test, yr_test)

print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_predict))
print(metrics.confusion_matrix(yr_test, yr_predict))

0.95
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       531
           1       0.95      0.97      0.96       668

    accuracy                           0.95      1199
   macro avg       0.95      0.95      0.95      1199
weighted avg       0.95      0.95      0.95      1199

[[499  32]
 [ 23 645]]


## K-Nearest Neighbors after resampling

In [26]:
model_knn_smote = KNeighborsClassifier(n_neighbors=5)

In [27]:
model_knn_smote.fit(xr_train,yr_train)

yr_predict = model_knn_smote.predict(xr_test)

model_score_r = model_knn_smote.score(xr_test, yr_test)

print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_predict))
print(metrics.confusion_matrix(yr_test, yr_predict))

0.96
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       531
           1       0.96      0.97      0.96       668

    accuracy                           0.96      1199
   macro avg       0.96      0.96      0.96      1199
weighted avg       0.96      0.96      0.96      1199

[[507  24]
 [ 23 645]]
