In [2]:
# Import of libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as CR
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

pred_classes = ['Customers', 'Attrited']

In [4]:
# Load the data
df = pd.read_csv("../data/BankChurners_CleanedMapped.csv", index_col="ClientID")
df.shape

(10127, 20)

In [5]:
df.dtypes

Attrited                                 int64
Age                                      int64
Gender                                   int64
DependentCount                           int64
EducationLevel                           int64
MaritalStatus                            int64
IncomeCategory                           int64
CardCategory                             int64
RelationshipPeriod_InMonths              int64
TotalProductNo                           int64
MonthsInactive_Last12Months              int64
ContactNo_Last12Months                   int64
CreditLimit                            float64
TotalRevolvingBalance_CC                 int64
OpenToBuyAve_Last12Months              float64
TransactionAmount_ChangeQ4overQ1       float64
TotalTransactionAmount_Last12Months      int64
TotalTransactionCount_Last12Months       int64
TransactionCount_ChangeQ4overQ1        float64
AveCardUtilizationRatio                float64
dtype: object

In [6]:
df_attrited = df[df["Attrited"] == 1]
df_customers = df[df["Attrited"] == 0]
display(df_attrited.shape)
display(df_customers.shape)

(1627, 20)

(8500, 20)

In [7]:
X_a = df_attrited.drop('Attrited', axis=1) 
y_a = df_attrited['Attrited'] 

X_c = df_customers.drop('Attrited', axis=1) 
y_c = df_customers['Attrited'] 

In [8]:
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, test_size=0.2, random_state=1)
X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(X_c, y_c, test_size=0.2, random_state=1)

In [9]:
X_train = pd.concat([X_c_train, X_a_train])
y_train = pd.concat([y_c_train, y_a_train])

X_test = pd.concat([X_c_test, X_a_test])
y_test = pd.concat([y_c_test, y_a_test])

In [10]:
# Scale the features
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
ros = RandomOverSampler(random_state=1)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(X_train.shape, ">>", X_train_res.shape)
print(y_train.shape, ">>", y_train_res.shape)

(8101, 19) >> (13600, 19)
(8101,) >> (13600,)


### Logistic regression

In [12]:
# Import the model
from sklearn.linear_model import LogisticRegression
# Declare an instance and fit the model
lrClassifier = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
# Score
print("Train: ", lrClassifier.score(X_train, y_train))
print("Test:  ", lrClassifier.score(X_test, y_test))
# Predict
y_predict = lrClassifier.predict(X_test)
# Report
report = CR(y_test, y_predict, target_names=pred_classes)
print("\n", report)

Train:  0.9086532526848538
Test:   0.8928923988153998

               precision    recall  f1-score   support

   Customers       0.91      0.96      0.94      1700
    Attrited       0.74      0.52      0.61       326

    accuracy                           0.89      2026
   macro avg       0.83      0.74      0.77      2026
weighted avg       0.88      0.89      0.89      2026



In [13]:
# Declare an instance and fit the model
lrClassifier_RES = LogisticRegression(solver='lbfgs').fit(X_train_res, y_train_res)
# Score
print("Train: ", lrClassifier_RES.score(X_train_res, y_train_res))
print("Train: ", lrClassifier_RES.score(X_train, y_train))
print("Test:  ", lrClassifier_RES.score(X_test, y_test))
# Predict
y_predict_r = lrClassifier_RES.predict(X_test)
# Report
report_r = CR(y_test, y_predict_r, target_names=pred_classes)
print("\n", report_r)

Train:  0.8522794117647059
Train:  0.8503888408838415
Test:   0.8509378084896347

               precision    recall  f1-score   support

   Customers       0.96      0.86      0.91      1700
    Attrited       0.52      0.81      0.64       326

    accuracy                           0.85      2026
   macro avg       0.74      0.83      0.77      2026
weighted avg       0.89      0.85      0.86      2026



### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier as RFC
rfClassifier = RFC(
    n_estimators=100, max_depth=5, random_state=1
).fit(X_train, y_train)

rfClassifier_Weighted = RFC(
    n_estimators=100, max_depth=5, bootstrap=True, class_weight="balanced", random_state=1
).fit(X_train, y_train)

# Score
print("Train: ", rfClassifier.score(X_train, y_train))
print("Test:  ", rfClassifier.score(X_test, y_test))

print("class_weight='balanced'")
print("Train: ", rfClassifier_Weighted.score(X_train, y_train))
print("Test:  ", rfClassifier_Weighted.score(X_test, y_test))

# Predict
y_predict = rfClassifier.predict(X_test)
y_predict_w = rfClassifier_Weighted.predict(X_test)

# Report
report = CR(y_test, y_predict, target_names=pred_classes)
print("\n", report)

report_w = CR(y_test, y_predict_w, target_names=pred_classes)
print("\n", report_w)

Train:  0.9222318232316998
Test:   0.9146100691016782
class_weight='balanced'
Train:  0.9233427971855327
Test:   0.9062191510365252

               precision    recall  f1-score   support

   Customers       0.92      0.98      0.95      1700
    Attrited       0.86      0.56      0.68       326

    accuracy                           0.91      2026
   macro avg       0.89      0.77      0.81      2026
weighted avg       0.91      0.91      0.91      2026


               precision    recall  f1-score   support

   Customers       0.97      0.91      0.94      1700
    Attrited       0.66      0.87      0.75       326

    accuracy                           0.91      2026
   macro avg       0.82      0.89      0.85      2026
weighted avg       0.92      0.91      0.91      2026



In [15]:
rfClassifier_RES = RFC(
    n_estimators=100, max_depth=5, random_state=1
).fit(X_train_res, y_train_res)

print("Train: ", rfClassifier_RES.score(X_train_res, y_train_res))
print("Train: ", rfClassifier_RES.score(X_train, y_train))
print("Test:  ", rfClassifier_RES.score(X_test, y_test))

y_predict_r = rfClassifier_RES.predict(X_test)

report_r = CR(y_test, y_predict_r, target_names=pred_classes)
print("\n", report_r)

Train:  0.9199264705882353
Train:  0.9065547463276139
Test:   0.8923988153998026

               precision    recall  f1-score   support

   Customers       0.97      0.90      0.93      1700
    Attrited       0.62      0.88      0.72       326

    accuracy                           0.89      2026
   macro avg       0.80      0.89      0.83      2026
weighted avg       0.92      0.89      0.90      2026

