In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv('../../datasets/BankChurners.csv')
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [7]:
# Exploratory data analysis using Plotly
fig = px.histogram(df, x='Attrition_Flag', color='Gender')
fig.show()

In [8]:
# Scatter plot of credit limit vs total transaction amount
fig = px.scatter(df, x="Credit_Limit", y="Total_Trans_Amt", color="Attrition_Flag")
fig.show()


In [9]:
# Preprocess the data
X = df.drop(['CLIENTNUM', 'Attrition_Flag'], axis=1)
X = X.iloc[:,:-2]
X.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [10]:
#Prepare the target
y = df['Attrition_Flag'].replace({'Existing Customer': 0, 'Attrited Customer': 1})
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Attrition_Flag, dtype: int64

In [11]:
#Use get dummies
X = pd.get_dummies(X, columns=['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])
X.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,45,3,39,5,1,3,12691.0,777,11914.0,1.335,...,0,0,1,0,0,0,1,0,0,0
1,49,5,44,6,1,2,8256.0,864,7392.0,1.541,...,0,0,0,0,1,0,1,0,0,0
2,51,3,36,4,1,0,3418.0,0,3418.0,2.594,...,0,0,0,1,0,0,1,0,0,0
3,40,4,34,3,4,1,3313.0,2517,796.0,1.405,...,0,0,0,0,1,0,1,0,0,0
4,40,3,21,5,1,0,4716.0,0,4716.0,2.175,...,0,0,1,0,0,0,1,0,0,0


In [12]:
def gen_cm_plotly(cm):
    # calculate various counts and rates
    # compute success percentages and format as strings with percentage symbol
    total_predictions = cm.sum(axis=1)
    success_percentages = np.zeros_like(cm, dtype=float)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if total_predictions[i] > 0:
                success_percentages[i,j] = cm[i,j] / total_predictions[i] * 100
    success_percentages = np.char.add(success_percentages.round(1).astype(str), '%')

    # create text labels for each cell
    text = [[f"Count: {cm[j][i]:,}<br>Success: {success_percentages[j][i]}" for i in range(len(cm[j]))] for j in range(len(cm))]

    # create heatmap figure
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=['No churn', 'Churn'],
            y=['No churn', 'Churn'],
            colorscale='Blues', 
            text=text,
            hovertemplate='%{text}<extra></extra>'
            #hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<br>Percentage: %{text}',
        )
    )

    # add annotations as percentages to heatmap
    for i in range(len(cm)):
        for j in range(len(cm)):
            fig.add_annotation(x=j, y=i, text=success_percentages[i][j], showarrow=False, font=dict(color='black', size=12))

    # set layout and show figure
    fig.update_layout(
        title='Logistic Regression Confusion Matrix',
        font=dict(color='black'),
    )
    fig.update_layout(
        xaxis_title='Predicted Event',
        yaxis_title='True Event',
        font=dict(
            size=14,
            color='black'
        )
    )
    fig.show()


In [13]:
#Split the data in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train and evaluate models
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, lr_pred))
lr_cm = confusion_matrix(y_test, lr_pred)
gen_cm_plotly(lr_cm)

Logistic Regression Accuracy: 0.8928923988153998


In [14]:
from sklearn.naive_bayes import GaussianNB
# Instantiate Gaussian Naive Bayes model
nb = GaussianNB()

# Train the model on the training set
nb.fit(X_train, y_train)

# Predict the target variable on the test set
nb_pred = nb.predict(X_test)

# Evaluate the performance of the model
print('Naive Bayes Accuracy:', accuracy_score(y_test, nb_pred))
print('Naive Bayes F1 Score:', f1_score(y_test, nb_pred))

nb_cm = confusion_matrix(y_test, nb_pred)
gen_cm_plotly(nb_cm)


Naive Bayes Accuracy: 0.9002961500493584
Naive Bayes F1 Score: 0.6622073578595318


In [15]:
# Train and evaluate models
rf = RandomForestClassifier(n_estimators=300, max_depth=30)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test, rf_pred))
print('Random Forest F1 Score:', f1_score(y_test, rf_pred))

rf_cm = confusion_matrix(y_test, rf_pred)
gen_cm_plotly(rf_cm)


Random Forest Accuracy: 0.9526159921026653
Random Forest F1 Score: 0.8285714285714285


In [16]:
from sklearn.ensemble import GradientBoostingClassifier
# Instantiate GBM model
gbm_model = GradientBoostingClassifier(n_estimators=600, max_depth=12, learning_rate=0.1, random_state=0)

# Train the model on the training set
gbm_model.fit(X_train, y_train)

# Predict the target variable on the test set
gbm_pred = gbm_model.predict(X_test)

# Evaluate the performance of the model
print('GBM Accuracy:', accuracy_score(y_test, gbm_pred))
print('GBM F1 Score:', f1_score(y_test, gbm_pred))

gbm_cm = confusion_matrix(y_test, gbm_pred)
gen_cm_plotly(gbm_cm)

GBM Accuracy: 0.9639684106614018
GBM F1 Score: 0.8785357737104824


In [17]:
import xgboost as xgb
# Instantiate XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=0)

# Train the model on the training set
xgb_model.fit(X_train, y_train)

# Predict the target variable on the test set
xgb_pred = xgb_model.predict(X_test)

# Evaluate the performance of the model
print('XGBoost Accuracy:', accuracy_score(y_test, xgb_pred))
print('XGBoost F1 Score:', f1_score(y_test, xgb_pred))

xgb_cm = confusion_matrix(y_test, xgb_pred)
gen_cm_plotly(xgb_cm)

XGBoost Accuracy: 0.9748272458045409
XGBoost F1 Score: 0.9145728643216081
