In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

rawdata = pd.read_csv('https://raw.githubusercontent.com/yuezheli/BankChurnerPred/main/BankChurners.csv', header = 0)


# label existing customers as 0
rawdata.replace('Existing Customer', 0, regex=True, inplace = True)
# label attrited customers as 1
rawdata.replace('Attrited Customer', 1, regex=True, inplace = True)


# drop unnecessary categorical variables

rawdata.drop(['CLIENTNUM', 'Education_Level', 'Marital_Status', 'Income_Category',
              'Card_Category'], axis = 1, inplace = True)

# binary encode of gender
rawdata.replace('M', 0, regex=True, inplace = True) # male = 0
rawdata.replace('F', 1, regex=True, inplace = True) # female = 0

# drop more unrelated variable
rawdata.drop(['Customer_Age', 'Gender', 'Dependent_count','Months_on_book', 
                               'Credit_Limit', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'], axis = 1, inplace = True)


rawdata.head()



Unnamed: 0,Attrition_Flag,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,0,5,1,3,777,1144,42,0.061
1,0,6,1,2,864,1291,33,0.105
2,0,4,1,0,0,1887,20,0.0
3,0,3,4,1,2517,1171,20,0.76
4,0,5,1,0,0,816,28,0.0


In [4]:
# prepare training and testing datasets

attrition_label = rawdata['Attrition_Flag']
attrition_data = rawdata.drop(['Attrition_Flag'], axis = 1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(attrition_data, attrition_label, test_size=0.2,random_state=109) 

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1658   45]
 [ 114  209]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1703
           1       0.82      0.65      0.72       323

    accuracy                           0.92      2026
   macro avg       0.88      0.81      0.84      2026
weighted avg       0.92      0.92      0.92      2026



In [11]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred)) # little falso positive
print("Recall:",recall_score(y_test, y_pred)) # too many false negative

Accuracy: 0.9215202369200395
Precision: 0.8228346456692913
Recall: 0.6470588235294118


## search for optimal neighbor number using training dataset

In [14]:
from sklearn import metrics


def ScanNeighborNum(neighbornum = 5, X_train = X_train, y_train = y_train):
    classifier = KNeighborsClassifier(n_neighbors=neighbornum)
    classifier.fit(X_train, y_train)
    
    train_pred = classifier.predict(X_train)
    
    accurary = metrics.accuracy_score(y_train, train_pred)
    precision = metrics.precision_score(y_train, train_pred)
    recall = metrics.recall_score(y_train, train_pred)
    
    return accurary, precision, recall

    

In [15]:
Accuracy=[]
Precision= []
Recall = []

for neignum in range(10):
    taccu, tprec, trecall = ScanNeighborNum(neignum + 1)
    Accuracy.append(taccu)
    Precision.append(tprec)
    Recall.append(trecall)
    
    del taccu, tprec, trecall 
    


In [22]:
combined = pd.DataFrame()
combined['neighbor #'] = np.array(range(10))
combined['Accuracy'] = Accuracy
combined['Precision'] = Precision
combined['Recall'] = Recall

combined

Unnamed: 0,neighbor #,Accuracy,Precision,Recall
0,0,1.0,1.0,1.0
1,1,0.951858,1.0,0.70092
2,2,0.951981,0.885425,0.805982
3,3,0.942229,0.934511,0.689417
4,4,0.945933,0.883186,0.765337
5,5,0.936057,0.903491,0.674847
6,6,0.937785,0.863636,0.728528
7,7,0.933465,0.889908,0.669479
8,8,0.935934,0.860422,0.718558
9,9,0.931984,0.882234,0.666411


In [24]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1703
           1       0.85      0.56      0.68       323

    accuracy                           0.91      2026
   macro avg       0.89      0.77      0.81      2026
weighted avg       0.91      0.91      0.91      2026

