In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

rawdata = pd.read_csv('https://raw.githubusercontent.com/yuezheli/BankChurnerPred/main/BankChurners.csv', header = 0)

rawdata.iloc[0:1,:]

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061


In [43]:
# preprocessing catergorical data into a continuous variable

from sklearn import preprocessing

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')

enc.fit(rawdata.Gender.unique().reshape(-1, 1))

print(enc.categories_)

generencode = pd.DataFrame(enc.transform(np.array(rawdata['Gender']).reshape(-1,1)).toarray(), columns = ['isFemale','isMale'])

rawdata['isFemale'] = generencode['isFemale']

rawdata['isMale'] = generencode['isMale']


rawdata.iloc[0:1,:]

[array(['F', 'M'], dtype=object)]


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,isFemale,isMale
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.0,1.0


In [49]:
# label existing customers as 0
rawdata.replace('Existing Customer', 0, regex=True, inplace = True)
# label attrited customers as 1
rawdata.replace('Attrited Customer', 1, regex=True, inplace = True)


In [51]:
attrition_label = rawdata['Attrition_Flag']
print(attrition_label)

# use only the minimal set from the previous analysis for classification
attrition_data =  rawdata[['Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal',
                         'Total_Trans_Amt','Total_Trans_Ct','Avg_Utilization_Ratio']]
attrition_data.head()
# attrition_data = rawdata.drop(['CLIENTNUM', 'Attrition_Flag'], axis = 1)

0        0
1        0
2        0
3        0
4        0
        ..
10122    0
10123    1
10124    1
10125    1
10126    1
Name: Attrition_Flag, Length: 10127, dtype: int64


Unnamed: 0,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,5,1,3,777,1144,42,0.061
1,6,1,2,864,1291,33,0.105
2,4,1,0,0,1887,20,0.0
3,3,4,1,2517,1171,20,0.76
4,5,1,0,0,816,28,0.0


In [52]:
# prepare training and testing datasets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(attrition_data, attrition_label, test_size=0.2,random_state=109) 

In [61]:
# create svm model and test it out

from sklearn import svm
from sklearn import metrics


clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred)) # too many false positive
print("Recall:",metrics.recall_score(y_test, y_pred)) # too many false negative

Accuracy: 0.8642645607107601
Precision: 0.5736196319018405
Recall: 0.5789473684210527


In [60]:
# add gender to dataset to see whether there is an improvement in recall

addeddata = rawdata[['Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal',
                         'Total_Trans_Amt','Total_Trans_Ct','Avg_Utilization_Ratio', 'isFemale', 'isMale']]

X_train2, X_test2, y_train2, y_test2 = train_test_split(addeddata, attrition_label, test_size=0.2,random_state=109) # 70% training and 30% test

clf2 = svm.SVC(kernel='linear') # Linear Kernel

clf2.fit(X_train2, y_train2)

#Predict the response for test dataset
y_pred2 = clf2.predict(X_test2)

print("Accuracy:",metrics.accuracy_score(y_test2, y_pred2))
print("Precision:",metrics.precision_score(y_test2, y_pred2))
print("Recall:",metrics.recall_score(y_test2, y_pred2)) 


Accuracy: 0.8726554787759131
Precision: 0.6038338658146964
Recall: 0.5851393188854489


In [83]:
# see whether recall can be improved by balancing 2 status in the training sample

attrited =  rawdata[rawdata['Attrition_Flag'] == 1] 
existing =  rawdata[rawdata['Attrition_Flag'] == 0]

import random
random.seed(0)
sampl_attr = random.sample(range(len(attrited)), k=int(np.floor(len(attrited) * 0.9)) )
sampl_exis = random.sample(range(len(existing)), k=int(np.floor(len(attrited) * 0.9 * 3)) )

# create a balanced training dataset

tmpa = attrited.iloc[sampl_attr]
tmpe = existing.iloc[sampl_exis]
tmpc = pd.concat([tmpa, tmpe])

X_train3 = tmpc[['Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal',
                         'Total_Trans_Amt','Total_Trans_Ct','Avg_Utilization_Ratio', 'isFemale', 'isMale']]
y_train3 = tmpc['Attrition_Flag']

del tmpa, tmpe, tmpc


tmpa = attrited.iloc[~attrited.index.isin(sampl_attr)]
tmpe = existing.iloc[~existing.index.isin(sampl_exis)]
tmpc = pd.concat([tmpa, tmpe])

X_test3 = tmpc[['Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal',
                         'Total_Trans_Amt','Total_Trans_Ct','Avg_Utilization_Ratio', 'isFemale', 'isMale']]
y_test3 = tmpc['Attrition_Flag']


In [84]:
clf3 = svm.SVC(kernel='linear') # Linear Kernel

clf3.fit(X_train3, y_train3)

#Predict the response for test dataset
y_pred3 = clf3.predict(X_test3)

print("Accuracy:",metrics.accuracy_score(y_test3, y_pred3))
print("Precision:",metrics.precision_score(y_test3, y_pred3)) # still lots of false positive
print("Recall:",metrics.recall_score(y_test3, y_pred3)) 


Accuracy: 0.8418953414167198
Precision: 0.6568265682656826
Recall: 0.7115256495669554


In [None]:
# linear kernal with different parameter

clf3 = svm.SVC(kernel='linear', gamma=2) 
clf3.fit(X_train3, y_train3)
y_pred3 = clf3.predict(X_test3)

print("Accuracy:",metrics.accuracy_score(y_test3, y_pred3))
print("Precision:",metrics.precision_score(y_test3, y_pred3)) # still lots of false positive
print("Recall:",metrics.recall_score(y_test3, y_pred3)) 



Accuracy: 0.8418953414167198
Precision: 0.6568265682656826
Recall: 0.7115256495669554


In [None]:
# polynomial kernel

clf4 = svm.SVC(kernel='poly', gamma=2) 
clf4.fit(X_train3, y_train3)
y_pred4 = clf4.predict(X_test3)

print("Accuracy:",metrics.accuracy_score(y_test3, y_pred4))
print("Precision:",metrics.precision_score(y_test3, y_pred4)) # still lots of false positive
print("Recall:",metrics.recall_score(y_test3, y_pred4)) 

# rbf kernel

clf5 = svm.SVC(kernel='rbf', gamma=2)
clf5.fit(X_train3, y_train3)
y_pred5 = clf5.predict(X_test3)

print("Accuracy:",metrics.accuracy_score(y_test3, y_pred5))
print("Precision:",metrics.precision_score(y_test3, y_pred5)) # still lots of false positive
print("Recall:",metrics.recall_score(y_test3, y_pred5)) 

## test different linear kernel in SVM and use training data to see difference

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

rawdata = pd.read_csv('https://raw.githubusercontent.com/yuezheli/BankChurnerPred/main/BankChurners.csv', header = 0)


# label existing customers as 0
rawdata.replace('Existing Customer', 0, regex=True, inplace = True)
# label attrited customers as 1
rawdata.replace('Attrited Customer', 1, regex=True, inplace = True)


# drop unnecessary categorical variables

rawdata.drop(['CLIENTNUM', 'Education_Level', 'Marital_Status', 'Income_Category',
              'Card_Category'], axis = 1, inplace = True)

# binary encode of gender
rawdata.replace('M', 0, regex=True, inplace = True) # male = 0
rawdata.replace('F', 1, regex=True, inplace = True) # female = 0

# drop more unrelated variable
rawdata.drop(['Customer_Age', 'Gender', 'Dependent_count','Months_on_book', 
                               'Credit_Limit', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'], axis = 1, inplace = True)


# prepare training and testing datasets

attrition_label = rawdata['Attrition_Flag']
attrition_data = rawdata.drop(['Attrition_Flag'], axis = 1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(attrition_data, attrition_label, test_size=0.2,random_state=109) 

In [2]:
# create svm model and test it out

from sklearn import svm
from sklearn import metrics

def GammaScan(gamma, X_train = X_train, y_train = y_train):
    clf = svm.SVC(kernel='linear', gamma = gamma) # Linear Kernel
    clf.fit(X_train, y_train)
    train_pred = clf.predict(X_train)
    
    accurary = metrics.accuracy_score(y_train, train_pred)
    precision = metrics.precision_score(y_train, train_pred)
    recall = metrics.recall_score(y_train, train_pred)
    
    return accurary, precision, recall
    


In [3]:
Accuracy=[]
Precision= []
Recall = []

for gamma in range(3):
    taccu, tprec, trecall = GammaScan(gamma + 1)
    Accuracy.append(taccu)
    Precision.append(tprec)
    Recall.append(trecall)
    
    del taccu, tprec, trecall 
    


In [4]:
combined = pd.DataFrame()
combined['gamma'] = np.array(range(3)) + 1
combined['Accuracy'] = Accuracy
combined['Precision'] = Precision
combined['Recall'] = Recall

combined

Unnamed: 0,gamma,Accuracy,Precision,Recall
0,1,0.861252,0.568079,0.57592
1,2,0.861252,0.568079,0.57592
2,3,0.861252,0.568079,0.57592
