# Bank Full Dataset

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image
import os
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [3]:
bank = pd.read_csv("https://raw.githubusercontent.com/skathirmani/datasets/master/bank-full.csv",delimiter=';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
bank.shape

(45211, 17)

In [5]:
bank['y'].unique()

array(['no', 'yes'], dtype=object)

In [6]:
bank['y'] = bank['y'].apply(lambda v: 1 if v=='yes' else 0)
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [7]:
(bank['y'].value_counts()/bank.shape[0])*100

0    88.30152
1    11.69848
Name: y, dtype: float64

In [8]:
bank_dummies = pd.get_dummies(bank)

In [9]:
bank_dummies.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [10]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(bank_dummies,test_size=0.3,random_state=100)

In [11]:
train_x = train.drop('y',axis=1)
train_y = train['y']

test_x = test.drop('y',axis=1)
test_y = test['y']

model = DecisionTreeClassifier(random_state=100,max_depth=5)
model.fit(train_x,train_y)

pred_test = model.predict(test_x)

tn, fp, fn, tp = confusion_matrix(test_y,pred_test).ravel()

print(tn, fp, fn, tp)
print(accuracy_score(test_y,pred_test))
print(classification_report(test_y,pred_test))

11676 309 1057 522
0.8992922441757594
             precision    recall  f1-score   support

          0       0.92      0.97      0.94     11985
          1       0.63      0.33      0.43      1579

avg / total       0.88      0.90      0.89     13564



In [11]:
model_rf = RandomForestClassifier(random_state = 100,
                                 n_estimators=300)
model_rf.fit(train_x,train_y)
pred = model_rf.predict(test_x)

tn, fp, fn, tp = confusion_matrix(test_y,pred).ravel()

print(tn, fp, fn, tp)
print(accuracy_score(test_y,pred))
print(classification_report(test_y,pred))

11636 349 934 645
0.9054113830728399
             precision    recall  f1-score   support

          0       0.93      0.97      0.95     11985
          1       0.65      0.41      0.50      1579

avg / total       0.89      0.91      0.90     13564



In [16]:
model_ab = AdaBoostClassifier(random_state = 100,
                                 n_estimators=300)
model_ab.fit(train_x,train_y)
pred_ab = model_ab.predict(test_x)

tn, fp, fn, tp = confusion_matrix(test_y,pred_ab).ravel()

print(tn, fp, fn, tp)
print(accuracy_score(test_y,pred_ab))
print(classification_report(test_y,pred_ab))

11597 388 961 618
0.9005455617811855
             precision    recall  f1-score   support

          0       0.92      0.97      0.95     11985
          1       0.61      0.39      0.48      1579

avg / total       0.89      0.90      0.89     13564



In [14]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(train_x,train_y)
pred_knn = model_knn.predict(test_x)

tn, fp, fn, tp = confusion_matrix(test_y,pred_knn).ravel()

print(tn, fp, fn, tp)
print(accuracy_score(test_y,pred_knn))
print(classification_report(test_y,pred_knn))

11510 475 1145 434
0.8805662046593925
             precision    recall  f1-score   support

          0       0.91      0.96      0.93     11985
          1       0.48      0.27      0.35      1579

avg / total       0.86      0.88      0.87     13564



In [17]:
df = pd.DataFrame({'Decision Tree': ['522', '11676','309','1057','90', 
                                  '33', '97'],
                         'Random Forest':['645', '11636','349','934','91', 
                                  '41', '97'],
                         'AdaBoost':['618', '11597','388','961','90', 
                                  '39', '97'],
                  'KNN':['434','11510','475','1145','88','27','96']}, index=['True Positive','True Negative','False Positive', 'False Negative',
                                                       'Accuracy','Sensitivity','Specificity'])
df = df[['Decision Tree','Random Forest','AdaBoost','KNN']]
df

Unnamed: 0,Decision Tree,Random Forest,AdaBoost,KNN
True Positive,522,645,618,434
True Negative,11676,11636,11597,11510
False Positive,309,349,388,475
False Negative,1057,934,961,1145
Accuracy,90,91,90,88
Sensitivity,33,41,39,27
Specificity,97,97,97,96
