In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('bank-additional-full.csv',sep = ';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [5]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [6]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [7]:
cat_feat  = df.dtypes[df.dtypes == object].index

In [8]:
cat_feat

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome', 'y'],
      dtype='object')

In [9]:
cat_feat = cat_feat.drop('y')

In [10]:
cat_feat

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [11]:
cat_feat = cat_feat.drop('month')

In [12]:
cat_feat = cat_feat.drop('day_of_week')

In [13]:
cat_feat

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'poutcome'],
      dtype='object')

In [14]:
for i in cat_feat:
    print(df[i].unique())

['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
['married' 'single' 'divorced' 'unknown']
['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
['no' 'unknown' 'yes']
['no' 'yes' 'unknown']
['no' 'yes' 'unknown']
['telephone' 'cellular']
['nonexistent' 'failure' 'success']


In [15]:
df_jobs = pd.get_dummies(df['job']).drop(columns = 'unknown')

In [16]:
df.drop(columns = 'job',inplace = True)

In [17]:
df = df.join(df_jobs)

In [18]:
df.head()

Unnamed: 0,age,marital,education,default,housing,loan,contact,month,day_of_week,duration,...,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed
0,56,married,basic.4y,no,no,no,telephone,may,mon,261,...,0,0,1,0,0,0,0,0,0,0
1,57,married,high.school,unknown,no,no,telephone,may,mon,149,...,0,0,0,0,0,0,1,0,0,0
2,37,married,high.school,no,yes,no,telephone,may,mon,226,...,0,0,0,0,0,0,1,0,0,0
3,40,married,basic.6y,no,no,no,telephone,may,mon,151,...,0,0,0,0,0,0,0,0,0,0
4,56,married,high.school,no,no,yes,telephone,may,mon,307,...,0,0,0,0,0,0,1,0,0,0


In [19]:
df_married = pd.get_dummies(df['marital'])

In [20]:
df.drop(columns = 'marital',inplace = True)

In [21]:
df.head()

Unnamed: 0,age,education,default,housing,loan,contact,month,day_of_week,duration,campaign,...,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed
0,56,basic.4y,no,no,no,telephone,may,mon,261,1,...,0,0,1,0,0,0,0,0,0,0
1,57,high.school,unknown,no,no,telephone,may,mon,149,1,...,0,0,0,0,0,0,1,0,0,0
2,37,high.school,no,yes,no,telephone,may,mon,226,1,...,0,0,0,0,0,0,1,0,0,0
3,40,basic.6y,no,no,no,telephone,may,mon,151,1,...,0,0,0,0,0,0,0,0,0,0
4,56,high.school,no,no,yes,telephone,may,mon,307,1,...,0,0,0,0,0,0,1,0,0,0


In [22]:
df = df.join(df_married)

In [23]:
df.head()

Unnamed: 0,age,education,default,housing,loan,contact,month,day_of_week,duration,campaign,...,retired,self-employed,services,student,technician,unemployed,divorced,married,single,unknown
0,56,basic.4y,no,no,no,telephone,may,mon,261,1,...,0,0,0,0,0,0,0,1,0,0
1,57,high.school,unknown,no,no,telephone,may,mon,149,1,...,0,0,1,0,0,0,0,1,0,0
2,37,high.school,no,yes,no,telephone,may,mon,226,1,...,0,0,1,0,0,0,0,1,0,0
3,40,basic.6y,no,no,no,telephone,may,mon,151,1,...,0,0,0,0,0,0,0,1,0,0
4,56,high.school,no,no,yes,telephone,may,mon,307,1,...,0,0,1,0,0,0,0,1,0,0


In [24]:
df.drop(columns = 'unknown',inplace = True)

In [25]:
df.head()
len(df.columns)
df.shape

(41188, 33)

In [26]:
cat_feat = cat_feat.delete([0])

In [27]:
cat_feat = cat_feat.delete([0])

In [28]:
cat_feat = cat_feat.delete([0])

In [29]:
df_education = pd.get_dummies(df['education'])

In [30]:
df_education.drop(columns = 'unknown',inplace = True)

In [31]:
df = df.join(df_education)

In [32]:
df.drop(columns = 'education',inplace = True)

In [33]:
len(df.columns)

39

In [34]:
df_def = pd.get_dummies(df['default'])

In [35]:
df_def.drop(columns = 'unknown',inplace = True)

In [36]:
df.drop(columns = 'default',inplace = True)

In [37]:
df = df.join(df_def)

In [38]:
len(df.columns)

40

In [39]:
df.head()

Unnamed: 0,age,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,...,single,basic.4y,basic.6y,basic.9y,high.school,illiterate,professional.course,university.degree,no,yes
0,56,no,no,telephone,may,mon,261,1,999,0,...,0,1,0,0,0,0,0,0,1,0
1,57,no,no,telephone,may,mon,149,1,999,0,...,0,0,0,0,1,0,0,0,0,0
2,37,yes,no,telephone,may,mon,226,1,999,0,...,0,0,0,0,1,0,0,0,1,0
3,40,no,no,telephone,may,mon,151,1,999,0,...,0,0,1,0,0,0,0,0,1,0
4,56,no,yes,telephone,may,mon,307,1,999,0,...,0,0,0,0,1,0,0,0,1,0


In [40]:
df_loan = pd.get_dummies(df['loan'])

In [41]:
df_loan.drop(columns = 'unknown',inplace = True)

In [42]:
df.drop(columns = 'loan',inplace = True)

In [43]:
df = df.join(df_loan,rsuffix = '_loan')

In [44]:
df.head()

Unnamed: 0,age,housing,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,...,basic.6y,basic.9y,high.school,illiterate,professional.course,university.degree,no,yes,no_loan,yes_loan
0,56,no,telephone,may,mon,261,1,999,0,nonexistent,...,0,0,0,0,0,0,1,0,1,0
1,57,no,telephone,may,mon,149,1,999,0,nonexistent,...,0,0,1,0,0,0,0,0,1,0
2,37,yes,telephone,may,mon,226,1,999,0,nonexistent,...,0,0,1,0,0,0,1,0,1,0
3,40,no,telephone,may,mon,151,1,999,0,nonexistent,...,1,0,0,0,0,0,1,0,1,0
4,56,no,telephone,may,mon,307,1,999,0,nonexistent,...,0,0,1,0,0,0,1,0,0,1


In [45]:
len(df.columns)

41

In [46]:
df_housing = pd.get_dummies(df['housing'])

In [47]:
df_housing

Unnamed: 0,no,unknown,yes
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,1,0,0
...,...,...,...
41183,0,0,1
41184,1,0,0
41185,0,0,1
41186,1,0,0


In [48]:
df_housing.drop(columns = 'unknown',inplace = True)

In [49]:
df.drop(columns = 'housing',inplace = True)

In [50]:
df = df.join(df_housing,rsuffix = '_housing')

In [51]:
df.head()

Unnamed: 0,age,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,...,high.school,illiterate,professional.course,university.degree,no,yes,no_loan,yes_loan,no_housing,yes_housing
0,56,telephone,may,mon,261,1,999,0,nonexistent,1.1,...,0,0,0,0,1,0,1,0,1,0
1,57,telephone,may,mon,149,1,999,0,nonexistent,1.1,...,1,0,0,0,0,0,1,0,1,0
2,37,telephone,may,mon,226,1,999,0,nonexistent,1.1,...,1,0,0,0,1,0,1,0,0,1
3,40,telephone,may,mon,151,1,999,0,nonexistent,1.1,...,0,0,0,0,1,0,1,0,1,0
4,56,telephone,may,mon,307,1,999,0,nonexistent,1.1,...,1,0,0,0,1,0,0,1,1,0


In [52]:
cat_feat

Index(['default', 'housing', 'loan', 'contact', 'poutcome'], dtype='object')

In [53]:
df_cont = pd.get_dummies(df['contact'])

In [54]:
df_cont.drop(columns = 'telephone',inplace = True)

In [55]:
df.drop(columns = 'contact',inplace = True)

In [56]:
df = df.join(df_cont)

In [57]:
df.head()

Unnamed: 0,age,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,...,illiterate,professional.course,university.degree,no,yes,no_loan,yes_loan,no_housing,yes_housing,cellular
0,56,may,mon,261,1,999,0,nonexistent,1.1,93.994,...,0,0,0,1,0,1,0,1,0,0
1,57,may,mon,149,1,999,0,nonexistent,1.1,93.994,...,0,0,0,0,0,1,0,1,0,0
2,37,may,mon,226,1,999,0,nonexistent,1.1,93.994,...,0,0,0,1,0,1,0,0,1,0
3,40,may,mon,151,1,999,0,nonexistent,1.1,93.994,...,0,0,0,1,0,1,0,1,0,0
4,56,may,mon,307,1,999,0,nonexistent,1.1,93.994,...,0,0,0,1,0,0,1,1,0,0


In [58]:
cat_feat

Index(['default', 'housing', 'loan', 'contact', 'poutcome'], dtype='object')

In [59]:
df_p = pd.get_dummies(df['poutcome'])

In [60]:
df_p.drop(columns = 'failure',inplace = True)

In [61]:
df.drop(columns = 'poutcome',inplace =  True)

In [62]:
df = df.join(df_p)

In [63]:
df.head()

Unnamed: 0,age,month,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,...,university.degree,no,yes,no_loan,yes_loan,no_housing,yes_housing,cellular,nonexistent,success
0,56,may,mon,261,1,999,0,1.1,93.994,-36.4,...,0,1,0,1,0,1,0,0,1,0
1,57,may,mon,149,1,999,0,1.1,93.994,-36.4,...,0,0,0,1,0,1,0,0,1,0
2,37,may,mon,226,1,999,0,1.1,93.994,-36.4,...,0,1,0,1,0,0,1,0,1,0
3,40,may,mon,151,1,999,0,1.1,93.994,-36.4,...,0,1,0,1,0,1,0,0,1,0
4,56,may,mon,307,1,999,0,1.1,93.994,-36.4,...,0,1,0,0,1,1,0,0,1,0


In [64]:
df.drop(columns = ['month','day_of_week','duration'],inplace = True)

In [65]:
df.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'divorced', 'married', 'single', 'basic.4y', 'basic.6y',
       'basic.9y', 'high.school', 'illiterate', 'professional.course',
       'university.degree', 'no', 'yes', 'no_loan', 'yes_loan', 'no_housing',
       'yes_housing', 'cellular', 'nonexistent', 'success'],
      dtype='object')

In [66]:
df['pdays'].unique()

array([999,   6,   4,   3,   5,   1,   0,  10,   7,   8,   9,  11,   2,
        12,  13,  14,  15,  16,  21,  17,  18,  22,  25,  26,  19,  27,
        20], dtype=int64)

In [67]:
df.dtypes

age                      int64
campaign                 int64
pdays                    int64
previous                 int64
emp.var.rate           float64
cons.price.idx         float64
cons.conf.idx          float64
euribor3m              float64
nr.employed            float64
y                       object
admin.                   uint8
blue-collar              uint8
entrepreneur             uint8
housemaid                uint8
management               uint8
retired                  uint8
self-employed            uint8
services                 uint8
student                  uint8
technician               uint8
unemployed               uint8
divorced                 uint8
married                  uint8
single                   uint8
basic.4y                 uint8
basic.6y                 uint8
basic.9y                 uint8
high.school              uint8
illiterate               uint8
professional.course      uint8
university.degree        uint8
no                       uint8
yes     

In [68]:
df['cons.price.idx'].describe()

count    41188.000000
mean        93.575664
std          0.578840
min         92.201000
25%         93.075000
50%         93.749000
75%         93.994000
max         94.767000
Name: cons.price.idx, dtype: float64

In [69]:
df['emp.var.rate'].describe()

count    41188.000000
mean         0.081886
std          1.570960
min         -3.400000
25%         -1.800000
50%          1.100000
75%          1.400000
max          1.400000
Name: emp.var.rate, dtype: float64

In [70]:
df['cons.conf.idx'].describe()

count    41188.000000
mean       -40.502600
std          4.628198
min        -50.800000
25%        -42.700000
50%        -41.800000
75%        -36.400000
max        -26.900000
Name: cons.conf.idx, dtype: float64

In [71]:
le_y = LabelEncoder()
df['y'] = le_y.fit_transform(df['y'])

In [72]:
df['y']

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int32

In [73]:
y = df['y'].values

In [74]:
df.drop(columns = 'y',inplace = True)

In [75]:
X = df.values

In [76]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [77]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
X_train1 = X_train
X_test1  = X_test
y_train1 = y_train
y_test1 = y_test

(32950, 39)
(8238, 39)
(32950,)
(8238,)


In [78]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### Random Forest Classifier

In [79]:
classifier = RandomForestClassifier(n_estimators = 100,criterion = 'entropy',random_state = 0)

In [80]:
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

In [81]:
cm1 = confusion_matrix(y_test,y_pred)
cm1

array([[7102,  217],
       [ 644,  275]], dtype=int64)

In [82]:
accuracy_vector = cross_val_score(estimator = classifier,X = X_train,y = y_train,cv = 10,n_jobs = -1)
print(accuracy_vector)
print('The mean of accuracy is:',accuracy_vector.mean())
print('The standard deviation of accuracy is:',accuracy_vector.std())

[0.89377845 0.89468892 0.88437026 0.8952959  0.89802731 0.88801214
 0.89044006 0.89468892 0.89772382 0.89468892]
The mean of accuracy is: 0.8931714719271623
The standard deviation of accuracy is: 0.004094305785502911


In [83]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7319
           1       0.56      0.30      0.39       919

    accuracy                           0.90      8238
   macro avg       0.74      0.63      0.67      8238
weighted avg       0.88      0.90      0.88      8238



### XGB Classifier

In [84]:
classifier1 = XGBClassifier()
classifier1.fit(X_train1,y_train1)
y_predx = classifier1.predict(X_test1)

In [85]:
cm2 = confusion_matrix(y_test1,y_predx)
cm2

array([[7214,  105],
       [ 714,  205]], dtype=int64)

In [86]:
accuracy_vectorx = cross_val_score(estimator = classifier1,X = X_train1,y = y_train1,cv = 10,n_jobs = -1)
print(accuracy_vectorx)
print('The mean of accuracy is:',accuracy_vectorx.mean())
print('The standard deviation of accuracy is:',accuracy_vectorx.std())

[0.89742033 0.89954476 0.89195751 0.90409712 0.90440061 0.89863429
 0.89742033 0.90197269 0.89954476 0.90197269]
The mean of accuracy is: 0.8996965098634293
The standard deviation of accuracy is: 0.003511185455364537


In [87]:
print(classification_report(y_test,y_predx))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      7319
           1       0.66      0.22      0.33       919

    accuracy                           0.90      8238
   macro avg       0.79      0.60      0.64      8238
weighted avg       0.88      0.90      0.88      8238



### Naive Bayes

In [88]:
classifier2 = GaussianNB()
classifier2.fit(X_train,y_train)
y_pred2 = classifier2.predict(X_test)

In [89]:
cm3 = confusion_matrix(y_test,y_pred2)
cm3

array([[3626, 3693],
       [ 170,  749]], dtype=int64)

In [90]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.96      0.50      0.65      7319
           1       0.17      0.82      0.28       919

    accuracy                           0.53      8238
   macro avg       0.56      0.66      0.47      8238
weighted avg       0.87      0.53      0.61      8238



### Logistic Regression

In [91]:
classifier3 = LogisticRegression(random_state = 0)
classifier3.fit(X_train,y_train)
y_predlr = classifier3.predict(X_test)

In [92]:
cm4 = confusion_matrix(y_test,y_predlr)
cm4

array([[7209,  110],
       [ 715,  204]], dtype=int64)

In [93]:
accuracy_vectorlr = cross_val_score(estimator = classifier3,X = X_train1,y = y_train1,cv = 10,n_jobs = -1)
print(accuracy_vectorlr)
print('The mean of accuracy is:',accuracy_vectorlr.mean())
print('The standard deviation of accuracy is:',accuracy_vectorlr.std())

[0.89620637 0.8983308  0.89742033 0.90349014 0.90227618 0.90015175
 0.89438543 0.90197269 0.90106222 0.89954476]
The mean of accuracy is: 0.89948406676783
The standard deviation of accuracy is: 0.002744195339716491


In [94]:
print(classification_report(y_test,y_predlr))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95      7319
           1       0.65      0.22      0.33       919

    accuracy                           0.90      8238
   macro avg       0.78      0.60      0.64      8238
weighted avg       0.88      0.90      0.88      8238

