In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 
from pylab import plot, show
%matplotlib inline

In [2]:
df = pd.read_csv('bank-additional-full.csv', sep=';')

In [3]:
df.columns

Index([u'age', u'job', u'marital', u'education', u'default', u'housing',
       u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign',
       u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx',
       u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y'],
      dtype='object')

In [4]:
df.shape

(41188, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

In [6]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [7]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
df.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [9]:
#some graphs analyses here

In [10]:
#Categorize the column of dataset which is object type 
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].astype('category')

In [11]:
#Convert categorical data into numerical value
df["job"] = df["job"].cat.codes
df["education"] = df["education"].cat.codes
df['default'] = df['default'].cat.codes
df['housing'] = df['housing'].cat.codes
df['loan'] = df['loan'].cat.codes
df['month'] = df['month'].cat.codes
df['day_of_week'] = df['day_of_week'].cat.codes
df['poutcome'] = df['poutcome'].cat.codes
df['y'] = df['y'].cat.codes

In [12]:
features_columns = ['job','education', 'default', 'housing', 'loan',
           'month', 'day_of_week', 'duration', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m','y']

In [13]:
df.dtypes

age                  int64
job                   int8
marital           category
education             int8
default               int8
housing               int8
loan                  int8
contact           category
month                 int8
day_of_week           int8
duration             int64
campaign             int64
pdays                int64
previous             int64
poutcome              int8
emp.var.rate       float64
cons.price.idx     float64
cons.conf.idx      float64
euribor3m          float64
nr.employed        float64
y                     int8
dtype: object

In [14]:
df[features_columns].head()

Unnamed: 0,job,education,default,housing,loan,month,day_of_week,duration,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,y
0,3,0,0,0,0,6,1,261,999,0,1,1.1,93.994,-36.4,4.857,0
1,7,3,1,0,0,6,1,149,999,0,1,1.1,93.994,-36.4,4.857,0
2,7,3,0,2,0,6,1,226,999,0,1,1.1,93.994,-36.4,4.857,0
3,0,1,0,0,0,6,1,151,999,0,1,1.1,93.994,-36.4,4.857,0
4,7,3,0,0,2,6,1,307,999,0,1,1.1,93.994,-36.4,4.857,0


In [15]:
import pandas
import scipy
import numpy

from sklearn.preprocessing import MinMaxScaler

features_columns_df = df[features_columns]
array = features_columns_df.values

X = array[:,0:15]
Y = array[:,15]

scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[0.273 0.    0.    0.    0.    0.667 0.25  0.053 1.    0.    0.5   0.938
  0.699 0.603 0.957]
 [0.636 0.429 0.5   0.    0.    0.667 0.25  0.03  1.    0.    0.5   0.938
  0.699 0.603 0.957]
 [0.636 0.429 0.    1.    0.    0.667 0.25  0.046 1.    0.    0.5   0.938
  0.699 0.603 0.957]
 [0.    0.143 0.    0.    0.    0.667 0.25  0.031 1.    0.    0.5   0.938
  0.699 0.603 0.957]
 [0.636 0.429 0.    0.    1.    0.667 0.25  0.062 1.    0.    0.5   0.938
  0.699 0.603 0.957]]


In [16]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X = features_columns_df.iloc[:,0:15]
Y = features_columns_df.iloc[:,15]
# split data into train and test sets
seed = 7
test_size = 0.80
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
rf = RandomForestClassifier(random_state = 42)  # random_state is the seed used by the random number generator
#fitting the model
model = rf.fit(X_train, y_train)
# Find feature importance, print it
raw_feature_importance = model.feature_importances_.tolist()
feature_importance = [round(val * 100.0, 2) for val in raw_feature_importance]
print(zip(features_columns_df.columns, feature_importance))

[('job', 7.27), ('education', 5.98), ('default', 0.87), ('housing', 2.23), ('loan', 1.74), ('month', 3.36), ('day_of_week', 5.55), ('duration', 36.33), ('pdays', 3.83), ('previous', 1.43), ('poutcome', 2.64), ('emp.var.rate', 3.27), ('cons.price.idx', 3.25), ('cons.conf.idx', 7.22), ('euribor3m', 15.02)]




In [17]:
model.score(X_test,y_test)

0.9062547418894722

In [18]:
predicted = rf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predicted)
print cm

[[28161  1093]
 [ 1996  1701]]


In [19]:
from sklearn.metrics import classification_report
print classification_report(y_test, predicted)

              precision    recall  f1-score   support

           0       0.93      0.96      0.95     29254
           1       0.61      0.46      0.52      3697

   micro avg       0.91      0.91      0.91     32951
   macro avg       0.77      0.71      0.74     32951
weighted avg       0.90      0.91      0.90     32951



In [20]:
columns_trains = ['job','education', 'default', 'housing', 'loan',
           'month', 'day_of_week', 'duration', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m','y']
columns_target = ['y']
X = df[columns_trains]
Y = df[columns_target]

In [21]:
from sklearn.naive_bayes import GaussianNB
# create model
gnb = GaussianNB()
# train model
gnb.fit(X_train, y_train)
# print the accuracy of model
gnb.score(X_test, y_test)

0.8434645382537708

In [22]:
predicted = gnb.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predicted)
print cm

[[25902  3352]
 [ 1806  1891]]


In [23]:
from sklearn.metrics import classification_report
print classification_report(y_test, predicted)

              precision    recall  f1-score   support

           0       0.93      0.89      0.91     29254
           1       0.36      0.51      0.42      3697

   micro avg       0.84      0.84      0.84     32951
   macro avg       0.65      0.70      0.67     32951
weighted avg       0.87      0.84      0.85     32951

