In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB,BernoulliNB

Import Bank Data from csv to dataframe object

In [6]:
bank_data = pd.read_csv('data/bank-additional-full.csv', delimiter=';')

In [7]:
bank_data.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


Data contains many categorical features.So we cann't feed this data to our estimator.
It needs to be pre processed. 

Let's  do that first.

In [8]:
bank_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

Below lines to saparate feature data and target data

In [9]:
feature_data = bank_data.drop('y',axis=1)
target_data = bank_data.y

In [10]:
lblEnc = LabelEncoder()
target_data_trans=lblEnc.fit_transform(target_data)

In [11]:
feature_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
dtypes: float64(5), int64(5), object(10)
memory usage: 6.3+ MB


Here I am going to apply preprocessing techniques on Object(categorical data) type data  using dataframemapper.

In [12]:
no_action = list(set(feature_data.columns.tolist()) - set(['education',
                                                           'marital', 'month', 'contact', 'poutcome', 
                                                           'day_of_week', 'loan', 'job', 'default',
                                                           'housing']))

In [13]:
no_action_f = list(map(lambda x:(x,None),no_action))


In [14]:
mapper = DataFrameMapper([*no_action_f,
                        ('job',LabelEncoder()),
                        ('marital',LabelEncoder()),
                        ('education',LabelEncoder()),
                        ('default',LabelEncoder()),
                        ('housing',LabelEncoder()),
                        ('loan',LabelEncoder()),
                        ('contact',LabelEncoder()),
                        ('month',LabelEncoder()),
                        ('day_of_week',LabelEncoder()),
                        ('poutcome',LabelEncoder())
                         ])


In [15]:
pipeline = Pipeline([('mapper',mapper),
                      ('scaler',StandardScaler()),
                       ('sel',SelectKBest()),
                        ('cls',LogisticRegressionCV())
                    ])

In [16]:
Xtrain,Xtest,ytrain,ytest = train_test_split(feature_data,target_data_trans)

In [17]:
pipeline.fit(Xtrain,ytrain)

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('emp.var.rate', None), ('euribor3m', None), ('pdays', None), ('previous', None), ('cons.price.idx', None), ('nr.employed', None), ('cons.conf.idx', None), ('campaign', None), ('duration', None), ('age', None), ('job', ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [20]:
pipeline.score(Xtest,ytest)

0.90764300281635424

In [29]:
grid = GridSearchCV(pipeline,param_grid={'sel__k':[10,12,15]},cv=5,scoring='accuracy')

In [31]:
grid.fit(Xtrain,ytrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('emp.var.rate', None), ('euribor3m', None), ('pdays', None), ('previous', None), ('cons.price.idx', None), ('nr.employed', None), ('cons.conf.idx', None), ('campaign', None), ('duration', None), ('age', None), ('job', ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'sel__k': [10, 12, 15]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [32]:
grid.best_params_

{'sel__k': 15}

In [33]:
grid.best_score_

0.91201320773040695