In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/bank.csv"
BankData = pd.read_csv(url)
BankData.head(5)

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no


In [3]:
BankData['y'].unique()  #you can use unique if you would like to find out how many unique attributes each variable have

array(['no', 'yes'], dtype=object)

For dictionary of data please refer to https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

#### Our goal is to define a model best predicts outcome y - success of the marketing campaign. This is the very first a dataset we used to introduce Logistic Regression Models. This time we know few more tricks! In this lecture we are going to use as many techniques as possible on our dataset to make the least possible error! Goal is to use, Logistic Regression, Naive Bayes, Random Forest, KNN, and using VotingClassifier() to combine the results. 

In [4]:
Default_dummy  = pd.get_dummies(BankData['default'], prefix = 'default')
del Default_dummy['default_no']

marital_dummy  = pd.get_dummies(BankData['marital'], prefix = 'marital')
del marital_dummy['marital_married']
del marital_dummy['marital_divorced']

housing_dummy  = pd.get_dummies(BankData['housing'], prefix = 'housing')
del housing_dummy['housing_no']

loan_dummy = pd.get_dummies(BankData['loan'], prefix = 'loan')
del loan_dummy['loan_no']


BankData = pd.concat([BankData,marital_dummy , Default_dummy, housing_dummy, loan_dummy], axis=1)
BankData.head()

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y,marital_single,default_yes,housing_yes,loan_yes
0,30,married,primary,no,1787,no,no,cellular,79,1,-1,0,unknown,no,0,0,0,0
1,33,married,secondary,no,4789,yes,yes,cellular,220,1,339,4,failure,no,0,0,1,1
2,35,single,tertiary,no,1350,yes,no,cellular,185,1,330,1,failure,no,1,0,1,0
3,30,married,tertiary,no,1476,yes,yes,unknown,199,4,-1,0,unknown,no,0,0,1,1
4,59,married,secondary,no,0,yes,no,unknown,226,1,-1,0,unknown,no,0,0,1,0


In [5]:
X = BankData[['age','balance','duration','campaign','pdays','previous','marital_single','default_yes','housing_yes','loan_yes']]
y = BankData['y']

In [6]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import VotingClassifier  #VotingClassifier is part of sklearn.ensemble

In [7]:
# We first standardize our data since some of the algorithms only work with standardized values 
def Standardize(X):
    X_Max = X.max()
    X_Min = X.min()
    X_Standardized = (X-X_Min)/(X_Max - X_Min)
    return X_Standardized

In [8]:
NameOfVariables = ['age','balance','duration','campaign','pdays','previous','marital_single','default_yes','housing_yes','loan_yes']
for i in NameOfVariables:
    BankData[i] = Standardize(BankData[i])
    
X1 = BankData[NameOfVariables]   

In [9]:
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators = 1000)
clf3 = GaussianNB()
clf4 = neighbors.KNeighborsClassifier( weights='uniform')
clf5 = DecisionTreeClassifier(min_samples_leaf = 5)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),('knn',clf4),('DT',clf5)], voting='hard')

In [12]:
from sklearn.grid_search import GridSearchCV
params = {'lr__C': [0.01,1,100],
          'rf__max_features':[1,3,5,7],
          'knn__n_neighbors':[2,10,20],
           'DT__max_depth':[2,10,20]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
gridfit = grid.fit(X1, y)

In [13]:
print gridfit.best_params_

{'rf__max_features': 3, 'DT__max_depth': 10, 'knn__n_neighbors': 10, 'lr__C': 100}


In [15]:
print gridfit.best_score_

0.88896261889


In [16]:
y_hat_predict = gridfit.predict(X1)
confusion_matrix(y_hat_predict,y)

array([[3966,  311],
       [  34,  210]])