# Load  data

In [1]:
import pandas as pd
import numpy as np

# Code here
# colnames = ["age","job","marital","education", "default", "balance", "housing", "loan", "contact","day","month","duration","campaign","pdays","previous","poutcome","y"]
df = pd.read_csv("./bank-additional/bank-additional-full.csv", sep=';')

# Create a new data set with y transfored to binary data

def transform(string):
    if string=="yes":
        return 1
    else:
        return 0

y_trans = df['y'].apply(transform)
new_df = df
new_df['y'] = y_trans
new_df.head(5)

len(df.columns)

21

## Clean data

Deal with unknown data
Strategy:

- For continuous data, change it with average 
- For categorical data, change it to a new category

The result is there is no NaNs in the dataset.

In [2]:
# Check number of NaNs
for col in new_df.columns:
    print(col + ' ' + str(new_df.isnull().values.any()))

age False
job False
marital False
education False
default False
housing False
loan False
contact False
month False
day_of_week False
duration False
campaign False
pdays False
previous False
poutcome False
emp.var.rate False
cons.price.idx False
cons.conf.idx False
euribor3m False
nr.employed False
y False


### Missing data -- Pdays

pdays has 39673 of 999s --- this is missing data.

It is a continuous variable, so replace 999 with mean, while removing 999 first.

This data set also contains missing data on one numerical feature: pdays. This feature indicates the number of days that passed by after the client was last contacted from a previous campaign- it was coded as '999' if the contact never happened. Over 90% of the records show a number missing for the pdays. In order to implement the machine learning algorthms, we need to imput the missing values of this feature in a way to maximize its prediction accuracy.

In the study, we tested the following appraches with the Logistic Regression algorithm:

- Leave it as it is (999)
- Imput as the column mean
- Imput as zero
- Remove the column from data set

Since the result shows the highest prediction accuracy on the first apprach. we will keep the data as it is in the rest of study.

Reference: https://nycdatascience.com/blog/student-works/machine-learning/machine-learning-retail-bank-marketing-data/

### Method 2: Imput as the column mean

In [3]:
new_df2 = new_df.copy() # this creates a copy

In [4]:
new_df2['pdays'].replace(999, (new_df2['pdays'] != 999).mean(), inplace=True)

In [5]:
new_df = new_df2

### Method3: Imput as zero

In [6]:
new_df3 = new_df.copy() # this creates a copy

In [7]:
new_df3['pdays'].replace(999, 0, inplace=True)

### Method 4: remove the column

In [8]:
new_df4 = new_df.drop('pdays', axis = 1)

### Preprocessing 
Now do the proprocessing first

In [9]:
# examine all variables that are object, store them in a list
obj_col_list = []
for column in new_df.columns:
    if new_df.dtypes[column] == 'object':
        print('\n')
        print(new_df.groupby([column]).size())
        obj_col_list.append(column)



job
admin.           10422
blue-collar       9254
entrepreneur      1456
housemaid         1060
management        2924
retired           1720
self-employed     1421
services          3969
student            875
technician        6743
unemployed        1014
unknown            330
dtype: int64


marital
divorced     4612
married     24928
single      11568
unknown        80
dtype: int64


education
basic.4y                4176
basic.6y                2292
basic.9y                6045
high.school             9515
illiterate                18
professional.course     5243
university.degree      12168
unknown                 1731
dtype: int64


default
no         32588
unknown     8597
yes            3
dtype: int64


housing
no         18622
unknown      990
yes        21576
dtype: int64


loan
no         33950
unknown      990
yes         6248
dtype: int64


contact
cellular     26144
telephone    15044
dtype: int64


month
apr     2632
aug     6178
dec      182
jul     7174
jun     5318


### Preprocessing

1. One-hot encoding of categorical variables

In [10]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# a function to do one-hot-encoding for categorical variables
# input is an nparray of categorical values, ncol = 1, nrow = len(dataset)
# output is a pd dataframe of one-hot vectors. ncol = number of categoreis, nrow = len(dataset)
def one_hot_encoding(values):
    # first do the one-hot encoding. return a np array of vectors.
    ori_index = values.index
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values) #  
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    
    # then transform the vector into pd dataframe with proper column names
    col_names = [values.name + '_' + x for x in label_encoder.classes_.tolist()]
    one_hot_encoding_df = pd.DataFrame(onehot_encoded.tolist(), columns = col_names, index = ori_index)

    return one_hot_encoding_df

In [11]:
def integer_encoding(values):
    # first do the one-hot encoding. return a np array of vectors.
    ori_index = values.index
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values) #  
    return integer_encoded

In [12]:
obj_col_list

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [13]:
# This part does the transformation of one-hot vectors
num_col_list = list(set(new_df.columns).difference(obj_col_list)) # the list of columns that are of numerical values
num_col_list
new_df_num = new_df[num_col_list]
new_df_num.shape

(41188, 11)

In [14]:
for col in obj_col_list:
    df_mini = one_hot_encoding(new_df[col])
    new_df_num = pd.concat([new_df_num, df_mini], axis=1, join_axes=[new_df_num.index])

In [15]:
new_df_num.shape

(41188, 64)

In [16]:
data_features = new_df_num.drop(labels = 'y', axis = 1)
data_target = new_df_num['y']

## Build cross validation set 

In [37]:
from sklearn.cross_validation import KFold
kfolds = KFold(train_features.shape[0], n_folds = 5)

kfolds

sklearn.cross_validation.KFold(n=32950, n_folds=5, shuffle=False, random_state=None)

In [38]:
train_features.shape

(32950, 63)

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model

SVM_parameters = {'kernel':('sigmoid', 'rbf', 'poly'), 'C':[1, 10, 100]}
lr_parameters = dict(C = [10**i for i in range(-3, 3)],
                  penalty = ['l1', 'l2'])
knn_parameters = {'n_neighbors': [5, 10, 15, 20], 'weights':['uniform', 'distance'], 'leaf_size':[20,30,40]}
nn_parameters = dict(hidden_layer_sizes=[(30,),(50,),(70,),(100,)], activation=['relu', 'tanh', 'logistic'], learning_rate=['constant', 'adaptive'])
decision_parameters = dict(max_depth=[5, 10, 15, 20], min_samples_split=[2,6,12], min_samples_leaf=[1,10,20])

### Logistic Regression GridSearch

In [None]:
lr_grid_search = GridSearchCV(linear_model.LogisticRegression(), lr_parameters, cv = kfolds, scoring = 'roc_auc') 
lr_grid_search.fit(data_features, data_target)


#3rd, get the score of the best model and print it
best_1r_1 = lr_grid_search.best_score_
print(best_1r_1)


In [None]:
best_lr = lr_grid_search.best_estimator_
best_lr

### KNN grid search

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_grid_search = GridSearchCV(KNeighborsClassifier(), knn_parameters, cv = kfolds, scoring = 'roc_auc') 
knn_grid_search.fit(data_features, data_target)

#3rd, get the score of the best model and print it
best_knn_1 = knn_grid_search.best_score_
best_knn_1

0.8717344008203417

In [24]:
best_knn = knn_grid_search.best_estimator_
best_knn

KNeighborsClassifier(algorithm='auto', leaf_size=20, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='distance')

### Neural network grid search

In [25]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=43)
nn_grid_search = GridSearchCV(mlp, nn_parameters, cv = kfolds, scoring = 'roc_auc') 
nn_grid_search.fit(data_features, data_target)

#3rd, get the score of the best model and print it
best_nn_1 = nn_grid_search.best_score_
print(best_nn_1)

0.922314328553


In [26]:
best_nn = nn_grid_search.best_estimator_
best_nn

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=43, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

### Decision Tree grid search

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), decision_parameters, cv = kfolds, scoring = 'roc_auc') 
dt_grid_search.fit(data_features, data_target)

#3rd, get the score of the best model and print it
best_dt_1 = dt_grid_search.best_score_
print(best_dt_1)

0.896178304467


In [28]:
dt_grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

# SVM Grid Search

from sklearn import svm
svm_grid_search = GridSearchCV(svm.SVC(), SVM_parameters, cv = kfolds, scoring = 'roc_auc') 
svm_grid_search.fit(data_features, data_target)

#3rd, get the score of the best model and print it
best_svm_1 = svm_grid_search.best_score_
print(best_svm_1)

best_svm = svm_grid_search.best_estimator_

## Naive Bayes

In [29]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# defines a classification tree
def NBClassifier(X_train,y_train,X_test,y_test, auc):
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    if (auc==0):
        cm = confusion_matrix(clf.predict(X_test),y_test)
        return cm # (cm[0][0]+cm[1][1])/float(sum(cm))
    else:
        return roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [30]:
auc = NBClassifier(train_features,train_target,test_features,test_target, 1)
auc

0.82883285090516601

## Neural Net

In [31]:
from sklearn.neural_network import MLPClassifier

# defines a classification tree
def NNClassifier(X_train,y_train,X_test,y_test, auc):
    clf = MLPClassifier(hidden_layer_sizes=(30, ),learning_rate='adaptive', random_state=43)
    clf.fit(X_train,y_train)
    if (auc==0):
        cm = confusion_matrix(clf.predict(X_test),y_test)
        return cm # (cm[0][0]+cm[1][1])/float(sum(cm))
    else:
        return roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [32]:
auc = NNClassifier(train_features,train_target,test_features,test_target, 1)
auc

0.93890782558775565

# KNN Classifier

In [33]:
from sklearn.neighbors import KNeighborsClassifier

# defines a classification tree
def KNNClassifier(X_train,y_train,X_test,y_test, auc):
    clf = KNeighborsClassifier(n_neighbors=15)
    clf.fit(X_train,y_train)
    if (auc==0):
        cm = confusion_matrix(clf.predict(X_test),y_test)
        return cm # (cm[0][0]+cm[1][1])/float(sum(cm))
    else:
        return roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [34]:
auc = KNNClassifier(train_features,train_target,test_features,test_target, 1)
auc

0.91377501444355524

# SVM Classifier

In [35]:
from sklearn import svm

# defines a classification tree
def SVMClassifier(X_train,y_train,X_test,y_test, auc):
    clf = svm.SVC(C=1.0, kernel='rbf', probability=True)
    clf.fit(X_train,y_train)
    if (auc==0):
        cm = confusion_matrix(clf.predict(X_test),y_test)
        return cm # (cm[0][0]+cm[1][1])/float(sum(cm))
    else:
        return roc_auc_score(y_test,clf.decision_function(X_test))

In [36]:
auc = SVMClassifier(train_features,train_target,test_features,test_target, 1)
auc

KeyboardInterrupt: 

# XGBoost

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# fit model no training data
model = XGBClassifier()
model.fit(train_features, train_target)

# make predictions for test data
xgboost_pred = model.predict(test_features)
predictions = [round(value) for value in xgboost_pred]

# evaluate predictions
accuracy = accuracy_score(test_target, xgboost_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Logistic regression
The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train_features, train_target)

# Make predictions using the testing set
predict_y = regr.predict(test_features)


# The predict score
print("Mean accuracy Score: %.2f" % regr.score(test_features,test_target))
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(test_target, predict_y))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_target, predict_y))
# The coefficients
# print('Coefficients: \n', regr.coef_)


fpr, tpr, thresholds = roc_curve(test_target, predict_y)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1,label="ROC of lr" +"(AUC = %0.4f)" %roc_auc)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve and AUC')
plt.legend(loc="lower right")
plt.show()


tn, fp, fn, tp = confusion_matrix(test_target, predict_y).ravel()
print(tn, fp, fn, tp)
##################################################
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr, new_df_num.drop(labels = 'y', axis = 1), new_df_num["y"], cv=50)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Decision Tree

In [None]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

# defines a classification tree
def testTrees(X_train,y_train,X_test,y_test,dep,leaf,auc):
    clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=leaf,max_depth=dep)
    clf = clf.fit(X_train,y_train)
    if (auc==0):
        cm = confusion_matrix(clf.predict(X_test),y_test)
        return (cm[0][0]+cm[1][1])/float(sum(cm))
    else:
        return roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

In [None]:
import matplotlib.pyplot as plt

lab='y_buy'

depths=[4,5,10,20]
leaves=np.arange(1,101)

#Run all of the options
run=1
if (run==1):
    #Initialize dictionary of results
    res=dict()
    for d in depths:
        res[d]=list()

    #Now train and get results for each option
    for d in depths:
        for l in leaves:
            res[d].append(testTrees(train_features,train_target,test_features,test_target, d, l, 1))


#Now plot            
fig = plt.figure()
ax=fig.add_subplot(111)
plt.plot(leaves,res[depths[0]],'b-',label='Depth={}'.format(depths[0]))
plt.plot(leaves,res[depths[1]],'r-',label='Depth={}'.format(depths[1]))
plt.plot(leaves,res[depths[2]],'y-',label='Depth={}'.format(depths[2]))
plt.plot(leaves,res[depths[3]],'g-',label='Depth={}'.format(depths[3]))
plt.legend(loc=4)
ax.set_xlabel('Min Leaf Size')
ax.set_ylabel('Test Set AUC')
plt.title('Holdout AUC by Hyperparameters')

In [None]:
plt.show()

### Now it's random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
# import course_utils as bd
# imp.reload(bd)

In [None]:
# We'll build a RF and compare to a DT
clf_def = DecisionTreeClassifier(criterion='entropy', min_samples_leaf = 20)
clf_def = clf_def.fit(train_features, train_target)
# dt_pred = clf_def.predict_proba(test_features)
dt_pred = clf_def.predict(test_features)

rf_def = RandomForestClassifier(criterion='entropy', n_estimators=100)
rf_def = rf_def.fit(train_features, train_target)
# rf_pred = rf_def.predict_proba(test_features)
rf_pred = rf_def.predict(test_features)

In [None]:
# A function to plot AUC, from course Git

def plotAUC(truth, pred, lab):
    fpr, tpr, thresholds = roc_curve(truth, pred)
    roc_auc = auc(fpr, tpr)
    c = (np.random.rand(), np.random.rand(), np.random.rand())
    plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC')
    plt.legend(loc="lower right")

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

plotAUC(test_target, dt_pred, 'DT')
plotAUC(test_target, rf_pred, 'RF')
plt.show()

# Evaluation

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
# lr_grid_search, knn_grid_search,nn_grid_search,dt_grid_search
# test set features and labels

test_X = test_features # test_features.drop(labels = 'y', axis = 1)
test_Y = test_target #new_df_num['y']

models = [best_lr ] #lr_grid_search,knn_grid_search,nn_grid_search,dt_grid_search]
modelsName = ["lr_grid_search" ] # ,"knn_grid_search","nn_grid_search","dt_grid_search"]

for m,n in zip(models, modelsName):
    predict_y = m.predict(test_X)
    # The mean squared error
    print("Model: " + str(n) + "-------------------------------------")
    print("Mean squared error: %.4f" % mean_squared_error(test_Y, predict_y))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.4f' % r2_score(test_Y, predict_y))
    # The f1 score
    print("F1 score:%0.4f" % f1_score(test_Y, predict_y))
    # Precision
    print("Precision:%0.4f" % precision_score(test_Y, predict_y,average='weighted'))
    # Confusion matrix
    cnf_matrix = confusion_matrix(test_Y, predict_y)
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=['y', 'no'], normalize=True,title='Normalized confusion matrix')
    plot.show()
    

print("done")

In [None]:
# ROC for test set
for m,n in zip(models, modelsName):
    result = m.predict_proba(test_X)[:, 1]
    fpr, tpr, thresholds = roc_curve(test_Y, result)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1,label="ROC of "+ str(n) +"(AUC = %0.4f)" %roc_auc)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve and AUC')
plt.legend(loc="lower right")
plt.show()