# THOMAS NGO
# TEVIN VU

# 2. Use read_csv() to load and examine the training and test sets. Unlike most CSV files, the separator is actually ';' rather than ','.

In [1]:
import pandas as pd
df_train = pd.read_csv('bank-additional.csv', sep=';')
df_test = pd.read_csv('bank-additional-full.csv', sep=';')
df_train
# df_train.columns
# df_test.columns

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.200,-42.0,4.191,5195.8,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4114,30,admin.,married,basic.6y,no,yes,yes,cellular,jul,thu,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,no
4115,39,admin.,married,high.school,no,yes,no,telephone,jul,fri,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.959,5228.1,no
4116,27,student,single,high.school,no,no,no,cellular,may,mon,...,2,999,1,failure,-1.8,92.893,-46.2,1.354,5099.1,no
4117,58,admin.,married,high.school,no,no,no,cellular,aug,fri,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no


# 3. The training and test DataFrames will need some significant preprocessing before they can be used:
- Several of the features are categorical variables and will need to be turned into numbers before they can be used by ML algorithms. The simplest way to accomplish this is to use dummy coding using get_dummies().
- Some algorithms (e.g. logistic regression) have problems with collinear features. If you use one-hot encoding, one dummy variable will be a linear combination of the other dummy variables, so be sure to pass drop_first=True.
- Per bank-additional-names.txt, the feature duration “should be discarded if the intention is to have a realistic predictive model,” so removed.
- The feature y (or y_yes after dummy coding) is the target, so should be removed.
- Some algorithms (e.g. KNN and SVM) require non-categorical features to be standardized.

In [2]:
prep_data_train = pd.get_dummies(df_train, drop_first=True)
prep_data_test = pd.get_dummies(df_test, drop_first=True)
prep_data_train

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,30,487,2,999,0,-1.8,92.893,-46.2,1.313,5099.1,...,0,0,0,0,0,0,0,1,0,0
1,39,346,4,999,0,1.1,93.994,-36.4,4.855,5191.0,...,0,0,0,0,0,0,0,1,0,0
2,25,227,1,999,0,1.4,94.465,-41.8,4.962,5228.1,...,0,0,0,0,0,0,1,1,0,0
3,38,17,3,999,0,1.4,94.465,-41.8,4.959,5228.1,...,0,0,0,0,0,0,0,1,0,0
4,47,58,1,999,0,-0.1,93.200,-42.0,4.191,5195.8,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4114,30,53,1,999,0,1.4,93.918,-42.7,4.958,5228.1,...,0,0,0,0,1,0,0,1,0,0
4115,39,219,1,999,0,1.4,93.918,-42.7,4.959,5228.1,...,0,0,0,0,0,0,0,1,0,0
4116,27,64,2,999,1,-1.8,92.893,-46.2,1.354,5099.1,...,0,0,0,1,0,0,0,0,0,0
4117,58,528,1,999,0,1.4,93.444,-36.1,4.966,5228.1,...,0,0,0,0,0,0,0,1,0,0


In [3]:
train_data = prep_data_train.drop(columns=['duration', 'y_yes'])
test_data = prep_data_test.drop(columns=['duration', 'y_yes'])
# train_data.columns
# train_data

In [4]:
from sklearn.preprocessing import StandardScaler
dnames = ['train', 'test']
dataset = {
    'train': {
        'df': df_train,
        'data': train_data,
        'p_data': None,
    },
    'test': {
        'df': df_test,
        'data': test_data,
        'p_data': None,
    }
}

# for name in dnames:
#     scaler = StandardScaler()
#     dataset[name]['sdd'] = scaler.fit_transform(dataset[name]['data'])
#     dataset[name]['p_data'] = pd.DataFrame(dataset[name]['sdd'], columns=dataset[name]['data'].columns)

for name in dnames:
    dataset[name]['p_data'] = (dataset[name]['data'] - dataset[name]['data'].mean()) / dataset[name]['data'].std()


dataset['train']['p_data']
# dataset['test']['p_data']

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,-0.980633,-0.209203,0.201007,-0.351313,-1.205908,-1.185304,-1.240788,-1.331546,-0.914668,1.91275,...,1.410189,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,-0.488990,0.411258,-0.188936
1,-0.107978,0.569565,0.201007,-0.351313,0.649363,0.715106,0.892161,0.711612,0.332822,-0.52268,...,1.410189,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,-0.488990,0.411258,-0.188936
2,-1.465441,-0.598587,0.201007,-0.351313,0.841287,1.528088,-0.283138,0.773334,0.836434,-0.52268,...,-0.708953,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,2.044533,0.411258,-0.188936
3,-0.204940,0.180181,0.201007,-0.351313,0.841287,1.528088,-0.283138,0.771603,0.836434,-0.52268,...,-0.708953,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,-0.488990,0.411258,-0.188936
4,0.667714,-0.598587,0.201007,-0.351313,-0.118336,-0.655398,-0.326667,0.328592,0.397979,-0.52268,...,-0.708953,2.869395,-0.13051,-0.125615,1.953616,-0.513635,-0.506455,-0.488990,0.411258,-0.188936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4114,-0.980633,-0.598587,0.201007,-0.351313,0.841287,0.583924,-0.479021,0.771026,0.836434,-0.52268,...,-0.708953,-0.348421,-0.13051,-0.125615,-0.511747,1.946436,-0.506455,-0.488990,0.411258,-0.188936
4115,-0.107978,-0.598587,0.201007,-0.351313,0.841287,0.583924,-0.479021,0.771603,0.836434,-0.52268,...,-0.708953,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,-0.488990,0.411258,-0.188936
4116,-1.271518,-0.209203,0.201007,1.494426,-1.205908,-1.185304,-1.240788,-1.307895,-0.914668,-0.52268,...,1.410189,-0.348421,-0.13051,-0.125615,1.953616,-0.513635,-0.506455,-0.488990,-2.430975,-0.188936
4117,1.734292,-0.598587,0.201007,-0.351313,0.841287,-0.234236,0.957455,0.775641,0.836434,-0.52268,...,-0.708953,-0.348421,-0.13051,-0.125615,-0.511747,-0.513635,-0.506455,-0.488990,0.411258,-0.188936


# 4.Fit Naive Bayes, KNN, and SVM classifiers to the training set, then score each classifier on the test set. Which classifier has the highest accuracy?

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# find score for Gaussian Naive Bayes classifer
gnb = GaussianNB()
gnb.fit(dataset['train']['p_data'], prep_data_train['y_yes'])
gnb_score = gnb.score(dataset['test']['p_data'], prep_data_test['y_yes'])
print('GNB classifer score = ', gnb_score)

# find score for KNN classifer
knn = KNeighborsClassifier()
knn.fit(dataset['train']['p_data'], prep_data_train['y_yes'])
knn_score = knn.score(dataset['test']['p_data'], prep_data_test['y_yes'])
print('KNN classifer score = ', knn_score)

# find score for SVM classifer
svm = SVC()
svm.fit(dataset['train']['p_data'], prep_data_train['y_yes'])
svm_score = svm.score(dataset['test']['p_data'], prep_data_test['y_yes'])
print('SVM classifer score = ', svm_score)

GNB classifer score =  0.8873458288821987
KNN classifer score =  0.8938040205885209
SVM classifer score =  0.8981985044187627


## Which classifier has the highest accuracy?
the SVM classifier has the highest accuracy

# 5. These numbers look pretty good, but let’s take another look at the data. How many values in the training set have y_yes = 0, and how many have y_yes = 1? What would be the accuracy if we simply assumed that no customer ever subscribed to the product?

In [6]:
y_yes_0 = prep_data_test[prep_data_test.y_yes == 0].shape[0]
y_yes_1 = prep_data_test[prep_data_test.y_yes == 1].shape[0]

print('total of y_yes=0 is', y_yes_0)
print('total of y_yes=1 is', y_yes_1)

total of y_yes=0 is 36548
total of y_yes=1 is 4640


## What would be the accuracy if we simply assumed that no customer ever subscribed to the product?
- The accuracy would be 36548/(36548+4640) = 0.88 or 88% if we simply assume that no customer ever subscribe to the product


# 6. Use np.zeros_like() to create a target vector representing the output of the “dumb” classifier of experiment (5), then create a confusion matrix and find its AUC

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import numpy as np
y_pred = np.zeros_like(prep_data_test.y_yes)
y_true = prep_data_test['y_yes']
dumb_con_mat = confusion_matrix(y_true, y_pred)
print('confusion matrix\n', dumb_con_mat)
print('roc_auc_score of dumb classifer = ', roc_auc_score(y_true, y_pred))

confusion matrix
 [[36548     0]
 [ 4640     0]]
roc_auc_score of dumb classifer =  0.5


# 7. Create a confusion matrix and find the AUC for each of the classifiers of experiment (4). Is the best classifier the one with the highest accuracy?


In [8]:
# GNB
gnb_y_pred = gnb.predict(dataset['test']['p_data'])
gnb_con_mat = confusion_matrix(y_true, gnb_y_pred)
print('confusion matrix of GNB\n',gnb_con_mat)
roc_auc_score_gnb = roc_auc_score(y_true, gnb_y_pred)
print('roc_auc_score of GNB =', roc_auc_score_gnb)
print('\n')

# KNN
knn_y_pred = knn.predict(dataset['test']['p_data'])
knn_con_mat = confusion_matrix(y_true, knn_y_pred)
print('confusion matrix of KNN\n', knn_con_mat)
roc_auc_score_knn = roc_auc_score(y_true, knn_y_pred)
print('roc_auc_score of KNN =', roc_auc_score_knn)
print('\n')

# SVM
svm_y_pred = svm.predict(dataset['test']['p_data'])
svm_con_mat = confusion_matrix(y_true, svm_y_pred)
print('confusion matrix of SVM\n', svm_con_mat)
roc_auc_score_svm = roc_auc_score(y_true, svm_y_pred)
print('roc_auc_score of SVM =', roc_auc_score_svm)
print('\n')


confusion matrix of GNB
 [[36548     0]
 [ 4640     0]]
roc_auc_score of GNB = 0.5


confusion matrix of KNN
 [[35674   874]
 [ 3500  1140]]
roc_auc_score of KNN = 0.6108879489608374


confusion matrix of SVM
 [[36193   355]
 [ 3838   802]]
roc_auc_score of SVM = 0.5815657868914946




## Is the best classifier the one with the highest accuracy?
- No, the best classifier, which is KNN, is not the one with the highest accuracy

# 8. One of the easiest ways to deal with an unbalanced dataset is random oversampling. This can be done with an imblearn.over_sampling.RandomOverSampler object. Use fit_resample() to generate a balanced training set.

In [13]:
# balance dataset after standardization
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_res, y_res = ros.fit_resample(dataset['train']['p_data'], prep_data_train['y_yes'])

# X_res
# y_res


In [10]:
# test: balance dataset before standardization

# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# X_res, y_res = ros.fit_resample(df_train, df_train['y'])
# X_res = pd.get_dummies(X_res, drop_first=True)
# y_res = pd.get_dummies(y_res, drop_first=True)
# X_res = X_res.drop(columns=['duration', 'y_yes'])

# X_res = (X_res - X_res.mean()) / X_res.std()
# # X_res
# # y_res



# 9. Repeat experiments (4) and (7) on the balanced training set of experiment (8). Which classifier performs the best, and how much better is its performance?

In [11]:
# experiment (4): Fit Naive Bayes, KNN, and SVM classifiers to the training set, 
# then score each classifier on the test set. Which classifier has the highest accuracy?

# find score for Gaussian Naive Bayes classifer
gnb_ros = GaussianNB()
gnb_ros.fit(X_res, y_res)
gnb_score_ros = gnb_ros.score(X_res, y_res)
print('GNB classifer score = ', gnb_score_ros)

# find score for KNN classifer
knn_ros = KNeighborsClassifier()
knn_ros.fit(X_res, y_res)
knn_score_ros = knn_ros.score(X_res, y_res)
print('KNN classifer score = ', knn_score_ros)

# find score for SVM classifer
svm_ros = SVC()
svm_ros.fit(X_res, y_res)
svm_score_ros = svm_ros.score(X_res, y_res)
print('SVM classifer score = ', svm_score_ros)

GNB classifer score =  0.5389858233369684
KNN classifer score =  0.923391494002181
SVM classifer score =  0.8545528898582334


## Which classifier has the highest accuracy?
KNN is has the highest accuracy

In [12]:
# experiment (7): Create a confusion matrix and find the AUC for each of the classifiers of experiment (4). 
# Is the best classifier the one with the highest accuracy?

# GNB
gnb_ros_y_pred = gnb_ros.predict(dataset['test']['p_data'])
gnb_ros_con_mat = confusion_matrix(y_true, gnb_ros_y_pred)
print('confusion matrix of GNB\n',gnb_ros_con_mat)
roc_auc_score_gnb_ros = roc_auc_score(y_true, gnb_ros_y_pred)
print('roc_auc_score of GNB =', roc_auc_score_gnb_ros)
print('\n')

# KNN
knn_ros_y_pred = knn_ros.predict(dataset['test']['p_data'])
knn_ros_con_mat = confusion_matrix(y_true, knn_ros_y_pred)
print('confusion matrix of KNN\n', knn_ros_con_mat)
roc_auc_score_knn_ros = roc_auc_score(y_true, knn_ros_y_pred)
print('roc_auc_score of KNN =', roc_auc_score_knn_ros)
print('\n')

# SVM
svm_ros_y_pred = svm_ros.predict(dataset['test']['p_data'])
svm_ros_con_mat = confusion_matrix(y_true, svm_ros_y_pred)
print('confusion matrix of SVM\n', svm_ros_con_mat)
roc_auc_score_svm_ros = roc_auc_score(y_true, svm_ros_y_pred)
print('roc_auc_score of SVM =', roc_auc_score_svm_ros)
print('\n')

confusion matrix of GNB
 [[36548     0]
 [ 4640     0]]
roc_auc_score of GNB = 0.5


confusion matrix of KNN
 [[28991  7557]
 [ 1975  2665]]
roc_auc_score of KNN = 0.6837921340098803


confusion matrix of SVM
 [[32189  4359]
 [ 2102  2538]]
roc_auc_score of SVM = 0.713857473214252




## Is the best classifier the one with the highest accuracy?
No, the best classifier which is SVM is not the one with the highest accuracy

## Which classifier performs the best, and how much better is its performance?
SVM classifier performs the best, and SVM is about 21% better than GNB and about 3% better than KNN