# 2. Use read_csv() to load and examine the training and test sets. Unlike most CSV files, the separator is actually ';' rather than ','.

In [1]:
import pandas as pd
df_train = pd.read_csv('bank-additional.csv', sep=';')
df_test = pd.read_csv('bank-additional-full.csv', sep=';')
# df_train
# df_train.columns
df_test.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

# 3. The training and test DataFrames will need some significant preprocessing before they can be used:
- Several of the features are categorical variables and will need to be turned into numbers before they can be used by ML algorithms. The simplest way to accomplish this is to use dummy coding using get_dummies().
- Some algorithms (e.g. logistic regression) have problems with collinear features. If you use one-hot encoding, one dummy variable will be a linear combination of the other dummy variables, so be sure to pass drop_first=True.
- Per bank-additional-names.txt, the feature duration “should be discarded if the intention is to have a realistic predictive model,” so removed.
- The feature y (or y_yes after dummy coding) is the target, so should be removed.
- Some algorithms (e.g. KNN and SVM) require non-categorical features to be standardized.

In [2]:
prep_data_train = pd.get_dummies(df_train, drop_first=True)
prep_data_test = pd.get_dummies(df_test, drop_first=True)
# prep_data.columns

In [3]:
train_data = prep_data_train.drop(columns=['duration', 'y_yes'])
test_data = prep_data_test.drop(columns=['duration', 'y_yes'])
# train_data.columns
# train_data

In [4]:
from sklearn.preprocessing import StandardScaler
dnames = ['train', 'test']
# separate categorical and non-categorical features
ori_col = df_train.columns
cat_col = set([x for x in train_data.columns if x not in ori_col])
non_cat_col = [x for x in train_data.columns if x not in cat_col]

# standardize non-categorical features
scaler = StandardScaler()
sdd_train_data = scaler.fit_transform(train_data[non_cat_col])
df_sdd_train_data = pd.DataFrame(sdd_train_data, columns=non_cat_col)

# combine standardize non-categorical with categorical features
p_train_data = pd.concat([df_sdd_train_data, train_data[cat_col]], axis=1)


ori_col = df_test.columns
cat_col = set([x for x in test_data.columns if x not in ori_col])
non_cat_col = [x for x in test_data.columns if x not in cat_col]

# standardize non-categorical features
scaler = StandardScaler()
sdd_test_data = scaler.fit_transform(test_data[non_cat_col])
df_sdd_test_data = pd.DataFrame(sdd_test_data, columns=non_cat_col)

# combine standardize non-categorical with categorical features
p_test_data = pd.concat([df_sdd_test_data, test_data[cat_col]], axis=1)
# p_train_data
# p_test_data

# 4.Fit Naive Bayes, KNN, and SVM classifiers to the training set, then score each classifier on the test set. Which classifier has the highest accuracy?

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# find score for Gaussian Naive Bayes classifer
gnb = GaussianNB()
gnb.fit(p_train_data, df_train['y'])
print('GNB classifer score = ', gnb.score(p_test_data, df_test['y']))

# find score for KNN classifer
knn = KNeighborsClassifier()
knn.fit(p_train_data, df_train['y'])
print('KNN classifer score = ', knn.score(p_test_data, df_test['y']))

# find score for SVM classifer
svm = KNeighborsClassifier()
svm.fit(p_train_data, df_train['y'])
print('SVM classifer score = ', svm.score(p_test_data, df_test['y']))

GNB classifer score =  0.5962901816062931
KNN classifer score =  0.8929057006895212
SVM classifer score =  0.8929057006895212
