In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#load data
bank_info = pd.read_csv('bank-additional-full.csv', delimiter=";")
bank_info.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
# Leaving age, duration, call_time, previous contact, euribor3m, employment variation rate, 
# consumer price index, consumer confidence index and number of employees as numerical columns
age = bank_info.age
duration = bank_info.duration
call_time = bank_info.campaign
prev_contact = bank_info.previous
euribor3m = bank_info.euribor3m
emp_var_rate = bank_info['emp.var.rate']
cons_price_idx = bank_info['cons.price.idx']
cons_conf_idx = bank_info['cons.conf.idx']
number_employed = bank_info['nr.employed']

In [4]:
# Rearrange the job column
j = bank_info.job
job = pd.get_dummies(j, prefix='job')
job.drop([job.columns[0]], axis=1, inplace=True)

In [5]:
# Force to stop showing warning for assigning grouped data the same categorical names
pd.options.mode.chained_assignment = None

In [6]:
# Rearrange the marital column, regroup by married and unknown
m = bank_info.marital
#m[(m == 'divorced')|(m == 'married')].loc[:]= 'married_current_or_before'
m[(m == 'divorced')|(m == 'married')] = 'married_current_or_before'
m[(m == 'single') | (m == 'unknown')] = 'not_married'
marital = pd.get_dummies(m)
marital.drop([marital.columns[0]], axis=1, inplace=True)

In [7]:
# Rearrange the education level column, regroup by high school below and above
e = bank_info.education
e[(e == 'illiterate')|(e == 'basic.4y')|(e == 'basic.6y')|(e == 'basic.9y')]= 'high_school_below'
e[(e == 'university.degree')|(e == 'professional.course')|(e == 'unknown')] = 'high_school_above'
education = pd.get_dummies(e)
education.drop([education.columns[0]], axis=1, inplace=True)

In [8]:
# Rearrange the contact communication type column
contact = pd.get_dummies(bank_info.contact)
contact.drop([contact.columns[0]], axis=1, inplace=True)

In [9]:
# Rearrange the column for last contact month of year
month = pd.get_dummies(bank_info.month)
month.drop([month.columns[0]], axis=1, inplace=True)

In [10]:
# Rearrange the column for last contact day of the week
day_of_week = pd.get_dummies(bank_info.day_of_week)
day_of_week.drop([day_of_week.columns[0]], axis=1, inplace=True)

In [11]:
# Rearrange the column for number of days that passed by after the client was last contacted from a previous campaign
p = bank_info.pdays
p[p != 999] = '0-27'
last_contact = pd.get_dummies(p)
last_contact.drop([last_contact.columns[0]], axis=1, inplace=True)

In [12]:
# Rearrange the previous outcome column
prev_outcome = pd.get_dummies(bank_info.poutcome, prefix='prev_outcome')
prev_outcome.drop([prev_outcome.columns[0]], axis=1, inplace=True)

In [13]:
# Rearrange the two loans columns
hou_loan = pd.get_dummies(bank_info.housing, prefix='hou_loan')
hou_loan.drop([hou_loan.columns[0]], axis=1, inplace=True)
per_loan = pd.get_dummies(bank_info.loan, prefix='per_loan')
per_loan.drop([per_loan.columns[0]], axis=1, inplace=True)

In [14]:
# Rearrange the default column
defa = bank_info.default
defa[defa == 'yes'] = 'unknown'
default = pd.get_dummies(defa, prefix='default')
default.drop([default.columns[0]], axis=1, inplace=True)

In [15]:
# Save the output column
output = pd.get_dummies(bank_info.y).yes
output = output.rename("y")

In [16]:
# The entire rearranged data set
data_df = pd.concat([age, job, marital, education, default, hou_loan, per_loan, contact, month, 
                  duration, call_time, day_of_week, last_contact, prev_contact, prev_outcome, 
                  emp_var_rate, number_employed, cons_price_idx, cons_conf_idx, euribor3m, output], axis=1)
pd.set_option('display.max_columns', None)
data_df.head()
data_df.shape

(41188, 46)

In [17]:
# Save the entire rearranged data set for future training
pd.DataFrame.to_csv(data_df, path_or_buf='../capstone_project_1/rearranged_whole_data_set.csv'
                    , index=False)

In [17]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(data_df)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(28831, 45)
(8238, 45)
(4119, 45)
(28831,)
(8238,)
(4119,)


In [18]:
print(sum(Y_train)/len(Y_train), sum(Y_test)/len(Y_test))

0.111234435157 0.116290361738


#### The above cell is to make sure the train and test datasets have about the same class ratio. It is also shown from the above cell that the data is imbalanced, with only ~11% output being "1". 

In [19]:
# Standardize variables
scaler = StandardScaler()
scaler.fit(X_train) # use train data to fit
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)
X_train = scaler.transform(X_train)
X_train

array([[-0.7709298 , -0.53710216, -0.19402632, ...,  0.72247186,
         0.88520024,  0.70991597],
       [ 1.62982596, -0.53710216, -0.19402632, ..., -0.22856308,
         0.95009242,  0.77112035],
       [-0.67489957, -0.53710216, -0.19402632, ...,  0.72247186,
         0.88520024,  0.70818377],
       ..., 
       [-1.92329256, -0.53710216, -0.19402632, ...,  0.59105613,
        -0.47753557,  0.76881075],
       [ 0.76555389, -0.53710216, -0.19402632, ..., -0.22856308,
         0.95009242,  0.77054295],
       [-0.96299026,  1.8618432 , -0.19402632, ...,  0.59105613,
        -0.47753557,  0.76996555]])

In [22]:
# Train the data using logistic regression, class weight not balanced
model1 = LogisticRegression()
model1.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
# Try gridsearch for tuning parameters
parameters = {'C':[0.1, 1, 10, 20], 'tol':[1e-05, 1e-04, 0.001]}
clf = GridSearchCV(model1, parameters, cv=5, scoring='accuracy')
clf.fit(X_train, Y_train)
clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

(LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=1e-05,
           verbose=0, warm_start=False),
 {'C': 10, 'tol': 1e-05},
 ({'C': 0.1, 'tol': 1e-05},
  {'C': 0.1, 'tol': 0.0001},
  {'C': 0.1, 'tol': 0.001},
  {'C': 1, 'tol': 1e-05},
  {'C': 1, 'tol': 0.0001},
  {'C': 1, 'tol': 0.001},
  {'C': 10, 'tol': 1e-05},
  {'C': 10, 'tol': 0.0001},
  {'C': 10, 'tol': 0.001},
  {'C': 20, 'tol': 1e-05},
  {'C': 20, 'tol': 0.0001},
  {'C': 20, 'tol': 0.001}),
 array([ 0.91200444,  0.91200444,  0.91200444,  0.9118657 ,  0.9118657 ,
         0.91183102,  0.91221255,  0.91221255,  0.91217786,  0.91221255,
         0.91221255,  0.91221255]))

#### From the results, if the model is trained without class weight balanced, the best C value is 10, which differs from the default value, and the best tol value I would decide to be 1e-04, the same as the default, since the scores really do not vary as tol value changes.

In [20]:
# Train the logistic regression model with class weight not balanced and parameters tuned through gridsearch
model1 = LogisticRegression(C=10)
model1.fit(X_train, Y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# Train the data using logistic regression, class weighted
model2 = LogisticRegression(class_weight='balanced')
model2.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [27]:
# Try gridsearch for tuning parameters
parameters = {'C':[0.01, 0.1, 1], 'tol':[1e-4, 0.001, 0.01]}
clf = GridSearchCV(model2, parameters, cv=5, scoring='accuracy')
clf.fit(X_train, Y_train)
clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

(LogisticRegression(C=0.1, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='liblinear', tol=0.001, verbose=0, warm_start=False),
 {'C': 0.1, 'tol': 0.001},
 ({'C': 0.01, 'tol': 0.0001},
  {'C': 0.01, 'tol': 0.001},
  {'C': 0.01, 'tol': 0.01},
  {'C': 0.1, 'tol': 0.0001},
  {'C': 0.1, 'tol': 0.001},
  {'C': 0.1, 'tol': 0.01},
  {'C': 1, 'tol': 0.0001},
  {'C': 1, 'tol': 0.001},
  {'C': 1, 'tol': 0.01}),
 array([ 0.85855503,  0.85855503,  0.85876314,  0.86191946,  0.86209289,
         0.8620582 ,  0.86126045,  0.86126045,  0.86126045]))

#### From the results, if the model is trained with class weight balanced, the best C value is 0.1 and the best tol value is 0.001, which both differ from their default values. 

In [28]:
# Train the logistic regression model with class weight balanced and parameters tuned through gridsearch
model2 = LogisticRegression(class_weight='balanced', C=0.1, tol=0.001)
model2.fit(X_train, Y_train)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.001, verbose=0, warm_start=False)

In [21]:
# Present features sorted by their corresponding coefficients
a = data_df.drop([data_df.columns[-1]], axis=1)
features = pd.DataFrame(a.columns, columns=['features'])
coefficients = pd.DataFrame(np.exp(model1.coef_)).unstack().unstack()
results = pd.concat([features, coefficients], axis=1)
results.columns = ['features', 'coefficients']
results.sort_values(by='coefficients')

Unnamed: 0,features,coefficients
40,emp.var.rate,0.075691
20,telephone,0.770068
26,may,0.787324
24,jun,0.813355
31,campaign,0.880749
1,job_blue-collar,0.887037
15,default_unknown,0.892757
27,nov,0.901712
32,mon,0.947674
4,job_management,0.95721


## Compare weighted and unweighted models:

In [31]:
# Classification report for including all features, class weight not balanced
print(classification_report(y_true=Y_test, y_pred=model1.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.93      0.97      0.95      7280
        yes       0.66      0.41      0.50       958

avg / total       0.89      0.91      0.90      8238



In [32]:
# Classification report for including all features, class weight balanced
print(classification_report(y_true=Y_test, y_pred=model2.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.98      0.86      0.92      7280
        yes       0.45      0.87      0.59       958

avg / total       0.92      0.86      0.88      8238



#### Firstly, since the data set is imbalanced with much less 1s, when looking through the classification report, 1s ("yes") need more attention. 
#### Secondly, for this typical problem, precision and recall are both important, with precision means how many percentage of people that are predicted to subscribe a term deposit by the model really subscribed and recall means how many percentage of people that actually subscribed are predicted to subscribe a term deposit by the model. Hence, f1 score is used to determine which model would be better since it represents the harmonic mean of precision and recall. 
#### From the two classification reports above, the model with class weight balanced has a higher f1 score for output "yes".  

## Exclude some columns of features that have relatively low coefficients shown in the features and coefficients table above, train and compare:

In [23]:
# Rearranged dataset to exclude top five unimportant features: contact type, employment variation rate, call time/
# campaign and May and June in month
month.drop(['may', 'jun'], axis=1, inplace=True)
data_df2 = pd.concat([age, job, marital, education, default, hou_loan, per_loan, month, duration, day_of_week, 
                      last_contact, prev_contact, prev_outcome, number_employed, cons_price_idx, cons_conf_idx, 
                     euribor3m, output], axis=1)
pd.set_option('display.max_columns', None)
data_df2.head()
data_df2.shape

(41188, 41)

In [24]:
# Save the optimal data set for future training
pd.DataFrame.to_csv(data_df2, path_or_buf='../capstone_project_1/rearranged_optimal_data_set.csv'
                   , index=False)

In [25]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(data_df2)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(28831, 40)
(8238, 40)
(4119, 40)
(28831,)
(8238,)
(4119,)


In [26]:
# Standardize variables
scaler = StandardScaler()
scaler.fit(X_train) # use train data to fit
X_test = scaler.transform(X_test)
X_final = scaler.transform(X_final)
X_train = scaler.transform(X_train)
X_train

array([[-0.7709298 , -0.53710216, -0.19402632, ...,  0.72247186,
         0.88520024,  0.70991597],
       [ 1.62982596, -0.53710216, -0.19402632, ..., -0.22856308,
         0.95009242,  0.77112035],
       [-0.67489957, -0.53710216, -0.19402632, ...,  0.72247186,
         0.88520024,  0.70818377],
       ..., 
       [-1.92329256, -0.53710216, -0.19402632, ...,  0.59105613,
        -0.47753557,  0.76881075],
       [ 0.76555389, -0.53710216, -0.19402632, ..., -0.22856308,
         0.95009242,  0.77054295],
       [-0.96299026,  1.8618432 , -0.19402632, ...,  0.59105613,
        -0.47753557,  0.76996555]])

In [27]:
# Train the data using logistic regression, class weighted
model3 = LogisticRegression(class_weight='balanced')
model3.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [29]:
# Try gridsearch for tuning parameters
parameters = {'C':[1, 10, 20], 'tol':[0.001, 0.01, 0.1]}
clf = GridSearchCV(model3, parameters, cv=5, scoring='accuracy')
clf.fit(X_train, Y_train)
clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

(LogisticRegression(C=10, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='liblinear', tol=0.01, verbose=0, warm_start=False),
 {'C': 10, 'tol': 0.01},
 ({'C': 1, 'tol': 0.001},
  {'C': 1, 'tol': 0.01},
  {'C': 1, 'tol': 0.1},
  {'C': 10, 'tol': 0.001},
  {'C': 10, 'tol': 0.01},
  {'C': 10, 'tol': 0.1},
  {'C': 20, 'tol': 0.001},
  {'C': 20, 'tol': 0.01},
  {'C': 20, 'tol': 0.1}),
 array([ 0.86028927,  0.86039333,  0.86035864,  0.86035864,  0.8604627 ,
         0.86032396,  0.86035864,  0.8604627 ,  0.86032396]))

In [30]:
# Train the logistic regression model with class weight balanced and parameters tuned through gridsearch
model3 = LogisticRegression(class_weight='balanced', C=10, tol=0.01)
model3.fit(X_train, Y_train)

LogisticRegression(C=10, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.01, verbose=0, warm_start=False)

In [31]:
# Classification report for excluding age, personal loan, housing loan, marital, 
# unemployed, unknown and housemaid in job and July in month, class weight balanced
print(classification_report(y_true=Y_test, y_pred=model3.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.98      0.87      0.92      7280
        yes       0.45      0.85      0.59       958

avg / total       0.92      0.86      0.88      8238



#### Compare the classification report for including all features and the classification report for excluding some features with low coefficients, the only difference for "yes" output is that excluding some features slightly improves the recall score by 1%. 