In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#load data
bank_info = pd.read_csv('bank-additional-full.csv', delimiter=";")
bank_info.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
# Leaving age, duration, call_time, previous contact, euribor3m, employment variation rate, 
# consumer price index, consumer confidence index and number of employees as numerical columns
age = bank_info.age
duration = bank_info.duration
call_time = bank_info.campaign
prev_contact = bank_info.previous
euribor3m = bank_info.euribor3m
emp_var_rate = bank_info['emp.var.rate']
cons_price_idx = bank_info['cons.price.idx']
cons_conf_idx = bank_info['cons.conf.idx']
number_employed = bank_info['nr.employed']

In [4]:
# Rearrange the job column
j = bank_info.job
job = pd.get_dummies(j, prefix='job')
job.drop([job.columns[0]], axis=1, inplace=True)

In [5]:
# Force to stop showing warning for assigning grouped data the same categorical names
pd.options.mode.chained_assignment = None

In [6]:
# Rearrange the marital column, regroup by married and unknown
m = bank_info.marital
#m[(m == 'divorced')|(m == 'married')].loc[:]= 'married_current_or_before'
m[(m == 'divorced')|(m == 'married')] = 'married_current_or_before'
m[(m == 'single') | (m == 'unknown')] = 'not_married'
marital = pd.get_dummies(m)
marital.drop([marital.columns[0]], axis=1, inplace=True)

In [7]:
# Rearrange the education level column, regroup by high school below and above
e = bank_info.education
e[(e == 'illiterate')|(e == 'basic.4y')|(e == 'basic.6y')|(e == 'basic.9y')]= 'high_school_below'
e[(e == 'university.degree')|(e == 'professional.course')|(e == 'unknown')] = 'high_school_above'
education = pd.get_dummies(e)
education.drop([education.columns[0]], axis=1, inplace=True)

In [8]:
# Rearrange the contact communication type column
contact = pd.get_dummies(bank_info.contact)
contact.drop([contact.columns[0]], axis=1, inplace=True)

In [9]:
# Rearrange the column for number of employees - quarterly indicator
#ne = bank_info['nr.employed']
#ne[(ne == 4963.6)|(ne == 4991.6)|(ne == 5008.7)|(ne == 5017.5)|(ne == 5023.5)|(ne == 5076.2)] = '5099_below'
#ne[(ne == 5176.3)|(ne == 5191.0)|(ne == 5195.8)|(ne == 5228.1)] = '5100_above'
#number_employed = pd.get_dummies(ne)
#number_employed.drop([number_employed.columns[0]], axis=1, inplace=True)

In [10]:
# Rearrange the column for last contact month of year
month = pd.get_dummies(bank_info.month)
month.drop([month.columns[0]], axis=1, inplace=True)

In [11]:
# Rearrange the column for last contact day of the week
day_of_week = pd.get_dummies(bank_info.day_of_week)
day_of_week.drop([day_of_week.columns[0]], axis=1, inplace=True)

In [12]:
# Rearrange the column for number of days that passed by after the client was last contacted from a previous campaign
p = bank_info.pdays
p[p != 999] = '0-27'
last_contact = pd.get_dummies(p)
last_contact.drop([last_contact.columns[0]], axis=1, inplace=True)

In [13]:
# Rearrange the previous outcome column
prev_outcome = pd.get_dummies(bank_info.poutcome, prefix='prev_outcome')
prev_outcome.drop([prev_outcome.columns[0]], axis=1, inplace=True)

In [14]:
# Rearrange the employment variation rate, consumer price index and consumer confidence index
#emp_var_rate = pd.get_dummies(bank_info['emp.var.rate'], prefix='evr')
#emp_var_rate.drop([emp_var_rate.columns[0]], axis=1, inplace=True)
#cons_price_idx  = pd.get_dummies(bank_info['cons.price.idx'], prefix='cpi')
#cons_price_idx.drop([cons_price_idx.columns[0]], axis=1, inplace=True)
#cons_conf_idx = pd.get_dummies(bank_info['cons.conf.idx'], prefix='cci')
#cons_conf_idx.drop([cons_conf_idx.columns[0]], axis=1, inplace=True)

In [15]:
# Rearrange the two loans columns
hou_loan = pd.get_dummies(bank_info.housing, prefix='hou_loan')
hou_loan.drop([hou_loan.columns[0]], axis=1, inplace=True)
per_loan = pd.get_dummies(bank_info.loan, prefix='per_loan')
per_loan.drop([per_loan.columns[0]], axis=1, inplace=True)

In [16]:
# Rearrange the default column
defa = bank_info.default
defa[defa == 'yes'] = 'unknown'
default = pd.get_dummies(defa, prefix='default')
default.drop([default.columns[0]], axis=1, inplace=True)

In [17]:
# Save the output column
output = pd.get_dummies(bank_info.y).yes
output = output.rename("y")

In [18]:
# The entire rearranged data set
data_df = pd.concat([age, job, marital, education, default, hou_loan, per_loan, contact, month, 
                  duration, call_time, day_of_week, last_contact, prev_contact, prev_outcome, 
                  emp_var_rate, number_employed, cons_price_idx, cons_conf_idx, euribor3m, output], axis=1)
pd.set_option('display.max_columns', None)
data_df.head()
data_df.shape

(41188, 46)

In [19]:
# Save the entire rearranged data set
pd.DataFrame.to_csv(data_df, path_or_buf='../capstone_project_1/rearranged_whole_data_set.csv'
                    , index=False)

In [18]:
# Rearranged data set that excluding cons.conf.idx, tue and thur in day_of week, housing loan, personal loan, dec in month, 
# unemployed, techician and unknown in job and age
day_of_week.drop(['tue','thu'], axis=1, inplace=True)
month.drop('dec', axis=1, inplace=True)
job.drop(['job_unemployed','job_technician','job_unknown'], axis=1, inplace=True)
data_df = pd.concat([job, marital, education, default, contact, month, 
                  duration, call_time, day_of_week, last_contact, prev_contact, prev_outcome, 
                  emp_var_rate, number_employed, cons_price_idx, euribor3m, output], axis=1)
pd.set_option('display.max_columns', None)
data_df.head()
data_df.shape

(41188, 35)

In [18]:
# Rearrangeed data set that excluding age, personal loan, housing loan, marital, job, 
# Jun, Nov and Aug in month and Thursday in day_of_week
day_of_week.drop('thu', axis=1, inplace=True)
month.drop(['jun', 'nov', 'aug'], axis=1, inplace=True)
data_df = pd.concat([education, default, contact, month, duration, call_time, day_of_week, last_contact, 
                    prev_contact, prev_outcome, emp_var_rate, number_employed, cons_price_idx, cons_conf_idx, 
                     euribor3m, output], axis=1)
pd.set_option('display.max_columns', None)
data_df.head()
data_df.shape

(41188, 82)

In [19]:
# Save the optimal data set for future training
pd.DataFrame.to_csv(data_df, path_or_buf='../capstone_project_1/rearranged_optimal_data_set.csv'
                   , index=False)

In [20]:
# Standardize variables
scaler = StandardScaler()
a = data_df.drop([data_df.columns[-1]], axis=1)
scaler.fit(a)
b = pd.DataFrame(scaler.transform(a))
b['y'] = data_df.iloc[:,-1]
b.columns = data_df.columns
b.head()

Unnamed: 0,age,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,not_married,high_school_above,high_school_below,default_unknown,hou_loan_unknown,hou_loan_yes,per_loan_unknown,per_loan_yes,telephone,aug,dec,jul,jun,mar,may,nov,oct,sep,duration,campaign,mon,thu,tue,wed,0-27,previous,prev_outcome_nonexistent,prev_outcome_success,emp.var.rate,nr.employed,cons.price.idx,cons.conf.idx,euribor3m,y
0,1.533034,-0.538317,-0.19143,6.152772,-0.276435,-0.208757,-0.189032,-0.326556,-0.147327,-0.442449,-0.158872,-0.089871,-0.627943,-0.931813,1.512246,-0.513713,-0.156933,-1.048877,-0.156933,-0.422872,1.31827,-0.420076,-0.066621,-0.459253,-0.385042,-0.115907,1.411155,-0.332532,-0.133197,-0.118462,0.010471,-0.565922,1.959,-0.514581,-0.494394,-0.496067,-0.195415,-0.349494,0.397706,-0.1857,0.648092,0.33168,0.722722,0.886447,0.71246,0
1,1.628993,-0.538317,-0.19143,-0.162528,-0.276435,-0.208757,-0.189032,3.062258,-0.147327,-0.442449,-0.158872,-0.089871,-0.627943,-0.931813,-0.661268,1.946613,-0.156933,-1.048877,-0.156933,-0.422872,1.31827,-0.420076,-0.066621,-0.459253,-0.385042,-0.115907,1.411155,-0.332532,-0.133197,-0.118462,-0.421501,-0.565922,1.959,-0.514581,-0.494394,-0.496067,-0.195415,-0.349494,0.397706,-0.1857,0.648092,0.33168,0.722722,0.886447,0.71246,0
2,-0.290186,-0.538317,-0.19143,-0.162528,-0.276435,-0.208757,-0.189032,3.062258,-0.147327,-0.442449,-0.158872,-0.089871,-0.627943,-0.931813,-0.661268,-0.513713,-0.156933,0.953401,-0.156933,-0.422872,1.31827,-0.420076,-0.066621,-0.459253,-0.385042,-0.115907,1.411155,-0.332532,-0.133197,-0.118462,-0.12452,-0.565922,1.959,-0.514581,-0.494394,-0.496067,-0.195415,-0.349494,0.397706,-0.1857,0.648092,0.33168,0.722722,0.886447,0.71246,0
3,-0.002309,-0.538317,-0.19143,-0.162528,-0.276435,-0.208757,-0.189032,-0.326556,-0.147327,-0.442449,-0.158872,-0.089871,-0.627943,-0.931813,1.512246,-0.513713,-0.156933,-1.048877,-0.156933,-0.422872,1.31827,-0.420076,-0.066621,-0.459253,-0.385042,-0.115907,1.411155,-0.332532,-0.133197,-0.118462,-0.413787,-0.565922,1.959,-0.514581,-0.494394,-0.496067,-0.195415,-0.349494,0.397706,-0.1857,0.648092,0.33168,0.722722,0.886447,0.71246,0
4,1.533034,-0.538317,-0.19143,-0.162528,-0.276435,-0.208757,-0.189032,3.062258,-0.147327,-0.442449,-0.158872,-0.089871,-0.627943,-0.931813,-0.661268,-0.513713,-0.156933,-1.048877,-0.156933,2.364781,1.31827,-0.420076,-0.066621,-0.459253,-0.385042,-0.115907,1.411155,-0.332532,-0.133197,-0.118462,0.187888,-0.565922,1.959,-0.514581,-0.494394,-0.496067,-0.195415,-0.349494,0.397706,-0.1857,0.648092,0.33168,0.722722,0.886447,0.71246,0


In [21]:
# Splitting the data to 70% training, 20% validating and 10% testing
np.random.seed(101)
data = np.random.permutation(b)
train, validate, test = np.split(data, [int(.7*len(data)), int(.9*len(data))])

X_train = train[:, :-1]
Y_train = train[:, -1]
X_test = validate[:, :-1]
Y_test = validate[:, -1]
X_final = test[:, :-1]
Y_final = test[:, -1]

print(X_train.shape)
print(X_test.shape)
print(X_final.shape)
print(Y_train.shape)
print(Y_test.shape)
print(Y_final.shape)

(28831, 45)
(8238, 45)
(4119, 45)
(28831,)
(8238,)
(4119,)


In [40]:
# Train the data using logistic regression, class not weighted
model1 = LogisticRegression()
model1.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
# Try gridsearch for tuning parameters
parameters = {'C':[0.1, 1, 10, 20], 'tol':[1e-04, 0.001, 0.01]}
clf = GridSearchCV(model1, parameters, cv=5, scoring='accuracy')
clf.fit(X_train, Y_train)
clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

(LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 {'C': 10, 'tol': 0.0001},
 ({'C': 0.1, 'tol': 0.0001},
  {'C': 0.1, 'tol': 0.001},
  {'C': 0.1, 'tol': 0.01},
  {'C': 1, 'tol': 0.0001},
  {'C': 1, 'tol': 0.001},
  {'C': 1, 'tol': 0.01},
  {'C': 10, 'tol': 0.0001},
  {'C': 10, 'tol': 0.001},
  {'C': 10, 'tol': 0.01},
  {'C': 20, 'tol': 0.0001},
  {'C': 20, 'tol': 0.001},
  {'C': 20, 'tol': 0.01}),
 array([ 0.91203912,  0.91200444,  0.91200444,  0.9118657 ,  0.9118657 ,
         0.91203912,  0.91221255,  0.91214318,  0.91172696,  0.91221255,
         0.91221255,  0.91179633]))

#### From the results, if the model is trained without class weight balanced, the best C value is 10, which differs from the default value, and the best tol value I would decide to be 1e-04, the same as the default, since the scores really do not vary much as tol value changes.

In [44]:
model1 = LogisticRegression(C=10)
model1.fit(X_train, Y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
# Presenting sorted features based on their corresponding coefficients
features = pd.DataFrame(a.columns, columns=['features'])[:-1]
coefficients = pd.DataFrame(abs(model1.coef_)).unstack().unstack()
results = pd.concat([features, coefficients], axis=1)
results.columns = ['features', 'coefficients']
results.sort_values(by='coefficients')

Unnamed: 0,features,coefficients
10,job_unemployed,8.5e-05
18,per_loan_unknown,0.003323
16,hou_loan_unknown,0.003323
11,job_unknown,0.004836
0,age,0.006002
3,job_housemaid,0.012412
12,not_married,0.013504
17,hou_loan_yes,0.014623
23,jul,0.017386
22,dec,0.023208


In [48]:
# Train the data using logistic regression, class weighted
model2 = LogisticRegression(class_weight='balanced')
model2.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [49]:
# Try gridsearch for tuning parameters
parameters = {'C':[0.01, 0.1, 1], 'tol':[0.001, 0.01, 0.1]}
clf = GridSearchCV(model2, parameters, cv=5, scoring='accuracy')
clf.fit(X_train, Y_train)
clf.best_estimator_, clf.best_params_, clf.cv_results_['params'], clf.cv_results_['mean_test_score']

(LogisticRegression(C=0.1, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='liblinear', tol=0.01, verbose=0, warm_start=False),
 {'C': 0.1, 'tol': 0.01},
 ({'C': 0.01, 'tol': 0.001},
  {'C': 0.01, 'tol': 0.01},
  {'C': 0.01, 'tol': 0.1},
  {'C': 0.1, 'tol': 0.001},
  {'C': 0.1, 'tol': 0.01},
  {'C': 0.1, 'tol': 0.1},
  {'C': 1, 'tol': 0.001},
  {'C': 1, 'tol': 0.01},
  {'C': 1, 'tol': 0.1}),
 array([ 0.85852034,  0.85872845,  0.85886719,  0.86198883,  0.8620582 ,
         0.86042801,  0.86129513,  0.86122576,  0.86060144]))

#### From the results, if the model is trained with class weight balanced, the best C value is 0.1 and the best tol value is 0.01, which both differ from the default values. 

In [52]:
model2 = LogisticRegression(class_weight='balanced', C=0.1, tol=0.01)
model2.fit(X_train, Y_train)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.01, verbose=0, warm_start=False)

## Compare weighted and unweighted model:

In [46]:
# Classification report for including all the data
print(classification_report(y_true=Y_test, y_pred=model1.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.93      0.97      0.95      7280
        yes       0.66      0.41      0.50       958

avg / total       0.89      0.91      0.90      8238



In [53]:
# Classification report for including all the data, class weighted
print(classification_report(y_true=Y_test, y_pred=model2.predict(X_test), target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.98      0.86      0.92      7280
        yes       0.45      0.87      0.59       958

avg / total       0.92      0.86      0.88      8238

