# Logistic Regression Lab

### Part 1: Load  data

Import "bank-data.csv"

In [1]:
import pandas as pd
bankData = pd.read_csv('bank-data.csv', sep = ';')
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### Part 2: Preprocess data

Preprocess the dataset as you have done before

#### 2.1 Binary encoding

Use LabelEncoder to encode the following columns:
- y
- default
- housing
- loan

In [2]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#example
bankData['y'] = le.fit_transform(bankData['y'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [3]:
#Encode the remaining columns
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['default'] = le.fit_transform(bankData['default'])
bankData['loan'] = le.fit_transform(bankData['loan'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0


#### 2.2 Convert categorical variables into dummy columns

(1) Use pd.get_dummies to convert the following categorical variales into dummy columns
- job
- maritial
- education
- contact
- month
- poutcome

(2) Drop columns that have been converted

In [4]:
#example
bankData = pd.concat([bankData,pd.get_dummies(bankData['job'],prefix='job')],axis=1)
bankData = bankData.drop(columns=['job'])
bankData.head()

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,30,married,primary,0,1787,0,0,cellular,19,oct,...,0,0,0,0,0,0,0,0,1,0
1,33,married,secondary,0,4789,1,1,cellular,11,may,...,0,0,0,0,0,1,0,0,0,0
2,35,single,tertiary,0,1350,1,0,cellular,16,apr,...,0,0,1,0,0,0,0,0,0,0
3,30,married,tertiary,0,1476,1,1,unknown,3,jun,...,0,0,1,0,0,0,0,0,0,0
4,59,married,secondary,0,0,1,0,unknown,5,may,...,0,0,0,0,0,0,0,0,0,0


In [5]:
bankData = pd.concat([bankData,pd.get_dummies(bankData['marital'],prefix='marital')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['education'],prefix='education')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['contact'],prefix='contact')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['month'],prefix='month')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['poutcome'],prefix='poutcome')],axis=1)

bankData = bankData.drop(columns=['marital', 'education', 'contact', 'month', 'poutcome'])

In [6]:
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   default              4521 non-null   int32
 2   balance              4521 non-null   int64
 3   housing              4521 non-null   int32
 4   loan                 4521 non-null   int32
 5   day                  4521 non-null   int64
 6   duration             4521 non-null   int64
 7   campaign             4521 non-null   int64
 8   pdays                4521 non-null   int64
 9   previous             4521 non-null   int64
 10  y                    4521 non-null   int32
 11  job_admin.           4521 non-null   uint8
 12  job_blue-collar      4521 non-null   uint8
 13  job_entrepreneur     4521 non-null   uint8
 14  job_housemaid        4521 non-null   uint8
 15  job_management       4521 non-null   uint8
 16  job_retired          452

#### 2.3 Train/Test separation

Perform hold-out method
- 60% training set
- 40% testing set

In [7]:
bankData_train = bankData.sample(frac = 0.6)
bankData_test = bankData.drop(bankData_train.index)
print(pd.crosstab(bankData_train['y'],columns = 'count'))
print(pd.crosstab(bankData_test['y'],columns = 'count'))

col_0  count
y           
0       2394
1        319
col_0  count
y           
0       1606
1        202


##### X/y separation

In [8]:
bankData_train_y = bankData_train['y']
bankData_train_X = bankData_train.copy()
del bankData_train_X['y']

bankData_test_y = bankData_test['y']
bankData_test_X = bankData_test.copy()
del bankData_test_X['y']

#### 2.4 Feature Scaling

It is always a good practice to scale the features so that all of them can be uniformly evaluated

In [9]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
train_X_scaled_s = pd.DataFrame(standard_scaler.fit_transform(bankData_train_X), columns=bankData_train_X.columns)
test_X_scaled_s = pd.DataFrame(standard_scaler.fit_transform(bankData_test_X), columns=bankData_train_X.columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled_m = pd.DataFrame(min_max_scaler.fit_transform(bankData_train_X),columns=bankData_train_X.columns)
test_X_scaled_m = pd.DataFrame(min_max_scaler.fit_transform(bankData_test_X),columns=bankData_train_X.columns)

In [10]:
train_X_scaled_s.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,-0.106012,-0.142507,-0.399954,0.872748,-0.421934,0.254471,-0.838051,0.395277,-0.413757,-0.332742,...,-0.35392,-0.092467,-0.675465,-0.299263,-0.135622,-0.109251,-0.3526,-0.216993,-0.178749,0.478822
1,0.085664,-0.142507,-0.50403,-1.145807,-0.421934,1.595723,-0.678485,0.395277,-0.413757,-0.332742,...,-0.35392,-0.092467,-0.675465,-0.299263,-0.135622,-0.109251,-0.3526,-0.216993,-0.178749,0.478822
2,-0.010174,-0.142507,-0.445623,0.872748,-0.421934,0.254471,-0.097212,0.069224,0.843457,2.010487,...,-0.35392,-0.092467,-0.675465,3.341544,-0.135622,-0.109251,-0.3526,4.608438,-0.178749,-2.088459
3,0.852368,-0.142507,-0.423255,0.872748,-0.421934,0.742199,-0.302367,0.395277,-0.413757,-0.332742,...,-0.35392,-0.092467,-0.675465,-0.299263,-0.135622,-0.109251,-0.3526,-0.216993,-0.178749,0.478822
4,0.085664,-0.142507,-0.575484,0.872748,-0.421934,-0.599054,-0.697481,-0.58288,2.645133,2.010487,...,-0.35392,-0.092467,1.480461,-0.299263,-0.135622,-0.109251,2.836077,-0.216993,-0.178749,-2.088459


In [11]:
train_X_scaled_m.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0.308824,0.0,0.046738,1.0,0.0,0.566667,0.013748,0.061224,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.338235,0.0,0.042241,0.0,0.0,0.933333,0.028944,0.061224,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.323529,0.0,0.044764,1.0,0.0,0.566667,0.084298,0.040816,0.145642,0.16,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.455882,0.0,0.045731,1.0,0.0,0.7,0.064761,0.061224,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.338235,0.0,0.039154,1.0,0.0,0.333333,0.027135,0.0,0.354358,0.16,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Part 3: Train a logistic regression model & Part 4: Model Evaluation

Evaluation metrics
- confusion metrix
- accuracy
- precision, recall, f1-score

In [12]:
from sklearn import linear_model

lr = linear_model.LogisticRegression()
lr.fit(bankData_train_X, bankData_train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [13]:
print(lr.coef_[0])

[-2.15074863e-02 -3.17471557e-02  9.34283134e-06 -9.97718977e-01
 -3.19148662e-01 -2.06358703e-02  4.26185784e-03 -6.89316792e-02
 -1.16943360e-04  1.19854222e-01 -1.90842485e-02 -3.68725701e-01
 -6.15328602e-02 -1.02409472e-02  6.39198120e-03  2.85272235e-01
 -4.34273515e-02 -1.47994514e-01  1.65606799e-02 -1.07839890e-01
 -3.62194628e-02  1.86312608e-02  1.02844862e-01 -3.55939834e-01
 -2.15113846e-01 -5.38283260e-02 -4.46088320e-01  4.78401353e-02
 -1.61323073e-02  1.09832206e-01  1.02798681e-01 -6.80839705e-01
  4.91484138e-02  1.67680389e-01  4.03716233e-02  9.94597515e-03
  1.63190299e-02 -1.17978741e-01 -1.58889400e-01  8.08010705e-02
 -7.82616900e-01 -5.25582035e-02  1.95448767e-01  8.41191570e-02
 -1.96102912e-01 -2.55625364e-02  4.13720405e-01 -6.60263774e-01]


In [14]:
import numpy as np

feature_importance = abs(lr.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())


lr_feature = pd.DataFrame({'feature':bankData_train_X.columns,
                             'Score':feature_importance})

lr_feature.sort_values(by = 'Score', ascending=False).head()

Unnamed: 0,feature,Score
3,housing,100.0
40,month_may,78.440615
31,contact_unknown,68.239627
47,poutcome_unknown,66.177329
26,education_secondary,44.710818


In [15]:
res = lr.predict(bankData_test_X)
pd.crosstab(bankData_test_y, res)

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res))
print(classification_report(bankData_test_y, res))

Accuracy:	 0.889
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1606
           1       0.51      0.21      0.30       202

    accuracy                           0.89      1808
   macro avg       0.71      0.59      0.62      1808
weighted avg       0.86      0.89      0.87      1808



### StandardScaler

In [16]:
lr_s = linear_model.LogisticRegression()
lr_s.fit(train_X_scaled_s, bankData_train_y)

LogisticRegression()

In [17]:
print(lr_s.coef_[0])

[-0.02874914  0.00136271  0.00374582 -0.18978783 -0.27769482  0.00623069
  1.18102055 -0.1249968  -0.01083657  0.00960449  0.04859806 -0.02849671
 -0.03592712 -0.05869838 -0.01456731  0.1798708  -0.03501729 -0.04333061
  0.0861982  -0.00579428 -0.06474118  0.0247656   0.13693885 -0.09613278
  0.00484958 -0.09143368  0.00249982  0.11683766 -0.11464035  0.21501543
  0.10913755 -0.28913618  0.09203312  0.05294783  0.08213581  0.14490329
 -0.04274653 -0.11065549  0.10600888  0.1086085  -0.23880121 -0.04815392
  0.25091375  0.15209344 -0.07475189  0.06264603  0.33845445 -0.1236528 ]


In [18]:
feature_importance_s = abs(lr_s.coef_[0])
feature_importance_s = 100.0 * (feature_importance_s / feature_importance_s.max())


lr_feature_s = pd.DataFrame({'feature':train_X_scaled_s.columns,
                             'Score':feature_importance_s})

lr_feature_s.sort_values(by = 'Score', ascending=False).head()

Unnamed: 0,feature,Score
6,duration,100.0
46,poutcome_success,28.657795
31,contact_unknown,24.481892
4,loan,23.513124
42,month_oct,21.245502


In [19]:
res_s = lr_s.predict(test_X_scaled_s)
pd.crosstab(bankData_test_y, res_s)

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res_s))
print(classification_report(bankData_test_y, res_s))

Accuracy:	 0.902
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      1606
           1       0.61      0.33      0.43       202

    accuracy                           0.90      1808
   macro avg       0.76      0.65      0.69      1808
weighted avg       0.89      0.90      0.89      1808



### MinMaxScaler

In [20]:
lr_m = linear_model.LogisticRegression()
lr_m.fit(train_X_scaled_m, bankData_train_y)

LogisticRegression()

In [21]:
print(lr_m.coef_[0])

[ 2.69773465e-03 -4.51244164e-02  5.54477988e-04 -3.53832036e-01
 -6.34436123e-01 -5.32662478e-02  8.50295639e+00 -4.82326299e-01
  3.78155960e-02  2.22305747e-01  6.10805207e-02 -1.00734997e-01
 -1.51473951e-01 -1.90067744e-01 -6.91493653e-02  6.72096726e-01
 -1.83393712e-01 -1.64711786e-01  4.71822272e-01 -1.14596519e-01
 -3.57628990e-01  1.26211883e-01  2.97518038e-01 -2.69274976e-01
 -2.87887251e-02 -1.07774090e-01  1.23350874e-01  3.01055664e-01
 -3.17178111e-01  3.39224702e-01  2.96097766e-01 -6.35868131e-01
 -4.63502233e-02 -3.25493000e-01  5.41958406e-01  8.64021688e-02
 -5.63978656e-01 -6.53490521e-01 -1.12922109e-01  5.04714279e-01
 -8.20061045e-01 -4.97330276e-01  1.16855773e+00  7.17447582e-01
 -6.20446750e-01 -1.04705025e-01  1.41317284e+00 -6.88566729e-01]


In [22]:
feature_importance_m = abs(lr_m.coef_[0])
feature_importance_m = 100.0 * (feature_importance_m / feature_importance_m.max())


lr_feature_m = pd.DataFrame({'feature':train_X_scaled_m.columns,
                             'Score':feature_importance_m})

lr_feature_m.sort_values(by = 'Score', ascending=False).head()

Unnamed: 0,feature,Score
6,duration,100.0
46,poutcome_success,16.619782
42,month_oct,13.742958
40,month_may,9.644423
43,month_sep,8.437625


In [23]:
res_m = lr_m.predict(test_X_scaled_m)
pd.crosstab(bankData_test_y, res_m)

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res_m))
print(classification_report(bankData_test_y, res_m))

Accuracy:	 0.900
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1606
           1       0.66      0.21      0.32       202

    accuracy                           0.90      1808
   macro avg       0.79      0.60      0.63      1808
weighted avg       0.88      0.90      0.88      1808



### Part 5: Model tuning

#### Note:

After building the classifier, try answering the following questions.

1. What is the Accuracy Score?
2. If you change your preprosessing method, can you improve the model?
3. If you change your parameters setting, can you improve the model?

You can look at the parameters and functions of Logistic Regression at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html