# Logistic Regression Lab

### Part 1: Load  data

Import "bank-data.csv"

In [8]:
import pandas as pd
bankData = pd.read_csv('bank-data.csv', sep = ';')
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### Part 2: Preprocess data

#### 2.1 Binary encoding

Use LabelEncoder to encode the following columns:
- y
- default
- housing
- loan

In [9]:
# Change the 'y' column to 0 and 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#example
bankData['y'] = le.fit_transform(bankData['y'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [10]:
#Encode the remaining columns 
# Change the 'default', 'housing' and 'loan' into the 0 and 1 value
bankData['default'] = le.fit_transform(bankData['default'])
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['loan'] = le.fit_transform(bankData['loan'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0


#### 2.2 Convert categorical variables into dummy columns

(1) Use pd.get_dummies to convert the following categorical variales into dummy columns
- job
- marital
- education
- contact
- month
- poutcome

(2) Drop columns that have been converted

In [11]:
#example
bankData = pd.concat([bankData,pd.get_dummies(bankData['job'],prefix='job')],axis=1)
bankData = bankData.drop(columns=['job'])
bankData.head()

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,30,married,primary,0,1787,0,0,cellular,19,oct,...,0,0,0,0,0,0,0,0,1,0
1,33,married,secondary,0,4789,1,1,cellular,11,may,...,0,0,0,0,0,1,0,0,0,0
2,35,single,tertiary,0,1350,1,0,cellular,16,apr,...,0,0,1,0,0,0,0,0,0,0
3,30,married,tertiary,0,1476,1,1,unknown,3,jun,...,0,0,1,0,0,0,0,0,0,0
4,59,married,secondary,0,0,1,0,unknown,5,may,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#Convert and drop the remaining columns
bankData = pd.concat([bankData,pd.get_dummies(bankData['marital'],prefix='marital')],axis=1)
bankData = bankData.drop(columns=['marital'])

bankData = pd.concat([bankData,pd.get_dummies(bankData['education'],prefix='education')],axis=1)
bankData = bankData.drop(columns=['education'])

bankData = pd.concat([bankData,pd.get_dummies(bankData['contact'],prefix='contact')],axis=1)
bankData = bankData.drop(columns=['contact'])

bankData = pd.concat([bankData,pd.get_dummies(bankData['month'],prefix='month')],axis=1)
bankData = bankData.drop(columns=['month'])

bankData = pd.concat([bankData,pd.get_dummies(bankData['poutcome'],prefix='poutcome')],axis=1)
bankData = bankData.drop(columns=['poutcome'])
bankData.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,0,1,0,0,0,0,1
1,33,0,4789,1,1,11,220,1,339,4,...,0,0,1,0,0,0,1,0,0,0
2,35,0,1350,1,0,16,185,1,330,1,...,0,0,0,0,0,0,1,0,0,0
3,30,0,1476,1,1,3,199,4,-1,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,0,1,0,5,226,1,-1,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  4521 non-null   int64
 1   default              4521 non-null   int32
 2   balance              4521 non-null   int64
 3   housing              4521 non-null   int32
 4   loan                 4521 non-null   int32
 5   day                  4521 non-null   int64
 6   duration             4521 non-null   int64
 7   campaign             4521 non-null   int64
 8   pdays                4521 non-null   int64
 9   previous             4521 non-null   int64
 10  y                    4521 non-null   int32
 11  job_admin.           4521 non-null   uint8
 12  job_blue-collar      4521 non-null   uint8
 13  job_entrepreneur     4521 non-null   uint8
 14  job_housemaid        4521 non-null   uint8
 15  job_management       4521 non-null   uint8
 16  job_retired          452

#### 2.3 Train/Test separation

Perform hold-out method
- 60% training set
- 40% testing set

In [14]:
bankData_train = bankData.sample(frac = 0.6)
bankData_test = bankData.drop(bankData_train.index)
print(pd.crosstab(bankData_train['y'],columns = 'count'))
print(pd.crosstab(bankData_test['y'],columns = 'count'))

col_0  count
y           
0       2388
1        325
col_0  count
y           
0       1612
1        196


##### X/y separation

In [15]:
bankData_train_y = bankData_train['y']
bankData_train_X = bankData_train.copy()
del bankData_train_X['y']

bankData_test_y = bankData_test['y']
bankData_test_X = bankData_test.copy()
del bankData_test_X['y']

### Part 3: Train a logistic regression model

In [18]:
from sklearn import linear_model

lr = linear_model.LogisticRegression(max_iter=10000)
lr.fit(bankData_train_X, bankData_train_y)

LogisticRegression(max_iter=10000)

### Part 4: Model Evaluation

Evaluation metrics
- confusion metrix
- accuracy
- precision, recall, f1-score

In [19]:
#confusion metrix
res = lr.predict(bankData_test_X)
pd.crosstab(bankData_test_y, res)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1582,30
1,137,59


In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res))
print(classification_report(bankData_test_y, res))

Accuracy:	 0.908
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1612
           1       0.66      0.30      0.41       196

    accuracy                           0.91      1808
   macro avg       0.79      0.64      0.68      1808
weighted avg       0.89      0.91      0.89      1808



### Part 5: Model tuning

#### Note:

After building the classifier, try answering the following questions.

1. What is the Accuracy Score?
2. If you change your preprosessing method, can you improve the model?
3. If you change your parameters setting, can you improve the model?

You can look at the parameters and functions of Logistic Regression at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html