# Logistic Regression Lab

### Part 1: Load  data

Import "bank-data.csv"

In [None]:
import pandas as pd
bankData = pd.read_csv('bank-data.csv', sep = ';')
bankData.head()

### Part 2: Preprocess data

Preprocess the dataset as you have done before

#### 2.1 Binary encoding

Use LabelEncoder to encode the following columns:
- y
- default
- housing
- loan

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#example
bankData['y'] = le.fit_transform(bankData['y'])
bankData.head()

In [None]:
#Encode the remaining columns
bankData['housing'] = le.fit_transform(bankData['housing'])
bankData['default'] = le.fit_transform(bankData['default'])
bankData['loan'] = le.fit_transform(bankData['loan'])
bankData.head()

#### 2.2 Convert categorical variables into dummy columns

(1) Use pd.get_dummies to convert the following categorical variales into dummy columns
- job
- maritial
- education
- contact
- month
- poutcome

(2) Drop columns that have been converted

In [None]:
#example
bankData = pd.concat([bankData,pd.get_dummies(bankData['job'],prefix='job')],axis=1)
bankData = bankData.drop(columns=['job'])
bankData.head()

In [None]:
bankData = pd.concat([bankData,pd.get_dummies(bankData['marital'],prefix='marital')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['education'],prefix='education')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['contact'],prefix='contact')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['month'],prefix='month')],axis=1)
bankData = pd.concat([bankData,pd.get_dummies(bankData['poutcome'],prefix='poutcome')],axis=1)

bankData = bankData.drop(columns=['marital', 'education', 'contact', 'month', 'poutcome'])

In [None]:
bankData.info()

#### 2.3 Train/Test separation

Perform hold-out method
- 60% training set
- 40% testing set

In [None]:
bankData_train = bankData.sample(frac = 0.6)
bankData_test = bankData.drop(bankData_train.index)
print(pd.crosstab(bankData_train['y'],columns = 'count'))
print(pd.crosstab(bankData_test['y'],columns = 'count'))

##### X/y separation

In [None]:
bankData_train_y = bankData_train['y']
bankData_train_X = bankData_train.copy()
del bankData_train_X['y']

bankData_test_y = bankData_test['y']
bankData_test_X = bankData_test.copy()
del bankData_test_X['y']

#### 2.4 Feature Scaling

It is always a good practice to scale the features so that all of them can be uniformly evaluated

In [None]:
from sklearn import preprocessing

standard_scaler = preprocessing.StandardScaler()
train_X_scaled_s = pd.DataFrame(standard_scaler.fit_transform(bankData_train_X), columns=bankData_train_X.columns)
test_X_scaled_s = pd.DataFrame(standard_scaler.fit_transform(bankData_test_X), columns=bankData_train_X.columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled_m = pd.DataFrame(min_max_scaler.fit_transform(bankData_train_X),columns=bankData_train_X.columns)
test_X_scaled_m = pd.DataFrame(min_max_scaler.fit_transform(bankData_test_X),columns=bankData_train_X.columns)

In [None]:
train_X_scaled_s.head()

In [None]:
train_X_scaled_m.head()

### Part 3: Train a logistic regression model & Part 4: Model Evaluation

Evaluation metrics
- confusion metrix
- accuracy
- precision, recall, f1-score

In [None]:
from sklearn import linear_model

lr = linear_model.LogisticRegression()
lr.fit(bankData_train_X, bankData_train_y)

In [None]:
print(lr.coef_[0])

In [None]:
import numpy as np

feature_importance = abs(lr.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())


lr_feature = pd.DataFrame({'feature':bankData_train_X.columns,
                             'Score':feature_importance})

lr_feature.sort_values(by = 'Score', ascending=False).head()

In [None]:
res = lr.predict(bankData_test_X)
pd.crosstab(bankData_test_y, res)

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res))
print(classification_report(bankData_test_y, res))

### StandardScaler

In [None]:
lr_s = linear_model.LogisticRegression()
lr_s.fit(train_X_scaled_s, bankData_train_y)

In [None]:
print(lr_s.coef_[0])

In [None]:
feature_importance_s = abs(lr_s.coef_[0])
feature_importance_s = 100.0 * (feature_importance_s / feature_importance_s.max())


lr_feature_s = pd.DataFrame({'feature':train_X_scaled_s.columns,
                             'Score':feature_importance_s})

lr_feature_s.sort_values(by = 'Score', ascending=False).head()

In [None]:
res_s = lr_s.predict(test_X_scaled_s)
pd.crosstab(bankData_test_y, res_s)

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res_s))
print(classification_report(bankData_test_y, res_s))

### MinMaxScaler

In [None]:
lr_m = linear_model.LogisticRegression()
lr_m.fit(train_X_scaled_m, bankData_train_y)

In [None]:
print(lr_m.coef_[0])

In [None]:
feature_importance_m = abs(lr_m.coef_[0])
feature_importance_m = 100.0 * (feature_importance_m / feature_importance_m.max())


lr_feature_m = pd.DataFrame({'feature':train_X_scaled_m.columns,
                             'Score':feature_importance_m})

lr_feature_m.sort_values(by = 'Score', ascending=False).head()

In [None]:
res_m = lr_m.predict(test_X_scaled_m)
pd.crosstab(bankData_test_y, res_m)

print("Accuracy:\t %.3f" %accuracy_score(bankData_test_y, res_m))
print(classification_report(bankData_test_y, res_m))

### Part 5: Model tuning

#### Note:

After building the classifier, try answering the following questions.

1. What is the Accuracy Score?
2. If you change your preprosessing method, can you improve the model?
3. If you change your parameters setting, can you improve the model?

You can look at the parameters and functions of Logistic Regression at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html