# 4.1 Imports

In [None]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

from numpy import mean, std
from collections import Counter
import time
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import SparsePCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, recall_score, roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

from imblearn.over_sampling import ADASYN
from imblearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load train data
train = pd.read_csv('data/train_prep_1.csv')

In [None]:
# load test data
test = pd.read_csv('data/test_prep_1.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
X_train = train.drop(columns=['target'])
X_train.shape

In [None]:
X_test = test.drop(columns=['target'])
X_test.shape

In [None]:
y_train = train['target']
y_train.shape

In [None]:
y_test = test['target']
y_test.shape

In [None]:
# temp, for deployment

train.columns

In [None]:
model_list = []
training_time_list = []
recall_score_list = []

# 4.2 Logistic Regression

In [None]:
# numerical features
num_features = ['age','education','hours_per_week']

# numerical transformer
num_transformer = StandardScaler()

In [None]:
# categorical features
cat_features = ['workclass','marital_status','occupation','relationship',
               'race','sex','capital_change','native_country']

# categorical transformer
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
# preprocessing

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
# whole model pipeline

logistic_regression = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',LogisticRegression(random_state=20210510, n_jobs=-1))
])

In [None]:
# model fitting

start_time = time.time()

logistic_regression.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation

def print_score(model):
    print('model score: %.3f' % model.score(X_test,y_test))
    print('\n')

In [None]:
# classification reports

def print_classfication_reports(model):
    y_test_pred = model.predict(X_test)

    print('Classification report on test data:')
    print(classification_report(y_test, y_test_pred))

    y_train_pred = model.predict(X_train)

    print('Classification report on train data:')
    print(classification_report(y_train, y_train_pred))

In [None]:
# model evaluation
print_score(logistic_regression)

# classification reports
print_classfication_reports(logistic_regression)

In [None]:
# grid search 1

param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(logistic_regression, param_grid, scoring='recall', verbose=1, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

In [None]:
# grid search 2

param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C': [100, 200, 300, 400, 500]
}

grid_search = GridSearchCV(logistic_regression, param_grid, scoring='recall', verbose=1, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

In [None]:
# whole model pipeline with best params

logistic_regression = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',LogisticRegression(C=300, random_state=20210510, n_jobs=-1))
])

# model fitting

start_time = time.time()

logistic_regression.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

# model evaluation
print_score(logistic_regression)

# classification reports
print_classfication_reports(logistic_regression)

In [None]:
# best of logistic regression

model = 'logistic regression'
training_time = 1.25
recall_score = 0.57

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

# 4.3 Logistic Regression with ADASYN

In [None]:
# preprocessing stay the same
# only edit the whole model pipeline

In [None]:
# oversampler
adasyn = ADASYN(random_state=20210517,n_jobs=-1)

In [None]:
# as 'pipeline' in sklearn library does not support 'adasyn'
# 'adasyn' function has no .fit_transform
# have to use 'make_pipeline' in imblearn library

logistic_adasyn = make_pipeline(preprocessor, adasyn ,LogisticRegression(random_state=20210510, n_jobs=-1))

In [None]:
# model fitting

start_time = time.time()

logistic_adasyn.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation
print_score(logistic_adasyn)

# classification reports
print_classfication_reports(logistic_adasyn)

In [None]:
# best of logistic regression with ADASYN

model = 'logistic with ADASYN'
training_time = 31.71
recall_score = 0.87

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

# 4.4 Linear Support Vector Classification (SVC)

In [None]:
# preprocessing stay the same
# only edit the whole model pipeline

In [None]:
# whole model pipeline

linear_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',LinearSVC(random_state=20210518))
])

In [None]:
# model fitting

start_time = time.time()

linear_svc.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation
print_score(linear_svc)

# classification reports
print_classfication_reports(linear_svc)

In [None]:
# grid search

param_grid = {
    'classifier__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(linear_svc, param_grid, scoring='recall', verbose=1, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

In [None]:
# whole model pipeline with best params

linear_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',LinearSVC(C=1.0, random_state=20210518))
])

# model fitting

start_time = time.time()

linear_svc.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

# model evaluation
print_score(linear_svc)

# classification reports
print_classfication_reports(linear_svc)

In [None]:
# best of linear SVC

model = 'linear SVC'
training_time = 4.48
recall_score = 0.57

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

# 4.5 K Nearest Neighbors Classification

In [None]:
# preprocessing stay the same
# only edit the whole model pipeline

In [None]:
# whole model pipeline

kneighbors = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',KNeighborsClassifier())
])

In [None]:
# model fitting

start_time = time.time()

kneighbors.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation
print_score(kneighbors)

# classification reports
print_classfication_reports(kneighbors)

In [None]:
# grid search

param_grid = {
    'classifier__n_neighbors': [5, 10, 25, 50, 100]
}

grid_search = GridSearchCV(kneighbors, param_grid, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

In [None]:
# whole model pipeline with best params

kneighbors = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',KNeighborsClassifier(n_neighbors=25))
])

# model fitting

start_time = time.time()

kneighbors.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

# model evaluation
print_score(kneighbors)

# classification reports
print_classfication_reports(kneighbors)

In [None]:
# best of K nearest neighbor classification

model = 'K nearest neighbor'
training_time = 0.24
recall_score = 0.60

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

# 4.6 Decision Tree Classification

In [None]:
# preprocessing stay the same
# only edit the whole model pipeline

In [None]:
# whole model pipeline

decision_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=20210521, class_weight='balanced'))
])

In [None]:
# model fitting

start_time = time.time()

decision_tree.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation
print_score(decision_tree)

# classification reports
print_classfication_reports(decision_tree)

In [None]:
# grid search 1

depth = [
    1, 2, 3, 4, 5,
    6, 7, 8, 9, 10,
    12, 14, 16, 18, 20
]

num_leaf = [
    1, 2, 3, 4, 5,
    6, 7, 8, 9, 10,
    12, 14, 16, 18, 20
]

param_grid = {
        'classifier__max_depth': depth,
        'classifier__min_samples_leaf': num_leaf
}

grid_search = GridSearchCV(decision_tree, param_grid, scoring='recall', verbose=1, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

In [None]:
# whole model pipeline with best params

decision_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=6, min_samples_leaf=4, random_state=20210521, class_weight='balanced'))
])

# model fitting

start_time = time.time()

decision_tree.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

# model evaluation
print_score(decision_tree)

# classification reports

print_classfication_reports(decision_tree)

In [None]:
# best of decision tree classification

model = 'decision tree'
training_time = 0.32
recall_score = 0.88

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

In [None]:
# save model

file_name = 'decision_tree_model.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(decision_tree, file)

In [None]:
# for application
# load model

# file_name = 'decision_tree_model.pkl'
# with open(file_name, 'rb') as file:
#     model = pickle.load(file)

# 4.7 Adaptive Boosting (Adaboost) Classification

In [None]:
# preprocessing stay the same
# only edit the whole model pipeline

In [None]:
# whole model pipeline

adaboost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=60, random_state=20210521, class_weight='balanced'),
        random_state=20210526))
])

In [None]:
# model fitting

start_time = time.time()

adaboost.fit(X_train, y_train)

end_time = time.time()

training_time = end_time - start_time
print(f'model training time: %.3fs' % training_time)

In [None]:
# model evaluation
print_score(adaboost)

# classification reports
print_classfication_reports(adaboost)

In [None]:
# best of AdaBoost classification

model = 'AdaBoost'
training_time = 14.59
recall_score = 0.61

model_list.append(model)
training_time_list.append(training_time)
recall_score_list.append(recall_score)

# 4.8 Performance Metrics Comparison

In [None]:
df = pd.DataFrame({
    'model': model_list,
    'training_time': training_time_list,
    'recall_score': recall_score_list
})

In [None]:
df

In [None]:
df['efficiency (1/time)'] = df.training_time.apply(lambda x: 1/x)

In [None]:
df

In [None]:
fig, ax1 = plt.subplots(figsize=(20,10))
ax2 = ax1.twinx()

sns.lineplot(data=df['efficiency (1/time)'], marker='o', sort=False, ax=ax1, palette='Reds')
sns.barplot(data=df, x='model', y='recall_score', alpha=0.5, ax=ax2, palette='Reds')

In [None]:
#END