# ML with pipeline

## Import packages

In [None]:
# Install packages that colab dosen't have as default
!pip install -q aif360
!pip install -q shap
!pip install -q feature-engine
!pip install -q boruta
!pip install -q scikit-optimize

# Packages for editing data
import pandas as pd
import numpy as np

# Packages for preprocessing the training data
from feature_engine.selection import SmartCorrelatedSelection
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Packages for ML
from imblearn.pipeline import Pipeline # use imblearn pipeline to use under/over sampler instead of sklearn pipeline
from boruta import BorutaPy
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
## from sklearn.neighbors import KNeighborsClassifier

# Packages for ML metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *
from aif360.sklearn.metrics import specificity_score

# Packages for visualization
import matplotlib.pyplot as plt
import shap

# Packages for save models
import pickle

# Packages for convenience
from datetime import datetime
import joblib


[K     |████████████████████████████████| 214 kB 28.4 MB/s 
[K     |████████████████████████████████| 569 kB 33.6 MB/s 
[K     |████████████████████████████████| 276 kB 14.8 MB/s 
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
[K     |████████████████████████████████| 100 kB 4.5 MB/s 
[?25h

## Import data

- one-hot encoded data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
data_path = '/content/gdrive/My Drive/공동작업폴더(연찬,성은)/ICT_ML/data/2014_NSOK_preprocessed.csv'

data_2014 = pd.read_csv(data_path, index_col=None) ## sperater, delimiter 확인
data_2014 = data_2014.drop('Unnamed: 0', axis=1)

data_path = '/content/gdrive/My Drive/공동작업폴더(연찬,성은)/ICT_ML/data/2017_NSOK_preprocessed.csv'

data_2017 = pd.read_csv(data_path, index_col=None) ## sperater, delimiter 확인
data_2017 = data_2017.drop('Unnamed: 0', axis=1)

data_path = '/content/gdrive/My Drive/공동작업폴더(연찬,성은)/ICT_ML/data/2020_NSOK_preprocessed.csv'

data_2020 = pd.read_csv(data_path, index_col=None) ## sperater, delimiter 확인
data_2020 = data_2020.drop('Unnamed: 0', axis=1)

In [None]:
print(data_2014.shape)
print(data_2017.shape)
print(data_2020.shape)

In [None]:
# check na
tmp = data_2014.isna().sum()
tmp[tmp!=0]

In [None]:
# check na
tmp = data_2017.isna().sum()
tmp[tmp!=0]

In [None]:
# check na
tmp = data_2020.isna().sum()
tmp[tmp!=0]

# Import data

- drop NA
- train-test split

In [None]:
data_2014 = data_2014.dropna(how='any', axis=0)
data_2017 = data_2017.dropna(how='any', axis=0)
data_2020 = data_2020.dropna(how='any', axis=0)

In [None]:
# 2014
# X = data_2014.drop('suicide', axis=1)
# y = data_2014['suicide']

# 2014 + 2017
# X = pd.concat([data_2014.drop('suicide', axis=1), data_2017.drop('suicide', axis=1)])
# y = pd.concat([data_2014['suicide'], data_2017['suicide']])

# 2014 + 2017 + 2020
X = pd.concat([data_2014.drop('suicide', axis=1), data_2017.drop('suicide', axis=1), data_2020.drop('suicide', axis=1)])
y = pd.concat([data_2014['suicide'], data_2017['suicide'], data_2020['suicide']])

len(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y)

In [None]:
print(len(y_train[y_train==0]), len(y_train[y_train==1]))
print(len(y_test[y_test==0]), len(y_test[y_test==1]))

## Modeling

- Remove one of correlated features (Logistic only / SVM???)
- Scaling
- Select top N features
- Oversampling 

In [None]:
# https://www.kaggle.com/code/jsaguiar/baseline-with-multiple-models
rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
# define Boruta feature selection method
selector = BorutaPy(rf, n_estimators='auto', verbose=0,
                    random_state=1, max_iter=500)
# find all relevant features
selector.fit(X_train.values, y_train.values)
# transform values
X_filtered = selector.transform(X_train.values)
X_train = pd.DataFrame(X_filtered, columns=X_train.columns[selector.support_])
X_train.head()

Unnamed: 0,B2_3_y,B3_y,B4_1_etc_y,call_child_out,H12_1_etc_y,J3b_1_13_y,J3b_3_13_y,RES_AGE_y,E1_2_y,B2_2R_15_y,gds15_i,hospital_no,F4_y,H14_1_3_y,H16_4_y
0,2.0,2.0,1.0,4.0,0.0,4900.0,6600.0,69.0,99.0,0.0,0.0,0.0,1.0,1.0,1.0
1,3.0,2.0,3.0,3.0,0.0,292.0,632.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.0,0.0,0.0,2.0,0.0,2500.0,2800.0,65.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,3.0,1.0,2.5,0.0,850.0,1260.0,81.0,99.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2.0,2.0,0.0,7.0,0.0,1028.0,1306.0,69.0,99.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# feature_selecter = SmartCorrelatedSelection( # drops correlated
#                                             method = 'pearson', 
#                                             threshold=0.5, 
#                                             missing_values='ignore', 
#                                             selection_method='model_performance', 
#                                             estimator = LinearRegression(), 
#                                             scoring='neg_root_mean_squared_error', # RMSE 
#                                             cv=3
#                                             )



scaler = MinMaxScaler()
augmentation = ADASYN(random_state = 1120)
svm = SVC()


pipe = Pipeline(steps=[('scaler', scaler),
                       ('augmentation', augmentation),
                       ('model', svm)])

# pipe.fit(X_train, y_train)
# cv_idx = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1120)

linsvc_search = {
    'model': [LinearSVC(max_iter=1000)],
    'model__C': (1e-6, 1e+6, 'log-uniform'),
}

svc_search = {
    'model': Categorical([SVC()]),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
    'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
    'model__degree': Integer(1,8),
    'model__kernel': Categorical(['linear', 'poly', 'rbf']),
}

# https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html
# https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV
opt = BayesSearchCV(estimator = pipe,
                    search_spaces = [(svc_search, 40), (linsvc_search, 16)],
                    scoring = 'f1',
                    cv = 10,
                    n_jobs = -1)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

In [None]:
y_train_prob = opt.predict_proba(X_train.values)
y_train_pred = opt.predict(X_train.values)

print('Accuracy: %.4f' % accuracy_score(y_train, y_train_pred))
print('AUC: %.4f' % roc_auc_score(y_train, y_train_prob[:, 1]))
print(classification_report(y_train, y_train_pred))

In [None]:
# # Optimal Threshold for ROC Curve (maximizing f1)
# # https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# # https://stackoverflow.com/questions/19984957/scikit-learn-predict-default-threshold

# y_test_prob = opt.predict_proba(X_test.values)
# fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
# gmeans = np.sqrt(tpr * (1-fpr))
# ix = np.argmax(gmeans)
# print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

In [None]:
# X_train과 column 맞춰줘야함
cor_list = list(X_train)
X_test = X_test[cor_list]

y_test_prob = opt.predict_proba(X_test.values)
y_test_pred  = opt.predict(X_test.values)

print('Accuracy: %.4f' % accuracy_score(y_test, y_test_pred))
print('AUC: %.4f' % roc_auc_score(y_test, y_test_prob[:, 1]))
print(classification_report(y_test, y_test_pred))

In [None]:
## AUROC
print('%.4f' % roc_auc_score(y_test, y_test_prob[:, 1]))

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_test_pred)
print('%.4f' % f1)

# cohen's kappa score
kappa = cohen_kappa_score(y_test, y_test_pred)
print('%.4f' % kappa)

# Sensitivity: tp / (tp + fn)
recall = recall_score(y_test, y_test_pred)
print('%.4f' % recall)

## acc
print('%.4f' % accuracy_score(y_test, y_test_pred))

# Specificity 
specificity = specificity_score(y_test, y_test_pred)
print('%.4f' % specificity)

# precision tp / (tp + fp)
precision = precision_score(y_test, y_test_pred)
print('%.4f' % precision)

# # Threshold
# print('%.4f' % thresholds[ix])

### Linear Regression

In [None]:
## iterative imputation (Missing NOT at Random - MNAR, optional), min-max scaling, PCA, modeling

## Missing values in our data will not be randomly distributed, which means there is a bias. 
## The imputation is a method for responding to this bias, and scaling is performed after the imputation because it can distort this bias.
## -> imputation - scaling 
## https://stats.stackexchange.com/questions/138203/imputation-of-missing-data-before-or-after-centering-and-scaling

## Sampling techniques require a simple model to be trained.
## These models have better performance on pre-processed datasets
## -> imputation - scaling - augmentation
## https://stats.stackexchange.com/questions/363312/normalization-standardization-should-one-do-this-before-oversampling-undersampl

## PCA 하기 전 correlation 검사 해야될듯. 너무 높은 변수들이 있으면
## PCA 수행 시 분산으로 인한 왜곡이 심해진다고 하는데,
## https://www.quora.com/Is-it-correct-to-apply-PCA-after-resampling-the-data-using-SMOTE
## PCA 수행 후의 components 간 corr가 없으면 괜찮은거 아님..?
## https://towardsdatascience.com/how-do-you-apply-pca-to-logistic-regression-to-remove-multicollinearity-10b7f8e89f9b
## 아님. corr가 높은 변수들이 함께 포함되어있는 components의 효과가 과장될 위험이 있음.
## https://stats.stackexchange.com/questions/50537/should-one-remove-highly-correlated-variables-before-doing-pca
## 그러면 corr가 높은 변수를 제거하는 feature selection을 해야겠네.
## 순서는 이거 참고. https://stats.stackexchange.com/questions/183961/impute-missing-data-before-or-after-feature-selection
## covariates 131개니까, 많지 않은듯
## -> imputation - feature selection - scaling - augmentation - pca

## feature selection은 filter method로 간다.
## 1. Filter method가 Wrapper/Embedded methods보다 빠르다. computationally less expensive
## 2. 우리 목적은 feature selection을 통해 성능을 최대화하는게 아니라, pca를 더 정확하게 하기 위해서임.우리 모델은 multicollinearity에 영향을 안받음.
## 죄다 걸러내자 x PCA를 정확히 수행할 수 있을 정도로만 걸러내자 O

## 우리가 사용한 모든 모델은 다중공선성 영향을 받지 않음. 그러나 feature selection을 하는 이유는 PCA 시 component의 영향이 왜곡되지 않도록 하기 위해서임.
## mlp: https://datascience.stackexchange.com/questions/28328/how-does-multicollinearity-affect-neural-networks
## logistic: http://spss.datasolution.kr/artyboard/mboard.asp?exec=view&strBoardID=BOARD_QNA&intPage=21&intCategory=1&strSearchCategory=%7Cs_name%7Cs_subject%7C&strSearchWord=&intSeq=5551
## tree based: https://medium.com/@manepriyanka48/multicollinearity-in-tree-based-models-b971292db140 // https://datascience.stackexchange.com/questions/12554/does-xgboost-handle-multicollinearity-by-itself
## svm: https://medium.com/@raj5287/effects-of-multi-collinearity-in-logistic-regression-svm-rf-af6766d91f1b (RBF 커널 말고 다른거 써야함)


## 1. Logistic Regression


## https://www.kaggle.com/code/solegalli/feature-selection-with-feature-engine
## https://feature-engine.readthedocs.io/en/1.0.x/selection/SmartCorrelatedSelection.html
feature_selecter = SmartCorrelatedSelection( # drops correlated
                                            method = 'pearson', 
                                            threshold=0.5, 
                                            missing_values='ignore', 
                                            selection_method='model_performance', 
                                            estimator = LinearRegression(), 
                                            scoring='neg_root_mean_squared_error', # RMSE 
                                            cv=3
                                            )
scaler = MinMaxScaler()
augmentation = ADASYN(random_state = 1120)
pca = PCA()
lr = LogisticRegression(solver = 'liblinear', 
                        multi_class = 'auto',
                        max_iter = 1000,
                        random_state = 1120)

pipe = Pipeline(steps=[('imputer', imputer),
                       ('feature_selecter', feature_selecter),
                       ('scaler', scaler),
                       ('pca', pca),
                       ('augmentation', augmentation),
                       ('model', lr)])

pipe.fit(X_train, y_train)

cv_idx = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1120)

param_grid = {'model__penalty':['l1', 'l2'],
              'model__C':[0.01, 0.1, 1]
}

grid_model = GridSearchCV(estimator = pipe,
                          param_grid = param_grid,
                          cv = cv_idx,
                          n_jobs = -1).fit(X_train, y_train)

In [None]:
grid_model.best_params_

In [None]:
y_train_prob = grid_model.predict_proba(X_train.values)
y_train_pred = grid_model.predict(X_train.values)

print('Accuracy: %.4f' % accuracy_score(y_train, y_train_pred))
print('AUC: %.4f' % roc_auc_score(y_train, y_train_prob[:, 1]))
print(classification_report(y_train, y_train_pred))

In [None]:
# Optimal Threshold for ROC Curve (maximizing f1)
# https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# https://stackoverflow.com/questions/19984957/scikit-learn-predict-default-threshold

y_test_prob = grid_model.predict_proba(X_test.values)
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

In [None]:
# X_train과 column 맞춰줘야함
cor_list = list(X_train)
X_test = X_test[cor_list]

y_test_prob = grid_model.predict_proba(X_test.values)
y_test_pred  = grid_model.predict(X_test.values)

print('Accuracy: %.4f' % accuracy_score(y_test, y_test_pred))
print('AUC: %.4f' % roc_auc_score(y_test, y_test_prob[:, 1]))
print(classification_report(y_test, y_test_pred))

In [None]:
## AUROC
print('%.4f' % roc_auc_score(y_test, y_test_prob[:, 1]))

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_test_pred)
print('%.4f' % f1)

# cohen's kappa score
kappa = cohen_kappa_score(y_test, y_test_pred)
print('%.4f' % kappa)

# Sensitivity: tp / (tp + fn)
recall = recall_score(y_test, y_test_pred)
print('%.4f' % recall)

## acc
print('%.4f' % accuracy_score(y_test, y_test_pred))

# Specificity 
specificity = specificity_score(y_test, y_test_pred)
print('%.4f' % specificity)

# precision tp / (tp + fp)
precision = precision_score(y_test, y_test_pred)
print('%.4f' % precision)

# Threshold
print('%.4f' % thresholds[ix])

## Modeling with 2014 + 2017 data

In [None]:
X = pd.concat([data_2014.drop('suicide', axis=1), data_2017.drop('suicide', axis=1)])
y = pd.concat([data_2014['suicide'], data_2017['suicide']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, stratify = y)

In [None]:
print(len(y_train[y_train==0]), len(y_train[y_train==1]))
print(len(y_test[y_test==0]), len(y_test[y_test==1]))

13112 1142
5621 489


In [None]:
## iterative imputation (Missing NOT at Random - MNAR, optional), min-max scaleing, PCA, modeling

## 각 모델별로 pipline 다르게 만들어야함..
## MLOps까지 고려한다면 2014년 것만 가지고 먼저 하고, 2017년 합쳐서 결과 내고, 2020년 합쳐서 결과내면 어떨까??

## 1. Linear Regression
## impute - scaling - pca - modeling

imputer = IterativeImputer(random_state = 1120)
scaler = MinMaxScaler()
augmentation = ADASYN(random_state = 1120)
pca = PCA()
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', random_state = 1120)


pipe = Pipeline(steps=[('imputer', imputer),
                       ('scaler', scaler),
                       ('augmentation', augmentation),
                       ('pca', pca),
                       ('model', lr)])

pipe.fit(X_train, y_train)

cv_idx = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1120)

param_grid = {'model__penalty':['l2', 'none'],
              'model__C':[0.01, 0.1, 1],
              'model__class_weight':['None']
}

grid_model = GridSearchCV(estimator = pipe,
                          param_grid = param_grid,
                          cv = cv_idx,
                          n_jobs = -1).fit(X_train, y_train)

print('학습 평가 : ', pipe.score(X_train, y_train))
print('테스트 평가 : ', pipe.score(X_test, y_test))

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


학습 평가 :  0.738038445348674
테스트 평가 :  0.7356792144026186


In [None]:
print('학습 평가 : ', pipe.score(X_train, y_train))
print('테스트 평가 : ', pipe.score(X_test, y_test))

학습 평가 :  0.738038445348674
테스트 평가 :  0.7356792144026186


2014 + 2017 + 2020

In [None]:
X = pd.concat([data_2014.drop('suicide', axis=1), data_2017.drop('suicide', axis=1), data_2020.drop('suicide', axis=1)])
y = pd.concat([data_2014['suicide'], data_2017['suicide'], data_2020['suicide']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, stratify = y)

In [None]:
print(len(y_train[y_train==0]), len(y_train[y_train==1]))
print(len(y_test[y_test==0]), len(y_test[y_test==1]))

19925 1273
8541 545


In [None]:
## iterative imputation (Missing NOT at Random - MNAR, optional), min-max scaleing, PCA, modeling

## 각 모델별로 pipline 다르게 만들어야함..
## MLOps까지 고려한다면 2014년 것만 가지고 먼저 하고, 2017년 합쳐서 결과 내고, 2020년 합쳐서 결과내면 어떨까??

## 1. Linear Regression
## impute - scaling - pca - modeling

imputer = IterativeImputer(random_state = 1120)
scaler = MinMaxScaler()
augmentation = ADASYN(random_state = 1120)
pca = PCA()
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', random_state = 1120)


pipe = Pipeline(steps=[('imputer', imputer),
                       ('scaler', scaler),
                       ('augmentation', augmentation),
                       ('pca', pca),
                       ('model', lr)])

pipe.fit(X_train, y_train)

cv_idx = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1120)

param_grid = {'model__penalty':['l2', 'none'],
              'model__C':[0.01, 0.1, 1],
              'model__class_weight':['None']
}

grid_model = GridSearchCV(estimator = pipe,
                          param_grid = param_grid,
                          cv = cv_idx,
                          n_jobs = -1).fit(X_train, y_train)

print('학습 평가 : ', pipe.score(X_train, y_train))
print('테스트 평가 : ', pipe.score(X_test, y_test))

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


학습 평가 :  0.7665345787338428
테스트 평가 :  0.7587497248514198


In [None]:
print('학습 평가 : ', pipe.score(X_train, y_train))
print('테스트 평가 : ', pipe.score(X_test, y_test))

학습 평가 :  0.7665345787338428
테스트 평가 :  0.7587497248514198
