In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
# import seaborn as sns

%matplotlib inline

import pickle
import gzip

In [2]:
#train, test split
from sklearn.model_selection import GridSearchCV, train_test_split

# models
from lightgbm import LGBMClassifier
from lightgbm import plot_importance
import xgboost as xgb
from xgboost import plot_importance
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# imbalanced learn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [None]:
# 데이터 불러오기
with gzip.open('../preprocessed/final_data.pickle','rb') as f:
    final = pickle.load(f)

final.head()

In [4]:
# 완전 중복되는 열인지 확인
# final['desired_amount_x'].equals(final['desired_amount_y'])

True

In [5]:
# 같은 column 발견해서 여럿 drop
# final = final.drop(['desired_amount_x', 'income_type', 'employment_type'], axis=1)
final = final.drop(['desired_amount_x'], axis=1)

In [8]:
final_temp=pd.concat([final.iloc[:,0],final.iloc[:,3:]],axis=1)
len(final_temp.columns)
final=final_temp

In [9]:
final.rename(columns={'desired_amount_y': 'desired_amount'}, inplace=True)

In [10]:
[col for col in final.columns if "_x" in col]

[]

In [None]:
final.columns

In [None]:
final.isna().sum()

In [13]:
final.shape

(6833617, 39)

In [14]:
final.reset_index(inplace=True)

In [None]:
final.head()

In [16]:
X = final.drop(['is_applied', 'application_id', 'user_id'], axis = 1)
y = final['is_applied']

In [17]:
# feature별 값의 범위 스케일 해주기

# standard scaler (z-score)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X만 본인이 가지고 있는 변수로 넣어주기 (X는 label 없이 only feature만!)

In [49]:
# scaler 피클 파일로 저장
with open('../preprocessed/scaler_final.pickle','wb') as f:
    pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state=777, stratify=y)

In [48]:
# train set 클래스
from collections import Counter
print(sorted(Counter(y_train).items()))

[(0.0, 4535212), (1.0, 248319)]


## models with SMOTE

In [18]:
# Random Undersampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
for ratio in [1,0.5,0.33,0.2]:
    print('ratio : ',ratio)
    # ratio = (number of samples in the minority class) / (number of samples in the majority class)
    random_under = RandomUnderSampler(sampling_strategy=ratio, random_state=777) # 1:1 (X), 1:2 (0.5), 1:3 (0.33), 1:5 (0.2)
    X_under, y_under = random_under.fit_resample(X_train, y_train)

    print('after under sampling')
    display(pd.Series(y_under).value_counts())
    print('\nafter smote')
    # SMOTE

    
    print('smote 후 데이터 class별 개수', sorted(Counter(y_resampled).items()))

    # no tuning, base model

    # XGBoost
    xgb_model = xgb.XGBClassifier(random_state=777, tree_method='gpu_hist', gpu_id=0)
    xgb_model.fit(X_resampled, y_resampled)
    print('XGBoost')
    y_pred_xgb = xgb_model.predict(X_test)
    print(classification_report(y_test, y_pred_xgb))
    
    # LGBM
    lgbm = LGBMClassifier(random_state=777, n_jobs=-1, n_estimators=200, objective='binary', is_unbalance=True) # device='gpu', 
    lgbm.fit(X_resampled, y_resampled)
    print('LightGBM')
    y_pred_lgbm = lgbm.predict(X_test)
    print(classification_report(y_test, y_pred_lgbm)) # SMOTE까지 한 f1=0.4,

    # Logistic
    lr_base = LogisticRegression(random_state=777)
    lr_base.fit(X_resampled, y_resampled)
    print('Logistic')
    y_pred_lr = lr_base.predict(X_test)
    print(classification_report(y_test, y_pred_lr))

    # Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 100, random_state=777)
    rf_clf.fit(X_resampled, y_resampled)
    print('Random Forest')
    y_preds_rf = rf_clf.predict(X_test)
    print(classification_report(y_test, y_preds_rf))

ratio :  1
after under sampling


0.0    248319
1.0    248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 248319), (1.0, 248319)]
XGBoost
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.89   1943664
         1.0       0.20      0.88      0.32    106422

    accuracy                           0.81   2050086
   macro avg       0.59      0.84      0.60   2050086
weighted avg       0.95      0.81      0.86   2050086

LightGBM
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.89   1943664
         1.0       0.19      0.88      0.32    106422

    accuracy                           0.80   2050086
   macro avg       0.59      0.84      0.60   2050086
weighted avg       0.95      0.80      0.86   2050086



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic
              precision    recall  f1-score   support

         0.0       0.99      0.76      0.86   1943664
         1.0       0.16      0.82      0.26    106422

    accuracy                           0.76   2050086
   macro avg       0.57      0.79      0.56   2050086
weighted avg       0.94      0.76      0.83   2050086

Random Forest
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.89   1943664
         1.0       0.20      0.88      0.32    106422

    accuracy                           0.81   2050086
   macro avg       0.59      0.84      0.60   2050086
weighted avg       0.95      0.81      0.86   2050086

ratio :  0.5
after under sampling


0.0    496638
1.0    248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 496638), (1.0, 496638)]
XGBoost
              precision    recall  f1-score   support

         0.0       0.99      0.86      0.92   1943664
         1.0       0.24      0.79      0.37    106422

    accuracy                           0.86   2050086
   macro avg       0.61      0.82      0.64   2050086
weighted avg       0.95      0.86      0.89   2050086

LightGBM
              precision    recall  f1-score   support

         0.0       0.99      0.86      0.92   1943664
         1.0       0.24      0.79      0.37    106422

    accuracy                           0.86   2050086
   macro avg       0.61      0.83      0.64   2050086
weighted avg       0.95      0.86      0.89   2050086



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic
              precision    recall  f1-score   support

         0.0       0.99      0.76      0.86   1943664
         1.0       0.16      0.82      0.26    106422

    accuracy                           0.76   2050086
   macro avg       0.57      0.79      0.56   2050086
weighted avg       0.94      0.76      0.83   2050086

Random Forest
              precision    recall  f1-score   support

         0.0       0.99      0.86      0.92   1943664
         1.0       0.23      0.81      0.36    106422

    accuracy                           0.85   2050086
   macro avg       0.61      0.83      0.64   2050086
weighted avg       0.95      0.85      0.89   2050086

ratio :  0.33
after under sampling


0.0    752481
1.0    248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 752481), (1.0, 752481)]
XGBoost
              precision    recall  f1-score   support

         0.0       0.98      0.90      0.94   1943664
         1.0       0.27      0.70      0.40    106422

    accuracy                           0.89   2050086
   macro avg       0.63      0.80      0.67   2050086
weighted avg       0.95      0.89      0.91   2050086

LightGBM
              precision    recall  f1-score   support

         0.0       0.98      0.90      0.94   1943664
         1.0       0.28      0.70      0.40    106422

    accuracy                           0.89   2050086
   macro avg       0.63      0.80      0.67   2050086
weighted avg       0.95      0.89      0.91   2050086



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic
              precision    recall  f1-score   support

         0.0       0.99      0.76      0.86   1943664
         1.0       0.16      0.81      0.26    106422

    accuracy                           0.76   2050086
   macro avg       0.57      0.79      0.56   2050086
weighted avg       0.94      0.76      0.83   2050086

Random Forest
              precision    recall  f1-score   support

         0.0       0.98      0.89      0.93   1943664
         1.0       0.27      0.75      0.39    106422

    accuracy                           0.88   2050086
   macro avg       0.63      0.82      0.66   2050086
weighted avg       0.95      0.88      0.91   2050086

ratio :  0.2
after under sampling


0.0    1241595
1.0     248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 1241595), (1.0, 1241595)]
XGBoost
              precision    recall  f1-score   support

         0.0       0.98      0.93      0.95   1943664
         1.0       0.32      0.58      0.42    106422

    accuracy                           0.92   2050086
   macro avg       0.65      0.76      0.69   2050086
weighted avg       0.94      0.92      0.93   2050086

LightGBM
              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96   1943664
         1.0       0.33      0.58      0.42    106422

    accuracy                           0.92   2050086
   macro avg       0.65      0.76      0.69   2050086
weighted avg       0.94      0.92      0.93   2050086



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic
              precision    recall  f1-score   support

         0.0       0.99      0.76      0.86   1943664
         1.0       0.16      0.81      0.26    106422

    accuracy                           0.76   2050086
   macro avg       0.57      0.79      0.56   2050086
weighted avg       0.94      0.76      0.83   2050086

Random Forest
              precision    recall  f1-score   support

         0.0       0.98      0.92      0.95   1943664
         1.0       0.31      0.65      0.42    106422

    accuracy                           0.91   2050086
   macro avg       0.65      0.79      0.69   2050086
weighted avg       0.95      0.91      0.92   2050086



### Random Forest ratio 변경해서 추가적으로 더 확인

In [19]:
# Random Undersampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
for ratio in [0.125, 0.1]:
    print('ratio : ',ratio)
    # ratio = (number of samples in the minority class) / (number of samples in the majority class)
    random_under = RandomUnderSampler(sampling_strategy=ratio, random_state=777) # 1:1 (X), 1:2 (0.5), 1:3 (0.33), 1:5 (0.2)
    X_under, y_under = random_under.fit_resample(X_train, y_train)

    print('after under sampling')
    display(pd.Series(y_under).value_counts())
    print('\nafter smote')
    # SMOTE

    smote = SMOTE(random_state=777) # SMOTE의 하이퍼파라미터는 default로 사용
    X_resampled, y_resampled=smote.fit_resample(X_under, y_under)
    print('smote 후 데이터 class별 개수', sorted(Counter(y_resampled).items()))

    # no tuning, base model
    # Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 100, random_state=777)
    rf_clf.fit(X_resampled, y_resampled)
    print('Random Forest')
    y_preds_rf = rf_clf.predict(X_test)
    print(classification_report(y_test, y_preds_rf))

ratio :  0.125
after under sampling


0.0    1986552
1.0     248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 1986552), (1.0, 1986552)]
Random Forest
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96   1943664
         1.0       0.36      0.54      0.43    106422

    accuracy                           0.93   2050086
   macro avg       0.67      0.74      0.69   2050086
weighted avg       0.94      0.93      0.93   2050086

ratio :  0.1
after under sampling


0.0    2483190
1.0     248319
Name: is_applied, dtype: int64


after smote
smote 후 데이터 class별 개수 [(0.0, 2483190), (1.0, 2483190)]
Random Forest
              precision    recall  f1-score   support

         0.0       0.97      0.96      0.96   1943664
         1.0       0.38      0.48      0.43    106422

    accuracy                           0.93   2050086
   macro avg       0.68      0.72      0.69   2050086
weighted avg       0.94      0.93      0.94   2050086



## models without SMOTE

In [None]:
# Random Undersampling
from imblearn.under_sampling import RandomUnderSampler

# undersampling 비율 별로 모델별 성능이 어떻게 되는지 확인
for ratio in [1,0.5,0.2]:
    print('ratio : ',ratio)
    # ratio = (number of samples in the minority class) / (number of samples in the majority class)
    random_under = RandomUnderSampler(sampling_strategy=ratio, random_state=777) # 1:1 (X), 1:2 (0.5), 1:3 (0.33), 1:5 (0.2)
    X_under, y_under = random_under.fit_resample(X_train, y_train)

    print('after under sampling')
    display(pd.Series(y_under).value_counts())
    
    # no tuning, base model

    # XGBoost
    xgb_model = xgb.XGBClassifier(random_state=777, tree_method='gpu_hist', gpu_id=0)
    xgb_model.fit(X_under, y_under)
    print('XGBoost')
    y_pred_xgb = xgb_model.predict(X_test)
    print(classification_report(y_test, y_pred_xgb))
    
    # LGBM
    lgbm = LGBMClassifier(random_state=777, n_jobs=-1, n_estimators=200, objective='binary', is_unbalance=True) # device='gpu', 
    lgbm.fit(X_under, y_under)
    print('LightGBM')
    y_pred_lgbm = lgbm.predict(X_test)
    print(classification_report(y_test, y_pred_lgbm)) # SMOTE까지 한 f1=0.4,

    # Logistic
    lr_base = LogisticRegression(random_state=777)
    lr_base.fit(X_under, y_under)
    print('Logistic')
    y_pred_lr = lr_base.predict(X_test)
    print(classification_report(y_test, y_pred_lr))

    # Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 100, random_state=777)
    rf_clf.fit(X_under, y_under)
    print('Random Forest')
    y_preds_rf = rf_clf.predict(X_test)
    print(classification_report(y_test, y_preds_rf))

### Random Forest without SMOTE 비율 달리해서 실험

In [None]:
# Random Undersampling
from imblearn.under_sampling import RandomUnderSampler

for ratio in [0.17, 0.143, 0.125, 0.1]:
    print('ratio : ',ratio)
    # ratio = (number of samples in the minority class) / (number of samples in the majority class)
    random_under = RandomUnderSampler(sampling_strategy=ratio, random_state=777) # 1:1 (X), 1:2 (0.5), 1:3 (0.33), 1:5 (0.2)
    X_under, y_under = random_under.fit_resample(X_train, y_train)

    print('after under sampling')
    display(pd.Series(y_under).value_counts())
    
    # no tuning, base model
    # Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 100, random_state=777)
    rf_clf.fit(X_under, y_under)
    print('Random Forest')
    y_preds_rf = rf_clf.predict(X_test)
    print(classification_report(y_test, y_preds_rf))

## Tuning ("without" SMOTE)

### XGBoost

In [None]:
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under, y_under = random_under.fit_resample(X_train, y_train)

estimators = [100, 150, 200, 250, 300, 350]
# min_child_weight = [] # default 1
max_depths = [20, 30] # default 6
subsamples = [0.7, 1.0] # default 1
gammas = [0, 5, 10]

from itertools import product as prod

for n_estimator, max_depth, subsample, gamma in prod(
    estimators, max_depths, subsamples, gammas):
    
    xgbc = xgb.XGBClassifier(random_state = 777, tree_method='gpu_hist', gpu_id=0,
            n_estimators = n_estimator,
            max_depth = max_depth,
            subsample = subsample,
            gamma = gamma,
            )
    xgbc.fit(X_under, y_under)

    y_train_pred_xgb = xgbc.predict(X_train)
    y_under_pred_xgb = xgbc.predict(X_under)
    y_test_pred_xgb = xgbc.predict(X_test)

    print(f'n_estimators : {n_estimator}, max_depth : {max_depth}, subsample : {subsample}, gamma : {gamma}')
    print(f'f1 score with train set: {f1_score(y_train, y_train_pred_xgb)}')
    print(f'f1 score with test set: {f1_score(y_test, y_test_pred_xgb)}')
    print(f'f1 score with under: {f1_score(y_under, y_under_pred_xgb)}')

In [31]:
xgb_tune = xgb.XGBClassifier(random_state = 777, tree_method='gpu_hist', gpu_id=0,
            n_estimators=100, max_depth=20, subsample=1.0, gamma=0)
xgb_tune.fit(X_under, y_under)
proba = xgb_tune.predict_proba(X_test)

In [32]:
@np.vectorize
def is_positive(x, thres):
    if x>thres:
        y = 1
    else:
        y = 0
    return y

def get_best_combination(proba, real):
    best_score = 0
    best_thres = 0
    for thres in np.arange(0.1,0.9, 0.01):
        score = f1_score(is_positive(proba, thres), real)
        if score >= best_score:
            best_score = score
            best_thres = thres
    return best_score, best_thres

In [35]:
# f1_score(is_positive(proba[:,1], 0.5), y_test)

0.4452837447425564

In [36]:
get_best_combination(proba[:,1], y_test)

(0.445409073241964, 0.47999999999999976)

### Random Forest

In [None]:
# SMOTE 사용하지 않는 random forest tuning
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under, y_under = random_under.fit_resample(X_train, y_train)
n_estimators = [150,250]
max_depths = [20, 30]
min_samples_splits = [8,16]
min_samples_leafs = [8,18]

for n_estimator, max_depth, min_samples_split, min_samples_leaf in prod(
    n_estimators, max_depths, min_samples_splits, min_samples_leafs):
    
    rf_clf = RandomForestClassifier(random_state = 777,
            n_estimators = n_estimator,
            max_depth = max_depth,
            min_samples_split = min_samples_split,
            min_samples_leaf=min_samples_leaf)
    rf_clf.fit(X_under, y_under)

    y_train_pred = rf_clf.predict(X_train)
    y_under_pred = rf_clf.predict(X_under)
    y_test_pred = rf_clf.predict(X_test)
    print(f'n_estimators : {n_estimator}, max_depth : {max_depth}, min_samples_split : {min_samples_split}, min_samples_leaf : {min_samples_leaf}')
    print(f'f1 score with train set: {f1_score(y_train, y_train_pred)}')
    print(f'f1 score with test set: {f1_score(y_test, y_test_pred)}')
    print(f'f1 score with under: {f1_score(y_under, y_under_pred)}')

### Logistic

In [22]:
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under, y_under = random_under.fit_resample(X_train, y_train)

solvers = ['lbfgs', 'sag', 'saga']
C_list = [100, 10, 1.0, 0.1, 0.01]


for solver, c_value in prod(solvers, C_list):
    
    logistic = LogisticRegression(random_state = 777,
            solver = solver,
            penalty='l2',
            C = c_value, n_jobs=-1)
    xgbc.fit(X_under, y_under)

    y_train_pred_log = xgbc.predict(X_train)
    y_under_pred_log = xgbc.predict(X_under)
    y_test_pred_log = xgbc.predict(X_test)

    print(f'solver : {solver}, C : {c_value}')
    print(f'f1 score with train set: {f1_score(y_train, y_train_pred_log)}')
    print(f'f1 score with test set: {f1_score(y_test, y_test_pred_log)}')
    print(f'f1 score with under: {f1_score(y_under, y_under_pred_log)}')

solver : lbfgs, C : 100
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : lbfgs, C : 10
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : lbfgs, C : 1.0
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : lbfgs, C : 0.1
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : lbfgs, C : 0.01
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : sag, C : 100
f1 score with train set: 0.731668701020715
f1 score with test set: 0.4362482127119604
f1 score with under: 0.9993737880987522
solver : sag, C : 10
f1 score with train set: 0.731668701020715
f1 score with test set: 0.436248212711

## 튜닝한 파라미터들로 모델 학습 시키기

In [50]:
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under, y_under = random_under.fit_resample(X_train, y_train)

smote = SMOTE(random_state=777) # SMOTE의 하이퍼파라미터는 default로 사용
X_resampled, y_resampled=smote.fit_resample(X_under, y_under)

In [51]:
# XGBoost
xgb_tuned = xgb.XGBClassifier(random_state=777, n_estimators=100, 
                            max_depth=20, subsample=1.0, gamma=0, 
                            tree_method='gpu_hist', gpu_id=0)
xgb_tuned.fit(X_under, y_under)
print('xgboost done')

# LGBM
lgbm_tuned = LGBMClassifier(num_leaves=70, max_depth=25, min_child_samples=100, 
                        random_state=777, n_jobs=-1, n_estimators=200, objective='binary', 
                        is_unbalance=True)
lgbm_tuned.fit(X_resampled, y_resampled)
print('lgbm done')

# Logistic
lr_tuned = LogisticRegression(random_state=777, penalty='l2', C=1, n_jobs=-1)
lr_tuned.fit(X_under, y_under)
print('logistic done')

# Random Forest
rf_tuned = RandomForestClassifier(n_estimators=100, random_state=777)
rf_tuned.fit(X_under, y_under)
print('random forest done')

xgboost done
lgbm done


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic done
random forest done


In [52]:
y_pred = rf_tuned.predict(X_test)

## 모델 앙상블: 경우의 수 확인해보기

In [53]:
@np.vectorize
def is_positive(x, thres):
    if x>thres:
        y = 1
    else:
        y = 0
    return y

def get_best_combination(proba, real):
    best_score = 0
    best_thres = 0
    for thres in np.arange(0.1,0.9, 0.01):
        score = f1_score(is_positive(proba, thres), real)
        if score >= best_score:
            best_score = score
            best_thres = thres
    return best_score, best_thres

In [55]:
from itertools import product as prod

models = [xgb_tuned, lgbm_tuned, lr_tuned, rf_tuned]
bin = [[0,1]] * len(models)
i=0
for a0,a1,a2,a3 in prod([0,1],[0,1],[0,1],[0,1]):
    if a0+a1+a2+a3==0:
        continue
    pred_models = []
    if a0 == 1:
        pred_models.append(models[0])
    if a1 == 1:
        pred_models.append(models[1])
    if a2 == 1:
        pred_models.append(models[2])
    if a3 == 1:
        pred_models.append(models[3])

    for i,model in enumerate(pred_models):
        if i==0:
            all_test_proba = model.predict_proba(X_test)
        else:
            all_test_proba += model.predict_proba(X_test)
    all_test_proba = all_test_proba/(i+1)
    best_score, best_thres = get_best_combination(all_test_proba[:,1], y_test)

    print(a0,a1,a2,a3, best_score, best_thres)
    



0 0 0 1 0.44022071433897547 0.47999999999999976
0 0 1 0 0.3397625968205511 0.3899999999999999
0 0 1 1 0.4161516184610873 0.4299999999999998
0 1 0 0 0.4280672831714793 0.5199999999999998
0 1 0 1 0.4419547912742298 0.4999999999999998
0 1 1 0 0.41012390284848815 0.44999999999999984
0 1 1 1 0.4313505916659464 0.45999999999999985
1 0 0 0 0.443500899228404 0.47999999999999976
1 0 0 1 0.4497652686534535 0.47999999999999976
1 0 1 0 0.44207843734559754 0.43999999999999984
1 0 1 1 0.44746560002396946 0.44999999999999984
1 1 0 0 0.4501996568913771 0.48999999999999977
1 1 0 1 0.4520239383433076 0.47999999999999976
1 1 1 0 0.4460009627691928 0.45999999999999985
1 1 1 1 0.44899313842482097 0.45999999999999985


## 학습데이터 통째로 다시 학습

In [19]:
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under_all, y_under_all = random_under.fit_resample(X_scaled, y)

smote = SMOTE(random_state=777) # SMOTE의 하이퍼파라미터는 default로 사용
X_resampled_all, y_resampled_all=smote.fit_resample(X_under_all, y_under_all)

In [20]:
# XGBoost
xgb_tuned_all = xgb.XGBClassifier(random_state=777, n_estimators=100, 
                            max_depth=20, subsample=1.0, gamma=0, 
                            tree_method='gpu_hist', gpu_id=0)
xgb_tuned_all.fit(X_under_all, y_under_all)
print('xgboost done')

# LGBM
lgbm_tuned_all = LGBMClassifier(num_leaves=70, max_depth=25, min_child_samples=100, 
                        random_state=777, n_jobs=-1, n_estimators=200, objective='binary', 
                        is_unbalance=True)
lgbm_tuned_all.fit(X_resampled_all, y_resampled_all)
print('lgbm done')

# Random Forest
rf_tuned_all = RandomForestClassifier(n_estimators=100, random_state=777)
rf_tuned_all.fit(X_under_all, y_under_all)
print('random forest done')

xgboost done
lgbm done
random forest done


In [21]:
with open('../preprocessed/xgb.pickle','wb') as fw:
    pickle.dump(xgb_tuned_all, fw)

In [22]:
with open('../preprocessed/lgbm.pickle','wb') as fw:
    pickle.dump(lgbm_tuned_all, fw)

In [24]:
with open('../preprocessed/random_forest.pickle','wb') as fw:
    pickle.dump(rf_tuned_all, fw)