In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
# import seaborn as sns

%matplotlib inline

import pickle
import gzip

In [3]:
#train, test split
from sklearn.model_selection import GridSearchCV, train_test_split
# models
from lightgbm import LGBMClassifier
from lightgbm import plot_importance
import xgboost as xgb
from xgboost import plot_importance
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
# 데이터 불러오기
with gzip.open('../preprocessed/final_data.pickle','rb') as f:
    final = pickle.load(f)

final.head()

In [5]:
# final['desired_amount_x'].equals(final['desired_amount_y'])

True

In [6]:
# 같은 column 발견해서 하나 drop
final = final.drop(['desired_amount_x'], axis=1)

In [7]:
final.rename(columns={'desired_amount_y': 'desired_amount'}, inplace=True)

In [8]:
[col for col in final.columns if "_x" in col]

[]

In [9]:
final.reset_index(inplace=True)

In [None]:
final.head()

In [11]:
X = final.drop(['is_applied', 'application_id', 'user_id'], axis = 1)
y = final['is_applied']

In [12]:
# feature별 값의 범위 스케일 해주기

# standard scaler (z-score)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X만 본인이 가지고 있는 변수로 넣어주기 (X는 label 없이 only feature만!)

In [14]:
# 추후 활용 위해 저장
with gzip.open('../preprocessed/scaler.pickle','wb') as f:
    pickle.dump(X_scaled, f, pickle.HIGHEST_PROTOCOL)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state=777, stratify=y)

In [16]:
# train set 클래스
from collections import Counter
print(sorted(Counter(y_train).items()))

[(0.0, 4535212), (1.0, 248319)]


In [18]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# random under sampling 수행 (1:5 비율로)
random_under = RandomUnderSampler(sampling_strategy=0.2, random_state=777)
X_under, y_under = random_under.fit_resample(X_train, y_train)
# SMOTE로 upsampling 진행
smote = SMOTE(random_state=777) # SMOTE의 하이퍼파라미터는 default로 사용
X_resampled, y_resampled=smote.fit_resample(X_under, y_under)

# parameters to test
num_leaves = [70, 90, 110]
max_depths = [10, 15, 20, 25]
min_child_samples = [100, 300, 500]

from itertools import product as prod

# grid search
for leaves, depths, min_child_sample in prod(num_leaves, max_depths, min_child_samples):
    model = LGBMClassifier(random_state=777, n_jobs=-1, n_estimators=200, objective='binary', is_unbalance=True,
                         num_leaves=leaves, max_depth=depths, min_child_samples=min_child_sample)
    model.fit(X_resampled, y_resampled)

    y_train_pred_lgbm = model.predict(X_train)
    y_under_pred_lgbm = model.predict(X_under)
    y_test_pred_lgbm = model.predict(X_test)

    print(f'num_leaves : {leaves}, max_depth : {depths}, min_child_samples : {min_child_sample}')
    print(f'f1 score with train set: {f1_score(y_train, y_train_pred_lgbm)}')
    print(f'f1 score with test set: {f1_score(y_test, y_test_pred_lgbm)}')
    print(f'f1 score with under: {f1_score(y_under, y_under_pred_lgbm)}')

num_leaves : 70, max_depth : 10, min_child_samples : 100
f1 score with train set: 0.4281930431039405
f1 score with test set: 0.42310255157307713
f1 score with under: 0.6190302819912537
num_leaves : 70, max_depth : 10, min_child_samples : 300
f1 score with train set: 0.42821121720829064
f1 score with test set: 0.4235359460765995
f1 score with under: 0.6187189193201217
num_leaves : 70, max_depth : 10, min_child_samples : 500
f1 score with train set: 0.4294126537625699
f1 score with test set: 0.4242635881845445
f1 score with under: 0.6199855511841885
num_leaves : 70, max_depth : 15, min_child_samples : 100
f1 score with train set: 0.43072676928039955
f1 score with test set: 0.4259419360236495
f1 score with under: 0.6172350693961411
num_leaves : 70, max_depth : 15, min_child_samples : 300
f1 score with train set: 0.4313642165305258
f1 score with test set: 0.42645316633525654
f1 score with under: 0.6175455583427409
num_leaves : 70, max_depth : 15, min_child_samples : 500
f1 score with train