In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

# 모델링

In [None]:
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier

In [None]:
X_train = data[credit.notnull()]
X_test = data[credit.isnull()]
y_train = credit.dropna().astype(int)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

### XGBOOST

### LightGBM

In [None]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'max_depth': (1, 5), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.001, 10) 
}

In [None]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        'objective': 'multiclass',
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'logloss', verbose= 100, 
                early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(valid_x)
    logLoss = log_loss(valid_y, valid_proba)
    
    return -logLoss

In [None]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

In [None]:
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

In [None]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
max_params = max_dict['params']
print(max_dict['target'])
print(max_params)

In [None]:
params = {'colsample_bytree': round(max_params['colsample_bytree'], 3),
 'max_bin': int(max_params['max_bin']),
 'max_depth': int(max_params['max_depth']),
 'min_child_samples': int(max_params['min_child_samples']),
 'min_child_weight': int(max_params['min_child_weight']),
 'num_leaves': int(max_params['num_leaves']),
 'reg_alpha': round(max_params['reg_alpha'], 3),
 'reg_lambda': round(max_params['reg_lambda'], 3),
 'subsample': round(max_params['subsample'], 3),}

In [None]:
d_train = lgb.Dataset(X_train, label=y_train)
lgb_model = LGBMClassifier(**params)
lgb_model.fit(X_train, y_train, verbose= 100)
pred_lgb = lgb_model.predict_proba(X_test)
pred_lgb

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
rf.fit(X_train , y_train)
pred_rf = rf.predict_proba(X_test)
pred_rf

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

#n_estimators : 생성할 tree의 개수

#max_features : 최대 선택할 변수의 수
#max_features 값을 크게 하면 random forest의 tree들은 같은 변수를 고려하므로 
#tree들이 비슷해지고 가장 두드러진 변수를 이용해 데이터에 잘 맞춰짐

#max_features를 낮추면 
#random forest tree들은 많이 달라지고 각 tree는 데이터에 맞추기 위해 tree의 깊이가 깊어집니다.

#max_depth : 랜덤포레스트 안에 있는 각 의사결정나무의 깊이를 설정. 
# 트리가 깊어질수록 더 잘게 분류를 시키므로 일반적으론 정확도가 높아진다. 하지만 오버피팅의 위험이 존재

params = {
    'n_estimators':[100, 300, 500]
}

#Revenue의 T, F의 비율이 8,5 : 1.5로 치우처져있으므로 그 비율에 맞게 sampling하는 StratifiedKFold사용
cv = StratifiedKFold(n_splits=5, random_state=0)

# 랜덤포레스트 객체 생성 
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

# f1 스코어 기준으로 GridSearchCV 수행
# refit = True :  best estimator로 자동으로 수정됨
# n_jobs = -1 : 모든 cpu의 코어를 사용
grid_cv = GridSearchCV(rf , param_grid=params , cv=cv, scoring="logloss", n_jobs=-1, refit = True)

# 모델 학습
grid_cv.fit(X_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 f1: {0:.4f}'.format(grid_cv.best_score_))

# 최적의 파라미터로 모델 생성 및 예측
model = grid_cv.best_estimator_
pred_rf = model.predict(X_test)

# Evaluation
print('f1: {0:.4f}'.format(metrics.f1_score(y_test , pred_rf)))
print('accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test , pred_rf)))
print('precision: {0:.4f}'.format(metrics.precision_score(y_test , pred_rf)))
print('recall: {0:.4f}'.format(metrics.recall_score(y_test , pred_rf)))

# Confusion Matrix
metrics.plot_confusion_matrix(model, X_test, y_test, cmap="Blues")
plt.show()

# 제출

In [None]:
pred = (pred_lgb + pred_rf)/2

In [None]:
submission=pd.read_csv('sample_submission.csv')
submission[['0','1','2']] = pd.DataFrame(pred)
submission

In [None]:
submission.to_csv('submission_3.csv', index=False)