In [None]:
!pip install catboost -q
!pip install optuna -q

In [None]:
import os
import sys
import pandas as pd
import numpy as np

# 모델
import sklearn 
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score
import catboost
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
import joblib

np.random.seed(777)

In [None]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_data.csv')
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [None]:
# 데이터 셋 나누기
X = train.drop('target', axis=1)
y = train['target']
X_test = test.copy()

In [None]:
# Optuna를 통한 하이퍼 파라미터 설정
def objective(trial):
    param = {"random_state": 777,
             "learning_rate": trial.suggest_loguniform('learning_rate', 0.01, 0.2),
             "bagging_temperature": trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
             "n_estimators": trial.suggest_int("n_estimators", 1000, 10000),
             "max_depth": trial.suggest_int("max_depth", 4, 16),
             "random_strength": trial.suggest_int('random_strength', 0, 100),
             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0),
             "l2_leaf_reg": trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
             "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
             "max_bin": trial.suggest_int("max_bin", 200, 500),
             "od_type": trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])}

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=777)
    cat = CatBoostClassifier(**param)
    cat.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid,y_valid)],
            early_stopping_rounds=35, verbose=False,
            cat_features=X_train.drop(['contents_rn_cnt', 'person_rn_cnt'], axis=1).columns.tolist())
    cat_pred = cat.predict(X_valid)
    f1 = f1_score(y_valid, cat_pred)

    return f1

study = optuna.create_study(study_name = 'cat_parameter_opt', direction = 'maximize', sampler = TPESampler(seed=777))
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial", study.best_trial.params)

[32m[I 2022-01-30 07:26:17,477][0m A new study created in memory with name: cat_parameter_opt[0m
[32m[I 2022-01-30 07:42:20,317][0m Trial 0 finished with value: 0.6856973452681508 and parameters: {'learning_rate': 0.01579865438088323, 'bagging_temperature': 0.16196696296443755, 'n_estimators': 1558, 'max_depth': 9, 'random_strength': 84, 'colsample_bylevel': 0.9561982286499368, 'l2_leaf_reg': 2.1812399639728913e-05, 'min_child_samples': 78, 'max_bin': 281, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.6856973452681508.[0m
[32m[I 2022-01-30 08:20:08,661][0m Trial 1 finished with value: 0.6666345105681019 and parameters: {'learning_rate': 0.012696199189652336, 'bagging_temperature': 2.2827324567331666, 'n_estimators': 4090, 'max_depth': 16, 'random_strength': 63, 'colsample_bylevel': 0.8090675678685719, 'l2_leaf_reg': 1.6572181843977423e-05, 'min_child_samples': 30, 'max_bin': 312, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.6856973452681508.[0m
[32m[I 2022-01

Best Score: 0.6938205204275966
Best trial {'learning_rate': 0.04031140044870307, 'bagging_temperature': 47.87071560332222, 'n_estimators': 7554, 'max_depth': 7, 'random_strength': 1, 'colsample_bylevel': 0.8235932881616793, 'l2_leaf_reg': 2.323674135181441e-05, 'min_child_samples': 50, 'max_bin': 441, 'od_type': 'Iter'}


In [None]:
## 하이퍼 파리미터 저장
joblib.dump(study, "./model/model/opt_catboost.pkl")