In [31]:
import pandas as pd
import numpy as np

import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from tqdm import tnrange, tqdm_notebook

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn.functional as F

import catboost
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [32]:
train_df = pd.read_csv('data/train.csv', index_col = 0)
test_df = pd.read_csv('data/test.csv' , index_col = 0)
sample_submission = pd.read_csv('data/sample_submission.csv')

In [33]:
def fill_na_mode(data):                  
    fill_null_list = list(data.columns)
    fill_null_list.remove('TIPI10')
    for col in fill_null_list:
        data[col].fillna(data[col].mode()[0],inplace=True)
    data['TIPI10'].fillna(1,inplace=True)

    return data

In [34]:
# Q 답변 변환
def q_data(data):
    Q_list = ['Q'+ str(i+1) for i in range(26)]
    reverse_list = ['Q3', 'Q4', 'Q7', 'Q9', 'Q10', 'Q11', 'Q16', 'Q17']
    for flip in reverse_list: 
        data[flip] = 6 - data[flip]
        
    reverse_secret_list = ['Q21', 'Q22']
    for flip in reverse_secret_list: 
        data[flip] = 6 - data[flip]
    
    data['Mach_score'] = data[Q_list].mean(axis = 1)
    
    return data

In [35]:
def drop_list(data):
    dorp = ['hand']
    data.drop('hand', axis = 1, inplace = True)
    return data

In [36]:
def vcl_sum(data):
    vcl_list = ['VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16']
    data['vcl_sum'] = data[vcl_list].sum(axis = 1)
    
    return data

In [37]:
def ageband(data):
    data['ageBand'] = '-1'
    data.loc[(data['age'] > 12) & (data['age'] <= 18.0), 'ageBand'] = '1'
    data.loc[(data['age'] > 18.0) & (data['age'] <= 24.0), 'ageBand'] = '2'
    data.loc[(data['age'] > 24.0) & (data['age'] < 99999), 'ageBand'] = '3'
    data.drop('age',axis=1,inplace=True)
    return data

In [38]:
# 명목형 변수 타입 변경
def type_nomial(data):
    nomial_list = ['education', 'urban', 'engnat', 'married', 'religion', 'orientation', 'ageBand']
    for idx in nomial_list:
        data[idx] = data[idx].astype('str')
    
    return data

In [39]:
# encoding

In [40]:
def processing_data(data, idx = "train"):      #     데이터 전처리 함수
    data = fill_na_mode(data)                  #     Fillna
    data = drop_list(data)                     #     droplist
    data.loc[data['familysize'] > 12, 'familysize'] = 12
    data = q_data(data)                        #     q_답변
    data = tipi_emotion(data)                  #     tipi
    data = ageband(data)
    data = vcl_sum(data)
    data = encoding_label(data)
    data = type_nomial(data)                 
    data = pd.get_dummies(data)
    
    return data

In [41]:
train = processing_data(train_df)
test = processing_data(test_df)

In [42]:
train_Y = train['nerdiness']
train_X = train.drop('nerdiness', axis = 1)

train_X = train_X.reset_index().drop('index', axis = 1)
train_Y = train_Y.reset_index().drop('index', axis = 1)['nerdiness']

In [47]:
real_score = []

def cb_optimization(trial):
    score = []
    kf = StratifiedKFold(n_splits = 10, random_state = 123 , shuffle = True)
    for train_fold, test_fold in tqdm_notebook(kf.split(train_X, train_Y), desc = 'k_fold'):
        X_train, X_test, y_train, y_test = train_X.iloc[train_fold], train_X.iloc[test_fold], train_Y[train_fold], train_Y[test_fold] 
        
        # Parameter 범위는 처음에는 넓게, 이후 조금씩 줄여나가는 것을 추천 # 
        
        params = {
            'iterations':trial.suggest_int("iterations", 51, 55),
            'learning_rate' : trial.suggest_uniform('learning_rate',0.1, 1),
            'depth': trial.suggest_int('depth',14, 16),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
            'reg_lambda': trial.suggest_uniform('reg_lambda',1,100),
            'subsample': trial.suggest_uniform('subsample',0.3,1),
            'random_strength': trial.suggest_uniform('random_strength',10,100),
            'od_wait':trial.suggest_int('od_wait', 10, 150),
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
            'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 1, 100),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0., 1.0),
            'random_state' : 0,
            'verbose' : 0,
        }

        model = CatBoostClassifier(**params)
        # Training
        model.fit(X_train, y_train)
        pred_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, threshold = roc_curve(y_test, pred_proba)
        auc_score = auc(fpr, tpr)
        score.append(auc_score)
        real_score.append(auc_score)

    return np.mean(score)

In [None]:
sampler = TPESampler(seed = 42)

# 10번 돌렸을 경우 안댐

optim = optuna.create_study(
    study_name="cat_parameter_opt",
    direction="maximize",
    sampler=sampler,
)


optim.optimize(cb_optimization, n_trials = 100) 
print("Best auc:", optim.best_value)

[32m[I 2022-08-22 09:36:36,363][0m A new study created in memory with name: cat_parameter_opt[0m


k_fold: 0it [00:00, ?it/s]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2022-08-22 09:40:28,711][0m Trial 0 finished with value: 0.823513887504846 and parameters: {'iterations': 52, 'learning_rate': 0.9556428757689246, 'depth': 16, 'min_data_in_leaf': 18, 'reg_lambda': 16.445845403801215, 'subsample': 0.40919616423534183, 'random_strength': 15.227525095137953, 'od_wait': 132, 'leaf_estimation_iterations': 10, 'bagging_temperature': 26.070247583707673, 'colsample_bylevel': 0.020584494295802447}. Best is trial 0 with value: 0.823513887504846.[0m


k_fold: 0it [00:00, ?it/s]

[32m[I 2022-08-22 09:46:29,393][0m Trial 1 finished with value: 0.8542704780077532 and parameters: {'iterations': 55, 'learning_rate': 0.8491983767203796, 'depth': 14, 'min_data_in_leaf': 6, 'reg_lambda': 19.15704647548995, 'subsample': 0.5129695700716763, 'random_strength': 57.228078846901404, 'od_wait': 70, 'leaf_estimation_iterations': 5, 'bagging_temperature': 16.73808578875213, 'colsample_bylevel': 0.13949386065204183}. Best is trial 1 with value: 0.8542704780077532.[0m


k_fold: 0it [00:00, ?it/s]

[32m[I 2022-08-22 09:58:03,877][0m Trial 2 finished with value: 0.8552164320878411 and parameters: {'iterations': 52, 'learning_rate': 0.4297256589643226, 'depth': 15, 'min_data_in_leaf': 24, 'reg_lambda': 20.767704433677615, 'subsample': 0.6599641068895281, 'random_strength': 63.31731119758382, 'od_wait': 16, 'leaf_estimation_iterations': 10, 'bagging_temperature': 2.1930485556643684, 'colsample_bylevel': 0.06505159298527952}. Best is trial 2 with value: 0.8552164320878411.[0m


k_fold: 0it [00:00, ?it/s]