In [133]:
import xgboost as xgb
from catboost import CatBoostRegressor

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold , train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import optuna

In [3]:
!ls dataset/

test.csv  train.csv


# Data

In [4]:
DATA_PATH = 'dataset/'
train = pd.read_csv(DATA_PATH + 'train.csv')

In [6]:
TARGET = train.columns[-1]

In [7]:
X = train.drop([TARGET,'UNC_YEAR', 'COUNTRYNM','HSCD'],axis=1)
X = X.fillna(0)
Y = train[TARGET]

In [49]:
cat_columns = [c for c, t in zip(X.dtypes.index, X.dtypes) if t == 'O'] 
num_columns = [c for c in X.dtypes.index if c not in cat_columns]

## Categorical

In [17]:
# 각 나라별 수입액에 대해 평균치 encoding

income = train.groupby(cat_columns)[TARGET].mean()
income

X['COUNTRYNM'] = X['COUNTRYNM'].map(income)


In [33]:
# # 각 나라 품목별 수입액에 대한 평균치 encoding

# code = train.groupby('HSCD')[TARGET].mean()
# code 


# X['HSCD'] = X['HSCD'].map(code)

In [20]:
nation_code = train.groupby('COUNTRYCD')[TARGET].mean()
nation_code

X['COUNTRYCD'] = X['COUNTRYCD'].map(nation_code)

## Numerical

In [51]:
scaler = StandardScaler()
X[num_columns] = scaler.fit_transform(X[num_columns])

# XGBOOST

In [74]:
def objective(trial,data=None,target=None):
    
    train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15,random_state=42)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#         'n_gpus' : 2,
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [75]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-06-25 11:42:03,851][0m A new study created in memory with name: no-name-20301ffd-aa3f-410c-9d43-1b9101fa12a2[0m
[32m[I 2021-06-25 11:42:04,372][0m Trial 0 finished with value: 119305657.02687868 and parameters: {'lambda': 3.641197886360409, 'alpha': 0.004117403278852017, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 9, 'random_state': 24, 'min_child_weight': 99}. Best is trial 0 with value: 119305657.02687868.[0m
[32m[I 2021-06-25 11:42:12,782][0m Trial 1 finished with value: 115944132.90456645 and parameters: {'lambda': 0.08480244006735226, 'alpha': 2.1158010001106073, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 20, 'random_state': 2020, 'min_child_weight': 71}. Best is trial 1 with value: 115944132.90456645.[0m
[32m[I 2021-06-25 11:42:13,734][0m Trial 2 finished with value: 129267288.27424446 and parameters: {'lambda': 0.12743625997453095, 'alpha': 9.81655900369425, 'colsample_bytree': 0.8, 's

[32m[I 2021-06-25 11:43:39,906][0m Trial 24 finished with value: 109088384.51560527 and parameters: {'lambda': 0.7075206157977179, 'alpha': 0.0014919433803230314, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.014, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 32}. Best is trial 15 with value: 97951392.95413533.[0m
[32m[I 2021-06-25 11:43:45,232][0m Trial 25 finished with value: 109593679.70839928 and parameters: {'lambda': 0.0027029206076729356, 'alpha': 0.03768437149750119, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.009, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 59}. Best is trial 15 with value: 97951392.95413533.[0m
[32m[I 2021-06-25 11:43:47,038][0m Trial 26 finished with value: 123111786.15775257 and parameters: {'lambda': 0.022373265119381865, 'alpha': 0.17765617927815738, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 225}. Best is

[32m[I 2021-06-25 11:45:09,519][0m Trial 48 finished with value: 96623672.34039877 and parameters: {'lambda': 1.3262164104206684, 'alpha': 4.312575043315489, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 1}. Best is trial 48 with value: 96623672.34039877.[0m
[32m[I 2021-06-25 11:45:12,729][0m Trial 49 finished with value: 110928080.79561675 and parameters: {'lambda': 8.751010583463508, 'alpha': 5.0845128976996214, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 17, 'random_state': 48, 'min_child_weight': 61}. Best is trial 48 with value: 96623672.34039877.[0m


Number of finished trials: 50
Best trial: {'lambda': 1.3262164104206684, 'alpha': 4.312575043315489, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 1}


In [76]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,119305700.0,2021-06-25 11:42:03.853228,2021-06-25 11:42:04.372442,0 days 00:00:00.519214,0.004117,0.7,3.641198,0.016,9,99,24,1.0,COMPLETE
1,1,115944100.0,2021-06-25 11:42:04.373831,2021-06-25 11:42:12.781960,0 days 00:00:08.408129,2.115801,0.9,0.084802,0.01,20,71,2020,0.7,COMPLETE
2,2,129267300.0,2021-06-25 11:42:12.783427,2021-06-25 11:42:13.733752,0 days 00:00:00.950325,9.816559,0.8,0.127436,0.008,7,247,24,0.4,COMPLETE
3,3,111904600.0,2021-06-25 11:42:13.735152,2021-06-25 11:42:16.853933,0 days 00:00:03.118781,5.678236,0.7,0.015077,0.008,17,51,2020,0.8,COMPLETE
4,4,125449600.0,2021-06-25 11:42:16.855440,2021-06-25 11:42:17.328701,0 days 00:00:00.473261,0.003799,1.0,1.151615,0.016,5,115,2020,0.4,COMPLETE
5,5,126871800.0,2021-06-25 11:42:17.330283,2021-06-25 11:42:18.213824,0 days 00:00:00.883541,0.081927,0.9,0.080033,0.02,7,284,24,0.4,COMPLETE
6,6,126810000.0,2021-06-25 11:42:18.215099,2021-06-25 11:42:19.324782,0 days 00:00:01.109683,0.30913,0.5,0.042861,0.008,13,120,48,0.5,COMPLETE
7,7,128418300.0,2021-06-25 11:42:19.326363,2021-06-25 11:42:20.904084,0 days 00:00:01.577721,0.025966,0.9,0.031998,0.012,15,270,24,0.4,COMPLETE
8,8,126021800.0,2021-06-25 11:42:20.905482,2021-06-25 11:42:21.762200,0 days 00:00:00.856718,0.303098,1.0,0.054551,0.016,15,127,48,0.4,COMPLETE
9,9,115392700.0,2021-06-25 11:42:21.763662,2021-06-25 11:42:28.944872,0 days 00:00:07.181210,3.118607,0.8,0.010782,0.016,20,71,2020,0.5,COMPLETE


In [89]:
Best_trial = {'lambda': 1.3262164104206684,
              'alpha': 4.312575043315489,
              'colsample_bytree': 0.6,
              'subsample': 0.8,
              'learning_rate': 0.01,
              'max_depth': 17,
              'random_state': 2020,
              'min_child_weight': 1}

In [109]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15,random_state=42)

In [115]:
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.3,random_state=42)

In [151]:
model = xgb.XGBRegressor(**Best_trial)

In [152]:
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=False)

XGBRegressor(alpha=4.312575043315489, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', lambda=1.3262164104206684,
             learning_rate=0.01, max_delta_step=0, max_depth=17,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=2020,
             reg_alpha=4.31257486, reg_lambda=1.32621646, scale_pos_weight=1,
             subsample=0.8, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [153]:
preds += model.predict(test_x)

In [156]:
mean_squared_error(test_y, preds, squared=False)

106856452.86002801

# Catboost

In [157]:
X

Unnamed: 0,COUNTRYCD,TRADE_COUNTRYCD,TRADE_HSCD,TARIFF_AVG,SNDIST,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD_1Y,SP_POP_TOTL,PA_NUS_FCRF,IC_BUS_EASE_DFRN_DB,KMDIST,TRADE_HSCD_COUNTRYCD
0,-0.373230,-0.639000,0.110117,3.082953,-0.983455,-0.429411,-0.426136,-0.285195,-0.257025,-2.330420,0.493935,-0.143080
1,-0.047035,-0.245844,0.110117,-0.374765,2.313494,-0.091880,-0.104121,-0.344215,-0.275157,0.899485,0.093251,0.061400
2,-0.379136,-0.379419,0.110117,-0.374765,-0.775534,-0.357458,-0.353811,-0.399779,-0.275372,0.770036,0.083230,0.068192
3,-0.343393,0.137789,0.110117,-0.374765,-0.943005,-0.332570,-0.329197,-0.390717,-0.275372,0.159608,0.190060,0.187179
4,-0.217925,-0.413317,0.110117,1.699866,1.300063,0.121772,0.076053,0.299981,-0.274845,-1.452490,2.568571,-0.129969
...,...,...,...,...,...,...,...,...,...,...,...,...
21184,-0.222845,-0.154202,0.080663,-0.374765,-0.284450,-0.366670,-0.365639,-0.397354,-0.274765,0.635390,-0.257126,-0.048089
21185,-0.281683,-0.234294,0.080663,-0.374765,-0.886339,-0.228811,-0.208250,-0.145576,-0.274769,-0.127308,0.003967,-0.082175
21186,-0.359221,-0.595272,0.080663,-0.374765,-2.519368,-0.410315,-0.373192,-0.091642,-0.272432,-1.443791,0.134622,-0.144685
21187,-0.257049,0.644183,0.080663,-0.374765,-0.906922,0.297493,0.351933,-0.198463,-0.275244,1.194640,0.227869,0.392100


In [161]:
Y_2 = Y.values.reshape(-1,1)

In [169]:
def objective(trial, data=None, target=None):
    
    train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.15, random_state=42)
    param = {
        'loss_function' : 'RMSE',
        'task_type' : 'CPU',
        'l2_leaf_reg' : trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'max_bin' : trial.suggest_int('max_bin', 200, 400),
        'subsample' : trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'learning_rate' : trial.suggest_uniform('learning_rate', 0.006, 0.018),
        'n_estimators' : 25000,
        'max_depth' : trial.suggest_categorical('max_Depth', [7, 10, 14, 16]),
        'random_state' : trial.suggest_categorical('random_state', [24, 48 , 2021]),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 1, 300),
    }
    
    model = CatBoostRegressor(**param)
    
    model.fit(train_x, train_y, eval_set=[(test_x,test_y,)], early_stopping_rounds=200, verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds, squared=False)
    
    return rmse

In [170]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-06-25 20:01:13,263][0m A new study created in memory with name: no-name-fb45da4b-df00-46e2-9856-eeabdf1532e3[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2021-06-25 20:01:15,684][0m Trial 0 finished with value: 101632747.24536476 and parameters: {'l2_leaf_reg': 0.4135254146361835, 'max_bin': 204, 'bagging_fraction': 0.8238999416502923, 'learning_rate': 0.010850938484176656, 'max_Depth': 10, 'random_state': 24, 'min_data_in_leaf': 160}. Best is trial 0 with value: 101632747.24536476.[0m
[32m[I 2021-06-25 20:02:49,774][0m Trial 1 finished with value: 103881338.5798518 and parameters: {'l2_leaf_reg': 0.1732438165178731, 'max_bin': 314, 'bagging_fraction': 0.8263950176267669, 'learning_rate': 0.01739758713362053, 'max_Depth': 16, 'random_state': 48, 'min_data_in_leaf': 164}. Best is trial 0 with value: 101632747.24536476.[0m
[32m[I 2021-06-25 20:04:24,545][0m Trial 2 finished with value: 103087807.4171187 

[32m[I 2021-06-25 20:11:43,375][0m Trial 24 finished with value: 98709232.04474434 and parameters: {'l2_leaf_reg': 3.880172328273814, 'max_bin': 200, 'bagging_fraction': 0.5344365407698894, 'learning_rate': 0.014963266680975154, 'max_Depth': 7, 'random_state': 24, 'min_data_in_leaf': 124}. Best is trial 14 with value: 93853488.1319475.[0m
[32m[I 2021-06-25 20:11:44,063][0m Trial 25 finished with value: 101261372.0937687 and parameters: {'l2_leaf_reg': 1.1308947012636534, 'max_bin': 224, 'bagging_fraction': 0.5462232678449489, 'learning_rate': 0.013054654516709188, 'max_Depth': 7, 'random_state': 2021, 'min_data_in_leaf': 83}. Best is trial 14 with value: 93853488.1319475.[0m
[32m[I 2021-06-25 20:11:44,762][0m Trial 26 finished with value: 103235109.45188914 and parameters: {'l2_leaf_reg': 0.36180780711796023, 'max_bin': 282, 'bagging_fraction': 0.7148511353162751, 'learning_rate': 0.01622224932431156, 'max_Depth': 7, 'random_state': 24, 'min_data_in_leaf': 34}. Best is trial 14

[32m[I 2021-06-25 20:18:30,626][0m Trial 48 finished with value: 102614169.53490776 and parameters: {'l2_leaf_reg': 0.1558012031604308, 'max_bin': 206, 'bagging_fraction': 0.6296119214327769, 'learning_rate': 0.010955774641268252, 'max_Depth': 7, 'random_state': 48, 'min_data_in_leaf': 64}. Best is trial 14 with value: 93853488.1319475.[0m
[32m[I 2021-06-25 20:19:17,710][0m Trial 49 finished with value: 97966047.147824 and parameters: {'l2_leaf_reg': 4.1418104258484005, 'max_bin': 216, 'bagging_fraction': 0.6729622096254144, 'learning_rate': 0.014582290390335775, 'max_Depth': 14, 'random_state': 48, 'min_data_in_leaf': 16}. Best is trial 14 with value: 93853488.1319475.[0m


Number of finished trials: 50
Best trial: {'l2_leaf_reg': 9.143928615561375, 'max_bin': 203, 'bagging_fraction': 0.4139753414835906, 'learning_rate': 0.013863363556442816, 'max_Depth': 7, 'random_state': 24, 'min_data_in_leaf': 55}


In [178]:
Best_trial_cat = {'l2_leaf_reg': 9.143928615561375,
                  'max_bin': 203,
                  'subsample': 0.4139753414835906,
                  'learning_rate': 0.013863363556442816,
                  'max_depth': 7,
                  'random_state': 24,
                  'min_data_in_leaf': 55}

In [179]:
 model = CatBoostRegressor(**Best_trial_cat)

In [180]:
 model.fit(train_x, train_y, eval_set=[(test_x,test_y,)], early_stopping_rounds=200, verbose=False)
    
preds = model.predict(test_x)
    
rmse = mean_squared_error(test_y, preds, squared=False)

In [181]:
np.mean(rmse)

112949988.86507598

XGBOOST ; 106856452.86002801

CATBOOST ; 112949988.86507598