## Import & Data Load

In [12]:
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import pandas as pd
from tqdm import tqdm
import numpy as np

# SHAP
import shap
import skimage

# LGBM Regressor
from lightgbm import LGBMRegressor

# train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Evaluation Score
from sklearn.metrics import mean_absolute_error

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

In [2]:
train = pd.read_csv('../new_open/train_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])

In [3]:
train.columns

Index(['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ID', 'BREADTH',
       'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN',
       'ATA_LT', 'PORT_SIZE', 'year', 'month', 'day', 'hour', 'minute',
       'weekday', 'COS_ATA_LT', 'SIN_ATA_LT', '종가', 'rounded_hour',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'rounded_hour_sin', 'rounded_hour_cos', 'ship_cluster',
       'CI_HOUR'],
      dtype='object')

# Optuna

In [4]:
X = train.drop(columns='CI_HOUR')
y = train['CI_HOUR']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# random sampler
sampler = TPESampler(seed=42)

# define function
def objective(trial):

    lgbm_param = {
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    # Generate model
    model_lgbm = LGBMRegressor(**lgbm_param)
    model_lgbm = model_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                           verbose=0, early_stopping_rounds=25)
                           
    # * 평기 지표이다.
    # 원하는 평가 지표에 따라 사용하면 된다.                         
    MAE = mean_absolute_error(y_val, model_lgbm.predict(X_val))
    return MAE

optuna_lgbm = optuna.create_study(direction='minimize', sampler=sampler)

# * n_trials의 경우 optuna를 몇번 실행하여 hyper parameter를 찾을 것인지를 정한다.
# 50으로 설정해도 유의미한 값이 나온다.
optuna_lgbm.optimize(objective, n_trials=100)

[32m[I 2023-10-28 15:14:14,150][0m A new study created in memory with name: no-name-7aec28fe-1316-4d39-bdca-3a1aa877c03b[0m
[32m[I 2023-10-28 15:14:16,063][0m Trial 0 finished with value: 80.81463777796647 and parameters: {'num_leaves': 17, 'colsample_bytree': 0.9852142919229748, 'reg_alpha': 0.7319939418114051, 'reg_lambda': 5.986584841970366, 'max_depth': 5, 'learning_rate': 8.62913219007185e-08, 'n_estimators': 268, 'min_child_samples': 88, 'subsample': 0.6938533737439828}. Best is trial 0 with value: 80.81463777796647.[0m
[32m[I 2023-10-28 15:14:20,057][0m Trial 1 finished with value: 80.81354300571407 and parameters: {'num_leaves': 152, 'colsample_bytree': 0.7061753482887407, 'reg_alpha': 0.9699098521619943, 'reg_lambda': 8.324426408004218, 'max_depth': 5, 'learning_rate': 1.2329623163659816e-07, 'n_estimators': 632, 'min_child_samples': 34, 'subsample': 0.6469661675743767}. Best is trial 1 with value: 80.81354300571407.[0m
[32m[I 2023-10-28 15:14:29,845][0m Trial 2 fin

In [7]:
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))

Best Trial: score 52.41091302898551,
params {'num_leaves': 763, 'colsample_bytree': 0.9561646034233345, 'reg_alpha': 0.860129081073355, 'reg_lambda': 1.6180424591586329, 'max_depth': 15, 'learning_rate': 0.006372380680387674, 'n_estimators': 2346, 'min_child_samples': 23, 'subsample': 0.4984430729530094}


## K-Fold Model Fitting & Validation

In [8]:
X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']
X_train_reduced = X_train
X_test_reduced = test

In [9]:
lgbm = LGBMRegressor(**lgbm_trial_params)

In [10]:
lgb_model = lgbm.fit(X_train_reduced, y_train)

In [None]:
explainer = shap.TreeExplainer(lgb_model) # Tree model Shap Value 확인 객체 지정
shap_values = explainer.shap_values(X_test_reduced)

In [None]:
shap.summary_plot(shap_values, test_x, plot_type = "bar")

## Submission

In [18]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = final_predictions

In [19]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,103.720071
1,TEST_000001,24.068626
2,TEST_000002,42.500400
3,TEST_000003,129.251190
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,110.435805
220487,TEST_220487,102.844896
220488,TEST_220488,85.997301
220489,TEST_220489,0.000000


In [20]:
submit.to_csv(f'../Sub/lgbm_tune_1.csv', index=False)