## Import & Data Load

In [1]:
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import pandas as pd
from tqdm import tqdm
import numpy as np

# LGBM Regressor
from lightgbm import LGBMRegressor

# train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Evaluation Score
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('../new_open/train_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])

In [3]:
train.columns

Index(['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ID', 'BREADTH',
       'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN',
       'ATA_LT', 'PORT_SIZE', 'year', 'month', 'day', 'hour', 'minute',
       'weekday', 'COS_ATA_LT', 'SIN_ATA_LT', '종가', 'rounded_hour',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'rounded_hour_sin', 'rounded_hour_cos', 'ship_cluster',
       'CI_HOUR'],
      dtype='object')

# Optuna

In [4]:
X = train.drop(columns='CI_HOUR')[[]]
y = train['CI_HOUR']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# random sampler
sampler = TPESampler(seed=42)

# define function
def objective(trial):

    lgbm_param = {
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    # Generate model
    model_lgbm = LGBMRegressor(**lgbm_param)
    model_lgbm = model_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                           verbose=0, early_stopping_rounds=25)
                           
    # * 평기 지표이다.
    # 원하는 평가 지표에 따라 사용하면 된다.                         
    MAE = mean_absolute_error(y_val, model_lgbm.predict(X_val))
    return MAE

optuna_lgbm = optuna.create_study(direction='minimize', sampler=sampler)

# * n_trials의 경우 optuna를 몇번 실행하여 hyper parameter를 찾을 것인지를 정한다.
# 50으로 설정해도 유의미한 값이 나온다.
optuna_lgbm.optimize(objective, n_trials=100)

[32m[I 2023-10-28 15:13:21,769][0m A new study created in memory with name: no-name-6835b561-964a-4239-a07a-1dc281aaddf2[0m
[32m[I 2023-10-28 15:13:23,687][0m Trial 0 finished with value: 80.81463777796647 and parameters: {'num_leaves': 17, 'colsample_bytree': 0.9852142919229748, 'reg_alpha': 0.7319939418114051, 'reg_lambda': 5.986584841970366, 'max_depth': 5, 'learning_rate': 8.62913219007185e-08, 'n_estimators': 268, 'min_child_samples': 88, 'subsample': 0.6938533737439828}. Best is trial 0 with value: 80.81463777796647.[0m
[32m[I 2023-10-28 15:13:27,713][0m Trial 1 finished with value: 80.81354300571407 and parameters: {'num_leaves': 152, 'colsample_bytree': 0.7061753482887407, 'reg_alpha': 0.9699098521619943, 'reg_lambda': 8.324426408004218, 'max_depth': 5, 'learning_rate': 1.2329623163659816e-07, 'n_estimators': 632, 'min_child_samples': 34, 'subsample': 0.6469661675743767}. Best is trial 1 with value: 80.81354300571407.[0m
[32m[I 2023-10-28 15:13:37,147][0m Trial 2 fin

In [None]:
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))

## K-Fold Model Fitting & Validation

In [None]:
X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']
X_train_reduced = X_train
X_test_reduced = test

In [9]:
lgbm = LGBMRegressor(**lgbm_trial_params)

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 MAE 점수 리스트
ensemble_predictions = []
scores = []

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]
    
    # 두 모델 모두 학습
    lgbm.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = lgbm.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_absolute_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    lgbm_pred = lgbm.predict(X_test_reduced)
    lgbm_pred = np.where(lgbm_pred < 0, 0, lgbm_pred)
    
    ensemble_predictions.append(lgbm_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

Processing folds: 100%|██████████████████████████████████████████████████████████████████| 5/5 [07:51<00:00, 94.35s/it]

Validation : MAE scores for each fold: [52.646318010406326, 52.81864475556544, 52.5670176323291, 52.44402248562996, 53.27014935901645]
Validation : MAE: 52.749230448589444





## Submission

In [18]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = final_predictions

In [19]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,103.720071
1,TEST_000001,24.068626
2,TEST_000002,42.500400
3,TEST_000003,129.251190
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,110.435805
220487,TEST_220487,102.844896
220488,TEST_220488,85.997301
220489,TEST_220489,0.000000


In [20]:
submit.to_csv(f'../Sub/lgbm_tune_1.csv', index=False)