## Import & Data Load

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
train = pd.read_csv('../new_open/train_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])

In [3]:
train.columns

Index(['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ID', 'BREADTH',
       'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN',
       'ATA_LT', 'PORT_SIZE', 'year', 'month', 'day', 'hour', 'minute',
       'weekday', 'COS_ATA_LT', 'SIN_ATA_LT', '종가', 'rounded_hour',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'rounded_hour_sin', 'rounded_hour_cos', 'ship_cluster',
       'CI_HOUR'],
      dtype='object')

# Feature Selection

| Feature           | Importance | Stddev   | p_value       | n   | p99_high   | p99_low    |
|-------------------|------------|----------|---------------|-----|------------|------------|
| DIST              | 33.718465  | 1.443961 | 4.025977e-07  | 5   | 36.691601  | 30.745328  |
| AIR_TEMPERATURE   | 20.535054  | 2.346330 | 2.010168e-05  | 5   | 25.366181  | 15.703927  |
| PORT_SIZE         | 13.383557  | 0.433339 | 1.317043e-07  | 5   | 14.275809  | 12.491306  |
| month_sin         | 10.448891  | 0.869073 | 5.690231e-06  | 5   | 12.238324  | 8.659458   |
| DEADWEIGHT        | 8.050769   | 0.303832 | 2.429625e-07  | 5   | 8.676363   | 7.425176   |
| SHIP_TYPE_CATEGORY| 5.522400   | 0.605824 | 1.710481e-05  | 5   | 6.769799   | 4.275000   |
| ARI_CO            | 5.450869   | 0.344041 | 1.894331e-06  | 5   | 6.159254   | 4.742484   |
| GT                | 4.916247   | 0.041069 | 5.843499e-10  | 5   | 5.000810   | 4.831685   |
| V_WIND            | 4.194757   | 0.513256 | 2.636758e-05  | 5   | 5.251558   | 3.137956   |
| year              | 3.800577   | 0.532664 | 4.511340e-05  | 5   | 4.897339   | 2.703815   |
| ARI_PO            | 3.758550   | 0.380011 | 1.237053e-05  | 5   | 4.540998   | 2.976102   |
| ID                | 2.767858   | 0.188760 | 2.579624e-06  | 5   | 3.156517   | 2.379199   |
| SHIPMANAGER       | 2.596882   | 0.189932 | 3.409397e-06  | 5   | 2.987955   | 2.205808   |
| 종가               | 2.401468   | 0.198470 | 5.547651e-06  | 5   | 2.810121   | 1.992816   |
| weekday_sin       | 2.209348   | 0.358705 | 8.053096e-05  | 5   | 2.947927   | 1.470769   |
| weekday           | 1.803524   | 0.146227 | 5.140521e-06  | 5   | 2.104608   | 1.502441   |
| LENGTH            | 1.618994   | 0.323677 | 1.819071e-04  | 5   | 2.285450   | 0.952539   |
| BN                | 1.602821   | 0.270544 | 9.381494e-05  | 5   | 2.159875   | 1.045767   |
| weekday_cos       | 1.493677   | 0.222006 | 5.687632e-05  | 5   | 1.950791   | 1.036563   |
| FLAG              | 1.464941   | 0.245221 | 9.079881e-05  | 5   | 1.969855   | 0.960027   |
| BUILT             | 1.335723   | 0.161547 | 2.518170e-05  | 5   | 1.668350   | 1.003096   |
| month_cos         | 1.264094   | 0.193196 | 6.348254e-05  | 5   | 1.661888   | 0.866300   |
| minute            | 0.974526   | 0.103713 | 1.516398e-05  | 5   | 1.188072   | 0.760979   |
| U_WIND            | 0.950229   | 0.082772 | 6.839444e-06  | 5   | 1.120658   | 0.779801   |
| DRAUGHT           | 0.807782   | 0.093130 | 2.083045e-05  | 5   | 0.999537   | 0.616027   |
| BREADTH           | 0.795087   | 0.179288 | 2.902996e-04  | 5   | 1.164244   | 0.425929   |
| DEPTH             | 0.769400   | 0.217530 | 6.913903e-04  | 5   | 1.217297   | 0.321503   |
| ship_cluster      | 0.648050   | 0.188175 | 7.650426e-04  | 5   | 1.035506   | 0.260595   |
| day               | 0.640091   | 0.193719 | 8.946352e-04  | 5   | 1.038962   | 0.241220   |
| COS_ATA_LT        | 0.583585   | 0.070848 | 2.556124e-05  | 5   | 0.729461   | 0.437708   |
| month             | 0.568937   | 0.249898 | 3.513745e-03  | 5   | 1.083481   | 0.054394   |
| day_cos           | 0.503841   | 0.193575 | 2.169824e-03  | 5   | 0.902414   | 0.105267   |
| SIN_ATA_LT        | 0.487606   | 0.109514 | 2.858391e-04  | 5   | 0.713096   | 0.262115   |
| hour              | 0.484163   | 0.132960 | 6.189294e-04  | 5   | 0.757929   | 0.210397   |
| day_sin           | 0.477267   | 0.133768 | 6.689107e-04  | 5   | 0.752696   | 0.201837   |
| rounded_hour_cos  | 0.427113   | 0.218824 | 6.009905e-03  | 5   | 0.877674   | -0.023448  |
| rounded_hour_sin  | 0.370473   | 0.151834 | 2.742366e-03  | 5   | 0.683101   | 0.057846   |
| rounded_hour      | 0.370026   | 0.030097 | 5.206102e-06  | 5   | 0.431996   | 0.308056   |
| ATA_LT            | 0.331854   | 0.185703 | 8.092656e-03  | 5   | 0.714219   | -0.050511  |

In [25]:
importance_features = ['AIR_TEMPERATURE','V_WIND','U_WIND','BN']
#importance_features = ['DIST','PORT_SIZE','month','BDI_ADJ','DEADWEIGHT']

X_train = train.drop(columns='CI_HOUR')
y_train = train['CI_HOUR']
X_train_reduced = X_train[importance_features]
X_test_reduced = test[importance_features]
#X_train_reduced = X_train
#X_test_reduced = test

## K-Fold Model Fitting & Validation

In [26]:
lgbm = lgb.LGBMRegressor(n_estimators=1000)

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 MAE 점수 리스트
ensemble_predictions = []
scores = []

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]
    
    # 두 모델 모두 학습
    lgbm.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = lgbm.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_absolute_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    lgbm_pred = lgbm.predict(X_test_reduced)
    lgbm_pred = np.where(lgbm_pred < 0, 0, lgbm_pred)
    
    ensemble_predictions.append(lgbm_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

Processing folds: 100%|██████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.16s/it]

Validation : MAE scores for each fold: [79.29071477870535, 79.20956991459803, 79.24291493685928, 78.50947280697433, 79.69925750019209]
Validation : MAE: 79.19038598746582





## Submission

In [6]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = final_predictions

In [7]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,167.892796
1,TEST_000001,75.583427
2,TEST_000002,36.558741
3,TEST_000003,138.545895
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,125.409036
220487,TEST_220487,139.136735
220488,TEST_220488,68.008151
220489,TEST_220489,0.000000


In [8]:
submit.to_csv(f'../Sub/lgbm_2.csv', index=False)