## Import & Data Load

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.preprocessing import MinMaxScaler
from datetime import timedelta

train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

## Data Pre-processing

In [5]:
#############################################################
##시간 데이터 처리
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# UTC와 local 간 시차 계산
train['hour'] = train['ATA'].dt.hour #ATA 시간 추출
test['hour'] = test['ATA'].dt.hour

train['TIME_DIFFERENCE'] = train['ATA_LT'] - train['hour'] #시차 계산
train['TIME_DIFFERENCE'] = train['TIME_DIFFERENCE'].apply(lambda x: x+24 if x<=-12 else x)

test['TIME_DIFFERENCE'] = test['ATA_LT'] - test['hour'] #시차 계산
test['TIME_DIFFERENCE'] = test['TIME_DIFFERENCE'].apply(lambda x: x+24 if x<=-12 else x)

# local의 datetime 생성
def add_time_difference(row):
    return row['ATA'] + timedelta(hours=row['TIME_DIFFERENCE'])

train['NEW_TIME'] = train.apply(add_time_difference, axis=1)
test['NEW_TIME'] = test.apply(add_time_difference, axis=1)

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['NEW_TIME'].dt.year
    df['month'] = df['NEW_TIME'].dt.month
    df['day'] = df['NEW_TIME'].dt.day
    df['hour'] = df['NEW_TIME'].dt.hour
    df['minute'] = df['NEW_TIME'].dt.minute
    df['weekday'] = df['NEW_TIME'].dt.weekday
#############################################################

#############################################################
## Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le
#############################################################

#############################################################
##결측치 처리
#BREADTH, DEPTH 등 선박의 정보가 없는 356484번째 데이터 하나 삭제
train.drop(356484, axis=0, inplace=True)
train.reset_index(drop=True, inplace=True)

#########################################################
##스케일링
minmax_columns = ['GT', 'DEADWEIGHT'] #너어어무 범위 커서 스케일링........
mms = MinMaxScaler(feature_range = (0,400))
mms.fit(train[minmax_columns])
train[minmax_columns] = mms.transform(train[minmax_columns])
test[minmax_columns] = mms.transform(test[minmax_columns])
##########################################################

##피처 생성
#선박 평균 수명은 25~30년 이라고 함.
train['BUILT_old'] = np.where(train['BUILT'] > 25, 1, 0) #노선: 25년 초과면 1 아니면 0
test['BUILT_old'] = np.where(test['BUILT'] > 25, 1, 0)

#DIST 0인거 있음 (0 유무) => binary
train['DIST_CATE'] = (train['DIST'] > 0).astype(int)
test['DIST_CATE'] = (test['DIST'] > 0).astype(int)

# 1000개 넘는 선박 소유주만 select => binary
train['SHIPMANAGER_RICH'] = np.where(train['SHIPMANAGER'] > 1000, 1, 0)
test['SHIPMANAGER_RICH'] = np.where(test['SHIPMANAGER'] > 1000, 1, 0)

#
train['bn_cate'] = np.where(train['BN'] < 3, 0, np.where(train['BN'] <= 6, 1, 2))
test['bn_cate'] = np.where(test['BN'] < 3, 0, np.where(test['BN'] <= 6, 1, 2))

#폭염(섭씨 35도 이상)
train['hot'] = np.where(train['AIR_TEMPERATURE'] >= 35, 1, 0)
test['hot'] = np.where(test['AIR_TEMPERATURE'] >= 35, 1, 0)

#한파(섭씨 -15도 이하)
train['cold'] = np.where(train['AIR_TEMPERATURE'] <= -15, 1, 0)
test['cold'] = np.where(test['AIR_TEMPERATURE'] <= -15, 1, 0)

#주말 여부
train['weekend'] = np.where(train['weekday'] >= 5, 1, 0)
test['weekend'] = np.where(test['weekday'] >= 5, 1, 0)


Encoding features: 100%|██████████| 6/6 [00:01<00:00,  5.29it/s]


In [6]:
# TIME_DIFFERENCe, NEW_TIME 피처 생성된 것 확인
train.loc[:,['ATA', 'TIME_DIFFERENCE', 'NEW_TIME']]

Unnamed: 0,ATA,TIME_DIFFERENCE,NEW_TIME
0,2018-12-17 21:29:00,8,2018-12-18 05:29:00
1,2014-09-23 06:59:00,6,2014-09-23 12:59:00
2,2015-02-03 22:00:00,8,2015-02-04 06:00:00
3,2020-01-17 04:02:00,9,2020-01-17 13:02:00
4,2020-01-26 07:51:00,8,2020-01-26 15:51:00
...,...,...,...
391933,2017-06-06 05:02:00,9,2017-06-06 14:02:00
391934,2019-10-16 00:36:00,8,2019-10-16 08:36:00
391935,2021-03-23 22:35:00,-4,2021-03-23 18:35:00
391936,2015-01-08 07:15:00,8,2015-01-08 15:15:00


In [7]:
print(train[train['weekday'] == 0]['CI_HOUR'].mean())
print(train[train['weekday'] == 1]['CI_HOUR'].mean())
print(train[train['weekday'] == 2]['CI_HOUR'].mean())
print(train[train['weekday'] == 3]['CI_HOUR'].mean())
print(train[train['weekday'] == 4]['CI_HOUR'].mean())
print(train[train['weekday'] == 5]['CI_HOUR'].mean())
print(train[train['weekday'] == 6]['CI_HOUR'].mean())

57.51039407518398
48.83009308984127
49.895614996281715
50.7009322251558
55.44520602841297
117.59785645048761
120.81260596205672


In [8]:
#쓸모 없는거 drop
train.drop(columns=['ATA', 'NEW_TIME', 'ATA_LT', 'ID', 'DEPTH', 'DRAUGHT', 'minute', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN', 'BREADTH', 'BUILT', 'SHIPMANAGER', 'FLAG'], axis=1, inplace=True)
test.drop(columns=['ATA', 'NEW_TIME', 'ATA_LT', 'ID', 'DEPTH', 'DRAUGHT', 'minute', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN', 'BREADTH', 'BUILT', 'SHIPMANAGER', 'FLAG'], axis=1, inplace=True)

## 모델 선택 및 최적화 with autogluon


In [9]:
#train, test 데이터셋
train = TabularDataset(train)
test = TabularDataset(test)

In [11]:
#학습
predictor = TabularPredictor(label='CI_HOUR', eval_metric='mean_absolute_error').fit(train, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240116_103039\"
Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240116_103039\"
AutoGluon Version:  0.8.2
Python Version:     3.8.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
Disk Space Avail:   299.25 GB / 510.55 GB (58.6%)
Train Data Rows:    391938
Train Data Columns: 21
Label Column: CI_HOUR
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (2159.130556, 0.0, 61.94099, 170.80975)
	If 'regression' is not the correct problem_type, please manually specify 

[1000]	valid_set's l1: 55.746
[2000]	valid_set's l1: 55.2087
[3000]	valid_set's l1: 55.0028
[4000]	valid_set's l1: 54.9573


	-54.9228	 = Validation score   (-mean_absolute_error)
	12.68s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 54.9737


	-54.9095	 = Validation score   (-mean_absolute_error)
	4.06s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-55.8354	 = Validation score   (-mean_absolute_error)
	67.26s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ...
	-55.7484	 = Validation score   (-mean_absolute_error)
	150.75s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-55.4189	 = Validation score   (-mean_absolute_error)
	24.73s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
		Exception occured in `AgSaveModelCallback` when calling event `after_epoch`:
	module 'torch' has no attribute '_utils'
Detailed Traceback:
Traceback (most recent call last):
  File "c:\Users\yebin\anaconda3\envs\ml\lib\site-packages\autogluon\core\trainer\abstract_trainer.py", line 1733, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_

[1000]	valid_set's l1: 53.8705


	-53.7546	 = Validation score   (-mean_absolute_error)
	8.33s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-49.8272	 = Validation score   (-mean_absolute_error)
	0.16s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 290.98s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240116_103039\")


In [12]:
# 모델 리더보드 출력
predictor.leaderboard(train, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist,-0.588998,-58.890378,10.368644,0.114366,1.426626,10.368644,0.114366,1.426626,1,True,2
1,RandomForestMSE,-29.484355,-55.835415,1.765887,0.062011,67.257278,1.765887,0.062011,67.257278,1,True,5
2,ExtraTreesMSE,-33.239833,-55.418901,1.612748,0.047514,24.733392,1.612748,0.047514,24.733392,1,True,7
3,LightGBMLarge,-42.99124,-53.754648,2.903282,0.034509,8.325534,2.903282,0.034509,8.325534,1,True,9
4,WeightedEnsemble_L2,-45.292594,-49.827193,21.350579,0.2834,90.62456,0.035505,0.0,0.157093,2,True,10
5,KNeighborsUnif,-48.213185,-59.130725,10.86124,0.122989,4.832691,10.86124,0.122989,4.832691,1,True,1
6,LightGBMXT,-50.495,-54.922798,5.95523,0.063512,12.681751,5.95523,0.063512,12.681751,1,True,3
7,XGBoost,-50.753731,-51.50094,0.322032,0.009001,0.776278,0.322032,0.009001,0.776278,1,True,8
8,LightGBM,-50.798143,-54.909507,1.59803,0.01769,4.058838,1.59803,0.01769,4.058838,1,True,4
9,CatBoost,-51.66869,-55.748404,0.230105,0.013998,150.746153,0.230105,0.013998,150.746153,1,True,6


In [74]:
y_pred = predictor.predict(test)
y_pred = pd.DataFrame(y_pred, columns=['CI_HOUR'])
y_pred['CI_HOUR'] = y_pred['CI_HOUR'].apply(lambda x: max(0, x))

## Submission

In [75]:
submit = pd.read_csv('./sample_submission.csv')
submit['CI_HOUR'] = y_pred['CI_HOUR']
submit.to_csv('./change_time_autogluon.csv', index=False)