In [None]:
import sys
import pandas as pd
import numpy as np

from utils import pre_all
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [None]:
print(f"파이썬 버전 : {sys.version}")
print(f"pandas 버전 : {pd.__version__}")
print(f"numpy 버전 : {np.__version__}")

# 1. vanilla autogluon-ts

In [None]:
train_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/train.csv')[['ID','timestamp','supply(kg)', 'price(원/kg)']]
test_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/test.csv')[['ID','timestamp']]
# train_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/train.csv')
# test_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/test.csv')

In [None]:
train_df['item_id'] = train_df.ID.str[0:6]
test_df['item_id'] = test_df.ID.str[0:6]

print(train_df.head())
print(test_df.head())

In [None]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

# seed 고정
predictor.fit(data, random_seed=42)

In [None]:
predictor.refit_full()

In [None]:
# seed 고정
pred = predictor.predict(data, random_seed=42)
pred

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('./dacon_submission.csv', index=False)
submission

# 2. + static features + covariates + custom validation set + custom predictor model

In [None]:
train_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/train.csv')
test_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/test.csv')

In [None]:
train_df['item_id'] = train_df.ID.str[0:6]
test_df['item_id'] = test_df.ID.str[0:6]

print(train_df.head())
print(test_df.head())

In [None]:
# covariates 생성
train, test = pre_all(train_df, test_df)

In [None]:
# make static_features_df
static_features_df = train.loc[:, ['item_id', 'item', 'corporation', 'location']]
static_features_df = static_features_df.drop_duplicates()
print(static_features_df.head())

In [None]:
# 별개의 static_features_df를 생성했으므로 학습 데이터에서는 drop
train.drop(columns=['ID', 'item', 'corporation', 'location'], inplace=True)
test.drop(columns=['ID'], inplace=True)

In [None]:
print(train.head())
print(test.head())

In [None]:
train_data = TimeSeriesDataFrame.from_data_frame(
    train,
    id_column="item_id",
    timestamp_column="timestamp",
    static_features_df=static_features_df,
)

print(train_data.head())
print(train_data.static_features.head())

In [None]:
# create own validation set (march 4~31 of 2019, 2020, 2021, 2022)
march_2019_threshold = pd.to_datetime('2019-03-31')
val_set_1 = train_data[train_data.index.get_level_values('timestamp') <= march_2019_threshold]

march_2020_threshold = pd.to_datetime('2020-03-31')
val_set_2 = train_data[(train_data.index.get_level_values('timestamp') > march_2019_threshold) & (train_data.index.get_level_values('timestamp') <= march_2020_threshold)]

march_2021_threshold = pd.to_datetime('2021-03-31')
val_set_3 = train_data[(train_data.index.get_level_values('timestamp') > march_2020_threshold) & (train_data.index.get_level_values('timestamp') <= march_2021_threshold)]

march_2022_threshold = pd.to_datetime('2022-03-31')
val_set_4 = train_data[(train_data.index.get_level_values('timestamp') > march_2021_threshold) & (train_data.index.get_level_values('timestamp') <= march_2022_threshold)]

my_validation_dataset = pd.concat([val_set_1, val_set_2, val_set_3, val_set_4], axis=0)
my_validation_dataset


In [None]:
# test 시점에도 알 수 있는 covariate들을 known_covariates로 지정 (날짜, 휴일과 각각의 파생변수 등)
known_covariates = ["year", "month", "day", "week_day", "year_month", "week", "week_num", "holiday"]

# configure target and known, past covariates in predictor
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price",
    known_covariates_names=known_covariates, # supply and x_prev_price columns will automatically interpreted as past covariates
    eval_metric="RMSE",
)

# 관심이 있는 DLInear 모델과 PatchTST 모델 그리고 baseline으로 DeepAR과 Theta 모델만을 학습
# 특산품 가격은 달마다의 계절성이 있다고 판단되어 PatchTST 모델의 look-back window를 365일로 지정
# Transformer encoder layer은 default 2에서 6으로 확장
predictor.fit(train_data,
              random_seed=42,
              tuning_data=my_validation_dataset,
              hyperparameters={
                "DLinear": {},
                "PatchTST": [
                    {"context_length": 365}, # default 96 (look-back window length)
                    {"num_encoder_layers": 6}, # default 2
                ],
                "DeepAR": {},
                "Theta": [
                    {"decomposition_type": "additive"},
                    {"seasonal_period": 1},
                ],
            }
        )

In [None]:
predictor.refit_full()

In [None]:
from autogluon.timeseries.utils.forecast import get_forecast_horizon_index_ts_dataframe

future_index = get_forecast_horizon_index_ts_dataframe(train_data, prediction_length=28)
future_timestamps = future_index.get_level_values("timestamp")
known_covariates_pred_df = pd.DataFrame(index=future_index)
for kc in known_covariates:
    known_covariates_pred_df[kc] = test_data[kc]
known_covariates_pred_df.head()

In [None]:
# known_covariates를 사용하여 학습한 모델이 있을 시 predict 시에도 제공해야함
pred = predictor.predict(train_data,
                         known_covariates=known_covariates_pred_df,
                         random_seed=42,
                         )
pred

In [None]:
predictor.leaderboard(train_data)

In [None]:
import matplotlib.pyplot as plt

# TimeSeriesDataFrame can also be loaded directly from a file
# test_data = TimeSeriesDataFrame.from_path("https://autogluon.s3.amazonaws.com/datasets/timeseries/m4_hourly_subset/test.csv")

plt.figure(figsize=(20, 3))

# item_id = "H1"
# item_id = "TG_A_J"
item_id = "RD_F_J"
y_past = train_data.loc[item_id]["price"]
y_pred = pred.loc[item_id]
# y_test = test_data.loc[item_id]["price(원/kg)"][-48:]

plt.plot(y_past[-200:], label="Past time series values")
plt.plot(y_pred["mean"], label="Mean forecast")
# plt.plot(y_test, label="Future time series values")

plt.fill_between(
    y_pred.index, y_pred["0.1"], y_pred["0.9"], color="red", alpha=0.1, label=f"10%-90% confidence interval"
)
plt.legend();

In [None]:
submission = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/sample_submission.csv')
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/dacon_submission.csv', index=False)
submission