In [None]:
import torch
from math import floor
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression

from AnalyzeTools.models import autoregressive_integrated_moving_average, linear_regression, support_vector_regression, random_forest, gradient_boosting
from AnalyzeTools.prepare import data_split, model_eval, pathForSavingModels
from AnalyzeTools.preprocess import preprocessData
from AnalyzeTools.superModels import DEEPAR, TFT, RNN

params_path = './Models'

In [2]:
''' Read data '''
raw_file_name = '경략가격집계 - 소,돼지'
product = "pork-hot"
product_attribute = "경락가격"

_output = 'MAX_COST_AMT' # MIN_COST_AMT
default_exclude_cols = ['JUDGE_GUBN', 'JUDGE_BREED', 'JUDGE_SEX', 'SABLE_GUBN', 'ABATT_CODE']

df = pd.read_csv('../Data/beef/경략가격집계 - 소,돼지.csv', encoding = 'euc_kr', engine ='python').query("JUDGE_KIND == 2")

df = df.drop(default_exclude_cols, axis=1)
df = df.groupby(['STD_DATE']).mean().reset_index()
df['STD_DATE'] = df['STD_DATE'].apply(lambda x: "20" + "-".join(x.split("/")))
df.rename(columns={'STD_DATE': 'date'}, inplace=True)

df, _input = preprocessData(df, 'date', _output)

predictions_x_axis = df['date'][floor(len(df)*0.8):].values


-->Feature scores:
                 Features        Scores
6  DEFECT_MAX_COST_AMT  6.811574e+10
1         MIN_COST_AMT  1.222592e+04
5  DEFECT_MIN_COST_AMT  1.222443e+04
2         SUM_COST_AMT  4.538033e+02
7  DEFECT_SUM_COST_AMT  4.537454e+02
8    DEFECT_SUM_WEIGHT  2.279391e+01
3           SUM_WEIGHT  2.277102e+01
4           DEFECT_CNT  1.084715e+01
0                  CNT  1.083094e+01

-->TOP K features:
   ['MIN_COST_AMT', 'SUM_COST_AMT', 'DEFECT_MIN_COST_AMT', 'DEFECT_MAX_COST_AMT']

-->Final features:
  ['MIN_COST_AMT', 'SUM_COST_AMT', 'DEFECT_MAX_COST_AMT']


In [3]:
# prepare dataset for statistics and Macnhine models
ml_split_params = {'Model': 'ML', 'Future': 1}
X_train, X_test, y_train, y_test, input_scaler, output_scaler = data_split(df, input_cols=_input, output=_output, train_size=0.8, scaling=True, **ml_split_params)

''' Input data into models and Evaluate model results '''
ml_searchCV_params = {
    'base_dir': params_path,
    'product': product,
    'attribute': product_attribute,
    'raw': raw_file_name,
    'save': True
}
stdout = True
vis = True

X_train: (2037, 4) y_train: (2037,) X_test: (510, 4) y_test: (510,)


# Statistics

In [4]:
# print("\nARIMA")
# arima_predictions = autoregressive_integrated_moving_average(y_train, y_test)
# model_eval(y_test, arima_predictions, stdout=stdout, vis=vis)

In [5]:
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import r2_score as R2
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

plot_template = dict(
    layout=go.Layout({
        'font_size': 18,
        'xaxis_title_font_size': 18,
        'yaxis_title_font_size': 18,
    })
)

In [6]:
print("\nLinear Regression")
lr, _ = linear_regression(X_train, y_train)
lr_predictions = lr.predict(X_test)
model_eval(y_test, lr_predictions, predictions_x_axis, stdout=stdout, vis=vis, **{'scaler': output_scaler})


Linear Regression
MAPE: 0.04591038814470225 R square: 0.7887017123984879


# Machine learning

In [7]:
print("\nSupport Vector Regression")
svr, _ = support_vector_regression(X_train, y_train, search=True, **ml_searchCV_params)
svr_predictions = svr.predict(X_test)
model_eval(y_test, svr_predictions, predictions_x_axis, stdout=stdout, vis=vis, **{'scaler': output_scaler})


Support Vector Regression
--> Use the existed best parameters!

Best parameter for SVR is:
  {'C': 100, 'gamma': 0.01, 'epsilon': 0.05, 'kernel': 'rbf'}
MAPE: 0.04662573916071147 R square: 0.7695773976347209


In [8]:
print("\nRandom Forest")
rf, _ = random_forest(X_train, y_train, search=True, **ml_searchCV_params, **{'scaler': output_scaler})
rf_predictions = rf.predict(X_test)
model_eval(y_test, rf_predictions, predictions_x_axis, stdout=stdout, vis=vis, **{'scaler': output_scaler})


Random Forest
--> Use the existed best parameters!

Best parameter for Random forest is:
  {'n_estimators': 50, 'max_depth': 9, 'max_features': 1, 'min_samples_leaf': 7, 'min_samples_split': 9}
MAPE: 0.05605317739899718 R square: 0.6958585167312588


In [9]:
print("\nGradient Boosting")
gb, _ = gradient_boosting(X_train, y_train, search=True, **ml_searchCV_params)
gb_predictions = gb.predict(X_test)
model_eval(y_test, gb_predictions, predictions_x_axis, stdout=stdout, vis=vis, **{'scaler': output_scaler})


Gradient Boosting
--> Use the existed best parameters!

Best parameter for Gradient Boosting is:
  {'n_estimators': 70, 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 7, 'min_samples_split': 9}
MAPE: 0.056641378295991286 R square: 0.6803308471467395


# Deep learning

In [10]:
data = df.copy()

data['time_idx'] = range(len(data))
data['group'] = product

training_cutoff = floor(len(data) * 0.8)

max_prediction_length = 1
max_encoder_length = 30 # 7, 14, 30, 60, 120
batch_size = 64

group = ['group']
time_varying_known_categoricals = ['month', 'week']
time_varying_unknown_categoricals = []
time_varying_known_reals = ['time_idx']
time_varying_unknown_reals = _input + [_output]

In [11]:
print("\nLSTM")
lstm, val_dataloader = RNN(
    data,
    training_cutoff,
    _output,
    group,
    max_encoder_length,
    max_prediction_length,
    time_varying_known_categoricals,
    time_varying_unknown_categoricals,
    time_varying_known_reals,
    batch_size,
    pathForSavingModels(product, product_attribute, raw_file_name, 'LSTM'),
    'LSTM'
)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
lstm_predictions = lstm.predict(val_dataloader)
model_eval(actuals, lstm_predictions, predictions_x_axis, stdout=True, vis=True)


LSTM
MAPE: 0.04118090867996216 R square: 0.8623359213900043


In [12]:
print("\nGRU")
gru, val_dataloader = RNN(
    data,
    training_cutoff,
    _output,
    group,
    max_encoder_length,
    max_prediction_length,
    time_varying_known_categoricals,
    time_varying_unknown_categoricals,
    time_varying_known_reals,
    batch_size,
    pathForSavingModels(product, product_attribute, raw_file_name, 'GRU'),
    'GRU'
)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
gru_predictions = gru.predict(val_dataloader)

model_eval(actuals, gru_predictions, predictions_x_axis, stdout=True, vis=True)


GRU
MAPE: 0.038563765585422516 R square: 0.870616266615487


In [13]:
print("\nDeepAR")
deep_ar, val_dataloader = DEEPAR(
    data,
    training_cutoff,
    _output,
    group,
    max_encoder_length,
    max_prediction_length,
    time_varying_known_categoricals,
    time_varying_unknown_categoricals,
    time_varying_known_reals,
    batch_size,
    pathForSavingModels(product, product_attribute, raw_file_name, 'DEEPAR'),
)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
deepar_predictions = deep_ar.predict(val_dataloader)

model_eval(actuals, deepar_predictions, predictions_x_axis, stdout=True, vis=True)


DeepAR
MAPE: 0.05845688283443451 R square: 0.760482361814296


# Transformer

In [14]:
print("\nTFT")
tft, val_dataloader = TFT(
    data,
    training_cutoff,
    _output,
    group,
    max_encoder_length,
    max_prediction_length,
    time_varying_unknown_categoricals,
    time_varying_known_categoricals,
    time_varying_known_reals,
    time_varying_unknown_reals,
    batch_size,
    pathForSavingModels(product, product_attribute, raw_file_name, 'TFT'),
)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
tft_predictions = tft.predict(val_dataloader)

model_eval(actuals, tft_predictions, predictions_x_axis, stdout=True, vis=True)


TFT
MAPE: 0.05099772661924362 R square: 0.8139380063890236
