In [2]:
import os
import sys
PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.realpath("__file__")))
sys.path.append(PROJECT_PATH)

import json
import pickle

import numpy as np
import pandas as pd
import torch
from torchsummary import summary

from src.data.downloader import Downloader
from src.model.baseline_model import BaselineModel
from src.model.linear_model import LinearModel
from src.model.mlp_model import MLPModel

In [2]:
%%capture
gdrive_link = "https://drive.google.com/uc?export=download&id="
Downloader(gdrive_id="1XMqFFSc65UVE3EYh_tgPHtVRnonshjEO", file_name="all_analysis_for_paper.json") 
Downloader(files=json.load(open(os.path.join(PROJECT_PATH, "data", "all_analysis_for_paper.json"))))

# Data loading, preprocessing

In [4]:
data_training = pd.read_csv("../data/data_training.csv")
data_validation = pd.read_csv('../data/data_validation.csv')

data = pd.concat([data_training, data_validation])
data["Date"] = pd.to_datetime(data["Date"])
data = data.set_index('Date')
meta = pd.read_csv("../data/meta.csv")
data =  data.loc[data.index >= '2005']

data


Unnamed: 0_level_0,1515,1516,1518,1521,1719,1720,1722,1723,2040,2046,...,1732,1734,2049,2741,2742,2751,2545,744624,210888,210900
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,-80.0,45.0,9.0,387.0,513.0,460.0,274.0,322.0,489.0,320,...,-47.0,35.0,142.0,86.0,229.0,244.0,222.0,82.696130,307.000000,288.000000
2005-01-02,-111.0,5.0,-36.0,362.0,505.0,462.0,243.0,310.0,482.0,299,...,-46.0,30.0,139.0,85.0,227.0,241.0,198.0,69.923412,309.000000,306.000000
2005-01-03,-123.0,-9.0,-67.0,341.0,499.0,465.0,214.0,270.0,466.0,274,...,-47.0,27.0,137.0,75.0,225.0,242.0,198.0,72.505006,304.000000,306.000000
2005-01-04,-132.0,-11.0,-72.0,336.0,498.0,464.0,203.0,260.0,464.0,226,...,-50.0,27.0,134.0,109.0,246.0,241.0,198.0,63.899692,293.000000,293.000000
2005-01-05,-127.0,-6.0,-76.0,331.0,496.0,466.0,193.0,258.0,458.0,210,...,-38.0,27.0,133.0,149.0,246.0,303.0,202.0,63.039160,287.000000,282.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-26,-27.0,75.0,-69.0,309.0,478.0,460.0,138.0,204.0,451.0,-78,...,-70.0,52.0,136.0,74.0,194.0,198.0,-18.0,31.000000,227.059349,212.846617
2020-12-27,-94.0,21.0,-50.0,333.0,488.0,460.0,206.0,252.0,464.0,20,...,-88.0,42.0,133.0,82.0,194.0,216.0,-17.0,19.000000,235.703288,221.319022
2020-12-28,-137.0,-20.0,-92.0,311.0,481.0,459.0,201.0,250.0,461.0,120,...,-86.0,40.0,133.0,83.0,197.0,215.0,-22.0,24.000000,256.980675,242.174172
2020-12-29,-140.0,-31.0,-118.0,303.0,481.0,462.0,182.0,239.0,457.0,132,...,0.0,53.0,133.0,85.0,197.0,203.0,-24.0,32.000000,273.603634,258.467258


In [8]:
filename = os.path.join(PROJECT_PATH, "data", "scaler.pickle")
with open(filename, 'rb') as f:
    scaler = pickle.load(f)
scaler_dict = {k: v["scaler"] for k, v in scaler.items()}
scaler_dict

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


{'1516': MinMaxScaler(),
 '1719': MinMaxScaler(),
 '1722': MinMaxScaler(),
 '210888': MinMaxScaler(),
 '2271': MinMaxScaler(),
 '2272': MinMaxScaler(),
 '2274': MinMaxScaler(),
 '2275': MinMaxScaler(),
 '2278': MinMaxScaler(),
 '2543': MinMaxScaler(),
 '2753': MinMaxScaler(),
 '2756': MinMaxScaler()}

In [10]:
s_data = data.copy()
for st_id, sclr in scaler_dict.items():
    s_data[st_id] = sclr.transform(data[[st_id]])

Unnamed: 0_level_0,1515,1516,1518,1521,1719,1720,1722,1723,2040,2046,...,1732,1734,2049,2741,2742,2751,2545,744624,210888,210900
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01,-80.0,0.238908,9.0,387.0,0.612160,460.0,0.466246,322.0,489.0,320,...,-47.0,35.0,142.0,86.0,229.0,244.0,222.0,82.696130,0.456522,288.000000
2005-01-02,-111.0,0.204778,-36.0,362.0,0.604757,462.0,0.435196,310.0,482.0,299,...,-46.0,30.0,139.0,85.0,227.0,241.0,198.0,69.923412,0.458333,306.000000
2005-01-03,-123.0,0.192833,-67.0,341.0,0.599204,465.0,0.406150,270.0,466.0,274,...,-47.0,27.0,137.0,75.0,225.0,242.0,198.0,72.505006,0.453804,306.000000
2005-01-04,-132.0,0.191126,-72.0,336.0,0.598279,464.0,0.395132,260.0,464.0,226,...,-50.0,27.0,134.0,109.0,246.0,241.0,198.0,63.899692,0.443841,293.000000
2005-01-05,-127.0,0.195392,-76.0,331.0,0.596428,466.0,0.385116,258.0,458.0,210,...,-38.0,27.0,133.0,149.0,246.0,303.0,202.0,63.039160,0.438406,282.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-26,-27.0,0.264505,-69.0,309.0,0.579770,460.0,0.330028,204.0,451.0,-78,...,-70.0,52.0,136.0,74.0,194.0,198.0,-18.0,31.000000,0.384112,212.846617
2020-12-27,-94.0,0.218430,-50.0,333.0,0.589025,460.0,0.398137,252.0,464.0,20,...,-88.0,42.0,133.0,82.0,194.0,216.0,-17.0,19.000000,0.391941,221.319022
2020-12-28,-137.0,0.183447,-92.0,311.0,0.582547,459.0,0.393129,250.0,461.0,120,...,-86.0,40.0,133.0,83.0,197.0,215.0,-22.0,24.000000,0.411214,242.174172
2020-12-29,-140.0,0.174061,-118.0,303.0,0.582547,462.0,0.374099,239.0,457.0,132,...,0.0,53.0,133.0,85.0,197.0,203.0,-24.0,32.000000,0.426271,258.467258


In [11]:
windowed_data = np.lib.stride_tricks.sliding_window_view(
    s_data, (15, 12)
).reshape((-1, 15, 12))
win_data_torch = torch.tensor(windowed_data, dtype=torch.float32)
win_data_torch.shape

torch.Size([215673, 15, 12])

In [12]:
y_true = pd.read_csv(os.path.join(PROJECT_PATH, "data", "y_true_2005-01-01_2019-12-24.csv"), 
                     header=0, index_col=0)
y_true.index = pd.to_datetime(y_true.index)
y_true

Unnamed: 0,1day,2day,3day,4day,5day,6day,7day
2005-01-01,281.0,290.0,283.0,270.0,252.0,254.0,257.0
2005-01-02,290.0,283.0,270.0,252.0,254.0,257.0,258.0
2005-01-03,283.0,270.0,252.0,254.0,257.0,258.0,267.0
2005-01-04,270.0,252.0,254.0,257.0,258.0,267.0,277.0
2005-01-05,252.0,254.0,257.0,258.0,267.0,277.0,282.0
...,...,...,...,...,...,...,...
2019-12-20,123.0,124.0,117.0,117.0,119.0,133.0,157.0
2019-12-21,124.0,117.0,117.0,119.0,133.0,157.0,186.0
2019-12-22,117.0,117.0,119.0,133.0,157.0,186.0,203.0
2019-12-23,117.0,119.0,133.0,157.0,186.0,203.0,215.0


# Running baseline models

In [9]:
szeged_idx = np.arange(
    len(data.columns)
)[np.array(data.columns) == "2275"][0]
baseline = BaselineModel(horizon=7, target_idx=szeged_idx)
baseline(win_data_torch).shape

torch.Size([5471, 7])

In [10]:
device = 'cpu'  # "cuda:0" if torch.cuda.is_available() else "cpu"

In [11]:
linear = LinearModel(past_window=15, n_features=12, horizon=7)
linear.model
l_linear = torch.load(os.path.join(PROJECT_PATH, "data", "linear_model.pth"))

In [12]:
summary(linear, (15, 12), device=device)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 180]               0
            Linear-2                    [-1, 7]           1,267
Total params: 1,267
Trainable params: 1,267
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01
----------------------------------------------------------------


In [13]:
linear.model.to("cpu")
linear.model.load_state_dict(l_linear.state_dict())
linear(win_data_torch).shape

torch.Size([5471, 7])

In [14]:
mlp = MLPModel(past_window=15, n_features=12, horizon=7)
l_mlp = torch.load(os.path.join(PROJECT_PATH, "data", "mlp_model.pth"))

In [15]:
summary(mlp, (15, 12), device=device)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 180]               0
            Linear-2                  [-1, 256]          46,336
              ReLU-3                  [-1, 256]               0
            Linear-4                  [-1, 128]          32,896
              ReLU-5                  [-1, 128]               0
            Linear-6                    [-1, 7]             903
Total params: 80,135
Trainable params: 80,135
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.31
Estimated Total Size (MB): 0.31
----------------------------------------------------------------


In [16]:
mlp.model.load_state_dict(l_mlp.state_dict())
mlp(win_data_torch).shape

torch.Size([5471, 7])

# Get predictions

In [24]:
def get_predictions(model, st_id = "2275"):
    pred = pd.DataFrame(
        data=model(win_data_torch).detach().numpy(),
        index=y_true.index,
        columns=y_true.columns)

    for col in pred.columns:
        pred[col] = scaler_dict[st_id].inverse_transform(pred[[col]])
    return pred

In [25]:
get_predictions(model=baseline)

Unnamed: 0,1day,2day,3day,4day,5day,6day,7day
2005-01-01,251.000000,251.000000,251.000000,251.000000,251.000000,251.000000,251.000000
2005-01-02,281.000000,281.000000,281.000000,281.000000,281.000000,281.000000,281.000000
2005-01-03,290.000000,290.000000,290.000000,290.000000,290.000000,290.000000,290.000000
2005-01-04,283.000000,283.000000,283.000000,283.000000,283.000000,283.000000,283.000000
2005-01-05,270.000000,270.000000,270.000000,270.000000,270.000000,270.000000,270.000000
...,...,...,...,...,...,...,...
2019-12-20,118.000000,118.000000,118.000000,118.000000,118.000000,118.000000,118.000000
2019-12-21,122.999992,122.999992,122.999992,122.999992,122.999992,122.999992,122.999992
2019-12-22,123.999992,123.999992,123.999992,123.999992,123.999992,123.999992,123.999992
2019-12-23,116.999992,116.999992,116.999992,116.999992,116.999992,116.999992,116.999992


In [26]:
get_predictions(model=linear)

Unnamed: 0,1day,2day,3day,4day,5day,6day,7day
2005-01-01,283.279938,290.000610,302.234314,291.485107,293.573059,289.969269,284.621185
2005-01-02,299.273804,285.124298,287.191498,276.713104,273.201721,258.846252,256.347137
2005-01-03,290.987885,278.913300,267.998901,255.359329,239.473923,243.977158,255.437943
2005-01-04,279.119293,254.222382,240.689133,245.658737,234.614212,238.752792,241.292892
2005-01-05,243.177017,227.446075,231.924957,226.781708,222.826294,226.452240,241.546600
...,...,...,...,...,...,...,...
2019-12-20,117.218193,117.482330,118.721832,123.736847,127.621651,126.254417,137.911301
2019-12-21,125.835815,115.553963,119.570305,113.764359,123.820084,122.564011,135.495331
2019-12-22,116.240257,112.532196,107.928871,107.734535,115.231689,123.158607,127.177452
2019-12-23,111.046043,104.793930,110.087883,107.359940,121.677528,121.138115,127.906380


In [27]:
get_predictions(model=mlp)

Unnamed: 0,1day,2day,3day,4day,5day,6day,7day
2005-01-01,266.176025,286.165833,303.614899,315.135773,321.390045,320.636078,320.830658
2005-01-02,280.764832,278.486877,272.853058,270.835144,264.144348,262.644562,262.746399
2005-01-03,278.915710,265.665924,252.556137,244.399063,235.686432,237.241302,238.306732
2005-01-04,265.337402,250.639542,238.935349,230.464462,225.454391,228.665451,234.037888
2005-01-05,242.479721,235.259964,233.362305,230.349747,235.108231,240.555374,246.867493
...,...,...,...,...,...,...,...
2019-12-20,117.328789,119.007515,125.856972,130.000275,136.356964,143.036484,148.404724
2019-12-21,116.089928,116.225555,122.056183,125.897514,133.662994,139.887405,145.222580
2019-12-22,116.192131,112.824257,115.741524,117.333801,122.897568,129.452820,134.288803
2019-12-23,113.119843,109.804390,113.774834,115.258972,121.619537,128.773712,133.195816
