Define the Model

- prediction_length(이 경우 48시간): 정보 제공자의 디코더가 예측을 학습할 기간입니다
- context_length: context_length가 지정되지 않은 경우, 모델은 context_length(인코더의 입력)를 prediction_length와 동일하게 설정합니다
- lags for a given frequency: 이는 효율적인 "되돌아보기" 메커니즘을 지정하며, 과거 값과 현재 값을 추가 기능으로 연결합니다(예: 일일 빈도의 경우 [1, 7, 30, ...] 또는 분 데이터의 경우 [1, 30, 60, 60*24, ...] 등을 고려할 수 있습니다)
- the number of time features: 이 경우에는 시간, 요일, ... 및 연령 기능을 추가할 것이므로 5개가 됩니다(아래 참조).

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from datetime import timedelta
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

# model을 import
from transformers import InformerModel
from gluonts.time_feature import get_lags_for_frequency, time_features_from_frequency_str

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Util function

In [104]:
class StandardScaler():
    def __init__(self):
        self.mean = 0.
        self.std = 1.
    
    def fit(self, data):
        self.mean = data.mean(0)
        self.std = data.std(0)

    def transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data - mean) / std

    def inverse_transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data * std) + mean

# 시간 특징을 freq에 따라 추출
def time_features(dates, freq='1s'):
    timestamp_as_index = pd.DatetimeIndex(dates)
    time_features = time_features_from_frequency_str(freq)
    additional_features = [
        (time_feature.__name__, time_feature(timestamp_as_index))
        for time_feature in time_features
    ]
        
    time_features = pd.DataFrame(dict(additional_features))  
    
    return time_features.values

# 한번의 batch를 실행하는 코드
def _process_one_batch(batch_x, batch_y, batch_x_mark, batch_y_mark):
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float()
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    dec_inp = torch.zeros([batch_y.shape[0], pred_len, batch_y.shape[-1]]).float()
    dec_inp = torch.cat([batch_y[:,:label_len,:], dec_inp], dim=1).float().to(device)
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    batch_y = batch_y[:,-pred_len:,0:].to(device)
    return outputs, batch_y

# Make Datase

In [124]:
class Dataset_Pred(Dataset):
    def __init__(self, dataframe, size=None, scale=True):
        self.fisrt_dt = None
        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]
        self.dataframe = dataframe
        
        self.scale = scale
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = self.dataframe
        # df_raw["STCK_CNTG_HOUR"] = pd.to_datetime(df_raw["STCK_CNTG_HOUR"])
        df_raw["AMOUNT"] = df_raw["STCK_PRPR"].values * df_raw["CNTG_VOL"].values
        delta = df_raw["STCK_CNTG_HOUR"].iloc[1] - df_raw["STCK_CNTG_HOUR"].iloc[0]
        self.first_time = df_raw["STCK_CNTG_HOUR"].values[0]
        
        if delta>=timedelta(hours=1):
            self.freq='h'
        else:
            self.freq='t'

        border1 = 0
        border2 = len(df_raw)
        input_columns = list(df_raw.columns.difference(["MKSC_SHRN_ISCD", "day_of_year", "STCK_CNTG_HOUR", "STCK_PRPR", "CCLD_DVSN"]))
        df_data = df_raw[input_columns]

        if self.scale:
            self.scaler.fit(df_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
        
        df_stamp = df_raw["STCK_CNTG_HOUR"]

        data_stamp = time_features(df_stamp)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    #######################################################04:15 취침 (수정 필요) ####################################################
    def __getitem__(self, index):
        s_begin = self.first_time + pd.Timedelta(seconds=index)
        s_end = s_begin + pd.Timedelta(seconds=self.seq_len) # self.seq_len # pd.Timedelta(seconds=self.seq_len)
        
        r_begin = s_end - pd.Timedelta(seconds=self.label_len) # pd.Timedelta(seconds=self.label_len)
        r_end = r_begin + pd.Timedelta(seconds=self.label_len) + pd.Timedelta(seconds=self.pred_len) # pd.Timedelta(seconds=self.label_len + self.pred_len)
        print(self.data_x)
 
        seq_x = self.data_x.loc[s_begin:s_end, "STCK_CNTG_HOUR"]
        seq_y = self.data_y.loc[r_begin:r_end, "STCK_CNTG_HOUR"]
        seq_x_mark = self.data_stamp.loc[s_begin:s_end, "STCK_CNTG_HOUR"]
        seq_y_mark = self.data_stamp.loc[r_begin:r_end, "STCK_CNTG_HOUR"]
        
        return seq_x, seq_y, seq_x_mark, seq_y_mark
####################################################################################################################################
    def __len__(self):
        return len(self.data_x) - self.seq_len- self.pred_len + 1

In [125]:
from torch.utils.data import DataLoader
from glob import glob
import os, sys

paths = sorted(glob(os.path.join(os.path.dirname(os.path.abspath(os.getcwd())),"data/raw/*.parquet")))
dataset = Dataset_Pred(pd.read_parquet(paths[0]), scale=True, size = (60, 30, 30))
dl = DataLoader(Dataset_Pred(pd.read_parquet(paths[0]), size=[24, 24, 24]), batch_size=1, shuffle=False)

In [126]:
for seq_x, seq_y, seq_x_mark, seq_y_mark in dl:

    break

[[-0.06141017 -0.06248157]
 [-0.06141017 -0.06248157]
 [ 0.02997654  0.02685745]
 ...
 [-0.03694056 -0.03475567]
 [-0.06179934 -0.06248157]
 [-0.05351308 -0.05323961]]


AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [61]:
from glob import glob
import os, sys
# sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))
paths = sorted(glob(os.path.join(os.path.dirname(os.path.abspath(os.getcwd())),"data/raw/*.parquet")))

In [71]:
pd.read_parquet(paths[0])

Unnamed: 0,MKSC_SHRN_ISCD,STCK_CNTG_HOUR,STCK_PRPR,CNTG_VOL,CCLD_DVSN
0,005490,2023-09-11 08:30:00,583000.0,1.0,3.0
1,005490,2023-09-11 08:30:00,583000.0,1.0,3.0
2,005490,2023-09-11 08:30:00,583000.0,30.0,3.0
3,005490,2023-09-11 08:30:00,583000.0,3.0,3.0
4,005490,2023-09-11 08:30:00,583000.0,3.0,3.0
...,...,...,...,...,...
714727,005490,2023-10-04 15:56:55,511000.0,5.0,5.0
714728,005490,2023-10-04 15:57:42,511000.0,3.0,5.0
714729,005490,2023-10-04 15:57:55,511000.0,10.0,5.0
714730,005490,2023-10-04 15:58:39,511000.0,1.0,5.0


In [None]:
from glob import glob
dataset = DS(data_paths=paths[:2], window_size=(3, 0), target_size=(1, 0))
# dataset

progress = tqdm(DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=collate_func(True)), total=len(dataset))
for idx, boxcox_lambda, x, y in progress:
    progress.n = idx
    progress.refresh()
    print(x)
    print(y)
    break