In [1]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [2]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from datetime import datetime
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
import os
import matplotlib.pyplot as plt
from typing import(List, Dict, Union, Optional)
from torchmetrics.functional.regression import spearman_corrcoef
from torchmetrics.regression import PearsonCorrCoef

import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda:0')

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
def read_wsdm23_file(file_path: str,) -> Dict[str, pd.DataFrame]:
  wsdm23_dict = np.load(file_path, allow_pickle=True).item()
  for df in wsdm23_dict.values():
    df.index.name = 'timestamp'
  return wsdm23_dict

def load_dataset():
  feature_names = None
  label_names = None

  file_path = '/content/gdrive/MyDrive/Colab Notebooks/인공지능프로젝트2/baseline_data_sp500.npy'
  df = read_wsdm23_file(file_path)
  return df

df = load_dataset()

In [4]:
dataset = df.copy()
df_list = []
for key in dataset.keys():
  df_list.append(pd.DataFrame({'timestamp':dataset[key].index, 'corporation':[key for _ in range(len(dataset[key].index))],\
                               'close':dataset[key]['close'].values, 'open':dataset[key]['open'].values, 'high':dataset[key]['high'].values,\
                               'low':dataset[key]['low'].values, 'volume':dataset[key]['volume'].values, 'adjclose':dataset[key]['adjclose'].values}))
dataset = pd.concat(df_list, ignore_index=True)
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])
dataset

Unnamed: 0,timestamp,corporation,close,open,high,low,volume,adjclose
0,2012-05-14,POOL,36.299999,36.360001,36.840000,35.979999,230900.0,32.237789
1,2012-05-15,POOL,36.750000,36.139999,37.119999,36.139999,328500.0,32.637432
2,2012-05-16,POOL,36.619999,36.950001,37.080002,36.549999,220300.0,32.521984
3,2012-05-17,POOL,34.979999,36.540001,36.540001,34.830002,418400.0,31.065506
4,2012-05-18,POOL,35.139999,34.900002,35.660000,34.779999,200300.0,31.207592
...,...,...,...,...,...,...,...,...
1197319,2022-05-19,RTX,90.250000,91.379997,92.459999,89.540001,5234700.0,90.250000
1197320,2022-05-20,RTX,90.080002,90.839996,91.169998,88.430000,6585500.0,90.080002
1197321,2022-05-23,RTX,91.830002,90.599998,92.019997,90.000000,4701300.0,91.830002
1197322,2022-05-24,RTX,93.209999,91.199997,93.419998,90.470001,5932000.0,93.209999


In [5]:
print(dataset['corporation'].unique())
print('The number of corporations: ', len(dataset['corporation'].unique()))

['POOL' 'MKC' 'AOS' 'AXP' 'CME' 'EQR' 'BKR' 'FDX' 'NCLH' 'MET' 'MRK' 'MAA'
 'ISRG' 'GOOGL' 'PEAK' 'CMCSA' 'TT' 'JPM' 'REGN' 'DHI' 'HUM' 'MDT' 'DIS'
 'DE' 'F' 'C' 'EXC' 'SCHW' 'WYNN' 'PLD' 'AAL' 'DVN' 'AEE' 'IQV' 'AMZN'
 'TDY' 'PENN' 'IP' 'OMC' 'VFC' 'MSCI' 'A' 'USB' 'UNH' 'IT' 'ETR' 'DVA'
 'HSY' 'NEE' 'FFIV' 'BIIB' 'KO' 'MSI' 'KMB' 'FTNT' 'EL' 'ABC' 'CF' 'LKQ'
 'TMUS' 'ABT' 'PSX' 'LMT' 'ESS' 'APD' 'PSA' 'AMAT' 'HCA' 'ORLY' 'KEY'
 'KLAC' 'GL' 'PCAR' 'CBOE' 'PRU' 'FE' 'LEG' 'PHM' 'NOC' 'BBWI' 'MCK' 'CPB'
 'KR' 'DGX' 'TJX' 'GRMN' 'PKG' 'FISV' 'OXY' 'FITB' 'CBRE' 'WAB' 'KMX'
 'RSG' 'ADM' 'PEP' 'IRM' 'LOW' 'CL' 'TXT' 'MCO' 'DUK' 'NKE' 'PAYX' 'IFF'
 'MSFT' 'CTRA' 'MA' 'NWS' 'PVH' 'TSN' 'DXC' 'VTRS' 'TWTR' 'MTD' 'MLM' 'SO'
 'ETN' 'STT' 'GD' 'CMS' 'SRE' 'MMC' 'AIZ' 'NOW' 'TGT' 'BRK-B' 'MRO' 'NXPI'
 'WFC' 'EXR' 'NTAP' 'BDX' 'AJG' 'WY' 'EOG' 'XYL' 'PH' 'IPG' 'DLTR' 'TDG'
 'LUV' 'AFL' 'AVGO' 'LLY' 'CAG' 'ODFL' 'ZBH' 'KMI' 'LUMN' 'PGR' 'ROL'
 'PNC' 'DRE' 'EQIX' 'LRCX' 'VMC' 'TRMB' 'HPQ' 'ADP' 'ARE

In [6]:
train_start = datetime.strptime('2012-05-14', '%Y-%m-%d')
valid_start = datetime.strptime('2020-05-22', '%Y-%m-%d')
test_start = datetime.strptime('2021-05-22', '%Y-%m-%d')
test_end = datetime.strptime('2022-05-25', '%Y-%m-%d')

train_dataset = dataset[(dataset['timestamp'] >= train_start) & (dataset['timestamp'] < valid_start)]
valid_dataset = dataset[(dataset['timestamp'] >= valid_start) & (dataset['timestamp'] < test_start)]
test_dataset = dataset[(dataset['timestamp'] >= test_start) & (dataset['timestamp'] <= test_end)]

In [7]:
scaler = StandardScaler()

train_dfx = train_dataset[['open', 'close', 'high', 'low', 'volume']]
train_dfx = scaler.fit_transform(train_dfx)
train_dfy = train_dataset[['open']]
train_dfy = scaler.fit_transform(train_dfy)

val_dfx = valid_dataset[['open', 'close', 'high', 'low', 'volume']]
val_dfx = scaler.fit_transform(val_dfx)
val_dfy = valid_dataset[['open']]
val_dfy = scaler.fit_transform(val_dfy)

test_dfx = test_dataset[['open', 'close', 'high', 'low', 'volume']]
test_dfx = scaler.fit_transform(test_dfx)
test_dfy = test_dataset[['open']]
test_dfy = scaler.fit_transform(test_dfy)

seq_len = 96
pred_len = 5

train_x, train_y = [], []
for i in range(len(train_dfx) - seq_len-pred_len+1):
  train_x.append(train_dfx[i:i+seq_len])
  train_y.append(train_dfy[i+seq_len:i+seq_len+pred_len])
train_x = np.array(train_x)
train_y = np.array(train_y)

val_x, val_y = [], []
for i in range(len(val_dfx) - seq_len-pred_len+1):
  val_x.append(val_dfx[i:i+seq_len])
  val_y.append(val_dfy[i+seq_len:i+seq_len+pred_len])
val_x = np.array(val_x)
val_y = np.array(val_y)

test_x, test_y = [], []
for i in range(len(test_dfx) - seq_len-pred_len+1):
  test_x.append(test_dfx[i:i+seq_len])
  test_y.append(test_dfy[i+seq_len:i+seq_len+pred_len])
test_x = np.array(test_x)
test_y = np.array(test_y)

train_x = torch.Tensor(train_x).to(device)
train_y = torch.Tensor(train_y).to(device)
val_x = torch.Tensor(val_x).to(device)
val_y = torch.Tensor(val_y).to(device)
test_x = torch.Tensor(test_x).to(device)
test_y = torch.Tensor(test_y).to(device)

In [8]:
class moving_avg(nn.Module):
    """
    Moving average block to highlight the trend of time series
    """
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        # padding on the both ends of time series
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x


class series_decomp(nn.Module):
    """
    Series decomposition block
    """
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        res = x - moving_mean
        return res, moving_mean

class Model(nn.Module):
    """
    Decomposition-Linear
    """
    def __init__(self, configs):
        super(Model, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len

        # Decompsition Kernel Size
        kernel_size = 25
        self.decompsition = series_decomp(kernel_size)
        self.individual = configs.individual
        self.channels = configs.enc_in

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList()
            self.Linear_Trend = nn.ModuleList()

            for i in range(self.channels):
                self.Linear_Seasonal.append(nn.Linear(self.seq_len,self.pred_len))
                self.Linear_Trend.append(nn.Linear(self.seq_len,self.pred_len))

                # Use this two lines if you want to visualize the weights
                # self.Linear_Seasonal[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
                # self.Linear_Trend[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
        else:
            self.Linear_Seasonal = nn.Linear(self.seq_len,self.pred_len)
            self.Linear_Trend = nn.Linear(self.seq_len,self.pred_len)

            # Use this two lines if you want to visualize the weights
            # self.Linear_Seasonal.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
            # self.Linear_Trend.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))

    def forward(self, x):
        # x: [Batch, Input length, Channel]
        seasonal_init, trend_init = self.decompsition(x)
        seasonal_init, trend_init = seasonal_init.permute(0,2,1), trend_init.permute(0,2,1)
        if self.individual:
            seasonal_output = torch.zeros([seasonal_init.size(0),seasonal_init.size(1),self.pred_len],dtype=seasonal_init.dtype).to(seasonal_init.device)
            trend_output = torch.zeros([trend_init.size(0),trend_init.size(1),self.pred_len],dtype=trend_init.dtype).to(trend_init.device)
            for i in range(self.channels):
                seasonal_output[:,i,:] = self.Linear_Seasonal[i](seasonal_init[:,i,:])
                trend_output[:,i,:] = self.Linear_Trend[i](trend_init[:,i,:])

        else:
            seasonal_output = self.Linear_Seasonal(seasonal_init)
            trend_output = self.Linear_Trend(trend_init)

        x = seasonal_output + trend_output
        x = x[:, -5:, :]
        return x

In [9]:
import argparse
import sys

parser = argparse.ArgumentParser()
parser.add_argument('--seq_len', type=int, default=96, help='시퀀스 길이')
parser.add_argument('--pred_len', type=int, default=5, help='예측 길이')
parser.add_argument('--individual', type=bool, default=False, help='개별 채널 예측 여부')
parser.add_argument('--enc_in', type=int, default=6, help='입력 채널 수')

if 'google.colab' in sys.modules:
    sys.argv = sys.argv[:1]

args = parser.parse_args()

In [10]:
model = Model(args).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 100

for epoch in range(epochs):
  model.train()
  train_loss = 0
  train_total = 0

  optimizer.zero_grad()
  outputs = model(train_x).to(device)
  loss = criterion(outputs, train_y)
  loss.backward()
  optimizer.step()
  train_loss = loss.item()

  model.eval()
  with torch.no_grad():
    val_outputs = model(val_x).to(device)
    val_loss = criterion(val_outputs, val_y).item()

  if (epoch%10 == 0):
    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss}, Validaiton Loss: {val_loss}')

Epoch [1/100], Train Loss: 1.4671756029129028, Validaiton Loss: 1.3157129287719727
Epoch [11/100], Train Loss: 0.46796733140945435, Validaiton Loss: 0.6430683135986328
Epoch [21/100], Train Loss: 0.48548173904418945, Validaiton Loss: 0.6936966776847839
Epoch [31/100], Train Loss: 0.4341546595096588, Validaiton Loss: 0.6173332333564758
Epoch [41/100], Train Loss: 0.4206000566482544, Validaiton Loss: 0.5831505656242371
Epoch [51/100], Train Loss: 0.4127720892429352, Validaiton Loss: 0.5680655837059021
Epoch [61/100], Train Loss: 0.4063766896724701, Validaiton Loss: 0.5554686784744263
Epoch [71/100], Train Loss: 0.40231505036354065, Validaiton Loss: 0.538784384727478
Epoch [81/100], Train Loss: 0.39868852496147156, Validaiton Loss: 0.523898720741272
Epoch [91/100], Train Loss: 0.39560797810554504, Validaiton Loss: 0.5118358135223389


In [11]:
model.eval()
with torch.no_grad():
  test_outputs = model(test_x).to(device)
  test_loss = criterion(test_outputs, test_y)
  print(f'Test Loss: {test_loss.item()}')

Test Loss: 0.4846578538417816


In [12]:
predictions = model(test_x).detach().cpu().numpy()
actual = test_y.cpu().numpy()

# MSE 계산
mse = np.mean((predictions - actual) ** 2)

# MAE 계산
mae = np.mean(np.abs(predictions - actual))

# RMSE 계산
rmse = np.sqrt(mse)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

MAE: 0.2503601610660553
MSE: 0.48465782403945923
RMSE: 0.6961737275123596


In [13]:
test_outputs = test_outputs[:,:,0].reshape((120770,5,1)).reshape((120770*5,1)).cpu()
test_y = test_y.reshape((120770,5,1)).reshape((120770*5,1)).cpu()

rankIC = spearman_corrcoef(test_outputs, test_y)
print(f"RankIC: {rankIC}")

rankIR = PearsonCorrCoef()
rankIR(test_outputs, test_y)
print(f"RankIR: {rankIR.compute():.4f}")

RankIC: tensor([0.5890])
RankIR: 0.6563
