In [1]:
import sys
sys.path.append('..')
from data.process import StockDataProcessor
import data.config as dataconf

In [2]:
import numpy as np

In [3]:
data_processor = StockDataProcessor()

In [4]:
storage_path = '../data/raw.csv'
data_processor.load_raw_to_df(storage_path, dataconf.tickers)

In [5]:
data_processor.df

Unnamed: 0,Open,High,Low,Close,Volume,Ticker,Collect Date
0,16.100000,16.396667,15.942000,16.312668,92439000.0,TSLA,2019-10-01
1,16.219334,16.309999,15.962000,16.208668,84471000.0,TSLA,2019-10-02
2,15.457333,15.632000,14.952000,15.535333,226267504.0,TSLA,2019-10-03
3,15.440667,15.652000,15.204667,15.428667,119925000.0,TSLA,2019-10-04
4,15.320000,15.904000,15.236667,15.848000,120963000.0,TSLA,2019-10-07
...,...,...,...,...,...,...,...
65560,2894.000000,2931.500000,2863.000000,2882.500000,24980800.0,7203.T,2025-01-24
65561,2913.500000,2941.500000,2910.500000,2922.000000,18257200.0,7203.T,2025-01-27
65562,2900.000000,2936.000000,2889.500000,2889.500000,18314000.0,7203.T,2025-01-28
65563,2917.000000,2936.500000,2898.500000,2930.000000,17997800.0,7203.T,2025-01-29


In [6]:
features = ['Close', 'Volume']
data_processor.handle_missing_data(dataconf.start_date, dataconf.end_date, features=features)
data_processor.scale(features=features)
data_processor.select_feature(features=features)

In [7]:
data_processor.df

Unnamed: 0,Close,Volume,Collect Date,Ticker
67,-0.361416,7.191827,2020-01-01,TSLA
68,-0.358438,6.637111,2020-01-02,TSLA
69,-0.355254,12.707194,2020-01-03,TSLA
70,-0.353125,7.079427,2020-01-06,TSLA
71,-0.348751,12.783453,2020-01-07,TSLA
...,...,...,...,...
65538,10.636154,2.325211,2024-12-25,7203.T
65539,11.300889,2.760062,2024-12-26,7203.T
65540,11.473159,2.040668,2024-12-27,7203.T
65541,11.315869,0.809383,2024-12-30,7203.T


In [8]:
X_train, y_train, X_val, y_val, X_test, y_test = data_processor.split_train_val_test(window_for_x=40,
                                                                                     window_for_y=5,
                                                                                     val_size=0.1,
                                                                                     test_size=0.1,
                                                                                     features=features,
                                                                                     target_col='Close')

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((47094, 40, 2),
 (47094, 5),
 (4042, 40, 2),
 (4042, 5),
 (4042, 40, 2),
 (4042, 5))

## Baseline

### Last Days

In [10]:
from baseline import BaselineLastDayModel

In [11]:
model = BaselineLastDayModel()
y_train_pred = model.predict(X_train, ndays=5)
y_val_pred = model.predict(X_val, ndays=5)
y_test_pred = model.predict(X_test, ndays=5)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [12]:
from common import mse, mae

In [13]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.00857969, 0.01713958, 0.02712367, 0.03552309, 0.04417002],
       dtype=float32),
 0.02650721)

In [14]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.01045789, 0.02028568, 0.03077288, 0.04323151, 0.05812694],
       dtype=float32),
 0.032574978)

In [15]:
y_test

array([[ 9.905915  ,  9.836011  ,  9.98318   ,  9.898558  ,  9.964784  ],
       [ 9.836011  ,  9.98318   ,  9.898558  ,  9.964784  ,  9.652048  ],
       [ 9.98318   ,  9.898558  ,  9.964784  ,  9.652048  ,  9.49752   ],
       ...,
       [-0.1274993 , -0.11877347, -0.11877347, -0.11836152, -0.12259334],
       [-0.11877347, -0.11877347, -0.11836152, -0.12259334, -0.12667538],
       [-0.11877347, -0.11836152, -0.12259334, -0.12667538, -0.1274993 ]],
      dtype=float32)

In [16]:
def inverse_y(y):
    ndays = y.shape[1]
    y_new = np.zeros(y.shape)
    for i in range(ndays):
        y_new[:, i] = data_processor.inverse_transform(y[:, i], 'Close')[:, 0]
    return y_new

In [17]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([1302.00112662, 2284.88861635, 3279.70038298, 3923.86798261,
        4452.61393196]),
 3048.614408103344)

### Moving Average

In [18]:
from baseline import BaselineMAModel

In [19]:
model = BaselineMAModel()
y_train_pred = model.predict(X_train, ndays=5)
y_val_pred = model.predict(X_val, ndays=5)
y_test_pred = model.predict(X_test, ndays=5)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [20]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.03336056, 0.0394291 , 0.04570048, 0.05191565, 0.05828492]),
 0.04573814135792863)

In [21]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.04430977, 0.05216096, 0.06133201, 0.07132456, 0.08231013]),
 0.06228748496916357)

In [22]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([2688.92233386, 3142.65854376, 3560.8380593 , 3928.23160023,
        4266.91285109]),
 3517.5126776461357)

## Deep

### LSTM

In [23]:
from lstm import LSTMStockModel

In [24]:
import torch.nn as nn
import torch.optim as optim

In [25]:
# model = LSTMStockModel(optimizer_class=optim.Adam, optimizer_params=None,
 #                      input_dim=2, hidden_dim=64, fc_dim=32, output_dim=5)
# model.train_model(X_train, y_train, loss_fn=nn.MSELoss(), num_epochs=14, lr=1e-4, batch_size=16, X_val=X_val, y_val=y_val)

In [26]:
# model.train_model(X_train, y_train, loss_fn=nn.MSELoss(), num_epochs=6, lr=5e-5, batch_size=16, X_val=X_val, y_val=y_val)

In [27]:
# y_train_pred = model.predict(X_train)
# y_val_pred = model.predict(X_val)
# y_test_pred = model.predict(X_test)
# y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

In [28]:
# model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

In [29]:
# model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

In [30]:
# y_test_scaled = inverse_y(y_test)
# y_test_pred_scaled = inverse_y(y_test_pred)
# model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

In [31]:
# model.save_model(file_path='lstm_model_checkpoint.pth')

### Transformer

In [32]:
from transformer import StandardTransformerModel

In [36]:
model = StandardTransformerModel(optimizer_class=optim.Adam, optimizer_params={"betas": (0.9, 0.999), "eps": 1e-8},
                                 input_dim=2, d_model=64, nhead=8,
                                 num_encoder_layers=4, dim_ff=256, dropout=0.1,
                                 max_len=70, output_dim=1, ndays=5)

In [37]:
model

StandardTransformerModel(
  (input_linear): Linear(in_features=2, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (activation): ReLU()
      )
    )
    (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_feat

In [38]:
model.train_model(X=X_train, y=y_train, loss_fn=nn.MSELoss(), num_epochs=30, lr=5e-4, batch_size=32, 
                   X_val=X_val, y_val=y_val, use_warmup=True, warmup_epochs=5, 
                   scheduler_type="cosine", scheduler_params={"T_max": 25})

Epoch 1/30, Train Loss: 8.8903, Val Loss: 16.3198
Epoch 2/30, Train Loss: 3.7751, Val Loss: 5.6567
Epoch 3/30, Train Loss: 0.8137, Val Loss: 2.0381
Epoch 4/30, Train Loss: 0.5411, Val Loss: 1.2625
Epoch 5/30, Train Loss: 0.4841, Val Loss: 1.0606
Epoch 6/30, Train Loss: 0.4662, Val Loss: 0.9988
Epoch 7/30, Train Loss: 0.4072, Val Loss: 3.4214
Epoch 8/30, Train Loss: 0.3619, Val Loss: 0.5538
Epoch 9/30, Train Loss: 0.3557, Val Loss: 0.6814
Epoch 10/30, Train Loss: 0.2895, Val Loss: 0.5817
Epoch 11/30, Train Loss: 0.2829, Val Loss: 0.4861
Epoch 12/30, Train Loss: 0.2576, Val Loss: 0.2524
Epoch 13/30, Train Loss: 0.2586, Val Loss: 0.7558
Epoch 14/30, Train Loss: 0.1956, Val Loss: 0.2292
Epoch 15/30, Train Loss: 0.1895, Val Loss: 0.7159
Epoch 16/30, Train Loss: 0.1866, Val Loss: 0.2451
Epoch 17/30, Train Loss: 0.1717, Val Loss: 0.1889
Epoch 18/30, Train Loss: 0.1632, Val Loss: 0.2530
Epoch 19/30, Train Loss: 0.1702, Val Loss: 0.3018
Epoch 20/30, Train Loss: 0.1431, Val Loss: 0.2377
Epoch 21

In [39]:
model.save_model(file_path = 'std_transformer_model_checkpoint2.pth')

In [40]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [45]:
def evaluate_model(y_true, y_pred, metric):
    ndays = y_true.shape[1]
    perdays = np.array([metric(y_true[:, i], y_pred[:, i]) for i in range(ndays)])
    return perdays, perdays.mean()


In [46]:
evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.0856821 , 0.09238195, 0.10008991, 0.10774764, 0.11288454],
       dtype=float32),
 0.09975723)

In [47]:
evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.18937111, 0.1960636 , 0.20420927, 0.21525136, 0.2238119 ],
       dtype=float32),
 0.20574144)

In [48]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([12067.29438019, 11988.21781021, 11714.52501283, 11376.79288705,
        11124.99082332]),
 11654.364182718384)

In [49]:
y_test_scaled

array([[2769.5090332 , 2750.84277344, 2790.14038086, 2767.54418945,
        2785.22827148],
       [2750.84277344, 2790.14038086, 2767.54418945, 2785.22827148,
        2701.72045898],
       [2790.14038086, 2767.54418945, 2785.22827148, 2701.72045898,
        2660.45776367],
       ...,
       [  90.34999847,   92.68000031,   92.68000031,   92.79000092,
          91.66000366],
       [  92.68000031,   92.68000031,   92.79000092,   91.66000366,
          90.56999969],
       [  92.68000031,   92.79000092,   91.66000366,   90.56999969,
          90.34999847]])

In [50]:
y_test_pred_scaled

array([[2814.45874023, 2824.73510742, 2818.99926758, 2819.80078125,
        2825.9387207 ],
       [2821.53198242, 2832.78466797, 2826.58764648, 2827.56884766,
        2834.58178711],
       [2813.88061523, 2824.76464844, 2817.74291992, 2818.93847656,
        2826.34228516],
       ...,
       [  67.301651  ,   67.67636871,   68.08278656,   70.71026611,
          70.17713928],
       [  64.62723541,   65.60940552,   65.54699707,   67.16470337,
          67.42545319],
       [  61.63510132,   62.5256958 ,   62.5632515 ,   63.13911819,
          63.89462662]])