In [1]:
import sys
sys.path.append('..')
from data.process import StockDataProcessor
import data.config as dataconf

In [2]:
import numpy as np

In [3]:
data_processor = StockDataProcessor()

In [4]:
storage_path = '../data/raw.csv'
data_processor.load_raw_to_df(storage_path, dataconf.tickers)

In [5]:
data_processor.df

Unnamed: 0,Open,High,Low,Close,Volume,Ticker,Collect Date
0,16.100000,16.396667,15.942000,16.312668,92439000.0,TSLA,2019-10-01
1,16.219334,16.309999,15.962000,16.208668,84471000.0,TSLA,2019-10-02
2,15.457333,15.632000,14.952000,15.535333,226267504.0,TSLA,2019-10-03
3,15.440667,15.652000,15.204667,15.428667,119925000.0,TSLA,2019-10-04
4,15.320000,15.904000,15.236667,15.848000,120963000.0,TSLA,2019-10-07
...,...,...,...,...,...,...,...
65560,2894.000000,2931.500000,2863.000000,2882.500000,24980800.0,7203.T,2025-01-24
65561,2913.500000,2941.500000,2910.500000,2922.000000,18257200.0,7203.T,2025-01-27
65562,2900.000000,2936.000000,2889.500000,2889.500000,18314000.0,7203.T,2025-01-28
65563,2917.000000,2936.500000,2898.500000,2930.000000,17997800.0,7203.T,2025-01-29


In [6]:
features = ['Close', 'Volume']
data_processor.handle_missing_data(dataconf.start_date, dataconf.end_date, features=features)
data_processor.scale(features=features)
data_processor.select_feature(features=features)

In [7]:
data_processor.df

Unnamed: 0,Close,Volume,Collect Date,Ticker
67,-0.361416,7.191827,2020-01-01,TSLA
68,-0.358438,6.637111,2020-01-02,TSLA
69,-0.355254,12.707194,2020-01-03,TSLA
70,-0.353125,7.079427,2020-01-06,TSLA
71,-0.348751,12.783453,2020-01-07,TSLA
...,...,...,...,...
65538,10.636154,2.325211,2024-12-25,7203.T
65539,11.300889,2.760062,2024-12-26,7203.T
65540,11.473159,2.040668,2024-12-27,7203.T
65541,11.315869,0.809383,2024-12-30,7203.T


In [8]:
X_train, y_train, X_val, y_val, X_test, y_test = data_processor.split_train_val_test(window_for_x=40,
                                                                                     window_for_y=5,
                                                                                     val_size=0.1,
                                                                                     test_size=0.1,
                                                                                     features=features,
                                                                                     target_col='Close')

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((47094, 40, 2),
 (47094, 5),
 (4042, 40, 2),
 (4042, 5),
 (4042, 40, 2),
 (4042, 5))

## Baseline

### Last Days

In [10]:
from baseline import BaselineLastDayModel

In [11]:
model = BaselineLastDayModel()
y_train_pred = model.predict(X_train, ndays=5)
y_val_pred = model.predict(X_val, ndays=5)
y_test_pred = model.predict(X_test, ndays=5)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [12]:
from common import mse, mae

In [13]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.00857969, 0.01713958, 0.02712367, 0.03552309, 0.04417002],
       dtype=float32),
 0.02650721)

In [14]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.01045789, 0.02028568, 0.03077288, 0.04323151, 0.05812694],
       dtype=float32),
 0.032574978)

In [15]:
y_test

array([[ 9.905915  ,  9.836011  ,  9.98318   ,  9.898558  ,  9.964784  ],
       [ 9.836011  ,  9.98318   ,  9.898558  ,  9.964784  ,  9.652048  ],
       [ 9.98318   ,  9.898558  ,  9.964784  ,  9.652048  ,  9.49752   ],
       ...,
       [-0.1274993 , -0.11877347, -0.11877347, -0.11836152, -0.12259334],
       [-0.11877347, -0.11877347, -0.11836152, -0.12259334, -0.12667538],
       [-0.11877347, -0.11836152, -0.12259334, -0.12667538, -0.1274993 ]],
      dtype=float32)

In [16]:
def inverse_y(y):
    ndays = y.shape[1]
    y_new = np.zeros(y.shape)
    for i in range(ndays):
        y_new[:, i] = data_processor.inverse_transform(y[:, i], 'Close')[:, 0]
    return y_new

In [17]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([1302.00112662, 2284.88861635, 3279.70038298, 3923.86798261,
        4452.61393196]),
 3048.614408103344)

### Moving Average

In [18]:
from baseline import BaselineMAModel

In [19]:
model = BaselineMAModel()
y_train_pred = model.predict(X_train, ndays=5)
y_val_pred = model.predict(X_val, ndays=5)
y_test_pred = model.predict(X_test, ndays=5)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [20]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.03336056, 0.0394291 , 0.04570048, 0.05191565, 0.05828492]),
 0.04573814135792863)

In [21]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.04430977, 0.05216096, 0.06133201, 0.07132456, 0.08231013]),
 0.06228748496916357)

In [22]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([2688.92233386, 3142.65854376, 3560.8380593 , 3928.23160023,
        4266.91285109]),
 3517.5126776461357)

## Deep

### LSTM

In [23]:
from lstm import LSTMStockModel

In [24]:
import torch.nn as nn

In [25]:
model = LSTMStockModel(input_dim=2, hidden_dim=64, fc_dim=32, output_dim=5)
model.train_model(X_train, y_train, loss_fn=nn.MSELoss(), num_epochs=14, lr=1e-4, batch_size=16, X_val=X_val, y_val=y_val)

Epoch 1/14, Train Loss: 4.0808, Val Loss: 1.5243
Epoch 2/14, Train Loss: 0.1423, Val Loss: 0.1939
Epoch 3/14, Train Loss: 0.0479, Val Loss: 0.1040
Epoch 4/14, Train Loss: 0.0375, Val Loss: 0.1242
Epoch 5/14, Train Loss: 0.0343, Val Loss: 0.0722
Epoch 6/14, Train Loss: 0.0324, Val Loss: 0.0651
Epoch 7/14, Train Loss: 0.0321, Val Loss: 0.0583
Epoch 8/14, Train Loss: 0.0307, Val Loss: 0.0548
Epoch 9/14, Train Loss: 0.0302, Val Loss: 0.0562
Epoch 10/14, Train Loss: 0.0310, Val Loss: 0.0524
Epoch 11/14, Train Loss: 0.0297, Val Loss: 0.0414
Epoch 12/14, Train Loss: 0.0292, Val Loss: 0.0573
Epoch 13/14, Train Loss: 0.0300, Val Loss: 0.0496
Epoch 14/14, Train Loss: 0.0290, Val Loss: 0.0407


In [27]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [28]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.00995759, 0.0181122 , 0.02698571, 0.03549696, 0.04302435],
       dtype=float32),
 0.026715362)

In [29]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.01732958, 0.02678571, 0.03872032, 0.05230585, 0.06820323],
       dtype=float32),
 0.040668942)

In [30]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([1572.0989663 , 2264.01842375, 2978.48641041, 3575.78891908,
        3880.001606  ]),
 2854.07886510783)

In [31]:
model = LSTMStockModel(input_dim=2, hidden_dim=[64, 64], fc_dim=32, output_dim=5)
model.train_model(X_train, y_train, loss_fn=nn.MSELoss(), num_epochs=14, lr=1e-4, batch_size=16, X_val=X_val, y_val=y_val)

Epoch 1/14, Train Loss: 4.1689, Val Loss: 2.5132
Epoch 2/14, Train Loss: 0.2043, Val Loss: 0.2881
Epoch 3/14, Train Loss: 0.0557, Val Loss: 0.0882
Epoch 4/14, Train Loss: 0.0438, Val Loss: 0.0599
Epoch 5/14, Train Loss: 0.0389, Val Loss: 0.0626
Epoch 6/14, Train Loss: 0.0363, Val Loss: 0.0622
Epoch 7/14, Train Loss: 0.0346, Val Loss: 0.0604
Epoch 8/14, Train Loss: 0.0331, Val Loss: 0.0556
Epoch 9/14, Train Loss: 0.0327, Val Loss: 0.0487
Epoch 10/14, Train Loss: 0.0318, Val Loss: 0.0626
Epoch 11/14, Train Loss: 0.0310, Val Loss: 0.0420
Epoch 12/14, Train Loss: 0.0313, Val Loss: 0.0430
Epoch 13/14, Train Loss: 0.0309, Val Loss: 0.0472
Epoch 14/14, Train Loss: 0.0303, Val Loss: 0.0419


In [32]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
y_train_pred.shape, y_val_pred.shape, y_test_pred.shape

((47094, 5), (4042, 5), (4042, 5))

In [33]:
model.evaluate_model(y_true=y_train, y_pred=y_train_pred, metric=mse)

(array([0.01010536, 0.01773989, 0.02682295, 0.03426706, 0.04239224],
       dtype=float32),
 0.026265498)

In [34]:
model.evaluate_model(y_true=y_val, y_pred=y_val_pred, metric=mse)

(array([0.01561623, 0.02528864, 0.0399519 , 0.05403223, 0.07466817],
       dtype=float32),
 0.04191143)

In [35]:
y_test_scaled = inverse_y(y_test)
y_test_pred_scaled = inverse_y(y_test_pred)
model.evaluate_model(y_test_scaled, y_test_pred_scaled, mse)

(array([1373.47339079, 2277.33316374, 3180.96642062, 3801.5330759 ,
        4509.4346674 ]),
 3028.5481436910713)