In [7]:
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import r2_score

In [8]:
NYSE = load_data('NYSE')
cols = ['DJ_return', 'log_volume', 'log_volatility']
X = pd.DataFrame(StandardScaler(
                     with_mean=True,
                     with_std=True).fit_transform(NYSE[cols]),
                 columns=NYSE[cols].columns,
                 index=NYSE.index)
for lag in range(1, 6):
    for col in cols:
        newcol = np.zeros(X.shape[0]) * np.nan
        newcol[lag:] = X[col].values[:-lag]
        X.insert(len(X.columns), "{0}_{1}".format(col, lag), newcol)
X.insert(len(X.columns), 'train', NYSE['train'])
X = X.dropna()
Y, train = X['log_volume'], X['train']
X = X.drop(columns=['train'] + cols)
X_day = pd.concat([X, 
                  pd.get_dummies(NYSE['day_of_week'])],
                  axis=1).dropna()
X_day.columns


Index(['DJ_return_1', 'log_volume_1', 'log_volatility_1', 'DJ_return_2',
       'log_volume_2', 'log_volatility_2', 'DJ_return_3', 'log_volume_3',
       'log_volatility_3', 'DJ_return_4', 'log_volume_4', 'log_volatility_4',
       'DJ_return_5', 'log_volume_5', 'log_volatility_5', 'mon', 'tues', 'wed',
       'thur', 'fri'],
      dtype='object')

In [9]:
train_idx = X_day.index[NYSE.loc[X_day.index, 'train'] == 1]
test_idx = X_day.index[NYSE.loc[X_day.index, 'train'] == 0]

X_train, X_test = X_day.loc[train_idx], X_day.loc[test_idx]
Y_train, Y_test = Y.loc[train_idx], Y.loc[test_idx]

num_lags = 5
num_numeric_cols = 3
numeric_lag_cols = [f'{col}_{lag}' for lag in range(1, num_lags+1) for col in ['DJ_return', 'log_volume', 'log_volatility']]

X_train_numeric = X_train[numeric_lag_cols].values.reshape(-1, num_lags, num_numeric_cols)
X_test_numeric = X_test[numeric_lag_cols].values.reshape(-1, num_lags, num_numeric_cols)

X_train_cat = np.repeat(X_train[['mon','tues','wed','thur','fri']].values[:, np.newaxis, :], num_lags, axis=1)
X_test_cat = np.repeat(X_test[['mon','tues','wed','thur','fri']].values[:, np.newaxis, :], num_lags, axis=1)

X_train_rnn = np.concatenate([X_train_numeric, X_train_cat], axis=2)
X_test_rnn = np.concatenate([X_test_numeric, X_test_cat], axis=2)

X_train_tensor = torch.tensor(X_train_rnn, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_rnn, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_test_tensor = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size=32):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

input_size = X_train_rnn.shape[2]
model = RNNModel(input_size)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 50
for epoch in range(epochs):
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    Y_pred = model(X_test_tensor).numpy()

r2 = r2_score(Y_test, Y_pred)
print("Test R^2:", r2)


Test R^2: 0.44066112086986786
