In [2]:
import pandas as pd

In [14]:
df = pd.read_csv("../data/full_sentiment_dataset_cleaned.csv")
df.head()

Unnamed: 0,date,ticker,positive,neutral,negative,return_1d,return_2d,return_3d,ticker_encoded
0,2017-01-31,21CF,0.000167,0.999619,0.000214,-0.000319,-0.002868,-0.002868,0
1,2017-02-01,21CF,0.0,0.0,0.0,,,,0
2,2017-02-02,21CF,0.0,0.0,0.0,,,,0
3,2017-02-03,21CF,0.0,0.0,0.0,,,,0
4,2017-02-04,21CF,0.0,0.0,0.0,,,,0


In [4]:
import numpy as np

In [63]:
window = 3
seq = []

for _, group in df.groupby('ticker'):
    group = group.reset_index(drop=True)
    for i in range(len(group)):
        end = min(i + window, len(group))
        sentiment = group.loc[i:end - 1, ['positive', 'neutral', 'negative']].values 
        
        if len(sentiment) < window:
            pad = window - len(sentiment)
            fill = np.zeros((pad, 3))
            sentiment = np.vstack((sentiment, fill))
            
        ticker_id = group.loc[i, 'ticker_encoded']
        ticker_feature = np.full((window, 1), ticker_id)
        features = np.hstack([sentiment, ticker_feature])
            
        if end < len(group):
            target = group.loc[end, ['return_1d', 'return_2d']].values
            
            if not np.any(pd.isnull(target)) and not np.all(features == 0) and not np.all(target == 0):
                seq.append((features, target))

In [64]:
X = np.array([x[0] for x in seq], dtype = np.float32)
y = np.array([x[1] for x in seq], dtype = np.float32)

In [65]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

In [66]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [67]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, id):
        return self.X[id], self.y[id]

In [68]:
train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [69]:
import torch.nn as nn

In [70]:
class LSTM(nn.Module):
    """
    input_size: number of features (positive, negative, neutral, ticker_id)
    hidden_size: features in the hidden state
    num_layers: number of stacked LSTM layers
    """
    def __init__(self, input_size = 4, hidden_size = 128, num_layers = 2, output_size = 2, dropout = 0.3):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            num_layers = num_layers,
                            batch_first = True,
                            dropout = dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1]) 
        return out

In [71]:
from itertools import product

param_grid = {
    'hidden_size': [32, 64],
    'num_layers': [1, 2],
    'lr': [0.001, 0.0005],
    'dropout': [0.0, 0.3]
}

param_combos = list(product(*param_grid.values()))

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_model = None
best_score = float('-inf')

for combo in param_combos:
    hidden_size, num_layers, lr, dropout = combo
    print(f"Trying combo: hidden={hidden_size}, layers={num_layers}, lr={lr}, dropout={dropout}")
    
    model = LSTM(input_size=4, hidden_size=hidden_size, num_layers=num_layers,
                      output_size=2, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    loss_fn = nn.MSELoss()
    
    for epoch in range(10):
        model.train()
    
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
            optimizer.zero_grad()
            output = model(X_batch)
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()
            
    model.eval()
    all_preds, all_targets = [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch).cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(y_batch.numpy())
            
    r2 = r2_score(all_targets, all_preds)
    print(f"→ R2 Score: {r2:.4f}")
    
    if r2 > best_score:
        best_score = r2
        best_model = model
        

Trying combo: hidden=32, layers=1, lr=0.001, dropout=0.0
→ R2 Score: -0.0839
Trying combo: hidden=32, layers=1, lr=0.001, dropout=0.3
→ R2 Score: -0.0507
Trying combo: hidden=32, layers=1, lr=0.0005, dropout=0.0
→ R2 Score: -0.0395
Trying combo: hidden=32, layers=1, lr=0.0005, dropout=0.3
→ R2 Score: -0.0656
Trying combo: hidden=32, layers=2, lr=0.001, dropout=0.0
→ R2 Score: -0.0130
Trying combo: hidden=32, layers=2, lr=0.001, dropout=0.3
→ R2 Score: -0.1131
Trying combo: hidden=32, layers=2, lr=0.0005, dropout=0.0
→ R2 Score: -0.0121
Trying combo: hidden=32, layers=2, lr=0.0005, dropout=0.3
→ R2 Score: -0.0901
Trying combo: hidden=64, layers=1, lr=0.001, dropout=0.0
→ R2 Score: -0.0361
Trying combo: hidden=64, layers=1, lr=0.001, dropout=0.3
→ R2 Score: -0.0503
Trying combo: hidden=64, layers=1, lr=0.0005, dropout=0.0
→ R2 Score: -0.0419
Trying combo: hidden=64, layers=1, lr=0.0005, dropout=0.3
→ R2 Score: -0.0293
Trying combo: hidden=64, layers=2, lr=0.001, dropout=0.0
→ R2 Score: -

In [75]:
print(best_score)

-0.008847231126532962
