In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv("../data/full_sentiment_dataset_cleaned.csv")
df.head()

Unnamed: 0,ticker,date,text,positive,neutral,negative,price,volume,volatility_10d,volatility_30d,return_1d,return_2d,return_3d,ticker_encoded,price_volatility_ratio,volume_volatility_ratio,volatility_diff,day_of_week
0,21CF,2017-01-31,['RT @21CF: 21CF internal memo from Executive ...,0.000167,0.999619,0.000214,31.38,5170587.0,16.864,14.768,-0.031867,-0.286807,-0.002868,0,2.124865,350121.004875,2.096,1
1,21CF,2017-04-29,['RT @21CF: Read what @Gotham star @ben_mckenz...,0.000117,0.999737,0.000147,30.54,6681951.0,17.751,16.189,0.0,0.065488,0.006549,0,1.886466,412746.370993,1.562,5
2,ASOS,2017-01-31,['RT @n76seary: RT @StudentBunker: #FreebieFri...,0.000176,0.999649,0.000175,5266.0,342823.0,32.807,28.367,-1.253323,0.835549,0.008355,1,185.638242,12085.275144,4.44,1
3,ASOS,2017-02-01,['ASOS SALON Pretty Floral Soft Midi with Embe...,0.000231,0.999595,0.000175,5267.0,301346.0,26.819,28.35,-0.018986,-1.272071,0.008164,1,185.784832,10629.488536,-1.531,2
4,ASOS,2017-02-28,"['GUADALUPE PASS AMOS,TX (GDP) ASOS reports gu...",0.222588,0.626411,0.151001,5432.0,608408.0,10.72,21.31,-0.257732,-1.896171,-0.018962,1,254.903801,28550.351947,-10.59,1


In [63]:
seq = []

for _, group in df.groupby('ticker'):
    group = group.reset_index(drop = True)
    for i in range(len(group)):
        required_cols = ['positive', 'neutral', 'negative', 'price', 'volume', 'volatility_10d', 'volatility_30d']
        if not set(required_cols).issubset(group.columns):
            continue
    
        sentiment = group.loc[i, ['positive', 'negative', 'neutral']].values
        ticker_id = group.loc[i, 'ticker_encoded']
        market_data = group.loc[i, ['price', 'volume', 'volatility_10d', 'volatility_30d', "price_volatility_ratio", "volume_volatility_ratio",	"volatility_diff", "day_of_week"]].values
        
        if pd.isnull(sentiment).any() or pd.isnull(market_data).any():
            continue

        final_features = np.concatenate([sentiment, [ticker_id], market_data])
        
        if i + 1 < len(group):
            target = group.loc[i + 1, ['return_1d']].values
            if not np.any(pd.isnull(target)):
                seq.append((final_features, target))

In [64]:
X = np.array([x[0] for x in seq], dtype=np.float32)
y = np.array([x[1] for x in seq], dtype=np.float32)


In [65]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [67]:
X_scale = StandardScaler()
y_scale = StandardScaler()

X_train_scale = X_scale.fit_transform(X_train)
X_test_scale = X_scale.transform(X_test)

y_train_scale = y_scale.fit_transform(y_train)
y_test_scale = y_scale.transform(y_test)

In [68]:
import torch
from torch.utils.data import Dataset, DataLoader

In [69]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, id):
        return self.X[id], self.y[id]

In [70]:
train_dataset = SentimentDataset(X_train_scale, y_train_scale)
test_dataset = SentimentDataset(X_test_scale, y_test_scale)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [71]:
import torch.nn as nn

In [72]:
class GRU(nn.Module):
    def __init__(self, input = 12, hidden = 64, num_layers = 1, output = 1, dropout = 0.2):
        super(GRU, self).__init__()
        self.gru = nn.GRU(input, hidden, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden, output)
        
    def forward(self, x):
        _, hn = self.gru(x)
        out = self.fc(hn[-1])
        return out

In [73]:
from itertools import product

In [74]:
param_grid = {
    'hidden_size': [64, 32],
    'num_layers': [1, 2, 3],
    'lr': [0.001],
    'dropout': [0.0, 0.3]
}

param_combos = list(product(*param_grid.values()))

In [75]:
from sklearn.metrics import r2_score
import gc

In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = None
best_score = float('-inf')

for hidden_size, num_layers, lr, dropout in param_combos:
    print(f"Trying: hidden={hidden_size}, layers={num_layers}, lr={lr}, dropout={dropout}")
    model = GRU(input = 12, hidden=hidden_size, num_layers=num_layers, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    
    for epoch in range(30):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()
            
    model.eval()
    preds, target = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            pred = model(X_batch).cpu().numpy()
            preds.extend(pred)
            target.extend(y_batch.numpy())
            
    preds = np.array(preds)
    target = np.array(target)
    preds_orig = y_scale.inverse_transform(preds)
    targets_orig = y_scale.inverse_transform(target)
    
    r2 = r2_score(target, preds)
    print(f"→ R²: {r2:.4f}")
    if r2 > best_score:
        best_score = r2
        best_model = model
    del model
    gc.collect()
    torch.cuda.empty_cache()

Trying: hidden=64, layers=1, lr=0.001, dropout=0.0
→ R²: -0.0024
Trying: hidden=64, layers=1, lr=0.001, dropout=0.3
→ R²: -0.0016
Trying: hidden=64, layers=2, lr=0.001, dropout=0.0
→ R²: -0.0019
Trying: hidden=64, layers=2, lr=0.001, dropout=0.3
→ R²: -0.0011
Trying: hidden=64, layers=3, lr=0.001, dropout=0.0
→ R²: -0.0132
Trying: hidden=64, layers=3, lr=0.001, dropout=0.3
→ R²: -0.0025
Trying: hidden=32, layers=1, lr=0.001, dropout=0.0
→ R²: -0.0026
Trying: hidden=32, layers=1, lr=0.001, dropout=0.3
→ R²: -0.0028
Trying: hidden=32, layers=2, lr=0.001, dropout=0.0
→ R²: -0.0027
Trying: hidden=32, layers=2, lr=0.001, dropout=0.3
→ R²: 0.0004
Trying: hidden=32, layers=3, lr=0.001, dropout=0.0
→ R²: -0.0047
Trying: hidden=32, layers=3, lr=0.001, dropout=0.3
→ R²: -0.0006


In [78]:
preds = np.array(preds)
target = np.array(target)

preds_orig = y_scale.inverse_transform(preds)
targets_orig = y_scale.inverse_transform(target)

print("R² (orig):", r2_score(targets_orig, preds_orig))

R² (orig): -0.0005710124969482422
