In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

Data preprocessing

In [2]:
path = "data/returns.csv"
returns = pd.read_csv(path, index_col=0, parse_dates=True)
returns.head()

Unnamed: 0_level_0,0,A,AA,AAL,AAP,AAPL,AAS,ABBV,ABC,ABMD,...,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZMH,ZMX,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-02,0.0,-0.070776,-0.037313,0.0,0.0,0.0,-0.011139,0.0,0.0,-0.072165,...,0.025162,-0.003195,0.0,-0.032197,0.0,0.038682,-0.04004,0.0,0.0,0.0
2001-01-03,0.0,0.103194,0.013566,0.0,0.0,0.10084,0.020025,0.0,0.0,0.094444,...,-0.043478,-0.051282,0.0,0.056751,0.0,0.10767,0.033368,0.0,0.0,0.0
2001-01-04,0.0,0.037862,0.032505,0.0,0.0,0.041985,-0.082209,0.0,0.0,-0.081218,...,-0.027859,-0.064189,0.0,0.001852,0.0,-0.003995,-0.020182,0.0,-0.011628,0.0
2001-01-05,0.0,-0.054721,-0.018519,0.0,0.0,-0.040293,-0.034759,0.0,0.0,-0.104972,...,0.004525,0.01083,0.0,-0.031423,0.0,-0.029412,-0.011329,0.0,0.0,0.0
2001-01-08,0.0,-0.032917,0.015094,0.0,0.0,0.01145,0.060942,0.0,0.0,-0.185185,...,-0.004505,0.030357,0.0,-0.005725,0.0,-0.015152,0.002083,0.0,0.017647,0.0


additional cleaning

In [3]:
def mask_pre_ipo(df):
    # For each stock, treat leading zeros as NA
    df2 = df.copy()
    for col in df2:
        s = df2[col]
        first_nonzero = s.ne(0).idxmax()  # first non-zero return
        df2.loc[:first_nonzero, col] = np.nan
    return df2

def safe_rolling_zscore(df, window):
    rolling_mean = df.rolling(window).mean()
    rolling_std = df.rolling(window).std()

    # If std == 0 → return 0 instead of NaN or inf
    z = (df - rolling_mean) / rolling_std.replace(0, np.nan)
    z = z.fillna(0)

    return z


# 1. Mask pre-IPO zeros  
returns = mask_pre_ipo(returns)

# 2. Compute stable rolling z  
features = safe_rolling_zscore(returns, window=60)

# 3. Replace infinities / residual NaN  
features = features.replace([np.inf, -np.inf], 0).fillna(0)


In [6]:

returns = features

# Assume `returns` is a (T, N) pandas DataFrame, indexed by date
X_all = torch.tensor(returns.shift(1).iloc[1:].values, dtype=torch.float32)  # r_t
y_all = torch.tensor(returns.iloc[1:].values, dtype=torch.float32)           # r_{t+1}
T, N = X_all.shape

# --------------- Train/Test Split (preserve order) ---------------
split_idx = int(T * 0.8)
X_train, X_test = X_all[:split_idx], X_all[split_idx:]
y_train, y_test = y_all[:split_idx], y_all[split_idx:]

# --------------- Normalize using training set only ---------------

# mean = X_train.mean(0, keepdim=True)
# std = X_train.std(0, keepdim=True) + 1e-6
# X_train = (X_train - mean) / std
# y_train = (y_train - mean) / std
# X_test = (X_test - mean) / std
# y_test = (y_test - mean) / std

# --------------- Neural Network Definition ---------------
class ReturnPredictor(nn.Module):
    def __init__(self, N, hidden_dim=256, depth=3, dropout=0.1):
        super().__init__()
        layers = []
        in_dim = N
        for _ in range(depth):
            layers += [
                nn.Linear(in_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden_dim)
            ]
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, N))  # output dimension N
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

model = ReturnPredictor(N, hidden_dim=3, depth=2)
# Custom loss combining MSE and hit rate
def combined_loss(y_pred, y_true, mse_weight=0.5, hit_weight=100):
    # MSE component
    mse = torch.mean((y_pred - y_true) ** 2)
    
    # Hit rate component (penalize incorrect sign predictions)
    # We want to maximize hit rate, so we minimize (1 - hit_rate)
    correct_signs = (torch.sign(y_pred) == torch.sign(y_true)).float()
    hit_rate = torch.mean(correct_signs)
    hit_loss = 1 - hit_rate
    
    # Normalize MSE to be on similar scale as hit_loss (which is in [0, 1])
    # Use detach to avoid affecting gradients of the normalization factor
    mse_normalized = mse / (mse.detach() + 1e-8)
    
    # Combined loss with normalized components
    return mse_weight * mse_normalized + hit_weight * hit_loss

criterion = combined_loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# --------------- Training Loop ---------------
n_epochs = 100
batch_size = 32
T_train = len(X_train)

for epoch in range(n_epochs):
    total_loss = 0
    for i in range(0, T_train, batch_size):
        Xb = X_train[i:i+batch_size]
        yb = y_train[i:i+batch_size]
        optimizer.zero_grad()
        y_pred = model(Xb)
        loss = criterion(y_pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(Xb)
    if (epoch+1) % 10 == 0:
        avg_loss = total_loss / T_train
        print(f"Epoch {epoch+1:03d} | Train Loss {avg_loss:.6f}")

# --------------- Evaluation ---------------
with torch.no_grad():
    y_pred_train = model(X_train)
    y_pred_test = model(X_test)

def r2_score(y_true, y_pred):
    ss_res = ((y_true - y_pred)**2).sum().item()
    ss_tot = ((y_true)**2).sum().item()
    return 1 - ss_res / ss_tot

r2_train = r2_score(y_train, y_pred_train)
r2_test  = r2_score(y_test,  y_pred_test)

print(f"\nTrain R² = {r2_train:.4f}")
print(f" Test R² = {r2_test:.4f}")

# --------------- Convert predictions back to DataFrame ---------------
# y_hat_test = (y_pred_test * std + mean).numpy()
# pred_df = pd.DataFrame(
#     y_hat_test,
#     index=returns.index[1:][split_idx:],
#     columns=returns.columns
# )

Epoch 010 | Train Loss 61.328948
Epoch 020 | Train Loss 57.995537
Epoch 030 | Train Loss 56.978979
Epoch 040 | Train Loss 56.403842
Epoch 050 | Train Loss 55.906245
Epoch 060 | Train Loss 55.699430
Epoch 070 | Train Loss 55.802527
Epoch 080 | Train Loss 55.739613
Epoch 090 | Train Loss 55.585155
Epoch 100 | Train Loss 56.064026

Train R² = 0.0734
 Test R² = -0.0649


In [7]:
# --------------- Hit Rate Calculation ---------------
# Hit rate: percentage of times the predicted sign matches the actual sign
def hit_rate(y_true, y_pred):
    correct = ((y_true * y_pred) > 0).sum().item()
    total = y_true.numel()
    return correct / total

hit_rate_train = hit_rate(y_train, y_pred_train)
hit_rate_test = hit_rate(y_test, y_pred_test)

print(f"\nTrain Hit Rate = {hit_rate_train:.4f}")
print(f" Test Hit Rate = {hit_rate_test:.4f}")



Train Hit Rate = 0.4429
 Test Hit Rate = 0.3851
