<a href="https://colab.research.google.com/github/ucsd-cse-spis-2025/SPIS25-Lauren-Diana-Final_Project/blob/main/SPIS_Stock_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("camnugent/sandp500")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sandp500


In [110]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [111]:
df = pd.read_csv("/kaggle/input/sandp500/all_stocks_5yr.csv")
df = df[df["Name"] == "AAPL"]

In [112]:
splitted = df['date'].str.split('-', expand=True)

df['day'] = splitted[2].astype('int')
df['month'] = splitted[1].astype('int')
df['year'] = splitted[0].astype('int')

df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,day,month,year
1259,2013-02-08,67.7142,68.4014,66.8928,67.8542,158168416,AAPL,8,2,2013
1260,2013-02-11,68.0714,69.2771,67.6071,68.5614,129029425,AAPL,11,2,2013
1261,2013-02-12,68.5014,68.9114,66.8205,66.8428,151829363,AAPL,12,2,2013
1262,2013-02-13,66.7442,67.6628,66.1742,66.7156,118721995,AAPL,13,2,2013
1263,2013-02-14,66.3599,67.3771,66.2885,66.6556,88809154,AAPL,14,2,2013


In [113]:
# Feature engineering
df['open-close'] = df['open'] - df['close']
df['low-high'] = df['low'] - df['high']
df['daily_return'] = df['close'].pct_change()
df['is_quarter_end'] = np.where(df['month'] % 3 == 0, 1, 0)

In [114]:
#!pip install ta
import ta  # technical analysis library

df['rsi'] = ta.momentum.RSIIndicator(df['close'], window=14).rsi()
df['macd'] = ta.trend.MACD(df['close']).macd_diff()
df['bollinger_h'] = ta.volatility.BollingerBands(df['close']).bollinger_hband()
df['bollinger_l'] = ta.volatility.BollingerBands(df['close']).bollinger_lband()
df['ema_12'] = ta.trend.EMAIndicator(df['close'], window=12).ema_indicator()
df['ema_26'] = ta.trend.EMAIndicator(df['close'], window=26).ema_indicator()

In [115]:
# Drop NA from pct_change
df = df.dropna()

In [116]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,day,month,year,open-close,low-high,daily_return,is_quarter_end,rsi,macd,bollinger_h,bollinger_l,ema_12,ema_26
1292,2013-03-28,64.2599,64.5455,63.0885,63.2371,110698007,AAPL,28,3,2013,1.0228,-1.457,-0.020837,1,44.833238,0.228919,66.95703,59.28319,64.315191,64.223744
1293,2013-04-01,63.1285,63.3854,61.1057,61.2728,97294421,AAPL,1,4,2013,1.8557,-2.2797,-0.031062,0,37.318998,-0.016442,66.965937,59.251993,63.847131,64.005156
1294,2013-04-02,61.0857,62.5914,60.9142,61.3988,132350022,AAPL,2,4,2013,-0.3131,-1.6772,0.002056,0,38.036411,-0.160036,66.855163,59.501937,63.470464,63.812092
1295,2013-04-03,61.6242,62.4685,61.4728,61.7128,90719482,AAPL,3,4,2013,-0.0886,-0.9957,0.005114,0,39.883015,-0.219954,66.851137,59.518053,63.200054,63.656589
1296,2013-04-04,61.9657,62.1428,60.7499,61.1028,89589332,AAPL,4,4,2013,0.8629,-1.3929,-0.009884,0,37.542328,-0.282751,66.829799,59.568791,62.8774,63.46742


In [117]:
# Target: 1 if next day close is higher
df['target'] = np.where(df['close'].shift(-1) > df['close'], 1, 0)
df = df.dropna()

# Define feature columns
feature_cols = ['open-close', 'low-high', 'daily_return', 'volume', 'is_quarter_end']
X = df[feature_cols]
y = df['target']

In [118]:
# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [119]:
# Train/Test Split
train_size = int(len(X_scaled) * 0.8)
X_train, X_valid = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_valid = y[:train_size], y[train_size:]

In [120]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).unsqueeze(1)

In [121]:
class StrongerMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

model = StrongerMLP(input_dim=X_train.shape[1])


In [122]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)


In [140]:
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        train_preds = torch.sigmoid(outputs)
        train_preds_cls = (train_preds > 0.5).float()
        train_acc = accuracy_score(y_train_tensor.numpy(), train_preds_cls.numpy())

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_valid_tensor)
        val_preds = torch.sigmoid(val_outputs)
        val_preds_cls = (val_preds > 0.5).float()
        val_acc = accuracy_score(y_valid_tensor, val_preds_cls)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Train Acc: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")


Epoch 1/10, Loss: 0.6716, Train Acc: 0.5796, Val Accuracy: 0.4837
Epoch 2/10, Loss: 0.6767, Train Acc: 0.5796, Val Accuracy: 0.4837
Epoch 3/10, Loss: 0.6676, Train Acc: 0.5990, Val Accuracy: 0.4797
Epoch 4/10, Loss: 0.6768, Train Acc: 0.5827, Val Accuracy: 0.4797
Epoch 5/10, Loss: 0.6752, Train Acc: 0.5816, Val Accuracy: 0.4797
Epoch 6/10, Loss: 0.6760, Train Acc: 0.5786, Val Accuracy: 0.4797
Epoch 7/10, Loss: 0.6716, Train Acc: 0.5704, Val Accuracy: 0.4837
Epoch 8/10, Loss: 0.6752, Train Acc: 0.5786, Val Accuracy: 0.4837
Epoch 9/10, Loss: 0.6707, Train Acc: 0.5898, Val Accuracy: 0.4837
Epoch 10/10, Loss: 0.6726, Train Acc: 0.5857, Val Accuracy: 0.4837


In [124]:
model.eval()
with torch.no_grad():
    preds = model(X_valid_tensor)
    preds_prob = torch.sigmoid(preds)
    preds_cls = (preds_prob > 0.5).float()

print("Classification Report:")
print(classification_report(y_valid_tensor, preds_cls))


Classification Report:
              precision    recall  f1-score   support

         0.0       0.48      0.10      0.17       118
         1.0       0.52      0.90      0.66       128

    accuracy                           0.52       246
   macro avg       0.50      0.50      0.41       246
weighted avg       0.50      0.52      0.42       246

