In [1]:
# STEP 1: Mount Google Drive
import yfinance as yf
import os
import pandas as pd
import numpy as np

sp500_table = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sp500 = sp500_table[0]  # The first table on the page
sp500tickers = sp500['Symbol'].tolist()

# Download grouped data
data = yf.download(
    tickers=sp500tickers,
    start="2020-01-01",
    end="2025-01-01",
    interval="1d",
    group_by="ticker",
    auto_adjust=False,
    progress=True
)

# Create directory to save files
save_path = "/content/drive/MyDrive/StockData"
os.makedirs(save_path, exist_ok=True)

df = data.stack(level=0).reset_index()
df.columns = ['Date','Ticker','Open','High','Low','Close','Adj Close','Volume']

output_path = os.path.join(save_path, "sp500_data.csv")
df.to_csv(output_path, index=False)

print(f"Saved to {output_path}")

[*********************100%***********************]  503 of 503 completed
ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2020-01-01 -> 2025-01-01)')
ERROR:yfinance:['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
  df = data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/StockData/sp500_data.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from fastai.learner import Learner
from fastai.data.core import DataLoaders
from fastai.metrics import mse
from fastai.callback.all import *

torch.manual_seed(42)
np.random.seed(42)

df_cleaned = df.fillna(0).copy()
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
df_cleaned = df_cleaned.sort_values(['Ticker', 'Date']).reset_index(drop=True)
df_cleaned['return_14d'] = df_cleaned.groupby('Ticker')['Close'].shift(-14) / df_cleaned['Close'] - 1

In [3]:
df_cleaned.head(40)

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Adj Close,Volume,return_14d
0,2020-01-02,A,85.900002,86.349998,85.199997,85.949997,82.885254,1410500.0,0.0363
1,2020-01-03,A,84.669998,85.330002,84.5,84.57,81.554451,1118300.0,0.043633
2,2020-01-06,A,84.0,84.82,83.599998,84.82,81.795555,1993200.0,0.020514
3,2020-01-07,A,83.959999,85.260002,83.940002,85.080002,82.04628,1684700.0,0.024683
4,2020-01-08,A,85.959999,86.470001,85.199997,85.919998,82.856323,1847600.0,0.015363
5,2020-01-09,A,86.459999,87.699997,86.169998,87.269997,84.158188,1912700.0,-0.033116
6,2020-01-10,A,87.720001,88.239998,87.32,87.589996,84.466782,1417000.0,-0.057427
7,2020-01-13,A,87.809998,88.32,86.739998,87.459999,84.341408,1630200.0,-0.060713
8,2020-01-14,A,87.269997,88.209999,86.699997,87.989998,84.852509,1675200.0,-0.050801
9,2020-01-15,A,87.629997,89.110001,87.550003,88.620003,85.46006,1630400.0,-0.041638


In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def scale_single_ticker(df, ticker, numerical_cols):
    """Scales numerical features for a single ticker."""
    scaler = MinMaxScaler()
    cols_to_scale = [col for col in numerical_cols if col in df.columns]
    if cols_to_scale:
        df.loc[:, cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df

if 'df_cleaned' in locals():
    unique_tickers = df_cleaned['Ticker'].unique()
    scaled_dfs = []
    numerical_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close','Volume', 'rsi','stochastic','signal-stochastic','cci', 'williams%R','roc','awesome-oscillator','ultimate-oscillator','trix','KAMA','stochrsi','sma','macd','ema','adx','psar-up','psar-down', 'ichimoku_a','ichimoku_b','supertrend','KST','plus_di','minus_di','VI-dff','VI+','VI-','atr','bb_bbm',
              'donchianwidth','keltnerwidth','ulcer','chaikin_volatility','historical_volatility','upper_band','lower_band']

    if df_cleaned.empty:
        print("df_cleaned is empty. Check your data cleaning steps or data source.")
    else:
      for ticker in unique_tickers:
        ticker_df = df_cleaned[df_cleaned['Ticker'] == ticker].copy()
        scaled_ticker_df = scale_single_ticker(ticker_df, ticker, numerical_cols)
        scaled_dfs.append(scaled_ticker_df)

    df_beforedrop = pd.concat(scaled_dfs).reset_index(drop=True)

In [5]:
from google.colab import drive
import shutil
import os


save_directory = '/content/drive/My Drive/StockData'
os.makedirs(save_directory, exist_ok=True)

df_beforedrop['key'] = list(zip(df_beforedrop['Ticker'], df_beforedrop['Date']))

split_date = '2024-06-01'
train_df = df_beforedrop[df_beforedrop['Date'] < split_date].copy()
unclean_test_df = df_beforedrop[df_beforedrop['Date'] >= split_date].copy()

train_keys = set(train_df['key'])
test_df = unclean_test_df[~unclean_test_df['key'].isin(train_keys)].copy()

train_df = train_df.drop(columns='key')
test_df = test_df.drop(columns='key')

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

test_path = os.path.join(save_directory, 'test_set.csv')
validtrain_path = os.path.join(save_directory, 'validtest_set.csv')

train_df.to_csv(validtrain_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Cleaned test size: {len(test_df)} rows (from {len(unclean_test_df)})")
print(f"Test set saved to: {test_path}")

✅ Cleaned test size: 73624 rows (from 73624)
Test set saved to: /content/drive/My Drive/StockData/test_set.csv
