<h2>Things done to reproduce code</h2>

1. Literature Review & Objective Definition

    - Understood the original study’s goals: to compare model performance on intraday return prediction.
      
    - Focused on reproducing the Random Forest component using public tools and data.

2. Code Migration and Environment Setup

    - Adapted original codebase (which relied on proprietary Bloomberg data and an SPX constituents file) to use public data from Yahoo Finance (yfinance).

    - Implemented Random Forest pipeline for training and prediction.

3. Data Acquisition and Preparation

    - Attempted to replace SPXconst.csv with approximated static ticker lists.

    - Used yfinance to download historical Open and Adjusted Close prices.

    - Created training and test datasets using rolling 3-year windows.

4. Model Execution and Evaluation

    - Trained Random Forest models for each year (2015–2019).

    - Simulated a simple long-short trading strategy to evaluate predictions.

    - Measured output: daily return averages.

5. Code Adjustments for Compatibility

    - Rewrote deprecated or missing functions.

    - Replaced unavailable Statistics class with placeholder metrics.

    - Handled missing and incomplete ticker data.

    - Improved error handling and modularization.

<h5>ChatGPT helped me with attempting to recreate code and get results</h5>

In [2]:
import pandas as pd
import numpy as np
import random
import time
import pickle
from sklearn.ensemble import RandomForestClassifier
import yfinance as yf
import os

SEED = 9
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [3]:
def get_sp500_tickers():
    return ['AAPL', 'MSFT', 'JPM', 'UNH', 'XOM']

In [4]:
all_companies = get_sp500_tickers()

In [5]:
constituents = {}
dates = pd.date_range(start='1990-01-01', end='2019-12-31', freq='M')
for date in dates:
    constituents[date.strftime('%Y-%m')] = set(all_companies)

constituents_train = {}
for test_year in range(1993, 2016):
    months = [f"{t}-{m:02d}" for t in range(test_year-3, test_year) for m in range(1, 13)]
    all_stocks = [list(constituents[m]) for m in months if m in constituents]
    constituents_train[test_year] = set([i for sublist in all_stocks for i in sublist])

  dates = pd.date_range(start='1990-01-01', end='2019-12-31', freq='M')


In [6]:
def trainer(train_data, test_data):
    random.seed(SEED)
    np.random.seed(SEED)
    train_x, train_y = train_data[:, 2:-2], train_data[:, -1].astype(int)
    print('Started training')
    clf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=SEED, n_jobs=-1)
    clf.fit(train_x, train_y)
    print('Completed', clf.score(train_x, train_y))

    dates = list(set(test_data[:, 0]))
    predictions = {}
    for day in dates:
        test_d = test_data[test_data[:, 0] == day][:, 2:-2]
        predictions[day] = clf.predict_proba(test_d)[:, 1]
    return predictions

In [7]:
def simulate(test_data, predictions):
    rets = pd.DataFrame([], columns=['Long', 'Short'])
    k = 10
    for day in sorted(predictions.keys()):
        preds = predictions[day]
        test_returns = test_data[test_data[:, 0] == day][:, -2]
        top_preds = predictions[day].argsort()[-k:][::-1]
        trans_long = test_returns[top_preds]
        worst_preds = predictions[day].argsort()[:k][::-1]
        trans_short = -test_returns[worst_preds]
        rets.loc[day] = [np.mean(trans_long), np.mean(trans_short)]
    return rets

In [8]:
def create_label(df_open, df_close, perc=[0.5, 0.5]):
    if not np.all(df_close['Date'] == df_open['Date']):
        print('Date Index issue')
        return pd.DataFrame()

    if df_open.shape[1] <= 2 or df_close.shape[1] <= 2:
        print("Too few tickers with valid data for label creation.")
        return pd.DataFrame()

    perc = [0.] + list(np.cumsum(perc))
    try:
        label = (df_close.iloc[:, 1:] / df_open.iloc[:, 1:] - 1).apply(
            lambda x: pd.qcut(x.rank(method='first'), perc, labels=False, duplicates='drop'), axis=1)
    except Exception as e:
        print("Label creation failed:", e)
        return pd.DataFrame()

    return label

In [9]:
def create_stock_data(df_close, df_open, st, label, test_year):
    st_data = pd.DataFrame()
    st_data['Date'] = df_close['Date']
    st_data['Name'] = st
    daily_change = df_close[st] / df_open[st] - 1

    m = list(range(1, 20)) + list(range(20, 241, 20))
    for k in m:
        st_data[f'IntraR{k}'] = daily_change.shift(k)
        st_data[f'CloseR{k}'] = df_close[st].pct_change(k).shift(1)
        st_data[f'OverNR{k}'] = df_open[st] / df_close[st].shift(k) - 1

    st_data['R-future'] = daily_change
    st_data['label'] = label[st]
    st_data['Month'] = df_close['Date'].str[:7]
    st_data.dropna(inplace=True)

    trade_year = st_data['Month'].str[:4]
    st_data.drop(columns=['Month'], inplace=True)
    st_train_data = st_data[trade_year < str(test_year)]
    st_test_data = st_data[trade_year == str(test_year)]
    return np.array(st_train_data), np.array(st_test_data)

In [10]:
def download_data(tickers, start, end):
    df_open = pd.DataFrame()
    df_close = pd.DataFrame()
    df_open['Date'] = pd.date_range(start=start, end=end, freq='B')
    df_close['Date'] = df_open['Date']
    valid_tickers = []
    print("Attempting to download:", tickers)

    for ticker in tickers:
        try:
            df = yf.download(ticker, start=start, end=end, auto_adjust=False, progress=False)
            if df.empty or df['Open'].isnull().all() or df['Adj Close'].isnull().all():
                continue
            df = df.reset_index()
            df['Date'] = pd.to_datetime(df['Date'])
            df_open[ticker] = df.set_index('Date').reindex(df_open['Date'])['Open'].values
            df_close[ticker] = df.set_index('Date').reindex(df_close['Date'])['Adj Close'].values
            valid_tickers.append(ticker)
        except Exception as e:
            print(f"{ticker} failed: {e}")
            continue

    df_open['Date'] = df_open['Date'].dt.strftime('%Y-%m-%d')
    df_close['Date'] = df_close['Date'].dt.strftime('%Y-%m-%d')

    print("Valid tickers with data:", valid_tickers)
    print("Dropped:", set(tickers) - set(valid_tickers))
    return df_open, df_close, valid_tickers

In [11]:
result_folder = 'results-Intraday-240-3-RF'
os.makedirs(result_folder, exist_ok=True)

for test_year in range(2018, 2020):
    print('-'*40)
    print(test_year)
    print('-'*40)

    tickers = sorted(list(constituents[f'{test_year-1}-12']))[:10]
    df_open, df_close, valid_tickers = download_data(tickers, f'{test_year-3}-01-01', f'{test_year}-01-01')

    if len(valid_tickers) < 3:
        print(f"Skipping {test_year} — only {len(valid_tickers)} tickers with data.")
        continue

    df_open.insert(0, 'Date', df_open.pop('Date'))
    df_close.insert(0, 'Date', df_close.pop('Date'))

    label = create_label(df_open, df_close)
    if label.empty:
        print(f"Skipping {test_year} due to label creation issue.")
        continue

    train_data, test_data = [], []
    start = time.time()
    for st in valid_tickers:
        try:
            st_train, st_test = create_stock_data(df_close, df_open, st, label, test_year)
            train_data.append(st_train)
            test_data.append(st_test)
        except Exception as e:
            print(f"{st} failed: {e}")

    if not train_data or not test_data:
        print(f"Skipping {test_year} — no usable training/testing data.")
        continue

    train_data = np.concatenate(train_data)
    test_data = np.concatenate(test_data)

    print('Created:', train_data.shape, test_data.shape, time.time()-start)
    predictions = trainer(train_data, test_data)
    returns = simulate(test_data, predictions)

    with open(f"{result_folder}/predictions-{test_year}.pickle", 'wb') as handle:
        pickle.dump(predictions, handle, protocol=pickle.HIGHEST_PROTOCOL)

    returns.to_csv(f"{result_folder}/avg_daily_rets-{test_year}.csv")

----------------------------------------
2018
----------------------------------------
Attempting to download: ['AAPL', 'JPM', 'MSFT', 'UNH', 'XOM']
AAPL failed: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
JPM failed: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
MSFT failed: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
UNH failed: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
XOM failed: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Valid tickers with data: []
Dropped: {'AAPL', 'UNH', 'JPM', 'XOM', 'MSFT'}
Skipping 2018 — only 0 tickers with data.
----------------------------------------
2019
----------------------------------------
Attempting to download: ['AAPL', 'JPM', 'MSFT', 'UNH', 'XOM']
AAPL failed: The truth value of a Series is ambi