In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# --------------------------------------------
# 1. Data Preparation & Splitting
# --------------------------------------------
data_path = r'C:\Zeel\UChicago\Winter\Real Time Intelligence Systems\Project Phase 1\data\historical_stock_data_5min_6months.csv'
df = pd.read_csv(data_path, parse_dates=['timestamp'])
df = df.sort_values(['symbol', 'timestamp'])

In [2]:
# Split the data by symbol (70% training, 30% testing)
train_list = []
test_list = []
for symbol, group in df.groupby('symbol'):
    group = group.sort_values('timestamp')
    split_idx = int(0.7 * len(group))
    train_list.append(group.iloc[:split_idx])
    test_list.append(group.iloc[split_idx:])
train_data = pd.concat(train_list).reset_index(drop=True)
test_data  = pd.concat(test_list).reset_index(drop=True)

In [3]:
# --------------------------------------------
# 2. Technical Indicator Calculation & Feature Engineering
# --------------------------------------------
def compute_indicators(data):
    # 200-period EMA
    data['200_EMA'] = data.groupby('symbol')['close'].transform(
        lambda x: x.ewm(span=200, min_periods=200).mean()
    )
    
    # 9-period and 26-period EMAs and their crossover
    data['9_EMA'] = data.groupby('symbol')['close'].transform(lambda x: x.ewm(span=9, min_periods=9).mean())
    data['26_EMA'] = data.groupby('symbol')['close'].transform(lambda x: x.ewm(span=26, min_periods=26).mean())
    data['ema_crossover'] = np.where(data['9_EMA'] > data['26_EMA'], 1, -1)
    
    # Compute typical price and CCI
    data['typical_price'] = (data['high'] + data['low'] + data['close']) / 3
    def calc_cci(group):
        tp = group['typical_price']
        sma = tp.rolling(20).mean()
        mad = tp.rolling(20).apply(lambda x: np.mean(np.abs(x - x.mean())), raw=True)
        return (tp - sma) / (0.015 * mad)
    data['CCI'] = data.groupby('symbol', group_keys=False).apply(calc_cci)
    
    # Create CCI signal: 1 if CCI > 150, -1 if CCI < -150, else 0.
    data['cci_signal'] = np.where(data['CCI'] > 150, 1, np.where(data['CCI'] < -150, -1, 0))
    
    # 200_EMA signal: 1 if price is above the 200_EMA, else -1.
    data['200_EMA_signal'] = np.where(data['close'] > data['200_EMA'], 1, -1)
    
    # VWAP signal: assumes the column 'vwap' exists; otherwise, compute or substitute appropriately.
    data["volume_20_avg"] = data.groupby("symbol")["volume"].transform(lambda x: x.rolling(20).mean())
    data["vwap_signal"] = np.where(
        (data["close"] > data["vwap"]) & (data["volume"] > 0.8 * data["volume_20_avg"]), 1,
        np.where((data["close"] < data["vwap"]) & (data["volume"] > 0.8 * data["volume_20_avg"]), -1, 0)
    )
    
    return data

train_data = compute_indicators(train_data)
test_data = compute_indicators(test_data)

  data['CCI'] = data.groupby('symbol', group_keys=False).apply(calc_cci)
  data['CCI'] = data.groupby('symbol', group_keys=False).apply(calc_cci)


In [4]:
# --------------------------------------------
# 3. Define the Target & Feature Set for ML
# --------------------------------------------
# Target: 1 if next candle's close > current candle's close, else 0.
train_data['target'] = (train_data['close'].shift(-1) > train_data['close']).astype(int)
test_data['target'] = (test_data['close'].shift(-1) > test_data['close']).astype(int)

# We'll use the indicator signals as our features.
features = ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover']
train_ml = train_data.dropna(subset=features + ['target']).copy()
test_ml  = test_data.dropna(subset=features + ['target']).copy()

X_train = train_ml[features]
y_train = train_ml['target']
X_test  = test_ml[features]
y_test  = test_ml['target']


In [5]:
# --------------------------------------------
# 4. Model Training with Hyperparameter Tuning
# --------------------------------------------
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, y_train)

print("Best Parameters:", clf.best_params_)
print("Best CV Accuracy:", clf.best_score_)
print("Training Accuracy:", accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, clf.predict(X_test)))


Best Parameters: {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 150}
Best CV Accuracy: 0.5323864742477463
Training Accuracy: 0.5330263659063016
Test Accuracy: 0.5258027953192816


In [6]:
# --------------------------------------------
# 5. Generate Trading Signals & Simulate Trades on Test Data
# --------------------------------------------
# Use model probabilities to define signals:
# For example, if probability > 0.55, signal = 1 (buy); if < 0.45, signal = -1 (sell); else 0.
test_ml['pred_prob'] = clf.predict_proba(X_test)[:, 1]
def get_signal(prob, thresh_buy=0.55, thresh_sell=0.45):
    if prob > thresh_buy:
        return 1
    elif prob < thresh_sell:
        return -1
    else:
        return 0
test_ml['signal'] = test_ml['pred_prob'].apply(get_signal)

# Trade simulation function:
def simulate_trade(df, idx, signal, risk_pct=0.01):
    if idx + 1 >= len(df):
        return None, len(df)
    entry = df.iloc[idx + 1]['open']
    risk = risk_pct * entry
    profit_target = 2 * risk
    exit_price, exit_idx = None, None
    for i in range(idx + 1, len(df)):
        row = df.iloc[i]
        if signal == 1:  # Long trade
            if row['high'] >= entry + profit_target:
                exit_price, exit_idx = entry + profit_target, i
                break
            if row['low'] <= entry - risk:
                exit_price, exit_idx = entry - risk, i
                break
        elif signal == -1:  # Short trade
            if row['low'] <= entry - profit_target:
                exit_price, exit_idx = entry - profit_target, i
                break
            if row['high'] >= entry + risk:
                exit_price, exit_idx = entry + risk, i
                break
    if exit_price is None:
        exit_price, exit_idx = df.iloc[-1]['close'], len(df)-1
    profit = exit_price - entry if signal == 1 else entry - exit_price
    return profit, exit_idx

In [7]:
# Simulate trades on a per-symbol basis:
trade_results = {}
risk_pct = 0.01
for symbol in test_ml['symbol'].unique():
    sym_df = test_ml[test_ml['symbol'] == symbol].sort_values('timestamp').reset_index(drop=True)
    trades = []
    i = 0
    while i < len(sym_df):
        if sym_df.loc[i, 'signal'] != 0:
            s = sym_df.loc[i, 'signal']
            p, exit_idx = simulate_trade(sym_df, i, s, risk_pct=risk_pct)
            if p is not None:
                trades.append(p)
                i = exit_idx + 1  # Jump to next trade to avoid overlap
                continue
        i += 1
    trade_results[symbol] = trades
    if trades:
        avg_profit = np.mean(trades)
        win_rate = np.mean([1 if p > 0 else 0 for p in trades])
        print(f"{symbol}: trades = {len(trades)}, avg profit = {avg_profit:.4f}, win rate = {win_rate:.1%}")
    else:
        print(f"{symbol}: No trades executed")

AAPL: trades = 34, avg profit = 0.0428, win rate = 32.4%
ABT: trades = 29, avg profit = -0.0416, win rate = 31.0%
ACN: trades = 23, avg profit = -0.2006, win rate = 30.4%
ADBE: trades = 23, avg profit = 0.1508, win rate = 34.8%
ADP: trades = 17, avg profit = 0.7685, win rate = 41.2%
AMZN: trades = 37, avg profit = 0.3750, win rate = 37.8%
AVGO: trades = 65, avg profit = 0.0981, win rate = 33.8%
BAC: trades = 20, avg profit = -0.0895, win rate = 25.0%
CMCSA: trades = 28, avg profit = -0.1235, win rate = 21.4%
COST: trades = 21, avg profit = -0.0459, win rate = 33.3%
CRM: trades = 39, avg profit = 0.6800, win rate = 41.0%
CVX: trades = 31, avg profit = -0.4366, win rate = 22.6%
DHR: trades = 29, avg profit = 0.0469, win rate = 34.5%
DIA: trades = 13, avg profit = -0.9394, win rate = 30.8%
DIS: trades = 22, avg profit = 0.1033, win rate = 36.4%
GOOGL: trades = 35, avg profit = -0.3837, win rate = 28.6%
HD: trades = 26, avg profit = 0.8296, win rate = 42.3%
HON: trades = 30, avg profit = 0

In [8]:
# Aggregate overall results:
all_trades = [p for trades in trade_results.values() for p in trades]
if all_trades:
    overall_profit = np.sum(all_trades)
    overall_win_rate = np.mean([1 if p > 0 else 0 for p in all_trades])
    print(f"Overall: total profit = {overall_profit:.4f}, win rate = {overall_win_rate:.1%}")
else:
    print("No trades executed overall.")

Overall: total profit = -307.5182, win rate = 30.4%
