In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# ---------------------------------------------------
# 1. Data Loading, Validation, and Train/Test Splitting
# ---------------------------------------------------

data_path = r'C:\Zeel\UChicago\Winter\Real Time Intelligence Systems\Project Phase 1\data\historical_stock_data_5min_6months.csv'
df = pd.read_csv(data_path, parse_dates=['timestamp'])

print("Raw data shape:", df.shape)
print("First 3 rows:")
print(df.head(3))
print("Total missing values:", df.isnull().sum().sum())
print("Duplicates (symbol, timestamp):", df.duplicated(subset=['symbol', 'timestamp']).sum())

# Split by symbol into training (70%) and testing (30%) sets
train_data_list = []
test_data_list = []
for symbol, group in df.groupby("symbol"):
    group = group.sort_values("timestamp")
    split_idx = int(0.7 * len(group))
    train_data_list.append(group.iloc[:split_idx])
    test_data_list.append(group.iloc[split_idx:])
    
train_data = pd.concat(train_data_list).reset_index(drop=True)
test_data = pd.concat(test_data_list).reset_index(drop=True)

print("\nTraining size:", len(train_data))
print("Testing size:", len(test_data))


Raw data shape: (843934, 9)
First 3 rows:
  symbol                 timestamp    open    high     low   close  volume  \
0   AAPL 2024-08-26 08:00:00+00:00  226.25  226.41  226.25  226.41  1965.0   
1   AMZN 2024-08-26 08:00:00+00:00  177.00  177.60  177.00  177.30  6356.0   
2   AVGO 2024-08-26 08:00:00+00:00  165.75  165.75  165.75  165.75   359.0   

   trade_count        vwap  
0        219.0  226.365909  
1        150.0  177.363163  
2         13.0  165.750000  
Total missing values: 0
Duplicates (symbol, timestamp): 0

Training size: 590725
Testing size: 253209


In [3]:
# ---------------------------------------------------
# 2. Compute Technical Indicators
# ---------------------------------------------------
def calculate_indicators(df):
    # Typical Price for CCI
    df["typical_price"] = (df["high"] + df["low"] + df["close"]) / 3

    # 200-period EMA on close
    df["200_EMA"] = df.groupby("symbol")["close"].transform(
        lambda x: x.ewm(span=200, min_periods=200).mean()
    )
    
    # VWAP Signal with volume filter (assuming 'vwap' column exists in your data)
    df["volume_20_avg"] = df.groupby("symbol")["volume"].transform(lambda x: x.rolling(20).mean())
    df["vwap_signal"] = np.where(
        (df["close"] > df["vwap"]) & (df["volume"] > 0.8 * df["volume_20_avg"]), 1,
        np.where((df["close"] < df["vwap"]) & (df["volume"] > 0.8 * df["volume_20_avg"]), -1, 0)
    )
    
    # CCI Calculation
    def calculate_cci(group):
        tp = group["typical_price"]
        sma = tp.rolling(20).mean()
        mad_vals = tp.rolling(20).apply(lambda x: np.abs(x - x.mean()).mean(), raw=True)
        cci = (tp - sma) / (0.015 * mad_vals)
        return cci

    df["CCI"] = df.groupby("symbol", group_keys=False).apply(calculate_cci)
    df["cci_signal"] = np.where(df["CCI"] > 150, 1, np.where(df["CCI"] < -150, -1, 0))

    # EMA Crossover: 9-period and 26-period EMAs
    df["9_EMA"] = df.groupby("symbol")["close"].transform(lambda x: x.ewm(span=9, min_periods=9).mean())
    df["26_EMA"] = df.groupby("symbol")["close"].transform(lambda x: x.ewm(span=26, min_periods=26).mean())
    df["ema_crossover"] = np.where(df["9_EMA"] > df["26_EMA"], 1, -1)
    
    # 200_EMA signal (if close > 200_EMA then 1, else -1)
    df['200_EMA_signal'] = np.where(df['close'] > df['200_EMA'], 1, -1)
    
    return df

train_data = calculate_indicators(train_data)
test_data = calculate_indicators(test_data)


  df["CCI"] = df.groupby("symbol", group_keys=False).apply(calculate_cci)
  df["CCI"] = df.groupby("symbol", group_keys=False).apply(calculate_cci)


In [4]:
# ---------------------------------------------------
# 3. Create a Target Variable and Feature Set for ML
# ---------------------------------------------------
# Define target: 1 if next candle's close > current candle's close, else 0
train_data['target'] = (train_data['close'].shift(-1) > train_data['close']).astype(int)
test_data['target']  = (test_data['close'].shift(-1)  > test_data['close']).astype(int)

# Choose technical indicator signals as features
features = ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover']

# Drop rows with NaN values (e.g. at the end of each symbol's data) for training
train_ml = train_data.dropna(subset=features + ['target']).copy()
test_ml  = test_data.dropna(subset=features + ['target']).copy()

X_train = train_ml[features]
y_train = train_ml['target']

In [5]:
# ---------------------------------------------------
# 4. Train a Machine Learning Classifier
# ---------------------------------------------------
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("\nTraining Accuracy:", accuracy_score(y_train, clf.predict(X_train)))

# Predict probabilities on test set
X_test = test_ml[features]
test_ml['predicted_prob'] = clf.predict_proba(X_test)[:,1]

# Create a predicted signal:
# Here we define thresholds: if probability > 0.55, signal = 1 (buy); if < 0.45, signal = -1 (sell); otherwise, 0 (no trade)
def get_signal(p):
    if p > 0.55:
        return 1
    elif p < 0.45:
        return -1
    else:
        return 0

test_ml['predicted_signal'] = test_ml['predicted_prob'].apply(get_signal)


Training Accuracy: 0.5300740615345549


In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming that the data has already been loaded, indicators computed, and split into train_data and test_data,
# and that 'target' has been created as:
# target = 1 if next candle's close > current candle's close, else 0.
# Also assuming that the indicator signals are stored in:
# ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover']

# Drop NaNs that might occur due to indicator calculation or shifting target values.
train_ml = train_data.dropna(subset=['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover', 'target']).copy()
test_ml  = test_data.dropna(subset=['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover', 'target']).copy()

# Define the feature set and target variable.
features = ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover']
X_train = train_ml[features]
y_train = train_ml['target']
X_test = test_ml[features]
y_test = test_ml['target']

# Set up a grid of parameters to search over.
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForestClassifier and perform grid search with 5-fold cross validation.
rf = RandomForestClassifier(random_state=42)
clf_rf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
clf_rf.fit(X_train, y_train)

# Print out the best parameters and cross-validation score.
print("Best Parameters:", clf_rf.best_params_)
print("Best Cross-Validation Accuracy:", clf_rf.best_score_)

# Evaluate on the training and test sets.
train_pred = clf_rf.predict(X_train)
test_pred = clf_rf.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))


Best Parameters: {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 150}
Best Cross-Validation Accuracy: 0.5323864742477463
Training Accuracy: 0.5330263659063016
Test Accuracy: 0.5258027953192816


In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# ----------------------------
# Assume train_data and test_data have been computed
# and include the following columns:
#   - '200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover'
#   - 'CCI', '200_EMA', '9_EMA', '26_EMA', 'close', 'volume'
#   - 'target' : 1 if next candle's close > current candle's close, else 0
# ----------------------------

# Drop rows with NaNs for the columns we'll use.
required_cols = ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover', 
                 'CCI', '200_EMA', '9_EMA', '26_EMA', 'close', 'volume', 'target']

train_ml_ext = train_data.dropna(subset=required_cols).copy()
test_ml_ext  = test_data.dropna(subset=required_cols).copy()

# ----------------------------
# Create additional features
# ----------------------------
# 1. Difference between the close and the 200 EMA.
train_ml_ext['close_200ema_diff'] = train_ml_ext['close'] - train_ml_ext['200_EMA']
test_ml_ext['close_200ema_diff']  = test_ml_ext['close'] - test_ml_ext['200_EMA']

# 2. Difference between 9 EMA and 26 EMA.
train_ml_ext['ema_diff'] = train_ml_ext['9_EMA'] - train_ml_ext['26_EMA']
test_ml_ext['ema_diff']  = test_ml_ext['9_EMA'] - test_ml_ext['26_EMA']

# 3. Percentage change in volume.
train_ml_ext['vol_change'] = train_ml_ext['volume'].pct_change().fillna(0)
test_ml_ext['vol_change']  = test_ml_ext['volume'].pct_change().fillna(0)

# ----------------------------
# Define the extended feature set.
# ----------------------------
features_ext = ['200_EMA_signal', 'vwap_signal', 'cci_signal', 'ema_crossover', 
                'close_200ema_diff', 'ema_diff', 'CCI', 'vol_change']

X_train_ext = train_ml_ext[features_ext]
y_train_ext = train_ml_ext['target']
X_test_ext  = test_ml_ext[features_ext]
y_test_ext  = test_ml_ext['target']

# ----------------------------
# Grid search with RandomForest using the extended feature set
# ----------------------------
param_grid_ext = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 5, 10]
}

rf_ext = RandomForestClassifier(random_state=42)
clf_rf_ext = GridSearchCV(rf_ext, param_grid_ext, cv=5, scoring='accuracy', n_jobs=-1)
clf_rf_ext.fit(X_train_ext, y_train_ext)

print("Extended Best Parameters:", clf_rf_ext.best_params_)
print("Extended Best Cross-Validation Accuracy:", clf_rf_ext.best_score_)

train_pred_ext = clf_rf_ext.predict(X_train_ext)
test_pred_ext  = clf_rf_ext.predict(X_test_ext)

print("Extended Training Accuracy:", accuracy_score(y_train_ext, train_pred_ext))
print("Extended Test Accuracy:", accuracy_score(y_test_ext, test_pred_ext))


KeyboardInterrupt: 

In [6]:
# ---------------------------------------------------
# 5. Trade Simulation on Test Set Using ML-Based Signals
# ---------------------------------------------------
def simulate_trade(df, start_index, signal, risk_pct=0.01):
    """
    Simulate a trade for a given symbol's DataFrame (sorted by timestamp) using the ML signal.
    The trade uses the next candle's open as entry, with a profit target of 2×risk and stop loss of 1×risk.
    """
    if start_index + 1 >= len(df):
        return None, len(df)
    
    entry = df.iloc[start_index + 1]["open"]
    risk = risk_pct * entry
    profit_target = 2 * risk  # 2:1 reward to risk

    exit_price = None
    exit_index = None

    for i in range(start_index + 1, len(df)):
        row = df.iloc[i]
        if signal == 1:  # Long trade
            if row["high"] >= entry + profit_target:  # Profit target reached
                exit_price = entry + profit_target
                exit_index = i
                break
            if row["low"] <= entry - risk:  # Stop loss reached
                exit_price = entry - risk
                exit_index = i
                break
        elif signal == -1:  # Short trade
            if row["low"] <= entry - profit_target:  # Profit target reached for short
                exit_price = entry - profit_target
                exit_index = i
                break
            if row["high"] >= entry + risk:  # Stop loss reached for short
                exit_price = entry + risk
                exit_index = i
                break

    if exit_price is None:
        exit_price = df.iloc[-1]["close"]
        exit_index = len(df) - 1

    if signal == 1:
        profit = exit_price - entry
    else:
        profit = entry - exit_price

    return profit, exit_index

# Run trade simulation for each symbol in the test set based on ML predictions
trade_results_ml = {}
risk_pct = 0.01  # Risk percentage

# Process each symbol separately
for symbol in test_ml["symbol"].unique():
    symbol_df = test_ml[test_ml["symbol"] == symbol].sort_values("timestamp").reset_index(drop=True)
    trades = []
    i = 0
    while i < len(symbol_df):
        if symbol_df.loc[i, "predicted_signal"] != 0:
            signal = symbol_df.loc[i, "predicted_signal"]
            trade_profit, exit_idx = simulate_trade(symbol_df, i, signal, risk_pct=risk_pct)
            if trade_profit is not None:
                trades.append(trade_profit)
                # Jump to the candle after the trade exit to avoid overlapping trades
                i = exit_idx + 1
                continue
        i += 1
    trade_results_ml[symbol] = trades
    if trades:
        avg_profit = np.mean(trades)
        win_rate = np.mean([1 if p > 0 else 0 for p in trades])
        print(f"{symbol}: Number of trades = {len(trades)}, Avg profit = {avg_profit:.4f}, Win rate = {win_rate:.1%}")
    else:
        print(f"{symbol}: No trades executed")

# Overall performance across symbols
all_trades_ml = [p for trades in trade_results_ml.values() for p in trades]
if all_trades_ml:
    overall_profit_ml = np.sum(all_trades_ml)
    overall_win_rate_ml = np.mean([1 if p > 0 else 0 for p in all_trades_ml])
    print(f"\nOverall: Total profit = {overall_profit_ml:.4f}, Overall win rate = {overall_win_rate_ml:.1%}")
else:
    print("No trades executed overall.")

AAPL: Number of trades = 50, Avg profit = 0.1132, Win rate = 34.0%
ABT: Number of trades = 30, Avg profit = -0.3237, Win rate = 23.3%
ACN: Number of trades = 25, Avg profit = -0.1222, Win rate = 32.0%
ADBE: Number of trades = 35, Avg profit = -0.2689, Win rate = 31.4%
ADP: Number of trades = 18, Avg profit = 0.5552, Win rate = 38.9%
AMZN: Number of trades = 51, Avg profit = -0.0841, Win rate = 31.4%
AVGO: Number of trades = 125, Avg profit = 0.1696, Win rate = 35.2%
BAC: Number of trades = 31, Avg profit = -0.0856, Win rate = 25.8%
CMCSA: Number of trades = 39, Avg profit = -0.0113, Win rate = 33.3%
COST: Number of trades = 27, Avg profit = -1.8685, Win rate = 25.9%
CRM: Number of trades = 46, Avg profit = 1.0225, Win rate = 43.5%
CVX: Number of trades = 31, Avg profit = -0.4287, Win rate = 22.6%
DHR: Number of trades = 43, Avg profit = 0.2411, Win rate = 37.2%
DIA: Number of trades = 14, Avg profit = -1.0299, Win rate = 28.6%
DIS: Number of trades = 34, Avg profit = -0.0091, Win rate 