In [1]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features  # A convenience function from the "ta" library
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [2]:
# ----------------------------
# 1. Load Data
# ----------------------------
data_path = r'C:\Zeel\UChicago\Winter\Real Time Intelligence Systems\Project Phase 1\data\historical_stock_data_5min_6months.csv'
df = pd.read_csv(data_path, parse_dates=['timestamp'])

# Sort by symbol and time
df.sort_values(['symbol', 'timestamp'], inplace=True)

In [3]:
# ----------------------------
# 2. Generate a Large Set of Candidate Features
# ----------------------------
# The `add_all_ta_features` function automatically computes many technical indicators.
# It adds features based on open, high, low, close, and volume.
# You can adjust the parameters or select specific indicators if desired.
df = add_all_ta_features(
    df, open="open", high="high", low="low", close="close", volume="volume",
    fillna=True
)

# Now df contains many new columns (typically hundreds) like RSI, MACD, Bollinger Bands, etc.

In [4]:
# ----------------------------
# 3. Define the Target Variable
# ----------------------------
# For example, define target = 1 if next candle's close is higher than current, else 0.
df['target'] = (df['close'].shift(-1) > df['close']).astype(int)

# Remove rows with NaNs (often appear at the start/end due to indicator calculation)
df.dropna(inplace=True)

In [5]:
# ----------------------------
# 4. Split Data into Training and Testing Sets
# ----------------------------
# It’s important to split by time and symbol. For simplicity, here we do a random split.
# In a realistic scenario, you may want to split by date for each symbol.
X = df.drop(columns=['timestamp', 'symbol', 'target'])  # All candidate indicator columns
y = df['target']

# For demonstration, we use a random split; in production, use a time-based split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# ----------------------------
# 5. Automated Feature Selection Using RandomForest
# ----------------------------
# Use a RandomForest to rank features
rf_selector = RandomForestClassifier(random_state=42, n_estimators=100)
rf_selector.fit(X_train, y_train)

# Select features with importance greater than the median importance.
selector = SelectFromModel(rf_selector, threshold="median", prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected  = selector.transform(X_test) 

print("Selected Features:", X_train.columns[selector.get_support()])




Selected Features: Index(['volume', 'trade_count', 'volume_adi', 'volume_obv', 'volume_cmf',
       'volume_fi', 'volume_em', 'volume_sma_em', 'volume_vpt', 'volume_mfi',
       'volume_nvi', 'volatility_bbw', 'volatility_bbp', 'volatility_kcw',
       'volatility_kcp', 'volatility_dcw', 'volatility_dcp', 'volatility_atr',
       'volatility_ui', 'trend_macd_diff', 'trend_vortex_ind_pos',
       'trend_vortex_ind_neg', 'trend_vortex_ind_diff', 'trend_mass_index',
       'trend_dpo', 'trend_kst_diff', 'trend_stc', 'trend_adx',
       'trend_adx_pos', 'trend_adx_neg', 'trend_cci', 'momentum_rsi',
       'momentum_stoch_rsi_k', 'momentum_stoch_rsi_d', 'momentum_tsi',
       'momentum_uo', 'momentum_stoch', 'momentum_stoch_signal', 'momentum_wr',
       'momentum_ao', 'momentum_roc', 'momentum_ppo_hist', 'momentum_pvo',
       'momentum_pvo_signal', 'momentum_pvo_hist', 'others_dr', 'others_dlr'],
      dtype='object')


In [7]:
# ----------------------------
# 6. Train a Model on the Selected Features
# ----------------------------
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(X_train_selected, y_train)

print("Best Parameters:", clf.best_params_)
print("Best CV Accuracy:", clf.best_score_)

train_acc = accuracy_score(y_train, clf.predict(X_train_selected))
test_acc  = accuracy_score(y_test, clf.predict(X_test_selected))
print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best CV Accuracy: 0.5425448516746144
Training Accuracy: 0.5703940563992057
Test Accuracy: 0.5421141396866274


In [8]:
# ----------------------------
# 7. Use the Model for Trading Signal Generation & Backtesting
# ----------------------------
# For example, generate predicted probabilities and then define a signal.
y_test_prob = clf.predict_proba(X_test_selected)[:, 1]

# Define signals: if probability > 0.55, signal = 1 (buy); if < 0.45, signal = -1 (sell); else 0.
def get_signal(prob, thresh_buy=0.55, thresh_sell=0.45):
    if prob > thresh_buy:
        return 1
    elif prob < thresh_sell:
        return -1
    else:
        return 0

signals = np.array([get_signal(p) for p in y_test_prob])

In [10]:
# ----------------------------
# 8. Map Predicted Signals to Test Data
# ----------------------------
# Retrieve the original test data rows using the indices from the X_test split.
test_data_with_signals = df.loc[X_test.index].copy()

# Assume you have already computed predicted probabilities and converted them to signals:
# For example:
# y_test_prob = clf.predict_proba(X_test_selected)[:, 1]
# signals = np.array([get_signal(p) for p in y_test_prob])
# Now add these signals to the test data.
test_data_with_signals['signal'] = signals

# Ensure the test data is sorted by symbol and time.
test_data_with_signals.sort_values(['symbol', 'timestamp'], inplace=True)


In [11]:
# ----------------------------
# 9. Define the Trade Simulation Function
# ----------------------------
def simulate_trade(df, start_index, signal, risk_pct=0.01):
    """
    Simulate a trade for a given symbol's DataFrame (sorted by timestamp).
    Entry is taken from the next candle's open.
    The trade exits when either a profit target (2× risk) or stop-loss (1× risk) is hit.
    
    Parameters:
      df (DataFrame): Data for one symbol.
      start_index (int): Index where the signal is generated.
      signal (int): Trade direction (1 for long, -1 for short).
      risk_pct (float): Risk percentage (defines the risk amount relative to the entry price).
    
    Returns:
      profit (float): Profit (or loss) from the trade.
      exit_index (int): The index at which the trade is exited.
    """
    if start_index + 1 >= len(df):
        return None, len(df)
    
    # Use next candle's open price as entry
    entry = df.iloc[start_index + 1]['open']
    risk = risk_pct * entry
    profit_target = 2 * risk

    exit_price = None
    exit_index = None

    # Loop through subsequent candles to simulate trade exit
    for i in range(start_index + 1, len(df)):
        row = df.iloc[i]
        if signal == 1:  # Long trade
            if row['high'] >= entry + profit_target:
                exit_price = entry + profit_target
                exit_index = i
                break
            if row['low'] <= entry - risk:
                exit_price = entry - risk
                exit_index = i
                break
        elif signal == -1:  # Short trade
            if row['low'] <= entry - profit_target:
                exit_price = entry - profit_target
                exit_index = i
                break
            if row['high'] >= entry + risk:
                exit_price = entry + risk
                exit_index = i
                break

    # If no exit condition was met, exit at the last available close price.
    if exit_price is None:
        exit_price = df.iloc[-1]['close']
        exit_index = len(df) - 1

    # Calculate profit: for long, profit = exit - entry; for short, profit = entry - exit.
    profit = exit_price - entry if signal == 1 else entry - exit_price
    return profit, exit_index

In [12]:

# ----------------------------
# 10. Simulate Trades & Evaluate Performance
# ----------------------------
trade_results = {}
risk_pct = 0.01  # Adjust this value as needed

# Process each symbol separately
for symbol in test_data_with_signals['symbol'].unique():
    symbol_df = test_data_with_signals[test_data_with_signals['symbol'] == symbol].sort_values('timestamp').reset_index(drop=True)
    trades = []
    i = 0
    while i < len(symbol_df):
        if symbol_df.loc[i, 'signal'] != 0:
            current_signal = symbol_df.loc[i, 'signal']
            profit, exit_idx = simulate_trade(symbol_df, i, current_signal, risk_pct=risk_pct)
            if profit is not None:
                trades.append(profit)
                # Skip ahead to avoid overlapping trades
                i = exit_idx + 1
                continue
        i += 1
    trade_results[symbol] = trades
    if trades:
        avg_profit = np.mean(trades)
        win_rate = np.mean([1 if p > 0 else 0 for p in trades])
        print(f"{symbol}: {len(trades)} trades, Avg Profit = {avg_profit:.4f}, Win Rate = {win_rate:.1%}")
    else:
        print(f"{symbol}: No trades executed")

# Aggregate overall performance across all symbols
all_trades = [p for trades in trade_results.values() for p in trades]
if all_trades:
    overall_profit = np.sum(all_trades)
    overall_win_rate = np.mean([1 if p > 0 else 0 for p in all_trades])
    print(f"\nOverall: Total Profit = {overall_profit:.4f}, Overall Win Rate = {overall_win_rate:.1%}")
else:
    print("No trades executed overall.")

AAPL: 79 trades, Avg Profit = -0.5945, Win Rate = 25.3%
ABT: 49 trades, Avg Profit = -0.0043, Win Rate = 32.7%
ACN: 60 trades, Avg Profit = 0.3015, Win Rate = 36.7%
ADBE: 77 trades, Avg Profit = 0.6448, Win Rate = 37.7%
ADP: 47 trades, Avg Profit = -0.6622, Win Rate = 25.5%
AMZN: 87 trades, Avg Profit = -0.0113, Win Rate = 34.5%
AVGO: 135 trades, Avg Profit = 0.1287, Win Rate = 35.6%
BAC: 70 trades, Avg Profit = 0.0581, Win Rate = 38.6%
CMCSA: 84 trades, Avg Profit = -0.0046, Win Rate = 33.3%
COST: 61 trades, Avg Profit = -2.0270, Win Rate = 26.2%
CRM: 103 trades, Avg Profit = 0.4867, Win Rate = 37.9%
CVX: 76 trades, Avg Profit = -0.2640, Win Rate = 27.6%
DHR: 63 trades, Avg Profit = 0.2787, Win Rate = 38.1%
DIA: 31 trades, Avg Profit = -0.3252, Win Rate = 32.3%
DIS: 62 trades, Avg Profit = -0.1109, Win Rate = 29.0%
GOOGL: 89 trades, Avg Profit = -0.3108, Win Rate = 27.0%
HD: 61 trades, Avg Profit = 0.5126, Win Rate = 37.7%
HON: 65 trades, Avg Profit = -0.4417, Win Rate = 27.7%
IBM: 83