In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

In [2]:
sp500 = yf.Ticker("^GSPC").history(period="max")

In [3]:
# Preprocessing
sp500['Tomorrow'] = sp500['Close'].shift(-1)
sp500['Target'] = (sp500['Tomorrow'] > sp500['Close']).astype(int)
sp500 = sp500.loc["1990-01-01":].drop(['Dividends', 'Stock Splits'], axis=1).dropna()

In [4]:
def feature_engineering(data):
    # Simple Moving Averages for different time windows
    for window in [10, 20, 30, 60, 120, 250]:  # Added 20 to the list
        data[f"MA_{window}"] = data['Close'].rolling(window=window).mean()

    # Exponential Moving Averages
    for window in [10, 30, 60, 120, 250]:
        data[f"EMA_{window}"] = data['Close'].ewm(span=window, adjust=False).mean()

    # Money Flow Index (MFI)
    typical_price = (data['Close'] + data['Low'] + data['High']) / 3
    raw_money_flow = typical_price * data['Volume']
    up_flow = raw_money_flow.copy()
    down_flow = raw_money_flow.copy()
    
    up_flow[typical_price <= typical_price.shift(1)] = 0
    down_flow[typical_price > typical_price.shift(1)] = 0

    up_flow_sum = up_flow.rolling(window=14).sum()
    down_flow_sum = down_flow.rolling(window=14).sum()

    money_flow_ratio = up_flow_sum / down_flow_sum
    data['MFI'] = 100 - (100 / (1 + money_flow_ratio))

    # RSI - Relative Strength Index
    delta = data['Close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['RSI'] = 100 - (100 / (1 + rs))

    # Bollinger Bands
    data['Bollinger_Upper'] = data['MA_20'] + (data['Close'].rolling(20).std() * 2)
    data['Bollinger_Lower'] = data['MA_20'] - (data['Close'].rolling(20).std() * 2)

    # MACD - Moving Average Convergence Divergence
    short_ema = data['Close'].ewm(span=12, adjust=False).mean()
    long_ema = data['Close'].ewm(span=26, adjust=False).mean()
    data['MACD'] = short_ema - long_ema
    data['Signal_Line'] = data['MACD'].ewm(span=9, adjust=False).mean()

    # Drop NaN values generated by rolling/ewm functions
    data.dropna(inplace=True)
    
    # New Feature: Average True Range (ATR)
    high_low = data['High'] - data['Low']
    high_close = np.abs(data['High'] - data['Close'].shift())
    low_close = np.abs(data['Low'] - data['Close'].shift())
    tr = high_low.combine(high_close, max).combine(low_close, max)
    data['ATR'] = tr.rolling(window=14).mean()

    # Lag Features (Example: 5-day lagged Close)
    data['Lagged_Close'] = data['Close'].shift(5)

    # Normalize Features
    scaler = StandardScaler()
    feature_cols = [col for col in data.columns if col not in ['Tomorrow', 'Target']]
    data[feature_cols] = scaler.fit_transform(data[feature_cols])

    data.dropna(inplace=True)
    return data

In [5]:
sp500 = feature_engineering(sp500)

In [6]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target,MA_10,MA_20,MA_30,...,EMA_120,EMA_250,MFI,RSI,Bollinger_Upper,Bollinger_Lower,MACD,Signal_Line,ATR,Lagged_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991-01-15 00:00:00-05:00,-1.208230,-1.209448,-1.206245,-1.207210,-1.318952,316.170013,1,-1.204863,-1.199170,-1.197900,...,-1.199737,-1.193537,-2.009373,-1.903459,-1.192896,-1.204168,-0.297560,-0.231377,-0.891780,-1.207066
1991-01-16 00:00:00-05:00,-1.207084,-1.206497,-1.205222,-1.204955,-1.305554,327.970001,1,-1.205815,-1.199628,-1.198146,...,-1.199860,-1.193660,-1.864496,-1.692356,-1.193113,-1.204882,-0.293797,-0.247883,-0.883707,-1.210224
1991-01-17 00:00:00-05:00,-1.204755,-1.196358,-1.202146,-1.194052,-1.204896,332.230011,1,-1.205254,-1.199724,-1.198095,...,-1.199795,-1.193691,-0.978071,-0.335824,-1.193372,-1.204804,-0.245761,-0.250767,-0.850721,-1.207409
1991-01-18 00:00:00-05:00,-1.193960,-1.192442,-1.192079,-1.190115,-1.255252,331.059998,0,-1.204214,-1.199630,-1.198024,...,-1.199665,-1.193688,-0.390262,-0.075129,-1.193063,-1.204940,-0.191462,-0.241409,-0.837256,-1.206761
1991-01-21 00:00:00-05:00,-1.189986,-1.192442,-1.190416,-1.191196,-1.304610,328.309998,0,-1.202768,-1.199586,-1.197962,...,-1.199556,-1.193695,-0.352516,-0.255107,-1.192929,-1.204993,-0.152954,-0.225649,-0.834944,-1.209298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-19 00:00:00-05:00,2.887104,2.885786,2.913106,2.909103,0.817809,4698.350098,0,2.822878,2.782875,2.740673,...,2.677182,2.674405,0.814834,1.898914,2.776830,2.785358,3.571371,3.320732,0.857646,2.801856
2023-12-20 00:00:00-05:00,2.906521,2.894353,2.870443,2.844401,0.912920,4746.750000,1,2.836678,2.790308,2.750594,...,2.681580,2.677391,0.886309,0.819802,2.784171,2.792877,3.406582,3.388795,1.032079,2.860561
2023-12-21 00:00:00-05:00,2.869146,2.867419,2.880231,2.889125,0.492798,4754.629883,1,2.851602,2.799131,2.761879,...,2.686663,2.680728,0.856713,0.932644,2.798647,2.795638,3.415716,3.445207,1.052919,2.872100
2023-12-22 00:00:00-05:00,2.896530,2.889693,2.906646,2.896406,0.283097,4774.750000,1,2.865517,2.808194,2.774506,...,2.691786,2.684100,1.281956,1.270545,2.812283,2.799793,3.411193,3.489365,1.009843,2.871767


In [7]:
# Split data into train and test sets
train_data, test_data = train_test_split(sp500, test_size=0.2, random_state=42)

In [8]:
# Selecting Features and Target
features = [col for col in sp500.columns if col not in ['Tomorrow', 'Target']]
target = 'Target'

In [9]:
# Model Training
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=42)
model.fit(train_data[features], train_data[target])

In [10]:
# Making Predictions for the RandomForestClassifier as benchmark
predictions = model.predict(test_data[features])
precision = precision_score(test_data[target], predictions)
print(f"Precision Score: {precision}")

Precision Score: 0.531951640759931


In [11]:
def train_and_tune_model(train_data, features, target):
    # Define models
    rf_model = RandomForestClassifier(random_state=42)
    gb_model = GradientBoostingClassifier(random_state=42)
    svc_model = SVC(kernel='rbf', probability=True, random_state=42)

    # Hyperparameter tuning (example)
    # param_grid = {'n_estimators': [100, 200], 'max_depth': [5, 10]}
    # grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3)
    # grid_search.fit(train_data[features], train_data[target])
    # rf_model = grid_search.best_estimator_

    # Feature selection using RFE
    selector = RFE(rf_model, n_features_to_select=10, step=1)
    selector = selector.fit(train_data[features], train_data[target])
    selected_features = [f for f, s in zip(features, selector.support_) if s]

    # Create the ensemble model
    ensemble_model = VotingClassifier(
        estimators=[('rf', rf_model), ('gb', gb_model), ('svc', svc_model)],
        voting='soft'
    )
    ensemble_model.fit(train_data[selected_features], train_data[target])

    return ensemble_model, selected_features

In [None]:
ensemble_model, selected_features = train_and_tune_model(train_data, features, target)

In [12]:
# Example of a backtesting function (simplified version)
def backtest(data, model, features):
    predictions = []
    for i in range(250, len(data), 250):
        train = data.iloc[:i]
        test = data.iloc[i:i + 250]
        model.fit(train[features], train[target])
        preds = model.predict(test[features])
        predictions.extend(preds)
    return predictions

In [13]:
svc_model = SVC(kernel='rbf')  # Using RBF kernel for higher dimensions
svc_model.fit(train_data[features], train_data[target])

In [14]:
# Implementing Kelly Criterion for position sizing
def kelly_criterion(prob_win, payoff_win):
    return prob_win - (1 - prob_win) / payoff_win


In [15]:
def enhanced_backtest(data, model, features, target, time_stop=10):
    predictions = []
    trade_durations = []
    pnl = []

    for i in range(250, len(data), 250):
        # Training phase
        train = data.iloc[:i]
        test = data.iloc[i:i + 250]

        # Train the model and make predictions
        model.fit(train[features], train[target])
        preds = model.predict_proba(test[features])[:, 1]

        # Threshold for binary classification
        final_preds = (preds > 0.6).astype(int)

        for j, pred in enumerate(final_preds):
            # Trade execution logic
            entry_price = test.iloc[j]['Close']
            exit_price = test.iloc[min(j + time_stop, len(final_preds) - 1)]['Close']
            duration = min(j + time_stop, len(final_preds) - 1) - j

            # Calculate P&L
            if pred == 1:  # Long position
                trade_pnl = exit_price - entry_price
            else:  # Short position
                trade_pnl = entry_price - exit_price

            pnl.append(trade_pnl)
            trade_durations.append(duration)
            predictions.append(pred)

    # Kelly Criterion for position sizing (example)
    win_rate = sum(p > 0 for p in pnl) / len(pnl)
    win_loss_ratio = sum(p for p in pnl if p > 0) / -sum(p for p in pnl if p < 0)
    kelly_fraction = win_rate - (1 - win_rate) / win_loss_ratio

    # Align predictions with the dataset
    aligned_predictions = [None] * len(data)
    prediction_index = 0
    for i in range(250, len(data), 250):
        for j in range(i, min(i + 250, len(data))):
            if prediction_index < len(predictions):
                aligned_predictions[j] = predictions[prediction_index]
                prediction_index += 1

    return aligned_predictions, pnl, trade_durations, kelly_fraction

In [None]:
# Running the backtest
backtest_results = enhanced_backtest(sp500, ensemble_model, features, target)
backtest_predictions, pnl, trade_durations, kelly_fraction = backtest_results

# Removing None values from predictions for precision score calculation
valid_predictions = [pred for pred in backtest_predictions if pred is not None]
valid_targets = sp500[target].iloc[len(sp500[target]) - len(valid_predictions):]

# Analyzing the results
print(f"Backtest Precision: {precision_score(valid_targets, valid_predictions)}")
print(f"Average P&L per trade: {np.mean(pnl)}")
print(f"Average trade duration: {np.mean(trade_durations)} days")
print(f"Kelly Fraction: {kelly_fraction}")