### Environment Information
This notebook was originally run using:
- **pandas**: 1.4.4
- **numpy**: 1.24.2
- **scikit-learn**: 1.0.2

In [1]:
import pandas as pd
import numpy as np
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
# read in the feature data
data = pd.read_csv("feature_data_holiday_20251112.csv")

### Random Forest

In [7]:
### No Grid Search ###
np.random.seed(643)
# features and init storage
features = [col for col in data.columns if col.startswith("char_")]
predictions_by_year = {}
accuracy_by_year = {}
auc_by_year = {}

feature_importance_by_year = {}

min_year = data['year'].min()
max_year = data['year'].max()
counter = 1
# loop through the test years using a sequential rolling window of train (3 yr) and validation (2 yr) data
for test_year in range(min_year + 5, max_year + 1):
    train_years = range(test_year - 5, test_year - 2)
    valid_years = range(test_year - 2, test_year)

    print(f"Iteration {counter}: Train Years = {list(train_years)}, Validation Years = {list(valid_years)}, Test Year = {test_year}")
    counter += 1

    train_data = data[data['year'].isin(train_years)]
    valid_data = data[data['year'].isin(valid_years)]
    test_data = data[data['year'] == test_year]

    if train_data.empty or valid_data.empty or test_data.empty:
        continue
    # combine train and validation for final model fit (rf don't require separate sets for training, as bootstrapping occurs)
    train_valid_data = pd.concat([train_data, valid_data])

    X_train_valid = train_valid_data[features]
    y_train_valid = train_valid_data['target']
    X_test = test_data[features]
    y_test = test_data['target']
    # train Random Forest model
    model = RandomForestClassifier(
        criterion = "entropy", # gini v. entropy
        n_estimators=2000, # Number of trees in the forest
        max_depth=4, # Maximum tree depth
        min_samples_leaf=1,
        random_state=643,
        n_jobs=-1,
        max_features="sqrt" # Square root of the number of predictors
    )
    model.fit(X_train_valid, y_train_valid)
    # Feature importance
    feature_importance_by_year[test_year] = pd.Series(model.feature_importances_, index=features)
    # predict probs and binary classes for test data
    test_probs = model.predict_proba(X_test)[:, 1]
    # from prior analysis, the target outcomes are nearly 1:1, making 0.5 an appropriate threshold
    test_preds = (test_probs >= 0.5).astype(int)
    # store probs and calculate metrics
    predictions_by_year[test_year] = test_probs.tolist()
    accuracy_by_year[test_year] = accuracy_score(y_test, test_preds)
    auc_by_year[test_year] = roc_auc_score(y_test, test_probs)
# accuracy and AUC for each year
for year in sorted(accuracy_by_year.keys()):
    print(f"Year {year}: Accuracy = {accuracy_by_year[year]:.2%}, AUC = {auc_by_year[year]:.3f}")

Iteration 1: Train Years = [2004, 2005, 2006], Validation Years = [2007, 2008], Test Year = 2009
Iteration 2: Train Years = [2005, 2006, 2007], Validation Years = [2008, 2009], Test Year = 2010
Iteration 3: Train Years = [2006, 2007, 2008], Validation Years = [2009, 2010], Test Year = 2011
Iteration 4: Train Years = [2007, 2008, 2009], Validation Years = [2010, 2011], Test Year = 2012
Iteration 5: Train Years = [2008, 2009, 2010], Validation Years = [2011, 2012], Test Year = 2013
Iteration 6: Train Years = [2009, 2010, 2011], Validation Years = [2012, 2013], Test Year = 2014
Iteration 7: Train Years = [2010, 2011, 2012], Validation Years = [2013, 2014], Test Year = 2015
Iteration 8: Train Years = [2011, 2012, 2013], Validation Years = [2014, 2015], Test Year = 2016
Iteration 9: Train Years = [2012, 2013, 2014], Validation Years = [2015, 2016], Test Year = 2017
Iteration 10: Train Years = [2013, 2014, 2015], Validation Years = [2016, 2017], Test Year = 2018
Iteration 11: Train Years = [

In [None]:
### Grid Search Method ###
np.random.seed(643)
# Features and initialization
features = [col for col in data.columns if col.startswith("char_")]
predictions_by_year = {}
accuracy_by_year = {}
auc_by_year = {}
feature_importance_by_year = {}

min_year = data['year'].min()
max_year = data['year'].max()

counter = 1
# Rolling loop
for test_year in range(min_year + 5, max_year + 1):
    train_years = range(test_year - 5, test_year - 2)
    valid_years = range(test_year - 2, test_year)

    print(f"Iteration {counter}: Train Years = {list(train_years)}, Validation Years = {list(valid_years)}, Test Year = {test_year}")
    counter += 1

    train_data = data[data['year'].isin(train_years)]
    valid_data = data[data['year'].isin(valid_years)]
    test_data = data[data['year'] == test_year]

    if train_data.empty or valid_data.empty or test_data.empty:
        continue

    X_train = train_data[features]
    y_train = train_data['target']
    X_valid = valid_data[features]
    y_valid = valid_data['target']
    X_test = test_data[features]
    y_test = test_data['target']
    # Combine for final model fit after tuning
    X_train_valid = pd.concat([X_train, X_valid])
    y_train_valid = pd.concat([y_train, y_valid])
    # Grid search on train + valid
    param_grid = {
        'n_estimators': [250, 500, 1000],
        'max_depth': [5, 7, 9],
        'min_samples_leaf': [1, 3, 5],
        'max_features': ['sqrt']
    }

    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=643, n_jobs=-1),
        param_grid=param_grid,
        cv=3,
        scoring='roc_auc',
        verbose=3
    )

    grid_search.fit(X_train_valid, y_train_valid)
    best_params = grid_search.best_params_
    # Train model with best hyperparameters
    model = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        max_features=best_params['max_features'],
        min_samples_leaf=1,
        random_state=643,
        n_jobs=-1
    )
    model.fit(X_train_valid, y_train_valid)
    # Feature importance
    feature_importance_by_year[test_year] = pd.Series(model.feature_importances_, index=features)
    # Predictions
    test_probs = model.predict_proba(X_test)[:, 1]
    test_preds = (test_probs >= 0.5).astype(int)
    # Store results
    predictions_by_year[test_year] = test_probs.tolist()
    accuracy_by_year[test_year] = accuracy_score(y_test, test_preds)
    auc_by_year[test_year] = roc_auc_score(y_test, test_probs)
# Print accuracy and AUC by year
for year in sorted(accuracy_by_year.keys()):
    print(f"Year {year}: Accuracy = {accuracy_by_year[year]:.2%}, AUC = {auc_by_year[year]:.3f}")

In [8]:
# flatten predictions into one list
all_predictions = [prob for year_probs in predictions_by_year.values() for prob in year_probs]
# get all test data (years with predictions available)
test_data = data[data['year'] >= (min_year + 5)].copy()
# make sure predictions are flattened into a single list (predictions_by_year is in order)
all_predictions = []
for year in sorted(predictions_by_year.keys()):
    all_predictions.extend(predictions_by_year[year])
# length matches the number of test rows
if len(all_predictions) != len(test_data):
    raise ValueError(f"Length mismatch: {len(all_predictions)} predictions vs {len(test_data)} test rows")
# make prediction column
test_data['prediction'] = all_predictions
rf_results_df = test_data[['datadate', 'year', 'tic', 'holiday_return', 'spy_holiday_return', 'target', 'prediction']]
# save as csv file
rf_results_df.to_csv("rf_holiday_results_20251120v3_entropy.csv", index=False)