In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,precision_score, recall_score, f1_score, accuracy_score
from bayes_opt import BayesianOptimization
import time

ModuleNotFoundError: No module named 'timw'

In [None]:
"""
This method takes a decision tree and the datasource, then displays the corresponding Random Forest
"""
def tree_plot(tree,data_source):
    plt.figure(figsize=(20, 20))
    tree_est=tree.estimators_[0]
    plot_tree(tree_est, 
              feature_names=data_source.columns,  
              filled=True,              
              rounded=True)             
    
    plt.title("Random Forest")
    plt.show()

In [None]:
#read in the source CSV files
X = pd.read_csv('SPY_Daily.csv')
y = pd.read_csv('Actions.csv')

View the head of the source data

In [None]:
X.head()

Drop date time as daily seperation.
Drop change and Close as we are predicting the strategy and we will not know the close.

In [None]:
X=X.drop(['Date Time','Change','Close'],axis=1)
y = y['Act']
y.value_counts(normalize=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
X_train.columns

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
#View the classification report
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
tree_plot(rf,X_test)

The model is to be improved using Bayesian Optimisation. A useful resource was here [https://www.kaggle.com/code/neerajmohan/randomforest-model-with-bayesian-optimization] below is an implementation from here, expanding for all hyper parameters.
The hyper parameters that will be tweaked are:-
n_estimators
Max_features
max_depth
max_sample
*criterion -- categorical input see below

This has been expanded to make use of categorical mapping components within BO. Typically by design this is not used, but by defining it as a map of values and using a category integeter rounding. For random forest we have trebled the number of possible combinations and included different key metrics in an automated fashion. Else we would end up doing BO for one category e.g. 'gini' and discounting the others.

In [None]:

# Define categorical mapping
criterion_mapping = {0: 'gini', 1: 'entropy', 2: 'log_loss'}

def stratified_kfold_score(clf,X,y,n_fold):
    X,y = X.values,y.values
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in strat_kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds,y_test_fold)
        accuracy_list.append(accuracy_test)

    return np.array(accuracy_list).mean()

def bo_params_rf(max_samples,n_estimators,max_features,max_depth,criterion):
    
    params = {
        'max_samples': max_samples,
        'max_depth': max_depth,
        'max_features':max_features,
        'n_estimators':int(n_estimators),
        'criterion': criterion
    }
    clf = RandomForestClassifier(criterion = criterion_mapping[int(round(criterion))],
                                 max_samples=params['max_samples'],
                                 max_depth=int(params['max_depth']),
                                 max_features=params['max_features'],
                                 n_estimators=params['n_estimators'])
    score = stratified_kfold_score(clf,X_train, y_train,5)
    return score

rf_bo = BayesianOptimization(bo_params_rf, {
                                            'criterion': (0,2),          #using mapping as BO does not work with categorical values
                                              'max_samples':(0.5,1),
                                                'max_features':(0.5,1),
                                              'n_estimators':(100,200),
                                                'max_depth':(10,50)
                                             })

In [None]:
#This records the time to do the optimisation

start_time =time.time()
results = rf_bo.maximize(n_iter=600, init_points=60) #Was 200,20 but with 3X the number of inputs this was trebled.
end_time = time.time()
total_time = (end_time - start_time) / 60


In [None]:
print("Execution time took ",total_time, " mins")

In [None]:
params = rf_bo.max['params']
params['n_estimators']= int(params['n_estimators'])
params['criterion'] = criterion_mapping[int(round(params['criterion']))]
params['max_depth']=int(params['max_depth'])
print(params)

We now have the ideal values to use in our Random Forest. The next step is to create an ideal forest and report
ams)
{'criterion': 'log_loss', 'max_depth': 35, 'max_features': 0.6161733644459485, 'max_samples': 0.940087494327472, 'n_estimators': 106}

In [None]:
#setup and train the random forest with ideal values
rf_ideal= RandomForestClassifier(criterion = params['criterion'],
                                 max_samples=params['max_samples'],
                                 max_depth=int(params['max_depth']),
                                 max_features=params['max_features'],
                                 n_estimators=params['n_estimators'])
rf_ideal.fit(X_train,y_train)

In [None]:
print(classification_report(y_test, rf_ideal.predict(X_test)))

From a previous run using the defaults
              precision    recall  f1-score   support

         BUY       0.99      0.96      0.97        99
        SELL       0.90      0.97      0.94        37

    accuracy                           0.96       136
   macro avg       0.94      0.97      0.95       136
weighted avg       0.97      0.96      0.96     deleted


In [None]:
tree_plot(rf_ideal,X_test)

This final tree is much simplified compared to the original with defaults. Since there are less comparitors the result will be faster execution.