In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,precision_score, recall_score, f1_score, accuracy_score
from bayes_opt import BayesianOptimization

In [2]:
X = pd.read_csv('SPY_Daily.csv')
y = pd.read_csv('Actions.csv')

View the head of the source data

In [4]:
X.head()

Unnamed: 0,Date Time,Open,High,Low,Close,Change,200 Moving Average,100 Moving Average,50 Moving Average,Bollinger-Upper,Bollinger-Mid,Bollinger-Lower,RSI,RSI-Average,MACD,MACD-Signal,MACD-Histogram,Volume
0,02/05/2022,412.07,415.92,405.02,414.48,2.48,448.06405,447.4625,436.7374,461.656903,435.8995,410.142097,37.173459,36.455153,-11.623462,-7.360043,-4.263419,158312500
1,03/05/2022,415.01,418.93,413.36,416.38,1.9,448.0211,446.9311,436.3804,459.096391,433.8785,408.660609,38.708659,37.642485,-12.01109,-8.516318,-3.494772,100028100
2,04/05/2022,417.08,429.66,413.7099,429.06,12.68,448.0111,446.5582,436.3702,456.799243,432.78,408.760757,47.864633,40.43383,-11.720897,-9.416019,-2.304879,144247800
3,05/05/2022,424.55,425.0,409.44,413.81,-15.25,447.9074,445.9889,436.2074,455.648362,431.1445,406.640638,40.105029,39.811642,-12.487244,-10.21042,-2.276823,172929100
4,06/05/2022,411.1,414.8,405.73,411.34,-2.47,447.7868,445.4366,435.8682,453.823788,429.273,404.722212,39.002157,40.570787,-13.446538,-10.998326,-2.448212,151770800


Drop date time as daily seperation.
Drop change and Close as we are predicting the strategy and we will not know the close.

In [6]:
X=X.drop(['Date Time','Change','Close'],axis=1)
y = y['Act']
y.value_counts(normalize=True)

Act
BUY     0.720074
SELL    0.279926
Name: proportion, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [8]:
X_train.columns

Index(['Open', 'High', 'Low', '200 Moving Average', '100 Moving Average',
       '50 Moving Average', 'Bollinger-Upper', 'Bollinger-Mid',
       'Bollinger-Lower', 'RSI', 'RSI-Average', 'MACD', 'MACD-Signal',
       'MACD-Histogram', 'Volume'],
      dtype='object')

In [9]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [10]:
rf.score(X_train, y_train)

1.0

In [11]:

print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         BUY       0.99      0.96      0.97        99
        SELL       0.90      0.97      0.94        37

    accuracy                           0.96       136
   macro avg       0.94      0.97      0.95       136
weighted avg       0.97      0.96      0.96       136



The model is to be improved using Bayesian Optimisation. A useful resource was here [https://www.kaggle.com/code/neerajmohan/randomforest-model-with-bayesian-optimization] below is an implementation from here, expanding for all hyper parameters.
The hyper parameters that will be tweaked are:-
n_estimators
Max_features
max_depth
max_sample


In [13]:

# Define categorical mapping
criterion_mapping = {0: 'gini', 1: 'entropy', 2: 'log_loss'}

def stratified_kfold_score(clf,X,y,n_fold):
    X,y = X.values,y.values
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in strat_kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds,y_test_fold)
        accuracy_list.append(accuracy_test)

    return np.array(accuracy_list).mean()

def bo_params_rf(max_samples,n_estimators,max_features,max_depth,criterion):
    
    params = {
        'max_samples': max_samples,
        'max_depth': max_depth,
        'max_features':max_features,
        'n_estimators':int(n_estimators),
        'criterion': criterion
    }
    clf = RandomForestClassifier(criterion = criterion_mapping[int(round(criterion))],
                                 max_samples=params['max_samples'],
                                 max_depth=int(params['max_depth']),
                                 max_features=params['max_features'],
                                 n_estimators=params['n_estimators'])
    score = stratified_kfold_score(clf,X_train, y_train,5)
    return score

rf_bo = BayesianOptimization(bo_params_rf, {
                                            'criterion': (0,2),          #using mapping as BO does not work with categorical values
                                              'max_samples':(0.5,1),
                                                'max_features':(0.5,1),
                                              'n_estimators':(100,200),
                                                'max_depth':(10,50)
                                             })

In [14]:
results = rf_bo.maximize(n_iter=250, init_points=30)

|   iter    |  target   | criterion | max_depth | max_fe... | max_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.941    [39m | [39m0.641    [39m | [39m31.73    [39m | [39m0.559    [39m | [39m0.7827   [39m | [39m153.5    [39m |
| [35m2        [39m | [35m0.946    [39m | [35m1.917    [39m | [35m47.93    [39m | [35m0.5046   [39m | [35m0.9242   [39m | [35m166.3    [39m |
| [39m3        [39m | [39m0.9361   [39m | [39m0.6469   [39m | [39m24.94    [39m | [39m0.7937   [39m | [39m0.6781   [39m | [39m101.1    [39m |
| [39m4        [39m | [39m0.9312   [39m | [39m0.2429   [39m | [39m32.2     [39m | [39m0.5789   [39m | [39m0.5532   [39m | [39m153.7    [39m |
| [39m5        [39m | [39m0.9361   [39m | [39m1.303    [39m | [39m11.08    [39m | [39m0.9476   [39m | [39m0.5095   [39m | [39m198.3    [39m |
| [39m6        [39m | [39m0.9435   [39m | [

In [37]:
params = rf_bo.max['params']
params['n_estimators']= int(params['n_estimators'])
params['criterion'] = criterion_mapping[int(round(params['criterion']))]
params['max_depth']=int(params['max_depth'])
print(params)

{'criterion': 'log_loss', 'max_depth': 35, 'max_features': 0.6161733644459485, 'max_samples': 0.940087494327472, 'n_estimators': 106}


We now have the ideal values to use in our Random Forest. The next step is to create an ideal forest and report
ams)
{'criterion': 'log_loss', 'max_depth': 35, 'max_features': 0.6161733644459485, 'max_samples': 0.940087494327472, 'n_estimators': 106}