In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score

In [None]:
merged_df3=pd.read_csv('final_combined_df.csv')

In [None]:
print(merged_df3.columns)
len(merged_df3.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'RSI_14',
       'macd', 'macd_signal', 'BB_upper', 'BB_middle', 'BB_lower',
       'Close-Open', 'High-Low', 'OBV', 'BB_height',
       'next_day_close_increased', 'sentiment_polarity',
       'sentiment_subjectivity', 'Interest Rate', 'SP_Open', 'SP_High',
       'SP_Low', 'SP_Close', 'SP_AdjClose', 'SP_Volume'],
      dtype='object')


27

**K-fold=10**

In [None]:
#time series k fold cross validation split where k=10
import math

def k_fold_time_series_blocking(k=10):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

We will be automating the hyperparameter tuning (using GridSearch) and select K best



**for each of the type of models below:**

1.Select n best features based on mutual_info_regression

2.pass these n features into the model and conduct grid search with cross-validation , select the best hyperparameters based on aggregated f1 score over k folds

3.for the best esimator , print its hyperparameters ,features selected as well as scores


4.repeat for different n ( 3 to max_number of feature columns in dataset)

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
def train_pipeline(n):

  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('RF', RandomForestClassifier())
  ])

  #  Define parameter grid for GridSearchCV
  param_grid = {
      'RF__n_estimators': [50, 100, 200],
      'RF__max_depth': [None, 10, 20],
      'RF__min_samples_split': [2, 5, 10],
      'RF__min_samples_leaf': [1, 2, 4]
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)

  #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  rf_model = best_estimator.named_steps['RF']


  return rf_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5784126984126985
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 10, 'RF__n_estimators': 50}
i:  4
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5166666666666667
Best Parameters: {'RF__max_depth': None, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 2, 'RF__n_estimators': 50}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'sentiment_subjectivity']
Best F1 Score: 0.56
Best Parameters: {'RF__max_depth': 20, 'RF__min_samples_leaf': 2, 'RF__min_samples_split': 5, 'RF__n_estimators': 50}
i:  6
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.5714285714285714
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 2, 'RF__min_samples_split': 2, 'RF__n_estimators': 100}
i:  7
Selected Features: ['Open', 'macd', 'BB_middle', 

**export weights**

In [None]:
import pickle
rf_model=all_weights[max_f1_index]
with open('random_forest_model_k10.pkl', 'wb') as f:
    pickle.dump(rf_model, f)



**backtest**

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('random_forest_model_k10.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.78      0.47      0.58        15
           1       0.33      0.67      0.44         6

    accuracy                           0.52        21
   macro avg       0.56      0.57      0.51        21
weighted avg       0.65      0.52      0.54        21



**K-fold=5**

In [None]:
#time series k fold cross validation split where k=5
import math

def k_fold_time_series_blocking(k=5):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

**train and CV**

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Open', 'macd', 'BB_middle']
Best F1 Score: 0.5871345029239766
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 4, 'RF__min_samples_split': 5, 'RF__n_estimators': 100}
i:  4
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5409806567701304
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 4, 'RF__min_samples_split': 5, 'RF__n_estimators': 50}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'sentiment_subjectivity']
Best F1 Score: 0.5529824561403508
Best Parameters: {'RF__max_depth': None, 'RF__min_samples_leaf': 4, 'RF__min_samples_split': 10, 'RF__n_estimators': 100}
i:  6
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.4982905982905983
Best Parameters: {'RF__max_depth': None, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 2, 'RF__n_estimators': 100}
i:  7
Selected Features: ['Open', 'Adj Close', 'macd', 

**export weights**

In [None]:
import pickle
rf_model=all_weights[max_f1_index]
with open('random_forest_model_k5.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

**backtest**

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('random_forest_model_k5.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.75      0.60      0.67        15
           1       0.33      0.50      0.40         6

    accuracy                           0.57        21
   macro avg       0.54      0.55      0.53        21
weighted avg       0.63      0.57      0.59        21

