**K-fold=5**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
merged_df3=pd.read_csv('final_combined_df.csv')

#time series k fold cross validation split where k=10
import math

def k_fold_time_series_blocking(k=5):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

**for model below:**

1.Select n best features based on mutual_info_regression

2.pass these n features into the model and conduct grid search with cross-validation , select the best hyperparameters based on aggregated f1 score over k folds

3.for the best esimator , print its hyperparameters ,features selected as well as scores


4.repeat for different n ( 3 to max_number of feature columns in dataset)

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
def train_pipeline(n):
  pipeline =  Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('adaboost', AdaBoostClassifier()) ])

  # 4. Define parameter grid for GridSearchCV
  param_grid = {

      'adaboost__n_estimators': [50, 100, 150], # Number of weak learners
      'adaboost__learning_rate': [0.1, 0.5, 1.0] # Learning rate
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
    #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  ada_model = best_estimator.named_steps['adaboost']


  return ada_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Open', 'macd', 'BB_middle']
Best F1 Score: 0.5686618507051944
Best Parameters: {'adaboost__learning_rate': 0.5, 'adaboost__n_estimators': 50}
i:  4
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5442557442557442
Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 100}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_polarity', 'sentiment_subjectivity']
Best F1 Score: 0.5405128205128205
Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 100}
i:  6
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.5513986013986013
Best Parameters: {'adaboost__learning_rate': 1.0, 'adaboost__n_estimators': 50}
i:  7
Selected Features: ['Open', 'Adj Close', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.5340481740481741
Best Parameters: {'adaboost__learning_rate': 0.5, 'adabo

In [None]:
import pickle
ada_model=all_weights[max_f1_index]
with open('ada_model_k5.pkl', 'wb') as f:
    pickle.dump(ada_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('ada_model_k5.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.86      0.40      0.55        15
           1       0.36      0.83      0.50         6

    accuracy                           0.52        21
   macro avg       0.61      0.62      0.52        21
weighted avg       0.71      0.52      0.53        21



**k-fold=10**

In [None]:
def k_fold_time_series_blocking(k=10):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

**reuse the pipeline above**

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5273809523809524
Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 150}
i:  4
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5035714285714286
Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 150}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity', 'Interest Rate']
Best F1 Score: 0.469047619047619
Best Parameters: {'adaboost__learning_rate': 0.5, 'adaboost__n_estimators': 150}
i:  6
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.46428571428571425
Best Parameters: {'adaboost__learning_rate': 0.5, 'adaboost__n_estimators': 100}
i:  7
Selected Features: ['Open', 'Adj Close', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.4776190476190476
Best Parameters: {'adaboost__learning_rat

In [None]:
import pickle
ada_model=all_weights[max_f1_index]
with open('ada_model_k10.pkl', 'wb') as f:
    pickle.dump(ada_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('ada_model_k10.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       1.00      0.40      0.57        15
           1       0.40      1.00      0.57         6

    accuracy                           0.57        21
   macro avg       0.70      0.70      0.57        21
weighted avg       0.83      0.57      0.57        21

