**K-fold=5**

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
merged_df3=pd.read_csv('final_combined_df.csv')

#time series k fold cross validation split where k=5
import math

def k_fold_time_series_blocking(k=5):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

We will be automating the hyperparameter tuning (using GridSearch) and feature selection process (using SelectKbest)


for each of the type of models below:

1.Select n best features based on mutual_info_regression

2.pass these n features into the model and conduct grid search with cross-validation , select the best hyperparameters based on aggregated f1 score over k folds

3.for the best esimator , print its hyperparameters ,features selected as well as scores

4.repeat for different n ( 3 to max_number of feature columns in dataset)

**kernel=poly**

In [2]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
def train_pipeline(n):
  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='poly'))
  ])

  # 4. Define parameter grid for GridSearchCV
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__degree': [2, 3, 4],              # Degree of the polynomial kernel
      'svm__coef0': [0.0, 1.0, 2.0]          # Independent term in the polynomial kernel
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
  #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names


In [3]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5466666666666666
Best Parameters: {'svm__C': 0.1, 'svm__coef0': 0.0, 'svm__degree': 4}
i:  4
Selected Features: ['Open', 'macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.5463398692810457
Best Parameters: {'svm__C': 0.1, 'svm__coef0': 0.0, 'svm__degree': 4}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'Interest Rate']
Best F1 Score: 0.4833333333333333
Best Parameters: {'svm__C': 1, 'svm__coef0': 0.0, 'svm__degree': 2}
i:  6
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_polarity']
Best F1 Score: 0.5047619047619047
Best Parameters: {'svm__C': 0.1, 'svm__coef0': 2.0, 'svm__degree': 3}
i:  7
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity', 'Interest Rate']
Best F1 Score: 0.5065359477124183
Best Parameters: {'svm__C': 1, 'svm__coef0': 0.0, 'svm__degree': 2}
i:  8
Selected Features: ['Open',

In [4]:
import pickle
poly_model=all_weights[max_f1_index]
with open('poly_svm_model_k5.pkl', 'wb') as f:
    pickle.dump(poly_model, f)


In [5]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('poly_svm_model_k5.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.77      0.67      0.71        15
           1       0.38      0.50      0.43         6

    accuracy                           0.62        21
   macro avg       0.57      0.58      0.57        21
weighted avg       0.66      0.62      0.63        21



**kernel=rbf**

In [6]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
def train_pipeline(n):
  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='rbf'))
  ])
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__gamma': [0.1, 1, 10, 100],

  }

  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
      #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names

In [7]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,26):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['macd', 'BB_middle', 'sentiment_subjectivity']
Best F1 Score: 0.3861111111111111
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  4
Selected Features: ['macd', 'BB_middle', 'sentiment_subjectivity', 'Interest Rate']
Best F1 Score: 0.37777777777777777
Best Parameters: {'svm__C': 10, 'svm__gamma': 10}
i:  5
Selected Features: ['Open', 'macd', 'BB_middle', 'High-Low', 'Interest Rate']
Best F1 Score: 0.4224577280304834
Best Parameters: {'svm__C': 10, 'svm__gamma': 10}
i:  6
Selected Features: ['Open', 'Adj Close', 'macd', 'BB_middle', 'High-Low', 'OBV']
Best F1 Score: 0.4027777777777778
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  7
Selected Features: ['Open', 'Adj Close', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1 Score: 0.42020202020202024
Best Parameters: {'svm__C': 10, 'svm__gamma': 10}
i:  8
Selected Features: ['Open', 'Close', 'Adj Close', 'macd', 'BB_middle', 'High-Low', 'OBV', 'sentiment_subjectivity']
Best F1

In [8]:
import pickle
rbf_model=all_weights[max_f1_index]
with open('rbf_svm_model_k5.pkl', 'wb') as f:
    pickle.dump(rbf_model, f)

In [10]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('All-merged-backtest.csv')#backtest dataset

with open('rbf_svm_model_k5.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.29      1.00      0.44         6

    accuracy                           0.29        21
   macro avg       0.14      0.50      0.22        21
weighted avg       0.08      0.29      0.13        21



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
