**K=5**

initialization of k-fold function

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
merged_df3=pd.read_csv('btc_only_df.csv')

#time series k fold cross validation split where k=10
import math

def k_fold_time_series_blocking(k=5):
    merged_df3.reset_index(drop=True, inplace=True)
    n=len(merged_df3)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

In [None]:
max_features=len(merged_df3.drop(columns=['Date','Close2','Close2-Close1','next_day_close_increased']).columns)


In [None]:
print(max_features)

16


**General structure for each type of models**

We will be automating the hyperparameter tuning (using GridSearch) and feature selection process (using SelectKbest)


for each of the type of models below:

1.Select n best features based on mutual_info_regression

2.pass these n features into the model and conduct grid search with cross-validation , select the best hyperparameters based on aggregated f1 score over k folds

3.for the best esimator , print its hyperparameters ,features selected as well as scores

4.repeat for different n ( 3 to max_number of feature columns in dataset)

**SVM-poly kernel**




In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
def svmPolypipeline(n):


  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='poly'))
  ])

  #  Define parameter grid for GridSearchCV
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__degree': [2, 3, 4],              # Degree of the polynomial kernel
      'svm__coef0': [0.0, 1.0, 2.0]          # Independent term in the polynomial kernel
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','Close2','Close2-Close1','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # Perform grid search with cross-validation
  #select best estimator using aggregated f1 score over all k fold as defined by refit='f1'
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  #retrieving and printing the selected features and hyperparameters
  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_) #print best score
  print("Best Parameters:", grid_result.best_params_) #print best hyperparameters

  #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names


In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,16):
  print("i: ",i)
  weights,best_f1,selected_features=svmPolypipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Volume', 'macd', 'OBV']
Best F1 Score: 0.4501329131077415
Best Parameters: {'svm__C': 1, 'svm__coef0': 0.0, 'svm__degree': 4}
i:  4
Selected Features: ['Volume', 'macd', 'macd_signal', 'OBV']
Best F1 Score: 0.4118695729222045
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  5
Selected Features: ['Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.4584864433811802
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  6
Selected Features: ['High', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.5128283481224657
Best Parameters: {'svm__C': 10, 'svm__coef0': 0.0, 'svm__degree': 4}
i:  7
Selected Features: ['High', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.5217997846974517
Best Parameters: {'svm__C': 10, 'svm__coef0': 0.0, 'svm__degree': 4}
i:  8
Selected Features: ['High', 'Close', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV'

In [None]:
import pickle
poly_model=all_weights[max_f1_index]
with open('poly_svm_model_k5_BTC_only.pkl', 'wb') as f:
    pickle.dump(poly_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('BTC-USD-backtest-cleaned-final.csv')#backtest dataset

with open('poly_svm_model_k5_BTC_only.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.74      0.78      0.76        18
           1       0.67      0.62      0.64        13

    accuracy                           0.71        31
   macro avg       0.70      0.70      0.70        31
weighted avg       0.71      0.71      0.71        31



**SVM-RBF**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
def svmRBFpipeline(n):
  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='rbf'))
  ])
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__gamma': [0.1, 1, 10, 100],

  }

  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','Close2','Close2-Close1','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
  #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,16):
  print("i: ",i)
  weights,best_f1,selected_features=svmRBFpipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Volume', 'macd', 'OBV']
Best F1 Score: 0.4360793229758747
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  4
Selected Features: ['Volume', 'macd', 'macd_signal', 'OBV']
Best F1 Score: 0.5296363636363636
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  5
Selected Features: ['Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.523253994412531
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  6
Selected Features: ['High', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.4148004434589801
Best Parameters: {'svm__C': 10, 'svm__gamma': 100}
i:  7
Selected Features: ['High', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.42209564483762874
Best Parameters: {'svm__C': 10, 'svm__gamma': 10}
i:  8
Selected Features: ['High', 'Close', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.4275654691309825
Best Parameters: {'svm__C': 10, 'svm__gamma': 10}
i:  

In [None]:
import pickle
rbf_model=all_weights[max_f1_index]
with open('rbf_svm_model_k5_BTC_only.pkl', 'wb') as f:
    pickle.dump(rbf_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('BTC-USD-backtest-cleaned-final.csv')#backtest dataset

with open('rbf_svm_model_k5_BTC_only.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.50      0.50      0.50        18
           1       0.31      0.31      0.31        13

    accuracy                           0.42        31
   macro avg       0.40      0.40      0.40        31
weighted avg       0.42      0.42      0.42        31



**RF**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
def RFpipeline(n):

  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('RF', RandomForestClassifier())
  ])

  #  Define parameter grid for GridSearchCV
  param_grid = {
      'RF__n_estimators': [50, 100, 200],
      'RF__max_depth': [None, 10, 20],
      'RF__min_samples_split': [2, 5, 10],
      'RF__min_samples_leaf': [1, 2, 4]
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','Close2','Close2-Close1','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
  #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  rf_model = best_estimator.named_steps['RF']


  return rf_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,16):
  print("i: ",i)
  weights,best_f1,selected_features=RFpipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Volume', 'macd', 'OBV']
Best F1 Score: 0.3465767116441779
Best Parameters: {'RF__max_depth': 20, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 2, 'RF__n_estimators': 100}
i:  4
Selected Features: ['Volume', 'macd', 'macd_signal', 'OBV']
Best F1 Score: 0.30425609478241056
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 2, 'RF__n_estimators': 50}
i:  5
Selected Features: ['Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.2992196209587514
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 4, 'RF__min_samples_split': 2, 'RF__n_estimators': 50}
i:  6
Selected Features: ['High', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.25419321419321417
Best Parameters: {'RF__max_depth': 10, 'RF__min_samples_leaf': 1, 'RF__min_samples_split': 2, 'RF__n_estimators': 50}
i:  7
Selected Features: ['High', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 S

In [None]:
import pickle
rf_model=all_weights[max_f1_index]
with open('random_forest_model_k5_BTC_only.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('BTC-USD-backtest-cleaned-final.csv')#backtest dataset

with open('random_forest_model_k5_BTC_only.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.70      0.39      0.50        18
           1       0.48      0.77      0.59        13

    accuracy                           0.55        31
   macro avg       0.59      0.58      0.54        31
weighted avg       0.61      0.55      0.54        31



**Adaboost**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
def adaboostpipeline(n):
  pipeline =  Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('adaboost', AdaBoostClassifier()) ])

  # 4. Define parameter grid for GridSearchCV
  param_grid = {

      'adaboost__n_estimators': [50, 100, 150], # Number of weak learners
      'adaboost__learning_rate': [0.1, 0.5, 1.0] # Learning rate
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df3.drop(columns=['Date','Close2','Close2-Close1','next_day_close_increased'])
  out=merged_df3['next_day_close_increased']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
    #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  ada_model = best_estimator.named_steps['adaboost']


  return ada_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,16):
  print("i: ",i)
  weights,best_f1,selected_features=adaboostpipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

i:  3
Selected Features: ['Volume', 'macd', 'OBV']
Best F1 Score: 0.352770823379519
Best Parameters: {'adaboost__learning_rate': 0.5, 'adaboost__n_estimators': 150}
i:  4
Selected Features: ['Volume', 'macd', 'macd_signal', 'OBV']
Best F1 Score: 0.4186279461279462
Best Parameters: {'adaboost__learning_rate': 1.0, 'adaboost__n_estimators': 50}
i:  5
Selected Features: ['Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.4337121212121212
Best Parameters: {'adaboost__learning_rate': 1.0, 'adaboost__n_estimators': 150}
i:  6
Selected Features: ['High', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.41392857142857153
Best Parameters: {'adaboost__learning_rate': 0.5, 'adaboost__n_estimators': 100}
i:  7
Selected Features: ['High', 'Adj Close', 'Volume', 'macd', 'macd_signal', 'High-Low', 'OBV']
Best F1 Score: 0.460700996027083
Best Parameters: {'adaboost__learning_rate': 1.0, 'adaboost__n_estimators': 150}
i:  8
Selected Features: ['High', 'Close', 'Adj

In [None]:
import pickle
ada_model=all_weights[max_f1_index]
with open('ada_model_k5_BTC_only.pkl', 'wb') as f:
    pickle.dump(ada_model, f)

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('BTC-USD-backtest-cleaned-final.csv')#backtest dataset

with open('ada_model_k5_BTC_only.pkl', 'rb') as f:
    trained_model = pickle.load(f)


selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['next_day_close_increased']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.70      0.39      0.50        18
           1       0.48      0.77      0.59        13

    accuracy                           0.55        31
   macro avg       0.59      0.58      0.54        31
weighted avg       0.61      0.55      0.54        31

