## Imports

In [None]:
import pandas as pd
#import pyspark as spark
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

## Data Preprocessing

In [None]:

df_BTC_USD = pd.read_csv('BTC-USD.csv')
df_BTC_USD.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-10-12,57526.832031,57627.878906,54477.972656,56041.058594,56041.058594,41083758949
1,2021-10-13,56038.257813,57688.660156,54370.972656,57401.097656,57401.097656,41684252783
2,2021-10-14,57372.832031,58478.734375,56957.074219,57321.523438,57321.523438,36615791366
3,2021-10-15,57345.902344,62757.128906,56868.144531,61593.949219,61593.949219,51780081801
4,2021-10-16,61609.527344,62274.476563,60206.121094,60892.179688,60892.179688,34250964237


In [None]:
df_SP = pd.read_csv('SP500.csv')
df_SP.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,12/10/2021,4368.31,4374.89,4342.09,4350.65,4350.65,3558450000
1,13/10/2021,4358.01,4372.87,4329.92,4363.8,4363.8,3620070000
2,14/10/2021,4386.75,4439.73,4386.75,4438.26,4438.26,3598280000
3,15/10/2021,4447.69,4475.82,4447.69,4471.37,4471.37,3819380000
4,18/10/2021,4463.72,4488.75,4447.47,4486.46,4486.46,3662010000


In [None]:
df_InterestRate = pd.read_csv('Interest Rate.csv')
df_InterestRate.head()

Unnamed: 0,Date,Interest Rate
0,12/10/2021,0.08
1,13/10/2021,0.08
2,14/10/2021,0.08
3,15/10/2021,0.08
4,16/10/2021,0.08


In [None]:
#Reformat date column to make sure the date is consistent for df_SP
df_SP['Date'] = pd.to_datetime(df_SP['Date'], format='%d/%m/%Y')
df_SP['Date'] = df_SP['Date'].dt.strftime('%Y-%m-%d')

df_SP.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,2021-10-12,4368.31,4374.89,4342.09,4350.65,4350.65,3558450000
1,2021-10-13,4358.01,4372.87,4329.92,4363.8,4363.8,3620070000
2,2021-10-14,4386.75,4439.73,4386.75,4438.26,4438.26,3598280000
3,2021-10-15,4447.69,4475.82,4447.69,4471.37,4471.37,3819380000
4,2021-10-18,4463.72,4488.75,4447.47,4486.46,4486.46,3662010000


In [None]:
#Reformat date column to make sure the date is consistent for df_InterestRate
df_InterestRate['Date'] = pd.to_datetime(df_InterestRate['Date'], format='%d/%m/%Y')
df_InterestRate['Date'] = df_InterestRate['Date'].dt.strftime('%Y-%m-%d')
df_InterestRate.head()

Unnamed: 0,Date,Interest Rate
0,2021-10-12,0.08
1,2021-10-13,0.08
2,2021-10-14,0.08
3,2021-10-15,0.08
4,2021-10-16,0.08


In [None]:
df_BTC_USD = df_BTC_USD.rename(columns={'Volume': 'BTC-USD Volume', 'Open': 'BTC-USD Open', 'High': 'BTC-USD High', 'Low': 'BTC-USD Low', 'Adj Close':'BTC-USD Adj Close'})
df_SP = df_SP.rename(columns={'Volume': 'S&P500 Volume', 'Open': 'S&P500 Open', 'High': 'S&P500 High', 'Low': 'S&P500 Low', 'Adj Close**':'S&P500 Adj Close'})


# Merge df_BTC_USD with df_SP on the 'Date' column
merged_df = pd.merge(df_BTC_USD[['Date', 'Close', 'BTC-USD Volume', 'BTC-USD Open', 'BTC-USD High', 'BTC-USD Low', 'BTC-USD Adj Close']], df_SP[['Date', 'Close*','S&P500 Volume','S&P500 Open','S&P500 High','S&P500 Low', 'S&P500 Adj Close']], on='Date')

# Merge the result with df_InterestRate on the 'Date' column
merged_df = pd.merge(merged_df, df_InterestRate[['Date', 'Interest Rate']], on='Date')

merged_df.head()

Unnamed: 0,Date,Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,Close*,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate
0,2021-10-12,56041.058594,41083758949,57526.832031,57627.878906,54477.972656,56041.058594,4350.65,3558450000,4368.31,4374.89,4342.09,4350.65,0.08
1,2021-10-13,57401.097656,41684252783,56038.257813,57688.660156,54370.972656,57401.097656,4363.8,3620070000,4358.01,4372.87,4329.92,4363.8,0.08
2,2021-10-14,57321.523438,36615791366,57372.832031,58478.734375,56957.074219,57321.523438,4438.26,3598280000,4386.75,4439.73,4386.75,4438.26,0.08
3,2021-10-15,61593.949219,51780081801,57345.902344,62757.128906,56868.144531,61593.949219,4471.37,3819380000,4447.69,4475.82,4447.69,4471.37,0.08
4,2021-10-18,62026.078125,38055562075,61548.804688,62614.660156,60012.757813,62026.078125,4486.46,3662010000,4463.72,4488.75,4447.47,4486.46,0.08


In [None]:
#Create a new column for the next day's BTC-USD closing price
# Shift the 'Close' column by one row to get the next day's Close value
merged_df['BTC-USD Next Day Close'] = merged_df['Close'].shift(-1)
merged_df

Unnamed: 0,Date,Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,Close*,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,BTC-USD Next Day Close
0,2021-10-12,56041.058594,41083758949,57526.832031,57627.878906,54477.972656,56041.058594,4350.65,3558450000,4368.31,4374.89,4342.09,4350.65,0.08,57401.097656
1,2021-10-13,57401.097656,41684252783,56038.257813,57688.660156,54370.972656,57401.097656,4363.80,3620070000,4358.01,4372.87,4329.92,4363.80,0.08,57321.523438
2,2021-10-14,57321.523438,36615791366,57372.832031,58478.734375,56957.074219,57321.523438,4438.26,3598280000,4386.75,4439.73,4386.75,4438.26,0.08,61593.949219
3,2021-10-15,61593.949219,51780081801,57345.902344,62757.128906,56868.144531,61593.949219,4471.37,3819380000,4447.69,4475.82,4447.69,4471.37,0.08,62026.078125
4,2021-10-18,62026.078125,38055562075,61548.804688,62614.660156,60012.757813,62026.078125,4486.46,3662010000,4463.72,4488.75,4447.47,4486.46,0.08,64261.992188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,42890.742188,26797884674,41468.464844,43429.781250,40676.867188,42890.742188,4707.09,5063650000,4646.20,4709.69,4643.23,4707.09,5.33,43023.972656
547,2023-12-14,43023.972656,25578530178,42884.261719,43390.859375,41767.089844,43023.972656,4719.55,6314040000,4721.04,4738.57,4694.34,4719.55,5.33,41929.757813
548,2023-12-15,41929.757813,19639442462,43028.250000,43087.824219,41692.968750,41929.757813,4719.19,8218980000,4714.23,4725.53,4704.69,4719.19,5.33,42623.539063
549,2023-12-18,42623.539063,25224642008,41348.203125,42720.296875,40530.257813,42623.539063,4740.56,4060340000,4725.58,4749.52,4725.58,4740.56,5.33,42270.527344


In [None]:
# Replace the NaN value in the last row of 'Next Day Close' column with 43,652.25
merged_df.loc[merged_df['Date'] == '2023-12-19', 'BTC-USD Next Day Close'] = 43652.25
merged_df

Unnamed: 0,Date,Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,Close*,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,BTC-USD Next Day Close
0,2021-10-12,56041.058594,41083758949,57526.832031,57627.878906,54477.972656,56041.058594,4350.65,3558450000,4368.31,4374.89,4342.09,4350.65,0.08,57401.097656
1,2021-10-13,57401.097656,41684252783,56038.257813,57688.660156,54370.972656,57401.097656,4363.80,3620070000,4358.01,4372.87,4329.92,4363.80,0.08,57321.523438
2,2021-10-14,57321.523438,36615791366,57372.832031,58478.734375,56957.074219,57321.523438,4438.26,3598280000,4386.75,4439.73,4386.75,4438.26,0.08,61593.949219
3,2021-10-15,61593.949219,51780081801,57345.902344,62757.128906,56868.144531,61593.949219,4471.37,3819380000,4447.69,4475.82,4447.69,4471.37,0.08,62026.078125
4,2021-10-18,62026.078125,38055562075,61548.804688,62614.660156,60012.757813,62026.078125,4486.46,3662010000,4463.72,4488.75,4447.47,4486.46,0.08,64261.992188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,42890.742188,26797884674,41468.464844,43429.781250,40676.867188,42890.742188,4707.09,5063650000,4646.20,4709.69,4643.23,4707.09,5.33,43023.972656
547,2023-12-14,43023.972656,25578530178,42884.261719,43390.859375,41767.089844,43023.972656,4719.55,6314040000,4721.04,4738.57,4694.34,4719.55,5.33,41929.757813
548,2023-12-15,41929.757813,19639442462,43028.250000,43087.824219,41692.968750,41929.757813,4719.19,8218980000,4714.23,4725.53,4704.69,4719.19,5.33,42623.539063
549,2023-12-18,42623.539063,25224642008,41348.203125,42720.296875,40530.257813,42623.539063,4740.56,4060340000,4725.58,4749.52,4725.58,4740.56,5.33,42270.527344


In [None]:
# Create the 'Price Change Indicator' column based on the difference between 'Next Day Close' and 'Close'
merged_df['Price Change Indicator'] = pd.cut(merged_df['BTC-USD Next Day Close'] - merged_df['Close'], bins=[float('-inf'), 0, float('inf')], labels=['decrease', 'increase'])
merged_df

Unnamed: 0,Date,Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,Close*,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,BTC-USD Next Day Close,Price Change Indicator
0,2021-10-12,56041.058594,41083758949,57526.832031,57627.878906,54477.972656,56041.058594,4350.65,3558450000,4368.31,4374.89,4342.09,4350.65,0.08,57401.097656,increase
1,2021-10-13,57401.097656,41684252783,56038.257813,57688.660156,54370.972656,57401.097656,4363.80,3620070000,4358.01,4372.87,4329.92,4363.80,0.08,57321.523438,decrease
2,2021-10-14,57321.523438,36615791366,57372.832031,58478.734375,56957.074219,57321.523438,4438.26,3598280000,4386.75,4439.73,4386.75,4438.26,0.08,61593.949219,increase
3,2021-10-15,61593.949219,51780081801,57345.902344,62757.128906,56868.144531,61593.949219,4471.37,3819380000,4447.69,4475.82,4447.69,4471.37,0.08,62026.078125,increase
4,2021-10-18,62026.078125,38055562075,61548.804688,62614.660156,60012.757813,62026.078125,4486.46,3662010000,4463.72,4488.75,4447.47,4486.46,0.08,64261.992188,increase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,42890.742188,26797884674,41468.464844,43429.781250,40676.867188,42890.742188,4707.09,5063650000,4646.20,4709.69,4643.23,4707.09,5.33,43023.972656,increase
547,2023-12-14,43023.972656,25578530178,42884.261719,43390.859375,41767.089844,43023.972656,4719.55,6314040000,4721.04,4738.57,4694.34,4719.55,5.33,41929.757813,decrease
548,2023-12-15,41929.757813,19639442462,43028.250000,43087.824219,41692.968750,41929.757813,4719.19,8218980000,4714.23,4725.53,4704.69,4719.19,5.33,42623.539063,increase
549,2023-12-18,42623.539063,25224642008,41348.203125,42720.296875,40530.257813,42623.539063,4740.56,4060340000,4725.58,4749.52,4725.58,4740.56,5.33,42270.527344,decrease


In [None]:
# Rename the 'Close' and 'Close*' columns
merged_df = merged_df.rename(columns={'Close': 'BTC-USD Close', 'Close*': 'S&P500 Close'})
merged_df

Unnamed: 0,Date,BTC-USD Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,S&P500 Close,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,BTC-USD Next Day Close,Price Change Indicator
0,2021-10-12,56041.058594,41083758949,57526.832031,57627.878906,54477.972656,56041.058594,4350.65,3558450000,4368.31,4374.89,4342.09,4350.65,0.08,57401.097656,increase
1,2021-10-13,57401.097656,41684252783,56038.257813,57688.660156,54370.972656,57401.097656,4363.80,3620070000,4358.01,4372.87,4329.92,4363.80,0.08,57321.523438,decrease
2,2021-10-14,57321.523438,36615791366,57372.832031,58478.734375,56957.074219,57321.523438,4438.26,3598280000,4386.75,4439.73,4386.75,4438.26,0.08,61593.949219,increase
3,2021-10-15,61593.949219,51780081801,57345.902344,62757.128906,56868.144531,61593.949219,4471.37,3819380000,4447.69,4475.82,4447.69,4471.37,0.08,62026.078125,increase
4,2021-10-18,62026.078125,38055562075,61548.804688,62614.660156,60012.757813,62026.078125,4486.46,3662010000,4463.72,4488.75,4447.47,4486.46,0.08,64261.992188,increase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,42890.742188,26797884674,41468.464844,43429.781250,40676.867188,42890.742188,4707.09,5063650000,4646.20,4709.69,4643.23,4707.09,5.33,43023.972656,increase
547,2023-12-14,43023.972656,25578530178,42884.261719,43390.859375,41767.089844,43023.972656,4719.55,6314040000,4721.04,4738.57,4694.34,4719.55,5.33,41929.757813,decrease
548,2023-12-15,41929.757813,19639442462,43028.250000,43087.824219,41692.968750,41929.757813,4719.19,8218980000,4714.23,4725.53,4704.69,4719.19,5.33,42623.539063,increase
549,2023-12-18,42623.539063,25224642008,41348.203125,42720.296875,40530.257813,42623.539063,4740.56,4060340000,4725.58,4749.52,4725.58,4740.56,5.33,42270.527344,decrease


In [None]:
#Min-Max Normalization
# Extract the columns you want to normalize
#columns_to_normalize = ['BTC-USD Close', 'S&P500 Close', 'Interest Rate', 'BTC-USD Next Day Close']
columns_to_normalize = ['BTC-USD Close', 'S&P500 Close', 'BTC-USD Next Day Close', 'BTC-USD Volume', 'BTC-USD Open', 'BTC-USD High', 'BTC-USD Low', 'BTC-USD Adj Close', 'S&P500 Volume','S&P500 Open','S&P500 High','S&P500 Low', 'S&P500 Adj Close']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the selected columns
scaler.fit(merged_df[columns_to_normalize])

# Perform Min-Max normalization
merged_df[columns_to_normalize] = scaler.transform(merged_df[columns_to_normalize])

# Convert 'Date' column to datetime type if needed
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

# Convert 'Price Change Indicator' column to binary format
merged_df['Price Change Indicator'] = merged_df['Price Change Indicator'].apply(lambda x: 1 if str(x) == 'increase' else 0)
#merged_df.drop(columns=['BTC-USD Next Day Close'], inplace=True)

merged_df

Unnamed: 0,Date,BTC-USD Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,S&P500 Close,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,BTC-USD Next Day Close,Price Change Indicator
0,2021-10-12,0.777407,0.289151,0.806386,0.787543,0.765589,0.777407,0.634359,0.248737,0.660317,0.633366,0.660098,0.634359,0.08,0.803673,1
1,2021-10-13,0.803673,0.294630,0.777631,0.788700,0.763482,0.803673,0.645142,0.256724,0.652296,0.631697,0.650653,0.645142,0.08,0.802136,0
2,2021-10-14,0.802136,0.248385,0.803411,0.803739,0.814407,0.802136,0.706198,0.253900,0.674677,0.686940,0.694760,0.706198,0.08,0.884648,1
3,2021-10-15,0.884648,0.386746,0.802891,0.885175,0.812656,0.884648,0.733348,0.282559,0.722133,0.716760,0.742056,0.733348,0.08,0.892993,1
4,2021-10-18,0.892993,0.261522,0.884079,0.882464,0.874578,0.892993,0.745722,0.262160,0.734616,0.727443,0.741886,0.745722,0.08,0.936175,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,0.523439,0.158805,0.496184,0.517292,0.493823,0.523439,0.926636,0.443843,0.876719,0.909996,0.893819,0.926636,5.33,0.526013,1
547,2023-12-14,0.526013,0.147680,0.523533,0.516551,0.515291,0.526013,0.936853,0.605920,0.934999,0.933858,0.933486,0.936853,5.33,0.504880,0
548,2023-12-15,0.504880,0.093491,0.526314,0.510783,0.513832,0.504880,0.936558,0.852841,0.929696,0.923084,0.941519,0.936558,5.33,0.518279,1
549,2023-12-18,0.518279,0.144451,0.493861,0.503787,0.490936,0.518279,0.954081,0.313792,0.938535,0.942906,0.957732,0.954081,5.33,0.511461,0


In [None]:
merged_df.drop(columns=['BTC-USD Next Day Close'], inplace=True)
merged_df

Unnamed: 0,Date,BTC-USD Close,BTC-USD Volume,BTC-USD Open,BTC-USD High,BTC-USD Low,BTC-USD Adj Close,S&P500 Close,S&P500 Volume,S&P500 Open,S&P500 High,S&P500 Low,S&P500 Adj Close,Interest Rate,Price Change Indicator
0,2021-10-12,0.777407,0.289151,0.806386,0.787543,0.765589,0.777407,0.634359,0.248737,0.660317,0.633366,0.660098,0.634359,0.08,1
1,2021-10-13,0.803673,0.294630,0.777631,0.788700,0.763482,0.803673,0.645142,0.256724,0.652296,0.631697,0.650653,0.645142,0.08,0
2,2021-10-14,0.802136,0.248385,0.803411,0.803739,0.814407,0.802136,0.706198,0.253900,0.674677,0.686940,0.694760,0.706198,0.08,1
3,2021-10-15,0.884648,0.386746,0.802891,0.885175,0.812656,0.884648,0.733348,0.282559,0.722133,0.716760,0.742056,0.733348,0.08,1
4,2021-10-18,0.892993,0.261522,0.884079,0.882464,0.874578,0.892993,0.745722,0.262160,0.734616,0.727443,0.741886,0.745722,0.08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,2023-12-13,0.523439,0.158805,0.496184,0.517292,0.493823,0.523439,0.926636,0.443843,0.876719,0.909996,0.893819,0.926636,5.33,1
547,2023-12-14,0.526013,0.147680,0.523533,0.516551,0.515291,0.526013,0.936853,0.605920,0.934999,0.933858,0.933486,0.936853,5.33,0
548,2023-12-15,0.504880,0.093491,0.526314,0.510783,0.513832,0.504880,0.936558,0.852841,0.929696,0.923084,0.941519,0.936558,5.33,1
549,2023-12-18,0.518279,0.144451,0.493861,0.503787,0.490936,0.518279,0.954081,0.313792,0.938535,0.942906,0.957732,0.954081,5.33,0


# K-Fold

### K = 10

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score


#time series k fold cross validation split where k=10
import math

def k_fold_time_series_blocking(k=10):
    merged_df.reset_index(drop=True, inplace=True)
    n=len(merged_df)
    fold_size=n//k
    train_size= math.floor(0.8 * fold_size)
    test_size=math.floor(0.2*fold_size)

    indexes=[]

    for i in range(k):
        start_train=i*fold_size
        end_train=start_train+train_size-1
        start_test=end_train+1
        end_test=start_test+test_size-1
        yield np.arange(start_train, end_train, dtype=int),np.arange(start_test, end_test, dtype=int)

In [None]:
max_features=len(merged_df.drop(columns=['Date','S&P500 Close','Interest Rate','BTC-USD Close','Price Change Indicator']).columns)

In [None]:
print(max_features)

10


### SVM - poly kernel

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

def train_pipeline(n):


  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='poly'))
  ])

  #  Define parameter grid for GridSearchCV
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__degree': [2, 3, 4],              # Degree of the polynomial kernel
      'svm__coef0': [0.0, 1.0, 2.0]          # Independent term in the polynomial kernel
  }
  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df.drop(columns=['Date','Price Change Indicator'])
  out=merged_df['Price Change Indicator']

  # Perform grid search with cross-validation
  #select best estimator using aggregated f1 score over all k fold as defined by refit='f1'
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  #retrieving and printing the selected features and hyperparameters
  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_) #print best score
  print("Best Parameters:", grid_result.best_params_) #print best hyperparameters

#get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,10):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)

# all_weights = {}
# all_selected_features = {}
# all_f1_scores = {}  # Store F1 scores for each value of i

# for i in range(3, 10):
#     print("i: ", i)
#     weights, best_f1, selected_features = train_pipeline(i)  # i is number of features

#     # Store the weights and selected features
#     all_weights[i] = weights
#     all_selected_features[i] = selected_features
    
#     # Store the best F1 score
#     all_f1_scores[i] = best_f1

# # Find the maximum F1 score and its corresponding index
# max_f1_index = max(all_f1_scores, key=all_f1_scores.get)
# max_f1 = all_f1_scores[max_f1_index]

# print("index:", max_f1_index)
# print("max_f1:", max_f1)


i:  3
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'Interest Rate']
Best F1 Score: 0.5114718614718614
Best Parameters: {'svm__C': 0.1, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  4
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 Adj Close']
Best F1 Score: 0.49041154923507857
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  5
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.49777751660104597
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  6
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 Low', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.45671191553544493
Best Parameters: {'svm__C': 1, 'svm__coef0': 2.0, 'svm__degree': 3}
i:  7
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 High', 'S&P500 Low', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.45231631113984055
Best 

## Save Weights

In [None]:
import pickle
poly_model=all_weights[max_f1_index]
with open('s&p_poly_svm_model_k10.pkl', 'wb') as f:
    pickle.dump(poly_model, f)

## Backtesting

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('sp_backtest.csv')#backtest dataset
df_backtest = df_backtest.rename(columns={'Volume': 'BTC-USD Volume', 'Open': 'BTC-USD Open', 'High': 'BTC-USD High', 'Low': 'BTC-USD Low', 'Close': 'BTC-USD Close', 'Adj Close':'BTC-USD Adj Close', 'SP_Open': 'S&P500 Open', 'SP_High': 'S&P500 High', 'SP_Low': 'S&P500 Low', 'SP_Close': 'S&P500 Close', 'SP_AdjClose': 'S&P500 Adj Close', 'SP_Volume': 'S&P500 Volume'})
df_backtest['Date'] = pd.to_datetime(df_backtest['Date'], format='%d/%m/%Y')
df_backtest['Date'] = df_backtest['Date'].dt.strftime('%Y-%m-%d')
df_backtest['BTC-USD Next Day Close'] = df_backtest['BTC-USD Close'].shift(-1)
df_backtest.loc[df_backtest['Date'] == '2020-07-02', 'BTC-USD Next Day Close'] = 9087.30
df_backtest['Price Change Indicator'] = pd.cut(df_backtest['BTC-USD Next Day Close'] - df_backtest['BTC-USD Close'], bins=[float('-inf'), 0, float('inf')], labels=['decrease', 'increase'])
df_backtest['Price Change Indicator'] = df_backtest['Price Change Indicator'].apply(lambda x: 1 if str(x) == 'increase' else 0)

with open('s&p_poly_svm_model_k10.pkl', 'rb') as f:
    trained_model = pickle.load(f)




selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
df_backtest
y_true=df_backtest['Price Change Indicator']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)


# for i in range(3, 10):

#     selected_feats = all_selected_features[i]
#     X_test = df_backtest[selected_feats]
#     y_true = df_backtest['Price Change Indicator']
#     predictions = all_weights[i].predict(X_test.values)

#     # Generate the classification report (optional)
#     report = classification_report(y_true, predictions)
#     print(f"Classification Report for i={i}:")
#     print(report)




              precision    recall  f1-score   support

           0       0.57      1.00      0.73        12
           1       0.00      0.00      0.00         9

    accuracy                           0.57        21
   macro avg       0.29      0.50      0.36        21
weighted avg       0.33      0.57      0.42        21

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### SVM - rbf kernel

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
def svmRBFpipeline(n):
  pipeline = Pipeline([
      ('feature_selection', SelectKBest(mutual_info_regression,k=n)),
      ('svm', SVC(kernel='rbf'))
  ])
  param_grid = {

      'svm__C': [0.1, 1, 10],                # Regularization parameter
      'svm__gamma': [0.1, 1, 10, 100],

  }

  f1_scorer = make_scorer(f1_score)
  roc_auc_scorer = make_scorer(roc_auc_score)

  input=merged_df.drop(columns=['Date','S&P500 Close','Interest Rate','BTC-USD Close','Price Change Indicator'])
  out=merged_df['Price Change Indicator']

  # 5. Perform grid search with cross-validation
  grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold_time_series_blocking(),scoring={'f1': f1_scorer, 'roc_auc': roc_auc_scorer}, refit='f1', verbose=0)
  grid_result = grid_search.fit(input, out)

  selected_feature_names=[]
  selected_features_bool=grid_result.best_estimator_.named_steps['feature_selection'].get_support()
  for i in range(len(selected_features_bool)):
    if(selected_features_bool[i]):
      selected_feature_names.append(input.columns[i])

  print("Selected Features:", selected_feature_names)

  print("Best F1 Score:", grid_result.best_score_)
  print("Best Parameters:", grid_result.best_params_)
        #get best weights
  best_estimator = grid_result.best_estimator_

  # Access the RandomForestClassifier from the best estimator
  svm_model = best_estimator.named_steps['svm']


  return svm_model,grid_result.best_score_,selected_feature_names

In [None]:
all_weights={}
all_selected_features={}
max_f1_index=0
max_f1=0

for i in range(3,10):
  print("i: ",i)
  weights,best_f1,selected_features=train_pipeline(i) #i is number of features

  max_f1=max(max_f1,best_f1)
  if max_f1==best_f1:
    max_f1_index=i

  all_weights[i]=weights
  all_selected_features[i]=selected_features


print("index:",max_f1_index)
print("max_f1:",max_f1)



i:  3
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'Interest Rate']
Best F1 Score: 0.5023809523809524
Best Parameters: {'svm__C': 0.1, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  4
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.523803157626687
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  5
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.5182903371138665
Best Parameters: {'svm__C': 10, 'svm__coef0': 2.0, 'svm__degree': 4}
i:  6
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 Low', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.44718810601163533
Best Parameters: {'svm__C': 1, 'svm__coef0': 1.0, 'svm__degree': 3}
i:  7
Selected Features: ['BTC-USD Open', 'BTC-USD High', 'S&P500 Close', 'S&P500 High', 'S&P500 Low', 'S&P500 Adj Close', 'Interest Rate']
Best F1 Score: 0.45231631113984055
Best Pa

## Save Weights

In [None]:
import pickle
poly_model=all_weights[max_f1_index]
with open('s&p_rbf_svm_model_k10.pkl', 'wb') as f:
    pickle.dump(poly_model, f)

## Backtesting

In [None]:
# Use the loaded model to make predictions on new data

from sklearn.metrics import classification_report

df_backtest=pd.read_csv('sp_backtest.csv')#backtest dataset
df_backtest = df_backtest.rename(columns={'Volume': 'BTC-USD Volume', 'Open': 'BTC-USD Open', 'High': 'BTC-USD High', 'Low': 'BTC-USD Low', 'Close': 'BTC-USD Close', 'Adj Close':'BTC-USD Adj Close', 'SP_Open': 'S&P500 Open', 'SP_High': 'S&P500 High', 'SP_Low': 'S&P500 Low', 'SP_Close': 'S&P500 Close', 'SP_AdjClose': 'S&P500 Adj Close', 'SP_Volume': 'S&P500 Volume'})
df_backtest['Date'] = pd.to_datetime(df_backtest['Date'], format='%d/%m/%Y')
df_backtest['Date'] = df_backtest['Date'].dt.strftime('%Y-%m-%d')
df_backtest['BTC-USD Next Day Close'] = df_backtest['BTC-USD Close'].shift(-1)
df_backtest.loc[df_backtest['Date'] == '2020-07-02', 'BTC-USD Next Day Close'] = 9087.30
df_backtest['Price Change Indicator'] = pd.cut(df_backtest['BTC-USD Next Day Close'] - df_backtest['BTC-USD Close'], bins=[float('-inf'), 0, float('inf')], labels=['decrease', 'increase'])
df_backtest['Price Change Indicator'] = df_backtest['Price Change Indicator'].apply(lambda x: 1 if str(x) == 'increase' else 0)

with open('s&p_rbf_svm_model_k10.pkl', 'rb') as f:
    trained_model = pickle.load(f)

selected_feats=all_selected_features[max_f1_index]
X_test = df_backtest[selected_feats]
y_true=df_backtest['Price Change Indicator']
predictions = trained_model.predict(X_test.values)

# Generate the classification report
report = classification_report(y_true, predictions)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.53      0.67      0.59        12
           1       0.33      0.22      0.27         9

    accuracy                           0.48        21
   macro avg       0.43      0.44      0.43        21
weighted avg       0.45      0.48      0.45        21



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=84adc6f1-a70e-4d8d-9cc6-0672f4fd6bc6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>