In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score as r2
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Open the pickle file
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))

In [None]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [6]:
# Define the different sets of features to try
feature_sets = {
    'time_only': time_cols,
    'sentiment_only': sentiment_cols,
    'self_finance_only': self_finance_vars,
    'finance_only': finance_vars,
    'finance_time': finance_vars + time_cols,
    'all': sentiment_cols + finance_vars + time_cols 
}

In [7]:
y_cols = ['Volume']
y = df[y_cols]

In [8]:
# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [9]:
for feature_set in feature_sets:
    print(f'Processing feature set: {feature_set}')

    x_cols = feature_sets[feature_set]
    x = df[x_cols]
    x_train = x[:split_val]
    x_val   = x[split_val:split_test]
    x_test  = x[split_test:]

    # Normalize the features to [0,1]
    sc2 = MinMaxScaler(feature_range=(0, 1))

    x_train = sc2.fit_transform(x_train)
    x_val   = sc2.transform(x_val)
    x_test  = sc2.transform(x_test)

    ols = LinearRegression()
    ols.fit(x_train, y_train)
    pickle.dump(ols, open(f'../../output/models/ols/ols_{feature_set}.pkl', 'wb'))
    print(f"OLS: {ols.score(x_val, y_val)}, {ols.score(x_test, y_test)}")

    lasso = Lasso(
        alpha=1,
        selection='random',
    )

    lasso.fit(x_train, y_train)
    pickle.dump(lasso, open(f'../../output/models/lasso/lasso_{feature_set}.pkl', 'wb'))
    print(f"LASSO: {ols.score(x_val, y_val)}, {ols.score(x_test, y_test)}")

Processing feature set: time_only
OLS: 0.11967365776891914, 0.08185475787619945
LASSO: 0.11967365776891914, 0.08185475787619945
Processing feature set: sentiment_only
OLS: -0.04133655403918257, 0.03247526155527736
LASSO: -0.04133655403918257, 0.03247526155527736
Processing feature set: self_finance_only
OLS: 0.7122210391585042, 0.5872579468370565
LASSO: 0.7122210391585042, 0.5872579468370565
Processing feature set: finance_only
OLS: 0.7142655327917684, 0.5910419981329451
LASSO: 0.7142655327917684, 0.5910419981329451
Processing feature set: finance_time
OLS: 0.7219595449984467, 0.5992451301424944
LASSO: 0.7219595449984467, 0.5992451301424944
Processing feature set: all
OLS: 0.7209179190336554, 0.6055574313519705
LASSO: 0.7209179190336554, 0.6055574313519705
