In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score as r2 
import shap, datetime,warnings
warnings.filterwarnings('ignore')

In [44]:
# Open the pickle file
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))

In [45]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [46]:
# Define the different sets of features to try
feature_sets = {
    'time_only': time_cols,
    'sentiment_only': sentiment_cols,
    'self_finance_only': self_finance_vars,
    'finance_only': finance_vars,
    'finance_time': finance_vars + time_cols,
    'all': sentiment_cols + finance_vars + time_cols 
}

In [47]:
y_cols = ['Volume']
y = df[y_cols]

In [48]:
# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [49]:
models = ['OLS', 'LASSO', 'NeuralNet', 'LightGBM']

In [50]:
predictions = df[['date','ticker','Volume']+time_cols]
predictions = predictions.iloc[split_val:, :]
predictions.rename(columns={'Volume': 'Actual'}, inplace=True)

In [51]:
for m in models:
    for feature_set in feature_sets:
        model = pickle.load(open(f'../../output/models/{m}/{m}_{feature_set}.pkl', 'rb'))

        x_cols = feature_sets[feature_set]
        x = df[x_cols]
        x_train = x[:split_val]
        x_val   = x[split_val:split_test]
        x_test  = x[split_test:]

        # Normalize the features to [0,1]
        sc2 = MinMaxScaler(feature_range=(0, 1))

        x_train = sc2.fit_transform(x_train)
        x_val   = sc2.transform(x_val)
        x_test  = sc2.transform(x_test)

        p1 = model.predict(x_val)
        p2 = model.predict(x_test)
        p = np.concatenate((p1, p2), axis=0)

        predictions[f'Predicted ({m}) ({feature_set})'] = p

# Add the tuned and retrained models as well
for desc in ['all_tuned', 'all_tuned_retrained']:
    model = pickle.load(open(f'../../output/models/LightGBM/lightgbm_{desc}.pkl', 'rb'))

    x_cols = feature_sets['all']
    x = df[x_cols]
    x_train = x[:split_val]
    x_val   = x[split_val:split_test]
    x_test  = x[split_test:]

    # Normalize the features to [0,1]
    sc2 = MinMaxScaler(feature_range=(0, 1))

    x_train = sc2.fit_transform(x_train)
    x_val   = sc2.transform(x_val)
    x_test  = sc2.transform(x_test)

    p1 = model.predict(x_val)
    p2 = model.predict(x_test)
    p = np.concatenate((p1, p2), axis=0)

    predictions[f'Predicted (LightGBM) ({desc})'] = p

In [52]:
predictions

Unnamed: 0,date,ticker,Actual,hour_of_day_10,hour_of_day_11,hour_of_day_12,hour_of_day_13,hour_of_day_14,hour_of_day_15,hour_of_day_9,...,Predicted (NeuralNet) (finance_time),Predicted (NeuralNet) (all),Predicted (LightGBM) (time_only),Predicted (LightGBM) (sentiment_only),Predicted (LightGBM) (self_finance_only),Predicted (LightGBM) (finance_only),Predicted (LightGBM) (finance_time),Predicted (LightGBM) (all),Predicted (LightGBM) (all_tuned),Predicted (LightGBM) (all_tuned_retrained)
270230,2023-12-04 12:45:00,AAL,1293240,0,0,1,0,0,0,0,...,1.249752e+06,1.062676e+06,2.166145e+05,853039.208070,1.466402e+06,1.467144e+06,1.444167e+06,1.398818e+06,1.273039e+06,1.385569e+06
270231,2023-12-04 12:45:00,ALGT,1447,0,0,1,0,0,0,0,...,3.557939e+03,9.692740e+03,2.166145e+05,117560.899842,3.084344e+03,2.405521e+03,5.587949e+03,3.406106e+03,1.775199e+03,3.620900e+03
270232,2023-12-04 12:45:00,ALK,587333,0,0,1,0,0,0,0,...,1.897455e+05,1.615639e+05,2.166145e+05,657044.943388,3.758347e+05,4.132884e+05,3.735391e+05,3.653477e+05,3.965052e+05,3.878293e+05
270233,2023-12-04 12:45:00,DAL,94989,0,0,1,0,0,0,0,...,1.144308e+05,1.294410e+05,2.166145e+05,276890.669042,1.747686e+05,1.862419e+05,1.748006e+05,1.693648e+05,1.612089e+05,1.758026e+05
270234,2023-12-04 12:45:00,JBLU,273608,0,0,1,0,0,0,0,...,4.161604e+05,3.706713e+05,2.166145e+05,830606.119024,5.119678e+05,5.256118e+05,4.896124e+05,4.796478e+05,4.580689e+05,4.958197e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337783,2025-05-30 15:45:00,ALK,440444,0,0,0,0,0,1,0,...,6.316739e+05,7.893551e+05,1.063072e+06,205925.448478,3.283563e+05,3.981022e+05,5.539017e+05,5.802983e+05,5.299487e+05,4.925179e+05
337784,2025-05-30 15:45:00,DAL,1245698,0,0,0,0,0,1,0,...,1.020272e+06,9.146906e+05,1.063072e+06,376416.982429,1.007004e+06,1.023123e+06,1.061786e+06,1.058074e+06,1.242038e+06,1.076072e+06
337785,2025-05-30 15:45:00,JBLU,8400782,0,0,0,0,0,1,0,...,3.370909e+06,4.472642e+06,1.063072e+06,285688.984920,3.415540e+06,3.235173e+06,3.590649e+06,3.451630e+06,3.598861e+06,3.577430e+06
337786,2025-05-30 15:45:00,LUV,2026886,0,0,0,0,0,1,0,...,1.532227e+06,2.007167e+06,1.063072e+06,316754.581946,1.256809e+06,1.268855e+06,1.217692e+06,1.229052e+06,1.449506e+06,1.282050e+06


In [70]:
r2_table = pd.DataFrame(index=feature_sets.keys(), columns=models)
for m in models:
    for feature_set in feature_sets:
        pred_col = f'Predicted ({m}) ({feature_set})'
        if pred_col in predictions.columns:
            r2_val = r2(predictions['Actual'], predictions[pred_col])
            r2_table.loc[feature_set, m] = r2_val

r2_table['Dataset'] = r2_table.index
r2_table.reset_index(drop=True, inplace=True)

# add 2 rows to the data
r2_table.loc[6] = [np.nan,np.nan,np.nan,r2(predictions['Actual'], predictions['Predicted (LightGBM) (all_tuned)']),'All (Tuned)']
r2_table.loc[7] = [np.nan,np.nan,np.nan,r2(predictions['Actual'], predictions['Predicted (LightGBM) (all_tuned_retrained)']),'All (Tuned, Retrained)']

# Move 'Dataset' to the first column
r2_table = r2_table[['Dataset'] + [col for col in r2_table.columns if col != 'Dataset']]

# clean Dataset column
r2_table['Dataset'] = r2_table['Dataset'].str.replace('_', ' ').str.title().replace('Self Finance Only', 'Self-Finance Only').replace('Finance Time', 'Finance + Time')

r2_table.rename(columns={'NeuralNet': 'Neural Network'}, inplace=True)

r2_table

Unnamed: 0,Dataset,OLS,LASSO,Neural Network,LightGBM
0,Time Only,0.097313,0.09731,0.091623,0.09784
1,Sentiment Only,0.007938,0.031426,-0.041789,0.076203
2,Self-Finance Only,0.632985,0.632493,0.591168,0.681706
3,Finance Only,0.636136,0.636271,0.570163,0.686544
4,Finance + Time,0.64414,0.64437,0.638835,0.689313
5,All,0.647798,0.649229,0.638858,0.695218
6,All (Tuned),,,,0.700003
7,"All (Tuned, Retrained)",,,,0.700718


In [63]:
r2_table

Unnamed: 0,OLS,LASSO,NeuralNet,LightGBM,Dataset
0,0.097313,0.09731,0.091623,0.09784,time_only
1,0.007938,0.031426,-0.041789,0.076203,sentiment_only
2,0.632985,0.632493,0.591168,0.681706,self_finance_only
3,0.636136,0.636271,0.570163,0.686544,finance_only
4,0.64414,0.64437,0.638835,0.689313,finance_time
5,0.647798,0.649229,0.638858,0.695218,all
