In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.ensemble import HistGradientBoostingRegressor as hgbr
from sklearn.metrics import r2_score as r2 
import shap, datetime,warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Open the pickle file
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))
baseline = pickle.load(open('../../output/models/baseline predictions.pkl', 'rb'))
baseline.reset_index(drop=True, inplace=True)

In [3]:
df['baseline'] = baseline
df['Volume_Shock'] = df['Volume'] - df['baseline']
df['Volume_Shock'] = pd.to_numeric(df['Volume_Shock'], errors='coerce')

In [4]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
sentiment_cols_no_llm = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [5]:
# Define the different sets of features to try
feature_sets = {
    'time_only': time_cols,
    'sentiment_only': sentiment_cols,
    'self_finance_only': self_finance_vars,
    'finance_only': finance_vars,
    'finance_time': finance_vars + time_cols,
    'all': sentiment_cols + finance_vars + time_cols 
}

all_no_llm = sentiment_cols_no_llm + finance_vars + time_cols 

In [6]:
y_cols = ['Volume']
y = df[y_cols]

In [7]:
# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [14]:
for feature_set in feature_sets:
    print(f'Processing feature set: {feature_set}')

    x_cols = feature_sets[feature_set]
    x = df[x_cols]
    x_train = x[:split_val]
    x_val   = x[split_val:split_test]
    x_test  = x[split_test:]

    # Normalize the features to [0,1]
    sc2 = MinMaxScaler(feature_range=(0, 1))

    x_train = sc2.fit_transform(x_train)
    x_val   = sc2.transform(x_val)
    x_test  = sc2.transform(x_test)

    model = hgbr(
        learning_rate=0.01,
        min_samples_leaf=200,
        l2_regularization=0,
        max_features=1.0,
        max_leaf_nodes=None,
        max_depth=None,
        early_stopping=True,
        scoring='r2',
        n_iter_no_change=10,
        verbose=2,
        max_iter = 2000,
        random_state=42
    )

    model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
    pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_{feature_set}.pkl', 'wb'))
    print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Processing feature set: time_only
Binning 0.056 GB of training data: 0.081 s
Binning 0.007 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 402 leaves, max depth = 22, train score: 0.00165, val score: 0.00096, in 0.051s
[2/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00330, val score: 0.00304, in 0.058s
[3/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00493, val score: 0.00508, in 0.058s
[4/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00652, val score: 0.00708, in 0.052s
[5/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00809, val score: 0.00904, in 0.051s
[6/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00962, val score: 0.01097, in 0.052s
[7/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01113, val score: 0.01286, in 0.050s
[8/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01260, val score: 0.01471, in 0.054s
[9/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01405, va

In [8]:
feature_set = all_no_llm

x_cols = feature_set
x = df[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

model = hgbr(
    learning_rate=0.01,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_all_no_llm.pkl', 'wb'))
print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Binning 3.061 GB of training data: 6.801 s
Binning 0.383 GB of validation data: 0.338 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 1034 leaves, max depth = 21, train score: 0.01567, val score: 0.01307, in 4.200s
[2/2000] 1 tree, 1033 leaves, max depth = 21, train score: 0.03103, val score: 0.02709, in 4.076s
[3/2000] 1 tree, 1043 leaves, max depth = 20, train score: 0.04608, val score: 0.04083, in 4.072s
[4/2000] 1 tree, 1036 leaves, max depth = 21, train score: 0.06080, val score: 0.05422, in 4.247s
[5/2000] 1 tree, 1046 leaves, max depth = 21, train score: 0.07530, val score: 0.06736, in 4.183s
[6/2000] 1 tree, 1039 leaves, max depth = 23, train score: 0.08933, val score: 0.08002, in 4.148s
[7/2000] 1 tree, 1027 leaves, max depth = 21, train score: 0.10320, val score: 0.09273, in 4.091s
[8/2000] 1 tree, 1042 leaves, max depth = 21, train score: 0.11680, val score: 0.10488, in 4.098s
[9/2000] 1 tree, 1021 leaves, max depth = 21, train score: 0.13009, val score: 0.11690, in 3.99

In [None]:
# Sensitivity: current-time sentiment
feature_set = sentiment_cols

# Lag the y data so that it is the same time as the features
df_new = df.copy()
df_new['Volume_shifted'] = df_new.groupby('ticker').shift(1)['Volume']
df_new = df_new.dropna(subset=['Volume_shifted'])

y_cols = ['Volume_shifted']
y = df_new[y_cols]
y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

x_cols = feature_set
x = df_new[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

x_eval = np.concatenate((x_val, x_test), axis=0)
y_eval = np.concatenate((y_val, y_test), axis=0)

In [13]:
model = hgbr(
    learning_rate=0.01,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_current_time_sentiment.pkl', 'wb'))
print(model.score(x_eval, y_eval))

Binning 1.621 GB of training data: 2.829 s
Binning 0.203 GB of validation data: 0.168 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 979 leaves, max depth = 32, train score: 0.00893, val score: 0.00160, in 1.622s
[2/2000] 1 tree, 976 leaves, max depth = 32, train score: 0.01790, val score: 0.00431, in 1.429s
[3/2000] 1 tree, 979 leaves, max depth = 31, train score: 0.02657, val score: 0.00707, in 1.463s
[4/2000] 1 tree, 980 leaves, max depth = 31, train score: 0.03526, val score: 0.00963, in 1.564s
[5/2000] 1 tree, 969 leaves, max depth = 30, train score: 0.04372, val score: 0.01197, in 1.447s
[6/2000] 1 tree, 988 leaves, max depth = 31, train score: 0.05203, val score: 0.01464, in 1.452s
[7/2000] 1 tree, 977 leaves, max depth = 27, train score: 0.06008, val score: 0.01687, in 1.557s
[8/2000] 1 tree, 978 leaves, max depth = 27, train score: 0.06821, val score: 0.01927, in 1.387s
[9/2000] 1 tree, 993 leaves, max depth = 30, train score: 0.07601, val score: 0.02150, in 1.406s
[10/20

In [102]:
# Sensitivity: volume shock
feature_set = feature_sets['all']

df_new = df.copy()

y_cols = ['Volume_Shock']
y = df_new[y_cols]
y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

x_cols = feature_set
x = df_new[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

model = hgbr(
    learning_rate=0.05,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
#pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_current_time_sentiment.pkl', 'wb'))
print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Binning 3.407 GB of training data: 8.347 s
Binning 0.426 GB of validation data: 0.402 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 1050 leaves, max depth = 50, train score: 0.03950, val score: 0.02795, in 4.780s
[2/2000] 1 tree, 1042 leaves, max depth = 55, train score: 0.07507, val score: 0.05467, in 4.645s
[3/2000] 1 tree, 1057 leaves, max depth = 54, train score: 0.10505, val score: 0.07913, in 4.769s
[4/2000] 1 tree, 1040 leaves, max depth = 48, train score: 0.13559, val score: 0.10172, in 5.141s
[5/2000] 1 tree, 1042 leaves, max depth = 51, train score: 0.16037, val score: 0.12442, in 4.668s
[6/2000] 1 tree, 1049 leaves, max depth = 51, train score: 0.18620, val score: 0.14268, in 4.829s
[7/2000] 1 tree, 1060 leaves, max depth = 47, train score: 0.20832, val score: 0.15982, in 5.036s
[8/2000] 1 tree, 1034 leaves, max depth = 50, train score: 0.23003, val score: 0.17481, in 5.109s
[9/2000] 1 tree, 1036 leaves, max depth = 56, train score: 0.24976, val score: 0.19026, in 4.81

In [103]:
predict = model.predict(x_test)
avg = df['baseline'][split_test:]

# convert back to Volume
predict_vol = predict + avg

results_df = df.copy()
results_df = results_df[['date','ticker','Volume','baseline']]
results_df = results_df[split_test:]
results_df['Volume_Predict'] = predict_vol

In [111]:
r2(results_df['Volume'], results_df['Volume_Predict'])

0.6449844542178512

In [112]:
# Sensitivity: sentiment only, predicting volume shock
feature_set = sentiment_cols

df_new = df.copy()

y_cols = ['Volume_Shock']
y = df_new[y_cols]
y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

x_cols = feature_set
x = df_new[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

model = hgbr(
    learning_rate=0.1,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
#pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_current_time_sentiment.pkl', 'wb'))
print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

predict = model.predict(x_test)
avg = df['baseline'][split_test:]

# convert back to Volume
predict_vol = predict + avg

results_df = df.copy()
results_df = results_df[['date','ticker','Volume','baseline']]
results_df = results_df[split_test:]
results_df['Volume_Predict'] = predict_vol

print(r2(results_df['Volume'], results_df['Volume_Predict']))

Binning 1.621 GB of training data: 2.925 s
Binning 0.203 GB of validation data: 0.166 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 980 leaves, max depth = 73, train score: 0.02667, val score: 0.00819, in 1.450s
[2/2000] 1 tree, 981 leaves, max depth = 78, train score: 0.05242, val score: 0.01286, in 1.851s
[3/2000] 1 tree, 988 leaves, max depth = 90, train score: 0.07787, val score: 0.02019, in 1.528s
[4/2000] 1 tree, 985 leaves, max depth = 89, train score: 0.09820, val score: 0.02417, in 1.523s
[5/2000] 1 tree, 997 leaves, max depth = 105, train score: 0.11605, val score: 0.02598, in 1.484s
[6/2000] 1 tree, 984 leaves, max depth = 114, train score: 0.13549, val score: 0.02844, in 1.497s
[7/2000] 1 tree, 981 leaves, max depth = 91, train score: 0.15615, val score: 0.03072, in 1.509s
[8/2000] 1 tree, 981 leaves, max depth = 77, train score: 0.17218, val score: 0.03231, in 1.589s
[9/2000] 1 tree, 982 leaves, max depth = 102, train score: 0.18804, val score: 0.03254, in 1.493s
[10

In [114]:
# Sensitivity: sentiment only, predicting volume shock
feature_set = time_cols

df_new = df.copy()

y_cols = ['Volume_Shock']
y = df_new[y_cols]
y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

x_cols = feature_set
x = df_new[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

model = hgbr(
    learning_rate=0.01,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
#pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_current_time_sentiment.pkl', 'wb'))
print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

predict = model.predict(x_test)
avg = df['baseline'][split_test:]

# convert back to Volume
predict_vol = predict + avg

results_df = df.copy()
results_df = results_df[['date','ticker','Volume','baseline']]
results_df = results_df[split_test:]
results_df['Volume_Predict'] = predict_vol

print(r2(results_df['Volume'], results_df['Volume_Predict']))

Binning 0.056 GB of training data: 0.109 s
Binning 0.007 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 423 leaves, max depth = 22, train score: 0.00014, val score: 0.00005, in 0.070s
[2/2000] 1 tree, 423 leaves, max depth = 22, train score: 0.00035, val score: 0.00016, in 0.057s
[3/2000] 1 tree, 423 leaves, max depth = 22, train score: 0.00055, val score: 0.00027, in 0.055s
[4/2000] 1 tree, 423 leaves, max depth = 22, train score: 0.00074, val score: 0.00037, in 0.056s
[5/2000] 1 tree, 423 leaves, max depth = 22, train score: 0.00093, val score: 0.00047, in 0.051s
[6/2000] 1 tree, 425 leaves, max depth = 22, train score: 0.00113, val score: 0.00056, in 0.054s
[7/2000] 1 tree, 427 leaves, max depth = 22, train score: 0.00131, val score: 0.00064, in 0.054s
[8/2000] 1 tree, 418 leaves, max depth = 23, train score: 0.00148, val score: 0.00073, in 0.053s
[9/2000] 1 tree, 418 leaves, max depth = 23, train score: 0.00165, val score: 0.00082, in 0.055s
[10/20