In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.ensemble import HistGradientBoostingRegressor as hgbr
from sklearn.metrics import r2_score as r2 
import shap, datetime,warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Open the pickle file
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))

In [3]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
sentiment_cols_no_llm = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [4]:
# Define the different sets of features to try
feature_sets = {
    'time_only': time_cols,
    'sentiment_only': sentiment_cols,
    'self_finance_only': self_finance_vars,
    'finance_only': finance_vars,
    'finance_time': finance_vars + time_cols,
    'all': sentiment_cols + finance_vars + time_cols 
}

all_no_llm = sentiment_cols_no_llm + finance_vars + time_cols 

In [5]:
y_cols = ['Volume']
y = df[y_cols]

In [6]:
# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [14]:
for feature_set in feature_sets:
    print(f'Processing feature set: {feature_set}')

    x_cols = feature_sets[feature_set]
    x = df[x_cols]
    x_train = x[:split_val]
    x_val   = x[split_val:split_test]
    x_test  = x[split_test:]

    # Normalize the features to [0,1]
    sc2 = MinMaxScaler(feature_range=(0, 1))

    x_train = sc2.fit_transform(x_train)
    x_val   = sc2.transform(x_val)
    x_test  = sc2.transform(x_test)

    model = hgbr(
        learning_rate=0.01,
        min_samples_leaf=200,
        l2_regularization=0,
        max_features=1.0,
        max_leaf_nodes=None,
        max_depth=None,
        early_stopping=True,
        scoring='r2',
        n_iter_no_change=10,
        verbose=2,
        max_iter = 2000,
        random_state=42
    )

    model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
    pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_{feature_set}.pkl', 'wb'))
    print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Processing feature set: time_only
Binning 0.056 GB of training data: 0.081 s
Binning 0.007 GB of validation data: 0.002 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 402 leaves, max depth = 22, train score: 0.00165, val score: 0.00096, in 0.051s
[2/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00330, val score: 0.00304, in 0.058s
[3/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00493, val score: 0.00508, in 0.058s
[4/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00652, val score: 0.00708, in 0.052s
[5/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00809, val score: 0.00904, in 0.051s
[6/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.00962, val score: 0.01097, in 0.052s
[7/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01113, val score: 0.01286, in 0.050s
[8/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01260, val score: 0.01471, in 0.054s
[9/2000] 1 tree, 404 leaves, max depth = 22, train score: 0.01405, va

In [7]:
all_no_llm

['Article Count_cum04_lag01',
 'Article Count_cum16_lag01',
 'Article Count_cum48_lag01',
 'Article Count_cum96_lag01',
 'Article Count_lag01',
 'Tone_cum04_lag01',
 'Tone_cum16_lag01',
 'Tone_cum48_lag01',
 'Tone_cum96_lag01',
 'Tone_lag01',
 'c16.60; WORDCOUNT; finance_cum04_lag01',
 'c16.60; WORDCOUNT; finance_cum16_lag01',
 'c16.60; WORDCOUNT; finance_cum48_lag01',
 'c16.60; WORDCOUNT; finance_cum96_lag01',
 'c16.60; WORDCOUNT; finance_lag01',
 'c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT_cum04_lag01',
 'c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT_cum16_lag01',
 'c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT_cum48_lag01',
 'c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT_cum96_lag01',
 'c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT_lag01',
 'c18.137; WORDCOUNT; TRIAL_cum04_lag01',
 'c18.137; WORDCOUNT; TRIAL_cum16_lag01',
 'c18.137; WORDCOUNT; TRIAL_cum48_lag01',
 'c18.137; WORDCOUNT; TRIAL_cum96_lag01',
 'c18.137; WORDCOUNT; TRIAL_lag01',
 'c18.154; WORDCOUNT; ECON_MONOPOLY_cum04_lag01',


In [8]:
len(all_no_llm)

1416

In [9]:
feature_set = all_no_llm

x_cols = feature_set
x = df[x_cols]
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

model = hgbr(
    learning_rate=0.01,
    min_samples_leaf=200,
    l2_regularization=0,
    max_features=1.0,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    verbose=2,
    max_iter = 2000,
    random_state=42
)

model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
pickle.dump(model, open(f'../../output/models/lightgbm/lightgbm_all_no_llm.pkl', 'wb'))
print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Binning 3.061 GB of training data: 7.331 s
Binning 0.383 GB of validation data: 0.348 s
Fitting gradient boosted rounds:
[1/2000] 1 tree, 1034 leaves, max depth = 21, train score: 0.01567, val score: 0.01307, in 4.076s
[2/2000] 1 tree, 1033 leaves, max depth = 21, train score: 0.03103, val score: 0.02709, in 4.066s
[3/2000] 1 tree, 1043 leaves, max depth = 20, train score: 0.04608, val score: 0.04083, in 4.235s
[4/2000] 1 tree, 1036 leaves, max depth = 21, train score: 0.06080, val score: 0.05422, in 4.592s
[5/2000] 1 tree, 1046 leaves, max depth = 21, train score: 0.07530, val score: 0.06736, in 4.273s
[6/2000] 1 tree, 1039 leaves, max depth = 23, train score: 0.08933, val score: 0.08002, in 4.231s
[7/2000] 1 tree, 1027 leaves, max depth = 21, train score: 0.10320, val score: 0.09273, in 4.124s
[8/2000] 1 tree, 1042 leaves, max depth = 21, train score: 0.11680, val score: 0.10488, in 4.176s
[9/2000] 1 tree, 1021 leaves, max depth = 21, train score: 0.13009, val score: 0.11690, in 4.14