In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.neural_network import MLPRegressor as nn
from sklearn.metrics import r2_score as r2 
import shap, datetime,warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Open the pickle file
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))

In [3]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [4]:
# Define the different sets of features to try
feature_sets = {
    'time_only': time_cols,
    'sentiment_only': sentiment_cols,
    'self_finance_only': self_finance_vars,
    'finance_only': finance_vars,
    'finance_time': finance_vars + time_cols,
    'all': sentiment_cols + finance_vars + time_cols 
}

In [5]:
for feature_set in feature_sets:
    print(len(feature_sets[feature_set]), feature_set)

26 time_only
750 sentiment_only
160 self_finance_only
800 finance_only
826 finance_time
1576 all


In [6]:
y_cols = ['Volume']
y = df[y_cols]

In [7]:
# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [8]:
for feature_set in feature_sets:
    print(f'Processing feature set: {feature_set}')

    x_cols = feature_sets[feature_set]
    x = df[x_cols]
    x_train = x[:split_val]
    x_val   = x[split_val:split_test]
    x_test  = x[split_test:]

    # Normalize the features to [0,1]
    sc2 = MinMaxScaler(feature_range=(0, 1))

    x_train = sc2.fit_transform(x_train)
    x_val   = sc2.transform(x_val)
    x_test  = sc2.transform(x_test)

    model = nn(
        hidden_layer_sizes=(20, 20),
        batch_size=200,
        early_stopping=True,
        n_iter_no_change=10,
        verbose=True,
        max_iter = 1000,
        random_state=42,
        validation_fraction=0.1,
        learning_rate_init = 0.01
    )

    model.fit(x_train, y_train)
    pickle.dump(model, open(f'../../output/models/neuralnet/neuralnet_{feature_set}.pkl', 'wb'))
    print(f"{model.score(x_val, y_val)}, {model.score(x_test, y_test)}")

Processing feature set: time_only
Iteration 1, loss = 303584267452.90911865
Validation score: 0.044778
Iteration 2, loss = 277699685192.38507080
Validation score: 0.059669
Iteration 3, loss = 275408917749.46673584
Validation score: 0.063362
Iteration 4, loss = 274660561926.17529297
Validation score: 0.064376
Iteration 5, loss = 274403223831.64285278
Validation score: 0.064366
Iteration 6, loss = 274288049479.90344238
Validation score: 0.064319
Iteration 7, loss = 274250595187.24658203
Validation score: 0.064228
Iteration 8, loss = 274226731111.18859863
Validation score: 0.063975
Iteration 9, loss = 274234254678.61810303
Validation score: 0.064134
Iteration 10, loss = 274237616339.04782104
Validation score: 0.064042
Iteration 11, loss = 274221236934.57812500
Validation score: 0.063992
Iteration 12, loss = 274225531837.78900146
Validation score: 0.064106
Iteration 13, loss = 274223395190.39309692
Validation score: 0.063731
Iteration 14, loss = 274235093322.76260376
Validation score: 0.06