In [29]:

# Imports
import sys
import importlib

import pandas as pd
from textblob import TextBlob

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

sys.path.append("..")

import ds_helpers as dh
import helper_funcs as hf
import ml_models as ml

In [None]:
importlib.reload(dh)
importlib.reload(hf)
importlib.reload(ml)

# Fetch all datasets
news_path = "/../../news_data/"
stock_path = "/../../stock_data/"

news_df, stock_meta = dh.fetch_datasets(news_path, stock_path)
stock_dfs = dh.fetch_stock_data(stock_meta, stock_path)

# Prepare datasets
stock_dfs = dh.prep_stock_data(stock_dfs)
news_df = dh.prep_news_data(news_df, stock_dfs)

# Sentiment Analysis
daily_sent = ml.sentiment_analysis(news_df)

Error retrieving stock: AGM$A
Error retrieving stock: CARR.V
Error retrieving stock: UTX.V

Dropping VST: invalid 1 day return

Number of Stocks before processing: 	5881
Number of Stocks after processing: 	3284



In [64]:
importlib.reload(dh)
importlib.reload(hf)
importlib.reload(ml)

big_df = ml.dataframe_union(stock_dfs, daily_sent)

2012-01-30 00:00:00
           Date       Open      Close Ticker      r_0d      r_1d      r_7d  \
0    2012-02-01   9.000000   9.000000   ALRS  0.000000 -0.009259  0.000000   
1    2012-02-02   9.000000   8.916667   ALRS -0.009259  0.000000 -0.007477   
2    2012-02-03   8.916667   8.916667   ALRS  0.000000  0.000000 -0.016822   
3    2012-02-06   8.916667   8.916667   ALRS  0.000000  0.000000 -0.016822   
4    2012-02-07   8.916667   8.916667   ALRS  0.000000  0.000000 -0.016822   
...         ...        ...        ...    ...       ...       ...       ...   
2020 2020-02-12  21.809999  21.780001   ALRS -0.001375 -0.004591 -0.015611   
2021 2020-02-13  21.730000  21.680000   ALRS -0.002301  0.002306 -0.027675   
2022 2020-02-14  21.500000  21.730000   ALRS  0.010698 -0.001841 -0.030833   
2023 2020-02-18  21.730000  21.690001   ALRS -0.001841  0.002305 -0.045182   
2024 2020-02-19  21.740000  21.740000   ALRS  0.000000 -0.004140 -0.058878   

         r_30d  
0     0.046296  
1     0.0

IndexError: index 0 is out of bounds for axis 0 with size 0

In [50]:
print(big_df.nlargest(20, 'r_30d')[['Date','Ticker','Open','r_30d']])

              Date Ticker   Open         r_30d
1951143 2014-04-15    PJT  0.075  75891.899260
1965187 2014-04-22    PJT  0.105  75891.899260
1954654 2014-04-16    PJT  0.120  62499.001397
1958165 2014-04-17    PJT  0.120  61754.917721
1961676 2014-04-21    PJT  0.120  61754.917721
2267133 2014-08-22    PJT  0.255  32155.863346
2253089 2014-08-18    PJT  0.255  31763.706476
2256600 2014-08-19    PJT  0.255  31763.706476
2260111 2014-08-20    PJT  0.255  31763.706476
2263622 2014-08-21    PJT  0.255  31763.706476
2270644 2014-08-25    PJT  0.270  30739.739519
2274155 2014-08-26    PJT  0.270  30739.739519
2277666 2014-08-27    PJT  0.270  30739.739519
2281177 2014-08-28    PJT  0.270  30739.739519
1972209 2014-04-24    PJT  0.250  30356.160156
1975720 2014-04-25    PJT  0.250  30356.160156
1979231 2014-04-28    PJT  0.250  30356.160156
1982742 2014-04-29    PJT  0.250  29999.000000
1968698 2014-04-23    PJT  0.145  29284.720703
1986253 2014-04-30    PJT  0.250  29284.720703


In [45]:
print(big_df.columns)

Index(['Date', 'Ticker', 'Open', 'r_0d', 'r_1d', 'r_7d', 'r_30d', 'SPORTS',
       'SCIENCE', 'COMEDY', 'PARENTING', 'STYLE & BEAUTY', 'WELLNESS',
       'TRAVEL', 'HOME & LIVING', 'QUEER VOICES', 'TECHNOLOGY',
       'CULTURE & ARTS', 'ENTERTAINMENT', 'BUSINESS', 'IMPACT', 'ENVIRONMENT',
       'WEDDINGS', 'BLACK VOICES', 'FOOD & DRINK', 'DIVORCE', 'CRIME', 'MONEY',
       'POLITICS', 'RELIGION', 'ARTS', 'GREEN', 'WORLDPOST', 'STYLE', 'TASTE',
       'PARENTS', 'COLLEGE', 'HEALTHY LIVING', 'WOMEN', 'FIFTY', 'MEDIA',
       'EDUCATION', 'WEIRD NEWS', 'LATINO VOICES', 'GOOD NEWS',
       'ARTS & CULTURE', 'THE WORLDPOST', 'WORLD NEWS', 'U.S. NEWS'],
      dtype='object')


In [32]:
# Experiment with 30 day return
x_features = ['Open'] + list(news_df['category'].unique())

X = big_df[x_features]
y_30d = big_df['r_30d']

X_train, X_test, y_train, y_test = train_test_split(X, y_30d, test_size=0.2, random_state=42)

N_EST = 1000
LAMBDA = 0.05
MAX_DEPTH = 6
SUB_SAMP = 0.8
OBJECTIVE = "reg:squarederror"
METHOD = "hist"

model_30d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth = MAX_DEPTH,
    subsample = SUB_SAMP,
    objective = OBJECTIVE,
    tree_method = METHOD
)

model_30d.fit(
    X_train, 
    y_train,
    eval_set=[(X_test,y_test)],
    verbose=50
)

predictions = model_30d.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("\n=== MODEL PERFORMANCE ===")
print(f"RMSE: {rmse:,.6f}")
print(f"MAE:  {mae:,.6f}")
print(f"R²:   {r2:,.6f}")

[0]	validation_0-rmse:103.35308
[50]	validation_0-rmse:105.49588
[100]	validation_0-rmse:107.06929
[150]	validation_0-rmse:107.74489
[200]	validation_0-rmse:107.73827
[250]	validation_0-rmse:108.15342
[300]	validation_0-rmse:108.52524
[350]	validation_0-rmse:108.64887
[400]	validation_0-rmse:108.63951
[450]	validation_0-rmse:108.57467
[500]	validation_0-rmse:108.70326
[550]	validation_0-rmse:108.84966
[600]	validation_0-rmse:109.07496
[650]	validation_0-rmse:109.09244
[700]	validation_0-rmse:109.08670
[750]	validation_0-rmse:108.82382
[800]	validation_0-rmse:109.10325
[850]	validation_0-rmse:109.03448
[900]	validation_0-rmse:109.03069
[950]	validation_0-rmse:108.49241
[999]	validation_0-rmse:108.79907

=== MODEL PERFORMANCE ===
RMSE: 108.799070
MAE:  1.161226
R²:   -0.106633
