In [22]:

# Imports
import sys
import importlib

import pandas as pd
from textblob import TextBlob

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

sys.path.append("..")

import ds_helpers as dh
import helper_funcs as hf
import ml_models as ml

In [6]:
importlib.reload(dh)
importlib.reload(hf)
importlib.reload(ml)

# Fetch all datasets
news_path = "/../../news_data/"
stock_path = "/../../stock_data/"

news_df, stock_meta = dh.fetch_datasets(news_path, stock_path)
stock_dfs = dh.fetch_stock_data(stock_meta, stock_path)

# Prepare datasets
stock_dfs = dh.prep_stock_data(stock_dfs)
news_df = dh.prep_news_data(news_df, stock_dfs)

# Sentiment Analysis
daily_sent = ml.sentiment_analysis(news_df)

Error with stock: AGM$A
Error with stock: CARR.V
Error with stock: UTX.V

Number of Stocks before processing: 	5881
Number of Stocks after processing: 	3511



In [None]:
big_df = ml.dataframe_union(stock_dfs, daily_sent)

              Date Ticker       Open      r_0d      r_1d      r_7d     r_30d  \
0       2012-03-13      A  31.444921  0.026160 -0.003104 -0.031035 -0.087120   
1       2012-03-13     AA  24.030001  0.031000 -0.002910 -0.029098 -0.047527   
2       2012-03-13    AAL   7.000000 -0.005714  0.018678  0.053161  0.379310   
3       2012-03-13   AAME   2.900000 -0.017241  0.010526 -0.003509  0.028070   
4       2012-03-13    AAN  26.270000  0.011420 -0.013925 -0.029356 -0.042529   
...            ...    ...        ...       ...       ...       ...       ...   
7011462 2020-02-19   ZNGA   7.100000 -0.001408  0.012694 -0.053597 -0.046544   
7011463 2020-02-19    ZNH  30.379999  0.010204 -0.028674 -0.127403 -0.335614   
7011464 2020-02-19    ZTR  11.790000  0.004241  0.009291 -0.179054 -0.430743   
7011465 2020-02-19   ZUMZ  32.490002 -0.015082  0.009063 -0.170937 -0.534687   
7011466 2020-02-19   ZYXI  10.210000  0.028404  0.045714  0.240000  0.040000   

           SPORTS   SCIENCE    COMEDY  

In [25]:
# Experiment with 30 day return
x_features = ['Open'] + list(news_df['category'].unique())

X = big_df[x_features]
y_30d = big_df['r_30d']

X_train, X_test, y_train, y_test = train_test_split(X, y_30d, test_size=0.2, random_state=42)

N_EST = 1000
LAMBDA = 0.05
MAX_DEPTH = 6
SUB_SAMP = 0.8
OBJECTIVE = "reg:squarederror"
METHOD = "hist"

model_30d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth = MAX_DEPTH,
    subsample = SUB_SAMP,
    objective = OBJECTIVE,
    tree_method = METHOD
)

model_30d.fit(
    X_train, 
    y_train,
    eval_set=[(X_test,y_test)],
    verbose=50
)

predictions = model_30d.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("\n=== MODEL PERFORMANCE ===")
print(f"RMSE: {rmse:,.6f}")
print(f"MAE:  {mae:,.6f}")
print(f"R²:   {r2:,.6f}")

[0]	validation_0-rmse:103.12270
[50]	validation_0-rmse:109.69303
[100]	validation_0-rmse:111.31208
[150]	validation_0-rmse:112.71639
[200]	validation_0-rmse:113.13400
[250]	validation_0-rmse:114.61607
[300]	validation_0-rmse:116.61246
[350]	validation_0-rmse:115.50596
[400]	validation_0-rmse:114.56095
[450]	validation_0-rmse:116.58673
[500]	validation_0-rmse:115.04970
[550]	validation_0-rmse:114.91033
[600]	validation_0-rmse:116.23943
[650]	validation_0-rmse:114.18118
[700]	validation_0-rmse:115.57354
[750]	validation_0-rmse:113.75887
[800]	validation_0-rmse:114.38235
[850]	validation_0-rmse:117.42115
[900]	validation_0-rmse:118.59006
[950]	validation_0-rmse:116.83502
[999]	validation_0-rmse:116.81810

=== MODEL PERFORMANCE ===
RMSE: 116.818106
MAE:  1.135918
R²:   -0.282495


# XGBoost Modeling

In [1]:
"""
    At this point:
        - stock_dfs: dictionary with ticker as key, DF as value
            - Gather list of tickers being used through stock_dfs.keys()
            - stock_dfs[...]
                - Date, Open, Close, Ticker, r_0d, r_1d, r_7d, r_30d
        - daily_sentiment: DF holding sentiment scores
            - date, category (42 total), avg_sentiment, article_count
    
    The plan:
        - For each time horizon create an XGBoost Model
            - Each model gets all 3511 stock openings and the scores of that day
                - Target is the designated time horizon of that modelSo 
"""
all_rows = []
all_dates = stock_dfs['A']['Date']

cols = ['Date', 'Ticker', 'Open', 'r_0d', 'r_1d', 'r_7d', 'r_30d']
cols.extend(all_cats)

for d in all_dates:
    # Get dictionary of daily sentiments
    sent_df = daily_sentiment[daily_sentiment['date'] == d]
    daily_sent_scores = [sent_df[sent_df['category'] == cat]['avg_sentiment'].item() for cat in all_cats]
    
    # Iterate over tickers
    for ticker, df in stock_dfs.items():
        date, open, close, tick, r0, r1, r7, r30 = df[df['Date'] == d].values[0]
        
        row = [d, tick, open, r0, r1, r7, r30] # Get the date and stock ticker
        row.extend(daily_sent_scores)
        
        all_rows.append(row)

big_df = pd.DataFrame(data=all_rows, columns=cols)

NameError: name 'stock_dfs' is not defined

In [None]:
feature_cols = ['Open'] + all_cats
X = big_df[feature_cols]

y_0d = big_df['r_0d']
y_1d = big_df['r_1d']
y_7d = big_df['r_7d']
y_30d = big_df['r_30d']

In [None]:
N_EST = 1000
LAMBDA = 0.05
MAX_DEPTH = 6
EVAL_MET = "mse"

model_0d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_1d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_7d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_30d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)