In [None]:
# Imports
import os
import pandas as pd
from textblob import TextBlob

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing

In [26]:
"""
    Local Dataset Importing
        Originally tried to use the Kaggle API however quickly faced
        issues with rate limiting. Local downloading will speed up
        training at the cost of repeatability for others viewing project.
        
        Solution, I pull from downloaded datasets in parent folder to this
        github repository. news_data contains only the news dataset .json file,
        stock data contains a .csv with the metadata as well as 2 subfolders
        ETF and Stocks that contain their respective ticker histories
"""
news_path = os.getcwd() + "/../news_data/"
stock_path = os.getcwd() + "/../stock_data/"

news_df = pd.read_json(f'{news_path}News_Category_Dataset_v3.json', lines=True)
stock_meta = pd.read_csv(f'{stock_path}symbols_valid_meta.csv')

In [27]:
"""
    Capture all Data Frames
        Maintain a dictionary with keys as stock ticker symbols and values
        as the DataFrames captured from reading the stock's .csv
"""
stock_tickers = [x['Symbol'] for _, x in stock_meta.iterrows() if x['ETF'] == 'N'] # Remove ETFs


stock_dfs = {}
for s in stock_tickers:
    try:
        stock_dfs[s] = pd.read_csv(f"{stock_path}/stocks/{s}.csv")
    except Exception:
        print(f"Error with stock: {s}")


Error with stock: AGM$A
Error with stock: CARR.V
Error with stock: UTX.V


In [29]:
"""
    Dataset Columns
"""
print(f"News Columns: {news_df.columns.values}")
print(f"Stock Columns: {stock_dfs['A'].columns.values}")

News Columns: ['link' 'headline' 'category' 'short_description' 'authors' 'date']
Stock Columns: ['Date' 'Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume']


In [30]:
"""
    Helper Functions
"""

# Return the next trading day, avoiding holidays and weekends
#   Inputs
#       cur_day      -> The current date
#       trading_days -> List of all open dates in the range
#   Returns:
#       The next available date (datetime object)
def next_trading_day(cur_day, trading_days):
    days_left = trading_days[trading_days > cur_day]
    return days_left.min() if len(days_left) else trading_days.max()

In [31]:
"""
    Prepare Stock Datasets
        Drop irrelevant columns --> High, Low, Volume, adj_close
        Drop information prior to 2012-01-28
        Create stock return time horizon features
"""
processed_stocks = {}
tickers = stock_dfs.keys()

for s in tickers:
    df = stock_dfs[s].copy()

    df['Ticker'] = s

    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['Date'] >= pd.Timestamp("2012-01-28")].reset_index(drop=True)
    df = df.drop(['High', 'Low', 'Adj Close', 'Volume'], axis=1)

    if df.shape[0] != 2057: # 2057 dates 2012-01-28 and 2020-04-01, cut unfilled stocks
        continue

    df['r_0d'] = (df['Close'] - df['Open'])/df['Open']
    df['r_1d'] = df['Close'].shift(-1).pct_change(periods=1, fill_method=None)
    df['r_7d'] = df['Close'].shift(-7).pct_change(periods=7, fill_method=None)
    df['r_30d'] = df['Close'].shift(-30).pct_change(periods=30, fill_method=None)

    processed_stocks[s] = df

print(f"Number of Stocks before processing: \t{len(stock_dfs)}")
print(f"Number of Stocks after processing: \t{len(processed_stocks)}")

stock_dfs = {k: v.dropna().reset_index(drop=True) for k, v in processed_stocks.items()}

Number of Stocks before processing: 	5881
Number of Stocks after processing: 	3511


In [None]:
"""
    Prepare News Dataset
        News DF starts 2022-09-23, ends 2012-01-28 --> First must reverse dataset
        Drop categories to only necessary --> Category, Headline, Date
        Shift dates to align with trading days, skipping weekends and holidays till next open day
"""
news_df = news_df.sort_values(by='date', ascending=True).reset_index(drop=True)
news_df = news_df.drop(['link', 'short_description', 'authors'], axis=1)
news_df = news_df[news_df['date'] <= pd.Timestamp("2020-04-01")].reset_index(drop=True)

trading_days = pd.to_datetime(stock_dfs['A']['Date'].unique())
news_df['effective_date'] = news_df['date'].apply(lambda d: next_trading_day(d, trading_days))

# Stock ticker named TECH has issues down the line
news_df['category'] = news_df['category'].replace({'TECH': 'TECHNOLOGY'})

# Step 2: Sentiment Analysis

In [82]:
# Create sentiment of each headline using 
news_df['sentiment'] = news_df['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [83]:
daily_sentiment = (
    news_df.groupby(['effective_date', 'category'])
    .agg(
        avg_sentiment = ('sentiment', 'mean'),
        article_count = ('headline', 'count')
    )
    .reset_index()
)

In [84]:
all_dates = pd.date_range(news_df['effective_date'].min(), news_df['effective_date'].max())
all_cats = news_df['category'].unique()


all_cats_per_date = pd.MultiIndex.from_product(
    [all_dates, all_cats],
    names=['date', 'category']
)

daily_sentiment = (
    daily_sentiment
    .set_index(['effective_date', 'category'])
    .reindex(all_cats_per_date)
    .fillna({'avg_sentiment': 0, 'article_count': 0})
    .reset_index()
)

# 3. XGBoost

In [None]:
"""
    At this point:
        - stock_dfs: dictionary with ticker as key, DF as value
            - Gather list of tickers being used through stock_dfs.keys()
            - stock_dfs[...]
                - Date, Open, Close, Ticker, r_0d, r_1d, r_7d, r_30d
        - daily_sentiment: DF holding sentiment scores
            - date, category (42 total), avg_sentiment, article_count
    
    The plan:
        - For each time horizon create an XGBoost Model
            - Each model gets all 3511 stock openings and the scores of that day
                - Target is the designated time horizon of that modelSo 
"""
all_rows = []
all_dates = stock_dfs['A']['Date']

cols = ['Date', 'Ticker', 'Open', 'r_0d', 'r_1d', 'r_7d', 'r_30d']
cols.extend(all_cats)

for d in all_dates:
    # Get dictionary of daily sentiments
    sent_df = daily_sentiment[daily_sentiment['date'] == d]
    daily_sent_scores = [sent_df[sent_df['category'] == cat]['avg_sentiment'].item() for cat in all_cats]
    
    # Iterate over tickers
    for ticker, df in stock_dfs.items():
        date, open, close, tick, r0, r1, r7, r30 = df[df['Date'] == d].values[0]
        
        row = [d, tick, open, r0, r1, r7, r30] # Get the date and stock ticker
        row.extend(daily_sent_scores)
        
        all_rows.append(row)

big_df = pd.DataFrame(data=all_rows, columns=cols)

In [None]:
feature_cols = ['Open'] + all_cats
X = big_df[feature_cols]

y_0d = big_df['r_0d']
y_1d = big_df['r_1d']
y_7d = big_df['r_7d']
y_30d = big_df['r_30d']

In [None]:
N_EST = 1000
LAMBDA = 0.05
MAX_DEPTH = 6
EVAL_MET = "mse"

model_0d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_1d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_7d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)

model_30d = xgb.XGBRegressor(
    n_estimators = N_EST,
    learning_rate = LAMBDA,
    max_depth= MAX_DEPTH,
    eval_metric= EVAL_MET
)