In [1]:
# Imports
import os, sys
import pandas as pd

# Step 1: Data Preprocessing

In [32]:
"""
    Local Dataset Importing
        Originally tried to use the Kaggle API however quickly faced
        issues with rate limiting. Local downloading will speed up
        training at the cost of repeatability for others viewing project.
        
        Solution, I pull from downloaded datasets in parent folder to this
        github repository. news_data contains only the news dataset .json file,
        stock data contains a .csv with the metadata as well as 2 subfolders
        ETF and Stocks that contain their respective ticker histories
"""
news_path = os.getcwd() + "/../news_data/"
stock_path = os.getcwd() + "/../stock_data/"

news_df = pd.read_json(f'{news_path}News_Category_Dataset_v3.json', lines=True)
stock_meta = pd.read_csv(f'{stock_path}symbols_valid_meta.csv')

In [91]:
"""
    Capture all Data Frames
        Maintain a dictionary with keys as stock ticker symbols and values
        as the DataFrames captured from reading the stock's .csv
"""
stock_tickers = [x['Symbol'] for _, x in stock_meta.iterrows() if x['ETF'] == 'N'] # Remove ETFs


stock_dfs = {}
for s in stock_tickers:
    try:
        stock_dfs[s] = pd.read_csv(f"{stock_path}/stocks/{s}.csv")
    except Exception:
        print(f"Error with stock: {s}")


Error with stock: AGM$A
Error with stock: CARR.V
Error with stock: UTX.V


In [81]:
"""
    Dataset Columns
"""
print(f"News Columns: {news_df.columns.values}")
print(f"Stock Columns: {stock_dfs['A'].columns.values}")

News Columns: ['headline' 'category' 'date' 'effective_date']
Stock Columns: ['Date' 'Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume']


In [82]:
"""
    Helper Functions
"""

# Return the next trading day, avoiding holidays and weekends
#   Inputs
#       cur_day      -> The current date
#       trading_days -> List of all open dates in the range
#   Returns:
#       The next available date (datetime object)
def next_trading_day(cur_day, trading_days):
    days_left = trading_days[trading_days > cur_day]
    return days_left.min() if len(days_left) else trading_days.max()

In [None]:
"""
    Prepare Stock Datasets
        Drop irrelevant columns --> High, Low, Volume, adj_close
        Drop information prior to 2012-01-28
        Create stock return time horizon features
"""
processed_stocks = {}
tickers = stock_dfs.keys()

for s in tickers:
    df = stock_dfs[s].copy()

    df['Ticker'] = s

    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['Date'] >= pd.Timestamp("2012-01-28")].reset_index(drop=True)
    df = df.drop(['High', 'Low', 'Adj Close', 'Volume'], axis=1)

    if df.shape[0] != 2057: # 2057 dates 2012-01-28 and 2020-04-01, cut unfilled stocks
        continue

    df['r_0d'] = (df['Close'] - df['Open'])/df['Open']
    df['r_1d'] = df['Close'].shift(-1).pct_change(periods=1, fill_method=None)
    df['r_7d'] = df['Close'].shift(-7).pct_change(periods=7, fill_method=None)
    df['r_30d'] = df['Close'].shift(-30).pct_change(periods=30, fill_method=None)

    processed_stocks[s] = df

print(f"Number of Stocks before processing: \t{len(stock_dfs)}")
print(f"Number of Stocks after processing: \t{len(processed_stocks)}")

stock_dfs = {k: v for k, v in processed_stocks.items()}

In [89]:
print(stock_dfs['A'])

KeyError: 'A'

In [None]:
"""
    Prepare News Dataset
        News DF starts 2022-09-23, ends 2012-01-28 --> First must reverse dataset
        Drop categories to only necessary --> Category, Headline, Date
        Shift dates to align with trading days, skipping weekends and holidays till next open day
"""
news_df = news_df.sort_values(by='date', ascending=True).reset_index(drop=True)
news_df = news_df.drop(['link', 'short_description', 'authors'], axis=1)
news_df = news_df[news_df['date'] <= pd.Timestamp("2020-04-01")].reset_index(drop=True)

trading_days = pd.to_datetime(stock_dfs['A']['Date'].unique())
news_df['effective_date'] = news_df['date'].apply(lambda d: next_trading_day(d, trading_days))