In [1]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split

from feature_engineering import macro_feature_engineer, reddit_feature_engineer, nyt_feature_engineer, compute_lagged_values

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Target column ##
target = pd.read_excel("data/target_df.xlsx").set_index("Date")[['decision']]
target = target.replace({'BUY':-1, 'HOLD':0, 'SELL':1})

In [3]:
## Feature: Index Price ##
prices = yf.download("^GSPC", start="2015-12-20", end="2022-09-02")[['Adj Close']]
prices = compute_lagged_values(prices, 3, "mean")
prices = prices[prices.index.isin(target.index)]

[*********************100%***********************]  1 of 1 completed


In [4]:
## Feature: Reddit Posts ##
# WARNING: This line of code may take up to an hour to run. 
# reddit_posts = pd.read_excel("data/cleaned/reddit_2016_2022_cleaned_1710.xlsx")
# reddit_scores = reddit_feature_engineer(reddit_posts)

# Instead, run this to directly retrieve reddit sentiment scores.
reddit_scores = pd.read_excel("data/sentiments/reddit_2016_2022_sentiment_scores.xlsx")
reddit_scores = reddit_scores.set_index('date')

# Then, compute lagged values and filter rows with target value
reddit_scores = compute_lagged_values(reddit_scores, 3, "mean")
reddit_scores = reddit_scores[reddit_scores.index.isin(target.index)]

weight_type = "both" # or "comments", "upvotes"
reddit_scores = reddit_scores[[f'pos_score_weighted_{weight_type}',f'neg_score_weighted_{weight_type}',f'neu_score_weighted_{weight_type}',f'compound_score_weighted_{weight_type}']]

In [5]:
## Feature: NYT Articles ###
# WARNING: This line of code may take up to an hour to run. 
# nyt_posts = pd.read_excel("data/cleaned/nyt_2016_2022_cleaned_1710.xlsx")
# spweights = pd.read_excel("data/nyt_2016_2022_cleaned_1710")
# nyt_scores = nyt_feature_engineer(nyt_posts, spweights)

# Instead, run this to directly retrieve nyt sentiment scores.
nyt_scores = pd.read_excel("data/sentiments/nyt_2016_2022_sentiment_scores.xlsx")
nyt_scores = nyt_scores.set_index('date')

# Then, compute lagged values and filter rows with target value
nyt_scores = compute_lagged_values(nyt_scores, 3, "mean")
nyt_scores = nyt_scores[nyt_scores.index.isin(target.index)]

In [5]:
nyt_posts = pd.read_excel("data/cleaned/nyt_2016_2022_cleaned_1710.xlsx")
spweights = pd.read_excel("data/S&P500 Top 30.xlsx")
nyt_scores = nyt_feature_engineer(nyt_posts, spweights)

In [6]:
## Feature: Macroeconomic Data ##
macro_data = pd.read_excel("data/raw/Macro_Data_2016_to_2022.xlsx")
macro_data = macro_feature_engineer(macro_data, normalize=False, data_type="actual")
macro_data = macro_data.reindex(target.index)
macro_data = macro_data[macro_data.index.isin(target.index)]

In [7]:
# Combine features and target
data = pd.concat([prices, reddit_scores, nyt_scores, macro_data, target], axis=1)

In [8]:
# Train-test split
X, y = data.drop(columns={'decision'}), data[['decision']]
X = X.drop(['pos_score', 'neg_score','neu_score','compound_score_weighted_both'], axis = 1)
X.rename({'pos_score_weighted_both': 'reddit_pos_both', 'neg_score_weighted_both': 'reddit_neg_both', 'neu_score_weighted_both': 'reddit_neu_both', 'pos_weighted':'nyt_pos','neg_weighted':'nyt_neg','neu_weighted':'nyt_neu','Adj Close':'adj_close','Quarterly GDP (Actual)':'quarterly_gdp_actual','Monthly CPI (Actual)':'monthly_cpi_actual','Monthly Short Term Interest Rates (Actual)':'monthly_st_ir_actual','Monthly Unemployment Rate (Actual)':'monthly_unemployment_actual'}, axis = 1, inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(data['2022':]), shuffle=False)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1511, 21), (168, 21), (1511, 1), (168, 1))

In [None]:
X_train.to_excel('data/model_inputs/x_train.xlsx')
X_test.to_excel('data/model_inputs/x_test.xlsx')
y_train.to_excel('data/model_inputs/y_train.xlsx')
y_test.to_excel('data/model_inputs/y_test.xlsx')