In [1]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split

from feature_engineering import *

In [2]:
## Target column ##
target_5perc = create_target(0.05)
target_5perc = target_5perc.replace({'BUY':-1, 'HOLD':0, 'SELL':1})

target_3perc = create_target(0.03)
target_3perc = target_3perc.replace({'BUY':-1, 'HOLD':0, 'SELL':1})

target_1perc = create_target(0.01)
target_1perc = target_1perc.replace({'BUY':-1, 'HOLD':0, 'SELL':1})

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
target_3perc['decision'].value_counts(), target_5perc['decision'].value_counts(), target_1perc['decision'].value_counts()

( 0    1635
  1      29
 -1      15
 Name: decision, dtype: int64,
  0    1669
  1       5
 -1       5
 Name: decision, dtype: int64,
  0    1278
 -1     221
  1     180
 Name: decision, dtype: int64)

In [4]:
target = target_3perc.copy(deep=True)
target = target.drop(['Adj Close'], axis=1)

In [5]:
## Feature: Index Price ##
prices = yf.download("^GSPC", start="2015-12-20", end="2022-09-02")[['Adj Close']]
prices = compute_lagged_values(prices, 3, "mean")
prices = prices.reset_index()
prices['Date'] = prices['Date'].apply(lambda x: x.date())

prices = prices.set_index('Date')
prices.index = pd.DatetimeIndex(prices.index)

prices = prices[prices.index.isin(target.index)]

[*********************100%***********************]  1 of 1 completed


In [6]:
## Feature: Reddit Posts ##
# WARNING: This line of code may take up to an hour to run. 
# reddit_posts = pd.read_excel("data/cleaned/reddit_2016_2022_cleaned_1710.xlsx")
# reddit_scores = reddit_feature_engineer(reddit_posts)

# Instead, run this to directly retrieve reddit sentiment scores.
reddit_scores = pd.read_excel("data/sentiments/reddit_2016_2022_sentiment_scores.xlsx")
reddit_scores = reddit_scores.set_index('date')

# Then, compute lagged values and filter rows with target value
reddit_scores = compute_lagged_values(reddit_scores, 3, "mean")
reddit_scores = reddit_scores[reddit_scores.index.isin(target.index)]

weight_type = "both" # or "comments", "upvotes"
reddit_scores = reddit_scores[[f'pos_score_weighted_{weight_type}',f'neg_score_weighted_{weight_type}',f'neu_score_weighted_{weight_type}',f'compound_score_weighted_{weight_type}']]

In [7]:
## Feature: NYT Articles ###
# WARNING: This line of code may take up to an hour to run. 
# nyt_posts = pd.read_excel("data/cleaned/nyt_2016_2022_cleaned_1710.xlsx")
# spweights = pd.read_excel("data/nyt_2016_2022_cleaned_1710")
# nyt_scores = nyt_feature_engineer(nyt_posts, spweights)
# nyt_scores = nyt_scores.set_index('date')

# Instead, run this to directly retrieve nyt sentiment scores.
nyt_scores = pd.read_excel("data/sentiments/nyt_2016_2022_sentiment_scores.xlsx")
nyt_scores = nyt_scores.set_index('date')

# Then, compute lagged values and filter rows with target value
nyt_scores = compute_lagged_values(nyt_scores, 3, "mean")
nyt_scores = nyt_scores[nyt_scores.index.isin(target.index)]

In [8]:
## Feature: Macroeconomic Data ##
macro_data = pd.read_excel("data/raw/Macro_Data_2016_to_2022.xlsx")
macro_data = macro_feature_engineer(macro_data, normalize=False, data_type="actual") ### TO DELETE NORMALIZE CODE
macro_data = macro_data.reindex(target.index)
macro_data = macro_data[macro_data.index.isin(target.index)]

In [10]:
# Combine features and target
data = pd.concat([prices, reddit_scores, nyt_scores, macro_data, target], axis=1)

TypeError: Cannot join tz-naive with tz-aware DatetimeIndex

In [10]:
# Train-test split
X, y = data.drop(columns={'decision'}), data[['decision']]
X = X.drop(['Unnamed: 0', 'pos_score', 'neg_score','neu_score','compound_score_weighted_both'], axis = 1)
X.rename({'pos_score_weighted_both': 'reddit_pos_both', 'neg_score_weighted_both': 'reddit_neg_both', 'neu_score_weighted_both': 'reddit_neu_both', 'pos_weighted':'nyt_pos','neg_weighted':'nyt_neg','neu_weighted':'nyt_neu','Adj Close':'adj_close','Quarterly GDP (Actual)':'quarterly_gdp_actual','Monthly CPI (Actual)':'monthly_cpi_actual','Monthly Short Term Interest Rates (Actual)':'monthly_st_ir_actual','Monthly Unemployment Rate (Actual)':'monthly_unemployment_actual'}, axis = 1, inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(data['2022':]), shuffle=False)

In [11]:
scaler = StandardScaler() 
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train_scaled = X_train_scaled.set_index(X_train.index)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test_scaled = X_test_scaled.set_index(X_test.index)

In [12]:
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape

((1511, 11), (168, 11), (1511, 1), (168, 1))

In [13]:
X_train_scaled.isna().any()

adj_close                      False
reddit_pos_both                False
reddit_neg_both                False
reddit_neu_both                False
nyt_pos                        False
nyt_neg                        False
nyt_neu                        False
quarterly_gdp_actual           False
monthly_cpi_actual             False
monthly_st_ir_actual           False
monthly_unemployment_actual    False
dtype: bool

In [14]:
X_train_scaled.to_excel('data/model_inputs/x_train_3perc.xlsx')
X_test_scaled.to_excel('data/model_inputs/x_test_3perc.xlsx')
y_train.to_excel('data/model_inputs/y_train_3perc.xlsx')
y_test.to_excel('data/model_inputs/y_test_3perc.xlsx')