In [1]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split

from feature_engineering import *



In [2]:
# Generate X_train, y_train, X_test, y_test for lag={3, 7} and perc={3perc, 5perc}
perc_dict = {'5perc': 0.05, '3perc': 0.03}

for lag in [3, 7]:
    for perc in ['5perc', '3perc']:
        # target column
        target = create_target(perc_dict[perc])
        target = target.replace({'BUY':1, 'HOLD':0, 'SELL':-1})
        target = target.drop(['Adj Close'], axis=1)

        # feature: index price
        prices = yf.download("^GSPC", start="2015-12-01", end="2022-09-02")[['Adj Close']]
        prices = compute_lagged_values(prices, lag, "mean")
        prices = prices.reset_index()
        prices['Date'] = prices['Date'].apply(lambda x: x.date())
        prices = prices.set_index('Date')
        prices.index = pd.DatetimeIndex(prices.index)
        prices = prices[prices.index.isin(target.index)]

        # feature: reddit scores
        # WARNING: This line of code may take hours to run. 
        # reddit_posts = pd.read_excel("data/cleaned/reddit_2016_2022_cleaned_1710.xlsx")
        # reddit_scores = reddit_feature_engineer(reddit_posts)
        # Instead, run below code to retrieve previously obtained sentiment scores.
        reddit_scores = pd.read_excel("data/sentiments/reddit_2016_2022_sentiment_scores.xlsx")
        reddit_scores = reddit_scores.set_index('date')
        reddit_scores = compute_lagged_values(reddit_scores, lag, "mean")
        reddit_scores = reddit_scores[reddit_scores.index.isin(target.index)]
        weight_type = "both" # or "comments", "upvotes"
        reddit_scores = reddit_scores[[f'pos_score_weighted_{weight_type}',f'neg_score_weighted_{weight_type}',f'neu_score_weighted_{weight_type}',f'compound_score_weighted_{weight_type}']]

        # feature: news scores
        # WARNING: This line of code may take up to an hour to run. 
        # nyt_posts = pd.read_excel("data/cleaned/nyt_2016_2022_cleaned_1710.xlsx")
        # spweights = pd.read_excel("data/nyt_2016_2022_cleaned_1710")
        # nyt_scores = nyt_feature_engineer(nyt_posts, spweights)
        # nyt_scores = nyt_scores.set_index('date')
        # Instead, run below code to retrieve previously obtained sentiment scores.
        nyt_scores = pd.read_excel("data/sentiments/nyt_2016_2022_sentiment_scores.xlsx")
        nyt_scores = nyt_scores.set_index('date')
        nyt_scores = compute_lagged_values(nyt_scores, lag, "mean")
        nyt_scores = nyt_scores[nyt_scores.index.isin(target.index)]

        # feature: macro data
        macro_data = pd.read_excel("data/raw/Macro_Data_2016_to_2022.xlsx")
        macro_data = macro_feature_engineer(macro_data, data_type="both")
        macro_data = macro_data.reindex(target.index)
        macro_data = macro_data[macro_data.index.isin(target.index)]

        # Combine features and target
        data = pd.concat([prices, reddit_scores, nyt_scores, macro_data, target], axis=1)
        
        # Train-test split
        X, y = data.drop(columns={'decision'}), data[['decision']]
        X = X.drop(['Unnamed: 0', 'pos_score', 'neg_score','neu_score','compound_score_weighted_both'], axis = 1)
        X.rename({'pos_score_weighted_both': 'reddit_pos_both', 'neg_score_weighted_both': 'reddit_neg_both', 'neu_score_weighted_both': 'reddit_neu_both', 'pos_weighted':'nyt_pos','neg_weighted':'nyt_neg','neu_weighted':'nyt_neu','Adj Close':'adj_close','Quarterly GDP (Actual)':'quarterly_gdp_actual','Monthly CPI (Actual)':'monthly_cpi_actual','Monthly Short Term Interest Rates (Actual)':'monthly_st_ir_actual','Monthly Unemployment Rate (Actual)':'monthly_unemployment_actual','Quarterly GDP (Growth)':'quarterly_gdp_growth','Monthly CPI (Growth)':'monthly_cpi_growth','Monthly Short Term Interest Rates (Growth)':'monthly_st_ir_growth','Monthly Unemployment Rate (Growth)':'monthly_unemployment_growth'}, axis = 1, inplace = True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(data['2022':]), shuffle=False)

        scaler = StandardScaler() 
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_train_scaled = X_train_scaled.set_index(X_train.index)

        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        X_test_scaled = X_test_scaled.set_index(X_test.index)

        X_train_scaled.to_excel(f'data/model_inputs/x_train_{perc}_lag{lag}.xlsx')
        X_test_scaled.to_excel(f'data/model_inputs/x_test_{perc}_lag{lag}.xlsx')
        y_train.to_excel(f'data/model_inputs/y_train_{perc}_lag{lag}.xlsx')
        y_test.to_excel(f'data/model_inputs/y_test_{perc}_lag{lag}.xlsx')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
