In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import itertools
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from feature_engineering import *
import warnings
warnings.filterwarnings('ignore')



Import data

In [2]:
X_train_3perc_lag3 = pd.read_excel("data/model_inputs/x_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_3perc_lag3 = pd.read_excel("data/model_inputs/x_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag3 = pd.read_excel("data/model_inputs/y_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_3perc_lag3.index)
y_test_3perc_lag3 = pd.read_excel("data/model_inputs/y_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_3perc_lag7 = pd.read_excel("data/model_inputs/x_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_3perc_lag7 = pd.read_excel("data/model_inputs/x_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag7 = pd.read_excel("data/model_inputs/y_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_3perc_lag7.index)
y_test_3perc_lag7 = pd.read_excel("data/model_inputs/y_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag3 = pd.read_excel("data/model_inputs/x_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_5perc_lag3 = pd.read_excel("data/model_inputs/x_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag3 = pd.read_excel("data/model_inputs/y_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_5perc_lag3.index)
y_test_5perc_lag3 = pd.read_excel("data/model_inputs/y_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag7 = pd.read_excel("data/model_inputs/x_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_5perc_lag7 = pd.read_excel("data/model_inputs/x_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag7 = pd.read_excel("data/model_inputs/y_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_5perc_lag7.index)
y_test_5perc_lag7 = pd.read_excel("data/model_inputs/y_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

Generate config and feature combinations to test

In [3]:
included_columns = ['adj_close', 'reddit_pos_both', 'reddit_neg_both', 'reddit_neu_both', 'nyt_pos', 'nyt_neg', 'nyt_neu']
gdp = ['quarterly_gdp_actual','quarterly_gdp_growth']
cpi = ['monthly_cpi_actual','monthly_cpi_growth']
ir = ['monthly_st_ir_actual','monthly_st_ir_growth']
unemployment = ['monthly_unemployment_actual', 'monthly_unemployment_growth']
macro_combinations = list(itertools.product(gdp,cpi,ir,unemployment))
macro_combinations = [list(x) for x in macro_combinations]

In [4]:
feature_combinations = []
for x in macro_combinations:
    feature_combinations.append(list(np.append(x, included_columns)))

In [5]:
def smote(X_train, y_train):
    sm = SMOTE(sampling_strategy='not majority')
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
    return X_train_oversampled, y_train_oversampled

def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over

def adasyn(X_train, y_train):
    ada = ADASYN(sampling_strategy = 'not majority')
    X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
    return X_resampled, y_resampled

Test configs and features

In [6]:
perc_col, lag_col, oversampling_method, used_features, accuracy, precision, recall, f1, pred_count = [], [], [], [], [], [], [], [], []
for perc in ['3perc', '5perc']:
    for lag in [3, 7]:
        for oversampling in ['smote', 'random_oversampler', 'adasyn']:
            for features in feature_combinations:
                X_train = eval(f'X_train_{perc}_lag{lag}')
                X_test = eval(f'X_test_{perc}_lag{lag}')
                y_train = eval(f'y_train_{perc}_lag{lag}')
                y_test = eval(f'y_test_{perc}_lag{lag}')

                # oversampling
                if oversampling == 'smote':
                    X_train_oversampled, y_train_oversampled = smote(X_train, y_train)
                elif oversampling == 'random_oversampler':
                    X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)
                else:
                    X_train_oversampled, y_train_oversampled = adasyn(X_train, y_train)

                # fit and predict
                lr = LogisticRegression()
                pred = lr.fit(X_train_oversampled, y_train_oversampled).predict(X_test)
                
                # update columns
                perc_col.append(perc)
                lag_col.append(lag)
                oversampling_method.append(oversampling)
                used_features.append(features)
                accuracy.append(accuracy_score(y_test, pred))
                precision.append(precision_score(y_test, pred, average='weighted'))
                recall.append(recall_score(y_test, pred, average='weighted'))
                f1.append(f1_score(y_test, pred, average='weighted'))
                pred_count.append(dict(pd.Series(pred).value_counts().sort_index()))

base_results = pd.DataFrame({
    'model': "Logistic Regression",
    'perc_threshold': perc_col,
    'lag': lag_col,
    'oversampling': oversampling_method,
    'features': used_features,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'pred_count': pred_count
})

Results

In [7]:
base_results.sort_values(by="f1", ascending=False)

Unnamed: 0,model,perc_threshold,lag,oversampling,features,accuracy,precision,recall,f1,pred_count
127,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.666667,0.644260,0.666667,0.654514,"{-1: 1, 0: 141, 1: 26}"
114,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.654762,0.650794,0.654762,0.651786,"{-1: 1, 0: 137, 1: 30}"
125,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.654762,0.646161,0.654762,0.649503,"{-1: 1, 0: 138, 1: 29}"
126,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.654762,0.646092,0.654762,0.649426,"{0: 138, 1: 30}"
113,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.636905,0.651629,0.636905,0.642945,"{0: 133, 1: 35}"
...,...,...,...,...,...,...,...,...,...,...
70,Logistic Regression,3perc,7,random_oversampler,"[quarterly_gdp_actual, monthly_cpi_growth, mon...",0.333333,0.677112,0.333333,0.269107,"{-1: 12, 0: 11, 1: 145}"
15,Logistic Regression,3perc,3,smote,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.327381,0.599741,0.327381,0.263626,"{-1: 20, 0: 10, 1: 138}"
12,Logistic Regression,3perc,3,smote,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.321429,0.524772,0.321429,0.263370,"{-1: 23, 0: 12, 1: 133}"
0,Logistic Regression,3perc,3,smote,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.321429,0.512057,0.321429,0.261338,"{-1: 26, 0: 12, 1: 130}"


In [8]:
base_results.sort_values(by="f1", ascending=False).iloc[0]

model                                           Logistic Regression
perc_threshold                                                5perc
lag                                                               3
oversampling                                     random_oversampler
features          [quarterly_gdp_growth, monthly_cpi_growth, mon...
accuracy                                                   0.666667
precision                                                   0.64426
recall                                                     0.666667
f1                                                         0.654514
pred_count                                   {-1: 1, 0: 141, 1: 26}
Name: 127, dtype: object

In [9]:
base_results.sort_values(by="f1", ascending=False).iloc[0]['features']

['quarterly_gdp_growth',
 'monthly_cpi_growth',
 'monthly_st_ir_growth',
 'monthly_unemployment_growth',
 'adj_close',
 'reddit_pos_both',
 'reddit_neg_both',
 'reddit_neu_both',
 'nyt_pos',
 'nyt_neg',
 'nyt_neu']

Generate X_train, y_train, X_test, y_test based on best config and features

In [10]:
best_perc, best_lag = 0.05, 3

# target column
target = create_target(best_perc)
target = target.replace({'BUY':1, 'HOLD':0, 'SELL':-1})
target = target.drop(['Adj Close'], axis=1)

# feature: index price
prices = yf.download("^GSPC", start="2015-12-01", end="2022-09-02")[['Adj Close']]
prices = compute_lagged_values(prices, best_lag, "mean")
prices = prices.reset_index()
prices['Date'] = prices['Date'].apply(lambda x: x.date())
prices = prices.set_index('Date')
prices.index = pd.DatetimeIndex(prices.index)
prices = prices[prices.index.isin(target.index)]

# feature: reddit scores
reddit_scores = pd.read_excel("data/sentiments/reddit_2016_2022_sentiment_scores.xlsx")
reddit_scores = reddit_scores.set_index('date')
reddit_scores = compute_lagged_values(reddit_scores, best_lag, "mean")
reddit_scores = reddit_scores[reddit_scores.index.isin(target.index)]
weight_type = "both" # or "comments", "upvotes"
reddit_scores = reddit_scores[[f'pos_score_weighted_{weight_type}',f'neg_score_weighted_{weight_type}',f'neu_score_weighted_{weight_type}',f'compound_score_weighted_{weight_type}']]

# feature: news scores
nyt_scores = pd.read_excel("data/sentiments/nyt_2016_2022_sentiment_scores.xlsx")
nyt_scores = nyt_scores.set_index('date')
nyt_scores = compute_lagged_values(nyt_scores, best_lag, "mean")
nyt_scores = nyt_scores[nyt_scores.index.isin(target.index)]

# feature: macro data
macro_data = pd.read_excel("data/raw/Macro_Data_2016_to_2022.xlsx")
macro_data = macro_feature_engineer(macro_data, data_type="growth") # best results uses all growth for macro features
macro_data = macro_data.reindex(target.index)
macro_data = macro_data[macro_data.index.isin(target.index)]

# Combine features and target
data = pd.concat([prices, reddit_scores, nyt_scores, macro_data, target], axis=1)

# Train-test split
X, y = data.drop(columns={'decision'}), data[['decision']]
X = X.drop(['Unnamed: 0', 'pos_score', 'neg_score','neu_score','compound_score_weighted_both'], axis = 1)
X.rename({'pos_score_weighted_both': 'reddit_pos_both', 'neg_score_weighted_both': 'reddit_neg_both', 'neu_score_weighted_both': 'reddit_neu_both', 'pos_weighted':'nyt_pos','neg_weighted':'nyt_neg','neu_weighted':'nyt_neu','Adj Close':'adj_close','Quarterly GDP (Growth)':'quarterly_gdp_growth','Monthly CPI (Growth)':'monthly_cpi_growth','Monthly Short Term Interest Rates (Growth)':'monthly_st_ir_growth','Monthly Unemployment Rate (Growth)':'monthly_unemployment_growth'}, axis = 1, inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(data['2022':]), shuffle=False)

scaler = StandardScaler() 
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train_scaled = X_train_scaled.set_index(X_train.index)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test_scaled = X_test_scaled.set_index(X_test.index)

X_train_scaled.to_excel(f'data/model_inputs/X_train.xlsx')
X_test_scaled.to_excel(f'data/model_inputs/X_test.xlsx')
y_train.to_excel(f'data/model_inputs/y_train.xlsx')
y_test.to_excel(f'data/model_inputs/y_test.xlsx')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
