In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import itertools
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')



In [2]:
X_train_3perc_lag3 = pd.read_excel("data/model_inputs2/x_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_3perc_lag3 = pd.read_excel("data/model_inputs2/x_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag3 = pd.read_excel("data/model_inputs2/y_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_3perc_lag3.index)
y_test_3perc_lag3 = pd.read_excel("data/model_inputs2/y_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_3perc_lag7 = pd.read_excel("data/model_inputs2/x_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_3perc_lag7 = pd.read_excel("data/model_inputs2/x_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag7 = pd.read_excel("data/model_inputs2/y_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_3perc_lag7.index)
y_test_3perc_lag7 = pd.read_excel("data/model_inputs2/y_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag3 = pd.read_excel("data/model_inputs2/x_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_5perc_lag3 = pd.read_excel("data/model_inputs2/x_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag3 = pd.read_excel("data/model_inputs2/y_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_5perc_lag3.index)
y_test_5perc_lag3 = pd.read_excel("data/model_inputs2/y_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag7 = pd.read_excel("data/model_inputs2/x_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').dropna()
X_test_5perc_lag7 = pd.read_excel("data/model_inputs2/x_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag7 = pd.read_excel("data/model_inputs2/y_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date').reindex(X_train_5perc_lag7.index)
y_test_5perc_lag7 = pd.read_excel("data/model_inputs2/y_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

In [3]:
included_columns = ['adj_close', 'reddit_pos_both', 'reddit_neg_both', 'reddit_neu_both', 'nyt_pos', 'nyt_neg', 'nyt_neu']
gdp = ['quarterly_gdp_actual','quarterly_gdp_growth']
cpi = ['monthly_cpi_actual','monthly_cpi_growth']
ir = ['monthly_st_ir_actual','monthly_st_ir_growth']
unemployment = ['monthly_unemployment_actual', 'monthly_unemployment_growth']
macro_combinations = list(itertools.product(gdp,cpi,ir,unemployment))
macro_combinations = [list(x) for x in macro_combinations]

In [4]:
feature_combinations = []
for x in macro_combinations:
    feature_combinations.append(list(np.append(x, included_columns)))

In [5]:
def smote(X_train, y_train):
    sm = SMOTE(sampling_strategy='not majority')
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
    return X_train_oversampled, y_train_oversampled

def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over

def adasyn(X_train, y_train):
    ada = ADASYN(sampling_strategy = 'not majority')
    X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
    return X_resampled, y_resampled

In [6]:
perc_col, lag_col, oversampling_method, used_features, accuracy, precision, recall, f1, pred_count = [], [], [], [], [], [], [], [], []
for perc in ['3perc', '5perc']:
    for lag in [3, 7]:
        for oversampling in ['smote', 'random_oversampler', 'adasyn']:
            for features in feature_combinations:
                X_train = eval(f'X_train_{perc}_lag{lag}')
                X_test = eval(f'X_test_{perc}_lag{lag}')
                y_train = eval(f'y_train_{perc}_lag{lag}')
                y_test = eval(f'y_test_{perc}_lag{lag}')

                # oversampling
                if oversampling == 'smote':
                    X_train_oversampled, y_train_oversampled = smote(X_train, y_train)
                elif oversampling == 'random_oversampler':
                    X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)
                else:
                    X_train_oversampled, y_train_oversampled = adasyn(X_train, y_train)

                # fit and predict
                lr = LogisticRegression()
                pred = lr.fit(X_train_oversampled, y_train_oversampled).predict(X_test)
                
                # update columns
                perc_col.append(perc)
                lag_col.append(lag)
                oversampling_method.append(oversampling)
                used_features.append(features)
                accuracy.append(accuracy_score(y_test, pred))
                precision.append(precision_score(y_test, pred, average='weighted'))
                recall.append(recall_score(y_test, pred, average='weighted'))
                f1.append(f1_score(y_test, pred, average='weighted'))
                pred_count.append(dict(pd.Series(pred).value_counts().sort_index()))

base_results = pd.DataFrame({
    'model': "Logistic Regression",
    'perc_threshold': perc_col,
    'lag': lag_col,
    'oversampling': oversampling_method,
    'features': used_features,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'pred_count': pred_count
})

In [7]:
base_results.sort_values(by="f1", ascending=False)

Unnamed: 0,model,perc_threshold,lag,oversampling,features,accuracy,precision,recall,f1,pred_count
127,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.678571,0.651316,0.678571,0.663966,"{-1: 1, 0: 142, 1: 25}"
124,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.660714,0.647467,0.660714,0.653150,"{-1: 1, 0: 139, 1: 28}"
112,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.654762,0.646161,0.654762,0.649503,"{-1: 1, 0: 138, 1: 29}"
125,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_growth, monthly_cpi_growth, mon...",0.654762,0.646161,0.654762,0.649503,"{-1: 1, 0: 138, 1: 29}"
113,Logistic Regression,5perc,3,random_oversampler,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.648810,0.649504,0.648810,0.648116,"{-1: 1, 0: 136, 1: 31}"
...,...,...,...,...,...,...,...,...,...,...
146,Logistic Regression,5perc,7,smote,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.220238,0.692435,0.220238,0.268720,"{0: 32, 1: 136}"
5,Logistic Regression,3perc,3,smote,"[quarterly_gdp_actual, monthly_cpi_growth, mon...",0.321429,0.571647,0.321429,0.268135,"{-1: 20, 0: 12, 1: 136}"
3,Logistic Regression,3perc,3,smote,"[quarterly_gdp_actual, monthly_cpi_actual, mon...",0.327381,0.639150,0.327381,0.263255,"{-1: 21, 0: 9, 1: 138}"
9,Logistic Regression,3perc,3,smote,"[quarterly_gdp_growth, monthly_cpi_actual, mon...",0.321429,0.536983,0.321429,0.261109,"{-1: 19, 0: 12, 1: 137}"


In [8]:
base_results.sort_values(by="f1", ascending=False).iloc[0]

model                                           Logistic Regression
perc_threshold                                                5perc
lag                                                               3
oversampling                                     random_oversampler
features          [quarterly_gdp_growth, monthly_cpi_growth, mon...
accuracy                                                   0.678571
precision                                                  0.651316
recall                                                     0.678571
f1                                                         0.663966
pred_count                                   {-1: 1, 0: 142, 1: 25}
Name: 127, dtype: object

In [9]:
base_results.sort_values(by="f1", ascending=False).iloc[0]['features']

['quarterly_gdp_growth',
 'monthly_cpi_growth',
 'monthly_st_ir_growth',
 'monthly_unemployment_growth',
 'adj_close',
 'reddit_pos_both',
 'reddit_neg_both',
 'reddit_neu_both',
 'nyt_pos',
 'nyt_neg',
 'nyt_neu']