In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import loguniform, uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')



Import data

In [2]:
X_train = pd.read_excel("data/model_inputs/X_train.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test = pd.read_excel("data/model_inputs/X_test.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train = pd.read_excel("data/model_inputs/y_train.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test = pd.read_excel("data/model_inputs/y_test.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

Tuning

In [4]:
def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over
    
nb_distribution = {'alpha':loguniform(0.001, 100)}

In [5]:
param, accuracy, precision, recall, f1 = [], [], [], [], []

# best oversampling: random_oversampling
X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)

# scaling for Naive Bayes
scaler = MinMaxScaler()
X_train_oversampled = scaler.fit_transform(X_train_oversampled)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

# tuning
nb = MultinomialNB()
nb_clf = RandomizedSearchCV(nb, nb_distribution, n_iter=200, scoring=['accuracy', 'recall_weighted', 'precision_weighted', 'f1_weighted'], refit='f1_weighted', random_state=42)
nb_search = nb_clf.fit(X_train_oversampled, y_train_oversampled)

# update columns
param.append(nb_search.cv_results_['params'])
accuracy.append(nb_search.cv_results_['mean_test_accuracy'])
precision.append(nb_search.cv_results_['mean_test_precision_weighted'])
recall.append(nb_search.cv_results_['mean_test_recall_weighted'])
f1.append(nb_search.cv_results_['mean_test_f1_weighted'])

# update columns
param.append(nb_search.cv_results_['params'])
accuracy.append(nb_search.cv_results_['mean_test_accuracy'])
precision.append(nb_search.cv_results_['mean_test_precision_weighted'])
recall.append(nb_search.cv_results_['mean_test_recall_weighted'])
f1.append(nb_search.cv_results_['mean_test_f1_weighted'])

In [6]:
results = pd.DataFrame({
    'model': "Naive Bayes",
    'parameters': np.array(param).flatten(),
    'accuracy': np.array(accuracy).flatten(),
    'precision': np.array(precision).flatten(),
    'recall': np.array(recall).flatten(),
    'f1': np.array(f1).flatten(),
}).sort_values(by="f1", ascending=False).reset_index(drop=True)
results.head(10)

Unnamed: 0,model,parameters,accuracy,precision,recall,f1
0,Naive Bayes,{'alpha': 30.588015371390597},0.709812,0.771191,0.709812,0.706574
1,Naive Bayes,{'alpha': 30.625189859246774},0.709812,0.771191,0.709812,0.706574
2,Naive Bayes,{'alpha': 30.588015371390597},0.709812,0.771191,0.709812,0.706574
3,Naive Bayes,{'alpha': 30.625189859246774},0.709812,0.771191,0.709812,0.706574
4,Naive Bayes,{'alpha': 30.23127748796298},0.709571,0.77094,0.709571,0.706342
5,Naive Bayes,{'alpha': 30.23127748796298},0.709571,0.77094,0.709571,0.706342
6,Naive Bayes,{'alpha': 29.794544625913627},0.70933,0.770691,0.70933,0.70611
7,Naive Bayes,{'alpha': 29.794544625913627},0.70933,0.770691,0.70933,0.70611
8,Naive Bayes,{'alpha': 29.026521418263922},0.70933,0.770691,0.70933,0.70611
9,Naive Bayes,{'alpha': 29.026521418263922},0.70933,0.770691,0.70933,0.70611


Best Model

In [7]:
best_params = results.iloc[0]['parameters']
best_model = MultinomialNB(**best_params)
best_model.fit(X_train_oversampled, y_train_oversampled)
best_model

MultinomialNB(alpha=30.588015371390597)

In [8]:
def print_results(actual, predictions):
    print(f"accuracy: {accuracy_score(actual, predictions)}")
    print(f"precision: {precision_score(actual, predictions, average='weighted')}")
    print(f"recall: {recall_score(actual, predictions, average='weighted')}")
    print(f"f1: {f1_score(actual, predictions, average='weighted')}")
    print(f"confusion matrix:\n{confusion_matrix(actual, predictions)}")

Predict on test set

In [9]:
test_pred = best_model.predict(X_test)
print_results(y_test, test_pred)

accuracy: 0.7738095238095238
precision: 0.7296030556150497
recall: 0.7738095238095238
f1: 0.751043873384299
confusion matrix:
[[  8  13   0]
 [ 15 122   0]
 [  0  10   0]]


Predict on full data

In [10]:
X_full = np.vstack([X_train, X_test])
y_full = pd.concat([y_train, y_test])

full_pred = best_model.predict(X_full)
print_results(y_full, full_pred)

accuracy: 0.5444993819530284
precision: 0.9089252404204361
recall: 0.5444993819530284
f1: 0.6579933972735262
confusion matrix:
[[ 43  17   0]
 [461 820 239]
 [  5  15  18]]


Business Metric

In [11]:
whole_df_x = pd.concat([X_train, X_test])
whole_df_x.shape

(1618, 11)

In [12]:
whole_df_x['year'] = whole_df_x.index.year

In [13]:
business_metric_results = pd.DataFrame(columns=['exp_strategy_annual_return', 'exp_benchmark_annual_return'])

for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022]:
    year_data = whole_df_x[whole_df_x['year'] == year]
    year_data = year_data.drop(['year'], axis = 1)
    predict_x = best_model.predict(np.array(year_data))
    predictions = predict_x
    df_pred = pd.DataFrame({'prediction':predictions}, index=year_data.index)
    dates = df_pred.index

    if year == 2022:
        end_date = "2022-09-02"
    else:
        end_date = str(year+1) + "-01-01"
    df_prices = yf.download("^GSPC", start=dates[0], end=end_date)[['Adj Close']]
    df_prices = df_prices.reindex(df_pred.index)

    # create positions column
    positions = []
    prev = 0
    for i in range(len(df_pred)):
        if df_pred.iloc[i]['prediction'] == 0:
            positions.append(prev)
        else:
            prev = df_pred.iloc[i]['prediction']
            positions.append(prev)

    df_business = pd.DataFrame()
    df_business['stock_daily_log_return'] = np.log(df_prices /df_prices.shift(1))['Adj Close']
    df_business['prediction'] = df_pred['prediction']
    df_business['position'] = positions
    df_business['benchmark'] = 1 # long and hold strategy
    df_business["strategy_Returns"] = df_business["stock_daily_log_return"] * df_business["position"].shift(1)
    df_business["benchmark_Returns"] = df_business["stock_daily_log_return"] * df_business["benchmark"].shift(1)

    # Annual Mean Returns or Expected returns
    expected_strategy_annual_return = np.exp(df_business['strategy_Returns'].mean() * 252) - 1 
    expected_benchmark_annual_return = np.exp(df_business['benchmark_Returns'].mean() * 252) - 1 
    business_metric_results.loc[year] = [expected_strategy_annual_return, expected_benchmark_annual_return]

business_metric_results

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,exp_strategy_annual_return,exp_benchmark_annual_return
2016,-0.128447,0.107616
2017,-0.065058,0.185753
2018,-0.026594,-0.070634
2019,-0.016214,0.288443
2020,0.206166,0.152929
2021,-0.15133,0.28923
2022,0.279334,-0.249185


Best and Worst Predictors

In [68]:
target_dict = {-1:'SELL', 0:'HOLD', 1:'BUY'}
columns = {'feature': X_train.columns}
for i in range(3):
    columns[target_dict[best_model.classes_[i]]] = best_model.coef_[i]
df_predictors = pd.DataFrame(columns)

for target in ['BUY', 'HOLD', 'SELL']:
    print(f'== {target} ==')
    print(f'Best predictor:', df_predictors.loc[df_predictors[target].idxmax(), 'feature'])
    print('Worst predictor:', df_predictors.loc[df_predictors[target].idxmin(), 'feature'])
    print("\n")

== BUY ==
Best predictor: reddit_neu_both
Worst predictor: nyt_neg


== HOLD ==
Best predictor: monthly_st_ir_growth
Worst predictor: monthly_unemployment_growth


== SELL ==
Best predictor: reddit_neg_both
Worst predictor: monthly_cpi_growth


