In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import yfinance as yf
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
X_train = pd.read_excel("data/model_inputs/x_train.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test = pd.read_excel("data/model_inputs/x_test.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train = pd.read_excel("data/model_inputs/y_train.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test = pd.read_excel("data/model_inputs/y_test.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

Without resampling

In [3]:
y_train.value_counts()

decision
 0          1436
 1            45
-1            30
dtype: int64

In [4]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train.values.ravel())
y_pred_logreg = logreg_model.predict(X_test)

print("======== Logistic Regression ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print(f"precision: {precision_score(y_test, y_pred_logreg, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_logreg, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_logreg, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_logreg)}")

accuracy: 0.8154761904761905
precision: 0.6650014172335601
recall: 0.8154761904761905
roc_auc: 0.7325917252146761
confusion matrix:
[[  0  10   0]
 [  0 137   0]
 [  0  21   0]]


In [5]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train.values.ravel())
y_pred_lgb = lgb_model.predict(X_test)

print("======== LGBM ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_lgb)}")
print(f"precision: {precision_score(y_test, y_pred_lgb, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_lgb, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_lgb, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_lgb)}")

accuracy: 0.8154761904761905
precision: 0.6650014172335601
recall: 0.8154761904761905
roc_auc: 0.7325917252146761
confusion matrix:
[[  0  10   0]
 [  0 137   0]
 [  0  21   0]]


In [6]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train.values.ravel())
y_pred_nb = nb_model.predict(X_test_scaled)

print("======== Naive Bayes ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_nb)}")
print(f"precision: {precision_score(y_test, y_pred_nb, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_nb, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_nb, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_nb)}")

accuracy: 0.8154761904761905
precision: 0.6650014172335601
recall: 0.8154761904761905
roc_auc: 0.7325917252146761
confusion matrix:
[[  0  10   0]
 [  0 137   0]
 [  0  21   0]]


With sklearn resampling

In [7]:
df_train = pd.concat([X_train, y_train], axis=1)
df_train_hold = df_train[(df_train['decision']==0)] 
df_train_buy = df_train[(df_train['decision']==-1)] 
df_train_sell = df_train[(df_train['decision']==1)] 

# upsample buy and sell class
df_train_buy_upsampled = resample(df_train_buy, 
                                 replace=True,    # sample with replacement
                                 n_samples= len(df_train_hold)) # to match majority class

df_train_sell_upsampled = resample(df_train_sell, 
                                 replace=True,    # sample with replacement
                                 n_samples= len(df_train_hold)) # to match majority class

# Combine majority class with upsampled minority classes
df_train_upsampled = pd.concat([df_train_buy_upsampled, df_train_sell_upsampled, df_train_hold])
X_train_upsampled, y_train_upsampled = df_train_upsampled.drop(columns=['decision']), df_train_upsampled['decision']
y_train_upsampled.value_counts()

 0    1436
 1    1436
-1    1436
Name: decision, dtype: int64

In [8]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_upsampled, y_train_upsampled.values.ravel())
y_pred_logreg_resampling = logreg_model.predict(X_test)

print("======== Logistic Regression ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_logreg_resampling)}")
print(f"precision: {precision_score(y_test, y_pred_logreg_resampling, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_logreg_resampling, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_logreg_resampling, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_logreg_resampling)}")

accuracy: 0.10714285714285714
precision: 0.5474664224664224
recall: 0.10714285714285714
roc_auc: 0.0947394563873904
confusion matrix:
[[ 10   0   0]
 [129   8   0]
 [ 17   4   0]]


In [9]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train_upsampled, y_train_upsampled.values.ravel())
y_pred_lgb_resampling = lgb_model.predict(X_test)

print("======== LGBM ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_lgb_resampling)}")
print(f"precision: {precision_score(y_test, y_pred_lgb_resampling, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_lgb_resampling, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_lgb_resampling, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_lgb_resampling)}")

accuracy: 0.8154761904761905
precision: 0.6650014172335601
recall: 0.8154761904761905
roc_auc: 0.7325917252146761
confusion matrix:
[[  0  10   0]
 [  0 137   0]
 [  0  21   0]]


In [10]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_upsampled)
X_test_scaled = scaler.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train_upsampled.values.ravel())
y_pred_nb_resampling = nb_model.predict(X_test_scaled)

print("======== Naive Bayes ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_nb_resampling)}")
print(f"precision: {precision_score(y_test, y_pred_nb_resampling, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_nb_resampling, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_nb_resampling, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_nb_resampling)}")

accuracy: 0.7440476190476191
precision: 0.6917068854568855
recall: 0.7440476190476191
roc_auc: 0.7168296758155551
confusion matrix:
[[  0  10   0]
 [  0 121  16]
 [  0  17   4]]


With SMOTE resampling

In [11]:
sm = SMOTE(sampling_strategy='not majority')
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
y_train_oversampled.value_counts()

decision
-1          1436
 0          1436
 1          1436
dtype: int64

In [12]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_oversampled, y_train_oversampled.values.ravel())
y_pred_logreg_smote = logreg_model.predict(X_test)

print("======== Logistic Regression ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_logreg_smote)}")
print(f"precision: {precision_score(y_test, y_pred_logreg_smote, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_logreg_smote, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_logreg_smote, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_logreg_smote)}")

accuracy: 0.06547619047619048
precision: 0.4113238668961561
recall: 0.06547619047619048
roc_auc: 0.018497539630633154
confusion matrix:
[[ 10   0   0]
 [136   1   0]
 [ 20   1   0]]


In [13]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train_oversampled, y_train_oversampled.values.ravel())
y_pred_lgb_smote = lgb_model.predict(X_test)

print("======== LGBM ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_lgb_smote)}")
print(f"precision: {precision_score(y_test, y_pred_lgb_smote, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_lgb_smote, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_lgb_smote, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_lgb_smote)}")

accuracy: 0.8154761904761905
precision: 0.6650014172335601
recall: 0.8154761904761905
roc_auc: 0.7325917252146761
confusion matrix:
[[  0  10   0]
 [  0 137   0]
 [  0  21   0]]


In [14]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_oversampled)
X_test_scaled = scaler.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train_oversampled.values.ravel())
y_pred_nb_smote = nb_model.predict(X_test_scaled)

print("======== Naive Bayes ========")
print(f"accuracy: {accuracy_score(y_test, y_pred_nb_smote)}")
print(f"precision: {precision_score(y_test, y_pred_nb_smote, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred_nb_smote, average='weighted')}")
print(f"roc_auc: {f1_score(y_test, y_pred_nb_smote, average='weighted')}")
print(f"confusion matrix:\n{confusion_matrix(y_test, y_pred_nb_smote)}")

accuracy: 0.7380952380952381
precision: 0.6895043731778426
recall: 0.7380952380952381
roc_auc: 0.7129443326626425
confusion matrix:
[[  0  10   0]
 [  0 120  17]
 [  0  17   4]]


In [15]:
pd.DataFrame({
    'model': ['logreg', 'logreg_resampling', 'logreg_smote', 'lgbm', 'lgbm_resampling', 'lgbm_smote', 'nb', 'nb_resampling', 'nbg_smote'],
    'accuracy': [accuracy_score(y_test, y_pred_logreg), accuracy_score(y_test, y_pred_logreg_resampling), accuracy_score(y_test, y_pred_logreg_smote),
                accuracy_score(y_test, y_pred_lgb), accuracy_score(y_test, y_pred_lgb_resampling), accuracy_score(y_test, y_pred_lgb_smote),
                accuracy_score(y_test, y_pred_nb), accuracy_score(y_test, y_pred_nb_resampling), accuracy_score(y_test, y_pred_nb_smote)],
    'recall': [recall_score(y_test, y_pred_logreg, average='weighted'), recall_score(y_test, y_pred_logreg_resampling, average='weighted'), recall_score(y_test, y_pred_logreg_smote, average='weighted'),
                recall_score(y_test, y_pred_lgb, average='weighted'), recall_score(y_test, y_pred_lgb_resampling, average='weighted'), recall_score(y_test, y_pred_lgb_smote, average='weighted'),
                recall_score(y_test, y_pred_nb, average='weighted'), recall_score(y_test, y_pred_nb_resampling, average='weighted'), recall_score(y_test, y_pred_nb_smote, average='weighted')],
    'precision': [precision_score(y_test, y_pred_logreg, average='weighted'), precision_score(y_test, y_pred_logreg_resampling, average='weighted'), precision_score(y_test, y_pred_logreg_smote, average='weighted'),
                precision_score(y_test, y_pred_lgb, average='weighted'), precision_score(y_test, y_pred_lgb_resampling, average='weighted'), precision_score(y_test, y_pred_lgb_smote, average='weighted'),
                precision_score(y_test, y_pred_nb, average='weighted'), precision_score(y_test, y_pred_nb_resampling, average='weighted'), precision_score(y_test, y_pred_nb_smote, average='weighted')],
})

Unnamed: 0,model,accuracy,recall,precision
0,logreg,0.815476,0.815476,0.665001
1,logreg_resampling,0.107143,0.107143,0.547466
2,logreg_smote,0.065476,0.065476,0.411324
3,lgbm,0.815476,0.815476,0.665001
4,lgbm_resampling,0.815476,0.815476,0.665001
5,lgbm_smote,0.815476,0.815476,0.665001
6,nb,0.815476,0.815476,0.665001
7,nb_resampling,0.744048,0.744048,0.691707
8,nbg_smote,0.738095,0.738095,0.689504
