In [1]:
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN

from scipy.stats import loguniform, uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')



In [2]:
lgbm_distribution = {
    'learning_rate': loguniform(0.01, 0.99),
    'max_depth': randint(1, 10),
    'min_child_weight': randint(1, 10),
    'boosting':  ['gbdt', 'dart'],
    'min_gain_to_split': uniform(0, 10),
    'num_leaves': randint(10, 300, 10),
    'subsample': uniform(0.5, 0.4),
    'colsample_bytree': uniform(0.5, 0.4),
    'colsample_bynode': uniform(0.5, 0.4),
    'reg_lambda': uniform(0, 1),
    'reg_alpha': randint(0, 100)
}

Import Data

In [3]:
X_train_3perc_lag3 = pd.read_excel("data/model_inputs2/x_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test_3perc_lag3 = pd.read_excel("data/model_inputs2/x_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag3 = pd.read_excel("data/model_inputs2/y_train_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test_3perc_lag3 = pd.read_excel("data/model_inputs2/y_test_3perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_3perc_lag7 = pd.read_excel("data/model_inputs2/x_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test_3perc_lag7 = pd.read_excel("data/model_inputs2/x_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_3perc_lag7 = pd.read_excel("data/model_inputs2/y_train_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test_3perc_lag7 = pd.read_excel("data/model_inputs2/y_test_3perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag3 = pd.read_excel("data/model_inputs2/x_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test_5perc_lag3 = pd.read_excel("data/model_inputs2/x_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag3 = pd.read_excel("data/model_inputs2/y_train_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test_5perc_lag3 = pd.read_excel("data/model_inputs2/y_test_5perc_lag3.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

X_train_5perc_lag7 = pd.read_excel("data/model_inputs2/x_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
X_test_5perc_lag7 = pd.read_excel("data/model_inputs2/x_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_train_5perc_lag7 = pd.read_excel("data/model_inputs2/y_train_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')
y_test_5perc_lag7 = pd.read_excel("data/model_inputs2/y_test_5perc_lag7.xlsx").rename(columns={'Unnamed: 0':'date'}).set_index('date')

Oversampling Methods

In [4]:
def smote(X_train, y_train):
    sm = SMOTE(sampling_strategy='not majority')
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
    return X_train_oversampled, y_train_oversampled

def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='not majority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    return X_over, y_over

def adasyn(X_train, y_train):
    ada = ADASYN(sampling_strategy = 'not majority')
    X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
    return X_resampled, y_resampled

12 combinations without tuning

In [5]:
perc_col, lag_col, oversampling_method, accuracy, precision, recall, f1, pred_count = [], [], [], [], [], [], [], []

for perc in ['3perc', '5perc']:
    for lag in [3, 7]:
        for oversampling in ['smote', 'random_oversampler', 'adasyn']:
            X_train = eval(f'X_train_{perc}_lag{lag}')
            X_test = eval(f'X_test_{perc}_lag{lag}')
            y_train = eval(f'y_train_{perc}_lag{lag}')
            y_test = eval(f'y_test_{perc}_lag{lag}')

            # oversampling
            if oversampling == 'smote':
                X_train_oversampled, y_train_oversampled = smote(X_train, y_train)
            elif oversampling == 'random_oversampler':
                X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)
            else:
                X_train_oversampled, y_train_oversampled = adasyn(X_train, y_train)

            # fit and predict
            lgbm = LGBMClassifier()
            pred = lgbm.fit(X_train_oversampled, y_train_oversampled).predict(X_test)

            # update columns
            perc_col.append(perc)
            lag_col.append(lag)
            oversampling_method.append(oversampling)
            accuracy.append(accuracy_score(y_test, pred))
            precision.append(precision_score(y_test, pred, average='weighted'))
            recall.append(recall_score(y_test, pred, average='weighted'))
            f1.append(f1_score(y_test, pred, average='weighted'))
            pred_count.append(dict(pd.Series(pred).value_counts().sort_index()))

results_no_tuning = pd.DataFrame({
    'model': "LGBM",
    'perc_threshold': perc_col,
    'lag': lag_col,
    'oversampling': oversampling_method,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'pred_count': pred_count
})
results_no_tuning

Unnamed: 0,model,perc_threshold,lag,oversampling,accuracy,precision,recall,f1,pred_count
0,LGBM,3perc,3,smote,0.52381,0.396568,0.52381,0.382244,"{-1: 6, 0: 162}"
1,LGBM,3perc,3,random_oversampler,0.505952,0.326183,0.505952,0.354892,"{-1: 4, 0: 164}"
2,LGBM,3perc,3,adasyn,0.5,0.368571,0.5,0.391333,"{-1: 15, 0: 150, 1: 3}"
3,LGBM,3perc,7,smote,0.5,0.368846,0.5,0.367822,"{-1: 7, 0: 160, 1: 1}"
4,LGBM,3perc,7,random_oversampler,0.52381,0.405332,0.52381,0.40809,"{-1: 13, 0: 155}"
5,LGBM,3perc,7,adasyn,0.511905,0.455865,0.511905,0.376708,"{-1: 4, 0: 161, 1: 3}"
6,LGBM,5perc,3,smote,0.815476,0.665001,0.815476,0.732592,{0: 168}
7,LGBM,5perc,3,random_oversampler,0.815476,0.665001,0.815476,0.732592,{0: 168}
8,LGBM,5perc,3,adasyn,0.815476,0.665001,0.815476,0.732592,{0: 168}
9,LGBM,5perc,7,smote,0.815476,0.665001,0.815476,0.732592,{0: 168}


Tuning for best combination

In [15]:
perc_col, lag_col, oversampling_method, param, accuracy, precision, recall, f1 = [], [], [], [], [], [], [], []

perc = '3perc'
lag = 3
oversampling = 'adasyn'
X_train = eval(f'X_train_{perc}_lag{lag}')
X_test = eval(f'X_test_{perc}_lag{lag}')
y_train = eval(f'y_train_{perc}_lag{lag}')
y_test = eval(f'y_test_{perc}_lag{lag}')

# oversampling
if oversampling == 'smote':
    X_train_oversampled, y_train_oversampled = smote(X_train, y_train)
elif oversampling == 'random_oversampler':
    X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)
else:
    X_train_oversampled, y_train_oversampled = adasyn(X_train, y_train)

# tuning
lgbm = LGBMClassifier()
lgbm_clf = RandomizedSearchCV(lgbm, lgbm_distribution, n_iter=100, scoring=['accuracy', 'recall_weighted', 'precision_weighted', 'f1_weighted'], refit='f1_weighted', random_state=42)
lgbm_search = lgbm_clf.fit(X_train_oversampled, y_train_oversampled)

# update columns
perc_col.append([perc]*100)
lag_col.append([lag]*100)
oversampling_method.append([oversampling]*100)
param.append(lgbm_search.cv_results_['params'])
accuracy.append(lgbm_search.cv_results_['mean_test_accuracy'])
precision.append(lgbm_search.cv_results_['mean_test_precision_weighted'])
recall.append(lgbm_search.cv_results_['mean_test_recall_weighted'])
f1.append(lgbm_search.cv_results_['mean_test_f1_weighted'])



In [16]:
results = pd.DataFrame({
    'model': "LGBM",
    'perc_threshold': np.array(perc_col).flatten(),
    'lag': np.array(lag_col).flatten(),
    'oversampling': np.array(oversampling_method).flatten(),
    'parameters': np.array(param).flatten(),
    'accuracy': np.array(accuracy).flatten(),
    'precision': np.array(precision).flatten(),
    'recall': np.array(recall).flatten(),
    'f1': np.array(f1).flatten(),
}).sort_values(by="f1", ascending=False).reset_index(drop=True)

In [17]:
results.head(10)

Unnamed: 0,model,perc_threshold,lag,oversampling,parameters,accuracy,precision,recall,f1
0,LGBM,3perc,3,adasyn,"{'boosting': 'dart', 'colsample_bynode': 0.798...",0.571472,0.520569,0.571472,0.501369
1,LGBM,3perc,3,adasyn,"{'boosting': 'dart', 'colsample_bynode': 0.872...",0.543514,0.516259,0.543514,0.474393
2,LGBM,3perc,3,adasyn,"{'boosting': 'dart', 'colsample_bynode': 0.858...",0.526499,0.489024,0.526499,0.462336
3,LGBM,3perc,3,adasyn,"{'boosting': 'gbdt', 'colsample_bynode': 0.898...",0.505428,0.539001,0.505428,0.462215
4,LGBM,3perc,3,adasyn,"{'boosting': 'dart', 'colsample_bynode': 0.807...",0.522226,0.520544,0.522226,0.459099
5,LGBM,3perc,3,adasyn,"{'boosting': 'dart', 'colsample_bynode': 0.566...",0.515013,0.55884,0.515013,0.45717
6,LGBM,3perc,3,adasyn,"{'boosting': 'gbdt', 'colsample_bynode': 0.872...",0.519285,0.54462,0.519285,0.457085
7,LGBM,3perc,3,adasyn,"{'boosting': 'gbdt', 'colsample_bynode': 0.885...",0.520348,0.533502,0.520348,0.456158
8,LGBM,3perc,3,adasyn,"{'boosting': 'gbdt', 'colsample_bynode': 0.732...",0.500107,0.516107,0.500107,0.452773
9,LGBM,3perc,3,adasyn,"{'boosting': 'gbdt', 'colsample_bynode': 0.775...",0.507864,0.497251,0.507864,0.451909


Best Model

In [18]:
best_params, best_perc, best_lag, best_oversampling = results.iloc[0]['parameters'], results.iloc[0]['perc_threshold'], results.iloc[0]['lag'], results.iloc[0]['oversampling']
best_model = LGBMClassifier(**best_params)
X_train, y_train, x_test, y_test = eval(f'X_train_{best_perc}_lag{best_lag}'), eval(f'y_train_{best_perc}_lag{best_lag}'), eval(f'X_test_{best_perc}_lag{best_lag}'), eval(f'y_test_{best_perc}_lag{best_lag}')

if best_oversampling == 'smote':
    X_train_oversampled, y_train_oversampled = smote(X_train, y_train)
elif best_oversampling == 'random_oversampler':
    X_train_oversampled, y_train_oversampled = random_oversampler(X_train, y_train)
else:
    X_train_oversampled, y_train_oversampled = adasyn(X_train, y_train)

best_model.fit(X_train_oversampled, y_train_oversampled)
best_model

LGBMClassifier(boosting='dart', colsample_bynode=0.7983898102194873,
               colsample_bytree=0.6634073171743667,
               learning_rate=0.7274514955731546, max_depth=9,
               min_child_weight=2, min_gain_to_split=1.8123306616566015,
               num_leaves=54, reg_alpha=0, reg_lambda=0.741120649290059,
               subsample=0.7297892452719648)

In [19]:
def print_results(actual, predictions):
    print(f"accuracy: {accuracy_score(actual, predictions)}")
    print(f"precision: {precision_score(actual, predictions, average='weighted')}")
    print(f"recall: {recall_score(actual, predictions, average='weighted')}")
    print(f"f1: {f1_score(actual, predictions, average='weighted')}")
    print(f"confusion matrix:\n{confusion_matrix(actual, predictions)}")

Predict on test set

In [20]:
test_pred = best_model.predict(X_test)
print_results(y_test, test_pred)

accuracy: 0.47619047619047616
precision: 0.3524386724386725
recall: 0.47619047619047616
f1: 0.36762024332645804
confusion matrix:
[[ 4 36  3]
 [ 6 76  4]
 [ 1 38  0]]


Predict on full data

In [21]:
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

full_pred = best_model.predict(X_full)
print_results(y_full, full_pred)

accuracy: 0.8826682549136391
precision: 0.8852326648367429
recall: 0.8826682549136391
f1: 0.8836647653103599
confusion matrix:
[[  99   40    4]
 [  34 1248   68]
 [   2   49  135]]


Business metric

In [22]:
whole_df_x = pd.concat([X_train, X_test])
whole_df_x['year'] = whole_df_x.index.year

In [23]:
business_metric_results = pd.DataFrame(columns=['exp_strategy_annual_return', 'exp_benchmark_annual_return'])

for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022]:
    year_data = whole_df_x[whole_df_x['year'] == year]
    # year_data = year_data.set_index('index')
    year_data = year_data.drop(['year'], axis = 1)
    predict_x = best_model.predict(np.array(year_data)) 
    # predictions = np.argmax(predict_x,axis=1)
    predictions = predict_x

    df_pred = pd.DataFrame({'prediction':predictions}, index=year_data.index)
    df_pred = df_pred.replace({-1:1, 1:-1}) # convert classes to buy hold sell
    dates = df_pred.index

    if year == 2022:
        end_date = "2022-09-02"
    else:
        end_date = str(year+1) + "-01-01"
    df_prices = yf.download("^GSPC", start=dates[0], end=end_date)[['Adj Close']]

    # create positions column
    positions = []
    prev = 0
    for i in range(len(df_pred)):
        if df_pred.iloc[i]['prediction'] == 0:
            positions.append(prev)
        else:
            prev = df_pred.iloc[i]['prediction']
            positions.append(prev)

    df_business = pd.DataFrame()
    df_business['stock_daily_log_return'] = np.log(df_prices /df_prices.shift(1))['Adj Close']
    df_business['prediction'] = df_pred['prediction']
    df_business['position'] = positions
    df_business['benchmark'] = 1 # long and hold strategy
    df_business["strategy_Returns"] = df_business["stock_daily_log_return"] * df_business["position"].shift(1)
    df_business["benchmark_Returns"] = df_business["stock_daily_log_return"] * df_business["benchmark"].shift(1)

    # Annual Mean Returns or Expected returns
    expected_strategy_annual_return = np.exp(df_business['strategy_Returns'].mean() * 252) - 1 
    expected_benchmark_annual_return = np.exp(df_business['benchmark_Returns'].mean() * 252) - 1 
    business_metric_results.loc[year] = [expected_strategy_annual_return, expected_benchmark_annual_return]
    # print(f'Expected Annual Returns: Strategy: {round(expected_strategy_annual_return*100, 2)}%  |  Stock: {round(expected_benchmark_annual_return*100, 2)}%')

business_metric_results

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,exp_strategy_annual_return,exp_benchmark_annual_return
2016,-0.096594,0.112846
2017,0.0,0.185753
2018,-0.013877,-0.070634
2019,0.053949,0.288443
2020,-0.393371,0.152929
2021,0.019595,0.28923
2022,-0.061623,-0.249185
