In [13]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Conv1D, Conv2D, Dropout
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

In [28]:
def retrieve_data(varname, filename):
    if "combined" in filename:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist() # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1]['Date'], row[1][column]]
#             new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist() # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    
    df2 = pd.DataFrame(rows)
    df2.columns = columns
                       
    for col in columns:
        if col[:4] == "Date" and col != "Date":
            df2 = df2.drop([col], axis=1)
    return df2

def create_train_val_test(df, year_val, year_test, perc_train=None):
    if perc_train == None:
        # assumes years_train < year_val < year_test
        df["Date"] = pd.to_datetime(df["Date"])
        
        val = df[df['Date'].dt.year == year_val]
        test = df[df['Date'].dt.year == year_test]
        train = df[df['Date'].dt.year < year_val]
    else:
        train = df.head(round(len(df) * perc_train))
        val = df.tail(len(df) - len(train))
        test = val.tail(round(0.5 * len(val)))
        val = val.head(len(val) - len(test))
    y_train = train['SP500_relative_change_perc_1']
    x_train = train.drop(['SP500_relative_change_perc_1'], axis=1)
    
    y_val = val['SP500_relative_change_perc_1']
    x_val = val.drop(['SP500_relative_change_perc_1'], axis=1)
    
    y_test = test['SP500_relative_change_perc_1']
    x_test = test.drop(['SP500_relative_change_perc_1'], axis=1)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

def scale_data(x):
    standard_scaler = MinMaxScaler()
    x_scaled = pd.DataFrame(standard_scaler.fit_transform(x), columns=x.columns)
    return x_scaled

In [36]:
val_year = 2018
test_year = 2019


mode = 'reduced'
if mode == 'reduced':
    files = {
        # varname: filename
        "S&P500": "Dataset v3/SP500_reduced_data_20220425.csv",
    #     "S&P500": "Dataset v3/SP500_combined_data_20220422.csv"
    }
else:
    files = {
        # varname: filename
        "S&P500": "Dataset v3/SP500_combined_data_20220422.csv",
    #     "S&P500": "Dataset v3/SP500_combined_data_20220422.csv"
    }

for file in files:
    df = retrieve_data(file, files[file])
    
lookback = 3
if mode == 'reduced': # REDUCED DATA
    if lookback == 3:
        features = ['SP500_stochastic_D_5_5_t-3', 'SP500_momentum_16_t-1', 'SP500_williams_R_10_t-1', 'SP500_williams_R_20_t-1', 'SP500_AD_MACD_12_26_t-3', 'SP500_stochastic_D_5_5_t-1', 'SP500_stochastic_K_50_t-3', 'SP500_F_relative_change_perc_1_t-3', 'SP500_stochastic_K_5_t-3', 'SP500_williams_R_5_t-1', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_10_t-1', 'SP500_AD_oscillator_t-1', 'SP500_williams_R_10_t-2', 'SP500_momentum_8_t-1', 'SP500_stochastic_D_5_5_t-2']
    elif lookback == 5:
        features = ['SP500_momentum_16_t-1', 'SP500_williams_R_5_t-5', 'SP500_stochastic_K_50_t-5', 'SP500_williams_R_20_t-1', 'SP500_relative_change_perc_1_t-4', 'SP500_williams_R_50_t-3', 'SP500_AD_oscillator_t-4', 'SP500_stochastic_K_5_t-4', 'SP500_F_relative_change_perc_1_t-3', 'SP500_AD_oscillator_t-3', 'Silver_F_relative_change_perc_1_t-5', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_50_t-4', 'SP500_williams_R_50_t-4', 'SP500_stochastic_K_10_t-1', 'Silver_F_relative_change_perc_1_t-4', 'SP500_AD_oscillator_t-1', 'SP500_williams_R_10_t-2', 'SP500_stochastic_K_5_t-2', 'SP500_stochastic_D_5_5_t-2', 'SP500_williams_R_20_t-3', 'Copper_F_relative_change_perc_1_t-4']
    elif lookback == 10:
        features = ['SP500_stochastic_K_50_t-5', 'SP500_williams_R_10_t-3', 'SP500_relative_change_perc_1_t-4', 'SP500_williams_R_50_t-3', 'Copper_F_relative_change_perc_1_t-7', 'SP500_stochastic_K_20_t-4', 'Gold_F_relative_change_perc_1_t-10', 'SP500_stochastic_K_5_t-4', 'SP500_momentum_16_t-7', 'SP500_stochastic_K_10_t-10', 'Silver_F_relative_change_perc_1_t-5', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_50_t-4', 'SP500_williams_R_50_t-4', 'SP500_stochastic_K_10_t-1', 'SP500_stochastic_K_10_t-7', 'SP500_AD_oscillator_t-9', 'SP500_AD_oscillator_t-1', 'SP500_stochastic_K_5_t-2', 'Silver_F_relative_change_perc_1_t-10', 'SP500_stochastic_D_5_5_t-5', 'SP500_williams_R_10_t-7']
    elif lookback == 20:
        features = ['Silver_F_relative_change_perc_1_t-16', 'Copper_F_relative_change_perc_1_t-19', 'Gold_F_relative_change_perc_1_t-14', 'SP500_stochastic_K_50_t-13', 'Silver_F_relative_change_perc_1_t-5', 'SP500_AD_oscillator_t-6', 'SP500_F_relative_change_perc_1_t-13', 'SP500_AD_oscillator_t-17', 'Silver_F_relative_change_perc_1_t-8', 'SP500_stochastic_K_10_t-1', 'SP500_F_relative_change_perc_1_t-12', 'SP500_relative_change_perc_1_t-18', 'SP500_stochastic_K_5_t-6', 'SP500_stochastic_K_50_t-11', 'SP500_williams_R_5_t-5', 'SP500_williams_R_10_t-8', 'SP500_AD_oscillator_t-4', 'Copper_F_relative_change_perc_1_t-4', 'SP500_stochastic_K_10_t-10', 'SP500_williams_R_50_t-11', 'SP500_williams_R_50_t-4', 'SP500_momentum_8_t-20', 'SP500_momentum_8_t-4', 'SP500_stochastic_K_5_t-20', 'Silver_F_relative_change_perc_1_t-19', 'SP500_F_relative_change_perc_1_t-17', 'SP500_williams_R_5_t-18', 'SP500_F_relative_change_perc_1_t-10', 'SP500_momentum_8_t-14', 'Gold_F_relative_change_perc_1_t-10', 'SP500_stochastic_K_5_t-4', 'SP500_AD_oscillator_t-14', 'SP500_stochastic_K_50_t-4', 'Gold_F_relative_change_perc_1_t-8', 'SP500_stochastic_K_10_t-7', 'Silver_F_relative_change_perc_1_t-4', 'Silver_F_relative_change_perc_1_t-1', 'SP500_AD_oscillator_t-1', 'Copper_F_relative_change_perc_1_t-11', 'SP500_AD_oscillator_t-15', 'SP500_williams_R_10_t-11', 'SP500_stochastic_D_5_5_t-5', 'SP500_momentum_16_t-5', 'SP500_relative_change_perc_1_t-16', 'SP500_stochastic_K_50_t-5', 'SP500_stochastic_K_5_t-16', 'SP500_AD_MACD_12_26_t-8', 'SP500_momentum_16_t-7', 'Copper_F_relative_change_perc_1_t-6', 'SP500_stochastic_D_5_5_t-19', 'SP500_AD_oscillator_t-20', 'SP500_relative_change_perc_1_t-2', 'SP500_williams_R_50_t-14', 'SP500_stochastic_K_50_t-14', 'SP500_momentum_16_t-14', 'SP500_momentum_16_t-20', 'Gold_F_relative_change_perc_1_t-13', 'SP500_williams_R_5_t-6']
else: # COMBINED DATA
    if lookback == 3:
        features = ['SP500_momentum_8_t-1', 'SP500_momentum_16_t-1', 'SP500_stochastic_K_10_t-1', 'SP500_AD_MACD_12_26_t-3', 'SP500_stochastic_K_50_t-3', 'Gold_F_relative_change_perc_1_t-2', 'SP500_williams_R_5_t-1', 'SP500_williams_R_10_t-2', 'SP500_stochastic_D_5_5_t-2', 'SP500_williams_R_20_t-1', 'SP500_relative_change_perc_1_t-3', 'SP500_williams_R_10_t-1', 'SP500_stochastic_D_5_5_t-1', 'SP500_AD_oscillator_t-1', 'SP500_stochastic_D_5_5_t-3', 'SP500_stochastic_K_5_t-3']
    elif lookback == 5:
        features = ['Silver_F_relative_change_perc_1_t-5', 'SP500_momentum_16_t-1', 'Silver_F_relative_change_perc_1_t-4', 'SP500_stochastic_K_5_t-2', 'SP500_stochastic_K_10_t-1', 'SP500_stochastic_K_5_t-4', 'Copper_F_relative_change_perc_1_t-4', 'SP500_williams_R_50_t-4', 'SP500_AD_oscillator_t-4', 'SP500_williams_R_50_t-3', 'SP500_AD_oscillator_t-3', 'SP500_stochastic_K_50_t-4', 'SP500_stochastic_K_50_t-5', 'SP500_relative_change_perc_1_t-4', 'SP500_williams_R_5_t-5', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_D_5_5_t-2', 'SP500_williams_R_20_t-1', 'SP500_relative_change_perc_1_t-3', 'SP500_AD_oscillator_t-1']
    elif lookback == 10:
        features = ['Silver_F_relative_change_perc_1_t-5', 'Copper_F_relative_change_perc_1_t-7', 'SP500_stochastic_K_5_t-2', 'SP500_williams_R_10_t-3', 'SP500_stochastic_K_10_t-1', 'SP500_stochastic_D_5_5_t-5', 'SP500_AD_oscillator_t-9', 'SP500_stochastic_K_5_t-4', 'SP500_williams_R_50_t-4', 'SP500_williams_R_50_t-3', 'SP500_stochastic_K_50_t-4', 'SP500_stochastic_K_10_t-7', 'SP500_stochastic_K_50_t-5', 'SP500_stochastic_K_10_t-10', 'SP500_relative_change_perc_1_t-4', 'SP500_momentum_16_t-7', 'SP500_williams_R_10_t-7', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_20_t-4', 'SP500_AD_oscillator_t-1', 'Gold_F_relative_change_perc_1_t-10', 'Silver_F_relative_change_perc_1_t-10']
    elif lookback == 20:
        features = ['Silver_F_relative_change_perc_1_t-5', 'SP500_stochastic_K_50_t-17', 'Gold_F_relative_change_perc_1_t-13', 'SP500_stochastic_K_10_t-1', 'SP500_momentum_8_t-20', 'SP500_F_relative_change_perc_1_t-12', 'SP500_williams_R_50_t-11', 'SP500_AD_oscillator_t-6', 'SP500_stochastic_K_50_t-5', 'SP500_AD_oscillator_t-17', 'SP500_AD_oscillator_t-14', 'Silver_F_relative_change_perc_1_t-1', 'SP500_stochastic_K_10_t-10', 'SP500_williams_R_10_t-8', 'SP500_williams_R_5_t-5', 'Copper_F_relative_change_perc_1_t-11', 'SP500_AD_MACD_12_26_t-8', 'Gold_F_relative_change_perc_1_t-10', 'SP500_AD_oscillator_t-15', 'SP500_stochastic_K_50_t-14', 'SP500_stochastic_K_50_t-9', 'SP500_relative_change_perc_1_t-16', 'Silver_F_relative_change_perc_1_t-8', 'SP500_williams_R_5_t-6', 'SP500_momentum_16_t-5', 'SP500_momentum_16_t-7', 'SP500_williams_R_10_t-13', 'SP500_momentum_16_t-20', 'Silver_F_relative_change_perc_1_t-14', 'SP500_stochastic_K_10_t-13', 'SP500_stochastic_K_5_t-4', 'SP500_stochastic_K_50_t-13', 'SP500_stochastic_D_5_5_t-5', 'Gold_F_relative_change_perc_1_t-8', 'Copper_F_relative_change_perc_1_t-4', 'SP500_AD_oscillator_t-19', 'SP500_williams_R_50_t-4', 'Gold_F_relative_change_perc_1_t-14', 'SP500_stochastic_K_50_t-4', 'SP500_stochastic_K_10_t-7', 'SP500_momentum_8_t-4', 'SP500_F_relative_change_perc_1_t-17', 'Silver_F_relative_change_perc_1_t-16', 'SP500_williams_R_10_t-9', 'SP500_F_relative_change_perc_1_t-3', 'SP500_stochastic_D_5_5_t-1', 'SP500_AD_oscillator_t-1', 'SP500_momentum_16_t-14', 'SP500_relative_change_perc_1_t-2', 'SP500_stochastic_K_5_t-6', 'Silver_F_relative_change_perc_1_t-4', 'SP500_AD_oscillator_t-20', 'SP500_momentum_8_t-16', 'Copper_F_relative_change_perc_1_t-19', 'SP500_stochastic_K_5_t-20', 'SP500_stochastic_K_50_t-11', 'SP500_momentum_16_t-19', 'SP500_stochastic_K_5_t-10']

# lookback = 10
# features = ['SP500_stochastic_K_50_t-5', 'SP500_williams_R_10_t-3', 'SP500_relative_change_perc_1_t-4', 'SP500_williams_R_50_t-3', 'Copper_F_relative_change_perc_1_t-7', 'SP500_stochastic_K_20_t-4', 'Gold_F_relative_change_perc_1_t-10', 'SP500_stochastic_K_5_t-4', 'SP500_momentum_16_t-7', 'SP500_stochastic_K_10_t-10', 'Silver_F_relative_change_perc_1_t-5', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_50_t-4', 'SP500_williams_R_50_t-4', 'SP500_stochastic_K_10_t-1', 'SP500_stochastic_K_10_t-7', 'SP500_AD_oscillator_t-9', 'SP500_AD_oscillator_t-1', 'SP500_stochastic_K_5_t-2', 'Silver_F_relative_change_perc_1_t-10', 'SP500_stochastic_D_5_5_t-5', 'SP500_williams_R_10_t-7']
df = create_classification_data(df, lookback, 'SP500_relative_change_perc_1')

features.insert(0, 'SP500_relative_change_perc_1')
features.insert(0, 'Date')

df = df[features].copy()

y = df['SP500_relative_change_perc_1']
x = df.drop(['SP500_relative_change_perc_1'], axis=1)

# x_train, y_train, x_val, y_val, x_test, y_test = create_train_val_test(df, val_year, test_year)

In [37]:
def label_data_dir(y):
    y = list(y)    
    labels = []
    for dev in y:
        if dev >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

def label_data_mag(y):
    positives = []
    negatives = []
    y = list(y)
    for dev in y:
        if dev >= 0:
            positives.append(dev)
        else:
            negatives.append(dev)
            
    med_pos = median(positives)
    med_neg = median(negatives)
    
    labels = []
    for dev in y:
        if dev >= med_pos or dev <= med_neg:
            labels.append(1)
        else:
            labels.append(0)
    return labels
# 
y = label_data_mag(y)
# y_val = label_data_mag(y_val)
# y_test = label_data_mag(y_test)

In [32]:
def random_baseline(y):
    counts = [0, 0]
    for i in y:
        if i == 0:
            counts[0] = counts[0] + 1
        elif i == 1:
            counts[1] = counts[1] + 1
    print(f"\tDistribution: {counts}")
    print(f"\tRandom baseline accuracy (majority class): {counts[np.argmax(np.asarray(counts))]/ len(y)}")
    
# print("Random baseline training set")
# random_baseline(y_train)
# print("Random baseline validation set")
# random_baseline(y_val)
# print("Random baseline test set")
# random_baseline(y_test)

Random baseline training set
	Distribution: [1068, 1069]
	Random baseline accuracy (majority class): 0.5002339728591484
Random baseline validation set
	Distribution: [125, 126]
	Random baseline accuracy (majority class): 0.50199203187251
Random baseline test set
	Distribution: [125, 127]
	Random baseline accuracy (majority class): 0.503968253968254


In [33]:
def previous_day_pred(y):
#     pred = []
#     for i in range(len(y)):
#         if i > 0:
#             pred.append(y[i])
    return y[:-1]

In [34]:
# y_train_pred = previous_day_pred(y_train)
# y_val_pred = previous_day_pred(y_val)
# y_test_pred = previous_day_pred(y_test)

y_pred = previous_day_pred(y)
# y_val_pred = previous_day_pred(y_val)
# y_test_pred = previous_day_pred(y_test)

In [35]:
def evaluate(y, y_pred):
    acc = accuracy(y, y_pred)
    print(f"\tAccuracy: {acc}")
    
    prec = precision(y, y_pred)
    print(f"\tPrecision: {prec}")
    
    rec = recall(y, y_pred)
    print(f"\tRecall: {rec}")
    
    f = f1(y, y_pred)
    print(f"\tF1-score: {f}")
    
evaluate(y_train, y_train_pred)
evaluate(y_val, y_val_pred)
evaluate(y_test, y_test_pred)

	Accuracy: 0.5538389513108615
	Precision: 0.5543071161048689
	Recall: 0.5537885874649204
	F1-score: 0.5540477304632663
	Accuracy: 0.524
	Precision: 0.5238095238095238
	Recall: 0.528
	F1-score: 0.5258964143426296
	Accuracy: 0.5219123505976095
	Precision: 0.5238095238095238
	Recall: 0.5238095238095238
	F1-score: 0.5238095238095238
