In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU, LSTM, Conv1D, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random
import math

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

import pickle

In [2]:
def extract_model(filename):
    model = tf.models.load_model(filename)
    return model
    
def retrieve_data(filename):
    if "combined" in filename:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist() # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1]['Date'], row[1][column]]
#             new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist() # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    
    df2 = pd.DataFrame(rows)
    df2.columns = columns
                       
    for col in columns:
        if col[:4] == "Date" and col != "Date":
            df2 = df2.drop([col], axis=1)
    return df2

In [10]:
features_mag_sp500 = ['SP500_week_high_1_t-2', 'SP500_stochastic_K_50_t-1', 'SP500_week_low_1_t-1']
features_mag_us30 = ['US30_stochastic_K_20_t-1', 'US30_week_high_1_t-3', 'US30_ATR5_t-2']
features_mag_nasdaq = ['NASDAQ_F_Volume_t-1', 'NASDAQ_stochastic_K_50_t-3', 'HS50_F_relative_change_perc_50_t-1', 'CAC40_relative_change_perc_50_t-1', 'NASDAQ_stochastic_K_50_t-1', 'NASDAQ_week_low_1_t-1', 'USDCHF_relative_change_perc_5_t-1', 'NASDAQ_stochastic_K_10_t-1', 'EURUSD_relative_change_perc_1_t-3', 'US30_relative_change_perc_5_t-3']
features_mag_sp500.insert(0, 'SP500_relative_change_perc_1')
features_mag_sp500.insert(0, 'Date')
features_mag_us30.insert(0, 'US30_relative_change_perc_1')
features_mag_us30.insert(0, 'Date')
features_mag_nasdaq.insert(0, 'NASDAQ_relative_change_perc_1')
features_mag_nasdaq.insert(0, 'Date')

data_mag_sp500 = create_classification_data(retrieve_data("Dataset v3/SP500_combined_data_20220422.csv"), 3, 'SP500_relative_change_perc_1')
data_mag_us30 = create_classification_data(retrieve_data("Dataset v3/US30_combined_data_20220422.csv"), 3, 'US30_relative_change_perc_1')
data_mag_nasdaq = create_classification_data(retrieve_data("Dataset v3/nasdaq_combined_data_20220422.csv"), 3, 'NASDAQ_relative_change_perc_1')

data_mag_sp500 = data_mag_sp500[features_mag_sp500]
data_mag_us30 = data_mag_us30[features_mag_us30]
data_mag_nasdaq = data_mag_nasdaq[features_mag_nasdaq]

In [11]:
def label_data(y):
    positives = []
    negatives = []
    y = list(y)
    for dev in y:
        if dev >= 0:
            positives.append(dev)
        else:
            negatives.append(dev)
    med_pos = median(positives)
    med_neg = median(negatives)
    
    labels = []
    for dev in y:
        if dev >= med_pos or dev <= med_neg:
            labels.append(1)
        else:
            labels.append(0)
    return labels
# 
data_mag_sp500['label'] = label_data(data_mag_sp500['SP500_relative_change_perc_1'])
data_mag_us30['label'] = label_data(data_mag_us30['US30_relative_change_perc_1'])
data_mag_nasdaq['label'] = label_data(data_mag_nasdaq['NASDAQ_relative_change_perc_1'])

In [13]:
data_mag_sp500['pred'] = data_mag_sp500['label'].shift(1)
data_mag_sp500 = data_mag_sp500[1:]
data_mag_sp500['pred'] = data_mag_sp500['pred'].astype(int)

data_mag_us30['pred'] = data_mag_us30['label'].shift(1)
data_mag_us30 = data_mag_us30[1:]
data_mag_us30['pred'] = data_mag_us30['pred'].astype(int)

data_mag_nasdaq['pred'] = data_mag_nasdaq['label'].shift(1)
data_mag_nasdaq = data_mag_nasdaq[1:]
data_mag_nasdaq['pred'] = data_mag_nasdaq['pred'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
def train_val_test_acc(df):
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == 2018]
    test = df[df['Date'].dt.year == 2019]
    train = df[df['Date'].dt.year < 2018]
    
    train_acc = accuracy(train['label'], train['pred'])
    val_acc = accuracy(val['label'], val['pred'])
    test_acc = accuracy(test['label'], test['pred'])
    print(train_acc)
    print(val_acc)
    print(test_acc)
    print("\n")
train_val_test_acc(data_mag_sp500)
train_val_test_acc(data_mag_us30)
train_val_test_acc(data_mag_nasdaq)

0.5517564402810304
0.5179282868525896
0.5753968253968254


0.5585205992509363
0.5378486055776892
0.5476190476190477


0.5543071161048689
0.549800796812749
0.5119047619047619




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
