In [5]:
%load_ext autoreload
%autoreload 2

import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from data_pipeline import run_pipeline
from preprocessing import preprocess
from fill_nans import fill_nans

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def get_data_by_feature(data, feature):
    
    """ split yfinance data into 
    """
    
    col_names = data.columns
    return data[col_names[[col.split(".")[0] == feature for col in col_names]]]

In [7]:
def normalize(x):
    
    """ Scale data between -1, 1
    """
    
    return (2 * (x - np.min(x)) / (np.max(x) - np.min(x))) - 1

In [8]:
def feature_target_split(price_data, lag, prediction_interval=0):
    
    """ slice data into timeseries classification context
    """
    
    features = np.zeros((price_data.shape[0] - lag - prediction_interval, lag))
    target = np.zeros(price_data.shape[0] - lag - prediction_interval)

    for i in range(lag, price_data.shape[0] - prediction_interval):
        features[i - lag] = price_data[i - lag:i]
        target[i - lag] = (np.sum(price_data[i:i+prediction_interval+1]) > 0) * 1

    return features, target

In [9]:
def prep_data(df, lag=25):
    
    """ transform dataframe into train test data
    """
    
    # get open and close
    df_open = get_data_by_feature(df, "Open")
    df_close = get_data_by_feature(df, "Close")
    
    # get symbols, columns
    open_stocks = np.array([df_open.columns[i].split(".")[1] for i in range(len(df_open.columns))])
    close_stocks = np.array([df_close.columns[i].split(".")[1] for i in range(len(df_close.columns))])
    column_space = len(df_open.columns)
    columns = [df_open.columns[i].split('.')[1] for i in range(column_space)]
    
    # derive price change from open, close data
    price_change_data = np.array([normalize(df_close[df_close.columns[i]].values) - normalize(df_open[df_open.columns[i]].values) for i in range(column_space)])
    price_change_df = pd.DataFrame(price_change_data.T, columns=columns)

    # iterate over data, save to x and y lists
    x = []
    y = []
    for i in range(column_space):
        data_split = feature_target_split(price_change_data[i], lag)
        x.append(data_split[0])
        y.append(data_split[1])

    # convert datatype and reshape
    x = np.array(x)
    y = np.array(y)
    x = np.concatenate([x[i] for i in range(x.shape[0])])
    y = np.concatenate([y[i] for i in range(y.shape[0])]).reshape(x.shape[0], 1)
    
    return x, y

In [None]:
dir_path = "C:/Users/voyno/Desktop/data/finance/"
training_files = ["1wk1m_1.csv", "1wk1m_2.csv", "1wk1m_3.csv"]
testing_files = ["1wk1m_10.csv"]

# timeseries data for subset of Russel3000 stocks
train_df = run_pipeline([dir_path + training_files[i] for i in range(len(training_files))])
test_df = run_pipeline(dir_path + testing_files[0])

In [11]:
x_train, y_train = prep_data(train_df)
print("Training data preparation complete\nx_train.shape =", x_train.shape, "\ny_train.shape =", y_train.shape)

x_test, y_test = prep_data(test_df)
print("Testing data preparation complete\nx_test.shape =", x_test.shape, "\ny_test.shape =", y_test.shape)

y_train = y_train.flatten()
y_test = y_test.flatten()

Training data preparation complete
x_train.shape = (3972650, 25) 
y_train.shape = (3972650, 1)
Testing data preparation complete
x_test.shape = (1709400, 25) 
y_test.shape = (1709400, 1)


In [15]:
input_shape=(x_train.shape[1],)
node_num = 64
dropout_prob = 0.25
batch_size=16384
epochs=10
verbose=1

layers = [
    Dense(node_num, activation="relu", input_shape=input_shape),
    Dropout(dropout_prob),
    Dense(node_num, activation="relu"),
    Dropout(dropout_prob),
    Dense(node_num, activation="relu"),
    Dropout(dropout_prob),
    Dense(1, activation="sigmoid")]

model = Sequential(layers)
model.compile(
    loss="binary_crossentropy", 
    optimizer="adam", 
    metrics=['acc'])

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test),
    verbose=verbose)

print("Mean validation accuracy:", np.mean(history.history["val_acc"]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean validation accuracy: 0.6477760016918183
