In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler

import time
import math
import os

from IPython.display import display

import matplotlib.pyplot as plt
%matplotlib inline

# functions to calculate WAMP, VWAP, etc. and display plots
from functions import *
from Datagenerator import DataGenerator

Using TensorFlow backend.


In [2]:
# set default values
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

plt.rcParams["figure.figsize"] = [16, 9]

np.random.seed(1234)
# set_random_seed(1234)

data_root_dir = "data/kraken/"
n_asset_pairs = 25
n_ss_per_file = 1440
lob_depth = 100
ss_shape = (2*lob_depth, 3)

In [None]:
def read_file_list(data_root_dir, verbose=False):
    updates_file_list = []
    snapshots_file_list = []

    for (dirpath, dirnames, filenames) in os.walk(data_root_dir):
        updates_file_list.extend([dirpath+'/'+filename for filename in filenames if filename != ".DS_Store" and filename[0] == 'u'])
        snapshots_file_list.extend([dirpath+'/'+filename for filename in filenames if filename != ".DS_Store" and filename[0] == 's'])

    updates_file_list = sorted(updates_file_list)
    snapshots_file_list = sorted(snapshots_file_list)
    
    if verbose:
        print(len(snapshots_file_list), "update files read.")
        print(len(updates_file_list), "snapshot files read.")
        
    return updates_file_list, snapshots_file_list

In [None]:
updates_file_list, snapshots_file_list = read_file_list(data_root_dir=data_root_dir, verbose=True)

n_files = len(snapshots_file_list)
n_days = int(n_files / n_asset_pairs)

print(n_days, "days of data is collected.")

# Linear Regression

In [None]:
alpha = 0.002
delay = 100
n_labels = 2

In [None]:
dataset = np.zeros([n_files*n_ss_per_file], 
                   dtype=[('snapshot', np.float32, ss_shape), ('rel_prices', np.float32, 200), ('timestamp', np.float32, 1), ('wamp', np.float32, 1),('label', int, 1)])

global_cnt = 0

for i in range(n_asset_pairs):
    asset_pair_cnt = 0
    
    for j in range(n_days):
        data = np.loadtxt(snapshots_file_list[i*n_days+j], delimiter=',')
        
        for j, ss in enumerate(data):
            snapshot = ss[:-1].reshape(-1, 3)
            timestamp = ss[-1]
            wamp = functions.calc_WAMP(ss)

            dataset[global_cnt]["snapshot"] = snapshot
            dataset[global_cnt]["rel_prices"] = [price / wamp for price in snapshot[:,0]]
            dataset[global_cnt]["timestamp"] = timestamp
            dataset[global_cnt]["wamp"] = wamp

            # calculate label for the last snapshot for which enough data is provided with this snapshot
            if asset_pair_cnt >= delay:
                mean = np.mean(dataset["wamp"][global_cnt-delay+1:global_cnt+1])
                act_wamp = dataset["wamp"][global_cnt-delay]
                
                if labels == 3:
                    if mean - act_wamp < -(act_wamp*alpha):
                        dataset[global_cnt-delay]["label"] = -1

                    elif mean - act_wamp > act_wamp*alpha:
                        dataset[global_cnt-delay]["label"] = 1

                    else:
                        dataset[global_cnt-delay]["label"] = 0
                elif labels == 2:
                    if mean < act_wamp:
                        dataset[global_cnt-delay]["label"] = -1
                    else:
                        dataset[global_cnt-delay]["label"] = 1
                else:
                    raise AttributeError()
                    
            asset_pair_cnt += 1
            global_cnt += 1
            
    # set back global_cnt to overwrite not labeled data (last delay number of elements)
    global_cnt = global_cnt-delay

dataset = dataset[:global_cnt]

In [None]:
len(dataset)

In [None]:
sample = dataset[:1500]

fig, ax = plt.subplots()

markers = ['ro' if label == -1 else 'go' if label == 1 else 'y.' for label in sample["label"]]

for i, wamp in enumerate(sample["wamp"]):    
    ax.plot(i, wamp, markers[i])

plt.show()

In [None]:
cnt = 0
for i,data in enumerate(dataset):
    if data["label"] == -1:
        cnt += 1
print(cnt)

In [None]:
nb_samples = len(dataset)

valid_split = 0.2

In [None]:
#shuffle the dataset
np.random.shuffle(dataset)

In [None]:
# train-valid-test split and one-hot encoding (dataset prep)
#input, select prices only
X_train = dataset['rel_prices'][0:int(nb_samples*(1-valid_split))]
X_valid = dataset['rel_prices'][int(nb_samples*(1-valid_split)):]

#output
Y_train = dataset['label'][0:int(nb_samples*(1-valid_split))]
Y_valid = dataset['label'][int(nb_samples*(1-valid_split)):]

In [None]:
# standard scaling
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)

In [None]:
X_train.shape

In [None]:
C = 1e-5

if labels == 2:
    logreg = linear_model.LogisticRegression(C=C,solver='lbfgs')
else:
    logreg = linear_model.LogisticRegression(C=C,solver='sag', multi_class='multinomial')
model = logreg.fit(X_train,Y_train)

In [None]:
logreg.score(X_valid,Y_valid)

In [None]:
predictions = logreg.predict(X_valid)

In [None]:
if labels == 2:
    class_names = [-1, 1]
elif labels == 3:
    class_names = [-1, 0, 1]
else:
    class_names = []

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
functions.plot_confusion_matrix(Y_valid, predictions, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
functions.plot_confusion_matrix(Y_valid, predictions, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

In [3]:
g = DataGenerator(data_root_dir)

In [9]:
g.statistics["path"]

array(['data/kraken/ada_eur/ss_2019-03-15.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-16.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-17.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-18.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-19.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-20.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-21.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-22.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-23.csv.gz',
       'data/kraken/ada_eur/ss_2019-03-24.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-15.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-16.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-17.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-18.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-19.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-20.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-21.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-22.csv.gz',
       'data/kraken/ada_usd/ss_2019-03-23.csv.gz',
       'data/kraken/ada_usd/ss_