In [1]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from numpy import random as rand
from scipy import *
import time as T
import gtda
import plotly
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)
from gtda.plotting import plot_point_cloud
from gtda.plotting import plot_diagram
from gtda.time_series import TakensEmbedding, SlidingWindow, takens_embedding_optimal_parameters
from gtda.homology import VietorisRipsPersistence

pd.set_option('display.max_rows', 100)

In [2]:
path_to_data = "../train.csv"
ALL_DATA = pd.read_csv(path_to_data)

# Proof of concept

Here, we'll try a proof of conept using persistent homology and time series prediction.   
- We'll use that data at hand to try and predict the last value of the Target time series (obviously without using Target itself, since we'll need that
- We'll do this by treating every date and stock as different inputs (i.e. same stock on different day and different stock on same days are equally different inputs)
- We will perform sliding-window-delay-embedding (SWDE) on each input (time series) and use that to solve a **classification problem** on the final point: split the real line into bins, which bin does the point end up in?
- Then, armed with bin probabilities of the final point, we use those probabilities plus LSTM or Regressor of the final minute or 2 (6 to 12 data points $\times$ columns $\times$ inputs) to find a precise final answer
- Finally, we can try and assess which columns are most important by removing columns and seeing how much the answer changes

In [3]:
def preprocess_PoC(df):
    """
    Absolute bare minimal preprocessing for the Optiver dataset
    
    stolen from Jay ;)  (modified a bit)
    """

    # Fill NaNs in near_price and far_price with reference_price
#     df['near_price'] = df['near_price'].fillna(df['reference_price'])
#     df['far_price'] = df['far_price'].fillna(df['reference_price'])
#     df['wap'] = df['wap'].fillna(df['reference_price'])

    # Instead of the above, after a good look, there are only 4 stocks on 4 particular days missing:
    # (stock,date) -- (131,35); (101,328); (158,388); (19,438)
    # and for some reason stock 101 and 19 have columns filled in for 'Target', 'bid_size', and 'ask_size'
    # but notthing else.. strange but we'll treat that as not enough information for now
    
    # !! So, in that case we will actually just DROP ALL 'NaN'  !!
    # But, first we need to drop near and far price columns, we won't use these in this part of the training
    # only later during the precision part
        
    df = df.drop(columns=['near_price','far_price']).dropna()
    
    # I think it is also good to re-index in a more intuitive way.. right now the indexing from slow to fast is:
    # (  date_id, seconds_in_bucket, stock_id  ), but since we are doing time series, I think it makes sense
    # to swap the speed of seconds and stock, so that i.loc[55*(x-1):55*x] indexes a stock on a given day
    df = (df
          .set_index(['date_id','stock_id','seconds_in_bucket'])
          .sort_index(level=['date_id','stock_id','seconds_in_bucket'], sort_remaining=False))
    
    # Drop row_id and time_id, not needed for training
    df = df.drop(['row_id', 'time_id'], axis=1)
    # And, in this case, we make sure to only select on the last data point of each time series input
    targets = df[['target']].loc[pd.IndexSlice[:, :, 540]]
    df = df.drop(['target'], axis=1)
    
    return df, targets

### Process data and split training and validation

In [4]:
X_processed, y_processed = preprocess_PoC(ALL_DATA)
training_size_from_date = 450 # let's train on first 450 dates, validate on the rest

X_train = X_processed.loc[pd.IndexSlice[0:training_size_from_date, :, :]]
y_train_unbinned = y_processed.loc[pd.IndexSlice[0:training_size_from_date, :, :]]

X_valid = X_processed.loc[pd.IndexSlice[training_size_from_date:, :, :]]
y_valid_unbinned = y_processed.loc[pd.IndexSlice[training_size_from_date:, :, :]]

### Remove 'ALL_ASSETS' from memory since we won't need it anymore (Jupyter will crash otherwise)

In [5]:
%reset_selective ALL_ASSETS

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


### Perform sliding window and embeding on each time series input   
- We will have three sliding windows, one before the last 5 minutes, one containing the last 5 minutes, one in the middle which overlaps
- We will decide on the delay, tau, and dimension, dim, by whatever is best for the WAP, since this is the most closely related time series input to the target
- This will give us: 3 point clouds per input time series, each containing information from all data columns

In [6]:
NUM_OF_INPUTS = X_processed.shape[0]//55

time_series_split = np.array_split(X_processed,NUM_OF_INPUTS)

# SlidingWindow params:
# W_size is size of window
# S is stride, distance between next window start and last window start
W_size = 31
S = 12

In [None]:
np.asarray(time_series_split).shape

In [10]:
point_clouds = np.asarray(time_series_split)

In [15]:
for chunk in time_series_split:
    SW = SlidingWindow(size=W_size, stride=S)
    windows = SW.fit_transform(chunk.to_numpy())
#     windows = chunk.to_numpy()[np.newaxis,:,:]
    # TakensEmbedding expects a specific order:
    X = np.transpose(windows, (0, 2, 1))   # shape=(n_windows, num of data_columns, W_size)
    TE = TakensEmbedding(time_delay=1, dimension=3, stride=1, flatten=True)
    Xt = TE.fit_transform(X)
    break

In [None]:
plot_point_cloud(Xt[0])

In [None]:
plot_point_cloud(point_clouds[1000])

In [16]:
Xt.shape

(3, 29, 27)

In [21]:
VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2])
diagrams_test_delay = VR.fit_transform(Xt)
diagrams_test_delay.shape
plot_diagram(diagrams_test_delay[0])

In [70]:
normalized_split = []
for df in time_series_split:
    new_df = df
    pair_im_ma = df.xs(0, level='seconds_in_bucket')[['imbalance_size','matched_size']].to_numpy()
    pair_bid_ask = df.xs(0, level='seconds_in_bucket')[['bid_size','ask_size']].to_numpy()
    maxim1 , maxim2 = np.amax(pair_im_ma), np.amax(pair_bid_ask)
    
    new_df[['imbalance_size','matched_size']] = new_df[['imbalance_size','matched_size']]/np.amax(pair_im_ma)
    new_df[['bid_size','ask_size']] = new_df[['bid_size','ask_size']]/np.amax(pair_bid_ask)
    normalized_split.append(new_df)

normalized_split[1000]

KeyboardInterrupt: 

In [71]:
cols1 = ['imbalance_size','matched_size']
cols2 = ['bid_size','ask_size']

# Add a group id per (stock,day)
big = pd.concat(time_series_split, keys=range(len(time_series_split)), names=['gid'])

# ensure “first” = earliest time in group
big = big.sort_index(level=['gid','seconds_in_bucket'])

g = big.groupby(level='gid')

firsts1 = g[cols1].transform('first')
den1    = firsts1.max(axis=1)
big[cols1] = big[cols1].div(den1.where(den1 != 0, np.nan), axis=0)

firsts2 = g[cols2].transform('first')
den2    = firsts2.max(axis=1)
big[cols2] = big[cols2].div(den2.where(den2 != 0, np.nan), axis=0)

# if you still need a list back:
normalized_split = [sub.droplevel('gid') for _, sub in big.groupby(level='gid')]


In [73]:
time_series_split[1000]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,bid_price,bid_size,ask_price,ask_size,wap
date_id,stock_id,seconds_in_bucket,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,44,0,0.122151,1,1.000026,1.0,0.999835,1.0,1.000074,0.450558,1.0
5,44,10,0.0,0,1.000026,1.122151,0.99974,0.027024,0.999883,0.450472,0.999748
5,44,20,0.0,0,0.999883,1.136545,0.999597,0.009007,0.999883,2.018114,0.999598
5,44,30,0.0,0,0.999931,1.136545,0.99974,1.909727,0.999931,0.900987,0.99987
5,44,40,0.0,0,0.999692,1.136545,0.999501,0.018012,0.999788,3.369208,0.999503
5,44,50,0.0,0,0.999835,1.136545,0.999549,3.233307,0.999883,0.117123,0.999871
5,44,60,0.0,0,0.999597,1.148372,0.999405,0.045026,0.99974,0.027024,0.999614
5,44,70,0.0,0,0.999979,1.154704,0.999788,2.243136,0.999979,0.90103,0.999924
5,44,80,0.0,0,0.999883,1.154704,0.999644,0.53143,0.999931,1.801974,0.99971
5,44,90,0.0,0,0.999835,1.154704,0.999644,1.89153,0.999835,0.900901,0.999774


In [72]:
normalized_split[1000]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,bid_price,bid_size,ask_price,ask_size,wap
date_id,stock_id,seconds_in_bucket,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,44,0,0.122151,1,1.000026,1.0,0.999835,1.0,1.000074,0.450558,1.0
5,44,10,0.0,0,1.000026,1.122151,0.99974,0.027024,0.999883,0.450472,0.999748
5,44,20,0.0,0,0.999883,1.136545,0.999597,0.009007,0.999883,2.018114,0.999598
5,44,30,0.0,0,0.999931,1.136545,0.99974,1.909727,0.999931,0.900987,0.99987
5,44,40,0.0,0,0.999692,1.136545,0.999501,0.018012,0.999788,3.369208,0.999503
5,44,50,0.0,0,0.999835,1.136545,0.999549,3.233307,0.999883,0.117123,0.999871
5,44,60,0.0,0,0.999597,1.148372,0.999405,0.045026,0.99974,0.027024,0.999614
5,44,70,0.0,0,0.999979,1.154704,0.999788,2.243136,0.999979,0.90103,0.999924
5,44,80,0.0,0,0.999883,1.154704,0.999644,0.53143,0.999931,1.801974,0.99971
5,44,90,0.0,0,0.999835,1.154704,0.999644,1.89153,0.999835,0.900901,0.999774
