In [39]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from numpy import random as rand
from scipy import *
import time as T
import gtda
import plotly
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)
from gtda.plotting import plot_point_cloud
from gtda.plotting import plot_diagram
from gtda.time_series import TakensEmbedding
from gtda.time_series import takens_embedding_optimal_parameters

pd.set_option('display.max_rows', 100)

In [2]:
path_to_data = "../train.csv"
ALL_DATA = pd.read_csv(path_to_data)

# Proof of concept

Here, we'll try a proof of conept using persistent homology and time series prediction.   
- We'll use that data at hand to try and predict the last value of the Target time series (obviously without using Target itself, since we'll need that
- We'll do this by treating every date and stock as different inputs (i.e. same stock on different day and different stock on same days are equally different inputs)
- We will perform sliding-window-delay-embedding (SWDE) on each input (time series) and use that to solve a **classification problem** on the final point: split the real line into bins, which bin does the point end up in?
- Then, armed with bin probabilities of the final point, we use those probabilities plus LSTM or Regressor of the final minute or 2 (6 to 12 data points $\times$ columns $\times$ inputs) to find a precise final answer
- Finally, we can try and assess which columns are most important by removing columns and seeing how much the answer changes

In [55]:
def preprocess_PoC(df):
    """
    Absolute bare minimal preprocessing for the Optiver dataset
    
    stolen from Jay ;)  (modified a bit)
    """

    # Fill NaNs in near_price and far_price with reference_price
#     df['near_price'] = df['near_price'].fillna(df['reference_price'])
#     df['far_price'] = df['far_price'].fillna(df['reference_price'])
#     df['wap'] = df['wap'].fillna(df['reference_price'])

    # Instead of the above, after a good look, there are only 4 stocks on 4 particular days missing:
    # (stock,date) -- (131,35); (101,328); (158,388); (19,438)
    # and for some reason stock 101 and 19 have columns filled in for 'Target', 'bid_size', and 'ask_size'
    # but notthing else.. strange but we'll treat that as not enough information for now
    
    # !! So, in that case we will actually just DROP ALL 'NaN'  !!
    # But, first we need to drop near and far price columns, we won't use these in this part of the training
    # only later during the precision part
        
    df = df.drop(columns=['near_price','far_price']).dropna()
    
    # I think it is also good to re-index in a more intuitive way.. right now the indexing from slow to fast is:
    # (  date_id, seconds_in_bucket, stock_id  ), but since we are doing time series, I think it makes sense
    # to swap the speed of seconds and stock, so that i.loc[55*(x-1):55*x] indexes a stock on a given day
    df = (df
          .set_index(['date_id','stock_id','seconds_in_bucket'])
          .sort_index(level=['date_id','stock_id','seconds_in_bucket'], sort_remaining=False))
    
    # Drop row_id and time_id, not needed for training
    df = df.drop(['row_id', 'time_id'], axis=1)
    # And, in this case, we make sure to only select on the last data point of each time series input
    targets = df[['target']].loc[pd.IndexSlice[:, :, 540]]
    df = df.drop(['target'], axis=1)
    
    return df, targets

### Process data and split training and validation

In [56]:
X_processed, y_processed = preprocess_PoC(ALL_DATA)
training_size_from_date = 450 # let's train on first 450 dates, validate on the rest

X_train = X_processed.loc[pd.IndexSlice[0:training_size_from_date, :, :]]
y_train_unbinned = y_processed.loc[pd.IndexSlice[0:training_size_from_date, :, :]]

X_valid = X_processed.loc[pd.IndexSlice[training_size_from_date:, :, :]]
y_valid_unbinned = y_processed.loc[pd.IndexSlice[training_size_from_date:, :, :]]

In [None]:
tau, dim = takens_embedding_optimal_parameters(
    df[col].to_numpy(), max_time_delay=100, max_dimension=10
)