Main paper: https://arxiv.org/pdf/1705.03233.pdf

Report with dataloader:
- https://cs230.stanford.edu/projects_fall_2018/reports/12445636.pdf
- https://github.com/dunetz/CS230Final

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [8]:
import numpy as np
import pandas as pd
from io import StringIO
from zipfile import ZipFile

class DataLoader():
    def __init__(self,path):
        self.path=path
        self.zipfiles=['Day%i.zip' %i for i in range(1,2)]
        self.files=['Test_Dst_NoAuction_ZScore_CF_%i.txt' % i for i in range(1,2)]
        self.files.insert(0,'Train_Dst_NoAuction_ZScore_CF_1.txt')
        self.index=np.array([ 
                        [0, 3454,  9772, 14694, 25413, 39512],
                        [0, 5079, 11201, 17166, 23878, 38397],
                        [0, 3903,  9314, 13341, 18527, 28535],
                        [0, 2806,  9798, 15140, 25959, 37023],
                        [0, 3030,  9758, 15704, 22082, 34785],
                        [0, 2263,  8506, 14113, 21120, 39152],
                        [0, 2801,  9861, 16601, 24455, 37346],
                        [0, 2647, 11309, 19900, 33129, 55478],
                        [0, 1873, 11144, 21180, 34060, 52172],
                        [0, 1888,  7016, 12738, 18559, 31937]
                            ]) # index into each stock for each day
        self.diff=np.diff(self.index)

    
    # return dataframe for one day (rows are values,columns are times)
    #        
    def get_one_day(self,day):
        # days are indexed from 0 to 9
        z=self.path+self.zipfiles[day]
        zf=ZipFile(z,'r')
        data = StringIO(zf.read(self.files[day]).decode('utf_8'))
        df=pd.read_csv(data,sep='\s{2,}',engine='python',header=None)
        s=np.repeat([0,1,2,3,4],self.diff[day])
        return df
    
    # return index of stocks for list of days
    #
    def get_stock_index(self,days):
        s=[np.repeat([0,1,2,3,4],self.diff[day]) for day in days]
        return s
    
    #return  1) dataframe for multiple days - transpose so that columns are values and rows are time
    #        
    def get_days(self,days):
        df=[]
        for day in days:
            d=self.get_one_day(day)
            d=pd.DataFrame(d.values.T)
            df.append(d)
        df=pd.concat(df)
        return df


In [10]:
path='../data/HFT/'
d=DataLoader(path)
s=d.get_stock_index(range(10))
for i in range(1):
    df=d.get_one_day(i)
    print(i,len(s[i]),df.shape)
  

0 39512 (149, 39512)


In [13]:
colx=range(40)# columns to look at in dataset - 0-40 are LOB
coly=147 # target 144-148 are 1,2,3,5,10

In [16]:
# LOB data
df.iloc[:, colx]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.408275,0.408275,0.407254,0.407254,0.407254,0.407254,0.407254,0.410317,0.410317,0.410317,...,0.410317,0.412358,0.411338,0.411338,0.411338,0.410317,0.410317,0.410317,0.411338,0.411338
1,-0.509405,-0.604272,-0.663731,-0.530115,-0.530115,-0.530115,-0.737220,-0.131941,-0.198749,-0.198749,...,-0.609617,-0.703148,-0.613625,-0.613625,-0.613625,-0.592915,-0.592915,-0.592915,-0.544813,-0.544813
2,0.403577,0.403577,0.403577,0.403577,0.403577,0.404600,0.404600,0.408691,0.408691,0.409714,...,0.405623,0.405623,0.405623,0.405623,0.405623,0.405623,0.405623,0.405623,0.405623,0.405623
3,-0.479331,-0.479331,-0.479331,-0.479331,-0.479331,-0.531439,-0.442366,-0.446374,-0.446374,-0.401837,...,-0.401837,-0.401837,-0.401837,-0.401837,-0.535448,-0.535448,-0.535448,-0.535448,-0.535448,-0.535448
4,0.409877,0.410898,0.406815,0.408857,0.408857,0.408857,0.407836,0.410898,0.409877,0.409877,...,0.410898,0.411919,0.410898,0.410898,0.410898,0.409877,0.409877,0.409877,0.414981,0.410898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,2.000000,2.000000,3.000000,2.000000,1.000000,1.000000,1.000000,1.000000,2.000000,1.000000,...,1.000000,1.000000,3.000000,3.000000,1.000000,3.000000,3.000000,1.000000,1.000000,2.000000
145,2.000000,2.000000,3.000000,2.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,3.000000,3.000000,2.000000,3.000000,2.000000,1.000000,1.000000,1.000000
146,2.000000,2.000000,2.000000,3.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,3.000000,3.000000,3.000000,3.000000,1.000000,1.000000,1.000000
147,2.000000,2.000000,2.000000,2.000000,2.000000,3.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,3.000000,3.000000,2.000000,3.000000,1.000000


In [20]:
df.nunique().sort_values()

35147    135
38304    136
24218    136
38980    136
38979    136
        ... 
27780    139
27781    139
27788    139
27771    139
39511    139
Length: 39512, dtype: int64

In [17]:
df.iloc[:, coly]

0      0.418484
1     -0.703148
2      0.417896
3     -0.323452
4      0.419064
         ...   
144    2.000000
145    2.000000
146    2.000000
147    2.000000
148    3.000000
Name: 147, Length: 149, dtype: float64