In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

orderbook = "../datasets/PTP/GOOGL_2018-07-02_34200000_57600000_orderbook_10.csv"
message = "../datasets/PTP/GOOGL_2018-07-02_34200000_57600000_message_10.csv"
num_levels = 10

header_list = []
for i in range(num_levels):
    header_list = header_list + ["Pa%d"%(i+1),"Va%d"%(i+1),"Pb%d"%(i+1),"Vb%d"%(i+1)]
df_orderbook = pd.read_csv(orderbook,header=None,names=header_list)

df_message = pd.read_csv(message,usecols = [0,1,3,4,5], names=['time', 'type','size','price','direction'])
df_message.index = pd.Timestamp(datetime.date.today()) + pd.TimedeltaIndex(df_message.time, unit='s')
df_orderbook.index = df_message.index

# Spreads and mid-prices
def feature_v2(num_levels,df):
    for i in range(1,num_levels+1):
        df["spread%d"%(i)] = df["Pa%d"%(i)] - df["Pb%d"%(i)]
        df["midprice%d"%(i)] = (df["Pa%d"%(i)] + df["Pb%d"%(i)])/2
    return df

def feature_v3(num_levels,df):
    for i in range(1, num_levels):
        df["PA_diff%d"%(i)] = df["Pa%d"%(i+1)] - df["Pa%d"%(i)]
        df["PB_diff%d"%(i)] = df["Pb%d"%(i)] - df["Pb%d"%(i+1)]
    return df

def feature_v4(num_levels,df):
    lst = ["Pa%d"%(i+1) for i in range(num_levels)]
    df["Pa_mean"] = df[df.columns.intersection(lst)].sum(axis=1)    
    
    lst = ["Pb%d"%(i+1) for i in range(num_levels)]
    df["Pb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Va%d"%(i+1) for i in range(num_levels)]
    df["Va_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Vb%d"%(i+1) for i in range(num_levels)]
    df["Vb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    return df

def feature_v5(num_levels,df): # accum differences
    for i in range(num_levels):
        df["pri_accum_diff%d"%(i+1)] = 0
        df["vol_accum_diff%d"%(i+1)] = 0
        for k in range(i):
            df["pri_accum_diff%d"%(i+1)] += (df["Pa%d"%(k+1)] - df["Pb%d"%(k+1)])
            df["vol_accum_diff%d"%(i+1)] += (df["Va%d"%(i+1)] - df["Vb%d"%(i+1)])
    return df

In [2]:
df_orderbook = feature_v2(num_levels,df_orderbook)
df_orderbook = feature_v3(num_levels,df_orderbook)
df_orderbook = feature_v4(num_levels,df_orderbook)
df_orderbook = feature_v5(num_levels,df_orderbook)

In [3]:
df_orderbook.head()

Unnamed: 0,Pa1,Va1,Pb1,Vb1,Pa2,Va2,Pb2,Vb2,Pa3,Va3,...,pri_accum_diff6,vol_accum_diff6,pri_accum_diff7,vol_accum_diff7,pri_accum_diff8,vol_accum_diff8,pri_accum_diff9,vol_accum_diff9,pri_accum_diff10,vol_accum_diff10
2019-12-10 09:30:00.001513,11177800,44,11165500,128,11227100,100,11144200,66,11280000,2,...,750200,495,1142100,0,1793100,0,2575000,1600,3438800,0
2019-12-10 09:30:00.006005,11177800,44,11165500,127,11227100,100,11144200,66,11280000,2,...,750200,495,1142100,0,1793100,0,2575000,1600,3438800,0
2019-12-10 09:30:00.007099,11177800,44,11165500,127,11227100,100,11158600,5,11280000,2,...,693200,205,1073500,594,1512300,0,2226300,1600,3021400,0
2019-12-10 09:30:00.008490,11177800,44,11165500,127,11227100,100,11158600,5,11280000,2,...,693200,205,1073500,594,1512300,0,2161900,1600,2889100,0
2019-12-10 09:30:00.009083,11177800,44,11165500,127,11227100,100,11158600,5,11280000,2,...,685400,-270,993800,594,1385700,0,1972300,0,2686300,1800


In [None]:
window = 1000
df_orderbook['midprice_win%d'%window] = df_orderbook.rolling(window).agg({'midprice1':'mean'})
df_orderbook['midprice_win%d'%window].plot()

In [None]:
window = '1MIN'
df_orderbook['midprice_average'+window] = df_orderbook.rolling('1MIN').agg({'midprice1':'mean'})
df_orderbook['midprice_average'+window].plot()