# Data Preprocessing: Data Cleaning and Feature Generation 

In [None]:
import numpy as np
import pandas as pd
import os, datetime
import warnings
warnings.filterwarnings('ignore')


## ===== Hyper-parameters =====

In [None]:
# ======= price_features
bid_price_features = ['Bid1', 'Bid2', 'Bid3', 'Bid4', 'Bid5'] 
ask_price_features = ['Ask1', 'Ask2', 'Ask3', 'Ask4', 'Ask5']


# ======= size_features
bid_size_features = ['Bid1Size', 'Bid2Size', 'Bid3Size', 'Bid4Size', 'Bid5Size']
ask_size_features = ['Ask1Size', 'Ask2Size', 'Ask3Size', 'Ask4Size', 'Ask5Size']

                      
# ======= Data Path
processed_data_path = 'features/'
if not os.path.isdir(processed_data_path):
    os.makedirs(processed_data_path)
    print("made folder:", processed_data_path)

## ===== Load data =====

In [None]:
df_book = pd.read_parquet('./book')
df_trade = pd.read_parquet('./trade')

In [None]:
df_book = df_book.sort_values(by=['TimeStamp'])
df_book

In [None]:
df_book.describe()

### Generate next_Bid1 and next_Ask1 column 

In [None]:
# generate next_Bid1 next_Ask1 column 
# possible regularizers in some models, not features
tmp_list = df_book['Bid1'][1:].values.tolist()
tmp_list.append(np.nan)
df_book = df_book.assign(nextBid1=tmp_list)

# generate next_Ask1 column 
tmp_list = df_book['Ask1'][1:].values.tolist()
tmp_list.append(np.nan)
df_book = df_book.assign(nextAsk1=tmp_list)

In [None]:
# df_trade

## ===== Clean Data =====

### Book data

In [None]:
pd.set_option('use_inf_as_na', True)

In [None]:
# ----- check null, na, nan 

print('df_book.isnull().sum()', df_book.isnull().sum())
print('df_book.isnan().sum()', df_book.isna().sum())
print("df_book.isna().sum()", df_book.isna().sum())

In [None]:
# Drop all rows with NaN values
# # Note: the nextAsk1 and nextBid1 in the next day will be droped here too
df_book.dropna(how='any', axis=0, inplace=True) 

In [None]:
# check null, na, nan 

print('df_book.isnull().sum()', df_book.isnull().sum())
print('df_book.isnan().sum()', df_book.isna().sum())
print("df_book.isna().sum()", df_book.isna().sum())

In [None]:
### Trade data

In [None]:
# print('df_trade.isnull().sum()', df_trade.isnull().sum())
# print('df_trade.isnan().sum()', df_trade.isna().sum())
# print("df_trade.isna().sum()", df_trade.isna().sum())

In [None]:
# # Drop all rows with NaN values
# df_trade.dropna(how='any', axis=0, inplace=True) 

In [None]:
# # check null, na, nan after MA

# print('df_trade.isnull().sum()', df_trade.isnull().sum())
# print('df_trade.isnan().sum()', df_trade.isna().sum())
# print("df_trade.isna().sum()", df_trade.isna().sum())

## ====== Generate Features ======

### ----- Price Statistical Features ----- 

In [None]:
df_book['Ask_Total_Size'] = df_book[ask_size_features].sum(axis=1)
df_book['Bid_Total_Size'] = df_book[bid_size_features].sum(axis=1)

In [None]:
# ----- weighted avg. (mean) of Bid and Ask

df_book['Bid_Mean'] = (df_book['Bid1']*df_book['Bid1Size'] + df_book['Bid2']*df_book['Bid2Size'] + \
                             df_book['Bid3']*df_book['Bid3Size'] + df_book['Bid4']*df_book['Bid4Size'] + \
                             df_book['Bid5']*df_book['Bid5Size']) / df_book['Bid_Total_Size']


df_book['Ask_Mean'] = (df_book['Ask1']*df_book['Ask1Size'] + df_book['Ask2']*df_book['Ask2Size'] + \
                             df_book['Ask3']*df_book['Ask3Size'] + df_book['Ask4']*df_book['Ask4Size'] + \
                             df_book['Ask5']*df_book['Ask5Size']) / df_book['Ask_Total_Size']

df_book['Ask_Mean']

In [None]:
# ----- microprice

# print(df_book)
df_book['MicroPrice'] = df_book['Ask1']*df_book['Ask1Size'] + df_book['Bid1']*df_book['Bid1Size']
df_book['MicroPrice'] = df_book['MicroPrice']/(df_book['Ask1Size'] + df_book['Bid1Size'])
# print(df_book)

### ----- Price Distance Features -----

In [None]:
# ----- speard features

df_book['Spread1'] = (df_book['Ask1'] - df_book['Bid1']) / df_book['midpt']
df_book['Spread2'] = (df_book['Ask2'] - df_book['Bid2']) / df_book['midpt']
df_book['Spread3'] = (df_book['Ask3'] - df_book['Bid3']) / df_book['midpt']
df_book['Spread4'] = (df_book['Ask4'] - df_book['Bid4']) / df_book['midpt']
df_book['Spread5'] = (df_book['Ask5'] - df_book['Bid5']) / df_book['midpt']
df_book['SpreadMean'] = (df_book['Ask_Mean'] - df_book['Bid_Mean']) / df_book['midpt']

### ----- Size Ratio Features -----

In [None]:
# ----- size ratio features

df_book['BidAskRatio1'] = df_book['Bid1Size'] / df_book['Ask1Size']
df_book['BidAskRatio2'] = df_book['Bid2Size'] / df_book['Ask2Size']
df_book['BidAskRatio3'] = df_book['Bid3Size'] / df_book['Ask3Size']
df_book['BidAskRatio4'] = df_book['Bid4Size'] / df_book['Ask4Size']
df_book['BidAskRatio5'] = df_book['Bid5Size'] / df_book['Ask5Size']
df_book['BidAskRatioTotal'] = df_book['Bid_Total_Size'] / df_book['Ask_Total_Size']

In [None]:
# ----- Queue Imbalance features

df_book['Q_ImB1'] = (df_book['Bid1Size'] - df_book['Ask1Size']) / (df_book['Bid1Size'] + df_book['Ask1Size'])
df_book['Q_ImB2'] = (df_book['Bid2Size'] - df_book['Ask2Size']) / (df_book['Bid2Size'] + df_book['Ask2Size'])
df_book['Q_ImB3'] = (df_book['Bid3Size'] - df_book['Ask3Size']) / (df_book['Bid3Size'] + df_book['Ask3Size'])
df_book['Q_ImB4'] = (df_book['Bid4Size'] - df_book['Ask4Size']) / (df_book['Bid4Size'] + df_book['Ask4Size'])
df_book['Q_ImB5'] = (df_book['Bid5Size'] - df_book['Ask5Size']) / (df_book['Bid5Size'] + df_book['Ask5Size'])

### ----- Size Distribution Features -----

In [None]:
# Ask/Bid size distribution at time t 

print(df_book)  
for feat in ask_size_features:
    df_book[feat + 'Prop'] = df_book[feat]/df_book['Ask_Total_Size']
    
for feat in bid_size_features:
    df_book[feat + 'Prop'] = df_book[feat]/df_book['Bid_Total_Size']

In [None]:
df_book

## Save processed_data

In [None]:
df_book.to_csv(processed_data_path+'book.csv', index=False)
df_trade.to_csv(processed_data_path+'trade.csv', index=False)

## ======= Data pre-processed finished =======

In [None]:
df_book = pd.read_parquet('./book')
df_trade = pd.read_parquet('./trade')
df_trade

In [None]:
df_book

In [None]:
df_trade['TimeStamp_s'] = df_trade['TimeStamp']//1000000
df_tmp = df_trade[['TimeStamp_s', 'BuyVolume', 'BuyPrice', 'SellVolume', 'SellPrice']]
df_book['TimeStamp_s'] = df_book['TimeStamp']//1000000

In [None]:
df_tmp.describe()

In [None]:
df_tmp = df_tmp.groupby('TimeStamp_s').agg({'BuyVolume':'sum', 'BuyPrice':'max', 'SellVolume':'sum', 'SellPrice':'max'})
df_tmp

In [None]:
df_join = df_book.merge(df_tmp, how='left', on='TimeStamp_s')

In [None]:
df_join

In [None]:
print('df_join.isnull().sum()', df_join.isnull().sum())
print('df_join.isnan().sum()', df_join.isna().sum())
print("df_join.isna().sum()", df_join.isna().sum())