In [1]:
# https://www.kaggle.com/tpmeli/insights-correlation-analysis-of-192-features
import random
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from scipy.stats import skew, kurtosis, entropy, median_absolute_deviation
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, Normalizer, MinMaxScaler
import sklearn as sk
data_dir = '../../../data/'

In [2]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(data_dir + 'train.csv')
    test = pd.read_csv(data_dir + 'test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'bid_price1': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'ask_price1': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'bid_price2': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'ask_price2': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'wap1': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'wap2': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'log_return1': [realized_volatility, np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std, skew, kurtosis, median_absolute_deviation],
        'wap_balance': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'price_spread': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'bid_spread': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'ask_spread': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'total_volume':[np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'volume_imbalance':[np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby(['time_id'])['price'].apply(log_return).fillna(0)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility, np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'seconds_in_bucket':[count_unique],
        'size': [np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
        'order_count':[np.min, np.max, np.mean, np.median, np.std, np.sum, skew, median_absolute_deviation],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
# def get_time_stock(df):
#     # Get realized volatility columns
# #     vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
# #                 'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
# #                 'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']
#     vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'trade_log_return_realized_volatility']

#     # Group by the stock id
# #     df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
# #     # Rename columns joining suffix
# #     df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
# #     df_stock_id = df_stock_id.add_suffix('_' + 'stock')

#     # Group by the time id
#     df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
#     # Rename columns joining suffix
#     df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
#     df_time_id = df_time_id.add_suffix('_' + 'time')
    
#     # Merge with original dataframe
#     #df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
#     df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
#     df.drop(['time_id__time'], axis = 1, inplace = True)
#     return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df


# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left').dropna()


# Get group stats of time_id and stock_id
# train = get_time_stock(train)

train.head()

Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 29.7min finished


Unnamed: 0,stock_id,time_id,target,row_id,bid_price1_amin,bid_price1_amax,bid_price1_mean,bid_price1_median,bid_price1_std,bid_price1_sum,bid_price1_skew,bid_price1_median_absolute_deviation,ask_price1_amin,ask_price1_amax,ask_price1_mean,ask_price1_median,ask_price1_std,ask_price1_sum,ask_price1_skew,ask_price1_median_absolute_deviation,bid_price2_amin,bid_price2_amax,bid_price2_mean,bid_price2_median,bid_price2_std,bid_price2_sum,bid_price2_skew,bid_price2_median_absolute_deviation,ask_price2_amin,ask_price2_amax,ask_price2_mean,ask_price2_median,ask_price2_std,ask_price2_sum,ask_price2_skew,ask_price2_median_absolute_deviation,wap1_amin,wap1_amax,wap1_mean,wap1_median,wap1_std,wap1_sum,wap1_skew,wap1_median_absolute_deviation,wap2_amin,wap2_amax,wap2_mean,wap2_median,wap2_std,wap2_sum,wap2_skew,wap2_median_absolute_deviation,log_return1_realized_volatility,log_return1_amin,log_return1_amax,log_return1_mean,log_return1_median,log_return1_std,log_return1_sum,log_return1_skew,log_return1_median_absolute_deviation,log_return2_sum,log_return2_realized_volatility,log_return2_mean,log_return2_std,log_return2_skew,log_return2_kurtosis,log_return2_median_absolute_deviation,wap_balance_amin,wap_balance_amax,wap_balance_mean,wap_balance_median,wap_balance_std,wap_balance_sum,wap_balance_skew,wap_balance_median_absolute_deviation,price_spread_amin,price_spread_amax,price_spread_mean,price_spread_median,price_spread_std,price_spread_sum,price_spread_skew,price_spread_median_absolute_deviation,bid_spread_amin,bid_spread_amax,bid_spread_mean,bid_spread_median,bid_spread_std,bid_spread_sum,bid_spread_skew,bid_spread_median_absolute_deviation,ask_spread_amin,ask_spread_amax,ask_spread_mean,ask_spread_median,ask_spread_std,ask_spread_sum,ask_spread_skew,ask_spread_median_absolute_deviation,total_volume_amin,total_volume_amax,total_volume_mean,total_volume_median,total_volume_std,total_volume_sum,total_volume_skew,total_volume_median_absolute_deviation,volume_imbalance_amin,volume_imbalance_amax,volume_imbalance_mean,volume_imbalance_median,volume_imbalance_std,volume_imbalance_sum,volume_imbalance_skew,volume_imbalance_median_absolute_deviation,bid_price1_amin_450,bid_price1_amax_450,bid_price1_mean_450,bid_price1_median_450,bid_price1_std_450,bid_price1_sum_450,bid_price1_skew_450,bid_price1_median_absolute_deviation_450,ask_price1_amin_450,ask_price1_amax_450,ask_price1_mean_450,ask_price1_median_450,ask_price1_std_450,ask_price1_sum_450,ask_price1_skew_450,ask_price1_median_absolute_deviation_450,bid_price2_amin_450,bid_price2_amax_450,bid_price2_mean_450,bid_price2_median_450,bid_price2_std_450,bid_price2_sum_450,bid_price2_skew_450,bid_price2_median_absolute_deviation_450,ask_price2_amin_450,ask_price2_amax_450,ask_price2_mean_450,ask_price2_median_450,ask_price2_std_450,ask_price2_sum_450,ask_price2_skew_450,ask_price2_median_absolute_deviation_450,wap1_amin_450,wap1_amax_450,...,wap_balance_mean_150,wap_balance_median_150,wap_balance_std_150,wap_balance_sum_150,wap_balance_skew_150,wap_balance_median_absolute_deviation_150,price_spread_amin_150,price_spread_amax_150,price_spread_mean_150,price_spread_median_150,price_spread_std_150,price_spread_sum_150,price_spread_skew_150,price_spread_median_absolute_deviation_150,bid_spread_amin_150,bid_spread_amax_150,bid_spread_mean_150,bid_spread_median_150,bid_spread_std_150,bid_spread_sum_150,bid_spread_skew_150,bid_spread_median_absolute_deviation_150,ask_spread_amin_150,ask_spread_amax_150,ask_spread_mean_150,ask_spread_median_150,ask_spread_std_150,ask_spread_sum_150,ask_spread_skew_150,ask_spread_median_absolute_deviation_150,total_volume_amin_150,total_volume_amax_150,total_volume_mean_150,total_volume_median_150,total_volume_std_150,total_volume_sum_150,total_volume_skew_150,total_volume_median_absolute_deviation_150,volume_imbalance_amin_150,volume_imbalance_amax_150,volume_imbalance_mean_150,volume_imbalance_median_150,volume_imbalance_std_150,volume_imbalance_sum_150,volume_imbalance_skew_150,volume_imbalance_median_absolute_deviation_150,trade_log_return_realized_volatility,trade_log_return_amin,trade_log_return_amax,trade_log_return_mean,trade_log_return_median,trade_log_return_std,trade_log_return_sum,trade_log_return_skew,trade_log_return_median_absolute_deviation,trade_seconds_in_bucket_count_unique,trade_size_amin,trade_size_amax,trade_size_mean,trade_size_median,trade_size_std,trade_size_sum,trade_size_skew,trade_size_median_absolute_deviation,trade_order_count_amin,trade_order_count_amax,trade_order_count_mean,trade_order_count_median,trade_order_count_std,trade_order_count_sum,trade_order_count_skew,trade_order_count_median_absolute_deviation,trade_log_return_realized_volatility_450,trade_log_return_amin_450,trade_log_return_amax_450,trade_log_return_mean_450,trade_log_return_median_450,trade_log_return_std_450,trade_log_return_sum_450,trade_log_return_skew_450,trade_log_return_median_absolute_deviation_450,trade_seconds_in_bucket_count_unique_450,trade_size_amin_450,trade_size_amax_450,trade_size_mean_450,trade_size_median_450,trade_size_std_450,trade_size_sum_450,trade_size_skew_450,trade_size_median_absolute_deviation_450,trade_order_count_amin_450,trade_order_count_amax_450,trade_order_count_mean_450,trade_order_count_median_450,trade_order_count_std_450,trade_order_count_sum_450,trade_order_count_skew_450,trade_order_count_median_absolute_deviation_450,trade_log_return_realized_volatility_300,trade_log_return_amin_300,trade_log_return_amax_300,trade_log_return_mean_300,trade_log_return_median_300,trade_log_return_std_300,trade_log_return_sum_300,trade_log_return_skew_300,trade_log_return_median_absolute_deviation_300,trade_seconds_in_bucket_count_unique_300,trade_size_amin_300,trade_size_amax_300,trade_size_mean_300,trade_size_median_300,trade_size_std_300,trade_size_sum_300,trade_size_skew_300,trade_size_median_absolute_deviation_300,trade_order_count_amin_300,trade_order_count_amax_300,trade_order_count_mean_300,trade_order_count_median_300,trade_order_count_std_300,trade_order_count_sum_300,trade_order_count_skew_300,trade_order_count_median_absolute_deviation_300,trade_log_return_realized_volatility_150,trade_log_return_amin_150,trade_log_return_amax_150,trade_log_return_mean_150,trade_log_return_median_150,trade_log_return_std_150,trade_log_return_sum_150,trade_log_return_skew_150,trade_log_return_median_absolute_deviation_150,trade_seconds_in_bucket_count_unique_150,trade_size_amin_150,trade_size_amax_150,trade_size_mean_150,trade_size_median_150,trade_size_std_150,trade_size_sum_150,trade_size_skew_150,trade_size_median_absolute_deviation_150,trade_order_count_amin_150,trade_order_count_amax_150,trade_order_count_mean_150,trade_order_count_median_150,trade_order_count_std_150,trade_order_count_sum_150,trade_order_count_skew_150,trade_order_count_median_absolute_deviation_150
0,0,5,0.004136,0-5,1.001422,1.004267,1.003314,1.003439,0.000596,303.000854,-1.75244,0.00023,1.002301,1.004939,1.004169,1.004267,0.000601,303.259064,-1.549424,0.000383,1.00137,1.004215,1.003139,1.003336,0.000573,302.947845,-1.754613,0.000192,1.002353,1.005146,1.00432,1.004525,0.000604,303.304626,-1.664484,0.000383,1.001434,1.00492,1.003725,1.003923,0.000693,303.125061,-1.793836,0.000351,1.00139,1.005124,1.003661,1.003821,0.000781,303.10553,-1.28259,0.000548,0.004499,-0.000896,0.001049,7.588486e-06,0.0,0.000259,0.002292,0.332077,5.1e-05,0.002325,0.006999,8e-06,0.000403,0.301112,4.468462,7.8e-05,1.192093e-07,0.001414,0.000388,0.000315,0.000295,0.117051,0.791968,0.000314,0.000361,0.001393,0.000852,0.000876,0.000211,0.257255,0.140243,0.000152,5.2e-05,0.000672,0.000176,0.000103,0.000162,0.053006,1.237735,7.652829e-05,-0.000569,-5.2e-05,-0.000151,-0.000103,0.000126,-0.045557,-1.477414,7.670503e-05,34,762,323.496689,314.5,138.101214,97696,0.489687,131.2101,0,518,134.89404,107.0,107.260583,40738,0.9685,107.4885,1.002043,1.003646,1.003107,1.003129,0.000445,68.211258,-1.024004,0.000537,1.002715,1.004422,1.003892,1.004008,0.000438,68.264679,-0.544144,0.000537,1.001991,1.003594,1.002845,1.003025,0.000405,68.193466,-0.763396,0.00023,1.003129,1.00468,1.004058,1.00406,0.000483,68.275955,-0.299255,0.00046,1.002053,1.004004,...,0.000397,0.000345,0.000281,0.091996,0.595002,0.000311,0.000361,0.001391,0.000858,0.000876,0.000221,0.199058,0.102357,0.000153,5.2e-05,0.000672,0.000188,0.000103,0.000165,0.043697,1.144688,7.670503e-05,-0.000517,-5.2e-05,-0.000147,-0.000103,0.00012,-0.034024,-1.131454,7.670503e-05,34,727,327.431034,325.5,142.761068,75964,0.365209,139.3644,0,418,123.586207,99.0,103.533216,28672,1.030882,97.8516,0.002006,-0.000721,0.00061,3.4e-05,7.7e-05,0.000319,0.001341,-0.29063,0.000344,40.0,1.0,499.0,79.475,19.5,118.375107,3179.0,2.036925,27.4281,1.0,12.0,2.75,2.0,2.467741,110.0,1.8986,1.4826,0.00106,-0.000464,0.000475,-1.8e-05,5.1e-05,0.000293,-0.000258,-0.065085,0.000326,14.0,1.0,499.0,74.428571,10.5,137.880502,1042.0,2.315182,14.0847,1.0,7.0,2.642857,2.0,2.023217,37.0,1.028004,1.4826,0.001308,-0.000464,0.000475,-3.4e-05,-4e-06,0.00029,-0.000721,-0.012553,0.000387,21.0,1.0,499.0,75.571429,6.0,141.675888,1587.0,2.135433,7.413,1.0,9.0,2.571429,2.0,2.292846,54.0,1.56349,1.4826,0.001701,-0.000721,0.00061,-1.1e-05,2.6e-05,0.000316,-0.000316,-0.219891,0.000365,30.0,1.0,499.0,68.966667,7.0,122.25256,2069.0,2.421965,8.8956,1.0,9.0,2.433333,2.0,2.062528,73.0,1.679583,1.4826
1,0,11,0.001445,0-11,0.999473,1.000627,1.000011,1.000025,0.000284,200.002258,0.293284,0.000223,0.999975,1.000878,1.000406,1.000376,0.000219,200.081116,0.508002,0.000223,0.999423,1.000477,0.99987,0.999975,0.000245,199.973907,-0.251671,0.000223,1.000176,1.001179,1.000541,1.000527,0.000217,200.108109,0.675041,0.000223,0.9997,1.000834,1.000239,1.000232,0.000262,200.047775,0.452154,0.000176,0.999575,1.001067,1.000206,1.000192,0.000272,200.041168,0.286531,0.000248,0.001204,-0.000476,0.000396,1.801324e-06,0.0,8.5e-05,0.00036,-1.098433,4e-06,0.000801,0.002476,4e-06,0.000175,0.292108,3.893411,2.1e-05,4.768372e-07,0.000639,0.000212,0.000212,0.000155,0.04231,0.590079,0.000196,0.000151,0.000903,0.000394,0.000351,0.000157,0.078836,1.105518,0.000149,5e-05,0.000652,0.000142,5e-05,0.000148,0.028358,2.322646,8.836984e-08,-0.000351,-5e-05,-0.000135,-0.000151,6.5e-05,-0.027001,-0.646372,7.440741e-05,46,876,411.45,408.0,172.263581,82290,0.150233,187.5489,1,481,142.05,114.0,102.139758,28410,0.746431,100.8168,0.999975,1.000627,1.000293,1.000226,0.000253,54.015812,0.337272,0.000298,1.000326,1.000878,1.000641,1.000527,0.000207,54.034634,-0.196819,0.000298,0.999573,1.000477,1.000059,1.000025,0.000171,54.003212,0.194116,7.4e-05,1.000477,1.001179,1.000785,1.000728,0.0002,54.042362,0.107939,0.000223,1.000158,1.000834,...,0.000205,0.000199,0.000158,0.035451,0.697107,0.000189,0.000151,0.000753,0.000353,0.000351,0.000112,0.061017,0.899066,0.000149,5e-05,0.000652,0.000141,5e-05,0.000154,0.024394,2.387894,8.836984e-08,-0.000301,-5e-05,-0.000127,-0.000151,5.8e-05,-0.022032,-0.3582,7.440741e-05,46,876,419.277457,433.0,178.652395,72535,0.080263,182.3598,1,481,151.566474,120.0,104.576846,26221,0.623652,102.2994,0.000901,-0.000251,0.000402,2.7e-05,0.0,0.000165,0.000803,0.351172,0.000149,30.0,1.0,280.0,42.966667,3.0,77.815203,1289.0,1.888894,2.9652,1.0,6.0,1.9,1.0,1.446756,57.0,1.70531,0.0,0.000501,-0.000204,0.000269,3.5e-05,4.1e-05,0.000163,0.000351,-0.068993,0.000149,10.0,1.0,280.0,82.8,19.0,107.246134,828.0,0.792523,26.6868,1.0,6.0,2.2,1.0,2.097618,22.0,1.245638,0.0,0.000587,-0.000204,0.000269,1.6e-05,1.6e-05,0.000151,0.000251,0.184259,0.000155,16.0,1.0,280.0,56.25,12.5,90.504144,900.0,1.503318,17.0499,1.0,6.0,2.25,1.0,1.807392,36.0,1.154519,0.0,0.000813,-0.000251,0.000402,2.9e-05,1.6e-05,0.000167,0.000702,0.316074,0.000155,24.0,1.0,280.0,48.875,5.5,83.807913,1173.0,1.703029,6.6717,1.0,6.0,2.041667,1.0,1.573674,49.0,1.434568,0.0
2,0,16,0.002168,0-16,0.997008,1.00012,0.999204,0.999497,0.000711,187.850311,-1.171563,0.000497,0.997678,1.000886,0.999929,1.000263,0.000788,187.986603,-1.16187,0.000497,0.996721,0.999928,0.999007,0.99921,0.000728,187.813354,-1.467128,0.000426,0.997966,1.000981,1.000127,1.000407,0.000784,188.023834,-1.157013,0.000639,0.997224,1.000878,0.999542,0.999818,0.000864,187.913849,-0.905676,0.000772,0.996897,1.000876,0.99968,0.999751,0.000862,187.939819,-1.01296,0.000722,0.002369,-0.000783,0.000799,-1.103269e-05,0.0,0.000173,-0.002074,-0.192328,1.9e-05,-0.001493,0.004801,-8e-06,0.000351,-0.449537,3.646116,0.000157,4.887581e-06,0.001135,0.000331,0.000252,0.000246,0.06223,1.237335,0.000186,0.000384,0.001149,0.000725,0.000718,0.000164,0.13633,0.202463,0.000178,4.8e-05,0.00067,0.000197,9.6e-05,0.00017,0.036955,0.909445,7.104936e-05,-0.000718,-4.8e-05,-0.000198,-9.6e-05,0.000171,-0.037243,-1.408969,7.096099e-05,108,758,416.351064,411.0,138.433034,78274,0.084062,143.8122,1,579,141.414894,111.0,108.891243,26586,0.831645,114.9015,0.997008,0.999067,0.998115,0.998157,0.00054,43.917046,0.121114,0.000497,0.997678,0.999737,0.998718,0.998779,0.000577,43.943611,0.205975,0.000639,0.996721,0.999019,0.997929,0.998013,0.000688,43.908859,-0.193573,0.000781,0.997966,1.000407,0.998926,0.998875,0.000602,43.952755,0.672419,0.000532,0.997224,0.999084,...,0.000373,0.000328,0.000276,0.044348,0.946803,0.000272,0.000384,0.001149,0.000679,0.000671,0.000163,0.080811,0.564635,0.000143,4.8e-05,0.00067,0.000161,9.6e-05,0.000155,0.0191,1.754655,7.096099e-05,-0.000718,-4.8e-05,-0.000241,-0.000191,0.000195,-0.028626,-0.95781,0.0001420103,108,758,428.537815,417.0,135.376048,50996,0.035931,127.5036,1,579,132.084034,100.0,114.924631,15718,1.158035,112.6776,0.001961,-0.000642,0.000862,-0.000102,-0.000151,0.000387,-0.00254,1.048432,0.000328,25.0,1.0,391.0,86.44,14.0,113.587,2161.0,1.220954,19.2738,1.0,8.0,2.72,2.0,2.300725,68.0,1.27196,1.4826,0.001048,-0.000642,0.000288,-0.000202,-0.000151,0.000302,-0.001822,0.126854,0.000366,9.0,5.0,391.0,120.555556,13.0,143.461764,1085.0,0.711283,11.8608,1.0,8.0,3.666667,3.0,2.915476,33.0,0.606339,2.9652,0.001137,-0.000642,0.000288,-0.000196,-0.000123,0.000275,-0.002348,0.032183,0.000347,12.0,1.0,391.0,99.083333,12.0,130.679663,1189.0,1.049955,15.5673,1.0,8.0,3.166667,2.5,2.691175,38.0,0.955384,2.2239,0.001621,-0.000642,0.000862,-0.000139,-0.000171,0.000344,-0.002779,1.257622,0.000277,20.0,1.0,391.0,100.5,29.5,120.976509,2010.0,0.985998,41.5128,1.0,8.0,2.95,2.0,2.459675,59.0,1.082243,1.4826
3,0,31,0.002195,0-31,0.997363,0.999815,0.998445,0.99808,0.000689,119.813408,0.930791,0.000103,0.998519,1.000509,0.999304,0.999213,0.000568,119.916534,0.799826,0.000412,0.997178,0.999769,0.998255,0.997964,0.000731,119.790642,0.971031,0.00048,0.998566,1.000601,0.999413,0.999283,0.000568,119.929535,0.76766,0.000377,0.997447,1.000412,0.998832,0.998657,0.000757,119.859779,0.746726,0.000598,0.99743,1.000116,0.998633,0.998573,0.000656,119.835945,0.469768,0.000596,0.002574,-0.001296,0.000757,-2.356919e-05,0.0,0.000235,-0.002828,-1.950045,1.2e-05,-0.002053,0.003637,-1.7e-05,0.000333,-1.350842,5.661873,0.000129,1.507998e-05,0.001082,0.00038,0.00036,0.000248,0.04561,0.368336,0.000322,0.000324,0.001622,0.00086,0.000788,0.00028,0.103252,0.335148,0.000343,4.6e-05,0.000694,0.00019,9.3e-05,0.000199,0.022764,1.247198,6.866337e-05,-0.000463,-4.6e-05,-0.000108,-4.6e-05,9.1e-05,-0.013001,-1.549214,8.836984e-08,140,912,435.266667,426.0,156.120334,52232,0.66811,149.7426,2,576,146.216667,115.0,121.533215,17546,1.123457,105.2646,0.997363,0.998011,0.997831,0.998011,0.000299,17.960951,-0.992209,0.0,0.998519,0.999075,0.998887,0.998982,0.000237,17.979967,-0.918251,0.000137,0.997178,0.997964,0.997715,0.997918,0.000343,17.95887,-0.988111,0.0,0.998566,0.999121,0.998936,0.999028,0.000239,17.980846,-0.913937,0.000137,0.997447,0.9986,...,0.000362,0.00035,0.000247,0.029322,0.512297,0.000302,0.000324,0.001622,0.00092,0.001019,0.000296,0.074552,-0.058949,0.000343,4.6e-05,0.000648,0.00017,9.3e-05,0.000191,0.013789,1.55486,6.8575e-05,-0.000324,-4.6e-05,-0.000108,-4.6e-05,8.5e-05,-0.008745,-1.156439,8.836984e-08,140,912,424.234568,407.0,156.628404,34363,0.776949,131.9514,2,576,151.765432,115.0,124.293028,12293,1.218865,102.2994,0.001561,-0.00088,0.000546,-0.000151,-3.8e-05,0.000387,-0.002261,-0.224084,0.000382,15.0,5.0,450.0,130.8,70.0,144.828569,1962.0,1.309124,81.543,1.0,15.0,3.933333,2.0,4.043808,59.0,1.559521,1.4826,0.000802,-0.000732,-3.8e-05,-0.000365,-0.000324,0.000349,-0.001095,-0.210467,0.000424,3.0,5.0,450.0,171.333333,59.0,242.83808,514.0,0.667974,80.0604,1.0,8.0,3.666667,2.0,3.785939,11.0,0.652012,1.4826,0.001089,-0.000732,0.000546,-5.1e-05,-3.8e-05,0.000381,-0.000455,-0.203817,0.000382,9.0,5.0,450.0,172.888889,116.0,167.747909,1556.0,0.834892,100.8168,1.0,15.0,5.111111,4.0,4.702245,46.0,1.081381,4.4478,0.001401,-0.00088,0.000546,-0.000117,-3.8e-05,0.000426,-0.001289,-0.393842,0.000382,11.0,5.0,450.0,148.272727,70.0,160.381477,1631.0,1.0948,81.543,1.0,15.0,4.545455,3.0,4.412791,50.0,1.344523,2.9652
4,0,62,0.001747,0-62,0.999044,0.99979,0.999407,0.999324,0.000218,175.895645,0.357271,0.000138,0.999464,1.00021,0.999804,0.999837,0.000191,175.965515,0.12478,0.000207,0.998858,0.99965,0.999216,0.999091,0.000229,175.862076,0.515789,0.000207,0.999557,1.000303,0.999913,0.999883,0.000196,175.984711,-0.21631,0.000276,0.999231,1.000159,0.999619,0.999586,0.000258,175.932861,0.21661,0.000355,0.999102,1.000249,0.999626,0.999598,0.000317,175.93425,-0.045549,0.000426,0.001894,-0.00075,0.000912,-1.015935e-08,0.0,0.000143,-2e-06,0.451765,1e-05,-0.000281,0.003257,-2e-06,0.000246,0.117547,2.519043,8.9e-05,3.278255e-06,0.000724,0.000254,0.000222,0.000188,0.044783,0.530648,0.000211,9.3e-05,0.000793,0.000397,0.000373,0.00013,0.069901,0.603802,0.000104,4.7e-05,0.000466,0.000191,0.000187,8.3e-05,0.033565,0.904491,6.906103e-05,-0.00042,-4.7e-05,-0.000109,-7e-05,7.6e-05,-0.019206,-0.962715,3.45968e-05,16,738,343.221591,321.0,158.054066,60407,0.532468,143.8122,0,424,123.846591,98.5,102.407501,21797,0.653022,128.2449,0.999231,0.999744,0.999293,0.999231,0.000151,35.974552,2.234963,0.0,0.999557,1.000163,0.999812,0.999837,0.000155,35.99324,0.500277,0.000138,0.999044,0.999511,0.999098,0.999044,0.000132,35.96751,2.407073,0.0,0.99965,1.00021,0.999948,0.999883,0.000137,35.998138,0.051566,0.000173,0.999234,1.000159,...,0.000242,0.000198,0.000193,0.032718,0.682531,0.000206,9.3e-05,0.000793,0.000395,0.000373,0.000137,0.053347,0.592488,0.000138,4.7e-05,0.000466,0.000187,0.000187,8.8e-05,0.02522,1.17496,6.901685e-05,-0.00042,-4.7e-05,-0.000117,-9.3e-05,8e-05,-0.015757,-0.768504,6.910522e-05,16,738,371.266667,349.0,162.610706,50121,0.373945,173.4642,0,424,131.474074,101.0,109.275622,17749,0.457668,136.3992,0.000871,-0.00056,0.000332,-1.2e-05,-9e-06,0.00019,-0.000274,-0.703133,0.000135,22.0,1.0,341.0,81.409091,5.5,117.914682,1791.0,1.190959,6.6717,1.0,17.0,4.045455,2.0,4.099678,89.0,1.694005,1.4826,0.00036,-9.3e-05,0.000332,2.5e-05,-7e-05,0.000206,9.9e-05,1.116419,3.5e-05,4.0,1.0,34.0,10.75,4.0,15.585784,43.0,1.11683,2.9652,1.0,10.0,3.5,1.5,4.358899,14.0,1.115396,0.7413,0.000453,-0.000176,0.000332,1e-05,-3.9e-05,0.000143,0.000107,0.948151,8.1e-05,11.0,1.0,341.0,110.818182,34.0,140.061285,1219.0,0.803901,48.9258,1.0,17.0,4.909091,2.0,4.887833,54.0,1.512628,1.4826,0.00055,-0.000195,0.000332,1.5e-05,-1.9e-05,0.000141,0.000239,0.51876,0.000135,16.0,1.0,341.0,98.125,20.0,127.212093,1570.0,0.950559,28.1694,1.0,17.0,4.5,2.0,4.412105,72.0,1.577651,1.4826


In [3]:
print(train.isna().sum().sum())
train.columns[train.isna().sum() > 0]
train.head()

0


Unnamed: 0,stock_id,time_id,target,row_id,bid_price1_amin,bid_price1_amax,bid_price1_mean,bid_price1_median,bid_price1_std,bid_price1_sum,bid_price1_skew,bid_price1_median_absolute_deviation,ask_price1_amin,ask_price1_amax,ask_price1_mean,ask_price1_median,ask_price1_std,ask_price1_sum,ask_price1_skew,ask_price1_median_absolute_deviation,bid_price2_amin,bid_price2_amax,bid_price2_mean,bid_price2_median,bid_price2_std,bid_price2_sum,bid_price2_skew,bid_price2_median_absolute_deviation,ask_price2_amin,ask_price2_amax,ask_price2_mean,ask_price2_median,ask_price2_std,ask_price2_sum,ask_price2_skew,ask_price2_median_absolute_deviation,wap1_amin,wap1_amax,wap1_mean,wap1_median,wap1_std,wap1_sum,wap1_skew,wap1_median_absolute_deviation,wap2_amin,wap2_amax,wap2_mean,wap2_median,wap2_std,wap2_sum,wap2_skew,wap2_median_absolute_deviation,log_return1_realized_volatility,log_return1_amin,log_return1_amax,log_return1_mean,log_return1_median,log_return1_std,log_return1_sum,log_return1_skew,log_return1_median_absolute_deviation,log_return2_sum,log_return2_realized_volatility,log_return2_mean,log_return2_std,log_return2_skew,log_return2_kurtosis,log_return2_median_absolute_deviation,wap_balance_amin,wap_balance_amax,wap_balance_mean,wap_balance_median,wap_balance_std,wap_balance_sum,wap_balance_skew,wap_balance_median_absolute_deviation,price_spread_amin,price_spread_amax,price_spread_mean,price_spread_median,price_spread_std,price_spread_sum,price_spread_skew,price_spread_median_absolute_deviation,bid_spread_amin,bid_spread_amax,bid_spread_mean,bid_spread_median,bid_spread_std,bid_spread_sum,bid_spread_skew,bid_spread_median_absolute_deviation,ask_spread_amin,ask_spread_amax,ask_spread_mean,ask_spread_median,ask_spread_std,ask_spread_sum,ask_spread_skew,ask_spread_median_absolute_deviation,total_volume_amin,total_volume_amax,total_volume_mean,total_volume_median,total_volume_std,total_volume_sum,total_volume_skew,total_volume_median_absolute_deviation,volume_imbalance_amin,volume_imbalance_amax,volume_imbalance_mean,volume_imbalance_median,volume_imbalance_std,volume_imbalance_sum,volume_imbalance_skew,volume_imbalance_median_absolute_deviation,bid_price1_amin_450,bid_price1_amax_450,bid_price1_mean_450,bid_price1_median_450,bid_price1_std_450,bid_price1_sum_450,bid_price1_skew_450,bid_price1_median_absolute_deviation_450,ask_price1_amin_450,ask_price1_amax_450,ask_price1_mean_450,ask_price1_median_450,ask_price1_std_450,ask_price1_sum_450,ask_price1_skew_450,ask_price1_median_absolute_deviation_450,bid_price2_amin_450,bid_price2_amax_450,bid_price2_mean_450,bid_price2_median_450,bid_price2_std_450,bid_price2_sum_450,bid_price2_skew_450,bid_price2_median_absolute_deviation_450,ask_price2_amin_450,ask_price2_amax_450,ask_price2_mean_450,ask_price2_median_450,ask_price2_std_450,ask_price2_sum_450,ask_price2_skew_450,ask_price2_median_absolute_deviation_450,wap1_amin_450,wap1_amax_450,...,wap_balance_mean_150,wap_balance_median_150,wap_balance_std_150,wap_balance_sum_150,wap_balance_skew_150,wap_balance_median_absolute_deviation_150,price_spread_amin_150,price_spread_amax_150,price_spread_mean_150,price_spread_median_150,price_spread_std_150,price_spread_sum_150,price_spread_skew_150,price_spread_median_absolute_deviation_150,bid_spread_amin_150,bid_spread_amax_150,bid_spread_mean_150,bid_spread_median_150,bid_spread_std_150,bid_spread_sum_150,bid_spread_skew_150,bid_spread_median_absolute_deviation_150,ask_spread_amin_150,ask_spread_amax_150,ask_spread_mean_150,ask_spread_median_150,ask_spread_std_150,ask_spread_sum_150,ask_spread_skew_150,ask_spread_median_absolute_deviation_150,total_volume_amin_150,total_volume_amax_150,total_volume_mean_150,total_volume_median_150,total_volume_std_150,total_volume_sum_150,total_volume_skew_150,total_volume_median_absolute_deviation_150,volume_imbalance_amin_150,volume_imbalance_amax_150,volume_imbalance_mean_150,volume_imbalance_median_150,volume_imbalance_std_150,volume_imbalance_sum_150,volume_imbalance_skew_150,volume_imbalance_median_absolute_deviation_150,trade_log_return_realized_volatility,trade_log_return_amin,trade_log_return_amax,trade_log_return_mean,trade_log_return_median,trade_log_return_std,trade_log_return_sum,trade_log_return_skew,trade_log_return_median_absolute_deviation,trade_seconds_in_bucket_count_unique,trade_size_amin,trade_size_amax,trade_size_mean,trade_size_median,trade_size_std,trade_size_sum,trade_size_skew,trade_size_median_absolute_deviation,trade_order_count_amin,trade_order_count_amax,trade_order_count_mean,trade_order_count_median,trade_order_count_std,trade_order_count_sum,trade_order_count_skew,trade_order_count_median_absolute_deviation,trade_log_return_realized_volatility_450,trade_log_return_amin_450,trade_log_return_amax_450,trade_log_return_mean_450,trade_log_return_median_450,trade_log_return_std_450,trade_log_return_sum_450,trade_log_return_skew_450,trade_log_return_median_absolute_deviation_450,trade_seconds_in_bucket_count_unique_450,trade_size_amin_450,trade_size_amax_450,trade_size_mean_450,trade_size_median_450,trade_size_std_450,trade_size_sum_450,trade_size_skew_450,trade_size_median_absolute_deviation_450,trade_order_count_amin_450,trade_order_count_amax_450,trade_order_count_mean_450,trade_order_count_median_450,trade_order_count_std_450,trade_order_count_sum_450,trade_order_count_skew_450,trade_order_count_median_absolute_deviation_450,trade_log_return_realized_volatility_300,trade_log_return_amin_300,trade_log_return_amax_300,trade_log_return_mean_300,trade_log_return_median_300,trade_log_return_std_300,trade_log_return_sum_300,trade_log_return_skew_300,trade_log_return_median_absolute_deviation_300,trade_seconds_in_bucket_count_unique_300,trade_size_amin_300,trade_size_amax_300,trade_size_mean_300,trade_size_median_300,trade_size_std_300,trade_size_sum_300,trade_size_skew_300,trade_size_median_absolute_deviation_300,trade_order_count_amin_300,trade_order_count_amax_300,trade_order_count_mean_300,trade_order_count_median_300,trade_order_count_std_300,trade_order_count_sum_300,trade_order_count_skew_300,trade_order_count_median_absolute_deviation_300,trade_log_return_realized_volatility_150,trade_log_return_amin_150,trade_log_return_amax_150,trade_log_return_mean_150,trade_log_return_median_150,trade_log_return_std_150,trade_log_return_sum_150,trade_log_return_skew_150,trade_log_return_median_absolute_deviation_150,trade_seconds_in_bucket_count_unique_150,trade_size_amin_150,trade_size_amax_150,trade_size_mean_150,trade_size_median_150,trade_size_std_150,trade_size_sum_150,trade_size_skew_150,trade_size_median_absolute_deviation_150,trade_order_count_amin_150,trade_order_count_amax_150,trade_order_count_mean_150,trade_order_count_median_150,trade_order_count_std_150,trade_order_count_sum_150,trade_order_count_skew_150,trade_order_count_median_absolute_deviation_150
0,0,5,0.004136,0-5,1.001422,1.004267,1.003314,1.003439,0.000596,303.000854,-1.75244,0.00023,1.002301,1.004939,1.004169,1.004267,0.000601,303.259064,-1.549424,0.000383,1.00137,1.004215,1.003139,1.003336,0.000573,302.947845,-1.754613,0.000192,1.002353,1.005146,1.00432,1.004525,0.000604,303.304626,-1.664484,0.000383,1.001434,1.00492,1.003725,1.003923,0.000693,303.125061,-1.793836,0.000351,1.00139,1.005124,1.003661,1.003821,0.000781,303.10553,-1.28259,0.000548,0.004499,-0.000896,0.001049,7.588486e-06,0.0,0.000259,0.002292,0.332077,5.1e-05,0.002325,0.006999,8e-06,0.000403,0.301112,4.468462,7.8e-05,1.192093e-07,0.001414,0.000388,0.000315,0.000295,0.117051,0.791968,0.000314,0.000361,0.001393,0.000852,0.000876,0.000211,0.257255,0.140243,0.000152,5.2e-05,0.000672,0.000176,0.000103,0.000162,0.053006,1.237735,7.652829e-05,-0.000569,-5.2e-05,-0.000151,-0.000103,0.000126,-0.045557,-1.477414,7.670503e-05,34,762,323.496689,314.5,138.101214,97696,0.489687,131.2101,0,518,134.89404,107.0,107.260583,40738,0.9685,107.4885,1.002043,1.003646,1.003107,1.003129,0.000445,68.211258,-1.024004,0.000537,1.002715,1.004422,1.003892,1.004008,0.000438,68.264679,-0.544144,0.000537,1.001991,1.003594,1.002845,1.003025,0.000405,68.193466,-0.763396,0.00023,1.003129,1.00468,1.004058,1.00406,0.000483,68.275955,-0.299255,0.00046,1.002053,1.004004,...,0.000397,0.000345,0.000281,0.091996,0.595002,0.000311,0.000361,0.001391,0.000858,0.000876,0.000221,0.199058,0.102357,0.000153,5.2e-05,0.000672,0.000188,0.000103,0.000165,0.043697,1.144688,7.670503e-05,-0.000517,-5.2e-05,-0.000147,-0.000103,0.00012,-0.034024,-1.131454,7.670503e-05,34,727,327.431034,325.5,142.761068,75964,0.365209,139.3644,0,418,123.586207,99.0,103.533216,28672,1.030882,97.8516,0.002006,-0.000721,0.00061,3.4e-05,7.7e-05,0.000319,0.001341,-0.29063,0.000344,40.0,1.0,499.0,79.475,19.5,118.375107,3179.0,2.036925,27.4281,1.0,12.0,2.75,2.0,2.467741,110.0,1.8986,1.4826,0.00106,-0.000464,0.000475,-1.8e-05,5.1e-05,0.000293,-0.000258,-0.065085,0.000326,14.0,1.0,499.0,74.428571,10.5,137.880502,1042.0,2.315182,14.0847,1.0,7.0,2.642857,2.0,2.023217,37.0,1.028004,1.4826,0.001308,-0.000464,0.000475,-3.4e-05,-4e-06,0.00029,-0.000721,-0.012553,0.000387,21.0,1.0,499.0,75.571429,6.0,141.675888,1587.0,2.135433,7.413,1.0,9.0,2.571429,2.0,2.292846,54.0,1.56349,1.4826,0.001701,-0.000721,0.00061,-1.1e-05,2.6e-05,0.000316,-0.000316,-0.219891,0.000365,30.0,1.0,499.0,68.966667,7.0,122.25256,2069.0,2.421965,8.8956,1.0,9.0,2.433333,2.0,2.062528,73.0,1.679583,1.4826
1,0,11,0.001445,0-11,0.999473,1.000627,1.000011,1.000025,0.000284,200.002258,0.293284,0.000223,0.999975,1.000878,1.000406,1.000376,0.000219,200.081116,0.508002,0.000223,0.999423,1.000477,0.99987,0.999975,0.000245,199.973907,-0.251671,0.000223,1.000176,1.001179,1.000541,1.000527,0.000217,200.108109,0.675041,0.000223,0.9997,1.000834,1.000239,1.000232,0.000262,200.047775,0.452154,0.000176,0.999575,1.001067,1.000206,1.000192,0.000272,200.041168,0.286531,0.000248,0.001204,-0.000476,0.000396,1.801324e-06,0.0,8.5e-05,0.00036,-1.098433,4e-06,0.000801,0.002476,4e-06,0.000175,0.292108,3.893411,2.1e-05,4.768372e-07,0.000639,0.000212,0.000212,0.000155,0.04231,0.590079,0.000196,0.000151,0.000903,0.000394,0.000351,0.000157,0.078836,1.105518,0.000149,5e-05,0.000652,0.000142,5e-05,0.000148,0.028358,2.322646,8.836984e-08,-0.000351,-5e-05,-0.000135,-0.000151,6.5e-05,-0.027001,-0.646372,7.440741e-05,46,876,411.45,408.0,172.263581,82290,0.150233,187.5489,1,481,142.05,114.0,102.139758,28410,0.746431,100.8168,0.999975,1.000627,1.000293,1.000226,0.000253,54.015812,0.337272,0.000298,1.000326,1.000878,1.000641,1.000527,0.000207,54.034634,-0.196819,0.000298,0.999573,1.000477,1.000059,1.000025,0.000171,54.003212,0.194116,7.4e-05,1.000477,1.001179,1.000785,1.000728,0.0002,54.042362,0.107939,0.000223,1.000158,1.000834,...,0.000205,0.000199,0.000158,0.035451,0.697107,0.000189,0.000151,0.000753,0.000353,0.000351,0.000112,0.061017,0.899066,0.000149,5e-05,0.000652,0.000141,5e-05,0.000154,0.024394,2.387894,8.836984e-08,-0.000301,-5e-05,-0.000127,-0.000151,5.8e-05,-0.022032,-0.3582,7.440741e-05,46,876,419.277457,433.0,178.652395,72535,0.080263,182.3598,1,481,151.566474,120.0,104.576846,26221,0.623652,102.2994,0.000901,-0.000251,0.000402,2.7e-05,0.0,0.000165,0.000803,0.351172,0.000149,30.0,1.0,280.0,42.966667,3.0,77.815203,1289.0,1.888894,2.9652,1.0,6.0,1.9,1.0,1.446756,57.0,1.70531,0.0,0.000501,-0.000204,0.000269,3.5e-05,4.1e-05,0.000163,0.000351,-0.068993,0.000149,10.0,1.0,280.0,82.8,19.0,107.246134,828.0,0.792523,26.6868,1.0,6.0,2.2,1.0,2.097618,22.0,1.245638,0.0,0.000587,-0.000204,0.000269,1.6e-05,1.6e-05,0.000151,0.000251,0.184259,0.000155,16.0,1.0,280.0,56.25,12.5,90.504144,900.0,1.503318,17.0499,1.0,6.0,2.25,1.0,1.807392,36.0,1.154519,0.0,0.000813,-0.000251,0.000402,2.9e-05,1.6e-05,0.000167,0.000702,0.316074,0.000155,24.0,1.0,280.0,48.875,5.5,83.807913,1173.0,1.703029,6.6717,1.0,6.0,2.041667,1.0,1.573674,49.0,1.434568,0.0
2,0,16,0.002168,0-16,0.997008,1.00012,0.999204,0.999497,0.000711,187.850311,-1.171563,0.000497,0.997678,1.000886,0.999929,1.000263,0.000788,187.986603,-1.16187,0.000497,0.996721,0.999928,0.999007,0.99921,0.000728,187.813354,-1.467128,0.000426,0.997966,1.000981,1.000127,1.000407,0.000784,188.023834,-1.157013,0.000639,0.997224,1.000878,0.999542,0.999818,0.000864,187.913849,-0.905676,0.000772,0.996897,1.000876,0.99968,0.999751,0.000862,187.939819,-1.01296,0.000722,0.002369,-0.000783,0.000799,-1.103269e-05,0.0,0.000173,-0.002074,-0.192328,1.9e-05,-0.001493,0.004801,-8e-06,0.000351,-0.449537,3.646116,0.000157,4.887581e-06,0.001135,0.000331,0.000252,0.000246,0.06223,1.237335,0.000186,0.000384,0.001149,0.000725,0.000718,0.000164,0.13633,0.202463,0.000178,4.8e-05,0.00067,0.000197,9.6e-05,0.00017,0.036955,0.909445,7.104936e-05,-0.000718,-4.8e-05,-0.000198,-9.6e-05,0.000171,-0.037243,-1.408969,7.096099e-05,108,758,416.351064,411.0,138.433034,78274,0.084062,143.8122,1,579,141.414894,111.0,108.891243,26586,0.831645,114.9015,0.997008,0.999067,0.998115,0.998157,0.00054,43.917046,0.121114,0.000497,0.997678,0.999737,0.998718,0.998779,0.000577,43.943611,0.205975,0.000639,0.996721,0.999019,0.997929,0.998013,0.000688,43.908859,-0.193573,0.000781,0.997966,1.000407,0.998926,0.998875,0.000602,43.952755,0.672419,0.000532,0.997224,0.999084,...,0.000373,0.000328,0.000276,0.044348,0.946803,0.000272,0.000384,0.001149,0.000679,0.000671,0.000163,0.080811,0.564635,0.000143,4.8e-05,0.00067,0.000161,9.6e-05,0.000155,0.0191,1.754655,7.096099e-05,-0.000718,-4.8e-05,-0.000241,-0.000191,0.000195,-0.028626,-0.95781,0.0001420103,108,758,428.537815,417.0,135.376048,50996,0.035931,127.5036,1,579,132.084034,100.0,114.924631,15718,1.158035,112.6776,0.001961,-0.000642,0.000862,-0.000102,-0.000151,0.000387,-0.00254,1.048432,0.000328,25.0,1.0,391.0,86.44,14.0,113.587,2161.0,1.220954,19.2738,1.0,8.0,2.72,2.0,2.300725,68.0,1.27196,1.4826,0.001048,-0.000642,0.000288,-0.000202,-0.000151,0.000302,-0.001822,0.126854,0.000366,9.0,5.0,391.0,120.555556,13.0,143.461764,1085.0,0.711283,11.8608,1.0,8.0,3.666667,3.0,2.915476,33.0,0.606339,2.9652,0.001137,-0.000642,0.000288,-0.000196,-0.000123,0.000275,-0.002348,0.032183,0.000347,12.0,1.0,391.0,99.083333,12.0,130.679663,1189.0,1.049955,15.5673,1.0,8.0,3.166667,2.5,2.691175,38.0,0.955384,2.2239,0.001621,-0.000642,0.000862,-0.000139,-0.000171,0.000344,-0.002779,1.257622,0.000277,20.0,1.0,391.0,100.5,29.5,120.976509,2010.0,0.985998,41.5128,1.0,8.0,2.95,2.0,2.459675,59.0,1.082243,1.4826
3,0,31,0.002195,0-31,0.997363,0.999815,0.998445,0.99808,0.000689,119.813408,0.930791,0.000103,0.998519,1.000509,0.999304,0.999213,0.000568,119.916534,0.799826,0.000412,0.997178,0.999769,0.998255,0.997964,0.000731,119.790642,0.971031,0.00048,0.998566,1.000601,0.999413,0.999283,0.000568,119.929535,0.76766,0.000377,0.997447,1.000412,0.998832,0.998657,0.000757,119.859779,0.746726,0.000598,0.99743,1.000116,0.998633,0.998573,0.000656,119.835945,0.469768,0.000596,0.002574,-0.001296,0.000757,-2.356919e-05,0.0,0.000235,-0.002828,-1.950045,1.2e-05,-0.002053,0.003637,-1.7e-05,0.000333,-1.350842,5.661873,0.000129,1.507998e-05,0.001082,0.00038,0.00036,0.000248,0.04561,0.368336,0.000322,0.000324,0.001622,0.00086,0.000788,0.00028,0.103252,0.335148,0.000343,4.6e-05,0.000694,0.00019,9.3e-05,0.000199,0.022764,1.247198,6.866337e-05,-0.000463,-4.6e-05,-0.000108,-4.6e-05,9.1e-05,-0.013001,-1.549214,8.836984e-08,140,912,435.266667,426.0,156.120334,52232,0.66811,149.7426,2,576,146.216667,115.0,121.533215,17546,1.123457,105.2646,0.997363,0.998011,0.997831,0.998011,0.000299,17.960951,-0.992209,0.0,0.998519,0.999075,0.998887,0.998982,0.000237,17.979967,-0.918251,0.000137,0.997178,0.997964,0.997715,0.997918,0.000343,17.95887,-0.988111,0.0,0.998566,0.999121,0.998936,0.999028,0.000239,17.980846,-0.913937,0.000137,0.997447,0.9986,...,0.000362,0.00035,0.000247,0.029322,0.512297,0.000302,0.000324,0.001622,0.00092,0.001019,0.000296,0.074552,-0.058949,0.000343,4.6e-05,0.000648,0.00017,9.3e-05,0.000191,0.013789,1.55486,6.8575e-05,-0.000324,-4.6e-05,-0.000108,-4.6e-05,8.5e-05,-0.008745,-1.156439,8.836984e-08,140,912,424.234568,407.0,156.628404,34363,0.776949,131.9514,2,576,151.765432,115.0,124.293028,12293,1.218865,102.2994,0.001561,-0.00088,0.000546,-0.000151,-3.8e-05,0.000387,-0.002261,-0.224084,0.000382,15.0,5.0,450.0,130.8,70.0,144.828569,1962.0,1.309124,81.543,1.0,15.0,3.933333,2.0,4.043808,59.0,1.559521,1.4826,0.000802,-0.000732,-3.8e-05,-0.000365,-0.000324,0.000349,-0.001095,-0.210467,0.000424,3.0,5.0,450.0,171.333333,59.0,242.83808,514.0,0.667974,80.0604,1.0,8.0,3.666667,2.0,3.785939,11.0,0.652012,1.4826,0.001089,-0.000732,0.000546,-5.1e-05,-3.8e-05,0.000381,-0.000455,-0.203817,0.000382,9.0,5.0,450.0,172.888889,116.0,167.747909,1556.0,0.834892,100.8168,1.0,15.0,5.111111,4.0,4.702245,46.0,1.081381,4.4478,0.001401,-0.00088,0.000546,-0.000117,-3.8e-05,0.000426,-0.001289,-0.393842,0.000382,11.0,5.0,450.0,148.272727,70.0,160.381477,1631.0,1.0948,81.543,1.0,15.0,4.545455,3.0,4.412791,50.0,1.344523,2.9652
4,0,62,0.001747,0-62,0.999044,0.99979,0.999407,0.999324,0.000218,175.895645,0.357271,0.000138,0.999464,1.00021,0.999804,0.999837,0.000191,175.965515,0.12478,0.000207,0.998858,0.99965,0.999216,0.999091,0.000229,175.862076,0.515789,0.000207,0.999557,1.000303,0.999913,0.999883,0.000196,175.984711,-0.21631,0.000276,0.999231,1.000159,0.999619,0.999586,0.000258,175.932861,0.21661,0.000355,0.999102,1.000249,0.999626,0.999598,0.000317,175.93425,-0.045549,0.000426,0.001894,-0.00075,0.000912,-1.015935e-08,0.0,0.000143,-2e-06,0.451765,1e-05,-0.000281,0.003257,-2e-06,0.000246,0.117547,2.519043,8.9e-05,3.278255e-06,0.000724,0.000254,0.000222,0.000188,0.044783,0.530648,0.000211,9.3e-05,0.000793,0.000397,0.000373,0.00013,0.069901,0.603802,0.000104,4.7e-05,0.000466,0.000191,0.000187,8.3e-05,0.033565,0.904491,6.906103e-05,-0.00042,-4.7e-05,-0.000109,-7e-05,7.6e-05,-0.019206,-0.962715,3.45968e-05,16,738,343.221591,321.0,158.054066,60407,0.532468,143.8122,0,424,123.846591,98.5,102.407501,21797,0.653022,128.2449,0.999231,0.999744,0.999293,0.999231,0.000151,35.974552,2.234963,0.0,0.999557,1.000163,0.999812,0.999837,0.000155,35.99324,0.500277,0.000138,0.999044,0.999511,0.999098,0.999044,0.000132,35.96751,2.407073,0.0,0.99965,1.00021,0.999948,0.999883,0.000137,35.998138,0.051566,0.000173,0.999234,1.000159,...,0.000242,0.000198,0.000193,0.032718,0.682531,0.000206,9.3e-05,0.000793,0.000395,0.000373,0.000137,0.053347,0.592488,0.000138,4.7e-05,0.000466,0.000187,0.000187,8.8e-05,0.02522,1.17496,6.901685e-05,-0.00042,-4.7e-05,-0.000117,-9.3e-05,8e-05,-0.015757,-0.768504,6.910522e-05,16,738,371.266667,349.0,162.610706,50121,0.373945,173.4642,0,424,131.474074,101.0,109.275622,17749,0.457668,136.3992,0.000871,-0.00056,0.000332,-1.2e-05,-9e-06,0.00019,-0.000274,-0.703133,0.000135,22.0,1.0,341.0,81.409091,5.5,117.914682,1791.0,1.190959,6.6717,1.0,17.0,4.045455,2.0,4.099678,89.0,1.694005,1.4826,0.00036,-9.3e-05,0.000332,2.5e-05,-7e-05,0.000206,9.9e-05,1.116419,3.5e-05,4.0,1.0,34.0,10.75,4.0,15.585784,43.0,1.11683,2.9652,1.0,10.0,3.5,1.5,4.358899,14.0,1.115396,0.7413,0.000453,-0.000176,0.000332,1e-05,-3.9e-05,0.000143,0.000107,0.948151,8.1e-05,11.0,1.0,341.0,110.818182,34.0,140.061285,1219.0,0.803901,48.9258,1.0,17.0,4.909091,2.0,4.887833,54.0,1.512628,1.4826,0.00055,-0.000195,0.000332,1.5e-05,-1.9e-05,0.000141,0.000239,0.51876,0.000135,16.0,1.0,341.0,98.125,20.0,127.212093,1570.0,0.950559,28.1694,1.0,17.0,4.5,2.0,4.412105,72.0,1.577651,1.4826


In [91]:
train = train.dropna()

In [86]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(data_dir + 'train.csv')
    test = pd.read_csv(data_dir + 'test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    if df.isna().sum().sum() > 0:
        print(file_path)
    return df

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    if df.isna().sum().sum() > 0:
        print(file_path)
    return df

    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        book_preprocessor(file_path_book)
        trade_preprocessor(file_path_trade)
        
        # Return the merge dataframe
        return "Hello"
    
    # Use parallel api to call paralle for loop
    # df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    for stock_id in list_stock_ids:
        for_joblib(stock_id)
    # Concatenate all the dataframes that return from Parallel
    # df = pd.concat(df, ignore_index = True)
    return df


# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)

Our training set has 428932 rows


In [55]:
df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).count()

Unnamed: 0_level_0,seconds_in_bucket,price,size,order_count,log_return
time_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,14,14,14,14,14
11,10,10,10,10,10
16,9,9,9,9,9
31,3,3,3,3,3
62,4,4,4,4,4
...,...,...,...,...,...
32751,19,19,19,19,19
32753,5,5,5,5,5
32758,8,8,8,8,8
32763,12,12,12,12,12


In [11]:
len(df.columns)

449

In [12]:
df.isna().sum().sum()

0