In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from joblib import Parallel, delayed
import scipy as sc
from scipy.stats import skew, kurtosis, median_absolute_deviation
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import os

# data directory
data_dir = '../../../data/'

In [9]:
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1']*df['bid_size1'] + df['bid_price2']*df['ask_size2'] + df['ask_price2']*df['bid_size2']) / (
                            df['ask_size1'] + df['bid_size1'] + df['ask_size2'] + df['bid_size2'])
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))


def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return).fillna(0)
    
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std,np.max],
        'wap2': [np.sum, np.mean, np.std,np.max],
        'wap3': [np.sum, np.mean, np.std,np.max],
        'log_return1': [np.sum, np.mean, np.std,np.max, realized_volatility],
        'log_return2': [np.sum, np.mean, np.std,np.max, realized_volatility],
        'log_return3': [np.sum, np.mean, np.std,np.max, realized_volatility],
        'ask_price1':[np.sum, np.mean, np.std,np.max],
        'ask_price2':[np.sum, np.mean, np.std,np.max],
        'bid_price1':[np.sum, np.mean, np.std,np.max],
        'bid_price2':[np.sum, np.mean, np.std,np.max],
        'total_volume':[np.sum, np.mean, np.std,np.max],
    }

    #####groupby / all seconds
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id_'],axis=1)
    
    return df_feature


def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)
    
    
    aggregate_dictionary = {
        'log_return':[np.sum, realized_volatility, np.mean, np.std,np.max, median_absolute_deviation,skew,kurtosis],
        'price':[np.sum, np.mean, np.std, np.max, median_absolute_deviation,skew,kurtosis],
        'seconds_in_bucket':[count_unique],
        'size':[np.mean, np.sum, np.std, np.max],
        'order_count':[np.mean, np.sum, np.std, np.max],
    }
    
    df_feature = df.groupby('time_id').agg(aggregate_dictionary)
    
    df_feature = df_feature.reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature

def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=4, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df

train = pd.read_csv(data_dir + 'train.csv')
train_ids = train.stock_id.unique()
df_train = preprocessor(list_stock_ids= train_ids, is_train = True).fillna(-999)
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')
df_train.head()

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 112 out of 112 | elapsed:  5.9min finished


Unnamed: 0,row_id,target,wap1_sum,wap1_mean,wap1_std,wap1_amax,wap2_sum,wap2_mean,wap2_std,wap2_amax,...,trade_price_kurtosis,trade_seconds_in_bucket_count_unique,trade_size_mean,trade_size_sum,trade_size_std,trade_size_amax,trade_order_count_mean,trade_order_count_sum,trade_order_count_std,trade_order_count_amax
0,0-5,0.004136,303.125061,1.003725,0.000693,1.00492,303.10553,1.003661,0.000781,1.005124,...,-0.23169,40.0,79.475,3179.0,118.375107,499.0,2.75,110.0,2.467741,12.0
1,0-11,0.001445,200.047775,1.000239,0.000262,1.000834,200.041168,1.000206,0.000272,1.001067,...,-0.33866,30.0,42.966667,1289.0,77.815203,280.0,1.9,57.0,1.446756,6.0
2,0-16,0.002168,187.913849,0.999542,0.000864,1.000878,187.939819,0.99968,0.000862,1.000876,...,-0.930625,25.0,86.44,2161.0,113.587,391.0,2.72,68.0,2.300725,8.0
3,0-31,0.002195,119.859779,0.998832,0.000757,1.000412,119.835945,0.998633,0.000656,1.000116,...,-0.979349,15.0,130.8,1962.0,144.828569,450.0,3.933333,59.0,4.043808,15.0
4,0-62,0.001747,175.932861,0.999619,0.000258,1.000159,175.93425,0.999626,0.000317,1.000249,...,-0.065389,22.0,81.409091,1791.0,117.914682,341.0,4.045455,89.0,4.099678,17.0


In [10]:
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)
df_test = test.merge(df_test, on = ['row_id'], how = 'left')

#stock_id target encoding
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean = df_train.groupby('stock_id')['target'].mean() 
df_test['stock_id_target_enc'] = df_test['stock_id'].map(stock_id_target_mean) # test_set

#training
tmp = np.repeat(np.nan, df_train.shape[0])
kf = KFold(n_splits = 10, shuffle=True,random_state = 19911109)
for idx_1, idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()

    tmp[idx_2] = df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp

DO_FEAT_IMP = False
if len(df_test)==3:
    DO_FEAT_IMP = True
    
# ref https://www.kaggle.com/corochann/permutation-importance-for-feature-selection-part1
def calc_model_importance(model, feature_names=None, importance_type='gain'):
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df


def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(8, 12)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()
    
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)

X = df_train.drop(['row_id','target'],axis=1)
y = df_train['target']

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'early_stopping_rounds': 100,
    'learning_rate': 0.1,
    'lambda_l1': 0.4311,
    'lambda_l2': 0.3897,
    'num_leaves': 77,
    'feature_fraction': 0.45,
    'bagging_fraction': 0.793,
    'bagging_freq': 1,
    'min_child_samples': 96,
  }

kf = KFold(n_splits=10, random_state=19901028, shuffle=True)
oof = pd.DataFrame()                 # out-of-fold result
models = []                          # models
scores = 0.0                         # validation score

gain_importance_list = []
split_importance_list = []

for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold :", fold+1)
    
    # create dataset
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    #RMSPE weight
    weights = 1/np.square(y_train)
    lgbm_train = lgb.Dataset(X_train,y_train,weight = weights)

    weights = 1/np.square(y_valid)
    lgbm_valid = lgb.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
    
    # model 
    model = lgb.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=5000,         
                      feval=feval_RMSPE,
                      verbose_eval=100,
                      categorical_feature = ['stock_id']                
                     )
    
    # validation 
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),3)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #keep scores and models
    scores += RMSPE / 10
    models.append(model)
    print("*" * 100)
    
    # --- calc model feature importance ---
    if DO_FEAT_IMP:    
        feature_names = X_train.columns.values.tolist()
        gain_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='gain')
        gain_importance_list.append(gain_importance_df)

        split_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='split')
        split_importance_list.append(split_importance_df)
print(f'Our out of folds RMSPE is {rmspe_score}')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    0.1s finished


Fold : 1
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18472
[LightGBM] [Info] Number of data points in the train set: 386038, number of used features: 73
[LightGBM] [Info] Start training from score 0.001803
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.000481755	training's RMSPE: 0.22273	valid_1's rmse: 0.000500173	valid_1's RMSPE: 0.23361
[200]	training's rmse: 0.000467321	training's RMSPE: 0.21606	valid_1's rmse: 0.000500889	valid_1's RMSPE: 0.23394
Early stopping, best iteration is:
[120]	training's rmse: 0.000478285	training's RMSPE: 0.22113	valid_1's rmse: 0.000499496	valid_1's RMSPE: 0.23329
Performance of the　prediction: , RMSPE: 0.233
****************************************************************************************************
Fold : 2
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18472
[LightGBM] [Info] Number of data points in the train set: 38603

In [11]:
scores

0.2343