In [None]:
import pandas as pd
import numpy as np
import os
from datetime import date, datetime, timedelta
from pathlib import Path

import copy
import pyarrow.parquet as pq

## define data paths

In [None]:
data_path = Path('../data')
data_path.exists()

In [None]:
train_book_path = data_path.joinpath('book_train.parquet')
book_train_sub = next(os.walk(train_book_path))[1]
print(len(book_train_sub), train_book_path.exists())

In [None]:
test_book_path = data_path.joinpath('book_test.parquet')
book_test_sub = next(os.walk(test_book_path))[1]
print(len(book_test_sub), test_book_path.exists())

In [None]:
train_trade_path = data_path.joinpath('trade_train.parquet')
trade_train_sub = next(os.walk(train_trade_path))[1]
print(len(trade_train_sub), train_trade_path.exists())

In [None]:
test_trade_path = data_path.joinpath('trade_test.parquet')
trade_test_sub = next(os.walk(test_trade_path))[1]
print(len(trade_test_sub), test_trade_path.exists())

### define functions

In [None]:
#setups
final_feats = [  'wap__absolute_sum_of_changes',
                 'wap__fft_aggregated__aggtype_"variance"',
                 'wap__spkt_welch_density__coeff_5',
                 'wap__spkt_welch_density__coeff_8',
                 'wap__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.2',
                 'price__variation_coefficient',
                 'price__mean_abs_change',
                 'price__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6',
                 'price__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.6',
                 'price__absolute_sum_of_changes',
              ] 

rename_dict={'wap__absolute_sum_of_changes': 'feat1',
 'wap__fft_aggregated__aggtype_"variance"': 'feat2',
 'wap__spkt_welch_density__coeff_5': 'feat3',
 'wap__spkt_welch_density__coeff_8': 'feat4',
 'wap__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.2': 'feat5',
 'price__variation_coefficient': 'feat6',
 'price__mean_abs_change': 'feat7',
 'price__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6': 'feat8',
 'price__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.6': 'feat9',
 'price__absolute_sum_of_changes': 'feat10'}

high_corr_list=[10, 11, 13, 14, 15, 23, 29, 32, 35, 42, 43, 50, 52, 62, 69, 70, 72, 73, 76, 87, 93, 95, 101, 108, 109, 119, 122, 126]
low_corr_list=[2, 13, 19, 20, 21, 29, 30, 34, 39, 41, 43, 46, 47, 53, 58, 64, 67, 68, 69, 80, 81, 99, 110]


In [None]:
#---full data frame
def cal_wap(df_raw):
    return ((df_raw['bid_price1'] * df_raw['ask_size1'] +
                    df_raw['ask_price1'] * df_raw['bid_size1']) / (
                                           df_raw['bid_size1']+ df_raw['ask_size1'])).values

In [None]:
#get tsfresh features
def cal_features(stock_id, book_path, trade_path):
    #book features
    df_book= pq.read_table(book_path.joinpath(f'stock_id={stock_id}')).to_pandas()
    df_book['wap']=cal_wap(df_book)
    
    df_trade= pq.read_table(trade_path.joinpath(f'stock_id={stock_id}')).to_pandas()
    
    val_list = []
    for time_id in df_book['time_id'].unique().tolist():  
        f1, f2, f3, f4, f5, f6, f7, f8, f9, f10 = [np.nan]*10
        
        values = df_book.loc[df_book['time_id']==time_id, 'wap'].values
        if len(values)>0:
            f1=feature_calculators.absolute_sum_of_changes(values)
            f2=feature_calculators.fft_aggregated(values, param=[{"aggtype" : "variance"}])
            f2=list(f2)[0][1]
            f3=feature_calculators.spkt_welch_density(values, param=[{"coeff": 5}])
            f3=list(f3)[0][1]
            f4=feature_calculators.spkt_welch_density(values, param=[{"coeff": 8}])
            f4=list(f4)[0][1]
            f5=feature_calculators.change_quantiles(values, **{'f_agg':'var', 'isabs':True, 'qh':1.0, 'ql':0.2 })


        values = df_trade.loc[df_trade['time_id']==time_id, 'price'].values
        
        if len(values)>0:
            f6=feature_calculators.variation_coefficient(values)
            f7=feature_calculators.mean_abs_change(values)
            f8=feature_calculators.change_quantiles(values, **{'f_agg':'mean', 'isabs':True, 'qh':0.8, 'ql':0.6 })
            f9=feature_calculators.change_quantiles(values, **{'f_agg':'var', 'isabs':True, 'qh':1.0, 'ql':0.6 })
            f10=feature_calculators.absolute_sum_of_changes(values)

        item = [stock_id, time_id, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10]
        val_list.append(item)
        
    return val_list

### prepare train data

In [None]:
all_list=[]
for stock_id in low_corr_list:
    cur_list=cal_features(stock_id, train_book_path, train_trade_path)
    all_list.extend(cur_list)

In [None]:
df_X = pd.DataFrame(data=all_list, columns=['stock_id', 'time_id']+final_feats)

In [None]:
df_train = pd.read_csv('../data/train.csv')

In [None]:
print(df_train.shape, df_X.shape)
df_train = df_train.merge(df_X, on=['stock_id', 'time_id'], how='inner')
print(df_train.shape)
df_train.dropna(how='any', inplace=True)
print(df_train.shape)

## prepare test data

In [None]:
df_test = pd.read_csv('../data/test.csv')

In [None]:
df_test

In [None]:
test_list=[]
for stock_id in df_test['stock_id'].unique().tolist():
    if (f'stock_id={stock_id}' in trade_test_sub) & (f'stock_id={stock_id}'in book_test_sub):
        cur_list=cal_features(stock_id, test_book_path, test_trade_path)
        test_list.extend(cur_list)
        
X_test = pd.DataFrame(data=test_list, columns=['stock_id', 'time_id']+final_feats)

In [None]:
print(df_test.shape)
df_test = df_test.merge(X_test, on=['stock_id', 'time_id'], how='inner')
print(df_test.shape)

## preprocessing data

In [None]:
def impute_test_data(X_train, X_test, features):
    for feat in features:
        if X_test[feat].isna().sum()>0:
            X_test[feat].fillna(value=X_train[feat].mean(), inplace=True)
    return X_test

In [None]:
def scale_data(X_train, X_test, features, scaler=5):
    for i in range(0, len(features)):
        col=features[i]

        avg = X_train[col].mean()
        std = X_train[col].std()
        X_train.loc[X_train[col]>avg+scaler*std, col] = avg+scaler*std
        X_train.loc[X_train[col]<avg-scaler*std, col] = avg-scaler*std
        X_test.loc[X_test[col]>avg+scaler*std, col] = avg+scaler*std
        X_test.loc[X_test[col]<avg-scaler*std, col] = avg-scaler*std
        
    return X_train, X_test

In [None]:
df_train.set_index(keys=['stock_id', 'time_id'], inplace=True)
df_test.set_index(keys=['stock_id', 'time_id'], inplace=True)

In [None]:
X_test = impute_test_data(df_train[final_feats], df_test[final_feats].copy(deep=True), final_feats)

In [None]:
X_train, X_test = scale_data(df_train[final_feats], X_test[final_feats], final_feats, scaler=5)

In [None]:
X_train.rename(columns=rename_dict, inplace=True)
X_test.rename(columns=rename_dict, inplace=True)

In [None]:
y_train = np.log(df_train[['target']])

## make models

In [None]:
import lightgbm as lgb
def make_lgb_preds(X_train, y_train,X_test, num_round=100, params={}, verbose=False):
    
    dtrain = lgb.Dataset(X_train, y_train)
    tree_model = lgb.train(params,
                dtrain,
                num_boost_round=num_round,
                verbose_eval=verbose)
    
    y_preds = tree_model.predict(X_test, num_iteration=tree_model.best_iteration)
    scores = tree_model.feature_importance(importance_type='gain', iteration=tree_model.best_iteration)
    df_scores = pd.DataFrame({'feature':list(X_train.columns), 'gain': list(scores)})

    return y_preds, df_scores, tree_model

In [None]:
params_list=[{'bagging_fraction': 0.78, 'bagging_freq': 56, 'boosting': 'gbdt', 'feature_fraction': 0.69, 'lambda_l1': 6, 'lambda_l2': 6, 'learning_rate': 0.01, 'max_bin': 260, 'max_depth': 11, 'max_leaves': 160, 'metric': 'mae', 'min_data_in_bin': 73, 'min_data_in_leaf': 61, 'nthread': 4, 'num_boost_round': 744, 'objective': 'regression_l2', 'seed': 1234},
]

In [None]:
params = copy.deepcopy(params_list[0])

In [None]:
num_boost_round = params['num_boost_round']

del params['num_boost_round']
params['verbose']=-1

In [None]:
y_preds, df_scores, tree_model = make_lgb_preds(X_train, y_train,X_test, 
                                                num_round=num_boost_round, params=params, verbose=False)

In [None]:
df_test['pred']=y_preds

In [None]:
df_test['target']=np.exp(df_test['pred'])

In [None]:
df_test[['row_id', 'target']].to_csv('submission.csv',index = False)