In [1]:
import sys
import os
import csv
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from copy import deepcopy
from datetime import datetime as dt

from sklearn.decomposition import PCA, FastICA

#metrics
from sklearn.metrics import roc_auc_score, accuracy_score

import time

def timeit(func):
    def timed(*args, **kwargs):
        ts = time.time()
        print('Function', func.__name__, 'running...')

        result = func(*args, **kwargs)
        te = time.time()
        shape = result[0].shape
        print('Completed. Dataframe shape: ', shape, 'Time elapsed:', round((te -ts),1), 's')
        print()
        return result
    return timed

sns.set_style('darkgrid')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

SEED = 26
np.random.seed(SEED)

### Read In Data

In [2]:
%%time
# dtype_dict = {}
# for f in range(0,130):
#     dtype_dict[f'feature_{f}'] = 'float32'
train = pd.read_csv('../inputs/train.csv')

CPU times: user 54.6 s, sys: 3.17 s, total: 57.8 s
Wall time: 57.8 s


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float64(135), int64(3)
memory usage: 2.5 GB


In [4]:
%%time
train_fold_ind = pd.read_csv('../preprocessed/train_fold_ind.csv')

CPU times: user 248 ms, sys: 8 ms, total: 256 ms
Wall time: 257 ms


In [5]:
df = pd.concat([train,train_fold_ind[['fold']]],axis=1)
del train

### Create Feature Functions

In [6]:
@timeit
def create_daily_return_lags(df, lags, na_method):
    
    df_daily = df[['date','fold','resp']].groupby(['date','fold']).mean().reset_index()
    
    features = []
    for lag in tqdm(lags):
        col_name = '_'.join(['resp_daily_lag',str(lag)])
        features.append(col_name)
        df_daily[col_name] = df_daily.groupby(['fold'])['resp'].transform(lambda x: x.shift(lag))
        
    df_daily = df_daily.drop(['resp','fold'], axis=1)
    df = pd.merge(df, df_daily, on=['date'], how='left')
    
    if na_method == -1:
        df[features] = df[features].fillna(-1)
    elif na_method == 'drop':
        df[features] = df[features].dropna()
    
    return df, features

In [7]:
@timeit
def create_last_trade_return_lags(df, lags, na_method):
    
    features = []
    for lag in tqdm(lags):
        col_name = '_'.join(['resp_lag',str(lag)])
        features.append(col_name)
        df[col_name] = df.groupby(['fold'])['resp'].transform(lambda x: x.shift(lag))
    
    if na_method == -1:
        df[features] = df[features].fillna(-1)
    elif na_method == 'drop':
        df[features] = df[features].dropna()
    
    return df, features

In [8]:
@timeit
def running_daily_total_trades(df):
            
    df['running_total_trades'] = (df[['date','fold','ts_id']]
                                  .sort_values('ts_id')
                                  .groupby(['date','fold'])
                                  .transform(lambda x: x.expanding().count())
                                  .astype('int64')
                                  .values)
    
    return df, ['running_total_trades']

In [9]:
@timeit
def create_day_features(df):
    '''
    create day of week features assuming starting monday and 5 day trading week
    '''
    df['dow'] = df['ts_id']%5
    return df, ['dow']

In [10]:
@timeit
def create_target(df, threshold=0):
    df['target'] = df['resp'].apply(lambda x: 1 if x > threshold else 0)
    return df, 'target'

In [11]:
@timeit
def create_pca_features(df, columns, n_components, col_prefix):
    
    df_train = df[df['fold']=='train_fold'][columns]
    df_train = df_train.fillna(0)
    
    pca = PCA(n_components=n_components)
    pca.fit(df_train.values)
    
    features = ["_".join([col_prefix,str(i)]) for i in range(1,n_components+1)]
    
    pca_df = pd.DataFrame(pca.transform(df[columns].fillna(0).values), 
                          columns=features)
    
    df = pd.concat([df, pca_df],axis=1)
    
    return df, features

### Run Pipeline

In [12]:
%%time

original_features = [f"feature_{f}" for f in range(0,130)]

df, daily_lag_features = create_daily_return_lags(df, 
                                                  lags=np.arange(1,30,1),
                                                  na_method=-1)

df, lag_features = create_last_trade_return_lags(df, 
                                                 lags=np.arange(1,30,1),
                                                 na_method=-1)

df, orig_pca_features = create_pca_features(df,
                                            columns=original_features,
                                            n_components=15,
                                            col_prefix='orginal_pca')

df, lag_pca_features = create_pca_features(df,
                                           columns=lag_features+daily_lag_features,
                                           n_components=15,
                                           col_prefix='lag_pca')

df, dow_features = create_day_features(df)
df, running_total_trades_features = running_daily_total_trades(df)
df, target_col = create_target(df, threshold=0)

Function create_daily_return_lags running...


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))


Completed. Dataframe shape:  (2390491, 168) Time elapsed: 7.8 s

Function create_last_trade_return_lags running...


HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))


Completed. Dataframe shape:  (2390491, 197) Time elapsed: 14.0 s

Function create_pca_features running...
Completed. Dataframe shape:  (2390491, 212) Time elapsed: 22.0 s

Function create_pca_features running...
Completed. Dataframe shape:  (2390491, 227) Time elapsed: 17.2 s

Function create_day_features running...
Completed. Dataframe shape:  (2390491, 228) Time elapsed: 0.0 s

Function running_daily_total_trades running...
Completed. Dataframe shape:  (2390491, 229) Time elapsed: 4.6 s

Function create_target running...
Completed. Dataframe shape:  (2390491, 230) Time elapsed: 0.7 s

CPU times: user 1min 29s, sys: 21.9 s, total: 1min 51s
Wall time: 1min 6s


In [13]:
# fill main features missing values
df[original_features] = df[original_features].fillna(0) # okay as scaled to 0 mean

In [14]:
%%time
df = df.drop(['resp_1','resp_2','resp_3','resp_4'], axis=1)
df = df.set_index(['date','ts_id','weight','resp'])

CPU times: user 5.21 s, sys: 3.46 s, total: 8.67 s
Wall time: 8.66 s


In [15]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,lag_pca_9,lag_pca_10,lag_pca_11,lag_pca_12,lag_pca_13,lag_pca_14,lag_pca_15,dow,running_total_trades,target
date,ts_id,weight,resp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0,0,0.0,0.006,1,-1.873,-2.191,-0.474,-0.323,0.015,-0.002,0.0,0.0,-0.99,...,0.074,-0.108,-0.119,-0.256,-0.052,0.81,-0.038,0,1,1
0,1,16.674,-0.01,-1,-1.35,-1.705,0.068,0.028,0.194,0.138,0.0,0.0,-0.152,...,0.073,-0.113,-0.118,-0.243,-0.048,0.793,-0.023,1,2,0
0,2,0.0,0.024,-1,0.813,-0.256,0.806,0.4,-0.614,-0.355,0.0,0.0,5.448,...,0.073,-0.119,-0.117,-0.232,-0.045,0.777,-0.006,2,3,1
0,3,0.0,-0.003,-1,1.174,0.345,0.067,0.009,-1.006,-0.676,0.0,0.0,4.508,...,0.072,-0.125,-0.117,-0.22,-0.039,0.761,0.012,3,4,0
0,4,0.139,-0.003,1,-3.172,-3.093,-0.162,-0.128,-0.195,-0.144,0.0,0.0,2.683,...,0.071,-0.131,-0.116,-0.21,-0.033,0.748,0.025,4,5,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2390491 entries, (0, 0, 0.0, 0.006270362237518486) to (499, 2390490, 0.0, -0.0019046214579785984)
Columns: 222 entries, feature_0 to target
dtypes: float64(217), int64(4), object(1)
memory usage: 4.3+ GB


In [17]:
date = str(dt.today().date()).replace("-","_")
df.to_parquet(f"../preprocessed/all_features_{date}.csv",
              engine='fastparquet',
              compression='gzip')