In [1]:
import sys
import os
import csv
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from copy import deepcopy

from sklearn.decomposition import PCA

#metrics
from sklearn.metrics import roc_auc_score, accuracy_score

sns.set_style('darkgrid')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

SEED = 26
np.random.seed(SEED)

### Read In Data

In [2]:
%%time
dtype_dict = {}
for f in range(0,130):
    dtype_dict[f'feature_{f}'] = 'float32'
train = pd.read_csv('../inputs/train.csv')

CPU times: user 1min 16s, sys: 6.29 s, total: 1min 23s
Wall time: 2min 59s


In [3]:
%%time
train_fold_ind = pd.read_csv('../preprocessed/train_fold_ind.csv')

CPU times: user 701 ms, sys: 8.18 ms, total: 709 ms
Wall time: 1.35 s


In [4]:
df = pd.concat([train,train_fold_ind[['fold']]],axis=1)
del train

In [5]:
df.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,fold
0,0,0.0,0.01,0.014,0.009,0.001,0.006,1,-1.873,-2.191,...,1.168,8.314,1.782,14.018,2.653,12.6,2.301,11.446,0,train_fold
1,0,16.674,-0.003,-0.003,-0.007,-0.011,-0.01,-1,-1.35,-1.705,...,-1.179,1.777,-0.915,2.832,-1.417,2.297,-1.305,1.899,1,train_fold
2,0,0.0,0.025,0.028,0.033,0.034,0.024,-1,0.813,-0.256,...,6.116,9.668,5.543,11.672,7.282,10.06,6.638,9.427,2,train_fold
3,0,0.0,-0.005,-0.003,-0.0,-0.0,-0.003,-1,1.174,0.345,...,2.839,0.499,3.034,1.513,4.398,1.266,3.856,1.013,3,train_fold
4,0,0.139,0.001,0.002,-0.001,-0.006,-0.003,1,-3.172,-3.093,...,0.345,4.101,0.614,6.623,0.8,5.233,0.363,3.927,4,train_fold


### Create Features

In [6]:
original_features = [f"feature_{f}" for f in range(0,130)]

In [7]:
def create_daily_return_lags(df, lags):
    df_daily = df[['date','resp']].groupby(['date']).mean().reset_index()
    
    print("Creating daily lag features...")
    for lag in tqdm(lags):
        col_name = '_'.join(['resp_daily_lag',str(lag)])
        df_daily[col_name] = df_daily['resp'].shift(lag)
        
    df = pd.merge(df, df_daily, on=['date'], how='left')
    
    return df

In [8]:
%%time
df = create_daily_return_lags(df, lags=[1,2,3,4,5,6,7])

Creating daily lag features...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for lag in tqdm(lags):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))


CPU times: user 11.2 s, sys: 10.3 s, total: 21.6 s
Wall time: 22.2 s


In [12]:
df.tail()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp_x,feature_0,feature_1,feature_2,...,ts_id,fold,resp_y,resp_daily_lag_1,resp_daily_lag_2,resp_daily_lag_3,resp_daily_lag_4,resp_daily_lag_5,resp_daily_lag_6,resp_daily_lag_7
2390486,499,0.0,0.0,0.0,0.006,0.02,0.015,1,-1.649,-1.17,...,2390486,fold_4,0.0,0.002,0.003,0.001,0.002,0.001,-0.001,-0.002
2390487,499,0.0,0.0,0.0,-0.001,-0.006,-0.005,1,2.433,5.285,...,2390487,fold_4,0.0,0.002,0.003,0.001,0.002,0.001,-0.001,-0.002
2390488,499,0.0,0.0,0.0,0.008,0.025,0.017,1,-0.622,-0.964,...,2390488,fold_4,0.0,0.002,0.003,0.001,0.002,0.001,-0.001,-0.002
2390489,499,0.283,-0.0,-0.0,-0.001,-0.004,-0.002,-1,-1.464,-1.107,...,2390489,fold_4,0.0,0.002,0.003,0.001,0.002,0.001,-0.001,-0.002
2390490,499,0.0,-0.002,-0.002,-0.001,-0.001,-0.002,-1,-1.817,-1.132,...,2390490,fold_4,0.0,0.002,0.003,0.001,0.002,0.001,-0.001,-0.002


In [None]:
def find_null_features(df, threshold, fill_val):
    
    return columns

def find_top_corr_features(df, target, threshold):
    
    return columns

def find_top_skew_features():
    return columns

def create_day_features():
    '''
    create day of week features assuming starting monday and 5 day trading week
    '''
    return df