In [105]:
import tscv
import pandas as pd
import talib as ta
import numpy as np

In [71]:
df = pd.read_feather('../data/df_btc_eth_with_features.feather')
cols_to_drop = ['open_time', 'close_time', 'ignore',
                'create_time', 'symbol', 'returns', 'returns_5m',
                'open', 'high', 'low', 'close', 'target_15m', 'token']

In [72]:
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,...,mom_roc_30,mom_roc_60,mom_roc_120,mom_roc_240,sin_hour,cos_hour,Day_sin,Day_cos,month_sin,month_cos
0,2020-01-01 00:00:00,7189.43,7190.52,7177.0,7182.44,246.092,2020-01-01 00:00:59.999000064,1767430.0,336.0,46.63,...,,,,,0.0,1.0,0.201299,0.97953,0.5,0.866025
1,2020-01-01 00:00:00,129.12,129.12,128.91,128.97,289.04,2020-01-01 00:00:59.999000064,37296.67,18.0,155.107,...,,,,,0.0,1.0,0.201299,0.97953,0.5,0.866025
2,2020-01-01 00:01:00,7182.43,7182.44,7178.75,7179.01,70.909,2020-01-01 00:01:59.999000064,509145.8,140.0,32.597,...,,,,,0.004363,0.99999,0.201299,0.97953,0.5,0.866025
3,2020-01-01 00:01:00,128.95,129.04,128.93,128.94,695.566,2020-01-01 00:01:59.999000064,89717.06,49.0,284.536,...,,,,,0.004363,0.99999,0.201299,0.97953,0.5,0.866025
4,2020-01-01 00:02:00,7179.01,7179.01,7175.25,7177.93,99.42,2020-01-01 00:02:59.999000064,713539.6,148.0,16.311,...,,,,,0.008727,0.999962,0.201299,0.97953,0.5,0.866025


In [73]:
train_features = [x for x in df.columns if (x not in cols_to_drop)]

In [80]:
df_oi = ['sum_open_interest', 'sum_open_interest_value', 'count_toptrader_long_short_ratio',
         'sum_toptrader_long_short_ratio', 'count_long_short_ratio',
         'sum_taker_long_short_vol_ratio', 'taker_buy_base_asset_volume',
         'taker_buy_quote_asset_volume']

In [83]:
object_cols = df[train_features].select_dtypes(include=object).columns
df[object_cols] = df[object_cols].astype(float)

In [97]:
# test with low corr
df['hilbert_transform'] = ta.HT_DCPERIOD(df['close'])
df['HT_DCPHASE'] = ta.HT_DCPHASE(df['close'])
inphase, quadrature = ta.HT_PHASOR(df['close'])
df['inphase'] = inphase
df['quadrature'] = quadrature
sine, leadsine = ta.HT_SINE(df['close'])
df['sine'] = sine
df['leadsine'] = quadrature
df['trend_mode'] = ta.HT_TRENDMODE(df['close'])
df['trend_line'] = ta.HT_TRENDLINE(df['close'])

In [98]:
train_features += ['hilbert_transform', 'HT_DCPHASE', 'inphase',
                   'quadrature', 'sine', 'leadsine', 'trend_mode', 'trend_line']

In [102]:
def normalize_float_columns(df, features):
  float_cols = df[features].select_dtypes(include = [float]).columns
  grouped_df = df.groupby(['token'])
  for col in float_cols:
      df[col] = grouped_df[col].transform(lambda x: (x - x.mean()) / (x.std()))
  return df

In [87]:
lags = [5, 15, 30, 60, 240, 480]
grouped_df = df.groupby(['token'])
for col in df_oi:
    for lag in lags:
        col_name = f'{col}_change_{lag}'
        train_features.append(col_name)
        df[f'{col}_change_{lag}'] = grouped_df[col].transform(lambda x: x.pct_change(lag))

In [106]:
df_train = normalize_float_columns(df, train_features)
df_train[train_features] = df_train[train_features].replace([np.inf, -np.inf], np.nan).fillna(0.0)

In [107]:
df_corr = df.groupby(['token'])[train_features + ['target_15m']].corr()
sample = df_corr['target_15m']
sample

token                                
BTCUSDT  volume                          0.008885
         quote_asset_volume              0.010604
         number_of_trades                0.011652
         taker_buy_base_asset_volume     0.006385
         taker_buy_quote_asset_volume    0.007630
                                           ...   
ETHUSDT  sine                            0.000695
         leadsine                        0.000091
         trend_mode                     -0.000181
         trend_line                     -0.006606
         target_15m                      1.000000
Name: target_15m, Length: 398, dtype: float64