In [2]:
import pandas as pd
import pathlib
from factor_util import *
from joblib import dump, load
from fredapi import Fred

def hullMA(x, n = 50):
    sma1 = x.rolling(n,  min_periods = 1).mean()
    sma2 = x.rolling(int(n/2),  min_periods = 1).mean()
    out = (2 * sma1 - sma2).rolling(int(np.sqrt(n)), min_periods = 1).mean()
    return x - out

def calculate_corr(df, ta_features = None, columns = None, groupby = True):
    if ta_features is None:
        skip_features = ['returns_5m', 'open_time', 'close_time', 'target_15m', 'ignore', 'token']
        features = [x for x in df.columns if x not in skip_features]
        ta_features = [x for x in df.columns if x not in skip_features and x not in columns]
    if groupby:
        tgt_corr = df.groupby(['token'])[ta_features + ['target_15m']].corr()
    else:
        tgt_corr = df[ta_features + ['target_15m']].corr()
    return tgt_corr

def calculate_vol_price_corr(df, windows = [5, 15, 30, 60, 120]):
    for window in windows:
        df[f'vol_price_corr_{window}'] = df['close'].rolling(window, min_periods = 1).corr(df['volume'])
    return df

def get_cols_for_corr(df, str_idx):
    return df.columns[df.columns.str.startswith(str_idx)].tolist()

def transform_time(df):
    day = 24 * 60
    hour_float = df['open_time'].dt.hour + df['open_time'].dt.minute/60
    df['sin_hour'] = np.sin(2.0 * np.pi * hour_float/24)
    df['cos_hour'] = np.cos(2.0 * np.pi * hour_float/24)
    df['Day_sin'] = np.sin(df['open_time'].dt.day * (2 * np.pi / 31))
    df['Day_cos'] = np.cos(df['open_time'].dt.day * (2 * np.pi / 31))
    df['month_sin'] = np.sin(df['open_time'].dt.month * (2 * np.pi / 12))
    df['month_cos'] = np.cos(df['open_time'].dt.month * (2 * np.pi / 12))
    return df

def calc_sma_diff_test(close, timeperiod_short, timeperiod_long):
    res_short = close.rolling(window = timeperiod_short, min_periods = 1).mean()
    res_long = close.rolling(window = timeperiod_long, min_periods = 1).mean()
    res = (res_long - res_short) / res_long
    return res

def load_metrics_data(ticker):
    df_metrics = pd.read_feather(f'../data/processed_metrics/{ticker}_1m.feather')
    df_metrics['create_time'] = pd.to_datetime(df_metrics['create_time'], format = 'mixed')
    return df_metrics

def gen_cross_features(x, lag = 60):
    """
    calculate cross features with other assets if any
    :param x:
    :param lag:
    :return:
    """
    lag_arr = np.ones(lag)
    conv_arr = np.convolve(x, lag_arr / lag, mode = 'valid')
    app_arr = np.append(conv_arr, np.ones(lag - 1))
    roll_arr = np.roll(app_arr, lag - 1)
    div_arr = np.log(x / roll_arr)
    return div_arr

def log_return_np(x):
    return np.log(x / x.shift(60)).fillna(0)

In [3]:
directory = '../data/processed_data/'
files = pathlib.Path(directory).glob('*.feather')
dfs = {}
universe = ['BTCUSDT']

df_btc = pd.read_feather('../data/processed_data/BTCUSDT_5m_spot.feather')
df_btc = df_btc.sort_values(by = ['open_time'], ignore_index = True)

In [40]:
lag = 60
df = df_btc.copy()
df[f'log_close/mean_{lag}'] = gen_cross_features(df['close'], lag = lag)
df[f'log_return_{lag}'] = log_return_np(df['close'])
df['mid_diff'] = (df['close'] - df['open']) / ((df['high'] - df['low']) + 0.001)

sma_lags = [5, 15, 30, 60, 120, 240, 800]
for sma_lag in sma_lags:
    df[f'sma{sma_lag}'] = (df['close'].rolling(sma_lag, min_periods = 1).mean())
    df[f'sma{sma_lag}'] = (df[f'sma{sma_lag}'] / df['close']) - 1
    df[f'return{sma_lag}'] = df['close'].pct_change(sma_lag)
    df[f'volume_change_{sma_lag}'] = df['volume'].pct_change(sma_lag)

hull_lags = [76, 240, 800]
for hull_lag in hull_lags:
    df[f'hull_{hull_lag}'] = hullMA(df['close'], hull_lag)

fibo_list = [55, 210, 340, 890, 3750]
for num in fibo_list:
    df[f'log_return_{num}'] = np.log(df['close']).diff().rolling(num, min_periods = 1).mean().ffill().bfill()

momentum_windows = [15, 30, 60, 120, 240]
for window in momentum_windows:
    df[f'mom_roc_{window}'] = df['close'].shift(-window)

df = transform_time(df)
sma_diff_windows = [(12 * np.power(4, i), 24 * np.power(4,i)) for i in range(1, 6)]
for short_win, long_win in sma_diff_windows:
    df[f'sma_diff_{short_win}'] = calc_sma_diff_test(df['close'], short_win, long_win)

df[f'sma_diff_vol_{12*4*4}'] = calc_sma_diff_test(df['volume'], 12*4*4, 24*4*4)

In [None]:
target_intervals = [30, 60, 90, 120, 150, 180, 210, 240,
                    270, 300, 330, 360, 390, 420, 450, 480]
interval = 5
corr = {}
df = df.sort_values(by = ['open_time'], ignore_index = True)
skip_features = ['returns_5m', 'open_time', 'quote_asset_volume', 'number_of_trades',
                 'close_time', 'target_15m', 'ignore', 'token',
                 'taker_buy_base_volume', 'taker_buy_quote_asset_volume']

features = [x for x in df.columns if (x not in skip_features)]
features = [x for x in features if ('target' not in x)]


for target_interval in target_intervals:
    nperiod = target_interval // interval
    label = f'target_{target_interval}m'
    df[f'close_{nperiod}lag'] = df['close'].shift(nperiod).fillna(0.0)
    df[f'target_{target_interval}m'] = df['close'].pct_change(-nperiod)
    corr[label] = df[features + [label]].corr()


In [34]:
for k, v in corr.items():
    v = v[k].abs().sort_values(ascending = False).iloc[1:]
    v.to_csv(f'../output/feature_corr/{k}_feature.csv')

In [37]:
df.to_feather('../data/df_btc_with_features_5m_spot.feather')

In [9]:
import pandas as pd
import pathlib
import numpy as np
# from factor_util import *
import pandas as pd
import pathlib
# from factor_util import *
from joblib import dump, load

def hullMA(x, n = 50):
    sma1 = x.rolling(n,  min_periods = 1).mean()
    sma2 = x.rolling(int(n/2),  min_periods = 1).mean()
    out = (2 * sma1 - sma2).rolling(int(np.sqrt(n)), min_periods = 1).mean()
    return x - out

def calculate_corr(df, ta_features = None, columns = None, groupby = True):
    if ta_features is None:
        skip_features = ['returns_5m', 'open_time', 'close_time', 'target_15m', 'ignore', 'token']
        features = [x for x in df.columns if x not in skip_features]
        ta_features = [x for x in df.columns if x not in skip_features and x not in columns]
    if groupby:
        tgt_corr = df.groupby(['token'])[ta_features + ['target_15m']].corr()
    else:
        tgt_corr = df[ta_features + ['target_15m']].corr()
    return tgt_corr

def calculate_vol_price_corr(df, windows = [5, 15, 30, 60, 120]):
    for window in windows:
        df[f'vol_price_corr_{window}'] = df['close'].rolling(window).corr(df['volume'])
    return df

def get_cols_for_corr(df, str_idx):
    return df.columns[df.columns.str.startswith(str_idx)].tolist()

def transform_time(df):
    day = 24 * 60
    hour_float = df['open_time'].dt.hour + df['open_time'].dt.minute/60
    df['sin_hour'] = np.sin(2.0 * np.pi * hour_float/24)
    df['cos_hour'] = np.cos(2.0 * np.pi * hour_float/24)
    df['Day_sin'] = np.sin(df['open_time'].dt.day * (2 * np.pi / 31))
    df['Day_cos'] = np.cos(df['open_time'].dt.day * (2 * np.pi / 31))
    df['month_sin'] = np.sin(df['open_time'].dt.month * (2 * np.pi / 12))
    df['month_cos'] = np.cos(df['open_time'].dt.month * (2 * np.pi / 12))
    return df

def calc_sma_diff_test(close, timeperiod_short, timeperiod_long):
    res_short = close.rolling(window = timeperiod_short).mean()
    res_long = close.rolling(window = timeperiod_long).mean()
    res = (res_long - res_short) / res_long
    return res

def load_metrics_data(ticker):
    df_metrics = pd.read_feather(f'../data/processed_metrics/{ticker}_1m.feather')
    df_metrics['create_time'] = pd.to_datetime(df_metrics['create_time'], format = 'mixed')
    return df_metrics

def gen_cross_features(x, lag = 60):
    """
    calculate cross features with other assets if any
    :param x:
    :param lag:
    :return:
    """
    lag_arr = np.ones(lag)
    conv_arr = np.convolve(x, lag_arr / lag, mode = 'valid')
    app_arr = np.append(conv_arr, np.ones(lag - 1))
    roll_arr = np.roll(app_arr, lag - 1)
    div_arr = np.log(x / roll_arr)
    return div_arr

def log_return_np(x):
    return np.log(x / x.shift(60)).fillna(0)

def generate_features(df):
    directory = '../data/processed_data/'

    # df = pd.concat([df_btc, df_eth], axis = 0, ignore_index = True)
    df = df.sort_values(by = ['open_time'], ignore_index = True)
    # calculate next 15min returns (ie: current open_time is 2020-01-01 00:00:00,
    # then return is from 2020-01-01 00:01:00 - 2020-01-01 00:16:00

    sma_lags = [5, 10, 15, 30, 60, 90, 120, 150, 180, 210, 240,
                270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 660, 720]
    for lag in sma_lags:
        df[f'log_close/mean_{lag}'] = gen_cross_features(df['close'], lag=lag)
        df[f'log_return_{lag}'] = log_return_np(df['close'])

    df['mid_diff'] = (df['close'] - df['open']) / ((df['high'] - df['low']) + 0.001)



    sma_lags = [5, 10, 15, 30, 60, 90, 120, 150, 180, 210, 240,
                270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 660, 720]
    for sma_lag in sma_lags:
        df[f'sma{sma_lag}'] = (df['close'].rolling(sma_lag).mean())
        df[f'sma{sma_lag}'] = (df[f'sma{sma_lag}'] / df['close']) - 1
        df[f'return{sma_lag}'] = df['close'].pct_change(sma_lag)
        df[f'volume_change_{sma_lag}'] = df['volume'].pct_change(sma_lag)

    hull_lags = [15, 30, 60, 90, 120, 150, 180, 210, 240,
                270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 660, 720]
    for hull_lag in hull_lags:
        df[f'hull_{hull_lag}'] = hullMA(df['close'], hull_lag)

    fibo_list = [15, 30, 60, 90, 120, 150, 180, 210, 240,
                270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 660, 720]
    for num in fibo_list:
        df[f'log_return_{num}'] = np.log(df['close']).diff().rolling(num).mean().ffill().bfill()


    # momentum_windows = [15, 30, 60, 90, 120, 150, 180, 210, 240,
    #                     270, 300, 330, 360, 390, 420, 450, 480,
    #                     510, 540, 570, 600, 660, 720]
    # for window in momentum_windows:
    #     df[f'mom_roc_{window}'] = df['close'].pct_change(window)

    # momentum_windows = [15, 30, 60, 120, 240]
    # for window in momentum_windows:
    #     df_btc[f'mom_adx_{window}'] = talib.ADX(df_btc['high'], df_btc['low'], df_btc['close'], timeperiod = window)
    #     df_btc[f'mom_adxr_{window}'] = talib.ADXR(df_btc['high'], df_btc['low'], df_btc['close'], timeperiod=window)

    df = transform_time(df)
    sma_diff_windows = [(12 * np.power(4, i), 24 * np.power(4, i)) for i in range(1, 10)]
    for short_win, long_win in sma_diff_windows:
        df[f'sma_diff_{short_win}'] = calc_sma_diff_test(df['close'], int(short_win), int(long_win))

    df[f'sma_diff_vol_{12 * 4 * 4}'] = calc_sma_diff_test(df['volume'], 12 * 4 * 4, 24 * 4 * 4)

    return df

# Generate Features using more lags

In [3]:
df = pd.read_feather('../data/df_btc_with_features_5m_spot.feather')

In [4]:
VOL_THRESHOLD = 5  # multiple to winsorise by
HALFLIFE_WINSORISE = 252

df_asset = df.copy()
df_asset = df_asset[
    ~df_asset["close"].isna()
    | ~df_asset["close"].isnull()
    | (df_asset["close"] > 1e-8)  # price is zero
].copy()

In [5]:
def calc_returns(srs: pd.Series, day_offset: int = 1) -> pd.Series:
    returns = srs / srs.shift(day_offset) - 1.0
    return returns

VOL_LOOKBACK = 60  # for ex-ante volatility
VOL_TARGET = 0.15  # 15% volatility target

def calc_daily_vol(daily_returns):
    return (
        daily_returns.ewm(span=VOL_LOOKBACK, min_periods=VOL_LOOKBACK)
        .std()
        .fillna(method="bfill")
    )

def calc_normalised_returns(day_offset):
    return (
        calc_returns(df_asset["srs"], day_offset)
        / df_asset["vol_5m"]
        / np.sqrt(day_offset)
    )

def calc_macd_signal(srs: pd.Series, short_timescale: int, long_timescale: int) -> float:
    def _calc_halflife(timescale):
        return np.log(0.5) / np.log(1 - 1 / timescale)

    macd = (
        srs.ewm(halflife=_calc_halflife(short_timescale)).mean()
        - srs.ewm(halflife=_calc_halflife(long_timescale)).mean()
    )
    q = macd / srs.rolling(63).std().fillna(method="bfill")
    return q / q.rolling(252).std().fillna(method="bfill")

In [6]:
# winsorize using rolling 5X standard deviations to remove outliers
df_asset["srs"] = df_asset["close"]
ewm = df_asset["srs"].ewm(halflife=HALFLIFE_WINSORISE)
means = ewm.mean()
stds = ewm.std()
df_asset["srs"] = np.minimum(df_asset["srs"], means + VOL_THRESHOLD * stds)
df_asset["srs"] = np.maximum(df_asset["srs"], means - VOL_THRESHOLD * stds)

In [7]:
df_asset["returns_5m"] = calc_returns(df_asset["srs"])
df_asset["vol_5m"] = calc_daily_vol(df_asset["returns_5m"])


times = [(2, '10m'), (3, '15m'), (6, '30m'), (12, '60m'), (24, '120m'),
         (48, '240m'), (96, '480m'), (192, '720m')]

for x, y in times:
    df_asset[f"norm_return_{y}"] = calc_normalised_returns(x)

trend_combinations = [(8, 24), (16, 48), (32, 96)]
for short_window, long_window in trend_combinations:
    df_asset[f"macd_{short_window}_{long_window}"] = calc_macd_signal(
        df_asset["srs"], short_window, long_window
    )

  daily_returns.ewm(span=VOL_LOOKBACK, min_periods=VOL_LOOKBACK)
  q = macd / srs.rolling(63).std().fillna(method="bfill")
  return q / q.rolling(252).std().fillna(method="bfill")


In [None]:
import mom_trans.changepoint_detection as cpd
#
# cpd.run_module(
#     data, lookback_window_length, output_file_path, start_date, end_date, USE_KM_HYP_TO_INITIALISE_KC
# ) , (12, '60m'), (24, '120m'), (48, '240m')
cpd_time = [(6, '30m')]
df_asset.index = df_asset['open_time']
df_asset['daily_returns'] = df_asset['returns_5m'].copy()

output = cpd.run_module(df_asset, 6, output_csv_file_path = '../data/sample')

2023-11-30 01:05:25.923683: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
