In [596]:
import numpy as np
import pandas as pd

log_pr_file = './log_price.df'
volu_usd_file = './volume_usd.df'

log_pr = pd.read_pickle(log_pr_file)
volu = pd.read_pickle(volu_usd_file)

daylen = 10

def interpolate(log_pr, volu, window=30):
    log_pr.columns = ['log_pr_%d'%i for i in range(10)]
    volu.columns = ['volu_%d'%i for i in range(10)]

    open_ = log_pr[::window].reindex(log_pr.index).ffill()
    open_.columns = ['open_%d'%i for i in range(10)]
    close_ = log_pr[window-1::window].reindex(log_pr.index).bfill()
    close_.columns = ['close_%d'%i for i in range(10)]
    high_ = log_pr.groupby(np.arange(len(log_pr))//window) \
            .max().set_index(np.arange(0, len(log_pr), window)) \
            .reindex(np.arange(len(log_pr))).ffill().set_index(log_pr.index)
    high_.columns = ['high_%d'%i for i in range(10)]
    low_ = log_pr.groupby(np.arange(len(log_pr))//window) \
            .min().set_index(np.arange(0, len(log_pr), window)) \
            .reindex(np.arange(len(log_pr))).ffill().set_index(log_pr.index)
    low_.columns = ['low_%d'%i for i in range(10)]
    return pd.concat([log_pr, volu, open_, close_, high_, low_], axis=1)

data = interpolate(log_pr, volu, daylen)

In [597]:
# Simple Moving Average
def SMA(x, window):
    return x.rolling(window).mean()

# exponential moving average
def EMA(x, window):
    return x.ewm(com=1/window, adjust=True, min_periods=window).mean()

# Average True Range
def ATR(x, window, daylen):
    low = x[['low_%d'%i for i in range(10)]].iloc[::daylen].copy()
    high = x[['high_%d'%i for i in range(10)]].iloc[::daylen].copy()
    close = x[['close_%d'%i for i in range(10)]].iloc[::daylen].copy()
    
    high_low = high.values - low.values
    high_close = np.abs(high.values - close.shift().values)
    low_close = np.abs(low.values - close.shift().values)

    ranges = np.stack([high_low, high_close, low_close], axis=0)
    true_range = np.max(ranges, axis=0)
    true_range = pd.DataFrame(true_range, 
                              index=close.index, columns=['atr_%d'%i for i in range(10)])
    atr = EMA(true_range, window)
    atr = atr.reindex(x.index).ffill()
    return atr

# TODO
# Average Directional Movement Index
def ADX(x, window, daylen):
    low = x[['low_%d'%i for i in range(10)]].iloc[::daylen].copy()
    high = x[['high_%d'%i for i in range(10)]].iloc[::daylen].copy()
    close = x[['close_%d'%i for i in range(10)]].iloc[::daylen].copy()
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    atr = ATR(x, window, daylen).iloc[::daylen]
#     print(atr)
    
    plus_di = (100 * EMA(plus_dm, window) / atr.values).values
    minus_di = abs(100 * EMA(minus_dm, window) / atr.values).values
    
    adx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = pd.DataFrame(adx, index=close.index, columns=['adx_%d'%i for i in range(10)])
    adx = ((adx.shift() * (window - 1)) + adx) / window
    adx_smooth = EMA(adx, window)
    adx_smooth = adx_smooth.reindex(x.index).ffill()
    return adx_smooth

# Commodity Channel Index
def CCI(x, window, daylen):
    low = x[['low_%d'%i for i in range(10)]].iloc[::daylen].copy()
    high = x[['high_%d'%i for i in range(10)]].iloc[::daylen].copy()
    close = x[['close_%d'%i for i in range(10)]].iloc[::daylen].copy()
    
    m = (high.values + low.values + close)/3
#     return m
    sma = SMA(m, window)
#     return sma
    mad_ = m.rolling(window).apply(lambda x: pd.Series(x).mad())
    cci = pd.DataFrame((m.values - sma.values)/(0.015*mad_.values), 
                       index=close.index, columns=['cci_%d'%i for i in range(10)])
    cci = cci.reindex(x.index).ffill()
    return cci

# Price Rate of Change
def ROC(x, window, daylen):
    close = x[['close_%d'%i for i in range(10)]].iloc[::daylen].copy()
    roc = close.pct_change(window)
    roc.columns = ['roc_%d'%i for i in range(10)]
    roc = roc.reindex(x.index).ffill()
    return roc

# Relative Strength Index
def RSI(x, window, daylen, ema=True):
    close = x[['close_%d'%i for i in range(10)]].iloc[::daylen].copy()
    
    # Make two series: one for lower closes and one for higher closes
    up = close.diff().clip(lower=0)
    down = -1 * close.diff().clip(upper=0)
    
    if ema == True:
        # Use exponential moving average
        ma_up = EMA(up, window)
        ma_down = EMA(down, window)
    else:
        # Use simple moving average
        ma_up = SMA(up, window)
        ma_down = SMA(down, window)
        
    rsi = ma_up.values / ma_down.values
    rsi = 100 - (100/(1 + rsi))
    rsi = pd.DataFrame(rsi, index=close.index, columns=['rsi_%d'%i for i in range(10)])
    rsi = rsi.reindex(x.index).ffill()
    return rsi

# William's %R oscillator
def WR(x, window):
    hn = x[['log_pr_%d'%i for i in range(10)]].rolling(window).max()
    ln = x[['log_pr_%d'%i for i in range(10)]].rolling(window).min()
    wr = 100*(hn.values - x[['close_%d'%i for i in range(10)]].values)/(hn.values - ln.values)
    return pd.DataFrame(wr, index=x.index, columns=['wr_%d'%i for i in range(10)])

# Stochastic K
def SK(x, window):
    hhn = x[['high_%d'%i for i in range(10)]].rolling(window).max()
    lln = x[['low_%d'%i for i in range(10)]].rolling(window).min()
    sk = 100*(x[['close_%d'%i for i in range(10)]].values - lln.values)/(hhn.values - lln.values)
    return pd.DataFrame(sk, index=x.index, columns=['sk_%d'%i for i in range(10)])

# Stochastic D
def SD(x, window):
    sd = EMA(SK(x, window), 3)
    sd.columns = ['sd_%d'%i for i in range(10)]
    return sd
    

In [598]:
# feature generation pipline
def generate_features(data, window, daylen):
    pr = data.drop(labels=['volu_%d'%i for i in range(10)], axis=1)
    sma = SMA(pr[['log_pr_%d'%i for i in range(10)]], window)
    sma.columns = ['sma_%d'%i for i in range(10)]
    print(sma.shape)
    ema = EMA(pr[['log_pr_%d'%i for i in range(10)]], window)
    ema.columns = ['ema_%d'%i for i in range(10)]
    print(ema.shape)
    atr = ATR(pr, window, daylen)
    print(atr.shape)
    adx = ADX(pr, window, daylen)
    print(adx.shape)
    cci = CCI(pr, window, daylen)
    print(cci.shape)
    roc = ROC(pr, window, daylen)
    print(roc.shape)
    rsi = RSI(pr, window, daylen)
    print(rsi.shape)
    wr = WR(pr, window)
    print(wr.shape)
    sk = SK(pr, window)
    print(sk.shape)
    sd = SD(pr, window)
    print(sd.shape)
    return pd.concat([sma, ema, atr, adx, cci, 
                      roc, rsi, wr, sk, sd], axis=1)


In [606]:
# combined pipeline
from sklearn.preprocessing import StandardScaler
import pickle

def data_preprocess(log_pr, volu, window, daylen, scaler_file = None):
    data = interpolate(log_pr, volu, window)
    features = generate_features(data, window, daylen)
    if isinstance(scaler_file, type(None)):
        scaler = StandardScaler()
        features = scaler.fit_transform(features)
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
    else:
        with open(scaler_file, 'rb') as f:
            scaler = pickle.load(f)
        features = scaler.transform(features)
    return features, scaler_file

In [599]:
features = generate_features(data, 30, daylen)
features.columns, features.shape, features.dropna().shape

(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)
(264960, 10)


(Index(['sma_0', 'sma_1', 'sma_2', 'sma_3', 'sma_4', 'sma_5', 'sma_6', 'sma_7',
        'sma_8', 'sma_9', 'ema_0', 'ema_1', 'ema_2', 'ema_3', 'ema_4', 'ema_5',
        'ema_6', 'ema_7', 'ema_8', 'ema_9', 'atr_0', 'atr_1', 'atr_2', 'atr_3',
        'atr_4', 'atr_5', 'atr_6', 'atr_7', 'atr_8', 'atr_9', 'adx_0', 'adx_1',
        'adx_2', 'adx_3', 'adx_4', 'adx_5', 'adx_6', 'adx_7', 'adx_8', 'adx_9',
        'cci_0', 'cci_1', 'cci_2', 'cci_3', 'cci_4', 'cci_5', 'cci_6', 'cci_7',
        'cci_8', 'cci_9', 'roc_0', 'roc_1', 'roc_2', 'roc_3', 'roc_4', 'roc_5',
        'roc_6', 'roc_7', 'roc_8', 'roc_9', 'rsi_0', 'rsi_1', 'rsi_2', 'rsi_3',
        'rsi_4', 'rsi_5', 'rsi_6', 'rsi_7', 'rsi_8', 'rsi_9', 'wr_0', 'wr_1',
        'wr_2', 'wr_3', 'wr_4', 'wr_5', 'wr_6', 'wr_7', 'wr_8', 'wr_9', 'sk_0',
        'sk_1', 'sk_2', 'sk_3', 'sk_4', 'sk_5', 'sk_6', 'sk_7', 'sk_8', 'sk_9',
        'sd_0', 'sd_1', 'sd_2', 'sd_3', 'sd_4', 'sd_5', 'sd_6', 'sd_7', 'sd_8',
        'sd_9'],
       dtype='object'),
 

In [611]:
# split data into test set and training set
N = len(features)
train_idx = np.arange(1440 * 30, 1440 * 120)
np.random.shuffle(train_idx)
label_idx = train_idx + 30

train_features = features.iloc[train_idx]
train_labels = data[['log_pr_%d'%i for i in range(10)]].iloc[label_idx]
train_features.shape, train_labels.shape

((129600, 100), (129600, 10))

In [609]:
# validation set
test_idx = np.arange(1440*120, 1440*150)
test_label_idx = test_idx + 30

test_features = features.iloc[test_idx]
test_labels = data[['log_pr_%d'%i for i in range(10)]].iloc[test_label_idx]
test_features.shape, test_labels.shape

((43200, 100), (43200, 10))

In [620]:
# prepare to train a model
import lightgbm as lgb

def eval(y_true, y_pred):
    return ('corr', np.corrcoef(y_true.squeeze(), y_pred.squeeze()), True)

gbm = lgb.LGBMRegressor(num_leaves=25, learning_rate=0.1, n_estimators=20)
gbm.fit(train_features.iloc[:,np.arange(0,100,10)], train_labels['log_pr_0'],
        eval_set=[(test_features.iloc[:,np.arange(0,100,10)], test_labels['log_pr_0'])],
        eval_metric=eval,
        callbacks=[lgb.early_stopping(10)])

ValueError: too many values to unpack (expected 3)

In [540]:
datay = interpolate(log_pr.iloc[:1440], volu.iloc[:1440])

In [614]:
train_labels

Unnamed: 0_level_0,log_pr_0,log_pr_1,log_pr_2,log_pr_3,log_pr_4,log_pr_5,log_pr_6,log_pr_7,log_pr_8,log_pr_9
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-08-15 15:30:00,-0.263723,-0.213141,0.069161,-0.103135,-0.005828,0.115408,0.287895,-0.074333,0.004548,-0.023020
2021-09-03 11:18:00,-0.510720,-0.504586,0.797183,-0.001151,-0.295806,0.231925,0.029799,-0.158106,0.036123,-0.148730
2021-08-11 00:41:00,-0.346334,-0.238884,0.133264,-0.028064,-0.094549,-0.020857,0.022979,-0.013209,0.094028,0.109587
2021-09-28 20:02:00,-0.542036,-0.674963,1.411684,0.095377,-0.407051,0.116689,-0.068215,-0.211189,-0.051907,-0.187903
2021-10-27 19:40:00,-0.239044,-0.656815,1.127582,0.279280,-0.322847,-0.153801,-0.149903,-0.170882,0.125202,-0.019150
...,...,...,...,...,...,...,...,...,...,...
2021-08-24 13:57:00,-0.372062,-0.447888,0.973951,-0.101325,-0.215073,0.235021,0.042642,-0.165953,0.016188,-0.221636
2021-09-27 15:20:00,-0.548915,-0.698642,1.437192,0.098774,-0.407928,0.121842,-0.077013,-0.209414,-0.020917,-0.204201
2021-10-17 14:24:00,-0.418993,-0.684360,1.043145,0.304310,-0.199684,-0.040132,-0.041972,-0.249846,0.085263,-0.042088
2021-08-16 20:33:00,-0.294636,-0.204453,0.142880,-0.055042,-0.070102,0.058564,0.166818,-0.042348,-0.010078,-0.093555


In [544]:
f = generate_features(datay, 30, daylen)

(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)
(1440, 10)


In [545]:
f.dropna().shape

(0, 100)

In [592]:
i = np.where(features.iloc[1440 + 30*12].isnull())[0]

In [593]:
features.columns[i]

Index([], dtype='object')

In [595]:
1440/30 + 2

50.0