In [3]:
import numpy as np
import pandas as pd

log_pr_file = './log_price.df'
volu_usd_file = './volume_usd.df'

In [4]:
log_pr = pd.read_pickle(log_pr_file)
volu = pd.read_pickle(volu_usd_file)
log_pr.columns = ['log_pr_%d'%i for i in range(10)]
volu.columns = ['volu_%d'%i for i in range(10)]

data = pd.concat([log_pr, volu], axis=1)

#### Dataset manipulation

In [201]:
def _split_data(data:pd.DataFrame, test_pct:float):
    assert test_pct > 0 and test_pct < 1
    test_size = int(len(data) * test_pct)
    return  data[test_size:], data[:test_size]

def split_data(log_pr:pd.DataFrame, volu:pd.DataFrame, test_pct:float):
    return *_split_data(log_pr, test_pct), *_split_data(volu, test_pct)

In [214]:
def formulize_data(data:pd.DataFrame, log_pr:pd.DataFrame, window_size=1440, step=10) -> np.array:
    N = len(data)
    assert N == len(log_pr)
    train_index = np.arange(0, window_size)[np.newaxis, :] + step * np.arange(0, (N - window_size - 30) // step)[:, np.newaxis]
    return_index = step * np.arange(0, (N - window_size - 30) // step)[:, np.newaxis] + window_size + 30 - 1
#     print(train_index, return_index)
    return data.values[train_index], log_pr.values[return_index]

#### Featue generator functions

In [104]:
def rate_of_change(data:pd.DataFrame, periods):
    return data.pct_change(periods)

In [109]:
def moving_average(data:pd.DataFrame, window_size):
    return data.rolling(window_size).mean()

In [110]:
def exp_moving_avg(data:pd.DataFrame, window_size):
    return data.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()

In [111]:
def z_score(data:pd.DataFrame, window_size):
    assert window_size > 1
    return (data - data.rolling(window=window_size).mean()) / \
            data.rolling(window=window_size).std()

In [112]:
def moving_sum(data:pd.DataFrame, window_size):
    return data.rolling(window_size).sum()

In [113]:
def sign(data:pd.DataFrame):
    return np.sign(data)

In [114]:
def binning(data:pd.DataFrame, n_bins):
    bin_fn = lambda y: pd.qcut(y, q=n_bins, labels=range(1, n_bins+1))
    return data.apply(bin_fn)

In [128]:
def RSI(data:pd.DataFrame, window_size, ema=True):
    """
    Returns a pd.Series with the relative strength index.
    """
    # Make two series: one for lower closes and one for higher closes
    up = data.clip(lower=0)
    down = -1 * data.clip(upper=0)
    
    if ema == True:
        # Use exponential moving average
        ma_up = up.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()
        ma_down = down.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()
    else:
        # Use simple moving average
        ma_up = up.rolling(window = window_size, adjust=False).mean()
        ma_down = down.rolling(window = window_size, adjust=False).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi.dropna()

In [131]:
def MACD(data:pd.DataFrame, window_slow=26, window_fast=12, window_signal=9):
    ema_slow = exp_moving_avg(data, window_slow)
    ema_fast = exp_moving_avg(data, window_fast)
    ema_signal = exp_moving_avg(data, window_signal)
    macd = ema_fast - ema_slow
    return macd, macd - ema_signal

In [117]:
def high_low(data:pd.DataFrame, window=52):
    high = data.rolling(window=window).max()
    low = data.rolling(window=window).min()
    return high, low

In [118]:
def RHP(data:pd.DataFrame, window=10, horizon=52):
    high, low = high_low(data, horizon)
    newhighs = (high == data).sum(axis=1)
    newlows = (low == data).sum(axis=1)
    rhp = newhighs / (newhighs + newlows + 1e-4)
    return rhp.rolling(window=window).mean()

In [119]:
def BollingerBands(data:pd.DataFrame, window_size):
    ma = data.rolling(window=window_size).mean()
    sd = data.rolling(window=window_size).std()
    up = ma + 2*sd
    down = ma - 2*sd
    return up, down

In [120]:
def TRIN(log_pr:pd.DataFrame, volu:pd.DataFrame, window=1):
    num_stocks = log_pr.shape[1]
    pr_adv = (log_pr.diff(window) > 0).sum(axis=1)
    v_adv = (volu.diff(window) > 0).sum(axis=1)
    trin = (pr_adv / (num_stocks - pr_adv + 1e-4)) / (v_adv / (num_stocks - v_adv + 1e-4) + 1e-4)
    return trin

In [121]:
def ADL(data:pd.DataFrame, window=1):
    num = data.shape[1]
    adv = (data.diff(window) > 0).sum(axis=1)
    return adv - (num - adv)

#### Feature generator pipeline

In [242]:
# construct features
def generate_features(log_pr:pd.DataFrame, volu:pd.DataFrame):
    # 30 min negative returns
    n_ret = -(log_pr - log_pr.shift(30))
    n_ret.columns = ['30rt%d'%i for i in range(10)]
    
    # log volumes
    log_volu = np.log(volu + 1)
    log_volu.columns = ['lgv%d'%i for i in range(10)]
    
    # price diff
    pr_diff = []
    for d in [1, 10, 30]:
        pr_diff.append(log_pr.diff(d))
        pr_diff[-1].columns = ['prdif%d%d'%(d, i) for i in range(10)]
        
    # volume diff
    volu_diff = []
    for d in [1, 10, 30]:
        volu_diff.append(volu.diff(d))
        volu_diff[-1].columns = ['vdif%d%d'%(d, i) for i in range(10)]
        
    # price rate of change
    pr_roc = []
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        pr_roc.append(rate_of_change(log_pr, window))
        pr_roc[-1].columns = ['prroc%d%d'%(window, i) for i in range(10)]
        
    # volume rate of change
    v_roc = []
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        v_roc.append(rate_of_change(volu, window))
        v_roc[-1].columns = ['vroc%d%d'%(window, i) for i in range(10)]
        
    # price moving average
    pr_ma =[]
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        pr_ma.append(moving_average(log_pr, window))
        pr_ma[-1].columns = ['prma%d%d'%(window, i) for i in range(10)]
        
    # volume moving average
    v_ma = []
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        v_ma.append(moving_average(volu, window))
        v_ma[-1].columns = ['vma%d%d'%(window, i) for i in range(10)]
        
    # price exp moving average
    pr_ema =[]
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        pr_ema.append(exp_moving_avg(log_pr, window))
        pr_ema[-1].columns = ['prema%d%d'%(window, i) for i in range(10)]
        
    # volume exp moving average
    v_ema = []
    for window in [1, 10, 30, 40, 60, 120, 1440]:
        v_ema.append(exp_moving_avg(volu, window))
        v_ema[-1].columns = ['vema%d%d'%(window, i) for i in range(10)]
        
    # z score
    pr_z = []
    for window in [2, 10, 30, 40, 60, 120, 1440]:
        pr_z.append(z_score(log_pr, window))
        pr_z[-1].columns = ['prz%d%d'%(window, i) for i in range(10)]
        
    # volume z score
    v_z = []
    for window in [2, 10, 30, 40, 60, 120, 1440]:
        v_z.append(z_score(volu, window))
        v_z[-1].columns = ['vz%d%d'%(window, i) for i in range(10)]
        
    # RSI indicators
    rsi = RSI(log_pr, 14)
    rsi.columns = ['rsi%d'%i for i in range(10)]
    
    # MACD indicators
    macd, macd_r = MACD(log_pr)
    macd.columns = ['macd%d'%i for i in range(10)]
    macd_r.columns = ['macd%d'%i for i in range(10)]
    
    # high low indicators
    high, low = high_low(log_pr)
    high.columns = ['high%d'%i for i in range(10)]
    low.columns = ['low%d'%i for i in range(10)]
    
    # high low ratio
    rhp = RHP(log_pr)
    rhp = pd.DataFrame(rhp, columns=['rhp'])
    
    # BollingerBands
    upbb, downbb = BollingerBands(log_pr, 10)
    upbb.columns = ['upbb%d'%i for i in range(10)]
    downbb.columns = ['downbb%d'%i for i in range(10)]
    
    # TRIN
    trin = TRIN(log_pr, volu)
    trin = pd.DataFrame(trin, columns=['trin'])
    
    # ADL
    adl = ADL(log_pr)
    adl = pd.DataFrame(adl, columns=['adl'])
    
    import itertools
    features = pd.concat([log_pr, volu, 
                          n_ret, log_volu, *pr_diff, *volu_diff, 
                          *pr_roc, *v_roc, *pr_ma, *v_ma, 
                          *pr_ema, *v_ema, *pr_z, *v_z,
                          rsi, macd, high, low, rhp, 
                          upbb, downbb, trin, adl], axis=1)
    return features
    
    

In [243]:
features = generate_features(log_pr, volu)

In [244]:
features.columns

Index(['log_pr_0', 'log_pr_1', 'log_pr_2', 'log_pr_3', 'log_pr_4', 'log_pr_5',
       'log_pr_6', 'log_pr_7', 'log_pr_8', 'log_pr_9',
       ...
       'downbb2', 'downbb3', 'downbb4', 'downbb5', 'downbb6', 'downbb7',
       'downbb8', 'downbb9', 'trin', 'adl'],
      dtype='object', length=723)

In [179]:
features.to_pickle('./features.pkl')

#### Forge training set and test set

In [202]:
log_pr_tr, log_pr_tst, volu_tr,  volu_tst = split_data(log_pr, volu, 0.3)

In [203]:
log_pr_tr.shape, log_pr_tst.shape, volu_tr.shape, volu_tst.shape

((185472, 10), (79488, 10), (185472, 10), (79488, 10))

In [204]:
features_tr = generate_features(log_pr_tr, volu_tr)

In [205]:
# features_tr.to_pickle('./features_tr.pkl')

In [241]:
'trin' in features_tr.columns

False

In [225]:
features_trf, labels_trf = formulize_data(features_tr, log_pr_tr.loc[features_tr.index], window_size=3)

In [217]:
np.save('./features_tr_f.pkl', features_trf)
np.save('./labels_tr_f.pkl', labels_trf)

In [240]:
features_tr

Unnamed: 0_level_0,log_pr_0,log_pr_1,log_pr_2,log_pr_3,log_pr_4,log_pr_5,log_pr_6,log_pr_7,log_pr_8,log_pr_9,...,downbb2,downbb3,downbb4,downbb5,downbb6,downbb7,downbb8,downbb9,0,adl
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-26 04:48:00,-0.346692,-0.435694,0.896019,-0.093906,-0.202054,0.246745,0.061087,-0.172276,0.028698,-0.194407,...,0.895347,-0.096026,-0.202866,0.245893,0.060423,-0.173027,0.027693,-0.195406,3.998371,0
2021-08-26 04:49:00,-0.345551,-0.434301,0.881972,-0.092821,-0.200816,0.246935,0.062729,-0.171782,0.028574,-0.192796,...,0.885849,-0.096356,-0.203337,0.245936,0.060063,-0.173055,0.027995,-0.195728,35.966230,6
2021-08-26 04:50:00,-0.346730,-0.435517,0.884784,-0.092905,-0.200948,0.247199,0.061673,-0.172245,0.028294,-0.193138,...,0.881654,-0.096490,-0.203443,0.245944,0.060220,-0.173027,0.028148,-0.195836,0.107140,-6
2021-08-26 04:51:00,-0.346271,-0.434632,0.887894,-0.093619,-0.201877,0.246898,0.061977,-0.172134,0.028831,-0.192897,...,0.879567,-0.096384,-0.203384,0.246027,0.060210,-0.172838,0.028159,-0.195948,5.443071,4
2021-08-26 04:52:00,-0.346362,-0.435158,0.887970,-0.093767,-0.202067,0.247758,0.061822,-0.171826,0.028714,-0.192943,...,0.878008,-0.096212,-0.203343,0.245928,0.060253,-0.172563,0.028269,-0.195932,3.853662,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 23:55:00,0.202473,-0.953688,1.822158,-0.014819,-0.455037,-0.433889,-0.248988,-0.444827,0.051673,-0.097036,...,1.819987,-0.015269,-0.456611,-0.434366,-0.254772,-0.445614,0.050735,-0.098215,0.999933,2
2021-12-31 23:56:00,0.202848,-0.953987,1.822320,-0.014359,-0.455589,-0.433965,-0.249405,-0.444581,0.051369,-0.097438,...,1.819938,-0.015145,-0.456442,-0.434264,-0.254055,-0.445614,0.050738,-0.098214,2.665589,-2
2021-12-31 23:57:00,0.203726,-0.954003,1.822973,-0.014348,-0.455433,-0.434181,-0.250450,-0.444742,0.051289,-0.097625,...,1.820181,-0.015141,-0.456302,-0.434343,-0.253364,-0.445296,0.050734,-0.098200,0.666602,-2
2021-12-31 23:58:00,0.203586,-0.954238,1.824952,-0.015121,-0.456122,-0.433732,-0.251122,-0.444934,0.051144,-0.098005,...,1.821085,-0.015170,-0.456526,-0.434317,-0.253374,-0.445225,0.050736,-0.098059,0.107140,-6


In [226]:
N = features_trf.shape[0]
features_trf.shape, labels_trf.shape

((18543, 3, 723), (18543, 1, 10))

In [227]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(features_trf.reshape(N, -1), labels_trf.squeeze())
model.score(features_trf.reshape(N, -1), labels_trf.squeeze())

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').