In [1]:
import numpy as np
import pandas as pd

log_pr_file = './log_price.df'
volu_usd_file = './volume_usd.df'

In [2]:
log_pr = pd.read_pickle(log_pr_file)
volu = pd.read_pickle(volu_usd_file)
log_pr.columns = ['log_pr_%d'%i for i in range(10)]
volu.columns = ['volu_%d'%i for i in range(10)]

data = pd.concat([log_pr, volu], axis=1)

#### Dataset manipulation

In [3]:
def _split_data(data:pd.DataFrame, test_pct:float):
    assert test_pct > 0 and test_pct < 1
    test_size = int(len(data) * test_pct)
    return  data[:-test_size], data[-test_size:]

def split_data(log_pr:pd.DataFrame, volu:pd.DataFrame, test_pct:float):
    return *_split_data(log_pr, test_pct), *_split_data(volu, test_pct)

In [179]:
def formulize_data(data:pd.DataFrame, log_pr:pd.DataFrame, window_size=1440, step=10) -> np.array:
    N = len(data)
    assert N == len(log_pr)
    train_index = np.arange(0, window_size)[np.newaxis, :] + np.arange(0, N - window_size - 30, step)[:, np.newaxis]
    return_index = np.arange(0, N - window_size - 30, step)[:, np.newaxis] + window_size + 30 - 1
#     print(train_index, return_index)
    return data.values[train_index], log_pr.values[return_index]

#### Featue generator functions

In [5]:
def rate_of_change(data:pd.DataFrame, periods):
    return data.pct_change(periods)

In [6]:
def moving_average(data:pd.DataFrame, window_size):
    return data.rolling(window_size).mean()

In [7]:
def exp_moving_avg(data:pd.DataFrame, window_size):
    return data.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()

In [8]:
def z_score(data:pd.DataFrame, window_size):
    assert window_size > 1
    return (data - data.rolling(window=window_size).mean()) / \
            data.rolling(window=window_size).std()

In [9]:
def moving_sum(data:pd.DataFrame, window_size):
    return data.rolling(window_size).sum()

In [10]:
def sign(data:pd.DataFrame):
    return np.sign(data)

In [11]:
def binning(data:pd.DataFrame, n_bins):
    bin_fn = lambda y: pd.qcut(y, q=n_bins, labels=range(1, n_bins+1))
    return data.apply(bin_fn)

In [12]:
def RSI(data:pd.DataFrame, window_size, ema=True):
    """
    Returns a pd.Series with the relative strength index.
    """
    # Make two series: one for lower closes and one for higher closes
    up = data.clip(lower=0)
    down = -1 * data.clip(upper=0)
    
    if ema == True:
        # Use exponential moving average
        ma_up = up.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()
        ma_down = down.ewm(com = window_size - 1, adjust=True, min_periods = window_size).mean()
    else:
        # Use simple moving average
        ma_up = up.rolling(window = window_size, adjust=False).mean()
        ma_down = down.rolling(window = window_size, adjust=False).mean()
        
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi.dropna()

In [13]:
def MACD(data:pd.DataFrame, window_slow=26, window_fast=12, window_signal=9):
    ema_slow = exp_moving_avg(data, window_slow)
    ema_fast = exp_moving_avg(data, window_fast)
    ema_signal = exp_moving_avg(data, window_signal)
    macd = ema_fast - ema_slow
    return macd, macd - ema_signal

In [14]:
def high_low(data:pd.DataFrame, window=52):
    high = data.rolling(window=window).max()
    low = data.rolling(window=window).min()
    return high, low

In [15]:
def RHP(data:pd.DataFrame, window=10, horizon=52):
    high, low = high_low(data, horizon)
    newhighs = (high == data).sum(axis=1)
    newlows = (low == data).sum(axis=1)
    rhp = newhighs / (newhighs + newlows + 1e-4)
    return rhp.rolling(window=window).mean()

In [16]:
def BollingerBands(data:pd.DataFrame, window_size):
    ma = data.rolling(window=window_size).mean()
    sd = data.rolling(window=window_size).std()
    up = ma + 2*sd
    down = ma - 2*sd
    return up, down

In [17]:
def TRIN(log_pr:pd.DataFrame, volu:pd.DataFrame, window=1):
    num_stocks = log_pr.shape[1]
    pr_adv = (log_pr.diff(window) > 0).sum(axis=1)
    v_adv = (volu.diff(window) > 0).sum(axis=1)
    trin = (pr_adv / (num_stocks - pr_adv + 1e-4)) / (v_adv / (num_stocks - v_adv + 1e-4) + 1e-4)
    return trin

In [18]:
def ADL(data:pd.DataFrame, window=1):
    num = data.shape[1]
    adv = (data.diff(window) > 0).sum(axis=1)
    return adv - (num - adv)

#### Feature generator pipeline

In [197]:
# construct features
def generate_features(log_pr:pd.DataFrame, volu:pd.DataFrame):
    # 30 min negative returns
    n_ret = -(log_pr - log_pr.shift(30))
    n_ret.columns = ['30rt%d'%i for i in range(10)]
    
    # log volumes
    log_volu = np.log(volu + 1)
    log_volu.columns = ['lgv%d'%i for i in range(10)]
    
    # price diff
    pr_diff = []
    for d in [1, 10, 30]:
        pr_diff.append(log_pr.diff(d))
        pr_diff[-1].columns = ['prdif%d%d'%(d, i) for i in range(10)]
        
    # volume diff
    volu_diff = []
    for d in [1, 10, 30]:
        volu_diff.append(volu.diff(d))
        volu_diff[-1].columns = ['vdif%d%d'%(d, i) for i in range(10)]
        
    # price rate of change
    pr_roc = []
    for window in [1, 10, 30, 40, 60, 120, 1440 - 60]:
        pr_roc.append(rate_of_change(log_pr, window))
        pr_roc[-1].columns = ['prroc%d%d'%(window, i) for i in range(10)]
        
    # volume rate of change
    v_roc = []
    for window in [10, 30, 40, 60, 120, 1440 - 60]:
        v_roc.append(rate_of_change(volu + 1, window))
        v_roc[-1].columns = ['vroc%d%d'%(window, i) for i in range(10)]
        
    # price moving average
    pr_ma =[]
    for window in [1, 10, 30, 40, 60, 120, 1440 - 60]:
        pr_ma.append(moving_average(log_pr, window))
        pr_ma[-1].columns = ['prma%d%d'%(window, i) for i in range(10)]
        
    # volume moving average
    v_ma = []
    for window in [1, 10, 30, 40, 60, 120, 1440 - 60]:
        v_ma.append(moving_average(volu, window))
        v_ma[-1].columns = ['vma%d%d'%(window, i) for i in range(10)]
        
    # price exp moving average
    pr_ema =[]
    for window in [1, 10, 30, 40, 60, 120, 1440 - 60]:
        pr_ema.append(exp_moving_avg(log_pr, window))
        pr_ema[-1].columns = ['prema%d%d'%(window, i) for i in range(10)]
        
    # volume exp moving average
    v_ema = []
    for window in [1, 10, 30, 40, 60, 120, 1440 - 60]:
        v_ema.append(exp_moving_avg(volu, window))
        v_ema[-1].columns = ['vema%d%d'%(window, i) for i in range(10)]
        
    # z score
    pr_z = []
    for window in [10, 30, 40, 60, 120, 1440 - 60]:
        pr_z.append(z_score(log_pr, window))
        pr_z[-1].columns = ['prz%d%d'%(window, i) for i in range(10)]
        
    # volume z score
    v_z = []
    for window in [10, 30, 40, 60, 120, 1440 - 60]:
        v_z.append(z_score(volu, window))
        v_z[-1].columns = ['vz%d%d'%(window, i) for i in range(10)]
        
    # RSI indicators
    rsi = RSI(log_pr, 14)
    rsi.columns = ['rsi%d'%i for i in range(10)]
    
    # MACD indicators
    macd, macd_r = MACD(log_pr)
    macd.columns = ['macd%d'%i for i in range(10)]
    macd_r.columns = ['macd%d'%i for i in range(10)]
    
    # high low indicators
    high, low = high_low(log_pr)
    high.columns = ['high%d'%i for i in range(10)]
    low.columns = ['low%d'%i for i in range(10)]
    
    # high low ratio
    rhp = RHP(log_pr)
    rhp = pd.DataFrame(rhp, columns=['rhp'])
    
    # BollingerBands
    upbb, downbb = BollingerBands(log_pr, 10)
    upbb.columns = ['upbb%d'%i for i in range(10)]
    downbb.columns = ['downbb%d'%i for i in range(10)]
    
    # TRIN
    trin = TRIN(log_pr, volu)
    trin = pd.DataFrame(trin, columns=['trin'])
    
    # ADL
    adl = ADL(log_pr)
    adl = pd.DataFrame(adl, columns=['adl'])
    
    import itertools
    features = pd.concat([log_pr, volu, 
                          n_ret, log_volu, *pr_diff, *volu_diff, 
                          *pr_roc, *v_roc, *pr_ma, *v_ma, 
                          *pr_ema, *v_ema, *pr_z, *v_z,
                          rsi, macd, high, low, rhp, 
                          upbb, downbb, trin, adl], axis=1)
    return features
    
    

In [198]:
features = generate_features(log_pr, volu)

In [199]:
features.columns

Index(['log_pr_0', 'log_pr_1', 'log_pr_2', 'log_pr_3', 'log_pr_4', 'log_pr_5',
       'log_pr_6', 'log_pr_7', 'log_pr_8', 'log_pr_9',
       ...
       'downbb2', 'downbb3', 'downbb4', 'downbb5', 'downbb6', 'downbb7',
       'downbb8', 'downbb9', 'trin', 'adl'],
      dtype='object', length=693)

In [200]:
features.to_pickle('./features.pkl')

#### Forge training set and test set

In [201]:
log_pr_tr, log_pr_tst, volu_tr,  volu_tst = split_data(log_pr, volu, 0.3)

In [202]:
log_pr_tr.shape, log_pr_tst.shape, volu_tr.shape, volu_tst.shape

((185472, 10), (79488, 10), (185472, 10), (79488, 10))

In [203]:
features_tr = generate_features(log_pr_tr, volu_tr)

In [204]:
features_tr.shape, features.columns, features_tr.dropna().shape

((185472, 693),
 Index(['log_pr_0', 'log_pr_1', 'log_pr_2', 'log_pr_3', 'log_pr_4', 'log_pr_5',
        'log_pr_6', 'log_pr_7', 'log_pr_8', 'log_pr_9',
        ...
        'downbb2', 'downbb3', 'downbb4', 'downbb5', 'downbb6', 'downbb7',
        'downbb8', 'downbb9', 'trin', 'adl'],
       dtype='object', length=693),
 (184092, 693))

In [205]:
features_tr.to_pickle('./features_tr.pkl')

In [206]:
features_tr_f, label_tr_f = formulize_data(features_tr.dropna(), log_pr_tr.loc[features_tr.dropna().index], window_size=3)
features_tr_f.shape, label_tr_f.shape

((18406, 3, 693), (18406, 1, 10))

In [207]:
np.save('./features_tr_f.pkl', features_tr_f)
np.save('./labels_tr_f.pkl', label_tr_f)

In [208]:
from sklearn.linear_model import LinearRegression
N = len(features_tr_f)
model = LinearRegression()
model.fit(features_tr_f.reshape(N, -1), label_tr_f.squeeze())
model.score(features_tr_f.reshape(N, -1), label_tr_f.squeeze())

0.9988108652516947

In [220]:
from critic import Critic
cr = Critic()
def get_r_hat(A, B):
#     print(A.shape, B.shape)
    features = generate_features(A, B).dropna()
    features_tr = features[-3:]
    print(features_tr.shape)
    pred = model.predict(features_tr.reshape(1, -1))
    return pred - A.values[-1]
    
cr.submit(get_r_hat, log_pr_tst, volu_tst)

(3, 693)


KeyError: -1