In [355]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb


In [375]:
# Load the data frames

log_pr = pd.read_pickle("./log_price.df")
volu = pd.read_pickle("./volume_usd.df")

In [376]:
def construct_dataset(window_size, dataset):
    """
    window: look-back window size for constructing X (in minutes)
    """
    window_dt = datetime.timedelta(minutes=window_size)
    predict_dt = datetime.timedelta(minutes=30)

    window_X = []
    window_y = []

    for t in dataset.index[window_size:-window_size:10]: # compute the predictions every 10 minutes
        window_X.append(dataset.loc[(t - window_dt):t])
        window_y.append(dataset.loc[t + predict_dt] - dataset.loc[t])
        
    return np.array(window_X), np.array(window_y)

In [377]:
def construct_pr_vol_datasets(window_size):
    """
    window: look-back window size for constructing X (in minutes)
    """
    return construct_dataset(window_size, log_pr), construct_dataset(window_size, volu)

In [395]:
window_size = 30 # in minutes
log_pr_ds, volu_ds = construct_pr_vol_datasets(window_size)
log_pr_X, log_pr_y = log_pr_ds
volu_X, volu_y = volu_ds

print(log_pr_X.shape)
print(log_pr_y.shape)

num_test = 5000

X_train = np.array(log_pr_X[:-num_test, :, :])
y_train = np.array(log_pr_y[:-num_test, :])

X_test = np.array(log_pr_X[-num_test:, :, :])
y_test = np.array(log_pr_y[-num_test:, :])

print(X_train.shape)
print(y_train.shape)


(26490, 31, 10)
(26490, 10)
(21490, 31, 10)
(21490, 10)


In [408]:
class GBoost:
    def __init__(self, num_assets=10):
        self.models = []
        self.num_assets = num_assets

    def fit(self, X, y):
        for asset_index in range(10):
            asset_X = np.array(X[:, :, asset_index])
            asset_y = np.array(y[:, asset_index])

            # X_train = np.array(one_asset_X[:-5000, :])
            # y_train = np.array(one_asset_y[:-5000])

            model = lgb.LGBMRegressor()
            model.fit(asset_X, asset_y)

            self.models.append(model)

    def predict_one(self, X): #given data for just one sequence.
        final_predictions = []
        for asset_index in range(10):
            features = np.expand_dims(X[-31:, asset_index], 0)
            pred = self.models[asset_index].predict(features)
            final_predictions.append(-1*pred)
        final_predictions = np.array(final_predictions).squeeze()
        return final_predictions

    def predict(self, X): #multiple sequences:
        final_predictions = []
        for i in range(len(X)):
            tmp_predictions = []
            for asset_index in range(10):
                features = np.array(np.expand_dims(X[i, -31:, asset_index], 0))
                pred = self.models[asset_index].predict(features)[0]
                tmp_predictions.append(-1*pred)
            final_predictions.append(tmp_predictions)
            
        return np.array(final_predictions)

boost_model = GBoost()
boost_model.fit(X_train, y_train)

In [409]:
# Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns
#predict the log price, and then do correlation

def get_r_hat_baseline(A, B):
    return -(A.iloc[-1] - A.iloc[-30]).values 

In [410]:
def get_r_hat_gboost(A, B):
    preds = boost_model.predict_one(A.to_numpy())
    return preds

In [417]:
# An example of get_r_hat

ACTIVE_R_HAT = "baseline"

r_hat_implementations = {
    "baseline": get_r_hat_baseline, # 0.040118694541047606
    "gboost": get_r_hat_gboost
}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    print(A.shape)
    return r_hat_implementations[ACTIVE_R_HAT](A, B)

In [418]:
def get_model_corr(input):
    t0 = time.time()
    dt = datetime.timedelta(days=1)
    r_hat = pd.DataFrame(index=input.index[30::10], columns=np.arange(10), dtype=np.float64)
    for t in input.index[30::10]: # compute the predictions every 10 minutes
        r_hat.loc[t, :] = get_r_hat(input.loc[(t - dt):t], volu.loc[(t - dt):t])
    t_used = time.time() - t0
    
    r_fwd = (input.shift(-30) - input).iloc[30::10].rename(columns={f"input_{i}": i for i in range(10)})
    r_fwd.corrwith(r_hat)
    
    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    
    return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]
# def get_model_accuracy():
#     t0 = time.time()
#     dt = datetime.timedelta(days=1)
#     r_hat = pd.DataFrame(index=log_pr.index[30::10], columns=np.arange(10), dtype=np.float64)
#     for t in log_pr.index[30::10]: # compute the predictions every 10 minutes
#         r_hat.loc[t, :] = get_r_hat(log_pr.loc[(t - dt):t], volu.loc[(t - dt):t])
#     t_used = time.time() - t0
    
#     r_fwd = (log_pr.shift(-30) - log_pr).iloc[30::10].rename(columns={f"log_pr_{i}": i for i in range(10)})
#     r_fwd.corrwith(r_hat)
    
#     r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
#     print(r_fwd_all.shape)
#     r_hat_all = r_hat.iloc[:-3].values.ravel()
    
#     return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]

In [419]:
log_pr_test = log_pr.iloc[-num_test:, :]
get_model_corr(log_pr_test)

(31, 10)
(41, 10)
(51, 10)
(61, 10)
(71, 10)
(81, 10)
(91, 10)
(101, 10)
(111, 10)
(121, 10)
(131, 10)
(141, 10)
(151, 10)
(161, 10)
(171, 10)
(181, 10)
(191, 10)
(201, 10)
(211, 10)
(221, 10)
(231, 10)
(241, 10)
(251, 10)
(261, 10)
(271, 10)
(281, 10)
(291, 10)
(301, 10)
(311, 10)
(321, 10)
(331, 10)
(341, 10)
(351, 10)
(361, 10)
(371, 10)
(381, 10)
(391, 10)
(401, 10)
(411, 10)
(421, 10)
(431, 10)
(441, 10)
(451, 10)
(461, 10)
(471, 10)
(481, 10)
(491, 10)
(501, 10)
(511, 10)
(521, 10)
(531, 10)
(541, 10)
(551, 10)
(561, 10)
(571, 10)
(581, 10)
(591, 10)
(601, 10)
(611, 10)
(621, 10)
(631, 10)
(641, 10)
(651, 10)
(661, 10)
(671, 10)
(681, 10)
(691, 10)
(701, 10)
(711, 10)
(721, 10)
(731, 10)
(741, 10)
(751, 10)
(761, 10)
(771, 10)
(781, 10)
(791, 10)
(801, 10)
(811, 10)
(821, 10)
(831, 10)
(841, 10)
(851, 10)
(861, 10)
(871, 10)
(881, 10)
(891, 10)
(901, 10)
(911, 10)
(921, 10)
(931, 10)
(941, 10)
(951, 10)
(961, 10)
(971, 10)
(981, 10)
(991, 10)
(1001, 10)
(1011, 10)
(1021, 10)
(103

0.05733390255969615