In [13]:
import os
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
# Load the data frames

log_pr = pd.read_pickle("./log_price.df")
volu = pd.read_pickle("./volume_usd.df")

In [15]:
# Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns

def get_r_hat_baseline(A, B):
    return -(A.iloc[-1] - A.iloc[-30]).values 

In [16]:
# An example of get_r_hat

ACTIVE_R_HAT = "baseline"

r_hat_implementations = {
    "baseline": get_r_hat_baseline
}

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    
    return r_hat_implementations[ACTIVE_R_HAT](A, B)

In [17]:
# Overall correlation (The ranking is based on this metric on the testing dataset)
# BASELINE: 0.040118694541047606

def get_model_accuracy():
    t0 = time.time()
    dt = datetime.timedelta(days=1)
    r_hat = pd.DataFrame(index=log_pr.index[30::10], columns=np.arange(10), dtype=np.float64)
    for t in log_pr.index[30::10]: # compute the predictions every 10 minutes
        r_hat.loc[t, :] = get_r_hat(log_pr.loc[(t - dt):t], volu.loc[(t - dt):t])
    t_used = time.time() - t0
    
    r_fwd = (log_pr.shift(-30) - log_pr).iloc[30::10].rename(columns={f"log_pr_{i}": i for i in range(10)})
    r_fwd.corrwith(r_hat)
    
    r_fwd_all = r_fwd.iloc[:-3].values.ravel() # the final 3 rows are NaNs. 
    r_hat_all = r_hat.iloc[:-3].values.ravel()
    
    return np.corrcoef(r_fwd_all, r_hat_all)[0, 1]

In [18]:
def construct_dataset(window_size, dataset):
    """
    window: look-back window size for constructing X (in minutes)
    """
    window_dt = datetime.timedelta(minutes=window_size)
    predict_dt = datetime.timedelta(minutes=30)

    window_X = []
    window_y = []

    for t in dataset.index[window_size:-window_size:10]: # compute the predictions every 10 minutes
        window_X.append(dataset.loc[(t - window_dt):t])
        window_y.append(dataset.loc[t + predict_dt])
        
    return np.array(window_X), np.array(window_y)

In [19]:
def construct_pr_vol_datasets(window_size):
    """
    window: look-back window size for constructing X (in minutes)
    """
    return construct_dataset(window_size, log_pr), construct_dataset(window_size, volu)

In [20]:
window_size = 30 # in minutes
log_pr_ds, volu_ds = construct_pr_vol_datasets(window_size)
log_pr_X, log_pr_y = log_pr_ds
volu_X, volu_y = volu_ds

In [21]:
get_model_accuracy()

0.040118694541047606