### Train and save the model

In [1]:
import numpy as np
import pandas as pd

Read from file and parse into training and test set

In [2]:
# read data file
log_pr = pd.read_pickle("./log_price.df")
volume = pd.read_pickle("./volume_usd.df")

In [3]:
import datetime
import time

t0 = time.time()
dt = datetime.timedelta(days=60)
cur_t = log_pr.index[-1]
log_pr_test = log_pr.loc[(cur_t - dt):]
log_pr = log_pr[:(cur_t - dt)]
volume_test = volume.loc[(cur_t - dt):]
volume = volume[:(cur_t - dt)]

In [4]:
log_pr_test.shape, volume_test.shape, log_pr.shape, volume.shape

((86401, 10), (86401, 10), (178560, 10), (178560, 10))

In [3]:
# divide and parse data into training set and label set
data = pd.concat([log_pr, np.log(volume + 1)], axis=1)
data_np = data.values
train_np = data_np.reshape(-1, 1440 * 20)
train_np = np.delete(train_np, -1, axis=0)
label_np = data_np[1440+29::1440,:10]  # future log price in 30 min
train_np.shape, label_np.shape

((183, 28800), (183, 10))

In [4]:
# standardize each feature
def standardize(train_np):
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    return scaler.fit_transform(train_np)

train_np_standardize = standardize(train_np)

In [5]:
# normalize each feature
def normalize(train_np):
    from sklearn.preprocessing import Normalizer

    norm = Normalizer()
    return norm.fit_transform(train_np)

train_np_normalize = normalize(train_np)
train_np_normalize.min(), train_np_normalize.max()

(-0.0006284209797935348, 0.01311364501707388)

In [6]:
# extract principal components
def pca_after_normalize(train_np):
    from sklearn.decomposition import PCA 

    pca = PCA(n_components=64, svd_solver="full")
    pca.fit(standardize(train_np))
    return pca.transform(train_np), pca

train_np_pca, pca = pca_after_normalize(train_np)
np.cumsum(pca.explained_variance_ratio_)

array([0.30707126, 0.47496257, 0.54138948, 0.6035855 , 0.6237112 ,
       0.63905665, 0.65100007, 0.6618374 , 0.67221408, 0.68074472,
       0.68908342, 0.69588428, 0.70245134, 0.7083728 , 0.71418379,
       0.7198574 , 0.72502773, 0.72995567, 0.73462558, 0.73895161,
       0.74319612, 0.74735328, 0.75131267, 0.75499127, 0.75859269,
       0.76201976, 0.76534245, 0.76853719, 0.77165096, 0.77470539,
       0.77765917, 0.78052295, 0.78334896, 0.78613007, 0.78875611,
       0.79137791, 0.79396715, 0.79648294, 0.79895304, 0.8014019 ,
       0.80378863, 0.80613203, 0.80845572, 0.81073905, 0.81301542,
       0.81525388, 0.81744664, 0.81959648, 0.82171636, 0.82382083,
       0.82589077, 0.82794408, 0.8299785 , 0.83199218, 0.83396154,
       0.83591345, 0.83785891, 0.8397871 , 0.84168492, 0.84356368,
       0.84542494, 0.84727367, 0.84911079, 0.85092497])

Cross validation to train the model and return out-of-sample correlation.

In [7]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LinearRegression
import critic

train_actual = train_np_normalize
print(train_actual.shape)

cv = ShuffleSplit(n_splits=10, test_size=0.25)
reg = LinearRegression()
validation_score = []
for train_idx, test_idx in cv.split(train_actual, label_np):
    X = np.take(train_actual, train_idx, axis=0)
    Y = np.take(label_np, train_idx, axis=0)
    X_test = np.take(train_actual, test_idx, axis=0)
    Y_test = np.take(label_np, test_idx, axis=0)
    reg.fit(X, Y)
    
    # calculate validation score
    Y_hat = reg.predict(X_test)
    score = critic.overall_corr(Y_hat, Y_test)
    validation_score.append(score)

print(validation_score)
print("avg=", np.mean(validation_score))


(183, 28800)
[0.9935358183521849, 0.9927509619370319, 0.9886858325213485, 0.9928644276983495, 0.991944139851397, 0.9923916198449841, 0.9937421503207934, 0.9938208153389676, 0.9944544820070387, 0.9911745209345547]
avg= 0.992536476880665


In [8]:
# save regressor weights
params = np.hstack([reg.intercept_[:,np.newaxis], reg.coef_]).T
print(params.shape)
np.savetxt("./linreg_norm.txt", params.astype(np.single), delimiter=',')

(28801, 10)


In [9]:
# save pca components
print(pca.components_.shape)
np.savetxt("./pca.txt", pca.components_.astype(np.single), delimiter=',')

(64, 28800)


### Training Summary

| Feature | Preprocess | Cross Validation Score | Weights Order |
|---------|------------|------------------------|---------------|
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes | avg = 0.9925 | e-11 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes and standardize | avg = 0.9983 | e-11 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes and normalize | avg = 0.9928 | e-7 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes, normalize, and pca 128 | a,vg = 0.9312 | e-6 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes, standardize, and pca 64 | a,vg = 0.9976 | e-6 / e-10 for pca |

### Load the model and run

### Submission function

In [None]:
# original
import numpy as np
import pandas as pd

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    
    data = pd.concat([A, B], axis=1).values.ravel()[np.newaxis, :]
    weights = np.loadtxt("./linreg.txt", delimiter=',')
    pred = weights[0] + (data @ weights[1:]).squeeze()
    return A.iloc[-1].values - pred # Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns
    

In [34]:
# standardize + pca
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def get_r_hat(A, B):
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np)], axis=0).reshape(1, -1)
    reg_weights = np.loadtxt("./linreg.txt", delimiter=',')
    pca_weights = np.loadtxt("./pca.txt", delimiter=',')
    data = StandardScaler().fit_transform(data)
    data = data @ pca_weights.T
    predict = (data @ reg_weights[1:]).squeeze() + reg_weights[0]
    return predict

get_r_hat(log_pr.iloc[-1440:], volume.iloc[-1440:])

array([ 0.09780466, -0.19130926,  0.44984528,  0.21735089, -0.07590324,
       -0.40594572, -0.24346416, -0.08908863, -0.23921368, -0.27030188])

In [10]:
# normalize
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer

def get_r_hat(A, B):
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np)], axis=0).reshape(1, -1)
    data = Normalizer().fit_transform(data)
    reg_weights = np.loadtxt("./linreg_norm.txt", delimiter=',')
    predict = (data @ reg_weights[1:]).squeeze() + reg_weights[0]
    return predict

get_r_hat(log_pr.iloc[-1440:], volume.iloc[-1440:])

array([ 0.76917557, -0.5449981 ,  1.92371925,  0.32897562,  0.01220115,
       -0.92214199,  0.06372874, -0.1868889 ,  0.4834458 ,  0.51131822])

### Simulate OJ

In [3]:
import main
import time
import datetime

t0 = time.time()
dt = datetime.timedelta(days=1) - datetime.timedelta(minutes=1)
r_hat = pd.DataFrame(index=log_pr.index[1440::10], columns=np.arange(10), dtype=np.float64)
# print(r_hat.index)
for t in log_pr.index[1440::10]:  # compute the predictions every 10 minutes
    # print(log_pr_test.loc[(t - dt):t].shape, volume_test.loc[(t - dt):t].shape)
    r_hat.loc[t, :] = main.get_r_hat(log_pr.loc[(t - dt):t], volume.loc[(t - dt):t])
t_used = time.time() - t0
# print(t_used)

r_fwd = (log_pr.shift(-30) - log_pr).iloc[1440::10].rename(columns={f"log_pr_{i}": i for i in range(10)})
# print(r_fwd, r_hat)
r_fwd.corrwith(r_hat)

0    0.006556
1   -0.010191
2   -0.006406
3   -0.009474
4   -0.006198
5    0.001535
6   -0.002478
7   -0.006178
8    0.000166
9    0.004295
dtype: float64