### Train and save the model

In [1]:
import numpy as np
import pandas as pd

In [2]:
import datetime
import time
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

In [3]:
train_np, label_np = dataset.train_set_form
label_np -= train_np[:,0,-1,:]
train_np = train_np[:,0,-50:,:]
train_np = np.concatenate([train_np, train_np[:,1:,:] - train_np[:,:-1,:]], axis=1)

train_np.shape, label_np.shape

((17709, 99, 10), (17709, 10))

In [None]:
# standardize each feature
def standardize(train_np):
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    return scaler.fit_transform(train_np)

train_np_standardize = standardize(train_np)

In [None]:
# normalize each feature
def normalize(train_np):
    from sklearn.preprocessing import Normalizer

    norm = Normalizer()
    return norm.fit_transform(train_np)

train_np_normalize = normalize(train_np)
train_np_normalize.min(), train_np_normalize.max()

(-0.00046333403184483756, 0.01358961048728508)

In [None]:
# extract principal components
def pca_after_normalize(train_np):
    from sklearn.decomposition import PCA 

    pca = PCA(n_components=16, svd_solver="full")
    pca.fit(standardize(train_np))
    return pca.transform(train_np), pca

train_np_pca, pca = pca_after_normalize(train_np)
np.cumsum(pca.explained_variance_ratio_)

Cross validation to train the model and return out-of-sample correlation.

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LinearRegression
import critic
from sklearn.kernel_ridge import KernelRidge

train_actual = train_np
print(train_actual.shape)

cv = ShuffleSplit(n_splits=10, test_size=0.4)
reg = LinearRegression()
# reg = KernelRidge(kernel="poly", gamma=0.5, degree=2, alpha=15)
validation_score = []
for train_idx, test_idx in cv.split(train_actual, label_np):
    X = np.take(train_actual, train_idx, axis=0)
    Y = np.take(label_np, train_idx, axis=0)
    X_test = np.take(train_actual, test_idx, axis=0)
    Y_test = np.take(label_np, test_idx, axis=0)
    reg.fit(X, Y)
    
    # calculate validation score
    Y_hat = reg.predict(X_test)
    score = critic.overall_corr(Y_hat, Y_test)
    validation_score.append(score)

print(validation_score)
print("avg=", np.mean(validation_score))


(17709, 14390)


In [6]:
from sklearn.kernel_ridge import KernelRidge

reg = KernelRidge(kernel="poly", degree=2, alpha=1)
reg.fit(train_np.reshape(len(train_np), -1), label_np)

KernelRidge(degree=2, kernel='poly')

In [8]:
import critic
critic.overall_corr(label_np, reg.predict(train_np.reshape(len(train_np), -1)))

0.06724478607186044

In [9]:
# save regressor weights
import pickle
with open("./kreg.pkl", "wb") as f:
    pickle.dump(reg, f)

In [None]:
# save pca components
with open("./pca.pkl", "wb") as f:
    pickle.dump(pca, f)

### Training Summary

| Feature | Preprocess | Cross Validation Score | Weights Order |
|---------|------------|------------------------|---------------|
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes | avg = 0.9925 | e-11 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes and standardize | avg = 0.9983 | e-11 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes and normalize | avg = 0.9928 | e-7 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes, normalize, and pca 128 | a,vg = 0.9312 | e-6 |
| log price + volume (n_samples, 10 * 1440 * 2) | log over volumes, standardize, and pca 64 | a,vg = 0.9976 | e-6 / e-10 for pca |

### Load the model and run

### Submission function

In [None]:
# original
import numpy as np
import pandas as pd

def get_r_hat(A, B): 
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    
    data = pd.concat([A, B], axis=1).values.ravel()[np.newaxis, :]
    weights = np.loadtxt("./linreg.txt", delimiter=',')
    pred = weights[0] + (data @ weights[1:]).squeeze()
    return A.iloc[-1].values - pred # Use the negative 30-minutes backward log-returns to predict the 30-minutes forward log-returns
    

In [None]:
# standardize + pca
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def get_r_hat(A, B):
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np)], axis=0).reshape(1, -1)
    reg_weights = np.loadtxt("./linreg.txt", delimiter=',')
    pca_weights = np.loadtxt("./pca.txt", delimiter=',')
    data = StandardScaler().fit_transform(data)
    data = data @ pca_weights.T
    predict = (data @ reg_weights[1:]).squeeze() + reg_weights[0]
    return predict

get_r_hat(log_pr.iloc[-1440:], volume.iloc[-1440:])

In [None]:
# normalize
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer

reg_weights = np.loadtxt("./linreg_norm.txt", delimiter=',')

def get_r_hat(A, B):
    """
        A: 1440-by-10 dataframe of log prices with columns log_pr_0, ... , log_pr_9
        B: 1440-by-10 dataframe of trading volumes with columns volu_0, ... , volu_9    
        return: a numpy array of length 10, corresponding to the predictions for the forward 30-minutes returns of assets 0, 1, 2, ..., 9
    """
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np + 1)], axis=0).reshape(1, -1)
    data = Normalizer().fit_transform(data)
    predict = (data @ reg_weights[1:]).squeeze() + reg_weights[0]
    return predict


In [None]:
# pca regression
import pandas as pd
import numpy as np
import pickle

with open("./pca.pkl", "rb") as f:
    pca = pickle.load(f)

with open("./pcareg.pkl", "rb") as f:
    model = pickle.load(f)

def get_r_hat(A, B):
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np + 1)], axis=0).reshape(1, -1)
    data = pca.transform(data)
    predict = model.predict(data)
    return predict - A.values[-1]

In [None]:
# kernel regression
import numpy as np
import pandas as pd
import pickle

with open("./kreg.pkl", "rb") as f:
    reg = pickle.load(f)

def get_r_hat(A, B):
    A_np, B_np = A.values, B.values
    data = np.concatenate([A_np, np.log(B_np + 1)], axis=0).reshape(1, -1)
    predict = reg.predict(data)
    return predict - A.values[-1]

### Simulate OJ

In [None]:
# linreg without subtraction
import main 
import time
import datetime
import pandas as pd
import critic
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

t, p, o = critic.corr_score(main.get_r_hat, log_pr_test, volu_test)
print(t, p, o)
print(p.mean())

5.619262218475342 0    0.009722
1   -0.016240
2   -0.023683
3   -0.014074
4   -0.015898
5   -0.045118
6   -0.012909
7   -0.015110
8    0.001181
9    0.005255
dtype: float64 0.007725527309844091
-0.012687476911780512


In [7]:
# linreg with subtraction
import main
import time
import datetime
import pandas as pd
import critic
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

t, p, o = critic.corr_score(main.get_r_hat, log_pr_test, volu_test)
print(t, p, o)
print(p.mean())

2.735478401184082 0    0.001788
1   -0.006726
2    0.016865
3    0.017322
4    0.033277
5   -0.002937
6   -0.002388
7    0.012442
8    0.013796
9    0.012024
dtype: float64 -0.00611569740869358
0.0095463289686462


In [10]:
# kernel regression
import main
import time
import datetime
import pandas as pd
import critic
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

t, p, o = critic.corr_score(main.get_r_hat, log_pr_test, volu_test)
print(t, p, o)
print(p.mean())

187.53665018081665 0   -0.005227
1    0.022567
2    0.035439
3    0.020911
4    0.021831
5    0.004640
6    0.018181
7    0.025009
8    0.025577
9    0.029023
dtype: float64 -0.00713382036470401
0.019795031887557265


In [7]:
# kernel regression
import main
import time
import datetime
import pandas as pd
import critic
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

t, p, o = critic.corr_score(main.get_r_hat, log_pr_test, volu_test)
print(t, p, o)
print(p.mean())

2.994023084640503 0    0.001788
1   -0.006726
2    0.016865
3    0.017322
4    0.033277
5   -0.002937
6   -0.002388
7    0.012442
8    0.013796
9    0.012024
dtype: float64 -0.00611569740869358
0.0095463289686462


In [None]:
# pca regression
import main
import time
import datetime
import pandas as pd
import critic
from dataset import DataSet

dataset = DataSet()
log_pr_train, volu_train = dataset.train_set
log_pr_test, volu_test = dataset.test_set

t, p, o = critic.corr_score(main.get_r_hat, log_pr_test, volu_test)
print(t, p, o)
print(p.mean())

5.30824875831604 0    0.005728
1    0.000533
2   -0.011091
3   -0.015957
4    0.000167
5   -0.038348
6    0.006341
7    0.018579
8   -0.009087
9   -0.011001
dtype: float64 -0.012074797529834466
-0.005413531429876319


### Some feature engineering

In [None]:
import numpy as np

# test correlation between volume and price
log_pr_train, volu_train = dataset.train_set
np.corrcoef(log_pr_train.values.ravel(), volu_train.shift(-1).values.ravel()), log_pr_train.corrwith(volu_train)

(array([[ 1., nan],
        [nan, nan]]),
 0    0.094592
 1    0.088388
 2    0.406681
 3    0.171382
 4    0.036516
 5    0.328695
 6    0.339551
 7    0.013728
 8    0.070581
 9    0.076392
 dtype: float64)

In [None]:
X, y = dataset.train_set_form

In [None]:
X = X.reshape(len(X), -1, 10)

In [None]:
X.shape

(17709, 2880, 10)

In [None]:
for a in range(10):
    for f in range(2880):
        cor = np.corrcoef(X[:,f,a], y[:,5])[0,1]
        if cor > 0.5:
            print("feature %d, asset %d: cor = %.4f" % (f, a, cor))

feature 0, asset 5: cor = 0.9698
feature 1, asset 5: cor = 0.9699
feature 2, asset 5: cor = 0.9699
feature 3, asset 5: cor = 0.9699
feature 4, asset 5: cor = 0.9699
feature 5, asset 5: cor = 0.9700
feature 6, asset 5: cor = 0.9700
feature 7, asset 5: cor = 0.9700
feature 8, asset 5: cor = 0.9700
feature 9, asset 5: cor = 0.9700
feature 10, asset 5: cor = 0.9701
feature 11, asset 5: cor = 0.9701
feature 12, asset 5: cor = 0.9701
feature 13, asset 5: cor = 0.9701
feature 14, asset 5: cor = 0.9702
feature 15, asset 5: cor = 0.9702
feature 16, asset 5: cor = 0.9702
feature 17, asset 5: cor = 0.9702
feature 18, asset 5: cor = 0.9702
feature 19, asset 5: cor = 0.9703
feature 20, asset 5: cor = 0.9703
feature 21, asset 5: cor = 0.9703
feature 22, asset 5: cor = 0.9703
feature 23, asset 5: cor = 0.9703
feature 24, asset 5: cor = 0.9704
feature 25, asset 5: cor = 0.9704
feature 26, asset 5: cor = 0.9704
feature 27, asset 5: cor = 0.9705
feature 28, asset 5: cor = 0.9704
feature 29, asset 5: cor

In [None]:
y.shape

(17709, 10)

In [2]:
import numpy as np
a = np.random.rand(10,3)
b = np.random.rand(10,3)
a, b

(array([[0.25749826, 0.54229204, 0.10030242],
        [0.37251887, 0.74790041, 0.32971172],
        [0.15829173, 0.35297618, 0.52887478],
        [0.34529629, 0.71582873, 0.37678417],
        [0.52631867, 0.7965577 , 0.84675771],
        [0.0059341 , 0.79860863, 0.32063279],
        [0.27183965, 0.56173354, 0.84828254],
        [0.99602569, 0.4784562 , 0.96333257],
        [0.3610556 , 0.36814544, 0.93253799],
        [0.12916841, 0.53997391, 0.62649916]]),
 array([[0.538391  , 0.05611216, 0.22882594],
        [0.29462557, 0.68127254, 0.47308179],
        [0.54187467, 0.27219685, 0.11506879],
        [0.15649079, 0.46605557, 0.49083987],
        [0.28743797, 0.84162562, 0.88319107],
        [0.19620918, 0.69022949, 0.54364517],
        [0.25406651, 0.69827133, 0.26678178],
        [0.95568198, 0.81093281, 0.81173416],
        [0.494931  , 0.89365039, 0.15733942],
        [0.58644325, 0.19594696, 0.51618281]]))

In [1]:
import ojsim
import main
sim = ojsim.OJSimulator()
sim.submit(main.get_r_hat)

100%|██████████| 8496/8496 [04:08<00:00, 34.15it/s]

Total time used: 248.791s
Pairwise correlation:
	asset 0 = -0.00130
	asset 1 = 0.06496
	asset 2 = -0.02842
	asset 3 = 0.02031
	asset 4 = 0.04824
	asset 5 = 0.05920
	asset 6 = 0.00168
	asset 7 = 0.02035
	asset 8 = 0.05620
	asset 9 = 0.01902
	mean correlation = 0.02602
Overall correlation: 0.00232
Fail to outperform Ziwei's method, whose pairwise average
and overall correlations are (0.02840, 0.01536)



