In [1]:
import ojsim
sim = ojsim.OJSimulator()
X, Y = sim.formulized_train

In [2]:
import numpy as np
import pandas as pd

In [3]:
# construct the huge dim input dataset
def transform(X):
    prices = X[:,0,-30:,:]
    price_diff = (X[:,0,1:,:] - X[:,0,:-1,:])[:,-30:,:]
    volu = np.log(X[:,1,:,:] + 1)[:,-30:,:]
    volu_diff = (np.log(X[:,1,1:,:] + 1) - np.log(X[:,1,:-1,:] + 1))[:,-30:,:]
    X_train = np.concatenate([prices, price_diff, volu, volu_diff], axis=1)
    return X_train

X_train = transform(X)
print(X_train.shape)
N, _, _ = X_train.shape

(17709, 120, 10)


In [4]:
y = Y - X[:,0,-1,:]

In [27]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest
# SELECT with mutual information
def select_feature_mi(X_train, y):
    X_train_fs_mi = []
    support = []
    for i in range(10):
        fs = SelectKBest(score_func=mutual_info_regression, k='all')
        fs.fit(X_train, y[:,i])
#         X_train_fs_mi.append(fs.transform(X_train[:,:,i]))
        scores = fs.scores_
        support.append(np.where(scores > scores.max()*0.6)[0])
#         support.append(fs.get_support(True))
    return X_train_fs_mi, support

X_train_fs_mi, support_fs_mi = select_feature_mi(X_train.reshape(N,-1), y)

In [28]:
for i in range(10):
    print(len(support_fs_mi[i]))

91
91
224
81
164
156
71
143
141
211


In [29]:
support_fs_mi

[array([  0,   9,  10,  15,  19,  20,  29,  30,  39,  40,  49,  50,  59,
         60,  67,  69,  70,  75,  79,  80,  89,  90,  99, 100, 109, 110,
        119, 120, 125, 129, 130, 139, 140, 149, 150, 155, 159, 160, 165,
        169, 170, 173, 179, 180, 185, 189, 190, 199, 200, 209, 210, 219,
        220, 229, 230, 235, 239, 240, 249, 250, 259, 260, 265, 269, 270,
        275, 279, 280, 285, 289, 290, 295, 299, 600, 620, 630, 680, 730,
        740, 750, 760, 780, 790, 820, 830, 840, 850, 860, 870, 880, 890],
       dtype=int64),
 array([  1,   2,   5,  12,  13,  15,  22,  32,  42,  45,  52,  53,  55,
         61,  62,  65,  72,  75,  81,  82,  85,  92,  93,  95, 101, 102,
        112, 122, 131, 132, 142, 152, 162, 171, 172, 181, 182, 183, 192,
        202, 203, 212, 213, 221, 231, 232, 233, 241, 243, 251, 252, 261,
        262, 271, 272, 281, 282, 285, 291, 292, 293, 295, 601, 621, 631,
        641, 651, 661, 671, 681, 691, 701, 711, 721, 731, 741, 751, 761,
        771, 781, 791, 801, 8

In [16]:
X_train_mi = []
for i in range(10):
    X_train_mi.append(X_train.reshape(N,-1)[:,support_fs_mi[i]])

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
model = []
for i in range(10):
    model.append(LinearRegression().fit(X_train_mi[i], y[:,1]))

In [19]:
for i in range(10):
    print(model[i].score(X_train_mi[i], y[:,i]))

-0.06641847368299247
0.04564438725398867
-0.028311121785702165
-0.07941915076674166
-0.10097990038088556
-0.12185504401360037
-0.10025770742181583
-0.16632727832835714
-0.27149989241041395
-0.24592610265318426


In [25]:
def get_r_hat(A, B):
        X = np.stack([A.values, B.values], axis=0)
        X_test = transform(X[np.newaxis,:])
        X_test_list = []
        for i in range(10):
            X_test_list.append(X_test.reshape(1,-1)[:,support_fs_mi[i]])
        pred = [model[i].predict(X_test_list[i]) for i in range(10)]

        return pred

In [26]:
sim.submit(get_r_hat)

Total time used: 9.189s
Pairwise correlation:
	asset 0 = 0.01583
	asset 1 = 0.01372
	asset 2 = -0.01378
	asset 3 = 0.01667
	asset 4 = -0.02284
	asset 5 = 0.01818
	asset 6 = 0.02600
	asset 7 = -0.00319
	asset 8 = -0.07827
	asset 9 = -0.04128
	mean correlation = -0.00690
Overall correlation: -0.00140
Fail to outperform Ziwei's method, whose pairwise average
and overall correlations are (0.02840, 0.01536)


(9.189293146133423,
 0    0.015832
 1    0.013716
 2   -0.013783
 3    0.016675
 4   -0.022840
 5    0.018182
 6    0.025995
 7   -0.003187
 8   -0.078273
 9   -0.041284
 dtype: float64,
 -0.001398935071040119)