In [2]:
%matplotlib inline

from IPython.core.display import display, HTML
from scipy import stats
import pandas as pd
import numpy as np
from datetime import datetime as dt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.datasets import *

N_FACTORS=5

In [4]:
%run TrainValidTestDataPrep.ipynb
print("Train Set Normality Test")
for c in data_train.columns:
    _stats = stats.jarque_bera(data_train)
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Valid Set Normality Test")
for c in data_valid.columns:
    _stats = stats.jarque_bera(data_valid[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))
print("Test  Set Normality Test")
for c in data_test.columns:
    _stats = stats.jarque_bera(data_test[c])
    if np.abs(_stats[1] > 0.01): print("%s,%s"%(c, _stats))


Train Set Normality Test
Valid Set Normality Test
0001.HK,(6.064224749785349, 0.048213685104578796)
0857.HK,(8.452645800870712, 0.014605999639945466)
Test  Set Normality Test
0011.HK,(3.2554994473743633, 0.19637096624740646)
0083.HK,(6.814479085838839, 0.03313253519634285)
0386.HK,(1.5497081805806185, 0.4607710070582528)
1928.HK,(4.337978009576364, 0.11429310830561656)
3328.HK,(1.1835132218824163, 0.5533544020994401)
3988.HK,(0.18692085297707353, 0.9107740592407177)
Train Set Normality Test
Valid Set Normality Test
0001.HK,(6.064224749785349, 0.048213685104578796)
0857.HK,(8.452645800870712, 0.014605999639945466)
Test  Set Normality Test
0011.HK,(3.2554994473743633, 0.19637096624740646)
0083.HK,(6.814479085838839, 0.03313253519634285)
0386.HK,(1.5497081805806185, 0.4607710070582528)
1928.HK,(4.337978009576364, 0.11429310830561656)
3328.HK,(1.1835132218824163, 0.5533544020994401)
3988.HK,(0.18692085297707353, 0.9107740592407177)


# SciKitLearn's PCA at prototype stage

in future will just hand code with numpy as we will have more control that way

In [5]:
model0 = PCA(n_components=N_FACTORS)
model0.fit(data_train)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [23]:
dir(model0)
print("n_components: " + str(model0.n_components))
print("model covar: " + str(model0.get_covariance().shape))

n_components: 5
model covar: (50, 50)


# Change benchmark: encoding accuracy to encoding forecast accuracy

Assume we can use any data up to the test_data cutoff(s). I'm using data_valid just because its the closest to data_test in time.

Although I am interested in the sparce encoding of the return space to describe what has happened over the last hour, days, etc (and can use this as input into the forecast), at the end of the day I really care about the future. 

We are assuming in this PCA model case that we can model the whole market as 5 iid gaussian shocks + 50 individual shocks. I want to test that the covar matrix we produce is the best forecast of the forward covar matrix. And since we can split that into two separate forecasts: the correlations and the marginals, I'm really most interested in the accuracy and stability of the correlation matrix. 

In [39]:
data_train_encoded = pd.DataFrame(model0.transform(data_valid))
print("encoded shape: " + str(data_test_encoded.shape))

encoded shape: (1141, 5)


In [40]:
#PCA factor covariance matrix 
cov_factor0 = data_train_encoded.cov()
corr_factor0 = data_train_encoded.corr()
corr_factor0

Unnamed: 0,0,1,2,3,4
0,1.0,0.115503,0.190654,0.165122,-0.203549
1,0.115503,1.0,-0.101404,0.122249,0.118201
2,0.190654,-0.101404,1.0,-0.111239,-0.069077
3,0.165122,0.122249,-0.111239,1.0,0.014165
4,-0.203549,0.118201,-0.069077,0.014165,1.0


In [95]:
# note: i did it this way (factor_exp * covar_common * factor_exp) for illustration
factor_exposures0 = model0.components_
cov_from_common_factors0 = pd.DataFrame(np.dot(np.dot(factor_exposures0.T,cov_factor0),factor_exposures0))

# note: can also do like below, as you can see, they are equivalent
np.isclose(pd.DataFrame(model0.inverse_transform(data_train_encoded)).cov(),cov_from_common_factors).all()

True

In [120]:
#because correlation comes from cross sectional effects only we can create like this (one of many ways):
var_marginals0 = np.diagonal(data_valid.cov())
stdev_marginals0 = np.sqrt(var_marginals0)
corr0 = cov_from_common_factors0 / np.outer(stdev_marginals0,stdev_marginals0)
np.fill_diagonal(corr0.values,1.0)

In [128]:
#We can also shave off the specific var for use later as well
var_specific0 = np.diagonal(data_valid.cov()) - np.diagonal(cov_from_common_factors0)

#inspect
var_breakdown = pd.DataFrame({'marginal':var_marginals0,
                              'specific':var_specific0,
                              'pct':var_specific0/var_marginals0}).set_index(data_valid.columns)
    
var_breakdown[var_breakdown['pct']< 0.0]

Unnamed: 0,marginal,specific,pct
2018.HK,7e-05,-5e-06,-0.076968


# Testing

todo: this needs a lot of work. I'm low on time so just mindlessly banging this out for now.

the end goal is to be able to forecast the covariance matrix (for optimization or simulation etc), so I am sure there are better metrics than the ones i have below. Like likelhood of this covariance structure producing the data?

In [142]:
covar_test = data_test.cov()
var_marginals_test = np.diagonal(covar_test)
corr_test = data_test.corr()

#this doesnt necessarily make sense, but low on time so slapping these in here
print("mse corr: {}".format(np.abs(corr_test.values - corr0.values).sum()))
print("mse marginals: {}".format(np.abs(var_marginals_test - var_marginals0).sum()))

mse corr: 394.6182294790748
mse marginals: 0.000454458558671855


In [144]:
# this shoudl really be a separate model (and I'll make it one when I have a minute)
# but lets comp the above to a simple sample covar/correl matrx

corr_baseline = data_valid.corr()
var_marginals_baseline = np.diagonal(data_valid.cov()) # in this case same as pca

# this doesnt make sense - see qualifier above
print("mse corr: {}".format(np.abs(corr_test.values - corr_baseline.values).sum()))
print("mse marginals: {}".format(np.abs(var_marginals_test - var_marginals_baseline).sum()))

mse corr: 318.079674650662
mse marginals: 0.000454458558671855


In [164]:
# inspection, how are these correlations

corr_test
corr_baseline
df_corr0 = pd.DataFrame(corr0).set_index(corr_test.index)
df_corr0.columns = df_corr0.index

pd.DataFrame({'test sample corr':corr_test['0001.HK'],'baseline':corr_baseline['0001.HK'],'pca0':df_corr0['0001.HK']})

Unnamed: 0,test sample corr,baseline,pca0
0001.HK,1.0,1.0,1.0
0002.HK,0.217412,0.160217,0.262418
0003.HK,0.146921,0.299431,0.351168
0005.HK,0.143474,0.46961,0.403459
0006.HK,0.261169,0.352192,0.337022
0011.HK,0.227127,0.472645,0.426893
0012.HK,0.277419,0.495431,0.401665
0016.HK,0.286999,0.464531,0.45396
0017.HK,0.300431,0.507741,0.508973
0019.HK,0.165106,0.441108,0.397571


# More todo: 

lets come back to the encoding reconstruction test. I need to think about this a little more.

It is certainly useful for comparing the models, but as I am explaining as we go along I'm really most interested in which of these will give us the most useful correlation and var model for the future rather than which better describes the future (after it has happened).


In [7]:
data_test_reconst = model0.inverse_transform(data_test_encoded)
#data_test_encoded
#error_p0 = mean_squared_error(data_test_reconst, data_test)

array([[ 0.00667647,  0.01836518,  0.00308754, -0.00454558, -0.0062883 ],
       [ 0.0113555 ,  0.01830825, -0.00388114, -0.00300052, -0.00108511],
       [ 0.01897593,  0.01470508, -0.00817796, -0.00025326, -0.00533718],
       ...,
       [ 0.00673279,  0.00253529,  0.00045948, -0.00736422,  0.00071269],
       [ 0.02088822, -0.00491727,  0.00835285, -0.00333194, -0.00184507],
       [ 0.0248835 , -0.00156089,  0.00964574, -0.00334135, -0.01034274]])

In [6]:
cov0 = pd.DataFrame(data_test_encoded).cov()
corr0= pd.DataFrame(data_test_encoded).corr()
print(cov0)
print(corr0)

          0         1         2         3         4
0  0.000679 -0.000072  0.000075  0.000007 -0.000079
1 -0.000072  0.000085 -0.000030 -0.000012  0.000015
2  0.000075 -0.000030  0.000085  0.000002 -0.000012
3  0.000007 -0.000012  0.000002  0.000049 -0.000003
4 -0.000079  0.000015 -0.000012 -0.000003  0.000042
          0         1         2         3         4
0  1.000000 -0.300542  0.312821  0.037054 -0.465941
1 -0.300542  1.000000 -0.351472 -0.189661  0.247041
2  0.312821 -0.351472  1.000000  0.023385 -0.198348
3  0.037054 -0.189661  0.023385  1.000000 -0.057644
4 -0.465941  0.247041 -0.198348 -0.057644  1.000000
