In [1]:
import numpy as np
import pandas as pd
import gc
from copy import copy
from tqdm import trange
from datetime import datetime
import warnings
from sklearn.metrics.pairwise import pairwise_kernels
from AssetPricing import factor_model, backtest, kernel_methods

warnings.filterwarnings('ignore', message = 'Unused variable')

In [2]:
data = pd.read_csv('D:/cha1.csv',index_col=0)
# data = pd.read_csv('D:/project/data/data.csv', index_col = 1)

In [3]:
date = list(pd.unique(data.index))
date.sort()
ret = data.pivot(columns = 'PERMNO', values = 'RET').fillna(0).loc[date]

In [4]:
characteristics = dict()
for t in trange(len(date)):
    key = date[t]
    cha = data.loc[key].set_index('PERMNO').iloc[:, 1:]    
    cha = 2*cha.rank()/cha.count()-1
    rt = ret.loc[key, cha.index]
    cha.loc[rt[rt == 0].index] = np.nan
    cha = cha.dropna(how = 'all')
    
    characteristics[key] = cha

100%|████████████████████████████████████████████████████████████████████████████████| 623/623 [00:25<00:00, 24.59it/s]


In [5]:
def saving_lag_characteristics(characteristics, ret, lag, saving_path = 'D:/cha'):
    date = list(characteristics.keys())
    
    if len(lag) == 0 or lag is None == 0:
        starting_date = 0
    else:
        starting_date = max(lag)

    for t in trange(starting_date, len(date)):
        key = date[t]
        cha = characteristics[key]
        if len(lag) == 0 or lag is None == 0:
            pass
        else:
            for s in lag:
                key2 = date[t - s]
                cha_ = characteristics[key2]
                cha = cha.merge(cha_, how = 'right', left_index = True, right_index = True)
        rt = ret.loc[key, cha.index]
        cha.loc[rt[rt == 0].index] = np.nan
        cha = cha.dropna(how = 'all')

        cha.to_pickle(saving_path + '/{}.pkl'.format(key))

    date = list(characteristics.keys())[starting_date:]
    return date

In [6]:
available_date = saving_lag_characteristics(characteristics, ret, [])

100%|███████████████████████████████████████████████████████████████████████████████| 623/623 [00:01<00:00, 396.06it/s]


In [7]:
ret[ret == 0] = np.nan
ret = ret.add(- ret.mean(axis = 1), axis = 0).divide(ret.std(axis = 1), axis = 0)
ret = ret.fillna(0)

In [53]:
class kernel_ic_maximizer:
    def __init__(self, date, ret, starting_date):
        self.date = date
        self.ret = ret
        self.starting_date = starting_date
        self.reading_path = 'D:/cha'
        self.prediction_saving_path = 'D:/kernel_prediction'
        
    
    def ic_maximizer(self, exog, endog, new_exog, lamb = 1e-3, metric = 'linear', gamma = None, degree = None, coef0 = None):
        N1 = exog.shape[0]
        N2 = new_exog.shape[0]
        gram_matrix1 = pairwise_kernels(
            exog, exog, metric = metric, filter_params = True,
            gamma = gamma, degree = degree, coef0 = coef0
        )
        gram_matrix1 = gram_matrix1 - gram_matrix1.mean(axis = 0) - gram_matrix1.mean(axis = 1).reshape(N1, 1) + gram_matrix1.mean()
        gram_matrix2 = pairwise_kernels(
            new_exog, exog, metric = metric, filter_params = True,
            gamma = gamma, degree = degree, coef0 = coef0
        )
        gram_matrix2 = gram_matrix2 - gram_matrix2.mean(axis = 0) - gram_matrix2.mean(axis = 1).reshape(N2, 1) + gram_matrix2.mean()
        pred = gram_matrix2.dot(np.linalg.inv(gram_matrix1 + lamb * N1 * np.eye(N1))).dot(endog)
        del gram_matrix1, gram_matrix2
        gc.collect()

        return pred
    
    
    def get_prediction_of_lag_s(self, s = 1, lamb = 1e-3, metric = 'linear', gamma = None, degree = None, coef0 = None):
        prediction = 0 * self.ret

        oos_starting_date = self.date.index(self.starting_date)
        for t in trange(max(s, oos_starting_date), len(self.date)):
            key1 = self.date[t]
            key2 = self.date[t - s]
            new_exog = pd.read_pickle(self.reading_path + '/{}.pkl'.format(key1))
            new_available_list = list(new_exog.index)
            exog = pd.read_pickle(self.reading_path + '/{}.pkl'.format(key2))
            available_list = list(exog.index)
            endog = self.ret.loc[key2, available_list]
            prediction.loc[key1, new_available_list] = self.ic_maximizer(
                exog, endog, new_exog, lamb, metric, gamma, degree, coef0
            )
            
        pre = copy(prediction)
        pre[pre == 0] = np.nan
        ic = pre.corrwith(ret.loc[pre.index, pre.columns], axis = 1).mean()
        print(ic)
        saving_name = 's = {}, lamb = {}, metric = {}'.format(s, lamb, metric)
        if gamma is not None:
            saving_name = saving_name + ', gamma = {}'.format(gamma)
        if degree is not None:
            saving_name = saving_name + ', degree = {}'.format(degree)
        if coef0 is not None:
            saving_name = saving_name + ', coef0 = {}'.format(coef0)
        saving_name = saving_name + '.pkl'
        prediction.to_pickle(self.prediction_saving_path + '/' + saving_name)

In [54]:
self = kernel_ic_maximizer(
    date = available_date,
    ret = ret,
    starting_date = date[120]
)

In [10]:
self.get_prediction_of_lag_s(1, 1e-4, 'rbf', 1e-6)

100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:34<00:00,  1.51it/s]


0.01777039114237008


In [50]:
for s in range(1, 12):
    print('s = {}'.format(s))
    self.get_prediction_of_lag_s(s, 1e-3, 'rbf', 1e-4)

s = 1


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:53<00:00,  1.42it/s]


0.016940740076247345
s = 2


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


-0.02391770091043785
s = 3


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.019577132048731103
s = 4


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.022268380249086894
s = 5


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.013349688289982828
s = 6


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


-0.020278216387728046
s = 7


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.00338556368853025
s = 8


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.02025332430476163
s = 9


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


0.021849587665853962
s = 10


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:52<00:00,  1.43it/s]


-0.01730361314496968
s = 11


100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [05:53<00:00,  1.42it/s]


-0.014773885928850023


In [51]:
prediction = 0
for i in trange(1, 12):
    prediction = prediction + pd.read_pickle('D:/kernel_prediction/s = {}, lamb = 0.001, metric = rbf, gamma = 0.0001.pkl'.format(i))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 25.50it/s]


In [52]:
prediction[prediction == 0] = np.nan
prediction.corrwith(ret.loc[prediction.index, prediction.columns], axis = 1).dropna().mean()

0.02218853678659473

# linear ridge

In [41]:
def ic_oos(mod, starting_date, characteristics, ret, date_list, estimating_interval=10000, horizon=1):
    
    oos_starting_date = date_list.index(starting_date)
    prediction = ret*0
    for t in trange(oos_starting_date, len(date_list)):
        key = date_list[t]
        avaliable_date = date_list[(t-estimating_interval-horizon):(t-horizon)]
        avaliable_date = list(set(avaliable_date) & set(mod.factor.index))
        avaliable_date.sort()
        mu_hat = mod.factor.loc[avaliable_date].mean().values
        prediction.loc[key, characteristics[key].index] = mod.predict(characteristics[key],mu_hat)
        
    ic = prediction.iloc[oos_starting_date:(len(date_list)-horizon)].\
        corrwith(ret.iloc[oos_starting_date:(len(date_list)-horizon)], axis=1).mean()
    
    print(ic)
    
    return prediction.iloc[oos_starting_date:(len(date_list)-horizon)]

In [42]:
mod1 = factor_model.ridge(characteristics, ret)
mod1.fit(2e-3)

100%|██████████████████████████████████████████████████████████████████████████████| 611/611 [00:00<00:00, 5099.84it/s]


In [43]:
pre1 = ic_oos(
    mod1,
    date[120],
    characteristics,
    ret,
    date,
    12
)

100%|███████████████████████████████████████████████████████████████████████████████| 491/491 [00:01<00:00, 365.91it/s]


0.05948817832402594
