In [21]:
import numpy as np
import pandas as pd
from ipca import InstrumentedPCA
from ipca_utils import impute_w_median, IPCA_factor
from scipy.linalg import inv
import pickle

In [2]:
#load data
df_ipca = pd.read_pickle("data/kelly_data_without_nanocap.p")
print ("====== Finished loading data ======")
# impute and normalize
characteristics = df_ipca.columns[10:] #list of characteristics
df_ipca.sort_values(by='eom', inplace=True, ignore_index=True)

window_size = 240
K = 6 #num of principle components
unique_dates = sorted(df_ipca['eom'].unique()) #unique dates
T = len(unique_dates)



In [3]:
t = 450

In [4]:
window_dates = unique_dates[t-window_size:t]
mask = df_ipca['eom'].isin(window_dates)
window_data = df_ipca[mask]
date_to_predict = unique_dates[t]
date_to_predict

datetime.date(1999, 7, 31)

In [5]:
# Get the last date data and train data from the window data
last_date = max(window_data['eom'].values)
last_win_data = window_data[window_data['eom'] == last_date]
last_win_data.set_index('id', inplace=True)
train_data = window_data[window_data['eom'] != last_date]

# Only use columns with limited NaN
nan_threshold = 0.05
chars_to_keep = []
for char in characteristics:
    if train_data[char].isna().mean() <= nan_threshold:
        chars_to_keep.append(char)
r_t = last_win_data['ret_local_lead1m']
excess_r_t = last_win_data['ret_exc_lead1m']
X_last = last_win_data[chars_to_keep]

# Drop rows where lead 1m return (label) is missing
train_data = train_data.dropna(subset=["ret_local_lead1m"])

# Prepare data for IPCA model
train_data.set_index(['id', 'eom'], inplace=True) # this format is required for Kelly's IPCA module
y = train_data['ret_local_lead1m'] #lead return
X = train_data[chars_to_keep]

## Fill NA with median
X = X.fillna(X.median())
X_last = X_last.fillna(X.median()) # Don't use future info?

regr = InstrumentedPCA(n_factors=K, intercept=False, max_iter=400, iter_tol=1e-4)
regr = regr.fit(X=X, y=y, quiet = True)
Gamma, Factors = regr.get_factors(label_ind=True)

[=====                                                                   ]   7%

The panel dimensions are:
n_samples: 9186 , L: 9 , T: 239




In [32]:
V_t = inv(Gamma.T @ X_last.T @ X_last @ Gamma) @ Gamma.T @ X_last.T
V_t

id,106441201,106303001,106670801,102870101,100846301,106441501,101531601,101295001,101016301,101692601,...,100947301,106160101,100280301,102195901,102921101,101189601,102533401,106315701,102960601,111262401
0,0.0001170714,-0.000194,0.000553,-0.000232,-5.8e-05,0.000506,-0.0002650491,0.000215,9e-06,-0.000104,...,0.000335,0.0003447976,-0.000254,0.000873,5.2e-05,-0.000129,0.000289,0.000531,0.000386,0.001216
1,-3.269866e-06,-0.000143,-0.000864,0.000314,-0.000172,-0.000447,-0.0002941751,-0.000494,0.000284,0.000222,...,-7e-06,-0.0002550521,0.000156,-0.00065,1.2e-05,0.000176,-0.000301,-0.000642,-0.000262,1.1e-05
2,-0.0001468776,0.000512,0.000817,-0.000253,0.000388,0.000133,0.0008810652,0.000607,-0.000533,-0.000264,...,-0.00043,2.777777e-05,5.9e-05,2.5e-05,-8.7e-05,-0.00015,0.000167,0.000454,-3.7e-05,-0.001632
3,-4.101348e-05,0.000146,0.000325,-0.000125,9.7e-05,0.000113,0.0002284175,0.000192,-0.000109,-7.3e-05,...,-9e-05,-7.527318e-05,-2.9e-05,5.9e-05,-2.8e-05,-4.3e-05,4.2e-05,0.000188,2.1e-05,-0.00034
4,-5.451729e-06,-6e-06,-4.3e-05,-3.5e-05,-3.3e-05,-2.5e-05,-5.568086e-07,6e-06,-4.6e-05,0.000104,...,-2.8e-05,-1.547969e-05,-1.6e-05,-3.7e-05,7.3e-05,5.2e-05,-2e-06,2.6e-05,2e-05,-2.4e-05
5,-9.686929e-07,-2e-06,6e-06,-2.1e-05,-1e-05,2e-06,-2.124788e-06,2e-06,-2e-05,3e-06,...,-8e-06,2.539535e-07,-1.6e-05,4e-06,4e-06,-3e-06,2e-06,1e-05,4e-06,3e-06


In [22]:
to_save = [date_to_predict, Factors, V_t]

In [25]:
with open('temp.pickle', 'wb') as handle:
    pickle.dump(to_save, handle)

In [26]:
with open('temp.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [174]:
date_to_predict

datetime.date(2012, 1, 31)

In [31]:
import multiprocessing as mp

cpu_count = mp.cpu_count() 
cpu_count

20

In [15]:
data = pd.read_pickle("data/kelly_data_without_nanocap.p")

In [16]:
data

Unnamed: 0,id,eom,isin,cusip,sedol,excntry,ret_exc_lead1m,ret_local_lead1m,ret_local,ret_exc,...,rmax5_rvol_21d,ni_be,ocf_at,ocf_at_chg1,mispricing_perf,mispricing_mgmt,qmj,qmj_prof,qmj_growth,qmj_safety
0,100100001,1970-09-30,,,,USA,0.059230,0.063830,,,...,,0.179708,-0.388130,-0.166415,,,,,,
1,100100001,1970-10-31,,,,USA,-0.284600,-0.280000,0.063830,0.059230,...,,0.179708,-0.388130,-0.166415,0.557225,0.007569,-0.131257,-0.242119,1.131001,-1.038514
2,100100001,1970-11-30,,,,USA,0.106911,0.111111,-0.280000,-0.284600,...,,0.179708,-0.388130,-0.166415,0.560355,0.007408,-0.116091,-0.236741,1.141193,-1.028642
3,100100001,1970-12-31,,,,USA,-0.103800,-0.100000,0.111111,0.106911,...,,0.179708,-0.388130,-0.166415,0.560992,0.006983,-0.125102,-0.246816,1.134773,-1.030821
4,100100001,1971-01-31,,,,USA,0.232811,0.236111,-0.100000,-0.103800,...,,0.179708,-0.388130,-0.166415,0.564567,0.006521,-0.088173,-0.228488,1.159477,-1.024155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472336,135226201,2023-10-31,USG2415A1138,G2415A113,,USA,-0.041766,-0.037066,-0.064982,-0.069682,...,1.017932,0.265316,0.096348,-0.021851,0.593672,0.095065,,1.132598,,-0.943145
2472337,135226201,2023-11-30,USG2415A1138,G2415A113,,USA,0.051181,0.055581,-0.037066,-0.041466,...,0.900174,0.265316,0.096348,-0.021851,0.592866,0.094784,,1.128163,,-0.945400
2472338,135226201,2023-12-31,USG2415A1138,G2415A113,,USA,-0.067193,-0.062893,0.055581,0.051281,...,1.205799,0.265316,0.096348,-0.021851,0.593251,0.094770,,1.127290,,-0.946629
2472339,135226201,2024-01-31,USG2415A1138,G2415A113,,USA,-0.048324,-0.043624,-0.062893,-0.067593,...,0.689607,0.263314,0.100292,-0.021851,0.586455,0.098558,,1.156739,,-0.894204


In [3]:
np.exp(np.linspace( np.log(1e-6),np.log(5),10))[3]

0.0001709975946676699