In [1]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path


from wufam.data.prepare_data import read_kf_data
from wufam.dataset import Dataset

PATH = Path("../data/kf_data")
START = "1970-01-01"
END = "2024-12-31"
DATASET = Dataset.BM_25_M
FACTORS_DATASET = Dataset.FACTORS_M
WEIGHTING = "value_weighted"
FACTOR_ANNUALIZE = 12

In [3]:
portfolios_total_r, portfolios_xs_r, factors_df, rf = read_kf_data(
    portfolios_filename=PATH / DATASET,
    factors_filename=PATH / FACTORS_DATASET,
    start_date=START,
    end_date=END,
    weighting=WEIGHTING,
)

In [4]:
assert (
    portfolios_total_r.shape[0]
    == portfolios_xs_r.shape[0]
    == factors_df.shape[0]
    == rf.shape[0]
)

In [5]:
import pandas as pd

mkt_caps = pd.read_csv(
    PATH / DATASET,
    skiprows=3_795,
    skipfooter=8881 - 4_984,
    index_col=0,
    engine="python",
)
mkt_caps.index = pd.to_datetime(mkt_caps.index, format="%Y%m")
mkt_caps = mkt_caps.loc[START:END]
mkt_caps.shape

(660, 25)

In [6]:
bms = pd.read_csv(
    PATH / DATASET,
    skiprows=4_991,
    skipfooter=8881 - 6_181,
    index_col=0,
    engine="python",
)
bms.index = pd.to_datetime(bms.index, format="%Y%m")
bms = bms.loc[START:END]
bms.shape

(660, 25)

In [7]:
ops = pd.read_csv(
    PATH / DATASET,
    skiprows=7_382,
    skipfooter=8881 - 8_128,
    index_col=0,
    engine="python",
)
ops.index = pd.to_datetime(ops.index, format="%Y%m")
ops = ops.loc[START:END]
ops.shape

(660, 25)

In [8]:
invs = pd.read_csv(
    PATH / DATASET,
    skiprows=8_133,
    skipfooter=8881 - 8_878,
    index_col=0,
    engine="python",
)
invs.index = pd.to_datetime(invs.index, format="%Y%m")
invs = invs.loc[START:END]
invs.shape

(660, 25)

In [9]:
stds = portfolios_xs_r.rolling(window=21, min_periods=1).std().fillna(0)
stds.shape

(660, 25)

In [10]:
skews = portfolios_xs_r.rolling(window=21, min_periods=1).skew().fillna(0)
skews.shape

(660, 25)

In [11]:
dfs_dict = {
    "Market_Caps": mkt_caps,
    "Book_to_Market": bms,
    "Operating_Profitability": ops,
    "Investment": invs,
    "Volatility": stds,
    "Skewness": skews,
    "ret": portfolios_xs_r,
}

# Stack each DataFrame to convert columns to index level, then concatenate
stacked_dfs = []
for name, df in dfs_dict.items():
    stacked = df.iloc[2:].stack()
    stacked.name = name
    stacked_dfs.append(stacked)

# Concatenate all stacked series along axis=1 (columns)
multi_df = pd.concat(stacked_dfs, axis=1)

# Set proper index names
multi_df.index.names = ["date", "portfolio"]
multi_df.shape

(16450, 7)

In [12]:
multi_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Market_Caps,Book_to_Market,Operating_Profitability,Investment,Volatility,Skewness,ret
date,portfolio,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1970-03-01,SMALL LoBM,22.04,0.1663,0.3579,0.3172,0.044992,1.432121,-0.069971
1970-03-01,ME1 BM2,20.44,0.3484,0.2411,0.2095,0.045281,1.535252,-0.054491
1970-03-01,ME1 BM3,20.13,0.4780,0.2466,0.1410,0.045593,1.635091,-0.036625
1970-03-01,ME1 BM4,21.12,0.6326,0.2082,0.1251,0.028915,-0.573111,0.002985
1970-03-01,SMALL HiBM,21.55,1.0348,0.1390,0.0839,0.028915,1.304995,-0.018686
...,...,...,...,...,...,...,...,...
2024-12-01,BIG LoBM,214780.60,0.0819,1.0612,0.0731,0.038783,-0.131955,-0.000434
2024-12-01,ME5 BM2,101259.92,0.2535,0.2909,0.0721,0.037065,-0.438801,-0.051943
2024-12-01,ME5 BM3,71025.51,0.4756,0.2402,0.0578,0.043960,-0.510547,-0.093240
2024-12-01,ME5 BM4,107488.01,0.6716,0.2309,0.0637,0.048796,-0.354187,-0.071656


In [13]:
multi_df["ret"]

date        portfolio 
1970-03-01  SMALL LoBM   -0.069971
            ME1 BM2      -0.054491
            ME1 BM3      -0.036625
            ME1 BM4       0.002985
            SMALL HiBM   -0.018686
                            ...   
2024-12-01  BIG LoBM     -0.000434
            ME5 BM2      -0.051943
            ME5 BM3      -0.093240
            ME5 BM4      -0.071656
            BIG HiBM     -0.061114
Name: ret, Length: 16450, dtype: float64

In [14]:
rank = (
    multi_df[multi_df.columns.difference(["ret"])]
    .groupby(level="date")
    .rank(ascending=False, method="max")
)
rank = rank.div(rank.groupby(level="date").count()) - 0.5
rank = rank.fillna(0)

In [15]:
from ipca import InstrumentedPCA

regr = InstrumentedPCA(n_factors=4, intercept=True, max_iter=50)
regr = regr.fit(X=rank, y=multi_df["ret"])
Gamma, Factors = regr.get_factors(label_ind=True)



The panel dimensions are:
n_samples: 658 , L: 6 , T: 25





Step 1 - Aggregate Update: 1.2847604010423386
Step 2 - Aggregate Update: 1.36514200118227
Step 3 - Aggregate Update: 0.21907709322944346
Step 4 - Aggregate Update: 1.366458824226379
Step 5 - Aggregate Update: 0.09880291092363291
Step 6 - Aggregate Update: 0.07511242001883511
Step 7 - Aggregate Update: 0.07118658992799765
Step 8 - Aggregate Update: 0.057724841047955666
Step 9 - Aggregate Update: 0.03534980091807044
Step 10 - Aggregate Update: 0.026754696817294185
Step 11 - Aggregate Update: 0.02461605594176744
Step 12 - Aggregate Update: 0.022463640286327927
Step 13 - Aggregate Update: 1.7404326972868307
Step 14 - Aggregate Update: 0.018858982162414772
Step 15 - Aggregate Update: 0.01747994266951533
Step 16 - Aggregate Update: 0.01635784671516466
Step 17 - Aggregate Update: 0.015458241497643604
Step 18 - Aggregate Update: 0.014746706882474259
Step 19 - Aggregate Update: 0.014192070731775391
Step 20 - Aggregate Update: 0.01376712708548461
Step 21 - Aggregate Update: 1.6634135710294
Step 

In [16]:
Gamma.shape, Factors.shape

((6, 5), (5, 25))

In [17]:
rank.shape

(16450, 6)

In [18]:
pred = regr.predict(rank)
pred = pd.DataFrame(pred, index=rank.index, columns=["pred"])
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,pred
date,portfolio,Unnamed: 2_level_1
1970-03-01,SMALL LoBM,0.017713
1970-03-01,ME1 BM2,0.017270
1970-03-01,ME1 BM3,0.007028
1970-03-01,ME1 BM4,0.006030
1970-03-01,SMALL HiBM,0.010823
...,...,...
2024-12-01,BIG LoBM,0.006349
2024-12-01,ME5 BM2,0.001357
2024-12-01,ME5 BM3,0.009417
2024-12-01,ME5 BM4,0.006350


In [19]:
pred = pd.pivot_table(pred, index="date", columns="portfolio", values="pred")
pred

portfolio,BIG HiBM,BIG LoBM,ME1 BM2,ME1 BM3,ME1 BM4,ME2 BM1,ME2 BM2,ME2 BM3,ME2 BM4,ME2 BM5,...,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,ME5 BM2,ME5 BM3,ME5 BM4,SMALL HiBM,SMALL LoBM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-03-01,0.001243,0.003379,0.017270,0.007028,0.006030,0.011882,0.014626,0.003270,0.007356,0.012834,...,0.011460,-0.000156,0.004460,0.006706,0.004687,0.006328,0.001021,0.002154,0.010823,0.017713
1970-04-01,-0.003427,0.002322,0.010786,0.004093,0.003416,0.001268,-0.000207,-0.002423,0.008490,0.010474,...,0.010966,0.000727,0.006957,0.006454,0.007438,0.007489,0.001487,0.003915,0.004818,0.000938
1970-05-01,-0.004816,0.001186,0.010835,0.003230,0.004598,-0.008176,0.004657,-0.002360,0.006524,0.010790,...,0.011154,-0.000053,0.008100,0.006196,0.006801,0.007356,0.001453,0.003698,0.006500,-0.003325
1970-06-01,-0.003257,0.001186,0.010885,0.003888,0.004497,-0.008120,0.010015,-0.002267,0.005755,0.011220,...,0.008019,-0.000637,0.008993,0.005687,0.007362,0.007213,0.001901,0.003810,0.006383,-0.003325
1970-07-01,0.004339,0.005609,0.014523,0.002402,0.007168,-0.005824,0.012092,0.007926,0.007181,0.014678,...,0.005460,0.007496,0.010106,0.011083,0.008981,0.008221,0.000282,0.006475,0.000400,-0.007908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-01,0.008803,0.005856,0.007925,0.013457,0.007585,0.009508,0.004359,0.011609,0.006302,0.005655,...,0.009538,-0.001861,0.009378,0.001571,0.004041,0.003645,0.009766,0.005263,0.014548,0.021703
2024-09-01,0.008919,0.006670,0.007542,0.011400,0.007573,0.008449,0.008938,0.016316,0.008888,0.006115,...,0.009264,-0.001926,0.009428,0.000530,0.003013,0.002081,0.009117,0.004692,0.014171,0.016242
2024-10-01,0.010032,0.006670,0.007587,0.012060,0.008260,0.009453,0.011037,0.013139,0.007907,0.004524,...,0.011218,-0.001481,0.008682,0.000578,0.004041,0.001587,0.009327,0.005097,0.014907,0.018552
2024-11-01,0.010122,0.007806,0.008892,0.008837,0.008260,0.008449,0.007668,0.014054,0.006880,0.006053,...,0.009029,-0.001416,0.007692,0.000578,0.003345,0.001617,0.009327,0.005502,0.015275,0.018552


In [20]:
from wufam.ap.ipca_factor_model import IPCAFactorModel

ipca = IPCAFactorModel(n_factors=4, fit_alpha=True)
ipca.fit(test_assets_xs_r=multi_df["ret"], ranks=rank)
ipca.predict(rank)



The panel dimensions are:
n_samples: 658 , L: 6 , T: 25
Step 1 - Aggregate Update: 1.2847604010423386
Step 2 - Aggregate Update: 1.36514200118227
Step 3 - Aggregate Update: 0.21907709322944346
Step 4 - Aggregate Update: 1.366458824226379
Step 5 - Aggregate Update: 0.09880291092363291
Step 6 - Aggregate Update: 0.07511242001883511
Step 7 - Aggregate Update: 0.07118658992799765
Step 8 - Aggregate Update: 0.057724841047955666
Step 9 - Aggregate Update: 0.03534980091807044
Step 10 - Aggregate Update: 0.026754696817294185
Step 11 - Aggregate Update: 0.02461605594176744
Step 12 - Aggregate Update: 0.022463640286327927
Step 13 - Aggregate Update: 1.7404326972868307
Step 14 - Aggregate Update: 0.018858982162414772
Step 15 - Aggregate Update: 0.01747994266951533
Step 16 - Aggregate Update: 0.01635784671516466
Step 17 - Aggregate Update: 0.015458241497643604
Step 18 - Aggregate Update: 0.014746706882474259
Step 19 - Aggregate Update: 0.014192070731775391
Step 20 - Aggregate Update: 0.01376712708




portfolio,BIG HiBM,BIG LoBM,ME1 BM2,ME1 BM3,ME1 BM4,ME2 BM1,ME2 BM2,ME2 BM3,ME2 BM4,ME2 BM5,...,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,ME5 BM2,ME5 BM3,ME5 BM4,SMALL HiBM,SMALL LoBM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-03-01,0.001243,0.003379,0.017270,0.007028,0.006030,0.011882,0.014626,0.003270,0.007356,0.012834,...,0.011460,-0.000156,0.004460,0.006706,0.004687,0.006328,0.001021,0.002154,0.010823,0.017713
1970-04-01,-0.003427,0.002322,0.010786,0.004093,0.003416,0.001268,-0.000207,-0.002423,0.008490,0.010474,...,0.010966,0.000727,0.006957,0.006454,0.007438,0.007489,0.001487,0.003915,0.004818,0.000938
1970-05-01,-0.004816,0.001186,0.010835,0.003230,0.004598,-0.008176,0.004657,-0.002360,0.006524,0.010790,...,0.011154,-0.000053,0.008100,0.006196,0.006801,0.007356,0.001453,0.003698,0.006500,-0.003325
1970-06-01,-0.003257,0.001186,0.010885,0.003888,0.004497,-0.008120,0.010015,-0.002267,0.005755,0.011220,...,0.008019,-0.000637,0.008993,0.005687,0.007362,0.007213,0.001901,0.003810,0.006383,-0.003325
1970-07-01,0.004339,0.005609,0.014523,0.002402,0.007168,-0.005824,0.012092,0.007926,0.007181,0.014678,...,0.005460,0.007496,0.010106,0.011083,0.008981,0.008221,0.000282,0.006475,0.000400,-0.007908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-01,0.008803,0.005856,0.007925,0.013457,0.007585,0.009508,0.004359,0.011609,0.006302,0.005655,...,0.009538,-0.001861,0.009378,0.001571,0.004041,0.003645,0.009766,0.005263,0.014548,0.021703
2024-09-01,0.008919,0.006670,0.007542,0.011400,0.007573,0.008449,0.008938,0.016316,0.008888,0.006115,...,0.009264,-0.001926,0.009428,0.000530,0.003013,0.002081,0.009117,0.004692,0.014171,0.016242
2024-10-01,0.010032,0.006670,0.007587,0.012060,0.008260,0.009453,0.011037,0.013139,0.007907,0.004524,...,0.011218,-0.001481,0.008682,0.000578,0.004041,0.001587,0.009327,0.005097,0.014907,0.018552
2024-11-01,0.010122,0.007806,0.008892,0.008837,0.008260,0.008449,0.007668,0.014054,0.006880,0.006053,...,0.009029,-0.001416,0.007692,0.000578,0.003345,0.001617,0.009327,0.005502,0.015275,0.018552


In [21]:
ret_true = pd.pivot_table(
    multi_df[["ret"]], index="date", columns="portfolio", values="ret"
)
ret_true.shape

(658, 25)

In [22]:
ipca.r2_score(ret_true, rank)

np.float64(0.9182703336342888)

In [23]:
split_date = portfolios_xs_r.index[len(portfolios_xs_r) // 2]

In [24]:
ipca = IPCAFactorModel(n_factors=3, fit_alpha=False)
ipca.fit(
    test_assets_xs_r=multi_df["ret"].loc[:split_date, :],
    ranks=rank.loc[:split_date, :, :],
)
ret_true = pd.pivot_table(
    multi_df[["ret"]].loc[split_date:, :],
    index="date",
    columns="portfolio",
    values="ret",
)
ipca.r2_score(ret_true, rank.loc[split_date:, :])



The panel dimensions are:
n_samples: 329 , L: 6 , T: 25





Step 1 - Aggregate Update: 1.095359530447005
Step 2 - Aggregate Update: 0.28980346190252215
Step 3 - Aggregate Update: 1.3358570868321888
Step 4 - Aggregate Update: 0.09903500584279235
Step 5 - Aggregate Update: 0.05689990391999439
Step 6 - Aggregate Update: 0.036658580602998814
Step 7 - Aggregate Update: 0.027505289447774167
Step 8 - Aggregate Update: 0.0242133581807078
Step 9 - Aggregate Update: 0.022577111027432428
Step 10 - Aggregate Update: 0.020301507627301596
Step 11 - Aggregate Update: 0.017709627171586917
Step 12 - Aggregate Update: 0.015133671116374314
Step 13 - Aggregate Update: 0.012803817328389644
Step 14 - Aggregate Update: 0.010828848834165461
Step 15 - Aggregate Update: 0.009227228591187908
Step 16 - Aggregate Update: 0.007966828375264412
Step 17 - Aggregate Update: 0.006994868603832749
Step 18 - Aggregate Update: 0.006255518782886882
Step 19 - Aggregate Update: 0.005698337093698214
Step 20 - Aggregate Update: 0.00528125446762151
Step 21 - Aggregate Update: 0.0049707759

np.float64(-17.961536510765416)

In [46]:
ipca_total_r = ipca.get_mv_weights(ret_true, rank.loc[split_date:, :])

In [47]:
from wufam.metrics.metrics import calc_sharpe

calc_sharpe(
    strategy_total_r=ipca_total_r,
    rf_rate=rf,
    factor_annualize=FACTOR_ANNUALIZE,
)

-45.46230770109122

In [58]:
# from tqdm import tqdm
#
# r2s = []
# for date in portfolios_xs_r.resample("ME").last().index:
#     if date >= pd.Timestamp("2000-01-01"):
#         ipca = IPCAFactorModel(n_factors=3, fit_alpha=True)
#         ipca.fit(test_assets_xs_r=multi_df["ret"].loc[:date, :], ranks=rank.loc[:date, :, :])
#         ret_true = pd.pivot_table(multi_df[["ret"]].loc[:date, :], index="date", columns="portfolio", values="ret")
#         r2s.append(ipca.r2_score(ret_true.loc[date:], rank.loc[date:, :]))