In [1]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path


from wufam.data.prepare_data import read_kf_data
from wufam.dataset import Dataset

PATH = Path("../data/kf_data")
START = "1970-01-01"
END = "2024-12-31"
DATASET = Dataset.BM_25_M
FACTORS_DATASET = Dataset.FACTORS_M
WEIGHTING = "value_weighted"
FACTOR_ANNUALIZE = 12

## 1.1

In [3]:
portfolios_total_r, portfolios_xs_r, factors_df, rf = read_kf_data(
    portfolios_filename=PATH / DATASET,
    factors_filename=PATH / FACTORS_DATASET,
    start_date=START,
    end_date=END,
    weighting=WEIGHTING,
)

In [4]:
assert (
    portfolios_total_r.shape[0]
    == portfolios_xs_r.shape[0]
    == factors_df.shape[0]
    == rf.shape[0]
)

## 1.4

In [5]:
from wufam.ap.uncond_factor_model import UncondFactorModel

capm = UncondFactorModel()
capm.fit(
    test_assets_xs_r=portfolios_xs_r,
    factors_df=factors_df[["Mkt-RF"]],
)

In [6]:
capm.grs_stat, capm.p_value

(np.float64(3.9398895265603717), np.float64(8.56728799231513e-10))

Try monthly

In [7]:
ff_model = UncondFactorModel()
ff_model.fit(
    test_assets_xs_r=portfolios_xs_r,
    factors_df=factors_df,
)

In [8]:
ff_model.grs_stat, ff_model.p_value

(np.float64(3.6219917502952184), np.float64(1.2021402021456464e-08))

## 1.5

In [9]:
split_date = portfolios_xs_r.index[len(portfolios_xs_r) // 2]

In [10]:
capm.fit(
    test_assets_xs_r=portfolios_xs_r.loc[:split_date],
    factors_df=factors_df[["Mkt-RF"]].loc[:split_date],
)

In [11]:
capm.rmse_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df[["Mkt-RF"]].loc[split_date:],
)

np.float64(0.0019269915645699978)

In [12]:
capm.r2_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df[["Mkt-RF"]].loc[split_date:],
)

np.float64(-0.729104748466787)

In [13]:
capm.r2_gls_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df[["Mkt-RF"]].loc[split_date:],
)

np.float64(-0.13291418254746712)

In [14]:
ff_model.fit(
    test_assets_xs_r=portfolios_xs_r.loc[:split_date],
    factors_df=factors_df.loc[:split_date],
)

In [15]:
ff_model.rmse_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df.loc[split_date:],
)

np.float64(0.0014985342183559322)

In [16]:
ff_model.r2_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df.loc[split_date:],
)

np.float64(-0.045670858654170754)

In [17]:
ff_model.r2_gls_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors_df.loc[split_date:],
)

np.float64(-0.12759056892034804)

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=min(portfolios_total_r.shape))
factors = pca.fit_transform(portfolios_total_r)

  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


In [19]:
factors

array([[-0.33226989,  0.07158516, -0.12901853, ..., -0.01498556,
         0.02075657, -0.01996714],
       [ 0.20068421, -0.10591044,  0.00635917, ...,  0.011145  ,
         0.01001852,  0.02416404],
       [-0.11250434, -0.09943168, -0.03212523, ..., -0.01781561,
        -0.01095796, -0.01695359],
       ...,
       [-0.06587436,  0.0131887 , -0.01671571, ...,  0.00642891,
        -0.00529448,  0.01540042],
       [ 0.44948118, -0.02336834, -0.01710596, ...,  0.02368674,
        -0.01136357,  0.01304012],
       [-0.3891975 ,  0.07035177,  0.00925721, ..., -0.00266637,
        -0.01338434,  0.01805824]], shape=(660, 25))

In [20]:
import numpy as np

signs = np.sign(factors.mean(axis=0))
factors = signs * factors

factors_means = factors.mean(axis=0)
factors_means

array([9.75650537e-18, 1.05134756e-19, 2.16840434e-18, 9.46212805e-20,
       5.83497896e-19, 2.69144976e-18, 6.25551799e-19, 3.72177037e-18,
       3.55881149e-18, 3.04890793e-19, 2.80446962e-18, 5.41443994e-19,
       5.78241159e-20, 2.57580152e-19, 1.57702134e-19, 3.44316326e-19,
       6.20295061e-19, 5.30930518e-19, 7.91139040e-19, 9.63954295e-19,
       5.11217752e-19, 6.87318468e-19, 2.47920897e-18, 9.59354650e-19,
       1.23007665e-18])

In [21]:
np.argsort(factors_means)[::-1]

array([ 0,  7,  8, 10,  5, 22,  2, 24, 19, 23, 18, 21,  6, 16,  4, 11, 17,
       20, 15,  9, 13, 14,  1,  3, 12])

In [22]:
ranked_factors = factors[:, np.argsort(factors_means)[::-1]]
ranked_factors

array([[ 0.33226989, -0.00232503,  0.00319596, ..., -0.07158516,
        -0.02048537, -0.00266963],
       [-0.20068421,  0.01598224,  0.01893698, ...,  0.10591044,
        -0.01290957,  0.01777841],
       [ 0.11250434, -0.00365294, -0.00106442, ...,  0.09943168,
         0.01047337,  0.01308905],
       ...,
       [ 0.06587436, -0.01573703,  0.01501737, ..., -0.0131887 ,
        -0.09152511,  0.01051185],
       [-0.44948118,  0.01482121, -0.00905798, ...,  0.02336834,
        -0.0039906 ,  0.01292477],
       [ 0.3891975 , -0.01823682, -0.01227795, ..., -0.07035177,
        -0.0497618 , -0.02718614]], shape=(660, 25))

In [23]:
factor_vars = factors.var(axis=0)

In [24]:
l1 = 2e-19
l2 = 1e-19
w = (factors_means - l1) / (factor_vars + l2)
w = np.where(factors_means >= l1, w, 0)

In [25]:
selected_factors = factors[:, factors_means >= l1]

In [26]:
selected_factors

array([[ 0.33226989,  0.12901853, -0.01908226, ..., -0.01498556,
         0.02075657, -0.01996714],
       [-0.20068421, -0.00635917,  0.0115841 , ...,  0.011145  ,
         0.01001852,  0.02416404],
       [ 0.11250434,  0.03212523,  0.00191811, ..., -0.01781561,
        -0.01095796, -0.01695359],
       ...,
       [ 0.06587436,  0.01671571, -0.01499943, ...,  0.00642891,
        -0.00529448,  0.01540042],
       [-0.44948118,  0.01710596, -0.04879438, ...,  0.02368674,
        -0.01136357,  0.01304012],
       [ 0.3891975 , -0.00925721,  0.01347987, ..., -0.00266637,
        -0.01338434,  0.01805824]], shape=(660, 21))

In [27]:
from wufam.ap.kns_factor_model import KNSFactorModel

kns_model = KNSFactorModel(l1_penalty=l1, l2_penalty=l2)
kns_model.fit(test_assets_xs_r=portfolios_xs_r)

In [28]:
kns_model.predict()

SMALL LoBM    0.001620
ME1 BM2       0.007076
ME1 BM3       0.007152
ME1 BM4       0.009134
SMALL HiBM    0.010548
ME2 BM1       0.004966
ME2 BM2       0.007720
ME2 BM3       0.008102
ME2 BM4       0.008793
ME2 BM5       0.009545
ME3 BM1       0.005202
ME3 BM2       0.007883
ME3 BM3       0.007283
ME3 BM4       0.008331
ME3 BM5       0.009780
ME4 BM1       0.006522
ME4 BM2       0.006872
ME4 BM3       0.007368
ME4 BM4       0.008327
ME4 BM5       0.008576
BIG LoBM      0.006163
ME5 BM2       0.006402
ME5 BM3       0.006173
ME5 BM4       0.005577
BIG HiBM      0.007438
dtype: float64

In [29]:
kns_model.rmse_score(
    test_assets_xs_r=portfolios_xs_r,
    factors=factors,
)

np.float64(1.1940723679341168e-17)

In [30]:
kns_model.r2_score(
    test_assets_xs_r=portfolios_xs_r,
    factors=factors,
)

np.float64(1.0)

In [31]:
kns_model.r2_gls_score(
    test_assets_xs_r=portfolios_xs_r,
    factors=factors,
)

np.float64(1.0)

In [32]:
kns_model.fit(
    test_assets_xs_r=portfolios_xs_r.loc[:split_date],
)

In [33]:
kns_model.rmse_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors,
)

np.float64(0.002086667080374555)

In [34]:
kns_model.r2_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors,
)

np.float64(-1.0275333459162477)

In [35]:
kns_model.r2_gls_score(
    test_assets_xs_r=portfolios_xs_r.loc[split_date:],
    factors=factors,
)

np.float64(0.20885347987640723)

In [36]:
kns_model._betas

Unnamed: 0,Factor_1,Factor_2,Factor_3,Factor_4,Factor_5,Factor_6,Factor_7,Factor_8,Factor_9,Factor_10,...,Factor_16,Factor_17,Factor_18,Factor_19,Factor_20,Factor_21,Factor_22,Factor_23,Factor_24,Factor_25
SMALL LoBM,0.014814,-0.019636,-0.013859,-0.019606,-0.015753,0.002503,-0.014936,-0.017465,-0.001855,-0.006201,...,-0.001289,-0.021825,-0.003785,-0.002471,0.005462,0.013215,-0.028054,-0.033633,0.00201,-0.010891
ME1 BM2,0.013198,-0.017374,-0.003036,-0.01196,-0.010176,0.003674,-0.006708,-0.00257,-0.007439,-0.002476,...,-0.001393,-0.007958,-0.006688,-0.017425,0.002054,0.00651,0.037965,0.016207,0.03116,0.006787
ME1 BM3,0.012271,-0.015125,0.001677,-0.005317,-0.009588,0.007193,0.000969,0.005289,-0.007307,-0.006934,...,-0.005931,0.009406,-0.003165,-0.004379,0.001182,-0.006066,-0.033432,0.039015,-0.000994,0.002662
ME1 BM4,0.011296,-0.013406,0.005709,-0.002542,-0.0093,0.009999,0.002799,0.007677,-0.004307,-0.004392,...,0.010122,0.004868,0.006552,0.00623,-0.006423,-0.005,0.001021,-0.019617,-0.007902,0.03747
SMALL HiBM,0.011892,-0.015635,0.01211,-0.003991,-0.008053,0.016962,0.008854,0.007636,0.001688,0.004627,...,0.014123,0.019636,0.009665,0.00422,-0.007392,-0.006124,0.011725,-0.006611,-0.007944,-0.037401
ME2 BM1,0.014607,-0.010211,-0.021343,-0.00865,0.00166,-0.015075,-0.012204,-0.010517,0.00461,-0.000699,...,-0.01361,-0.004051,0.012651,0.001755,0.009106,-0.013261,0.022522,0.014541,-0.049307,0.003165
ME2 BM2,0.012895,-0.009513,-0.006191,0.001199,-0.000201,-0.006999,-0.00267,0.00359,-0.003634,0.005903,...,-0.015428,-0.004679,-0.019294,0.014679,-0.034252,0.024564,0.000879,-0.000423,-0.005576,-0.007999
ME2 BM3,0.011482,-0.006903,0.001533,0.0058,0.002042,-0.006052,0.004477,0.005949,-4.6e-05,-0.003899,...,-0.00931,0.004424,-0.003003,0.033343,0.0214,-0.003692,0.007262,-0.001336,0.010466,-0.00116
ME2 BM4,0.010572,-0.00417,0.006552,0.011322,-0.00141,0.002826,0.005103,0.005411,0.000775,-0.004778,...,-0.00683,-0.019969,0.022363,-0.00955,0.014188,0.00599,-0.005051,-0.001262,0.003366,-0.007586
ME2 BM5,0.011869,-0.007251,0.012768,0.002973,0.001303,0.004229,0.007743,0.003889,0.006061,0.022735,...,-0.001131,-0.007524,-0.015582,-0.003049,0.012315,-0.00266,0.001219,-5.1e-05,-0.00055,0.008055


In [37]:
kns_model._selection

Index(['Factor_1', 'Factor_2', 'Factor_3', 'Factor_4', 'Factor_5', 'Factor_6',
       'Factor_7', 'Factor_8', 'Factor_9', 'Factor_10', 'Factor_11',
       'Factor_12', 'Factor_13', 'Factor_14', 'Factor_15', 'Factor_16',
       'Factor_17', 'Factor_18', 'Factor_19', 'Factor_20', 'Factor_21',
       'Factor_22', 'Factor_23', 'Factor_24', 'Factor_25'],
      dtype='object')

In [38]:
l1_grid = np.logspace(-4, -1, 20)
for l1 in l1_grid:
    print(l1)
    kns_model = KNSFactorModel(l1_penalty=l1, l2_penalty=0)
    kns_model.fit(test_assets_xs_r=portfolios_xs_r)
    print(len(kns_model._selection))

0.0001
25
0.0001438449888287663
25
0.00020691380811147902
25
0.00029763514416313193
25
0.00042813323987193956
25
0.0006158482110660267
24
0.0008858667904100823
24
0.0012742749857031334
23
0.0018329807108324356
21
0.0026366508987303583
21
0.00379269019073225
21
0.005455594781168515
21
0.007847599703514606
18
0.011288378916846883
16
0.01623776739188721
13
0.023357214690901212
10
0.03359818286283781
6
0.04832930238571752
4
0.06951927961775606
2
0.1
1


In [39]:
from tqdm import tqdm
from itertools import product
from sklearn.model_selection import TimeSeriesSplit

cv = TimeSeriesSplit(n_splits=5)
l1_grid = np.logspace(-4, -1, 20)
l2_grid = np.logspace(-3, 3, 20)
best_l1 = 0
best_l2 = 0
best_score = np.inf

for (l1, l2) in (pbar := tqdm(product(l1_grid, l2_grid), total=len(l1_grid) * len(l2_grid))):
    scores = []
    for train, test in cv.split(portfolios_xs_r.loc[:split_date]):
        kns_model = KNSFactorModel(l1_penalty=l1, l2_penalty=l2)
        kns_model.fit(test_assets_xs_r=portfolios_xs_r.loc[:split_date].iloc[train])
        scores.append(kns_model.rmse_score(test_assets_xs_r=portfolios_xs_r.loc[:split_date].iloc[test], factors=factors))

    score = np.mean(scores)
    pbar.set_description(f"L1: {l1}, L2: {l2}, Score: {score}")
    if score < best_score:
        best_l1 = l1
        best_l2 = l2
        best_score = score

L1: 0.1, L2: 1000.0, Score: 0.009413168176733317: 100%|██████████| 400/400 [01:30<00:00,  4.41it/s]                                


In [40]:
best_l1, best_l2, best_score

(np.float64(0.1), np.float64(0.001), np.float64(0.009413168176733317))

In [41]:
kns_model = KNSFactorModel(l1_penalty=best_l1, l2_penalty=best_l2)
kns_model.fit(test_assets_xs_r=portfolios_xs_r.loc[:split_date])
kns_model.r2_score(portfolios_xs_r.loc[split_date:], factors=factors)

np.float64(-1.403083549662059)

In [42]:
len(kns_model._selection)

2