In [None]:
%pip install scikit-learn
%pip install numpy
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import logging
logging.basicConfig(level=logging.INFO)

### List of functions
----------------------------
1. rank_cross_section_features
2. sharpe_ratio
3. random_fourier_features
4. ranked_fourier_features

In [116]:
def rank_cross_section_features(X: pd.DataFrame, date_ids: np.ndarray):
    """Simple Cross-Section Ranking [-0.5, 0.5]"""
    assert isinstance(X, pd.DataFrame)
    columns = X.columns
    X["date"] = date_ids
    def _rank(df: pd.DataFrame) -> pd.DataFrame:
        ranked_df = df[columns].rank(pct=True) - 0.5
        return ranked_df
    X = X.groupby("date").apply(_rank, include_groups=False)
    return X

In [125]:
def sharpe_ratio(returns: np.ndarray):
    return returns.mean() / returns.std() * np.sqrt(12)

In [93]:
def random_fourier_features(
    x_train: pd.DataFrame, 
    x_test: pd.DataFrame, 
    number_features: int, 
    seed: int
):
    """Clean Random Fourier Feature, see Rahimi 2007."""
    np.random.seed(seed=int((seed+1)*1e3))
    weights = np.random.normal(loc=0, scale=1, size=[x_train.shape[1], int(number_features/2)])
  
    # np.__version__ == 1.26.4
    # XXX np.arange(0.5, 1.1, step=0.1) bugged :(
    gammas = np.random.choice(
        [0.5, 0.6, 0.7, 0.8, 0.9, 1], 
        size=[x_train.shape[1], int(number_features/2)], 
        replace=True
    )
    weights = weights * gammas
    x_train_rff = pd.concat(
        [getattr(np, activation)(x_train @ weights) for activation in ["cos", "sin"]],
        axis=1
    )
    
    x_test_rff = pd.concat(
        [getattr(np, activation)(x_test @ weights) for activation in ["cos", "sin"]],
        axis=1
    )
    return x_train_rff, x_test_rff

In [143]:
def random_monotonic_features(
    x_train: pd.DataFrame, 
    x_test: pd.DataFrame, 
    number_features: int, 
    seed: int
):
    """Random Monotonic Features"""
    np.random.seed(seed=int((seed+1)*1e3))
    weights = np.exp(
        np.random.normal(loc=0, scale=1, size=[x_train.shape[1],  number_features])
    )
  
    x_train_rff = x_train @ weights
    x_test_rff = x_test @ weights

    # ReLU
    x_train_rff = x_train_rff * (x_train_rff > 0)
    x_test_rff = x_test_rff * (x_test_rff > 0)

    return x_train_rff, x_test_rff

In [180]:
def ranked_random_monotonic_features(    
    x_train: pd.DataFrame, 
    x_test: pd.DataFrame, 
    number_features: int, 
    seed: int,
    train_dates: np.ndarray
):
    if isinstance(train_dates, pd.Series):
        train_dates = train_dates.values
    x_train_rff, x_test_rff = random_monotonic_features(
        x_train=x_train,
        x_test=x_test,
        number_features=number_features,
        seed=seed
    )
    x_train_rff.columns = np.arange(0, number_features, step=1)
    
    x_train_rff = rank_cross_section_features(
        X=x_train_rff,
        date_ids=train_dates
    )

    x_test_rff = x_test_rff.rank(pct=True) - 0.5
    return x_train_rff, x_test_rff

In [156]:
def factor_sign(train: pd.DataFrame, test: pd.DataFrame, train_dates: np.ndarray, y_train: pd.Series):
    train = train.multiply(y_train, axis="index")
    train.index = train_dates
    factor_signs = np.sign(train.reset_index().groupby("date").sum().mean())
    return train * factor_signs, test * factor_signs

In [107]:
def ranked_random_fourier_features(
    x_train: pd.DataFrame, 
    x_test: pd.DataFrame, 
    number_features: int, 
    seed: int,
    train_dates: np.ndarray
):
    """Applies Ranking After RFF"""

    x_train_rff, x_test_rff = random_fourier_features(
        x_train=x_train,
        x_test=x_test,
        number_features=number_features,
        seed=seed,
    )
    x_train_rff.columns = np.arange(0, number_features, step=1)
    x_train_rff = rank_cross_section_features(
        x_train_rff,
        date_ids=train_dates
    )
    # XXX Monthly re-training
    x_test_rff = x_test_rff.rank(pct=True) - 0.5
    return x_train_rff, x_test_rff

In [38]:
def linear_rolling_window_training(df: pd.DataFrame, rolling_window: int, long_only: bool) -> pd.DataFrame:
    dates = df.date.unique().tolist()
    dates.sort()
    
    start_window = 0
    end_window = rolling_window
    portfolio_returns = []
    while end_window < len(dates):
        start_time = time.monotonic()
 
        train_dates = dates[start_window:end_window]
        test_date = dates[end_window]

        train = df[df.date.isin(train_dates)].copy()
        test: pd.DataFrame = df[df.date == test_date].copy()

        train.set_index("date", inplace=True)
        test.set_index("date", inplace=True )
        
        y_train = train.pop("r_1")
        y_test = test.pop("r_1")

        train = train.multiply(y_train, axis="index")
        F_train = train.reset_index().groupby("date").sum()
        assert F_train.shape[0] == rolling_window
        model = LinearRegression(fit_intercept=False, positive=long_only)
        model.fit(F_train.values, np.ones(F_train.shape[0])) 

        prediction = test @ model.coef_
        returns = prediction.values.reshape(1,-1) @ y_test
        portfolio_returns.append({
            "date": test_date,
            "returns": returns[0]
        })
        start_window += 1
        end_window += 1
        end_time = time.monotonic()
        logging.info(f"[{end_window}/{len(dates)}]\tTraining Time:{end_time-start_time:.2f}s\tPF Return:{returns[0]:3f}")
    return pd.DataFrame(portfolio_returns)

In [171]:
def random_features_rolling_window_training(
        df: pd.DataFrame, 
        rolling_window: int,
        number_features: int,
        seed: int,
        long_only: int,
        projection_type: str
) -> pd.DataFrame:
    dates = df.date.unique().tolist()
    dates.sort()

    start_window = 0
    end_window = rolling_window
    portfolio_returns = []
    while end_window < len(dates):
        
        start_time = time.monotonic()
        train_dates = dates[start_window:end_window]
        test_date = dates[end_window]

        train = df[df.date.isin(train_dates)].copy()
        test: pd.DataFrame = df[df.date == test_date].copy()

        y_train = train.pop("r_1")
        y_test = test.pop("r_1")

        train_dates = train.pop("date")
        test_dates = test.pop("date")
        
        if projection_type == "RFF":
            x_train_rff, x_test_rff = ranked_random_fourier_features(
                x_train=train,
                x_test=test,
                number_features=number_features, 
                seed=seed,
                train_dates=train_dates
            )
        elif projection_type == "ReLU":
            train, test = factor_sign(train, test, train_dates, y_train)
            x_train_rff, x_test_rff = random_monotonic_features(
                x_train=train,
                x_test=test,
                number_features=number_features, 
                seed=seed,
            )

        elif projection_type == "RankedReLU":
            train, test = factor_sign(train, test, train_dates, y_train)
            train = train.reset_index(drop=True)
            x_train_rff, x_test_rff = ranked_random_monotonic_features(
                x_train=train,
                x_test=test,
                number_features=number_features, 
                seed=seed,
                train_dates=train_dates
            )
        
        x_train_rff = pd.DataFrame(x_train_rff.values * y_train.values.reshape(-1,1))
        x_train_rff.index = train_dates
        F_train = x_train_rff.reset_index().groupby("date").sum()
        assert F_train.shape[0] == rolling_window
        model = LinearRegression(fit_intercept=False, positive=long_only)
        model.fit(X=F_train, y=np.ones(F_train.shape[0]))
        prediction = x_test_rff @ model.coef_
        returns = prediction.values.reshape(1,-1) @ y_test
        portfolio_returns.append({
            "date": test_date,
            "returns": returns[0]
        })
        start_window += 1
        end_window += 1
        end_time = time.monotonic()
        logging.info(f"[{end_window}/{len(dates)}]\tTraining Time:{end_time-start_time:.2f}s\tPF Return:{returns[0]:.3f}")
    return pd.DataFrame(portfolio_returns)

## Empirics
Get the data from [Jensen, T. I., Kelly, B., & Pedersen, L. H. (2023).](https://jkpfactors.com/?country=usa&factor=all_factors&weighting=ew)<br>
For simplicity, we will restrict the analysis to large and mega cap stocks only.

In [9]:
jkp = pd.read_pickle("jkp_ranked_large_mega.pickle")
jkp.pop("size_grp")
jkp.pop("id")

0          10006
2          10102
3          10145
4          10153
5          10161
           ...  
2486994    93356
2486996    93369
2487000    93374
2487004    93427
2487006    93436
Name: id, Length: 671743, dtype: int64

In [10]:
jkp.describe()


Unnamed: 0,date,cowc_gr1a,oaccruals_at,oaccruals_ni,taccruals_at,taccruals_ni,debt_gr3,fnl_gr1a,ncol_gr1a,nfna_gr1a,...,div12m_me,ebitda_mev,eq_dur,eqnpo_12m,eqnpo_me,eqpo_me,ni_me,ocf_me,sale_me,r_1
count,671743,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,...,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0,671743.0
mean,1996-09-13 06:06:20.749185408,-0.019091,0.002287,-0.019457,-0.002163,-0.020176,0.014176,0.020228,0.053547,-0.011736,...,-0.006512,0.023517,0.019616,0.072738,0.091148,0.093303,0.055811,0.050632,-0.056205,0.006538
min,1963-01-31 00:00:00,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,...,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.998298
25%,1983-06-30 00:00:00,-0.191082,-0.17793,-0.222597,-0.1926,-0.225728,-0.156537,-0.212499,-0.168622,-0.231179,...,-0.326073,-0.162813,-0.173141,-0.158307,-0.036066,-0.003119,-0.124022,-0.124103,-0.271667,-0.047461
50%,1998-02-28 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.039233,0.071793,-0.023029,...,0.04947,0.013073,0.032006,0.124471,0.092892,0.067191,0.059565,0.043767,-0.074891,0.005307
75%,2010-07-31 00:00:00,0.139831,0.178554,0.175735,0.187959,0.176757,0.195017,0.246354,0.290834,0.211918,...,0.265673,0.209884,0.222395,0.315263,0.298246,0.284684,0.24779,0.241699,0.143129,0.058897
max,2022-11-30 00:00:00,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.499656,2.997363
std,,0.230908,0.238614,0.254723,0.245843,0.256605,0.241784,0.269545,0.278295,0.264254,...,0.329512,0.236675,0.249492,0.282324,0.245779,0.229699,0.239658,0.234916,0.253979,0.106385


In [123]:
rolling_window = 60
number_random_features = 600 # Complexity 10
seed = 1234

In [122]:
msrr_ols = linear_rolling_window_training(df=jkp, rolling_window=rolling_window, long_only=False)
msrr_nn_ols = linear_rolling_window_training(df=jkp, rolling_window=rolling_window, long_only=True)

INFO:root:[61/719]	Training Time:0.04s	PF Return:0.886068
INFO:root:[62/719]	Training Time:0.03s	PF Return:1.527426
INFO:root:[63/719]	Training Time:0.02s	PF Return:2.366944
INFO:root:[64/719]	Training Time:0.02s	PF Return:-0.264275
INFO:root:[65/719]	Training Time:0.02s	PF Return:1.395934
INFO:root:[66/719]	Training Time:0.02s	PF Return:1.329966
INFO:root:[67/719]	Training Time:0.03s	PF Return:2.665559
INFO:root:[68/719]	Training Time:0.03s	PF Return:3.395289
INFO:root:[69/719]	Training Time:0.02s	PF Return:0.489050
INFO:root:[70/719]	Training Time:0.02s	PF Return:0.082437
INFO:root:[71/719]	Training Time:0.02s	PF Return:1.712941
INFO:root:[72/719]	Training Time:0.02s	PF Return:2.188697
INFO:root:[73/719]	Training Time:0.02s	PF Return:0.899630
INFO:root:[74/719]	Training Time:0.02s	PF Return:-2.474180
INFO:root:[75/719]	Training Time:0.02s	PF Return:2.594974
INFO:root:[76/719]	Training Time:0.02s	PF Return:0.629536
INFO:root:[77/719]	Training Time:0.02s	PF Return:0.301490
INFO:root:[7

In [124]:
msrr_rff = random_features_rolling_window_training(
        df=jkp, 
        rolling_window=rolling_window,
        number_features=number_random_features,
        seed=seed,
        long_only=False,
        projection_type="RFF"
)

INFO:root:[61/719]	Training Time:1.43s	PF Return:0.718
INFO:root:[62/719]	Training Time:1.50s	PF Return:-0.067
INFO:root:[63/719]	Training Time:1.57s	PF Return:-0.902
INFO:root:[64/719]	Training Time:1.55s	PF Return:0.803
INFO:root:[65/719]	Training Time:1.54s	PF Return:0.244
INFO:root:[66/719]	Training Time:1.59s	PF Return:-0.543
INFO:root:[67/719]	Training Time:1.55s	PF Return:0.704
INFO:root:[68/719]	Training Time:1.55s	PF Return:0.663
INFO:root:[69/719]	Training Time:1.57s	PF Return:0.728
INFO:root:[70/719]	Training Time:1.60s	PF Return:-0.181
INFO:root:[71/719]	Training Time:1.61s	PF Return:0.083
INFO:root:[72/719]	Training Time:1.63s	PF Return:-0.559
INFO:root:[73/719]	Training Time:1.59s	PF Return:-0.129
INFO:root:[74/719]	Training Time:1.67s	PF Return:0.420
INFO:root:[75/719]	Training Time:1.66s	PF Return:-0.304
INFO:root:[76/719]	Training Time:1.60s	PF Return:1.264
INFO:root:[77/719]	Training Time:1.61s	PF Return:0.243
INFO:root:[78/719]	Training Time:1.63s	PF Return:0.157
INF

In [129]:
sharpe_ratio(msrr_nn_ols.returns)

0.5976136768748428

In [136]:
sharpe_ratio(msrr_rff.returns)

1.217637305114043

In [138]:
sharpe_ratio(msrr_ols.returns)

0.9189345190307636

In [134]:
msrr_nn_ols['ols'] = msrr_ols.returns
rhs = msrr_nn_ols.copy()

rhs = rhs.rename(columns={"returns": "nn_ols"})
model = LinearRegression(fit_intercept=True)
model.fit(X=rhs[["ols", "nn_ols"]], y=msrr_rff.returns)

In [135]:
model.intercept_

0.1481544145855217

In [161]:
msrr_relu = random_features_rolling_window_training(
        df=jkp, 
        rolling_window=rolling_window,
        number_features=number_random_features,
        seed=seed,
        long_only=True,
        projection_type="ReLU"
)

INFO:root:[61/719]	Training Time:0.21s	PF Return:-8.414
INFO:root:[62/719]	Training Time:0.19s	PF Return:1.675
INFO:root:[63/719]	Training Time:0.20s	PF Return:17.636
INFO:root:[64/719]	Training Time:0.21s	PF Return:4.423
INFO:root:[65/719]	Training Time:0.21s	PF Return:-0.629
INFO:root:[66/719]	Training Time:0.20s	PF Return:-6.596
INFO:root:[67/719]	Training Time:0.19s	PF Return:2.400
INFO:root:[68/719]	Training Time:0.20s	PF Return:6.919
INFO:root:[69/719]	Training Time:0.20s	PF Return:0.790
INFO:root:[70/719]	Training Time:0.19s	PF Return:8.107
INFO:root:[71/719]	Training Time:0.20s	PF Return:-4.625
INFO:root:[72/719]	Training Time:0.20s	PF Return:-2.580
INFO:root:[73/719]	Training Time:0.20s	PF Return:-8.465
INFO:root:[74/719]	Training Time:0.19s	PF Return:3.788
INFO:root:[75/719]	Training Time:0.21s	PF Return:2.965
INFO:root:[76/719]	Training Time:0.20s	PF Return:0.870
INFO:root:[77/719]	Training Time:0.20s	PF Return:-10.846
INFO:root:[78/719]	Training Time:0.28s	PF Return:-9.224


In [162]:
sharpe_ratio(msrr_relu.returns)

0.44325347830861334

In [164]:
msrr_relu.returns.std()

5.647088262869035

In [181]:
msrr_ranked_relu = random_features_rolling_window_training(
        df=jkp, 
        rolling_window=rolling_window,
        number_features=number_random_features,
        seed=seed,
        long_only=True,
        projection_type="RankedReLU"
)

INFO:root:[61/719]	Training Time:1.11s	PF Return:-0.363
INFO:root:[62/719]	Training Time:1.07s	PF Return:0.580
INFO:root:[63/719]	Training Time:1.11s	PF Return:0.551
INFO:root:[64/719]	Training Time:1.10s	PF Return:0.185
INFO:root:[65/719]	Training Time:1.10s	PF Return:-0.590
INFO:root:[66/719]	Training Time:1.13s	PF Return:-0.341
INFO:root:[67/719]	Training Time:1.07s	PF Return:0.074
INFO:root:[68/719]	Training Time:1.09s	PF Return:0.050
INFO:root:[69/719]	Training Time:1.10s	PF Return:-0.076
INFO:root:[70/719]	Training Time:1.06s	PF Return:-0.239
INFO:root:[71/719]	Training Time:1.06s	PF Return:-0.021
INFO:root:[72/719]	Training Time:1.09s	PF Return:-0.206
INFO:root:[73/719]	Training Time:1.15s	PF Return:-0.087
INFO:root:[74/719]	Training Time:1.08s	PF Return:0.500
INFO:root:[75/719]	Training Time:1.09s	PF Return:0.347
INFO:root:[76/719]	Training Time:1.12s	PF Return:0.305
INFO:root:[77/719]	Training Time:1.12s	PF Return:0.256
INFO:root:[78/719]	Training Time:1.09s	PF Return:0.132
IN

In [187]:
msrr_ranked_relu.returns.mean() / msrr_ranked_relu.returns.std() * np.sqrt(12)

0.5418585299394815