In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

import scipy.stats as stats

from ts_utils import OOSR2, modOOSR2

Train/Test split

In [6]:
df = pd.read_csv("../.data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

Mean model

In [7]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

Principal component regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

pipeline = Pipeline([
    ('scaler', "passthrough"),
    ("pca", PCA()),
    ('regressor', LinearRegression())
])

param_grid = {
    'scaler': ["passthrough", MinMaxScaler(), StandardScaler(), MaxAbsScaler()],
    "pca__n_components": [i for i in range(1,50)] 
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=tscv,
    n_jobs=-1
    )

grid_search.fit(X_train, y_train)

In [9]:
results = pd.DataFrame(grid_search.cv_results_)
#results

In [10]:
grid_search.best_params_

{'pca__n_components': 2, 'scaler': MaxAbsScaler()}

In [11]:
modOOSR2(y_test, grid_search.predict(X_test))

0.017591246518283477

In [12]:
OOSR2(y_test, grid_search.predict(X_test), naive_mean.predict(X_test))

-0.005850587994713052

In [13]:
best = grid_search.best_estimator_
best.named_steps["regressor"].coef_, best.named_steps["regressor"].intercept_

(array([ 0.0016912 , -0.00914507]), 0.0027119999999999996)

In [14]:
best.named_steps["pca"].components_

array([[-0.09172542, -0.00168717, -0.17154993, -0.11470841, -0.11677874,
        -0.15045335, -0.22750329, -0.24284482, -0.08401874, -0.22736964,
        -0.07608659, -0.03357989, -0.11430825,  0.0180259 , -0.06804722,
        -0.2263055 , -0.2513084 , -0.37091129,  0.05614972, -0.4623142 ,
        -0.23356032, -0.07233026, -0.06630113,  0.00753462, -0.26977572,
        -0.09603608, -0.26659179,  0.02480186,  0.01981104,  0.02502201,
         0.00451026, -0.02581203,  0.04669859, -0.15748713, -0.00962626,
        -0.00820555,  0.00766154,  0.0133362 ,  0.01477447,  0.00776268,
         0.01205382,  0.00612613,  0.01317882, -0.03170976, -0.02342022,
        -0.01586725,  0.00255876,  0.00068844, -0.00297229],
       [-0.18488629,  0.02914833,  0.08958436, -0.23878473, -0.00714364,
         0.40114035,  0.13329372, -0.23902848,  0.06331209,  0.19595052,
        -0.00340528,  0.03443347, -0.00347643,  0.01962222,  0.03689798,
         0.12154963,  0.09860075,  0.11491684,  0.08188482, -0.

In [15]:
best.named_steps["pca"].explained_variance_ratio_

array([0.40904805, 0.14920612])