In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

import scipy.stats as stats

from ts_utils import OOSR2, modOOSR2

Train/Test split

In [63]:
df = pd.read_csv("../.data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

Mean model

In [64]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

Principal component regression

In [65]:
from sklearn.cross_decomposition import PLSRegression

pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', PLSRegression())
])

param_grid = {
    'scaler': ["passthrough", MinMaxScaler(), StandardScaler(), MaxAbsScaler()],
    "regressor__n_components": [i for i in range(1,50)],
    "regressor__max_iter":[1000,900,800,700,600,500]
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=tscv,
    n_jobs=-1,
    )

grid_search.fit(X_train, y_train)

In [66]:
results = pd.DataFrame(grid_search.cv_results_)
#results

In [67]:
grid_search.best_params_

{'regressor__max_iter': 1000,
 'regressor__n_components': 1,
 'scaler': 'passthrough'}

In [68]:
modOOSR2(y_test, grid_search.predict(X_test))

-0.08473732884324403

In [69]:
OOSR2(y_test, grid_search.predict(X_test), naive_mean.predict(X_test))

-0.11062088582774154

In [70]:
best = grid_search.best_estimator_
best.named_steps["regressor"].coef_, best.named_steps["regressor"].intercept_



(array([[ 7.55699163e-04],
        [-3.18936437e-04],
        [-6.47966725e-04],
        [ 2.79010285e-04],
        [-6.37690184e-04],
        [-1.50464604e-03],
        [-5.97714566e-04],
        [ 2.37153193e-04],
        [-5.19285442e-04],
        [-1.07481334e-03],
        [-8.71929269e-05],
        [ 1.25068717e-04],
        [ 1.15665889e-05],
        [-9.10022082e-04],
        [-1.84752447e-04],
        [-1.07683055e-03],
        [-8.55319855e-04],
        [-5.92931510e-05],
        [-7.79933600e-04],
        [ 1.14956766e-04],
        [-1.00702862e-03],
        [ 7.01783883e-04],
        [ 5.64109672e-04],
        [-4.94196411e-04],
        [-5.07097830e-04],
        [ 7.41597577e-04],
        [-3.06326073e-04],
        [-9.43115335e-04],
        [-5.51457508e-04],
        [-7.95118022e-04],
        [-1.11474392e-03],
        [ 7.47254652e-04],
        [-5.60180756e-04],
        [ 6.39826347e-04],
        [ 5.32734636e-04],
        [ 5.34463267e-04],
        [-2.05644006e-04],
 