Initial imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor

from ts_utils import OOSR2, modOOSR2

Train/Test split

In [2]:
df = pd.read_csv("../.data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

Mean model

In [3]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

Simple linear

In [4]:
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([("scaler", "passthrough"), ("regressor", LinearRegression())])

param_grid = {
    "scaler": [
        "passthrough"
    ]
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2",
    cv=tscv,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

In [5]:
results = pd.DataFrame(grid_search.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_modOOSR2,split1_test_modOOSR2,split2_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
0,0.003744,0.001251,0.0,0.0,passthrough,{'scaler': 'passthrough'},-3.849039,-2.740087,-0.213285,-2.26747,1.521447,1


In [6]:
modOOSR2(y_test, grid_search.predict(X_test))

-16.15941116030504

In [7]:
OOSR2(y_test, grid_search.predict(X_test), naive_mean.predict(X_test))

-16.56886198750374

In [8]:
best = grid_search.best_estimator_
best.named_steps["regressor"].coef_, best.named_steps["regressor"].intercept_

(array([ 1.65150573e-01,  1.92093619e-04,  1.76096068e+01, -1.76126680e+01,
        -2.10731977e-03,  2.01802512e-04,  4.43590642e-04,  6.27535047e-04,
         1.66321580e-04, -4.04122364e-04, -1.76943303e-04,  9.11791348e-13,
         9.86737389e-05, -1.21581256e-04,  6.53106867e-03, -4.89328875e-04,
         1.67803678e-04,  1.12390574e-02,  8.60379161e-03, -1.02930013e-04,
        -9.15543945e-05,  3.28690575e-04, -4.35383837e-04, -4.69567836e-03,
        -3.38424269e-05, -1.21807731e+00,  8.10716945e-03,  6.95135428e+00,
        -7.38318211e+00, -7.78745795e+00, -7.19038837e-01, -3.66880838e-01,
        -1.76125964e+01,  7.37215976e-01, -8.66584853e-03,  1.33225144e-02,
         5.73888440e-04, -1.09061869e-02, -1.84840516e-02, -8.25758845e-03,
        -9.23404103e-03, -6.73520201e-03,  3.31274953e-03,  4.86921394e-02,
        -7.75914978e-04, -1.75522255e-03,  1.85670727e+00,  4.47109523e-03,
        -7.26384640e+00]),
 0.1554578453128041)