In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

import scipy.stats as stats

from ts_utils import OOSR2, modOOSR2

In [3]:
df = pd.read_csv("../_data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

In [29]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', GradientBoostingRegressor())
])

param_grid = {
    'scaler': ["passthrough"],
    "regressor__n_estimators": [i for i in np.arange(2,50)],
    "regressor__learning_rate": loguniform(0.05,0.25),
    "regressor__subsample": [i for i in np.linspace(0.3,1,1000)],
    "regressor__max_depth": [i for i in np.arange(1,4)],
    "regressor__max_features": [i for i in np.arange(5,20)]
    
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=tscv,
    n_iter=5000, 
    n_jobs=-1,
    verbose=2
    )

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 5000 candidates, totalling 15000 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              GradientBoostingRegressor())]),
                   n_iter=5000, n_jobs=-1,
                   param_distributions={'regressor__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feecc5a9c70>,
                                        'regressor__max_depth': [1,...
                                                                 0.3112112112112112,
                                                                 0.3119119119119119,
                                                                 0.3126126126126126,
                                                                 0.3133133133133133,
                                                                 0.314014014014014,
                  

In [30]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__learning_rate,param_regressor__max_depth,param_regressor__max_features,param_regressor__n_estimators,param_regressor__subsample,param_scaler,params,split0_test_modOOSR2,split1_test_modOOSR2,split2_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
302,0.001998,0.000263,0.000168,2e-06,0.195977,1,18,4,0.901902,passthrough,{'regressor__learning_rate': 0.195976760691564...,0.028454,0.037348,-0.017754,0.016016,0.024153,2
379,0.001878,0.000162,0.000164,2e-06,0.156551,1,9,5,0.721121,passthrough,{'regressor__learning_rate': 0.156550857817251...,0.022766,0.029516,-0.011836,0.013482,0.018113,4
481,0.001677,0.000334,0.000169,1e-06,0.129976,3,16,2,0.849349,passthrough,{'regressor__learning_rate': 0.129975694885041...,-0.005322,0.041302,0.004043,0.013341,0.020138,5
728,0.002442,0.000155,0.000167,3e-06,0.084429,1,9,8,0.345546,passthrough,{'regressor__learning_rate': 0.084428992531871...,0.039658,0.015909,0.038858,0.031475,0.011011,1
3930,0.005006,0.000723,0.00017,6e-06,0.054526,1,11,17,0.566266,passthrough,{'regressor__learning_rate': 0.054525940045638...,0.008123,0.039813,-0.00385,0.014696,0.018421,3


In [31]:
grid_search.best_params_

{'regressor__learning_rate': 0.08442899253187137,
 'regressor__max_depth': 1,
 'regressor__max_features': 9,
 'regressor__n_estimators': 8,
 'regressor__subsample': 0.3455455455455455,
 'scaler': 'passthrough'}

In [32]:
modOOSR2(y_test, grid_search.predict(X_test))

0.0112539723476337

In [33]:
best = grid_search.best_estimator_
best.named_steps["regressor"].feature_importances_

array([0.        , 0.10665767, 0.        , 0.        , 0.        ,
       0.16540748, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.29649258, 0.        , 0.        ,
       0.        , 0.13238816, 0.        , 0.        , 0.13470677,
       0.        , 0.        , 0.0551972 , 0.        , 0.        ,
       0.10915014, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])