In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

import scipy.stats as stats

from ts_utils import OOSR2, modOOSR2

In [49]:
df = pd.read_csv("../_data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

In [50]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

In [81]:
estimator = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', estimator)
])

param_grid = {
    "regressor__n_estimators": [i for i in np.arange(2,10)],
    "regressor__learning_rate": [i for i in np.linspace(0.1,1,1000)],
    'regressor__base_estimator__max_depth': [i for i in np.arange(1,5)], 
    #'regressor__base_estimator__n_estimators': [i for i in np.arange(2,100)],
    "regressor__base_estimator__max_features": [i for i in np.arange(1,30)],
    "regressor__base_estimator__min_samples_split": [i for i in np.arange(2,20)],
    "regressor__base_estimator__min_samples_leaf": [i for i in np.arange(1,20)]
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=tscv,
    n_iter=10000, 
    n_jobs=-1,
    verbose=2
    )

grid_search.fit(X_train, y_train)

#pipeline.get_params()

Fitting 3 folds for each of 10000 candidates, totalling 30000 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              AdaBoostRegressor(base_estimator=DecisionTreeRegressor()))]),
                   n_iter=10000, n_jobs=-1,
                   param_distributions={'regressor__base_estimator__max_depth': [1,
                                                                                 2,
                                                                                 3,
                                                                                 4],
                                        'regressor__base_estimator__max_features'...
                                                                     0.11621621621621622,
                                                                     0.11711711711711711,
                    

In [82]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"] <= 5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__n_estimators,param_regressor__learning_rate,param_regressor__base_estimator__min_samples_split,param_regressor__base_estimator__min_samples_leaf,param_regressor__base_estimator__max_features,param_regressor__base_estimator__max_depth,params,split0_test_modOOSR2,split1_test_modOOSR2,split2_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
2736,0.006012,0.000263,0.000559,2.557076e-05,6,0.767568,15,15,11,1,"{'regressor__n_estimators': 6, 'regressor__lea...",0.121424,0.000705,0.008537,0.043555,0.055154,3
4819,0.009241,0.000772,0.000637,3.371748e-07,9,0.212613,10,9,19,1,"{'regressor__n_estimators': 9, 'regressor__lea...",0.080796,0.046039,-0.005129,0.040569,0.035291,5
6562,0.007126,0.002411,0.000677,0.0002000062,4,0.585586,10,19,15,1,"{'regressor__n_estimators': 4, 'regressor__lea...",0.080486,0.036149,0.025795,0.047477,0.023721,1
6651,0.008286,0.000771,0.000606,2.186038e-05,8,0.159459,19,14,20,1,"{'regressor__n_estimators': 8, 'regressor__lea...",0.107575,0.000666,0.017046,0.041763,0.047015,4
7372,0.008675,0.000531,0.000666,2.409168e-05,9,0.372973,15,5,10,1,"{'regressor__n_estimators': 9, 'regressor__lea...",0.038735,0.072782,0.022162,0.04456,0.021072,2


In [83]:
grid_search.best_params_

{'regressor__n_estimators': 4,
 'regressor__learning_rate': 0.5855855855855856,
 'regressor__base_estimator__min_samples_split': 10,
 'regressor__base_estimator__min_samples_leaf': 19,
 'regressor__base_estimator__max_features': 15,
 'regressor__base_estimator__max_depth': 1}

In [84]:
modOOSR2(y_test, grid_search.predict(X_test))

0.0328114129910857

In [85]:
best = grid_search.best_estimator_
best.named_steps["regressor"].feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.31243582,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.48057485, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.20698933, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])