In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

import scipy.stats as stats

from ts_utils import OOSR2, modOOSR2

In [2]:
df = pd.read_csv("../_data/timeseries.csv", index_col="Date")
df.index = pd.to_datetime(df.index)
df = df.round(4)

train = df.iloc[:350]
test = df.iloc[350:]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1).values, train["EXCESS_RETURN_T+1"].values
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1).values, test["EXCESS_RETURN_T+1"].values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((350, 49), (350,), (165, 49), (165,))

In [3]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train, y_train)
modOOSR2(y_test, naive_mean.predict(X_test))

0.023305483729676668

In [4]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', RandomForestRegressor(n_jobs=-1))
])

param_grid = {
    'scaler': ["passthrough"],
    "regressor__n_estimators": [i for i in np.arange(10,100)],
    "regressor__max_depth": [i for i in np.arange(1,10)],
    "regressor__max_features": [i for i in np.arange(1,50)],
    "regressor__min_samples_split": [i for i in np.arange(5,10)],
    "regressor__min_samples_leaf": [i for i in np.arange(5,20)]
}

tscv = TimeSeriesSplit(n_splits=3)

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=tscv,
    n_iter=100, 
    n_jobs=-1,
    verbose=2
    )

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              RandomForestRegressor(n_jobs=-1))]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'regressor__max_depth': [1, 2, 3, 4, 5,
                                                                 6, 7, 8, 9],
                                        'regressor__max_features': [1, 2, 3, 4,
                                                                    5, 6, 7, 8,
                                                                    9, 10, 11,
                                                                    12, 13, 14,
                                                                    15, 16, 17,
                                                                    18, 19...
              

In [5]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_regressor__n_estimators,param_regressor__min_samples_split,param_regressor__min_samples_leaf,param_regressor__max_features,param_regressor__max_depth,params,split0_test_modOOSR2,split1_test_modOOSR2,split2_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
33,0.300844,0.003229,0.035113,0.019673,passthrough,58,6,11,6,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.010874,0.012042,-0.05976,-0.012282,0.033576,4
35,0.490892,0.01294,0.074494,0.004324,passthrough,92,9,12,10,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.020814,0.020326,-0.056461,-0.005107,0.036314,1
57,0.42044,0.02498,0.041655,0.014891,passthrough,95,7,18,1,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.008422,0.021405,-0.046837,-0.00567,0.029588,2
65,0.174937,0.027402,0.031957,0.024947,passthrough,25,8,10,27,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.009929,0.011744,-0.072784,-0.017037,0.039426,5
98,0.125545,0.024307,0.011177,0.00027,passthrough,16,9,14,6,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.004979,-0.003634,-0.037158,-0.011938,0.018177,3


In [6]:
grid_search.best_params_

{'scaler': 'passthrough',
 'regressor__n_estimators': 92,
 'regressor__min_samples_split': 9,
 'regressor__min_samples_leaf': 12,
 'regressor__max_features': 10,
 'regressor__max_depth': 1}

In [7]:
modOOSR2(y_test, grid_search.predict(X_test))

0.037303033337860225

In [23]:
best = grid_search.best_estimator_
best.named_steps["regressor"].feature_importances_

array([0.01666667, 0.02666667, 0.02666667, 0.00666667, 0.02333333,
       0.05666667, 0.01666667, 0.00666667, 0.00333333, 0.06      ,
       0.00333333, 0.02      , 0.00333333, 0.02666667, 0.        ,
       0.05      , 0.01      , 0.        , 0.02      , 0.01666667,
       0.02      , 0.01666667, 0.00666667, 0.05      , 0.        ,
       0.03333333, 0.01666667, 0.06333333, 0.03      , 0.03      ,
       0.04666667, 0.07333333, 0.01333333, 0.03      , 0.01      ,
       0.01333333, 0.01      , 0.01333333, 0.01333333, 0.00666667,
       0.01666667, 0.        , 0.01666667, 0.00666667, 0.01666667,
       0.00333333, 0.02      , 0.01666667, 0.01333333])