In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

In [2]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [2]:
#scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols)
%store -r df

In [3]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [6]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', GradientBoostingRegressor())
])

param_grid = {
    'scaler': ["passthrough"],
    "regressor__n_estimators": [5,10,25,50,100],
    "regressor__learning_rate": loguniform(0.0001,10),
    "regressor__subsample": [0.8,0.9,1],
    "regressor__max_depth": [1,3,5,10],
    "regressor__max_features": [20,40,49]
    
}

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=60, 
    n_jobs=-1,
    pre_dispatch =8,
    error_score= "raise",
    verbose=2
    )

grid_search.fit(X_train.values, y_train.values)

Fitting 1 folds for each of 60 candidates, totalling 60 fits


RandomizedSearchCV(cv=<generator object holdout_cv at 0x7fb0c95d1c80>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              GradientBoostingRegressor())]),
                   n_iter=60, n_jobs=-1,
                   param_distributions={'regressor__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb0d592cca0>,
                                        'regressor__max_depth': [1, 3, 5, 10],
                                        'regressor__max_features': [20, 40, 49],
                                        'regressor__n_estimators': [5, 10, 25,
                                                                    50, 100],
                                        'regressor__subsample': [0.8, 0.9, 1],
                                        'scaler': ['passthrough']},
                   pre_dispatch=8, 

In [7]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__learning_rate,param_regressor__max_depth,param_regressor__max_features,param_regressor__n_estimators,param_regressor__subsample,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
23,2.48788,0.0,0.006789,0.0,0.256706,1,20,5,1.0,passthrough,{'regressor__learning_rate': 0.256705853020614...,0.050567,0.050567,0.0,1
30,41.918709,0.0,0.016476,0.0,0.130991,1,40,50,0.9,passthrough,{'regressor__learning_rate': 0.130990601061071...,0.049299,0.049299,0.0,2
40,70.109964,0.0,0.02514,0.0,0.049873,3,20,50,1.0,passthrough,{'regressor__learning_rate': 0.049873036797158...,0.041214,0.041214,0.0,5
46,21.599878,0.0,0.017287,0.0,0.250226,1,20,50,0.9,passthrough,{'regressor__learning_rate': 0.250225619175599...,0.041737,0.041737,0.0,4
48,5.187519,0.0,0.006663,0.0,0.128398,1,49,5,0.9,passthrough,{'regressor__learning_rate': 0.128398369390534...,0.046667,0.046667,0.0,3


In [8]:
grid_search.best_params_

{'regressor__learning_rate': 0.25670585302061466,
 'regressor__max_depth': 1,
 'regressor__max_features': 20,
 'regressor__n_estimators': 5,
 'regressor__subsample': 1,
 'scaler': 'passthrough'}

In [10]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.00045075690618556763

In [43]:
best = grid_search.best_estimator_
best.named_steps["regressor"].feature_importances_

array([1.15634837e-02, 9.24230703e-03, 2.81988277e-02, 3.39041370e-02,
       1.30781241e-02, 8.37632593e-02, 3.42711992e-03, 5.77033961e-03,
       4.39398685e-03, 2.22212804e-03, 1.96496201e-02, 8.01056142e-03,
       2.56867927e-02, 6.31502718e-02, 0.00000000e+00, 7.56253213e-05,
       3.90678409e-04, 2.55075415e-04, 1.02803192e-02, 3.46378409e-03,
       1.21296712e-02, 1.29104016e-02, 3.49566634e-03, 4.90203115e-03,
       1.03390448e-02, 1.27002914e-02, 1.01909039e-02, 1.27585715e-02,
       3.01244805e-02, 3.60500012e-02, 4.14059664e-02, 8.87643704e-02,
       3.34041559e-02, 7.73414832e-03, 1.46543149e-02, 3.12338000e-02,
       2.24799717e-02, 6.38623020e-03, 3.89205617e-02, 1.58276938e-02,
       8.19324396e-02, 6.00770333e-04, 2.40144133e-02, 2.20317646e-02,
       3.47399527e-02, 1.67882302e-02, 2.28482039e-02, 1.48399579e-02,
       9.26554904e-03])