In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.utils.fixes import loguniform

In [3]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [4]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)

'Done.'

In [2]:
%store -r df

In [3]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [22]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', RandomForestRegressor(n_jobs=6))
])

param_grid = {
    'scaler': ["passthrough"],
    "regressor__n_estimators": [i for i in np.arange(5,100)],
    "regressor__max_depth": [i for i in np.arange(1,10)],
    "regressor__max_features": [10,20,30,40,50,55],
}

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=100, 
    n_jobs=2,
    #pre_dispatch =4,
    error_score= "raise",
    verbose=2
    )

grid_search.fit(X_train.values, y_train.values)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


RandomizedSearchCV(cv=<generator object holdout_cv at 0x7fc8bcb34040>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              RandomForestRegressor(n_jobs=6))]),
                   n_iter=100, n_jobs=2,
                   param_distributions={'regressor__max_depth': [1, 2, 3, 4, 5,
                                                                 6, 7, 8, 9],
                                        'regressor__max_features': [10, 20, 30,
                                                                    40, 50,
                                                                    55],
                                        'regressor__n_estimators': [5, 6, 7, 8,
                                                                    9, 10, 11,
                                                                    12, 13, 14,
         

In [23]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_regressor__n_estimators,param_regressor__max_features,param_regressor__max_depth,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
14,13.644228,0.0,0.049466,0.0,passthrough,76,20,2,"{'scaler': 'passthrough', 'regressor__n_estima...",0.066754,0.066754,0.0,3
17,17.545683,0.0,0.041568,0.0,passthrough,63,30,2,"{'scaler': 'passthrough', 'regressor__n_estima...",0.065156,0.065156,0.0,5
86,1.990952,0.0,0.017496,0.0,passthrough,9,40,1,"{'scaler': 'passthrough', 'regressor__n_estima...",0.065252,0.065252,0.0,4
91,3.370381,0.0,0.023288,0.0,passthrough,12,20,3,"{'scaler': 'passthrough', 'regressor__n_estima...",0.070295,0.070295,0.0,2
97,8.845248,0.0,0.057105,0.0,passthrough,93,10,2,"{'scaler': 'passthrough', 'regressor__n_estima...",0.071735,0.071735,0.0,1


In [24]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'),
                ('regressor',
                 RandomForestRegressor(max_depth=2, max_features=10,
                                       n_estimators=93, n_jobs=6))])

In [25]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.000391518150405501

In [26]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

-0.0007218616994641991

In [27]:
best = grid_search.best_estimator_
best.named_steps["regressor"].feature_importances_

array([0.00000000e+00, 6.75217589e-04, 8.47110297e-04, 0.00000000e+00,
       0.00000000e+00, 1.45062685e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.89043214e-02, 1.15175366e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.42253259e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.95350402e-04,
       0.00000000e+00, 0.00000000e+00, 2.53669682e-03, 1.03073072e-02,
       0.00000000e+00, 3.15981421e-03, 3.46225443e-03, 1.16838218e-03,
       4.92910288e-02, 1.19866584e-02, 1.10437463e-01, 5.21174410e-02,
       1.67689702e-02, 1.05615659e-01, 8.59335494e-02, 8.51722340e-03,
       1.64581628e-02, 2.80476239e-03, 1.21855466e-02, 7.51558880e-03,
       1.46454204e-02, 1.19010948e-01, 2.02580861e-01, 7.05219185e-02,
       1.94835681e-02, 3.68105722e-03, 1.47970231e-02])