In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform

In [2]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [2]:
#scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols)
%store -r df

In [3]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [5]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', MLPRegressor(
                    hidden_layer_sizes="passthrough",
                    activation='relu',
                    random_state=20))
])

param_grid = {
    'scaler': ["passthrough"],
    'regressor__hidden_layer_sizes': [(100,100), (50,50,50)],
    'regressor__alpha': loguniform(0.001,1)
}


grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=20, 
    n_jobs=-1,
    pre_dispatch =8,
    error_score= "raise",
    verbose=2
    )

grid_search.fit(X_train.values, y_train.values)

Fitting 1 folds for each of 20 candidates, totalling 20 fits


RandomizedSearchCV(cv=<generator object holdout_cv at 0x7f9c72d8bf20>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor',
                                              MLPRegressor(hidden_layer_sizes='passthrough',
                                                           random_state=20))]),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'regressor__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9c66255dc0>,
                                        'regressor__hidden_layer_sizes': [(100,
                                                                           100),
                                                                          (50,
                                                                           50,
                                                                           50)],
            

In [6]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"] <= 5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__hidden_layer_sizes,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
0,31.871954,0.0,0.630455,0.0,0.161954,"(50, 50, 50)",passthrough,"{'regressor__alpha': 0.1619542283650893, 'regr...",0.02477,0.02477,0.0,3
3,58.350364,0.0,0.738951,0.0,0.019487,"(50, 50, 50)",passthrough,"{'regressor__alpha': 0.019486521256755224, 're...",0.034306,0.034306,0.0,2
7,42.465545,0.0,0.733107,0.0,0.082439,"(50, 50, 50)",passthrough,"{'regressor__alpha': 0.08243898359738895, 'reg...",0.024727,0.024727,0.0,5
13,73.764418,0.0,1.350279,0.0,0.034999,"(100, 100)",passthrough,"{'regressor__alpha': 0.03499869512425072, 'reg...",0.048912,0.048912,0.0,1
18,31.86388,0.0,0.620688,0.0,0.130643,"(50, 50, 50)",passthrough,"{'regressor__alpha': 0.13064343270242756, 'reg...",0.024753,0.024753,0.0,4


In [7]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'),
                ('regressor',
                 MLPRegressor(alpha=0.03499869512425072,
                              hidden_layer_sizes=(100, 100),
                              random_state=20))])

In [8]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.001221654622627355

In [18]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

-2.5068820671769743e-05