In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.utils.fixes import loguniform

In [3]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [4]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)

'Done.'

In [2]:
%store -r df

In [3]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [5]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', ElasticNet())
])

param_grid = {
    'scaler': ["passthrough"],
    'regressor__alpha': loguniform(0.00075,10),
    "regressor__l1_ratio": stats.uniform(0,1)  
}


grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=7500, 
    n_jobs=-1,
    pre_dispatch =8,
    error_score= "raise"
    )

grid_search.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=<generator object holdout_cv at 0x7efd8d1f5900>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor', ElasticNet())]),
                   n_iter=7500, n_jobs=-1,
                   param_distributions={'regressor__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7efd8c98a400>,
                                        'regressor__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7efd8c9974c0>,
                                        'scaler': ['passthrough']},
                   pre_dispatch=8, refit='modOOSR2',
                   scoring={'modOOSR2': make_scorer(modOOSR2)})

In [6]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__alpha,param_regressor__l1_ratio,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
677,0.415491,0.0,0.004289,0.0,0.001931,0.647395,passthrough,"{'regressor__alpha': 0.0019310647602558994, 'r...",0.028279,0.028279,0.0,3
5506,0.402627,0.0,0.007609,0.0,0.002065,0.609766,passthrough,"{'regressor__alpha': 0.0020645365816346116, 'r...",0.028277,0.028277,0.0,5
6240,0.396312,0.0,0.004353,0.0,0.00157,0.801711,passthrough,"{'regressor__alpha': 0.0015699644419484011, 'r...",0.028288,0.028288,0.0,2
7102,0.366938,0.0,0.004412,0.0,0.001429,0.879127,passthrough,"{'regressor__alpha': 0.0014292193940235508, 'r...",0.028297,0.028297,0.0,1
7320,0.482345,0.0,0.004464,0.0,0.002245,0.55999,passthrough,"{'regressor__alpha': 0.0022453741889436572, 'r...",0.028277,0.028277,0.0,4


In [7]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'),
                ('regressor',
                 ElasticNet(alpha=0.0014292193940235508,
                            l1_ratio=0.8791267396726554))])

In [8]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

0.00030520728620131443

In [9]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

-2.490619417994111e-05

In [10]:
best = grid_search.best_estimator_
best.named_steps["regressor"].intercept_

0.01761730963384324

In [11]:
best.named_steps["regressor"].coef_

array([-0.        ,  0.        , -0.        , -0.        , -0.        ,
       -0.00842496, -0.        , -0.        , -0.        , -0.        ,
        0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        ,  0.        , -0.        ,  0.        ,
        0.        , -0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        , -0.        ,  0.        ,  0.        ,
        0.        , -0.        ,  0.        ,  0.        , -0.        ])