In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.utils.fixes import loguniform

In [3]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [4]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)

'Done.'

In [2]:
%store -r df

In [3]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [4]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [7]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ("pca", PCA()),
    ('regressor', ElasticNet())
])

param_grid = {
    'scaler': ["passthrough"],
    "pca__n_components": [i for i in range(1,21)],
    'regressor__alpha': loguniform(0.001,10),
    "regressor__l1_ratio": stats.uniform(0,1)
}

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=250, 
    n_jobs=-1,
    pre_dispatch =8,
    error_score= "raise"
    )

grid_search.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=<generator object holdout_cv at 0x7fdd41cafc10>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('pca', PCA()),
                                             ('regressor', ElasticNet())]),
                   n_iter=250, n_jobs=-1,
                   param_distributions={'pca__n_components': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15,
                                                              16, 17, 18, 19,
                                                              20],
                                        'regressor__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdd603d2640>,
                                        'regressor__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdd

In [8]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_regressor__alpha,param_regressor__l1_ratio,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
5,1.371178,0.0,0.018561,0.0,10,0.006574,0.204267,passthrough,"{'pca__n_components': 10, 'regressor__alpha': ...",0.027705,0.027705,0.0,5
33,1.472645,0.0,0.016812,0.0,11,0.00185,0.805049,passthrough,"{'pca__n_components': 11, 'regressor__alpha': ...",0.027866,0.027866,0.0,3
35,1.198307,0.0,0.008574,0.0,1,0.00246,0.482845,passthrough,"{'pca__n_components': 1, 'regressor__alpha': 0...",0.028015,0.028015,0.0,2
55,1.222568,0.0,0.010699,0.0,7,0.00483,0.166017,passthrough,"{'pca__n_components': 7, 'regressor__alpha': 0...",0.027725,0.027725,0.0,4
214,1.238598,0.0,0.008778,0.0,1,0.001038,0.595005,passthrough,"{'pca__n_components': 1, 'regressor__alpha': 0...",0.02871,0.02871,0.0,1


In [9]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'), ('pca', PCA(n_components=1)),
                ('regressor',
                 ElasticNet(alpha=0.0010377793860502193,
                            l1_ratio=0.5950050366551096))])

In [10]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

0.00038013267813630236

In [11]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

5.004393918839334e-05

In [12]:
best = grid_search.best_estimator_
best.named_steps["regressor"].intercept_

0.013261964413730306

In [13]:
best.named_steps["regressor"].coef_

array([0.00507427])

In [14]:
best.named_steps["pca"].components_

array([[-4.71160658e-02, -4.70330396e-03, -6.39596515e-02,
        -4.55053944e-02, -1.76902948e-01, -3.29501955e-01,
        -3.44011301e-01, -1.23817920e-01, -1.42812611e-01,
        -3.65047745e-01,  1.09353535e-01,  3.23651944e-02,
        -1.71648167e-01, -8.23732129e-04, -2.23750571e-01,
        -3.64326336e-01,  6.93703580e-02, -2.40066732e-01,
        -1.72588161e-01, -2.61666543e-02,  1.55941512e-02,
        -8.48931579e-03, -4.57233268e-01, -4.48067492e-02,
        -1.55202972e-01,  1.91874260e-03,  2.42434547e-03,
         2.89100724e-03,  6.91319467e-04,  4.60209054e-03,
         5.30744960e-03, -1.21652646e-03, -1.44620757e-02,
        -2.86953427e-02,  2.16089463e-02, -5.86589316e-02,
        -5.75482230e-02, -2.09442885e-02,  3.86238275e-03,
        -4.20752903e-02,  2.02310038e-03, -2.07229717e-03,
         1.55504847e-03, -7.75966351e-03,  1.62536247e-03,
         1.43122459e-03,  2.34699599e-03, -4.14722143e-04,
         3.87432726e-03, -4.79633762e-03, -8.43370047e-0

In [15]:
best.named_steps["pca"].explained_variance_ratio_

array([0.30221361])