In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

import scipy.stats as stats

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.utils.fixes import loguniform

In [3]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [4]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)

'Done.'

In [3]:
%store -r df

In [4]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [5]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [10]:
pipeline = Pipeline([
    ('scaler', "passthrough"),
    ("pca", PCA()),
    ('regressor', LinearRegression())
])

param_grid = {
    'scaler': ["passthrough"],
    "pca__n_components": [i for i in range(1,20)] 
}

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_jobs=-1,
    pre_dispatch=8, 
    error_score= "raise"
    )

grid_search.fit(X_train.values, y_train.values)

GridSearchCV(cv=<generator object holdout_cv at 0x7f7f3957a3c0>,
             error_score='raise',
             estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                       ('pca', PCA()),
                                       ('regressor', LinearRegression())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                               11, 12, 13, 14, 15, 16, 17, 18,
                                               19],
                         'scaler': ['passthrough']},
             pre_dispatch=8, refit='modOOSR2',
             scoring={'modOOSR2': make_scorer(modOOSR2)})

In [11]:
results = pd.DataFrame(grid_search.cv_results_)
results[results["rank_test_modOOSR2"]<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
0,1.180017,0.0,0.008543,0.0,1,passthrough,"{'pca__n_components': 1, 'scaler': 'passthrough'}",0.029337,0.029337,0.0,1
1,1.204533,0.0,0.008798,0.0,2,passthrough,"{'pca__n_components': 2, 'scaler': 'passthrough'}",0.026305,0.026305,0.0,4
2,1.345911,0.0,0.009712,0.0,3,passthrough,"{'pca__n_components': 3, 'scaler': 'passthrough'}",0.026536,0.026536,0.0,3
4,1.56771,0.0,0.031158,0.0,5,passthrough,"{'pca__n_components': 5, 'scaler': 'passthrough'}",0.02759,0.02759,0.0,2
5,1.167068,0.0,0.010013,0.0,6,passthrough,"{'pca__n_components': 6, 'scaler': 'passthrough'}",0.025773,0.025773,0.0,5


In [12]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'), ('pca', PCA(n_components=1)),
                ('regressor', LinearRegression())])

In [13]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

0.00038378421292395437

In [14]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

5.369667976484749e-05

In [15]:
best = grid_search.best_estimator_
best.named_steps["regressor"].intercept_

0.013261964413730306

In [16]:
best.named_steps["regressor"].coef_

array([0.00610542])

In [17]:
best.named_steps["pca"].components_

array([[-4.71160658e-02, -4.70330389e-03, -6.39596516e-02,
        -4.55053945e-02, -1.76902948e-01, -3.29501955e-01,
        -3.44011301e-01, -1.23817920e-01, -1.42812611e-01,
        -3.65047745e-01,  1.09353535e-01,  3.23651944e-02,
        -1.71648167e-01, -8.23732134e-04, -2.23750571e-01,
        -3.64326336e-01,  6.93703580e-02, -2.40066732e-01,
        -1.72588161e-01, -2.61666543e-02,  1.55941512e-02,
        -8.48931571e-03, -4.57233268e-01, -4.48067492e-02,
        -1.55202972e-01,  1.91874246e-03,  2.42434538e-03,
         2.89100725e-03,  6.91319447e-04,  4.60209064e-03,
         5.30744965e-03, -1.21652627e-03, -1.44620756e-02,
        -2.86953428e-02,  2.16089463e-02, -5.86589316e-02,
        -5.75482231e-02, -2.09442886e-02,  3.86238289e-03,
        -4.20752903e-02,  2.02310034e-03, -2.07229737e-03,
         1.55504872e-03, -7.75966373e-03,  1.62536245e-03,
         1.43122456e-03,  2.34699596e-03, -4.14722091e-04,
         3.87432730e-03, -4.79633770e-03, -8.43370139e-0

In [18]:
best.named_steps["pca"].explained_variance_ratio_

array([0.30221361])