In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor

In [3]:
df = pd.read_csv("../_data/panel_bigcap.csv", index_col="Date")

In [4]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)

'Done.'

In [5]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 55), (294327,), (169245, 55), (169245,))

In [7]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

0.0003301052587154629

In [8]:
pipeline = Pipeline([
    ("scaler", "passthrough"),
    ("regressor", LinearRegression())
])

param_grid = {
    "scaler": ["passthrough"]
}

grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2",
    cv=holdout_cv(X_train, n_test = 3),
    n_jobs=-1,
)

grid_search.fit(X_train.values, y_train.values)

GridSearchCV(cv=<generator object holdout_cv at 0x7fc1a41c9c10>,
             estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                       ('regressor', LinearRegression())]),
             n_jobs=-1, param_grid={'scaler': ['passthrough']},
             refit='modOOSR2', scoring={'modOOSR2': make_scorer(modOOSR2)})

In [9]:
results = pd.DataFrame(grid_search.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
0,0.920769,0.0,0.003955,0.0,passthrough,{'scaler': 'passthrough'},-0.009477,-0.009477,0.0,1


In [10]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.0010654491519284104

In [11]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test))

-0.0013960152426166772

In [12]:
best = grid_search.best_estimator_
best.named_steps["regressor"].intercept_

0.10737538057249138

In [13]:
best.named_steps["regressor"].coef_

array([ 0.05604896,  0.00738249, -0.02568091, -0.06730417, -0.09410128,
        0.00953252,  0.02965718,  0.07239039,  0.00971814, -0.01760572,
        0.01669102, -0.00329789,  0.01175747, -0.01420714,  0.0354925 ,
       -0.02817834,  0.02597632, -0.00573317, -0.00913397, -0.01222652,
       -0.0027195 , -0.01152957,  0.00215695, -0.0765434 , -0.01357393,
        0.00050511, -0.00538462, -0.00113585,  0.0090941 , -0.07983725,
        0.07036217,  0.01286296,  0.02166948, -0.02465627, -0.08984607,
        0.06686913, -0.08831921, -0.0819499 , -0.0343548 ,  0.16941289,
        0.04373004,  0.13088745,  0.12972712, -0.01175223, -0.08994273,
       -0.16150707,  0.03613928, -0.04585292,  0.08039465,  0.01559979,
        0.03327381,  0.02392419,  0.03836143, -0.02321838, -0.13538383])

In [14]:
%store df

Stored 'df' (DataFrame)
