In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from panel_utils import *

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor

In [2]:
df = pd.read_csv("panel_bigcap.csv", index_col="Date")

In [3]:
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols)

'Done.'

In [4]:
train = df.loc[:"2013-12-31"]
test = df.loc["2014-01-31":]

X_train, y_train = train.drop(["EXCESS_RETURN_T+1"], axis=1), train["EXCESS_RETURN_T+1"]
X_test, y_test = test.drop(["EXCESS_RETURN_T+1"], axis=1), test["EXCESS_RETURN_T+1"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((294327, 739), (294327,), (169245, 739), (169245,))

In [5]:
naive_mean = DummyRegressor(strategy="mean")
naive_mean.fit(X_train.values, y_train.values)
modOOSR2(y_test.values, naive_mean.predict(X_test.values))

# uses combined mean

0.0003301052587154629

In [6]:
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([("scaler", "passthrough"), ("regressor", LinearRegression())])

param_grid = {
    "scaler": [
        "passthrough"
    ]
}

grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2",
    cv=holdout_cv(X_train, n_test = 3),
    n_jobs=-1,
)

grid_search.fit(X_train.values, y_train.values)

GridSearchCV(cv=<generator object holdout_cv at 0x7fda0ac24270>,
             estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                       ('regressor', LinearRegression())]),
             n_jobs=-1, param_grid={'scaler': ['passthrough']},
             refit='modOOSR2', scoring={'modOOSR2': make_scorer(modOOSR2)})

In [7]:
results = pd.DataFrame(grid_search.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_modOOSR2,mean_test_modOOSR2,std_test_modOOSR2,rank_test_modOOSR2
0,18.338603,0.0,0.117771,0.0,passthrough,{'scaler': 'passthrough'},-0.298582,-0.298582,0.0,1


In [8]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.010323065372606965

In [9]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test))

-0.010656688460223496

In [10]:
# best = grid_search.best_estimator_
# best.named_steps["regressor"].coef_, best.named_steps["regressor"].intercept_

In [21]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.linear_model import Ridge
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV

pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', Ridge())
])

param_grid = {
    'scaler': ["passthrough"],
    'regressor__alpha': loguniform(0.001,10)
}

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=100, 
    n_jobs=-1,
    pre_dispatch =2,
    error_score= "raise"
    )

grid_search.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=<generator object holdout_cv at 0x7fde9553c580>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor', Ridge())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'regressor__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde953d8e80>,
                                        'scaler': ['passthrough']},
                   pre_dispatch=2, refit='modOOSR2',
                   scoring={'modOOSR2': make_scorer(modOOSR2)})

In [22]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'),
                ('regressor', Ridge(alpha=7.691678046431957))])

In [23]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

-0.0028320742316279013

In [24]:
OOSR2(y_test.values, grid_search.predict(X_test.values), naive_mean.predict(X_test.values))

-0.003163223687116856

In [26]:
from sklearn.linear_model import Lasso

pipeline = Pipeline([
    ('scaler', "passthrough"),
    ('regressor', Lasso())
])

param_grid = {
    'regressor__alpha': loguniform(0.001,10)  
}

grid_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions= param_grid,
    scoring={"modOOSR2": make_scorer(modOOSR2)},
    refit="modOOSR2", 
    cv=holdout_cv(X_train, n_test = 3),
    n_iter=100, 
    n_jobs=-1,
    pre_dispatch =2,
    error_score= "raise"
    )

grid_search.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=<generator object holdout_cv at 0x7fde9553cc80>,
                   error_score='raise',
                   estimator=Pipeline(steps=[('scaler', 'passthrough'),
                                             ('regressor', Lasso())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'regressor__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde953eeeb0>},
                   pre_dispatch=2, refit='modOOSR2',
                   scoring={'modOOSR2': make_scorer(modOOSR2)})

In [27]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', 'passthrough'),
                ('regressor', Lasso(alpha=0.001250195254146877))])

In [28]:
modOOSR2(y_test.values, grid_search.predict(X_test.values))

0.00030492292704986657