In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
from sklearn.utils.fixes import loguniform
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

import scipy.stats as stats

from panel_utils import *


In [None]:
df = pd.read_csv("../_data/panelbigcap.csv", index_col="Date")
df = df.loc[df.index <= "2019.12.31"]
scale_interact_sort(df=df, micro_cols=micro_cols, macro_cols=macro_cols, interact=False)
df = df.round(4)


In [None]:
%store -r df

In [None]:
train = df.loc[df.index <= "2009.12.31"]
test = df.loc[df.index > "2009.12.31"]

train.shape, test.shape


In [None]:
bestparams = []
predictions = []
naive_predictions = []
train_end_dates = []

naive_mean = DummyRegressor(strategy="mean")

for i in np.arange(2009, 2019, 1):
    train_realtime = df.loc[pd.to_datetime(df.index).year <= i]
    test_realtime = df.loc[
        (pd.to_datetime(df.index).year > i) & (pd.to_datetime(df.index).year < i + 2)
    ]

    X_train, y_train = (
        train_realtime.drop(["EXCESS_RETURN_T+1"], axis=1),
        train_realtime["EXCESS_RETURN_T+1"],
    )
    X_test, y_test = (
        test_realtime.drop(["EXCESS_RETURN_T+1"], axis=1),
        test_realtime["EXCESS_RETURN_T+1"],
    )

    est = Pipeline([("pca", PCA()), ("regressor", LinearRegression())])

    pipeline = Pipeline(
        [("regressor", BaggingRegressor(base_estimator=est, n_jobs=-1))]
    )

    param_grid = {
        "regressor__n_estimators": [10],
        "regressor__base_estimator__pca__n_components": [2, 4, 6],
        "regressor__base_estimator__regressor__positive": [True],
    }

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",
        refit="neg_mean_squared_error",
        cv=holdout_cv(X_train, n_test=4),
        n_jobs=8,
        pre_dispatch=4,
    )

    grid_search.fit(X_train, y_train)
    naive_mean.fit(X_train, y_train)

    predictions.extend(grid_search.predict(X_test).tolist())
    naive_predictions.extend(naive_mean.predict(X_test).tolist())
    bestparams.append(grid_search.best_params_)
    train_end_dates.append(
        pd.to_datetime(train_realtime.index[-1]).strftime("%Y-%m-%d")
    )

    print("train end", i)
    print("val MSE", grid_search.best_score_)


In [None]:
print("zeroOOSR2", modOOSR2(test["EXCESS_RETURN_T+1"].values, np.array(predictions)))
print(
    "meanOOSR2: ",
    OOSR2(
        test["EXCESS_RETURN_T+1"].values,
        np.array(predictions),
        np.array(naive_predictions),
    ),
)


In [None]:
nlist = []

for paramdict in bestparams:
    nlist.append(paramdict["regressor__n_estimators"])

fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(train_end_dates, nlist)
ax.set_xlabel("date")
ax.set_ylabel("param")

plt.plot()
