In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.utils import all_estimators
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
)
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsRegressor
import sklearn.datasets as ds
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
import seaborn as sns
import plotly.figure_factory as ff


In [None]:
X, y = ds.fetch_california_housing(as_frame=True, return_X_y=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

In [None]:
X_train

In [None]:
y_train

# KFold


In [None]:
from collections import defaultdict


MODELS = all_estimators(type_filter="regressor")
eval_rs = defaultdict(lambda: [])

kfold = KFold(n_splits=10)
for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    for key, model_cls in MODELS:
        if key in [
            "CCA",
            "IsotonicRegression",
            "KernelRidge",
            "LinearSVR",
            "GaussianProcessRegressor",
            "HuberRegressor",
            "MultiOutputRegressor",
            "MultiTaskElasticNet",
            "MultiTaskElasticNetCV",
            "MultiTaskLasso",
            "MultiTaskLassoCV",
            "PLSCanonical",
            "PoissonRegressor",
            "QuantileRegressor",
            "RadiusNeighborsRegressor",
            "RegressorChain",
            "StackingRegressor",
            "VotingRegressor",
        ]:
            continue
        print(f"Fitting {key}.", end=" ")
        model = model_cls()
        scaler = MinMaxScaler()
        model.fit(
            scaler.fit_transform(X_train.iloc[train_index]), y_train.iloc[train_index]
        )

        test_pred = model.predict(scaler.transform(X_train.iloc[test_index]))
        result = mean_squared_error(y_train.iloc[test_index], test_pred)
        print(f"{result=}")
        eval_rs[key].append(result)

In [None]:
eval_df = pd.DataFrame(eval_rs)
eval_df


In [None]:
eval_summary = eval_df.describe().T.sort_values("mean")
eval_summary


In [None]:
eval_

In [None]:
eval_df

In [None]:
eval_df_long = pd.melt(eval_df)
px.histogram(
    eval_df_long[eval_df_long["value"] < 4],
    facet_col="variable",
    facet_col_wrap=2,
    height=10000,
    facet_row_spacing=0.002,
    nbins=100,
    text_auto=True,
)

# MCCV


In [None]:
mccv_eval_rs = defaultdict(lambda: [])
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, test_size=1 / 10
    )
