In [28]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomTreesEmbedding, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, cross_validate, RepeatedKFold 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [29]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

## Only Linear Regression

In [30]:
scores = cross_validate(
    estimator=LinearRegression(),
    X=X, 
    y=y, 
    scoring=["r2"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10), 
    n_jobs=-1, 
    verbose=1, 
    return_train_score=True)

pd.DataFrame(scores).describe().round(4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.0s finished


Unnamed: 0,fit_time,score_time,test_r2,train_r2
count,50.0,50.0,50.0,50.0
mean,0.0103,0.0038,0.589,0.6066
std,0.0026,0.0018,0.0983,0.0033
min,0.0081,0.0022,-0.0843,0.6007
25%,0.0087,0.0029,0.5909,0.6042
50%,0.0092,0.0031,0.6046,0.6063
75%,0.0104,0.0038,0.6131,0.6092
max,0.0194,0.0114,0.6269,0.6127


## RandomTreesEmbedding + Linear Regression

In [32]:
regressor = Pipeline(
    [
     ("embedding", RandomTreesEmbedding(max_depth=None)),
     ("regressor", LinearRegression())
], verbose=True)

scores = cross_validate(
    estimator=regressor, 
    X=X, 
    y=y, 
    scoring=["r2"], 
    cv=RepeatedKFold(n_splits=5, n_repeats=1),
    n_jobs=-1, 
    verbose=1, 
    return_train_score=True)

pd.DataFrame(scores).describe().round(4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   22.6s finished


Unnamed: 0,fit_time,score_time,test_r2,train_r2
count,5.0,5.0,5.0,5.0
mean,5.1605,0.4794,0.7891,1.0
std,1.7243,0.1285,0.0096,0.0
min,2.6351,0.2722,0.7784,1.0
25%,4.5769,0.4455,0.7836,1.0
50%,5.096,0.5213,0.7852,1.0
75%,6.4879,0.5646,0.7979,1.0
max,7.0068,0.5933,0.8006,1.0


## Only Random Forest

In [33]:
scores = cross_validate(
    estimator=RandomForestRegressor(), 
    X=X, 
    y=y, 
    scoring=["r2"], 
    cv=RepeatedKFold(n_splits=5, n_repeats=1), 
    n_jobs=-1, 
    verbose=1, 
    return_train_score=True)

pd.DataFrame(scores).describe().round(4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   47.2s finished


Unnamed: 0,fit_time,score_time,test_r2,train_r2
count,5.0,5.0,5.0,5.0
mean,15.9327,0.1558,0.8097,0.9731
std,2.5264,0.0184,0.0114,0.0003
min,11.4181,0.1234,0.7941,0.9728
25%,16.913,0.1586,0.8047,0.9728
50%,17.0052,0.1646,0.8095,0.973
75%,17.1038,0.1658,0.816,0.9733
max,17.2233,0.1664,0.8243,0.9734


## Only GBDT

In [34]:
scores = cross_validate(
    estimator=GradientBoostingRegressor(), 
    X=X, 
    y=y, 
    scoring=["r2"], 
    cv=RepeatedKFold(n_splits=5, n_repeats=1),  
    n_jobs=-1, 
    verbose=1, 
    return_train_score=True)

pd.DataFrame(scores).describe().round(4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.2s finished


Unnamed: 0,fit_time,score_time,test_r2,train_r2
count,5.0,5.0,5.0,5.0
mean,5.2792,0.0096,0.7878,0.8065
std,0.755,0.0016,0.0128,0.0044
min,3.9299,0.0069,0.7722,0.7996
25%,5.5643,0.01,0.7779,0.8062
50%,5.6097,0.0102,0.7905,0.8066
75%,5.6414,0.0104,0.7944,0.8082
max,5.6505,0.0107,0.8039,0.8117


## RandomForest.apply & Linear Regression

In [39]:
from sklearn.base import BaseEstimator, RegressorMixin

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

In [40]:
class RERF(BaseEstimator, RegressorMixin):
    def fit(self, X, y):
        self.random_forest_ = RandomForestRegressor()
        self.random_forest_.fit(X, y)

        leaves = self.random_forest_.apply(X)

        self.encoder_ = OneHotEncoder()
        encoded_leaves = self.encoder_.fit_transform(leaves)

        self.linear_regressor_ = LinearRegression()
        self.linear_regressor_.fit(encoded_leaves, y)

    def predict(self, X):
        leaves = self.random_forest_.apply(X)
        encoded_leaves = self.encoder_.transform(leaves)

        y_pred = self.linear_regressor_.predict(encoded_leaves)

        return y_pred

    def score(self, X, y):
        y_pred = self.predict(X=X)
        score = r2_score(y_true=y, y_pred=y_pred)
        return score

    def get_params(self, deep=True):
        return {}

In [42]:
scores = cross_validate(
    estimator=RERF(), 
    X=X, 
    y=y, 
    scoring=["r2"], 
    cv=RepeatedKFold(n_splits=5, n_repeats=1),  
    n_jobs=-1, 
    verbose=1, 
    return_train_score=True)

pd.DataFrame(scores).describe().round(4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


Unnamed: 0,fit_time,score_time,test_r2,train_r2
count,5.0,5.0,5.0,5.0
mean,24.7943,0.2912,0.8077,1.0
std,4.381,0.0438,0.0105,0.0
min,17.1042,0.223,0.7911,1.0
25%,25.7875,0.2831,0.8035,1.0
50%,26.4226,0.2909,0.8124,1.0
75%,26.5697,0.327,0.8157,1.0
max,28.0876,0.3322,0.8158,1.0
