In [48]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

from sklearn.preprocessing import OneHotEncoder

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

df = load_housing_data()
df = df.dropna(subset=["total_bedrooms"])#.drop('ocean_proximity', axis=1)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

subset = df[:3000]
X = subset.drop(columns=["median_house_value"])
y = subset['median_house_value']

num_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))],
    remainder=default_num_pipeline)
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])

param_grid = {
    'model__C': np.logspace(-2, 2, 5),
    'model__gamma': np.logspace(-3, 1, 5)
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

gs.fit(X, y)



In [50]:
print(f"Best Score (MSE): {-gs.best_score_}")
print(f"Best Params: {gs.best_params_}")
best_estimator = gs.best_estimator_
best_estimator

Best Score (MSE): 13206031497.020035
Best Params: {'model__C': np.float64(100.0), 'model__gamma': np.float64(0.1)}


In [51]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

rs.fit(X, y)

In [52]:
print(f"Best Score (MSE): {-rs.best_score_}")
print(f"Best Params: {rs.best_params_}")
best_estimator_rs = rs.best_estimator_
best_estimator_rs

Best Score (MSE): 13206031497.020035
Best Params: {'model__gamma': np.float64(0.1), 'model__C': np.float64(100.0)}


In [58]:
gs.best_estimator_.named_steps['model']

In [59]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

pipe_sfm = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('selector', SelectFromModel(RandomForestRegressor(random_state=42,),
                                     threshold=0.05)),
        ('model', gs.best_estimator_.named_steps['model'])
    ]
)

pipe_sfm.fit(X, y)

In [60]:
from sklearn.model_selection import cross_val_score

rmse = -cross_val_score(pipe_sfm, X, y, scoring='neg_root_mean_squared_error')
rmse

array([105692.35262732, 117852.54560463, 109002.54486748, 117531.68882488,
       113650.49559553])

In [61]:
pd.Series(rmse).describe()

count         5.000000
mean     112745.925504
std        5328.347196
min      105692.352627
25%      109002.544867
50%      113650.495596
75%      117531.688825
max      117852.545605
dtype: float64

In [125]:
from sklearn.base import BaseEstimator, TransformerMixin, clone, MetaEstimatorMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.neighbors import KNeighborsRegressor

class FeatureFromRegressor(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
    def __init__(self, estimator):
        self.estimator = estimator
        
    def fit(self, X, y=None):
        check_array(X)
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X, y)
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator_, 'feature_names_in_'):
            self.feature_names_in_ = self.estimator_.feature_names_in_
        return self
    
    def transform(self, X):
        check_is_fitted(self)
        ypred = self.estimator_.predict(X)
        if ypred.ndim == 1:
            ypred = ypred.reshape(-1, 1)
        return ypred
    
    def get_features_names_out(self, names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, 'n_outputs_', 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estiamtor_shor_name = estimator_class_name.lower().replace('_', '')
        return [f'{estiamtor_shor_name}_prediction_{i}' for i in range(n_outputs)]
    
check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

[{'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_tags_renamed',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_valid_tag_types',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 

In [126]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = df[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, df['median_house_value'])

KeyError: "None of [Index(['latitude', 'longitude'], dtype='object')] are in the [columns]"

In [None]:
knn_transformer.get_features_names_out()

['kneighborsregressor_prediction_0']

In [127]:
from sklearn.utils.validation import validate_data
class StandardScalerClone(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        X = validate_data(self, X, ensure_2d=True)
        self.n_features_in_ = X.shape[1]
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0, ddof=0)
        self.std_[self.std_ == 0] = 1
        return self
        
    def transform(self, X):
        check_is_fitted(self)
        X = validate_data(self, X, ensure_2d = True, reset=False)
        return  (X - self.mean_) / self.std_
    
    def inverse_transform(self, X):
        check_is_fitted(self)
        validate_data(self, X, ensure_2d=True, reset=False)
        return X * self.std_ + self.means_
    
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            assert len(input_features) == self.n_features_in_
            if hasattr(self, 'feature_names_in_') and not np.all(
                self.features_names_in_ == input_features
            ):
                raise ValueError('The input_names and features_names_in_ are different')
            return input_features
        return getattr(self, 'feature_names_in_',
                       [f'x{i}' for i in range(self.n_features_in_)])
check_estimator(StandardScalerClone())

[{'estimator': StandardScalerClone(),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': StandardScalerClone(),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': StandardScalerClone(),
  'check_name': 'check_estimator_tags_renamed',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': StandardScalerClone(),
  'check_name': 'check_valid_tag_types',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': StandardScalerClone(),
  'check_name': 'check_estimator_repr',
  'exception': None,
  'status': 'passed',
  'expec

In [128]:
np.random.seed(42)
X = np.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [129]:
assert np.all(scaler.get_feature_names_out() == ["x0", "x1", "x2"])
assert np.all(scaler.get_feature_names_out(["a", "b", "c"]) == ["a", "b", "c"])

In [130]:
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(df)

assert np.all(scaler.feature_names_in_ == ["a", "b"])
assert np.all(scaler.get_feature_names_out() == ["a", "b"])