In [2]:
import numpy as np
import pandas as pd

In [3]:
import os

housing_path = os.path.join("..", "datasets", "housing", "housing.csv")
housing_data = pd.read_csv(housing_path)

In [4]:
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(
    housing_data, test_size=0.2, random_state=42, stratify=housing_data["income_cat"]
)

for set in (train_set, test_set):
    set.drop("income_cat", axis=1, inplace=True)
housing: pd.DataFrame = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6


class CombineAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        rooms_per_households = X[:, rooms_idx] / X[:, households_idx]
        population_per_households = X[:, population_idx] / X[:, households_idx]
        if self.add_bedrooms_per_room:
            bedrooms_per_rooms = X[:, bedrooms_idx] / X[:, rooms_idx]
            return np.c_[
                X, rooms_per_households, population_per_households, bedrooms_per_rooms
            ]
        else:
            return np.c_[X, rooms_per_households, population_per_households]


attr_adder = CombineAttributeAdder()
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("attr_adder", CombineAttributeAdder()),
        ("std_scaler", StandardScaler()),
    ]
)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

housing_num = housing.select_dtypes(include=[np.number])
housing_cat = housing.select_dtypes(exclude=[np.number])


num_attr = list(housing_num)
cat_attr = list(housing_cat)

full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, num_attr),
        ("cat", OneHotEncoder(), cat_attr),
    ]
)
housing_prepared = full_pipeline.fit_transform(housing)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

forest_reg = RandomForestRegressor(random_state=42, max_features=6, n_estimators=50)
forest_reg.fit(housing_prepared, housing_labels)
forest_rmse_score = np.sqrt(
    mean_squared_error(forest_reg.predict(housing_prepared), housing_labels)
)
forest_rmse_score

18701.967618164916

In [10]:
def display_scores(scores: np.ndarray):
    print("Scores", scores)
    print(f"Mean: {scores.mean()}")
    print(f"Derivate: {scores.std()}")

In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=5
)

display_scores(np.sqrt(-scores))

Scores [49265.041665   48682.56414208 48462.33179472 50042.7388339
 51092.79748033]
Mean: 49509.09478320574
Derivate: 962.4258709141989


In [None]:
features_importance = forest_reg.feature_importances_
housing_prepared.columns.tolist()
