In [10]:
import joblib
import numpy as np
import pandas as pd
import work.prepare as wp

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [31]:
housing = pd.read_hdf("prepare-housing.h5", "housing")
housing_labels = pd.read_hdf("prepare-housing_labels.h5", "housing_labels")
housing_prepared = joblib.load("housing_prepared.pkl")

## Train

In [32]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(housing, housing_labels)

lin_rmses = -cross_val_score(lin_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()

count       10.000000
mean     70002.480641
std       4202.377716
min      65650.934624
25%      68040.750870
50%      68804.768762
75%      70166.227430
max      81031.237747
dtype: float64

In [16]:
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rmses).describe()

count       10.000000
mean     65816.756399
std       2348.788056
min      62416.417515
25%      63556.252750
50%      66613.324866
75%      67431.986888
max      69389.591858
dtype: float64

In [17]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rmses).describe()

count       10.000000
mean     65816.756399
std       2348.788056
min      62416.417515
25%      63556.252750
50%      66613.324866
75%      67431.986888
max      69389.591858
dtype: float64

## GridCV

In [19]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

param_grid = [
    {
        'preprocessing__geo__n_clusters': [5, 8, 10],
        'random_forest__max_features': [4, 6, 8]
    },
    {
        'preprocessing__geo__n_clusters': [10, 15],
        'random_forest__max_features': [6, 8, 10]
    }
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring="neg_root_mean_squared_error")
grid_search.fit(housing, housing_labels)

In [20]:
grid_search.best_params_

{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}

In [22]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
12,4.206895,0.031569,0.12547,0.005643,15,6,"{'preprocessing__geo__n_clusters': 15, 'random...",-43196.68698,-43820.296924,-44659.619487,-43892.201131,599.400001,1
13,5.368326,0.043259,0.12368,0.001744,15,8,"{'preprocessing__geo__n_clusters': 15, 'random...",-43667.667642,-44248.886946,-44636.447655,-44184.334081,398.128106,2
14,6.419287,0.036192,0.121929,0.001479,15,10,"{'preprocessing__geo__n_clusters': 15, 'random...",-44140.926711,-44693.596776,-45164.180511,-44666.234666,418.18943,3
7,4.143294,0.03754,0.131405,0.007055,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-44271.859364,-44879.596282,-45663.063322,-44938.172989,569.464975,4
9,4.101072,0.025908,0.128124,0.004378,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-44271.859364,-44879.596282,-45663.063322,-44938.172989,569.464975,4


### RandomizedSearchCV

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'preprocessing__geo__n_clusters': randint(low=3, high=50),
    'random_forest__max_features': randint(low=2, high=20)
}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3, 
    scoring="neg_root_mean_squared_error", random_state=42
)

rnd_search.fit(housing, housing_labels)

In [29]:
final_model = rnd_search.best_estimator_
feature_importances = final_model["random_forest"].feature_importances_
sorted(zip(feature_importances, final_model["preprocessing"].get_feature_names_out()), reverse=True)

[(0.18265784948896358, 'log__median_income'),
 (0.07288575804294134, 'cat__ocean_proximity_INLAND'),
 (0.0702267693276188, 'bedrooms__ratio'),
 (0.054488620110953234, 'rooms_per_house__ratio'),
 (0.04822250899476408, 'people_per_house__ratio'),
 (0.03598903799761899, 'geo__Cluster 6 similarity'),
 (0.029301633458942662, 'geo__Cluster 40 similarity'),
 (0.021831822890732027, 'geo__Cluster 18 similarity'),
 (0.02176559175908292, 'geo__Cluster 41 similarity'),
 (0.02054830776591185, 'geo__Cluster 35 similarity'),
 (0.017570879813687178, 'geo__Cluster 13 similarity'),
 (0.016699872484664355, 'geo__Cluster 33 similarity'),
 (0.015219649903534792, 'geo__Cluster 37 similarity'),
 (0.015017912642662239, 'geo__Cluster 17 similarity'),
 (0.01426522670196346, 'geo__Cluster 44 similarity'),
 (0.013882344281963057, 'geo__Cluster 42 similarity'),
 (0.012865097998115924, 'geo__Cluster 29 similarity'),
 (0.01278433977528383, 'geo__Cluster 32 similarity'),
 (0.012729266983414061, 'geo__Cluster 38 simil