In [None]:
#Necessário executar no Google Colab ou caso não tenha a biblioteca instalada localmente
!pip install scikit-optimize

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import time
from skopt import gp_minimize
from skopt.space import Categorical, Real, Integer
from skopt.utils import use_named_args

In [None]:
train = pd.read_csv("california_housing_train.csv")
test = pd.read_csv("california_housing_test.csv")

In [None]:
train.shape, test.shape

((17000, 9), (3000, 9))

In [None]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [None]:
cv = KFold(n_splits = 10, shuffle=True, random_state=42)

X_train = train.drop("median_house_value", axis = 1)
y_train = train["median_house_value"]

X_test = test.drop("median_house_value", axis = 1)
y_test = test["median_house_value"]

# Dummy

In [None]:
dummy_regressor = DummyRegressor(strategy="mean")

dummy_regressor.fit(X_train, y_train)

yhat = dummy_regressor.predict(X_test)

dummy_score = np.sqrt(mean_squared_error(y_test, yhat))  ## penaliza mais os erros maiores

print(f"Dummy: {round(dummy_score, 2)}")

Dummy: 113110.19


# Baseline

In [None]:
#validation

baseline = RandomForestRegressor(random_state=42, n_jobs=-1)

start = time.time()

validate_score = cross_val_score(baseline, X_train, y_train, n_jobs = -1, cv = cv, scoring="neg_root_mean_squared_error")

end = time.time()

print(f"Tempo de execução: {end - start}")

Tempo de execução: 107.40469312667847


In [None]:
#train and test

start = time.time()

baseline.fit(X_train, y_train)

yhat_train = baseline.predict(X_train) 
yhat_test = baseline.predict(X_test)

train_score = np.sqrt(mean_squared_error(y_train, yhat_train))
test_score = np.sqrt(mean_squared_error(y_test, yhat_test))

end = time.time()

print(f"Tempo de execução: {end - start}")

Tempo de execução: 13.272844314575195


In [None]:
print(f"Train: {round(train_score, 2)}")
print(f"Validation: {round(abs(np.mean(validate_score)), 2)} ± {round(np.std(validate_score), 2)}")
print(f"Test: {round(test_score, 2)}")

Train: 18201.21
Validation: 48654.7 ± 1568.86
Test: 49457.73


# Grid Search

In [None]:
regressor_grid = RandomForestRegressor(n_jobs = -1, random_state=42)

grid = {"n_estimators": [100, 200, 300],
        "min_samples_leaf":[2, 5, 20, 50],
       "max_features": ["sqrt", "auto"],
        "max_depth": [2,5,20,50],
        "bootstrap": [True, False]
       }

model_grid = GridSearchCV(regressor_grid, param_grid=grid, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1)

start = round(time.time(), 2)
model_grid.fit(X_train,y_train)
end = round(time.time(), 2)

print(f"Tempo de execução: {end - start}")



Tempo de execução: 11537.870000123978


In [None]:
print(f"Tempo de execução: {end - start}")

In [None]:
validate_score_grid = abs(model_grid.best_score_)
validate_score_grid

48375.58613163082

In [None]:
model_grid.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'n_estimators': 300}

In [None]:
regressor_grid_tunned = RandomForestRegressor(n_estimators = 300, max_depth=50, max_features="auto", min_samples_leaf=2)

start = time.time()
regressor_grid_tunned.fit(X_train, y_train)

yhat_train = regressor_grid_tunned.predict(X_train)
yhat_test = regressor_grid_tunned.predict(X_test)

train_score = np.sqrt(mean_squared_error(y_train, yhat_train))
test_score = np.sqrt(mean_squared_error(y_test, yhat_test))

end = time.time()

print(f"Tempo de execução: {end - start}")



Tempo de execução: 35.77161931991577


In [None]:
print(f"Train: {round(train_score, 2)}")
print(f"Validation: {round(abs(np.mean(validate_score)), 2)} ± {round(np.std(validate_score), 2)}")
print(f"Test: {round(test_score, 2)}")

Train: 23091.78
Validation: 48375.59 ± 0.0
Test: 49270.13


# SciKit optimize

In [None]:
skopt_regressor = RandomForestRegressor(n_jobs=-1, random_state=42)

In [None]:
space  = [Integer(100, 300, name="n_estimators"),
          Categorical(["sqrt","auto"], name="max_features"),
          Integer(2,50, name="min_samples_leaf"),
          Integer(2,50, name="max_depth"),
          Categorical([True, False], name="bootstrap")]

In [None]:
@use_named_args(space)
def objective(**params):
    skopt_regressor.set_params(**params)

    return -np.mean(cross_val_score(skopt_regressor, X_train, y_train, cv=cv, n_jobs=-1,
                                    scoring="neg_root_mean_squared_error"))

In [None]:
st = time.time()
res_gp = gp_minimize(objective, space, n_calls=50, random_state=0, n_jobs=-1)
end = time.time()
print(f"Tempo de execução: {end - st}")



Tempo de execução: 8507.327837705612


In [None]:
print(f"Tempo de execução: {end - st}")

Tempo de execução: 8507.327837705612


In [None]:
np.abs(res_gp.fun)

48375.58613163082

In [None]:
res_gp.x

[300, 'auto', 2, 50, True]