In [None]:
# Importação das bibliotecas necessárias
import sklearn
from sklearn.datasets import fetch_california_housing  # Para carregar o conjunto de dados
from sklearn.model_selection import train_test_split   # Para dividir os dados em treino, validação e teste
from sklearn.neural_network import MLPRegressor        # Modelo de regressão com MLP
from sklearn.tree import DecisionTreeRegressor         # Modelo de regressão com Árvore de decisão
from sklearn.neighbors import KNeighborsRegressor         # Modelo de Regressão com Vizinhos mais próximos
from sklearn.pipeline import make_pipeline             # Para encadear transformações e modelo
from sklearn.preprocessing import StandardScaler       # Para normalização z-score dos dados
from sklearn.metrics import root_mean_squared_error    # Para avaliação da performance (métrica RMSE)
from sklearn.model_selection import GridSearchCV       # Para busca exaustiva de hiperparâmetros
import pandas as pd                                    # Para manipulação e visualização de dados

# Configuração para permitir a visualização gráfica dos pipelines
sklearn.set_config('diagram')

# Importação dos dados

In [None]:
housing = fetch_california_housing(as_frame=True)

In [None]:
# Separação entre atributos de entrada (X) e alvo (y)
X = housing['data']
y = housing['target']

# Divisão inicial: treino + validação vs teste
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)

# Divisão do conjunto de treino: treino e validação
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

# Rede Neural Artificial

In [None]:
mlpregressor = MLPRegressor()
pipeline = make_pipeline(StandardScaler(), mlpregressor)
param_grid = {
    'mlpregressor__activation':         ['relu', 'tanh', 'logistic'],     # Função de ativação
    'mlpregressor__solver':             ['sgd', 'adam']                   # Algoritmo de otimização
}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    refit=True,
    cv=5,
    verbose=3

)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV 1/5] END mlpregressor__activation=relu, mlpregressor__solver=sgd;, score=-0.576 total time=   4.0s




[CV 2/5] END mlpregressor__activation=relu, mlpregressor__solver=sgd;, score=-0.619 total time=   3.5s




[CV 3/5] END mlpregressor__activation=relu, mlpregressor__solver=sgd;, score=-0.602 total time=   3.5s




[CV 4/5] END mlpregressor__activation=relu, mlpregressor__solver=sgd;, score=-0.549 total time=   3.9s




[CV 5/5] END mlpregressor__activation=relu, mlpregressor__solver=sgd;, score=-0.611 total time=   3.6s
[CV 1/5] END mlpregressor__activation=relu, mlpregressor__solver=adam;, score=-0.549 total time=   1.7s
[CV 2/5] END mlpregressor__activation=relu, mlpregressor__solver=adam;, score=-0.583 total time=   2.3s




[CV 3/5] END mlpregressor__activation=relu, mlpregressor__solver=adam;, score=-0.598 total time=   4.5s
[CV 4/5] END mlpregressor__activation=relu, mlpregressor__solver=adam;, score=-0.522 total time=   2.1s
[CV 5/5] END mlpregressor__activation=relu, mlpregressor__solver=adam;, score=-0.563 total time=   2.8s




[CV 1/5] END mlpregressor__activation=tanh, mlpregressor__solver=sgd;, score=-0.626 total time=   5.8s




[CV 2/5] END mlpregressor__activation=tanh, mlpregressor__solver=sgd;, score=-0.670 total time=   5.3s




[CV 3/5] END mlpregressor__activation=tanh, mlpregressor__solver=sgd;, score=-0.640 total time=   5.6s




[CV 4/5] END mlpregressor__activation=tanh, mlpregressor__solver=sgd;, score=-0.607 total time=   5.6s




[CV 5/5] END mlpregressor__activation=tanh, mlpregressor__solver=sgd;, score=-0.667 total time=   5.4s




[CV 1/5] END mlpregressor__activation=tanh, mlpregressor__solver=adam;, score=-0.547 total time=   6.0s




[CV 2/5] END mlpregressor__activation=tanh, mlpregressor__solver=adam;, score=-0.588 total time=   5.7s




[CV 3/5] END mlpregressor__activation=tanh, mlpregressor__solver=adam;, score=-0.567 total time=   5.8s




[CV 4/5] END mlpregressor__activation=tanh, mlpregressor__solver=adam;, score=-0.529 total time=   5.6s




[CV 5/5] END mlpregressor__activation=tanh, mlpregressor__solver=adam;, score=-0.572 total time=   5.7s
[CV 1/5] END mlpregressor__activation=logistic, mlpregressor__solver=sgd;, score=-0.670 total time=   3.8s




[CV 2/5] END mlpregressor__activation=logistic, mlpregressor__solver=sgd;, score=-0.734 total time=   4.6s




[CV 3/5] END mlpregressor__activation=logistic, mlpregressor__solver=sgd;, score=-0.689 total time=   4.0s




[CV 4/5] END mlpregressor__activation=logistic, mlpregressor__solver=sgd;, score=-0.646 total time=   4.1s




[CV 5/5] END mlpregressor__activation=logistic, mlpregressor__solver=sgd;, score=-0.703 total time=   4.6s




[CV 1/5] END mlpregressor__activation=logistic, mlpregressor__solver=adam;, score=-0.574 total time=   4.3s




[CV 2/5] END mlpregressor__activation=logistic, mlpregressor__solver=adam;, score=-0.619 total time=   4.9s




[CV 3/5] END mlpregressor__activation=logistic, mlpregressor__solver=adam;, score=-0.603 total time=   4.3s




[CV 4/5] END mlpregressor__activation=logistic, mlpregressor__solver=adam;, score=-0.557 total time=   4.3s




[CV 5/5] END mlpregressor__activation=logistic, mlpregressor__solver=adam;, score=-0.605 total time=   4.8s




In [None]:
print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

Melhores parâmetros encontrados:
{'mlpregressor__activation': 'tanh', 'mlpregressor__solver': 'adam'}


In [None]:
print("\nMelhor estimador:")
print(grid_search.best_estimator_)


Melhor estimador:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(activation='tanh'))])


In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)                                  # Conversão dos resultados da busca para DataFrame
cv_res.sort_values(by="rank_test_score", ascending=True, inplace=True)          # Ordenação dos resultados com base no rank da métrica (menor erro)
display(cv_res)                                                                 # Exibição dos resultados ordenados

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlpregressor__activation,param_mlpregressor__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,5.745708,0.155005,0.010868,0.005718,tanh,adam,"{'mlpregressor__activation': 'tanh', 'mlpregre...",-0.547461,-0.588258,-0.567339,-0.529217,-0.571502,-0.560755,0.020429,1
1,2.693582,0.963764,0.003221,0.000247,relu,adam,"{'mlpregressor__activation': 'relu', 'mlpregre...",-0.548647,-0.58257,-0.597755,-0.521575,-0.563235,-0.562757,0.026506,2
0,3.695599,0.206024,0.003896,0.000689,relu,sgd,"{'mlpregressor__activation': 'relu', 'mlpregre...",-0.576461,-0.618579,-0.601764,-0.548913,-0.611373,-0.591418,0.025587,3
5,4.527017,0.264723,0.006271,0.000425,logistic,adam,"{'mlpregressor__activation': 'logistic', 'mlpr...",-0.573567,-0.618993,-0.60288,-0.556527,-0.605212,-0.591436,0.022882,4
2,5.509211,0.17178,0.011493,0.005728,tanh,sgd,"{'mlpregressor__activation': 'tanh', 'mlpregre...",-0.626311,-0.669648,-0.640253,-0.607389,-0.666836,-0.642088,0.023784,5
4,4.177188,0.330956,0.006207,0.000269,logistic,sgd,"{'mlpregressor__activation': 'logistic', 'mlpr...",-0.670178,-0.734422,-0.688948,-0.645767,-0.703483,-0.68856,0.029984,6


In [None]:
grid_search.score(X_test, y_test)

-0.5535768979746848

# Árvore de Decisão

In [None]:
decisiontreeregressor = DecisionTreeRegressor()
pipeline = make_pipeline(StandardScaler(), decisiontreeregressor)
param_grid = {
    "decisiontreeregressor__max_depth":  [2, 5, 7],
    "decisiontreeregressor__splitter":   ["best", "random"]
}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    refit=True,
    cv=5,
    verbose=3

)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=best;, score=-0.842 total time=   0.0s
[CV 2/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=best;, score=-0.883 total time=   0.0s
[CV 3/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=best;, score=-0.867 total time=   0.0s
[CV 4/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=best;, score=-0.831 total time=   0.0s
[CV 5/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=best;, score=-0.889 total time=   0.0s
[CV 1/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=random;, score=-0.992 total time=   0.0s
[CV 2/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=random;, score=-1.109 total time=   0.0s
[CV 3/5] END decisiontreeregressor__max_depth=2, decisiontreeregressor__splitter=random;, score=

In [None]:
print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

Melhores parâmetros encontrados:
{'decisiontreeregressor__max_depth': 7, 'decisiontreeregressor__splitter': 'best'}


In [None]:
print("\nMelhor estimador:")
print(grid_search.best_estimator_)



Melhor estimador:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeregressor', DecisionTreeRegressor(max_depth=7))])


In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)                                  # Conversão dos resultados da busca para DataFrame
cv_res.sort_values(by="rank_test_score", ascending=True, inplace=True)          # Ordenação dos resultados com base no rank da métrica (menor erro)
display(cv_res)                                                                 # Exibição dos resultados ordenados

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeregressor__max_depth,param_decisiontreeregressor__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.069691,0.00224,0.002898,0.000316,7,best,"{'decisiontreeregressor__max_depth': 7, 'decis...",-0.645218,-0.705812,-0.698084,-0.669651,-0.693806,-0.682514,0.022222,1
2,0.051183,0.00744,0.003575,0.00052,5,best,"{'decisiontreeregressor__max_depth': 5, 'decis...",-0.708207,-0.746762,-0.737847,-0.690318,-0.742723,-0.725171,0.022079,2
5,0.007519,0.000224,0.002128,8.1e-05,7,random,"{'decisiontreeregressor__max_depth': 7, 'decis...",-0.80441,-0.80885,-0.807364,-0.801756,-0.752864,-0.795049,0.021234,3
3,0.00714,0.000522,0.002548,0.000312,5,random,"{'decisiontreeregressor__max_depth': 5, 'decis...",-0.825899,-0.859146,-0.915721,-0.810319,-0.890709,-0.860359,0.039189,4
0,0.023322,0.003115,0.002873,0.000304,2,best,"{'decisiontreeregressor__max_depth': 2, 'decis...",-0.84188,-0.882809,-0.866874,-0.830608,-0.889308,-0.862296,0.02277,5
1,0.004212,0.000387,0.001753,0.000212,2,random,"{'decisiontreeregressor__max_depth': 2, 'decis...",-0.992484,-1.109164,-0.962677,-0.883064,-1.079591,-1.005396,0.081522,6


In [None]:
grid_search.score(X_test, y_test)

-0.6793915429083779

# K-NN

In [None]:
kneighborsregressor = KNeighborsRegressor()
pipeline = make_pipeline(StandardScaler(), kneighborsregressor)
param_grid = {
    "kneighborsregressor__n_neighbors":           [4, 5, 6],
    "kneighborsregressor__metric":                ["cityblock", "minkowski", "cosine"]
}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    refit=True,
    cv=5,
    verbose=3

)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=4;, score=-0.595 total time=   0.3s
[CV 2/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=4;, score=-0.640 total time=   0.3s
[CV 3/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=4;, score=-0.612 total time=   0.1s
[CV 4/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=4;, score=-0.575 total time=   0.2s
[CV 5/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=4;, score=-0.599 total time=   0.1s
[CV 1/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=5;, score=-0.586 total time=   0.2s
[CV 2/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=5;, score=-0.634 total time=   0.2s
[CV 3/5] END kneighborsregressor__metric=cityblock, kneighborsregressor__n_neighbors=5;, scor

In [None]:
print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

Melhores parâmetros encontrados:
{'kneighborsregressor__metric': 'cityblock', 'kneighborsregressor__n_neighbors': 6}


In [None]:
print("\nMelhor estimador:")
print(grid_search.best_estimator_)


Melhor estimador:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsregressor',
                 KNeighborsRegressor(metric='cityblock', n_neighbors=6))])


In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)                                  # Conversão dos resultados da busca para DataFrame
cv_res.sort_values(by="rank_test_score", ascending=True, inplace=True)          # Ordenação dos resultados com base no rank da métrica (menor erro)
display(cv_res)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsregressor__metric,param_kneighborsregressor__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.012033,0.001881,0.160968,0.01457,cityblock,6,"{'kneighborsregressor__metric': 'cityblock', '...",-0.579255,-0.635571,-0.597233,-0.561739,-0.594341,-0.593628,0.024479,1
1,0.010343,0.000283,0.149137,0.006867,cityblock,5,"{'kneighborsregressor__metric': 'cityblock', '...",-0.585664,-0.634082,-0.600394,-0.566698,-0.590635,-0.595494,0.022192,2
0,0.013328,0.003182,0.176624,0.053174,cityblock,4,"{'kneighborsregressor__metric': 'cityblock', '...",-0.594678,-0.640372,-0.611721,-0.575473,-0.59867,-0.604183,0.021501,3
5,0.010623,0.00038,0.080837,0.006573,minkowski,6,"{'kneighborsregressor__metric': 'minkowski', '...",-0.602145,-0.656313,-0.619196,-0.586919,-0.6183,-0.616574,0.023137,4
4,0.010898,0.000296,0.084684,0.013373,minkowski,5,"{'kneighborsregressor__metric': 'minkowski', '...",-0.605252,-0.6563,-0.622527,-0.588459,-0.620619,-0.618631,0.022483,5
3,0.011291,0.000317,0.074749,0.007423,minkowski,4,"{'kneighborsregressor__metric': 'minkowski', '...",-0.60924,-0.667891,-0.62718,-0.593918,-0.629471,-0.62554,0.024809,6
8,0.003782,0.000527,0.221854,0.002983,cosine,6,"{'kneighborsregressor__metric': 'cosine', 'kne...",-0.603173,-0.666609,-0.630017,-0.602372,-0.629888,-0.626412,0.02349,7
7,0.003349,4.5e-05,0.219899,0.00401,cosine,5,"{'kneighborsregressor__metric': 'cosine', 'kne...",-0.606991,-0.676452,-0.633641,-0.602972,-0.633996,-0.63081,0.026244,8
6,0.003686,0.000643,0.218339,0.001773,cosine,4,"{'kneighborsregressor__metric': 'cosine', 'kne...",-0.61573,-0.686337,-0.634729,-0.609474,-0.639777,-0.637209,0.027044,9


In [None]:
grid_search.score(X_test, y_test)

-0.5720925072557249

# Tarefa

1. Utilize o [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#) para comparar a performance de 3 modelos de regressão no conjunto de dados **California Housing**:

   - **Rede Neural Artificial (RNA)** – [rede neural artificial](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html) (RNA)
   - **Árvore de Decisão (AD)** – [árvore de Regressão](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html) (AD)
   - **k-Vizinhos Mais Próximos (k-NN)** – [k-Vizinhos Mais Próximos](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html) (k-NN)

2. Para cada modelo, **defina uma grade (`param_grid`) com pelo menos 2 hiperparâmetros distintos** a serem testados via busca em grade.

3. Utilize o **`GridSearchCV`** com:
   - Validação cruzada (`cv=5`);
   - Métrica de avaliação baseada na **raiz do erro quadrático médio (RMSE)**;
   - Reajuste (`refit=True`) com os melhores hiperparâmetros.

4. **Avalie o desempenho final** dos três modelos:
   - Registre o melhor conjunto de hiperparâmetros obtido;
   - Reporte o **RMSE médio na validação cruzada** e o **RMSE no conjunto de teste**;
   - Compare os resultados entre os modelos.


---

Dica

Para acessar os nomes automáticos das etapas dentro de um `Pipeline`, utilize:

```python
pipeline.named_steps
```

ou nomeie manualmente as etapas para facilitar o uso no param_grid, por exemplo:

    ``` python
    pipeline = Pipeline(steps = [
        ('escala', StandardScaler()),
        ('modelo', MLPRegressor(hidden_layer_sizes=[40, 30],
                       activation='relu',
                       max_iter=300,
                       solver='adam',
                       learning_rate_init=0.01,
                       batch_size=256,
                       random_state=42))
    ])
    ```