# Busca de Hiperparâmetros Promissores

## Título:

**Predição de preço de imóveis**

## Membros:

*   Adrisson Rogério Samersla
*   Nickolas Batista Mendonça Machado
*   Thayna Pires Baldão



# SETUP

In [2]:
# Importando os pacotes necessários para a análise

import time

import pandas   as pd
import numpy    as np

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.reset_option('max_colwidth')

# Configurações globais do treinamento
NUM_JOBS = 8
SEED = 21

In [3]:
# Lendo a base de dados

df = pd.read_csv("../dataset/preprocessed.csv")
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))

Formato dos dados:  (449633, 12)
#Exemplos: 449633
#Atributos: 12


In [4]:
# Breve descrição da base de dados

print('Tipo dos atributos:\n')
print(df.dtypes)

print('\n#Valores nulos:\n')
print(df.isnull().sum())

print('\nAlguns exemplo:\n')
df.head()

Tipo dos atributos:

created_on               float64
lat                      float64
lon                      float64
surface_total_in_m2      float64
surface_covered_in_m2    float64
rooms                    float64
expenses                 float64
PH                       float64
apartment                float64
house                    float64
store                    float64
price                    float64
dtype: object

#Valores nulos:

created_on               0
lat                      0
lon                      0
surface_total_in_m2      0
surface_covered_in_m2    0
rooms                    0
expenses                 0
PH                       0
apartment                0
house                    0
store                    0
price                    0
dtype: int64

Alguns exemplo:



Unnamed: 0,created_on,lat,lon,surface_total_in_m2,surface_covered_in_m2,rooms,expenses,PH,apartment,house,store,price
0,981.0,-23.616325,-46.67662,80.0,80.0,2.0,1065.0,0.0,1.0,0.0,0.0,900000.0
1,1008.0,-30.05175,-51.182068,91.0,91.0,2.0,0.0,0.0,1.0,0.0,0.0,750000.0
2,568.0,-25.413214,-49.271294,168.0,168.0,2.0,0.0,0.0,1.0,0.0,0.0,880000.0
3,755.0,-23.56229,-46.521248,145.0,145.0,2.0,0.0,0.0,0.0,1.0,0.0,580000.0
4,764.0,-21.8096,-46.544624,138.0,260.0,3.0,0.0,0.0,0.0,1.0,0.0,420000.0


# k-Nearest Neighbors (kNN)

In [5]:
# Importando os pacotes necessários para o treinamento

from sklearn.neighbors import KNeighborsRegressor

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_validate
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error

## Aquecimento

In [6]:
# Separando a base de dados (holdout)

X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [7]:
# Treinando e avaliando o modelo
#   Importante para avaliar o tempo de treinamento

model = Pipeline([
    ('scaler', StandardScaler()), 
    ('knn', KNeighborsRegressor(weights='distance', n_jobs=NUM_JOBS))])

start = time.time()
y_predicted = model.fit(X_train, y_train).predict(X_test)
end = time.time()

print('Took {:.2f} seconds to train!'.format(end - start))

Took 76.95 seconds to train!


In [8]:
# Valor de erro obtido: RMSLE

np.sqrt(mean_squared_log_error(y_test,y_predicted))

0.3692169419834162

## Buscando hiperparâmetros

In [9]:
# Definição dos modelos e hiperparâmetros a serem avaliados

scorer = make_scorer(mean_squared_log_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

grid_params = {
    'knn__n_neighbors': [3,7,11,15],
    'knn__p': [1,2],
}

model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(weights='distance', n_jobs=NUM_JOBS))])

gs = GridSearchCV(model, grid_params, cv=cv, n_jobs=NUM_JOBS, scoring=scorer)

In [10]:
# Executando o experimento

start = time.time()
gs_results = gs.fit(X_train, y_train)
end = time.time()

print("Total Time: {:.2f} s\n".format(end - start))

Total Time: 1325.16 s



In [11]:
# Inspecionando os resultados

gs_results.best_score_

-0.12975486938501435

## Conjunto promissor de hiperparâmetros

In [12]:
# Inspecionando os melhores resultados obtidos

gs_results.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('knn',
                 KNeighborsRegressor(n_jobs=8, n_neighbors=7, p=1,
                                     weights='distance'))])

In [16]:
# Refinando a busca

model = GridSearchCV(
    Pipeline([
        ('scaler', StandardScaler()),
        ('knn', KNeighborsRegressor(
            weights='distance',
            p=1, 
            n_jobs=NUM_JOBS))]), 
    param_grid={
        'knn__n_neighbors': [7, 11, 15], 
    },
    scoring=scorer,
    cv=KFold(n_splits=10, shuffle=True, random_state=SEED),
    n_jobs=NUM_JOBS)

start = time.time()
model.fit(X,y)
end = time.time()
print("Total Time: {:.2f} s\n".format(end - start))

Total Time: 2522.26 s



In [17]:
# Valor de erro obtido: RMSLE

np.sqrt(-model.best_score_)

0.3662264222341633

In [18]:
# Inspecionando os melhores hiperparâmetros obtidos

model.best_params_

{'knn__n_neighbors': 11}