# Geração dos Modelos de AM

## Título:

**Predição de preço de imóveis**

## Membros:

*   Adrisson Rogério Samersla
*   Nickolas Batista Mendonça Machado
*   Thayna Pires Baldão

# SETUP

In [1]:
# Importando os pacotes necessários para a análise

import time

import pandas   as pd
import numpy    as np

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.reset_option('max_colwidth')

# Configurações globais do treinamento
NUM_JOBS = 8
SEED = 21

In [2]:
# Lendo a base de dados

df = pd.read_csv('./dataset/preprocessed.csv')
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))

Formato dos dados:  (449633, 12)
#Exemplos: 449633
#Atributos: 12


In [3]:
# Breve descrição da base de dados

print('Tipo dos atributos:\n')
print(df.dtypes)

print('\n#Valores nulos:\n')
print(df.isnull().sum())

print('\nAlguns exemplo:\n')
df.head()

created_on               float64
lat                      float64
lon                      float64
surface_total_in_m2      float64
surface_covered_in_m2    float64
rooms                    float64
expenses                 float64
PH                       float64
apartment                float64
house                    float64
store                    float64
price                    float64
dtype: object
created_on               0
lat                      0
lon                      0
surface_total_in_m2      0
surface_covered_in_m2    0
rooms                    0
expenses                 0
PH                       0
apartment                0
house                    0
store                    0
price                    0
dtype: int64


Unnamed: 0,created_on,lat,lon,surface_total_in_m2,surface_covered_in_m2,rooms,expenses,PH,apartment,house,store,price
0,981.0,-23.616325,-46.67662,80.0,80.0,2.0,1065.0,0.0,1.0,0.0,0.0,900000.0
1,1008.0,-30.05175,-51.182068,91.0,91.0,2.0,0.0,0.0,1.0,0.0,0.0,750000.0
2,568.0,-25.413214,-49.271294,168.0,168.0,2.0,0.0,0.0,1.0,0.0,0.0,880000.0
3,755.0,-23.56229,-46.521248,145.0,145.0,2.0,0.0,0.0,0.0,1.0,0.0,580000.0
4,764.0,-21.8096,-46.544624,138.0,260.0,3.0,0.0,0.0,0.0,1.0,0.0,420000.0


# Geração dos Modelos de AM

In [4]:
# Importando os pacotes necessários para o treinamento

from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV, KFold, cross_validate
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import make_scorer

In [5]:
# Preparando o conjunto de dados

X = df.drop("price",1)
y = df.price

In [6]:
# Definição dos modelos e hiperparâmetros a serem avaliados

scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

gscv = KFold(n_splits=3, shuffle=True, random_state=SEED)
cv = KFold(n_splits=10, shuffle=True, random_state=SEED)

algorithms = {
    'rf': GridSearchCV(
        Pipeline([
            ('scaler', StandardScaler()),
            ('rf', RandomForestRegressor(
                random_state=SEED,
                n_estimators=100,
                max_depth=16,
                max_features='auto',
                min_samples_leaf=1,
                n_jobs=NUM_JOBS))]), 
        param_grid= {
            'rf__min_samples_split': [2, 5, 10],
        },
        scoring=scorer,
        cv=gscv,
        n_jobs=NUM_JOBS,
        verbose=4),
    'knn': GridSearchCV(
        Pipeline([
            ('scaler', StandardScaler()),
            ('knn', KNeighborsRegressor(
                weights='distance',
                p=1, 
                n_jobs=NUM_JOBS))]), 
        param_grid={
            'knn__n_neighbors': [7, 11, 15], 
        },
        scoring=scorer,
        cv=gscv,
        n_jobs=NUM_JOBS,
        verbose=4),
    'svr': GridSearchCV(
        Pipeline([
            ('scaler', StandardScaler()),
            ('svr', LinearSVR(
                random_state=SEED, 
                max_iter=1E+6))]), 
        param_grid={
            'svr__epsilon': [-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 10.0, 50.0, 75.0, 90.0],
            'svr__C': [0.01, 0.1, 1.0, 5.0],
        },
        scoring=scorer,
        cv=gscv,
        n_jobs=NUM_JOBS,
        verbose=4),
}

In [7]:
# Executando o experimento

results = {}
best_algs = {}
for alg, clf in algorithms.items():
    print(f"Algorithm {alg}: starting...")
    cv_res = cross_validate(clf, X, y, cv=cv, return_estimator=True, n_jobs=NUM_JOBS, pre_dispatch=1, verbose=4)
    print(f"Algorithm {alg}: DONE!")
    print("Train set: {:.2f} s".format(np.mean(cv_res['fit_time'])))
    print("Test set : {:.2f} s".format(np.mean(cv_res['score_time'])))
    print("Total    : {:.2f} s".format(np.mean(cv_res['fit_time']) + np.mean(cv_res['score_time'])))
    print()

    results[alg] = np.sqrt(-cv_res['test_score'])
    best_algs[alg] = cv_res['estimator']

results = pd.DataFrame.from_dict(results)

Algorithm rf: starting...
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 77.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Algorithm rf: DONE!
Train set: 460.81 s
Test set : 0.26 s
Total    : 461.06 s

Algorithm knn: starting...
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 184.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Algorithm knn: DONE!
Train set: 1083.32 s
Test set : 21.31 s
Total    : 1104.63 s

Algorithm svr: starting...
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.9min finished
Algorithm svr: DONE!
Train set: 16.65 s
Test set : 0.01 s
Total    : 16.65 s



In [8]:
# Inspecionando os resultados

results

Unnamed: 0,rf,knn,svr
0,0.306643,0.368359,0.802882
1,0.308204,0.373111,0.807845
2,0.304374,0.37261,0.812891
3,0.299823,0.309161,0.809532
4,0.305305,0.369953,0.803771
5,0.309946,0.372822,0.809968
6,0.306582,0.373051,0.804598
7,0.306919,0.376341,0.802869
8,0.31227,0.371901,0.800743
9,0.306453,0.37075,0.80518
