# Pré-Processamento dos Dados

## Título:

**Predição de preço de imóveis**

## Membros:

*   Adrisson Rogério Samersla
*   Nickolas Batista Mendonça Machado
*   Thayna Pires Baldão



# Setup

In [109]:
# Importando os pacotes necessários para a análise

import os

import pandas            as pd
import geopandas         as gpd
import numpy             as np
import scipy             as sc
import matplotlib.pyplot as plt
import seaborn           as sns

from google_drive_downloader import GoogleDriveDownloader as gdd

import mapclassify

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.reset_option('max_colwidth')

In [110]:
# Baixando o dataset

dataset_dir = "../dataset"
has_dataset_dir = os.path.isdir(dataset_dir)
if (not has_dataset_dir):
  # Link of dataset folder: 
  # https://drive.google.com/file/d/1S4rBgtuogAGr_WIcF-FIPaULfGlB9MRs/view?usp=sharing
  gdd.download_file_from_google_drive(file_id='1S4rBgtuogAGr_WIcF-FIPaULfGlB9MRs',
                                      dest_path='../dataset.zip',
                                      showsize=True,
                                      unzip=True)

In [111]:
# Lendo a base de dados

df = pd.read_csv(dataset_dir + '/dataset.csv')
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))

Formato dos dados:  (1000000, 24)
#Exemplos: 1000000
#Atributos: 24


In [112]:
df.dtypes

id                           int64
property_id                 object
created_on                  object
operation                   object
property_type               object
place_name                  object
place_with_parent_names     object
country_name                object
state_name                  object
geonames_id                float64
lat_lon                     object
lat                        float64
lon                        float64
currency                    object
surface_total_in_m2        float64
surface_covered_in_m2      float64
floor                      float64
rooms                      float64
expenses                   float64
description                 object
title                       object
image_thumbnail             object
collected_on                object
price                      float64
dtype: object

In [113]:
df.head()

Unnamed: 0,id,property_id,created_on,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat_lon,lat,lon,currency,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description,title,image_thumbnail,collected_on,price
0,9641098,9e065715d8b6c69bc930c185901aaf71e3be3d54,2017-04-28,sell,apartment,Campo Belo,|Brasil|São Paulo|São Paulo|Campo Belo|,Brasil,São Paulo,,"-23.616325,-46.67662",-23.616325,-46.67662,BRL,,80.0,,,1065.0,Atualizado em 05/06/2017. O projeto arquitetôn...,"Apartamento com 2 quartos e 2 Suites, São Paul...",https://thumbs4.properati.com/2/tQXG8R8vA-PsM4...,2017-08,900000.0
1,7965368,f6b640522dd15dd3667850718f52aa02516b04ff,2017-05-25,sell,apartment,Porto Alegre,|Brasil|Rio Grande do Sul|Porto Alegre|,Brasil,Rio Grande do Sul,,"-30.05175,-51.182068",-30.05175,-51.182068,BRL,91.0,91.0,,,,"APARTAMENTO com 3 dormitórios , sendo 1 suíte,...",Apartamento em Petrópolis,https://thumbs4.properati.com/6/gy-6GDbU5Ccx_I...,2017-07,750000.0
2,6779668,39a954f5d56137f6fe547af707626eafa6a3869c,2015-06-26,sell,apartment,Bessa,|Brasil|Paraíba|João Pessoa|Bessa|,Brasil,Paraíba,,,,,BRL,,,,,,Edifcio: 10(dez) apartamentos; 01(uma) vaga ga...,Venda Apartamento Joo Pessoa/Paraba: 3 quartos...,https://thumbs4.properati.com/8/DL8fMA8z1oRVgv...,2015-12,235000.0
3,10187959,ae0282c0e20210cfaf030e57edf85b9d5d6c13f4,2016-02-22,sell,apartment,São Lourenço,|Brasil|Paraná|Curitiba|São Lourenço|,Brasil,Paraná,,"-25.413214,-49.271294",-25.413214,-49.271294,BRL,,168.0,,,,Linda cobertura no bairro São Lourenço com uma...,"Linda Cobertura duplex - 168 m² - 2 suítes, 2 ...",https://thumbs4.properati.com/2/J6USW1fALKsXov...,2016-06,880000.0
4,1147630,f83c204dd1baf29c6460826ee829be00f751f676,2016-09-09,sell,house,Vila Formosa,|Brasil|São Paulo|São Paulo|Vila Formosa|,Brasil,São Paulo,,"-23.56229,-46.521248",-23.56229,-46.521248,BRL,,145.0,,2.0,,"Lindo sobrado, muito bem construído com ótimo ...",Sobrado novo para venda na Vila Formosa.,https://thumbs4.properati.com/0/bXqmIc-TKt1wua...,2017-03,580000.0


# Seleção Manual de Atributos

Atributos a serem eliminados:

* id 

In [114]:
ignore_columns = [
    'id',
    'property_id',
    'operation',
    'place_with_parent_names',
    'country_name',
    'geonames_id',
    'lat_lon',
    'currency',
    'floor',
    'description',
    'title',
    'image_thumbnail',
    'collected_on'
]

def manual_selection(dataset):
    return dataset.drop(columns=ignore_columns)

# filtered = manual_selection(df)
# for col in ignore_columns:
#     assert col not in manual_selection(df).columns

# Eliminando Inconsistências

## Moedas Estrangeiras

In [115]:
df.currency.astype('str').value_counts()

BRL    999806
MXN       103
USD        87
COP         4
Name: currency, dtype: int64

In [116]:
def filter_currency(dataset):
    positions = dataset['currency'] == 'BRL'
    return dataset[positions]

# filter_currency(df).currency.astype('str').value_counts()

## Pontos Fora do Brasil

In [117]:
df.state_name.astype('str').value_counts()

São Paulo              704145
Rio Grande do Sul       90626
Rio de Janeiro          71693
Santa Catarina          27613
Minas Gerais            27401
Paraná                  18083
Ceará                   11472
Espírito Santo          11056
Paraíba                 10599
Bahia                    8578
Rio Grande do Norte      4981
Goiás                    2929
Pernambuco               2645
Distrito Federal         2300
Pará                     1346
Mato Grosso               899
Amazonas                  831
Tocantins                 714
Alagoas                   610
Maranhão                  568
Sergipe                   314
Mato Grosso do Sul        190
Rondônia                  139
Piauí                     123
Amapá                     123
Acre                       16
Outros países               4
Roraima                     2
Name: state_name, dtype: int64

In [118]:
def filter_latlon(dataset):
    data = dataset[['lat', 'lon']]

    # data = data.dropna()
    # points = gpd.points_from_xy(data.lon, data.lat, crs=map_df.crs)
    # points = gpd.GeoSeries(points)
    # positions_in = map_df.contains(points)

    positions = data.lat.notna().squeeze() & data.lon.notna().squeeze()

    lon_max = -30.0
    lon_min = -80.0
    lat_min = -40.0
    lat_max = 10.0
    return dataset.loc[
        positions & 
        (dataset.state_name != "Outros países") &
        ((data.lat > lat_min) & (data.lat < lat_max)) & 
        ((data.lon > lon_min) & (data.lon < lon_max))
    ]

# filter_latlon(df).state_name.astype('str').value_counts()

# Aplicando o Pré-Processamento

In [119]:
df = filter_currency(df)
df = filter_latlon(df)

df = manual_selection(df)
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))
print()

df.dtypes

Formato dos dados:  (495836, 11)
#Exemplos: 495836
#Atributos: 11



created_on                object
property_type             object
place_name                object
state_name                object
lat                      float64
lon                      float64
surface_total_in_m2      float64
surface_covered_in_m2    float64
rooms                    float64
expenses                 float64
price                    float64
dtype: object

In [120]:
df.to_csv(dataset_dir + "/preprocessed.csv", index=False)