In [1]:
# Algoritmos a serem utilizados
# k-Nearest Neighbors (kNN), Random Fo-rests, XGBoost, Support Vector Regression (SVR) e Neural Networks.

In [12]:
# Importando os pacotes necessários para a análise

import os

import pandas            as pd
# import geopandas         as gpd
import numpy             as np
import scipy             as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error,make_scorer
# import seaborn           as sns

# from google_drive_downloader import GoogleDriveDownloader as gdd
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import OrdinalEncoder

# import mapclassify

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.reset_option('max_colwidth')

In [8]:
# Lendo a base de dados

df = pd.read_csv('dataset.csv')
df = df.drop('PH',1)
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))

Formato dos dados:  (449633, 11)
#Exemplos: 449633
#Atributos: 11


In [9]:
df.dtypes

created_on               float64
lat                      float64
lon                      float64
surface_total_in_m2      float64
surface_covered_in_m2    float64
rooms                    float64
expenses                 float64
apartment                float64
house                    float64
store                    float64
price                    float64
dtype: object

In [10]:
df.head()

Unnamed: 0,created_on,lat,lon,surface_total_in_m2,surface_covered_in_m2,rooms,expenses,apartment,house,store,price
0,0.786688,0.311355,0.664401,-0.001542,-0.001534,0.2,0.005237,1.0,0.0,0.0,900000.0
1,0.80834,0.124457,0.537113,-0.001539,-0.001531,0.2,-0.02641,1.0,0.0,0.0,750000.0
2,0.455493,0.25917,0.591096,-0.001515,-0.001507,0.2,-0.02641,0.0,1.0,0.0,880000.0
3,0.605453,0.312925,0.668791,-0.001522,-0.001514,0.2,-0.02641,0.0,1.0,0.0,580000.0
4,0.61267,0.363826,0.66813,-0.001524,-0.001478,0.4,-0.02641,1.0,0.0,0.0,420000.0


# k-Nearest Neighbors (kNN)

In [12]:
from sklearn.neighbors import KNeighborsRegressor

In [30]:
n_neighbors = 5
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [31]:
model = KNeighborsRegressor(n_neighbors=11,weights='distance',p=1,n_jobs=-1,algorithm='auto')
y_predicted = model.fit(X_train, y_train).predict(X_test)

In [32]:
np.sqrt(mean_squared_log_error(y_test,y_predicted))

0.4934027111616297

In [15]:
grid_params = {
    
'n_neighbors': [3,5,11,19],
'p': [1,2,3,4],

}

scorer = make_scorer(mean_squared_log_error,greater_is_better=False)

gs = GridSearchCV(KNeighborsRegressor(algorithm='auto',weights='distance'),grid_params,verbose=4,n_jobs=-1,scoring=scorer,return_train_score=True)

In [16]:
gs_results = gs.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [17]:
gs_results.best_score_

-0.2613317576746265

In [18]:
gs_results.best_estimator_

KNeighborsRegressor(n_neighbors=11, p=1, weights='distance')

In [19]:
gs_results.best_params_

{'n_neighbors': 11, 'p': 1}

In [21]:
gs_results.cv_results_

{'mean_fit_time': array([155.65409689, 138.71263227, 125.54477477,  92.60155334,
        123.56682987, 108.43741112, 100.94873357,  81.83847842,
         98.41120105, 105.65081134,  93.66024442,  70.88209281,
         86.04334865,  95.97713847,  92.76323152,  71.78684978]),
 'std_fit_time': array([69.51261602, 77.48664065, 47.71108339, 51.62037566, 60.51779879,
        64.73771596, 57.9755436 , 40.74859141, 68.91742898, 62.58419161,
        55.77248816, 43.83014909, 55.25336807, 63.14066005, 57.27041544,
        44.82143194]),
 'mean_score_time': array([ 77.44248667,  79.09174151, 216.5306376 , 220.66417241,
         79.57625108,  79.70739946, 243.84156795, 260.24075799,
         86.22830782,  85.7026288 , 271.26244512, 256.54212604,
         92.70610199,  94.61576734, 305.41920619, 296.17921791]),
 'std_score_time': array([21.66035565, 23.03979471, 54.53025379, 57.16198608, 20.5400987 ,
        21.8697517 , 65.58642716, 75.80449118, 22.63495634, 22.12637591,
        68.52555229, 66.14

# Random Forest

In [82]:
from sklearn.ensemble import RandomForestRegressor

In [81]:
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [83]:
model = RandomForestRegressor(n_jobs=-1)
y_predicted = model.fit(X_train, y_train).predict(X_test)

In [84]:
np.sqrt(mean_squared_log_error(y_test,y_predicted))

0.27724964768158555

# Support Vector Regression (SVR)

In [39]:
from sklearn.svm import SVR

In [40]:
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [42]:
model = SVR()
y_predicted = model.fit(X_train, y_train).predict(X_test)

In [None]:
np.sqrt(mean_squared_log_error(y_test,y_predicted))

# XGBoost

In [52]:
from xgboost import XGBRegressor

In [77]:
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [78]:
model = XGBRegressor(n_jobs=-1)
y_predicted = model.fit(X_train, y_train).predict(X_test)

In [75]:
sum(y_test < 0)

0

In [76]:
sum(y_predicted < 0)

103

In [80]:
np.sqrt(mean_squared_log_error(y_test,y_predicted))

0.3828730620423367

# Neural Networks
## Will use the same NN as shown in https://www.kaggle.com/luckeciano/nn-with-tabular-data-baseline

In [35]:
#Import Keras 
from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model, Sequential
from keras.layers import LeakyReLU
from keras.optimizers import Adam
from keras import backend as K
from keras.utils.vis_utils import plot_model
from keras.callbacks import LearningRateScheduler, TensorBoard
from sklearn.metrics import mean_squared_error
from keras.losses import MeanSquaredLogarithmicError
from keras.metrics import mean_squared_logarithmic_error

In [43]:
#Hyperparameters
ACTIVATION = 'relu'
EPOCHS = 2
BATCH_SIZE = 128
LEARNING_RATE= 0.01
LOSS_FUNCTION = 'mean_squared_error'
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-08
EPOCHS_DROP = 20
DROP = 0.8

In [13]:
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [18]:
#Build Neural Network
model =  Sequential()
model.add(Dense(128,  input_dim=10,  activation = ACTIVATION))
model.add(Dense(128, activation = ACTIVATION))
model.add(Dense(64,  activation = ACTIVATION))
model.add(Dense(1))

In [27]:
#Define loss metric
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true,y_pred))

In [44]:
# adam = Adam(lr=LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON)
# model.compile (loss = MeanSquaredLogarithmicError(), optimizer = adam)
# model.fit (X_train, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE, verbose = 1, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1c4745f8af0>

In [46]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = 10, activation='relu'))

# The Hidden Layers :
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
model.compile(loss=MeanSquaredLogarithmicError(), optimizer='adam', metrics=['mean_squared_error'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 128)               1408      
_________________________________________________________________
dense_13 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_14 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_15 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 257       
Total params: 166,273
Trainable params: 166,273
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1c4793ded90>

In [51]:
y_predicted = model.predict(X_test)
np.sqrt(mean_squared_log_error(y_test,y_predicted))

0.5995757353337279