In [1]:
# Algoritmos a serem utilizados
# k-Nearest Neighbors (kNN), Random Fo-rests, XGBoost, Support Vector Regression (SVR) e Neural Networks.

In [32]:
# Importando os pacotes necessários para a análise

import os

import pandas            as pd
import numpy             as np
import scipy             as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error,make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.reset_option('max_colwidth')

In [2]:
# Lendo a base de dados
df = pd.read_csv("dataset/preprocessed.csv")
print("Formato dos dados: ", df.shape)
print("#Exemplos: {}".format(df.shape[0]))
print("#Atributos: {}".format(df.shape[1]))

Formato dos dados:  (449633, 12)
#Exemplos: 449633
#Atributos: 12


In [5]:
df.head()

Unnamed: 0,created_on,lat,lon,surface_total_in_m2,surface_covered_in_m2,rooms,expenses,PH,apartment,house,store,price
0,0.786688,0.311355,0.664401,-0.001542,-0.001534,0.2,0.005237,0.0,1.0,0.0,0.0,900000.0
1,0.80834,0.124457,0.537113,-0.001539,-0.001531,0.2,-0.02641,0.0,1.0,0.0,0.0,750000.0
2,0.455493,0.25917,0.591096,-0.001515,-0.001507,0.2,-0.02641,0.0,1.0,0.0,0.0,880000.0
3,0.605453,0.312925,0.668791,-0.001522,-0.001514,0.2,-0.02641,0.0,0.0,1.0,0.0,580000.0
4,0.61267,0.363826,0.66813,-0.001524,-0.001478,0.4,-0.02641,0.0,0.0,1.0,0.0,420000.0


# k-Nearest Neighbors (kNN)

In [24]:
from sklearn.neighbors import KNeighborsRegressor

In [25]:
train, test = train_test_split(df, test_size=0.25)

x_train = train.drop('price', axis=1)
y_train = train['price']

x_test = test.drop('price', axis=1)
y_test = test['price']

In [23]:
grid_params = {
'n_neighbors': [5,9,11,13],
'p': [1,2],

}

scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(algorithm='auto', weights='distance'))])

gs = GridSearchCV(model, grid_params, verbose=4, n_jobs=-1, scoring=scorer, return_train_score=True)

In [24]:
gs_results = gs.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
gs_results.best_score_

-0.18013264645552116

In [26]:
gs_results.best_estimator_

KNeighborsRegressor(n_neighbors=11, p=1, weights='distance')

In [27]:
gs_results.best_params_

{'n_neighbors': 11, 'p': 1}

In [26]:
model = KNeighborsRegressor(n_neighbors=11,p=1,weights='distance')
y_predicted = model.fit(x_train, y_train).predict(x_test)

In [27]:
np.sqrt(mean_squared_log_error(y_test, y_predicted))

0.3862371701271055

# Random Forest

In [80]:
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

In [81]:
train, test = train_test_split(df, test_size=0.25)

x_train = train.drop('price', axis=1)
y_train = train['price']

x_test = test.drop('price', axis=1)
y_test = test['price']

In [7]:
# max number of levels in tree
max_depth = [10, 20, 50]
max_depth.append(None)

# minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

random_grid_params = {
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
}

pprint(random_grid_params)

{'max_depth': [10, 20, 50, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 5, 10],
 'min_samples_split': [2, 5, 10, 15],
 'n_estimators': [100, 200, 500]}


In [8]:
# create the base model to tune
model = Pipeline([
    ('scaler', StandardScaler()),
    ('rfr', RandomForestRegressor())])

# use the random grid to search for best hyperparameters,
# using 5 fold cross validation, earch across 20 different combinations, and use all available cores
rfr_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid_params, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [9]:
# fit the random search model
rfr_random_results = rfr_random.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits




In [10]:
rfr_random_results.best_score_

0.6869087933350606

In [11]:
rfr_random_results.best_params_

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': None}

In [18]:
model = RandomForestRegressor(n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=None)
y_predicted = model.fit(x_train, y_train).predict(x_test)

In [19]:
np.sqrt(mean_squared_log_error(y_test, y_predicted))

0.2561500180491964

# Support Vector Regression (SVR)

In [39]:
from sklearn.svm import SVR

In [40]:
X = df.drop("price",1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [42]:
model = SVR()
y_predicted = model.fit(X_train, y_train).predict(X_test)

In [None]:
np.sqrt(mean_squared_log_error(y_test,y_predicted))