In [1]:
import re
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras import metrics
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense


In [2]:
dataframe_final = pd.read_parquet('dados_webscraping/dataframe_imoveis_venda_pronto_para_predicao_v2.parquet')

print(dataframe_final.shape)
display(dataframe_final.head(2))
print("Porcentagem de informações faltantes: ")
display(round(dataframe_final.isna().sum()/dataframe_final.shape[0] * 100, 2).astype(str) + " %")


(1139, 26)


Unnamed: 0,url,endereco,preco,area,quartos,vagas_de_carro,mobiliado,piscina,condominio,elevador,...,latitude,longitude,geometry,nome,distancia_metro,distancia_unidade_saude,indic_rend,indic_lixo,indic_agua,preco_log
7,https://www.zapimoveis.com.br/imovel/venda-apa...,"Rua Barão, 450 - Praça Seca, Rio de Janeiro - RJ",195000.0,56.0,2.0,1,False,True,True,True,...,-22.89567935,-43.35641545,b'\x01\x01\x00\x00\x00C\xc8~\x05\x9f\xadE\xc0\...,Praça Seca,10138.418568,1850.721339,3.681449,99.660238,98.861081,12.18076
8,https://www.zapimoveis.com.br/imovel/venda-apa...,"Avenida dos Mananciais, 534 - Taquara, Rio de ...",400000.0,66.0,3.0,1,False,True,True,False,...,-22.9178813,-43.395858700000005,b'\x01\x01\x00\x00\x00n.u\x7f\xab\xb2E\xc0\x0c...,Taquara,16915.620046,2813.306033,4.134871,99.837475,99.326261,12.899222


Porcentagem de informações faltantes: 


Unnamed: 0,0
url,0.0 %
endereco,0.0 %
preco,0.0 %
area,0.0 %
quartos,0.0 %
vagas_de_carro,0.0 %
mobiliado,0.0 %
piscina,0.0 %
condominio,0.0 %
elevador,0.0 %


In [3]:
colunas_analise = ['preco', 'area', 'quartos', 'vagas_de_carro', "mobiliado", "tipo", "distancia_metro", 'distancia_unidade_saude',
                   'indic_rend', 'indic_lixo', 'indic_agua']

display(dataframe_final[colunas_analise].info())

X = dataframe_final[colunas_analise].drop('preco', axis=1)  # Remover a coluna 'preco' de X
y = dataframe_final['preco']

<class 'pandas.core.frame.DataFrame'>
Index: 1139 entries, 7 to 3859
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   preco                    1139 non-null   float64
 1   area                     1139 non-null   float64
 2   quartos                  1139 non-null   float64
 3   vagas_de_carro           1139 non-null   int64  
 4   mobiliado                1139 non-null   bool   
 5   tipo                     1139 non-null   object 
 6   distancia_metro          1139 non-null   float64
 7   distancia_unidade_saude  1139 non-null   float64
 8   indic_rend               1139 non-null   float64
 9   indic_lixo               1139 non-null   float64
 10  indic_agua               1139 non-null   float64
dtypes: bool(1), float64(8), int64(1), object(1)
memory usage: 99.0+ KB


None

In [4]:
# Separar colunas numéricas, categóricas e booleanas
colunas_numericas = X.select_dtypes(include=[np.number]).columns.tolist()
colunas_categoricas = X.select_dtypes(include=[object]).columns.tolist()
colunas_booleanas = X.select_dtypes(include=[bool]).columns.tolist()

# Criar o pré-processador com StandardScaler para colunas numéricas e OneHotEncoder para colunas categóricas e booleanas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), colunas_categoricas + colunas_booleanas)
    ])

# Aplicar o pré-processador nos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Converter `y` para numpy array (necessário para Keras)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [5]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=preprocessor.get_feature_names_out())
pd.set_option('display.max_columns', None)
print(X_train_scaled_df.shape)
display(X_train_scaled_df.head(2))

(911, 12)


Unnamed: 0,num__area,num__quartos,num__vagas_de_carro,num__distancia_metro,num__distancia_unidade_saude,num__indic_rend,num__indic_lixo,num__indic_agua,cat__tipo_apartamento,cat__tipo_casa,cat__mobiliado_False,cat__mobiliado_True
0,1.212384,0.911176,0.0,1.511739,2.63116,-0.50613,-1.642497,-1.97249,1.0,0.0,1.0,0.0
1,-0.732407,-0.598063,0.0,-0.717427,-1.0589,0.63045,0.773204,0.748429,1.0,0.0,1.0,0.0


In [6]:
# Função para criar e treinar o modelo de rede neural
def criar_rede_neural(input_shape, learning_rate=0.001, hidden_layers=2, neurons=64, dropout_rate=0.2, activation='relu',
                      output_activation='linear', optimizer='adam', l2_reg=0.01):

    # Verificar parâmetros
    if hidden_layers < 1:
        raise ValueError("O número de camadas ocultas deve ser pelo menos 1.")
    if neurons < 1:
        raise ValueError("O número de neurônios deve ser pelo menos 1.")
    if not 0 <= dropout_rate < 1:
        raise ValueError("A taxa de dropout deve estar entre 0 e 1.")

    model = Sequential()
    model.add(Input(shape=(input_shape,)))
    model.add(Dense(neurons, activation=activation, kernel_regularizer=l2(l2_reg)))

    # Adicionar camadas ocultas
    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons, activation=activation, kernel_regularizer=l2(l2_reg)))
        model.add(Dropout(dropout_rate))

    # Camada de saída (1 neurônio para regressão)
    model.add(Dense(1, activation='linear'))

    # Escolher otimizador
    if optimizer.lower() == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)
    else:
        raise ValueError("Otimizador não suportado. Use 'adam' ou 'sgd'.")

    # Compilar o modelo
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae'])

    return model

def testar_modelo(lr, hidden_layers, neurons, activation, dropout, X_train, y_train, X_test, y_test):

    def learning_rate_scheduler(epoch, lr):
        return float(lr * tf.math.exp(-0.09))

    # Criar o modelo
    model = criar_rede_neural(
        input_shape=X_train.shape[1],
        learning_rate=lr,
        hidden_layers=hidden_layers,
        neurons=neurons,
        dropout_rate=dropout,
        activation=activation
    )

    # Treinar o modelo
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[tf.keras.callbacks.LearningRateScheduler(learning_rate_scheduler)])

    # Fazer previsões
    y_pred = model.predict(X_test)

    # Calcular métricas
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  # MAPE

    # Retornar resultados e o modelo
    return mae, rmse, r2, mape, model

def testar_varios_modelos_paralelo(X_train, X_test, y_train, y_test):

    # Parâmetros para testar
    learning_rates = [0.001, 0.01, 0.1]
    hidden_layer_options = [1, 2, 3]
    neurons_options = [32, 64, 128]
    activation_functions = ['relu', 'tanh', 'sigmoid']
    dropout_rates = [0.2, 0.3, 0.5]

    param_combinations = list(itertools.product(learning_rates, hidden_layer_options, neurons_options, activation_functions, dropout_rates))

    # Executar os testes em paralelo
    results = Parallel(n_jobs=-1)(
        delayed(testar_modelo)(lr, hidden_layers, neurons, activation, dropout, X_train, y_train, X_test, y_test)
        for lr, hidden_layers, neurons, activation, dropout in tqdm(param_combinations, desc="Testando combinações")
    )

    # Identificar o melhor modelo
    best_score = float('inf')
    best_model = None

    for mae, rmse, r2, mape, model in results:
        print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f} | MAPE: {mape:.2f}%")
        if mae < best_score:
            best_score = mae
            best_model = model

    return best_model, best_score, results


best_model, best_score, results = testar_varios_modelos_paralelo(X_train_scaled, X_test_scaled, y_train, y_test)

Testando combinações: 100%|██████████| 243/243 [39:01<00:00,  9.64s/it]


MAE: 814318.0192 | RMSE: 1009717.9090 | R²: -1.8605 | MAPE: 100.00%
MAE: 814312.4989 | RMSE: 1009711.7021 | R²: -1.8604 | MAPE: 100.00%
MAE: 814314.0104 | RMSE: 1009714.1408 | R²: -1.8604 | MAPE: 100.00%
MAE: 814318.4459 | RMSE: 1009719.2365 | R²: -1.8605 | MAPE: 100.00%
MAE: 814318.6664 | RMSE: 1009719.3678 | R²: -1.8605 | MAPE: 100.00%
MAE: 814318.6172 | RMSE: 1009719.3200 | R²: -1.8605 | MAPE: 100.00%
MAE: 814322.3526 | RMSE: 1009723.2346 | R²: -1.8605 | MAPE: 100.00%
MAE: 814322.4336 | RMSE: 1009723.2601 | R²: -1.8605 | MAPE: 100.00%
MAE: 814321.6414 | RMSE: 1009722.5185 | R²: -1.8605 | MAPE: 100.00%
MAE: 814299.7159 | RMSE: 1009698.0283 | R²: -1.8604 | MAPE: 99.99%
MAE: 814305.7936 | RMSE: 1009704.3604 | R²: -1.8604 | MAPE: 100.00%
MAE: 814306.0111 | RMSE: 1009704.1819 | R²: -1.8604 | MAPE: 100.00%
MAE: 814308.0090 | RMSE: 1009709.7849 | R²: -1.8604 | MAPE: 100.00%
MAE: 814307.6797 | RMSE: 1009709.6226 | R²: -1.8604 | MAPE: 100.00%
MAE: 814309.6510 | RMSE: 1009711.1305 | R²: -1.86

In [7]:
best_model.save('melhor_modelo_venda.keras')
# best_model = load_model(f'models/melhor_modelo_aluguel.keras')

In [8]:
# Treinamento com Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = best_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Previsão dos valores de teste
y_pred = best_model.predict(X_test_scaled).flatten()#

Epoch 1/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 63347933184.0000 - mae: 160857.4844 - val_loss: 67826724864.0000 - val_mae: 173693.9688
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 61321621504.0000 - mae: 164072.9219 - val_loss: 67833368576.0000 - val_mae: 173698.0156
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 50749767680.0000 - mae: 152252.9375 - val_loss: 67830734848.0000 - val_mae: 173694.0625
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 59922456576.0000 - mae: 159871.6406 - val_loss: 67825975296.0000 - val_mae: 173691.0625
Epoch 5/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 54258728960.0000 - mae: 157209.4062 - val_loss: 67820179456.0000 - val_mae: 173687.5938
Epoch 6/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6

In [9]:
# Calculando as métricas
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Exibindo as métricas de forma mais bonita
print(f"{'Métricas de Avaliação':^40}")
print(f"{'-'*40}")
print(f"{'MAE':<10}: {mae:.4f}")
print(f"{'RMSE':<10}: {rmse:.4f}")
print(f"{'R²':<10}: {r2:.4f}")
print(f"{'MAPE':<10}: {mape:.4f}%")
print(f"{'-'*40}")

#         Métricas de Avaliação
#----------------------------------------
# MAE       : 1354.7818
# RMSE      : 2117.7866
# R²        : 0.8369
#----------------------------------------

#          Métricas de Avaliação
# ----------------------------------------
# MAE       : 173658.7656
# RMSE      : 260350.2725
# R²        : 0.8098
# MAPE      : 22.8972%
# ----------------------------------------

         Métricas de Avaliação          
----------------------------------------
MAE       : 173658.7656
RMSE      : 260350.2725
R²        : 0.8098
MAPE      : 22.8972%
----------------------------------------


In [10]:
results_dict = {
    "Algoritmo": [],
    "MAE": [],
    "RMSE": [],
    "R2": [],
    "MAPE": [],
}

# Preenchendo o dicionário
for mae, rmse, r2, mape, model in results:
    results_dict["Algoritmo"].append("RedeNeural")
    results_dict["MAE"].append(mae)
    results_dict["RMSE"].append(rmse)
    results_dict["R2"].append(r2)
    results_dict["MAPE"].append(mape)

df = pd.DataFrame(results_dict)
df_melted = df.melt(id_vars=["Algoritmo"],
                    var_name="Métrica",
                    value_name="Valor")




In [11]:
df_melted.to_csv("resultados_da_rn_venda.csv")