In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [4]:
dataframe_final = pd.read_parquet('dados_webscraping/dataframe_imoveis_aluguel_pronto_para_predicao.parquet')

print(dataframe_final.shape)
display(dataframe_final.head(2))
print("Porcentagem de informações faltantes: ")
display(round(dataframe_final.isna().sum()/dataframe_final.shape[0] * 100, 2).astype(str) + " %")


(518, 29)


Unnamed: 0,url,endereco,preco,area,quartos,vagas_de_carro,valor_condominio,iptu,mobiliado,piscina,...,longitude,geometry,nome,distancia_metro,distancia_escola,distancia_unidade_saude,indic_rend,indic_lixo,indic_esgo,indic_agua
0,https://www.zapimoveis.com.br/imovel/aluguel-a...,"Avenida Epitácio Pessoa, 4344 - Lagoa, Rio de ...",12500.0,137.0,3.0,2,1982.0,470.0,False,False,...,-43.199374,b'\x01\x01\x00\x00\x00\x97\xef\xd4\x19\x85\x99...,Lagoa,948.06301,437.351027,767.352721,20.550754,99.987981,99.915865,99.987981
2,https://www.zapimoveis.com.br/imovel/aluguel-a...,"Rua Pinto Teles, 660 - Praça Seca, Rio de Jane...",1100.0,60.0,2.0,1,409.0,162.0,False,False,...,-43.345604,b'\x01\x01\x00\x00\x00\\1(\xc2<\xacE\xc0{\xa2\...,Praça Seca,8011.304617,133.018579,182.882683,3.681449,99.660238,94.377183,98.861081


Porcentagem de informações faltantes: 


url                        0.0 %
endereco                   0.0 %
preco                      0.0 %
area                       0.0 %
quartos                    0.0 %
vagas_de_carro             0.0 %
valor_condominio           0.0 %
iptu                       0.0 %
mobiliado                  0.0 %
piscina                    0.0 %
condominio                 0.0 %
elevador                   0.0 %
jardim                     0.0 %
quadra_esportiva           0.0 %
academia                   0.0 %
finalidade                 0.0 %
tipo                       0.0 %
localizacao                0.0 %
latitude                   0.0 %
longitude                  0.0 %
geometry                   0.0 %
nome                       0.0 %
distancia_metro            0.0 %
distancia_escola           0.0 %
distancia_unidade_saude    0.0 %
indic_rend                 0.0 %
indic_lixo                 0.0 %
indic_esgo                 0.0 %
indic_agua                 0.0 %
dtype: object

In [5]:
colunas_analise = ['preco', 'area', 'quartos', 'vagas_de_carro', 'valor_condominio',
                   'iptu', "mobiliado", "tipo", "distancia_metro", 'distancia_escola', 'distancia_unidade_saude',
                   'indic_rend', 'indic_lixo', 'indic_esgo', 'indic_agua']

display(dataframe_final[colunas_analise].info())

X = dataframe_final[colunas_analise].drop('preco', axis=1)  # Remover a coluna 'preco' de X
y = dataframe_final['preco']

<class 'pandas.core.frame.DataFrame'>
Index: 518 entries, 0 to 1606
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   preco                    518 non-null    float64
 1   area                     518 non-null    float64
 2   quartos                  518 non-null    float64
 3   vagas_de_carro           518 non-null    int64  
 4   valor_condominio         518 non-null    float64
 5   iptu                     518 non-null    float64
 6   mobiliado                518 non-null    bool   
 7   tipo                     518 non-null    object 
 8   distancia_metro          518 non-null    float64
 9   distancia_escola         518 non-null    float64
 10  distancia_unidade_saude  518 non-null    float64
 11  indic_rend               518 non-null    float64
 12  indic_lixo               518 non-null    float64
 13  indic_esgo               518 non-null    float64
 14  indic_agua               518 n

None

In [9]:
# Separar colunas numéricas, categóricas e booleanas
colunas_numericas = X.select_dtypes(include=[np.number]).columns.tolist()
colunas_categoricas = X.select_dtypes(include=[object]).columns.tolist()
colunas_booleanas = X.select_dtypes(include=[bool]).columns.tolist()

# Criar o pré-processador com StandardScaler para colunas numéricas e OneHotEncoder para colunas categóricas e booleanas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), colunas_categoricas + colunas_booleanas)
    ])

# Aplicar o pré-processador nos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Converter `y` para numpy array (necessário para Keras)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [7]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=preprocessor.get_feature_names_out())
pd.set_option('display.max_columns', None)
print(X_train_scaled_df.shape)
display(X_train_scaled_df.head(2))


(414, 16)


Unnamed: 0,num__area,num__quartos,num__vagas_de_carro,num__valor_condominio,num__iptu,num__distancia_metro,num__distancia_escola,num__distancia_unidade_saude,num__indic_rend,num__indic_lixo,num__indic_esgo,num__indic_agua,cat__tipo_apartamento,cat__tipo_casa,cat__mobiliado_False,cat__mobiliado_True
0,0.235705,0.54945,-0.031817,0.504608,-0.162708,-0.666419,-0.110196,-0.234587,-0.093234,0.609032,0.766037,0.72821,1.0,0.0,1.0,0.0
1,0.041474,-0.538935,-0.031817,-0.978665,-0.6816,-0.508302,-0.266945,-0.430312,-1.092935,0.254257,0.69483,0.619705,0.0,1.0,1.0,0.0


In [24]:


# Função para criar e treinar o modelo de rede neural
def criar_rede_neural(input_shape, learning_rate=0.001, hidden_layers=2, neurons=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(neurons, activation=activation, input_shape=(input_shape,)))

    # Adicionar camadas ocultas
    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons, activation=activation))
        model.add(Dropout(dropout_rate))

    # Camada de saída (1 neurônio para regressão)
    model.add(Dense(1, activation='linear'))

    # Compilar o modelo
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def testar_varios_modelos(X, y):
    # Parâmetros para testar
    learning_rates = [0.001, 0.01, 0.1]
    hidden_layer_options = [1, 2, 3]
    neurons_options = [32, 64, 128]
    activation_functions = ['relu', 'tanh', 'sigmoid']
    dropout_rates = [0.2, 0.3, 0.5]

    best_model = None
    best_score = float('inf')
    
    # Dividir os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Testar várias combinações de parâmetros
    for lr in learning_rates:
        for hidden_layers in hidden_layer_options:
            for neurons in neurons_options:
                for activation in activation_functions:
                    for dropout in dropout_rates:
                        print(f'Testando modelo com: lr={lr}, hidden_layers={hidden_layers}, neurons={neurons}, activation={activation}, dropout={dropout}')

                        # Criar e treinar o modelo
                        model = criar_rede_neural(X_train.shape[1], learning_rate=lr, hidden_layers=hidden_layers, neurons=neurons, dropout_rate=dropout, activation=activation)
                        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

                        # Fazer previsões
                        y_pred = model.predict(X_test)

                        # Calcular as métricas
                        mae = mean_absolute_error(y_test, y_pred)
                        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                        r2 = r2_score(y_test, y_pred)

                        # Exibir as métricas
                        print(f'MAE: {mae:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}')

                        # Armazenar o melhor modelo
                        if mae < best_score:
                            best_score = mae
                            best_model = model

    return best_model, best_score

# Configurações da rede
input_shape = X_train_scaled.shape[1]
best_model, best_score = testar_varios_modelos(X_train_scaled, y_train)

In [22]:
# Treinamento com Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = best_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Previsão dos valores de teste
y_pred = best_model.predict(X_test_scaled).flatten()

Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 8832711.0000 - mae: 1695.9581 - val_loss: 6606573.0000 - val_mae: 1556.7722
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4674332.0000 - mae: 1445.6469 - val_loss: 7218651.5000 - val_mae: 1558.3087
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5805894.0000 - mae: 1556.3635 - val_loss: 6850326.0000 - val_mae: 1514.2290
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6841322.5000 - mae: 1540.1132 - val_loss: 6215168.5000 - val_mae: 1672.7902
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8660764.0000 - mae: 1806.8920 - val_loss: 11812334.0000 - val_mae: 2085.1094
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8937084.0000 - mae: 1812.4314 - val_loss: 6237386.5000 - val

In [23]:
# Calculando as métricas
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Exibindo as métricas de forma mais bonita
print(f"{'Métricas de Avaliação':^40}")
print(f"{'-'*40}")
print(f"{'MAE':<10}: {mae:.4f}")
print(f"{'RMSE':<10}: {rmse:.4f}")
print(f"{'R²':<10}: {r2:.4f}")
print(f"{'-'*40}")

         Métricas de Avaliação          
----------------------------------------
MAE       : 1354.7818
RMSE      : 2117.7866
R²        : 0.8369
----------------------------------------
