In [10]:
import re
import itertools
import os
import json
from openai import OpenAI
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import transformers
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras import metrics
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [11]:
dataframe_final = pd.read_parquet('dados_webscraping/dataframe_imoveis_aluguel_pronto_para_predicao.parquet')

print(dataframe_final.shape)
display(dataframe_final.head(2))
print("Porcentagem de informações faltantes: ")
display(round(dataframe_final.isna().sum()/dataframe_final.shape[0] * 100, 2).astype(str) + " %")


(518, 29)


Unnamed: 0,url,endereco,preco,area,quartos,vagas_de_carro,valor_condominio,iptu,mobiliado,piscina,condominio,elevador,jardim,quadra_esportiva,academia,finalidade,tipo,localizacao,latitude,longitude,geometry,nome,distancia_metro,distancia_escola,distancia_unidade_saude,indic_rend,indic_lixo,indic_esgo,indic_agua
0,https://www.zapimoveis.com.br/imovel/aluguel-a...,"Avenida Epitácio Pessoa, 4344 - Lagoa, Rio de ...",12500.0,137.0,3.0,2,1982.0,470.0,False,False,False,False,True,False,False,residencial,apartamento,"Avenida Epitácio Pessoa, Lagoa, Rio de Janeiro...",-22.978197,-43.199374,b'\x01\x01\x00\x00\x00\x97\xef\xd4\x19\x85\x99...,Lagoa,948.06301,437.351027,767.352721,20.550754,99.987981,99.915865,99.987981
2,https://www.zapimoveis.com.br/imovel/aluguel-a...,"Rua Pinto Teles, 660 - Praça Seca, Rio de Jane...",1100.0,60.0,2.0,1,409.0,162.0,False,False,False,False,False,False,False,residencial,apartamento,"Rua Pinto Teles, Praça Seca, Rio de Janeiro, R...",-22.889191,-43.345604,b'\x01\x01\x00\x00\x00\\1(\xc2<\xacE\xc0{\xa2\...,Praça Seca,8011.304617,133.018579,182.882683,3.681449,99.660238,94.377183,98.861081


Porcentagem de informações faltantes: 


url                        0.0 %
endereco                   0.0 %
preco                      0.0 %
area                       0.0 %
quartos                    0.0 %
vagas_de_carro             0.0 %
valor_condominio           0.0 %
iptu                       0.0 %
mobiliado                  0.0 %
piscina                    0.0 %
condominio                 0.0 %
elevador                   0.0 %
jardim                     0.0 %
quadra_esportiva           0.0 %
academia                   0.0 %
finalidade                 0.0 %
tipo                       0.0 %
localizacao                0.0 %
latitude                   0.0 %
longitude                  0.0 %
geometry                   0.0 %
nome                       0.0 %
distancia_metro            0.0 %
distancia_escola           0.0 %
distancia_unidade_saude    0.0 %
indic_rend                 0.0 %
indic_lixo                 0.0 %
indic_esgo                 0.0 %
indic_agua                 0.0 %
dtype: object

In [12]:
colunas_analise = ['preco', 'area', 'quartos', 'vagas_de_carro', 'valor_condominio',
                   'iptu', "mobiliado", "tipo", "distancia_metro", 'distancia_escola', 'distancia_unidade_saude',
                   'indic_rend', 'indic_lixo', 'indic_esgo', 'indic_agua']

display(dataframe_final[colunas_analise].info())

X = dataframe_final[colunas_analise].drop('preco', axis=1)  # Remover a coluna 'preco' de X
y = dataframe_final['preco']

<class 'pandas.core.frame.DataFrame'>
Index: 518 entries, 0 to 1606
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   preco                    518 non-null    float64
 1   area                     518 non-null    float64
 2   quartos                  518 non-null    float64
 3   vagas_de_carro           518 non-null    int64  
 4   valor_condominio         518 non-null    float64
 5   iptu                     518 non-null    float64
 6   mobiliado                518 non-null    bool   
 7   tipo                     518 non-null    object 
 8   distancia_metro          518 non-null    float64
 9   distancia_escola         518 non-null    float64
 10  distancia_unidade_saude  518 non-null    float64
 11  indic_rend               518 non-null    float64
 12  indic_lixo               518 non-null    float64
 13  indic_esgo               518 non-null    float64
 14  indic_agua               518 n

None

In [13]:
# Separar colunas numéricas, categóricas e booleanas
colunas_numericas = X.select_dtypes(include=[np.number]).columns.tolist()
colunas_categoricas = X.select_dtypes(include=[object]).columns.tolist()
colunas_booleanas = X.select_dtypes(include=[bool]).columns.tolist()

# Criar o pré-processador com StandardScaler para colunas numéricas e OneHotEncoder para colunas categóricas e booleanas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), colunas_categoricas + colunas_booleanas)
    ])

# Aplicar o pré-processador nos dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Converter `y` para numpy array (necessário para Keras)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=preprocessor.get_feature_names_out())
pd.set_option('display.max_columns', None)
print(X_train_scaled_df.shape)
display(X_train_scaled_df.head(2))


(414, 16)


Unnamed: 0,num__area,num__quartos,num__vagas_de_carro,num__valor_condominio,num__iptu,num__distancia_metro,num__distancia_escola,num__distancia_unidade_saude,num__indic_rend,num__indic_lixo,num__indic_esgo,num__indic_agua,cat__tipo_apartamento,cat__tipo_casa,cat__mobiliado_False,cat__mobiliado_True
0,0.235705,0.54945,-0.031817,0.504608,-0.162708,-0.666419,-0.110196,-0.234587,-0.093234,0.609032,0.766037,0.72821,1.0,0.0,1.0,0.0
1,0.041474,-0.538935,-0.031817,-0.978665,-0.6816,-0.508302,-0.266945,-0.430312,-1.092935,0.254257,0.69483,0.619705,0.0,1.0,1.0,0.0


In [15]:
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=preprocessor.get_feature_names_out())
pd.set_option('display.max_columns', None)
print(X_test_scaled_df.shape)
display(X_test_scaled_df.head(2))

(104, 16)


Unnamed: 0,num__area,num__quartos,num__vagas_de_carro,num__valor_condominio,num__iptu,num__distancia_metro,num__distancia_escola,num__distancia_unidade_saude,num__indic_rend,num__indic_lixo,num__indic_esgo,num__indic_agua,cat__tipo_apartamento,cat__tipo_casa,cat__mobiliado_False,cat__mobiliado_True
0,-0.670707,-0.538935,-0.031817,-0.718202,-0.13396,-0.658229,-0.169379,-0.938272,-1.396521,0.509557,0.386765,0.612578,1.0,0.0,1.0,0.0
1,-0.735451,-0.538935,-1.229305,-0.640519,-0.910143,0.960242,-1.093631,0.305293,-1.358502,-1.3923,-0.124527,-0.654095,1.0,0.0,1.0,0.0


In [16]:
X_train_scaled_df["Output"] = y_train
X_data_scaled_df = pd.concat([X_train_scaled_df, X_test_scaled_df])

X_data_scaled_df["Output"] = X_data_scaled_df["Output"].fillna("Predict this value")
display(X_data_scaled_df.head())

Unnamed: 0,num__area,num__quartos,num__vagas_de_carro,num__valor_condominio,num__iptu,num__distancia_metro,num__distancia_escola,num__distancia_unidade_saude,num__indic_rend,num__indic_lixo,num__indic_esgo,num__indic_agua,cat__tipo_apartamento,cat__tipo_casa,cat__mobiliado_False,cat__mobiliado_True,Output
0,0.235705,0.54945,-0.031817,0.504608,-0.162708,-0.666419,-0.110196,-0.234587,-0.093234,0.609032,0.766037,0.72821,1.0,0.0,1.0,0.0,4500.0
1,0.041474,-0.538935,-0.031817,-0.978665,-0.6816,-0.508302,-0.266945,-0.430312,-1.092935,0.254257,0.69483,0.619705,0.0,1.0,1.0,0.0,2500.0
2,-0.605964,-0.538935,-1.229305,-0.494294,3.387606,-0.646917,-0.072459,-0.630267,-0.093234,0.609032,0.766037,0.72821,1.0,0.0,1.0,0.0,6000.0
3,-0.631861,-0.538935,-0.031817,-0.696268,-0.848335,-0.600879,0.103189,-0.896902,-1.162962,0.269438,0.733424,0.837265,1.0,0.0,1.0,0.0,1800.0
4,-0.308143,-0.538935,-0.031817,-0.377314,0.905261,0.619339,0.319458,0.017801,-1.007971,0.428961,-0.429964,0.334479,1.0,0.0,1.0,0.0,2500.0


In [64]:
def dataframe_to_string(df, feature_prefix="Feature", output_column="Output"):

    exemplars = []
    for _, row in df.iterrows():
        exemplar = []
        exemplar.append(f"row: {_}")
        for i, col in enumerate(df.columns):            
            if col == output_column:
                exemplar.append(f"{output_column}: {row[col]}")
            else:
                exemplar.append(f"{feature_prefix} {i + 1}: {row[col]}")
        exemplars.append("\n".join(exemplar))
    
    exemplars.append("The task is to provide your best estimate for Output without values. Please provide that and only that, without any additional text. Return me a list with predict values, they are 104 values if needs to be predict. Please returme ALL the 104 values, not less not more. Its very important to returme 104 values based in non information values set for me. I need exactly 104 values, count for me")
    
    return "\n\n".join(exemplars)

result_string = dataframe_to_string(X_data_scaled_df)
print(result_string)

row: 0
Feature 1: 0.2357046385669746
Feature 2: 0.5494504438811783
Feature 3: -0.031817298982419924
Feature 4: 0.5046075702053873
Feature 5: -0.1627079782628212
Feature 6: -0.6664192044076606
Feature 7: -0.11019647779882742
Feature 8: -0.2345868856026287
Feature 9: -0.09323369456186016
Feature 10: 0.6090321074382763
Feature 11: 0.7660369327722384
Feature 12: 0.728210387578634
Feature 13: 1.0
Feature 14: 0.0
Feature 15: 1.0
Feature 16: 0.0
Output: 4500.0

row: 1
Feature 1: 0.041473507263775015
Feature 2: -0.5389346459121604
Feature 3: -0.031817298982419924
Feature 4: -0.9786654408912379
Feature 5: -0.6816000528465461
Feature 6: -0.5083016815458964
Feature 7: -0.2669446447613616
Feature 8: -0.4303116261709047
Feature 9: -1.0929346407546818
Feature 10: 0.2542569757224983
Feature 11: 0.6948299884963093
Feature 12: 0.6197049676835962
Feature 13: 0.0
Feature 14: 1.0
Feature 15: 1.0
Feature 16: 0.0
Output: 2500.0

row: 2
Feature 1: -0.6059635970802235
Feature 2: -0.5389346459121604
Feature 3:

In [53]:
client = OpenAI(api_key="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")

def _make_a_request_to_openai_api(messages, model="gpt-3.5-turbo", temperature=1, max_completion_tokens=2048, top_p=1, frequency_penalty=0, presence_penalty=0):

  response = client.chat.completions.create(
    model=model,
    messages=[
          {"role": "user", "content": messages}
      ],
    response_format={
      "type": "text"
    },
    temperature=temperature,
    max_completion_tokens=max_completion_tokens,
    top_p=top_p,
    frequency_penalty=frequency_penalty,
    presence_penalty=presence_penalty
  )
  return response

response = _make_a_request_to_openai_api(messages="Do you help me make a prediction? I Will pass a lines for you and the Output with 'Make a Prediction' do you make a prediction. Give me a list of predction values, ok? ", model="gpt-4o-mini")
response

ChatCompletion(id='chatcmpl-ApNVvCcxuOgZSZjUd4l2WLzYSHTVs', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Sure! Please provide the lines or data you'd like me to work with, and then indicate when you're ready for me to make a prediction. I'll generate a list of prediction values based on the information you provide.", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1736808479, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_bd83329f63', usage=CompletionUsage(completion_tokens=43, prompt_tokens=49, total_tokens=92, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [65]:
response =  _make_a_request_to_openai_api(messages=result_string, model="chatgpt-4o-latest", temperature=1)

In [66]:
dict_response = response.dict()
text_response = dict_response["choices"][0]["message"]["content"]

text_response

/tmp/ipykernel_59175/196840809.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  dict_response = response.dict()


'Sure, here are the 104 values predicted based on the given features:\n\n[2100.0, 1100.0, 1800.0, 17900.0, 2300.0, 14500.0, 10500.0, 9800.0, 3200.0, 14500.0, 3000.0, 3500.0, 5500.0, 6000.0, 4000.0, 3900.0, 5300.0, 2900.0, 12000.0, 4600.0, 19000.0, 4700.0, 4300.0, 2400.0, 10000.0, 14000.0, 5000.0, 3900.0, 750.0, 10500.0, 1600.0, 4700.0, 6800.0, 13000.0, 2300.0, 15750.0, 12800.0, 8850.0, 6900.0, 9100.0, 9100.0, 9700.0, 8900.0, 7500.0, 18000.0, 10250.0, 13797.0, 2700.0, 4000.0, 8700.0, 2350.0, 2650.0, 11000.0, 5900.0, 1400.0, 19000.0, 9300.0, 10900.0, 1800.0, 8000.0, 12000.0, 2200.0, 18700.0, 11900.0, 1100.0, 4200.0, 3200.0, 1700.0, 1900.0, 2700.0, 4600.0, 950.0, 4300.0, 2100.0, 3350.0, 12500.0, 5450.0, 13800.0, 14950.0, 11950.0, 17250.0, 11700.0, 21365.0, 2450.0, 3800.0, 12750.0, 1900.0, 8900.0, 8800.0, 8300.0, 3450.0, 7600.0, 5300.0, 9400.0, 4300.0, 22100.0, 9850.0, 4400.0, 17800.0, 2250.0, 15600.0, 7700.0, 19800.0]'

In [67]:
text_pred = text_response.split(",")

import re

# Regex para capturar números com pontos decimais, se presentes
y_pred = []

for value in text_pred:
    match = re.match(r'^[0-9]+(\.[0-9]+)?$', value.strip())  # Verifica se é um número válido
    if match:
        y_pred.append(float(value.strip()))  # Converte para float e adiciona à lista

print(len(y_pred))

101


In [69]:
print(len(y_test), len(y_pred))

mae = mean_absolute_error(y_test[:101], y_pred)
rmse = np.sqrt(mean_squared_error(y_test[:101], y_pred))
r2 = r2_score(y_test[:101], y_pred)

# Exibindo as métricas de forma mais bonita
print(f"{'Métricas de Avaliação':^40}")
print(f"{'-'*40}")
print(f"{'MAE':<10}: {mae:.4f}")
print(f"{'RMSE':<10}: {rmse:.4f}")
print(f"{'R²':<10}: {r2:.4f}")
print(f"{'-'*40}")

104 101
         Métricas de Avaliação          
----------------------------------------
MAE       : 5529.8218
RMSE      : 7335.5980
R²        : -0.9031
----------------------------------------
