In [None]:
%load_ext autoreload
%autoreload 2

import datetime
import os
import time

import requests
from google import genai
import datetime
import hashlib
import os
import re
import time
from pydantic import HttpUrl

import chromadb
import requests
from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai
from google.api_core import retry
from google.genai import types
from pydantic import BaseModel
from utils import (
    check_crawl_permission,
    fetch_html,
    filter_unique_ids,
    fix_json,
    geocode_address,
    get_price_point,
    get_public_transport_stations,
    get_secret,
    is_retriable,
    offer_to_text,
    preprocess_html,
    remove_url_parameters,
    extract_adresse_urls,
    chromadb_check_if_document_exists,
    fetch_and_preprocess
)


In [None]:

# bot page
# https://api.telegram.org/bot{telegram_token}/getUpdates

profile_name = os.getenv("AWS_PROFILE", "priv")
chromadb_ip = os.getenv("CHROMADB_IP", "3.124.214.10")

telegram_token = api_key = get_secret(
    secret_id="telegram-274181059559", key="TOKEN", profile_name=profile_name
)

telegram_chat_id = api_key = get_secret(
    secret_id="telegram-274181059559", key="CHAT_ID", profile_name=profile_name
)

genai_api_key = get_secret(
    secret_id="gemini-274181059559", key="GOOGLE_API_KEY", profile_name=profile_name
)

client = genai.Client(api_key=genai_api_key)

In [None]:
from main import setup_vector_database


In [None]:
collection = setup_vector_database(
    ip=chromadb_ip,
    client=client,
)

In [None]:
type(collection)

In [None]:

PROMPT_TEMPLATE = """
You are an expert in extracting apartment listings from cleaned HTML text. Your task is to extract key structured information and present it in **valid JSON format**.

Please follow these instructions **precisely**:

1. **Translate all text to English**, except for the **Address**, which must remain in its original language.
2. **Create a detailed apartment description** based on the listing, covering:
   - Natural light: Is it bright, which directions (e.g., east-facing)?
   - Condition: Is it newly built, recently renovated, or older but well-maintained?
   - View: What can be seen from the apartment? (e.g., courtyard, street, green area)
   - Neighborhood: What is mentioned about the area? Is it calm, central, well-connected, or popular?
3. **Address**: Extract in this format: `Street Name Number, PostalCode City, Country`  
   - Do NOT include unit/floor/apartment numbers in the address
4. **Price**: Extract as an integer, no commas or currency signs (e.g., `3250000`). If missing, use `null`.
5. **Area (m2)**: Extract as an integer (e.g., `87`). If missing, use `null`.
6. **Number of Rooms**: Extract total number of rooms as an integer. If missing, use `null`.
7. **Year Built**: Extract the year the building was constructed (e.g., `2006`). If missing, use `null`.
8. **Energy Label**: Extract as a single uppercase letter (`A`, `B`, etc.). If not available, use `null`.
9. **Balcony**: Return `true` if a balcony or terrace is mentioned; otherwise, `false`.
10. **URL**: Extract the full link to the listing.

Ensure the output is **JSON only**, with no explanation or additional text.

Cleaned HTMLs:

{html_content}

JSON output:
"""


class Offers(BaseModel):
    address: str
    description: str
    floor: str
    price: int
    area_m2: int
    number_of_rooms: int
    year_built: int
    energy_label: str
    balcony: str
    url: str


class ListOfOffers(BaseModel):
    offers: list[Offers]


EXAMPLE_TEXT = """
```json
[
    {
        "address": "Engholmene, 2450 København SV, Denmark",
        "description": "Apartment boasts abundant natural light and a spacious west-facing balcony overlooking the canal and marina. The contemporary interior is move-in ready, featuring high-quality materials. The neighborhood offers plenty of greenery, cafés, promenades, and convenient metro access.",
        "floor": "5",
        "price": 6195000,
        "area_m2": 91,
        "number_of_rooms": 2,
        "year_built": 2019,
        "energy_label": "A",
        "balcony": "true",
        "url": "https://www.boligsiden.dk/adresse/engholmene-2450-koebenhavn-sv-eksempel"
    }
]
"""


BASE_URL = (
    "https://www.boligsiden.dk/tilsalg/villa,ejerlejlighed?sortAscending=true"
    "&mapBounds=7.780294,54.501948,15.330305,57.896401&priceMax=7000000"
    "&polygon=12.555001,55.714439|12.544964,55.711152|12.535566,55.708713|12.523383,55.700403|"
    "12.513564,55.690885|12.507604,55.674192|12.508089,55.656840|12.521769,55.648585|"
    "12.534702,55.642731|12.564876,55.614388|12.591917,55.614270|12.599055,55.649692|"
    "12.605518,55.649361|12.615303,55.649093|12.628699,55.649335|12.641590,55.649906|"
    "12.636977,55.665739|12.626008,55.676732|12.636641,55.686489|12.654036,55.720127|"
    "12.602392,55.730897|12.555001,55.714439&page={page}"
)


In [None]:
html_content = fetch_html(BASE_URL.format(page=1))

In [None]:
urls = extract_adresse_urls(html_content)
urls_hash = [
    (url, hashlib.shake_128(str(url).encode()).hexdigest(8))
    for url in urls
]
new_urls: list = []
for url, hash_id in urls_hash:
    if not chromadb_check_if_document_exists(hash_id, collection):
        new_urls += [url]


In [None]:
offers_source: list[dict] = []

for url in new_urls:
    text = fetch_and_preprocess(url, mode="two_requests")
    if text:
        offers_source += [{"url": url, "text": text}]

In [None]:
SOURCE_TEMPLATE = """
---------------------
Offer #{i}
URL: {url}  
SOURCE:
{text}
"""

SOURCE = ""

for i, offer in enumerate(offers_source):
    SOURCE += SOURCE_TEMPLATE.format(i=i+1, **offer)



In [None]:
SOURCE