In [None]:
# %cd ..

In [None]:

import datetime
import hashlib
import os
import time
from typing import Any

import chromadb
import requests
from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai
from google.api_core import retry
from google.genai import types
from pydantic import BaseModel
from utils import (
    add_offers_to_db,
    check_crawl_permission,
    chromadb_check_if_document_exists,
    create_offer_text,
    extract_adresse_urls,
    fetch_and_preprocess,
    fetch_html,
    fix_json,
    geocode_address,
    get_price_point,
    get_public_transport_stations,
    get_secret,
    is_retriable,
    offer_to_text,
    remove_url_parameters,
)

In [None]:

profile_name = os.getenv("AWS_PROFILE", "priv")
chromadb_ip = os.getenv("CHROMADB_IP", "3.124.214.10")
telegram_token = api_key = get_secret(
    secret_id="telegram-274181059559", key="TOKEN", profile_name=profile_name
)
telegram_chat_id = api_key = get_secret(
    secret_id="telegram-274181059559", key="CHAT_ID", profile_name=profile_name
)
genai_api_key = get_secret(
    secret_id="gemini-274181059559", key="GOOGLE_API_KEY", profile_name=profile_name
)
client = genai.Client(api_key=genai_api_key)


In [None]:

class GeminiEmbeddingFunction(EmbeddingFunction):
    """Custom embedding function using Google's Gemini API."""

    def __init__(self, client: genai.Client, *args, **kwargs):
        self.client = client
        self.document_mode = True
        super().__init__(*args, **kwargs)

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        task_type = "retrieval_document" if self.document_mode else "retrieval_query"
        response = self.client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(task_type=task_type),
        )
        return [e.values for e in response.embeddings]

client = genai.Client(api_key=genai_api_key)
embed_fn = GeminiEmbeddingFunction(client)
embed_fn.document_mode = True

chroma_client = chromadb.HttpClient(host=chromadb_ip)


DB_NAME = "real-estate-offers-v2"
collection = chroma_client.get_or_create_collection(
    name=DB_NAME, embedding_function=embed_fn
)

In [None]:
collection.count()

In [None]:
# chroma_client.delete_collection(DB_NAME)

# OLD CODE

In [None]:
%load_ext autoreload
%autoreload 2

from main import EXAMPLE_TEXT, PROMPT_TEMPLATE, system_instruction_template, BASE_URL


In [None]:
collection = setup_vector_database(
    ip=chromadb_ip,
    client=client,
)
print("Vector database initialized")

In [None]:
all_results = []

MAX_RETRIES = 3  # Number of times to retry a page
NUMBER_OF_PAGES_TO_OPEN = int(os.getenv("NUMBER_OF_PAGES_TO_OPEN", 2))
GET_OFFERS_FROM_X_LAST_MIN = 5

In [None]:
page_url = BASE_URL.format(page=1)

offers = summarize_webpage(
    page_url, PROMPT_TEMPLATE, EXAMPLE_TEXT, client
)

In [None]:
len(offers)

In [None]:
offers[0]

In [None]:
import datetime
import hashlib
import json
import re
import statistics
from urllib.parse import urljoin, urlparse
from urllib.request import urlopen

import boto3
import chromadb
import requests
from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai
from google.api_core import retry
from google.genai import types
from osmnx import features_from_point
from pydantic import BaseModel

def get_secret(secret_id, key=None, profile_name=None):
    if profile_name:
        boto3.setup_default_session(profile_name=profile_name)
    secrets_client = boto3.client("secretsmanager", region_name="eu-central-1")
    secret_value_response = secrets_client.get_secret_value(SecretId=secret_id)
    secret_dict = json.loads(secret_value_response["SecretString"])
    if key:
        return secret_dict[key]
    else:
        return secret_dict


client = genai.Client(
    api_key=get_secret(
        secret_id="gemini-274181059559", key="GOOGLE_API_KEY", profile_name="priv"
    )
)

In [None]:

def is_retriable(e: Exception) -> bool:
    """
    Determine if an exception should trigger a retry.

    Parameters:
        e (Exception): The exception to check.

    Returns:
        bool: True if the exception is retriable (API error codes 429 or 503), otherwise False.
    """
    return isinstance(e, genai.errors.APIError) and e.code in {429, 503}


class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function for generating text embeddings via the Gemini API.
    """

    document_mode: bool = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        """
        Generate embeddings for the provided documents.

        Parameters:
            input (Documents): The documents or queries to embed.

        Returns:
            Embeddings: A list of embedding vectors.
        """
        task_type = "retrieval_document" if self.document_mode else "retrieval_query"
        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(task_type=task_type),
        )
        return [e.values for e in response.embeddings]


In [None]:
DB_NAME = "real-estate-offers"
embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True
chroma_client = chromadb.HttpClient(host="3.124.214.10", port=8000)
collection = chroma_client.get_or_create_collection(
    name=DB_NAME, embedding_function=embed_fn
)

In [None]:
chroma_client.list_collections()

In [None]:
collection.count()

In [None]:
# chroma_client.delete_collection(real-estate-offers-v2")

In [None]:
now = datetime.datetime.now()

newest_results = collection.get(
        include=["metadatas"],
        where={
        "$and": [
            {"create_date": {"$gt": (now - datetime.timedelta(minutes=350)).timestamp()}},
            {"subways": {"$eq": True}},
            {"number_of_rooms": {"$gte": 3}}
        ]
        }
)['metadatas']

newest_results = newest_results[:10]

In [None]:
newest_results

In [None]:
%load_ext autoreload
%autoreload 2

from app import get_price_point

# Calculate price point for each new offer
for offer in newest_results:
    offer["price_point"] = get_price_point(offer, collection)

newest_results.sort(key=lambda x: x.get("price_point", 0))


In [None]:
offer_dict = newest_results[1]
offer_dict

In [None]:

import re

def create_offer_text(offer_dict) -> str:
    """
    Build a multi-line description of a property offer, optionally extracting
    subway names from the raw transport text.

    Extracts any occurrences of 'subways:...;' if `offer_dict['subways']` is True,
    joins them with commas, and injects all values into the template.

    Returns:
        A formatted multi-line string including all fields and the
        comma-separated list of subways (if any).
    """
    if offer_dict.get('subways', False):
        pattern = re.compile(r'(?<=subways:)([^;]+)(?=;)')
        subways = pattern.findall(offer_dict['public_transport_text'])
        subways_txt = ", ".join(subways)
    else:
        subways_txt = ""

    offer_txt = """
Address: {address}
Size: {area_m2} m2, Rooms: {number_of_rooms}, Year: {year_built}, Energy: {energy_label}
Price: {price:,} DKK ({price_point:.2%})
Subway(s): {subways_txt}
Url: {url}
Public transport: {public_transport_text}
    """.format(**offer_dict, subways_txt=subways_txt)

    return offer_txt

print(create_offer_text(offer_dict))

In [None]:
message = create_offer_text(offer_dict)

In [None]:
TOKEN = "6254467902:AAF6WgjW1giheyqXMWVEz4CFTlqQyUB2wL8"

url = (
    f"https://api.telegram.org/bot{TOKEN}/"
    f"sendMessage?chat_id={-4764438804}&text={message}"
)
requests.get(url).json()

In [None]:
# Add historical listings to vector database
pprint(f"Adding {len(all_results)} historical listings to vector database")
add_offers_to_db(collection, all_results)

# Fetch newest offers from page 1
pprint("Fetching newest property listings")
newest_results = []
page = 1
page_url = BASE_URL.format(page=page)
offers = summarize_webpage(page_url, PROMPT_TEMPLATE, EXAMPLE_TEXT)
newest_results.extend(offers)

# Calculate price point for each new offer
pprint("Calculating price points for new listings")


In [None]:
collection.query(
    query_texts=["balcony"], # Chroma will embed this for you
    n_results=2, # how many results to return
    where = {"subways":  {"$eq": True}},
    where_document={"$contains": "Runddel"}
)

In [None]:
collection.get(
        include=["metadatas", "documents", "embeddings"],
        limit=1,
        offset=1)

In [None]:
col.get(
        include=["metadatas", "documents", "embeddings"],
        limit=1,
        offset=1)

In [None]:
chroma_client.delete_collection("test")

In [None]:
[{"create_date": date} for date in dates][:2]

In [None]:
    BASE_URL = (
        "https://www.boligsiden.dk/tilsalg/villa,ejerlejlighed?sortAscending=true"
        "&mapBounds=7.780294,54.501948,15.330305,57.896401&priceMax=7000000"
        "&polygon=12.555001,55.714439|12.544964,55.711152|12.535566,55.708713|12.523383,55.700403|"
        "12.513564,55.690885|12.507604,55.674192|12.508089,55.656840|12.521769,55.648585|"
        "12.534702,55.642731|12.564876,55.614388|12.591917,55.614270|12.599055,55.649692|"
        "12.605518,55.649361|12.615303,55.649093|12.628699,55.649335|12.641590,55.649906|"
        "12.636977,55.665739|12.626008,55.676732|12.636641,55.686489|12.654036,55.720127|"
        "12.602392,55.730897|12.555001,55.714439&page={page}"
    )

In [None]:
import datetime

def iso_date(epoch_seconds): return datetime.datetime.fromtimestamp(
    epoch_seconds).isoformat()


In [None]:
datetime.datetime.today().timestamp()

In [None]:
import uuid
import chromadb

import datetime
import random

now = datetime.datetime.now()
two_weeks_ago = now - datetime.timedelta(days=14)

dates = [
    two_weeks_ago + datetime.timedelta(days=random.randint(0, 14))
    for _ in range(100)
]
dates = [int(date.timestamp()) for date in dates]

# convert epoch seconds to iso format

def iso_date(epoch_seconds): return datetime.datetime.fromtimestamp(
    epoch_seconds).isoformat()

col = chroma_client.get_or_create_collection("test")

col.add(ids=[f"{uuid.uuid4()}" for _ in range(100)], documents=[
    f"document {i}" for i in range(100)], metadatas=[{"create_date": date} for date in dates])

res = col.get(where={"create_date": {"$gt": (now - datetime.timedelta(days=7)).timestamp()}})

for i in res['metadatas']:
    print(iso_date(i['create_date']))