In [None]:
import datetime
import hashlib
import json
import re
import statistics
from urllib.parse import urljoin, urlparse
from urllib.request import urlopen

import boto3
import chromadb
import requests
from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai
from google.api_core import retry
from google.genai import types
from osmnx import features_from_point
from pydantic import BaseModel

def get_secret(secret_id, key=None, profile_name=None):
    if profile_name:
        boto3.setup_default_session(profile_name=profile_name)
    secrets_client = boto3.client("secretsmanager", region_name="eu-central-1")
    secret_value_response = secrets_client.get_secret_value(SecretId=secret_id)
    secret_dict = json.loads(secret_value_response["SecretString"])
    if key:
        return secret_dict[key]
    else:
        return secret_dict


client = genai.Client(
    api_key=get_secret(
        secret_id="gemini-274181059559", key="GOOGLE_API_KEY", profile_name="priv"
    )
)

In [None]:

def is_retriable(e: Exception) -> bool:
    """
    Determine if an exception should trigger a retry.

    Parameters:
        e (Exception): The exception to check.

    Returns:
        bool: True if the exception is retriable (API error codes 429 or 503), otherwise False.
    """
    return isinstance(e, genai.errors.APIError) and e.code in {429, 503}


class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function for generating text embeddings via the Gemini API.
    """

    document_mode: bool = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        """
        Generate embeddings for the provided documents.

        Parameters:
            input (Documents): The documents or queries to embed.

        Returns:
            Embeddings: A list of embedding vectors.
        """
        task_type = "retrieval_document" if self.document_mode else "retrieval_query"
        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(task_type=task_type),
        )
        return [e.values for e in response.embeddings]


In [None]:
DB_NAME = "real-estate-offers"
embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True
chroma_client = chromadb.HttpClient(host=get_secret(secret_id="chrome-db-274181059559", key="IP", profile_name="priv"), port=8000)
collection = chroma_client.get_or_create_collection(
    name=DB_NAME, embedding_function=embed_fn
)

In [None]:
chroma_client.list_collections()

In [None]:
collection.count()

In [None]:
#chroma_client.delete_collection(DB_NAME)

In [None]:
collection.get(
        include=["metadatas", "documents", "embeddings"],
        limit=10,
        offset=1)