In [None]:
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
import requests
import json

from dotenv import dotenv_values

config = {
    **dotenv_values("./envs/aisearch.env")
}


In [None]:
vision_endpoint = config["AZURE_VISION_ENDPOINT"]
vision_key = config["AZURE_VISION_KEY"]

vision_client = ImageAnalysisClient(endpoint=vision_endpoint, credential=AzureKeyCredential(vision_key))

def get_multimodal_embedding(image_url, conf_threshold=0.6):
    response = requests.get(image_url)
    image_data = response.content

    result = vision_client.analyze(
        image_data=image_data,
        visual_features=[VisualFeatures.DENSE_CAPTIONS, VisualFeatures.TAGS, VisualFeatures.CAPTION],
        # model_version="2023-10-01",
        # model_version="latest"
    )
    # debug output of the result structure
    # print(json.dumps(result.as_dict(), indent=4))

    # Extract embeddings (vector representation)
    if not hasattr(result, 'embedding') or result.embedding is None:
        embedding_vector = None
    else:
        embedding_vector = result.embedding.vector
    
    # Additional metadata (optional)
    tags = [tag.name for tag in result.tags.list if tag.confidence >= conf_threshold] if result.tags else []
    dens_captions = [caption.text for caption in result.dense_captions.list if caption.confidence >= conf_threshold] if result.dense_captions else []
    
    # not output the confidence
    caption = result.caption.text if result.caption else ""

    return {
        "embedding": embedding_vector,
        "tags": tags,
        "dens_captions": dens_captions,
        "caption": caption  
    }
    
    # return result

In [74]:
image_url = "https://images.pexels.com/photos/1661179/pexels-photo-1661179.jpeg?cs=srgb&dl=pexels-roshan-kamath-1661179.jpg&fm=jpg"

result = get_multimodal_embedding(image_url, conf_threshold=0.8)
# print("Multimodal embedding:", embedding)

result

{'embedding': None,
 'tags': ['bird',
  'animal',
  'parrot',
  'parakeet',
  'budgie',
  'perched',
  'feather',
  'green'],
 'dens_captions': ['a close up of a bird',
  "a close up of a parrot's beak",
  "a close-up of a bird's foot",
  "a close up of a bird's face"],
 'caption': 'a green bird with red beak'}

In [None]:
# print result to json
# import json
# print(json.dumps(result.as_dict(), indent=4))

{
    "modelVersion": "2023-10-01",
    "captionResult": {
        "text": "a green bird with red beak",
        "confidence": 0.7682477831840515
    },
    "denseCaptionsResult": {
        "values": [
            {
                "text": "a green bird with red beak",
                "confidence": 0.7682477831840515,
                "boundingBox": {
                    "x": 0,
                    "y": 0,
                    "w": 2250,
                    "h": 2625
                }
            },
            {
                "text": "a green bird with red beak",
                "confidence": 0.7334362268447876,
                "boundingBox": {
                    "x": 632,
                    "y": 507,
                    "w": 1410,
                    "h": 1860
                }
            },
            {
                "text": "a bird's foot on a leaf",
                "confidence": 0.8004575371742249,
                "boundingBox": {
                    "x": 0,
                