# Get captions, dense captions and tags from Azure Computer Vision

In [None]:
!pip install azure-ai-vision --user

In [1]:
# imports
import requests
import os
from dotenv import load_dotenv
import pandas as pd
import azure.ai.vision as sdk
import time

In [2]:
# Load envs
load_dotenv('.env')

# Azure Computer Vision
key = os.getenv("azure_cv_key")
endpoint = os.getenv("azure_cv_endpoint")

# Images to process
image_dir = './val2017/'

# Get Azure Computer Vision client
service_options = sdk.VisionServiceOptions(endpoint, key)

In [3]:
def analyze_image(imagefile):
    """
    Analyze an image using Azure Computer Vision API
    Implement retry logic due to rate limits (denseCaptions in preview offered just 20 requests per minute)
    """
    query = "?api-version=2023-02-01-preview&features=caption,denseCaptions"
    url = endpoint + "/computervision/imageanalysis:analyze" + query
    headers = {
        "Content-type": "application/octet-stream",
        "Ocp-Apim-Subscription-Key": key,
    }

    # Read the image file
    with open(imagefile, "rb") as f:
        data = f.read()

    # Sending the requests
    for retries in range(10):
        r = requests.post(url, data=data, headers=headers)
        if r.status_code == 200:
            results = r.json()
            break
        else:
            print(f"Error: {r.status_code}, will retry")
            time.sleep(61)
    else:
        raise Exception("Maximum number of retries reached")    
    
    return results

In [None]:
df = pd.DataFrame()
num_files = len(os.listdir(image_dir))

# Iterate over images
for i, filename in enumerate(os.listdir(image_dir)):
    print(f"{i} out of {num_files} - {filename}")
    result = analyze_image(image_dir + filename)

    print(result)
        
    # Parse results
    caption = result["captionResult"]["text"]
    object_captions = ", ".join(set(object["text"] for object in result["denseCaptionsResult"]["values"]))
    all_captions = f"{caption}, {object_captions}"

    # Store results
    result = { "filename": filename, "caption": caption, "all_captions": all_captions }
    row = pd.DataFrame(result, index=[0])
    df = pd.concat([df, row], axis=0)
    print(f"---> {caption}")

In [20]:
df.sample(20)

Unnamed: 0,filename,caption,all_captions
0,000000187585.jpg,a man doing a trick on a skateboard,"a man doing a trick on a skateboard, a man sit..."
0,000000293324.jpg,a group of airplanes parked at an airport,"a group of airplanes parked at an airport, a g..."
0,000000454661.jpg,cars on a street with traffic lights,"cars on a street with traffic lights, a white ..."
0,000000087742.jpg,a vase with peacock feathers in it,"a vase with peacock feathers in it, a window w..."
0,000000360943.jpg,a cat sitting on a window ledge,"a cat sitting on a window ledge, a cat sitting..."
0,000000566923.jpg,a man with blood on his face holding a fire hy...,a man with blood on his face holding a fire hy...
0,000000044068.jpg,a teddy bear sitting on a chair,"a teddy bear sitting on a chair, a teddy bear ..."
0,000000532855.jpg,a person is skating on a rail,"a person is skating on a rail, a person is ska..."
0,000000577539.jpg,a bowl of ice cream with marshmallows and banana,a bowl of ice cream with marshmallows and bana...
0,000000453860.jpg,a pair of black luggage,"a pair of black luggage, a black suitcase with..."


In [21]:
# Save results
df.to_parquet("azurecv_image_captions.parquet")