# Get captions, dense captions and tags from Azure Computer Vision

In [None]:
!pip install azure-ai-vision --user

In [2]:
# imports
import requests
import os
from dotenv import load_dotenv
import pandas as pd
import azure.ai.vision as sdk
import time

In [3]:
# Load envs
load_dotenv('.env')

# Azure Computer Vision
key = os.getenv("azure_cv_key")
endpoint = os.getenv("azure_cv_endpoint")

# Images to process
image_dir = './val2017/'

# Get Azure Computer Vision client
service_options = sdk.VisionServiceOptions(endpoint, key)

In [4]:
def analyze_image(imagefile):
    # Set source file
    vision_source = sdk.VisionSource(filename=imagefile)

    # Set analysis options (enable features)
    analysis_options = sdk.ImageAnalysisOptions()

    analysis_options.features = (
        sdk.ImageAnalysisFeature.CAPTION |
        # sdk.ImageAnalysisFeature.DENSE_CAPTIONS |
        sdk.ImageAnalysisFeature.TAGS
    )

    # Analyze the image
    image_analyzer = sdk.ImageAnalyzer(service_options, vision_source, analysis_options)
    result = image_analyzer.analyze()

    return result

In [5]:
df = pd.DataFrame()
num_files = len(os.listdir(image_dir))

# Iterate over images
for i, filename in enumerate(os.listdir(image_dir)):
    print(f"{i} out of {num_files} - {filename}")

    # Analyze image with retries
    for retries in range(10):
        result = analyze_image(image_dir + filename)
        if result.reason == sdk.ImageAnalysisResultReason.ANALYZED:
            break # Success
        print(f"Will retry {filename} for {retries+1} time")
        if retries == 9:
            raise Exception(f"Failed to analyze {filename} after 10 retries")
        time.sleep(63)
    
    # Parse results
    caption = result.caption.content
    # object_captions = ", ".join(set(object.content for object in result.dense_captions))
    tags = ", ".join(set(object.name for object in result.tags))

    # Store results
    result = { "filename": filename, "caption": caption, "tags": tags }
    # result = { "filename": filename, "caption": caption, "object_captions": object_captions, "tags": tags }
    row = pd.DataFrame(result, index=[0])
    df = pd.concat([df, row], axis=0)
    print(f"---> {caption}")

0 out of 5000 - 000000000139.jpg
---> a woman standing in a kitchen
1 out of 5000 - 000000000285.jpg
---> a bear sitting in the grass
2 out of 5000 - 000000000632.jpg
---> a bedroom with a large window
3 out of 5000 - 000000000724.jpg
---> a stop sign on a pole
4 out of 5000 - 000000000776.jpg
---> a group of teddy bears
5 out of 5000 - 000000000785.jpg
---> a woman skiing on the snow
6 out of 5000 - 000000000802.jpg
---> a kitchen with white appliances
7 out of 5000 - 000000000872.jpg
---> a baseball player running to base
8 out of 5000 - 000000000885.jpg
---> a man holding a tennis racket
9 out of 5000 - 000000001000.jpg
---> a group of people posing for a photo
10 out of 5000 - 000000001268.jpg
---> a couple of people taking a picture of a swan
11 out of 5000 - 000000001296.jpg
---> a woman looking at a cell phone
12 out of 5000 - 000000001353.jpg
---> a group of children riding a train
13 out of 5000 - 000000001425.jpg
---> a sandwich on a plate
14 out of 5000 - 000000001490.jpg
--

: 

: 

In [31]:
df.sample(20)

Unnamed: 0,filename,caption,object_captions,tags
0,000000001425.jpg,a sandwich on a plate,"a close-up of a sandwich, close-up of a white ...","fast food, sandwich, indoor, black, food, coff..."
0,000000001490.jpg,a person on a surfboard in the water,"a person on a surfboard in the water, a person...","surfing, water, surfing equipment, surfboard, ..."
0,000000000285.jpg,a bear sitting in the grass,a bear sitting in the grass,"brown bear, grass, brown, kodiak bear, terrest..."
0,000000000139.jpg,a woman standing in a kitchen,"a chair with legs and a red cushion, a person ...","room, hardwood, couch, television, table, scen..."
0,000000001761.jpg,an airplane flying over a bridge,"a plane flying in the sky, an airplane flying ...","flight, vehicle, water, bridge, outdoor, plane..."
0,000000001000.jpg,a group of people posing for a photo,"a person holding a trophy, a person wearing a ...","tennis, group, child, adult, court, person, ou..."
0,000000001503.jpg,a computer on a desk,"a blurry image of a grey couch, a laptop on a ...","computer terminal, personal computer hardware,..."
0,000000001296.jpg,a woman looking at a cell phone,"a close-up of a phone, a woman looking at a ce...","nail, watch, black hair, street, phone, clothi..."
0,000000000632.jpg,a bedroom with a large window,"a blue blanket on a bed, a plant in a pot, a b...","room, bed sheet, shelving, bookcase, scene, ca..."
0,000000000724.jpg,a stop sign on a pole,"a tree next to a road, a stop sign on a pole, ...","road, tree, street, stop, intersection, signag..."


In [68]:
# Save results
df.to_parquet("azurecv_image_analyzes.parquet")