# Get image embeddings from Azure Computer Vision

In [13]:
# imports
import requests
from tenacity import retry, stop_after_attempt, wait_random_exponential
import os
from dotenv import load_dotenv
import pandas as pd

In [9]:
# Load envs
load_dotenv('.env')

# Azure Computer Vision
key = os.getenv("azure_cv_key")
endpoint = os.getenv("azure_cv_endpoint")

# Images to process
image_dir = './val2017/'

In [8]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) # automatic retry in case of a failing API call
def get_embedding(imagefile):
    """
    Get embedding from an image using Azure Computer Vision 4
    """
    # settings
    model = "?api-version=2023-02-01-preview&modelVersion=latest"
    url = endpoint + "/computervision/retrieval:vectorizeImage" + model
    headers = {
        "Content-type": "application/octet-stream",
        "Ocp-Apim-Subscription-Key": key,
    }

    # Read the image file
    with open(imagefile, "rb") as f:
        data = f.read()

    # Sending the requests
    r = requests.post(url, data=data, headers=headers)
    results = r.json()
    embeddings = results['vector']

    return embeddings

In [None]:
df = pd.DataFrame()
num_files = len(os.listdir(image_dir))
for i, filename in enumerate(os.listdir(image_dir)):
    print(f"{i} out of {num_files} - {filename}")
    embedding = get_embedding(image_dir + filename)
    row = pd.DataFrame({"filename": filename, "embedding": [embedding]})
    df = pd.concat([df, row], axis=0)

In [None]:
df.sample(20)

In [58]:
# Save results
df.to_parquet("azurecv_image_embeddings.parquet")