# OpenAI embeddings from captions

In [92]:
# Imports
from dotenv import load_dotenv
import pandas as pd
import openai
import os
import time

In [17]:
# Configurations
deployment = "text-embedding-ada-002"    # Deployed model name

# Load envs
load_dotenv('.env')

# Configure OpenAI client
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_URL")
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("OPENAI_API_KEY")

In [97]:
# Load data
df = pd.read_parquet("azurecv_image_captions.parquet")

In [99]:
df.sample(5)

Unnamed: 0,filename,caption,all_captions
0,000000261161.jpg,a dog sitting on a leash,"a dog sitting on a leash, a dog sitting on a l..."
0,000000328430.jpg,a woman swinging a tennis racket,"a woman swinging a tennis racket, a woman swin..."
0,000000421455.jpg,a dog taking a selfie in a car,"a dog taking a selfie in a car, a tree with ma..."
0,000000035062.jpg,a bed with a blanket and a pillow,"a bed with a blanket and a pillow, a bed with ..."
0,000000334417.jpg,a man eating a slice of pizza,"a man eating a slice of pizza, a man eating a ..."


In [None]:
def get_embedding(text, deployment):
   """
   Call OpenAI and get the embedding for the text
   Implement retry logic due to rate limits
   """
   text = text.replace("\n", " ")
   for i in range(5):
       try:
           response = openai.Embedding.create(input = [text], deployment_id=deployment)['data'][0]['embedding']
           break
       except:
           print("Error, retrying...")
           time.sleep(10)
   else:
       raise Exception("All retries failed")
   return response
 
df["caption_embeddings"] = df['caption'].apply(lambda x: get_embedding(text=x, deployment=deployment))
df["all_captions_embeddings"] = df['all_captions'].apply(lambda x: get_embedding(text=x, deployment=deployment))

In [104]:
df.sample(10)

Unnamed: 0,filename,caption,all_captions,caption_embeddings,all_captions_embeddings
0,000000108253.jpg,a plate of food on a table,"a plate of food on a table, a hand holding a g...","[-0.007507555186748505, -0.020446108654141426,...","[-0.01935538463294506, -0.0046701449900865555,..."
0,000000099242.jpg,a man skiing down a snowy hill,"a man skiing down a snowy hill, a man skiing o...","[-0.0005496425437740982, -0.03327050432562828,...","[-0.0004652086063288152, -0.02731732651591301,..."
0,000000103585.jpg,a bathroom with two sinks and mirrors,"a bathroom with two sinks and mirrors, a close...","[0.0028143979143351316, -0.012378406710922718,...","[-0.004762659315019846, -0.012305901385843754,..."
0,000000407403.jpg,a yellow tulips in a white vase,"a yellow tulips in a white vase, a white pitch...","[-0.0057285199873149395, -0.022930365055799484...","[-0.01124101784080267, -0.01874319277703762, 0..."
0,000000023899.jpg,a group of people sitting on a couch playing v...,a group of people sitting on a couch playing v...,"[-0.006704707629978657, -0.0019035928416997194...","[-0.00419843103736639, 0.0003370441263541579, ..."
0,000000032081.jpg,a woman serving a tennis ball,"a woman serving a tennis ball, a woman serving...","[-0.01671607792377472, -0.0054929014295339584,...","[-0.013333135284483433, -0.005588213913142681,..."
0,000000279714.jpg,a rocket is placed in the street,"a rocket is placed in the street, a blurry ima...","[0.00848266389220953, -0.033384762704372406, 0...","[0.009605936706066132, -0.02814723737537861, 0..."
0,000000492968.jpg,a person skiing down the snow,"a person skiing down the snow, a close up of a...","[0.012711617164313793, -0.020609555765986443, ...","[0.0152762895449996, -0.014003928750753403, 0...."
0,000000239318.jpg,a laptop with a keyboard on a table,"a laptop with a keyboard on a table, a laptop ...","[-0.011851301416754723, -0.014643753878772259,...","[-0.01137156318873167, -0.015151129104197025, ..."
0,000000259854.jpg,a sign on the side of a road,"a sign on the side of a road, a sign on the si...","[0.0159668680280447, -0.02149546891450882, 0.0...","[0.00954810343682766, -0.022342396900057793, 0..."


In [105]:
# Save results
df.to_parquet("azurecv_image_captions_openai_embeddings.parquet")