# Vector embeddings with OpenAI

## Setup OpenAI API

In [11]:
import os

# import azure.identity
import dotenv
import openai

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = "ragsearchpocopenai"
AZURE_OPENAI_ADA_DEPLOYMENT = "text-embedding-ada-002"

# azure_credential = azure.identity.DefaultAzureCredential()
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
# token_provider = azure.identity.get_bearer_token_provider(azure_openai_key,
#     "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    api_key=azure_openai_key)


## Vector representations

In [12]:
sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentence)

vector = response.data[0].embedding

In [13]:
vector

[-0.014314107596874237,
 -0.007131262682378292,
 -0.027183908969163895,
 0.012360425665974617,
 -0.0037332740612328053,
 0.022077254951000214,
 0.009684589691460133,
 -0.018040934577584267,
 -0.0006282571703195572,
 -0.0249787624925375,
 0.01599053479731083,
 -0.002740313299000263,
 0.010606625117361546,
 -0.005548328626900911,
 0.011573794297873974,
 0.01163182407617569,
 0.02394711598753929,
 0.010219757445156574,
 0.016725583001971245,
 0.026951787993311882,
 -0.010980596765875816,
 0.022489912807941437,
 0.019227327778935432,
 -0.023521561175584793,
 -0.015436024405062199,
 -0.0029756580479443073,
 0.021316414698958397,
 -0.015049156732857227,
 0.0009236467885784805,
 -0.009536290541291237,
 0.014081986621022224,
 -0.005935196299105883,
 -0.03868677839636803,
 0.005409701261669397,
 -0.02166459523141384,
 -0.020839277654886246,
 0.016480566933751106,
 -0.010735580697655678,
 0.009304169565439224,
 -0.023224962875247,
 -0.005067967809736729,
 0.007453652564436197,
 0.005738538689911

In [23]:
_input = "dog"

response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=_input)

vector = response.data[0].embedding

vector

[-0.003476932644844055,
 -0.01781758852303028,
 -0.01627529226243496,
 -0.017506422474980354,
 -0.018128754571080208,
 0.021970966830849648,
 -0.012466900050640106,
 -0.022755645215511322,
 -0.021524513140320778,
 -0.017979934811592102,
 0.012169264256954193,
 0.038882117718458176,
 0.0015473703388124704,
 -0.006980923004448414,
 -0.013806263916194439,
 0.024243826046586037,
 0.03982914239168167,
 0.0012759463861584663,
 0.009490537457168102,
 -0.011979859322309494,
 -0.020036332309246063,
 0.006010223180055618,
 0.011195181868970394,
 -0.025583188980817795,
 -0.007542373146861792,
 0.010241392999887466,
 0.00988287664949894,
 -0.008374402299523354,
 -0.005611119791865349,
 -0.009571711532771587,
 0.007508550770580769,
 -0.009172608144581318,
 -0.02528555318713188,
 -0.02124040573835373,
 -0.005841111298650503,
 -0.019008133560419083,
 -0.007467964198440313,
 -0.01618058979511261,
 -0.011641636490821838,
 -0.021010413765907288,
 0.004522041883319616,
 0.01105312816798687,
 0.0117836901

In [24]:
len(vector)

1536

### Document similarity modeled as cosine distance

In [21]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'djkshsjdkhfsjdfkhsd']

def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

# for i in range(len(sentences1)):
#     print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

# print the similarity scores in a table
# Create a list of dictionaries and then convert it to a DataFrame
data = []
for i in range(len(sentences1)):
    data.append({"Sentence 1": sentences1[i],
                 "Sentence 2": sentences2[i],
                 "Score": round(cosine_similarity(embeddings1[i], embeddings2[i]),2)})
        

df = pd.DataFrame(data)
df

Unnamed: 0,Sentence 1,Sentence 2,Score
0,The new movie is awesome,The new movie is awesome,1.0
1,The new movie is awesome,This recent movie is so good,0.92
2,The new movie is awesome,djkshsjdkhfsjdfkhsd,0.75
