In [6]:
import os
from decouple import config
from dotenv import load_dotenv

from openai import OpenAI

In [1]:
phrase_1 = "The dog ate my homework"
phrase_2 = "The homework ate my dog"

In [2]:
phrase_1_as_list = sorted([x.lower() for x in phrase_1.split(" ")])
phrase_2_as_list = sorted([x.lower() for x in phrase_2.split(" ")])
print(phrase_1_as_list)
print(phrase_2_as_list)

['ate', 'dog', 'homework', 'my', 'the']
['ate', 'dog', 'homework', 'my', 'the']


In [3]:
phrase_1_as_list == phrase_2_as_list

True

In [7]:
# Load environment variables from .env file
load_dotenv()

# Retrieve the API key
OPENAI_API_KEY = config('OPENAI_API_KEY')

client = OpenAI(
    api_key=OPENAI_API_KEY )

In [8]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [19]:
documents = [
    "The cat jumped over the dog",
    "The rat jumped over the dog",
    "The turkey ran in circles",
]

embeddings = [get_embedding(x) for x in documents]

embeddings[0]

[-0.0018775435164570808,
 -0.04121466726064682,
 -0.009573342278599739,
 -0.01355969812721014,
 0.002230533864349127,
 0.026169974356889725,
 -0.012099049054086208,
 0.0033199351746588945,
 0.005547426175326109,
 -0.01973094418644905,
 0.011484358459711075,
 0.017880788072943687,
 -0.036857061088085175,
 0.024405023083090782,
 -0.021861057728528976,
 0.03140396997332573,
 -0.029529469087719917,
 -0.03464174270629883,
 -0.08067655563354492,
 0.007881422527134418,
 -0.024745840579271317,
 0.031257905066013336,
 0.00640251487493515,
 -0.030454548075795174,
 -0.005614372435957193,
 0.011606079526245594,
 0.028872177004814148,
 0.013523181900382042,
 0.014789078384637833,
 0.002477018628269434,
 0.004859703592956066,
 -0.026802923530340195,
 -0.0014674965059384704,
 -0.050927989184856415,
 -0.024003343656659126,
 0.00998719222843647,
 -0.026121286675333977,
 -0.023431256413459778,
 -0.013133675791323185,
 0.0010848367819562554,
 -0.030186761170625687,
 -0.024684980511665344,
 0.015215101651

In [20]:
import numpy as np 

np.array(embeddings[0]).shape

(1536,)

In [21]:
def calculate_cosine_metrics(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude1 = np.linalg.norm(v1)
    magnitude2 = np.linalg.norm(v2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    cosine_distance = 1 - cosine_similarity
    return int(cosine_similarity * 100), int(cosine_distance * 100)

In [22]:
print(calculate_cosine_metrics(embeddings[0], embeddings[0]))
print(calculate_cosine_metrics(embeddings[0], embeddings[1]))
print(calculate_cosine_metrics(embeddings[0], embeddings[2]))

(100, 0)
(85, 14)
(31, 68)


In [25]:
query_str = "The moose sat by the turkey"
query_embedding = get_embedding(query_str)

for embedding in embeddings:
    print(calculate_cosine_metrics(query_embedding, embedding))

(28, 71)
(27, 72)
(56, 43)


In [26]:
phrase_1_embedding = get_embedding("The dog ate my homework and then burped it up")
phrase_2_embedding = get_embedding("The green dog ate my homework and then burped it up")

In [27]:
calculate_cosine_metrics(phrase_1_embedding, phrase_2_embedding)

(91, 8)