# Embeddings

This notebook demonstrates semantic closeness using cosine similarity.
Words with similar meanings should have higher similarity scores than unrelated ones.


In [None]:
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

## Getting embeddings

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

In [None]:
# Dimension
len(response.data[0].embedding)

In [None]:
response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small",
    dimensions=100
)

print(response.data[0].embedding)
print(len(response.data[0].embedding))

In [None]:
def cosine_similarity(a, b):
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0

words = [
    "cat", "dog", "kitten", "puppy",
    "car", "automobile",
    "banana", "apple",
    "king", "queen", "man", "woman",
    "Paris", "France", "Tokyo", "Japan", "Singapore"
]

response = client.embeddings.create(
    model="text-embedding-3-small",
    input=words,
)

# print(response)

embeddings = {item.index: item.embedding for item in response.data}
embedding_by_word = {word: embeddings[i] for i, word in enumerate(words)}

# print(embedding_by_word)

def score_pairs(pairs):
    scored = []
    for a, b in pairs:
        scored.append((a, b, cosine_similarity(embedding_by_word[a], embedding_by_word[b])))
    # return sorted(scored, key=lambda x: x[2], reverse=True)
    return scored

word_pairs = [
    ("cat", "dog"),
    ("kitten", "puppy"),
    ("car", "automobile"),
    ("king", "queen"),
    ("man", "woman"),
    ("Paris", "France"),
    ("Paris", "Singapore"),
    ("Tokyo", "Japan"),
    ("banana", "apple"),
    ("cat", "banana"),
    ("car", "king"),
    ("Paris", "puppy"),
    ("queen", "automobile"),
]

print("Word pairs (higher is closer):")
for a, b, score in score_pairs(word_pairs):
    print(f"  {a:<10} {b:<10} {score:.3f}")
