In [2]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())


True

In [14]:
from openai import OpenAI
from typing import List
import numpy as np
client = OpenAI()

In [23]:
def get_embedding(text: str) -> List[float]:
    """
    Get embeddings for a single text using text-embedding-3-large model.
    
    Args:
        text (str): The input text to generate embeddings for
        
    Returns:
        List[float]: The embedding vector
    """
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=text,
        dimensions=1024  # Optional: You can specify 256, 512, or 1024 dimensions
    )
    return response.data[0].embedding

def get_batch_embeddings(texts: List[str]) -> List[List[float]]:
    """
    Get embeddings for multiple texts in a single API call.
    
    Args:
        texts (List[str]): List of input texts to generate embeddings for
        
    Returns:
        List[List[float]]: List of embedding vectors
    """
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=texts,
        dimensions=1024
    )
    return [item.embedding for item in response.data]

In [24]:
def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
    """
    Calculate cosine similarity between two embeddings.
    
    Args:
        embedding1, embedding2 (List[float]): The embedding vectors to compare
        
    Returns:
        float: Cosine similarity score between 0 and 1
    """
    # Convert to numpy arrays for easier computation
    vec1 = np.array(embedding1)
    vec2 = np.array(embedding2)
    
    # Calculate cosine similarity
    similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return similarity


In [25]:
# Single text embedding example
text = "The quick brown fox jumps over the lazy dog"
embedding = get_embedding(text)
print(f"Single embedding length: {len(embedding)}")

Single embedding length: 1024


In [26]:
# Batch embedding example
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Pack my box with five dozen liquor jugs",
    "How vexingly quick daft zebras jump"
]
embeddings = get_batch_embeddings(texts)
print(f"Number of embeddings: {len(embeddings)}")
print(f"Each embedding length: {len(embeddings[0])}")


Number of embeddings: 3
Each embedding length: 1024


In [27]:
# Calculate similarity between first two texts
similarity = calculate_similarity(embeddings[0], embeddings[1])
print(f"Similarity between first two texts: {similarity:.4f}")

Similarity between first two texts: 0.2615
