In [None]:

from openai import OpenAI
client = OpenAI()

response = client.embeddings.create(
    input="Hello world!",
    model="text-embedding-3-small" # text-embedding-3-small (1536 dimensions) / text-embedding-3-large (3072 dimensions)
)

result = {
    'model': response.model, 
    'tokens': response.usage.total_tokens, 
    'embedding-head': response.data[0].embedding[:5]
    }
display(result)

{'model': 'text-embedding-3-small',
 'tokens': 3,
 'embedding-head': [-0.010008599609136581,
  -0.0436151884496212,
  0.00032354597351513803,
  0.03149333596229553,
  -0.05746873468160629]}

#### Pay attention to the maximum input token limit when using apis for embeddings

In [None]:
from openai import OpenAI
import tiktoken
client = OpenAI()

_input = "Hello world!" * 10_000  # Example input that may exceed token limit
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
num_tokens = len(encoding.encode(_input))
print(f"Number of tokens in input: {num_tokens}")

try:
    response = client.embeddings.create(
        input=_input,
        model="text-embedding-3-small" # max input 8192 tokens
    )
except Exception as e:
    print(f"Error: {e}")

Number of tokens in input: 30000
Error: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 30000 tokens (30000 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

client = OpenAI()

sentence_pairs = [
    ("A man is playing guitar", "A person is playing a musical instrument"),
    ("A dog is running in the park", "The cat is sleeping on the couch"),
    ("The weather is sunny today", "It's a bright and clear day outside")
]

for s1, s2 in sentence_pairs:
    response = client.embeddings.create(
        input=[s1, s2],
        model="text-embedding-3-small"
    )
    similarity = cosine_similarity(
        [response.data[0].embedding],
        [response.data[1].embedding]
    )
    print(f"Sentence 1: {s1}")
    print(f"Sentence 2: {s2}")     
    print(f"Similarity: {similarity[0][0]:.4f}\n")

Sentence 1: A man is playing guitar
Sentence 2: A person is playing a musical instrument
Similarity: 0.6146

Sentence 1: A dog is running in the park
Sentence 2: The cat is sleeping on the couch
Similarity: 0.1948

Sentence 1: The weather is sunny today
Sentence 2: It's a bright and clear day outside
Similarity: 0.6751

