### Open AI Embeddings

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [5]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")

In [10]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000018DA6BD5D30>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000018DA6BD6660>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [12]:
## Single text embeddings
single_text = "Langchain and RAG are amazing frameworks and projects to work on"
single_embeddings = embeddings.embed_query(single_text)
print(len(single_embeddings))
print(single_embeddings)

1536
[-0.04906528443098068, -0.0221567340195179, 0.012010772712528706, -0.004295974969863892, 0.018454736098647118, -0.032351039350032806, 0.00018583373457659036, 0.0010714037343859673, -0.012231787666678429, -0.03246154636144638, 0.015788745135068893, 0.011409888975322247, -0.03464406728744507, 0.04337415471673012, 0.008281148038804531, -0.011327007785439491, 0.0005667820805683732, -0.05823739990592003, 0.026217879727482796, 0.07779720425605774, -0.012666909955441952, -0.005425222683697939, -0.006164240650832653, 0.039423514157533646, -0.019573623314499855, -0.012259414419531822, -0.006257481407374144, 0.06884610652923584, 0.011582556180655956, -0.02764066308736801, -0.0018354587955400348, -0.03569388762116432, 2.679048884601798e-05, 0.0219357181340456, 0.006972325965762138, 0.027019059285521507, -0.005121327005326748, -0.006979232653975487, -0.009952572174370289, 0.026991432532668114, 0.022612575441598892, 0.03909199312329292, 0.00781839806586504, 0.008060133084654808, 0.017349662259

In [13]:
print("📝 Single Text Embedding:")
print(f"Input: {single_text}")
print(f"Output: Vector of {len(single_embeddings)} dimensions")
print(f"Sample values: {single_embeddings[:5]}")

📝 Single Text Embedding:
Input: Langchain and RAG are amazing frameworks and projects to work on
Output: Vector of 1536 dimensions
Sample values: [-0.04906528443098068, -0.0221567340195179, 0.012010772712528706, -0.004295974969863892, 0.018454736098647118]


In [14]:
# Example 2: Multiple texts at once
multiple_texts = [
    "Python is a programming language",
    "LangChain is a framework for LLM applications",
    "Embeddings convert text to numbers",
    "Vectors can be compared for similarity"
]

In [15]:
multiple_embeddings = embeddings.embed_documents(multiple_texts)

In [16]:
print("\n📚 Multiple Text Embeddings:")
print(f"Number of texts: {len(multiple_texts)}")
print(f"Number of embeddings: {len(multiple_embeddings)}")
print(f"Each embedding size: {len(multiple_embeddings[0])}")


📚 Multiple Text Embeddings:
Number of texts: 4
Number of embeddings: 4
Each embedding size: 1536


In [21]:
print(multiple_embeddings[0])

[-0.011004673317074776, -0.020408110693097115, 0.018817074596881866, -0.0028302103746682405, 0.015716591849923134, -0.026639673858880997, 0.0005226965295150876, 0.03720579296350479, -0.0017197990091517568, 0.012993469834327698, 0.021540194749832153, -0.0247222688049078, -0.009428935125470161, 0.0018638592446222901, 0.003916399087756872, 0.015502413734793663, -0.03296302631497383, 0.029780952259898186, -0.027210814878344536, 0.010372338816523552, -0.001478848629631102, -0.009913384914398193, -0.05385048687458038, 0.01543102040886879, 0.03685902804136276, -0.04291720688343048, 0.005415645893663168, 0.03622669354081154, -0.01945960894227028, 0.0011199729051440954, 0.01297307200729847, -0.032351087778806686, -0.03653265908360481, 0.05123955383896828, -0.03118840791285038, -0.04507938772439957, 0.04585450515151024, -0.010464129038155079, 0.06837379932403564, -0.015074057504534721, 0.004041336476802826, -0.039163991808891296, 0.03133119270205498, -0.00040030901436693966, -0.00212010811083018

### Cosine SImilarity with OpenAI Embeddings

In [2]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [1]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measure the angle between two vectors.
    - Result close to 1: very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product/(norm_a*norm_b)

In [3]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
sentence_embeddings = embeddings.embed_documents(sentences)
print(sentence_embeddings[0])

[-0.03074316307902336, -0.04954070597887039, -0.005032286047935486, -0.0014980505220592022, 0.036250557750463486, -0.0020749696996062994, -0.008868717588484287, 0.027200847864151, 0.007110487669706345, -0.011906835250556469, 0.04160281643271446, -0.0013857370940968394, 0.0451192781329155, 0.05274689197540283, 0.03206183388829231, 0.03244968131184578, -0.012417497113347054, 0.003046197583898902, -0.06603703647851944, 0.047446344047784805, 0.025869246572256088, -0.04540369659662247, -0.003451818600296974, 0.014621748588979244, 0.009101424366235733, 0.01482859905809164, -0.011208713985979557, -0.012049044482409954, 0.010762692429125309, 0.01282473374158144, 0.012288215570151806, -0.036069564521312714, -0.02650272659957409, -0.04535198211669922, -0.034595753997564316, 0.00479957927018404, -0.019844723865389824, -0.011745233088731766, -0.0420682318508625, -0.02290869876742363, -0.03637984022498131, -0.005058142356574535, 0.013277219608426094, 0.004010961391031742, 0.01919831708073616, -0.01

In [9]:
### Calculate the similarity between all pairs
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity = cosine_similarity(sentence_embeddings[i], sentence_embeddings[j])
        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"similarity:{similarity:.3f}\n")

'The cat sat on the mat' vs 'A feline rested on the rug'
similarity:0.655

'The cat sat on the mat' vs 'The dog played in the yard'
similarity:0.324

'The cat sat on the mat' vs 'I love programming in Python'
similarity:0.089

'The cat sat on the mat' vs 'Python is my favorite programming language'
similarity:0.120

'A feline rested on the rug' vs 'The dog played in the yard'
similarity:0.296

'A feline rested on the rug' vs 'I love programming in Python'
similarity:0.055

'A feline rested on the rug' vs 'Python is my favorite programming language'
similarity:0.103

'The dog played in the yard' vs 'I love programming in Python'
similarity:0.126

'The dog played in the yard' vs 'Python is my favorite programming language'
similarity:0.085

'I love programming in Python' vs 'Python is my favorite programming language'
similarity:0.708



In [10]:
### Example- Semantic Search- Retireve the similar sentence
# Test semantic search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query="What is Langchain?"

In [11]:
def semantic_search(query, documents, embeddings_models, top_k=3):
    """Simple semantic search implementation"""

    ## embed query and document

    query_embedding = embeddings.embed_query(query)
    doc_embedding = embeddings.embed_documents(documents)

    ### Calculate the similarity Score

    similarities = []

    for i, doc_emb in enumerate(doc_embedding):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i]))

    ## Sort by similarity 
    similarities.sort(reverse=True)
    return similarities[:top_k]



In [13]:
result = semantic_search(query, documents, embeddings)

print(f"Semantic Search Result for : '{query}'")
for score, doc in result:
    print(f"Score: {score:.3f} | {doc}")

Semantic Search Result for : 'What is Langchain?'
Score: 0.676 | LangChain is a framework for developing applications powered by language models
Score: 0.130 | Python is a high-level programming language
Score: 0.101 | Embeddings convert text into numerical vectors
