In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [11]:
model=SentenceTransformer('all-MiniLM-L6-v2')


text = """
Machine learning is a subset of artificial intelligence.
It focuses on building systems that learn from data.
These systems improve their performance over time.
Neural networks are a popular machine learning technique.
They are inspired by the human brain's structure.
Deep learning uses multiple layers of neural networks.
Convolutional neural networks excel at image processing.
They can identify objects in photographs.
Natural language processing deals with text and speech.
It enables computers to understand human language.
Chatbots use NLP to communicate with users.
Sentiment analysis determines emotions in text.
Reinforcement learning trains agents through rewards.
The agent learns by trial and error.
It maximizes cumulative rewards over time.
"""

In [14]:
sentences=[s.strip() for s in text.split("\n") if s.strip()]

embeddings=model.encode(sentences)


threshold=0.4
chunks=[]

current_chunk=[sentences[0]]



for i in range(1, len(sentences)):
    sim=cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )[0][0]
    print(f"Sentence {i}: similarity with previous = {sim:.4f}")


    if sim>=threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

chunks.append(" ".join(current_chunk))

print("\n Semantic chunks: ")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}: \n{chunk}")

Sentence 1: similarity with previous = 0.4330
Sentence 2: similarity with previous = 0.3824
Sentence 3: similarity with previous = 0.3049
Sentence 4: similarity with previous = 0.4064
Sentence 5: similarity with previous = 0.3178
Sentence 6: similarity with previous = 0.5285
Sentence 7: similarity with previous = 0.3915
Sentence 8: similarity with previous = 0.1868
Sentence 9: similarity with previous = 0.4806
Sentence 10: similarity with previous = 0.4714
Sentence 11: similarity with previous = 0.3549
Sentence 12: similarity with previous = 0.1047
Sentence 13: similarity with previous = 0.6324
Sentence 14: similarity with previous = 0.2210

 Semantic chunks: 

Chunk 1: 
Machine learning is a subset of artificial intelligence. It focuses on building systems that learn from data.

Chunk 2: 
These systems improve their performance over time.

Chunk 3: 
Neural networks are a popular machine learning technique. They are inspired by the human brain's structure.

Chunk 4: 
Deep learning uses

#### Modular RAG Pipeline

In [30]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chat_models import init_chat_model
from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")


In [31]:
### Custom Semantic Chunker With Threshold

class ThresholdSematicChunker:
    def __init__(self,model_name="all-MiniLM-L6-v2",threshold=0.7):
        self.model=SentenceTransformer(model_name)
        self.threshold=threshold 

    def split(self, text: str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]

        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_documents(self,docs):
        result=[]
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))

        return result

    

In [32]:
# Sample text
sample_text = """
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc = Document(page_content=sample_text)
doc

Document(metadata={}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [33]:
### Chunking
chunker=ThresholdSematicChunker(threshold=0.7)
chunks=chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [34]:
## Prompt Template

# --- 5. Prompt Template ---
template = """Answer the question based on the following context:

{context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\n')

In [36]:
### VectorStore
import os
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
embedding=HuggingFaceEmbeddings()
vectorstore=FAISS.from_documents(chunks,embedding)
retriever=vectorstore.as_retriever()



In [38]:
## LLM
llm=init_chat_model(model="groq:llama-3.3-70b-versatile",temperature=0.4)

### LCEL Chain With retrieval

rag_chain=(
    RunnableMap(
        {
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"],  
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

# --- 8. Run Query ---
query = {"question": "What is LangChain used for?"}
result = rag_chain.invoke(query)

print(result)

LangChain is a framework used for building applications with Large Language Models (LLMs). It provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
