In [36]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch
import re

# Step 1: Load PDF (using Langchain's PDFLoader)


In [49]:

def load_pdf_langchain(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    text = ""
    for doc in documents:
        text += doc.page_content + "\n"
    print("✅ PDF loaded successfully using Langchain. Characters:", len(text))
    return text

# Step 2: Split into chunks (using Langchain's RecursiveCharacterTextSplitter)

In [50]:
def split_text_langchain(text, chunk_size=500, overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = text_splitter.split_text(text)
    print(f"Generated {len(chunks)} text chunks using Langchain splitter")
    return chunks

# Step 3: Embeddings (using HuggingFaceEmbeddings)

In [51]:
def get_embeddings_model():
    return HuggingFaceEmbeddings(model_name="paraphrase-multilingual-mpnet-base-v2")

# Step 4: Build FAISS index (using Langchain's FAISS wrapper)

In [52]:

def build_faiss_index(chunks, embeddings_model):
    vectorstore = FAISS.from_texts(chunks, embeddings_model)
    print("✅ FAISS index built successfully.")
    return vectorstore

# Step 5: Initialize the QA pipeline


In [53]:
def get_qa_pipeline():
    model_name = "sagorsarker/mbert-bengali-tydiqa-qa"  # Specialized for Bengali QA
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        max_seq_len=512, # Explicitly set max sequence length
        max_question_len=64,
        doc_stride=128
    )

# Step 6: Retrieve relevant documents using the vectorstore

In [54]:
def retrieve_documents(vectorstore, query, k=15):
    return vectorstore.similarity_search(query, k=k)

# Step 7: Generate answer using the QA pipeline with explicit context


In [55]:
def generate_answer_with_pipeline(qa_pipeline, documents, question):
    context = "\n\n".join([doc.page_content for doc in documents])

    # Pass the context and question in the format expected by the pipeline
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# ---- MAIN WORKFLOW ----


In [34]:
%pip install pypdf

Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-5.8.0-py3-none-any.whl (309 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/309.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.8.0


In [57]:
if __name__ == "__main__":
    # Load and prepare PDF
    pdf_path = "Updated_HSC26-Bangla1st-Paper.pdf"
    text = load_pdf_langchain(pdf_path)
    chunks = split_text_langchain(text)

    # Get embeddings and build index
    embeddings_model = get_embeddings_model()
    vectorstore = build_faiss_index(chunks, embeddings_model)

    # Initialize QA pipeline
    qa_pipeline = get_qa_pipeline()


    # Ask question in Bangla
    question = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"

    # Retrieve relevant documents
    retrieved_docs = retrieve_documents(vectorstore, question)

    # Generate answer using the pipeline
    answer = generate_answer_with_pipeline(qa_pipeline, retrieved_docs, question)

    print(f"\n❓ Question: {question}")
    print(f"🟢 Answer: {answer}")

✅ PDF loaded successfully using Langchain. Characters: 88602
Generated 224 text chunks using Langchain splitter
✅ FAISS index built successfully.


Device set to use cpu



❓ Question: বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?
🟢 Answer: বিল? 
(ক) ২১ ব্িি
