In [29]:
# ── standard library & env ─────────────────────────────────────────────────────
import os                            # read API keys / file paths
from dotenv import load_dotenv       # load OPENAI_API_KEY (and others) from .env

# ── document loaders (ingest your files) ───────────────────────────────────────
# Use only what you need; these come from langchain_community
from langchain_community.document_loaders import (
    PyPDFLoader,                     # load PDFs page-wise
    DirectoryLoader,                 # load a whole folder of files
    Docx2txtLoader,                  # .docx support (optional)
    CSVLoader,                       # .csv support (optional)
    UnstructuredMarkdownLoader,      # .md support (optional; requires unstructured)
)

# ── core data types ────────────────────────────────────────────────────────────
from langchain_core.documents import Document  # LangChain's Document wrapper

# ── text splitting / chunking ─────────────────────────────────────────────────
from langchain_text_splitters import RecursiveCharacterTextSplitter
# smart splitter that respects boundaries and target chunk sizes

# ── embeddings & LLMs (OpenAI stack; swap if you prefer HF, etc.) ─────────────
from langchain_openai import OpenAIEmbeddings   # create dense vectors for chunks
from langchain_openai import ChatOpenAI         # chat LLM used to answer with context

# ── vector store (where chunks+embeddings live) ────────────────────────────────
from langchain_community.vectorstores import FAISS
# in-memory/on-disk ANN index; easy to start with, fast on a single machine

# ── prompts & chains (wire retriever + LLM together) ───────────────────────────
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
# builds a “stuff” chain that injects retrieved docs into a prompt

from langchain.chains import create_retrieval_chain
# connects your retriever (FAISS) with the document-combining LLM chain

# ── (optional) runnable & parsing utilities for custom pipelines ──────────────
from langchain_core.runnables import RunnablePassthrough  # for custom LCEL graphs
from langchain_core.output_parsers import StrOutputParser # parse LLM output to text

# ── (optional) typing & simple logging ─────────────────────────────────────────
from typing import List, Optional
import logging


from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever


In [6]:
path=r"D:\LANGCHAIN\Projects\PDF_QA\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf"

In [16]:
def ingest_documents(path: str):
    """
    Simple ingestion: loads a PDF from given path and returns list of Documents.
    """
    loader = PyPDFLoader(path)   # only handles PDF
    docs = loader.load()
    return docs

In [17]:
ingest_documents(path)

[Document(metadata={'producer': 'pdfTeX-1.40.24; modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-18T14:00:40+05:30', 'moddate': '2025-02-19T10:27:27-05:00', 'ieee article id': '10890950', 'trapped': 'False', 'ieee issue id': '10820123', 'subject': 'IEEE Access;2025;13; ;10.1109/ACCESS.2025.3542562', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022) kpathsea version 6.3.4', 'ieee publication id': '6287639', 'title': 'Explainable Video Topics for Content Taxonomy: A Multimodal Retrieval Approach to Industry-Compliant Contextual Advertising', 'source': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'total_pages': 16, 'page': 0, 'page_label': '30597'}, page_content='Received 1 February 2025, accepted 11 February 2025, date of publication 14 February 2

In [18]:
def split_documents(docs, chunk_size=1000, chunk_overlap=200):
    """
    Split raw documents into smaller chunks for embedding & retrieval.

    Args:
        docs: list of Documents (from ingestion).
        chunk_size: target characters per chunk.
        chunk_overlap: overlap between chunks to preserve context.

    Returns:
        List of chunked Documents.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    split_docs = splitter.split_documents(docs)
    return split_docs

In [30]:
# def build_retriever(chunks, k: int = 4):
#     # requires OPENAI_API_KEY in your env
#     embeddings = OpenAIEmbeddings()
#     vectordb = FAISS.from_documents(chunks, embeddings)
#     return vectordb.as_retriever(search_kwargs={"k": k})


def build_hybrid_retriever(chunks, k: int = 6, alpha: float = 0.6):
    """
    Hybrid = BM25 (keywords) + FAISS (dense). 
    alpha = weight for dense; (1 - alpha) for BM25.
    """
    # Keyword retriever (great for 'DOI', dates, author names)
    bm25 = BM25Retriever.from_documents(chunks)
    bm25.k = k

    # Dense retriever (semantic)
    embeddings = OpenAIEmbeddings()
    vectordb = FAISS.from_documents(chunks, embeddings)
    dense = vectordb.as_retriever(search_kwargs={"k": k})

    # Fuse with reciprocal-rank style
    return EnsembleRetriever(retrievers=[bm25, dense],
                             weights=[1.0 - alpha, alpha])


In [None]:
def make_qa_chain(retriever, model: str = "gpt-4o-mini"):
    llm = ChatOpenAI(model=model, temperature=0)
    prompt = ChatPromptTemplate.from_template(
        "Use ONLY the context to answer. If not found, say you don't know.\n\n"
        "Question: {input}\n\nContext:\n{context}"
    )
    doc_chain = create_stuff_documents_chain(llm, prompt)
    return create_retrieval_chain(retriever, doc_chain)

In [32]:
# def ask(question: str, chunks):
#     retriever = build_retriever(chunks, k=4)
#     qa = make_qa_chain(retriever)
#     resp = qa.invoke({"input": question})

#     # pretty sources (filename + 1-based page if available)
#     sources = []
#     for d in resp.get("context", []):
#         meta = d.metadata or {}
#         page = meta.get("page")
#         sources.append({
#             "file": meta.get("filename") or meta.get("source"),
#             "page": (page + 1) if isinstance(page, int) else page
#         })
#     return resp.get("answer"), sources


def ask(question: str, retriever):
    qa = make_qa_chain(retriever)
    resp = qa.invoke({"input": question})

    sources = []
    for d in resp.get("context", []):
        meta = d.metadata or {}
        page = meta.get("page")
        sources.append({
            "file": meta.get("filename") or meta.get("source"),
            "page": (page + 1) if isinstance(page, int) else page
        })
    return resp.get("answer"), sources



In [36]:
load_dotenv()  # ensure OPENAI_API_KEY is in env

docs   = ingest_documents(path)
chunks = split_documents(docs, chunk_size=1200, chunk_overlap=200)

# Use HYBRID for exact fields like DOI
retriever = build_hybrid_retriever(chunks, k=6, alpha=0.4)  # slightly BM25-heavy

answer, sources = ask("what is authors email adress?", retriever)
print("Answer:", answer)
print("Sources:", sources)


Answer: The author's email address is anil.fernando@strath.ac.uk.
Sources: [{'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 14}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 10}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 6}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 2}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertis

In [None]:
answer, sources = ask("what is authers email adress?", retriever)
print("Answer:", answer)
print("Sources:", sources)


Answer: I don't know.
Sources: [{'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 10}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 6}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 2}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 9}, {'file': 'D:\\LANGCHAIN\\Projects\\PDF_QA\\Explainable_Video_Topics_for_Content_Taxonomy_A_Multimodal_Retrieval_Approach_to_Industry-Compliant_Contextual_Advertising 1 (1).pdf', 'page': 9}, {'file': 'D:\\LAN