In [2]:
!pip uninstall -y langchain langchain-core langchain-community langchain-google-genai
!pip install -U langchain-core langchain-community langchain-google-genai google-generativeai


Found existing installation: langchain-core 1.0.4
Uninstalling langchain-core-1.0.4:
  Successfully uninstalled langchain-core-1.0.4
Found existing installation: langchain-community 0.4.1
Uninstalling langchain-community-0.4.1:
  Successfully uninstalled langchain-community-0.4.1
Found existing installation: langchain-google-genai 0.0.1
Uninstalling langchain-google-genai-0.0.1:
  Successfully uninstalled langchain-google-genai-0.0.1




Collecting langchain-core
  Using cached langchain_core-1.0.4-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-community
  Using cached langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-google-genai
  Using cached langchain_google_genai-3.0.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-generativeai
  Using cached google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage<1.0.0,>=0.7.0 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.9.0-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of google-generativeai to determine which version is compatible with other requirements. This could take a while.
Collecting google-generativeai
  Using cached google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
  Using cached google_generativeai-0.8.3-py3-none-any.whl.metadata (3.9 kB)
  Using cached google_generativeai-0.8.2-py3-none-any.whl.metadata (3

In [1]:
import os
import json
import numpy as np
from dotenv import load_dotenv

# LangChain Core
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# PDF Tools
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Google/FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

In [None]:



# 1Ô∏è‚É£ Set environment variables so all later cells can use them
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
os.environ["GEMINI_API_KEY"] = GEMINI_KEY


In [3]:
import os

# If GOOGLE_API_KEY is missing but GEMINI_API_KEY exists, map it
if "GOOGLE_API_KEY" not in os.environ and "GEMINI_API_KEY" in os.environ:
    os.environ["GOOGLE_API_KEY"] = os.environ["GEMINI_API_KEY"]

# Sanity check
print("GOOGLE_API_KEY set:", "GOOGLE_API_KEY" in os.environ)


GOOGLE_API_KEY set: True


In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

embeddings = GoogleGenerativeAIEmbeddings(model="text-embedding-004")
# Use the stable versioned alias
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", temperature=0.2, max_output_tokens=512) 

# Quick tests
print("Embed dim:", len(embeddings.embed_query("hello world")))
print("LLM:", llm.invoke("In one line, what is RAG?").content)

Embed dim: 768
LLM: RAG (Retrieval-Augmented Generation) is a technique that enhances LLMs by retrieving relevant information from external knowledge sources and incorporating it into the generation process.


In [8]:
from langchain_community.document_loaders import PyMuPDFLoader
pdf_path = "./sample.pdf"  # Replace with your PDF path
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()  # Returns list of Document objects (each page typically)

print(f"Loaded {len(docs)} documents (pages). Example page content head:\n", docs[0].page_content[:800])

Loaded 15 documents (pages). Example page content head:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Parmar‚àó
Google Research
nikip@google.com
Jakob Uszkoreit‚àó
Google Research
usz@google.com
Llion Jones‚àó
Google Research
llion@google.com
Aidan N. Gomez‚àó‚Ä†
University of Toronto
aidan@cs.toronto.edu
≈Åukasz Kaiser‚àó
Google Brain
lukaszkaiser@google.com
Illia Polosukhin‚àó‚Ä°
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and 


In [9]:
# ============================
# Cell ‚Äî Deep clean PDF pages with enriched metadata
# ============================
import re
import unicodedata
from langchain_core.documents import Document

def improved_clean(documents, file_name):
    cleaned_docs = []
    for doc in documents:
        text = doc.page_content
        page_number = doc.metadata.get("page")  # Retrieve page number from metadata
        
        # Dummy metadata - replace with actual extraction logic (can be improved)
        title = "Sample Research Paper"  # This should be extracted from the PDF title or document header
        authors = "John Doe, Jane Smith"  # Extract authors from metadata or first pages
        publication_date = "2024-01-01"  # Extract publication date if available
        source_link = "https://arxiv.org/abs/123456"  # If available in the document metadata

        # 1) Unicode normalize (fix ligatures / odd widths)
        text = unicodedata.normalize("NFKC", text)

        # 2) Repair hyphenation across line breaks
        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1-\2', text)

        # 3) Preserve paragraph breaks
        text = re.sub(r'\n{2,}', '<PAR>', text)  # mark paragraphs
        text = re.sub(r'[\r\n]+', ' ', text)     # flatten single newlines

        # 4) Remove bracketed numeric citations like [12]
        text = re.sub(r'\[\s*\d+\s*\]', '', text)

        # 5) Remove inline trailing citation digits glued to words (e.g., intelligence1.)
        text = re.sub(r'(?<=\w)(\d{1,3})(?=[\s\.,;:])', '', text)

        # 6) Remove long repeated digit runs (e.g., 1111, 1515151)
        text = re.sub(r'(\d)\1{3,}', '', text)

        # 7) Remove "Page 12" style markers
        text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)

        # 8) Strip control chars
        text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')

        # 9) Normalize whitespace and restore paragraph breaks
        text = re.sub(r'[ \t\f\v]+', ' ', text)  # collapse horizontal whitespace
        text = text.replace('<PAR>', '\n\n')     # restore paragraphs
        text = re.sub(r' {2,}', ' ', text).strip()

        # 10) Targeted glyph fixes (extend if you see more)
        replacements = {
            'Trade-o∆Ø': 'Trade-off',
            'Tradeo∆Ø': 'Trade-off',
            'oe∆Ø': 'oeff',
            'coe∆Ø': 'coeff',
            '∆Ø': 'f',  # keep last: broadest
        }
        for k, v in replacements.items():
            text = text.replace(k, v)

        # Assign metadata (file_name, page number, title, authors, date, source_link)
        doc.metadata["source"] = file_name
        doc.metadata["page"] = page_number
        doc.metadata["title"] = title
        doc.metadata["authors"] = authors
        doc.metadata["publication_date"] = publication_date
        doc.metadata["source_link"] = source_link

        # Determine section heading
        if page_number in ["0", "1"]:
            doc.metadata["section_heading"] = "authors"
        elif "references" in text.lower() or "bibliography" in text.lower():
            doc.metadata["section_heading"] = "references"
        else:
            doc.metadata["section_heading"] = "body"

        # Append the cleaned document with updated metadata
        cleaned_docs.append(Document(page_content=text, metadata=doc.metadata))

    return cleaned_docs

# Apply to 'docs' (output of PyMuPDFLoader.load())
docs_deeper_cleaned = improved_clean(docs, "sample.pdf")  # Pass the file name for metadata

# Safe previews for first 2 cleaned docs
for i, d in enumerate(docs_deeper_cleaned[:2]):
    print(f"\n--- Deeply cleaned doc {i+1} (first 800 chars) ---\n")
    print(d.page_content[:800])
    print("\n--- End preview ---\n")

print(f"‚úÖ Cleaned pages: {len(docs_deeper_cleaned)}")


--- Deeply cleaned doc 1 (first 800 chars) ---

Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó Google Research llion@google.com Aidan N. Gomez‚àó‚Ä† University of Toronto aidan@cs.toronto.edu ≈Åukasz Kaiser‚àó Google Brain lukaszkaiser@google.com Illia Polosukhin‚àó‚Ä° illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and 

--- End preview ---


--- Deeply cleaned doc 2 (first 800 chars) ---

1 Introduction Recurrent neural networks, long short-term 

In [58]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=30,
    add_start_index=True,
    separators=("\n\n", "\n", ". ", " ", "")   # real newlines, not escaped
)

# Split pages -> chunks (metadata is auto-propagated; don‚Äôt remap it)
chunked_docs = splitter.split_documents(docs_deeper_cleaned)

# Per-chunk traceability; do NOT reassign parent metadata via modulo
for i, doc in enumerate(chunked_docs):
    start = doc.metadata.get("start_index")
    if start is not None:
        doc.metadata["char_start"] = start
        doc.metadata["char_end"] = start + len(doc.page_content)
        # (optional) drop the raw start_index if you don‚Äôt need it
        # del doc.metadata["start_index"]

    doc.metadata["chunk_id"] = i
    # (optional) ensure keys exist without breaking provenance
    doc.metadata.setdefault("title", "Unknown Title")
    doc.metadata.setdefault("authors", "Unknown Authors")
    doc.metadata.setdefault("publication_date", "Unknown Date")
    doc.metadata.setdefault("section_heading", "Unknown Section")

print(f"‚úÖ Total chunks created: {len(chunked_docs)}")
print("üìÑ Example chunk preview:\n")
print(chunked_docs[0].page_content[:800] if chunked_docs else "‚ö†Ô∏è No chunks produced.")


‚úÖ Total chunks created: 95
üìÑ Example chunk preview:

Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó Google Research llion@google.com Aidan N


In [59]:
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=30,                 # match the creator cell (optional but tidy)
    add_start_index=True,
    separators=("\n\n", "\n", ". ", " ", ""),
)

chunked_docs = splitter.split_documents(docs_deeper_cleaned)

for i, doc in enumerate(chunked_docs):
    doc.metadata["chunk_id"] = i
    start = doc.metadata.get("start_index")
    if start is not None:
        doc.metadata["char_start"] = start
        doc.metadata["char_end"] = start + len(doc.page_content)
        # del doc.metadata["start_index"]  # optional

chunks_data = [{"text": d.page_content, "metadata": d.metadata} for d in chunked_docs]

output_path = "chunks.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunks_data, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved {len(chunks_data)} chunks to {output_path}")
print("üìÑ Example chunk preview:")
print(chunks_data[0]["text"][:400] if chunks_data else "No chunks.")


‚úÖ Saved 95 chunks to chunks.json
üìÑ Example chunk preview:
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó G


In [60]:
# 1) Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="text-embedding-004")

# 2) Vectorstore (FAISS)
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

# Assume `docs` is a list[Document] (your chunked docs with metadata)
vectorstore = FAISS.from_documents(chunked_docs, embeddings)  # default IP; cosine-equivalent with unit vectors

# 3) Persist
vectorstore.save_local("faiss_index")

# 4) Reload (per docs)
reloaded = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# 5a) Retriever (top-k)
retriever = reloaded.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# 5b) Retriever (thresholded; no manual scoring)
retriever = reloaded.as_retriever(search_type="similarity", search_kwargs={"k": 4})






In [61]:
query = "what is machine learning?"
retrieved_docs = retriever.invoke(query)

print("Retrieved Documents:")
for i, d in enumerate(retrieved_docs, 1):
    print(f"{i}. Page {d.metadata.get('page', '?')} ‚Äî {d.metadata.get('title', 'Untitled')}")
    print(d.page_content[:300], "...\n")


Retrieved Documents:
1. Page 10 ‚Äî Sample Research Paper
. Neural GPUs learn algorithms. In International Conference on Learning Representations (ICLR), 2. Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Ko-ray Kavukcuoglu. Neural machine translation in linear time. arXiv preprint arXiv:1.10099v, 2. Yoon Kim, Carl De ...

2. Page 1 ‚Äî Sample Research Paper
. Here, the encoder maps an input sequence of symbol representations (x, ..., xn) to a sequence of continuous representations z = (z, ..., zn). Given z, the decoder then generates an output sequence (y, ..., ym) of symbols one element at a time. At each step the model is auto-regressive , consuming  ...

3. Page 9 ‚Äî Sample Research Paper
. CoRR, abs/1.0, 2. Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. CoRR, abs/1.03, 2. Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine readi

In [103]:
# ‚úÖ RAG chain with strict citations (Gemini 2.5 Flash)
from pathlib import Path
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

def fmt_docs(docs):
    out = []
    for d in docs:
        src = Path(d.metadata.get("source","")).name or "document.pdf"
        pg = d.metadata.get("page","?")
        out.append(f"[source: {src}, p. {pg}]\n{d.page_content}")
    return "\n\n".join(out)

prompt = ChatPromptTemplate.from_messages([
   ("system",
     # 1. High-Value Persona and Core Directive
     "You are an **Expert Research Scientist** specializing in technical analysis of scientific papers. Your sole purpose is to provide highly accurate, factually grounded answers to the user's question.\n\n"
     
     # 2. Strict Grounding and Citation Rules
     "**CRITICAL INSTRUCTIONS:**\n"
     "1.  **STRICTLY GROUNDED:** Answer the question ONLY using the facts, findings, and figures present in the provided [CONTEXT] chunks.\n"
     "2.  **MANDATORY CITATION:** You **MUST** append an in-line citation (e.g., [source X]) immediately after every distinct fact or sentence derived from the context.\n"
     "3.  **SYNTHESIS:** Synthesize findings coherently and fluently. Do not simply list sentences. Structure your response using paragraphs and bullet points for clarity.\n"
     
     # 3. Flexible Escape Hatch
     "4. **FLEXIBLE ANSWER ESCAPE:** If the [CONTEXT] does not contain enough information to answer the question fully, you may respond with one of the following:\n"
     "  - If there is partial information: 'The paper provides some insights, but additional details may be needed to fully answer this question.'\n"
     "  - If no relevant information is found: 'The provided paper does not contain sufficient information to answer this question.'\n"
     "  - Always **indicate** the limitations in the answer to provide clarity about what information is available from the paper."
    ),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer with strict citations.")
])

rag = (
    {"context": retriever | fmt_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)


In [109]:
answer = rag.invoke("What did Lukasz contribute to the Attention Is All You Need paper?")
print(answer.content)

≈Åukasz Kaiser is listed as an author of the "Attention Is All You Need" paper, affiliated with Google Brain [source: sample.pdf, p. 0]. The provided context indicates his authorship, but it does not specify the particular aspects or sections of the paper he contributed to [source: sample.pdf, p. 0]. The paper provides some insights, but additional details may be needed to fully answer this question.
