In [6]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
    --------------------------------------- 0.3/18.4 MB ? eta -:--:--
   - -------------------------------------- 0.8/18.4 MB 2.2 MB/s eta 0:00:08
   -- ------------------------------------- 1.0/18.4 MB 1.7 MB/s eta 0:00:10
   ---- ----------------------------------- 2.1/18.4 MB 3.0 MB/s eta 0:00:06
   ------ --------------------------------- 2.9/18.4 MB 3.2 MB/s eta 0:00:05
   ------- -------------------------------- 3.4/18.4 MB 3.2 MB/s eta 0:00:05
   -------- ------------------------------- 3.9/18.4 MB 3.1 MB/s eta 0:00:05
   --------- ------------------------------ 4.5/18.4 MB 2.8 MB/s eta 0:00:05
   --------- ---------------

In [None]:
# ==== RAG Pipeline for LangChain v1.x (LCEL) ====
# langchain==1.0.x / core==1.0.x / community==0.4.x
import os
from pathlib import Path
from typing import List, Dict

# community integrations
from langchain_community.document_loaders import PyPDFLoader
from typing import List, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS, DistanceStrategy
from langchain_community.llms import HuggingFaceHub  # swap if you prefer another LLM

import math
# LCEL primitives
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

In [None]:
# --- set your keys (use environment vars for safety) ---
# export or set them here before running:
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = "hf_xxx"
# os.environ["GEMINI_API_KEY"] = "your_gemini_key"

HF_TOKEN = ""
GEMINI_KEY = ""


# 1Ô∏è‚É£ Set environment variables so all later cells can use them
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
os.environ["GEMINI_API_KEY"] = GEMINI_KEY


# 2Ô∏è‚É£ Basic validation prints
print("HUGGINGFACEHUB_API_TOKEN set?", bool(os.getenv("HUGGINGFACEHUB_API_TOKEN")))
print("GEMINI_API_KEY set?", bool(os.getenv("GEMINI_API_KEY")))

# 3Ô∏è‚É£ Configure Gemini SDK if key provided
if GEMINI_KEY and not GEMINI_KEY.startswith("AIzaSyAl"):
    genai.configure(api_key=GEMINI_KEY)
    print("‚úÖ Gemini API configured")
else:
    print("‚ö†Ô∏è Gemini key appears default or missing ‚Äî skipping configuration")


HUGGINGFACEHUB_API_TOKEN set? True
GEMINI_API_KEY set? True
‚ö†Ô∏è Gemini key appears default or missing ‚Äî skipping configuration


In [9]:
# interactive upload, will be useful in streamlit


# from IPython.display import display
# import ipywidgets as widgets
# import shutil

# upload = widgets.FileUpload(accept='.pdf', multiple=False)
# display(upload)

# # after uploading
# def save_uploaded_pdf(upload_widget):
#     for filename, file_info in upload_widget.value.items():
#         with open(filename, 'wb') as f:
#             f.write(file_info['content'])
#         return filename

# pdf_path = save_uploaded_pdf(upload)
# print("Saved PDF:", pdf_path)

# Replace 'sample.pdf' with your PDF path (you can iterate a list of files)
from langchain_community.document_loaders import PyMuPDFLoader
pdf_path = "./sample.pdf"  # Replace with your PDF path
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()  # Returns list of Document objects (each page typically)

print(f"Loaded {len(docs)} documents (pages). Example page content head:\n", docs[0].page_content[:800])




Loaded 15 documents (pages). Example page content head:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Parmar‚àó
Google Research
nikip@google.com
Jakob Uszkoreit‚àó
Google Research
usz@google.com
Llion Jones‚àó
Google Research
llion@google.com
Aidan N. Gomez‚àó‚Ä†
University of Toronto
aidan@cs.toronto.edu
≈Åukasz Kaiser‚àó
Google Brain
lukaszkaiser@google.com
Illia Polosukhin‚àó‚Ä°
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and 


In [10]:
# ============================
# Cell ‚Äî Deep clean PDF pages with enriched metadata
# ============================
import re
import unicodedata
from langchain_core.documents import Document

def improved_clean(documents, file_name):
    cleaned_docs = []
    for doc in documents:
        text = doc.page_content
        page_number = doc.metadata.get("page")  # Retrieve page number from metadata
        
        # Dummy metadata - replace with actual extraction logic (can be improved)
        title = "Sample Research Paper"  # This should be extracted from the PDF title or document header
        authors = "John Doe, Jane Smith"  # Extract authors from metadata or first pages
        publication_date = "2024-01-01"  # Extract publication date if available
        source_link = "https://arxiv.org/abs/123456"  # If available in the document metadata

        # 1) Unicode normalize (fix ligatures / odd widths)
        text = unicodedata.normalize("NFKC", text)

        # 2) Repair hyphenation across line breaks
        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1-\2', text)

        # 3) Preserve paragraph breaks
        text = re.sub(r'\n{2,}', '<PAR>', text)  # mark paragraphs
        text = re.sub(r'[\r\n]+', ' ', text)     # flatten single newlines

        # 4) Remove bracketed numeric citations like [12]
        text = re.sub(r'\[\s*\d+\s*\]', '', text)

        # 5) Remove inline trailing citation digits glued to words (e.g., intelligence1.)
        text = re.sub(r'(?<=\w)(\d{1,3})(?=[\s\.,;:])', '', text)

        # 6) Remove long repeated digit runs (e.g., 1111, 1515151)
        text = re.sub(r'(\d)\1{3,}', '', text)

        # 7) Remove "Page 12" style markers
        text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)

        # 8) Strip control chars
        text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')

        # 9) Normalize whitespace and restore paragraph breaks
        text = re.sub(r'[ \t\f\v]+', ' ', text)  # collapse horizontal whitespace
        text = text.replace('<PAR>', '\n\n')     # restore paragraphs
        text = re.sub(r' {2,}', ' ', text).strip()

        # 10) Targeted glyph fixes (extend if you see more)
        replacements = {
            'Trade-o∆Ø': 'Trade-off',
            'Tradeo∆Ø': 'Trade-off',
            'oe∆Ø': 'oeff',
            'coe∆Ø': 'coeff',
            '∆Ø': 'f',  # keep last: broadest
        }
        for k, v in replacements.items():
            text = text.replace(k, v)

        # Assign metadata (file_name, page number, title, authors, date, source_link)
        doc.metadata["source"] = file_name
        doc.metadata["page"] = page_number
        doc.metadata["title"] = title
        doc.metadata["authors"] = authors
        doc.metadata["publication_date"] = publication_date
        doc.metadata["source_link"] = source_link

        # Determine section heading
        if page_number in ["0", "1"]:
            doc.metadata["section_heading"] = "authors"
        elif "references" in text.lower() or "bibliography" in text.lower():
            doc.metadata["section_heading"] = "references"
        else:
            doc.metadata["section_heading"] = "body"

        # Append the cleaned document with updated metadata
        cleaned_docs.append(Document(page_content=text, metadata=doc.metadata))

    return cleaned_docs

# Apply to 'docs' (output of PyMuPDFLoader.load())
docs_deeper_cleaned = improved_clean(docs, "sample.pdf")  # Pass the file name for metadata

# Safe previews for first 2 cleaned docs
for i, d in enumerate(docs_deeper_cleaned[:2]):
    print(f"\n--- Deeply cleaned doc {i+1} (first 800 chars) ---\n")
    print(d.page_content[:800])
    print("\n--- End preview ---\n")

print(f"‚úÖ Cleaned pages: {len(docs_deeper_cleaned)}")


--- Deeply cleaned doc 1 (first 800 chars) ---

Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó Google Research llion@google.com Aidan N. Gomez‚àó‚Ä† University of Toronto aidan@cs.toronto.edu ≈Åukasz Kaiser‚àó Google Brain lukaszkaiser@google.com Illia Polosukhin‚àó‚Ä° illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and 

--- End preview ---


--- Deeply cleaned doc 2 (first 800 chars) ---

1 Introduction Recurrent neural networks, long short-term 

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Initialize your text splitter (tune separators if your PDF lost newlines)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=30,
    add_start_index=True,  # Keep character offsets for traceability
    separators=("\\n\\n", "\\n", ". ", " ", "")  # Optional: control where splits happen
)

# Preferred: Split documents directly (preserves metadata)
chunked_docs = splitter.split_documents(docs_deeper_cleaned)

# Add per-chunk metadata for traceability
for i, doc in enumerate(chunked_docs):
    # Retrieve start index from original document's metadata
    start = doc.metadata.get("start_index")
    if start is not None:
        doc.metadata["char_start"] = start
        doc.metadata["char_end"] = start + len(doc.page_content)
        del doc.metadata["start_index"]

    # Add chunk-specific metadata
    doc.metadata["chunk_id"] = i
    # Copy other relevant metadata from the original document
    doc.metadata["title"] = docs_deeper_cleaned[i % len(docs_deeper_cleaned)].metadata.get("title", "Unknown Title")
    doc.metadata["authors"] = docs_deeper_cleaned[i % len(docs_deeper_cleaned)].metadata.get("authors", "Unknown Authors")
    doc.metadata["publication_date"] = docs_deeper_cleaned[i % len(docs_deeper_cleaned)].metadata.get("publication_date", "Unknown Date")
    doc.metadata["section_heading"] = docs_deeper_cleaned[i % len(docs_deeper_cleaned)].metadata.get("section_heading", "Unknown Section")

print(f"‚úÖ Total chunks created: {len(chunked_docs)}")
if chunked_docs:
    print("üìÑ Example chunk preview:\n")
    print(chunked_docs[0].page_content[:500])
else:
    print("‚ö†Ô∏è No chunks produced. Check upstream cleaning or splitter settings.")


‚úÖ Total chunks created: 95
üìÑ Example chunk preview:

Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó Google Research llion@google.com Aidan N


In [13]:
import json  # ‚úÖ add this import
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# --- STEP 1: Split documents ---
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=30,
    add_start_index=True,
    separators=("\n\n", "\n", ". ", " ", ""),   # safer: use actual newlines, not escaped
)

chunked_docs = splitter.split_documents(docs_deeper_cleaned)

# --- STEP 2: Add metadata per chunk ---
for i, doc in enumerate(chunked_docs):
    start = doc.metadata.get("start_index")
    if start is not None:
        doc.metadata["char_start"] = start
        doc.metadata["char_end"] = start + len(doc.page_content)
        del doc.metadata["start_index"]
    doc.metadata["chunk_id"] = i

# --- STEP 3: Store chunks in JSON file ---
chunks_data = [
    {
        "text": doc.page_content,
        "metadata": doc.metadata
    }
    for doc in chunked_docs
]

output_path = "chunks.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunks_data, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved {len(chunks_data)} chunks to {output_path}")
print("üìÑ Example chunk preview:")
print(chunks_data[0]["text"][:400])


‚úÖ Saved 95 chunks to chunks.json
üìÑ Example chunk preview:
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó G


In [40]:
# ‚úÖ FAISS setup with Safe Embeddings and Cosine Distance (LangChain v1)
import json
import numpy as np
import faiss
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS, DistanceStrategy
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS, DistanceStrategy
# import faiss
# import numpy as np

with open("chunks.json", "r", encoding="utf-8") as f:
    chunks_data = json.load(f)

# Convert each JSON entry back into a LangChain Document
chunked_docs = [
    Document(page_content=chunk["text"], metadata=chunk["metadata"])
    for chunk in chunks_data
]

print(f"‚úÖ Loaded {len(chunked_docs)} chunks from chunks.json")

# --- Safe embedding wrapper to prevent AttributeError on dict inputs ---
class SafeHuggingFaceEmbeddings(HuggingFaceEmbeddings):
    def embed_documents(self, texts):
        # Coerce all inputs to strings before encoding
        clean_texts = [str(t) if not isinstance(t, str) else t for t in texts]
        return super().embed_documents(clean_texts)

    def embed_query(self, text):
        if not isinstance(text, str):
            text = str(text)
        return super().embed_query(text)

# Step 1: Initialize normalized embeddings
embedding_model = SafeHuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True},  # ensures unit-length vectors
)

# Step 2: Generate embeddings for the documents
# --- STEP 3: Generate embeddings manually (optional diagnostic) ---
document_embeddings = embedding_model.embed_documents(
    [doc.page_content for doc in chunked_docs]
)
# # Convert to NumPy array (from list of lists) for compatibility with faiss
# document_embeddings = np.array(document_embeddings, dtype="float32")

# # Normalize the embeddings (force L2 normalization, in case it's not done automatically)
# faiss.normalize_L2(document_embeddings)  # This ensures embeddings are unit-normalized

# # Step 3: Create FAISS index with Inner Product (cosine similarity)
# embedding_dim = document_embeddings.shape[1]
# index = faiss.IndexFlatIP(embedding_dim)  # IndexFlatIP for cosine similarity via inner product

# # Add the normalized embeddings to the index
# index.add(document_embeddings)


# # Step 4: Build the FAISS vectorstore (without embedding_model parameter)
# vectorstore = FAISS.from_documents(
#     chunked_docs,  # Your documents
#     embedding_model,
#     distance_strategy=DistanceStrategy.COSINE  # Ensures cosine similarity (IP)
# )

# # Confirm the FAISS index type and metric type
# print(f"FAISS Index Type: {type(index)}")  # Should be <class 'faiss.IndexFlatIP'>
# print(f"FAISS Metric Type: {index.metric_type}")  # Should print '1' f4

# --- STEP 4: Check the norms of the embeddings ---
norms = np.linalg.norm(E, axis=1)
print(f"Min norm: {norms.min()}, Max norm: {norms.max()}, Mean norm: {norms.mean()}")
# If the max norm is greater than 1.0, it means the embeddings are not normalized properly.

# --- STEP 5: Normalize embeddings (if they were not already unit-normalized) ---
faiss.normalize_L2(E)  # Normalize each vector to unit length

# --- STEP 6: Create the FAISS index for Inner Product (Cosine Similarity) ---
embedding_dim = E.shape[1]
ip_index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity

# Add the normalized embeddings to the index
ip_index.add(E)

# --- STEP 7: Confirm the FAISS index type and metric type ---
print(f"FAISS Index Type: {type(ip_index)}")  # Should print <class 'faiss.IndexFlatIP'>
print(f"FAISS Metric Type: {ip_index.metric_type}")  # Should print '1' for inner product (cosine similarity)

# Optional: Check how many embeddings were created and added
print(f"‚úÖ Total embeddings created: {E.shape[0]}")

# --- STEP 8: Build the FAISS vectorstore ---
vectorstore = FAISS.from_documents(
    chunked_docs,  # Your documents
    embedding_model,
    distance_strategy=DistanceStrategy.COSINE  # Ensures cosine similarity (IP)
)



‚úÖ Loaded 95 chunks from chunks.json
Min norm: 1.0, Max norm: 1.0, Mean norm: 1.0
FAISS Index Type: <class 'faiss.swigfaiss_avx2.IndexFlatIP'>
FAISS Metric Type: 0
‚úÖ Total embeddings created: 10


In [33]:
# # Create FAISS vectorstore
# vectorstore = FAISS.from_documents(
#     chunked_docs,  # Your documents
#     embedding_model,
#     distance_strategy=DistanceStrategy.COSINE  # Ensures cosine similarity (IP)
# )

# Confirm the FAISS index type and metric type
print(f"FAISS Index Type: {type(index)}")  # Should be <class 'faiss.IndexFlatIP'>
print(f"FAISS Metric Type: {index.metric_type}")  # Should print '1' for IP (cosine)

FAISS Index Type: <class 'faiss.swigfaiss_avx2.IndexFlatIP'>
FAISS Metric Type: 0


In [34]:
import numpy as np
import faiss

# Assume `embedding_model` has already been used to create embeddings for your documents
raw_embeddings = embedding_model.embed_documents([doc.page_content for doc in chunked_docs])

# Convert the list of embeddings to a NumPy array (for compatibility with FAISS)
raw_embeddings = np.array(raw_embeddings, dtype="float32")

# Check the norm of raw embeddings (L2 norm)
raw_norms = np.linalg.norm(raw_embeddings, axis=1)
print(f"Raw embeddings norms: Min={raw_norms.min()}, Max={raw_norms.max()}, Mean={raw_norms.mean()}")

# Normalize the embeddings
faiss.normalize_L2(raw_embeddings)

# Check the norm of normalized embeddings (L2 norm should be 1 for all embeddings)
normalized_norms = np.linalg.norm(raw_embeddings, axis=1)
print(f"Normalized embeddings norms: Min={normalized_norms.min()}, Max={normalized_norms.max()}, Mean={normalized_norms.mean()}")

# Optional: You can also print out some raw vs normalized embedding values to compare
print(f"Raw embedding sample: {raw_embeddings[0][:5]}")  # Show first 5 values of the first embedding
print(f"Normalized embedding sample: {raw_embeddings[0][:5]}")  # Show first 5 values of the first embedding


Raw embeddings norms: Min=0.9999999403953552, Max=1.0000001192092896, Mean=1.0
Normalized embeddings norms: Min=0.9999999403953552, Max=1.0000001192092896, Mean=1.0
Raw embedding sample: [ 0.07793531 -0.00717798  0.00692454  0.03471662  0.00192955]
Normalized embedding sample: [ 0.07793531 -0.00717798  0.00692454  0.03471662  0.00192955]


In [43]:
# ============================
# Retriever + calibration (v1) ‚Äî using LangChain DistanceStrategy
# ============================

# ---- 1) Safe cosine scorer + clamp to [0,1] ----
from langchain_community.vectorstores import DistanceStrategy
from langchain_core.documents import Document
from typing import List, Tuple

# We will remove the custom scorer and use LangChain's built-in cosine scorer
# LangChain will handle the mapping of cosine similarity to [0, 1]

# # Ensure the store is COSINE end-to-end
# vectorstore.distance_strategy = DistanceStrategy.COSINE

# ---- 2) Manual thresholding on the mapped [0,1] scale ----
def search_with_scores(
    q: str, 
    k: int = 6, 
    score_threshold: float = 0.050
) -> List[Tuple[Document, float]]:
    """
    Returns (Document, relevance_0_1) pairs with cosine-based relevance in [0,1],
    filtered by score_threshold.
    """
    results = vectorstore.similarity_search_with_score(q, k=k)  # raw cosine/IP
    out: List[Tuple[Document, float]] = []
    for doc, raw_ip in results:
        rel = (float(raw_ip) + 1.0) / 2.0  # Mapping raw cosine similarity [-1, 1] to [0, 1]
        if rel >= score_threshold:
            out.append((doc, rel))
    return out


# ---- 3) Retriever for chains (no built-in thresholding here) ----
retriever = vectorstore.as_retriever(
    search_type="similarity",          # keep it simple; threshold in chain logic
    search_kwargs={"k": 4}             # tune k as you like; gating is above
)
print("‚úÖ Retriever (cosine) created. k = 4, threshold applied in search_with_scores().")




‚úÖ Retriever (cosine) created. k = 4, threshold applied in search_with_scores().


In [47]:
# ============================
# Retriever + calibration (v1) ‚Äî using LangChain DistanceStrategy
# ============================

# ---- 1) Safe cosine scorer + clamp to [0,1] ----
from langchain_community.vectorstores import DistanceStrategy
from langchain_core.documents import Document
from typing import List, Tuple

# We will remove the custom scorer and use LangChain's built-in cosine scorer
# LangChain will handle the mapping of cosine similarity to [0, 1]

# Ensure the store is COSINE end-to-end
vectorstore.distance_strategy = DistanceStrategy.COSINE

# ---- 2) Manual thresholding on the mapped [0,1] scale ----
def search_with_scores(
    q: str, 
    k: int = 6, 
    score_threshold: float = 0.050
) -> List[Tuple[Document, float]]:
    """
    Returns (Document, relevance_0_1) pairs with cosine-based relevance in [0,1],
    filtered by score_threshold.
    """
    results = vectorstore.similarity_search_with_score(q, k=k)  # raw cosine/IP
    out: List[Tuple[Document, float]] = []
    for doc, raw_ip in results:
        rel = (float(raw_ip) + 1.0) / 2.0  # Mapping raw cosine similarity [-1, 1] to [0, 1]
        if rel >= score_threshold:
            out.append((doc, rel))
    return out




In [51]:
# ---- 3) Retriever for chains (no built-in thresholding here) ----
# Use LangChain's default `as_retriever` method to get the retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",          # Use the default similarity-based search
    search_kwargs={"k": 4}             # Number of top results to return
)
print("‚úÖ Retriever (cosine) created. k = 4, threshold applied in search_with_scores().")

# Test the retriever with a sample query
query = "What is the impact of AI on healthcare?"

# Use the default `similarity_search` method provided by LangChain's retriever
retrieved_docs = retriever.invoke(query)  # Retrieve the top k documents

# Print the titles of the retrieved documents
print("Retrieved Documents:")
for doc in retrieved_docs:
    print(f"Title: {doc.metadata.get('title', 'No title')}")


‚úÖ Retriever (cosine) created. k = 4, threshold applied in search_with_scores().
Retrieved Documents:
Title: Sample Research Paper
Title: Sample Research Paper
Title: Sample Research Paper
Title: Sample Research Paper


In [55]:
print("Retrieved Documents:")
for doc in retrieved_docs:
    title = doc.metadata.get('title', 'No title')
    content_preview = doc.page_content[:500]  
    print(f"Title: {title}")
    print(f"Content Preview: {content_preview}\n")

Retrieved Documents:
Title: Sample Research Paper
Content Preview: . In Proceedings of the 2 Conference on Empirical Methods in Natural Language Processing, pages 832‚Äì8. ACL, August 2. Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. Exploring the limits of language modeling. arXiv preprint arXiv:1.02, 2. ≈Åukasz Kaiser and Samy Bengio. Can active memory replace attention? In Advances in Neural Information Processing Systems, (NIPS), 2. ≈Åukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms

Title: Sample Research Paper
Content Preview: Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani‚àó Google Brain avaswani@google.com Noam Shazeer‚àó Google Brain noam@google.com Niki Parmar‚àó Google Research nikip@google.com Jakob Uszkoreit‚àó Google Research usz@google.com Llion Jones‚àó Google R

In [56]:
# Simple query to check the chunks
query = "What is machine learning?"

# Retrieve the relevant documents (chunks) using the retriever
retrieved_docs = retriever.invoke(query)  # Use the default `invoke()` for retrieval

# Print out the results with chunk IDs and content preview
print("Retrieved Documents:")
for doc in retrieved_docs:
    title = doc.metadata.get('title', 'No title')
    chunk_id = doc.metadata.get('chunk_id', 'No chunk ID')  # Assuming you added chunk_id to metadata
    content_preview = doc.page_content[:500]  # Preview the first 500 characters of content
    print(f"Chunk ID: {chunk_id} | Title: {title}")
    print(f"Content Preview: {content_preview}\n")


Retrieved Documents:
Chunk ID: 79 | Title: Sample Research Paper
Content Preview: . Neural GPUs learn algorithms. In International Conference on Learning Representations (ICLR), 2. Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Ko-ray Kavukcuoglu. Neural machine translation in linear time. arXiv preprint arXiv:1.10099v, 2. Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In International Conference on Learning Representations, 2. Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimiz

Chunk ID: 85 | Title: Sample Research Paper
Content Preview: . arXiv preprint arXiv:1.06, 2. Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdi-nov. Dropout: a simple way to prevent neural networks from overfitting. Journal of Machine Learning Research, 15(1):1929‚Äì1, 2. Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. C

In [57]:
# ============================
# Cell 3 ‚Äî HF Chat endpoint (v1)
# ============================
import os
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
hf_model_id = "Mistralai/Mistral-7B-Instruct-v0.2"

# Base endpoint configured for the conversational task
_base = HuggingFaceEndpoint(
    repo_id=hf_model_id,
    task="conversational",          # provider supports this task for this model
    huggingfacehub_api_token=HF_TOKEN,
    temperature=0.1,
    max_new_tokens=300,
    top_p=0.9,
    return_full_text=False,
)

# Wrap as a ChatModel so LangChain uses chat flow (not text_generation)
llm = ChatHuggingFace(llm=_base)

print("‚úÖ Using ChatHuggingFace over HuggingFaceEndpoint (conversational)")
print(f"Model loaded: {hf_model_id}")


‚úÖ Using ChatHuggingFace over HuggingFaceEndpoint (conversational)
Model loaded: Mistralai/Mistral-7B-Instruct-v0.2


In [58]:
# =======================================================
# Cell 4 ‚Äî Retrieval pipeline (v1 LCEL) with strict gate
# =======================================================
from operator import itemgetter
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableBranch

# Safety checks
if llm is None:
    raise RuntimeError("LLM not created. Run Cell 3 first.")
if retriever is None:
    raise RuntimeError("Retriever not created. Build/load FAISS first.")

# ---- Strict, PDF-bounded system rules (use the agreed refusal text) ----
SYSTEM_PROMPT = """
You are a QA assistant for research papers.
You must answer strictly and only from the provided context chunks.
If the answer is not contained in the context, reply: 'I don‚Äôt know based on the provided document.'
Cite page numbers if present in metadata (e.g., 'p. 12').
Do not use external knowledge. Do not speculate.
Always provide the answer in a concise, factual style, directly quoting or paraphrasing from the context.
If multiple chunks contain relevant information, combine them but clearly cite page numbers for each part.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer:")
])

# ---- Context formatter with file + page for later citation in the answer ----
def format_context(docs: List[Document]) -> str:
    blocks = []
    for d in docs:
        src = d.metadata.get("source", "")
        page = d.metadata.get("page", "")
        head = f"[source: {src}, p. {page}]" if page != "" else f"[source: {src}]"
        blocks.append(head + "\n" + d.page_content)
    return "\n\n---\n\n".join(blocks)

# ---- Normalizer (keeps you robust across different LLM client return shapes) ----
def normalize_llm_output(x):
    if hasattr(x, "content"):              # AIMessage
        return x.content
    if isinstance(x, dict):                # {'generated_text': ...} / {'text': ...}
        for k in ("generated_text", "text", "answer", "content"):
            if k in x and isinstance(x[k], str):
                return x[k]
        if "generated_text" in x and isinstance(x["generated_text"], list):
            first = x["generated_text"][0]
            if isinstance(first, dict) and "content" in first:
                return first["content"]
        return str(x)
    if isinstance(x, list) and x:
        first = x[0]
        if hasattr(first, "text"):
            return first.text
        if isinstance(first, str):
            return first
    return str(x)

# ---- Gate config (manual thresholding via search_with_scores) ----
K = 8
SCORE_THRESHOLD = 0.50  # adjust after calibration

def pack(inputs):
    q = inputs["question"]
    # Use manual thresholding on correctly mapped cosine scores
    picked = search_with_scores(q, k=K, score_threshold=SCORE_THRESHOLD)  # List[(Document, rel_0_1)]
    docs_only = [d for d, rel in picked]
    refuse = (len(docs_only) == 0) or all(rel < 0.50 for _, rel in picked)  # strict: no qualified context -> refuse
    ctx = format_context(docs_only) if not refuse else ""
    return {"question": q, "context": ctx, "refuse": refuse}

def refuse_text(_):
    return "I don‚Äôt know based on the provided document."

# ---- LCEL chain: branch BEFORE calling the LLM ----
qa_chain = (
    {"question": RunnablePassthrough()}
    | RunnableLambda(pack)
    | RunnableBranch(
        (lambda x: x["refuse"], RunnableLambda(refuse_text)),
        (prompt | llm | RunnableLambda(normalize_llm_output))
    )
)

print("‚úÖ Strict RAG chain ready (cosine, manual threshold). If nothing qualifies, it returns ‚ÄúI don‚Äôt know based on the provided document.‚Äù")


‚úÖ Strict RAG chain ready (cosine, manual threshold). If nothing qualifies, it returns ‚ÄúI don‚Äôt know based on the provided document.‚Äù


In [63]:
response = qa_chain.invoke({"question": "who are the authors of this paper?"})
print(response)


 The authors of the paper are: Mitchell P Marcus, Mary Ann Marcinkiewicz, Beatrice Santorini, David McClosky, Eugene Charniak, Mark Johnson, Ankur Parikh, Oscar T√§ckstr√∂m, Dipanjan Das, Jakob Uszkoreit, Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N, Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, Ko-ray Kavukcuoglu, Romain Paulus, Caiming Xiong, Richard Socher, Slav Petrov, Leon Barrett, Dan Klein, Ofir Press, Lior Wolf, Denny Britz, Anna Goldie, Minh-Thang Luong, Quoc V. Le, Jianpeng Cheng, Li Dong, and Mirella Lapata. (Citation needed for each author and their respective works)

I don‚Äôt know who the authors are for the references numbered 4, 2, and 22 mentioned on p. 1.


In [62]:
test_q = "What is the capital of France?"   # not in your PDF
print("üß© Irrelevant question test:")
print(qa_chain.invoke({"question": test_q}))


üß© Irrelevant question test:
 I don‚Äôt know based on the provided document. The context does not mention the capital city of France.
