In [1]:
print("hello world !!!!")

hello world !!!!


In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
!pip install -q faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip install -q rank-bm25==0.2.2

In [5]:
!pip install -q sentence-transformers

In [6]:
import pandas as pd
import numpy as np
import json, re, unicodedata, sys
from pathlib import Path
from PyPDF2 import PdfReader
from typing import List, Dict, Tuple, Literal
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import hashlib
from sentence_transformers import SentenceTransformer
import faiss
from rank_bm25 import BM25Okapi
import os
from huggingface_hub import snapshot_download
import math
import time
import pickle




Loading the model for future use

In [7]:
MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"

LOCAL_DIR = Path(os.environ.get("HF_LOCAL_MODEL_DIR", "/content/drive/MyDrive/RAG_Ceadar/model"))
LOCAL_DIR.mkdir(parents=True, exist_ok=True)


USE_LOCAL = True


if not any(LOCAL_DIR.iterdir()):
    snapshot_download(
        repo_id=MODEL_ID,
        local_dir=str(LOCAL_DIR),
        local_dir_use_symlinks=False,
        resume_download=True,
    )

if USE_LOCAL:
    os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [8]:
def _model_source() -> str:
    return str(LOCAL_DIR) if 'USE_LOCAL' in globals() and USE_LOCAL else MODEL_ID

### Loading the pdf file and cleaning it for further preprocessing steps


In [9]:
pdf_paths = [
    Path("/content/drive/MyDrive/RAG_Ceadar/Attention_is_all_you_need.pdf"),
    Path("/content/drive/MyDrive/RAG_Ceadar/Deepseek-r1.pdf"),
]

output_dir = Path("/content/drive/MyDrive/RAG_Ceadar/output")

In [10]:
""" Initital Cleaning of the text, Before chuking
Goal is to fix common PDF artifacts but KEEP original wording/casing/punctuation for accurate citations """

def clean_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s or "")
    s = s.replace("\u00ad", "")
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"(\w)-\n(\w)", r"\1\2", s)
    placeholder = "<<<P>>>"
    s = re.sub(r"\n{2,}", placeholder, s)
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s).strip()
    s = s.replace(placeholder, "\n\n")
    return s


In [11]:
"""Extracting text page-by-page so we can cite pages later.
Storing in a dict with page number and text of the page
easy to parse later"""

def extract_pages(pdf_path: Path) -> List[Dict]:
    reader = PdfReader(str(pdf_path))
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            raw = page.extract_text() or ""
        except Exception:
            raw = ""
        pages.append({"page": i, "text": clean_text(raw)})
    return pages


In [12]:
""" For each pdf, saving the pages list of dictionary to json, with page no and text
Also creating full document text for quick go through

- Save <name>.pages.json : [{"page": 1, "text": "..."}, ...]
- Save <name>.txt        : full document text (just for quick reading/check) """

def save_outputs(pdf_path: Path, pages: List[Dict], out_dir: Path = None):

    out_dir.mkdir(parents=True, exist_ok=True)
    out_json = out_dir / f"{pdf_path.stem}.pages.json"
    out_txt  = out_dir / f"{pdf_path.stem}.txt"

    with out_json.open("w", encoding="utf-8") as f:
        json.dump(pages, f, ensure_ascii=False, indent=2)

    joined = "\n\n===== PAGE BREAK =====\n\n".join(p["text"] for p in pages)
    out_txt.write_text(joined, encoding="utf-8")

    return out_json, out_txt

results = []
for p in pdf_paths:
    if not p.exists():
        print(f"[skip] not found: {p}")
        continue
    pages = extract_pages(p)
    out_json, out_txt = save_outputs(p, pages, output_dir)
    results.append({"pdf": str(p), "pages_json": str(out_json), "full_txt": str(out_txt), "num_pages": len(pages)})
    print(f"[ok] {p.name} → {out_json.name}, {out_txt.name} (pages: {len(pages)})")

results


[ok] Attention_is_all_you_need.pdf → Attention_is_all_you_need.pages.json, Attention_is_all_you_need.txt (pages: 15)
[ok] Deepseek-r1.pdf → Deepseek-r1.pages.json, Deepseek-r1.txt (pages: 22)


[{'pdf': '/content/drive/MyDrive/RAG_Ceadar/Attention_is_all_you_need.pdf',
  'pages_json': '/content/drive/MyDrive/RAG_Ceadar/output/Attention_is_all_you_need.pages.json',
  'full_txt': '/content/drive/MyDrive/RAG_Ceadar/output/Attention_is_all_you_need.txt',
  'num_pages': 15},
 {'pdf': '/content/drive/MyDrive/RAG_Ceadar/Deepseek-r1.pdf',
  'pages_json': '/content/drive/MyDrive/RAG_Ceadar/output/Deepseek-r1.pages.json',
  'full_txt': '/content/drive/MyDrive/RAG_Ceadar/output/Deepseek-r1.txt',
  'num_pages': 22}]

In [13]:
# Here are the first 600 chars of page 1 for each doc (just to verify extraction looks sane).
for r in results:
    with open(r["pages_json"], "r", encoding="utf-8") as f:
        pages = json.load(f)
    head = pages[0]["text"][:600] if pages else ""
    print(f"\n=== {Path(r['pdf']).name} | pages: {r['num_pages']} ===")
    print(head + ("..." if len(pages[0]['text']) > 600 else ""))



=== Attention_is_all_you_need.pdf | pages: 15 ===
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.comNoam Shazeer∗ Google Brain noam@google.comNiki Parmar∗ Google Research nikip@google.comJakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.comAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.eduŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The ...

=== Deepseek-r1.pdf | pages: 22 ===
DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning DeepSeek-AI research@deepseek.com Abstract We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised

### Next step is the chunking

Here I am creating a custom text splitter to create chunks. text splitter involves checking for tables or heading, and it keeps the paragraphs intact where ever necessary. For tokenisation I have used Autotokenizer from transformers and mistral 7b v0.1 pretrained model, as it is good for exact counting. Using the model as to tokenize it the exact way latter the LLM will see.

In [14]:
BASE_DIR = Path("/content/drive/MyDrive/RAG_Ceadar/output")
OUT_DIR = BASE_DIR / "chunks"
OUT_DIR.mkdir(parents=True, exist_ok=True)

pages_json_files = sorted(BASE_DIR.glob("*.pages.json"))

""" Make short human-friendly labels (A, B, C, …) for clean citations like [A: p.12]
using automation rather than hardcoding labels as for the case if PDFs are increased """

LABELS = {}
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
for i, fp in enumerate(pages_json_files):
    stem = fp.stem.replace(".pages", "")
    LABELS[stem] = alphabet[i] if i < len(alphabet) else f"D{i+1}"
pages_json_files, LABELS


([PosixPath('/content/drive/MyDrive/RAG_Ceadar/output/Attention_is_all_you_need.pages.json'),
  PosixPath('/content/drive/MyDrive/RAG_Ceadar/output/Deepseek-r1.pages.json')],
 {'Attention_is_all_you_need': 'A', 'Deepseek-r1': 'B'})

In [15]:
hf_tokenizer = AutoTokenizer.from_pretrained(
    _model_source(),
    local_files_only=bool('USE_LOCAL' in globals() and USE_LOCAL))

def token_len(text: str) -> int:
    return len(hf_tokenizer.encode(text))                   # Returning the exact number of tokens for this text


Checking for Heading and Tables int he paragraph so that later it could help in chunking

In [16]:
# opening the json file and reading the pages
def load_pages_json(pages_json_path: Path) -> List[Dict]:
    with pages_json_path.open("r", encoding="utf-8") as f:
        return json.load(f)

def split_into_paragraphs(text: str) -> List[str]:
    parts = [p.strip() for p in text.split("\n\n")]         # We had preserved paragraph breaks as double newlines in the clean text function
    return [p for p in parts if p]


_HEADING_CHARS = re.compile(r"^[A-Za-z0-9][A-Za-z0-9\-\.:/() \t]{0,120}$")
_SECTION_PREFIX = re.compile(r"^\s*(\d+(\.\d+)*)\s+")

def is_heading(paragraph: str) -> bool:
    p = (paragraph or "").strip()
    if not p:
        return False

    if p.endswith("."):
        return False

    p_no_num = _SECTION_PREFIX.sub("", p)

    if not _HEADING_CHARS.match(p_no_num):
       return False

    words = p_no_num.split()
    n = len(words)

    if n == 0 or n > 20:
        return False

    """ Checking for heading on caps status
      - ALL CAPS (INTRODUCTION)
      - Title Case (Training, Background) """

    def looks_heading_word(w: str) -> bool:
        return w.isupper() or (w[:1].isupper() and w[1:].islower())


    cap_like = sum(1 for w in words if looks_heading_word(w))
    ratio = cap_like / n


    """ single words could be tricky as normal words might get considered as heading
    handling this anomaly """

    if n == 1:
        w = words[0]
        if not looks_heading_word(w):
            return False
        if len(w) > 40:
            return False
        return True
    return ratio >= 0.5



def is_table_like(paragraph: str) -> bool:

    if len(paragraph) < 40:
      return False

    digits = sum(ch.isdigit() for ch in paragraph)
    digit_ratio = digits / len(paragraph)
    seps = paragraph.count("|") + paragraph.count(",") + paragraph.count(";") + paragraph.count("\t")
    multi_spaces = len(re.findall(r" {2,}", paragraph))

    return digit_ratio > 0.18 or seps >= 8 or multi_spaces >= 4


In [17]:
"""This function will create the list of chunks and the related meta data with it,
such as table or not, doc id, doc lable for citation """

def pack_chunks_exact(
    para_stream: List[Tuple[int, str]],
    target_tokens: int,
    overlap_tokens: int,
    doc_id: str,
    doc_label: str
) -> List[Dict]:


    chunks = []

    cur_parts: List[str] = []

    # few meta data collectors
    cur_pages: List[int] = []
    current_heading = None

    cur_text = ""
    cur_tok = 0


    """ This function finalizes the chunka and and prepare the ovelap,
    it is called when we are done with a chunck or start a new one """

    def flush():

        nonlocal cur_parts, cur_pages, cur_text, cur_tok

        if not cur_parts:
            return

        text = "\n\n".join(cur_parts).strip()
        if not text:
            cur_parts, cur_pages, cur_text, cur_tok = [], [], "", 0
            return
        p_start, p_end = min(cur_pages), max(cur_pages)
        chunks.append({
            "doc_id": doc_id,
            "doc_label": doc_label,
            "chunk_id": f"{doc_id}::p{p_start}-{p_end}::idx{len(chunks)}",
            "page_start": p_start,
            "page_end": p_end,
            "heading": current_heading,
            "content_type": "text",
            "text": text,
            "tokens": token_len(text),
        })

        """ Building the overlap tail
        keeping the last 100 tokens as the start of the next chunk
        we need the best K index in the word tail of last chunk which has the most words but still is in range of max tokens allowed """

        if overlap_tokens > 0:

            words = text.split()

            lo, hi = 0, len(words)
            best = len(words)

            while lo <= hi:
                mid = (lo + hi) // 2
                tail = " ".join(words[mid:])

                if token_len(tail) <= overlap_tokens:
                    best = mid
                    hi = mid - 1
                else:
                    lo = mid + 1

            tail_text = " ".join(words[best:])


            cur_parts = [tail_text] if tail_text else []
            cur_pages = [p_end] if tail_text else []
            cur_text  = tail_text
            cur_tok   = token_len(tail_text) if tail_text else 0

        else:
            cur_parts, cur_pages, cur_text, cur_tok = [], [], "", 0



    for page_no, para in para_stream:


        if is_heading(para):
            current_heading = para

            if cur_tok + token_len(para) <= target_tokens:
                cur_parts.append(para)
                cur_pages.append(page_no)
                cur_text = "\n\n".join(cur_parts)
                cur_tok = token_len(cur_text)
            continue


        if is_table_like(para):
            flush()
            table_text = para
            chunks.append({
                "doc_id": doc_id,
                "doc_label": doc_label,
                "chunk_id": f"{doc_id}::p{page_no}-{page_no}::idx{len(chunks)}",
                "page_start": page_no,
                "page_end": page_no,
                "heading": current_heading,
                "content_type": "table",
                "text": table_text,
                "tokens": token_len(table_text),
            })

            cur_parts, cur_pages, cur_text, cur_tok = [], [], "", 0
            continue

        # Normal paragraph: add it if we still fit, otherwise flush then start a new chunk
        p_tok = token_len(para)

        if cur_tok == 0:
            cur_parts = [para]
            cur_pages = [page_no]
            cur_text  = para
            cur_tok   = p_tok
            if cur_tok >= target_tokens:
                flush()
            continue

        # If adding this paragraph stays within the target, add it
        if cur_tok + p_tok <= target_tokens:
            cur_parts.append(para)
            cur_pages.append(page_no)
            cur_text = "\n\n".join(cur_parts)
            cur_tok  = token_len(cur_text)
            if cur_tok >= target_tokens:
                flush()
        else:
            # Would overflow → close current chunk, then start a new chunk with this paragraph
            flush()
            cur_parts = [para]
            cur_pages = [page_no]
            cur_text  = para
            cur_tok   = p_tok
            if cur_tok >= target_tokens:
                flush()

    flush()
    return chunks


In [18]:
def chunk_file_exact(pages_json_path: Path, target_tokens=800, overlap_tokens=100):
    doc_id = pages_json_path.stem.replace(".pages", "")
    doc_label = LABELS.get(doc_id, doc_id[:1].upper())
    pages = load_pages_json(pages_json_path)


    para_stream = []
    for page in pages:
        for para in split_into_paragraphs(page["text"]):
            para_stream.append((page["page"], para))

    chunks = pack_chunks_exact(
        para_stream=para_stream,
        target_tokens=target_tokens,
        overlap_tokens=overlap_tokens,
        doc_id=doc_id,
        doc_label=doc_label
    )

    out_path = OUT_DIR / f"{doc_id}.chunks.jsonl"
    with out_path.open("w", encoding="utf-8") as f:
        for ch in chunks:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")

    return {"doc_id": doc_id, "label": doc_label, "file": str(out_path), "num_chunks": len(chunks)}


summary = []
for pj in pages_json_files:
    info = chunk_file_exact(pj, target_tokens=800, overlap_tokens=100)  # approx 12–13% overlap
    summary.append(info)
    print(f"[ok] {info['doc_id']} ({info['label']}) → {info['file']} (chunks: {info['num_chunks']})")

summary


[ok] Attention_is_all_you_need (A) → /content/drive/MyDrive/RAG_Ceadar/output/chunks/Attention_is_all_you_need.chunks.jsonl (chunks: 15)
[ok] Deepseek-r1 (B) → /content/drive/MyDrive/RAG_Ceadar/output/chunks/Deepseek-r1.chunks.jsonl (chunks: 20)


[{'doc_id': 'Attention_is_all_you_need',
  'label': 'A',
  'file': '/content/drive/MyDrive/RAG_Ceadar/output/chunks/Attention_is_all_you_need.chunks.jsonl',
  'num_chunks': 15},
 {'doc_id': 'Deepseek-r1',
  'label': 'B',
  'file': '/content/drive/MyDrive/RAG_Ceadar/output/chunks/Deepseek-r1.chunks.jsonl',
  'num_chunks': 20}]

In [19]:
def head_jsonl(path: Path, n=2):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n: break
            rows.append(json.loads(line))
    return rows

previews = {}
for row in summary:
    previews[Path(row["file"]).name] = head_jsonl(Path(row["file"]), n=2)

previews


{'Attention_is_all_you_need.chunks.jsonl': [{'doc_id': 'Attention_is_all_you_need',
   'doc_label': 'A',
   'chunk_id': 'Attention_is_all_you_need::p1-1::idx0',
   'page_start': 1,
   'page_end': 1,
   'heading': None,
   'content_type': 'table',
   'text': 'Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.comNoam Shazeer∗ Google Brain noam@google.comNiki Parmar∗ Google Research nikip@google.comJakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.comAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.eduŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. 

Embedding

In [20]:

BASE_DIR = Path("/content/drive/MyDrive/RAG_Ceadar/output")
CHUNK_DIR = BASE_DIR / "chunks"
ARTIFACTS_DIR = BASE_DIR / "embeddings_meta"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

def read_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return rows


files = sorted(CHUNK_DIR.glob("*.chunks.jsonl"))
print("Found chunk files:", [f.name for f in files])


all_rows = []
for f in files:
    all_rows.extend(read_jsonl(f))

df = pd.DataFrame(all_rows)
print("Total chunks loaded:", len(df))
df.head(2)


Found chunk files: ['Attention_is_all_you_need.chunks.jsonl', 'Deepseek-r1.chunks.jsonl']
Total chunks loaded: 35


Unnamed: 0,doc_id,doc_label,chunk_id,page_start,page_end,heading,content_type,text,tokens
0,Attention_is_all_you_need,A,Attention_is_all_you_need::p1-1::idx0,1,1,,table,"Provided proper attribution is provided, Googl...",615
1,Attention_is_all_you_need,A,Attention_is_all_you_need::p2-2::idx1,2,2,,table,"1 Introduction Recurrent neural networks, long...",828


In [21]:
""" Tiny helper that produces a compact citation header like:
[A: p.1-2 | table] Introduction """
def build_cite_line(row):
    ps, pe = int(row["page_start"]), int(row["page_end"])
    pages = f"p.{ps}" if ps == pe else f"p.{ps}-{pe}"
    kind = "table" if row.get("content_type") == "table" else "text"
    head = (row.get("heading") or "").strip()
    head_part = f" | {head}" if head else ""
    return f"[{row['doc_label']}: {pages} | {kind}]{head_part}"

df["cite_line"] = df.apply(build_cite_line, axis=1)
df["embedding_text"] = (df["cite_line"].astype(str) + "\n" + df["text"].astype(str).str.strip()).str.strip()

df.head(2)

Unnamed: 0,doc_id,doc_label,chunk_id,page_start,page_end,heading,content_type,text,tokens,cite_line,embedding_text
0,Attention_is_all_you_need,A,Attention_is_all_you_need::p1-1::idx0,1,1,,table,"Provided proper attribution is provided, Googl...",615,[A: p.1 | table],[A: p.1 | table]\nProvided proper attribution ...
1,Attention_is_all_you_need,A,Attention_is_all_you_need::p2-2::idx1,2,2,,table,"1 Introduction Recurrent neural networks, long...",828,[A: p.2 | table],[A: p.2 | table]\n1 Introduction Recurrent neu...


In [22]:

MIN_CHARS = 40
df["char_len"] = df["embedding_text"].str.len().fillna(0).astype(int)
df = df[df["char_len"] >= MIN_CHARS].reset_index(drop=True)

def fingerprint(s: str) -> str:
    norm = re.sub(r"\s+", " ", (s or "").strip().lower())
    return hashlib.md5(norm.encode("utf-8")).hexdigest()

before = len(df)
df["fp"] = df["embedding_text"].map(fingerprint)
df = df.drop_duplicates(subset=["fp"]).reset_index(drop=True)
after = len(df)

print(f"Removed {before - after} exact duplicates. Kept {after} rows.")
df.drop(columns=["fp"], inplace=True)
len(df)


Removed 0 exact duplicates. Kept 35 rows.


35

In [23]:
""" We’ll use an instruction-tuned retriever for better RAG performance.
E5-small-v2 is fast, 384-dim, and strong for "passage vs query" use. """

EMBED_MODEL_NAME = "intfloat/e5-small-v2"
embedder = SentenceTransformer(EMBED_MODEL_NAME)


corpus_texts = ("passage: " + df["embedding_text"].astype(str)).tolist()


embeddings = embedder.encode(
    corpus_texts,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

embeddings.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(35, 384)

In [24]:

EMB_PATH = ARTIFACTS_DIR / "embeddings_e5.npy"
np.save(EMB_PATH, embeddings)

META_PATH = ARTIFACTS_DIR / "meta.parquet"
df.to_parquet(META_PATH, index=False)

print("Saved:")
print(" -", EMB_PATH)
print(" -", META_PATH)



df.head(3)

Saved:
 - /content/drive/MyDrive/RAG_Ceadar/output/embeddings_meta/embeddings_e5.npy
 - /content/drive/MyDrive/RAG_Ceadar/output/embeddings_meta/meta.parquet


Unnamed: 0,doc_id,doc_label,chunk_id,page_start,page_end,heading,content_type,text,tokens,cite_line,embedding_text,char_len
0,Attention_is_all_you_need,A,Attention_is_all_you_need::p1-1::idx0,1,1,,table,"Provided proper attribution is provided, Googl...",615,[A: p.1 | table],[A: p.1 | table]\nProvided proper attribution ...,2866
1,Attention_is_all_you_need,A,Attention_is_all_you_need::p2-2::idx1,2,2,,table,"1 Introduction Recurrent neural networks, long...",828,[A: p.2 | table],[A: p.2 | table]\n1 Introduction Recurrent neu...,4271
2,Attention_is_all_you_need,A,Attention_is_all_you_need::p3-3::idx2,3,3,,table,Figure 1: The Transformer - model architecture...,388,[A: p.3 | table],[A: p.3 | table]\nFigure 1: The Transformer - ...,1841


In [25]:
BASE_DIR = Path("/content/drive/MyDrive/RAG_Ceadar/output")
ARTIFACTS_DIR = BASE_DIR / "embeddings_meta"

EMB_PATH  = ARTIFACTS_DIR / "embeddings_e5.npy"
META_PATH = ARTIFACTS_DIR / "meta.parquet"

embeddings = np.load(EMB_PATH)
df = pd.read_parquet(META_PATH)

embeddings.shape, len(df)


((35, 384), 35)

In [26]:
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

FAISS_PATH = ARTIFACTS_DIR / "faiss.index"
faiss.write_index(index, str(FAISS_PATH))

print("FAISS index built and saved:", FAISS_PATH)


FAISS index built and saved: /content/drive/MyDrive/RAG_Ceadar/output/embeddings_meta/faiss.index


In [27]:

EMBED_MODEL_NAME = "intfloat/e5-small-v2"
query_encoder = SentenceTransformer(EMBED_MODEL_NAME)

def embed_query(q: str) -> np.ndarray:
    """
    Turn a user question into a 1xD vector.
    E5 expects a 'query: ' prefix for questions.
    """
    q_text = "query: " + q.strip()
    q_vec = query_encoder.encode(
        [q_text],
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    return q_vec


In [28]:
def cite_str(row) -> str:
    """Make a short citation like [A: p.12-13] (includes content_type if useful)."""
    ps, pe = int(row["page_start"]), int(row["page_end"])
    pages = f"p.{ps}" if ps == pe else f"p.{ps}-{pe}"
    kind = "table" if row.get("content_type") == "table" else "text"
    return f"[{row['doc_label']}: {pages} | {kind}]"

def search_dense(query: str, k: int = 5) -> pd.DataFrame:
    """
    Dense (semantic) top-k search.
    Returns a small DataFrame with scores, citations, and text.
    """
    qv = embed_query(query)
    D, I = index.search(qv, k)
    hits = df.iloc[I[0]].copy()
    hits["score_dense"] = D[0]
    hits["citation"] = hits.apply(cite_str, axis=1)

    cols = ["score_dense", "citation", "heading", "content_type", "text",
            "doc_label", "page_start", "page_end", "chunk_id"]
    return hits[cols]


search_dense("What is multi-head attention and why is it useful?", k=5)


Unnamed: 0,score_dense,citation,heading,content_type,text,doc_label,page_start,page_end,chunk_id
4,0.85801,[A: p.5 | table],,table,output values. These are concatenated and once...,A,5,5,Attention_is_all_you_need::p5-5::idx4
13,0.844355,[A: p.14 | table],,table,Input-Input Layer5 The Law will never be perfe...,A,14,14,Attention_is_all_you_need::p14-14::idx13
14,0.838872,[A: p.15 | table],,table,Input-Input Layer5 The Law will never be perfe...,A,15,15,Attention_is_all_you_need::p15-15::idx14
3,0.834543,[A: p.4 | table],,table,Scaled Dot-Product Attention Multi-Head Attent...,A,4,4,Attention_is_all_you_need::p4-4::idx3
2,0.83146,[A: p.3 | table],,table,Figure 1: The Transformer - model architecture...,A,3,3,Attention_is_all_you_need::p3-3::idx2


In [29]:
def simple_tokens(t: str):
    return re.findall(r"[A-Za-z0-9_]+", (t or "").lower())

corpus_tokens = df["text"].astype(str).map(simple_tokens).tolist()


bm25 = BM25Okapi(corpus_tokens)

bm25_path = Path("/content/drive/MyDrive/RAG_Ceadar/output/embeddings_meta/bm25.pkl")
bm25_path.parent.mkdir(parents=True, exist_ok=True)


with bm25_path.open("wb") as f:
    pickle.dump(bm25, f, protocol=pickle.HIGHEST_PROTOCOL)

print("Saved BM25 to:", bm25_path)

def looks_numeric_query(q: str) -> bool:
    return bool(re.search(r"\d", q)) or any(sym in q for sym in ["BLEU", "ROUGE", "%"])

def search_hybrid(query: str, k_dense=10, k_bm25=30, k_final=5,
                  w_dense=0.6, w_bm25=0.4, boost_tables=0.1):
    """
    1) Get dense top-k candidates
    2) Get BM25 top-k candidates
    3) Merge, min-max normalize each score
    4) Weighted sum → final top-k
    5) Small bonus for tables if the query looks numeric
    """
    # Dense
    qv = embed_query(query)
    D, I = index.search(qv, k_dense)
    ddf = pd.DataFrame({"row_id": I[0], "score_dense": D[0]})

    # BM25
    q_toks = simple_tokens(query)
    scores_bm25 = bm25.get_scores(q_toks)
    bm25_idx = np.argsort(scores_bm25)[::-1][:k_bm25]
    bdf = pd.DataFrame({"row_id": bm25_idx, "score_bm25": scores_bm25[bm25_idx]})

    # Merge candidates
    cand = pd.merge(ddf, bdf, on="row_id", how="outer").fillna(0.0)

    # Min–max normalize each score column into [0,1]
    for col in ["score_dense", "score_bm25"]:
        cmin, cmax = cand[col].min(), cand[col].max()
        cand[col] = (cand[col] - cmin) / (cmax - cmin) if cmax > cmin else 0.0

    # table boost if the query looks numeric
    if looks_numeric_query(query):
        table_mask = df.iloc[cand["row_id"]]["content_type"].eq("table").values
        cand.loc[table_mask, "score_bm25"] += boost_tables

    cand["score_hybrid"] = w_dense * cand["score_dense"] + w_bm25 * cand["score_bm25"]
    cand = cand.sort_values("score_hybrid", ascending=False).head(k_final)

    out = df.iloc[cand["row_id"]].copy()
    out["score_dense"]  = cand["score_dense"].values
    out["score_bm25"]   = cand["score_bm25"].values
    out["score_hybrid"] = cand["score_hybrid"].values
    out["citation"]     = out.apply(cite_str, axis=1)

    cols = ["score_hybrid", "score_dense", "score_bm25", "citation",
            "heading", "content_type", "text", "doc_label", "page_start", "page_end", "chunk_id"]
    return out[cols]


search_hybrid("Scaled dot-product attention formula", k_final=5)


Saved BM25 to: /content/drive/MyDrive/RAG_Ceadar/output/embeddings_meta/bm25.pkl


Unnamed: 0,score_hybrid,score_dense,score_bm25,citation,heading,content_type,text,doc_label,page_start,page_end,chunk_id
3,1.0,1.0,1.0,[A: p.4 | table],,table,Scaled Dot-Product Attention Multi-Head Attent...,A,4,4,Attention_is_all_you_need::p4-4::idx3
4,0.753661,0.937912,0.477285,[A: p.5 | table],,table,output values. These are concatenated and once...,A,5,5,Attention_is_all_you_need::p5-5::idx4
0,0.751299,0.908537,0.515442,[A: p.1 | table],,table,"Provided proper attribution is provided, Googl...",A,1,1,Attention_is_all_you_need::p1-1::idx0
8,0.681147,0.930158,0.30763,[A: p.9 | table],,table,Table 3: Variations on the Transformer archite...,A,9,9,Attention_is_all_you_need::p9-9::idx8
2,0.57703,0.921462,0.060383,[A: p.3 | table],,table,Figure 1: The Transformer - model architecture...,A,3,3,Attention_is_all_you_need::p3-3::idx2


In [30]:
_HEADER_LINE = re.compile(
    r"^\s*\[[A-Z]:\s*p\.\d+(?:-\d+)?(?:\s*\|\s*(?:text|table))?\]\s*\n",
    re.IGNORECASE,
)

def assemble_context(rows: pd.DataFrame, max_ctx_tokens: int = 2200) -> str:
    parts = []
    used  = 0
    seen  = set()

    for _, r in rows.iterrows():

        key = r.get("chunk_id")
        if key and key in seen:
            continue
        if key:
            seen.add(key)
        head = r.get("citation") or cite_str(r)
        body = (r.get("text") or "").strip()

        if not body:
            et = str(r.get("embedding_text") or "")
            body = _HEADER_LINE.sub("", et, count=1).strip()

        block = f"{head}\n{body}\n"
        need  = token_len(block)

        if used + need > max_ctx_tokens:
            if not parts:
                parts.append(block)
            break

        parts.append(block)
        used += need
    return "\n".join(parts).strip()

### Retrieval : Hybrid (Dense + BM25) + MMR Re-ranking

Here in this part I am implementing following things:

- Loading metadata, embeddings, and FAISS index from disk
- Building a BM25 corpus over the same chunk texts
- Encoding queries with E5 (instruction-tuned) embeddings
- Retrieving a candidate pool from Dense (FAISS) and BM25
- Normalizing scores, blending them (hybrid), and using a optional table boost
- Appling MMR to select a diverse top-k final result set
- Returning a tidy DataFrame with scores + clean citations

In [31]:
BASE_DIR = Path("/content/drive/MyDrive/RAG_Ceadar/output")
ART_DIR  = BASE_DIR / "embeddings_meta"

EMB_PATH  = ART_DIR / ("embeddings.npy" if (ART_DIR / "embeddings.npy").exists() else "embeddings_e5.npy")
META_PATH = ART_DIR / "meta.parquet"
FAISS_PATH = ART_DIR / "faiss.index"

print("Embeddings:", EMB_PATH.name, "| Meta:", META_PATH.name, "| FAISS:", FAISS_PATH.name)


df = pd.read_parquet(META_PATH)
embeddings = np.load(EMB_PATH)


assert len(df) == embeddings.shape[0], f"Row mismatch: df={len(df)} vs emb={embeddings.shape[0]}"


norms = np.linalg.norm(embeddings, axis=1, keepdims=True)


safe_norms = np.where(norms == 0, 1.0, norms)
embeddings = embeddings / safe_norms


index = faiss.read_index(str(FAISS_PATH))
dim = embeddings.shape[1]
assert index.d == dim, f"FAISS dim={index.d} != emb dim={dim}"

def _simple_tokens(s: str):
    return re.findall(r"[A-Za-z0-9_]+", (s or "").lower())

corpus_tokens = df["text"].astype(str).map(_simple_tokens).tolist()
bm25 = BM25Okapi(corpus_tokens)


Embeddings: embeddings_e5.npy | Meta: meta.parquet | FAISS: faiss.index


Query Encoder (E5 small-v2), Hybrid Score, MMR Selector

In [32]:
_E5_MODEL = "intfloat/e5-small-v2"
_query_encoder = SentenceTransformer(_E5_MODEL)

def embed_query_e5(query: str) -> np.ndarray:
    q_text = "query: " + (query or "").strip()
    vec = _query_encoder.encode([q_text], convert_to_numpy=True, normalize_embeddings=True)
    return vec  # shape: (1, D)


def _looks_numeric_query(q: str) -> bool:
    q = q or ""
    return bool(re.search(r"\d", q)) or any(kw in q for kw in ["bleu", "rouge", "%", "wmt", "table"])


def _minmax_norm(arr: np.ndarray) -> np.ndarray:
    """
    Min-max normalize an array into [0, 1]. Returns zeros if constant.
    """
    if arr.size == 0:
        return arr
    a_min, a_max = float(arr.min()), float(arr.max())
    if a_max > a_min:
        return (arr - a_min) / (a_max - a_min)
    return np.zeros_like(arr, dtype=float)


def _mk_citation(row: pd.Series) -> str:
    """
    Compact citation string: [A: p.12-13 | table] or [B: p.7 | text]
    """
    ps, pe = int(row["page_start"]), int(row["page_end"])
    pages = f"p.{ps}" if ps == pe else f"p.{ps}-{pe}"
    kind = "table" if row.get("content_type") == "table" else "text"
    return f"[{row['doc_label']}: {pages} | {kind}]"


def mmr_select(q_vec: np.ndarray,
               candidate_idx: np.ndarray,
               cand_vecs: np.ndarray,
               k: int = 5,
               lambda_mult: float = 0.7) -> list:
    """
    Maximal Marginal Relevance (MMR) selection over a candidate pool.
    Assumes q_vec and cand_vecs are L2-normalized so dot == cosine.
    Returns: list of selected dataset row indices (subset of candidate_idx).
    """
    M = cand_vecs.shape[0]
    if M == 0 or k <= 0:
        return []


    rel = (q_vec @ cand_vecs.T).ravel()

    selected_local = []
    redundancy = np.zeros(M, dtype=np.float32)

    for _ in range(min(k, M)):

        scores = lambda_mult * rel - (1.0 - lambda_mult) * redundancy
        if selected_local:
            scores[selected_local] = -1e9

        j = int(np.argmax(scores))
        selected_local.append(j)

        sim_to_j = (cand_vecs @ cand_vecs[j].reshape(-1, 1)).ravel()
        redundancy = np.maximum(redundancy, sim_to_j)

    return candidate_idx[selected_local].tolist()



MMR Diversification Parameters:
- pool_dense: of dense candidates taken from FAISS
- pool_bm25 : of BM25 candidates
- w_dense, w_bm25: blending weights after min-max normalization
- boost_tables: small BM25 bump for table chunks when query looks numeric
- lambda_mult: MMR relevance/diversity balance (0.7 is a good default)


This is the entire retrival pipeline, that I am following.
- Dense candidate pool via FAISS (cosine on normalized vectors)
-  BM25 candidate pool on tokenized corpus
-  Merging pools, min-max normalize and blend scores (hybrid)
-  Optional table boost for numeric queries
-  MMR selection to produce final top-k diverse results
-  Returns DataFrame with scores + clear and clean citations
    

In [33]:
def search_hybrid_mmr(query: str,
                      k: int = 5,
                      pool_dense: int = 40,
                      pool_bm25: int = 80,
                      w_dense: float = 0.6,
                      w_bm25: float = 0.4,
                      boost_tables: float = 0.10,
                      lambda_mult: float = 0.7) -> pd.DataFrame:


    qv = embed_query_e5(query)

    # Dense pool from FAISS
    D, I = index.search(qv, pool_dense)
    dense_idx = I[0]
    dense_score = D[0]

    # BM25 pool
    q_toks = _simple_tokens(query)
    bm25_scores_all = bm25.get_scores(q_toks)
    bm25_idx = np.argsort(bm25_scores_all)[::-1][:pool_bm25]
    bm25_score = bm25_scores_all[bm25_idx]

    # Merge pools, outer-join style on row_id
    ddf = pd.DataFrame({"row_id": dense_idx, "score_dense": dense_score})
    bdf = pd.DataFrame({"row_id": bm25_idx,  "score_bm25": bm25_score})
    cand = pd.merge(ddf, bdf, on="row_id", how="outer").fillna(0.0)

    # Normalize each score column into [0,1] for fair blending
    cand["dense_norm"] = _minmax_norm(cand["score_dense"].to_numpy())
    cand["bm25_norm"]  = _minmax_norm(cand["score_bm25"].to_numpy())

    # Optional table boost for numeric-oriented queries
    if _looks_numeric_query(query):
        is_table = df.iloc[cand["row_id"]]["content_type"].eq("table").to_numpy(dtype=bool)
        cand.loc[is_table, "bm25_norm"] = np.minimum(1.0, cand.loc[is_table, "bm25_norm"] + boost_tables)

    cand["score_hybrid"] = w_dense * cand["dense_norm"] + w_bm25 * cand["bm25_norm"]

    pool_idx = cand.sort_values("score_hybrid", ascending=False)["row_id"].to_numpy()
    pool_vecs = embeddings[pool_idx]   # normalized (N_pool, D)
    picked = mmr_select(qv, pool_idx, pool_vecs, k=k, lambda_mult=lambda_mult)


    out = df.iloc[picked].copy()
    out["score_dense"]  = (qv @ embeddings[picked].T).ravel()
    out["score_bm25"]   = bm25_scores_all[picked]
    out["dense_norm"]   = _minmax_norm(out["score_dense"].to_numpy())
    out["bm25_norm"]    = _minmax_norm(out["score_bm25"].to_numpy())
    out["score_hybrid"] = 0.6 * out["dense_norm"] + 0.4 * out["bm25_norm"]
    out["citation"]     = out.apply(_mk_citation, axis=1)

    cols = [
        "score_hybrid", "dense_norm", "bm25_norm", "score_dense", "score_bm25",
        "citation", "heading", "content_type", "text",
        "doc_label", "page_start", "page_end", "chunk_id"
    ]
    return out[cols]


In [34]:
hits = search_hybrid_mmr("What is multi-head attention and why is it useful?", k=5)
hits

Unnamed: 0,score_hybrid,dense_norm,bm25_norm,score_dense,score_bm25,citation,heading,content_type,text,doc_label,page_start,page_end,chunk_id
4,0.871608,1.0,0.679019,0.85801,8.268602,[A: p.5 | table],,table,output values. These are concatenated and once...,A,5,5,Attention_is_all_you_need::p5-5::idx4
13,0.870171,0.783618,1.0,0.844355,11.859326,[A: p.14 | table],,table,Input-Input Layer5 The Law will never be perfe...,A,14,14,Attention_is_all_you_need::p14-14::idx13
34,0.0,0.0,0.0,0.794904,0.672597,[B: p.20-22 | text],,text,Appendix A. Contributions and Acknowledgments ...,B,20,22,Deepseek-r1::p20-22::idx19
12,0.479637,0.570167,0.343841,0.830885,4.519052,[A: p.13 | text],,text,Attention Visualizations Input-Input Layer5 It...,A,13,13,Attention_is_all_you_need::p13-13::idx12
0,0.519974,0.47434,0.588424,0.824838,7.255139,[A: p.1 | table],,table,"Provided proper attribution is provided, Googl...",A,1,1,Attention_is_all_you_need::p1-1::idx0


In [35]:
hits = search_hybrid_mmr("Report the BLEU score on WMT 2014 English-to-German.", k=5)
hits.text.tolist()[0]

'Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost. ModelBLEU Training Cost (FLOPs) EN-DE EN-FR EN-DE EN-FR ByteNet [18] 23.75 Deep-Att + PosUnk [39] 39.2 1.0·1020 GNMT + RL [38] 24.6 39.92 2.3·10191.4·1020 ConvS2S [9] 25.16 40.46 9.6·10181.5·1020 MoE [32] 26.03 40.56 2.0·10191.2·1020 Deep-Att + PosUnk Ensemble [39] 40.4 8.0·1020 GNMT + RL Ensemble [38] 26.30 41.16 1.8·10201.1·1021 ConvS2S Ensemble [9] 26.36 41.29 7.7·10191.2·1021 Transformer (base model) 27.3 38.1 3.3·1018 Transformer (big) 28.4 41.8 2.3·1019 Residual Dropout We apply dropout [ 33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop= 0.1. Label Smoothing During training, 

### Generation from LLM with the Context


The pipeline for the generation consist of following:
- build_prompt(...), strict grounding + citation rules
- generate_answer(... ), provider-agnostic adapter
- ensure_citations(...), light post-check for bracket cites
- answer_question(...), end-to-end: retrieve → context → prompt → LLM


In [36]:
""" Prompt builder
System instruction to constrain the LLM to grounded, citable outputs
Promting is very important here as LLMs generally hallucinates a lot if not clearly addressed """

SYSTEM_PROMPT = (
    "You are a careful scientific assistant. Answer ONLY using the provided context.\n"
    "If the answer is not in the context, say \"I don't find this in the provided papers.\".\n"
    "Cite every claim with bracket citations like [A: p.12] or [B: p.7-8].\n"
    "Prefer quoting numbers from tables verbatim; do not guess."
)

def build_prompt(question: str, context_block: str) -> str:
    return (
        f"{SYSTEM_PROMPT}\n\n"
        f"Question:\n{question.strip()}\n\n"
        f"Context:\n{context_block}\n\n"
        f"Answer (with citations):"
    )


In [37]:
def generate_answer_hf(prompt: str, model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", max_new_tokens: int = 400) -> str:
    src = _model_source()
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
        tok = AutoTokenizer.from_pretrained(
            src,
            local_files_only=bool('USE_LOCAL' in globals() and USE_LOCAL))

        mdl = AutoModelForCausalLM.from_pretrained(
            src,
            local_files_only=bool('USE_LOCAL' in globals() and USE_LOCAL),
            device_map="auto",
            torch_dtype="auto"
        )
        pipe = pipeline("text-generation", model=mdl, tokenizer=tok)
        out = pipe(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tok.eos_token_id,
        )
        return out[0]["generated_text"].split("Answer (with citations):")[-1].strip()
    except Exception as e:
        return f"[LLM error] {e}"




In [38]:
# Minimal citation guardrail
_CITE_PATTERN = re.compile(r"\[[A-Z]:\s*p\.\d+(?:-\d+)?(?:\s*\|\s*(?:text|table))?\]")

def ensure_citations(answer: str) -> str:
    text = (answer or "").strip()
    if not _CITE_PATTERN.search(text):
        return text + "\n\n[Note: No in-text citations detected, verify against the provided context.]"
    return text


In [39]:
# End-to-end answer function
def answer_question(
    question: str,
    k: int = 5,
    provider: str = "hf",
    max_ctx_chars: int = 2200,
):
    # Retrieving top-k diversified chunks using the retriever build above
    hits = search_hybrid_mmr(question, k=k)

    if "citation" not in hits.columns:
        hits["citation"] = hits.apply(cite_str, axis=1)

    context = assemble_context(hits, max_ctx_tokens=max_ctx_chars)

    prompt = build_prompt(question, context)

    raw = generate_answer_hf(prompt, model_id="Qwen/Qwen2.5-3B-Instruct", max_new_tokens=400)
    final = ensure_citations(raw)

    return {
        "answer": final,
        "context": context,
        "hits": hits,
        "prompt": prompt,
    }


In [40]:
res = answer_question(
    "What is tranformer and why is it useful?",
    k=5,
    provider="hf",
    max_ctx_chars=6000
)



`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [41]:
print(res["answer"])
print("\n--- CONTEXT ---\n")
print(res["context"])
res["hits"][["citation","heading","content_type","doc_label","page_start","page_end"]].head()

The Transformer is a model architecture proposed in this work that eschews recurrence and instead relies entirely on an attention mechanism to draw global dependencies between input and output. It allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost. It was found that the Transformer (big) model outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4 on the WMT 2014 English-to-German translation task. The Transformer (big) model trained for English-to-French used dropout rate Pdrop= 0.1, instead of 0.3. The Transformer model is advantageous because it reduces the sequential computation required 

Unnamed: 0,citation,heading,content_type,doc_label,page_start,page_end
1,[A: p.2 | table],,table,A,2,2
16,[B: p.2 | text],,text,B,2,2
31,[B: p.17 | table],,table,B,17,17
7,[A: p.8 | table],,table,A,8,8
13,[A: p.14 | table],,table,A,14,14


In [49]:
# for gpu cache clear
import torch, gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

### Evaluation

- Implements a custom evaluation framework to measure both retrieval and generation quality.
- Calculates standard IR metrics like Hit@k, MRR@k, and nDCG@k to assess ranking performance.
- Checks answer quality using phrase coverage and hallucination detection based on citation presence.
- Logs every evaluation run with timestamps, retrieved documents, and generated answers for later analysis.
- Integrates seamlessly with the hybrid retriever and generator, allowing end-to-end benchmarking on custom queries.

In [43]:

EVAL_LOG = Path("/content/drive/MyDrive/RAG_Ceadar/output/eval_logs/eval_runs.jsonl")

def dcg_at_k(rels, k):
    score = 0.0
    for i, r in enumerate(rels[:k], start=1):
        score += (2**r - 1) / math.log2(i + 1)
    return score

def ndcg_at_k(rels, k):
    dcg = dcg_at_k(rels, k)
    ideal = dcg_at_k(sorted(rels, reverse=True), k)
    return dcg / ideal if ideal > 0 else 0.0

def hit_at_k(labels_ranked, expected_labels, k):
    topk = labels_ranked[:k]
    return 1.0 if any(lbl in expected_labels for lbl in topk) else 0.0

def mrr_at_k(labels_ranked, expected_labels, k):
    for i, lbl in enumerate(labels_ranked[:k], start=1):
        if lbl in expected_labels:
            return 1.0 / i
    return 0.0

def eval_retrieval(hits_df: pd.DataFrame, expected_labels: list, k: int = 5):
    labels_ranked = hits_df["doc_label"].astype(str).tolist()
    rels = [1 if lbl in expected_labels else 0 for lbl in labels_ranked]
    return {
        "Hit@k": hit_at_k(labels_ranked, expected_labels, k),
        "MRR@k": mrr_at_k(labels_ranked, expected_labels, k),
        "nDCG@k": ndcg_at_k(rels, k),
    }

def eval_answer_simple(answer: str, expected_phrases: list[str]):
    a = (answer or "").lower()
    found = sum(1 for p in expected_phrases if p.lower() in a)
    total = max(1, len(expected_phrases))
    return {
        "phrase_coverage": found / total,
        "missing_phrases": [p for p in expected_phrases if p.lower() not in a],
    }

def eval_hallucination_simple(answer: str):
    pat = re.compile(r"\[[A-Z]:\s*p\.\d+(?:-\d+)?(?:\s*\|\s*(?:text|table))?\]")
    has_cite = bool(pat.search(answer or ""))
    return {"has_citation": 1.0 if has_cite else 0.0}

def run_eval_case(question: str, expected_labels: list[str], expected_phrases: list[str], k: int = 5,
                  max_ctx_tokens: int = 2200, max_new_tokens: int = 300, log: bool = True):
    hits = search_hybrid_mmr(question, k=k)
    if "citation" not in hits.columns:
        hits["citation"] = hits.apply(cite_str, axis=1)
    context = assemble_context(hits, max_ctx_tokens=max_ctx_tokens)

    prompt = build_prompt(question, context)
    raw = generate_answer_hf(prompt, model_id="Qwen/Qwen2.5-3B-Instruct", max_new_tokens=max_new_tokens)
    final = ensure_citations(raw)

    rmetrics = eval_retrieval(hits, expected_labels, k=k)
    ametrics = eval_answer_simple(final, expected_phrases)
    hmetrics = eval_hallucination_simple(final)
    record = {
        "ts": time.strftime("%Y-%m-%d %H:%M:%S"),
        "question": question,
        "expected_labels": expected_labels,
        "expected_phrases": expected_phrases,
        "retrieval_metrics": rmetrics,
        "answer_metrics": ametrics,
        "hallucination_metrics": hmetrics,
        "answer": final,
        "top_hits": hits[["citation","doc_label","page_start","page_end","heading"]].head(k).to_dict(orient="records"),
    }


    if log:
        with EVAL_LOG.open("a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    return record


Gold set of queries

In [44]:
GOLD = [
    {
        "q": "What is multi-head attention and why is it useful?",
        "labels": ["A"],
        "phrases": ["multi-head attention", "parallel heads", "subspaces"],
    },
    {
        "q": "Report the BLEU result for WMT14 En-De from the Transformer paper.",
        "labels": ["A"],
        "phrases": ["BLEU", "WMT 2014", "English-to-German"],
    },
    {
        "q": "Summarize how DeepSeek improves reasoning in brief.",
        "labels": ["B"],
        "phrases": ["self-reflection", "deliberate", "reasoning steps"],
    },
]

In [45]:
# Running the evaluation
results = []
for g in GOLD:
    rec = run_eval_case(
        question=g["q"],
        expected_labels=g["labels"],
        expected_phrases=g["phrases"],
        k=5,
        max_ctx_tokens=1600,
        max_new_tokens=220
    )
    results.append(rec)

pd.DataFrame([
    {
        "q": r["question"],
        **r["retrieval_metrics"],
        "phrase_cov": r["answer_metrics"]["phrase_coverage"],
        "has_cite": r["hallucination_metrics"]["has_citation"],
        "answer": r["answer"][:100] + "..." if len(r["answer"]) > 100 else r["answer"],
    }
    for r in results
])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Unnamed: 0,q,Hit@k,MRR@k,nDCG@k,phrase_cov,has_cite,answer
0,What is multi-head attention and why is it use...,1.0,1.0,0.95583,0.333333,1.0,Multi-head attention is a mechanism used in tr...
1,Report the BLEU result for WMT14 En-De from th...,1.0,1.0,1.0,0.666667,1.0,[A: p.8 | table]\nThe BLEU result for WMT14 En...
2,Summarize how DeepSeek improves reasoning in b...,1.0,1.0,1.0,0.0,1.0,[B: p.2]\nDeepSeek-R1 improves reasoning by in...


In [46]:
for r in results:
  a = r["answer"][:1000] + "..." if len(r["answer"]) > 1000 else r["answer"]

In [47]:
a

"[B: p.2]\nDeepSeek-R1 improves reasoning by incorporating multi-stage training and cold-start data before reinforcement learning. This approach addresses the limitations of DeepSeek-R1-Zero, such as poor readability and language mixing, leading to enhanced reasoning performance. The multi-stage training involves cold start data, which helps in refining the model's understanding and improving its ability to reason effectively across various scenarios. Additionally, the use of rejection sampling and supervised fine-tuning during reinforcement learning ensures that the model learns more efficiently and avoids biases that could arise from unsupervised learning alone. [B: p.2] DeepSeek-R1 improves reasoning through multi-stage training and cold-start data before reinforcement learning, addressing issues like poor readability and language mixing. Multi-stage training includes cold start data to refine the model's reasoning abilities, while reinforcement learning with rejection sampling and 

In [48]:
for r in results:
    cov = r["answer_metrics"]["phrase_coverage"]
    if cov < 1.0:
        print("\n---")
        print("Q:", r["question"])
        print("Expected phrases:", r["expected_phrases"])
        print("Missing:", r["answer_metrics"]["missing_phrases"])
        print("Top hits:", [h["citation"] for h in r["top_hits"]])
        print("\nAnswer:\n", r["answer"])


---
Q: What is multi-head attention and why is it useful?
Expected phrases: ['multi-head attention', 'parallel heads', 'subspaces']
Missing: ['parallel heads', 'subspaces']
Top hits: ['[A: p.5 | table]', '[A: p.14 | table]', '[B: p.20-22 | text]', '[A: p.13 | text]', '[A: p.1 | table]']

Answer:
 Multi-head attention is a mechanism used in transformer models to enable joint attention to multiple representations from different positions. It is useful because it allows each position in the decoder to attend to all positions in the input sequence, which is crucial for tasks like machine translation. The paper mentions that with a single attention head, averaging would inhibit this ability, hence the use of multiple heads. The model employs 8 parallel attention layers, each with 64 dimensions, to achieve this functionality. The paper also illustrates how multi-head attention works through examples and figures, showing how it can focus on specific parts of the input sequence, such as the w