# Simple RAG Assignment (Jupyter, Python Kernel)

This notebook implements:
1. Dataset loading + preprocessing + chunking
2. TF-IDF embeddings + FAISS index
3. Query retrieval
4. GPT-2 grounded response generation
5. Qualitative + lightweight quantitative evaluation


In [None]:
!pip install -q datasets faiss-cpu scikit-learn transformers torch nltk

In [None]:
import re
import numpy as np
import torch
import faiss
import nltk
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

nltk.download('punkt', quiet=True)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

## Step 1: Load, clean, and chunk dataset

In [None]:
# Load a small streaming subset of Wikipedia
stream_ds = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
subset = list(stream_ds.take(300))  # keep it lightweight for notebook runtime


def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\\s+", " ", text)
    text = re.sub(r"[^a-z0-9.,;:!?()'\"\\- ]+", " ", text)
    return text.strip()


def chunk_text_by_words(text: str, chunk_size: int = 120, overlap: int = 20):
    words = clean_text(text).split()
    if not words:
        return []
    chunks = []
    step = max(1, chunk_size - overlap)
    for start in range(0, len(words), step):
        chunk = words[start:start + chunk_size]
        if len(chunk) >= 25:  # ignore tiny fragments
            chunks.append(" ".join(chunk))
    return chunks

all_chunks = []
for article in subset:
    txt = article.get("text", "")
    all_chunks.extend(chunk_text_by_words(txt, chunk_size=120, overlap=20))

print(f"Articles loaded: {len(subset)}")
print(f"Total chunks: {len(all_chunks)}")
print("Sample chunk:\n", all_chunks[0][:400])

## Step 2: Build retrieval with TF-IDF + FAISS

In [None]:
# TF-IDF embeddings
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(all_chunks)
chunk_embeddings = tfidf_matrix.toarray().astype(np.float32)

# Normalize vectors and build cosine-sim FAISS via inner product
faiss.normalize_L2(chunk_embeddings)
index = faiss.IndexFlatIP(chunk_embeddings.shape[1])
index.add(chunk_embeddings)

print("Embedding shape:", chunk_embeddings.shape)
print("FAISS index size:", index.ntotal)

## Step 3: Query and retrieve top-k chunks

In [None]:
def retrieve_chunks(query: str, k: int = 3):
    q_vec = vectorizer.transform([clean_text(query)]).toarray().astype(np.float32)
    faiss.normalize_L2(q_vec)
    scores, ids = index.search(q_vec, k)
    hits = []
    for score, idx in zip(scores[0], ids[0]):
        hits.append({
            "score": float(score),
            "chunk": all_chunks[idx],
            "chunk_id": int(idx)
        })
    return hits

query = "What is the capital of France?"
retrieved = retrieve_chunks(query, k=3)

for i, hit in enumerate(retrieved, 1):
    print(f"Top {i} | score={hit['score']:.4f}")
    print(hit['chunk'][:300], "\n")

## Step 4: Generate response using GPT-2 with retrieved context

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# GPT-2 has no pad token by default; map it to eos
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id


def generate_answer_from_chunks(user_query: str, retrieved_chunks, max_new_tokens: int = 120):
    context = "\n".join([f"- {x['chunk']}" if isinstance(x, dict) else f"- {x}" for x in retrieved_chunks])
    prompt = (
        "You are a helpful assistant. Use ONLY the context to answer the question.\n\n"
        f"Question: {user_query}\n\n"
        f"Context:\n{context}\n\n"
        "Answer:"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=900,
        padding=True,
    )

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            no_repeat_ngram_size=2,
        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].strip()

generated_answer = generate_answer_from_chunks(query, retrieved)
print("Query:", query)
print("Generated answer:\n", generated_answer)

## Step 5: Basic evaluation (relevance, completeness, coherence)

Below uses the required example corpus/query/response format and computes:
- **Relevance**: cosine similarity between query and response (TF-IDF)
- **Completeness**: proportion of important query keywords covered in response
- **Coherence**: perplexity of response under GPT-2 (lower is better)

In [None]:
corpus = [
    "Distributed Data Parallel (DDP) allows multi-GPU training.",
    "DDP synchronizes gradients across GPUs.",
    "Using multiple GPUs increases training efficiency.",
    "Parallelization across GPUs is useful for large models."
]

query = "What is Distributed Data Parallel (DDP) in PyTorch, and how is it useful in multi-GPU setups?"
response = (
    "Distributed Data Parallel (DDP) in PyTorch is a module that enables parallel "
    "training across multiple GPUs by distributing model replicas and splitting data "
    "across them."
)

# 1) Relevance via cosine similarity
rel_vectorizer = TfidfVectorizer(stop_words="english")
qr_mat = rel_vectorizer.fit_transform([query, response])
relevance_score = cosine_similarity(qr_mat[0:1], qr_mat[1:2])[0, 0]

# 2) Completeness via keyword coverage
def extract_keywords(text: str):
    tokens = re.findall(r"[a-zA-Z]+", text.lower())
    stop = {
        "what", "is", "in", "and", "how", "it", "the", "a", "an", "of", "to", "for", "on", "by", "with"
    }
    return sorted(set([t for t in tokens if t not in stop and len(t) > 2]))

query_keywords = extract_keywords(query)
response_tokens = set(re.findall(r"[a-zA-Z]+", response.lower()))
covered = [kw for kw in query_keywords if kw in response_tokens]
completeness_score = len(covered) / max(1, len(query_keywords))

# 3) Coherence via perplexity (lower is better)
def perplexity_gpt2(text: str):
    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        out = model(**enc, labels=enc["input_ids"])
    return float(torch.exp(out.loss))

coherence_perplexity = perplexity_gpt2(response)

print(f"Relevance (cosine similarity): {relevance_score:.4f}")
print(f"Completeness (keyword coverage): {completeness_score:.4f}")
print(f"Coherence (GPT-2 perplexity, lower better): {coherence_perplexity:.4f}")
print("\nQuery keywords:", query_keywords)
print("Covered keywords:", covered)

## Optional experiment: chunk size comparison

Try different chunk sizes to observe retrieval-quality changes.

In [None]:
def build_index_with_chunk_size(articles, chunk_size=120, overlap=20):
    chunks = []
    for a in articles:
        chunks.extend(chunk_text_by_words(a.get("text", ""), chunk_size=chunk_size, overlap=overlap))

    vec = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    mat = vec.fit_transform(chunks).toarray().astype(np.float32)
    faiss.normalize_L2(mat)
    idx = faiss.IndexFlatIP(mat.shape[1])
    idx.add(mat)
    return chunks, vec, idx

for size in [80, 120, 180]:
    chunks_s, vec_s, idx_s = build_index_with_chunk_size(subset[:120], chunk_size=size, overlap=size//6)
    q_vec = vec_s.transform([clean_text("what is france's capital city?")]).toarray().astype(np.float32)
    faiss.normalize_L2(q_vec)
    scores, ids = idx_s.search(q_vec, 3)
    print(f"Chunk size={size}, top score={scores[0][0]:.4f}, chunks={len(chunks_s)}")

## Qualitative observation template (for report/PPT)

- **Relevant example:** Retrieved chunks explicitly mention terms from the query.
- **Coherence strength:** GPT-2 output is usually fluent when context is clear.
- **Common issue:** GPT-2 may add unsupported details if context is weak.
- **Improvement ideas:** Better reranking, larger embedding model, prompt constraints, and answer post-validation.