# Assignment: Simple RAG System (Jupyter + Python)

This notebook gives you **clean, copy-ready code** for:
1. Data preprocessing and chunking
2. FAISS retrieval with TF-IDF embeddings
3. GPT-2 response generation from retrieved context
4. Basic evaluation (relevance, completeness, coherence)


## 0) Install libraries (run once)

In [None]:
!pip install -q datasets faiss-cpu scikit-learn transformers torch

## 1) Imports + configuration

In [None]:
import re
import numpy as np
import torch
import faiss
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

## 2) Load a small custom dataset (Wikipedia subset)

In [None]:
# Streaming keeps memory usage low
dataset_stream = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
articles = list(dataset_stream.take(250))
print("Articles loaded:", len(articles))
print("Example title:", articles[0].get("title", "N/A"))

## 3) Preprocess + chunk text

In [None]:
def clean_text(text: str) -> str:
    """Lowercase and remove noisy characters while preserving spaces."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def chunk_text(text: str, chunk_size: int = 120, overlap: int = 20):
    """Split text into overlapping word chunks for better retrieval."""
    words = clean_text(text).split()
    if not words:
        return []

    step = max(1, chunk_size - overlap)
    chunks = []
    for i in range(0, len(words), step):
        chunk_words = words[i:i + chunk_size]
        if len(chunk_words) >= 25:  # filter tiny chunks
            chunks.append(" ".join(chunk_words))
    return chunks


all_chunks = []
for item in articles:
    all_chunks.extend(chunk_text(item.get("text", ""), chunk_size=120, overlap=20))

print("Total chunks created:", len(all_chunks))
print("Sample chunk:
", all_chunks[0][:350])

## 4) Build retrieval index (TF-IDF + FAISS)

In [None]:
# Vectorize chunks with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(all_chunks)
chunk_vectors = X.toarray().astype(np.float32)

# Normalize to use inner product as cosine similarity
faiss.normalize_L2(chunk_vectors)

# Build FAISS index
index = faiss.IndexFlatIP(chunk_vectors.shape[1])
index.add(chunk_vectors)

print("TF-IDF shape:", chunk_vectors.shape)
print("Indexed vectors:", index.ntotal)

## 5) Query and retrieve relevant chunks

In [None]:
def retrieve(query: str, k: int = 3):
    q_vec = vectorizer.transform([clean_text(query)]).toarray().astype(np.float32)
    faiss.normalize_L2(q_vec)
    scores, ids = index.search(q_vec, k)

    results = []
    for score, idx in zip(scores[0], ids[0]):
        results.append({
            "chunk_id": int(idx),
            "score": float(score),
            "text": all_chunks[idx]
        })
    return results


user_query = "What is the capital of France?"
retrieved_docs = retrieve(user_query, k=3)

for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Top {i} | score={doc['score']:.4f}")
    print(doc['text'][:300], "
")

## 6) Generate answer with GPT-2 using retrieved context

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(DEVICE)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id


def generate_answer(query: str, retrieved_docs, max_new_tokens: int = 120):
    context = "
".join([f"- {d['text']}" for d in retrieved_docs])

    prompt = (
        "You are a helpful assistant. Answer the question using only the context.

"
        f"Question: {query}

"
        f"Context:
{context}

"
        "Answer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=900).to(DEVICE)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            no_repeat_ngram_size=2
        )

    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated.split("Answer:")[-1].strip()


answer = generate_answer(user_query, retrieved_docs)
print("Query:", user_query)
print("Generated Answer:
", answer)

## 7) Evaluation (required metrics)

Given corpus/query/response, compute:
- **Relevance** = cosine similarity
- **Completeness** = proportion of query keywords found in response
- **Coherence** = GPT-2 perplexity (lower is better)

In [None]:
corpus = [
    "Distributed Data Parallel (DDP) allows multi-GPU training.",
    "DDP synchronizes gradients across GPUs.",
    "Using multiple GPUs increases training efficiency.",
    "Parallelization across GPUs is useful for large models."
]

query = "What is Distributed Data Parallel (DDP) in PyTorch, and how is it useful in multi-GPU setups?"
response = "Distributed Data Parallel (DDP) in PyTorch is a module that enables parallel training across multiple GPUs by distributing model replicas and splitting data across them."

# Relevance: cosine similarity between query and response
rel_vec = TfidfVectorizer(stop_words="english")
qr = rel_vec.fit_transform([query, response])
relevance = cosine_similarity(qr[0:1], qr[1:2])[0, 0]

# Completeness: proportion of important query keywords found in response
def keywords(text: str):
    stop = {"what","is","in","and","how","it","the","a","an","of","to","for","on","by","with"}
    toks = re.findall(r"[a-zA-Z]+", text.lower())
    return sorted(set([t for t in toks if t not in stop and len(t) > 2]))

q_keywords = keywords(query)
r_tokens = set(re.findall(r"[a-zA-Z]+", response.lower()))
covered = [k for k in q_keywords if k in r_tokens]
completeness = len(covered) / max(1, len(q_keywords))

# Coherence: perplexity under GPT-2
def perplexity(text: str):
    inp = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        out = model(**inp, labels=inp["input_ids"])
    return float(torch.exp(out.loss))

coherence_ppl = perplexity(response)

print(f"Relevance (cosine similarity): {relevance:.4f}")
print(f"Completeness (keyword coverage): {completeness:.4f}")
print(f"Coherence (perplexity, lower is better): {coherence_ppl:.4f}")
print("Query keywords:", q_keywords)
print("Covered keywords:", covered)

## 8) Optional experiment: compare chunk sizes

In [None]:
def evaluate_chunk_size(sample_articles, query_text, chunk_size):
    temp_chunks = []
    for art in sample_articles:
        temp_chunks.extend(chunk_text(art.get("text", ""), chunk_size=chunk_size, overlap=max(5, chunk_size // 6)))

    temp_vec = TfidfVectorizer(max_features=4000, ngram_range=(1, 2))
    temp_matrix = temp_vec.fit_transform(temp_chunks).toarray().astype(np.float32)
    faiss.normalize_L2(temp_matrix)

    temp_index = faiss.IndexFlatIP(temp_matrix.shape[1])
    temp_index.add(temp_matrix)

    q = temp_vec.transform([clean_text(query_text)]).toarray().astype(np.float32)
    faiss.normalize_L2(q)
    scores, ids = temp_index.search(q, 3)

    return {
        "chunk_size": chunk_size,
        "num_chunks": len(temp_chunks),
        "top_score": float(scores[0][0])
    }

for cs in [80, 120, 180]:
    print(evaluate_chunk_size(articles[:120], "what is france capital city", cs))

## 9) Qualitative observations for report/PPT/video

Use this structure in your submission:
- **Relevance:** Were top chunks directly related to the query?
- **Coherence:** Was the generated answer clear and logically written?
- **Completeness:** Did it fully answer the query?
- **Challenges:** Retrieval misses, GPT-2 hallucination, limited context length.
- **Improvements:** Better embedding model, reranker, stronger prompt constraints, answer citation checks.