# RAG Summarization - Complete Pipeline

## üìã Project Workflow
1. ‚úÖ Set up retrieval (BM25 & FAISS dense)
2. ‚úÖ Implement RAG pipeline
3. ‚úÖ Ablation studies (retriever type, top-k)
4. ‚úÖ Evaluate QA & summarisation, compute readability metrics
5. ‚úÖ Log and compare ablation results

## üìä Evaluation Metrics
- ROUGE-1, ROUGE-2, ROUGE-L
- F1 BERTScore
- Avg Flesch-Kincaid Grade
- Individual FK grades (in CSV)

## üîß Installation (Run Once)

In [None]:
!pip install -q rank-bm25 sentence-transformers faiss-cpu transformers torch rouge-score bert-score textstat pandas numpy tqdm openpyxl

## üì¶ Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
from typing import List, Dict, Tuple

# Retrieval
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

# Generation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Evaluation
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import textstat

print("‚úì All imports successful")

## 1Ô∏è‚É£ Data Loading

In [None]:
# Load dataset
df = pd.read_excel("rag_dataset.xlsx")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Prepare corpus (unique passages)
unique_passages = df.drop_duplicates(subset=["passage_id"])
corpus = unique_passages["passage"].tolist()
passage_ids = unique_passages["passage_id"].tolist()

print(f"Total unique passages: {len(corpus)}")

## 2Ô∏è‚É£ Retriever Classes

In [None]:
# BM25 Retriever
class BM25Retriever:
    def __init__(self, corpus: List[str]):
        print("Initializing BM25...")
        tokenized_corpus = [doc.lower().split() for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.corpus = corpus
        print("‚úì BM25 ready")
    
    def retrieve(self, query: str, k: int = 3) -> List[str]:
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
        top_k_indices = np.argsort(scores)[::-1][:k]
        return [self.corpus[i] for i in top_k_indices]

In [None]:
# FAISS Dense Retriever
class FAISSRetriever:
    def __init__(self, corpus: List[str], model_name: str = 'all-MiniLM-L6-v2'):
        print(f"Initializing FAISS with {model_name}...")
        self.embed_model = SentenceTransformer(model_name)
        self.corpus = corpus
        
        # Create embeddings
        print("Creating passage embeddings...")
        self.passage_embeddings = self.embed_model.encode(
            corpus, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        # Build FAISS index
        dimension = self.passage_embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner Product
        
        # Normalize for cosine similarity
        faiss.normalize_L2(self.passage_embeddings)
        self.index.add(self.passage_embeddings)
        print(f"‚úì FAISS index ready with {self.index.ntotal} vectors")
    
    def retrieve(self, query: str, k: int = 3) -> List[str]:
        query_vec = self.embed_model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)
        scores, indices = self.index.search(query_vec, k)
        return [self.corpus[i] for i in indices[0]]

## 3Ô∏è‚É£ Initialize Retrievers

In [None]:
# Initialize BM25
bm25_retriever = BM25Retriever(corpus)

In [None]:
# Initialize FAISS (this may take a few minutes)
faiss_retriever = FAISSRetriever(corpus)

## 4Ô∏è‚É£ Summarization Model

In [None]:
class SummarizationModel:
    def __init__(self, model_name: str = "google/flan-t5-base"):
        print(f"Loading {model_name}...")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
        print(f"‚úì Model loaded on {self.device}")
    
    def generate(self, question: str, context: List[str], max_length: int = 150) -> str:
        context_str = " ".join(context)
        prompt = f"Question: {question}\n\nContext: {context_str}\n\nSummarize the answer based on the context:"
        
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                num_beams=4,
                early_stopping=True
            )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Initialize
summarizer = SummarizationModel()

## 5Ô∏è‚É£ Quick Test (Â∞èÊ†∑Êú¨ÊµãËØï)

In [None]:
# Test with one example
test_row = df.drop_duplicates(subset=["question_id"]).iloc[0]

print("Question:", test_row["question"])
print("\nTrue answer:", test_row["answer"])

# BM25
print("\n" + "="*60)
print("BM25 Retrieval (k=3):")
bm25_ctx = bm25_retriever.retrieve(test_row["question"], k=3)
bm25_summary = summarizer.generate(test_row["question"], bm25_ctx)
print("Summary:", bm25_summary)
print("FK Grade:", textstat.flesch_kincaid_grade(bm25_summary))

# FAISS
print("\n" + "="*60)
print("FAISS Retrieval (k=3):")
faiss_ctx = faiss_retriever.retrieve(test_row["question"], k=3)
faiss_summary = summarizer.generate(test_row["question"], faiss_ctx)
print("Summary:", faiss_summary)
print("FK Grade:", textstat.flesch_kincaid_grade(faiss_summary))

## 6Ô∏è‚É£ Evaluation Functions

In [None]:
class RAGEvaluator:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'], 
            use_stemmer=True
        )
    
    def evaluate_batch(self, predictions: List[str], references: List[str]) -> Tuple[Dict, List]:
        # ROUGE scores
        rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
        
        for pred, ref in zip(predictions, references):
            scores = self.rouge_scorer.score(ref, pred)
            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
        
        # BERTScore
        P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
        
        # Flesch-Kincaid
        fk_grades = [textstat.flesch_kincaid_grade(p) for p in predictions]
        
        metrics = {
            "ROUGE-1": np.mean(rouge_scores['rouge1']),
            "ROUGE-2": np.mean(rouge_scores['rouge2']),
            "ROUGE-L": np.mean(rouge_scores['rougeL']),
            "BERTScore_F1": F1.mean().item(),
            "Avg_FK_Grade": np.mean(fk_grades)
        }
        
        return metrics, fk_grades

evaluator = RAGEvaluator()
print("‚úì Evaluator ready")

## 7Ô∏è‚É£ Ablation Study (Ê∂àËûçÁ†îÁ©∂)

In [None]:
# Sample test set for ablation
TEST_SIZE = 500  # Adjust based on your computational resources
test_df = df.drop_duplicates(subset=["question_id"]).sample(n=TEST_SIZE, random_state=42)

print(f"Test set size: {len(test_df)}")

In [None]:
# Ablation configurations
configs = [
    {"name": "BM25_k3", "retriever": bm25_retriever, "k": 3},
    {"name": "BM25_k5", "retriever": bm25_retriever, "k": 5},
    {"name": "BM25_k10", "retriever": bm25_retriever, "k": 10},
    {"name": "FAISS_k3", "retriever": faiss_retriever, "k": 3},
    {"name": "FAISS_k5", "retriever": faiss_retriever, "k": 5},
    {"name": "FAISS_k10", "retriever": faiss_retriever, "k": 10},
]

ablation_results = []

for config in configs:
    print(f"\n{'='*60}")
    print(f"Running: {config['name']}")
    print(f"{'='*60}")
    
    predictions = []
    references = []
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=config['name']):
        ctx = config['retriever'].retrieve(row['question'], k=config['k'])
        pred = summarizer.generate(row['question'], ctx)
        predictions.append(pred)
        references.append(row['answer'])
    
    # Evaluate
    metrics, _ = evaluator.evaluate_batch(predictions, references)
    
    result = {
        "config_name": config['name'],
        "retriever_type": config['name'].split('_')[0],
        "k": config['k'],
        **metrics
    }
    ablation_results.append(result)
    
    print(f"\nResults:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# Save results
os.makedirs("results", exist_ok=True)
ablation_df = pd.DataFrame(ablation_results)
ablation_df.to_csv("results/ablation_study.csv", index=False)

print("\n" + "="*80)
print("ABLATION STUDY COMPLETE")
print("="*80)
ablation_df

## 8Ô∏è‚É£ Full Evaluation (ÂÆåÊï¥ËØÑ‰º∞)

‰ΩøÁî® ablation study ‰∏≠Ë°®Áé∞ÊúÄÂ•ΩÁöÑÈÖçÁΩÆ

In [None]:
# Select best config from ablation
best_config = ablation_df.loc[ablation_df['BERTScore_F1'].idxmax()]
print("Best configuration:")
print(best_config)

# Use the best retriever and k
BEST_RETRIEVER = faiss_retriever if "FAISS" in best_config['config_name'] else bm25_retriever
BEST_K = int(best_config['k'])

In [None]:
# Full evaluation on entire dataset
print("Running full evaluation...")
print(f"Using: {best_config['config_name']}")

unique_df = df.drop_duplicates(subset=["question_id"]).reset_index(drop=True)

predictions = []
references = []

for _, row in tqdm(unique_df.iterrows(), total=len(unique_df), desc="Full evaluation"):
    ctx = BEST_RETRIEVER.retrieve(row['question'], k=BEST_K)
    pred = summarizer.generate(row['question'], ctx)
    predictions.append(pred)
    references.append(row['answer'])

# Add predictions to dataframe
unique_df['rag_summary'] = predictions

# Evaluate
final_metrics, fk_grades = evaluator.evaluate_batch(predictions, references)
unique_df['FK_grade'] = fk_grades

# Save
unique_df.to_csv("results/rag_full_outputs.csv", index=False)

with open("results/rag_full_metrics.json", "w") as f:
    json.dump(final_metrics, f, indent=2)

print("\n" + "="*80)
print("FINAL METRICS")
print("="*80)
for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

print(f"\n‚úì Saved to results/")

## 9Ô∏è‚É£ Extract Required Passages (ÊèêÂèñÁâπÂÆöÊÆµËêΩ)

In [None]:
# Extract specific passages
required_passages = [16771, 12220, 29568]

subset_df = unique_df[unique_df['passage_id'].isin(required_passages)].copy()
subset_df.to_csv("results/rag_required_passages.csv", index=False)

print("Required passages extracted:")
subset_df[['passage_id', 'question', 'rag_summary', 'FK_grade']]

## üéØ Summary

### Generated Files:
1. `results/ablation_study.csv` - Comparison of different retrievers and k values
2. `results/rag_full_outputs.csv` - All predictions with individual FK grades
3. `results/rag_full_metrics.json` - Overall metrics (ROUGE, BERTScore, Avg FK)
4. `results/rag_required_passages.csv` - Specific passages [16771, 12220, 29568]

### Metrics in JSON:
- ROUGE-1
- ROUGE-2
- ROUGE-L
- BERTScore_F1
- Avg_FK_Grade ‚Üê **Average across all samples**

### Individual FK Grades:
- Stored in CSV files (column: `FK_grade`)