In [1]:
#@title 🚀 Install dependencies
!pip -q install faiss-cpu sentence-transformers pypdf beautifulsoup4 lxml requests gradio pillow reportlab
!pip -q install transformers accelerate --upgrade
# Optional clients; installed but not required unless you use them
!pip -q install groq openai


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.4/131.4 kB[0m [31m893.4 kB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#@title 🔧 Imports & helpers
import os, io, json, math, time, textwrap, shutil, string, uuid
from dataclasses import dataclass
from typing import List, Dict, Tuple
from pathlib import Path

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
import requests
from bs4 import BeautifulSoup

from PIL import Image, ImageDraw, ImageFont
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader

import gradio as gr

# Hugging Face Transformers (for local, small model)
from transformers import pipeline

WORKDIR = Path("/content/rag_colab")
WORKDIR.mkdir(parents=True, exist_ok=True)
ASSETS = WORKDIR / "assets"
ASSETS.mkdir(exist_ok=True)

def slug(s, n=50):
    safe = "".join(ch for ch in s.lower() if ch.isalnum() or ch in "-_ ")
    safe = "-".join(safe.split())
    return safe[:n] or str(uuid.uuid4())

def chunk_text(text:str, max_tokens:int=220, overlap:int=40) -> List[str]:
    """
    Simple word-based chunker approximating token sizes.
    """
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i: i + max_tokens]
        chunks.append(" ".join(chunk))
        i += max_tokens - overlap
        if i <= 0: break
    return chunks

def read_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for p in reader.pages:
        try:
            pages.append(p.extract_text() or "")
        except:
            pages.append("")
    return "\n".join(pages)

def scrape_url(url: str) -> str:
    r = requests.get(url, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")
    # Remove script/style
    for tag in soup(["script","style","noscript"]): tag.extract()
    text = soup.get_text(separator=" ")
    text = " ".join(text.split())
    return text

def normalize(vecs: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
    return vecs / norms

def wrap_for_image(s, width=80):
    return "\n".join(textwrap.wrap(s, width=width, replace_whitespace=False))

def text_to_image(text:str, out_path:Path, title:str=None, width=1200, padding=30):
    """
    Render long text to a PNG for use as 'screenshot' in the report.
    """
    font = ImageFont.load_default()
    lines = (title + "\n\n" if title else "") + wrap_for_image(text, width=110)
    dummy = Image.new("RGB", (width, 100), "white")
    draw = ImageDraw.Draw(dummy)
    w, h = draw.multiline_textsize(lines, font=font)
    img = Image.new("RGB", (width, h + padding*2), "white")
    draw = ImageDraw.Draw(img)
    draw.multiline_text((padding, padding), lines, fill="black", font=font)
    img.save(out_path)


In [3]:
#@title ⚙️ Configuration
# Choose one: "local_flan_t5" (no API needed), "groq", "openai", "hf_inference"
LLM_BACKEND = "groq"  #@param ["local_flan_t5", "groq", "openai", "hf_inference"]

# API keys (only needed if using groq/openai/hf_inference)
GROQ_API_KEY = "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"  #@param {type:"string"}
OPENAI_API_KEY = ""  #@param {type:"string"}
HF_INFERENCE_TOKEN = ""  #@param {type:"string"}

# Models
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # small, fast, free
# Local lightweight model (works on CPU in Colab; concise answers)
LOCAL_GENERATION_MODEL = "google/flan-t5-base"  # small, reliable demo
# Groq model example (fast, free tier exists, needs key)
GROQ_MODEL = "llama-3.1-8b-instant"
# OpenAI example
OPENAI_MODEL = "gpt-4o-mini"
# HF Inference example
HF_INFERENCE_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"

TOP_K = 4  # retrieval size
MAX_CHUNK_TOKENS = 220
CHUNK_OVERLAP = 40

# Output paths
DB_META_JSON = WORKDIR / "db_meta.json"
FAISS_INDEX_PATH = str(WORKDIR / "faiss_index.index")
REPORT_PDF = str(WORKDIR / "RAG_Project_Report.pdf")

print(f"LLM backend: {LLM_BACKEND}")
print(f"Workdir: {WORKDIR}")


LLM backend: local_flan_t5
Workdir: /content/rag_colab


In [None]:
!pip install python-docx

from docx import Document
from docx.shared import Inches
from pathlib import Path

# Save path
REPORT_DOCX = "/content/RAG_Project_Report.docx"

# Create document
doc = Document()

# ---------------- Cover Page ----------------
doc.add_heading("Retrieval-Augmented Generation (RAG) Project", 0)
doc.add_paragraph("Final Project Report\n\n")
doc.add_paragraph("Prepared by: Your Name\nSubmission Date: 20th August 2025")
doc.add_page_break()

# ---------------- Tools & Libraries ----------------
doc.add_heading("Tools & Libraries", level=1)
doc.add_paragraph("""
- LLMs: Hugging Face Transformers (Flan-T5), Groq API, OpenAI API, Hugging Face Inference API
- Vector Database: FAISS
- Embeddings: Sentence-Transformers (all-MiniLM-L6-v2)
- Document Handling: pypdf (for PDFs), BeautifulSoup + requests (for scraping)
- UI: Gradio
- Reporting: Pillow, ReportLab, python-docx
""")

# ---------------- Step-by-Step Process ----------------
doc.add_heading("Step-by-Step Process", level=1)
doc.add_paragraph("""
1. Data Loading:
   Loaded data from custom text, PDFs, and optional scraped content.
2. Chunking:
   Split text into ~220-token chunks with overlap.
3. Embeddings:
   Generated embeddings using Sentence-Transformers.
4. Vector Database:
   Stored embeddings inside FAISS for efficient similarity search.
5. Retrieval:
   Queried Top-K most relevant chunks using cosine similarity.
6. Prompt Construction:
   Built augmented prompt with retrieved chunks.
7. Generation:
   Passed prompt to chosen LLM backend (Flan-T5, Groq, OpenAI, or HF API).
8. UI:
   Provided Gradio interface for user interaction.
9. Report:
   Auto-generated project report with explanations and screenshots.
""")

# ---------------- Screenshots ----------------
assets = Path("/content/rag_colab/assets")
for name, title in [
    ("db_creation.png", "Database Creation & Insertion"),
    ("retrieval.png", "Retrieval Results"),
    ("final_answer.png", "Final Answer"),
]:
    img_path = assets / name
    if img_path.exists():
        doc.add_heading(title, level=2)
        doc.add_picture(str(img_path), width=Inches(5.5))
        doc.add_paragraph(f"Figure: {title}")

# Save file
doc.save(REPORT_DOCX)
print("✅ Word Report generated at:", REPORT_DOCX)


In [4]:
#@title 📥 Load your data (edit as you like)

# Option A: Raw text samples (you can paste your own here)
raw_texts = [
    """Taaza Paidawar Markaz is a fictional e-commerce idea focused on fresh produce delivery across Pakistan.
    It aims to connect farmers directly to consumers with transparent pricing and quality guarantees.""",
    """RAG (Retrieval Augmented Generation) improves LLM answers by retrieving relevant context from a vector database
    and passing it as prompt context to the generator model.""",
]

# Option B: Upload PDFs via Colab file picker (uncomment to use)
# from google.colab import files
# uploaded = files.upload()
# for fn in uploaded:
#     pdf_text = read_pdf_text(fn)
#     raw_texts.append(pdf_text)

# Option C: Scrape a URL (keep it small & public; or add multiple)
# try:
#     raw_texts.append(scrape_url("https://www.python.org/about/"))
# except Exception as e:
#     print("Scrape failed (skipping):", e)

print(f"Loaded {len(raw_texts)} source documents.")


Loaded 2 source documents.


In [5]:
#@title 🔎 Retrieval function (top-K by cosine similarity)
# Reload (useful if you come back later in the notebook)
index = faiss.read_index(FAISS_INDEX_PATH)
with open(DB_META_JSON) as f:
    meta = json.load(f)
docs = meta["docs"]

def retrieve(query: str, top_k=TOP_K) -> List[Dict]:
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(scores[0], idxs[0]):
        d = docs[idx].copy()
        d["score"] = float(score)
        results.append(d)
    return results


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/third-party/faiss/faiss/impl/io.cpp:69: Error: 'f' failed: could not open /content/rag_colab/faiss_index.index for reading: No such file or directory

In [None]:
#@title 🤖 LLM wrappers
from typing import Optional

# Local pipeline (no key needed)
_local_pipe = None
def local_generate(prompt: str, max_new_tokens=256) -> str:
    global _local_pipe
    if _local_pipe is None:
        _local_pipe = pipeline("text2text-generation", model=LOCAL_GENERATION_MODEL, device_map="auto")
    out = _local_pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
    return out.strip()

# Groq
def groq_generate(prompt: str, max_tokens=400) -> str:
    if not os.getenv("GROQ_API_KEY") and not GROQ_API_KEY:
        raise RuntimeError("Set GROQ_API_KEY to use Groq.")
    from groq import Groq
    client = Groq(api_key=GROQ_API_KEY or os.getenv("GROQ_API_KEY"))
    resp = client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[{"role":"system","content":"You are a helpful assistant."},
                  {"role":"user","content":prompt}],
        temperature=0.2,
        max_tokens=max_tokens,
    )
    return resp.choices[0].message.content.strip()

# OpenAI
def openai_generate(prompt: str, max_tokens=400) -> str:
    if not os.getenv("OPENAI_API_KEY") and not OPENAI_API_KEY:
        raise RuntimeError("Set OPENAI_API_KEY to use OpenAI.")
    from openai import OpenAI
    client = OpenAI(api_key=OPENAI_API_KEY or os.getenv("OPENAI_API_KEY"))
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[{"role":"system","content":"You are a helpful assistant."},
                  {"role":"user","content":prompt}],
        temperature=0.2,
        max_tokens=max_tokens,
    )
    return resp.choices[0].message.content.strip()

# Hugging Face Inference API
def hf_inference_generate(prompt: str, max_new_tokens=400) -> str:
    if not os.getenv("HF_INFERENCE_TOKEN") and not HF_INFERENCE_TOKEN:
        raise RuntimeError("Set HF_INFERENCE_TOKEN to use HF Inference API.")
    url = f"https://api-inference.huggingface.co/models/{HF_INFERENCE_MODEL}"
    headers = {"Authorization": f"Bearer {HF_INFERENCE_TOKEN or os.getenv('HF_INFERENCE_TOKEN')}"}
    payload = {
        "inputs": prompt,
        "parameters": {"max_new_tokens": max_new_tokens, "temperature": 0.2, "return_full_text": False}
    }
    r = requests.post(url, headers=headers, json=payload, timeout=60)
    r.raise_for_status()
    data = r.json()
    # Response format can vary slightly; handle common cases
    if isinstance(data, list) and data and "generated_text" in data[0]:
        return data[0]["generated_text"].strip()
    if isinstance(data, dict) and "generated_text" in data:
        return data["generated_text"].strip()
    # Fallback
    return str(data)


In [None]:
#@title 🧩 RAG: Retrieve → Augment Prompt → Generate
SYSTEM_INSTRUCTIONS = """You answer strictly based on the provided context.
If the answer isn't in the context, say you don't know. Be concise and accurate.
"""

def build_prompt(question:str, contexts:List[Dict]) -> str:
    context_block = "\n\n---\n".join(
        [f"[Chunk {i+1} | score={c['score']:.3f} | {c['source']}] {c['text']}" for i,c in enumerate(contexts)]
    )
    prompt = f"""{SYSTEM_INSTRUCTIONS}

Context:
{context_block}

User question: {question}

Answer:"""
    return prompt

def generate_answer(question:str) -> Tuple[str, List[Dict], str]:
    rets = retrieve(question, top_k=TOP_K)
    prompt = build_prompt(question, rets)

    if LLM_BACKEND == "local_flan_t5":
        answer = local_generate(prompt)
    elif LLM_BACKEND == "groq":
        answer = groq_generate(prompt)
    elif LLM_BACKEND == "openai":
        answer = openai_generate(prompt)
    elif LLM_BACKEND == "hf_inference":
        answer = hf_inference_generate(prompt)
    else:
        raise ValueError("Unknown LLM backend")

    return answer, rets, prompt


In [None]:
#@title 🧪 Test once and capture artifacts for the report
test_question = "What is Retrieval Augmented Generation and how does it help here?"
answer, rets, prompt = generate_answer(test_question)

print("QUESTION:\n", test_question, "\n")
print("RETRIEVAL RESULTS (top-k):")
for r in rets:
    print(f"- id={r['id']}, score={r['score']:.3f}, source={r['source']}")
print("\nANSWER:\n", answer)

# --- Create 'screenshots' as PNGs ---
# 1) DB creation summary
db_summary = f"FAISS index created with {len(docs)} chunks using {EMBEDDING_MODEL}.\nIndex path: {FAISS_INDEX_PATH}\nMetadata path: {DB_META_JSON}"
png_db = ASSETS / "db_creation.png"
text_to_image(db_summary, png_db, title="Database Creation & Insertion")

# 2) Retrieval results
retrieval_text = "Top-K Retrieval Results:\n\n" + "\n\n".join(
    [f"[{i+1}] id={r['id']} score={r['score']:.3f} source={r['source']}\n{wrap_for_image(r['text'], 100)}"
     for i, r in enumerate(rets)]
)
png_retrieval = ASSETS / "retrieval.png"
text_to_image(retrieval_text, png_retrieval, title="Retrieval Results")

# 3) Final answer
png_answer = ASSETS / "final_answer.png"
text_to_image(f"Question: {test_question}\n\nAnswer:\n{answer}", png_answer, title="Final Answer")

print("\nSaved images for report:")
print(png_db)
print(png_retrieval)
print(png_answer)


In [None]:
#@title 🖥️ Gradio app: Ask questions over your data
def ui_answer(q):
    if not q or not q.strip():
        return "", ""
    ans, rets, _ = generate_answer(q)
    retrieved_str = "\n\n".join(
        [f"[{i+1}] score={r['score']:.3f} ({r['source']})\n{r['text']}" for i, r in enumerate(rets)]
    )
    return ans, retrieved_str

with gr.Blocks() as demo:
    gr.Markdown("# 🔎 RAG Demo (FAISS + Sentence-Transformers)")
    gr.Markdown("Ask questions based on your loaded data (text/PDF/URL).")
    with gr.Row():
        inp = gr.Textbox(label="Your question", placeholder="Ask something...")
    with gr.Row():
        out = gr.Textbox(label="Generated answer")
        ctx = gr.Textbox(label="Retrieved chunks (for transparency)")
    btn = gr.Button("Ask")
    btn.click(fn=ui_answer, inputs=inp, outputs=[out, ctx])

demo.launch(debug=False, share=False)


In [None]:
#@title 📄 Build a PDF report (with screenshots)
tools_text = f"""
Tools, Libraries & APIs Used
- Embeddings: {EMBEDDING_MODEL} (Sentence-Transformers)
- Vector DB: FAISS (cosine similarity via inner product on normalized vectors)
- LLM Backend: {LLM_BACKEND}
  - local_flan_t5 -> Transformers pipeline: {LOCAL_GENERATION_MODEL}
  - groq -> {GROQ_MODEL}
  - openai -> {OPENAI_MODEL}
  - hf_inference -> {HF_INFERENCE_MODEL}
- Parsing: pypdf for PDFs, BeautifulSoup+requests for scraping
- UI: Gradio
- Reporting: Pillow + ReportLab
"""

steps_text = """
Step-by-Step Process
1) Data Loading: Accept raw text, PDF(s) (via upload), or small scraped pages (public URL).
2) Chunking: Simple word-based chunking (~220 tokens with 40 overlap) to keep semantic units.
3) Embeddings: Encode chunks with sentence-transformers 'all-MiniLM-L6-v2' (fast, free).
4) Vector DB: Store embeddings in FAISS (IndexFlatIP) with normalized vectors ⇒ cosine similarity.
5) Retrieval: For each query, embed the question and fetch Top-K similar chunks.
6) Prompt Construction: Build a context block with retrieved chunks and a strict instruction.
7) Generation: Send the prompt to the selected LLM backend; default local FLAN-T5 for no-API demo.
8) UI: Gradio app to accept user questions, show retrieved chunks, and display the final answer.
9) Report: Save images of DB creation summary, retrieval results, and final answer; compile to PDF.
"""

# Make PNG from steps & tools for inclusion
png_tools = ASSETS / "tools.png"
text_to_image(tools_text, png_tools, title="Tools, Libraries & APIs")
png_steps = ASSETS / "steps.png"
text_to_image(steps_text, png_steps, title="Step-by-Step Explanation")

# Create PDF
c = canvas.Canvas(REPORT_PDF, pagesize=A4)
W, H = A4

def add_img(path, title=None, y_offset=60):
    c.setFont("Helvetica-Bold", 14)
    if title:
        c.drawString(40, H-40, title)
    img = ImageReader(str(path))
    # Fit image to page width with margins
    iw, ih = Image.open(path).size
    max_w = W - 80
    scale = min(max_w/iw, (H-120)/ih)
    w, h = iw*scale, ih*scale
    c.drawImage(img, 40, (H - 80 - h), width=w, height=h)
    c.showPage()

add_img(png_tools, "Tools & Libraries")
add_img(png_steps, "Process")
add_img(ASSETS / "db_creation.png", "Database Creation & Insertion")
add_img(ASSETS / "retrieval.png", "Retrieval Results")
add_img(ASSETS / "final_answer.png", "Final Answer")

c.save()
print("Report saved ->", REPORT_PDF)


In [None]:
#@title 🔁 Switch LLM backend on the fly (optional)
def set_backend(name:str):
    global LLM_BACKEND
    LLM_BACKEND = name
    print("LLM backend set to:", LLM_BACKEND)

# Example:
# set_backend("groq")         # requires GROQ_API_KEY
# set_backend("openai")       # requires OPENAI_API_KEY
# set_backend("hf_inference") # requires HF_INFERENCE_TOKEN
# set_backend("local_flan_t5")
