# Visual RAG Pipeline (Colab Version)

**Project:** Visual RAG for German Portfolio Evaluation

## Overview
This notebook implements a **Visual Retrieval-Augmented Generation (RAG)** pipeline for evaluating German student portfolios. This system treats PDF pages as **images** to preserve spatial context (charts, tables, layouts).

## Architecture
```
PDF Document ‚Üí ColPali (Visual Retrieval) ‚Üí Top-K Pages ‚Üí Llama Vision (Groq API) ‚Üí Answer
```

## Components
| Component | Model | Purpose |
|-----------|-------|---------|
| Retriever | vidore/colpali-v1.3 | Visual document retrieval with MaxSim scoring |
| Generator | Llama 4 Scout (Groq) | Multimodal answer generation via API |

## Methodology
1. **Ingest:** Convert PDF pages into screenshots
2. **Index:** Create visual embeddings using ColPali
3. **Retrieve:** Find the most relevant page images
4. **Generate:** Pass page images + query to Llama Vision via Groq API

> **Note:** For local/cluster deployment with Qwen2-VL, see `run_visual_rag.py`

In [None]:
# @title ‚öôÔ∏è Step 1: Install Dependencies
import subprocess
import sys
import os

def install_packages():
    packages = [
        "byaldi==0.0.5",
        "pdf2image",
        "openai",
        "overrides",
        "ipython"
    ]

    subprocess.run(["sudo", "apt-get", "install", "-y", "poppler-utils"], check=True)

    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-qU", package])

    print("‚úÖ Setup complete. Please RESTART SESSION if this is the first run.")

try:
    import byaldi
except ImportError:
    install_packages()

In [None]:
# @title üß† Step 2: Define Visual RAG System
import base64
import requests
import os
from typing import List, Any
from byaldi import RAGMultiModalModel
from groq import Groq
from IPython.display import Image, display
from google.colab import userdata

# Configuration
RETRIEVER_MODEL = "vidore/colpali-v1.3"
LLM_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
INDEX_NAME = "visual_doc_index"

class MultimodalRAG:
    """
    Implements a Visual RAG pipeline: PDF -> Page Images -> Visual Embeddings -> Retrieval -> VLM Answer.
    """

    def __init__(self):
        self.rag_engine = None
        self.groq_client = None
        self.index_loaded = False

    def _load_retriever(self):
        """Lazy loads the ColPali model to save resources until needed."""
        if self.rag_engine is None:
            print(f"üîÑ Loading Retriever ({RETRIEVER_MODEL})...")
            self.rag_engine = RAGMultiModalModel.from_pretrained(RETRIEVER_MODEL)

    def authenticate(self):
        """Retrieves 'GROQ_API_KEY' from Colab secrets."""
        try:
            key = userdata.get('GROQ_API_KEY')
            if not key or not key.startswith("gsk_"):
                raise ValueError("Invalid Key format")
            self.groq_client = Groq(api_key=key)
            print("‚úÖ Authenticated.")
        except Exception as e:
            print(f"‚ùå Authentication Failed: {e}")

    def ingest_pdf(self, pdf_url: str, force_reindex: bool = False):
        """Downloads PDF and creates visual embeddings for all pages."""
        self._load_retriever()

        index_path = f".byaldi/{INDEX_NAME}"
        if os.path.exists(index_path) and not force_reindex:
            try:
                self.rag_engine.load_index(INDEX_NAME)
                self.index_loaded = True
                print(f"‚úÖ Loaded existing index: {INDEX_NAME}")
                return
            except Exception:
                print("‚ö†Ô∏è Index corrupted, re-indexing...")

        print(f"‚¨áÔ∏è Downloading PDF...")
        response = requests.get(pdf_url)
        with open("input.pdf", "wb") as f:
            f.write(response.content)

        print("üëÄ Indexing document...")
        self.rag_engine.index(
            input_path="input.pdf",
            index_name=INDEX_NAME,
            store_collection_with_index=True,
            overwrite=True
        )
        self.index_loaded = True
        print("‚úÖ Indexing complete.")

    def search(self, query: str, k: int = 1) -> List[Any]:
        """Retrieves top-k page results based on visual similarity."""
        self._load_retriever()
        if not self.index_loaded:
            # Attempt recovery if index exists but wasn't explicitly loaded
            try:
                self.rag_engine.load_index(INDEX_NAME)
                self.index_loaded = True
            except:
                raise RuntimeError("No index found. Run ingest_pdf() first.")

        return self.rag_engine.search(query, k=k)

    def generate_answer(self, query: str, result: Any) -> str:
        """Generates an answer using Llama Vision based on the retrieved page image."""
        if not self.groq_client:
            raise RuntimeError("Client not authenticated.")

        image_data = base64.b64decode(result.base64)
        
        print(f"\nüìÑ Context Found on Page {result.page_num}:")
        display(Image(data=image_data, width=500))

        try:
            chat_completion = self.groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": f"Context is provided in the image. Question: {query}"},
                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{result.base64}"}},
                        ],
                    }
                ],
                model=LLM_MODEL,
                temperature=0.1,  # Low temperature for factual consistency
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            return f"‚ùå GenAI Error: {e}"

In [None]:
# @title üöÄ Step 3: Run Pipeline
rag_system = MultimodalRAG()
rag_system.authenticate()

# Ingest Target Document
PDF_URL = "/Users/ningning/workspace/VisualRagPipeline/doc/handbuch_portfolio.pdf" 
rag_system.ingest_pdf(PDF_URL)

print("\n" + "="*50)
print("ü§ñ Visual RAG Assistant Ready. Type 'exit' to quit.")
print("="*50)

while True:
    user_query = input("\n‚ùì Ask a question: ")

    if user_query.lower() in ['exit', 'quit']:
        break

    results = rag_system.search(user_query, k=1)

    if results:
        answer = rag_system.generate_answer(user_query, results[0])
        print(f"\n‚ú® Answer:\n{answer}\n" + "-"*50)
    else:
        print("‚ùå No relevant information found.")