<a href="https://colab.research.google.com/github/zamanmiraz/DSandML-Notebooks/blob/main/RAG/05_reverse_hyde.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/guyernest/advanced-rag.git
%cd advanced-rag
!pip install --upgrade -r requirements.txt

In [None]:
!pip install torchvision==0.18.0
!pip install -q -U google-generativeai

In [None]:
from rich.console import Console
from rich.style import Style
import pathlib
from rich_theme_manager import Theme, ThemeManager

THEMES = [
    Theme(
        name="dark",
        description="Dark mode theme",
        tags=["dark"],
        styles={
            "repr.own": Style(color="#e87d3e", bold=True),      # Class names
            "repr.tag_name": "dim cyan",                        # Adjust tag names
            "repr.call": "bright_yellow",                       # Function calls and other symbols
            "repr.str": "bright_green",                         # String representation
            "repr.number": "bright_red",                        # Numbers
            "repr.none": "dim white",                           # None
            "repr.attrib_name": Style(color="#e87d3e", bold=True),    # Attribute names
            "repr.attrib_value": "bright_blue",                 # Attribute values
            "default": "bright_white on black"                  # Default text and background
        },
    ),
    Theme(
        name="light",
        description="Light mode theme",
        styles={
            "repr.own": Style(color="#22863a", bold=True),          # Class names
            "repr.tag_name": Style(color="#00bfff", bold=True),     # Adjust tag names
            "repr.call": Style(color="#ffff00", bold=True),         # Function calls and other symbols
            "repr.str": Style(color="#008080", bold=True),          # String representation
            "repr.number": Style(color="#ff6347", bold=True),       # Numbers
            "repr.none": Style(color="#808080", bold=True),         # None
            "repr.attrib_name": Style(color="#ffff00", bold=True),  # Attribute names
            "repr.attrib_value": Style(color="#008080", bold=True), # Attribute values
            "default": Style(color="#000000", bgcolor="#ffffff"),   # Default text and background
        },
    ),
]

theme_dir = pathlib.Path("themes").expanduser()
theme_dir.expanduser().mkdir(parents=True, exist_ok=True)

theme_manager = ThemeManager(theme_dir=theme_dir, themes=THEMES)
theme_manager.list_themes()

dark = theme_manager.get("dark")
theme_manager.preview_theme(dark)
light = theme_manager.get("light")

console = Console(theme=light)

In [None]:
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:
import google.generativeai as genai
from semantic_chunkers import StatisticalChunker
from google.colab import userdata
import logging

# Disable logs
logging.disable(logging.CRITICAL)

# Configure API
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
from typing import List, Dict
import google.generativeai as genai

class ReverseHyde:
    def __init__(self, api_key: str, model_name="models/text-embedding-004", gen_model="gemini-2.0-flash"):
        # Configure Gemini API
        genai.configure(api_key=api_key)
        self.embed_model = model_name
        self.gen_model = gen_model
        self.generator = genai.GenerativeModel(self.gen_model)

    def get_embedding(self, text: str) -> List[float]:
        """Generate embedding for a text chunk using Gemini embedding model."""
        response = genai.embed_content(model=self.embed_model, content=text)
        return response["embedding"]

    def generate_reverse_hyde(self, chunk: str, n: int = 3) -> List[str]:
        """Generate N questions that the given chunk could answer."""
        prompt = f"""
Given the following text chunk, generate {n} different questions that this chunk would be a good answer to:

Chunk:
{chunk}

Questions (enumerate the questions with 1., 2., etc.):
"""

        response = self.generator.generate_content(prompt)

        # Extract text and clean
        text_output = response.text.strip()
        lines = text_output.split('\n')
        questions = [line.split('. ', 1)[1] for line in lines if '. ' in line]
        return questions

    def process_chunks(self, chunks: List[str], n: int = 3) -> Dict[str, List[str]]:
        """Process multiple chunks and generate questions for each."""
        processed = {}
        for chunk in chunks:
            processed[chunk] = self.generate_reverse_hyde(chunk, n)
        return processed


In [None]:
import os
# Usage example
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
reverse_hyde = ReverseHyde(GOOGLE_API_KEY)

chunks = [
    "A mitochondrion (pl. mitochondria) is an organelle found in the cells of most eukaryotes, such as animals, plants and fungi. Mitochondria have a double membrane structure and use aerobic respiration to generate adenosine triphosphate (ATP), which is used throughout the cell as a source of chemical energy. They were discovered by Albert von Kölliker in 1857 in the voluntary muscles of insects. Meaning a thread-like granule, the term mitochondrion was coined by Carl Benda in 1898. The mitochondrion is popularly nicknamed the \"powerhouse of the cell\", a phrase popularized by Philip Siekevitz in a 1957 Scientific American article of the same name.",
    "Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.",
    "The American Civil War (from April 12, 1861 to May 26, 1865) was a civil war in the United States between the Union (\"the North\") and the Confederacy (\"the South\"), which was formed in 1861 by states that had seceded from the Union. The central conflict leading to war was a dispute over whether slavery should be permitted to expand into the western territories, leading to more slave states, or be prohibited from doing so, which many believed would place slavery on a course of ultimate extinction."
]

processed_chunks = reverse_hyde.process_chunks(chunks, n=5)

In [None]:
console.print(processed_chunks)

In [None]:
query = "What generates energy in a cell?"

In [None]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

# Create the embedding encoder
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [None]:
hyde_collection_name="reverse_hyde"

qdrant.recreate_collection(
    collection_name=hyde_collection_name,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

In [None]:
import uuid
# vectorize!
qdrant.upload_points(
    collection_name=hyde_collection_name,
    points=[
        models.PointStruct(
            id=uuid.uuid5(uuid.NAMESPACE_URL, f"{d_idx}-{q_idx}").hex,
            vector=encoder.encode(question).tolist(),
            payload={
                "document": document ,
                "doc_id": d_idx
            }
        ) for d_idx, (document, questions)
            in enumerate(processed_chunks.items())
                for q_idx, question in enumerate(questions)
    ]
)

In [None]:
console.print(
    qdrant
    .get_collection(
        collection_name=hyde_collection_name
    )
)

In [None]:
from rich.panel import Panel
from rich.table import Table

def search_collection(collection_name: str, query: str, limit: int = 1):
    """
    This function searches the specified collection for the best match to the given query.
    It then creates a table and a panel to display the query and the best match.

    :param collection_name: The name of the collection to search.
    :param query: The query to search for.
    :param limit: The maximum number of results to return. Default is 1.
    """
    hits = qdrant.search(
        collection_name=collection_name,
        query_vector=encoder.encode(query).tolist(),
        limit=limit
    )
    # Create a table for both query and best match
    table = Table(show_header=True, header_style="bold yellow")
    table.add_column("Query", style="bright_cyan", width=30)
    table.add_column("Best Matching Chunk", style="bright_yellow", width=50)
    table.add_column("Score", style="bright_green")
    for hit in hits:
        table.add_row(query, f"{hit.payload['document'][:80]}...", "{:.4f}".format(hit.score))

    # Create a panel for the table
    panel = Panel(
        table,
        title=f"[bold]Query and Best Match in {collection_name}",
        border_style="white",
        expand=False
    )

    # Print the panel
    console.print(panel)

In [None]:
search_collection(hyde_collection_name, query)


In [None]:
# Create collection to store the wine rating data
docs_collection_name="documents_only"

qdrant.recreate_collection(
    collection_name=docs_collection_name,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

In [None]:
# vectorize!
qdrant.upload_points(
    collection_name=docs_collection_name,
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(document).tolist(),
            payload={ "document": document}
        ) for idx, (document, questions) in enumerate(processed_chunks.items())
    ]
)

In [None]:

search_collection(docs_collection_name, query)