In [None]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.handlers.clear()
logger.propagate = False

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)

logger.info("Logger is working!")

2025-10-17 22:24:35,064 - INFO - Logger is working!


In [None]:
!pip install transformers==4.48.0

Collecting transformers==4.48.0
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers==4.48.0)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    F

In [None]:
!pip install chromadb pymupdf
!pip install flash-attn --no-build-isolatio # Check

Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

### **Fetch PDF as bytes**

Download and save the PDF locally to verify proper formatting.

In [None]:
pdf_url = "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"

In [None]:
import requests
from typing import Optional

In [None]:
def fetch_pdf_from_url(url: str, save_path: Optional[str] = None) -> bytes:
  """
  Fetch a PDF from a URL and optionally save it locally.

  Parameters:
  -----------
  url: str
      URL of the PDF
  save_path: Optional[str]
      Optional local path to save the PDF

  Returns:
  --------
  PDF content as bytes
  """
  logger.info(f"Fetching PDF from: {url}")
  response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
  response.raise_for_status()

  pdf_content = response.content
  logger.info(f"PDF downloaded: {len(pdf_content)} bytes")

  if save_path:
      with open(save_path, "wb") as f:
          f.write(pdf_content)
      logger.info(f"PDF saved to: {save_path}")

  return pdf_content

In [None]:
pdf_content = fetch_pdf_from_url(pdf_url, "file.pdf")

2025-10-17 22:25:43,463 - INFO - Fetching PDF from: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2025-10-17 22:25:45,504 - INFO - PDF downloaded: 569417 bytes
2025-10-17 22:25:45,505 - INFO - PDF saved to: file.pdf


### **Extract text and images from PDF**

In [None]:
import io
import pymupdf
from PIL import Image

from typing import List, Tuple

In [None]:
def extract_content_from_pdf(pdf_content: bytes) -> Tuple[List[dict], List[dict]]:
  """
  Extract all text blocks and images from PDF.

  Parameters:
  -----------
  pdf_content: bytes
      PDF file as bytes

  Returns:
  --------
  Tuple of lists of dictionaries containing text and images
  """
  pdf_document = pymupdf.open(stream=pdf_content, filetype="pdf")

  text_blocks = []
  images = []
  for page_num in range(len(pdf_document)):
      page = pdf_document[page_num]
      text = page.get_text()
      image_list = page.get_images(full=True)

      try:
          if not isinstance(text, str):
              continue

          if text.strip():
                  text_blocks.append(
                      {"page": page_num + 1, "text": text.strip(), "type": "text"}
                  )
      except Exception as e:
          logger.info(f"Could not process text on page {page_num + 1}: {e}")

      for img_index, img_info in enumerate(image_list):
          xref = img_info[0]
          base_image = pdf_document.extract_image(xref)
          image_bytes = base_image["image"]

          try:
              image = Image.open(io.BytesIO(image_bytes))

              images.append(
                  {
                      "page": page_num + 1,
                      "image": image,
                      "type": "image",
                      "index": img_index,
                  }
              )
          except Exception as e:
              logger.info(f"Could not process image on page {page_num + 1}: {e}")

  pdf_document.close()
  logger.info(f"Extracted {len(text_blocks)} text blocks and {len(images)} images.")

  return text_blocks, images

In [None]:
text_blocks, images = extract_content_from_pdf(pdf_content)

2025-10-17 22:25:45,846 - INFO - Extracted 11 text blocks and 3 images.


In [None]:
text_blocks[:2]

[{'page': 1,
  'text': 'Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and

In [None]:
images

[{'page': 3,
  'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1520x2239>,
  'type': 'image',
  'index': 0},
 {'page': 4,
  'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=835x1282>,
  'type': 'image',
  'index': 0},
 {'page': 4,
  'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=445x884>,
  'type': 'image',
  'index': 1}]

### **Embedding text and images**

`jina-clip-v1` is a state-of-the-art English **multimodal (text-image) embedding model**. [Read more](https://huggingface.co/jinaai/jina-clip-v1).

In [None]:
from transformers import AutoModel
import numpy as np

In [None]:
def generate_content_embeddings(model, text_blocks: List[dict], images: List[dict]):
  """
  Generate embeddings for text blocks and images using embedding model.

  Parameters:
  -----------
  model: model
  text_blocks: List of text block dictionaries
  images: List of images dictionaries

  Returns:
  --------
  Tuple of np.array of embeddings or None
  """

  text_embeddings = None
  images_embeddings = None

  if text_blocks:
      logger.info(f"Encoding {len(text_blocks)} text blocks...")
      text_contents = [block["text"] for block in text_blocks]
      text_embeddings = model.encode_text(text_contents)
      logger.info(f"Text embeddings shape {text_embeddings.shape}")

  if images:
      logger.info(f"Encoding {len(images)} images...")
      image_contents = [img["image"] for img in images]
      images_embeddings = model.encode_image(image_contents)
      logger.info(f"Image embeddings shape {images_embeddings.shape}")

  return text_embeddings, images_embeddings

In [None]:
jina_model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_clip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- configuration_clip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_clip.py: 0.00B [00:00, ?B/s]

hf_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- hf_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


rope_embeddings.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- rope_embeddings.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


transform.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- transform.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


eva_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- eva_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- modeling_clip.py
- hf_model.py
- rope_embeddings.py
- transform.py
- eva_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/891M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py: 0.00B [00:00, ?B/s]

block.py: 0.00B [00:00, ?B/s]

mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- block.py
- mlp.py
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


bert_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- modeling_bert.py
- block.py
- bert_padding.py
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [None]:
text_embeddings, image_embeddings = generate_content_embeddings(jina_model, text_blocks, images)

2025-10-17 22:26:11,825 - INFO - Encoding 11 text blocks...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

2025-10-17 22:26:17,815 - INFO - Text embeddings shape (11, 768)
2025-10-17 22:26:17,816 - INFO - Encoding 3 images...


preprocessor_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


processing_clip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- processing_clip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


2025-10-17 22:26:18,785 - INFO - Image embeddings shape (3, 768)


In [None]:
text_embeddings.shape

(11, 768)

In [None]:
image_embeddings.shape

(3, 768)

### **Retrieval database**

Chroma is an open-source AI application database that simplifies building LLM applications by making knowledge, facts, and skills pluggable for LLMs. [Getting started](https://docs.trychroma.com/docs/overview/getting-started).

In [None]:
import chromadb

In [None]:
chroma_client = chromadb.Client()

collection = chroma_client.create_collection(
    name="rag", metadata={"hnsw:space": "cosine"}, get_or_create=True
)

collection

Collection(name=rag)

In [None]:
from typing import Any

import torch
import numpy as np

In [None]:
def generate_id(prefix: Any, index: Any):
  """Generate a unique ID using prefix and index."""
  return f"{prefix}_{index}_{hash(str(index) + prefix) % 1000000}"

In [None]:
generate_id("prefix", "index")

'prefix_index_362238'

In [None]:
def store_text_embeddings(collection, text_blocks, embeddings, source):
    """
    Store text embeddings in ChromaDB.

    Parameters:
    -----------
    collection: ClientAPI
        ChromaDB collection
    text_blocks: List[dict]
        List of text block dictionaries
    embeddings: Text embeddings
    source: str
        Source identifier (URL or path)

    Returns:
    --------
    List of stored document IDs
    """
    if embeddings is None or len(text_blocks) == 0:
        logger.info("No text embeddings to store")
        return []

    logger.info(f"Storing {len(text_blocks)} text embeddings...")

    ids = []
    embedding_list = []
    documents = []
    metadatas = []

    for idx, (block, embedding) in enumerate(zip(text_blocks, embeddings)):
        doc_id = generate_id("text", idx)
        ids.append(doc_id)

        if torch.is_tensor(embedding):
            embedding_list.append(embedding.cpu().numpy().tolist())
        elif isinstance(embedding, np.ndarray):
            embedding_list.append(embedding.tolist())

        documents.append(block["text"])
        metadatas.append({"type": "text", "page": block["page"], "source": source})

    collection.add(
        ids=ids, embeddings=embedding_list, documents=documents, metadatas=metadatas
    )

    logger.info(f"Stored {len(ids)} text embeddings")
    return ids

In [None]:
text_ids = store_text_embeddings(collection, text_blocks, text_embeddings, pdf_url)

2025-10-17 22:26:21,144 - INFO - Storing 11 text embeddings...
2025-10-17 22:26:21,168 - INFO - Stored 11 text embeddings


In [None]:
text_ids

['text_0_314840',
 'text_1_355765',
 'text_2_222606',
 'text_3_678200',
 'text_4_911187',
 'text_5_353922',
 'text_6_33269',
 'text_7_578329',
 'text_8_853131',
 'text_9_235545',
 'text_10_734797']

In [None]:
def store_image_embeddings(collection, images, embeddings, source):
    """
    Store image embeddings in ChromaDB.

    Parameters:
    -----------
    collection: ChromaDB collection
    images: List of image dictionaries
    embeddings: Image embeddings
    source: Source identifier (URL or path)

    Returns:
    --------
    List of stored document IDs
    """
    if embeddings is None or len(images) == 0:
        logger.info("No image embeddings to store")
        return []

    logger.info(f"Storing {len(images)} image embeddings...")

    ids = []
    embedding_list = []
    documents = []
    metadatas = []

    for idx, (img, embedding) in enumerate(zip(images, embeddings)):
        doc_id = generate_id("image", idx)
        ids.append(doc_id)

        if torch.is_tensor(embedding):
            embedding_list.append(embedding.cpu().numpy().tolist())
        elif isinstance(embedding, np.ndarray):
            embedding_list.append(embedding.tolist())

        documents.append(f"Image from page {img['page']}, index {img['index']}")
        metadatas.append(
            {
                "type": "image",
                "page": img["page"],
                "image_index": img["index"],
                "source": source,
            }
        )

    collection.add(
        ids=ids, embeddings=embedding_list, documents=documents, metadatas=metadatas
    )

    logger.info(f"Stored {len(ids)} image embeddings")
    return ids

In [None]:
image_ids = store_image_embeddings(collection, images, image_embeddings, pdf_url)

2025-10-17 22:26:21,185 - INFO - Storing 3 image embeddings...
2025-10-17 22:26:21,192 - INFO - Stored 3 image embeddings


In [None]:
image_ids

['image_0_79277', 'image_1_715025', 'image_2_973616']

In [None]:
collection.count()

14

### **Retrieve data from embeddings database**

Transform a natural language query into an embedding, then perform similarity search on the Chroma database to retrieve relevant content.

In [None]:
def query_with_text(model, collection, query_text, n_results=5, filter_type=None):
  """
  Query the database using text.

  Parameters:
  -----------
  model: Jina-CLIP model
  collection: ChromaDB collection
  query_text: Query string
  n_results: Number of results to return
  filter_type: Optional filter ('text' or 'image')

  Returns:
  --------
  Query results dictionary
  """
  logger.info(f"Querying with text: '{query_text}'")

  query_embedding = model.encode_text([query_text])[0]

  if torch.is_tensor(query_embedding):
      query_embedding_list = query_embedding.cpu().numpy().tolist()
  elif isinstance(query_embedding, np.ndarray):
      query_embedding_list = query_embedding.tolist()

  where_filter = {"type": filter_type} if filter_type else None

  results = collection.query(
      query_embeddings=[query_embedding_list], n_results=n_results, where=where_filter
  )

  logger.info(f"Found {len(results['ids'][0])} results")
  return results

In [None]:
query_results = query_with_text(
    model=jina_model,
    collection=collection,
    query_text="attention mechanism",
    n_results=5
)

2025-10-17 22:26:21,218 - INFO - Querying with text: 'attention mechanism'
2025-10-17 22:26:21,267 - INFO - Found 5 results


In [None]:
query_results

{'ids': [['text_3_678200',
   'text_0_314840',
   'text_4_911187',
   'text_1_355765',
   'text_10_734797']],
 'embeddings': None,
 'documents': [['Scaled Dot-Product Attention\nMulti-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nquery with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the\nvalues.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together\ninto a matrix Q. The keys and values are also packed together into matrices K and V . We compute\nthe matrix of outputs as:\nAttention(Q, K, V ) = softmax(QKT\n√dk\n)V\n(1)\nThe two most commonly used attention functions are additive attention [2], and dot-product (multi-\nplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor\nof\n1\n√dk . Additive attention computes the compatibility function using a

In [None]:
query_results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])

In [None]:
query_results['documents'][0]

['Scaled Dot-Product Attention\nMulti-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nquery with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the\nvalues.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together\ninto a matrix Q. The keys and values are also packed together into matrices K and V . We compute\nthe matrix of outputs as:\nAttention(Q, K, V ) = softmax(QKT\n√dk\n)V\n(1)\nThe two most commonly used attention functions are additive attention [2], and dot-product (multi-\nplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor\nof\n1\n√dk . Additive attention computes the compatibility function using a feed-forward network with\na single hidden layer. While the two are similar in theoretical complexity, dot-product attention is\nmuch faster and 

### **Contextualized generative step**

This section integrates the retrieval system with content generation capabilities based on user queries.

### **Context formatting**

Format the context following examples from the official [Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) documentation.

In [None]:
def format_context_for_phi3(results, max_context_length=None):
  """
  Format retrieved results into a context string for PHI-3.

  Parameters:
  -----------
  results: Results from ChromaDB query
  max_context_length: Maximum length of context to include

  Returns:
  --------
  Formatted context string
  """
  if not results or not results["ids"][0]:
      return "No relevant context found."

  context_parts = []
  current_length = 0

  for i in range(len(results["ids"][0])):
      doc = results["documents"][0][i]
      metadata = results["metadatas"][0][i]
      distance = results["distances"][0][i]

      if metadata["type"] == "text":
          context_piece = f"[Text from page {metadata['page']}, relevance: {1-distance:.2f}]\n{doc}\n"
      else:
          context_piece = f"[Image from page {metadata['page']}, relevance: {1-distance:.2f}]\n{doc}\n"

      if max_context_length is not None:
          if current_length + len(context_piece) > max_context_length:
              break

      context_parts.append(context_piece)
      current_length += len(context_piece)

  return "\n---\n".join(context_parts)

In [None]:
context = format_context_for_phi3(query_results)
print(context[:1000])

[Text from page 4, relevance: 0.50]
Scaled Dot-Product Attention
Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several
attention layers running in parallel.
query with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the
values.
In practice, we compute the attention function on a set of queries simultaneously, packed together
into a matrix Q. The keys and values are also packed together into matrices K and V . We compute
the matrix of outputs as:
Attention(Q, K, V ) = softmax(QKT
√dk
)V
(1)
The two most commonly used attention functions are additive attention [2], and dot-product (multi-
plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor
of
1
√dk . Additive attention computes the compatibility function using a feed-forward network with
a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is
m

In [None]:
print(context[100])

t


### **Language model instance**

Initialize the PHI-3 vision model for text generation.

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor

In [None]:
language_model_id = "microsoft/Phi-3-vision-128k-instruct"

In [None]:
phi3_model = AutoModelForCausalLM.from_pretrained(language_model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2")

phi3_processor = AutoProcessor.from_pretrained(language_model_id, trust_remote_code=True)


config.json: 0.00B [00:00, ?B/s]

configuration_phi3_v.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- configuration_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3_v.py: 0.00B [00:00, ?B/s]

image_embedding_phi3_v.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- image_embedding_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- modeling_phi3_v.py
- image_embedding_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.35G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

processing_phi3_v.py: 0.00B [00:00, ?B/s]

image_processing_phi3_v.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- image_processing_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- processing_phi3_v.py
- image_processing_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [None]:
phi3_model

Phi3VForCausalLM(
  (model): Phi3VModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (vision_embed_tokens): Phi3ImageEmbedding(
      (drop): Dropout(p=0.0, inplace=False)
      (wte): Embedding(32064, 3072, padding_idx=32000)
      (img_processor): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
          (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
          )
          (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder): CLIPEncoder(
            (layers): ModuleList(
              (0-23): 24 x CLIPEncoderLayer(
                (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                (mlp): CLIPMLP(
                  (activation_fn): QuickGELUActivation()
                  (fc1): Linear(i

### **Model generation**

Generate responses using the language model without RAG context.

In [None]:
def generate_response_with_phi3(
  phi3_model, phi3_processor, prompt, context,
  images=None, max_new_tokens=512):
  """
  Generate a response using PHI-3 Vision model with RAG context.

  Parameters:
  -----------
  phi3_model: PHI-3 Vision model
  phi3_processor: PHI-3 processor
  prompt: User prompt/question
  context: Retrieved context from RAG
  images: Optional list of PIL Images
  max_new_tokens: Maximum number of tokens to generate

  Returns:
  --------
  Generated text response
  """
  messages = [
      {"role": "system", "content": f"You are a helpful AI assistant. Use the following context from documents to answer the user's question.\nIf the context doesn't contain relevant information, say so clearly.\n\nContext:\n{context}"},
      {"role": "user", "content": prompt}
  ]

  prompt_text = phi3_processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

  if images and len(images) > 0:
    inputs = phi3_processor(
        text=prompt_text,
        images=images,
        return_tensors="pt"
    )
  else:
    inputs = phi3_processor(
        text=prompt_text,
        return_tensors="pt"
    )

  device = next(phi3_model.parameters()).device
  inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v
          for k, v in inputs.items()}

  generation_args = {
      "max_new_tokens": max_new_tokens,
      "temperature": 0.7, # Using the temperature from the original function
      "top_p": 0.9, # Using the top_p from the original function
      "do_sample": True if 0.7 > 0 else False, # do_sample based on temperature
  }

  logger.info("Generating response...")
  with torch.no_grad():
    generate_ids = phi3_model.generate(
        **inputs,
        eos_token_id=phi3_processor.tokenizer.eos_token_id,
        **generation_args
    )

  # remove input tokens
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
  response = phi3_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

  return response

In [None]:
response = generate_response_with_phi3(
      phi3_model,
      phi3_processor,
      "what is the attention mechanism",
      context,
      max_new_tokens=3
  )

2025-10-17 22:45:40,074 - INFO - Generating response...


In [None]:
response

'The attention mechanism'

In [None]:
query = "how does the attention mechanism works"

In [None]:
result = rag_query_and_generate(
  jina_model,
  phi3_model,
  phi3_processor,
  collection,
  query,
  n_results=5
)

2025-10-17 22:45:40,470 - INFO - 
2025-10-17 22:45:40,471 - INFO - Processing query: how does the attention mechanism works

2025-10-17 22:45:40,472 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:45:40,473 - INFO - Querying with text: 'how does the attention mechanism works'
2025-10-17 22:45:40,519 - INFO - Found 5 results
2025-10-17 22:45:40,519 - INFO - 
Step 2: Formatting context...
2025-10-17 22:45:40,520 - INFO - Context length: 15345 characters
2025-10-17 22:45:40,521 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:45:40,532 - INFO - Generating response...
2025-10-17 22:46:12,845 - INFO - 
2025-10-17 22:46:12,846 - INFO - COMPLETE



In [None]:
print(result["response"])

The attention mechanism is a method that allows a model to focus on different parts of the input sequence when generating the output sequence. It does this by assigning different weights to different parts of the input sequence, allowing the model to focus on the most relevant information for each step of the output sequence. In the context of the Transformer model, the attention mechanism is used in three ways:

1. Encoder-decoder attention: In this case, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence, mimicking the typical encoder-decoder attention mechanisms found in sequence-to-sequence models.
2. Self-attention: In a self-attention layer, all of the keys, values, and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions i

### **Model generation with RAG context**

Generate responses using the language model enhanced with retrieved context from the RAG system.

In [None]:
def rag_query_and_generate(
  jina_model, phi3_model, phi3_processor, collection,
  query_text, n_results=5, filter_type=None,
  max_new_tokens=512):
  """
  Complete RAG pipeline: retrieve relevant context and generate response.

  Parameters:
  -----------
  jina_model: Jina-CLIP model for retrieval
  phi3_model: PHI-3 Vision model for generation
  phi3_processor: PHI-3 processor
  collection: ChromaDB collection
  query_text: User query
  n_results: Number of results to retrieve
  filter_type: Optional filter for retrieval
  max_new_tokens: Maximum tokens to generate

  Returns:
  --------
  Dictionary with results and generated response
  """
  logger.info(f"\n{"="*80}")
  logger.info(f"Processing query: {query_text}")
  logger.info(f"{"="*80}\n")

  logger.info("Step 1: Retrieving relevant context...")
  results = query_with_text(
      jina_model, collection, query_text,
      n_results=n_results, filter_type=filter_type)

  logger.info("\nStep 2: Formatting context...")
  context = format_context_for_phi3(results)
  logger.info(f"Context length: {len(context)} characters")

  logger.info("\nStep 3: Generating response with PHI-3...")
  response = generate_response_with_phi3(
      phi3_model,
      phi3_processor,
      query_text,
      context,
      max_new_tokens=max_new_tokens
  )

  logger.info(f"\n{"="*80}")
  logger.info("COMPLETE")
  logger.info(f"{"="*80}\n")

  return {
      "query": query_text,
      "retrieved_results": results,
      "context": context,
      "response": response
  }

In [None]:
query = "how do the attention mechanism works"

In [None]:
result = rag_query_and_generate(
  jina_model,
  phi3_model,
  phi3_processor,
  collection,
  query,
  n_results=5
)

2025-10-17 22:46:12,881 - INFO - 
2025-10-17 22:46:12,882 - INFO - Processing query: how do the attention mechanism works

2025-10-17 22:46:12,883 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:46:12,884 - INFO - Querying with text: 'how do the attention mechanism works'
2025-10-17 22:46:12,946 - INFO - Found 5 results
2025-10-17 22:46:12,946 - INFO - 
Step 2: Formatting context...
2025-10-17 22:46:12,947 - INFO - Context length: 15345 characters
2025-10-17 22:46:12,948 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:46:12,960 - INFO - Generating response...
2025-10-17 22:46:25,826 - INFO - 
2025-10-17 22:46:25,827 - INFO - COMPLETE



In [None]:
result.keys()

dict_keys(['query', 'retrieved_results', 'context', 'response'])

In [None]:
result["query"]

'how do the attention mechanism works'

In [None]:
print(result["response"])

The attention mechanism in the context of Transformer models allows the model to focus on different parts of the input sequence when generating each part of the output sequence. It does this by computing a weighted sum of the values (from the input sequence) corresponding to each position in the output sequence. This weighted sum is then used to compute a representation for the current position in the output sequence. The weights are computed by applying a softmax function to the dot product of the query with all the keys. The query and keys are derived from the encoder and decoder stacks, respectively, and are linearly transformed using learned weight matrices. The resulting values are then used as input for the next layer in the decoder. This process is repeated for each position in the output sequence. The attention mechanism enables the model to learn which parts of the input sequence are more relevant for each part of the output sequence, allowing for more efficient and effective 

### **Interactive RAG Chat**

Engage in a turn-based chat with the RAG system and the language model.

In [None]:
chat_history = []

def chat_with_rag(query, chat_history=[], n_results=3):
    """
    Interactive chat function with RAG.
    """
    chat_history.append({"role": "user", "content": query})

    rag_response = rag_query_and_generate(
        jina_model,
        phi3_model,
        phi3_processor,
        collection,
        query,
        n_results=n_results
    )

    full_response = f"Retrieved Context:\n---\n{rag_response['context']}\n---\n\nGenerated Answer:\n{rag_response['response']}"

    chat_history.append({"role": "assistant", "content": full_response})

    print("\nGenerated Answer:")
    print(rag_response['response'])

    return full_response

In [None]:
chat_with_rag("What is the main architecture proposed in the 'Attention is All You Need' paper?", chat_history)

2025-10-17 22:52:32,715 - INFO - 
2025-10-17 22:52:32,715 - INFO - Processing query: What is the main architecture proposed in the 'Attention is All You Need' paper?

2025-10-17 22:52:32,716 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:52:32,717 - INFO - Querying with text: 'What is the main architecture proposed in the 'Attention is All You Need' paper?'
2025-10-17 22:52:32,789 - INFO - Found 3 results
2025-10-17 22:52:32,790 - INFO - 
Step 2: Formatting context...
2025-10-17 22:52:32,790 - INFO - Context length: 8682 characters
2025-10-17 22:52:32,791 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:52:32,800 - INFO - Generating response...
2025-10-17 22:52:35,698 - INFO - 
2025-10-17 22:52:35,699 - INFO - COMPLETE


Generated Answer:
The main architecture proposed in the 'Attention is All You Need' paper is the Transformer model, which is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.


'Retrieved Context:\n---\n[Text from page 1, relevance: 0.44]\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in qual

In [None]:
chat_with_rag("Explain the multi-head attention mechanism.", chat_history)

2025-10-17 22:52:35,705 - INFO - 
2025-10-17 22:52:35,706 - INFO - Processing query: Explain the multi-head attention mechanism.

2025-10-17 22:52:35,707 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:52:35,707 - INFO - Querying with text: 'Explain the multi-head attention mechanism.'
2025-10-17 22:52:35,774 - INFO - Found 3 results
2025-10-17 22:52:35,774 - INFO - 
Step 2: Formatting context...
2025-10-17 22:52:35,775 - INFO - Context length: 10028 characters
2025-10-17 22:52:35,775 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:52:35,784 - INFO - Generating response...
2025-10-17 22:52:53,346 - INFO - 
2025-10-17 22:52:53,347 - INFO - COMPLETE


Generated Answer:
The multi-head attention mechanism is a component of the Transformer model that allows the model to jointly attend to information from different representation subspaces at different positions. In a single attention head, the queries, keys, and values are linearly projected to dk, dk, and dv

'Retrieved Context:\n---\n[Text from page 5, relevance: 0.70]\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3\nApplications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:\n• In "encoder-decoder attention" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in se

In [None]:
chat_with_rag("Describe the model architecture diagram shown in the paper. What are its main components?", chat_history)

2025-10-17 22:52:53,355 - INFO - 
2025-10-17 22:52:53,356 - INFO - Processing query: Describe the model architecture diagram shown in the paper. What are its main components?

2025-10-17 22:52:53,357 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:52:53,357 - INFO - Querying with text: 'Describe the model architecture diagram shown in the paper. What are its main components?'
2025-10-17 22:52:53,450 - INFO - Found 3 results
2025-10-17 22:52:53,451 - INFO - 
Step 2: Formatting context...
2025-10-17 22:52:53,452 - INFO - Context length: 8746 characters
2025-10-17 22:52:53,453 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:52:53,460 - INFO - Generating response...
2025-10-17 22:53:03,667 - INFO - 
2025-10-17 22:53:03,668 - INFO - COMPLETE


Generated Answer:
The model architecture diagram shown in the paper consists of two main components: the encoder and the decoder. Both components are composed of stacked layers. The encoder includes a stack of N = 6 ide

'Retrieved Context:\n---\n[Text from page 3, relevance: 0.37]\nFigure 1: The Transformer - model architecture.\nwise fully connected feed-forward network. We employ a residual connection [10] around each of\nthe two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is\nLayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\nlayers, produce outputs of dimension dmodel = 512.\nDecoder:\nThe decoder is also composed of a stack of N = 6 identical layers. In addition to the two\nsub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head\nattention over the output of the encoder stack. Similar to the encoder, we employ residual connections\naround each of the sub-layers, followed by layer normalization. We also modify the self-attention\nsub-layer in the decoder stack to preven

In [None]:
chat_with_rag("How does the Transformer model compare to RNN and CNN architectures according to the paper?", chat_history)

2025-10-17 22:53:03,675 - INFO - 
2025-10-17 22:53:03,676 - INFO - Processing query: How does the Transformer model compare to RNN and CNN architectures according to the paper?

2025-10-17 22:53:03,678 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:03,678 - INFO - Querying with text: 'How does the Transformer model compare to RNN and CNN architectures according to the paper?'
2025-10-17 22:53:03,775 - INFO - Found 3 results
2025-10-17 22:53:03,776 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:03,776 - INFO - Context length: 7583 characters
2025-10-17 22:53:03,777 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:03,784 - INFO - Generating response...
2025-10-17 22:53:13,994 - INFO - 
2025-10-17 22:53:13,995 - INFO - COMPLETE


Generated Answer:
According to the paper, the Transformer model outperforms architectures based on recurrent or convolutional layers, such as RNN and CNN, on translation tasks. It can be trained significantly faster

'Retrieved Context:\n---\n[Text from page 9, relevance: 0.57]\nTable 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base\nmodel. All metrics are on the English-to-German translation development set, newstest2013. Listed\nperplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to\nper-word perplexities.\nN\ndmodel\ndff\nh\ndk\ndv\nPdrop\nϵls\ntrain\nPPL\nBLEU\nparams\nsteps\n(dev)\n(dev)\n×106\nbase\n6\n512\n2048\n8\n64\n64\n0.1\n0.1\n100K\n4.92\n25.8\n65\n(A)\n1\n512\n512\n5.29\n24.9\n4\n128\n128\n5.00\n25.5\n16\n32\n32\n4.91\n25.8\n32\n16\n16\n5.01\n25.4\n(B)\n16\n5.16\n25.1\n58\n32\n5.01\n25.4\n60\n(C)\n2\n6.11\n23.7\n36\n4\n5.19\n25.3\n50\n8\n4.88\n25.5\n80\n256\n32\n32\n5.75\n24.5\n28\n1024\n128\n128\n4.66\n26.0\n168\n1024\n5.12\n25.4\n53\n4096\n4.75\n26.2\n90\n(D)\n0.0\n5.77\n24.6\n0.2\n4.95\n25.5\n0.0\n4.67\n25.3\n0.2\n5.47\n25.7\n(E)\npositional embedding instead of sinusoids\n4.92\n25.7\nbig\n

In [None]:
chat_with_rag("What is the formula for scaled dot-product attention and what does each component represent?", chat_history)

2025-10-17 22:53:14,001 - INFO - 
2025-10-17 22:53:14,002 - INFO - Processing query: What is the formula for scaled dot-product attention and what does each component represent?

2025-10-17 22:53:14,003 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:14,004 - INFO - Querying with text: 'What is the formula for scaled dot-product attention and what does each component represent?'
2025-10-17 22:53:14,096 - INFO - Found 3 results
2025-10-17 22:53:14,097 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:14,097 - INFO - Context length: 9319 characters
2025-10-17 22:53:14,098 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:14,106 - INFO - Generating response...
2025-10-17 22:53:27,267 - INFO - 
2025-10-17 22:53:27,268 - INFO - COMPLETE


Generated Answer:
The formula for scaled dot-product attention is as follows:

Attention(Q, K, V ) = softmax(QKT √dk )V

Here's what each component represents:

- Q: Query matrix, which is a matrix of shape (batch

'Retrieved Context:\n---\n[Text from page 4, relevance: 0.73]\nScaled Dot-Product Attention\nMulti-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nquery with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the\nvalues.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together\ninto a matrix Q. The keys and values are also packed together into matrices K and V . We compute\nthe matrix of outputs as:\nAttention(Q, K, V ) = softmax(QKT\n√dk\n)V\n(1)\nThe two most commonly used attention functions are additive attention [2], and dot-product (multi-\nplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor\nof\n1\n√dk . Additive attention computes the compatibility function using a feed-forward network with\na single hidden layer. While the two are similar in theor

### **Examples without Chat History**

#### *What is the main architecture proposed in the 'Attention is All You Need' paper?*

In [None]:
chat_with_rag("What is the main architecture proposed in the 'Attention is All You Need' paper?")

2025-10-17 22:53:27,278 - INFO - 
2025-10-17 22:53:27,279 - INFO - Processing query: What is the main architecture proposed in the 'Attention is All You Need' paper?

2025-10-17 22:53:27,280 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:27,281 - INFO - Querying with text: 'What is the main architecture proposed in the 'Attention is All You Need' paper?'
2025-10-17 22:53:27,377 - INFO - Found 3 results
2025-10-17 22:53:27,378 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:27,378 - INFO - Context length: 8682 characters
2025-10-17 22:53:27,379 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:27,386 - INFO - Generating response...
2025-10-17 22:53:30,252 - INFO - 
2025-10-17 22:53:30,253 - INFO - COMPLETE


Generated Answer:
The main architecture proposed in the 'Attention is All You Need' paper is the Transformer model, which is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.


'Retrieved Context:\n---\n[Text from page 1, relevance: 0.44]\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in qual

#### *Explain the multi-head attention mechanism.*

In [None]:
chat_with_rag("Explain the multi-head attention mechanism.")

2025-10-17 22:53:30,261 - INFO - 
2025-10-17 22:53:30,262 - INFO - Processing query: Explain the multi-head attention mechanism.

2025-10-17 22:53:30,263 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:30,264 - INFO - Querying with text: 'Explain the multi-head attention mechanism.'
2025-10-17 22:53:30,330 - INFO - Found 3 results
2025-10-17 22:53:30,331 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:30,331 - INFO - Context length: 10028 characters
2025-10-17 22:53:30,332 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:30,340 - INFO - Generating response...
2025-10-17 22:53:43,302 - INFO - 
2025-10-17 22:53:43,303 - INFO - COMPLETE


Generated Answer:
The multi-head attention mechanism is a technique used in Transformer models that allows the model to jointly attend to information from different representation subspaces at different positions. It involves linearly projecting the queries, keys, and values h times with different, learned li

'Retrieved Context:\n---\n[Text from page 5, relevance: 0.70]\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3\nApplications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:\n• In "encoder-decoder attention" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in se

#### *Describe the model architecture diagram shown in the paper. What are its main components?*

In [None]:
chat_with_rag("Describe the model architecture diagram shown in the paper. What are its main components?")

2025-10-17 22:53:43,309 - INFO - 
2025-10-17 22:53:43,310 - INFO - Processing query: Describe the model architecture diagram shown in the paper. What are its main components?

2025-10-17 22:53:43,311 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:43,312 - INFO - Querying with text: 'Describe the model architecture diagram shown in the paper. What are its main components?'
2025-10-17 22:53:43,376 - INFO - Found 3 results
2025-10-17 22:53:43,377 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:43,378 - INFO - Context length: 8746 characters
2025-10-17 22:53:43,378 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:43,386 - INFO - Generating response...
2025-10-17 22:53:54,255 - INFO - 
2025-10-17 22:53:54,256 - INFO - COMPLETE


Generated Answer:
The model architecture diagram shown in the paper consists of an encoder and a decoder. The encoder is composed of a stack of N=6 identical layers. Each layer has two sub-layers, a multi-head self-atte

'Retrieved Context:\n---\n[Text from page 3, relevance: 0.37]\nFigure 1: The Transformer - model architecture.\nwise fully connected feed-forward network. We employ a residual connection [10] around each of\nthe two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is\nLayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\nlayers, produce outputs of dimension dmodel = 512.\nDecoder:\nThe decoder is also composed of a stack of N = 6 identical layers. In addition to the two\nsub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head\nattention over the output of the encoder stack. Similar to the encoder, we employ residual connections\naround each of the sub-layers, followed by layer normalization. We also modify the self-attention\nsub-layer in the decoder stack to preven

#### *How does the Transformer model compare to RNN and CNN architectures according to the paper?*

In [None]:
chat_with_rag("How does the Transformer model compare to RNN and CNN architectures according to the paper?")

2025-10-17 22:53:54,263 - INFO - 
2025-10-17 22:53:54,264 - INFO - Processing query: How does the Transformer model compare to RNN and CNN architectures according to the paper?

2025-10-17 22:53:54,265 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:53:54,266 - INFO - Querying with text: 'How does the Transformer model compare to RNN and CNN architectures according to the paper?'
2025-10-17 22:53:54,331 - INFO - Found 3 results
2025-10-17 22:53:54,332 - INFO - 
Step 2: Formatting context...
2025-10-17 22:53:54,332 - INFO - Context length: 7583 characters
2025-10-17 22:53:54,333 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:53:54,340 - INFO - Generating response...
2025-10-17 22:54:01,937 - INFO - 
2025-10-17 22:54:01,938 - INFO - COMPLETE


Generated Answer:
According to the paper, the Transformer model compares favorably to RNN and CNN architectures for translation tasks. The paper states that the Transformer can be trained significantly faster than a

'Retrieved Context:\n---\n[Text from page 9, relevance: 0.57]\nTable 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base\nmodel. All metrics are on the English-to-German translation development set, newstest2013. Listed\nperplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to\nper-word perplexities.\nN\ndmodel\ndff\nh\ndk\ndv\nPdrop\nϵls\ntrain\nPPL\nBLEU\nparams\nsteps\n(dev)\n(dev)\n×106\nbase\n6\n512\n2048\n8\n64\n64\n0.1\n0.1\n100K\n4.92\n25.8\n65\n(A)\n1\n512\n512\n5.29\n24.9\n4\n128\n128\n5.00\n25.5\n16\n32\n32\n4.91\n25.8\n32\n16\n16\n5.01\n25.4\n(B)\n16\n5.16\n25.1\n58\n32\n5.01\n25.4\n60\n(C)\n2\n6.11\n23.7\n36\n4\n5.19\n25.3\n50\n8\n4.88\n25.5\n80\n256\n32\n32\n5.75\n24.5\n28\n1024\n128\n128\n4.66\n26.0\n168\n1024\n5.12\n25.4\n53\n4096\n4.75\n26.2\n90\n(D)\n0.0\n5.77\n24.6\n0.2\n4.95\n25.5\n0.0\n4.67\n25.3\n0.2\n5.47\n25.7\n(E)\npositional embedding instead of sinusoids\n4.92\n25.7\nbig\n

#### *What is the formula for scaled dot-product attention and what does each component represent?*

In [None]:
chat_with_rag("What is the formula for scaled dot-product attention and what does each component represent?")

2025-10-17 22:54:01,945 - INFO - 
2025-10-17 22:54:01,945 - INFO - Processing query: What is the formula for scaled dot-product attention and what does each component represent?

2025-10-17 22:54:01,947 - INFO - Step 1: Retrieving relevant context...
2025-10-17 22:54:01,947 - INFO - Querying with text: 'What is the formula for scaled dot-product attention and what does each component represent?'
2025-10-17 22:54:02,020 - INFO - Found 3 results
2025-10-17 22:54:02,021 - INFO - 
Step 2: Formatting context...
2025-10-17 22:54:02,022 - INFO - Context length: 9319 characters
2025-10-17 22:54:02,022 - INFO - 
Step 3: Generating response with PHI-3...
2025-10-17 22:54:02,030 - INFO - Generating response...
2025-10-17 22:54:18,070 - INFO - 
2025-10-17 22:54:18,071 - INFO - COMPLETE


Generated Answer:
The formula for scaled dot-product attention is as follows:

Attention(Q, K, V ) = softmax(QKT√dk)V

Here is what each component represents:

- Q, K, and V represent the query, key, and value mat

'Retrieved Context:\n---\n[Text from page 4, relevance: 0.73]\nScaled Dot-Product Attention\nMulti-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nquery with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the\nvalues.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together\ninto a matrix Q. The keys and values are also packed together into matrices K and V . We compute\nthe matrix of outputs as:\nAttention(Q, K, V ) = softmax(QKT\n√dk\n)V\n(1)\nThe two most commonly used attention functions are additive attention [2], and dot-product (multi-\nplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor\nof\n1\n√dk . Additive attention computes the compatibility function using a feed-forward network with\na single hidden layer. While the two are similar in theor