In [1]:

!pip uninstall tabula tabula-py
!pip install tabula-py

[0mFound existing installation: tabula-py 2.10.0
Uninstalling tabula-py-2.10.0:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/tabula/*
    /usr/local/lib/python3.11/dist-packages/tabula_py-2.10.0.dist-info/*
Proceed (Y/n)? y
  Successfully uninstalled tabula-py-2.10.0
Collecting tabula-py
  Using cached tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Using cached tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
Installing collected packages: tabula-py
Successfully installed tabula-py-2.10.0


In [2]:
!pip install -r requirements.txt



In [3]:
import boto3
import tabula
import faiss
import json
import base64
import pymupdf
import requests
import os
import logging
import numpy as np
import warnings
from tqdm import tqdm
from botocore.exceptions import ClientError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython import display


logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

In [4]:
# Downloading the dataset - URL of the "Attention Is All You Need" paper (Replace it with the URL of the PDF file/dataset you want to download)
url = "https://arxiv.org/pdf/1706.03762.pdf"

# Set the filename and filepath
filename = "attention_paper.pdf"
filepath = os.path.join("data", filename)

# Create the data directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open(filepath, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded successfully: {filepath}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

File downloaded successfully: data/attention_paper.pdf


In [5]:
# Create the directories
def create_directories(base_dir):
    directories = ["images", "text", "tables", "page_images"]
    for dir in directories:
        os.makedirs(os.path.join(base_dir, dir), exist_ok=True)

# Process tables
def process_tables(doc, page_num, base_dir, items):
    try:
        tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
        if not tables:
            return
        for table_idx, table in enumerate(tables):
            table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
            table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
            with open(table_file_name, 'w') as f:
                f.write(table_text)
            items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
    except Exception as e:
        print(f"Error extracting tables from page {page_num}: {str(e)}")

# Process text chunks
def process_text_chunks(text, text_splitter, page_num, base_dir, items):
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        text_file_name = f"{base_dir}/text/{os.path.basename(filepath)}_text_{page_num}_{i}.txt"
        with open(text_file_name, 'w') as f:
            f.write(chunk)
        items.append({"page": page_num, "type": "text", "text": chunk, "path": text_file_name})

# Process images
def process_images(page, page_num, base_dir, items):
    images = page.get_images()
    for idx, image in enumerate(images):
        xref = image[0]
        pix = pymupdf.Pixmap(doc, xref)
        image_name = f"{base_dir}/images/{os.path.basename(filepath)}_image_{page_num}_{idx}_{xref}.png"
        pix.save(image_name)
        with open(image_name, 'rb') as f:
            encoded_image = base64.b64encode(f.read()).decode('utf8')
        items.append({"page": page_num, "type": "image", "path": image_name, "image": encoded_image})

# Process page images
def process_page_images(page, page_num, base_dir, items):
    pix = page.get_pixmap()
    page_path = os.path.join(base_dir, f"page_images/page_{page_num:03d}.png")
    pix.save(page_path)
    with open(page_path, 'rb') as f:
        page_image = base64.b64encode(f.read()).decode('utf8')
    items.append({"page": page_num, "type": "page", "path": page_path, "image": page_image})

In [6]:
doc = pymupdf.open(filepath)
num_pages = len(doc)
base_dir = "data"

# Creating the directories
create_directories(base_dir)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
items = []

# Process each page of the PDF
for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
    page = doc[page_num]
    text = page.get_text()
    process_tables(doc, page_num, base_dir, items)
    process_text_chunks(text, text_splitter, page_num, base_dir, items)
    process_images(page, page_num, base_dir, items)
    process_page_images(page, page_num, base_dir, items)

Jul 01, 2025 6:45:52 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Jul 01, 2025 6:45:53 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Jul 01, 2025 6:45:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Jul 01, 2025 6:45:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:45:58 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:06 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:18 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Jul 01, 2025 6:46:18 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode

Jul 01, 2025 6:46:24 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:28 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:31 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:35 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul 01, 2025 6:46:40 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>

Jul

In [7]:
# Looking at the first text item
[i for i in items if i['type'] == 'text'][0]

{'page': 0,
 'type': 'text',
 'text': 'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or',
 'path': 'data/text/attention_paper.pdf_text_0_0.txt'}

In [8]:
# Looking at the first table item
[i for i in items if i['type'] == 'table'][0]

{'page': 5,
 'type': 'table',
 'text': 'nan | nan | Operations | nan\nSelf-Attention | O(n2 · d) | O(1) | O(1)\nRecurrent | O(n · d2) | O(n) | O(n)\nConvolutional | O(k · n · d2) | O(1) | O(logk(n))\nSelf-Attention (restricted) | O(r · n · d) | O(1) | O(n/r)',
 'path': 'data/tables/attention_paper.pdf_table_5_0.txt'}

In [9]:
# Generating Multimodal Embeddings using Amazon Titan Multimodal Embeddings model
import getpass

# Prompt for credentials securely using getpass
aws_access_key_id = getpass.getpass("Enter your AWS Access Key ID: ")
aws_secret_access_key = getpass.getpass("Enter your AWS Secret Access Key: ")
aws_region = input("Enter your AWS Region (e.g., us-east-1): ")

# Set the credentials as environment variables for the current session
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
os.environ["AWS_REGION"] = aws_region

def generate_multimodal_embeddings(prompt=None, image=None, output_embedding_length=384):
    """
    Invoke the Amazon Titan Multimodal Embeddings model using Amazon Bedrock runtime.

    Args:
        prompt (str): The text prompt to provide to the model.
        image (str): A base64-encoded image data.
    Returns:
        str: The model's response embedding.
    """
    if not prompt and not image:
        raise ValueError("Please provide either a text prompt, base64 image, or both as input")

    # Initialize the Amazon Bedrock runtime client
    client = boto3.client(service_name="bedrock-runtime", region_name="ap-south-1")
    model_id = "amazon.titan-embed-image-v1"

    body = {"embeddingConfig": {"outputEmbeddingLength": output_embedding_length}}

    if prompt:
        body["inputText"] = prompt
    if image:
        body["inputImage"] = image

    try:
        response = client.invoke_model(
            modelId=model_id,
            body=json.dumps(body),
            accept="application/json",
            contentType="application/json"
        )

        # Process and return the response
        result = json.loads(response.get("body").read())
        return result.get("embedding")

    except ClientError as err:
        print(f"Couldn't invoke Titan embedding model. Error: {err.response['Error']['Message']}")
        return None

Enter your AWS Access Key ID: ··········
Enter your AWS Secret Access Key: ··········
Enter your AWS Region (e.g., us-east-1): ap-south-1


In [10]:
# Set embedding vector dimension
embedding_vector_dimension = 384

# Count the number of each type of item
item_counts = {
    'text': sum(1 for item in items if item['type'] == 'text'),
    'table': sum(1 for item in items if item['type'] == 'table'),
    'image': sum(1 for item in items if item['type'] == 'image'),
    'page': sum(1 for item in items if item['type'] == 'page')
}

# Initialize counters
counters = dict.fromkeys(item_counts.keys(), 0)

# Generate embeddings for all items
with tqdm(
    total=len(items),
    desc="Generating embeddings",
    bar_format=(
        "{l_bar}{bar}| {n_fmt}/{total_fmt} "
        "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
    )
) as pbar:

    for item in items:
        item_type = item['type']
        counters[item_type] += 1

        if item_type in ['text', 'table']:
            # For text or table, use the formatted text representation
            item['embedding'] = generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension)
        else:
            # For images, use the base64-encoded image data
            item['embedding'] = generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)

        # Update the progress bar
        pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
        pbar.update(1)

Generating embeddings: 100%|██████████| 108/108 [01:40<00:00,  1.08it/s, Text: 83/83, Table: 7/7, Image: 3/3]


In [11]:
# All the embeddings
all_embeddings = np.array([item['embedding'] for item in items])

print("A single embedding:", items[0]['embedding'])

# Create FAISS Index
index = faiss.IndexFlatL2(embedding_vector_dimension)

# Clear any pre-existing index
index.reset()

# Add embeddings to the index
index.add(np.array(all_embeddings, dtype=np.float32))

A single embedding: [0.003961269, 0.011774781, -0.012865039, -0.0016172152, 0.08373178, -0.032271624, 0.03169015, 0.029800372, 0.06715986, -0.019043164, -0.07617266, -0.020932944, 0.091872364, 0.0058873906, -0.06047295, 0.047680594, -0.06803207, -0.031399414, -0.025730077, 0.08140589, -0.055239715, 0.02238662, -0.019043164, -0.03808633, 0.010248421, -0.0014445912, -0.01271967, -0.040993683, -0.051460154, 0.0012810526, -0.00872206, 0.07908001, -0.012501619, 0.09536119, -0.016063128, 0.00795888, 0.017008018, -0.0242764, -0.013955296, -0.027183754, -0.021514414, -0.053204566, -0.018461693, -0.047099125, 0.0121382, 0.045354713, -0.020932944, 0.019188533, -0.08373178, 0.011265994, -0.13199385, 0.003470653, -0.0633803, -0.012428936, -0.051750887, 0.004906159, -0.0023803955, -0.09768707, -0.06425251, -0.02180515, -0.0034524822, 0.036487285, -0.018534377, -0.05058795, 0.080242954, -0.021950517, -0.04244736, -0.049715742, 0.0054512876, 0.03241699, -0.029509636, 0.08838354, -0.034452137, 0.06250

In [12]:
!pip install -U langchain-google-genai Pillow

Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.6-py3-none-any.whl.metadata (7.0 kB)
Collecting Pillow
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Downloading langchain_google_genai-2.1.6-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB

In [13]:
import getpass
import os
import base64
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

def invoke_rag_model(prompt, matched_items):
    """
    Invokes the Google Gemini multimodal model for a RAG task.
    """
    # Securely get your Google API key if it's not already set
    if "GOOGLE_API_KEY" not in os.environ:
        os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API Key: ")

    # 1. Initialize the ChatGoogleGenerativeAI client with a multimodal model
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

    # 2. Construct the message content for multimodal input
    # The format is a list of dictionaries, one for each part (text or image).
    message_content = [
        {"type": "text", "text": "Context: You are a helpful assistant for question answering. The following text and images are relevant information that has been retrieved to help you answer the user's question."}
    ]

    # Add the retrieved context (text and images)
    for item in matched_items:
        if item.get('type') in ['text', 'table']:
            message_content.append({"type": "text", "text": item.get('text', '')})
        elif item.get('type') == 'image':
            # Gemini can take raw image bytes, so no base64 encoding is needed
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{base64.b64encode(item.get('image')).decode('utf-8')}"}
            })

    # Add the final user prompt
    message_content.append({"type": "text", "text": f"User Question: {prompt}"})

    # 3. Create a single HumanMessage with the multimodal content
    messages = [
        HumanMessage(content=message_content)
    ]

    # 4. Invoke the model
    response = llm.invoke(messages)

    return response.content

In [14]:
# User Query
query = "How is it different from prior art. Describe its novelty, use cases.. in detail in layman terms"

# Generate embeddings for the query
query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)

# Search for the nearest neighbors in the vector database
distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)

In [15]:
# Check the result (matched chunks)
result.flatten()
distances

array([[0.9509854 , 0.95326436, 0.9546226 , 0.9989011 , 1.0007919 ]],
      dtype=float32)

In [16]:
# Retrieve the matched items
matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]

# Generate RAG response with Amazon Nova
response = invoke_rag_model(query, matched_items)

display.Markdown(response)

Enter your Google API Key: ··········


The Transformer, unlike previous neural sequence transduction models, is the first to rely *entirely* on self-attention to process input and output sequences.  Prior models typically used recurrent neural networks (RNNs) or convolutions, which process data sequentially (one element after another).  This sequential processing limits the ability to parallelize computations, making training slower.

**Novelty:**

The Transformer's core novelty is its use of **self-attention**.  Imagine you're summarizing a sentence.  Instead of reading the words one by one, self-attention lets the model consider *all* words simultaneously to understand their relationships and importance.  It calculates how much each word "pays attention" to every other word in the sentence, determining the context and meaning of each. This allows the model to capture long-range dependencies between words much more effectively than RNNs, which struggle with very long sentences.  This is a key advantage.

Another novelty is the model's architecture.  It's an encoder-decoder structure, like many before it, but the *implementation* of the encoder and decoder is entirely based on self-attention, discarding the sequential processing of RNNs or convolutions.  The paper also introduces multi-head attention (looking at relationships from different perspectives) and a parameter-free positional encoding (telling the model the order of words without needing extra parameters).

**Use Cases:**

Because of its speed and ability to handle long sequences, the Transformer architecture has revolutionized many natural language processing tasks.  Examples include:

* **Machine Translation:**  Translating text from one language to another more accurately and efficiently.
* **Text Summarization:**  Generating concise summaries of longer texts.
* **Question Answering:**  Answering questions based on given text passages.
* **Text Generation:**  Creating coherent and contextually relevant text, such as writing stories or articles.
* **Chatbots:** Building more natural and engaging conversational AI systems.


In layman's terms, the Transformer is like having a super-powered brain for understanding language.  Instead of reading a sentence word-by-word, it looks at all the words at once, instantly grasping the relationships between them. This makes it much faster and better at understanding complex sentences and generating human-quality text.  This has led to significant improvements in many applications that involve processing and generating human language.