In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
import json

file_path = "./content/attention.pdf"

# Step 1: Extract all elements without chunking
elements = partition_pdf(
    filename=file_path,
    strategy="hi_res",
    infer_table_structure=True,
    extract_images_in_pdf=True,
    extract_image_block_types=["Image"],
    extract_image_block_to_payload=True
)

# Step 2: Separate tables/images from text elements
image_chunks = [el for el in elements if el.category in [ "Image"]]
table_chunks = [el for el in elements if el.category in ["Table"]]
text_elements = [el for el in elements if el.category not in ["Table", "Image"]]

# Step 3: Apply chunking only to the text elements
chunked_text = chunk_by_title(
    text_elements,
    max_characters=10000,
    combine_text_under_n_chars=2000,
    new_after_n_chars=6000
)              

In [2]:

def get_images_base64(image_chunks):
    images_b64 = []
    for image_chunk in image_chunks:
        images_b64.append(image_chunk.metadata.image_base64)
    return images_b64

b64_images = get_images_base64(image_chunks)
# print(b64_images)

# print(images[0].metadata.image_base64)

In [3]:
from IPython.display import HTML, Markdown, display

In [4]:
import htmltabletomd


def table_html_to_markdown(table_chunks):
    md_tables = []
    for table_chunk in table_chunks:
        md_table = htmltabletomd.convert_table(table_chunk.metadata.text_as_html)
        md_tables.append(md_table)
    return md_tables

md_tables = table_html_to_markdown(table_chunks)


In [None]:
%pip install python-dotenv

In [5]:
from langchain_openai import AzureChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

endpoint = os.getenv("ENDPOINT_URL")  
deployment = os.getenv("DEPLOYMENT_NAME")  
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")  

# Initialize Azure OpenAI Service client with key-based authentication    
model = AzureChatOpenAI(  
    azure_endpoint=endpoint,  
    api_key=subscription_key,  
    api_version="2024-05-01-preview",
    model=deployment
)

In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# Prompt
prompt_text = """
You are an assistant tasked with summarizing tables and text particularly for semantic retrieval.
These summaries will be embedded and used to retrieve the raw text or table elements
Give a detailed summary of the table or text below that is well optimized for retrieval.
For any tables also add in a one line description of what the table is about besides the summary.
Do not add additional words like Summary: etc.
Table or text chunk:
{element}
"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
summarize_chain = (
    {"element": lambda x: x}  
    | prompt
    | model
    | StrOutputParser()  # extracts the response as text and returns it as a string
)

try:
    table_summaries = summarize_chain.batch(md_tables, {"max_concurrency": 5})
    text_summaries = summarize_chain.batch(chunked_text,{"max_concurrency":5})
except Exception as e:
    print(f"Batch processing error: {e}")

In [7]:
from langchain_openai import ChatOpenAI

prompt_template = """Describe the image in detail. For context,
                  the image is part of a research paper explaining the transformers
                  architecture. Be specific about graphs, such as bar plots."""
messages = [
    (
        "user",
        [
            {"type": "text", "text": prompt_template},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]

prompt = ChatPromptTemplate.from_messages(messages)

chain = prompt | model | StrOutputParser()


image_summaries = chain.batch(b64_images)

In [None]:
import base64
from IPython.display import Image, display

def display_base64_image(base64_image):
    base64_decode = base64.b64decode(base64_image)

    display(Image(data=base64_decode))

display_base64_image(b64_images[0])

In [None]:
print(image_summaries[0])

In [None]:
%pip install uuid

In [8]:
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

endpoint = os.getenv("ENDPOINT_URL")
model_name = os.getenv("EMBEDDING_DEPLOYMENT")
subscription_key=os.getenv("AZURE_OPENAI_API_KEY")

api_version = "2024-12-01-preview"



openai_embed_model = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2024-05-01-preview",)

In [9]:
from langchain.embeddings import AzureOpenAIEmbeddings

# Sample text to generate embeddings for
test_text = "Hello, world! Testing Azure OpenAI Embeddings."

# Generate the embeddings
try:
    embedding = openai_embed_model.embed_query(test_text)
    print("Embedding generated successfully:")
    # print(embedding[:10])  # Print only the first 10 values for brevity
except Exception as e:
    print(f"Error generating embedding: {e}")


Embedding generated successfully:


In [None]:
%pip install faiss-cpu

In [10]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_community.vectorstores import FAISS
from langchain.storage import InMemoryStore
from langchain_core.documents import Document


# Initialize in-memory document store
memory_store = InMemoryStore()

def create_multi_vector_retriever(
    docstore, text_summaries, texts, table_summaries, tables, image_summaries, 
    images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    Assumes everything is kept in memory for local testing
    """
    id_key = "doc_id"
    
    # Prepare all summaries for initial FAISS creation
    all_summaries = []
    all_metadata = []
    
    # Helper function to prepare documents
    def prepare_documents(doc_summaries, doc_contents, doc_store):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summaries = []
        metadata = []
        
        for i, s in enumerate(doc_summaries):
            summaries.append(s)
            metadata.append({id_key: doc_ids[i]})
            
        # Store the actual documents
        doc_store.mset(list(zip(doc_ids, doc_contents)))
        
        return summaries, metadata
    
    # Prepare all document types if they exist
    if text_summaries:
        text_summaries_list, text_metadata = prepare_documents(text_summaries, texts, docstore)
        all_summaries.extend(text_summaries_list)
        all_metadata.extend(text_metadata)
        
    if table_summaries:
        table_summaries_list, table_metadata = prepare_documents(table_summaries, tables, docstore)
        all_summaries.extend(table_summaries_list)
        all_metadata.extend(table_metadata)
        
    if image_summaries:
        image_summaries_list, image_metadata = prepare_documents(image_summaries, images, docstore)
        all_summaries.extend(image_summaries_list)
        all_metadata.extend(image_metadata)
    
    # Create FAISS index from all prepared summaries
    if all_summaries:
        vectorstore = FAISS.from_texts(
            texts=all_summaries,
            embedding=openai_embed_model,
            metadatas=all_metadata
        )
    else:
        # Create empty FAISS index if no documents
        vectorstore = FAISS.from_texts(
            texts=["placeholder"],
            embedding=openai_embed_model,
            metadatas=[{id_key: "placeholder"}]
        )
    
    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=docstore,
        id_key=id_key,
    )
    
    return retriever

# Create the in-memory retriever
retriever_multi_vector = create_multi_vector_retriever(
    memory_store,
    text_summaries,
    chunked_text,
    table_summaries,
    md_tables,
    image_summaries,
    b64_images, 
)

In [11]:
query = "Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types."
docs = retriever_multi_vector.invoke(query, limit=5)

# We get 3 docs
len(docs)

4

In [None]:
docs[0]

In [12]:
import re
import base64

def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None


def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xff\xd8\xff": "jpg",
        b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

In [13]:
# is_image_data(docs[0])
looks_like_base64(docs[0])

False

In [14]:
from IPython.display import Markdown
from unstructured.documents.elements import CompositeElement


retrieved_tables = []
retrieved_images = []
retrieved_texts = []

for doc in docs:
    if type(doc) == CompositeElement:
        retrieved_texts.append(str(doc))
    elif is_image_data(doc) and looks_like_base64(doc):
        retrieved_images.append(doc)
    else:
        retrieved_tables.append(doc) 


In [90]:
print(len(retrieved_texts))

2


In [15]:
for retrieved_text in retrieved_texts:
    display(Markdown(retrieved_text))
for retrieved_table in retrieved_tables:
    display(Markdown(retrieved_table))
for retrieved_image in retrieved_images:
    display_base64_image(retrieved_image)

3.5 Positional Encoding

Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the

5

Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types. n is the sequence length, d is the representation dimension, k is the kernel size of convolutions and r the size of the neighborhood in restricted self-attention.

bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and ﬁxed [8].

In this work, we use sine and cosine functions of different frequencies:

PE(pos,2i) = sin(pos/100002i/dmodel) PE(pos,2i+1) = cos(pos/100002i/dmodel)

where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any ﬁxed offset k, PEpos+k can be represented as a linear function of PEpos.

We also experimented with using learned positional embeddings [8] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.

4 Why Self-Attention

In this section we compare various aspects of self-attention layers to the recurrent and convolu- tional layers commonly used for mapping one variable-length sequence of symbol representations (x1,...,xn) to another sequence of equal length (z1,...,zn), with xi,zi ∈ Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata.

One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.

The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [11]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types.

As noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in

6

the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work.

A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [15], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k · n · d + n · d2). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model.

As side beneﬁt, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences.

2 Background

The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difﬁcult to learn dependencies between distant positions [11]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.

Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].

End-to-end memory networks are based on a recurrent attention mechanism instead of sequence- aligned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [28].

To the best of our knowledge, however, the Transformer is the ﬁrst transduction model relying entirely on self-attention to compute representations of its input and output without using sequence- aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8].

3 Model Architecture

Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29]. Here, the encoder maps an input sequence of symbol representations (x1,...,xn) to a sequence of continuous representations z = (z1,...,zn). Given z, the decoder then generates an output sequence (y1,...,ym) of symbols one element at a time. At each step the model is auto-regressive [9], consuming the previously generated symbols as additional input when generating the next.

The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.

| Layer Type | Complexity per Layer | Sequential Operations | Maximum Path Length |
| :--- | :--- | :--- | :--- |
| Self-Attention | O(n? - d) | O(1) | O(1) |
| Recurrent | O(n-d?) | O(n) | O(n) |
| Convolutional | O(k-n-d?) | olny | O(logx(n)) |
| Self-Attention (restricted) | O(r-n-d) | ol) | O(n/r) |


| Model | EN-DE | BLEU EN-FR | Training EN-DE | Cost (FLOPs) EN-FR |
| :--- | :--- | :--- | :--- | :--- |
| ByteNet 23.75 |
| Deep-Att + PosUnk |  | 39.2 |  | 1.0 - 107° |
| GNMT + RL Bi] | 24.6 | 39.92 | 2.3-10!9 | 1.4-1070 |
| ConvS2S | 25.16 | 40.46 | 9.6-10'% | 1.5-1070 |
| MoE | 26.03 | 40.56 | 2.0-10'9 | 1.2. 1079 |
| Deep-Att + PosUnk Ensemble |  | 40.4 |  | 8.0 - 107° |
| GNMT + RL Ensemble (BI | 26.30 | 41.16 | 1.8-1079 | 1.1- 1074 |
| ConvS2S Ensemble [8] | 26.36 | 41.29 | 7.7-10!9 | 1.2.10?! |
| Transformer (base model) | 27.3 | 38.1 | 3.3- | 1018 |
| Transformer (big) | 28.4 | 41.0 | 2.3. | 1019 |
