In [6]:
import sys 
import base64 
import os 
import uuid 
import re

from typing import List, Tuple

from IPython.display import display, Image, Markdown

from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore

from langchain_community.vectorstores import Chroma

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

from langchain_text_splitters import CharacterTextSplitter

from langchain_google_vertexai import (
    VertexAI,
    ChatVertexAI,
    VertexAIEmbeddings,
    VectorSearchVectorStore,
)

from unstructured.partition.pdf import partition_pdf
import pprint 

In [7]:
PROJECT_ID = "evolution-u"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# For Vector Search Staging
GCS_BUCKET = "udays_experiments_evolu_ai"  # @param {type:"string"}
GCS_BUCKET_URI = f"gs://{GCS_BUCKET}"

In [30]:
GCS_BUCKET_SINGLE = "udays_experiments_evolu_ai_central" 
GCS_BUCKET_SINGLE_URI = f"gs://{GCS_BUCKET_SINGLE}"

In [8]:
# Initialize the vertex Ai SDK
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=GCS_BUCKET_URI)


In [9]:
# Copy the test data used for this learning 
# Download documents and images used in this notebook
!gsutil -m rsync -r gs://github-repo/rag/intro_multimodal_rag/ .
print("Download completed")

Building synchronization state...
Starting synchronization...
Download completed


In [10]:
!gsutil ls gs://github-repo/rag/intro_multimodal_rag/data

gs://github-repo/rag/intro_multimodal_rag/data/
gs://github-repo/rag/intro_multimodal_rag/data/Google Cloud TPU blog.pdf
gs://github-repo/rag/intro_multimodal_rag/data/gemini_v1_5_report_technical.pdf
gs://github-repo/rag/intro_multimodal_rag/data/gemma_technical_paper.pdf
gs://github-repo/rag/intro_multimodal_rag/data/med_gemini.pdf


In [11]:
# Partition PDF, Tables , text and images 
# The documents used are Google10k - Provides overview of company's financial performance  - Alphabet 
# Currently used = Google10k.pdf  but only 14 pages -  Truncated

In [12]:
pdf_folder_path = "data/"
pdf_file_name = "google-10k-sample-14pages.pdf"

In [13]:
# Extract tables, images and chunkk text from the PDF 
raw_pdf_elements = partition_pdf(
    filename=pdf_folder_path + pdf_file_name,
    extract_images_in_pdf=False,
    infer_table_structure=True,
    chunking_strategy='by_title',
    max_characters = 4000,
    new_after_n_chars = 3800,
    combine_text_under_n_chars = 2000,
    image_output_dir_path = pdf_folder_path,
)

# Categorize extracted elements from PDF --> Tables and texts
tables = []
texts = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        tables.append(str(element))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        texts.append(str(element))

# Print tables and texts
print("Length of tables: ", len(tables))    
print("Length of texts: ", len(texts))

# Optional : Enforce a specific token size for texts      # Tiktoken encoder  and hugging face token encoder
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=10000, chunk_overlap=0)  

joined_texts = " ".join(texts)
texts_4k_token = text_splitter.split_text(joined_texts)

2024-07-21 21:51:48.691492: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-21 21:51:49.641537: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.

Length of tables:  2
Length of texts:  8


In [14]:
print("Length of texts after enforcing token size: ", len(texts_4k_token[0]))

Length of texts after enforcing token size:  22053


In [15]:
MODEL_NAME = "gemini-pro-vision" #"gemini-1.0-pro-vision"

In [16]:
# Generate summaries of text elements
def generate_text_summaries(
    texts:List[str], tables: List[str], summarize_texts: bool= False
) -> Tuple[List, List]:
    """Summarize the text elements

    Args:
        texts (List[str]): _description_
        tables (List[str]): _description_
        summarize_texts (bool, optional): _description_. Defaults to False.

    Returns:
        Tuple[List, List]: _description_
    """
    
    # Prompt Text 
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
    
    prompt = PromptTemplate.from_template(prompt_text)
    empty_response = RunnableLambda( lambda x: AIMessage(content="Error Processing Document"))
    
    # Text summary chain 
    model = VertexAI(temperature=0, model_name=MODEL_NAME, max_output_tokens=1024
    ).with_fallbacks([empty_response])
    
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
    
    # Initialize Empty summary 
    text_summaries = []
    table_summaries = []
    
    # Apply to text if texts are provided and summarization is requested
    if texts:
        if summarize_texts:
            text_summaries = summarize_chain.batch(texts, {"max_concurrency": 1})
        else:
            text_summaries = texts
            
    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 1})
    
    return text_summaries, table_summaries 

# Get text, table summaries

text_summaries, table_summaries = generate_text_summaries(
    texts_4k_token, tables, summarize_texts=True
)



In [17]:
print(table_summaries)

['Error Processing Document', 'Error Processing Document']


In [18]:
def encode_image(image_path):
    ''' Getting the base64 string'''
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")    
        return encoded_string

# Summarize images 
def image_summarize(img_base64,prompt):
    """ Make Summary of the image"""
    model = ChatVertexAI(model_name = "gemini-pro-vision", max_output_tokens=1024)
    
    msg = model (
        [
            HumanMessage(
                content = [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png; base64;{img_base64}"}, 
                    },
                ]
            )
        ]
    )
    
    return msg.content 

def generate_img_summaries(path):
    """Generate summaries and base64 encoded strings for images 
    path: Path to list of .jpg files extracted by Unstructured

    Args:
        path (_type_): _description_
    """
    
    # Store base64 encoded images 
    img_base64_list = []
    
    # Store image summaries
    image_summaries = []
    
    # Prompt for image summary 
    prompt = """ You are an assistant tasked with summarizing images for retrieval. \
        These summaries will be embedded and used to retrieve the raw image elements. \
        Give a concise summary of the image that is well optimized for retrieval. \
        If it's a table, extract all elements of the table and also summarize them.
        If it's a Graph, explain the findings in the graph.
        Do not  include any numbers that are not mentioned in the image """
    
    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(".jpg") or img_file.endswith(".png"):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))
        
    return img_base64_list, image_summaries

# Image Summaries
img_base64_list, image_summaries = generate_img_summaries(".")


### Vector Search Index & Endpoint Setup 

In [21]:
DIMENSIONS = 768
 
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(display_name="mm_rag_langchain_index",
                                                            dimensions=DIMENSIONS,
                                                            approximate_neighbors_count=150,
                                                            leaf_node_embedding_count=100,
                                                            leaf_nodes_to_search_percent=7,
                                                            description="Multimodal RAG Index - Langchain ")

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1055298358924/locations/us-central1/indexes/7460366714367115264/operations/1407317468430467072
MatchingEngineIndex created. Resource name: projects/1055298358924/locations/us-central1/indexes/7460366714367115264
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1055298358924/locations/us-central1/indexes/7460366714367115264')


In [22]:
DEPLOYED_INDEX_ID = "mm_rag_langchain_index_endpoint"

index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DEPLOYED_INDEX_ID,
    description="Multimodal RAG Index Endpoint - Langchain",
    public_endpoint_enabled=True)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032/operations/1129220191440338944
MatchingEngineIndexEndpoint created. Resource name: projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032')


In [23]:
# Deploy index to Index endpoint

index_endpoint = index_endpoint.deploy_index(index = index, deployed_index_id= "mm_rag_langchain_index_endpoint")

index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032/operations/496464443794784256
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/1055298358924/locations/us-central1/indexEndpoints/5276261632580780032


[id: "mm_rag_langchain_index_endpoint"
index: "projects/1055298358924/locations/us-central1/indexes/7460366714367115264"
create_time {
  seconds: 1721613517
  nanos: 753197000
}
index_sync_time {
  seconds: 1721614505
  nanos: 668128000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

In [31]:
## Create retriever & load documents
### VectorSearchVectorStore with Vector Search Index ID and Endpoint ID 
### textembedding --> Gecko model

vectorstore = VectorSearchVectorStore.from_components(
    project_id = PROJECT_ID,
    region = LOCATION,
    gcs_bucket_name = GCS_BUCKET_SINGLE,
    index_id = index.name,
    endpoint_id = index_endpoint.name,
    embedding = VertexAIEmbeddings(model_name="textembedding-gecko@003"),
)

In [32]:
### Create Multi-Vector Retriever using the vector store 
### Since vector stores  -> Contain the embedding and an ID, Also need to create a document store indexed by ID --> Get original source documents after searching for embeddings

docstore = InMemoryStore()

id_key = "doc_id"

# Create the multi vector retriever
retriever_multi_vector_img = MultiVectorRetriever(vectorstore=vectorstore, docstore=docstore, id_key=id_key)

In [33]:
# Load data --> Documents and vector stores 

doc_contents = texts + tables  + img_base64_list

doc_ids = [str(uuid.uuid4()) for _ in range(len(doc_contents))]

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries + table_summaries + image_summaries)
]

retriever_multi_vector_img.docstore.mset(list(zip(doc_ids, doc_contents)))

retriever_multi_vector_img.vectorstore.add_documents(summary_docs)

Updating MatchingEngineIndex index: projects/1055298358924/locations/us-central1/indexes/7460366714367115264
Update MatchingEngineIndex index backing LRO: projects/1055298358924/locations/us-central1/indexes/7460366714367115264/operations/8513997680421109760
MatchingEngineIndex index Updated. Resource name: projects/1055298358924/locations/us-central1/indexes/7460366714367115264


['f0056e7a-3541-4d72-9ada-e817e63d79cd',
 '6bbb1a62-788c-4291-8ba0-0efacffe07ac',
 'a33542fc-d82f-4931-a8a6-4fc334a0ae7f']

## Create Chain with Retriever and Gemini LLM


In [34]:
def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None

In [35]:
def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xFF\xD8\xFF": "jpg",
        b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

In [36]:
def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so 
        if isinstance(doc, Document):
            doc = doc.page_content
        if looks_like_base64(doc) and is_image_data(doc):
            b64_images.append(doc)
        else:
            texts.append(doc)
    return {"images": b64_images, "texts": texts}



In [59]:
import pprint

In [66]:
# def img_prompt_func(data_dict):
#     """
#     Join the  context into a single string
#     """
#     formatted_texts = "\n".join(data_dict["context"]["texts"])
    
#     messages = [
#         {
#             "type": "text",
#             "text": (
#                 "You are financial analyst tasking with providing evidence",
#                 "You will be given a  mix of text , tables and image(s) usually of charts or graphs \n ",
#                 "Use this information to provide investment advice related to the user's question \n",
#                 f"User-provided question: {data_dict['question']}\n",
#                 "Text and /or tables :\n",
#                 f"{formatted_texts}\n",
#             )
#         }
#     ]
    
#     # Adding images to messages if present 
#     if data_dict["context"]["images"]:
#         for image in data_dict["context"]["images"]:
#             messages.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}, 
#                              })
            
#         return [HumanMessage(content=messages)]
    
    
def img_prompt_func(data_dict):
    """
    Create a structured prompt from the context and question for the multimodal model.
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    
    # Construct the main message content
    content = (
        "You are a financial analyst tasked with providing evidence-based investment advice. "
        "You will be given a mix of text, tables, and images (usually of charts or graphs).\n\n"
        f"User-provided question: {data_dict['question']}\n\n"
        "Text and/or tables:\n"
        f"{formatted_texts}\n"
    )
    
    # Initialize the messages list with the main text message
    messages = [HumanMessage(content=content)]
    
    # Add images to messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            messages.append(HumanMessage(content=f"![chart](data:image/jpeg;base64,{image})"))
    
    # Debug print to check the output
    pprint.pprint(messages)
    
    return messages

In [67]:
# Create RAG Chain 

chain_multimodal_rag = (
    {
        "context": retriever_multi_vector_img | RunnableLambda(split_image_text_types),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(lambda x: (print(f"Data before img_prompt_func: {x}"), img_prompt_func(x))[1])
    | ChatVertexAI(temperature=0, model_name="gemini-pro-vision", max_output_tokens=1024)  # Multi-modal LLM - Gemini Pro vision
    | RunnableLambda(lambda x: (print(f"Response from model: {x}"), x)[1])
    | StrOutputParser()
)

## Process user query


In [68]:
query = """
 - What are the critical difference between various graphs for Class A Share?
 - Which index best matches Class A share performance closely where Google is not already a part? Explain the reasoning.
 - Identify key chart patterns for Google Class A shares.
 - What is cost of revenues, operating expenses and net income for 2020. Do mention the percentage change
 - What was the effect of Covid in the 2020 financial year?
 - What are the total revenues for APAC and USA for 2021?
 - What is deferred income taxes?
 - How do you compute net income per share?
 - What drove percentage change in the consolidated revenue and cost of revenue for the year 2021 and was there any effect of Covid?
 - What is the cause of 41% increase in revenue from 2020 to 2021 and how much is dollar change?
"""

In [69]:
query_ques = {"question": query}

In [70]:
# List of source documents
docs = retriever_multi_vector_img.get_relevant_documents(query, limit=10)

source_docs = split_image_text_types(docs)

print(source_docs["texts"])

for i in source_docs["images"]:
    display(Image(base64.b64decode(i)))

["Stock Performance Graphs\n\nThe graph below matches Alphabet Inc. Class A's cumulative 5-year total stockholder return on common stock with the cumulative total returns of the S&P 500 index, the NASDAQ Composite index, and the RDG Internet Composite index. The graph tracks the performance of a $100 investment in our common stock and in each index (with the reinvestment of all dividends) from December 31, 2016 to December 31, 2021. The returns shown are based on historical results and are not intended to suggest future performance.\n\nCOMPARISON OF CUMULATIVE 5-YEAR TOTAL RETURN* ALPHABET INC. CLASS A COMMON STOCK Among Alphabet Inc., the S&P 500 Index, the NASDAQ Composite Index, and the RDG Internet Composite Index $350 $250 $150 $100 BP FSH gg g% gy oF p% gh J gph HK KML LC LS ——— Alphabet Inc. Class A ——— S&P 500 ——— RDG Internet Composite NASDAQ Composite *$100 invested on December 31, 2016 in stock or index, including reinvestment of dividends. Fiscal year ending December 31. Co

In [71]:
# Get generative responses

result = chain_multimodal_rag.invoke(query)

Markdown(result)

Data before img_prompt_func: {'context': {'images': [], 'texts': ["Stock Performance Graphs\n\nThe graph below matches Alphabet Inc. Class A's cumulative 5-year total stockholder return on common stock with the cumulative total returns of the S&P 500 index, the NASDAQ Composite index, and the RDG Internet Composite index. The graph tracks the performance of a $100 investment in our common stock and in each index (with the reinvestment of all dividends) from December 31, 2016 to December 31, 2021. The returns shown are based on historical results and are not intended to suggest future performance.\n\nCOMPARISON OF CUMULATIVE 5-YEAR TOTAL RETURN* ALPHABET INC. CLASS A COMMON STOCK Among Alphabet Inc., the S&P 500 Index, the NASDAQ Composite Index, and the RDG Internet Composite Index $350 $250 $150 $100 BP FSH gg g% gy oF p% gh J gph HK KML LC LS ——— Alphabet Inc. Class A ——— S&P 500 ——— RDG Internet Composite NASDAQ Composite *$100 invested on December 31, 2016 in stock or index, includ

**1. What are the critical difference between various graphs for Class A Share?**

The provided text does not contain any information on the critical differences between various graphs for Class A shares.

**2. Which index best matches Class A share performance closely where Google is not already a part? Explain the reasoning.**

The provided text does not contain any information on which index best matches Class A share performance closely where Google is not already a part.

**3. Identify key chart patterns for Google Class A shares.**

The provided text does not contain any information on key chart patterns for Google Class A shares.

**4. What is cost of revenues, operating expenses and net income for 2020. Do mention the percentage change**

- Cost of revenues: $84,732 million
- Operating expenses: $56,571 million
- Net income: $40,269 million

**5. What was the effect of Covid in the 2020 financial year?**

The adverse effect of COVID-19 on 2020 advertising revenues contributed to the year-over-year growth in revenues in 2021.

**6. What are the total revenues for APAC and USA for 2021?**

The provided text does not contain information on the total revenues for APAC and USA for 2021.

**7. What is deferred income taxes?**

The provided text does not contain any information on deferred income taxes.

**8. How do you compute net income per share?**

Net income per share is calculated by dividing the net income by the number of outstanding shares.

**9. What drove percentage change in the consolidated revenue and cost of revenue for the year 2021 and was there any effect of Covid?**

- Consolidated revenue increased by 41% in 2021, primarily driven by Google Services and Google Cloud. The adverse effect of COVID-19 on 2020 advertising revenues also contributed to the year-over-year growth.
- Cost of revenues increased by 31% in 2021, primarily driven by increases in TAC and content acquisition costs.

**10. What is the cause of 41% increase in revenue from 2020 to 2021 and how much is dollar change?**

The 41% increase in revenue from 2020 to 2021 was primarily driven by Google Services and Google Cloud. The dollar change in revenue was $75,110 million.