In [1]:
import requests
import base64

import litellm

from google import auth

from langchain_core.messages import HumanMessage
from langchain_google_vertexai import ChatVertexAI

### Setting for examples

In [2]:
PDF = 'https://arxiv.org/pdf/1706.03762'
HTML = 'https://arxiv.org/html/1706.03762'
GS_PDF = f'gs://{auth.default()[1]}/1706.03762.pdf'

YT_VIDEO = 'https://www.youtube.com/watch?v=iDulhoQ2pro'

MODEL = 'gemini-1.5-flash-002'

In [3]:
lang_llm = ChatVertexAI(model=MODEL)

## PDF file - in prompt

In [4]:
pdf_content = base64.b64encode(requests.get(PDF).content).decode("utf-8")

In [5]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}])
])

AIMessage(content="Here's the answer to your question:\n\n**Title:** Attention Is All You Need\n\n**Authors:** Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin", additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 48, 'end_index': 190, 'uri': 'https://clarifai.com/clarifai/main/models/moderation-multilingual-text-classification', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 62, 'total_token_count': 3943, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.06951698949260096}, id='run-66fdbf2c-0f9b-44b6-a62e-0bbdfa1e444d-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 62, 'total_tokens': 3943})

In [6]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}
        ]
    )
])

AIMessage(content='The title of the paper is **Attention Is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 70, 'end_index': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.029677619194162303}, id='run-e5d7eb49-8f4f-4a9d-901e-3e036b87254c-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [7]:
# Litellm - list of dictionaries
litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-5d484a84-c788-4afc-8468-190e0b91c718', created=1735350009, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the paper is **Attention is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=58, prompt_tokens=3881, total_tokens=3939, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 70, 'endIndex': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/'}]}])

## PDF - stored in Google Storage Bucket

In [8]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "application/pdf", "file_uri": GS_PDF}])
])

AIMessage(content='The title of the paper is "Attention is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 68, 'end_index': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.036004362435176455}, id='run-f73446e0-006e-47b1-8623-e9928acffeee-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [9]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "application/pdf", "file_uri": GS_PDF}
        ]
    )
])

AIMessage(content="Here's the answer to your question:\n\n**Title:** Attention Is All You Need\n\n**Authors:** Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin", additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 48, 'end_index': 190, 'uri': 'https://clarifai.com/clarifai/main/models/moderation-multilingual-text-classification', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 62, 'total_token_count': 3943, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.07204825647415654}, id='run-aea4ee2c-920f-4d55-8b73-807c7893c164-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 62, 'total_tokens': 3943})

In [10]:
# Litellm - list of dictionaries
# Litellm use file extenstion to guess media_type

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": GS_PDF}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-40f62a45-3d44-4b5c-aca7-446ede529b47', created=1735350019, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the paper is "Attention is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=58, prompt_tokens=3881, total_tokens=3939, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 68, 'endIndex': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/'}]}])

## PDF directly from website

Vertex AI can access data from publicly avialable sites (it does follow robots.txt limitations)

In [11]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "application/pdf", "file_uri": PDF}])
])

AIMessage(content="Here's the information you requested from the provided text:\n\n**Title:** Attention Is All You Need\n\n**Authors:** Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin", additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 73, 'end_index': 215, 'uri': 'https://clarifai.com/clarifai/main/models/moderation-multilingual-text-classification', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 65, 'total_token_count': 3946, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.08089208602905273}, id='run-0dfd8a9c-a9d8-4005-a81b-64aacb08204c-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 65, 'total_tokens': 3946})

In [12]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "application/pdf", "file_uri": PDF}
        ]
    )
])

AIMessage(content='The title of the paper is **Attention is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 70, 'end_index': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03923832548075709}, id='run-089b9840-975d-4fcc-b24d-a9b06df6252d-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [13]:
# Litellm - list of dictionaries
# Litellm use file extenstion to guess media_type

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": PDF}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-80efeedd-58b6-4f4e-ada2-259039bd8ff6', created=1735350036, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the paper is **Attention is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=58, prompt_tokens=3881, total_tokens=3939, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 70, 'endIndex': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/'}]}])

## HTML from public websites

The same applies to HTML (no need for scraping).

In [14]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "text/html", "file_uri": HTML}])
])

AIMessage(content='The title of the paper is **Attention Is All You Need**.  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.\n', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 70, 'end_index': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 209639, 'candidates_token_count': 59, 'total_token_count': 209698, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.023754556300276418}, id='run-692b7f1d-f30f-4fee-af03-fc741ad6f1f0-0', usage_metadata={'input_tokens': 209639, 'output_tokens': 59, 'total_tokens': 209698})

In [15]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "text/html", "file_uri": HTML}
        ]
    )
])

AIMessage(content='The title of the paper is "Attention Is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.  The footnote indicates that all authors contributed equally, and the order is random.\n', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 72, 'end_index': 201, 'uri': 'https://www.atmosera.com/ai/understanding-chatgpt/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 209639, 'candidates_token_count': 75, 'total_token_count': 209714, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.0865724245707194}, id='run-ef48c76d-2911-487d-8166-863313e8312b-0', usage_metadata={'input_tokens': 209639, 'output_tokens': 75, 'total_tokens': 209714})

In [16]:
# Litellm - list of dictionaries
# Litellm use file extenstion to guess media_type

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": HTML}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-dd2e0c87-4d99-445e-8de7-35d0842011bf', created=1735350060, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the paper is "Attention Is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.  The footnote indicates equal contribution and that the listing order is random.\n', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=73, prompt_tokens=11, total_tokens=84, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 72, 'endIndex': 201, 'uri': 'https://www.atmosera.com/ai/understanding-chatgpt/'}]}])

## Video from Youtube

In [17]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "video/vnd.youtube.yt", "file_uri": YT_VIDEO}])
])

AIMessage(content='The title of the paper shown is “Attention is All You Need.” The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 73, 'end_index': 200, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 479681, 'candidates_token_count': 58, 'total_token_count': 479739, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.08717664356889396}, id='run-7ce99689-926f-438f-a404-29a9b2e4d568-0', usage_metadata={'input_tokens': 479681, 'output_tokens': 58, 'total_tokens': 479739})

In [18]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "video/vnd.youtube.yt", "file_uri": YT_VIDEO}
        ]
    )
])

AIMessage(content='The title of the paper is “Attention is All You Need”.  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 68, 'end_index': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 479681, 'candidates_token_count': 58, 'total_token_count': 479739, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.042078630677584945}, id='run-2e9d7096-fb12-4d6d-b404-319b18034a18-0', usage_metadata={'input_tokens': 479681, 'output_tokens': 58, 'total_tokens': 479739})

In [19]:
# Litellm - list of dictionaries

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": YT_VIDEO}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-87f281da-7ef4-44a9-84ed-174d0d0bdaa8', created=1735350153, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the YouTube video embedded in the provided HTML is **Attention Is All You Need**.  The author of the YouTube video is **Yannic Kilcher**.  Note that the video\'s description mentions the original paper "Attention is All You Need" by Ashish Vaswani et al., but Yannic Kilcher is the creator of *this specific YouTube video* explaining the paper.\n', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=80, prompt_tokens=11, total_tokens=91, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[])