In [1]:
import requests
import base64

import litellm

from google import auth

from langchain_core.messages import HumanMessage
from langchain_google_vertexai import ChatVertexAI

### Setting for examples

In [2]:
PDF = 'https://arxiv.org/pdf/1706.03762'
GS_PDF = f'gs://{auth.default()[1]}/1706.03762.pdf'

YT_VIDEO = 'https://www.youtube.com/watch?v=iDulhoQ2pro'

MODEL = 'gemini-1.5-flash-002'

In [3]:
lang_llm = ChatVertexAI(model=MODEL)

## PDF file - in prompt

In [4]:
pdf_content = base64.b64encode(requests.get(PDF).content).decode("utf-8")

In [5]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}])
])

AIMessage(content='The title of the paper is "Attention is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 68, 'end_index': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03335425977049203}, id='run-1d4ef3e2-4bb7-42c6-a28b-0488a09d4475-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [6]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}
        ]
    )
])

AIMessage(content='The title of the paper is **Attention is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 70, 'end_index': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03665543013605578}, id='run-364d4ba3-fb77-4343-bfb7-c70ca16700e9-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [7]:
# Litellm - list of dictionaries
litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url": {"url": f"data:application/pdf;base64,{pdf_content}"}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-4b120dec-3a48-46df-a8d0-d45c955ef037', created=1734647317, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The title of the paper is "Attention is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=58, prompt_tokens=3881, total_tokens=3939, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 68, 'endIndex': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/'}]}])

## PDF - stored in Google Storage Bucket

In [8]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "application/pdf", "file_uri": GS_PDF}])
])

AIMessage(content='The title of the paper is **Attention Is All You Need**.\n\nThe authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 70, 'end_index': 197, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03426221937968813}, id='run-6fe63c5c-9456-45f4-9e91-69915f32085d-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [9]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "application/pdf", "file_uri": GS_PDF}
        ]
    )
])

AIMessage(content='The title of the paper is "Attention Is All You Need".  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 68, 'end_index': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 3881, 'candidates_token_count': 58, 'total_token_count': 3939, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.01916517471445018}, id='run-d0f0041c-d348-4553-a9fa-d07b2cfe89e8-0', usage_metadata={'input_tokens': 3881, 'output_tokens': 58, 'total_tokens': 3939})

In [10]:
# Litellm - list of dictionaries
# Litellm use file extenstion to guess media_type

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": GS_PDF}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-67e7795c-4131-4225-ba8f-bb641f07f493', created=1734647327, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='Here is the title and author information from the provided text:\n\n**Title:** Attention Is All You Need\n\n**Authors:** Ashish Vaswani*, Noam Shazeer*, Niki Parmar*, Jakob Uszkoreit*, Llion Jones*, Aidan N. Gomez*, Łukasz Kaiser*, and Illia Polosukhin*\n\n\n* indicates equal contribution.', role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=72, prompt_tokens=3881, total_tokens=3953, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[{'citations': [{'startIndex': 77, 'endIndex': 225, 'uri': 'https://clarifai.com/clarifai/main/models/moderation-multilingual-text-classification'}, {'startIndex': 117, 'endIndex': 256,

## Video from Youtube

In [11]:
# Langchain model - each message is tuple of role, content

lang_llm.invoke([
    ("user", [{"type": "text", "text": "What is title and who is author of the paper?"}, 
              {"type": "media", "mime_type": "video/vnd.youtube.yt", "file_uri": YT_VIDEO}])
])

AIMessage(content='The title of the paper is “Attention is All You Need,” and the authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 71, 'end_index': 198, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 479681, 'candidates_token_count': 58, 'total_token_count': 479739, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03462911885360192}, id='run-f61d0250-8946-45d8-b65b-65a3d14c3afc-0', usage_metadata={'input_tokens': 479681, 'output_tokens': 58, 'total_tokens': 479739})

In [12]:
# Langchain model - each message is subclass of BaseMassage (HumanMessage in this case)

lang_llm.invoke([
    HumanMessage(
        content = [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "media", "mime_type": "video/vnd.youtube.yt", "file_uri": YT_VIDEO}
        ]
    )
])

AIMessage(content='The title of the paper is “Attention is All You Need.”  The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'citation_metadata': {'citations': [{'start_index': 68, 'end_index': 195, 'uri': 'https://debuggercafe.com/transformer-neural-network/', 'title': '', 'license_': ''}]}, 'usage_metadata': {'prompt_token_count': 479681, 'candidates_token_count': 58, 'total_token_count': 479739, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.03850662297215955}, id='run-74fa9654-20ac-443a-8bac-d822cd469211-0', usage_metadata={'input_tokens': 479681, 'output_tokens': 58, 'total_tokens': 479739})

In [13]:
# Litellm - list of dictionaries

litellm.completion(
    model=MODEL,
    messages = [
        { "content": [
            {"type": "text", "text": "What is title and who is author of the paper?"}, 
            {"type": "image_url", "image_url":{"url": YT_VIDEO}}
        ],
         "role": "user"}
    ]
)

ModelResponse(id='chatcmpl-040113bf-eee4-43fe-abed-0f791abadfa2', created=1734647401, model='gemini-1.5-flash-002', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content="The title of the YouTube video is **Attention Is All You Need**.  The author is **Yannic Kilcher**.  Note that this information is extracted from the metadata within the HTML provided; it's not a research paper in the traditional sense, but rather a video about a research paper.\n", role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=61, prompt_tokens=11, total_tokens=72, completion_tokens_details=None, prompt_tokens_details=None), vertex_ai_grounding_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[])