<a target="_blank" href="https://github.com/urcraft/llm_lecture_notebooks/blob/main/05_Multimodal_Comparison_Image_Tagging_and_PDF_Extraction.ipynb">  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal Gemini Demo: Image Tagging and PDF Extraction

## What you will learn
- Use Gemini for image tagging and captioning.
- Use Gemini structured output for invoice field extraction.

Expected runtime: 10-20 minutes
Expected cost: Free-tier Gemini if available.


In [1]:
%pip install google-genai pandas pillow requests




In [2]:
import json
import requests
import pandas as pd
from PIL import Image

IMAGE_URLS = [
    'https://storage.googleapis.com/generativeai-downloads/images/scones.jpg',
    'https://codelabs.developers.google.com/static/codelabs/gemini-java-developers/img/af00516493ec9ade_856.png'
]
PDF_URL = 'https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/invoice.pdf'

for i, url in enumerate(IMAGE_URLS, start=1):
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    with open(f'image_{i}.jpg', 'wb') as f:
        f.write(r.content)

pdf_r = requests.get(PDF_URL, timeout=60)
pdf_r.raise_for_status()
with open('invoice.pdf', 'wb') as f:
    f.write(pdf_r.content)

print('Downloaded sample files.')


Downloaded sample files.


In [3]:
GEMINI_MODEL = 'gemini-3-flash-preview'
print('Using Gemini model:', GEMINI_MODEL)


Using Gemini model: gemini-3-flash-preview


In [4]:
GEMINI_AVAILABLE = False
GEMINI_ERROR = None

try:
    from google import genai
    import os

    api_key = os.getenv('GOOGLE_API_KEY')
    if not api_key:
        try:
            from google.colab import userdata
            api_key = userdata.get('GOOGLE_API_KEY')
        except Exception:
            api_key = None

    if not api_key:
        raise ValueError('Set GOOGLE_API_KEY environment variable (or Colab secret GOOGLE_API_KEY).')

    client = genai.Client(api_key=api_key)
    GEMINI_AVAILABLE = True
except Exception as e:
    GEMINI_ERROR = str(e)
    print('Gemini unavailable:', GEMINI_ERROR)


In [5]:
def tag_caption_image_gemini(image_path: str):
    if not GEMINI_AVAILABLE:
        return {'model': GEMINI_MODEL, 'output': None, 'error': 'Gemini unavailable'}

    try:
        img = Image.open(image_path)
        prompt = 'Provide exactly 5 tags and a 1-sentence caption for this image.'
        resp = client.models.generate_content(model=GEMINI_MODEL, contents=[prompt, img])
        return {'model': GEMINI_MODEL, 'output': resp.text, 'error': None}
    except Exception as e:
        return {'model': GEMINI_MODEL, 'output': None, 'error': str(e)}


In [6]:
image_rows = []
for image_path in ['image_1.jpg', 'image_2.jpg']:
    result = tag_caption_image_gemini(image_path)
    image_rows.append({'task': image_path, 'model': result['model'], 'output': result['output'], 'error': result['error']})

image_df = pd.DataFrame(image_rows)
image_df


Unnamed: 0,task,model,output,error
0,image_1.jpg,gemini-3-flash-preview,"Tags: blueberry scones, coffee, peonies, break...",
1,image_2.jpg,gemini-3-flash-preview,"Tags: cat, snow, tabby, winter, outdoor\n\nCap...",


In [7]:
def extract_pdf_structured_gemini(file_path: str):
    if not GEMINI_AVAILABLE:
        return {
            'model': GEMINI_MODEL,
            'data': {},
            'error': 'Gemini unavailable'
        }

    try:
        uploaded = client.files.upload(file=file_path)
        prompt = 'Extract invoice fields exactly as JSON with keys: seller_name, invoice_number, invoice_date, total_amount.'
        resp = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=[prompt, uploaded],
            config={'response_mime_type': 'application/json'}
        )
        try:
            data = json.loads(resp.text)
        except Exception:
            data = {'raw_text': resp.text}
        return {'model': GEMINI_MODEL, 'data': data, 'error': None}
    except Exception as e:
        return {'model': GEMINI_MODEL, 'data': {}, 'error': str(e)}


In [8]:
gem_pdf = extract_pdf_structured_gemini('invoice.pdf')

pdf_df = pd.DataFrame([
    {'task': 'invoice.pdf', 'model': gem_pdf['model'], 'output': gem_pdf['data'], 'error': gem_pdf['error']}
])
pdf_df


Unnamed: 0,task,model,output,error
0,invoice.pdf,gemini-3-flash-preview,"{'seller_name': 'Williams LLC', 'invoice_numbe...",


In [9]:
final_score = pd.DataFrame([
    {'task_type': 'image_tag_caption', 'model': 'Gemini', 'quality_score_1_to_5': '', 'hallucination_risk_1_to_5': '', 'notes': ''},
    {'task_type': 'pdf_structured_output', 'model': 'Gemini', 'quality_score_1_to_5': '', 'hallucination_risk_1_to_5': '', 'notes': ''}
])
final_score


Unnamed: 0,task_type,model,quality_score_1_to_5,hallucination_risk_1_to_5,notes
0,image_tag_caption,Gemini,,,
1,pdf_structured_output,Gemini,,,


## Checkpoint
- Fill in the final score table.
- Write one sentence on Gemini strengths for image understanding.
- Write one sentence on Gemini strengths/limits for document extraction.

## Reflection
- If your organization has strict privacy requirements, what deployment constraints would you add?

## Troubleshooting
- If Gemini returns temporary 5xx/503 errors, rerun the cell after a short wait.
- If API key setup fails, verify `GOOGLE_API_KEY` is set.
