<a href="https://colab.research.google.com/github/wesslen/llm-myths/blob/main/notebooks/invoice_gemini_structured_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on [post by Phil Schmid](https://www.philschmid.de/gemini-pdf-to-data).

In [1]:
!uv pip install --system "google-genai>=1"

[2mUsing Python 3.11.11 environment at /usr[0m
[2mAudited [1m1 package[0m [2min 301ms[0m[0m


In [2]:
from google import genai
from google.colab import userdata
# Create a client
api_key = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=api_key)

# Define the model you are going to use
model_id =  "gemini-2.0-flash"

![](https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/invoice.png)

In [3]:
!wget https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/invoice.pdf

--2025-02-10 02:23:03--  https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/invoice.pdf
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.31.207, 142.251.111.207, 142.251.16.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.31.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 560986 (548K) [application/pdf]
Saving to: ‘invoice.pdf.3’


2025-02-10 02:23:03 (80.0 MB/s) - ‘invoice.pdf.3’ saved [560986/560986]



In [4]:
invoice_pdf = client.files.upload(file="invoice.pdf", config={'display_name': 'invoice'})

In [5]:
file_size = client.models.count_tokens(model=model_id,contents=invoice_pdf)
print(f'File: {invoice_pdf.display_name} equals to {file_size.total_tokens} tokens')
# File: invoice equals to 821 tokens

File: invoice equals to 821 tokens


In [6]:
from pydantic import BaseModel, Field

def extract_structured_data(file_path: str, model: BaseModel):
    # Upload the file to the File API
    file = client.files.upload(file=file_path, config={'display_name': file_path.split('/')[-1].split('.')[0]})
    # Generate a structured response using the Gemini API
    prompt = f"Extract the structured data from the following PDF file"
    response = client.models.generate_content(model=model_id, contents=[prompt, file], config={'response_mime_type': 'application/json', 'response_schema': model})
    # Convert the response to the pydantic model and return it
    return response.parsed

In [7]:
from pydantic import BaseModel, Field

class Item(BaseModel):
    description: str = Field(description="The description of the item")
    quantity: float = Field(description="The Qty of the item")
    gross_worth: float = Field(description="The gross worth of the item")

class Invoice(BaseModel):
    """Extract the invoice number, date and all list items with description, quantity and gross worth and the total gross worth."""
    invoice_number: str = Field(description="The invoice number e.g. 1234567890")
    date: str = Field(description="The date of the invoice e.g. 2024-01-01")
    items: list[Item] = Field(description="The list of items with description, quantity and gross worth")
    total_gross_worth: float = Field(description="The total gross worth of the invoice")


result = extract_structured_data("invoice.pdf", Invoice)
print(type(result))
print(f"Extracted Invoice: {result.invoice_number} on {result.date} with total gross worth {result.total_gross_worth}")
for item in result.items:
    print(f"Item: {item.description} with quantity {item.quantity} and gross worth {item.gross_worth}")

<class '__main__.Invoice'>
Extracted Invoice: 27301261 on 10/09/2012 with total gross worth 544.46
Item: Lilly Pulitzer dress Size 2 with quantity 5.0 and gross worth 247.5
Item: New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10 with quantity 1.0 and gross worth 65.99
Item: Sequence dress Size Small with quantity 3.0 and gross worth 115.5
Item: fire los angeles dress Medium with quantity 3.0 and gross worth 21.45
Item: Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray with quantity 3.0 and gross worth 52.77
Item: Lularoe Nicole Dress Size Small Light Solid Grey/ White Ringer Tee Trim with quantity 2.0 and gross worth 8.25
Item: J.Crew Collection Black & White sweater Dress sz S with quantity 1.0 and gross worth 33.0


In [8]:
result.model_dump()

{'invoice_number': '27301261',
 'date': '10/09/2012',
 'items': [{'description': 'Lilly Pulitzer dress Size 2',
   'quantity': 5.0,
   'gross_worth': 247.5},
  {'description': 'New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10',
   'quantity': 1.0,
   'gross_worth': 65.99},
  {'description': 'Sequence dress Size Small',
   'quantity': 3.0,
   'gross_worth': 115.5},
  {'description': 'fire los angeles dress Medium',
   'quantity': 3.0,
   'gross_worth': 21.45},
  {'description': "Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray",
   'quantity': 3.0,
   'gross_worth': 52.77},
  {'description': 'Lularoe Nicole Dress Size Small Light Solid Grey/ White Ringer Tee Trim',
   'quantity': 2.0,
   'gross_worth': 8.25},
  {'description': 'J.Crew Collection Black & White sweater Dress sz S',
   'quantity': 1.0,
   'gross_worth': 33.0}],
 'total_gross_worth': 544.46}