In [1]:
from pathlib import Path

import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration


In [2]:
model_id = "/home/jovyan/nfs_share/models/Llama-3.2-11B-Vision-Instruct"
# here, specify the name of the image
imageName = "/home/jovyan/nfs_share/tod/datasets/synthetic_invoice_014.png"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

# open the image
image = Image.open(imageName)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
# create a message data structure
# messageDataStructure = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "image"},
#             {
#                 "type": "text",
#                 "text": "What type of document is this?",
#             },
#         ],
#     }
# ]


In [5]:
messageDataStructure = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": "How much did Jessica pay?",
            },
        ],
    }
]

In [6]:
# create text input
textInput = processor.apply_chat_template(
    messageDataStructure, add_generation_prompt=True
)
# call the processor
inputs = processor(image, textInput, return_tensors="pt").to(model.device)

# here, change the number of tokens to get a more detailed answer
output = model.generate(**inputs, max_new_tokens=2000)
# here, we decode and store the response so we can print it
generatedOutput = processor.decode(output[0])

print(generatedOutput)

<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>How much did Jessica pay?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**Calculating the Total Amount Paid by Jessica**

To determine the total amount paid by Jessica, we need to add up the costs of all items purchased and subtract any applicable taxes.

**Itemized Costs:**

*   Milk 2L: $4.80
*   Apples (kg): $3.96
*   Ground Beef 500g: $8.90
*   Pasta 500g: $2.90

**Total Cost Before Tax:**
$4.80 + $3.96 + $8.90 + $2.90 = $20.56

**GST 10% Tax:**
$20.56 x 0.10 = $2.06

**Subtotal (including tax):**
$20.56 + $2.06 = $22.62

**Total Amount Paid:**
$22.62 + $28.48 (GST) = $31.33

Therefore, Jessica paid a total of **$31.33**.<|eot_id|>


In [None]:
# save the answer in a file
with Path("/home/jovyan/nfs_share/tod/output/llama_output.txt").open(
    "w", encoding="utf-8"
) as text_file:
    text_file.write(generatedOutput)