In [1]:
from datasets import load_from_disk

HF_DATASET = "qa_german_invoices_formatted"

qa_ds = load_from_disk(HF_DATASET)

In [2]:
import asyncio
import os
import openai
import weave

weave.init("german_invoices_eval")

# our own Llama 3.2-90B-Vision-Instruct instance
llama_client = openai.AsyncOpenAI( 
  base_url="http://195.242.25.198:8032/v1",
  api_key=os.environ.get("WANDB_API_KEY"),
)

weave version 0.51.10 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: capecape.
View Weave data at https://wandb.ai/capecape/german_invoices_eval/weave


In [3]:
import io, base64
from typing import Union
from PIL import Image

def image_to_base64(image_path: Union[str, Image.Image]) -> str:
    image = Image.open(image_path) if isinstance(image_path, str) else image_path
    byte_arr = io.BytesIO()
    image.save(byte_arr, format="PNG")
    encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
    return str(encoded_string)

questions_template = """
You are a helpful assistant that extracts answers from a document.
You are given a document and a list of questions.
You need to extract the answers to the questions from the document.
Reply in the following format:
1.question_1
answer_1
2.question_2
answer_2
...
Here you have the questions:
{questions}
"""

class Model(weave.Model):
    client: openai.AsyncOpenAI
    model: str = "Llama-3.2-90B-Vision-Instruct"

    @weave.op
    async def predict(self, image: Image.Image, questions: list[str]) -> str:
        image_base64 = image_to_base64(image)
        questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
        base64_messages = [{
                "role":
                "user",
                "content": [
                    {
                        "type": "text",
                        "text": questions_template.format(questions=questions)
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_base64}",
                        },
                    },
                ],
            }]

        response = await self.client.chat.completions.create(
            model=self.model,
            messages=base64_messages,
        )
        return response.choices[0].message.content


In [4]:
sample = qa_ds[1]
image = sample["image"]
questions = [q["question"] for q in sample["qa_pairs"]]
answers = [a["answer"] for a in sample["qa_pairs"]]

In [5]:
questions

['Welche Firma stellt die Rechnung aus?',
 'Wie ist die Rechnungsnummer?',
 'Wann wurde die Rechnung ausgestellt?',
 'Wie hoch ist der Gesamtbetrag der Rechnung?',
 'Wie lautet die E-Mail-Adresse der Firma?']

## LLama 3.2 90B

In [6]:
llama = Model(client=llama_client, model="Llama-3.2-90B-Vision-Instruct")
res = await llama.predict(image, questions)

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192335f-7c7c-7132-aa7b-480727540113


In [62]:
print(res)

1. The name of the restaurant mentioned in the text is Mikoko Ramen- & Sushi-Bar.
2. The address of the restaurant is Mikoko Ramen- & Sushi-Bar Filmstraße 18 50987 Köln.
3. The manager of the restaurant is Lukas Wegerer.
4. The invoice number referred to in the text is 1234.
5. The total amount (including tax) on the invoice is 878,22 €.


In [82]:
import instructor
from pydantic import BaseModel, Field

openai_client = instructor.from_openai(openai.OpenAI())

class Judge(BaseModel):
    is_correct: bool = Field(description="Whether the model's answer is correct")
    explanation: str = Field(description="The explanation for the correctness or incorrectness of the answer")

system_prompt = """You are a helpful assistant that determines if an answer \
is correct and provides an explanation for the correctness or incorrectness \
of the answer. Be tolerant to spelling mistakes that could be related to \
the OCR extraction of the documents and languages"""

prompt_template = """
## Model Extracted Answers
{model_output}
## Real Answers
{answers}

Reply with in JSON format",
"""

@weave.op
def judge_answer(model_output: str, answers: str) -> dict:
    res = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", 
             "content": system_prompt},
            {"role": "user", 
             "content": prompt_template.format(model_output=model_output, answers=answers)
            }
        ],
        response_model=list[Judge]
    )
    return {"correct": sum([r.is_correct for r in res]), 
            "explanation": "\n".join([r.explanation for r in res])}

In [81]:
judge_answer(res, answers) 

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/01923297-5427-76b1-99f7-850c762f658b


{'correct': 3,
 'explanation': "The name of the restaurant retrieved by the model matches the real answer.\nThe address given by the model lacks proper punctuation; it should include a comma after 'Filmstraße 18'.\nThe manager's name provided by the model does not match the real answer; it should be 'Jasmin Tanaka'.\nThe invoice number provided by the model matches the real answer.\nThe total amount (including tax) on the invoice provided by the model matches the real answer."}

In [83]:
# let's create a flat version of the dataset, with the questions stacked and answers as a list of strings
flat_ds = [{"image": sample["image"], 
            "questions": [q["question"] for q in sample["qa_pairs"]],
            "answers": [q["answer"] for q in sample["qa_pairs"]]} for sample in qa_ds]

In [84]:
evaluation = weave.Evaluation(dataset=flat_ds, scorers=[judge_answer])

In [85]:
await evaluation.evaluate(llama)

Traceback (most recent call last):
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions
    yield
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpx/_transports/default.py", line 377, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 216, in handle_async_request
    raise exc from None
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 196, in handle_async_request
    response = await connection.handle_async_request(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection.py", line 101, in handle_async_request


Traceback (most recent call last):
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions
    yield
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpx/_transports/default.py", line 377, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 216, in handle_async_request
    raise exc from None
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 196, in handle_async_request
    response = await connection.handle_async_request(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tcapelle/miniforge3/envs/weave/lib/python3.11/site-packages/httpcore/_async/connection.py", line 101, in handle_async_request


## Pixtral

In [None]:
mistral_client = openai.AsyncOpenAI( 
    base_url="https://api.mistral.ai/v1/",
    api_key=os.environ.get("MISTRAL_API_KEY"),
)

mistral = Model(client=mistral_client, model="pixtral-12b-2409")
mistral_res = await mistral.predict(image, questions)
print(mistral_res)
mistral_judge = judge_answer(mistral_res, answers)
print(mistral_judge)

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192325d-bedf-7550-8f28-c6e5bbe24fb4
1. question_1
answer_1: Mikoko Ramen- & Sushi-Bar

2. question_2
answer_2: Filmstraße 18, 60387 Köln

3. question_3
answer_3: Lukas Wegerer

4. question_4
answer_4: 1234

5. question_5
answer_5: 878,22 €
🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192325d-cd69-7993-b18e-9bac0f40a388
correct_answers=3 explanation="1. Correct. The answer 'Mikoko Ramen- & Sushi-Bar' matches.\n2. Incorrect. The correct address is 'Filmstraße 18, 50987 Köln', but the model extracted '60387 Köln'.\n3. Incorrect. The correct name is 'Jasmin Tanaka', but the model extracted 'Lukas Wegerer'.\n4. Correct. The answer '1234' matches.\n5. Correct. The answer '878,22 €' matches."


In [None]:
await evaluation.evaluate(mistral)

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192325d-d347-7962-a70e-cfef0f6fb5a7


{'judge_answer': {'correct_answers': {'mean': 3.0309278350515463}},
 'model_latency': {'mean': 6.941068413331337}}