In [1]:
import weave

weave.init("german_invoices_eval")

weave_ref = "weave:///capecape/german_invoices_eval/object/qa_german_invoices:vKjWelZ7XSrdFwWxKHaAMgGXKfLrXGkPphMJxP340EQ"
qa_ds = weave.ref(weave_ref).get().rows[:]

weave version 0.51.10 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: capecape.
View Weave data at https://wandb.ai/capecape/german_invoices_eval/weave


In [2]:
import asyncio
import os
import openai
import weave

weave.init("german_invoices_eval")

# our own Llama 3.2-90B-Vision-Instruct instance
client = openai.AsyncOpenAI( 
  base_url="http://195.242.25.198:8032/v1",
  api_key=os.environ.get("WANDB_API_KEY"),
)

In [3]:
import io, base64
from typing import Union
from PIL import Image

def image_to_base64(image_path: Union[str, Image.Image]) -> str:
    image = Image.open(image_path) if isinstance(image_path, str) else image_path
    byte_arr = io.BytesIO()
    image.save(byte_arr, format="PNG")
    encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
    return str(encoded_string)

@weave.op
async def call_llama(image: Image.Image, questions: list[str]) -> str:
    image_base64 = image_to_base64(image)
    questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
    base64_messages = [{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Reply to all the questions with the info from the document: \n{questions}"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}",
                    },
                },
            ],
        }]

    response = await client.chat.completions.create(
        model="Llama-3.2-90B-Vision-Instruct",
        messages=base64_messages,
    )
    return response.choices[0].message.content


In [13]:
sample = qa_ds[1]
image = sample["image"]
questions = [q["question"] for q in sample["qa_pairs"]]
answers = [a["answer"] for a in sample["qa_pairs"]]

In [14]:
questions

['What is the name of the restaurant mentioned in the text?',
 'What is the address of the restaurant?',
 'Who is the manager of the restaurant?',
 'What is the invoice number referred to in the text?',
 'What is the total amount (including tax) on the invoice?']

In [15]:
res = await call_llama(image, questions)
res

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/01923023-cdee-77b0-8830-3388831c3365


'The name of the restaurant is Mikoko Ramen- & Sushi-Bar. The address of the restaurant is Filmstraße 18, 50987 Köln. The manager of the restaurant is Lukas Wegerer. The invoice number is 1234. The total amount (including tax) on the invoice is 878,22 €.'

In [16]:
print(res)

The name of the restaurant is Mikoko Ramen- & Sushi-Bar. The address of the restaurant is Filmstraße 18, 50987 Köln. The manager of the restaurant is Lukas Wegerer. The invoice number is 1234. The total amount (including tax) on the invoice is 878,22 €.


In [17]:
import instructor
from pydantic import BaseModel, Field
from typing import List

openai_client = instructor.from_openai(openai.OpenAI())


class Judge(BaseModel):
    number_of_correct: int = Field(description="The number of questions that the model answered correctly")
    explanation: str = Field(description="The explanation for the number of questions that the model answered correctly")

@weave.op
def judge_answer(model_output: str, answers: str) -> dict:
    res = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", 
            "content": "You are a helpful assistant that determines if an answer is correct and provides an explanation for the correctness or incorrectness of the answer. Be tolerant to spelling mistakes that oculd be related to the OCR extraction of the documents and languages."},
            {"role": "user", 
             "content": f"## Model Output \n{model_output}\n\n## Real Answer\n{answers}\nReply with a JSON",
            }
        ],
        response_model=Judge
    )
    return res

In [18]:
judge_answer(res, answers) 

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/01923024-218f-7262-80df-b2b99d574a62


Judge(number_of_correct=2, explanation="The model correctly identified the restaurant name 'Mikoko Ramen- & Sushi-Bar', the address 'Filmstraße 18, 50987 Köln', the invoice number '1234', and the total amount '878,22 €'. However, the manager's name provided by the model is incorrect; the correct manager's name is 'Jasmin Tanaka', while the model stated 'Lukas Wegerer'. Thus, it answered correctly for 4 out of 5 pieces of information.")

In [19]:
# let's create a flat version of the dataset, with the questions stacked and answers as a list of strings
flat_ds = [{"image": sample["image"], 
            "questions": [q["question"] for q in sample["qa_pairs"]],
            "answers": [q["answer"] for q in sample["qa_pairs"]]} for sample in qa_ds]

In [20]:
evaluation = weave.Evaluation(dataset=flat_ds, scorers=[judge_answer])

In [21]:
await evaluation.evaluate(call_llama)

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/01923025-6257-74f2-a976-6cdbc046de0c


{'judge_answer': {'number_of_correct': {'mean': 3.288659793814433}},
 'model_latency': {'mean': 13.159386509472561}}