In [18]:
import weave

weave.init("german_invoices_eval")

weave_ref = "weave:///capecape/german_invoices_eval/object/qa_german_invoices:vKjWelZ7XSrdFwWxKHaAMgGXKfLrXGkPphMJxP340EQ"
qa_ds = weave.ref(weave_ref).get().rows[:]

In [34]:
import asyncio
import os
import openai
import weave

weave.init("german_invoices_eval")

# our own Llama 3.2-90B-Vision-Instruct instance
client = openai.AsyncOpenAI( 
  base_url="http://195.242.25.198:8032/v1",
  api_key=os.environ.get("WANDB_API_KEY"),
)

In [64]:
import io, base64
from typing import Union
from PIL import Image

def image_to_base64(image_path: Union[str, Image.Image]) -> str:
    image = Image.open(image_path) if isinstance(image_path, str) else image_path
    byte_arr = io.BytesIO()
    image.save(byte_arr, format="PNG")
    encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
    return str(encoded_string)

@weave.op
async def call_llama(img: Image.Image, questions: list[str]) -> str:
    image_base64 = image_to_base64(img)
    questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
    base64_messages = [{
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Reply to all the questions with the info from the document: \n{questions}"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}",
                    },
                },
            ],
        }]

    response = await client.chat.completions.create(
        model="Llama-3.2-90B-Vision-Instruct",
        messages=base64_messages,
    )
    return response.choices[0].message.content


In [74]:
sample = qa_ds[0]
image = sample["image"]
questions = [q["question"] for q in sample["qa_pairs"]]
answers = [a["answer"] for a in sample["qa_pairs"]]

In [75]:
questions

['What is the name of the architect office mentioned in the text?',
 'What is the offer number provided in the text?',
 'What is the total gross amount quoted in the offer?',
 'Who is the offer addressed to?',
 'What is the date of the offer?']

In [67]:
res = await call_llama(image, questions)
res

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192301a-2c65-7e03-94d8-595ed2fe0d66


'**Answer:**\n\n1. The architect office mentioned in the text is **Architekturbüro Eickhölzer**.\n2. The offer number provided in the text is **1234**.\n3. The total gross amount quoted in the offer is **7.735,00 €**.\n4. The offer is addressed to **Frau Mia Hobner**.\n5. The date of the offer is **29.07.2030**.'

In [70]:
print(res)

**Answer:**

1. The architect office mentioned in the text is **Architekturbüro Eickhölzer**.
2. The offer number provided in the text is **1234**.
3. The total gross amount quoted in the offer is **7.735,00 €**.
4. The offer is addressed to **Frau Mia Hobner**.
5. The date of the offer is **29.07.2030**.


In [84]:
import instructor
from pydantic import BaseModel, Field
from typing import List

openai_client = instructor.from_openai(openai.AsyncOpenAI())


class Judge(BaseModel):
    number_of_correct: int = Field(description="The number of questions that the model answered correctly")
    explanation: str = Field(description="The explanation for the number of questions that the model answered correctly")

@weave.op
def judge_answer(model_output: str, answers: str) -> dict:
    res = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", 
            "content": "You are a helpful assistant that determines if an answer is correct and provides an explanation for the correctness or incorrectness of the answer. Be tolerant to spelling mistakes that oculd be related to the OCR extraction of the documents and languages."},
            {"role": "user", 
             "content": f"## Model Output \n{model_output}\n\n## Real Answer\n{answers}\nReply with a JSON",
            }
        ],
        response_model=Judge
    )
    return res

In [77]:
await judge_answer(res, answers) 

🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192301b-f111-7b60-a09c-1fcbe1649503
🍩 https://wandb.ai/capecape/german_invoices_eval/r/call/0192301b-f116-79c1-9783-8859d16988c0


Judge(number_of_correct=5, explanation="The model correctly identified all details from the text: the architect office name, offer number, total gross amount, recipient's name, and the date of the offer. Minor spelling variations due to OCR ('Eickhölzer' vs 'Eickholzer') do not affect the accuracy of the content.")

In [80]:
# let's create a flat version of the dataset, with the questions stacked and answers as a list of strings
flat_ds = [{"image": sample["image"], 
            "questions": [q["question"] for q in sample["qa_pairs"]],
            "answers": [q["answer"] for q in sample["qa_pairs"]]} for sample in qa_ds]

In [83]:
evaluation = weave.Evaluation(dataset=flat_ds, scorers=[judge_answer])

In [86]:
await evaluation.evaluate(call_llama)