In [15]:
import json
from datasets import load_dataset

ds = load_dataset("Aoschu/German_invoices_dataset_for_donut")["train"]

In [39]:
def load_sample(sample):
    return sample["image"], json.loads(sample["ground_truth"])["gt_parse"]["transcription"]

img, transcriptions = load_sample(ds[0])

In [42]:
import asyncio
import openai
import instructor
from pydantic import BaseModel
import weave

weave.init("german_invoices_eval")

client = instructor.from_openai(openai.AsyncOpenAI())

class QuestionAnswer(BaseModel):
    question: str
    answer: str

@weave.op
async def create_question_answer_pairs(transcriptions: list[str]) -> list[QuestionAnswer]:
    "Call openai to create question answer pairs"
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that creates question answer pairs from a given text. The text is a list of OCR output from a german invoice data.",
        },
        {
            "role": "user",
            "content": f"Create 5 question answer pairs from the following text: {transcriptions}",
        },
    ]
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        response_model=list[QuestionAnswer],
    )
    return response

In [45]:
from tqdm.asyncio import tqdm_asyncio

async def process_sample(sample):
    img, transcriptions = load_sample(sample)
    qa_pairs = await create_question_answer_pairs(transcriptions)
    return {"image": img, "qa_pairs": qa_pairs}

async def process_all_samples():
    tasks = [process_sample(sample) for sample in ds]
    qa_ds = await tqdm_asyncio.gather(*tasks, desc="Processing samples")
    return qa_ds

qa_ds = await process_all_samples()

Processing samples: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 97/97 [00:10<00:00,  9.26it/s]


In [46]:
qa_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1414x2000>,
 'qa_pairs': [QuestionAnswer(question='What is the name of the architect office mentioned in the text?', answer='Architekturburo Eickholzer'),
  QuestionAnswer(question='What is the offer number provided in the text?', answer='1234'),
  QuestionAnswer(question='What is the total gross amount quoted in the offer?', answer='7.735,00â‚¬'),
  QuestionAnswer(question='Who is the offer addressed to?', answer='Frau Mia Hobner'),
  QuestionAnswer(question='What is the date of the offer?', answer='29.07.2030')]}

In [62]:
qa_ds_list = [{"image": sample["image"], 
  "qa_pairs": [{"question": qa_pair.question, 
                "answer": qa_pair.answer} for qa_pair in sample["qa_pairs"]]} 
 for sample in qa_ds]

In [63]:
weave_ds = weave.Dataset(name="qa_german_invoices", 
              description="Question answer pairs created from a german invoice dataset",
              rows=qa_ds_list)

In [64]:
weave.publish(weave_ds)

ðŸ“¦ Published to https://wandb.ai/capecape/german_invoices_eval/weave/objects/qa_german_invoices/versions/vKjWelZ7XSrdFwWxKHaAMgGXKfLrXGkPphMJxP340EQ


ObjectRef(entity='capecape', project='german_invoices_eval', name='qa_german_invoices', digest='vKjWelZ7XSrdFwWxKHaAMgGXKfLrXGkPphMJxP340EQ', extra=())