In [1]:
from datasets import load_dataset
from transformers import AutoProcessor
from PIL import Image
import io
from json2xml import json2xml
from json2xml.utils import readfromurl, readfromstring, readfromjson
import json
from lxml import etree

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
#model_id = "Qwen/Qwen2-VL-7B-Instruct"
model_id = "HuggingFaceTB/SmolVLM-Instruct"

processor = AutoProcessor.from_pretrained(model_id)

In [23]:

test_dataset = load_dataset("nanonets/key_information_extraction",
                                                split=["test"])[0]
                                                
system_message = """You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
Your task is to process and extract meaningful insights from images, videos, and visual patterns, 
leveraging multimodal understanding to provide accurate and contextually relevant information."""

In [14]:
def format_data(sample: dict):
    image_buffer = io.BytesIO(sample["image"])
    pil_image = Image.open(image_buffer)

    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}]
        },
        {
            "role": "user",
            "content": [
                { "type": "image", "image": pil_image },
                { "type": "text", "text": "Perform key information extraction" }
            ]
        },
        {
            "role": "assistant",
            "content": [{ "type": "text", "text": sample["annotations"]}]
        }
    ]

In [24]:
test_dataset = [format_data(sample) for sample in test_dataset]

In [16]:
test_dataset[1]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. \nYour task is to process and extract meaningful insights from images, videos, and visual patterns, \nleveraging multimodal understanding to provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=928x2110>},
   {'type': 'text', 'text': 'Perform key information extraction'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': {'date': '20/04/2018',
     'doc_no_receipt_no': 'CS00012013',
     'seller_address': 'LOT 276 JALAN BANTING \n43800 DENGKIL , SELANGOR .',
     'seller_gst_id': '000781500416',
     'seller_name': 'KEDAI PAPAN YEW CHUAN',
     'seller_phone': '03-87686092 ',
     'total_amount': '87.45',
     'total_tax': '4.95'}}]}]

In [None]:
def text_generator(sample_data):
    text = processor.apply_chat_template(
        sample_data[0:2], tokenize = False, add_generation_prompt = True
    )

    # json to xml
    data = json2xml.Json2xml(
        data=readfromstring(json.dumps(sample_data[2]["content"][0]["text"])),
        wrapper="key-information-extraction",
        pretty=False,
        attr_type=False
    ).to_xml()
    data = etree.tostring(
        etree.fromstring(data),
        encoding="unicode",
        pretty_print=False
    )

    print(data)
    print("#" * 30)

    image_inputs = sample_data[1]["content"][0]["image"]

    inputs = processor(
        text=[text],
        images=image_inputs,
        return_tensors="pt"
    )

    return text

In [26]:
test_dataset[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. \nYour task is to process and extract meaningful insights from images, videos, and visual patterns, \nleveraging multimodal understanding to provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=928x2143>},
   {'type': 'text', 'text': 'Perform key information extraction'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': {'date': '26/05/2018',
     'doc_no_receipt_no': 'CS00013125',
     'seller_address': 'LOT 276 JALAN BANTING \n43800 DENGKIL , SELANGOR .',
     'seller_gst_id': '000781500416',
     'seller_name': 'KEDAI PAPAN YEW CHUAN',
     'seller_phone': '03-87686092',
     'total_amount': '121.90',
     'total_tax': '6.90'}}]}]

In [29]:
print(text_generator(test_dataset[0]))

<key-information-extraction><date>26/05/2018</date><doc_no_receipt_no>CS00013125</doc_no_receipt_no><seller_address>LOT 276 JALAN BANTING 
43800 DENGKIL , SELANGOR .</seller_address><seller_gst_id>000781500416</seller_gst_id><seller_name>KEDAI PAPAN YEW CHUAN</seller_name><seller_phone>03-87686092</seller_phone><total_amount>121.90</total_amount><total_tax>6.90</total_tax></key-information-extraction>
##############################
<class 'transformers.feature_extraction_utils.BatchFeature'>
<|im_start|>system
You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
Your task is to process and extract meaningful insights from images, videos, and visual patterns, 
leveraging multimodal understanding to provide accurate and contextually relevant information.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Perform key information extraction<|im_end|>
<|im_start|>assistant



In [31]:
print(text_generator(test_dataset[0]))

<key-information-extraction><date>26/05/2018</date><doc_no_receipt_no>CS00013125</doc_no_receipt_no><seller_address>LOT 276 JALAN BANTING 
43800 DENGKIL , SELANGOR .</seller_address><seller_gst_id>000781500416</seller_gst_id><seller_name>KEDAI PAPAN YEW CHUAN</seller_name><seller_phone>03-87686092</seller_phone><total_amount>121.90</total_amount><total_tax>6.90</total_tax></key-information-extraction>
##############################
<class 'transformers.feature_extraction_utils.BatchFeature'>
<|im_start|>System: You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
Your task is to process and extract meaningful insights from images, videos, and visual patterns, 
leveraging multimodal understanding to provide accurate and contextually relevant information.<end_of_utterance>
User:<image>Perform key information extraction<end_of_utterance>
Assistant:
