In [41]:
from datasets import load_dataset
from transformers import AutoProcessor
from PIL import Image
import io

In [68]:
#model_id = "Qwen/Qwen2-VL-7B-Instruct"
model_id = "HuggingFaceTB/SmolVLM-Instruct"

processor = AutoProcessor.from_pretrained(model_id)

In [35]:

test_dataset = load_dataset("nanonets/key_information_extraction",
                                                split=["test[:1%]"])[0]
                                                
system_message = """You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
Your task is to process and extract meaningful insights from images, videos, and visual patterns, 
leveraging multimodal understanding to provide accurate and contextually relevant information."""

In [48]:
def format_data(sample: dict):
    image_buffer = io.BytesIO(sample["image"])
    pil_image = Image.open(image_buffer)

    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": pil_image,
                },
                {
                    "type": "text",
                    "text": "Perform key information extraction",
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["annotations"]}],
        },
    ]

In [50]:
test_dataset = [format_data(sample) for sample in test_dataset]

In [51]:
test_dataset[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. \nYour task is to process and extract meaningful insights from images, videos, and visual patterns, \nleveraging multimodal understanding to provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=928x2143>},
   {'type': 'text', 'text': 'Perform key information extraction'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': {'date': '26/05/2018',
     'doc_no_receipt_no': 'CS00013125',
     'seller_address': 'LOT 276 JALAN BANTING \n43800 DENGKIL , SELANGOR .',
     'seller_gst_id': '000781500416',
     'seller_name': 'KEDAI PAPAN YEW CHUAN',
     'seller_phone': '03-87686092',
     'total_amount': '121.90',
     'total_tax': '6.90'}}]}]

In [70]:
def text_generator(sample_data):
    text = processor.apply_chat_template(
        sample_data[0:2], tokenize = False, add_generation_prompt = True
    )

    #print(f"Prompt: {text}")

    image_inputs = sample_data[1]["content"][0]["image"]

    inputs = processor(
        text=[text],
        images=image_inputs,
        return_tensors="pt"
    )

    #print(inputs)

    return text

In [72]:
test_dataset[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. \nYour task is to process and extract meaningful insights from images, videos, and visual patterns, \nleveraging multimodal understanding to provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=928x2143>},
   {'type': 'text', 'text': 'Perform key information extraction'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': {'date': '26/05/2018',
     'doc_no_receipt_no': 'CS00013125',
     'seller_address': 'LOT 276 JALAN BANTING \n43800 DENGKIL , SELANGOR .',
     'seller_gst_id': '000781500416',
     'seller_name': 'KEDAI PAPAN YEW CHUAN',
     'seller_phone': '03-87686092',
     'total_amount': '121.90',
     'total_tax': '6.90'}}]}]

In [71]:
print(text_generator(test_dataset[0]))

<|im_start|>System: You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
Your task is to process and extract meaningful insights from images, videos, and visual patterns, 
leveraging multimodal understanding to provide accurate and contextually relevant information.<end_of_utterance>
User:<image>Perform key information extraction<end_of_utterance>
Assistant:
