### Import libraries

In [1]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from dataset import *
from PIL import Image
import json

  from .autonotebook import tqdm as notebook_tqdm


### Prepare Training Data

In [None]:
system_message = """You are a highly advanced Vision Language Model (VLM), specialized in extracting visual data.
Your task is to process and extract meaningful insights from images that are asked in the prompt."""

In [None]:
from json2xml import json2xml
from json2xml.utils import readfromstring
from lxml import etree
import base64


def format_data(sample, train_type: str):
    pil_image = Image.open(sample.image_path)

    field_names = set([entity.label for entity in sample.entities])
    if train_type == "xml":
        xml_fields = "".join([f"<{field}>..</{field}>" for field in field_names])
        output_format = f"<kie>{xml_fields}</kie>"
        prompt = "Extract the following {fields} from the above document. If a field is not present, return ''. Return the output in a valid XML format like {output_format}" \
            .format(
                fields = list(field_names),
                output_format = output_format
            )
    else:
        output_format = {field: ".." for field in field_names}

        prompt = "Extract the following {fields} from the above document. If a field is not present, return ''. Return the output in a valid JSON format like {output_format}" \
            .format(
                fields = list(field_names),
                output_format = output_format
            )

    if train_type == "normal":
        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}]
            },
            {
                "role": "user",
                "content": [
                    { "type": "image", "image": pil_image },
                    { "type": "text", "text": prompt }
                ]
            },
            {
                "role": "assistant",
                "content": [{
                    "type": "text",
                    "text": json.dumps(sample.to_json("kie"))
                }]
            }
        ]
    elif train_type == "no-prompt":
        conversation = [
            {
                "role": "user",
                "content": [
                    { "type": "image", "image": pil_image }
                ]
            },
            {
                "role": "assistant",
                "content": [{
                    "type": "text",
                    "text": json.dumps(sample.to_json("kie"))
                }]
            }
        ]
    elif train_type == "xml":
        label = json2xml.Json2xml(
            data=readfromstring(json.dumps(sample.to_json("kie"))),
            wrapper="kie",
            pretty=False,
            attr_type=False
        ).to_xml()
        label = etree.tostring(
            etree.fromstring(label),
            encoding="unicode",
            pretty_print=False
        )

        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}]
            },
            {
                "role": "user",
                "content": [
                    { "type": "image", "image": pil_image },
                    { "type": "text", "text": prompt }
                ]
            },
            {
                "role": "assistant",
                "content": [{
                    "type": "text",
                    "text": label
                }]
            }
        ]
    else:
        raise Exception(f"{train_type} value error")

    return conversation

In [4]:
train_type = "normal"

train_dataset = [format_data(sample, train_type) for sample in SROIE(tasks=["kie"], split="train")]
test_dataset = [format_data(sample, train_type) for sample in SROIE(tasks=["kie"], split="test")]

train_dataset[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in extracting visual data.\nYour task is to process and extract meaningful insights from images, leveraging multimodal understanding\nto provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=463x1013>},
   {'type': 'text',
    'text': "Extract the following ['total', 'company', 'address', 'date'] from the above document. If a field is not present, return ''. Return the output in a valid JSON format like {'total': '..', 'company': '..', 'address': '..', 'date': '..'}"}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': '{"company": "BOOK TA .K (TAMAN DAYA) SDN BHD", "date": "25/12/2018", "address": "NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR.", "total": "9.00"}'}]}]

### Training Pipeline

In [5]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, AutoModelForImageTextToText

In [6]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

In [None]:
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    _attn_implementation="eager",
    device_map="auto"
    #torch_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained(model_id)

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [9]:
USE_LORA = False
USE_QLORA = True

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=False if USE_QLORA else True,
    init_lora_weights="gaussian"
)

In [10]:
if train_type == "xml":
    for x in ['company', 'date', 'address', 'total']:
        processor.tokenizer.add_tokens([f"<doc_{x}>", f"</doc_{x}>"])

    processor.tokenizer.add_tokens(["<kie>", "</kie>"])
    model.resize_token_embeddings(len(processor.tokenizer))

    model.config.pad_token_id = processor.tokenizer.pad_token_id
    model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<kie>")
    model.config.eos_token_id = processor.tokenizer.convert_tokens_to_ids("</kie>")

In [11]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
    processor.tokenizer.additional_special_tokens.index("<image>")
]


def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]

    image_inputs = []
    for example in examples:
        image = example[1]["content"][0]["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        image_inputs.append([image])

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
    labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels

    return batch

In [None]:
from trl import SFTConfig


training_args = SFTConfig(
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    #max_steps=30,
    ###
    learning_rate=1e-4,
    logging_steps=1,
    save_steps=5,
    optim="adamw_torch_fused",
    weight_decay=0.01,
    output_dir=f"result/smolvlm2-{train_type}",
    fp16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
    report_to="none",
    ##
    eval_strategy="steps",
    save_strategy="best",
    metric_for_best_model="Accuracy",
    greater_is_better = True, # the higher the acc metric the better 
    label_names=["labels"]
)

In [None]:
from transformers.trainer_utils import EvalPrediction
import editdistance


def compute_metrics(processor):
    def inner_compute_metrics(eval_pred: EvalPrediction):
        pred_ids, labels = eval_pred.predictions

        pred_ids_list = pred_ids.tolist()
        labels_list = labels.tolist()
        
        decoded_preds = []
        decoded_labels = []
        
        for i in range(len(pred_ids_list)):
            pred_tokens = [token_id for token_id in pred_ids_list[i] if token_id not in [-100, processor.tokenizer.pad_token_id]]
            decoded_pred = processor.tokenizer.decode(pred_tokens, skip_special_tokens=True)
            decoded_preds.append(decoded_pred)

            label_tokens = [token_id for token_id in labels_list[i] if token_id not in [-100, processor.tokenizer.pad_token_id]]
            decoded_label = processor.tokenizer.decode(label_tokens, skip_special_tokens=True)
            decoded_labels.append(decoded_label)
        
        similarities = []
        for i in range(len(decoded_preds)):
            try:
                pred: dict = json.loads(decoded_preds[i].split('Assistant: ')[1])
                label: dict = json.loads(decoded_labels[i].split('Assistant: ')[1])

                field_sims = []
                for k in label.keys():
                    if k in pred:
                        dist = editdistance.eval(str(pred[k]), str(label[k]))
                        max_len = max(len(str(label[k])), 1)
                        sim = 1 - dist / max_len
                        field_sims.append(sim)
                    else:
                        field_sims.append(0.0)
                        
                if field_sims:
                    similarities.append(sum(field_sims) / len(field_sims))
                else:
                    similarities.append(0.0)
            except Exception as e:
                similarities.append(0.0)

        return {
            "Accuracy": sum(similarities) / len(similarities) if similarities else 0.0
        }
    return inner_compute_metrics

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    peft_config=lora_config,
    processing_class=processor.tokenizer,
    compute_metrics=compute_metrics(processor)
)
trainer.can_return_loss = True

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.31 GiB. GPU 0 has a total capacity of 5.61 GiB of which 1.29 GiB is free. Process 2486 has 67.80 MiB memory in use. Including non-PyTorch memory, this process has 4.02 GiB memory in use. Of the allocated memory 3.29 GiB is allocated by PyTorch, and 649.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(output_dir=f"training/smolvlm2-{train_type}/final")

In [None]:
clear_memory()

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    _attn_implementation="eager",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
model.load_adapter(f"training/smolvlm2-{train_type}/final")

In [None]:
results = {}

test_images = sorted(os.listdir("data/sroie/test/img"))
for data, fn in zip(test_dataset, test_images):
    inputs = processor.apply_chat_template(
        data,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to("cuda", dtype=torch.bfloat16)

    start = time.time()
    generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1000)
    end = time.time()

    generated_texts = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
    )
    results[fn] = dict(
        response = generated_texts[0],
        inference_time = end - start
    )

    with open(f"result/smolvlm2-{train_type}.json", "w") as f:
        json.dump(results, f, indent=4)