### Import libraries

In [None]:
from PIL import Image
import os
import json

In [2]:
system_message = """You are a highly advanced Vision Language Model (VLM), specialized in extracting visual data. 
Your task is to process and extract meaningful insights from images, leveraging multimodal understanding
to provide accurate and contextually relevant information."""

### Read and format data

In [3]:
def read_sroie_data(split: str):
    root_path = "../../data/sroie"
    imgs_folder_path = f"{root_path}/{split}/img"
    labels_folder_path = f"../../data/sroie/{split}/entities"

    data = []
    for label_fn in os.listdir(labels_folder_path):
        with open(os.path.join(labels_folder_path, label_fn), "r") as f:
            json_f: dict = json.load(f)

        data.append(dict(
            img_path = os.path.join(imgs_folder_path, label_fn.replace(".txt", ".jpg")),
            label = json_f
        ))

    return data

In [4]:
train_dataset = read_sroie_data("train")
test_dataset = read_sroie_data("test")

train_dataset[0]

{'img_path': '../../data/sroie/train/img/X51006387847.jpg',
 'label': {'company': 'BEMED (SP) SDN. BHD.',
  'date': '15/APR/2017',
  'address': 'NO.49, JALAN DINAR G U3/G, SUBANG PERDANA, 40150 SHAH ALAM, SELANGOR D.E.',
  'total': '635.00'}}

In [5]:
def format_data(sample: dict):
    pil_image = Image.open(sample["img_path"])

    field_names = list(sample["label"].keys())
    output_format = {field: ".." for field in field_names}

    prompt = "Extract the following {fields} from the above document. If a field is not present, return ''. Return the output in a valid JSON format like {output_format}" \
        .format(
            fields = field_names,
            output_format = output_format
        )

    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}]
        },
        {
            "role": "user",
            "content": [
                { "type": "image", "image": pil_image },
                { "type": "text", "text": prompt }
            ]
        },
        {
            "role": "assistant",
            "content": [{ "type": "text", "text": json.dumps(sample["label"])}]
        }
    ]

In [6]:
train_dataset = [format_data(sample) for sample in train_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [7]:
train_dataset[0]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a highly advanced Vision Language Model (VLM), specialized in extracting visual data. \nYour task is to process and extract meaningful insights from images, leveraging multimodal understanding\nto provide accurate and contextually relevant information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=707x1688>},
   {'type': 'text',
    'text': "Extract the following ['company', 'date', 'address', 'total'] from the above document. If a field is not present, return ''. Return the output in a valid JSON format like {'company': '..', 'date': '..', 'address': '..', 'total': '..'}"}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': '{"company": "BEMED (SP) SDN. BHD.", "date": "15/APR/2017", "address": "NO.49, JALAN DINAR G U3/G, SUBANG PERDANA, 40150 SHAH ALAM, SELANGOR D.E.", "total": "635.00"}'}]}]

### Training pipeline

In [None]:
# define model

import torch
from transformers import Idefics3ForConditionalGeneration, AutoProcessor
from transformers import BitsAndBytesConfig

model_id = "HuggingFaceTB/SmolVLM-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

processor = AutoProcessor.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["down_proj", "o_proj", "k_proj", "q_proj", "gate_proj", "up_proj", "v_proj"],
    use_dora=True,
    init_lora_weights="gaussian",
)

peft_model = get_peft_model(model, peft_config)

peft_model.print_trainable_parameters()

trainable params: 11,269,248 || all params: 2,257,542,128 || trainable%: 0.4992


In [10]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
    processor.tokenizer.additional_special_tokens.index("<image>")
]


def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]

    image_inputs = []
    for example in examples:
        image = example[1]["content"][0]["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        image_inputs.append([image])

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
    labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels

    return batch

In [None]:
from trl import SFTConfig

# Configure training arguments using SFTConfig
training_args = SFTConfig(
    output_dir="training/single_test/smolvlm-sroie",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=5,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=1,
    optim="adamw_torch_fused",
    bf16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,1.6221
10,1.5684
15,1.4297
20,1.2449
25,1.0269
30,0.8185
35,0.5858
40,0.3718
45,0.2352
50,0.1451


TrainOutput(global_step=120, training_loss=0.40667652860283854, metrics={'train_runtime': 1697.907, 'train_samples_per_second': 1.106, 'train_steps_per_second': 0.071, 'total_flos': 3.347114271466656e+16, 'train_loss': 0.40667652860283854})

In [None]:
trainer.save_model(output_dir="training/single_test/smolvlm-sroie/final")

In [15]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 0.02 GB
GPU reserved memory: 2.47 GB
