# Fine-Tuning and Re-evaluating the Llama 3.2 Vision Model on Receipt Recognition Tasks

## Load the model and set up fine-tuning configurations

In [1]:
from huggingface_hub import login

# Must be a Hugging Face access token with write access
login(token="hf_...")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig

import os
os.environ["LD_LIBRARY_PATH"] = os.environ.get("LD_LIBRARY_PATH", "") + ":/sw/external/nvhpc/22.11/Linux_x86_64/22.11/REDIST/cuda/10.2/lib64"

# Hugging Face model id
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

processor = AutoProcessor.from_pretrained(model_id)

Downloading shards: 100%|██████████| 5/5 [00:00<00:00, 19.73it/s]
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 5/5 [00:32<00:00,  6.49s/it]


In [3]:
from peft import LoraConfig

# LoRA configuration based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

In [4]:
from trl import SFTConfig


args = SFTConfig(
    output_dir="fine-tuned-visionllama",    # directory to save and repository id
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=5,                        # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    gradient_checkpointing_kwargs = {"use_reentrant": False},    # use reentrant checkpointing
    dataset_text_field="",                  # need a dummy field for collator
    dataset_kwargs = {"skip_prepare_dataset": True}    # important for collator
)

args.remove_unused_columns=False

## Load in Q&A data 

In [5]:
import json

with open("qa_dataset.json", "r") as f:
    qa_data = json.load(f)

### Checking the data format

In [6]:
qa_data[0]

{'question': 'What is the total service charge on the receipt?',
 'answer': '100,950'}

### Cleaning and correcting the data

The Q&A pairs are generated by Llama3.1 model based on ground truth annotations. Besides cleaning the formatting in the Q&A pairs, we also found there were some calculation errors or hallucinations the LLM made that led to wrong answers. We manually fixed the ones we found, and saved it back to the QA dataset JSON file.

** All code are commented out because the cleaned version is already save so there is no need to run them again.

In [7]:
# The questions were intially in the format of '**Q:** What is the total service charge on the receipt?\n**'
# So we would like to strip the '**Q:**' in the beginning, '**' in the end, and the extra space or new lines on both side

# for qa in qa_data:
#     qa['question'] = qa['question'].lstrip('**Q:**').rstrip('**').strip()

In [66]:
# The answer for the question 1 : 'What is the total price of all the Tahu items on the receipt?' was 132,000 which is wrong
# qa_data[1]['answer'] = '116,000'

# The answer for the question 4: 'What is the total price of all the Ceker Ayam items?' was 180,000 which is wrong
# It's probably because the LLM thought the unit price for Ceker Ayam is 60000, and there are indeed 3 Ceker Ayam
# But actually 60000 is the total price of all 3 Ceker Ayam
# qa_data[4]['answer'] = '60,000'

# The answer for the 980th question: 'What is the total amount paid at the checkout?' was '250.000' instead of '0'
# It might be that the LLM thought the word 'checkout' need to be specified in order to count it as 'paid at the checkout'
# qa_data[980]['answer'] = '250.000'

# The answer for the 1021st question: 'What is the total tax amount paid on the receipt?' was ',.,,.' because of a previous parsing problem
# it wasn't actually wrong because those will be handles as 0 during evaluation, but it's better to clean it up
# qa_data[1021]['answer'] = '0'

# The answer for the 1048th question: 'What is the total amount of the purchase?' was empty
# qa_data[1048]['answer'] = '28.000'

# The answer for the 1364th question: 'What is the total amount of the transaction?' was empty
# qa_data[1364]['answer'] = '15000'

# The answer for the 1390th question: 'What is the total amount of the purchase?' was empty
# qa_data[1390]['answer'] = '20,000'

# The answer for the 1393rd question: 'What is the service charge added to the order?' was ',,.' because of a previous parsing problem
# it wasn't actually wrong because those will be handles as 0 during evaluation, but it's better to clean it up
# qa_data[1393]['answer'] = '0'

# All answers that are empty should be zero
# for qa in qa_data:
#     if qa['answer'].strip() == '':
#         qa['answer'] = '0'

In [68]:
# with open('qa_dataset.json', 'w') as file:
#     json.dump(qa_data, file)

### Transform the data into the format the vision model needed

In [10]:
def format_data(image, question, answer=None, train=False):
    messages = [
        {
            "role": "user", 
            "content": [
                {
                    "type": "image",
                    "image": image,
                },
                {
                    "type": "text", 
                    "text": f"{question}\n\nPlease return only the answer in the exact format of a single numeric value : '<number>'. Do not include steps, explanations, or intermediate calculations."
                }
            ]
        }
    ]
    
    if train:
        messages.append(
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text", 
                        "text": answer
                    }
                ],
            }
        )
    
    return messages

### Split the data into train and evaluation set

In [11]:
from datasets import load_dataset

# Format and store Q&A pairs
dataset_hf = load_dataset("naver-clova-ix/cord-v1", split="train")
train_dataset = []
eval_dataset = []

# Take first 50% percent of the Q&A pairs as training dataset, 
train_split_end = len(qa_data) // 2

idx = 0
for data in dataset_hf:
    
    # There are two questions for each image in the dataset
    for _ in range(2):
        if idx < train_split_end:    
            # Prepare the training dataset   
            train_dataset.append(format_data(image=data['image'], question=qa_data[idx]['question'], answer=qa_data[idx]['answer'], train=True))
        else:    
            # Prepare the evaluation dataset    
            eval_dataset.append(format_data(image=data['image'], question=qa_data[idx]['question'], train=False))

        idx += 1

In [12]:
len(qa_data)

1600

In [13]:
len(train_dataset)

800

In [14]:
train_dataset[0]

[{'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=864x1296>},
   {'type': 'text',
    'text': "What is the total service charge on the receipt?\n\nPlease return only the answer in the exact format of a single numeric value : '<number>'. Do not include steps, explanations, or intermediate calculations."}]},
 {'role': 'assistant', 'content': [{'type': 'text', 'text': '100,950'}]}]

In [15]:
len(eval_dataset)

800

In [16]:
eval_dataset[0]

[{'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=576x864>},
   {'type': 'text',
    'text': "What is the total tax amount added to the subtotal of the receipt?\n\nPlease return only the answer in the exact format of a single numeric value : '<number>'. Do not include steps, explanations, or intermediate calculations."}]}]

### Fine-tune the vision model on the training set

In [17]:
from qwen_vl_utils import process_vision_info

# Define a custom collator function
def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]
    image_inputs = [process_vision_info(example)[0] for example in examples]

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # Mask image tokens in the labels
    image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch

from trl import SFTTrainer

# Trainer setup
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    peft_config=peft_config
)

# Start training
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,3.437
10,2.2764
15,1.1451
20,0.702
25,0.6466
30,0.5836
35,0.528
40,0.4384
45,0.4227
50,0.3898


TrainOutput(global_step=125, training_loss=0.5543112316131592, metrics={'train_runtime': 6224.3025, 'train_samples_per_second': 0.643, 'train_steps_per_second': 0.02, 'total_flos': 1.4666944228684464e+16, 'train_loss': 0.5543112316131592, 'epoch': 5.0})

### Evaluate the fine-tuned vision model 

In [None]:
# Load in the fine-tuned model if needed

# model = AutoModelForCausalLM.from_pretrained("./fine-tuned-visionllama").to("cuda")  # or "cpu" if no GPU available

In [41]:
import re
import math

# Function to parse the predicted answer
def parse_answer(answer):
    """
    Extracts the numeric portion of the predicted answer, handling spaces, punctuation, and tokens.
    """
    if answer:
        # Extract the numeric value
        match = re.search(r"\b\d+([,]\d{3})*(\.\d+)?\b", answer, re.IGNORECASE)
        if match:
            result = match.group()
            return result
    print(f"No numeric value found in the predicted answer: {answer}.")
    return None


# Function to process the true answer
def process_true_answer(answer):
    """
    Extracts the numeric portion of the true answer, handling spaces and punctuation.
    """
    if answer:
        # Extract the numeric value
        match = re.search(r"\b\d+([,]\d{3})*(\.\d+)?\b", answer, re.IGNORECASE)
        if match:
            result = match.group()
            return result
    print(f"No numeric value found in the true answer: {answer}.")
    return None


# Function to compare answers
def answers_match(predicted_value, true_value):
    """
    Compares the given strings that can be potentially transformed into numeric values.
    If the strings can be succesfully transformed into float numbers, then compare if they represents the same value.
    If the strings are not numeric numbers or they do not represent the same value, return false.
    """
    try:
        if predicted_value:
            pred = float(predicted_value.replace(',', ''))
        else: 
            pred = 0
        if true_value:
            answer = float(true_value.replace(',', ''))
        else:
            answer = 0
            
    except ValueError:
        print(f"Either the predicted value {predicted_value} or the answer {true_value} is not a float!")
        return False
    
    return math.isclose(pred, answer)

In [69]:
from torch.cuda.amp import autocast

# Initialize variables for evaluation
total_questions = 0
correct_answers = 0

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    # Main loop for getting the evaluation result
    for message, data in zip(eval_dataset, qa_data[-len(eval_dataset):]):
        inputs_text = processor.apply_chat_template(message, add_generation_prompt=True)
        processed_image = process_vision_info(message)[0]

        # Combine processed text and image into a model-compatible format
        input = processor(
            text=[inputs_text],        # Single example requires wrapping in a list
            images=[processed_image],  # Single example requires wrapping in a list
            return_tensors="pt",
            padding=True
        ).to(model.device)
        
        # Generate output
        with autocast(dtype=torch.bfloat16):
            output = model.generate(**input, max_new_tokens=50)
        predicted_answer = processor.decode(output[0])
        match = re.search(r"\s*<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\s*(.*?)\s*<\|eot_id\|>\s*", predicted_answer, re.IGNORECASE)
        if match:
            predicted_answer = match.group(1)
        true_answer = data['answer']

        # Parse both predicted and true answers
        parsed_predicted = parse_answer(predicted_answer)
        parsed_true = process_true_answer(true_answer)
        
        print(f"Q{total_questions}:")
        
        # Compare the parsed answers, considering both direct match and fallback
        if answers_match(parsed_predicted, parsed_true):
            correct_answers += 1

        total_questions += 1
        
        print(f"Processed Question: {data['question']}")
        print(f"Model Output: {predicted_answer} -> Parsed: {parsed_predicted}")
        print(f"True Answer: {true_answer} -> Parsed: {parsed_true}")
        print(f"Correct Answers So Far: {correct_answers}/{total_questions}\n")

    # Compute final accuracy
    accuracy = (correct_answers / total_questions) * 100
    print(f"Final Accuracy: {accuracy:.2f}%")

  with autocast(dtype=torch.bfloat16):


Q0:
Processed Question: What is the total tax amount added to the subtotal of the receipt?
Model Output: 10,000 -> Parsed: 10,000
True Answer: 10,000 -> Parsed: 10,000
Correct Answers So Far: 1/1

Q1:
Processed Question: What is the total service charge amount added to the subtotal of the receipt?
Model Output: 0,003 -> Parsed: 0,003
True Answer: 0 -> Parsed: 0
Correct Answers So Far: 1/2

Q2:
Processed Question: What is the service charge for the items on this receipt?
Model Output: 0 -> Parsed: 0
True Answer: 0 -> Parsed: 0
Correct Answers So Far: 2/3

Q3:
Processed Question: What is the subtotal amount before tax on this receipt?
Model Output: 36.001 -> Parsed: 36.001
True Answer: 32.728 -> Parsed: 32.728
Correct Answers So Far: 2/4

Q4:
Processed Question: What is the amount of tax charged on the subtotal?
Model Output: 22.500 -> Parsed: 22.500
True Answer: 22,500 -> Parsed: 22,500
Correct Answers So Far: 2/5

Q5:
Processed Question: What is the subtotal of the purchase?
Model Outp