We need to first load the dataset, the processor and the model.

In [None]:
from huggingface_hub import login

login("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")

In [None]:
# https://huggingface.co/docs/transformers/en/model_doc/llava

from transformers import AutoProcessor, AutoTokenizer, MllamaForConditionalGeneration
import torch

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"


model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
processor = AutoProcessor.from_pretrained(model_id)
print(processor)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(tokenizer)

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = MllamaForConditionalGeneration.from_pretrained(
#     model_id,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True
# )
# model.to(device)


# Unlike direct load the pretrained model, we choose the PEFT strategy for finetuning with Lora.
# https://huggingface.co/docs/peft/en/index
# 

from transformers import BitsAndBytesConfig

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

USE_LORA = True
USE_QLORA = True

## Load model

# Three options for training, from the lowest precision training to the highest precision training:
# - QLora
# - Standard Lora
# - Full fine-tuning
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
        )
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
        low_cpu_mem_usage=True
    )
else:
    # for full fine-tuning, we can speed up the model using Flash Attention
    # only available on certain devices, see https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )




def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

if USE_LORA:

    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=find_all_linear_names(model),
        init_lora_weights="gaussian",
    )
    if USE_QLORA:
        model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)



    model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

dataset_id = "philschmid/amazon-product-descriptions-vlm"
dataset = load_dataset(dataset_id, split="train")
print(dataset)


In [None]:
example = dataset[0]
print(example)


In [None]:
example["image"]

In [None]:
example["description"]

In [None]:
example["Product Name"]

In [10]:
#  you need to convert the dataset format to the the TRL trainer 
#  https://huggingface.co/docs/trl/en/sft_trainer
# https://blog.futuresmart.ai/fine-tune-llama-32-vision-language-model-on-custom-datasets
# https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/discussions/31

prompt = """Create a Short Product description based on the provided ##PRODUCT NAME## and ##CATEGORY## and image.
Only return description. The description should be SEO optimized and for a better mobile search experience.

##PRODUCT NAME##: {product_name}
##CATEGORY##: {category}"""


def format_data(sample):
    return {
    "messages": [
        {'content': [{'text': prompt.format(product_name=sample["Product Name"], category=sample["Category"]),
                        'type': 'text'},
                        {'text': None, 'type': 'image'}],
                        'role': 'user'},
        {'content': [{'text': sample["description"], 'type': 'text'}],
            'role': 'assistant'},
    ],
    "images": [sample["image"]],
    }




In [11]:
formatted_dataset = [format_data(sample) for sample in dataset]

In [12]:
# from qwen_vl_utils import process_vision_info

class DataCollator:
    def __init__(self, processor):
        self.processor = processor
    ################
    # Create a data collator to encode text and image pairs
    ################
    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            messages = example["messages"]
            text = self.processor.apply_chat_template(messages, tokenize=False)
            texts.append(text)
            # images.append(process_vision_info(example["messages"])[0])
            images.append(example["images"][0])


        # print(texts)
        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)

        # The labels are the input_ids, and we mask the padding tokens in the loss computation
        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100

        # Ignore the image token index in the loss computation (model specific)
        image_token_id = processor.tokenizer.convert_tokens_to_ids(processor.image_token)
        labels[labels == image_token_id] = -100
        batch["labels"] = labels

        return batch
    

data_collator = DataCollator(processor)

In [None]:
# print(processor.apply_chat_template(dataset[0]["messages"], tokenize=False, add_generation_prompt=False))
formatted_dataset[0]

In [None]:
print(processor.chat_template)

In [None]:
data_collator([formatted_dataset[0]])

In [None]:
standard_dataset = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft", split="train")


standard_dataset[0]

In [None]:
from trl import (
    ModelConfig,
    SFTConfig,
    SFTTrainer,
    TrlParser,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama3.2_vision_instruct_output",
    learning_rate= 1.4e-5,
    per_device_train_batch_size=4,
    num_train_epochs=2,
    bf16=True,
    remove_unused_columns=False,
    push_to_hub=False
    # gradient_checkpointing=False,
    # gradient_accumulation_steps=8,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    eval_dataset=None,
    dataset_text_field="text",  # need a dummy field
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)

trainer.train()



In [None]:
# Save and push to hub
trainer.save_model(training_args.output_dir)
if training_args.push_to_hub:
    trainer.push_to_hub()
    if trainer.accelerator.is_main_process:
        processor.push_to_hub(training_args.hub_model_id)