In [None]:
# !pip list

In [None]:
# !pip install tensorboard

In [None]:
# !pip uninstall -y transformers

In [None]:
!pip install  -U -q git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git datasets bitsandbytes peft qwen-vl-utils accelerate

In [None]:
!pip show transformers

In [None]:
 !pip install -q torch==2.4.1+cu121 torchvision==0.19.1+cu121 torchaudio==2.4.1+cu121 --extra-index-url https://download.pytorch.org/whl/cu121

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

In [None]:
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": sample["query"],
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["label"][0]}],
        },
    ]

In [None]:
from datasets import load_dataset

dataset_id = "HuggingFaceM4/ChartQA"
train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=["train[:1%]", "val[:1%]", "test[:1%]"])

In [None]:
train_dataset

# Dataset({
#     features: ['image', 'query', 'label', 'human_or_machine'],
#     num_rows: 283
# })

In [None]:
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [None]:
train_dataset[10]

# [{'role': 'system',
#   'content': [{'type': 'text',
#     'text': 'You are a Vision Language Model specialized in interpreting visual data from chart images.\nYour task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.\nThe charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.\nFocus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary.'}]},
#  {'role': 'user',
#   'content': [{'type': 'image',
#     'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=307x429>},
#    {'type': 'text', 'text': "What's the rightmost value dark brown graph?"}]},
#  {'role': 'assistant', 'content': [{'type': 'text', 'text': '47'}]}]

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

model_id= "Qwen/Qwen2.5-VL-3B-Instruct"

In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
train_dataset[0]

# [{'role': 'system',
#   'content': [{'type': 'text',
#     'text': 'You are a Vision Language Model specialized in interpreting visual data from chart images.\nYour task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.\nThe charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.\nFocus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary.'}]},
#  {'role': 'user',
#   'content': [{'type': 'image',
#     'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=422x359>},
#    {'type': 'text', 'text': 'Is the value of Favorable 38 in 2015?'}]},
#  {'role': 'assistant', 'content': [{'type': 'text', 'text': 'Yes'}]}]

In [None]:
train_dataset[0][1:2]

# [{'role': 'user',
#   'content': [{'type': 'image',
#     'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=422x359>},
#    {'type': 'text', 'text': 'Is the value of Favorable 38 in 2015?'}]}]

In [None]:
train_dataset[0][1]["content"][0]["image"]

In [None]:
from qwen_vl_utils import process_vision_info

# Set device dynamically based on GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device=device):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[1:2], tokenize=False, add_generation_prompt=True  # Use the sample without the system message
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample)

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [None]:
# Example of how to call the method with sample:
output = generate_text_from_sample(model, processor, train_dataset[0])
output

# 'Yes, the value of "Favorable" is 38 in 2015 according to the provided data.'

In [None]:
import gc
import time


def clear_memory():
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

In [None]:
from transformers import BitsAndBytesConfig


if device == "cuda":
    # BitsAndBytesConfig int-4 config for GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, 
        bnb_4bit_use_double_quant=True, 
        bnb_4bit_quant_type="nf4", 
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load model with quantization config for GPU
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id, 
        device_map="auto",
        quantization_config=bnb_config,
        use_cache= True
    )
    processor = AutoProcessor.from_pretrained(model_id)
else:
    # Load model without quantization config for CPU
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id, 
        torch_dtype=torch.bfloat16,
        use_cache= True
    )
    processor = AutoProcessor.from_pretrained(model_id)

In [None]:
tok = AutoTokenizer.from_pretrained(model_id)
tok.padding_side = "right"

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=[
        "q_proj", 
        "v_proj"
    ],
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, peft_config)

peft_model.print_trainable_parameters()

# trainable params: 1,843,200 || all params: 3,756,466,176 || trainable%: 0.0491

In [None]:
from trl import SFTConfig

# Configure training arguments
training_args = SFTConfig(
    output_dir="qwen2.5-3b-instruct-trl-sft-ChartQA",  # Directory to save the model
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    gradient_accumulation_steps=4,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=10,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=10,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    tf32=False,  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    report_to="tensorboard",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset

In [None]:
%tensorboard --logdir qwen2.5-3b-instruct-trl-sft-ChartQA

In [None]:
# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [
        processor.apply_chat_template(example, tokenize=False) for example in examples
    ]  # Prepare texts for processing
    image_inputs = [process_vision_info(example)[0] for example in examples]  # Process the images to extract inputs

    # Tokenize the texts and process the images
    batch = processor(
        text=texts, images=image_inputs, return_tensors="pt", padding=True
    )  # Encode texts and images into tensors

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()  # Clone input IDs for labels
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, AutoTokenizer):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels  # Add labels to the batch

    return batch  # Return the prepared batch

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor
    # tokenizer=tok,
)

In [None]:
trainer.train()

# TrainOutput(global_step=35, training_loss=2.625527776990618, metrics={'train_runtime': 1476.0926, 
# 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.024, 'total_flos': 2509185336508416.0, 'train_loss': 2.625527776990618})

# Step	Training Loss	Validation Loss
# 10	3.826300	3.099709
# 20	2.762000	2.329909
# 30	1.952900	1.452009

In [None]:
trainer.save_model(training_args.output_dir)

In [None]:
clear_memory()

In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "/kaggle/working/qwen2.5-3b-instruct-trl-sft-ChartQA",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
output = generate_text_from_sample(model, processor, train_dataset[0])
output

# 'Yes, the value of "Favorable" is 38 in 2015 according to the provided data.'