Description
System Info
Describe the bug
Hi ,
I am working on finetunig deepseek-vl2 but have been facing a lot of issues.
This is the code
Modified the code from this blog https://huggingface.co/learn/cookbook/en/fine_tuning_vlm_trl
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
from deepseek_vl2.utils.io import load_pil_images
from typing import Dict, Tuple, List, Literal, Optional
# Load model and processor
model_path = "deepseek-ai/deepseek-vl2"
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model = vl_gpt.to(torch.bfloat16).cuda()
# Define PEFT configuration (LoRA)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.05,
r=8,
bias="none",
target_modules=["q_proj", "v_proj"],
task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
# System prompt (customize accordingly)
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""
# Fix format_data function
def format_data(sample: Dict[str, str]) -> List[Dict[str, str]]:
formatted_sample: List[Dict[str, str]] = [{
"role": "<|User|>",
"content": f"{sample['query']} <image>",
"images": sample['image'] # Fixed the missing comma here
}, {
"role": "<|Assistant|>",
"content": f"{sample['label'][0]}"
}]
return formatted_sample
# Load dataset
dataset_id = "HuggingFaceM4/ChartQA"
train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=["train[:10%]", "val[:10%]", "test[:10%]"])
# Apply data formatting
train_dataset = [format_data(sample) for sample in train_dataset if format_data(sample) is not None]
eval_dataset = [format_data(sample) for sample in eval_dataset if format_data(sample) is not None]
test_dataset = [format_data(sample) for sample in test_dataset if format_data(sample) is not None]
print(f"First training sample after processing: {train_dataset[0]}")
from trl import SFTConfig
# Configure training arguments
training_args = SFTConfig(
output_dir="deepseekvl2_finetuned",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=8,
evaluation_strategy="steps",
eval_steps=20,
save_strategy="steps",
save_steps=50,
logging_steps=10,
learning_rate=2e-5,
warmup_ratio=0.03,
gradient_checkpointing=False,
bf16=True,
tf32=True,
push_to_hub=False,
remove_unused_columns=False,
dataset_text_field=None, # explicitly set to None
dataset_kwargs={"skip_prepare_dataset": True}, # add this line
report_to="wandb",
)
# Define collate_fn to process batches
def collate_fn(examples):
batch_conversations = []
batch_images = []
for example in examples:
if example is None:
print("empty sample")
continue
print(f"Example: {example}")
# Extract user and assistant messages
formatted_sample = example # Already formatted in format_data
# Append the user and assistant conversations
batch_conversations.append({
"role": formatted_sample[0]["role"],
"content": formatted_sample[0]["content"],
})
batch_conversations.append({
"role": formatted_sample[1]["role"],
"content": formatted_sample[1]["content"],
})
# Append image to batch_images
# if formatted_sample[0]["images"].mode != 'RGB':
# formatted_sample[0]["images"].convert('RGB')
batch_images.append(formatted_sample[0]["images"].convert('RGB')) # User message contains the image
print(len(batch_conversations))
print(len(batch_images))
if not batch_conversations or not batch_images:
print("Warning: Empty conversations or images batch!")
# Pass both conversations and images to the processor
inputs = vl_chat_processor(
conversations=batch_conversations, # Provide the batch of conversations
images=batch_images, # Provide the batch of images
return_tensors="pt",
padding=True,
force_batchify=True,
system_prompt=system_message
)
# Create labels for the inputs, ensuring padding is handled correctly
labels = inputs["input_ids"].clone()
labels[labels == tokenizer.pad_token_id] = -100
# Handle special tokens in the labels
special_token_ids = [
tokenizer.convert_tokens_to_ids(tok) for tok in ['<image>', '<|User|>', '<|Assistant|>']
]
for token_id in special_token_ids:
labels[labels == token_id] = -100
inputs["labels"] = labels
return inputs
# Setup trainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=collate_fn,
peft_config=peft_config,
)
# Start training
trainer.train()
# Save trained model
trainer.save_model(training_args.output_dir)
# Clear memory after training (optional)
import gc
gc.collect()
torch.cuda.empty_cache()
print("Training completed and model saved.")
this is the error that I am facing
Traceback (most recent call last):
File "/data-mount/debugging_hf/finetuning_deepseek_vl2.py", line 155, in
trainer.train()
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
return inner_training_loop(
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2473, in _inner_training_loop
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 5134, in get_batch_samples
if len(batch_samples) > 0 and "labels" in batch_samples[0]:
File "/data-mount/debugging_hf/DeepSeek-VL2/deepseek_vl2/models/processing_deepseek_vl_v2.py", line 60, in getitem
return self.dict[item]
KeyError: 0
Traceback (most recent call last):
File "/data-mount/debugging_hf/finetuning_deepseek_vl2.py", line 155, in
trainer.train()
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
return inner_training_loop(
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2473, in _inner_training_loop
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 5134, in get_batch_samples
if len(batch_samples) > 0 and "labels" in batch_samples[0]:
File "/data-mount/debugging_hf/DeepSeek-VL2/deepseek_vl2/models/processing_deepseek_vl_v2.py", line 60, in getitem
return self.dict[item]
KeyError: 0
huggingface_hub version: 0.29.2
- Platform: Linux-6.8.0-1021-gcp-x86_64-with-glibc2.35
- Python version: 3.10.16
- Running in iPython ?: No
- Running in notebook ?: No
- Running in Google Colab ?: No
- Running in Google Colab Enterprise ?: No
- Token path ?: /data-mount/debugging_hf/token
- Has saved token ?: True
- Who am I ?: keertika
- Configured git credential helpers:
- FastAI: N/A
- Tensorflow: 2.18.0
- Torch: 2.0.1
- Jinja2: 3.1.6
- Graphviz: N/A
- keras: 3.7.0
- Pydot: N/A
- Pillow: 11.1.0
- hf_transfer: N/A
- gradio: N/A
- tensorboard: N/A
- numpy: 1.26.4
- pydantic: 2.10.6
- aiohttp: 3.11.13
- ENDPOINT: https://huggingface.co
- HF_HUB_CACHE: /data-mount/debugging_hf/hub
- HF_ASSETS_CACHE: /data-mount/debugging_hf/assets
- HF_TOKEN_PATH: /data-mount/debugging_hf/token
- HF_STORED_TOKENS_PATH: /data-mount/debugging_hf/stored_tokens
- HF_HUB_OFFLINE: False
- HF_HUB_DISABLE_TELEMETRY: False
- HF_HUB_DISABLE_PROGRESS_BARS: None
- HF_HUB_DISABLE_SYMLINKS_WARNING: False
- HF_HUB_DISABLE_EXPERIMENTAL_WARNING: False
- HF_HUB_DISABLE_IMPLICIT_TOKEN: False
- HF_HUB_ENABLE_HF_TRANSFER: False
- HF_HUB_ETAG_TIMEOUT: 10
- HF_HUB_DOWNLOAD_TIMEOUT: 10
{'huggingface_hub version': '0.29.2', 'Platform': 'Linux-6.8.0-1021-gcp-x86_64-with-glibc2.35', 'Python version': '3.10.16', 'Running in iPython ?': 'No', 'Running in notebook ?': 'No', 'Running in Google Colab ?': 'No', 'Running in Google Colab Enterprise ?': 'No', 'Token path ?': '/data-mount/debugging_hf/token', 'Has saved token ?': True, 'Who am I ?': 'keertika', 'Configured git credential helpers': '', 'FastAI': 'N/A', 'Tensorflow': '2.18.0', 'Torch': '2.0.1', 'Jinja2': '3.1.6', 'Graphviz': 'N/A', 'keras': '3.7.0', 'Pydot': 'N/A', 'Pillow': '11.1.0', 'hf_transfer': 'N/A', 'gradio': 'N/A', 'tensorboard': 'N/A', 'numpy': '1.26.4', 'pydantic': '2.10.6', 'aiohttp': '3.11.13', 'ENDPOINT': 'https://huggingface.co', 'HF_HUB_CACHE': '/data-mount/debugging_hf/hub', 'HF_ASSETS_CACHE': '/data-mount/debugging_hf/assets', 'HF_TOKEN_PATH': '/data-mount/debugging_hf/token', 'HF_STORED_TOKENS_PATH': '/data-mount/debugging_hf/stored_tokens', 'HF_HUB_OFFLINE': False, 'HF_HUB_DISABLE_TELEMETRY': False, 'HF_HUB_DISABLE_PROGRESS_BARS': None, 'HF_HUB_DISABLE_SYMLINKS_WARNING': False, 'HF_HUB_DISABLE_EXPERIMENTAL_WARNING': False, 'HF_HUB_DISABLE_IMPLICIT_TOKEN': False, 'HF_HUB_ENABLE_HF_TRANSFER': False, 'HF_HUB_ETAG_TIMEOUT': 10, 'HF_HUB_DOWNLOAD_TIMEOUT': 10}
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
script is provided
Expected behavior
I was expecting that i could finetune this model