Error faced during Finetuning Deepseek-vl2

### System Info

### Describe the bug

Hi ,
I am working on finetunig deepseek-vl2 but have been facing a lot of issues.
This is the code

Modified the code from this blog https://huggingface.co/learn/cookbook/en/fine_tuning_vlm_trl 
```
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, TrainingArguments
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
from deepseek_vl2.utils.io import load_pil_images 
from typing import Dict, Tuple, List, Literal, Optional

# Load model and processor
model_path = "deepseek-ai/deepseek-vl2"
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model = vl_gpt.to(torch.bfloat16).cuda()

# Define PEFT configuration (LoRA)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

# System prompt (customize accordingly)
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

# Fix format_data function
def format_data(sample: Dict[str, str]) -> List[Dict[str, str]]:
    formatted_sample: List[Dict[str, str]] = [{
        "role": "<|User|>", 
        "content": f"{sample['query']} <image>",
        "images": sample['image']  # Fixed the missing comma here
    }, {
        "role": "<|Assistant|>", 
        "content": f"{sample['label'][0]}"
    }]
    return formatted_sample

# Load dataset
dataset_id = "HuggingFaceM4/ChartQA"
train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=["train[:10%]", "val[:10%]", "test[:10%]"])

# Apply data formatting
train_dataset = [format_data(sample) for sample in train_dataset if format_data(sample) is not None]
eval_dataset = [format_data(sample) for sample in eval_dataset if format_data(sample) is not None]
test_dataset = [format_data(sample) for sample in test_dataset if format_data(sample) is not None]
print(f"First training sample after processing: {train_dataset[0]}")

from trl import SFTConfig
# Configure training arguments
training_args = SFTConfig(
    output_dir="deepseekvl2_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-5,
    warmup_ratio=0.03,
    gradient_checkpointing=False,
    bf16=True,
    tf32=True,
    push_to_hub=False,
    remove_unused_columns=False,
    dataset_text_field=None,  # explicitly set to None
    dataset_kwargs={"skip_prepare_dataset": True},  # add this line
    report_to="wandb",
)

# Define collate_fn to process batches
def collate_fn(examples):
    batch_conversations = []
    batch_images = []

    for example in examples:
        if example is None:
            print("empty sample")
            continue

        print(f"Example: {example}") 
        
        # Extract user and assistant messages
        formatted_sample = example  # Already formatted in format_data
        
        # Append the user and assistant conversations
        batch_conversations.append({
            "role": formatted_sample[0]["role"],
            "content": formatted_sample[0]["content"],
        })
        batch_conversations.append({
            "role": formatted_sample[1]["role"],
            "content": formatted_sample[1]["content"],
        })

        # Append image to batch_images
        # if formatted_sample[0]["images"].mode != 'RGB':
        #      formatted_sample[0]["images"].convert('RGB')
        batch_images.append(formatted_sample[0]["images"].convert('RGB'))  # User message contains the image

    print(len(batch_conversations))
    print(len(batch_images))
    if not batch_conversations or not batch_images:
        print("Warning: Empty conversations or images batch!")

    # Pass both conversations and images to the processor
    inputs = vl_chat_processor(
        conversations=batch_conversations,  # Provide the batch of conversations
        images=batch_images,  # Provide the batch of images
        return_tensors="pt",
        padding=True,
        force_batchify=True,
        system_prompt=system_message
    )

    # Create labels for the inputs, ensuring padding is handled correctly
    labels = inputs["input_ids"].clone()
    labels[labels == tokenizer.pad_token_id] = -100

    # Handle special tokens in the labels
    special_token_ids = [
        tokenizer.convert_tokens_to_ids(tok) for tok in ['<image>', '<|User|>', '<|Assistant|>']
    ]

    for token_id in special_token_ids:
        labels[labels == token_id] = -100

    inputs["labels"] = labels
    return inputs

# Setup trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
)

# Start training
trainer.train()

# Save trained model
trainer.save_model(training_args.output_dir)

# Clear memory after training (optional)
import gc
gc.collect()
torch.cuda.empty_cache()

print("Training completed and model saved.")
```
this is the error that I am facing

Traceback (most recent call last):
  File "/data-mount/debugging_hf/finetuning_deepseek_vl2.py", line 155, in <module>
    trainer.train()
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
    return inner_training_loop(
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2473, in _inner_training_loop
    batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 5134, in get_batch_samples
    if len(batch_samples) > 0 and "labels" in batch_samples[0]:
  File "/data-mount/debugging_hf/DeepSeek-VL2/deepseek_vl2/models/processing_deepseek_vl_v2.py", line 60, in __getitem__
    return self.__dict__[item]
KeyError: 0
Traceback (most recent call last):
  File "/data-mount/debugging_hf/finetuning_deepseek_vl2.py", line 155, in <module>
    trainer.train()
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
    return inner_training_loop(
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 2473, in _inner_training_loop
    batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
  File "/home/anaconda3/envs/hf_debugging/lib/python3.10/site-packages/transformers/trainer.py", line 5134, in get_batch_samples
    if len(batch_samples) > 0 and "labels" in batch_samples[0]:
  File "/data-mount/debugging_hf/DeepSeek-VL2/deepseek_vl2/models/processing_deepseek_vl_v2.py", line 60, in __getitem__
    return self.__dict__[item]
KeyError: 0


```shell
huggingface_hub version: 0.29.2
- Platform: Linux-6.8.0-1021-gcp-x86_64-with-glibc2.35
- Python version: 3.10.16
- Running in iPython ?: No
- Running in notebook ?: No
- Running in Google Colab ?: No
- Running in Google Colab Enterprise ?: No
- Token path ?: /data-mount/debugging_hf/token
- Has saved token ?: True
- Who am I ?: keertika
- Configured git credential helpers: 
- FastAI: N/A
- Tensorflow: 2.18.0
- Torch: 2.0.1
- Jinja2: 3.1.6
- Graphviz: N/A
- keras: 3.7.0
- Pydot: N/A
- Pillow: 11.1.0
- hf_transfer: N/A
- gradio: N/A
- tensorboard: N/A
- numpy: 1.26.4
- pydantic: 2.10.6
- aiohttp: 3.11.13
- ENDPOINT: https://huggingface.co
- HF_HUB_CACHE: /data-mount/debugging_hf/hub
- HF_ASSETS_CACHE: /data-mount/debugging_hf/assets
- HF_TOKEN_PATH: /data-mount/debugging_hf/token
- HF_STORED_TOKENS_PATH: /data-mount/debugging_hf/stored_tokens
- HF_HUB_OFFLINE: False
- HF_HUB_DISABLE_TELEMETRY: False
- HF_HUB_DISABLE_PROGRESS_BARS: None
- HF_HUB_DISABLE_SYMLINKS_WARNING: False
- HF_HUB_DISABLE_EXPERIMENTAL_WARNING: False
- HF_HUB_DISABLE_IMPLICIT_TOKEN: False
- HF_HUB_ENABLE_HF_TRANSFER: False
- HF_HUB_ETAG_TIMEOUT: 10
- HF_HUB_DOWNLOAD_TIMEOUT: 10

{'huggingface_hub version': '0.29.2', 'Platform': 'Linux-6.8.0-1021-gcp-x86_64-with-glibc2.35', 'Python version': '3.10.16', 'Running in iPython ?': 'No', 'Running in notebook ?': 'No', 'Running in Google Colab ?': 'No', 'Running in Google Colab Enterprise ?': 'No', 'Token path ?': '/data-mount/debugging_hf/token', 'Has saved token ?': True, 'Who am I ?': 'keertika', 'Configured git credential helpers': '', 'FastAI': 'N/A', 'Tensorflow': '2.18.0', 'Torch': '2.0.1', 'Jinja2': '3.1.6', 'Graphviz': 'N/A', 'keras': '3.7.0', 'Pydot': 'N/A', 'Pillow': '11.1.0', 'hf_transfer': 'N/A', 'gradio': 'N/A', 'tensorboard': 'N/A', 'numpy': '1.26.4', 'pydantic': '2.10.6', 'aiohttp': '3.11.13', 'ENDPOINT': 'https://huggingface.co', 'HF_HUB_CACHE': '/data-mount/debugging_hf/hub', 'HF_ASSETS_CACHE': '/data-mount/debugging_hf/assets', 'HF_TOKEN_PATH': '/data-mount/debugging_hf/token', 'HF_STORED_TOKENS_PATH': '/data-mount/debugging_hf/stored_tokens', 'HF_HUB_OFFLINE': False, 'HF_HUB_DISABLE_TELEMETRY': False, 'HF_HUB_DISABLE_PROGRESS_BARS': None, 'HF_HUB_DISABLE_SYMLINKS_WARNING': False, 'HF_HUB_DISABLE_EXPERIMENTAL_WARNING': False, 'HF_HUB_DISABLE_IMPLICIT_TOKEN': False, 'HF_HUB_ENABLE_HF_TRANSFER': False, 'HF_HUB_ETAG_TIMEOUT': 10, 'HF_HUB_DOWNLOAD_TIMEOUT': 10}
```

### Who can help?
@merveenoyan @sergiopaniego 


### Information

- [ ] The official example scripts
- [x] My own modified scripts

### Tasks

- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [x] My own task or dataset (give details below)

### Reproduction

script is provided 

### Expected behavior

I was expecting that i could finetune this model


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Error faced during Finetuning Deepseek-vl2 #36633

System Info

Describe the bug

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Error faced during Finetuning Deepseek-vl2 #36633

Description

System Info

Describe the bug

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions