In [None]:
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"  

In [None]:
import torch
print(f"Visible GPUs: {torch.cuda.device_count()}")  
for i in range(torch.cuda.device_count()):
    print(f"  {i}: {torch.cuda.get_device_name(i)}")

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import AutoTokenizer, AutoProcessor, LlavaForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
import torch

# Configuration - 
dataset_name = "HuggingFaceH4/llava-instruct-mix-vsft"  # or your dataset
model_id = "llava-hf/llava-1.5-7b-hf"

training_args = TrainingArguments(
    output_dir="./sft_llava_output",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Reduced from 8
    gradient_accumulation_steps=4,  # Increased to compensate
    learning_rate=2e-6,
    logging_steps=10,
    save_steps=500,
    bf16=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    remove_unused_columns=False,
    max_grad_norm=1.0,
    warmup_ratio=0.1,
)

In [None]:
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            messages = example["messages"]
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            texts.append(text)
            images.append(example["images"][0])

        batch = self.processor(
            text=texts, 
            images=images, 
            return_tensors="pt", 
            padding=True
        )

        labels = batch["input_ids"].clone()
        
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        
        # Mask prompt - only learn to predict assistant response
        assistant_token_ids = self.processor.tokenizer.encode("ASSISTANT:", add_special_tokens=False)
        
        for i, label_seq in enumerate(labels):
            input_ids = batch["input_ids"][i].tolist()
            for j in range(len(input_ids) - len(assistant_token_ids) + 1):
                if input_ids[j:j + len(assistant_token_ids)] == assistant_token_ids:
                    labels[i, :j + len(assistant_token_ids)] = -100
                    break
        
        batch["labels"] = labels
        return batch

data_collator = LLavaDataCollator(processor)

In [None]:
# Test the collator
dataset = load_dataset(dataset_name)

sample_batch = data_collator([dataset["train"][0]])
print("Input IDs shape:", sample_batch["input_ids"].shape)
print("Labels shape:", sample_batch["labels"].shape)
print("Non-masked tokens:", (sample_batch["labels"] != -100).sum().item())
print("Total tokens:", sample_batch["labels"].numel())

# Decode what the model will actually learn to predict
valid_labels = sample_batch["labels"][0][sample_batch["labels"][0] != -100]
print("Model learns to predict:", processor.tokenizer.decode(valid_labels))

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub()

This training failed catastrophically. The pattern shows:

- Steps 10-120: Normal training, loss decreasing from 2.5 → 0.65
- Steps 130-1010: Increasing instability, loss spiking up to 4.0
- Step 1020: Gradient explosion - loss spikes to 50.9
- Step 1030+: Model collapse - loss = 0.0 forever (model is broken)
- The model saved after step 1020 is corrupted and unusable. 

This happened because:

- Numerical instability with bfloat16 + pipeline parallelism (device_map="auto")
- Activations transferred between GPUs accumulate precision errors
- Once gradients explode, weights become NaN/inf → zero loss (nothing to compute)