In [None]:
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

In [30]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["WORLD_SIZE"] = "1"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"

In [31]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_id = "llava-hf/llava-1.5-7b-hf"

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    device_map={"": 0},
)

# Prepare for training and apply LoRA
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.pad_token = processor.tokenizer.eos_token

LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
processor.tokenizer.chat_template = LLAVA_CHAT_TEMPLATE

class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            text = self.processor.tokenizer.apply_chat_template(
                example["messages"], tokenize=False, add_generation_prompt=False
            )
            texts.append(text)
            images.append(example["images"][0])

        batch = self.processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024,
        )

        labels = batch["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        assistant_ids = self.processor.tokenizer.encode("ASSISTANT:", add_special_tokens=False)
        for i in range(len(labels)):
            ids = batch["input_ids"][i].tolist()
            for j in range(len(ids) - len(assistant_ids) + 1):
                if ids[j:j + len(assistant_ids)] == assistant_ids:
                    labels[i, :j + len(assistant_ids)] = -100
                    break

        batch["labels"] = labels
        return batch

data_collator = LLavaDataCollator(processor)

dataset = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft")
train_dataset = dataset["train"].select(range(10000))

training_args = TrainingArguments(
    output_dir="./sft_llava_lora",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    logging_steps=10,
    save_steps=500,
    fp16=False,
    max_grad_norm=0.3,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    remove_unused_columns=False,
    dataloader_num_workers=0,  # Avoid multiprocessing issues
    gradient_checkpointing=False,  # Disable - can cause NCCL issues with PEFT
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/686 [00:00<?, ?it/s]

trainable params: 19,136,512 || all params: 7,082,563,584 || trainable%: 0.2702


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Step,Training Loss
10,2.804641
20,2.529527
30,2.304009
40,1.66114
50,0.98497
60,0.774927
70,0.677329
80,0.645327
90,0.605836
100,0.616876


TrainOutput(global_step=625, training_loss=0.687084672164917, metrics={'train_runtime': 5914.5199, 'train_samples_per_second': 1.691, 'train_steps_per_second': 0.106, 'total_flos': 3.448413933084344e+17, 'train_loss': 0.687084672164917, 'epoch': 1.0})

In [2]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub()

NameError: name 'trainer' is not defined

In [1]:
from PIL import Image
import requests
import matplotlib.pyplot as plt
from textwrap import wrap

def evaluate_and_display(test_images):
    for url, question in test_images:
        try:
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, stream=True, headers=headers)
            image = Image.open(response.raw).convert("RGB")
            
            prompt = f"USER: <image>\n{question}\nASSISTANT:"
            inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                output = model.generate(**inputs, max_new_tokens=256, do_sample=False)
            
            result = processor.tokenizer.decode(output[0], skip_special_tokens=True)
            if "ASSISTANT:" in result:
                result = result.split("ASSISTANT:")[-1].strip()
            
            # Display
            fig, ax = plt.subplots(1, 1, figsize=(8, 6))
            ax.imshow(image)
            ax.axis("off")
            plt.show()
            
            print(f"Question: {question}")
            print(f"Response: {result}")
            print("=" * 60)
            print()
            
        except Exception as e:
            print(f"Error with {url}: {e}")

test_images = [
    ("https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=400",
     "What animal is in this image?"),
    ("https://images.unsplash.com/photo-1518791841217-8f162f1e1131?w=400", 
     "Describe this image in detail."),
]

evaluate_and_display(test_images)

Error with https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=400: name 'processor' is not defined
Error with https://images.unsplash.com/photo-1518791841217-8f162f1e1131?w=400: name 'processor' is not defined
