In [1]:
!pip install -U bitsandbytes
!pip install -q bitsandbytes>=0.39.0
!pip install -q -U transformers



In [None]:


import os
import json
import torch
from datasets import load_dataset
from transformers import (
    # AutoModelForCausalLM, # This was causing the error for LLaVA
    AutoProcessor,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    AutoModelForVision2Seq # Correct class for multimodal LLaVA models
)
from peft import LoraConfig, get_peft_model, PeftModel
from PIL import Image

# --- 1. Configuration ---

# Model and processor names
BASE_MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

# Input data paths
ANNOTATIONS_PATH = "annotations.json"
IMAGE_DIR = "images/"

# Output directory for the fine-tuned model adapters
OUTPUT_DIR = "./llava-finetuned-adapters-multimodal"

# --- 2. Load Model and Processor with Quantization ---

def load_model_and_processor():
    """Loads the LLaVA model and processor with 4-bit quantization."""
    print("Loading model and processor...")

    # Configure quantization to reduce memory usage
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load the base model using AutoModelForVision2Seq
    model = AutoModelForVision2Seq.from_pretrained(
        BASE_MODEL_NAME,
        quantization_config=quant_config,
        device_map="auto",
    )

    # Load the LLaVA processor (handles both image and text)
    processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME)
    # Set padding token
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

    print("Model and processor loaded successfully.")
    return model, processor

# --- 3. Configure PEFT/LoRA ---

def get_peft_config(model):
    """
    Configures the LoRA parameters, crucially targeting the
    multimodal projector and other key attention layers.
    """
    print("Configuring LoRA...")

    # Find all linear layers for LoRA target modules
    # This is a robust way to target all relevant layers
    target_modules = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
            # We add all linear layers, which will include:
            # - q_proj, k_proj, v_proj, o_proj (Attention)
            # - gate_proj, up_proj, down_proj (Feed-forward)
            # - mm_projector (The crucial vision-language connector)
            target_modules.append(name.split('.')[-1])

    # Remove duplicates
    target_modules = sorted(list(set(target_modules)))

    # Ensure key modules are present (optional check)
    print(f"LoRA Target Modules: {target_modules}")

    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=target_modules
    )
    print("LoRA configured.")
    return peft_config

# --- 4. Dataset Preprocessing ---

class MultimodalSarcasmDataset(torch.utils.data.Dataset):
    """
    Custom PyTorch Dataset to load images and text from
    our annotations.json file.
    """
    def __init__(self, annotations_path, image_dir, processor):
        print(f"Loading annotations from {annotations_path}")
        with open(annotations_path, 'r') as f:
            self.annotations = json.load(f)
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]

        # 1. Format the text prompt
        statement = item['statement']

        # This is the target output the model must learn
        response = f"Label: {item['label']}\nExplanation: {item['explanation']}"

        # This is the LLaVA 1.5 prompt format
        prompt = f"USER: <image>\n{statement}\nASSISTANT:{response}{self.processor.tokenizer.eos_token}"

        # 2. Load the image
        image_path = os.path.join(self.image_dir, item['image'])
        try:
            image = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"Warning: Image not found at {image_path}. Using a blank image.")
            image = Image.new('RGB', (512, 512), (255, 255, 255)) # Blank white image

        # 3. Process image and text
        # The processor will handle image preprocessing and text tokenization
        # We set `padding=True`, `truncation=True` in the data collator
        inputs = self.processor(text=prompt, images=image, return_tensors="pt")

        # The processor output is nested, e.g., inputs['input_ids'][0]
        # We want to return a flat dictionary
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}

        # We need to create the 'labels' for supervised fine-tuning
        # The labels are the same as the input_ids
        inputs['labels'] = inputs['input_ids'].clone()

        # Mask out the "USER" part of the prompt from the labels
        # The model should only learn to predict the "ASSISTANT" part

        # Let's find the start of the assistant's response
        # Tokenize the prompt *without* the response
        prompt_user_part = f"USER: <image>\n{statement}\nASSISTANT:"
        user_inputs = self.processor(text=prompt_user_part, images=image, return_tensors="pt")
        user_input_len = len(user_inputs['input_ids'][0])

        # Mask all tokens up to the start of the assistant's response
        # (minus 1, as the first token of the response is at that index)
        labels = inputs['labels']
        labels[:user_input_len - 1] = -100 # -100 is the ignore_index for cross-entropy loss

        # Replace pad tokens in labels with -100
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        inputs['labels'] = labels

        return inputs

def get_data_collator(processor):
    """
    Return a data collator that handles padding for our multimodal inputs.
    """
    def data_collator(features):
        # Pad text inputs
        input_ids = [f['input_ids'] for f in features]
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id
        )

        attention_mask = (input_ids != processor.tokenizer.pad_token_id)

        # Pad labels
        labels = [f['labels'] for f in features]
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100
        )

        # Stack pixel values
        pixel_values = torch.stack([f['pixel_values'] for f in features])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'pixel_values': pixel_values,
            'labels': labels,
        }
    return data_collator


# --- 5. Main Training Function ---

def main():
    """Main function to orchestrate the fine-tuning process."""

    # Step 1: Load model and processor
    model, processor = load_model_and_processor()

    # Step 2: Configure LoRA
    peft_config = get_peft_config(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Step 3: Load and prepare the dataset
    dataset = MultimodalSarcasmDataset(
        annotations_path=ANNOTATIONS_PATH,
        image_dir=IMAGE_DIR,
        processor=processor
    )

    # Step 4: Get data collator
    data_collator = get_data_collator(processor)

    # Step 5: Set up training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,              # More epochs are often needed for this
        per_device_train_batch_size=2,   # LLaVA is memory-hungry! Start small.
        gradient_accumulation_steps=8,   # Effective batch size = 16
        optim="paged_adamw_32bit",
        save_steps=100,
        logging_steps=10,
        learning_rate=1e-4,              # A slightly lower LR can be stable
        weight_decay=0.001,
        fp16=True,
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        lr_scheduler_type="constant",
        remove_unused_columns=False, # We need 'pixel_values'
        report_to="none", # Disable wandb/tensorboard logging for simplicity
    )

    # Step 6: Initialize the Trainer
    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        data_collator=data_collator,
        args=training_args,
    )

    # Step 7: Start training
    print("Starting training...")
    trainer.train()
    print("Training complete.")

    # Step 8: Save the fine-tuned model adapters
    print(f"Saving fine-tuned model adapters to {OUTPUT_DIR}...")
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR) # Save the processor too
    print("Model saved successfully.")

if __name__ == "__main__":
    main()

Loading model and processor...




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [1]:
import requests
import os

HF_TOKEN = os.environ.get("HF_KEY")  # set this in your environment, do NOT hardcode
HF_MODEL = "Salesforce/blip2-flan-t5-base"  # model slug on HF Hub

def get_multimodal_inference_response_hf_router(image_path: str, text_prompt: str, timeout: int = 60) -> str:
    """
    Uses the HF Router inference endpoint:
    POST https://router.huggingface.co/hf-inference/{model}
    multipart/form-data: file + inputs (prompt)
    """
    if not HF_TOKEN:
        return "Error: HF_TOKEN environment variable not set."

    # router endpoint (note 'hf-inference' path)
    url = f"https://router.huggingface.co/hf-inference/{HF_MODEL}"
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}

    try:
        with open(image_path, "rb") as f:
            img_bytes = f.read()
    except FileNotFoundError:
        return f"Error: image file not found: {image_path}"
    except Exception as e:
        return f"Error reading image file: {e}"

    # multipart POST: file + inputs (the prompt)
    files = {
        "file": ("image.jpg", img_bytes, "image/jpeg"),
        "inputs": (None, text_prompt),
    }

    try:
        resp = requests.post(url, headers=headers, files=files, timeout=timeout)
    except requests.RequestException as e:
        return f"Network error while calling HF Router: {e}"

    # handle non-200
    if resp.status_code != 200:
        return f"Hugging Face Router error {resp.status_code}: {resp.text}"

    try:
        data = resp.json()
    except ValueError:
        # sometimes the response is plain text
        return resp.text

    # Common shapes:
    # 1) [{"generated_text": "..."}]
    # 2) {"generated_text": "..."}
    # 3) other JSON shapes or ID-structured responses
    if isinstance(data, list) and len(data) and isinstance(data[0], dict) and "generated_text" in data[0]:
        return data[0]["generated_text"]
    if isinstance(data, dict) and "generated_text" in data:
        return data["generated_text"]

    # fallback: stringify useful keys if available
    return str(data)

# Example usage:
if __name__ == "__main__":
    p = "images/ironic_scene.jpg"
    prompt = "Describe the scene and indicate whether the caption is sarcastic."
    print(get_multimodal_inference_response_hf_router(p, prompt))


Error: image file not found: images/ironic_scene.jpg
