In [None]:
!pip install -U transformers torchinfo jedi datasets


## Local Inference on GPU
Model page: https://huggingface.co/llava-hf/llava-1.5-7b-hf

‚ö†Ô∏è If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) üôè

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-7b-hf", use_fast=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "/teamspace/studios/this_studio/Hko2q4tVGjfzUZDUNH2RsA_b.jpg"},
            {"type": "text", "text": "which animal is this?"}
        ]
    },
]

pipe(text=messages)

print(pipe.device) 


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


cuda:0


In [3]:
# Load model directly
from transformers import AutoProcessor, AutoModelForVision2Seq


processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", use_fast=True)
model = AutoModelForVision2Seq.from_pretrained("llava-hf/llava-1.5-7b-hf")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "/teamspace/studios/this_studio/Hko2q4tVGjfzUZDUNH2RsA_b.jpg"},
            {"type": "text", "text": "which animal is this?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

This is a dog.</s>


- print the last layer 
- freeze all the layers first
- unfreeze the last layer 
- add another layer, duplicate of the last layer 
- check a compatible model data set 
- train the model using that dataset
- congrats you transfer learning completed using LLM

In [4]:
# Model Architecture
with open('model.txt', 'a') as f:
    f.write(str(model)) 
    f.write(str(model.__dir__()))


In [5]:
from torchinfo import summary

with open("model_summary_brefore_training.txt", 'w') as f:
    f.write(str(summary(model, depth=10, verbose=0)))

In [6]:
from transformers import AutoTokenizer, AutoImageProcessor, LlavaForConditionalGeneration

# Use separate components for granular control
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-1.5-7b-hf", use_fast=True)
image_processor = AutoImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
tokenizer.pad_token = tokenizer.eos_token # Critical Fix


In [7]:
# Load base model
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

##### step A: Freeze Vision, Unfreeze Projector

In [8]:
# 1. Start by freezing EVERYTHING (Base state)
# This locks the Vision Encoder (layer 2-1) and the LlamaModel (layer 2-3)
for param in model.parameters():
    param.requires_grad = False

# 2. Unfreeze ONLY the Projector (Layer 2-2)
# We iterate through parameters and unlock those belonging to the projector.
# In most Hugging Face LLaVA implementations, these layers contain the keyword "projector" 
# (often named 'mm_projector' or 'multi_modal_projector').

print("Unfreezing Projector layers...")
for name, param in model.named_parameters():
    if "projector" in name:
        param.requires_grad = True
        print(f"  -> Unfrozen: {name}")

# 3. Verification
# Count how many parameters are actually trainable now.
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"\nSummary:")
print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}") 
print(f"Percentage Trainable: {100 * trainable_params / total_params:.2f}%")

Unfreezing Projector layers...
  -> Unfrozen: model.multi_modal_projector.linear_1.weight
  -> Unfrozen: model.multi_modal_projector.linear_1.bias
  -> Unfrozen: model.multi_modal_projector.linear_2.weight
  -> Unfrozen: model.multi_modal_projector.linear_2.bias

Summary:
Total Parameters: 7,063,427,072
Trainable Parameters: 20,979,712
Percentage Trainable: 0.30%


##### Step B: Applying LoRA and unfreezing the multi modal projector

In [9]:
try: 
    from peft import LoraConfig, get_peft_model
except:
    !pip install peft
    from peft import LoraConfig, get_peft_model

# 1. Define the LoRA Configuration
# We target the attention mechanisms of the LLM backbone.
lora_config = LoraConfig(
    r=16,                       # Rank: Higher = more params, "smarter" but slower. 16 is standard.
    lora_alpha=32,              # Scaling factor: Usually set to 2x the rank.
    target_modules=[            # The specific layers to inject adapters into
        "q_proj",               # Query projection in LlamaAttention
        "v_proj"                # Value projection in LlamaAttention
    ],
    lora_dropout=0.05,          # Reduces overfitting
    bias="none",
    task_type="CAUSAL_LM",      # LLaVA behaves like a standard Causal LLM during training
    modules_to_save=['multi_modal_projector']
)

# 2. Apply LoRA to the Model
# This wraps the frozen layers with the new trainable LoRA adapters.
print("Injecting LoRA adapters and projector...")
model = get_peft_model(model, lora_config)

# 3. Verify the Trainable Parameters
# You should see a slight increase in trainable params compared to Step A,
# but it will still be < 2% of the total model.
model.print_trainable_parameters()

Injecting LoRA adapters and projector...
trainable params: 30,941,184 || all params: 7,094,368,256 || trainable%: 0.4361


### Training 
#### loading Dataset 

In [10]:
from torch.utils.data import Dataset
from datasets import load_dataset

class ScienceQADataset(Dataset):
    def __init__(self, split, tokenizer, image_processor):
        # 1. Load the dataset from Hugging Face
        print(f"Loading ScienceQA [{split}]...")
        self.data = load_dataset("derek-thomas/ScienceQA", split=split)
        
        # 2. Filter for rows WITH images only (Critical for Vision Transfer Learning)
        # We assume you want to train the vision encoder/projector.
        self.data = self.data.filter(lambda x: x['image'] is not None)
        print(f"Filtered to {len(self.data)} multimodal examples.")

        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.options = ["(A)", "(B)", "(C)", "(D)", "(E)"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # --- A. Prepare Image ---
        # The 'image' field is already a PIL object in this dataset
        image = item['image'].convert('RGB')
        image_tensor = self.image_processor(image, return_tensors='pt')['pixel_values'].squeeze(0)

        # --- B. Prepare Text (Question & Options) ---
        question = item['question']
        choices = item['choices']
        
        # Format choices like: "(A) Choice 1  (B) Choice 2"
        choice_str = " ".join([f"{self.options[i]} {c}" for i, c in enumerate(choices)])
        
        # Create the USER prompt
        # Standard LLaVA format: USER: <image>\n<prompt> ASSISTANT:
        human_input = f"<image>\n{question}\nOptions: {choice_str}\nAnswer with the option letter."

        # --- C. Prepare Answer ---
        # Convert integer answer (e.g., 0) to Letter (e.g., "A") and add the explanation
        answer_idx = item['answer']
        answer_letter = self.options[answer_idx]
        solution = item['solution'] if item['solution'] else ""
        
        gpt_response = f"{answer_letter}. {solution}"

        # --- D. Tokenize ---
        # Combine into full conversation for training
        formatted_prompt = f"USER: {human_input} ASSISTANT: {gpt_response}</s>"

        tokenized = self.tokenizer(
            formatted_prompt,
            padding="max_length",
            truncation=True,
            max_length=512, # Reduce to 256 if you hit OOM errors
            return_tensors="pt"
        )
        
        input_ids = tokenized.input_ids.squeeze(0)
        attention_mask = tokenized.attention_mask.squeeze(0)
        labels = input_ids.clone()
        
        # Mask out padding tokens so they don't affect loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": image_tensor,
            "labels": labels
        }

# --- Usage ---
# Assuming you already have 'tokenizer' and 'image_processor' from previous steps
train_dataset = ScienceQADataset("train", tokenizer, image_processor)

In [None]:
from transformers import TrainingArguments, Trainer

# 1. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./llava-scienceqa-finetune",
    per_device_train_batch_size=4,  # Start low (e.g., 2 or 4) for 7B models
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    learning_rate=2e-4,             # Standard LoRA learning rate
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,                      # Use mixed precision (save VRAM)
    remove_unused_columns=False     # Important for custom LLaVA datasets
)

# 2. Initialize Trainer
trainer = Trainer(
    model=model,                    # Your model with LoRA + Unfrozen Projector
    args=training_args,
    train_dataset=train_dataset,    # The ScienceQA dataset we just built
    # No "eval_dataset" needed for a quick test run
)

# 3. Start Training
# trainer.train()