#VLLM Fine-tuning and Testing

In [1]:
#This notebook was run on a Google Colab environment using a NVIDIA A100 GPU
!pip install --upgrade pip
!pip install -q -U bitsandbytes
!pip install rouge-score sacrebleu
!pip install -q -U google-generativeai

from transformers import AutoProcessor, BitsAndBytesConfig, LlavaForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback
from PIL import Image
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from torch.utils.data import Dataset
from typing import Any, Dict
import random
import requests
import json
import os
from datasets import Dataset, load_dataset
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
import google.generativeai as genai
from google.colab import userdata

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portal

##Data Preparation

In [14]:
# Load your apod_data.json
with open('/content/drive/MyDrive/APOD_DATA/apod_data.json', 'r') as f:
    apod_data = json.load(f)

# Format data for Hugging Face Dataset
formatted_data = []
image_dir = '/content/drive/MyDrive/APOD_DATA/IMAGES'
for item in apod_data:
    date = item.get('date')
    summary = item.get('explanation')
    if date and summary:
        image_path = os.path.join(image_dir, f"{date}.jpg")
        if os.path.exists(image_path):
            # The prompt structure LLaVA expects
            prompt = "USER: <image>\nDescribe this astronomy image and explain it's content in scientific detail.\nASSISTANT:"
            formatted_data.append({
                "image": image_path,
                "prompt": prompt,
                "summary": summary
            })

# Create the Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)
print("Dataset prepared:")
print(dataset[0])

# Create a train/validation split (e.g., 80% train, 20% validation)
train_val_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

Dataset prepared:
{'image': '/content/drive/MyDrive/APOD_DATA/IMAGES/2020-01-01.jpg', 'prompt': "USER: <image>\nDescribe this astronomy image and explain it's content in scientific detail.\nASSISTANT:", 'summary': "Why is Betelgeuse fading?  No one knows.  Betelgeuse, one of the brightest and most recognized stars in the night sky, is only half as bright as it used to be only five months ago.  Such variability is likely just  normal behavior for this famously variable supergiant, but the recent dimming has rekindled discussion on how long it may be before Betelgeuse does go supernova.  Known for its red color, Betelgeuse is one of the few stars to be resolved by modern telescopes, although only barely.  The featured artist's illustration imagines how Betelgeuse might look up close. Betelgeuse is thought to have a complex and tumultuous surface that frequently throws impressive flares.  Were it to replace the Sun (not recommended), its surface would extend out near the orbit of Jupiter,

##VLLM Fine-tuning

In [15]:
# --- Load the Model and Processor ---
print("Loading the LLaVA model and processor...")
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.pad_token = processor.tokenizer.eos_token

# --- Configure PEFT for LoRA Fine-Tuning ---
lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    bias="none", target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Create a Custom Dataset and Data Collator ---
class LLaVADataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = Image.open(item["image"])
        full_text = item["prompt"] + " " + item["summary"]
        return {"image": image, "text": full_text}

class LLaVADataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        images = [f["image"] for f in features]
        texts = [f["text"] for f in features]

        # The batch should be returned as CPU tensors.
        # The Trainer will automatically move it to the GPU.
        batch = self.processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True
        )

        batch["labels"] = batch["input_ids"]
        return batch

# --- Set up and Run the Trainer ---
print("Configuring and starting the training...")
training_args = TrainingArguments(
    output_dir="llava_apod_finetuned",
    learning_rate=2.0e-5,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=1,
    save_total_limit=1,
    bf16=True,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps = 5,
    save_steps = 5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=LLaVADataset(train_dataset),
    eval_dataset=LLaVADataset(val_dataset),
    data_collator=LLaVADataCollator(processor),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

# Start the fine-tuning process
trainer.train()

print("Fine-tuning complete!")

Loading the LLaVA model and processor...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 19,136,512 || all params: 7,082,563,584 || trainable%: 0.2702
Configuring and starting the training...


Step,Training Loss,Validation Loss
5,10.8618,10.798047
10,10.1934,9.734252
15,8.5073,8.551641
20,8.345,8.396683
25,8.1232,8.333095
30,8.0539,8.174588
35,7.9433,8.051946
40,7.7503,7.934303
45,7.6357,7.76921
50,7.3061,7.591517


Fine-tuning complete!


In [16]:
# Save the fine-tuned adapter weights to a directory
SAVED_MODEL_PATH = "/content/llava-finetuned-apod"
print(f"Saving model to {SAVED_MODEL_PATH}...")
trainer.save_model(SAVED_MODEL_PATH)

# Save the processor
processor.save_pretrained(SAVED_MODEL_PATH)

print("Model and processor saved successfully.")

Saving model to /content/llava-finetuned-apod...
Model and processor saved successfully.


##VLLM Inference Testing

In [24]:
# --- Configuration ---
# Path to the base model on Hugging Face
BASE_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
# Path to the folder where you saved your fine-tuned model
FINETUNED_MODEL_PATH = "/content/llava-finetuned-apod"

# --- Load the Base Model and Processor ---
print("Loading base model and processor...")
# Load the original processor
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

# Load the base model in 4-bit for efficiency
model = LlavaForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    load_in_4bit=True,
    device_map="auto"
)

# --- Load and Apply Your Fine-Tuned Weights ---
print(f"Applying fine-tuned LoRA weights from {FINETUNED_MODEL_PATH}...")
# Load the LoRA adapter and merge it into the base model
model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)

# --- Run Inference ---
print("\n--- Running Inference with Fine-Tuned Model ---")
# Example image and prompt
image_url = "https://apod.nasa.gov/apod/image/2509/IMAP-IG2-001.JPG"
image = Image.open(requests.get(image_url, stream=True).raw)
prompt = "USER: <image>\nDescribe this astronomy image and explain it's content in scientific detail.\nASSISTANT:"
# Process inputs and generate a response
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=200)

# Decode and print the result
response_text = processor.decode(output[0], skip_special_tokens=True)
print("\nModel Response:")
# The response will include your original prompt, so we clean it up
assistant_response = response_text.split("ASSISTANT:")[-1].strip()
print(assistant_response)

Loading base model and processor...


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Applying fine-tuned LoRA weights from /content/llava-finetuned-apod...

--- Running Inference with Fine-Tuned Model ---

Model Response:
The image features a rocket launching into space, as it ascends into the sky. The rocket is captured in mid-flight, with its trail of smoke visible behind it. The scene is set against a backdrop of the sun, which is located towards the top left corner of the image. The sun's rays illuminate the rocket and its trail, creating a visually striking scene.


###Quantative Grading

In [25]:
# Assume you have run your query engine and have these variables:
generated_answer = assistant_response
reference_answer = "On the morning of September 24 a rocket crosses the bright solar disk in this long range telescopic snapshot captured from Orlando, Florida. That's about 50 miles west of its Kennedy Space Center launch site. This rocket carried three new space weather missions to space. Signals have now been successfully acquired from all three - NASA's Interstellar Mapping and Acceleration Probe, NASA’s Carruthers Geocorona Observatory, and the National Oceanic and Atmospheric Administration (NOAA) Space Weather Follow-On Lagrange 1 (SWFO-L1) - as they begin their journey to L1, an Earth-Sun lagrange point. L1 is about 1.5 million kilometers in the sunward direction from planet Earth. Appropriately, major space weather influencers, aka dark sunspots in active regions across the Sun, are posing with the transiting rocket. In fact, large active region AR4225 is just right of the rocket's nose."

# --- ROUGE Score ---
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference_answer, generated_answer)
print(f"ROUGE-L F1-Score: {rouge_scores['rougeL'].fmeasure:.4f}")


# --- BLEU Score ---
bleu = BLEU(effective_order=True)
bleu_score = bleu.sentence_score(generated_answer, [reference_answer])
print(f"BLEU Score: {bleu_score.score:.2f}")

ROUGE-L F1-Score: 0.1442
BLEU Score: 0.71


##LLM Response Grading

In [27]:
# --- Configuration ---
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

def get_gemini_judge_score(query, image, answer):
    """
    Uses the Gemini model as a judge to evaluate the quality of a generated answer.
    """
    print("\n--- Sending to LLM-as-a-Judge (Gemini) ---")

    # Configure the Gemini model
    model = genai.GenerativeModel('gemini-2.5-pro')

    # Create the prompt for the judge
    prompt = f"""
    You are an impartial and expert evaluator for an astronomy image explanation system.
    Your task is to evaluate a generated answer based on a given image.

    **Query:** "{query}"

    **Retrieved Image Link:** "{image}"

    **Generated Answer:** "{answer}"

    **Instructions:**
    1. Read the Query, Context, and Answer carefully.
    2. Assess the answer's correctness and relevance based *only* on the image.
    3. Provide a brief reasoning for your score.
    4. On a new line, provide a final score from 0 to 5, where 0 is "Not Relevant/Incorrect" and 5 is "Highly Relevant and Correct."

    Format your response as:
    Reasoning: [Your reasoning here]
    Score: [0-5]
    """

    try:
        response = model.generate_content(prompt)

        # Parse the response to extract the score
        score_text = response.text.strip().split('\n')[-1]
        score = int(score_text.split(':')[-1].strip())
        reasoning = response.text.strip().split('\n')[0].split(':')[-1].strip()

        print(f"Judge's Reasoning: {reasoning}")
        print(f"Judge's Score: {score}/5")
        return score, reasoning

    except Exception as e:
        print(f"An error occurred while querying the judge model: {e}")
        return None, None


if __name__ == "__main__":
    query = "USER: <image>\nDescribe this astronomy image and explain it's content in detail.\nASSISTANT:"
    image = "https://apod.nasa.gov/apod/image/2509/IMAP-IG2-001.JPG"
    generated_answer = assistant_response

    get_gemini_judge_score(query, image, generated_answer)


--- Sending to LLM-as-a-Judge (Gemini) ---
Judge's Reasoning: The generated answer accurately describes the key elements in the image. It correctly identifies the subject as a rocket launch, notes the trail of smoke (exhaust plume), and points out the presence and location of the sun, which is illuminating the scene. The description is factually correct and directly relevant to the visual content of the image.
Judge's Score: 5/5
