#VLLM Fine-tuning and Testing

In [1]:
#This notebook was run on a Google Colab environment using a NVIDIA A100 GPU
!pip install --upgrade pip
!pip install -q -U bitsandbytes
!pip install rouge-score sacrebleu
!pip install -q -U google-generativeai

from transformers import AutoProcessor, BitsAndBytesConfig, LlavaForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback
from PIL import Image
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from torch.utils.data import Dataset
from typing import Any, Dict
import random
import requests
import json
import os
from datasets import Dataset, load_dataset
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
import google.generativeai as genai
from google.colab import userdata

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portal

##Data Preparation

In [None]:
# Load your apod_data.json
with open('/content/drive/MyDrive/APOD/DATA/apod_data.json', 'r') as f:
    apod_data = json.load(f)

# NEW: Create a variety of prompts
prompt_templates = [
    "USER: <image>\nDescribe this image in detail, focusing on the scientific phenomena present.\nASSISTANT:",
    "USER: <image>\nWhat are the primary astronomical objects in this image? Explain their significance.\nASSISTANT:",
    "USER: <image>\nProvide a scientific explanation for what is happening in this image.\nASSISTANT:",
    "USER: <image>\nExplain this image in the style of NASA's Astronomy Picture of the Day.\nASSISTANT:",
]

# Format data for Hugging Face Dataset
formatted_data = []
image_dir = '/content/drive/MyDrive/APOD/DATA/IMAGES'
for item in apod_data:
    date = item.get('date')
    summary = item.get('explanation')
    if date and summary:
        image_path = os.path.join(image_dir, f"{date}.jpg")
        if os.path.exists(image_path):
            # The prompt structure LLaVA expects
            prompt = random.choice(prompt_templates)
            formatted_data.append({
                "image": image_path,
                "prompt": prompt,
                "summary": summary
            })

# Create the Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)
print("Dataset prepared:")
print(dataset[0:4])

# Create a train/validation split (e.g., 80% train, 20% validation)
train_val_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

Dataset prepared:
{'image': ['/content/drive/MyDrive/APOD/DATA/IMAGES/2020-01-01.jpg', '/content/drive/MyDrive/APOD/DATA/IMAGES/2020-01-02.jpg', '/content/drive/MyDrive/APOD/DATA/IMAGES/2020-01-03.jpg', '/content/drive/MyDrive/APOD/DATA/IMAGES/2020-01-04.jpg'], 'prompt': ['USER: <image>\nProvide a scientific explanation for what is happening in this image.\nASSISTANT:', "USER: <image>\nExplain this image in the style of NASA's Astronomy Picture of the Day.\nASSISTANT:", 'USER: <image>\nProvide a scientific explanation for what is happening in this image.\nASSISTANT:', 'USER: <image>\nProvide a scientific explanation for what is happening in this image.\nASSISTANT:'], 'summary': ["Why is Betelgeuse fading?  No one knows.  Betelgeuse, one of the brightest and most recognized stars in the night sky, is only half as bright as it used to be only five months ago.  Such variability is likely just  normal behavior for this famously variable supergiant, but the recent dimming has rekindled disc

##VLLM Fine-tuning

In [None]:
# --- Load the Model and Processor ---
print("Loading the LLaVA model and processor...")
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.pad_token = processor.tokenizer.eos_token

# --- Configure PEFT for LoRA Fine-Tuning ---
lora_config = LoraConfig(
    r=128, lora_alpha=256, lora_dropout=0.05,
    bias="none", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "multi_modal_projector.linear_1",
        "multi_modal_projector.linear_2",]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Create a Custom Dataset and Data Collator ---
class LLaVADataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = Image.open(item["image"])
        full_text = item["prompt"] + " " + item["summary"]
        return {"image": image, "text": full_text}

class LLaVADataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        images = [f["image"] for f in features]
        texts = [f["text"] for f in features]

        # The batch should be returned as CPU tensors.
        # The Trainer will automatically move it to the GPU.
        batch = self.processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True
        )

        batch["labels"] = batch["input_ids"]
        return batch

# --- Set up and Run the Trainer ---
print("Configuring and starting the training...")
training_args = TrainingArguments(
    output_dir="llava_apod_finetuned",
    learning_rate=2.0e-5,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=1,
    save_total_limit=1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    bf16=True,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps = 25,
    save_steps = 25,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=LLaVADataset(train_dataset),
    eval_dataset=LLaVADataset(val_dataset),
    data_collator=LLaVADataCollator(processor),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Start the fine-tuning process
trainer.train()

print("Fine-tuning complete!")

Loading the LLaVA model and processor...


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

trainable params: 340,393,984 || all params: 7,403,821,056 || trainable%: 4.5975
Configuring and starting the training...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mthomaspowell052202[0m ([33mthomaspowell052202-university-of-new-haven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
25,6.8121,6.253835
50,3.3289,3.282465
75,3.2721,3.072163
100,3.1883,3.008347
125,3.1941,2.985574
150,3.0813,2.976583
175,3.0424,2.96882
200,3.0937,2.966207
225,3.1394,2.958984
250,3.0113,2.954875


Fine-tuning complete!


In [None]:
# Save the fine-tuned adapter weights to a directory
SAVED_MODEL_PATH = "/content/llava-finetuned-apod"
print(f"Saving model to {SAVED_MODEL_PATH}...")
trainer.save_model(SAVED_MODEL_PATH)

# Save the processor
processor.save_pretrained(SAVED_MODEL_PATH)

print("Model and processor saved successfully.")

Saving model to /content/llava-finetuned-apod...
Model and processor saved successfully.


##VLLM Inference Testing

In [None]:
# --- Configuration ---
# Path to the base model on Hugging Face
BASE_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
# Path to the folder where you saved your fine-tuned model
FINETUNED_MODEL_PATH = "/content/llava-finetuned-apod"
MERGED_MODEL_PATH = "/content/llava-apod-merged2"

# --- Load the Base Model and Processor ---
print("Loading base model and processor...")
# Load the original processor
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

# Load the base model in 4-bit for efficiency
model = LlavaForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    load_in_4bit=True,
    device_map="auto"
)

# --- Load and Apply Your Fine-Tuned Weights ---
print(f"Applying fine-tuned LoRA weights from {FINETUNED_MODEL_PATH}...")
# Load the LoRA adapter and merge it into the base model
model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)
model = model.merge_and_unload()
print("Merge complete.")

# --- Save the Final, Merged Model ---
print(f"Saving merged model to {MERGED_MODEL_PATH}...")
model.save_pretrained(MERGED_MODEL_PATH)
processor.save_pretrained(MERGED_MODEL_PATH)

print(f"Success! Your final model is ready in the '{MERGED_MODEL_PATH}' folder.")

# --- Run Inference ---
print("\n--- Running Inference with Fine-Tuned Model ---")
# Example image and prompt
image_url = "https://apod.nasa.gov/apod/image/2509/TwoComets_Perrot_960.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
prompt = "USER: <image>\nDescribe this image in detail, focusing on the scientific phenomena present.\nASSISTANT:"
# Process inputs and generate a response
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=200)

# Decode and print the result
response_text = processor.decode(output[0], skip_special_tokens=True)
print("\nModel Response:")
# The response will include your original prompt, so we clean it up
assistant_response = response_text.split("ASSISTANT:")[-1].strip()
print(assistant_response)

Loading base model and processor...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Applying fine-tuned LoRA weights from /content/llava-finetuned-apod...




Merge complete.
Saving merged model to /content/llava-apod-merged2...
Success! Your final model is ready in the '/content/llava-apod-merged2' folder.

--- Running Inference with Fine-Tuned Model ---

Model Response:
The image captures a nighttime scene with a green comet streaking across the sky. The comet is visible in the upper part of the image, with its tail extending towards the bottom. The sky is filled with stars, creating a beautiful backdrop for the comet. The image also features a meteor shower, with multiple shooting stars scattered throughout the scene. The combination of the comet, meteor shower, and the starry sky creates a captivating astronomical display.


In [None]:
!cp -r /content/llava-apod-merged2 /content/drive/MyDrive/APOD

In [11]:
# Testing against non-finetuned model

# --- Configuration ---
# Path to the base model on Hugging Face
BASE_MODEL_ID = "llava-hf/llava-1.5-7b-hf"

# --- Load the Base Model and Processor ---
print("Loading base model and processor...")
# Load the original processor
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

# Load the base model in 4-bit for efficiency
base_model = LlavaForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    load_in_4bit=True,
    device_map="auto"
)

# --- Run Inference ---
print("\n--- Running Inference with Models ---")
# Example image and prompt
image = Image.open("/content/Screenshot 2025-10-15 at 15.58.13.png")
prompt = "USER: <image>\nWhat is the average orbital speed of Neptune?\nASSISTANT:"
# Process inputs and generate a response
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
output = base_model.generate(**inputs, max_new_tokens=200)

# Decode and print the result
response_text = processor.decode(output[0], skip_special_tokens=True)
print("\nModel Response:")
# The response will include your original prompt, so we clean it up
assistant_response = response_text.split("ASSISTANT:")[-1].strip()
print("Base Model:", assistant_response, "\n")


# --- Load and Apply Your Fine-Tuned Weights ---
FINETUNED_MODEL_PATH = "/content/drive/MyDrive/APOD/llava-apod-merged"
# Load the base model in 4-bit for efficiency
finetuned_model = LlavaForConditionalGeneration.from_pretrained(
    FINETUNED_MODEL_PATH,
    device_map="auto"
)
finetuned_output = finetuned_model.generate(**inputs, max_new_tokens=200)

# Decode and print the result
response_text = processor.decode(finetuned_output[0], skip_special_tokens=True)
# The response will include your original prompt, so we clean it up
assistant_response = response_text.split("ASSISTANT:")[-1].strip()
print("\nFine-tuned Model:", assistant_response)

Loading base model and processor...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


--- Running Inference with Models ---

Model Response:
Base Model: The average orbital speed of Neptune is approximately 29.78 kilometers per second. This is the speed at which Neptune completes one orbit around the Sun. The exact speed can vary slightly due to the gravitational influence of other planets and the Sun's own motion. 


Fine-tuned Model: The average orbital speed of Neptune is about 29.78 km/s (18.51 miles per second). This is the speed at which Neptune completes one orbit around the Sun. However, it is important to note that the actual speed of Neptune can vary slightly due to the gravitational influence of other planets in the Solar System.


###Quantative Grading

In [4]:
# Assume you have run your query engine and have these variables:
generated_answer = " The image features the bright supergiant star Betelgeuse with a blast of light exploding from the surface, causing a massive reaction in the orbit of Jupiter. It is an artistic illustration and not a real-life event. Additionally, there are stars surrounding Betelgeuse within the context of the artistic representation."
reference_answer = "Why is Betelgeuse fading?  No one knows.  Betelgeuse, one of the brightest and most recognized stars in the night sky, is only half as bright as it used to be only five months ago.  Such variability is likely just  normal behavior for this famously variable supergiant, but the recent dimming has rekindled discussion on how long it may be before Betelgeuse does go supernova.  Known for its red color, Betelgeuse is one of the few stars to be resolved by modern telescopes, although only barely.  The featured artist's illustration imagines how Betelgeuse might look up close. Betelgeuse is thought to have a complex and tumultuous surface that frequently throws impressive flares.  Were it to replace the Sun (not recommended), its surface would extend out near the orbit of Jupiter, while gas plumes would bubble out past Neptune.  Since Betelgeuse is about 700 light years away, its eventual supernova will not endanger life on Earth even though its brightness may rival that of a full Moon.  Astronomers -- both amateur and professional -- will surely continue to monitor Betelgeuse as this new decade unfolds.    Free Presentation: APOD Editor to show best astronomy images of 2019 -- and the decade -- in NYC on January 3"


# --- ROUGE Score ---
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference_answer, generated_answer)
print(f"ROUGE-L F1-Score: {rouge_scores['rougeL'].fmeasure:.4f}")


# --- BLEU Score ---
bleu = BLEU(effective_order=True)
bleu_score = bleu.sentence_score(generated_answer, [reference_answer])
print(f"BLEU Score: {bleu_score.score:.2f}")

ROUGE-L F1-Score: 0.1514
BLEU Score: 0.33


##LLM Response Grading

In [None]:
# --- Configuration ---
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

def get_gemini_judge_score(query, image, answer):
    """
    Uses the Gemini model as a judge to evaluate the quality of a generated answer.
    """
    print("\n--- Sending to LLM-as-a-Judge (Gemini) ---")

    # Configure the Gemini model
    model = genai.GenerativeModel('gemini-2.5-pro')

    # Create the prompt for the judge
    prompt = f"""
    You are an impartial and expert evaluator for an astronomy image explanation system.
    Your task is to evaluate a generated answer based on a given image.

    **Query:** "{query}"

    **Retrieved Image Link:** "{image}"

    **Generated Answer:** "{answer}"

    **Instructions:**
    1. Read the Query, Context, and Answer carefully.
    2. Assess the answer's correctness and relevance based *only* on the image.
    3. Provide a brief reasoning for your score.
    4. On a new line, provide a final score from 0 to 5, where 0 is "Not Relevant/Incorrect" and 5 is "Highly Relevant and Correct."

    Format your response as:
    Reasoning: [Your reasoning here]
    Score: [0-5]
    """

    try:
        response = model.generate_content(prompt)

        # Parse the response to extract the score
        score_text = response.text.strip().split('\n')[-1]
        score = int(score_text.split(':')[-1].strip())
        reasoning = response.text.strip().split('\n')[0].split(':')[-1].strip()

        print(f"Judge's Reasoning: {reasoning}")
        print(f"Judge's Score: {score}/5")
        return score, reasoning

    except Exception as e:
        print(f"An error occurred while querying the judge model: {e}")
        return None, None


if __name__ == "__main__":
    query = "USER: <image>\nDescribe this astronomy image and explain it's astronomical content in detail.\nASSISTANT:"
    image = "https://apod.nasa.gov/apod/image/2509/IMAP-IG2-001.JPG"
    generated_answer = assistant_response

    get_gemini_judge_score(query, image, generated_answer)


--- Sending to LLM-as-a-Judge (Gemini) ---
Judge's Reasoning: The generated answer accurately describes the visual elements present in the image, correctly identifying the rocket launch, its trail, and the Sun. However, the query asks for a *detailed* explanation of the astronomical content, which the answer does not provide. It mentions the Sun but offers no further explanation or context, making the response a good description but a very superficial explanation.
Judge's Score: 4/5
