# VCR + YOLO + LLaVA Pipeline

This notebook demonstrates an end-to-end pipeline for reducing hallucinations in image descriptions by combining object detection (YOLO) with a vision-language model (LLaVA-1.5-7B).

**Steps:**
1. Load VCR annotation data  
2. Run YOLO object detection on images  
3. Construct prompts for LLaVA using detected objects  
4. Generate grounded image descriptions with LLaVA  
5. (Optional) Compare with baseline LLaVA outputs  

In [None]:
# Install required packages (uncomment if running for the first time)
!pip install ultralytics transformers torch pillow jsonlines --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install torch --quiet
!pip install -U bitsandbytes --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Load VCR Annotation Data
We'll load a sample from the VCR validation set. Adjust the path as needed.

In [None]:
import jsonlines
import os
import json
from ultralytics import YOLO
import random

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
# Paths to VCR annotation files
vcr_ann_paths = ['/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1annots/train.jsonl', '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1annots/val.jsonl', '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1annots/test.jsonl']

vcr_img_root = '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/'

results_file = '/content/drive/MyDrive/UCLA/cs 263/final_proj/yolo_results.json'
num_examples = 10

# Select images from the train, test, and val splits
selected_images = []
for vcr_ann_path in vcr_ann_paths:
    with jsonlines.open(vcr_ann_path) as reader:
        items = list(reader)
        random.shuffle(items)  # Shuffle the items
        items = items[:num_examples]  # Select a specific number of items
        for item in items:
            img_path = os.path.join(vcr_img_root, item['img_fn'])
            if os.path.exists(img_path):
                selected_images.append(img_path)


In [None]:
# Load YOLO model
yolo_model = YOLO('yolov8n.pt') # smallest model
#yolo_model = YOLO('yolov8x.pt') # x is the largest and l is second largest, n is smallest
images_dict_small = {}

# Run YOLO model on selected images
for img_path in selected_images:
    results = yolo_model(img_path)
    detected_objects = [yolo_model.model.names[int(cls)] for cls in results[0].boxes.cls]
    # Save the results to the dictionary
    images_dict_small[img_path] = detected_objects

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 78.1MB/s]



image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_The_Taste_of_Others/dYQpJJySnwU@0.jpg: 384x640 2 persons, 1 couch, 46.3ms
Speed: 12.3ms preprocess, 46.3ms inference, 352.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3081_THOR/3081_THOR_00.32.12.562-00.32.19.069@2.jpg: 288x640 3 persons, 43.6ms
Speed: 3.4ms preprocess, 43.6ms inference, 1.5ms postprocess per image at shape (1, 3, 288, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_1052_Harry_Potter_and_the_order_of_phoenix/1052_Harry_Potter_and_the_order_of_phoenix_01.36.49.424-01.36.54.299@1.jpg: 288x640 11 persons, 1 chair, 2 dining tables, 8.2ms
Speed: 1.9ms preprocess, 8.2ms inference, 1.7ms postprocess per image at shape (1, 3, 288, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3050_MR_POPPERS_PENGUINS/3050_MR_POPPERS_PENGUINS_00.43.50.265-00.43.52.454@0.jpg:

In [None]:
# Load YOLO model
yolo_model = YOLO('yolov8x.pt')
#yolo_model = YOLO('yolov8x.pt') # x is the largest and l is second largest, n is smallest
images_dict_large = {}

# Run YOLO model on selected images
for img_path in selected_images:
    results = yolo_model(img_path)
    detected_objects = [yolo_model.model.names[int(cls)] for cls in results[0].boxes.cls]
    # Save the results to the dictionary
    images_dict_large[img_path] = detected_objects

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8x.pt to 'yolov8x.pt'...


100%|██████████| 131M/131M [00:00<00:00, 226MB/s]



image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_The_Taste_of_Others/dYQpJJySnwU@0.jpg: 384x640 2 persons, 1 chair, 2 couchs, 62.5ms
Speed: 1.7ms preprocess, 62.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3081_THOR/3081_THOR_00.32.12.562-00.32.19.069@2.jpg: 288x640 3 persons, 1 bottle, 1 microwave, 1 clock, 45.1ms
Speed: 1.6ms preprocess, 45.1ms inference, 1.4ms postprocess per image at shape (1, 3, 288, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_1052_Harry_Potter_and_the_order_of_phoenix/1052_Harry_Potter_and_the_order_of_phoenix_01.36.49.424-01.36.54.299@1.jpg: 288x640 10 persons, 1 chair, 5 dining tables, 39.1ms
Speed: 1.9ms preprocess, 39.1ms inference, 1.6ms postprocess per image at shape (1, 3, 288, 640)

image 1/1 /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3050_MR_POPPERS_PENGUINS/3050_MR_POPPERS_

## 2. Run YOLO Object Detection
We'll use a YOLOv8 model to detect objects in the image.

In [None]:
#images_dict_large

In [None]:
with open(results_file, 'w') as f:
    json.dump(results_dict, f)

## 3. Run LLaVA-1.5-7B-HF
We use the HuggingFace LLaVA-1.5-7B model to generate a description. (Requires GPU with sufficient VRAM!)

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from transformers import BitsAndBytesConfig

model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="cuda"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
from PIL import Image

In [None]:
selected_images

['/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3067_THE_ART_OF_GETTING_BY/3067_THE_ART_OF_GETTING_BY_01.16.32.496-01.16.37.176@0.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_3084_TOOTH_FAIRY/3084_TOOTH_FAIRY_01.32.27.868-01.32.32.320@1.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_Wild_Wild_West/zV3AZFuaJVQ@17.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_An_Unfinished_Life/-AXjzZskE9U@4.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_Command_Performance/301qydVqZzM@13.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/lsmdc_1062_Day_the_Earth_stood_still/1062_Day_the_Earth_stood_still_01.30.09.000-01.30.13.041@0.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_UHF/tHe6ar-X2cQ@17.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_Bled/MUFqS9iKzHw@0.jpg',
 '/content/drive/MyDrive/UCLA/cs 263/

In [None]:
detected_objects_large

'bottle, dining table, person, wine glass, chair, cell phone'

In [None]:
# output with small YOLO model
for i in range(len(selected_images)):
  img_fn = selected_images[i]

  # Open image
  image = Image.open(img_fn).convert('RGB')

  ###### Prompt 1: Just describe the image ######
  prompt1 = "<image>\nUSER: Describe what is happening in this image.\nASSISTANT:"

  inputs1 = processor(image, prompt1, return_tensors='pt').to('cuda')
  with torch.no_grad():
      output1 = model.generate(**inputs1, max_new_tokens=100)
  description1 = processor.decode(output1[0], skip_special_tokens=True)

  detected_objects_small = ', '.join(set(images_dict_small[img_fn]))
  ###### Prompt 2: Include detected objects (small model) ######
  object_list = ', '.join(set(detected_objects))
  prompt2 = f"<image>\nUSER: The image contains: {detected_objects_small}. Describe what is happening in this image.\nASSISTANT:"

  inputs2 = processor(image, prompt2, return_tensors='pt').to('cuda')
  with torch.no_grad():
      output2 = model.generate(**inputs2, max_new_tokens=100)
  description2 = processor.decode(output2[0], skip_special_tokens=True)

  detected_objects_large = ', '.join(set(images_dict_large[img_fn]))
  ###### Prompt 3: Include detected objects (large model) ######
  object_list = ', '.join(set(detected_objects))
  prompt3 = f"<image>\nUSER: The image contains: {detected_objects_large}. Describe what is happening in this image.\nASSISTANT:"

  inputs3 = processor(image, prompt3, return_tensors='pt').to('cuda')
  with torch.no_grad():
      output3 = model.generate(**inputs3, max_new_tokens=100)
  description3 = processor.decode(output3[0], skip_special_tokens=True)

  detected_objects_large_extra = ', '.join(set(images_dict_large[img_fn])) + ", dinosaur, meteor"
  ###### Prompt 4: Include detected objects + extra objects (large model) ######
  object_list = ', '.join(set(detected_objects))
  prompt4 = f"<image>\nUSER: The image contains: {detected_objects_large_extra}. Describe what is happening in this image.\nASSISTANT:"

  inputs4 = processor(image, prompt4, return_tensors='pt').to('cuda')
  with torch.no_grad():
      output4 = model.generate(**inputs4, max_new_tokens=100)
  description4 = processor.decode(output4[0], skip_special_tokens=True)

  ###### Compare outputs ######
  print(f"Image: {img_fn}")
  print("Prompt 1 Output:", description1)
  print("Prompt 2 Output (small object list):", description2)
  print("Prompt 3 Output (large object list):", description3)
  print("Prompt 4 Output (large object list + extra objects):", description4)
  print('-' * 60)

Image: /content/drive/MyDrive/UCLA/cs 263/final_proj/vcr1images/movieclips_The_Taste_of_Others/dYQpJJySnwU@0.jpg
Prompt 1 Output: 
USER: Describe what is happening in this image.
ASSISTANT: In the image, there are two people standing in a room. One person is wearing a suit and tie, while the other person is wearing a black dress. They appear to be engaged in a conversation or discussing something. The room also features a couch and a chair, providing a comfortable setting for their interaction.
Prompt 2 Output (small object list): 
USER: The image contains: person, couch. Describe what is happening in this image.
ASSISTANT: In the image, there is a woman wearing a black dress, standing in a room with a couch. She appears to be looking at something, possibly a man who is also present in the room. The woman is holding a handbag, and the scene seems to be set in a living room or a similar indoor space.
Prompt 3 Output (large object list): 
USER: The image contains: chair, person, couch. D