In [None]:
# ===========================================================
# Install dependencies
# ===========================================================
!pip install git+https://github.com/facebookresearch/sam2.git
!pip install supervision diffusers transformers accelerate scipy safetensors
!pip install opencv-python matplotlib tqdm


In [None]:
!pip install ultralytics

In [None]:
!pip install transformers accelerate bitsandbytes torch pillow tqdm

In [None]:
# ====================================
# BLIP2 Captioning 
# ====================================

# === Install dependencies (if using Colab/Notebook) ===


import os
import json
from PIL import Image
from transformers import BitsAndBytesConfig, InstructBlipProcessor, InstructBlipForConditionalGeneration

# =============================
# Paths                               Replace these paths for SDXL,SD2 correspondingly
# =============================
base_path = "/content/drive/MyDrive/SAM-FLUXDEV-COCO"  # SAM segments folder
output_json = "/content/drive/MyDrive/BLIP2_FLUXDEV_SEGMENTS_CAPS.json" 

# =============================
# BLIP2 Prompt
# =============================
prompt = (
    "Describe the image with a focus on the intricate details of the object, "
    "including their color, shape, and number. Include any physical aspects that "
    "appear unusual or incorrect according to general knowledge."
)

# =============================
# Load BLIP2 Instruct Model (8-bit)
# =============================
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
blip_processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
blip_model = InstructBlipForConditionalGeneration.from_pretrained(
    "Salesforce/instructblip-vicuna-7b",
    device_map="auto",
    quantization_config=bnb_config
)

In [None]:


# =============================
# Function to caption a single image
# =============================
def get_blip2_caption(img_path):
    img = Image.open(img_path).convert("RGB")
    inputs = blip_processor(img, prompt, return_tensors="pt").to(blip_model.device)
    out = blip_model.generate(**inputs, max_length=100, do_sample=False)
    caption = blip_processor.decode(out[0], skip_special_tokens=True)
    if caption.startswith(prompt):
        caption = caption[len(prompt):].strip()
    return caption

# =============================
# Load existing captions (if any)
# =============================
if os.path.exists(output_json):
    with open(output_json, "r") as f:
        all_captions = json.load(f)
    print(f"Resuming from existing file: {output_json}")
else:
    all_captions = {}

# =============================
# Loop over all segment folders
# =============================
folders = sorted(os.listdir(base_path))
for folder_name in folders:
    folder_path = os.path.join(base_path, folder_name)
    if not os.path.isdir(folder_path):
        continue

    if folder_name in all_captions:  # skip already processed
        print(f"Skipping folder {folder_name} (already done)")
        continue

    print(f"\nProcessing folder: {folder_name}")
    segment_captions = {}

    seg_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    for seg_file in seg_files:
        seg_path = os.path.join(folder_path, seg_file)
        print(f"  Segment: {seg_file}")
        try:
            caption = get_blip2_caption(seg_path)
        except Exception as e:
            caption = f"ERROR: {e}"
        print(f"    Caption: {caption}")
        segment_captions[seg_file] = caption

    if segment_captions:
        all_captions[folder_name] = segment_captions
        # Save progress after each folder
        with open(output_json, "w") as f:
            json.dump(all_captions, f, indent=2)
        print(f"  Saved captions for folder {folder_name}")

print(f"\nAll captions saved to {output_json}")



Processing folder: Flux-Dev-Pregen-COCO_train2014_000000002902.jpg
  Segment: segment_1.png
    Caption: The image features two bananas and an orange arranged in a unique and intricate way on a yellow background. The bananas are cur
  Segment: segment_2.png
    Caption: The image features two bananas and an orange, all of which are arranged in an intricate and unusual way on a yellow background.
  Segment: segment_3.png
    Caption: The image features a close-up shot of a yellow banana, which is intricately detailed and has a unique shape. The
  Saved captions for folder Flux-Dev-Pregen-COCO_train2014_000000002902.jpg

Processing folder: Flux-Dev-Pregen-COCO_train2014_000000004421.jpg
  Segment: segment_1.png
    Caption: The image features a close-up of a woman's hand holding a glass of pink liquid. The glass is a unique shape
  Segment: segment_10.png
    Caption: The image features a close-up of a woman's hand wearing a pair of pink gloves with a unique design.
  Segment: segment_1