In [None]:
# Clone Segment Anything
!git clone https://github.com/facebookresearch/segment-anything.git
!pip install -e segment-anything

# Clone GroundingDINO
!git clone https://github.com/IDEA-Research/GroundingDINO.git
!pip install -e GroundingDINO

# Other dependencies
!pip install supervision diffusers transformers accelerate scipy safetensors
!pip install huggingface_hub tqdm pandas



In [None]:
# -----------------------------------------------------------
# Imports    Note this was running using CPU only
# -----------------------------------------------------------
import os
import torch
import pandas as pd
from tqdm import tqdm
import sys
sys.path.append("/content/GroundingDINO")

# Set GPU device (default = 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Grounding DINO
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util.utils import clean_state_dict
from GroundingDINO.groundingdino.util.inference import load_image, predict
from huggingface_hub import hf_hub_download

# -----------------------------------------------------------
# Load GroundingDINO model
# -----------------------------------------------------------
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
    args = SLConfig.fromfile(cache_config_file)
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location='cpu')
    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
    print(f"Model loaded from {cache_file} => {log}")
    _ = model.eval()
    return model

# -----------------------------------------------------------
# Settings
# -----------------------------------------------------------
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"

groundingdino_model = load_model_hf(
    ckpt_repo_id, ckpt_filename, ckpt_config_filename, device=DEVICE
)

# -----------------------------------------------------------
# Parameters
# -----------------------------------------------------------
# Better prompt for object detection
TEXT_PROMPT = "detect all distinct objects in this image"
BOX_THRESHOLD = 0.3
TEXT_THRESHOLD = 0.25

base_path = "/content/drive/MyDrive"
folders = ["Flux-Dev"]          # Changed this to SDXL_1.0/SD2 for their respective computations....

# -----------------------------------------------------------
# Run GroundingDINO on dataset
# -----------------------------------------------------------
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    results = []

    print(f"\nProcessing folder: {folder}")
    for file in tqdm(os.listdir(folder_path)):
        if not file.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        image_path = os.path.join(folder_path, file)

        try:
            image_source, image = load_image(image_path)

            # Run prediction
            boxes, logits, phrases = predict(
                model=groundingdino_model,
                image=image,
                caption=TEXT_PROMPT,
                box_threshold=BOX_THRESHOLD,
                text_threshold=TEXT_THRESHOLD,
                device=DEVICE,
            )

            # Save results (bounding boxes + logits + detected object names)
            for b, l, p in zip(boxes.tolist(), logits.tolist(), phrases):
                results.append({
                    "image": file,
                    "box_xc": b[0],   # center x
                    "box_yc": b[1],   # center y
                    "box_w": b[2],    # width
                    "box_h": b[3],    # height
                    "logit": l,
                    "phrase": p,      # object name
                })
        except Exception as e:
            print(f"Error with {file}: {e}")

    # Save CSV for this folder
    df = pd.DataFrame(results)
    out_csv = os.path.join(base_path, f"{folder}_groundingdino.csv")
    df.to_csv(out_csv, index=False)
    print(f" Saved results (with phrases) to {out_csv}")


In [None]:
# This block checks for missing images and retries Groundingdino on them

import sys
sys.path.append("/content/GroundingDINO")

# Set GPU device (default = 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Grounding DINO
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util.utils import clean_state_dict
from GroundingDINO.groundingdino.util.inference import load_image, predict
from huggingface_hub import hf_hub_download

# -----------------------------------------------------------
# Load GroundingDINO model
# -----------------------------------------------------------
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
    args = SLConfig.fromfile(cache_config_file)
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location='cpu')
    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
    print(f"Model loaded from {cache_file} => {log}")
    _ = model.eval()
    return model

# -----------------------------------------------------------
# Settings
# -----------------------------------------------------------
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"

groundingdino_model = load_model_hf(
    ckpt_repo_id, ckpt_filename, ckpt_config_filename, device=DEVICE
)

# -----------------------------------------------------------
# Parameters
# -----------------------------------------------------------


BOX_THRESHOLD = 0.3

base_path = "/content/drive/MyDrive"
folders = ["FLux-Dev"]

# -----------------------------------------------------------
# Utility: Get missing images
# -----------------------------------------------------------
def get_missing_images(folder_path, existing_csv):
    # All images in folder
    all_images = {
        f for f in os.listdir(folder_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))
    }

    # Already processed images (from CSV if it exists)
    if os.path.exists(existing_csv):
        df = pd.read_csv(existing_csv)
        done_images = set(df["image"].unique())
    else:
        done_images = set()

    # Missing = all - done
    missing = sorted(list(all_images - done_images))
    print(f"Total images: {len(all_images)} | Done: {len(done_images)} | Missing: {len(missing)}")
    return missing

# -----------------------------------------------------------
# Retry only missing images
# -----------------------------------------------------------
def retry_missing_images(folder_path, existing_csv, model, device, box_threshold=0.3):
    missing_files = get_missing_images(folder_path, existing_csv)
    results = []

    if not missing_files:
        print(" No missing images, everything is processed already.")
        return None

    for file in tqdm(missing_files, desc="Retrying Missing Images"):
        image_path = os.path.join(folder_path, file)
        try:
            image_source, image = load_image(image_path)

            # Pass dummy caption, but ignore it
            boxes, logits, phrases = predict(
                model=model,
                image=image,
                caption="object",          # <-- required arg, dummy text
                box_threshold=box_threshold,
                text_threshold=0.25,       # <-- also required
                device=device,
            )

            for b, l in zip(boxes.tolist(), logits.tolist()):
                results.append({
                    "image": file,
                    "box_xc": b[0],   # center x
                    "box_yc": b[1],   # center y
                    "box_w": b[2],
                    "box_h": b[3],
                    "logit": l,
                })
        except Exception as e:
            print(f" Error with {file}: {e}")

    # Save results as a new CSV
    df_missing = pd.DataFrame(results)
    out_csv = os.path.join("/content", "groundingdino_missing.csv")
    df_missing.to_csv(out_csv, index=False)
    print(f" Saved missing results to {out_csv}")

    return out_csv




# -----------------------------------------------------------
# Example usage
# -----------------------------------------------------------
folder_path = "/content/drive/MyDrive/Flux-Dev"
existing_csv = "/content/drive/MyDrive/Flux-Dev_groundingdino.csv"

retry_missing_images(folder_path, existing_csv, groundingdino_model, DEVICE, box_threshold=BOX_THRESHOLD)



In [14]:

# Paths
original_csv = "/content/drive/MyDrive/Flux-Dev_groundingdino.csv"
missing_csv  = "/content/groundingdino_missing.csv"
combined_csv = "/content/drive/MyDrive/Flux-Dev_groundingdino_final.csv"

# Load CSVs
df_original = pd.read_csv(original_csv)
df_missing  = pd.read_csv(missing_csv)

# Concatenate (keep everything, no deduplication)
df_combined = pd.concat([df_original, df_missing], ignore_index=True)

# Save combined file
df_combined.to_csv(combined_csv, index=False)
print(f" Final combined CSV saved to: {combined_csv}")
print(f"Total rows: {len(df_combined)}")


 Final combined CSV saved to: /content/drive/MyDrive/Flux-Dev_groundingdino_final.csv
Total rows: 293


In [15]:


# Path to final CSV
final_csv = "/content/drive/MyDrive/Flux-Dev_groundingdino_final.csv"

# Load
df_final = pd.read_csv(final_csv)

# Show basic info
print(" Final CSV loaded")
print("Total rows:", len(df_final))
print("Unique images:", df_final["image"].nunique())
print("Columns:", list(df_final.columns))

# Show first 5 rows
print("\nSample rows:")
print(df_final.head())

# Check if any images got no boxes (logit column empty)
empty_images = df_final[df_final["logit"].isna()]["image"].unique()
print("\nImages with no detections:", len(empty_images))
if len(empty_images) > 0:
    print(empty_images[:10])  # show first 10 such images


 Final CSV loaded
Total rows: 293
Unique images: 198
Columns: ['image', 'box_xc', 'box_yc', 'box_w', 'box_h', 'logit']

Sample rows:
                                               image    box_xc    box_yc  \
0  Flux-Dev-Pregen-COCO_train2014_000000222970.jp...  0.467087  0.601826   
1  Flux-Dev-Pregen-COCO_train2014_000000527557.jp...  0.705490  0.852973   
2  Flux-Dev-Pregen-COCO_train2014_000000527557.jp...  0.512593  0.427007   
3  Flux-Dev-Pregen-COCO_train2014_000000538776.jp...  0.564300  0.766977   
4  Flux-Dev-Pregen-COCO_train2014_000000538776.jp...  0.498302  0.499078   

      box_w     box_h     logit  
0  0.377371  0.353525  0.400746  
1  0.207826  0.154093  0.303093  
2  0.848355  0.452285  0.313313  
3  0.298227  0.368137  0.305515  
4  0.988309  0.920158  0.329653  

Images with no detections: 0


In [24]:

import json

# CSV path
final_csv = "/content/drive/MyDrive/Flux-Dev_groundingdino_final.csv"
df = pd.read_csv(final_csv)

# Build list of dicts
json_list = []

for img_name, group in df.groupby("image"):
    boxes = []
    for _, row in group.iterrows():
        boxes.append({
            "box": [row["box_xc"], row["box_yc"], row["box_w"], row["box_h"]],
            "logit": row["logit"]
        })

    json_list.append({
        "image": img_name,
        "boxes": boxes
    })

# Save JSON
output_json = "/content/drive/MyDrive/Flux-Dev_groundingdino.json"
with open(output_json, "w") as f:
    json.dump(json_list, f, indent=4)

print(f" JSON saved at: {output_json}")


 JSON saved at: /content/drive/MyDrive/Flux-Dev_groundingdino.json


In [None]:

# Load JSON
with open("Flux-Dev_groundingdino.json", "r") as f:
    data = json.load(f)

seen_images = set()
cleaned_data = []

for item in data:

    image_name = item["image"].replace(" (1)", "")

    if image_name not in seen_images:

        cleaned_data.append(item)
        seen_images.add(image_name)

# Save cleaned JSON
with open("Flux-Dev_groundingdino.json", "w") as f:
    json.dump(cleaned_data, f, indent=4)

print(f" Cleaned JSON saved with {len(cleaned_data)} unique images.")
