![Grounded SAM Inpainting Demo](https://github.com/IDEA-Research/Grounded-Segment-Anything/raw/main/assets/grounded_sam_inpainting_demo.png)

## Why this project?

- [Segment Anything](https://github.com/facebookresearch/segment-anything) is a strong segmentation model. But it need prompts (like boxes/points) to generate masks.
- [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO) is a strong zero-shot detector which enable to generate high quality boxes and labels with free-form text.
- The combination of the two models enable **to detect and segment everything** with text inputs!



if NameError: name '_C' is not defined

reference this solution: https://github.com/IDEA-Research/GroundingDINO/issues/8#issuecomment-1541892708

## Install

In [None]:
%cd /content

!git clone https://github.com/IDEA-Research/Grounded-Segment-Anything

/content
Cloning into 'Grounded-Segment-Anything'...
remote: Enumerating objects: 1705, done.[K
remote: Counting objects: 100% (390/390), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 1705 (delta 329), reused 323 (delta 317), pack-reused 1315[K
Receiving objects: 100% (1705/1705), 124.77 MiB | 31.26 MiB/s, done.
Resolving deltas: 100% (802/802), done.


In [None]:
%cd /content/Grounded-Segment-Anything
!pip install -q -r requirements.txt
%cd /content/Grounded-Segment-Anything/GroundingDINO
!export CUDA_HOME=/user/local/cuda-11.3
!pip install -q .
%cd /content/Grounded-Segment-Anything/segment_anything
!pip install -q .
%cd /content/Grounded-Segment-Anything

/content/Grounded-Segment-Anything
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.7/254.7 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  In

## Imports

In [None]:
import os, sys

sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))

import argparse
import copy

from IPython.display import display
from PIL import Image, ImageDraw, ImageFont
from torchvision.ops import box_convert

# Grounding DINO
import GroundingDINO.groundingdino.datasets.transforms as T
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util import box_ops
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from GroundingDINO.groundingdino.util.inference import annotate, load_image, predict

import supervision as sv

# segment anything
from segment_anything import build_sam, SamPredictor
import cv2
import numpy as np
import matplotlib.pyplot as plt


# diffusers
import PIL
import requests
import torch
from io import BytesIO
from diffusers import StableDiffusionInpaintPipeline


from huggingface_hub import hf_hub_download

from google.colab import files



## Load models

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Grounding DINO model

In [None]:
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)

    args = SLConfig.fromfile(cache_config_file)
    args.device = device
    model = build_model(args)

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location=device)
    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
    print("Model loaded from {} \n => {}".format(cache_file, log))
    _ = model.eval()
    return model

In [None]:
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filenmae = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"


groundingdino_model = load_model_hf(ckpt_repo_id, ckpt_filenmae, ckpt_config_filename, device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


GroundingDINO_SwinB.cfg.py:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

groundingdino_swinb_cogcoor.pth:   0%|          | 0.00/938M [00:00<?, ?B/s]

Model loaded from /root/.cache/huggingface/hub/models--ShilongLiu--GroundingDINO/snapshots/a94c9b567a2a374598f05c584e96798a170c56fb/groundingdino_swinb_cogcoor.pth 
 => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])


### SAM

In [None]:
! wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

sam_checkpoint = 'sam_vit_h_4b8939.pth'

sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint).to(device))

--2024-01-31 16:45:19--  https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.163.34, 3.162.163.51, 3.162.163.11, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.163.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2564550879 (2.4G) [binary/octet-stream]
Saving to: ‘sam_vit_h_4b8939.pth’


2024-01-31 16:45:33 (172 MB/s) - ‘sam_vit_h_4b8939.pth’ saved [2564550879/2564550879]



### Stable Diffusion (Inpainting)

In [None]:
sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-inpainting",
    torch_dtype=torch.float16,
).to(device)

model_index.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

text_encoder/config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

## Grounding DINO for detection

In [None]:
# detect object using grounding DINO
def detect(image, text_prompt, model, box_threshold = 0.3, text_threshold = 0.25):
  boxes, logits, phrases = predict(
      model=model,
      image=image,
      caption=text_prompt,
      box_threshold=box_threshold,
      text_threshold=text_threshold
  )

  annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
  annotated_frame = annotated_frame[...,::-1] # BGR to RGB
  return annotated_frame, boxes

## SAM for segmentation

In [None]:
def segment(image, sam_model, boxes):
  sam_model.set_image(image)
  H, W, _ = image.shape
  boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.Tensor([W, H, W, H])

  transformed_boxes = sam_model.transform.apply_boxes_torch(boxes_xyxy.to(device), image.shape[:2])
  # get rid of the big boxes
  valid_mask = transformed_boxes[:, 3] - transformed_boxes[:, 1] < 600
  filtered_boxes = transformed_boxes[valid_mask]
  # print(transformed_boxes)
  # print(H)
  # print(filtered_boxes)

  # input_point = torch.from_numpy(np.array([[W//2, H//2]]))
  # input_point = sam_predictor.transform.apply_coords_torch(input_point, image.shape[:2])
  # input_point = input_point.unsqueeze(0).to(device)
  # input_label = torch.from_numpy(np.array([0])).unsqueeze(0).to(device)
  masks, _, _ = sam_model.predict_torch(
      # point_coords = input_point,
      # point_labels = input_label,
      point_coords = None,
      point_labels = None,
      boxes = filtered_boxes,
      multimask_output = False,
      )
  return masks.cpu()


def draw_masks(masks, image, random_color=True):
  annotated_frame_pil = Image.fromarray(image).convert("RGBA")

  for mask in masks:
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    mask_image_pil = Image.fromarray((mask_image.cpu().numpy() * 255).astype(np.uint8)).convert("RGBA")
    annotated_frame_pil = Image.alpha_composite(annotated_frame_pil, mask_image_pil)

  return np.array(annotated_frame_pil)


## Loading the images, detection, and segmentation


In [None]:
# Load images from local folder in a batch
def load_images_from_folder(folder_path):
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    return image_files

local_folder_path = "assets/apple2images"
if not os.path.exists(local_folder_path):
    os.makedirs(local_folder_path)
    print(f"New folder '{local_folder_path}' created.")
else:
    print(f"The folder '{local_folder_path}' already exists.")

# # Prompt the user to upload images
# print("Please upload image files to the new folder.")
# uploaded_files = files.upload()

# # Move the uploaded files to the new folder
# for filename, content in uploaded_files.items():
#     file_path = os.path.join(local_folder_path, filename)
#     with open(file_path, 'wb') as f:
#         f.write(content)
#         print(f"File '{filename}' uploaded to '{local_folder_path}'.")

# # List the contents of the new folder
# print(f"Contents of '{local_folder_path}':")
# print(os.listdir(local_folder_path))

# create output folder
output_folder = "/content/drive/MyDrive/apple2masks"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"New folder '{output_folder}' created.")
else:
    print(f"The folder '{output_folder}' already exists.")


image_files = load_images_from_folder(local_folder_path)

for image_file in image_files:
    image_path = os.path.join(local_folder_path, image_file)
    image_source, image = load_image(image_path)

    # detection
    annotated_frame, detected_boxes = detect(image, text_prompt="fruit", model=groundingdino_model)
    output_filename = f"{os.path.splitext(image_file)[0]}_boxes.png"
    output_path = os.path.join(output_folder, output_filename)
    Image.fromarray(annotated_frame).save(output_path)

    # segmentation
    segmented_frame_masks = segment(image_source, sam_predictor, boxes=detected_boxes)
    annotated_frame_with_masks = draw_masks(segmented_frame_masks, annotated_frame)
    output_filename = f"{os.path.splitext(image_file)[0]}_colored_masks.png"
    output_path = os.path.join(output_folder, output_filename)
    Image.fromarray(annotated_frame_with_masks).save(output_path)

    # masks
    image_source_pil = Image.fromarray(image_source)

    # white background
    # background_image = np.ones_like(segmented_frame_masks[0][0].cpu().numpy(), dtype=np.uint8)*255

    # black background
    background_image = np.zeros_like(segmented_frame_masks[0][0].cpu().numpy(), dtype=np.uint8)

    masks_image_pil = Image.fromarray(background_image).convert("RGBA")

    for i in range(len(segmented_frame_masks)):
        mask = segmented_frame_masks[i][0].cpu().numpy()

        # Create an RGBA image with a transparent background
        alpha_channel = (mask != 0).astype(np.uint8) * 255
        white_foreground = np.ones_like(mask, dtype=np.uint8) * 255
        image_mask_pil = Image.fromarray(np.dstack([white_foreground, white_foreground, white_foreground, alpha_channel]), "RGBA")

        # Blend the mask onto the image without accumulating the black background
        masks_image_pil = Image.alpha_composite(masks_image_pil, image_mask_pil)

    output_filename = f"{os.path.splitext(image_file)[0]}_masks.png"
    output_path = os.path.join(output_folder, output_filename)
    masks_image_pil.save(output_path)



The folder 'assets/apple2images' already exists.
The folder '/content/drive/MyDrive/apple2masks' already exists.




NameError: name '_C' is not defined

In [None]:
%cd /content/Grounded-Segment-Anything/

/content/Grounded-Segment-Anything


In [None]:
# prompt: connect to drive

from google.colab import drive
drive.mount('/content/drive')


ValueError: Mountpoint must not already contain files

In [None]:
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


In [None]:
# Specify the source folder in Colab
source_folder_colab = '/content/Grounded-Segment-Anything/assets/masks'

# Specify the destination folder in Google Drive
destination_folder_drive = '/content/drive/MyDrive/masks'

# Create the source folder in Colab (for demonstration purposes)
os.makedirs(source_folder_colab, exist_ok=True)

# Copy the entire folder to Google Drive
shutil.copytree(source_folder_colab, destination_folder_drive)

print(f"Folder '{source_folder_colab}' copied to '{destination_folder_drive}'.")