## Combine GroundingDINO with SegmentAnythingModel
- Use [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO) for zero-shot object detection (bboxes)
- Use Segment Anything Model ([SAM](https://github.com/facebookresearch/segment-anything)) for converting detections into segmentations

***
## Imports and Installs

In [14]:
import os
from glob import glob

import cv2
import matplotlib.pyplot as plt

import torch

***
### Install GroundingDINO & Download Weights 

In [2]:
%%capture

!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd GroundingDINO/
%pip install -qqq -e .

# download weights
%mkdir weights
%cd weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
%cd ..

***
### Install SAM & Load Weights

In [None]:
%pip install -qqq git+https://github.com/facebookresearch/segment-anything.git

In [4]:
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor

sam_checkpoint = "/kaggle/input/segment-anything/pytorch/vit-b/1/model.pth"
model_type = "vit_b"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)

***
### Load Models

In [9]:
from groundingdino.util.inference import load_model, load_image, predict, annotate

dino_model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")

final text_encoder_type: bert-base-uncased


In [11]:
dino_model.to(device)

GroundingDINO(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x DeformableTransformerEncoderLayer(
          (self_attn): MultiScaleDeformableAttention(
            (sampling_offsets): Linear(in_features=256, out_features=256, bias=True)
            (attention_weights): Linear(in_features=256, out_features=128, bias=True)
            (value_proj): Linear(in_features=256, out_features=256, bias=True)
            (output_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (dropout1): Dropout(p=0.0, inplace=False)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout2): Dropout(p=0.0, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (dropout3): Dropout(p=0.0, inplace=False)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_aff

In [13]:
# send SAM to GPU
sam.to(device)

mask_generator = SamAutomaticMaskGenerator(sam,
                                           points_per_side=32,
                                           pred_iou_thresh=0.86,
                                           stability_score_thresh=0.92,
                                           crop_n_layers=1,
                                           crop_n_points_downscale_factor=2,
                                           min_mask_region_area=100, )# points_per_batch=16)

In [8]:
device

device(type='cuda')