In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, GroundingDinoProcessor, GroundingDinoImageProcessor, GroundingDinoForObjectDetection, infer_device

model_id = "IDEA-Research/grounding-dino-tiny"
device = infer_device()

processor = GroundingDinoProcessor.from_pretrained(model_id)
model = GroundingDinoForObjectDetection.from_pretrained(model_id).to(device)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
help(processor.__call__)

Help on method __call__ in module transformers.models.grounding_dino.processing_grounding_dino:

__call__(images: Union[ForwardRef('PIL.Image.Image'), numpy.ndarray, ForwardRef('torch.Tensor'), list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']] = None, text: Union[str, list[str], list[list[str]]] = None, audio=None, videos=None, **kwargs: typing_extensions.Unpack[transformers.models.grounding_dino.processing_grounding_dino.GroundingDinoProcessorKwargs]) -> transformers.tokenization_utils_base.BatchEncoding method of transformers.models.grounding_dino.processing_grounding_dino.GroundingDinoProcessor instance
    This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
    [`BertTokenizerFast.__call__`] to prepare text for the model.
    
    Args:
        images (`ImageInput`, `list[ImageInput]`, *optional*):
            The image or batch of images to be processed. The image might be either PIL image, numpy array or a torc

In [None]:
image_path = "/mnt/e/text-image-product-search/images/000000039769.jpg"
image = Image.open(image_path).convert("RGB")

In [None]:
# text_labels = [["plant pot", "book", "table", "chair", "carpet", "rug"]] # text khong duoc co '.'
text_labels = [["a cat", "a remote control"]]
# text_labels = ["a cat.", "a remote control."]

In [15]:
inputs = processor(images=image, text=text_labels, return_tensors="pt", truncation=True, padding=True).to(model.device)

In [16]:
with torch.inference_mode():
    outputs = model(**inputs)

In [17]:
results = processor.post_process_grounded_object_detection(
    outputs=outputs,
    input_ids=inputs.input_ids,
    threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)

In [65]:
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["text_labels"]):
    # box = [round(x, 2) for x in box.tolist()]
    # print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
    print(box.is_cuda)

True
True
True


In [52]:
list(zip(result["boxes"], result["scores"], result["text_labels"]))[0]

(tensor([344.6931,  23.1090, 637.1847, 374.2747], device='cuda:0'),
 tensor(0.4785, device='cuda:0'),
 'a cat')

In [33]:
results[0]["boxes"].detach().cpu().tolist()

[[344.69305419921875, 23.109039306640625, 637.1846923828125, 374.274658203125],
 [12.26496696472168, 51.915008544921875, 316.8591003417969, 472.4386901855469],
 [38.58332061767578, 70.0059585571289, 176.77804565429688, 118.17623901367188]]

In [35]:
results[0]

{'scores': tensor([0.4785, 0.4381, 0.4759], device='cuda:0'),
 'boxes': tensor([[344.6931,  23.1090, 637.1847, 374.2747],
         [ 12.2650,  51.9150, 316.8591, 472.4387],
         [ 38.5833,  70.0060, 176.7780, 118.1762]], device='cuda:0'),
 'text_labels': ['a cat', 'a cat', 'a remote control'],
 'labels': ['a cat', 'a cat', 'a remote control']}

In [45]:
help(processor.__call__)

Help on method __call__ in module transformers.models.grounding_dino.processing_grounding_dino:

__call__(images: Union[ForwardRef('PIL.Image.Image'), numpy.ndarray, ForwardRef('torch.Tensor'), list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']] = None, text: Union[str, list[str], list[list[str]]] = None, audio=None, videos=None, **kwargs: typing_extensions.Unpack[transformers.models.grounding_dino.processing_grounding_dino.GroundingDinoProcessorKwargs]) -> transformers.tokenization_utils_base.BatchEncoding method of transformers.models.grounding_dino.processing_grounding_dino.GroundingDinoProcessor instance
    This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
    [`BertTokenizerFast.__call__`] to prepare text for the model.
    
    Args:
        images (`ImageInput`, `list[ImageInput]`, *optional*):
            The image or batch of images to be processed. The image might be either PIL image, numpy array or a torc

---

In [62]:
from PIL import Image
import requests
from transformers import SamModel, SamProcessor

model = SamModel.from_pretrained("facebook/sam-vit-base")
processor = SamProcessor.from_pretrained("facebook/sam-vit-base")


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [67]:
help(processor.__call__)

Help on method __call__ in module transformers.models.sam.processing_sam:

__call__(images: Union[ForwardRef('PIL.Image.Image'), numpy.ndarray, ForwardRef('torch.Tensor'), list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor'], NoneType] = None, text: Union[str, list[str], list[list[str]], NoneType] = None, audio: Union[ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), list['np.ndarray'], list['torch.Tensor'], NoneType] = None, video: Union[list['PIL.Image.Image'], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), list['np.ndarray'], list['torch.Tensor'], list[list['PIL.Image.Image']], list[list['np.ndarrray']], list[list['torch.Tensor']], transformers.video_utils.URL, list[transformers.video_utils.URL], list[list[transformers.video_utils.URL]], transformers.video_utils.Path, list[transformers.video_utils.Path], list[list[transformers.video_utils.Path]], NoneType] = None, **kwargs) -> transformers.tokenization_utils_base.BatchEncoding method of transformers.models.s

In [82]:
import os
from PIL import Image
from datetime import datetime, timezone
import cv2
import numpy as np
import torch 

def get_image_metadata(file_path):
    stat_info = os.stat(file_path)
    size = stat_info.st_size   # (bytes)
    uploaded_at = datetime.fromtimestamp(stat_info.st_mtime, tz=timezone.utc)
    
    with Image.open(file_path) as img:
        width, height = img.size
        format = img.format.lower()  # jpg, png, webp,...

    return {
        "file_name": os.path.basename(file_path),
        "url": os.path.abspath(file_path),
        "uploaded_at": uploaded_at.isoformat(),
        "size": size,
        "format": format,
        "width": width,
        "height": height
    }
    
    
def mask_to_polygon(mask: np.ndarray) -> list[list[int]]:
    # Find contours in the binary mask
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Find the contour with the largest area
    largest_contour = max(contours, key=cv2.contourArea)

    # Extract the vertices of the contour
    polygon = largest_contour.reshape(-1, 2).tolist()

    return polygon

def polygon_to_mask(polygon: list[tuple[int, int]], image_shape: tuple[int, int]) -> np.ndarray:
    """
    Convert a polygon to a segmentation mask.

    Args:
    - polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
    - image_shape (tuple): Shape of the image (height, width) for the mask.

    Returns:
    - np.ndarray: Segmentation mask with the polygon filled.
    """
    # Create an empty mask
    mask = np.zeros(image_shape, dtype=np.uint8)

    # Convert polygon to an array of points
    pts = np.array(polygon, dtype=np.int32)

    # Fill the polygon with white color (255)
    cv2.fillPoly(mask, [pts], color=(255,))

    return mask

def get_boxes(results: list[dict]) -> list[list[list[float]]]:
    # `results` are the detection results from GroundingDino. This is a list of dictionaries, with each dictionary containing the following keys:
    #   "scores: The confidence scores for each predicted box on the image.
    #   "labels: Indexes of the classes predicted by the model on the image.
    #   "boxes: Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
    
    list_boxes = []
    for result in results:
        boxes = result["boxes"].detach().cpu().tolist()
        list_boxes.append(boxes)
        
    return list_boxes

def refine_masks(masks : torch.Tensor, polygon_refinement: bool = False):
    masks = masks.detach().cpu().float()
    if masks.ndim == 4:   # [B, C, H, W]
        masks = masks.permute(0, 2, 3, 1).mean(dim=-1)  # [B, H, W]
    else:
        raise ValueError(f"Unexpected mask shape: {masks.shape}")
    masks = masks.mean(axis=-1)
    masks = (masks > 0).int()
    masks = masks.numpy().astype(np.uint8)
    masks = list(masks)

    if polygon_refinement:
        for idx, mask in enumerate(masks):
            shape = mask.shape
            polygon = mask_to_polygon(mask)
            mask = polygon_to_mask(polygon, shape)
            masks[idx] = mask

    return masks

In [83]:
from transformers import GroundingDinoProcessor, GroundingDinoForObjectDetection, infer_device
from PIL import Image
import string
import torch

class GroundingDinoDetector:
    def __init__(self, model_id: str="IDEA-Research/grounding-dino-tiny", device=None):
        self.device = device or infer_device() 
        self.processor = GroundingDinoProcessor.from_pretrained(model_id)
        self.model = GroundingDinoForObjectDetection.from_pretrained(model_id).to(self.device)
    
    def detect(self, text:str | list[str], image: str | list[str] | Image.Image | list[Image.Image], threshold: float = 0.4, text_threshold: float = 0.3):
        """
        Detect objects in the image based on the text prompt.
        Args:
            text (str or list of str): Text prompt(s) for object detection.
            image (str or list of str or Image or list of Image): Path(s) to image file(s) or PIL Image(s)
            threshold (float): Threshold to keep object detection predictions based on confidence score.
            text_threshold (float): Score threshold to keep text detection predictions.
        Returns:
            list[Dict]: A list of dictionaries, each dictionary containing the
                scores: tensor of confidence scores for detected objects
                boxes: tensor of bounding boxes in [x0, y0, x1, y1] format
                labels: list of text labels for each detected object (will be replaced with integer ids in v4.51.0)
                text_labels: list of text labels for detected objects
        """
        
        if text is None or image is None:
            raise ValueError("Both `text` and `image` must be provided")
        
        if not isinstance(image, list):
            image = [image]
            
        pil_images = []
        for img in image:
            if isinstance(img, str):
                with Image.open(img) as im:
                    pil_images.append(im.convert("RGB").copy())
            elif isinstance(img, Image.Image):
                pil_images.append(img if img.mode == "RGB" else img.convert("RGB"))
            else:
                raise ValueError("`image` must be a file path, a PIL Image, or a list of either")
        
        # remove punctuation from text
        if isinstance(text, list):
            text_labels = [label.translate(str.maketrans('', '', string.punctuation)) for label in text]
        else:
            text_labels = [text.translate(str.maketrans('', '', string.punctuation))]
        
        # Preproces
        inputs = self.processor(images=pil_images, text=text_labels, return_tensors="pt").to(self.model.device)
        
        with torch.inference_mode():
            outputs = self.model(**inputs)
        
        # Post-process results
        results = self.processor.post_process_grounded_object_detection(
            outputs=outputs,
            input_ids=inputs.input_ids,
            threshold=threshold,
            text_threshold=text_threshold,
            target_sizes=[pil_images[0].size[::-1]]
        )
        
        return results
    
from transformers import SamModel, SamProcessor, infer_device
from PIL import Image
import torch
import numpy as np

class SamSegmentator:
    def __init__(self, model_id="facebook/sam-vit-base", device=None):
        self.device = device or infer_device()
        self.processor = SamProcessor.from_pretrained(model_id)
        self.model = SamModel.from_pretrained(model_id).to(self.device)
    
    def segment(self, 
                image: str | list[str] | Image.Image | list[Image.Image], 
                detection_results: list[dict],
                polygon_refinement: bool = False
    ):
        
        if image is None or detection_results is None:
            raise ValueError("Both `image` and `detection_results` must be provided")
        
        if not isinstance(image, list):
            image = [image]
        
        pil_images = []
        for img in image:
            if isinstance(img, str):
                with Image.open(img) as im:
                    pil_images.append(im.convert("RGB").copy())
            elif isinstance(img, Image.Image):
                pil_images.append(img if img.mode == "RGB" else img.convert("RGB"))
            else:
                raise ValueError("`image` must be a file path, a PIL Image, or a list of either")
            
        boxes = get_boxes(results=detection_results)
        inputs = self.processor(images=pil_images, input_boxes=boxes, return_tensors="pt").to(self.device)
        with torch.inference_mode():
            outputs = self.model(**inputs)
            
        masks = self.processor.post_process_masks(
            masks=outputs.pred_masks,
            original_sizes=inputs.original_sizes,
            reshaped_input_sizes=inputs.reshaped_input_sizes
        )[0]
        
        masks = refine_masks(masks, polygon_refinement)
        # for detection_result, mask in zip(detection_results, masks):
        #     detection_result.mask = mask

        # return detection_results 
        return detection_results, masks

In [84]:
detector = GroundingDinoDetector()
segmentator = SamSegmentator()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [85]:
labels = ["a cat.", "a remote control."]
img_path = "/mnt/e/text-image-product-search/images/000000039769.jpg"
# result = detector.detect(text=labels, image_path=img_path)
detection_results = detector.detect(text=labels, image=img_path)
detection_results_1, masks = segmentator.segment(image=image_path, detection_results=detection_results)

In [90]:
detection_results[0]



{'scores': tensor([0.4785, 0.4381, 0.4759], device='cuda:0'),
 'boxes': tensor([[344.6931,  23.1090, 637.1847, 374.2747],
         [ 12.2650,  51.9150, 316.8591, 472.4387],
         [ 38.5833,  70.0060, 176.7780, 118.1762]], device='cuda:0'),
 'text_labels': ['a cat', 'a cat', 'a remote control'],
 'labels': ['a cat', 'a cat', 'a remote control']}

In [94]:
masks[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [59]:
from transformers import AutoModelForMaskGeneration, AutoProcessor
segmenter_id = None
segmenter_id = segmenter_id if segmenter_id is not None else "facebook/sam-vit-base"

segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to(device)
processor = AutoProcessor.from_pretrained(segmenter_id)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [62]:
type(segmentator), type(processor)

(transformers.models.sam.modeling_sam.SamModel,
 transformers.models.sam.processing_sam.SamProcessor)

In [63]:
help(processor.__call__)

Help on method __call__ in module transformers.models.sam.processing_sam:

__call__(images: Union[ForwardRef('PIL.Image.Image'), numpy.ndarray, ForwardRef('torch.Tensor'), list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor'], NoneType] = None, text: Union[str, list[str], list[list[str]], NoneType] = None, audio: Union[ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), list['np.ndarray'], list['torch.Tensor'], NoneType] = None, video: Union[list['PIL.Image.Image'], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), list['np.ndarray'], list['torch.Tensor'], list[list['PIL.Image.Image']], list[list['np.ndarrray']], list[list['torch.Tensor']], transformers.video_utils.URL, list[transformers.video_utils.URL], list[list[transformers.video_utils.URL]], transformers.video_utils.Path, list[transformers.video_utils.Path], list[list[transformers.video_utils.Path]], NoneType] = None, **kwargs) -> transformers.tokenization_utils_base.BatchEncoding method of transformers.models.s

In [114]:
import cv2
import numpy as np

# Đọc ảnh
img = cv2.imread("/mnt/e/text-image-product-search/images/canny-edges.png")
if img is None:
    raise ValueError("file not found or not an image")

# Chuyển sang grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Threshold để tạo ảnh nhị phân
_, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)

# Tìm contour
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# print(f"Found {len(contours)} contours")

# # Vẽ contour lên ảnh copy
# img_contour = img.copy()
# cv2.drawContours(img_contour, contours, -1, (0, 255, 0), 2)

# # Hiển thị kết quả
# cv2.imshow("Original", img)
# cv2.imshow("Threshold", thresh)
# cv2.imshow("Contours", img_contour)

# cv2.waitKey(0)
# cv2.destroyAllWindows()

largest_contour = max(contours, key=cv2.contourArea)

    # Extract the vertices of the contour
polygon = largest_contour.reshape(-1, 2).tolist()
print(polygon)


[[19, 11], [18, 12], [18, 180], [19, 181], [228, 181], [229, 180], [229, 12], [227, 12], [226, 11]]


In [110]:
len(contours), contours[0].shape, contours[0][:5]  # first 5 points of the first contour

(6,
 (12, 1, 2),
 array([[[ 34, 277]],
 
        [[ 34, 279]],
 
        [[ 36, 281]],
 
        [[ 37, 280]],
 
        [[ 38, 281]]], dtype=int32))