# Auto-Generating COCO Annotations for Instance Segmentation using FastSAM

Instance segmentation requires high-quality annotations, but manual annotation is time-consuming and expensive. This notebook automates the annotation process by leveraging FastSAM, a lightweight and efficient segmentation model, to generate masks from images.

The key steps in this pipeline include:



1.   Mask Generation with FastSAM - Detects object masks quickly.
2.   Post-processing - Reduces errors, removes false detections, and refines results.
3.   COCO JSON Conversion - Converts masks into COCO format for training deep learning models.

## Import required libraries and setup

In [None]:
import os
HOME = os.getcwd()

%cd {HOME}

# Clone the FastSAM repo and install the required libraries.
!git clone https://github.com/CASIA-IVA-Lab/FastSAM.git
!pip -q install -r FastSAM/requirements.txt
!pip -q install git+https://github.com/openai/CLIP.git

%cd {HOME}/FastSAM

In [None]:
import os
from typing import Union
import numpy as np
import pandas as pd
import torch
from scipy import ndimage
import cv2
import skimage
from fastsam import FastSAM, FastSAMPrompt
import matplotlib.pyplot as plt


DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"DEVICE = {DEVICE}")

In [None]:
#@title Utils

_PROPERTIES = (
    "major_axis_length",
    "minor_axis_length",
)


def masks_to_bool(masks: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
  """Convert masks to boolean format.

  Args:
      masks: Input masks, either as a NumPy array or a PyTorch tensor.

  Returns:
      Boolean masks where values are converted to True/False.
  """
  if type(masks) == np.ndarray:
      return masks.astype(bool)
  return masks.cpu().numpy().astype(bool)


def plot_boolean_masks(masks: np.ndarray, masks_per_row: int = 5):
    """Plots boolean masks in a grid format with a fixed number of masks per row.

    Args:
        masks: Boolean masks.
        masks_per_row: Number of masks to display per row.
    """
    num_masks = masks.shape[0]  # Total number of masks
    num_rows = (num_masks + masks_per_row - 1) // masks_per_row  # Compute required rows

    fig, axes = plt.subplots(num_rows, masks_per_row, figsize=(masks_per_row * 3, num_rows * 3))

    # Flatten axes array in case of a single row
    axes = axes.flatten()

    for i in range(len(axes)):
        if i < num_masks:
            axes[i].imshow(masks[i])  # Display mask
            axes[i].axis("off")  # Hide axis labels
            axes[i].set_title(f"Mask {i+1}")  # Set title
        else:
            axes[i].axis("off")  # Hide empty subplots

    plt.tight_layout()
    plt.show()


def extract_properties(masks: np.ndarray) -> pd.DataFrame:
    """Extracts properties of masks and computes additional ratio features.

    Args:
        masks: Boolean masks.

    Returns:
        Extracted properties.
    """
    dataframes = []

    for mask in masks:
      binary_mask = np.where(mask, 1, 0)
      df = pd.DataFrame(
        skimage.measure.regionprops_table(binary_mask, properties=_PROPERTIES)
      )
      dataframes.append(df)

    features = pd.concat(dataframes, ignore_index=True)
    features["axis_ratio"] = features["major_axis_length"] / features["minor_axis_length"]
    return features

def _is_contained(mask1: np.ndarray, mask2: np.ndarray):
  """Check if mask1 is entirely contained within mask2.

  Args:
    mask1: The first mask.
    mask2: The second mask.

  Returns:
    True if mask1 is entirely contained within mask2, False otherwise.
  """
  return np.array_equal(np.logical_and(mask1, mask2), mask1)


def _calculate_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
  """Calculate the intersection over union (IoU) between two masks.

  Args:
    mask1: The first mask.
    mask2: The second mask.

  Returns:
    The intersection over union (IoU) between the two masks.
  """
  intersection = np.logical_and(mask1, mask2).sum()
  union = np.logical_or(mask1, mask2).sum()
  return intersection / union if union != 0 else 0


def filter_masks(masks: np.ndarray, iou_threshold: float = 0.8) -> np.ndarray:
  """Filter the overlapping masks.

  Filter the masks based on the intersection over union (IoU) and keep the
  biggest masks if they are overlapping.

  Args:
    masks: The masks to filter.
    iou_threshold: The threshold for the intersection over union (IoU) between
      two masks.

  Returns:
    Unique masks.
  """
  # Calculate the area for each mask
  areas = np.array([np.sum(mask) for mask in masks])

  # Sort the masks based on area in descending order
  sorted_indices = np.argsort(areas)[::-1]
  sorted_masks = masks[sorted_indices]

  unique_masks = []

  for i, mask in enumerate(sorted_masks):
    keep = True
    for j in range(i):
      if _calculate_iou(mask, sorted_masks[j]) > iou_threshold or _is_contained(
          mask, sorted_masks[j]
      ):
        keep = False
        break
    if keep:
      unique_masks.append(mask)

  return np.array(unique_masks)


def keep_largest_component(masks: np.ndarray) -> np.ndarray:
    """Keeps only the largest connected component in each binary mask.

    Args:
        masks: Binary masks.

    Returns:
        Boolean masks with only the largest component retained.
    """
    largest_component_masks = []

    for mask in masks:
      mask = mask.astype(np.uint8)*255

      # Find connected components
      num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
          mask,
          connectivity=8
      )

      # Find the largest component, excluding the background (label 0)
      largest_label = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])

      # Create a boolean mask for the largest connected component
      largest_component_mask = labels == largest_label
      largest_component_mask = ndimage.binary_fill_holes(largest_component_mask)
      largest_component_masks.append(largest_component_mask)

    return np.array(largest_component_masks)


def create_coco_annotation_for_single_image(
    binary_masks: np.ndarray,
    labels: list[str],
    image_name: str,
    image_height: int,
    image_width: int
    ):
    """Creates a COCO annotation JSON.

    Create an annotation file for instance segmentation from binary masks and
    corresponding labels for a single image.

    Args:
      binary_masks: List of binary mask arrays corresponding to objects.
      labels: List of labels corresponding to each mask in the image.
      image_file: Image name.
      image_height: Image height.
      image_width: Image width.

    Returns:
      COCO-style annotation JSON as a Python dictionary.
    """

    # COCO structure template
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": []
    }

    # Add categories (assume labels are unique)
    label_to_id = {label: idx + 1 for idx, label in enumerate(set(labels))}
    for label, category_id in label_to_id.items():
        coco_dataset["categories"].append({
            "id": category_id,
            "name": label,
            "supercategory": "object"
        })

    # Get the file name and path
    file_name = os.path.basename(image_name)

    # extract height and width
    height, width = image_height, image_width

    img_id = 1  # Since it's a single image, you can set the image ID to 1

    # Add image information
    coco_dataset["images"].append({
        "id": img_id,
        "width": width,
        "height": height,
        "file_name": file_name
    })

    # Process each mask in the image
    annotation_id = 1
    for mask, label in zip(binary_masks, labels):
        category_id = label_to_id[label]

        # Find contours for the mask and flatten the contour points
        contours, _ = cv2.findContours(
            mask.astype(np.uint8),
            cv2.RETR_EXTERNAL,
            cv2.CHAIN_APPROX_SIMPLE
        )
        segmentation = []
        for contour in contours:
            contour = contour.flatten().tolist()  # Flatten the contour and convert it to a list
            if len(contour) >= 6:  # A valid polygon needs at least 3 points (6 coordinates)
                segmentation.append(contour)

        # Calculate area and bounding box
        area = int(np.sum(mask.astype(bool)))
        bbox = cv2.boundingRect(mask.astype(np.uint8))
        x, y, w, h = bbox

        # Create annotation entry
        coco_dataset["annotations"].append({
            "id": annotation_id,
            "image_id": img_id,
            "category_id": category_id,
            "segmentation": segmentation,  # Segmentation in polygon format
            "area": area,
            "bbox": [x, y, w, h],
            "iscrowd": 0
        })

        annotation_id += 1

    for i in coco_dataset['annotations']:
      i['segmentation'] = [max(i['segmentation'], key=len)]

    # Return the COCO JSON object
    return coco_dataset

## Install FastSAM weights

In [None]:
!mkdir weights
!wget -P weights -q https://huggingface.co/spaces/An-619/FastSAM/resolve/main/weights/FastSAM.pt
!ls -lh weights

## Load the model

In [None]:
FAST_SAM_CHECKPOINT_PATH = "weights/FastSAM.pt"
fast_sam = FastSAM(FAST_SAM_CHECKPOINT_PATH)

## Inference

Fast SAM parameters:



*   `retina_masks=True` determines whether the model uses retina masks for generating segmentation masks.
*   `imgsz`=1024 sets the input image size to 1024x1024 pixels for processing by the model.
*   `conf`=0.4 sets the minimum confidence threshold for object detection.
*   `iou`=0.9 sets the minimum intersection over union threshold for non-maximum suppression to filter out duplicate detections.






In [None]:
# Import an image.
url = (
    "https://raw.githubusercontent.com/tensorflow/models/master/official/"
    "projects/waste_identification_ml/pre_processing/config/sample_images/"
    "sample_image_fastsam.jpeg"
)
!curl -O {url}

In [None]:
IMAGE_PATH = "sample_image_fastsam.jpeg"
DEVICE = "cuda"

In [None]:
results = fast_sam(
    source=IMAGE_PATH,
    device=DEVICE,
    retina_masks=True,
    imgsz=1024,
    conf=0.5,
    iou=0.1)
prompt_process = FastSAMPrompt(IMAGE_PATH, results, device=DEVICE)
masks = prompt_process.everything_prompt()

if len(masks) == 0:
  print("No masks detected")
masks = masks_to_bool(masks)
print(masks.shape)

In [None]:
plot_boolean_masks(masks)

## Postprocessing masks

If you notice that Mask5 and Mask6 are the false positives which needs to be removed. We will use different techniques to get rid of such detections.

In [None]:
image = cv2.imread(IMAGE_PATH)
image_height, image_width = image.shape[:2]

# Remove masks which are bigger than 30% of an image size and lower than 4000
# pixels in area.
HIGHER_THRESHOLD = 0.3 * image_height * image_width
LOWER_THRESHOLD = 4000
masks = np.array([mask for mask in masks if LOWER_THRESHOLD < np.sum(mask) < HIGHER_THRESHOLD])
print(masks.shape)

In [None]:
# Removes masks whose major to minor axis ratio is bigger than 5.
features = extract_properties(masks)

RATIO_THRESHOLD = 5
masks = np.array([mask for mask,ratio in zip(masks, features["axis_ratio"]) if ratio < RATIO_THRESHOLD])
print(masks.shape)

In [None]:
# Keep the largest component masks if they are connected.
mask = keep_largest_component(masks)
print(mask.shape)

In [None]:
# Remove overlapped smaller masks and keep the biggest one using IoU.
masks = filter_masks(masks)
print(masks.shape)

In [None]:
plot_boolean_masks(masks)

## Create COCO JSON annotation file

In [None]:
# Get the class name of each corresponding mask.
labels = ['non-bottle']*len(masks)
labels

In [None]:
# Create a COCO JSON format file.
coco_json = create_coco_annotation_for_single_image(
    masks,
    labels,
    os.path.basename(IMAGE_PATH),
    image_height,
    image_width
)

In [None]:
coco_json.keys()

In [None]:
coco_json['images']

In [None]:
coco_json['annotations'][1].keys()

In [None]:
for i in range(len(masks)):
  print(f"id:{coco_json['annotations'][i]['id']}\
          image_id:{coco_json['annotations'][i]['image_id']}\
          category_id:{coco_json['annotations'][i]['category_id']}\
          area:{coco_json['annotations'][i]['area']}\
          bbox:{coco_json['annotations'][i]['bbox']}")