In [None]:
# Copyright (c) Meta Platforms, Inc. and affiliates.

# Segment Anything Video (SA-V) Dataset

This notebook presents an example showing how to load and visualize the SA-V annotations.
To run this notebook, you will need to have the following installed:
- jupyter
- matplotlib
- opencv-python
- numpy
- pycocoevalcap
- pandas

## Import libaries

In [None]:
import json

import pandas as pd
from utils.sav_utils import SAVDataset

## Example video and annotations from SA-V

In [None]:
sav_dataset = SAVDataset(sav_dir="example/")
frames, manual_annot, auto_annot = sav_dataset.get_frames_and_annotations("sav_000001")

### Show the SA-V annotations in frame 0 - auto + manual

In [None]:
sav_dataset.visualize_annotation(
    frames, manual_annot, auto_annot, 
    annotated_frame_id=0,
)

### Show the SA-V annotations in another frame - auto + manual

In [None]:
sav_dataset.visualize_annotation(
    frames, manual_annot, auto_annot, 
    annotated_frame_id=30,
)

### Show the SA-V annotations in frame 0 - manual only

In [None]:
sav_dataset.visualize_annotation(
    frames, manual_annot, auto_annot,
    annotated_frame_id=0,
    show_auto=False,
)

### Show the SA-V annotations in frame 0 - auto only

In [None]:
sav_dataset.visualize_annotation(
    frames, manual_annot, auto_annot,
    annotated_frame_id=0,
    show_manual=False,
)

### Masklet annotations and Metadata

#### Manual annotations and metadata

In [None]:
pd.DataFrame([manual_annot])

#### Auto annotations and metadata

In [None]:
pd.DataFrame([auto_annot])

#### Video info

In [None]:
video_id = manual_annot["video_id"]
video_duration = manual_annot["video_duration"]
video_frame_count = manual_annot["video_frame_count"]
H = manual_annot["video_height"]
W = manual_annot["video_width"]
environment = manual_annot["video_environment"]
print(
    f"{video_id} is {video_duration} seconds long with {video_frame_count} frames. The video resolution is {H} x {W}."
)
print(f"This video is captured in {environment} environment.")

#### Masklet info

In [None]:
print(
    f"There are {manual_annot['masklet_num']} manually labeled masklets and {auto_annot['masklet_num']} automatically generated masklets."
)
print(
    f"In SA-V, videos are annotated every 4 frames. Therefore, there are {manual_annot['masklet_frame_count'][0]} frames being annotated."
)

In [None]:
# Get the rle of the manual masklet with masklet_id=0 in frame 0
masklet_id = 0
annotated_frame_id = 0
manual_annot["masklet"][annotated_frame_id][masklet_id]
# decode the rle using `mask_util.decode(rle)>0` to get the binary segmentation mask

In [None]:
# Get the rle of the masklet with masklet_id=5 in frame 100
masklet_id = 5
annotated_frame_id = 100
auto_annot["masklet"][annotated_frame_id][masklet_id]
# decode the rle using `mask_util.decode(rle)>0` to get the binary segmentation mask