In [None]:
# Get polygon and Visualize the mask for SAM2
import json
import matplotlib.pyplot as plt
import numpy as np

# Load the file
file_path = 'mask_data_box(1).json'  # Replace with your actual file path
with open(file_path, 'r') as f:
    data = json.load(f)

for frame in data:
  # Extract polygons from the data
  polygons = data[frame][0]['polygons']
  # print(polygons)

  # Adjust the coordinates to make the mask horizontal
  fig, ax = plt.subplots(figsize=(8.54, 4.8))  # Adjust figure size to match the image dimensions

  for polygon in polygons:
      polygon = np.array(polygon)
      # Swap the x and y values to make the mask horizontal
      polygon = polygon[:, [1, 0]]
      ax.fill(polygon[:, 0], polygon[:, 1], edgecolor='black', fill="Green")  # Fill with no color and black edges

  ax.set_aspect('equal', adjustable='box')
  ax.set_xlim(0, 854)  # Set x-axis limit
  ax.set_ylim(0, 480)  # Set y-axis limit
  plt.title('Polygons Visualization')
  plt.gca().invert_yaxis()  # Invert the y-axis to match the image coordinate system
  plt.show()


For merging 


In [None]:
import pandas as pd

# Load the CSV files
hand_masks_df = pd.read_csv('/home/totargaming/workspace/sam2/aria-handtracking/hand_masks_bboxes_dot.csv')
final_df = pd.read_csv('/home/totargaming/workspace/sam2/aria-handtracking/final.csv')

# Rename columns for clarity
hand_masks_df.rename(columns={'point_x': 'hand_point_x', 'point_y': 'hand_point_y'}, inplace=True)
final_df.rename(columns={'point_x': 'obj_point_x', 'point_y': 'obj_point_y'}, inplace=True)

# Calculate bottom right coordinates for hand_masks_df
hand_masks_df['hand_box_x2'] = hand_masks_df['box_x1'] + hand_masks_df['width']
hand_masks_df['hand_box_y2'] = hand_masks_df['box_y1'] + hand_masks_df['height']

# Calculate bottom right coordinates for final_df
final_df['obj_box_x2'] = final_df['box_x1'] + final_df['width']
final_df['obj_box_y2'] = final_df['box_y1'] + final_df['height']

# Merge the dataframes on folder_name
merged_df = pd.merge(final_df, hand_masks_df, on='folder_name')

# Calculate the combined bounding box coordinates
merged_df['box_x1'] = merged_df[['box_x1_x', 'box_x1_y']].min(axis=1)
merged_df['box_y1'] = merged_df[['box_y1_x', 'box_y1_y']].min(axis=1)
merged_df['box_x2'] = merged_df[['obj_box_x2', 'hand_box_x2']].max(axis=1)
merged_df['box_y2'] = merged_df[['obj_box_y2', 'hand_box_y2']].max(axis=1)

# Select and rename the required columns
output_df = merged_df[['folder_name', 'hand_point_x', 'hand_point_y', 'obj_point_x', 'obj_point_y', 'box_x1', 'box_y1', 'box_x2', 'box_y2', 'frame']]
output_df.rename(columns={'box_x2': 'box_x2', 'box_y2': 'box_y2'}, inplace=True)

# Save the result to a new CSV file
output_df.to_csv('/home/totargaming/workspace/sam2/aria-handtracking/merged_output.csv', index=False)

In [None]:
# Get polygon and visualize a particular video for GT mask
import json
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from pycocotools import mask as mask_utils

# Load the JSON file
json_path = "./HOIST/valid.json"
with open(json_path, "r") as f:
    data = json.load(f)

# Create mappings
video_map = {video["id"]: video["file_names"] for video in data["videos"]}
video_dim_map = {video["id"]: (video["height"], video["width"]) for video in data["videos"]}

# Output directory
output_folder = "output_masks"
os.makedirs(output_folder, exist_ok=True)

# Iterate through all videos
for video_id, frame_files in video_map.items():
  if video_id == 50:
    frame_height, frame_width = video_dim_map.get(video_id, (720, 1280))  # Default dimensions if not found

    # Find all annotations related to this video
    video_annotations = [ann for ann in data["annotations"] if ann["video_id"] == video_id]

    # Iterate through each frame
    for frame_idx, frame_name in enumerate(frame_files):
        # Create a blank image
        img = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)

        # Apply segmentation masks for this frame
        for annotation in video_annotations:
            segmentation = annotation["segmentations"]

            if isinstance(segmentation, list) and len(segmentation) > frame_idx:
                seg = segmentation[frame_idx]  # Get segmentation for the current frame
                if isinstance(seg, dict) and "counts" in seg:
                    mask = mask_utils.decode(mask_utils.frPyObjects(seg, frame_height, frame_width))
                    img[mask > 0] = (0, 255, 0)  # Overlay mask in green

        # Display the masked frame
        plt.figure(figsize=(10, 5))
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Video {video_id} | Frame {frame_idx}")
        plt.axis("off")
        plt.show()

        # Save the masked frame
        output_path = os.path.join(output_folder, frame_name.replace("/", "_"))
        cv2.imwrite(output_path, img)

print(f"All frames processed and saved in '{output_folder}'")


In [None]:
# Compute AP and AP 50 (GT vs SAM2)
import json
import numpy as np
import cv2
import matplotlib.pyplot as plt
from pycocotools import mask as mask_utils
from sklearn.metrics import average_precision_score
import os, sys

# Load the JSON file for GT data
json_path = "valid.json"  # Replace with your actual path
with open(json_path, "r") as f:
    data = json.load(f)

def compute_ap(gt_masks, pred_masks, iou_threshold=0.5):
    # Compute Average Precision (AP) using sklearn
    ap = average_precision_score(gt_masks.flatten() > 0, pred_masks.flatten() > 0)
    return ap

def compute_iou(gt_mask, pred_mask):
    intersection = np.logical_and(gt_mask, pred_mask)
    union = np.logical_or(gt_mask, pred_mask)

    # Handle division by zero if union is zero
    if np.sum(union) == 0:
        return 0.0  # No overlap, return IoU as 0

    return np.sum(intersection) / np.sum(union)

def compute_ap50(gt_masks, pred_masks, iou_threshold=0.5):
    ground_truth = []
    predictions = []

    for gt_mask, pred_mask in zip(gt_masks, pred_masks):
        # Compute IoU for each pair of ground truth and predicted masks
        iou = compute_iou(gt_mask, pred_mask)

        # If IoU is greater than or equal to the threshold, it's a true positive
        if iou >= iou_threshold:
            ground_truth.append(1)  # True positive
        else:
            ground_truth.append(0)  # False positive

        predictions.append(iou)  # Use IoU as the predicted confidence score

    # Compute Average Precision (AP) using sklearn
    ap = average_precision_score(ground_truth, predictions)
    return ap

input= "input_dot"
path = f'/content/{input}'
dirs = os.listdir( path )
ap_list=[]
ap50_list=[]
from_0_25_list=[]
from_25_50_list=[]
from_50_75_list=[]
from_75_100_list=[]

ap50_from_0_25_list=[]
ap50_from_25_50_list=[]
ap50_from_50_75_list=[]
ap50_from_75_100_list=[]

ap50_70_to_80=[]
ap50_80_to_90=[]

# Print all the files and directories
for file in dirs:
  file_name=file.split('.')[0]
  print(file_name)
  sam2_file_path = f'./{input}/{file_name}.json'  # Replace with your actual SAM2 output file path
  if file_name!= "":
    with open(sam2_file_path, 'r') as f:
      sam2_data = json.load(f)

    # Search for the video ID based on the file name
    video_id = None
    frame_height=None
    frame_width=None
    for video in data['videos']:
        if any(file_name in fname for fname in video['file_names']):
            video_id = video['id']
            frame_height = video['height']
            frame_width = video['width']

            break

    # print("running file: ", file_name)
    # print("id is: ", video_id)
    # Iterate through video 50 frames
    video_annotations = [ann for ann in data["annotations"] if ann["video_id"] == video_id]

    # print("frame_files: ", frame_files)
    # print("height is: ", frame_height)
    # print("width is: ", frame_width)

    # Initialize lists for GT and predicted masks
    gt_masks = []
    pred_masks = []
    ap=[]
    ap50=[]
    # Iterate through each frame
    for frame_idx, frame_name in enumerate(frame_files):
        # Create a blank image for GT mask
        gt_img = np.zeros((frame_height, frame_width), dtype=np.uint8)

        # Apply segmentation masks for this frame from GT data
        for annotation in video_annotations:
            segmentation = annotation["segmentations"]

            if isinstance(segmentation, list) and len(segmentation) > frame_idx:
                seg = segmentation[frame_idx]  # Get segmentation for the current frame
                if isinstance(seg, dict) and "counts" in seg:
                    mask = mask_utils.decode(mask_utils.frPyObjects(seg, frame_height, frame_width))
                    gt_img[mask > 0] = 1  # Set ground truth pixels to 1

        # Append the GT mask to the list
        gt_masks.append(gt_img)

        # Get the corresponding polygons from SAM2 output
        sam2_polygons = sam2_data[str(frame_idx)][0]['polygons']

        # Create a blank image for predicted mask from SAM2 output
        pred_img = np.zeros((frame_height, frame_width), dtype=np.uint8)

        # Draw the polygons for SAM2 output on the pred_img
        for polygon in sam2_polygons:
            polygon = np.array(polygon, dtype=np.int32)  # Ensure coordinates are of integer type (CV_32S)
            polygon = polygon[:, [1, 0]]  # Swap to (x, y) format
            cv2.fillPoly(pred_img, [polygon], 1)  # Fill the polygon region with 1

        # Append the predicted mask to the list
        pred_masks.append(pred_img)

        # # Plot GT and predicted masks side by side
        # plt.figure(figsize=(10, 5))

        # # Ground Truth mask
        # plt.subplot(1, 2, 1)
        # plt.imshow(gt_img, cmap='gray')
        # plt.title(f'Ground Truth - Frame {frame_idx}')
        # plt.axis('off')

        # # SAM2 predicted mask
        # plt.subplot(1, 2, 2)
        # plt.imshow(pred_img, cmap='gray')
        # plt.title(f'SAM2 Prediction - Frame {frame_idx}')
        # plt.axis('off')

        # plt.show()
        ap.append(compute_ap(gt_img, pred_img))
        ap50.append(compute_ap50(gt_img, pred_img))

    # Compute the Average Precision (AP) for this video
    # print("ap list: ", ap)
    # print("ap50 list: ", ap50)
    ap = np.mean(ap)
    ap_list.append(ap)
    ap50 = np.mean(ap50)
    ap50_list.append(ap50)
    if ap > 0 and ap < 0.25 :
      from_0_25_list.append(file_name)
    elif ap > 0.25 and ap < 0.5:
      from_25_50_list.append(file_name)
    elif ap > 0.5 and ap < 0.75:
      from_50_75_list.append(file_name)
    elif ap > 0.75:
      from_75_100_list.append(file_name)

    if ap50 > 0 and ap50 < 0.25 :
      ap50_from_0_25_list.append(file_name)
    elif ap50 > 0.25 and ap50 < 0.5:
      ap50_from_25_50_list.append(file_name)
    elif ap50 > 0.5 and ap50 < 0.75:
      ap50_from_50_75_list.append(file_name)
    elif ap50 > 0.75:
      ap50_from_75_100_list.append(file_name)

    if ap50 >= 0.70 and ap50 < 0.80:
      ap50_70_to_80.append(file_name)
    elif ap50 >= 0.80 and ap50 < 0.90:
      ap50_80_to_90.append(file_name)
    # print(f"Average Precision for Video {video_id}: {ap}")
    # print(f"Average Precision 50 for Video {video_id}: {ap50}")

plt.hist(ap_list, alpha=0.5, label='ap')
plt.hist(ap50_list, alpha=0.5, label='ap50')
plt.legend(loc='upper right')
plt.show()
print("AP from 0 to 25: ", from_0_25_list)
print("AP from 25 to 50: ", from_25_50_list)
print("AP from 50 to 75: ", from_50_75_list)
print("AP from 75 to 100: ", from_75_100_list)
print("Mean AP of 40 videos: ", np.mean(ap_list))

print('\n')
print("AP50 from 0 to 25: ", ap50_from_0_25_list)
print("AP50 from 25 to 50: ", ap50_from_25_50_list)
print("AP50 from 50 to 75: ", ap50_from_50_75_list)
print("AP50 from 75 to 100: ", ap50_from_75_100_list)
print("Mean AP50 of 40 videos: ", np.mean(ap50_list))

print('\n')
print('ap50 in range 70 to 80: ', ap50_70_to_80)
print('ap50 in range 80 to 90: ', ap50_80_to_90)

In [None]:
# Compute AP and AP 50 (GT vs SAM2) of 1 folder
import json
import numpy as np
import cv2
import matplotlib.pyplot as plt
from pycocotools import mask as mask_utils
from sklearn.metrics import average_precision_score
import os, sys

# Load the JSON file for GT data
json_path = "valid.json"  # Replace with your actual path
with open(json_path, "r") as f:
    data = json.load(f)

def compute_ap(gt_masks, pred_masks, iou_threshold=0.5):
    # Compute Average Precision (AP) using sklearn
    ap = average_precision_score(gt_masks.flatten() > 0, pred_masks.flatten() > 0)
    return ap

def compute_iou(gt_mask, pred_mask):
    intersection = np.logical_and(gt_mask, pred_mask)
    union = np.logical_or(gt_mask, pred_mask)

    # Handle division by zero if union is zero
    if np.sum(union) == 0:
        return 0.0  # No overlap, return IoU as 0

    return np.sum(intersection) / np.sum(union)

def compute_ap50(gt_masks, pred_masks, iou_threshold=0.5):
    ground_truth = []
    predictions = []

    for gt_mask, pred_mask in zip(gt_masks, pred_masks):
        # Compute IoU for each pair of ground truth and predicted masks
        iou = compute_iou(gt_mask, pred_mask)

        # If IoU is greater than or equal to the threshold, it's a true positive
        if iou >= iou_threshold:
            ground_truth.append(1)  # True positive
        else:
            ground_truth.append(0)  # False positive

        predictions.append(iou)  # Use IoU as the predicted confidence score

    # Compute Average Precision (AP) using sklearn
    ap = average_precision_score(ground_truth, predictions)
    return ap

# Print all the files and directories

sam2_file_path = './input_box/yBun7uGa6r8_65.json'
if file_name!= "":
  with open(sam2_file_path, 'r') as f:
    sam2_data = json.load(f)

  # Search for the video ID based on the file name
  video_id = None
  frame_height=None
  frame_width=None
  for video in data['videos']:
      if any(file_name in fname for fname in video['file_names']):
          video_id = video['id']
          frame_height = video['height']
          frame_width = video['width']
          break

  video_annotations = [ann for ann in data["annotations"] if ann["video_id"] == video_id]

  # Initialize lists for GT and predicted masks
  gt_masks = []
  pred_masks = []
  ap=[]
  ap50=[]
  # Iterate through each frame
  for frame_idx, frame_name in enumerate(frame_files):
      # Create a blank image for GT mask
      gt_img = np.zeros((frame_height, frame_width), dtype=np.uint8)

      # Apply segmentation masks for this frame from GT data
      for annotation in video_annotations:
          segmentation = annotation["segmentations"]

          if isinstance(segmentation, list) and len(segmentation) > frame_idx:
              seg = segmentation[frame_idx]  # Get segmentation for the current frame
              if isinstance(seg, dict) and "counts" in seg:
                  mask = mask_utils.decode(mask_utils.frPyObjects(seg, frame_height, frame_width))
                  gt_img[mask > 0] = 1  # Set ground truth pixels to 1

      # Append the GT mask to the list
      gt_masks.append(gt_img)

      # Get the corresponding polygons from SAM2 output
      sam2_polygons = sam2_data[str(frame_idx)][0]['polygons']

      # Create a blank image for predicted mask from SAM2 output
      pred_img = np.zeros((frame_height, frame_width), dtype=np.uint8)

      # Draw the polygons for SAM2 output on the pred_img
      for polygon in sam2_polygons:
          polygon = np.array(polygon, dtype=np.int32)  # Ensure coordinates are of integer type (CV_32S)
          polygon = polygon[:, [1, 0]]  # Swap to (x, y) format
          cv2.fillPoly(pred_img, [polygon], 1)  # Fill the polygon region with 1

      # Append the predicted mask to the list
      pred_masks.append(pred_img)

      # Plot GT and predicted masks side by side
      plt.figure(figsize=(10, 5))

      # Ground Truth mask
      plt.subplot(1, 2, 1)
      plt.imshow(gt_img, cmap='gray')
      plt.title(f'Ground Truth - Frame {frame_idx}')
      plt.axis('off')

      # SAM2 predicted mask
      plt.subplot(1, 2, 2)
      plt.imshow(pred_img, cmap='gray')
      plt.title(f'SAM2 Prediction - Frame {frame_idx}')
      plt.axis('off')

      # plt.show()
      ap.append(compute_ap(gt_img, pred_img))
      ap50.append(compute_ap50(gt_img, pred_img))

  # Compute the Average Precision (AP) for this video
  print("ap list: ", ap)
  print("ap50 list: ", ap50)
  ap = np.mean(ap)
  print("ap is: ", ap)
  ap50 = np.mean(ap50)
  print("ap50 is: ", ap50)




In [None]:
import os, sys

# Open a file
path = "/content/input"
dirs = os.listdir( path )

# Print all the files and directories
for file in dirs:
   print(file.split('.')[0])



In [None]:
from __future__ import annotations
import pandas as pd
import json

# Path to your CSV file
csv_file_path = 'bbox.csv'
json_file_path = './HOIST/valid.json'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

with open(json_file_path, "r") as f:
    data = json.load(f)

# Initialize lists to store the new data
box_x1 = []
box_y1 = []
box_x2 = []
box_y2 = []
frame_numbers = []

for file_name in df['folder_name']:
    video_id = None
    for video in data['videos']:
        if any(file_name in fname for fname in video['file_names']):
            video_id = video['id']
            break

    if video_id is not None:
        for annotation in data['annotations']:
            if annotation['video_id'] == video_id:
                for frame_number, bbox in enumerate(annotation['bboxes']):
                    if bbox is not None:
                        box_x1.append(bbox[0])
                        box_y1.append(bbox[1])
                        box_x2.append(bbox[2])
                        box_y2.append(bbox[3])
                        frame_numbers.append(frame_number)
                        break
                else:
                    box_x1.append(None)
                    box_y1.append(None)
                    box_x2.append(None)
                    box_y2.append(None)
                    frame_numbers.append(None)
                break
    else:
        box_x1.append(None)
        box_y1.append(None)
        box_x2.append(None)
        box_y2.append(None)
        frame_numbers.append(None)

# Add the new data to the DataFrame
df['box_x1'] = box_x1
df['box_y1'] = box_y1
df['box_x2'] = box_x2
df['box_y2'] = box_y2
df['frame'] = frame_numbers

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)

print("CSV file updated successfully.")

In [None]:
import pandas as pd

# File paths
polygon_centers_path = '/home/totargaming/workspace/sam2/aria-handtracking/polygon_centers.csv'
bbox_path = '/home/totargaming/workspace/sam2/aria-handtracking/bbox.csv'
output_path = '/home/totargaming/workspace/sam2/aria-handtracking/merged_bbox.csv'

# Read the CSV files into DataFrames
polygon_centers_df = pd.read_csv(polygon_centers_path)
bbox_df = pd.read_csv(bbox_path)

# Merge the DataFrames on the 'folder_name' column
merged_df = pd.merge(bbox_df, polygon_centers_df, on='folder_name', how='left')

# Fill in the 'point_x' and 'point_y' columns in bbox_df with the values from polygon_centers_df
merged_df['point_x'] = merged_df['point_x_y']
merged_df['point_y'] = merged_df['point_y_y']

# Drop the redundant columns
merged_df.drop(columns=['point_x_y', 'point_y_y'], inplace=True)

# Remove duplicates based on 'folder_name' and keep the first occurrence
merged_df = merged_df.drop_duplicates(subset='folder_name', keep='first')

# Write the updated DataFrame to a new CSV file
merged_df.to_csv(output_path, index=False)

print(f"Merged CSV saved to {output_path}")

In [None]:
from __future__ import annotations
import pandas as pd
import json
import numpy as np
from pycocotools import mask as mask_utils

# Path to your CSV file
csv_file_path = 'bbox2.csv'
json_file_path = './HOIST/valid.json'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

with open(json_file_path, "r") as f:
    data = json.load(f)

# Initialize lists to store the new data
box_x1 = []
box_y1 = []
box_x2 = []
box_y2 = []
frame_numbers = []
x_centers = []
y_centers = []

for file_name in df['folder_name']:
    video_id = None
    for video in data['videos']:
        if any(file_name in fname for fname in video['file_names']):
            video_id = video['id']
            break

    if video_id is not None:
        frame_height, frame_width = video_dim_map.get(video_id, (720, 1280))  # Default dimensions
        for annotation in data['annotations']:
            if annotation['video_id'] == video_id:
                for frame_number, bbox in enumerate(annotation['bboxes']):
                    if bbox is not None:
                        box_x1_val, box_y1_val, box_x2_val, box_y2_val = bbox
                        box_x1.append(box_x1_val)
                        box_y1.append(box_y1_val)
                        box_x2.append(box_x2_val)
                        box_y2.append(box_y2_val)
                        frame_numbers.append(frame_number)

                        # Find the corresponding segmentation for this frame
                        segmentations = annotation['segmentations']
                        if isinstance(segmentations, list) and len(segmentations) > frame_number:
                            seg = segmentations[frame_number]
                            if isinstance(seg, dict) and 'counts' in seg:
                                mask = mask_utils.decode(mask_utils.frPyObjects(seg, frame_height, frame_width))

                                # Get the coordinates of the mask pixels
                                y_coords, x_coords = np.where(mask > 0)

                                if len(x_coords) > 0 and len(y_coords) > 0:
                                    # Compute the center as the mean of coordinates
                                    center_x = int(np.mean(x_coords))
                                    center_y = int(np.mean(y_coords))

                                    x_centers.append(center_x)
                                    y_centers.append(center_y)
                                else:
                                    x_centers.append(None)
                                    y_centers.append(None)
                            else:
                                x_centers.append(None)
                                y_centers.append(None)
                        else:
                            x_centers.append(None)
                            y_centers.append(None)
                        break
                else:
                    box_x1.append(None)
                    box_y1.append(None)
                    box_x2.append(None)
                    box_y2.append(None)
                    frame_numbers.append(None)
                    x_centers.append(None)
                    y_centers.append(None)
                break
    else:
        box_x1.append(None)
        box_y1.append(None)
        box_x2.append(None)
        box_y2.append(None)
        frame_numbers.append(None)
        x_centers.append(None)
        y_centers.append(None)

# Add the new data to the DataFrame
df['box_x1'] = box_x1
df['box_y1'] = box_y1
df['width'] = box_x2
df['height'] = box_y2
df['frame'] = frame_numbers
df['point_x'] = x_centers
df['point_y'] = y_centers

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)

print("CSV file updated successfully.")

In [7]:
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from sam2.build_sam import build_sam2_video_predictor
import gc
import time
import csv
import json
from skimage.measure import find_contours
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

# Set device for computation
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print("\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might give numerically different outputs and sometimes degraded performance on MPS. See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.")

# Initialize predictor
sam2_checkpoint = "../checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)

def show_mask(mask, ax, obj_id=None, random_color=False):
    color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) if random_color else np.array([*plt.get_cmap("tab10")(0 if obj_id is None else obj_id)[:3], 0.6])
    ax.imshow(mask.reshape(*mask.shape[-2:], 1) * color.reshape(1, 1, -1))

# Load coordinates from CSV file
coordinates = []
with open('ver.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'point': [float(row['hand_point_x']), float(row['hand_point_y'])],
            'box': [float(row['box_x1']), float(row['box_y1']), float(row['box_x2']), float(row['box_y2'])],
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    point = np.array(coord['point'], dtype=np.float32)
    box = np.array(coord['box'], dtype=np.float32)
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir_box = f"./TEST/{folder_name}_output_box"
    output_dir_dot = f"./TEST/{folder_name}_output_dot"
    os.makedirs(output_dir_box, exist_ok=True)
    os.makedirs(output_dir_dot, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    # Box prompt
    inference_state = predictor.init_state(video_path=frame_dir)
    predictor.reset_state(inference_state)
    ann_frame_idx = frame_index  # the frame index we interact with
    ann_obj_id = 1  # give a unique id to each object we interact with (it can be any integers)
    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=box)

    video_segments_box = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

    # Dot prompt
    inference_state = predictor.init_state(video_path=frame_dir)
    predictor.reset_state(inference_state)
    points, labels = point.reshape(1, -1), np.array([1], np.int32)
    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

    video_segments_dot = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

    # Start timing
    start_time = time.time()

    mask_data_box = {}
    mask_data_dot = {}

    def mask_to_polygon(mask):
        # Convert mask to binary
        mask_binary = mask.astype(bool)
        print(f"mask shape: {mask.shape}, mask_binary shape: {mask_binary.shape}")
        print(f"mask_binary unique values: {np.unique(mask_binary)}")
        
        # Check if mask is empty
        if mask_binary.size == 0 or np.all(mask_binary == 0):
            return []
        
        # Ensure mask is 2D
        if mask_binary.ndim > 2:
            mask_binary = mask_binary[0]
        
        # Find contours
        contours = find_contours(mask_binary, level=0.5)
        # Convert contours to polygons
        polygons = [contour.tolist() for contour in contours]
        return polygons

    for frame_idx in range(len(frame_names)):
        frame_masks_box = []
        frame_masks_dot = []
        frame = Image.open(os.path.join(frame_dir, frame_names[frame_idx]))
        
        # Box prompt
        for obj_id, mask in video_segments_box.get(frame_idx, {}).items():
            polygons = mask_to_polygon(mask)
            frame_masks_box.append({
                "obj_id": obj_id,
                "polygons": polygons
            })
        mask_data_box[frame_idx] = frame_masks_box
        
        # Dot prompt
        for obj_id, mask in video_segments_dot.get(frame_idx, {}).items():
            polygons = mask_to_polygon(mask)
            frame_masks_dot.append({
                "obj_id": obj_id,
                "polygons": polygons
            })
        mask_data_dot[frame_idx] = frame_masks_dot
        
        # Save images
        fig, ax = plt.subplots(figsize=(frame.width / 100, frame.height / 100), dpi=100)
        ax.axis('off')
        ax.imshow(frame)
        
        # Box prompt mask
        for obj_id, mask in video_segments_box.get(frame_idx, {}).items():
            show_mask(mask, ax, obj_id=obj_id)
        canvas = FigureCanvas(fig)
        canvas.draw()
        img = np.frombuffer(canvas.tostring_argb(), dtype='uint8').reshape(fig.canvas.get_width_height()[::-1] + (4,))
        img = img[:, :, [1, 2, 3]]  # Convert ARGB to RGB
        output_frame_path = os.path.join(output_dir_box, f"{frame_idx:05d}.jpg")
        cv2.imwrite(output_frame_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
        plt.close(fig)
        
        # Dot prompt mask
        fig, ax = plt.subplots(figsize=(frame.width / 100, frame.height / 100), dpi=100)
        ax.axis('off')
        ax.imshow(frame)
        for obj_id, mask in video_segments_dot.get(frame_idx, {}).items():
            show_mask(mask, ax, obj_id=obj_id)
        canvas = FigureCanvas(fig)
        canvas.draw()
        img = np.frombuffer(canvas.tostring_argb(), dtype='uint8').reshape(fig.canvas.get_width_height()[::-1] + (4,))
        img = img[:, :, [1, 2, 3]]  # Convert ARGB to RGB
        output_frame_path = os.path.join(output_dir_dot, f"{frame_idx:05d}.jpg")
        cv2.imwrite(output_frame_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
        plt.close(fig)
        
        del frame, fig, ax, canvas, img  # Free up memory
        gc.collect()  # Force garbage collection
        print(f"Processed frame {frame_idx + 1}/{len(frame_names)}")

    # Save mask data to JSON
    with open(os.path.join(output_dir_box, "mask_data_box.json"), "w") as json_file:
        json.dump(mask_data_box, json_file, indent=4)

    with open(os.path.join(output_dir_dot, "mask_data_dot.json"), "w") as json_file:
        json.dump(mask_data_dot, json_file, indent=4)

    # End timing
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total time taken: {elapsed_time:.2f} seconds")

using device: cuda


frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.25it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.22it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 53.70it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.61it/s]


mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask sh

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.18it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.00it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.15it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.57it/s]


mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask sh

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.76it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.28it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 51.42it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.56it/s]


mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 862), mask_binary shape: (1, 480, 862)
mask_binary unique values: [False  True]
mask sh

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.31it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.22it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 54.01it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.60it/s]


mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 640), mask_binary shape: (1, 480, 640)
mask_binary unique values: [False  True]
mask sh

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 50.18it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.99it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.09it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.60it/s]


mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 854), mask_binary shape: (1, 480, 854)
mask_binary unique values: [False  True]
mask sh

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.98it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.89it/s]
frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 53.45it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 34.61it/s]


mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
Processed frame 1/18
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
Processed frame 2/18
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
Processed frame 3/18
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
Processed frame 4/18
mask shape: (1, 480, 720), mask_binary shape: (1, 480, 720)
mask_binary unique values: [False  True]
mask sh

In [None]:
import os
import numpy as np
import cv2
from PIL import Image
import csv

# Load coordinates from CSV file
coordinates = []
with open('final.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'box': [float(row['box_x1']), float(row['box_y1']), float(row['box_x1']) + float(row['width']), float(row['box_y1']) + float(row['height'])],
            'frame': int(row['frame']),
            'point_x': float(row['point_x']) if row['point_x'] else None,
            'point_y': float(row['point_y']) if row['point_y'] else None
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    box = np.array(coord['box'], dtype=np.float32)
    frame_index = coord['frame']
    point_x = coord['point_x']
    point_y = coord['point_y']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir = f"./TEST/ver2"
    os.makedirs(output_dir, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    if frame_names and frame_index < len(frame_names):
        frame_path = os.path.join(frame_dir, frame_names[frame_index])
        frame = cv2.imread(frame_path)

        # Draw the box on the specified frame
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Draw the center point on the specified frame
        if point_x is not None and point_y is not None:
            cv2.circle(frame, (int(point_x), int(point_y)), 5, (0, 0, 255), -1)

        # Save the image with the box and point
        output_frame_path = os.path.join(output_dir, f"{folder_name}_frame_{frame_index}_with_box_and_point.jpg")
        cv2.imwrite(output_frame_path, frame)

        print(f"Processed folder {folder_name}, frame {frame_index}")

print("Verification complete.")

In [None]:
import os
import numpy as np
import torch
from PIL import Image
import gc
import time
import csv
import json
from skimage.measure import find_contours
from sam2.build_sam import build_sam2_video_predictor

# Set device for computation
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print("\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might give numerically different outputs and sometimes degraded performance on MPS. See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.")

# Initialize predictor
sam2_checkpoint = "../checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)

# Load coordinates from CSV file
coordinates = []
with open('ver.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'point': [float(row['hand_point_x']), float(row['hand_point_y'])],
            'box': [float(row['box_x1']), float(row['box_y1']), float(row['box_x2']) , float(row['box_y1'])],
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    point = np.array(coord['point'], dtype=np.float32)
    box = np.array(coord['box'], dtype=np.float32)
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir_box = f"./collected/box"
    output_dir_dot = f"./collected/dot"
    os.makedirs(output_dir_box, exist_ok=True)
    os.makedirs(output_dir_dot, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    # Box prompt
    inference_state = predictor.init_state(video_path=frame_dir)
    predictor.reset_state(inference_state)
    ann_frame_idx = frame_index  # the frame index we interact with
    ann_obj_id = 1  # give a unique id to each object we interact with (it can be any integers)
    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=box)

    video_segments_box = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

    # Dot prompt
    inference_state = predictor.init_state(video_path=frame_dir)
    predictor.reset_state(inference_state)
    points, labels = point.reshape(1, -1), np.array([1], np.int32)
    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

    video_segments_dot = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

    # Start timing
    start_time = time.time()

    mask_data_box = {}
    mask_data_dot = {}

    def mask_to_polygon(mask):
        # Convert mask to binary
        mask_binary = mask.astype(bool)
        print(f"mask shape: {mask.shape}, mask_binary shape: {mask_binary.shape}")
        print(f"mask_binary unique values: {np.unique(mask_binary)}")
        
        # Check if mask is empty
        if mask_binary.size == 0 or np.all(mask_binary == 0):
            return []
        
        # Ensure mask is 2D
        if mask_binary.ndim > 2:
            mask_binary = mask_binary[0]
        
        # Find contours
        contours = find_contours(mask_binary, level=0.5)
        # Convert contours to polygons
        polygons = [contour.tolist() for contour in contours]
        return polygons

    for frame_idx in range(len(frame_names)):
        frame_masks_box = []
        frame_masks_dot = []
        
        # Box prompt
        for obj_id, mask in video_segments_box.get(frame_idx, {}).items():
            polygons = mask_to_polygon(mask)
            frame_masks_box.append({
                "obj_id": obj_id,
                "polygons": polygons
            })
        mask_data_box[frame_idx] = frame_masks_box
        
        # Dot prompt
        for obj_id, mask in video_segments_dot.get(frame_idx, {}).items():
            polygons = mask_to_polygon(mask)
            frame_masks_dot.append({
                "obj_id": obj_id,
                "polygons": polygons
            })
        mask_data_dot[frame_idx] = frame_masks_dot
        
        gc.collect()  # Force garbage collection
        print(f"Processed frame {frame_idx + 1}/{len(frame_names)}")

    # Save mask data to JSON
    with open(os.path.join(output_dir_box, f"{folder_name}.json"), "w") as json_file:
        json.dump(mask_data_box, json_file, indent=4)

    with open(os.path.join(output_dir_dot, f"{folder_name}.json"), "w") as json_file:
        json.dump(mask_data_dot, json_file, indent=4)

    # End timing
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total time taken: {elapsed_time:.2f} seconds")

For verifier

In [2]:
import os
import numpy as np
import cv2
import csv

# Load coordinates from CSV file
coordinates = []
with open('/home/totargaming/workspace/sam2/aria-handtracking/ver.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'hand_point_x': float(row['hand_point_x']),
            'hand_point_y': float(row['hand_point_y']),
            'obj_point_x': float(row['obj_point_x']),
            'obj_point_y': float(row['obj_point_y']),
            'box_x1': float(row['box_x1']),
            'box_y1': float(row['box_y1']),
            'box_x2': float(row['box_x2']),
            'box_y2': float(row['box_y2']),
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    hand_point_x = coord['hand_point_x']
    hand_point_y = coord['hand_point_y']
    obj_point_x = coord['obj_point_x']
    obj_point_y = coord['obj_point_y']
    box_x1 = coord['box_x1']
    box_y1 = coord['box_y1']
    box_x2 = coord['box_x2']
    box_y2 = coord['box_y2']
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir = f"./TEST2/verifier"
    os.makedirs(output_dir, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    if frame_names and frame_index < len(frame_names):
        frame_path = os.path.join(frame_dir, frame_names[frame_index])
        frame = cv2.imread(frame_path)

        # Draw the combined bounding box on the specified frame
        cv2.rectangle(frame, (int(box_x1), int(box_y1)), (int(box_x2), int(box_y2)), (0, 255, 0), 2)

        # Draw the hand point on the specified frame
        cv2.circle(frame, (int(hand_point_x), int(hand_point_y)), 5, (0, 0, 255), -1)

        # Draw the object point on the specified frame
        cv2.circle(frame, (int(obj_point_x), int(obj_point_y)), 5, (255, 0, 0), -1)

        # Save the image with the box and points
        output_frame_path = os.path.join(output_dir, f"{folder_name}_frame_{frame_index}_with_box_and_points.jpg")
        cv2.imwrite(output_frame_path, frame)

        print(f"Processed folder {folder_name}, frame {frame_index}")

print("Verification complete.")

Processed folder GQ9DjpYb8kI_453, frame 0
Verification complete.


In [3]:
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from sam2.build_sam import build_sam2_video_predictor
import gc
import time
import csv
import json
from skimage.measure import find_contours
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

# Set device for computation
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print("\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might give numerically different outputs and sometimes degraded performance on MPS. See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.")

# Initialize predictor
sam2_checkpoint = "../checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)

def show_mask(mask, ax, obj_id=None, random_color=False):
    color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) if random_color else np.array([*plt.get_cmap("tab10")(0 if obj_id is None else obj_id)[:3], 0.6])
    ax.imshow(mask.reshape(*mask.shape[-2:], 1) * color.reshape(1, 1, -1))

# Load coordinates from CSV file
coordinates = []
with open('/home/totargaming/workspace/sam2/aria-handtracking/ver.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'hand_point_x': float(row['hand_point_x']),
            'hand_point_y': float(row['hand_point_y']),
            'obj_point_x': float(row['obj_point_x']),
            'obj_point_y': float(row['obj_point_y']),
            'box_x1': float(row['box_x1']),
            'box_y1': float(row['box_y1']),
            'box_x2': float(row['box_x2']),
            'box_y2': float(row['box_y2']),
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    hand_point_x = coord['hand_point_x']
    hand_point_y = coord['hand_point_y']
    obj_point_x = coord['obj_point_x']
    obj_point_y = coord['obj_point_y']
    box_x1 = coord['box_x1']
    box_y1 = coord['box_y1']
    box_x2 = coord['box_x2']
    box_y2 = coord['box_y2']
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir = f"./TEST2/{folder_name}_output"
    os.makedirs(output_dir, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    if frame_names and frame_index < len(frame_names):
        frame_path = os.path.join(frame_dir, frame_names[frame_index])
        frame = cv2.imread(frame_path)

        # Initialize inference state
        inference_state = predictor.init_state(video_path=frame_dir)
        predictor.reset_state(inference_state)

        # Add bounding box for the object
        obj_box = np.array([box_x1, box_y1, box_x2, box_y2], dtype=np.float32)
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, box=obj_box)

        # Add dot for the hand as a negative prompt
        points = np.array([[hand_point_x, hand_point_y]], dtype=np.float32)
        labels = np.array([0], np.int32)  # 0 indicates a negative prompt
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

        # Propagate the prompts to get the masklet across the video
        video_segments = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

        # Save images
        for frame_idx in range(len(frame_names)):
            frame = Image.open(os.path.join(frame_dir, frame_names[frame_idx]))
            fig, ax = plt.subplots(figsize=(frame.width / 100, frame.height / 100), dpi=100)
            ax.axis('off')
            ax.imshow(frame)

            for obj_id, mask in video_segments.get(frame_idx, {}).items():
                show_mask(mask, ax, obj_id=obj_id)

            canvas = FigureCanvas(fig)
            canvas.draw()
            img = np.frombuffer(canvas.tostring_argb(), dtype='uint8').reshape(fig.canvas.get_width_height()[::-1] + (4,))
            img = img[:, :, [1, 2, 3]]  # Convert ARGB to RGB
            output_frame_path = os.path.join(output_dir, f"{frame_idx:05d}.jpg")
            cv2.imwrite(output_frame_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
            plt.close(fig)

            del frame, fig, ax, canvas, img  # Free up memory
            gc.collect()  # Force garbage collection
            print(f"Processed frame {frame_idx + 1}/{len(frame_names)}")

        # Save mask data to JSON
        mask_data = {frame_idx: {obj_id: mask.tolist() for obj_id, mask in masks.items()} for frame_idx, masks in video_segments.items()}
        with open(os.path.join(output_dir, "mask_data.json"), "w") as json_file:
            json.dump(mask_data, json_file, indent=4)

        print(f"Total time taken: {time.time() - start_time:.2f} seconds")

using device: cuda


frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 52.49it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.40it/s]


Processed frame 1/18
Processed frame 2/18
Processed frame 3/18
Processed frame 4/18
Processed frame 5/18
Processed frame 6/18
Processed frame 7/18
Processed frame 8/18
Processed frame 9/18
Processed frame 10/18
Processed frame 11/18
Processed frame 12/18
Processed frame 13/18
Processed frame 14/18
Processed frame 15/18
Processed frame 16/18
Processed frame 17/18
Processed frame 18/18


NameError: name 'start_time' is not defined

In [8]:
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from sam2.build_sam import build_sam2_video_predictor
import gc
import time
import csv
import json
from skimage.measure import find_contours
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

# Set device for computation
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print("\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might give numerically different outputs and sometimes degraded performance on MPS. See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.")

# Initialize predictor
sam2_checkpoint = "../checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)

def show_mask(mask, ax, obj_id=None, random_color=False):
    color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) if random_color else np.array([*plt.get_cmap("tab10")(0 if obj_id is None else obj_id)[:3], 0.6])
    ax.imshow(mask.reshape(*mask.shape[-2:], 1) * color.reshape(1, 1, -1))

# Load coordinates from CSV file
coordinates = []
with open('/home/totargaming/workspace/sam2/aria-handtracking/ver.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'hand_point_x': float(row['hand_point_x']),
            'hand_point_y': float(row['hand_point_y']),
            'obj_point_x': float(row['obj_point_x']),
            'obj_point_y': float(row['obj_point_y']),
            'box_x1': float(row['box_x1']),
            'box_y1': float(row['box_y1']),
            'box_x2': float(row['box_x2']),
            'box_y2': float(row['box_y2']),
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    hand_point_x = coord['hand_point_x']
    hand_point_y = coord['hand_point_y']
    obj_point_x = coord['obj_point_x']
    obj_point_y = coord['obj_point_y']
    box_x1 = coord['box_x1']
    box_y1 = coord['box_y1']
    box_x2 = coord['box_x2']
    box_y2 = coord['box_y2']
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir = f"./TEST2/{folder_name}_output_dot"
    os.makedirs(output_dir, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    if frame_names and frame_index < len(frame_names):
        frame_path = os.path.join(frame_dir, frame_names[frame_index])
        frame = cv2.imread(frame_path)

        # Initialize inference state
        inference_state = predictor.init_state(video_path=frame_dir)
        predictor.reset_state(inference_state)

        # Add positive prompts for the hand and the object
        points = np.array([[hand_point_x, hand_point_y], [obj_point_x, obj_point_y]], dtype=np.float32)
        labels = np.array([1, 1], np.int32)  # 1 indicates a positive prompt
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

        # Add the hand point as a negative prompt
        points = np.array([[hand_point_x, hand_point_y]], dtype=np.float32)
        labels = np.array([0], np.int32)  # 0 indicates a negative prompt
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

        # Propagate the prompts to get the masklet across the video
        video_segments = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

        # Save images
        for frame_idx in range(len(frame_names)):
            frame = Image.open(os.path.join(frame_dir, frame_names[frame_idx]))
            fig, ax = plt.subplots(figsize=(frame.width / 100, frame.height / 100), dpi=100)
            ax.axis('off')
            ax.imshow(frame)

            for obj_id, mask in video_segments.get(frame_idx, {}).items():
                show_mask(mask, ax, obj_id=obj_id)

            canvas = FigureCanvas(fig)
            canvas.draw()
            img = np.frombuffer(canvas.tostring_argb(), dtype='uint8').reshape(fig.canvas.get_width_height()[::-1] + (4,))
            img = img[:, :, [1, 2, 3]]  # Convert ARGB to RGB
            output_frame_path = os.path.join(output_dir, f"{frame_idx:05d}.jpg")
            cv2.imwrite(output_frame_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
            plt.close(fig)

            del frame, fig, ax, canvas, img  # Free up memory
            gc.collect()  # Force garbage collection
            print(f"Processed frame {frame_idx + 1}/{len(frame_names)}")

        # Save mask data to JSON
        mask_data = {frame_idx: {obj_id: mask.tolist() for obj_id, mask in masks.items()} for frame_idx, masks in video_segments.items()}
        with open(os.path.join(output_dir, "mask_data.json"), "w") as json_file:
            json.dump(mask_data, json_file, indent=4)



using device: cuda


frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 49.19it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.82it/s]


Processed frame 1/18
Processed frame 2/18
Processed frame 3/18
Processed frame 4/18
Processed frame 5/18
Processed frame 6/18
Processed frame 7/18
Processed frame 8/18
Processed frame 9/18
Processed frame 10/18
Processed frame 11/18
Processed frame 12/18
Processed frame 13/18
Processed frame 14/18
Processed frame 15/18
Processed frame 16/18
Processed frame 17/18
Processed frame 18/18


frame loading (JPEG): 100%|██████████| 67/67 [00:01<00:00, 48.35it/s]
propagate in video: 100%|██████████| 67/67 [00:02<00:00, 32.19it/s]


Processed frame 1/67
Processed frame 2/67
Processed frame 3/67
Processed frame 4/67
Processed frame 5/67
Processed frame 6/67
Processed frame 7/67
Processed frame 8/67
Processed frame 9/67
Processed frame 10/67
Processed frame 11/67
Processed frame 12/67
Processed frame 13/67
Processed frame 14/67
Processed frame 15/67
Processed frame 16/67
Processed frame 17/67
Processed frame 18/67
Processed frame 19/67
Processed frame 20/67
Processed frame 21/67
Processed frame 22/67
Processed frame 23/67
Processed frame 24/67
Processed frame 25/67
Processed frame 26/67
Processed frame 27/67
Processed frame 28/67
Processed frame 29/67
Processed frame 30/67
Processed frame 31/67
Processed frame 32/67
Processed frame 33/67
Processed frame 34/67
Processed frame 35/67
Processed frame 36/67
Processed frame 37/67
Processed frame 38/67
Processed frame 39/67
Processed frame 40/67
Processed frame 41/67
Processed frame 42/67
Processed frame 43/67
Processed frame 44/67
Processed frame 45/67
Processed frame 46/

frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 49.47it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.80it/s]


Processed frame 1/18
Processed frame 2/18
Processed frame 3/18
Processed frame 4/18
Processed frame 5/18
Processed frame 6/18
Processed frame 7/18
Processed frame 8/18
Processed frame 9/18
Processed frame 10/18
Processed frame 11/18
Processed frame 12/18
Processed frame 13/18
Processed frame 14/18
Processed frame 15/18
Processed frame 16/18
Processed frame 17/18
Processed frame 18/18


frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 49.37it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.72it/s]


Processed frame 1/18
Processed frame 2/18
Processed frame 3/18
Processed frame 4/18
Processed frame 5/18
Processed frame 6/18
Processed frame 7/18
Processed frame 8/18
Processed frame 9/18
Processed frame 10/18
Processed frame 11/18
Processed frame 12/18
Processed frame 13/18
Processed frame 14/18
Processed frame 15/18
Processed frame 16/18
Processed frame 17/18
Processed frame 18/18


frame loading (JPEG): 100%|██████████| 18/18 [00:00<00:00, 50.16it/s]
propagate in video: 100%|██████████| 18/18 [00:00<00:00, 33.35it/s]


Processed frame 1/18
Processed frame 2/18
Processed frame 3/18
Processed frame 4/18
Processed frame 5/18
Processed frame 6/18
Processed frame 7/18
Processed frame 8/18
Processed frame 9/18
Processed frame 10/18
Processed frame 11/18
Processed frame 12/18
Processed frame 13/18
Processed frame 14/18
Processed frame 15/18
Processed frame 16/18
Processed frame 17/18
Processed frame 18/18


In [None]:
import os
import numpy as np
import torch
from PIL import Image
import gc
import time
import csv
import json
from skimage.measure import find_contours
from sam2.build_sam import build_sam2_video_predictor

# Set device for computation
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print("\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might give numerically different outputs and sometimes degraded performance on MPS. See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion.")

# Initialize predictor
sam2_checkpoint = "../checkpoints/sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)

# Load coordinates from CSV file
coordinates = []
with open('/home/totargaming/workspace/sam2/aria-handtracking/merged_output.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        coordinates.append({
            'folder_name': row['folder_name'],
            'hand_point_x': float(row['hand_point_x']),
            'hand_point_y': float(row['hand_point_y']),
            'obj_point_x': float(row['obj_point_x']),
            'obj_point_y': float(row['obj_point_y']),
            'box_x1': float(row['box_x1']),
            'box_y1': float(row['box_y1']),
            'box_x2': float(row['box_x2']),
            'box_y2': float(row['box_y2']),
            'frame': int(row['frame'])
        })

for coord in coordinates:
    folder_name = coord['folder_name']
    hand_point_x = coord['hand_point_x']
    hand_point_y = coord['hand_point_y']
    obj_point_x = coord['obj_point_x']
    obj_point_y = coord['obj_point_y']
    box_x1 = coord['box_x1']
    box_y1 = coord['box_y1']
    box_x2 = coord['box_x2']
    box_y2 = coord['box_y2']
    frame_index = coord['frame']

    frame_dir = f"./HOIST/valid/JPEGImages/{folder_name}"
    output_dir = f"./collected_new"
    os.makedirs(output_dir, exist_ok=True)

    frame_names = sorted([p for p in os.listdir(frame_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]], key=lambda p: int(os.path.splitext(p)[0].split('_')[-1]))

    if frame_names and frame_index < len(frame_names):
        frame_path = os.path.join(frame_dir, frame_names[frame_index])
        frame = Image.open(frame_path)

        # Initialize inference state
        inference_state = predictor.init_state(video_path=frame_dir)
        predictor.reset_state(inference_state)

        # Add positive prompts for the hand and the object
        points = np.array([[hand_point_x, hand_point_y], [obj_point_x, obj_point_y]], dtype=np.float32)
        labels = np.array([1, 1], np.int32)  # 1 indicates a positive prompt
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

        # Add the hand point as a negative prompt
        points = np.array([[hand_point_x, hand_point_y]], dtype=np.float32)
        labels = np.array([0], np.int32)  # 0 indicates a negative prompt
        _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(inference_state, frame_idx=frame_index, obj_id=1, points=points, labels=labels)

        # Propagate the prompts to get the masklet across the video
        video_segments = {out_frame_idx: {out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)} for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state)}

        # Start timing
        start_time = time.time()

        mask_data = {}

        def mask_to_polygon(mask):
            # Convert mask to binary
            mask_binary = mask.astype(bool)
            print(f"mask shape: {mask.shape}, mask_binary shape: {mask_binary.shape}")
            print(f"mask_binary unique values: {np.unique(mask_binary)}")
            
            # Check if mask is empty
            if mask_binary.size == 0 or np.all(mask_binary == 0):
                return []
            
            # Ensure mask is 2D
            if mask_binary.ndim > 2:
                mask_binary = mask_binary[0]
            
            # Find contours
            contours = find_contours(mask_binary, level=0.5)
            # Convert contours to polygons
            polygons = [contour.tolist() for contour in contours]
            return polygons

        for frame_idx in range(len(frame_names)):
            frame_masks = []
            
            # Process masks
            for obj_id, mask in video_segments.get(frame_idx, {}).items():
                polygons = mask_to_polygon(mask)
                frame_masks.append({
                    "obj_id": obj_id,
                    "polygons": polygons
                })
            mask_data[frame_idx] = frame_masks
            
            gc.collect()  # Force garbage collection
            print(f"Processed frame {frame_idx + 1}/{len(frame_names)}")

        # Save mask data to JSON
        with open(os.path.join(output_dir, f"{folder_name}.json"), "w") as json_file:
            json.dump(mask_data, json_file, indent=4)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Total time taken: {elapsed_time:.2f} seconds")