In [None]:
# !pip install "numpy<2.0" --upgrade
# !pip install torchreid
# !pip install torch==2.0.1 torchvision==0.15.2
# !pip install gdown
# !pip uninstall torchreid -y
# !pip install git+https://github.com/KaiyangZhou/deep-person-reid.git

In [1]:
!pip install roboflow
!pip install torch torchvision
!pip install git+https://github.com/KaiyangZhou/deep-person-reid.git
!pip install gdown
!pip install opencv-python-headless
!pip install scikit-learn

Collecting roboflow
  Downloading roboflow-1.2.7-py3-none-any.whl.metadata (9.7 kB)
Collecting idna==3.7 (from roboflow)
  Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting opencv-python-headless==4.10.0.84 (from roboflow)
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting pi-heif<2 (from roboflow)
  Downloading pi_heif-1.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.5 kB)
Collecting pillow-avif-plugin<2 (from roboflow)
  Downloading pillow_avif_plugin-1.5.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting filetype (from roboflow)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading roboflow-1.2.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.6/88.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━

In [2]:
# from roboflow import Roboflow

# rf = Roboflow(api_key="OFJsbzSXtei8j554tCdF")
# project = rf.workspace().project("walking-staff-detection-ms3uf")
# model = project.version(2).model

loading Roboflow workspace...
loading Roboflow project...


In [3]:
import torch
from torchreid.utils import FeatureExtractor

device = "cuda" if torch.cuda.is_available() else "cpu"

extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path=None,
    device=device
)

print("FeatureExtractor done initialization")

Downloading...
From: https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY
To: /root/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth
100%|██████████| 10.9M/10.9M [00:00<00:00, 25.2MB/s]


Successfully loaded imagenet pretrained weights from "/root/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
FeatureExtractor done initialization


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

staff_db = []

def get_embedding(image):
    """
    image: numpy array (H, W, C), BGR or RGB
    return: 512-dim embedded vector
    """
    return extractor(image)[0].cpu().numpy()

def is_staff(feature, threshold=0.85):
    """
    If the feature is similar to any feature in staff_db, return True
    feature: 512-dim embedded vector
    """
    if not staff_db:
        return False
    sims = cosine_similarity([feature], staff_db)[0]
    return np.max(sims) > threshold


In [None]:

def tag_inside_person(tag_pos, person_box):
    """
    tag_pos = (x, y)
    box = [x1, y1, x2, y2]  # left-top corner (x1,y1), right-bottom corner (x2,y2)
    """
    px, py = tag_pos
    return (person_box[0] <= px <= person_box[2]) and (person_box[1] <= py <= person_box[3])

In [None]:
import cv2
import json

with open("data.json", "r") as f:
    results = json.load(f)

video_path = "sample.mp4"
output_path = "output_staff.mp4"

cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

out = cv2.VideoWriter(output_path,
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      fps, (width, height))

frame_count = -1
max_staff_feature = 25
staff_db = []
project = "walking-staff-detection-ms3uf"
staff_occur={}

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    # Get detections for the current frame
    detections = results[project][frame_count]["predictions"]

    detected_class_names_set = set()

    # Check all detected objects
    for prediction in detections:
        class_name = prediction['class']
        if prediction["confidence"] > 0.4:
            detected_class_names_set.add(class_name)

    # Get people and tags with confidence threshold
    people = [obj for obj in detections if obj['class'] == 'people' and obj['confidence'] > 0.6 ]
    tags = [obj for obj in detections if obj['class'] == 'tag' and obj['confidence'] > 0.4 ]

    for p in people:
        x, y, w, h = int(p['x']), int(p['y']), int(p['width']), int(p['height'])
        x1, y1, x2, y2 = x-w//2, y-h//2, x+w//2, y+h//2
        person_box = [x1, y1, x2, y2]

        # Crop the person and get embedding
        person_crop = frame[max(0,y1):max(0,y2), max(0,x1):max(0,x2)]
        feature = get_embedding(person_crop)

        # Check if this person is staff
        staff_flag = is_staff(feature)

        # If already known staff, add to db
        if staff_flag:
            staff_db.append(feature)
            if(len(staff_db)>max_staff_feature):
                staff_db.pop(0)
        # Check if any tag is inside this person box
        if tags and not staff_flag:
            for t in tags:
                tx, ty, tw, th = int(t['x']), int(t['y']), int(t['width']), int(t['height'])
                tag_center = (tx, ty)
                if tag_inside_person(tag_center, person_box):
                    staff_flag = True
                    staff_db.append(feature)
                    if len(staff_db) > max_staff_feature:
                        staff_db.pop(0)
                    break

        # Draw staff/non-staff box
        color = (0, 255, 0) if staff_flag else (0, 0, 255)
        label = "STAFF" if staff_flag else "NON-STAFF"
        if staff_flag :
            staff_occur[frame_count] = (x,y)
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    # Draw tags
    for t in tags:
        tx, ty, tw, th = int(t['x']), int(t['y']), int(t['width']), int(t['height'])
        tx1, ty1, tx2, ty2 = tx-tw//2, ty-th//2, tx+tw//2, ty+th//2
        cv2.rectangle(frame, (tx1, ty1), (tx2, ty2), (255, 0, 0), 2)
        cv2.circle(frame, (tx, ty), 3, (255, 0, 0), -1)
        cv2.putText(frame, "TAG", (tx1, ty1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

    out.write(frame)
cap.release()
out.release()
print(f"Staff DB size: {len(staff_db)}")
print(f"Staff Occur: {len(staff_occur)}")

Frame: 00347 | Detected Classes: people, tag | Staff: 0
Frame: 00378 | Detected Classes: people, tag | Staff: 1
Frame: 00380 | Detected Classes: people, tag | Staff: 3
Frame: 00384 | Detected Classes: people, tag | Staff: 5
Frame: 00385 | Detected Classes: people, tag | Staff: 6
Frame: 00386 | Detected Classes: people, tag | Staff: 7
Frame: 00430 | Detected Classes: people, tag | Staff: 8
Frame: 00432 | Detected Classes: people, tag | Staff: 10
Frame: 00434 | Detected Classes: people, tag | Staff: 12
Frame: 00458 | Detected Classes: people, tag | Staff: 13
Frame: 00464 | Detected Classes: people, tag | Staff: 18
Frame: 00465 | Detected Classes: people, tag | Staff: 19
Frame: 00468 | Detected Classes: people, tag | Staff: 21
Frame: 00469 | Detected Classes: people, tag | Staff: 22
Frame: 00470 | Detected Classes: people, tag | Staff: 23
Frame: 00509 | Detected Classes: people, tag | Staff: 25
Frame: 00510 | Detected Classes: people, tag | Staff: 25
Frame: 00557 | Detected Classes: peopl

In [37]:
for frame in sorted(staff_occur.keys()):
    x, y = staff_occur[frame]
    print(f"Frame: {frame} | Position: ({x}, {y})")

Frame: 347 | Position: (390, 588)
Frame: 378 | Position: (511, 86)
Frame: 379 | Position: (521, 85)
Frame: 380 | Position: (525, 85)
Frame: 381 | Position: (534, 87)
Frame: 384 | Position: (534, 130)
Frame: 385 | Position: (535, 136)
Frame: 386 | Position: (537, 140)
Frame: 430 | Position: (524, 606)
Frame: 431 | Position: (525, 600)
Frame: 432 | Position: (533, 592)
Frame: 433 | Position: (545, 560)
Frame: 434 | Position: (562, 535)
Frame: 458 | Position: (447, 84)
Frame: 459 | Position: (453, 83)
Frame: 461 | Position: (451, 82)
Frame: 462 | Position: (457, 79)
Frame: 463 | Position: (474, 80)
Frame: 464 | Position: (489, 82)
Frame: 465 | Position: (496, 96)
Frame: 466 | Position: (500, 100)
Frame: 468 | Position: (512, 119)
Frame: 469 | Position: (519, 127)
Frame: 470 | Position: (523, 152)
Frame: 471 | Position: (526, 164)
Frame: 506 | Position: (517, 632)
Frame: 508 | Position: (533, 582)
Frame: 509 | Position: (538, 574)
Frame: 510 | Position: (546, 564)
Frame: 511 | Position: (5