# Video Demo
Here, we provide a demo on how to use our model to process a video and visualize the tracking results.

We selected an open hpop dance video from the internet to demonstrate our demo. You can also choose other custom videos. **Please note that it is crucial to select the appropriate trained MOTIP weights and configuration for different tracking scenarios.**

We process the video on NVIDIA RTX 3080Ti, achieving a nearly real-time tracking.

### System Environment
1. Modify the root path to the project path.
2. Make sure you have a cuda device available.

In [22]:
import os
import sys
import torch


current_file_path = os.path.abspath("/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/MOTIP/demo")
parent_dir = os.path.dirname(current_file_path)
sys.path.append(parent_dir)
os.chdir(parent_dir)
print(f"Current root path is set to {parent_dir}")

torch_version = torch.__version__
cuda_available = torch.cuda.is_available()

if not cuda_available:
    raise RuntimeError("CUDA is not available")

print(f"Hello! Welcome to use the video process demo. Your torch version is {torch_version} and CUDA is available.")

Current root path is set to /media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/MOTIP
Hello! Welcome to use the video process demo. Your torch version is 2.3.0 and CUDA is available.


### Prepare your video (.mp4 for example):

In [23]:
#os.makedirs("./outputs/video_process_demo/", exist_ok=True)
video_path = os.path.join("/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/sam2/notebooks/videos", f"bedroom.mp4")
output_path = os.path.join("/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/output/motip/bosch1_hungarian")
os.makedirs(output_path, exist_ok=True)



### Build our model

In [24]:
from utils.misc import yaml_to_dict


config_path = "configs/r50_deformable_detr_motip_dancetrack.yaml"
checkpoint_path = "/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/r50_deformable_detr_motip_dancetrack.pth"
config = yaml_to_dict(config_path)
dtype = torch.float16       # torch.float32 or torch.float16, we select float16 for faster inference


from models.motip import build as build_model
from models.misc import load_checkpoint
from models.runtime_tracker import RuntimeTracker
model, _ = build_model(config)
# Load the model weights
load_checkpoint(model, checkpoint_path)
model.eval()
model = model.cuda()
if dtype == torch.float16:
    model.half()

print("Model built successfully.")

Model built successfully.


In [25]:
!pwd


/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/MOTIP


In [26]:
print(model.parameters().__next__().dtype)  # shows model's dtype


torch.float16


### Process the video

In [27]:
import os
import cv2
import torch
from tqdm import tqdm
from torchvision.transforms import functional as F
from utils.nested_tensor import nested_tensor_from_tensor_list
from demo.colormap import get_color


def simple_transform(image, max_shorter, max_longer, image_dtype):
    image = F.to_tensor(image)
    image = F.resize(image, size=max_shorter, max_size=max_longer)
    image = F.normalize(image, mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
    if image_dtype != torch.float32:
        image = image.to(image_dtype)
    return image.cuda()


# === INPUT CONFIGURATION ===
frames_dir = "/media/RTCIN9TBA/Interns/RDT2/yai3kor/workspace/data/BOSCH/LB-UH_103_20181116_145538_005"         # directory with frames
dtype = torch.float16                   # or torch.float16

# === FRAME COLLECTION ===
frame_files = sorted([
    os.path.join(frames_dir, f)
    for f in os.listdir(frames_dir)
    if f.lower().endswith((".jpg",".png"))
])

if not frame_files:
    raise RuntimeError(f"No frames found in: {frames_dir}")

# === INIT TRACKER ===
sample_frame = cv2.imread(frame_files[0])
height, width = sample_frame.shape[:2]

runtime_tracker = RuntimeTracker(
    model=model,
    sequence_hw=(height, width),
    assignment_protocol="hungarian",
    miss_tolerance=30,
    det_thresh=0.4,
    newborn_thresh=0.5,
    id_thresh=0.6,
    dtype=dtype,
)

# === PROCESS FRAMES ===
for frame_path in tqdm(frame_files, desc="Tracking objects", unit="frame"):
    frame = cv2.imread(frame_path)
    if frame is None:
        print(f"Could not read frame: {frame_path}")
        continue

    # Prepare tensor input
    frame_tensor = simple_transform(frame, max_shorter=800, max_longer=1440, image_dtype=dtype)
    frame_tensor = nested_tensor_from_tensor_list([frame_tensor])

    # Run tracker
    runtime_tracker.update(frame_tensor)
    with torch.no_grad():
        track_results = runtime_tracker.get_track_results()

    # Annotate frame (in memory only)
    for bbox, obj_id in zip(track_results["bbox"], track_results["id"]):
        x, y, w, h = map(int, bbox)
        color = get_color(obj_id, rgb=False, use_int=True)
        cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
        cv2.putText(frame, f"ID: {obj_id}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # You can visualize (optional)
    # cv2.imshow("Tracked", frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

# cv2.destroyAllWindows()
    frame_name = os.path.basename(frame_path)
    save_path = os.path.join(output_path, frame_name)
    cv2.imwrite(save_path, frame)
print("Done processing all frames.")


Tracking objects: 100%|██████████| 927/927 [06:36<00:00,  2.34frame/s]

Done processing all frames.





In [28]:
print(height,width)

2160 3840
