# Tracking using YOLO & ByteTrack

- import packages and set up YOLO

In [1]:
from collections import defaultdict
import cv2
import numpy as np

from ultralytics import YOLO

!yolo checks

[2K
[2K


Ultralytics YOLOv8.0.200  Python-3.9.7 torch-1.11.0 CUDA:0 (NVIDIA GeForce RTX 3060, 12288MiB)
Setup complete  (12 CPUs, 15.8 GB RAM, 391.2/452.6 GB disk)

OS                  Windows-10-10.0.22621-SP0
Environment         Windows
Python              3.9.7
Install             git
RAM                 15.82 GB
CPU                 12th Gen Intel Core(TM) i5-12400F
CUDA                11.3

matplotlib           3.4.3>=3.3.0
numpy                1.22.4>=1.22.2
opencv-python        4.8.0.76>=4.6.0
pillow               8.4.0>=7.1.2
pyyaml               6.0>=5.3.1
requests             2.26.0>=2.23.0
scipy                1.7.1>=1.4.1
torch                1.11.0>=1.8.0
torchvision          0.12.0>=0.9.0
tqdm                 4.66.1>=4.64.0
pandas               1.3.4>=1.1.4
seaborn              0.11.2>=0.11.0
psutil               5.8.0
py-cpuinfo           9.0.0
thop                 0.1.1-2209072238>=0.1.1


## 트래킹 함수 정의 및 실행

- Ultralytics의 [track plotting](https://docs.ultralytics.com/modes/track/#plotting-tracks-over-time) 코드를 참고하였음
- 출력된 추론 결과는 12th Gen Intel Core(TM) i5-12400F, NVIDIA GeForce RTX 3060 (12,288MiB)의 환경에서 진행된 결과임.

In [2]:
def tracking(weight, video, plot_bbox=False, save=False, save_as='output.avi', mot16=False):
    """
    입력 비디오 경로의 영상으로부터 YOLO 검출 모델 기반 트래킹을 수행함. 각 프레임이 처리될 때마다 처리결과를 display window에서 보여줌.
    처리결과(영상 또는 MOT16 annotation)를 저장할 수 있음
    
        : weight (string): YOLOv8n 가중치 파일 경로. 반드시 YOLOv8n에 상응하는 가중치를 사용해야함
        : vidoe (string): 검출 및 트래킹을 수행할 입력 비디오 경로
        : plot_bbox (bool): 처리결과에 bounding box를 시각화할 것인지
        : save (bool): 트래킹 결과 영상을 저장할 것인지
        : save_as (string): (save=True일 때) 트래킹 결과 영상을 저장할 경로. 반드시 .avi 확장자여야 함
        : mot16 (bool): 트래킹 결과 MOT16 format annotation 텍스트 파일(.txt)을 현재 디렉토리에 저장할 것인지 (mot16_tracking_results.txt)
        
    """
    
    # Load the YOLOv8 model
    model = YOLO(weight)
    
    # Open the video file
    video_path = video
    cap = cv2.VideoCapture(video_path)
    
    # Video Write IF 'save' IS TRUE
    if save:
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        fourcc = cv2.VideoWriter_fourcc(*'DIVX')
        out = cv2.VideoWriter(save_as, fourcc, fps, (width, height))
        
    # Generate MOT-16 format output file IF 'mot16' IS TRUE
    if mot16:
        mot16_file_path = 'mot16_tracking_results.txt'
        mot16_file = open(output_file_path, 'w')
        
    # Store the track history
    track_history = defaultdict(lambda: [])
    
    # Init frame number for writing MOT-16 format annotation file
    fnum = 1

    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLOv8 tracking on the frame, persisting tracks between frames
            results = model.track(frame, persist=True, tracker='bytetrack.yaml')
        
            # Get the boxes and track IDs
            boxes = results[0].boxes.xywh.cpu()
            track_ids = results[0].boxes.id.int().cpu().tolist()
            confs = results[0].boxes.conf
            clss = results[0].boxes.cls

            # Visualize the results on the frame
            # https://docs.ultralytics.com/reference/engine/results/#ultralytics.engine.results.Results.numpy
            annotated_frame = results[0].plot(boxes=plot_bbox)
        
            # Write MOT-16 format annotation file
            if mot16:
                expansion_factor = 3 # expansion_factor defined in 'update' method of BYTETracker class (byte_tracker.py)
                
                for box, track_id, conf, cls in zip(boxes, track_ids, confs, clss):
                    x, y, w, h = box
                    if cls == 1:
                        w, h = (w / expansion_factor), (h / expansion_factor)
                    x, y, w, h = x - (w / 2), y - (h / 2), w, h
                    mot16_file.write(f"{fnum},{track_id},{x},{y},{w},{h},{conf},{int(cls)},-1,-1\n")
            
            # Plot the tracks
            for box, track_id, cls in zip(boxes, track_ids, clss):
                x, y, w, h = box
                track = track_history[track_id]
            
                # Plot the tracks of players
                if cls == 0:
                    track.append((float(x), float(y + 0.4*h)))  # points of players' foot
                    if len(track) > 50:  # retain 50 tracks for 50 frames
                        track.pop(0)
                    for i, (x_, y_) in enumerate(track):
                        cv2.ellipse(annotated_frame, center=(int(x_), int(y_)), axes=(int(0.1*w), int(0.05*w)), 
                                    angle=0, startAngle=0, endAngle=360, color=(255 - 2*i, 200 - 5*i, 0), thickness=int(i/5))

                # Plot the tracks of tennis ball
                else: # elif cls == 1:
                    track.append((float(x), float(y)))  # x, y center(ball) point
                    if len(track) > 20:  # retain 20 tracks for 20 frames
                        track.pop(0)
                
                    for i, (x_, y_) in enumerate(track):
                        cv2.circle(annotated_frame, center=(int(x_), int(y_)), radius=int(i/3),
                                    color=(255 - 10*i, 250, 250), thickness=2)
        
            # Video Write
            if save:
                out.write(annotated_frame)
            
            # Display the annotated frame
            cv2.imshow("YOLOv8 Tracking", annotated_frame)

            fnum += 1

            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
        else:
            # Break the loop if the end of the video is reached
            break

    # Release the video capture object and close the display window
    cap.release()
    cv2.destroyAllWindows()
    
    # Release the video writer
    if save:
        out.release()
    
    # Close writing the MOT-16 output file
    if mot16:
        mot16_file.close()

In [3]:
# tracking(weight, video, plot_bbox=False, save=False, save_as='output.avi', mot16=False)
tracking('weights/best.pt', 'evaluation/test_sequence/TENNIS-HARD.mp4', save=True, save_as='TENNIS-HARD_trajectory.avi')


0: 384x640 2 players, 12.0ms
Speed: 3.0ms preprocess, 12.0ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 8.5ms
Speed: 2.5ms preprocess, 8.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 2.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 2.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms prepro

Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.0ms
Speed: 1.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 2.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms pre


0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 player


0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 8.5ms
Speed: 1.0ms preprocess, 8.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 8.0ms
Speed: 2.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_bal


0: 384x640 2 players, 1 tennis_ball, 10.1ms
Speed: 3.0ms preprocess, 10.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.5ms
Speed: 2.0ms preprocess, 10.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.0ms
Speed: 1.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.0ms
Speed: 1.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 38

Speed: 1.0ms preprocess, 11.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.5ms
Speed: 2.0ms preprocess, 11.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 2.0ms preprocess, 11.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 2.0ms preprocess, 11.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed: 3.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 11.0ms
Speed:


0: 384x640 3 players, 1 tennis_ball, 21.0ms
Speed: 2.0ms preprocess, 21.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 players, 1 tennis_ball, 19.0ms
Speed: 4.0ms preprocess, 19.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 18.0ms
Speed: 2.0ms preprocess, 18.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 1 tennis_ball, 16.0ms
Speed: 5.0ms preprocess, 16.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
