In [46]:
import os
import cv2
import numpy as np
import pandas as pd
import torch
import math
from ultralytics import YOLO
from torch.utils.data import Dataset, DataLoader
from deep_sort_realtime.deepsort_tracker import DeepSort

In [47]:
# References: 
# https://learnopencv.com/real-time-deep-sort-with-torchvision-detectors/#Real-Time-Deep-SORT-Setup
# https://pypi.org/project/deep-sort-realtime/

In [48]:
raw_video_folder = r'C:\Users\janni\OneDrive\Dokumente\Privat\Bildung\M. Sc. Social and Economic Data Science\4. Semester\Master Thesis\Code\data\raw\videos'
yolo_path      = r'C:\Users\janni\OneDrive\Dokumente\Privat\Bildung\M. Sc. Social and Economic Data Science\4. Semester\Master Thesis\Code\models\costumized_yolo\costumized_yolo\costumized_yolo.pt'
output_folder    = r'C:\Users\janni\OneDrive\Dokumente\Privat\Bildung\M. Sc. Social and Economic Data Science\4. Semester\Master Thesis\Code\data\processed'

In [None]:
def get_swarm_data(frame, model, tracker, frame_idx):
    records = []

    # YOLO inference
    results = model(frame)[0]
    bboxes, confidences, class_ids = [], [], []

    for box, score, cls in zip(results.boxes.xyxy.cpu().numpy(), results.boxes.conf.cpu().numpy(), results.boxes.cls.cpu().numpy()):
        label = model.names[int(cls)]
        
        if label not in ("Prey", "Predator Head"):
            continue
        
        x1, y1, x2, y2 = box
        w, h = x2 - x1, y2 - y1
        bboxes.append([x1, y1, w, h])
        confidences.append(float(score))
        class_ids.append(label)

    detections = list(zip(bboxes, confidences, class_ids))
    tracks = tracker.update_tracks(detections, frame=frame)

    for t in tracks:
        if not t.is_confirmed():
            continue

        tid = t.track_id
        cx, cy = t.mean[0], t.mean[1]
        vx, vy = float(t.mean[4]), float(t.mean[5])
        speed = np.hypot(vx, vy)
        angle = np.degrees(np.arctan2(vy, vx))
        label = t.det_class
        conf = t.det_conf

        records.append({
            "frame":    int(frame_idx),
            "track_id": int(tid),
            "label":    str(label),
            "conf":     conf,
            "x":        float(cx),
            "y":        float(cy),
            "vx":       float(vx),
            "vy":       float(vy),
            "speed":    float(speed),
            "angle":    float(angle),
        })

    return pd.DataFrame(records)

In [None]:
def get_quality_score(df, n=10):
    track_counts = df.groupby("track_id")["frame"].nunique()
    mean_track_visibility = track_counts.mean()

    num_full_tracks = (track_counts == n).sum()

    mean_confidence = df["conf"].mean()

    print(f"Mean track visibility: {mean_track_visibility}")
    print(f"Number of full tracks: {num_full_tracks}")
    print(f"Mean confidence: {mean_confidence}")

    return mean_track_visibility, num_full_tracks, mean_confidence

In [51]:
def get_full_tracks(df, min_frames):
    counts = df.groupby("track_id")["frame"].nunique()
    good_ids = counts[counts == min_frames].index
    return df[df["track_id"].isin(good_ids)]

In [None]:
def get_tensor(df):
    # Einzigartige Frame-Indices und Track-IDs sammeln
    frame_indices = sorted(df["frame"].unique())
    track_ids = sorted(df["track_id"].unique())
    T = len(frame_indices)
    N = len(track_ids)

    # Mapping von track_id -> Index in unseren Arrays
    track_id_to_index = {track_id: idx for idx, track_id in enumerate(track_ids)}

    # Ergebnis-Tensor: [T, N, N-1, 4] für [dx, dy, vx_i, vy_i]
    relative_motion_tensor = torch.zeros((T, N, N-1, 4), dtype=torch.float32)

    for t_idx, frame_id in enumerate(frame_indices):
        frame_df = df[df["frame"] == frame_id]

        # Positionen (N×2) und Winkel (N) in Arrays bringen
        positions = np.stack([frame_df.loc[frame_df.track_id == tid, ["x", "y"]].values[0] for tid in track_ids])
        velocities = np.stack([frame_df.loc[frame_df.track_id == tid, ["vx", "vy"]].values[0] for tid in track_ids])

        # Winkel des Fokal-Agents in Radiant
        angles_rad = np.deg2rad(frame_df.set_index("track_id")["angle"].reindex(track_ids).values)

        for i in range(N):
            # Index aller anderen Agents
            neighbor_indices = [j for j in range(N) if j != i]

            # Relative Position = neighbor_pos - focal_pos
            delta_positions = positions[neighbor_indices] - positions[i]  # shape (N-1,2)

            # Roh-Geschwindigkeitsvektoren der Nachbarn
            neighbor_velocities = velocities[neighbor_indices]             # shape (N-1,2)

            # Rotationsmatrix, um in das Koordinatensystem des Fokal-Agents zu wechseln
            cos_theta = np.cos(-angles_rad[i])
            sin_theta = np.sin(-angles_rad[i])
            
            rotation_matrix = np.array([[cos_theta, -sin_theta],
                                        [sin_theta,  cos_theta]])

            # Nachbar-Geschwindigkeit im Fokal-Koordinatensystem
            rotated_neighbor_velocities = neighbor_velocities @ rotation_matrix.T  # (N-1,2)

            # 4-dim Feature: [dx, dy, vx_i, vy_i]
            edge_features = np.hstack([delta_positions, rotated_neighbor_velocities])  # (N-1,4)

            # In den Tensor speichern
            relative_motion_tensor[t_idx, i] = torch.from_numpy(edge_features.astype(np.float32))

    return relative_motion_tensor


In [53]:
class ClipDataset(Dataset):
    def __init__(self, clips_list):
        self.clips = clips_list

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        return self.clips[idx]

In [54]:
video = "video_5s" #58 min
video_path = raw_video_folder + "\\" + video + ".mp4"
cap = cv2.VideoCapture(video_path)

fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps

print(f"FPS: {fps}")
print(f"Total frames: {total_frames}")
print(f"Duration (s): {duration:.2f}")
print("Prey Count: 32")
print("Predator Count: 1")

FPS: 30.0
Total frames: 153
Duration (s): 5.10
Prey Count: 32
Predator Count: 1


In [55]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = YOLO(yolo_path)
tracker = DeepSort(max_age=5)

num_agents = 33 #32 Prey + 1 Predator
threshold = math.floor(num_agents * 0.9)
video_size = 10

video_idx = 0
frame_idx = 0
tensor_data = []

while True:
    frames = []
    for i in range(video_size):
        success, frame = cap.read()
        if not success:
            break
        frames.append(frame)

    if not frames:
        break
    
    results = []

    for frame in frames:
        df = get_swarm_data(frame, model, tracker, frame_idx)
        results.append(df)
        frame_idx += 1

    combined_df = pd.concat(results, ignore_index=True)
    mean_track_visibility, num_full_tracks, mean_confidence = get_quality_score(combined_df, n=video_size)
    video_idx += 1

    
    if num_full_tracks >= threshold:
            full_tracks_df = get_full_tracks(combined_df, min_frames=video_size)

            if not full_tracks_df.empty:
                tensor_data.append(get_tensor(full_tracks_df))

cap.release()


0: 736x736 1 Predator, 1 Predator Head, 33 Preys, 827.7ms
Speed: 21.1ms preprocess, 827.7ms inference, 2.1ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 33 Preys, 662.3ms
Speed: 21.5ms preprocess, 662.3ms inference, 2.1ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 34 Preys, 678.4ms
Speed: 15.0ms preprocess, 678.4ms inference, 1.7ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 35 Preys, 643.6ms
Speed: 17.0ms preprocess, 643.6ms inference, 1.3ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 35 Preys, 615.4ms
Speed: 16.4ms preprocess, 615.4ms inference, 2.0ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 34 Preys, 609.0ms
Speed: 15.3ms preprocess, 609.0ms inference, 2.4ms postprocess per image at shape (1, 3, 736, 736)

0: 736x736 1 Predator, 1 Predator Head, 34 Preys, 6

In [60]:
torch.save(tensor_data, os.path.join(output_folder, f"tensor_data_{video}.pt"))

dataset = ClipDataset(tensor_data)

dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    drop_last=True,  # nur volle Batches
    pin_memory=True  # falls auf GPU trainiert wird
)