In [7]:
#%%
import os

from glob import glob
from zipfile import ZipFile
import xmltodict
import numpy as np
import pandas as pd
import torch.nn as nn
from torchvision.io import read_video
import torch

# define paths to original videos
ANNO_DIR = '/workspace/pvc-meteor/downloads/Video XML Annotations/'
VID_DIR = '/workspace/pvc-meteor/Raw_Videos/'

# define files for testing purposes
TEST_NAME = 'REC_1970_01_01_07_40_16_F.MP4'

TEST_VID = os.path.join(VID_DIR, TEST_NAME)
TEST_ANNO = os.path.join(ANNO_DIR, TEST_NAME[:-4] + '.zip')


In [11]:
def get_bbx_coordinates(zip_path):
    """
    return list of tensors.
    one tensor for each agent in frame ('track' in xml_file). 
    Each tensor is initialized as (nr_frames, 4). 
    Each entry at dimension 1 represents a coordinate of a bounding box.
    --> if all entries are 0, there is no bounding box, hence no mask later.
    else apply mask.
    """
    out_list = list()
    zip_file = ZipFile(zip_path)

    xml_file = xmltodict.parse(zip_file.read('annotations.xml'))['annotations']

    # get index from stop_frame
    last_frame = int(xml_file['meta']['task']['stop_frame'])

    if not isinstance(xml_file['track'], list):
        xml_file['track'] = [xml_file['track']]

    for track in xml_file['track']:
        track_tensor = torch.zeros((last_frame + 1, 4))
        for box in track['box']:
            frame_index = box['@frame']
            xtl = box['@xtl']
            ytl = box['@ytl']
            xbr = box['@xbr']
            ybr = box['@ybr']

            for i, coordinate in zip(range(4), [xtl, ytl, xbr, ybr]):
                track_tensor[int(frame_index), i] = float(coordinate)

        out_list.append(track_tensor)
    return out_list

test_bbx_coordinates = get_bbx_coordinates(TEST_ANNO)

In [None]:
def apply_pos_masks(vid_name, bbx_coordinates, nr_frames=64, start_frame=0):
    """
    generate masks (binary) from bbx_coordinates (round corner coordinates)
    mask is positive --> remove area within bbx_coordinates (set to 0)
    return list of masked videos
    """
    frames, _, _ = read_video(vid_name, pts_unit='sec', output_format='TCHW')

    out_list = list()

    for track in bbx_coordinates:
        # Create a mask of the same size as the video
        mask = torch.ones_like(frames)
        
        # Loop over each frame and its corresponding bounding box
        for t, (xtl, ytl, xbr, ybr) in enumerate(track):
            # Convert bounding box coordinates to integers
            xtl, ytl, xbr, ybr = map(int, [xtl, ytl, xbr, ybr])

            # Set the area inside the bounding box to 0
            mask[t, :, ytl:ybr, xtl:xbr] = 0

        # Apply the mask to the video
        video_masked = frames * mask

        out_list.append(video_masked)
    
    return out_list

tet_pos_mask = apply_pos_masks(TEST_VID, test_bbx_coordinates)

In [17]:
a = np.ones((3,3))
b = np.ones((3,3))
a[1,1] = 0
a * b

array([[1., 1., 1.],
       [1., 0., 1.],
       [1., 1., 1.]])

In [None]:
def apply_pos_masks(vid_name, bbx_coordinates, nr_frames=64, start_frame=0):
    """
    generate masks (binary) from bbx_coordinates (round corner coordinates)
    mask is positive --> remove area within bbx_coordinates (set to 0)
    return list of masked videos
    """
    frames, _, _ = read_video(vid_name, pts_unit='sec', output_format='TCHW')

    out_list = list()

    for track in bbx_coordinates:
        # Create a mask of the same size as the video
        mask = torch.zeros_like(frames)
        
        # Loop over each frame and its corresponding bounding box
        for t, (xtl, ytl, xbr, ybr) in enumerate(bbx_coordinates):
            # Convert bounding box coordinates to integers
            xtl, ytl, xbr, ybr = map(int, [xtl, ytl, xbr, ybr])

            # Set the area inside the bounding box to 0
            mask[t, :, ytl:ybr, xtl:xbr] = 1

        # Apply the mask to the video
        video_masked = video * mask

        out_list.append(video_masked)
    
    return out_list
# %%
