# **Study Project:** *Transformer model for prediction of grasping movements*

## Import packages

In [1]:
import os
import sys
import zipfile
import cv2
import numpy as np
import tensorflow as tf
import IPython.display as ipd
#!pip install ultralytics
#ipd.clear_output()
import ultralytics

2023-09-07 11:38:58.967927: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Helper functions

In [2]:
def GetBoundingBoxes(videopath):
    """
    Extract bounding boxes from video.

    Args:
    videopath -- path to video to extract bounding boxes from.
    """
    
    # Use object tracker to get the bounding boxes and classIDs in a 'results' object
    bbs = np.zeros((1, 7))
    # It is also possible to pass the whole folder as path,
    # but we still want the flexibility to access single videos
    results = model.track(source=videopath, tracker="bytetrack.yaml")
    
    # Get class names
    classes = results[0].names
    
    # Iterate through each frame of a video to get all bounding boxes for a frame
    for frame in range(len(results)):
    
        # x_center, y_center, bbwidth, bbheight of bbs of this frame
        xywh = results[frame].boxes.xywh.detach().cpu().numpy()
        n = len(xywh) # number of bounding boxes
        cls = results[frame].boxes.cls.detach().cpu().numpy().reshape((n,1))
        # if the object tracker is currently tracking at least one object, save the trackingID for that object, else fill with -1 placeholder
        trackingID = results[frame].boxes.id.detach().cpu().numpy().reshape((n,1)) if results[frame].boxes.is_track else np.repeat(-1, n).reshape((n,1))
        frame_count = np.repeat(frame, n).reshape((n,1))
        
        # bind the data together for one frame
        data = np.concatenate((frame_count, cls, xywh, trackingID), axis=1)
        # add all data of this frame to all data of this video
        bbs = np.concatenate((bbs, data), axis=0)
    
    return bbs, classes

## Create input X

In [3]:
# Load model
model = ultralytics.YOLO('yolov8n.pt')

# List videos
folderpath = './Trials/'
labelpath = './Labels/'
folder = [f for f in os.listdir(labelpath) if os.path.isfile(os.path.join(labelpath, f))]

# Example video
#ipd.Video(labelpath+'banana02_centered.mp4', width=1920/1.7, height=1120/1.7)

### Extract bounding box information

In [4]:
# instantiate empty numpy arrays for data
# video_count, frame, x_center_t, y_center_t, bbwidth_t, bbheight_t, x_center_h, y_center_h, bbwidth_h, bbheight_h (raw)
input = np.zeros((0, 10)) 
input[:] = np.nan
# filename, frame, angle/sos/eos
labels = np.zeros((0, 3))
labels[:] = np.nan


# Iterate through every video file in the folder
for i, video in enumerate(folder):

    print(f"Video {i+1}/{len(folder)}: {video}\n")

    # Get length of input array (number of frames)
    cap = cv2.VideoCapture(labelpath+video)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    
    
    # 5. Get all bbs of this video
    # (x_center, y_center, bbwidth, bbheight, class, trackerID, frame, filename)
    bbs_corrected, classes = GetBoundingBoxes(labelpath+video)
    video_count = np.repeat(i, len(bbs_corrected)).reshape((len(bbs_corrected),1))
    bbs_corrected = np.concatenate((video_count, bbs_corrected), axis=1)
    
    
    # 6. Filter for hand by class
    if '.mp4' in video:
        # returns the string name without '00_centered.mp4' ending (could add try-catch)
        target_class = video[:-15]
    
    # Get class number from lookup table
    target_class = list(classes.keys())[list(classes.values()).index(target_class)]
    hand_class = list(classes.keys())[list(classes.values()).index('person')]
    
    target_bbs_corrected = bbs_corrected[(bbs_corrected[:, 2] == target_class)]
    target_bbs_corrected = target_bbs_corrected[:, [0,1,3,4,5,6]] # video_count, frame_count, NO class, xywh
    hand_bbs_corrected = bbs_corrected[(bbs_corrected[:, 2] == hand_class)]
    hand_bbs_corrected = hand_bbs_corrected[:, [0,1,3,4,5,6]]
    
    
    # 7. Interpolation: take last known bounding box positions
    # (x_center_target, y_center_target, bbwidth_target, bbheight_target, class, trackerID, frame, filename, x_center_hand, y_center_hand, bbwidth_hand, bbheight_hand)
    vid_input = np.zeros((frameCount, 10))
    vid_input[:] = np.nan

    # get video frame information (bbs)
    for frame in range(len(vid_input)):
        # get information on objects
        target_information = target_bbs_corrected[(target_bbs_corrected[:, 1] == frame), :]
        hand_information = hand_bbs_corrected[(hand_bbs_corrected[:, 1] == frame), 2:]
        
        
        # fill information for target object
        if (frame in target_bbs_corrected[:, 1]) & (target_information.shape[0] == 1): # col 1 is frames
            vid_input[frame, :6] = target_information
        else:
            # if we have a frame without information on the object (object was not detected)
            # we take the information from the last row if it was not the first frame
            if frame == 0:
                vid_input[frame, 1] = frame
                vid_input[frame, 0] = i
                continue
            else:
                # interpolation method: last known information
                # (should be replaced by positional interpolation)
                vid_input[frame, :6] = vid_input[frame-1, :6]
            
        
        # fill information for hand
        if (frame in hand_bbs_corrected[:, 1]) & (hand_information.shape[0] == 1): # col 1 is frames
            """
            Fails in the line below if we have more than one detection for the same object/class per frame.
            current workaround: take last known information (like current interpolation method)
            -> better option: compare to last known information and take closer one
            """
            vid_input[frame, 6:] = hand_information
        else:
            # if we have a frame without information on the object (object was not detected)
            # we take the information from the last row if it was not the first frame
            if frame == 0:
                vid_input[frame, 1] = frame
                vid_input[frame, 0] = i
                continue
            else:
                vid_input[frame, 6:] = vid_input[frame-1, 6:]
    
        vid_input[frame, 1] = frame
        vid_input[frame, 0] = i
    
    
    # 8. Bind input X together and bind labels Y together
    input = np.concatenate((input, vid_input), axis=0)

# Remove rows with NaN values (only give vibration commands if there is information on both hand and target object, at least for training)
#input = input[~np.isnan(input).any(axis=1)]

Video 1/6: orange05_centered.mp4





    causing potential out-of-memory errors for large sources or long-running streams/videos.

    Usage:
        results = model(source=..., stream=True)  # generator of Results objects
        for r in results:
            boxes = r.boxes  # Boxes object for bbox outputs
            masks = r.masks  # Masks object for segment masks outputs
            probs = r.probs  # Class probabilities for classification outputs

video 1/1 (1/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange05_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 345.1ms
video 1/1 (2/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange05_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 179.2ms
video 1/1 (3/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange05_centered.mp4: 320x640 1 person, 1 remote, 1 cell phone, 217.5ms
video 1/1 (4/126) /Users/florian/Documents/Studiu

Video 2/6: banana02_centered.mp4



video 1/1 (1/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 188.8ms
video 1/1 (2/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 182.9ms
video 1/1 (3/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 174.9ms
video 1/1 (4/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 165.6ms
video 1/1 (5/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 174.3ms
video 1/1 (6/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana02_centered.mp4: 320x640 (no detections), 163.6ms
video 1/1 (7/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local

Video 3/6: banana04_centered.mp4



video 1/1 (1/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 187.5ms
video 1/1 (2/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 163.1ms
video 1/1 (3/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 164.2ms
video 1/1 (4/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 167.7ms
video 1/1 (5/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 168.1ms
video 1/1 (6/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana04_centered.mp4: 320x640 (no detections), 179.6ms
video 1/1 (7/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local

Video 4/6: banana03_centered.mp4



video 1/1 (1/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 184.6ms
video 1/1 (2/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 176.0ms
video 1/1 (3/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 177.4ms
video 1/1 (4/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 170.0ms
video 1/1 (5/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 169.9ms
video 1/1 (6/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/banana03_centered.mp4: 320x640 (no detections), 164.2ms
video 1/1 (7/116) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local

Video 5/6: orange07_centered.mp4



video 1/1 (1/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 173.4ms
video 1/1 (2/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 172.9ms
video 1/1 (3/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 320x640 1 person, 1 remote, 1 cell phone, 179.0ms
video 1/1 (4/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 320x640 1 person, 2 remotes, 1 cell phone, 166.4ms
video 1/1 (5/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 320x640 1 person, 2 remotes, 1 keyboard, 1 cell phone, 232.6ms
video 1/1 (6/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange07_centered.mp4: 32

Video 6/6: orange06_centered.mp4



video 1/1 (1/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 197.1ms
video 1/1 (2/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 320x640 1 person, 1 orange, 1 remote, 1 cell phone, 158.1ms
video 1/1 (3/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 320x640 1 person, 1 remote, 1 cell phone, 168.9ms
video 1/1 (4/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 320x640 1 person, 2 remotes, 1 cell phone, 182.6ms
video 1/1 (5/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 320x640 1 person, 2 remotes, 1 keyboard, 1 cell phone, 218.5ms
video 1/1 (6/126) /Users/florian/Documents/Studium/Master/Semester 2/Study Project/Local/Labels/orange06_centered.mp4: 32

In [14]:
# Debugging: check input
with np.printoptions(threshold=sys.maxsize, suppress=True):
    print(input)

[[   0.            1.         1837.39025879  883.23632812  132.90625     116.28308105 1285.94189453 1671.93115234  684.50848389  119.26062012]
 [   0.            2.         1837.39025879  883.23632812  132.90625     116.28308105 1286.39746094 1672.08239746  684.06524658  118.37963867]
 [   0.            3.         1837.39025879  883.23632812  132.90625     116.28308105 1286.03894043 1673.19946289  688.83886719  116.22485352]
 [   0.            4.         1837.39025879  883.23632812  132.90625     116.28308105 1291.44555664 1669.90893555  685.31756592  113.49645996]
 [   0.            5.         1837.39025879  883.23632812  132.90625     116.28308105 1291.58605957 1669.92993164  685.27056885  113.49719238]
 [   0.            6.         1837.39025879  883.23632812  132.90625     116.28308105 1295.72094727 1669.2734375   690.46362305  113.91906738]
 [   0.            7.         1837.39025879  883.23632812  132.90625     116.28308105 1295.43225098 1669.25708008  691.32617188  114.0177002 ]

### Tokenization

In [6]:
# define tokens; has to be unusual in sequences (0 to max dim of video res) as well as labels (radiands: -2pi to 2pi)
start_token = -333 # for seqs and labels
end_token = -666 # for labels only (sequences do not need an end token, as they are all separated by start tokens)
padding_token = -999 # for seqs only (labels are generated from padded sequences)

# Replace nan's with PAD
#input = 

# Slice data into single videos
uniques = np.unique(input[:,0])
sliced_seqs = []
sliced_labels = []

for vid in uniques:
    # subset input per video and append to listin which each array is one video (sequence)
    subset = input[input[:,0] == vid,:]
    # prepend start token to sequence
    subset = np.vstack([np.full((1, subset.shape[1]), start_token), subset])
    with np.printoptions(threshold=sys.maxsize, suppress=True):
        print(subset)
    sliced_seqs.append(subset)

[[-333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.        ]
 [   0.            1.         1837.39025879  883.23632812  132.90625     116.28308105 1285.94189453 1671.93115234  684.50848389  119.26062012]
 [   0.            2.         1837.39025879  883.23632812  132.90625     116.28308105 1286.39746094 1672.08239746  684.06524658  118.37963867]
 [   0.            3.         1837.39025879  883.23632812  132.90625     116.28308105 1286.03894043 1673.19946289  688.83886719  116.22485352]
 [   0.            4.         1837.39025879  883.23632812  132.90625     116.28308105 1291.44555664 1669.90893555  685.31756592  113.49645996]
 [   0.            5.         1837.39025879  883.23632812  132.90625     116.28308105 1291.58605957 1669.92993164  685.27056885  113.49719238]
 [   0.            6.         1837.39025879  883.23632812  132.90625     116.28308105 1295.72094727 1669.2734375   690.46362305  113.91906738]

### Batching 

In [7]:
# create list that contains lists of size batch_size, each one containing single videos (sequences) as arrays
BATCH_SIZE = 3
seq_batches = [sliced_seqs[i:i+BATCH_SIZE] for i in range(0, len(sliced_seqs), BATCH_SIZE)]

### Padding

In [8]:
# Pad sequences to the maximum sequence length within each batch
padded_seq_batches = []
for batch in seq_batches:
    max_length = max(seq.shape[0] for seq in batch)
    # [:,2:] slices the sequence to remove columns for video count and frame count
    padded_batch = [tf.pad(seq[:,2:], paddings=[[max_length - seq.shape[0], 0], [0, 0]], mode="CONSTANT", constant_values=padding_token) for seq in batch]
    with np.printoptions(threshold=sys.maxsize, suppress=True):
        print(padded_batch)
    padded_seq_batches.append(tf.stack(padded_batch))

[<tf.Tensor: shape=(126, 8), dtype=float64, numpy=
array([[-333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        ],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1285.94189453, 1671.93115234,  684.50848389,  119.26062012],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1286.39746094, 1672.08239746,  684.06524658,  118.37963867],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1286.03894043, 1673.19946289,  688.83886719,  116.22485352],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1291.44555664, 1669.90893555,  685.31756592,  113.49645996],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1291.58605957, 1669.92993164,  685.27056885,  113.49719238],
       [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1295.72094727, 1669.2734375 ,  690.46362305,  113.91906738],
       [1837.39025879,  883.23632812,  132.906

## Create labels Y

In [9]:
padded_lbl_batches = []
for batch in padded_seq_batches:
    labels = []
    
    for seq in batch:
        seq = seq.numpy()
        lbl = np.zeros((seq.shape[0], 1))
        
        for row in range(lbl.shape[0]):

            # sos tokens
            if np.all(seq[row] == start_token):
                lbl[row] = start_token
                continue

            # padding tokens
            if np.all(seq[row] == padding_token):
                lbl[row] = padding_token
                continue
                
            
            # EOS token if hand overlaps with target (with a little wiggle room epsilon)
            EPSILON = 100 # what is a good range?
            object_loc = np.array(seq[row, 0], seq[row, 1])
            hand_loc = np.array(seq[row, 4], seq[row, 5])
            
            # this implementation entails that after an EOS token, there do not necessarily have to follow EOS tokens until the end 
            # -> good or bad for training/ online application?
            if np.linalg.norm(hand_loc - object_loc) < EPSILON or row == len(seq):
                lbl[row] = end_token
                continue
                
            # else neither start- nor end-token -> calculate angle
            lbl[row] = np.arctan2(seq[row,7] - seq[row+1,7], seq[row,6] - seq[row+1,6]) # angle between current and next frame

        labels.append(lbl)

    padded_lbl_batches.append(tf.stack(labels))


In [15]:
# Debugging: check padded sequences and corresponding labels
with np.printoptions(threshold=sys.maxsize, suppress=True):
    print(padded_seq_batches)
    print()
    print(padded_lbl_batches)

[<tf.Tensor: shape=(3, 126, 8), dtype=float64, numpy=
array([[[-333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        ],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1285.94189453, 1671.93115234,  684.50848389,  119.26062012],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1286.39746094, 1672.08239746,  684.06524658,  118.37963867],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1286.03894043, 1673.19946289,  688.83886719,  116.22485352],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1291.44555664, 1669.90893555,  685.31756592,  113.49645996],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1291.58605957, 1669.92993164,  685.27056885,  113.49719238],
        [1837.39025879,  883.23632812,  132.90625   ,  116.28308105, 1295.72094727, 1669.2734375 ,  690.46362305,  113.91906738],
        [1837.39025879,  883.2363281

## Create dataset

In [11]:
# Define a generator function to yield each sequence batch
def seq_generator():
    for batch in padded_seq_batches:
        yield batch

# Define a generator function to yield each label batch
def lbl_generator():
    for batch in padded_lbl_batches:
        yield batch

# Create a tf.data.Dataset from the generator
X = tf.data.Dataset.from_generator(seq_generator, output_signature=tf.TensorSpec(shape=(BATCH_SIZE, None, 8), dtype=tf.float32))
Y = tf.data.Dataset.from_generator(lbl_generator, output_signature=tf.TensorSpec(shape=(BATCH_SIZE, None, 1), dtype=tf.float32))
train_ds = tf.data.Dataset.zip((X, Y))

In [12]:
# Debugging
"""
# Print at most the first 5 examples from the train dataset
for i, (train_data, label_data) in enumerate(train_ds):
    if i < 5:
        print(f"Training pair: {i}")
        print("Train Data:")
        print(train_data.shape)
        print(train_data.numpy())
        print()
        print("Label Data:")
        print(label_data.shape)
        print(label_data.numpy())
        print()
"""

'\n# Print at most the first 5 examples from the train dataset\nfor i, (train_data, label_data) in enumerate(train_ds):\n    if i < 5:\n        print(f"Training pair: {i}")\n        print("Train Data:")\n        print(train_data.shape)\n        print(train_data.numpy())\n        print()\n        print("Label Data:")\n        print(label_data.shape)\n        print(label_data.numpy())\n        print()\n'

### Export dataset

In [13]:
# Define a file path for saving the zipped dataset
export_path = 'train_ds.zip'

# Create a TFRecordWriter to save the zipped dataset
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(export_path, options=options) as writer:
    for example_x, example_y in train_ds:
        tf_example = tf.train.Example(features=tf.train.Features(
            feature={
                'x': tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(example_x).numpy()])
                ),
                'y': tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(example_y).numpy()])
                )
            }
        ))
        writer.write(tf_example.SerializeToString())

2023-09-07 11:42:27.247103: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype int32
	 [[{{node Placeholder/_3}}]]
