# **Study Project:** *Transformer model for prediction of grasping movements*

## Import packages

In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.0.202-py3-none-any.whl (644 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.8/644.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.0.202


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys
import zipfile
import cv2
import numpy as np
import tensorflow as tf
import IPython.display as ipd
#!pip install ultralytics
#ipd.clear_output()
import ultralytics

## Helper functions

In [4]:
def GetBoundingBoxes(videopath):
    """
    Extract bounding boxes from video.

    Args:
    videopath -- path to video to extract bounding boxes from.
    """

    # Use object tracker to get the bounding boxes and classIDs in a 'results' object
    bbs = np.zeros((1, 7))
    # It is also possible to pass the whole folder as path,
    # but we still want the flexibility to access single videos
    results = model.track(source=videopath, tracker="bytetrack.yaml")

    # Get class names
    classes = results[0].names

    # Iterate through each frame of a video to get all bounding boxes for a frame
    for frame in range(len(results)):

        # x_center, y_center, bbwidth, bbheight of bbs of this frame
        xywh = results[frame].boxes.xywh.detach().cpu().numpy()
        n = len(xywh) # number of bounding boxes
        cls = results[frame].boxes.cls.detach().cpu().numpy().reshape((n,1))
        # if the object tracker is currently tracking at least one object, save the trackingID for that object, else fill with -1 placeholder
        trackingID = results[frame].boxes.id.detach().cpu().numpy().reshape((n,1)) if results[frame].boxes.is_track else np.repeat(-1, n).reshape((n,1))
        frame_count = np.repeat(frame, n).reshape((n,1))

        # bind the data together for one frame
        data = np.concatenate((frame_count, cls, xywh, trackingID), axis=1)
        # add all data of this frame to all data of this video
        bbs = np.concatenate((bbs, data), axis=0)

    return bbs, classes

## Create input X

In [5]:
# Load model
model = ultralytics.YOLO('yolov8n.pt')

# List videos
folderpath = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Trials2/'
labelpath = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Labels2/'
folder = [f for f in os.listdir(labelpath) if os.path.isfile(os.path.join(labelpath, f))]
bbs_hand_path = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Centering_Data/Post_Centering_BBs_Hand/'
bbs_target_path = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Centering_Data/Post_Centering_BBs_Target/'

# Example video
#ipd.Video(labelpath+'banana02_centered.mp4', width=1920/1.7, height=1120/1.7)

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt to 'yolov8n.pt'...
100%|██████████| 6.23M/6.23M [00:00<00:00, 25.1MB/s]


### Extract bounding box information

In [6]:
# instantiate empty numpy arrays for data
# video_count, frame, x_center_t, y_center_t, bbwidth_t, bbheight_t, x_center_h, y_center_h, bbwidth_h, bbheight_h (raw)
input = np.zeros((0, 10))
input[:] = np.nan
# filename, frame, angle/sos/eos
labels = np.zeros((0, 3))
labels[:] = np.nan


# Iterate through every video file in the folder
for i, video in enumerate(folder):

    print(f"Video {i+1}/{len(folder)}: {video}\n")

    #Load bounding boxes for video from drive
    bbs_hand_file = 'Post_Centering_BBs_Hand_Video_{}.npz'.format(i)
    bbs_target_file = 'Post_Centering_BBs_Target_Video_{}.npz'.format(i)

    bbs_hand_npz = np.load(os.path.join(bbs_hand_path,bbs_hand_file))
    bbs_target_npz = np.load(os.path.join(bbs_target_path,bbs_target_file))

    bbs_hand = bbs_hand_npz["array"]
    bbs_target = bbs_target_npz["array"]

    frameCount = len(bbs_hand)
    if (len(bbs_hand) != len(bbs_target)):
        raise ValueError('Error in interpolation result, different amount of frames for hand and target')

    # (x_center_target, y_center_target, bbwidth_target, bbheight_target, class, trackerID, frame, filename, x_center_hand, y_center_hand, bbwidth_hand, bbheight_hand)
    vid_input = np.zeros((frameCount, 10))
    vid_input[:] = np.nan

    #Fill frame and video indices for new array
    #frame indices
    vid_input[:,1] = range(frameCount)

    #video indices
    vid_input[:,0] = i

    #Fill Target information: already interpolated
    #get target values
    x_centers_targets = np.asarray([frame_value[0] for frame_value in bbs_target])
    y_centers_targets = np.asarray([frame_value[1] for frame_value in bbs_target])
    bbwidths_targets = np.asarray([frame_value[2] for frame_value in bbs_target])
    bbheights_targets = np.asarray([frame_value[3] for frame_value in bbs_target])

    #put target values in vid_input
    vid_input[:,2] = x_centers_targets
    vid_input[:,3] = y_centers_targets
    vid_input[:,4] = bbwidths_targets
    vid_input[:,5] = bbheights_targets


    #Fill hand information: already interpolated
    #get the values for successful detections
    x_centers_hands = np.asarray([frame_value[0] for frame_value in bbs_hand])
    y_centers_hands = np.asarray([frame_value[1] for frame_value in bbs_hand])
    bbwidths_hands = np.asarray([frame_value[2] for frame_value in bbs_hand])
    bbheights_hands = np.asarray([frame_value[3] for frame_value in bbs_hand])

    #put target values in vid_input
    vid_input[:,6] = x_centers_hands
    vid_input[:,7] = y_centers_hands
    vid_input[:,8] = bbwidths_hands
    vid_input[:,9] = bbheights_hands


    # 8. Bind input X together and bind labels Y together
    input = np.concatenate((input, vid_input), axis=0)

# Remove rows with NaN values (only give vibration commands if there is information on both hand and target object, at least for training)
input = input[~np.isnan(input).any(axis=1)]

Video 1/1: banana02_centered.mp4



In [16]:
# Debugging: check input

with np.printoptions(threshold=sys.maxsize, suppress=True):
    print(input)


[[   0.            0.         1528.85791016  600.90942383  225.45043945  167.29650879  284.           20.            0.            0.        ]
 [   0.            1.         1528.85791016  600.90942383  225.45043945  167.29650879 1510.95019531 1083.32666016  470.2857666    83.98937988]
 [   0.            2.         1528.85791016  600.90942383  225.45043945  167.29650879 1511.59460449 1083.99261475  469.45433044   82.04003906]
 [   0.            3.         1528.85791016  600.90942383  225.45043945  167.29650879 1512.23901367 1084.65856934  468.62289429   80.09069824]
 [   0.            4.         1528.85791016  600.90942383  225.45043945  167.29650879 1512.88342285 1085.32452393  467.79145813   78.14135742]
 [   0.            5.         1528.85791016  600.90942383  225.45043945  167.29650879 1513.52783203 1085.99047852  466.96002197   76.1920166 ]
 [   0.            6.         1528.85791016  600.90942383  225.45043945  167.29650879 1514.17224121 1086.65643311  466.12858582   74.24267578]

### Tokenization

In [8]:
# define tokens; has to be unusual in sequences (0 to max dim of video res) as well as labels (radiands: -2pi to 2pi)
start_token = -333 # for seqs and labels
end_token = -666 # for labels only (sequences do not need an end token, as they are all separated by start tokens)
padding_token = -999 # for seqs only (labels are generated from padded sequences)

# Slice data into single videos
uniques = np.unique(input[:,0])
sliced_seqs = []
sliced_labels = []

for vid in uniques:
    # subset input per video and append to listin which each array is one video (sequence)
    subset = input[input[:,0] == vid,:]
    # prepend start token to sequence
    subset = np.vstack([np.full((1, subset.shape[1]), start_token), subset])
    with np.printoptions(threshold=sys.maxsize, suppress=True):
        print(subset)
    sliced_seqs.append(subset)

[[-333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.         -333.        ]
 [   0.            0.         1528.85791016  600.90942383  225.45043945  167.29650879  284.           20.            0.            0.        ]
 [   0.            1.         1528.85791016  600.90942383  225.45043945  167.29650879 1510.95019531 1083.32666016  470.2857666    83.98937988]
 [   0.            2.         1528.85791016  600.90942383  225.45043945  167.29650879 1511.59460449 1083.99261475  469.45433044   82.04003906]
 [   0.            3.         1528.85791016  600.90942383  225.45043945  167.29650879 1512.23901367 1084.65856934  468.62289429   80.09069824]
 [   0.            4.         1528.85791016  600.90942383  225.45043945  167.29650879 1512.88342285 1085.32452393  467.79145813   78.14135742]
 [   0.            5.         1528.85791016  600.90942383  225.45043945  167.29650879 1513.52783203 1085.99047852  466.96002197   76.1920166 ]

### Batching

In [9]:
# create list that contains lists of size batch_size, each one containing single videos (sequences) as arrays
BATCH_SIZE = 2
seq_batches = [sliced_seqs[i:i+BATCH_SIZE] for i in range(0, len(sliced_seqs), BATCH_SIZE)]

### Padding

In [10]:
# Pad sequences to the maximum sequence length within each batch
padded_seq_batches = []
for batch in seq_batches:
    max_length = max(seq.shape[0] for seq in batch)
    # [:,2:] slices the sequence to remove columns for video count and frame count
    padded_batch = [tf.pad(seq[:,2:], paddings=[[max_length - seq.shape[0], 0], [0, 0]], mode="CONSTANT", constant_values=padding_token) for seq in batch]
    with np.printoptions(threshold=sys.maxsize, suppress=True):
        print(padded_batch)
    padded_seq_batches.append(tf.stack(padded_batch))

[<tf.Tensor: shape=(117, 8), dtype=float64, numpy=
array([[-333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        , -333.        ],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879,  284.        ,   20.        ,    0.        ,    0.        ],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879, 1510.95019531, 1083.32666016,  470.2857666 ,   83.98937988],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879, 1511.59460449, 1083.99261475,  469.45433044,   82.04003906],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879, 1512.23901367, 1084.65856934,  468.62289429,   80.09069824],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879, 1512.88342285, 1085.32452393,  467.79145813,   78.14135742],
       [1528.85791016,  600.90942383,  225.45043945,  167.29650879, 1513.52783203, 1085.99047852,  466.96002197,   76.1920166 ],
       [1528.85791016,  600.90942383,  225.450

## Create labels Y

In [11]:
padded_lbl_batches = []
for batch in padded_seq_batches:
    labels = []

    for seq in batch:
        seq = seq.numpy()
        lbl = np.zeros((seq.shape[0], 1))

        for row in range(lbl.shape[0]):

            # sos tokens
            if np.all(seq[row] == start_token):
                lbl[row] = start_token
                continue

            # padding tokens
            if np.all(seq[row] == padding_token):
                lbl[row] = padding_token
                continue


            # EOS token if hand overlaps with target (with a little wiggle room epsilon)
            EPSILON = 100 # what is a good range?
            object_loc = np.array(seq[row, 0], seq[row, 1])
            hand_loc = np.array(seq[row, 4], seq[row, 5])

            # this implementation entails that after an EOS token, there do not necessarily have to follow EOS tokens until the end
            # -> good or bad for training/ online application?
            if np.linalg.norm(hand_loc - object_loc) < EPSILON or row == len(seq):
                lbl[row] = end_token
                continue

            # else neither start- nor end-token -> calculate angle
            lbl[row] = np.arctan2(seq[row,7] - seq[row+1,7], seq[row,6] - seq[row+1,6]) # angle between current and next frame

        labels.append(lbl)

    padded_lbl_batches.append(tf.stack(labels))


In [12]:
# Debugging: check padded sequences and corresponding labels
"""
with np.printoptions(threshold=sys.maxsize, suppress=True):
    print(padded_seq_batches)
    print()
    print(padded_lbl_batches)
"""

'\nwith np.printoptions(threshold=sys.maxsize, suppress=True):\n    print(padded_seq_batches)\n    print()\n    print(padded_lbl_batches)\n'

## Create Dataset

In [13]:
# Define a generator function to yield each sequence batch
def seq_generator():
    for batch in padded_seq_batches:
        yield batch

# Define a generator function to yield each label batch
def lbl_generator():
    for batch in padded_lbl_batches:
        yield batch

# Create a tf.data.Dataset from the generator
X = tf.data.Dataset.from_generator(seq_generator, output_signature=tf.TensorSpec(shape=(BATCH_SIZE, None, 8), dtype=tf.float32))
Y = tf.data.Dataset.from_generator(lbl_generator, output_signature=tf.TensorSpec(shape=(BATCH_SIZE, None, 1), dtype=tf.float32))
train_ds = tf.data.Dataset.zip((X, Y))

In [14]:
# Debugging
"""
# Print at most the first 5 examples from the train dataset
for i, (train_data, label_data) in enumerate(train_ds):
    if i < 5:
        print(f"Training pair: {i}")
        print("Train Data:")
        print(train_data.shape)
        print(train_data.numpy())
        print()
        print("Label Data:")
        print(label_data.shape)
        print(label_data.numpy())
        print()
"""

'\n# Print at most the first 5 examples from the train dataset\nfor i, (train_data, label_data) in enumerate(train_ds):\n    if i < 5:\n        print(f"Training pair: {i}")\n        print("Train Data:")\n        print(train_data.shape)\n        print(train_data.numpy())\n        print()\n        print("Label Data:")\n        print(label_data.shape)\n        print(label_data.numpy())\n        print()\n'

### Export dataset

In [15]:
# Define a file path for saving the zipped dataset
export_path = 'train_ds.zip'

# Create a TFRecordWriter to save the zipped dataset
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(export_path, options=options) as writer:
    for example_x, example_y in train_ds:
        tf_example = tf.train.Example(features=tf.train.Features(
            feature={
                'x': tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(example_x).numpy()])
                ),
                'y': tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(example_y).numpy()])
                )
            }
        ))
        writer.write(tf_example.SerializeToString())

InvalidArgumentError: ignored