# **Study Project:** *Transformer model for prediction of grasping movements*

## Import packages

In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.0.199-py3-none-any.whl (644 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.5/644.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.0.199


In [None]:
import os
import cv2
import numpy as np
import IPython.display as ipd
from scipy import interpolate
#!pip install ultralytics
#ipd.clear_output()
import ultralytics

In [None]:
#Debugging
import sys

## Helper functions

In [None]:
def GetBoundingBoxes(videopath):
    """
    Extract bounding boxes from video.

    Args:
    videopath -- path to video to extract bounding boxes from.
    """
    # Use object tracker to get the bounding boxes and classIDs in a 'results' object
    bbs = np.zeros((1, 7))
    # It is also possible to pass the whole folder as path,
    # but we still want the flexibility to access single videos
    results = model.track(source=videopath, tracker="bytetrack.yaml")

    # Get class names
    classes = results[0].names

    # Iterate through each frame of a video to get all bounding boxes for a frame
    for frame in range(len(results)):

        # x_center, y_center, bbwidth, bbheight of bbs of this frame
        xywh = results[frame].boxes.xywh.detach().cpu().numpy()
        n = len(xywh) # number of bounding boxes
        cls = results[frame].boxes.cls.detach().cpu().numpy().reshape((n,1))
        # if the object tracker is currently tracking at least one object, save the trackingID for that object, else fill with -1 placeholder
        trackingID = results[frame].boxes.id.detach().cpu().numpy().reshape((n,1)) if results[frame].boxes.is_track else np.repeat(-1, n).reshape((n,1))
        frame_count = np.repeat(frame, n).reshape((n,1))

        # bind the data together for one frame
        data = np.concatenate((frame_count, cls, xywh, trackingID), axis=1)
        # add all data of this frame to all data of this video
        bbs = np.concatenate((bbs, data), axis=0)

    return bbs, classes

In [None]:
def Center(videopath, to_path, bounding_boxes, width=1920, height=1120):

    """
    Create a video where a target object is always centered.

    Args:
    videopath -- path to video to center
    to_path -- location to export centered video to
    bounding_boxes -- bbs of object to center on
    width -- output video dim x, default=1920
    height -- output video dim y, default=1120
    """

    # 1. Read video
    cap = cv2.VideoCapture(videopath)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    FPS = int(cap.get(cv2.CAP_PROP_FPS))

    video = np.zeros((frameCount, frameHeight, frameWidth, 3), np.dtype('uint8'))
    fc = 0
    ret = True

    while (fc < frameCount and ret):
        ret, video[fc] = cap.read()
        fc += 1

    cap.release()


    # 2. Correct video
    bbs = bounding_boxes
    width = width
    height = height
    center_x = (width/2)
    center_y = (height/2)
    x_dists = []
    y_dists = []

    for box_pos in bbs:
        if np.isnan(box_pos).any():
            # if all values were nan, then 0 would be the max, so the corrected video would be the original video
            x_dists.append(0)
            y_dists.append(0)
        else:
            # get center of bb
            #center_x_bb = (box_pos[0] + box_pos[2]) / 2
            #center_y_bb = (box_pos[1] + box_pos[3]) / 2
            center_x_bb = box_pos[0]
            center_y_bb = box_pos[1]
            # calculate distances
            x_dists.append(abs(center_x-center_x_bb))
            y_dists.append(abs(center_y-center_y_bb))


    video_corrected = np.zeros((len(video),
                              int(height + max(y_dists)*2),
                              int(width + max(x_dists)*2),
                              3), dtype=int)


    # 3. Center video
    # get coords for placement height and width. may be switched
    start_row = (video_corrected.shape[1] - height) // 2
    start_col = (video_corrected.shape[2] - width) // 2

    for idx, frame in enumerate(video):

        # get matching bb
        box_curr = bbs[idx]

        # If there is no information on fruit, color the frame black (skip iter)
        if np.isnan(box_curr).any():
            continue
            # frame[:] = 0
            # frame = np.zeros(frame.shape)
        else:
            # get center of bb
            #center_x_bb = (box_curr[0] + box_curr[2]) / 2
            #center_y_bb = (box_curr[1] + box_curr[3]) / 2
            center_x_bb = box_curr[0]
            center_y_bb = box_curr[1]

            # get offset of center
            # pos if bounding box is to right of center, else negative
            x_offset = int(center_x_bb - center_x)
            # pos if bounding box is below center, else negative
            y_offset = int(center_y_bb - center_y)

            #!coordinates until here are for old video

            #get fitting indices for new video
            fixed_start_row = start_row - y_offset
            fixed_start_col = start_col - x_offset

            fixed_end_row = fixed_start_row + height
            fixed_end_col = fixed_start_col + width

            # Checkup
            if((fixed_start_row or
                fixed_start_col or
                fixed_end_row or
                fixed_end_col) < 0):
                print("Negative Index!")

            # Save centered + corrected in new video
            video_corrected[idx][fixed_start_row:fixed_end_row,
                              fixed_start_col:fixed_end_col] = frame


    # 4. Save video
    video_corrected = np.uint8(video_corrected)

    height_new = int(video_corrected.shape[1])
    width_new = int(video_corrected.shape[2])

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(to_path[:-4]+'_centered.mp4', fourcc, FPS, (width_new, height_new), True)
    for idx in range(len(video)):
        out.write(video_corrected[idx])
    out.release()


## Video centering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load model
model = ultralytics.YOLO('yolov8n.pt')

# List videos
#folderpath = '/content/drive/MyDrive/Study_Project_Grasping_Copy/Transformer/Trials/'
#labelpath = '/content/drive/MyDrive/Study_Project_Grasping_Copy/Transformer/Labels/'
#folder = [f for f in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, f))]
folderpath = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Trials2/'
labelpath = '/content/drive/MyDrive/Study Project: Grasping/Transformer/Data/Labels2/'
folder = [f for f in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, f))]

# Example video
#ipd.Video(folderpath+'banana02.mp4', width=1920/1.7, height=1120/1.7)

**Procedure:**
1. Get the bounding boxes of all objects in each raw video.
2. Filter for hand and target object bounding boxes.
3. Target centering: fixate the target object (TARGET.mp4) using interpolation.

In [None]:
# Iterate through every video file in the folder
for i, video in enumerate(folder):

    print(f"Video {i+1}/{len(folder)}: {video}\n")

    # Get length of input array (number of frames)
    cap = cv2.VideoCapture(folderpath+video)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()


    # 1. Get all bbs of this video
    # (x_center, y_center, bbwidth, bbheight, class, trackerID, frame, filename)
    bbs, classes = GetBoundingBoxes(folderpath+video)
    video_count = np.repeat(i, len(bbs)).reshape((len(bbs),1))
    bbs = np.concatenate((video_count, bbs), axis=1)


    # 2. Filter for hand and target fruit by class
    # Get the target class to filter for as number
    if '.mov' in video:
        # returns the string name without '00.mp4' ending (could add try-catch)
        target_class = video[:-6]
    #added this, why was it .mov??
    elif '.mp4' in video:
        # returns the string name without '00.mp4' ending (could add try-catch)
        target_class = video[:-6]


    # Get class number from lookup table
    target_class = list(classes.keys())[list(classes.values()).index(target_class)]
    hand_class = list(classes.keys())[list(classes.values()).index('person')]

    # Filter data
    target_bbs = bbs[(bbs[:, 2] == target_class)]
    hand_bbs = bbs[(bbs[:, 2] == hand_class)]

    # To Do: Use trackerID to track the same object
    # -> could be added later on, as we only use one object per trial?
    # -> but if we have detection of more than one target fruit, it breaks something
    # because it will try to calculate positions from more than one bb per frame.
    # problem so far: if detection ends for one frame, a new trackerID is assigned
    # -> for this approach to work we would have to compare the position of the
    # last tracked trackerID to each object with newly assigned trackerID and
    # take the closest one, and interpolate position if there are no new trackerIDs detected.
    # for now, we leave it out: remove class and trackingID
    target_bbs = target_bbs[:, [0,1,3,4,5,6]]
    hand_bbs = hand_bbs[:, [0,1,3,4,5,6]]

    # 3. Interpolation: take last known bounding box positions
    # (x_center_fruit, y_center_fruit, bbwidth_fruit, bbheight_fruit, class, trackerID, frame, filename, x_center_hand, y_center_hand, bbwidth_hand, bbheight_hand)
    vid_input = np.zeros((frameCount, 10))
    vid_input[:] = np.nan # use as 'no information' instead of nan, because nan is a string

    #Perform interpolation for targets

    #get x values for which detection was successful
    x_detects = [int(target_bb[1]) for target_bb in target_bbs]
    #get x values for which interpolation has to be performed
    x_to_interp = [xframe for xframe in range(len(vid_input)) if xframe not in x_detects]

    #get the values for successful detections
    x_centers_targets_detec = np.asarray([frame_value[2] for frame_value in target_bbs])
    y_centers_targets_detec = np.asarray([frame_value[3] for frame_value in target_bbs])
    bbwidths_targets_detec = np.asarray([frame_value[4] for frame_value in target_bbs])
    bbheights_targets_detec = np.asarray([frame_value[5] for frame_value in target_bbs])

    #interpolate
    #numpy.interp(x, xp, fp, left=None, right=None, period=None)
    inter_x_centers_target = np.interp(x_to_interp, x_detects, x_centers_targets_detec)
    inter_y_centers_target = np.interp(x_to_interp, x_detects, y_centers_targets_detec)
    inter_bbwidths_target = np.interp(x_to_interp, x_detects, bbwidths_targets_detec)
    inter_bbheights_target = np.interp(x_to_interp, x_detects, bbheights_targets_detec)

    #Combine values into single array
    #add frame and video indices for new array
    #frame indices
    all_frames = [frame for frame in range(len(vid_input))]
    vid_input[all_frames,1] = range(len(vid_input))
    #video indices
    vid_input[all_frames,0] = i

    #pseudo: vid_input[all_detec_xs][einzeln 2 bis 4 für die werte (in versch zeilen)] =  x_centers_targets_detec bis bbheights_targets_detec in den zeilen
    vid_input[x_detects,2] = x_centers_targets_detec
    vid_input[x_detects,3] = y_centers_targets_detec
    vid_input[x_detects,4] = bbwidths_targets_detec
    vid_input[x_detects,5] = bbheights_targets_detec

    #und dasselbe für interpolated
    vid_input[x_to_interp,2] = inter_x_centers_target
    vid_input[x_to_interp,3] = inter_y_centers_target
    vid_input[x_to_interp,4] = inter_bbwidths_target
    vid_input[x_to_interp,5] = inter_bbheights_target

    # get first frame with information of target (used for input later)
    start_frame = 0
    for i, frame in enumerate(vid_input[:, :6]):
        if np.isnan(frame).any():
            start_frame = i+1
        else:
            break


    # 4. Center: Center on the correct bounding boxes of the target fruit
    Center(folderpath+video, labelpath+video, vid_input[:, 2:6])
