### vis EK prediction and annotations

In [1]:
from PIL import Image, ImageDraw
import numpy as np
import pose as P
from PIL import Image, ImageDraw, ImageEnhance

def draw_poses_on_image(img_path, all_poses_np, skeleton="hand", fading_scale = 25, return_trans=False, radius = 6, line_width = 5,dark_og=False, color='red'):
    pose_init_alpha = 300
    
    if skeleton == "hand":
        skeleton = {"labels": ["WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP"], "edges": [[0,1,2,3,4],[0,5,6,7,8],[9,10,11,12],[13,14,15,16],[0,17,18,19,20],[5,9,13,17]]}
    elif skeleton == "2hands":
        skeleton = {'labels': ["WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP", "WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP"], "edges": [[0,1,2,3,4],[0,5,6,7,8],[9,10,11,12],[13,14,15,16],[0,17,18,19,20],[5,9,13,17], [21, 22, 23, 24, 25], [21, 26, 27, 28, 29], [30, 31, 32, 33], [34, 35, 36, 37], [21, 38, 39, 40, 41], [26, 30, 34, 38]]}
    elif skeleton == "body":
        skeleton = {'labels': ["nose", "right shoulder", "left shoulder", "right elbow", "left elbow", "right wrist", "left wrist", "right hip", "left hip", "right knee", "left knee", "right ankle", "left ankle"], 'edges': [[5, 3, 1, 2, 4, 6], [11, 9, 7, 8, 10, 12]]}
    
    # img
    img = Image.open(img_path)
    # img.putalpha(alpha)
    if dark_og:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(0.5)

    # create a transparent layer
    transparent_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))

    # create draw object for the transparent layer
    draw = ImageDraw.Draw(transparent_layer)

    for edge_idx, edge in enumerate(skeleton['edges']):
        for i in range(len(edge)-1):
            x1_y1 = all_poses_np[:, edge[i], :] * [img.width, img.height]
            x2_y2 = all_poses_np[:, edge[i+1], :] * [img.width, img.height]
            for j in range(len(x1_y1)-1, -1, -1):
                current_alpha = pose_init_alpha - j * fading_scale
                if color=='red':
                    draw.line((x1_y1[j][0], x1_y1[j][1], x2_y2[j][0], x2_y2[j][1]), fill=(255, 0, 0, current_alpha), width=line_width)
                elif color=='blue':
                    draw.line((x1_y1[j][0], x1_y1[j][1], x2_y2[j][0], x2_y2[j][1]), fill=(0, 0, 255, current_alpha), width=line_width)
    
    # drawing operations (circles and lines) on the transparent layer
    for rev_pose_idx, pose in enumerate(reversed(all_poses_np)):
        pose_idx = len(all_poses_np) - rev_pose_idx - 1
        current_alpha = pose_init_alpha - pose_idx * fading_scale
        # loop over all keypoints in the pose
        for point_idx, point in enumerate(pose):
            x, y = point * [img.width, img.height]
            if color=='red':
                draw.ellipse((x-radius, y-radius, x+radius, y+radius), fill=(255, 0, 0, current_alpha))
            elif color=='blue':
                draw.ellipse((x-radius, y-radius, x+radius, y+radius), fill=(0, 0, 255, current_alpha))

    # blend the transparent layer with the original image
    img_with_drawn_poses = Image.alpha_composite(img.convert("RGBA"), transparent_layer)
    # img_with_drawn_poses = Image.alpha_composite(img, transparent_layer)

    if not return_trans:
        # convert back to RGB if needed
        return img_with_drawn_poses.convert("RGB")
    else:
        return {"on_img": img_with_drawn_poses, "on_trans": transparent_layer}

def interpolate_poses(start_pose, end_pose, N, noise_intensity = 0.005, scale_variation = 0.002):
    # Create an array to store the interpolated poses
    interpolated_poses = np.zeros((N, start_pose.shape[1], start_pose.shape[2]))

    # Linear interpolation with noise and scaling for each joint
    for i in range(N):
        t = i / (N - 1)
        noise = np.random.randn(*start_pose.shape[1:]) * noise_intensity
        scale = 1 + (np.random.rand() - 0.5) * scale_variation
        interpolated_poses[i] = ((1 - t) * start_pose + t * end_pose + noise) * scale

    # Concatenate start pose, interpolated poses, and end pose
    full_sequence = np.concatenate((start_pose, interpolated_poses, end_pose), axis=0)

    return full_sequence

In [2]:
import torch
import os
import pandas as pd

ann = pd.read_csv("/z/dat/EpicKitchens/epic-kitchens-100-annotations/EPIC_100_train.csv")
display(ann)

exp_folder = "/z/exp/lgpf/cog/z/home/yayuanli/Research/darpa_ptg/darpa_ptg_yayuan/ptg_research/exp12/outputs/experiments/exp_0059_202307031823"
predictions = torch.load(os.path.join(exp_folder, "outputs/best_predictions_val.pt"))
labels = torch.load(os.path.join(exp_folder, "outputs/labels_val.pt"))
print(labels.keys())
print(predictions.keys())


Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
0,P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,['door'],[3]
1,P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,['light'],[114]
2,P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,['drawer'],[8]
3,P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,['cup'],[13]
4,P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,['cupboard'],[3]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67212,P37_103_71,P37,P37_103,00:06:16.903,00:06:17.17,00:06:17.67,18858,18883,turn off tap,turn-off,8,tap,0,['tap'],[0]
67213,P37_103_72,P37,P37_103,00:06:22.154,00:06:17.86,00:06:23.77,18893,19188,take pan,take,0,pan,5,['pan'],[5]
67214,P37_103_73,P37,P37_103,00:06:26.404,00:06:23.45,00:06:32.66,19172,19633,pour out boiled water,pour-out,9,water:boiled,27,['water:boiled'],[27]
67215,P37_103_8,P37,P37_103,00:00:41.151,00:00:40.57,00:00:44.19,2028,2209,debone chicken thighs,debone,30,thigh:chicken,57,['thigh:chicken'],[57]


dict_keys(['/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000000304/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000000426/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000002265/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000014875/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015110/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015307/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015439/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015542/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015590/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015

In [None]:
import re

def replace_pattern(s):
    # Define the pattern to be replaced and the replacement pattern
    pattern = r'/(\d+)/full_scale\.jpg'
    replacement = r'frame_\1.jpg'

    # Use re.sub() to replace the pattern
    return re.sub(pattern, replacement, s)


import time
import random
k_list = list(predictions.keys())
random.shuffle(k_list)
prev_img_path = None
for key in k_list:
    prediction = predictions[key]
    label = labels[key]
    
    # get img_path from key and replace the pattern (they are on another machine now)
    img_path = key.replace("/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos", "/z/dat/EpicKitchens/EpicKitchens50_og/3h91syskeag572hl6tvuovwv4d/frames_rgb_flow/rgb")
    img_path = re.sub(r'/(\d+)/full_scale\.jpg', r'/frame_\1.jpg', img_path)
    
    if prev_img_path == img_path.split("/")[:-2]:
        continue
    prev_img_path = img_path.split("/")[:-2]
    
    # img_path = "/z/dat/EpicKitchens/EpicKitchens50_og/3h91syskeag572hl6tvuovwv4d/frames_rgb_flow/rgb/train/P07/P07_10/frame_0000000001.jpg"
    video_id = img_path.split('/')[10]
    frame_number = int(re.search(r'frame_(\d+)\.jpg', img_path).group(1))
    matching_rows = ann[(ann['video_id'] == video_id) & (ann['stop_frame'] >= frame_number) & (ann['start_frame'] <= frame_number)]
    
    # don't have good way to pick which narration the model was inferencing on so skip the images with such ambiguity of narration annotations
    if matching_rows.shape[0] != 1:
        continue
    narration = list(matching_rows["narration"])[0]
    # display(matching_rows)
    print(f"{img_path} {narration}")
    
    # time.sleep(1)
    # continue

    # img_path = "/z/dat/PennAction/Penn_Action_media_v000/frames/0638/0000000011/full_scale.jpg"
    # "/z/dat/EpicKitchens/EpicKitchens50_og/3h91syskeag572hl6tvuovwv4d/frames_rgb_flow/rgb/train/P07/P07_10"
    img = Image.open(img_path)
    img.show()
    img.putalpha(128)

    label_pose = draw_poses_on_image(img_path, label.numpy().reshape(30, -1, 2), skeleton="2hands", fading_scale = 13, radius = 1, line_width = 1, return_trans=True, color='red')["on_trans"]
    pred_pose = draw_poses_on_image(img_path, prediction.numpy().reshape(30, -1, 2), skeleton="2hands", fading_scale = 13, radius = 1, line_width = 1, return_trans=True, color='blue')["on_trans"]
    vis = Image.alpha_composite(img, Image.alpha_composite(pred_pose, label_pose))
    vis.show()
    # vis.save("../tmp/results/1_output.png")
    time.sleep(1)

## Penn Action

In [10]:
import torch
import os

exp_folder = "/z/exp/lgpf/workspace/ptg_research/exp15/outputs/experiments/exp_0010_202308092211"
predictions = torch.load(os.path.join(exp_folder, "outputs/best_predictions_val.pt"))
labels = torch.load(os.path.join(exp_folder, "outputs/labels_val.pt"))
print(labels.keys())
import pandas as pd
ann_pd = pd.read_csv(f"/z/dat/PennAction/Penn_Action_media_v000/vid_ann.csv")
ann_pd

In [None]:
import time
import random
key_list = list(predictions.keys())
random.shuffle(key_list)
prev_img_path = None
for img_path in key_list:
    if prev_img_path == img_path.split("/")[:-2]:
        continue
    prev_img_path = img_path.split("/")[:-2]
    print(f"{img_path} {ann_pd.loc[int(img_path.split('/')[-3])-1]['raw_narration']}")
    
    # img_path = "/z/dat/PennAction/Penn_Action_media_v000/frames/0638/0000000011/full_scale.jpg"
    img = Image.open(img_path)
    img.show()
    img.putalpha(128)


    label_pose = draw_poses_on_image(img_path, labels[img_path].numpy().reshape(30, -1, 2), skeleton="body", fading_scale = 13, radius = 2, line_width = 2, return_trans=True, color='red')["on_trans"]
    pred_pose = draw_poses_on_image(img_path, predictions[img_path].numpy().reshape(30, -1, 2), skeleton="body", fading_scale = 13, radius = 2, line_width = 2, return_trans=True, color='blue')["on_trans"]
    vis = Image.alpha_composite(img, Image.alpha_composite(pred_pose, label_pose))
    vis.show()
    # vis.save("../tmp/results/1_output.png")
    time.sleep(1)

## poses as a "video"

In [6]:
def draw_poses_on_trans(img_path, all_poses_np, skeleton="hand", fading_scale = 25, return_trans=True, radius = 6, line_width = 5, color='red'):
    pose_init_alpha = 300
    
    if skeleton == "hand":
        skeleton = {"labels": ["WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP", "MEAN_ALL"], "edges": [[0,1,2,3,4],[0,5,6,7,8],[9,10,11,12],[13,14,15,16],[0,17,18,19,20],[5,9,13,17]]}    
    elif skeleton == "2hands":
        skeleton = {'labels': ["WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP", "WRIST", "THUMB_CMC", "THUMB_MCP", "THUMB_IP", "THUMB_TIP", "INDEX_FINGER_MCP", "INDEX_FINGER_PIP", "INDEX_FINGER_DIP", "INDEX_FINGER_TIP", "MIDDLE_FINGER_MCP", "MIDDLE_FINGER_PIP", "MIDDLE_FINGER_DIP", "MIDDLE_FINGER_TIP", "RING_FINGER_MCP", "RING_FINGER_PIP", "RING_FINGER_DIP", "RING_FINGER_TIP", "PINKY_MCP", "PINKY_PIP", "PINKY_DIP", "PINKY_TIP"], "edges": [[0,1,2,3,4],[0,5,6,7,8],[9,10,11,12],[13,14,15,16],[0,17,18,19,20],[5,9,13,17], [21, 22, 23, 24, 25], [21, 26, 27, 28, 29], [30, 31, 32, 33], [34, 35, 36, 37], [21, 38, 39, 40, 41], [26, 30, 34, 38]]}
    elif skeleton == "body":
        skeleton = {'labels': ["nose", "right shoulder", "left shoulder", "right elbow", "left elbow", "right wrist", "left wrist", "right hip", "left hip", "right knee", "left knee", "right ankle", "left ankle"], 'edges': [[5, 3, 1, 2, 4, 6], [11, 9, 7, 8, 10, 12]]}    
    
    # img
    img = Image.open(img_path)
    # img.putalpha(alpha)
    # enhancer = ImageEnhance.Brightness(img)
    # img = enhancer.enhance(0.5)

    # create a transparent layer
    transparent_layer_list = []
    draw_list = []
    for t_i in range(len(all_poses_np)):
        transparent_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
        transparent_layer_list.append(transparent_layer)

        # create draw object for the transparent layer
        draw = ImageDraw.Draw(transparent_layer)
        draw_list.append(draw)

    for edge_idx, edge in enumerate(skeleton['edges']):
        for i in range(len(edge)-1):
            x1_y1 = all_poses_np[:, edge[i], :] * [img.width, img.height]
            x2_y2 = all_poses_np[:, edge[i+1], :] * [img.width, img.height]
            for j in range(len(x1_y1)-1, -1, -1):
                current_alpha = pose_init_alpha - j * fading_scale
                if color=='red':
                    draw_list[j].line((x1_y1[j][0], x1_y1[j][1], x2_y2[j][0], x2_y2[j][1]), fill=(255, 0, 0, current_alpha), width=line_width)
                elif color=='blue':
                    draw_list[j].line((x1_y1[j][0], x1_y1[j][1], x2_y2[j][0], x2_y2[j][1]), fill=(0, 0, 255, current_alpha), width=line_width)
        
    # drawing operations (circles and lines) on the transparent layer
    for rev_pose_idx, pose in enumerate(reversed(all_poses_np)):
        pose_idx = len(all_poses_np) - rev_pose_idx - 1
        current_alpha = pose_init_alpha - pose_idx * fading_scale
        # loop over all keypoints in the pose
        for point_idx, point in enumerate(pose):
            x, y = point * [img.width, img.height]
            if color=='red':
                draw_list[pose_idx].ellipse((x-radius, y-radius, x+radius, y+radius), fill=(255, 0, 0, current_alpha))
            elif color=='blue':
                draw_list[pose_idx].ellipse((x-radius, y-radius, x+radius, y+radius), fill=(0, 0, 255, current_alpha))


    # draw init pose on the input frame
    # img_with_drawn_poses = Image.alpha_composite(img.convert("RGBA"), transparent_layer_list[0])
    img_with_drawn_poses = img.convert("RGBA")
    return {"on_img": img_with_drawn_poses, "on_trans": transparent_layer_list}

def create_composite_image(images, overlap_rate):

    # Calculate the width and height of the composite image
    total_width = sum(image.width for image in images) - int(overlap_rate * (len(images) - 1) * images[0].width)
    max_height = max(image.height for image in images)

    # Create a new blank image with the calculated dimensions
    composite_image = Image.new('RGBA', (total_width, max_height), (0,0,0,0))

    # Paste each image onto the composite image
    x_offset = 0
    for i, image in enumerate(images):
        composite_image.paste(image, (x_offset, 0), image)
        x_offset += image.width - int(overlap_rate * image.width)

    return composite_image

def input2movie(image1, image2):
    combined_width = image1.width + image2.width
    combined_height = image1.height  # Both images have the same height
    new_image = Image.new('RGBA', (combined_width, combined_height))
    new_image.paste(image1, (0, 0))
    new_image.paste(image2, (image1.width, 0))
    
    return new_image

### penn action

In [27]:
import torch
import os

exp_folder = "/z/exp/lgpf/workspace/ptg_research/exp15/outputs/experiments/exp_0010_202308092211"
predictions = torch.load(os.path.join(exp_folder, "outputs/best_predictions_val.pt"))
labels = torch.load(os.path.join(exp_folder, "outputs/labels_val.pt"))
print(labels.keys())
import pandas as pd
ann_pd = pd.read_csv(f"/z/dat/PennAction/Penn_Action_media_v000/vid_ann.csv")
ann_pd

dict_keys(['/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000001/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000002/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000003/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000004/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000005/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000006/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000007/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000008/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000009/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000010/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000011/full_scale.jpg', '/z/dat/PennAction/Penn_Action_media_v000/frames/0010/0000000012/full_scale.jpg', '/z/d

Unnamed: 0.1,Unnamed: 0,total_frame_count,raw_narration,pose,video_dimensions,frames
0,1,151,baseball_pitch,back,"[360, 480, 151]",{}
1,2,80,baseball_pitch,back,"[270, 480, 80]",{}
2,3,85,baseball_pitch,right,"[270, 480, 85]",{}
3,4,82,baseball_pitch,front,"[270, 480, 82]",{}
4,5,48,baseball_pitch,front,"[270, 480, 48]",{}
...,...,...,...,...,...,...
2321,2322,80,tennis_serve,back,"[270, 480, 80]",{}
2322,2323,50,tennis_serve,front,"[270, 480, 50]",{}
2323,2324,71,tennis_serve,left,"[270, 480, 71]",{}
2324,2325,56,tennis_serve,right,"[270, 480, 56]",{}


In [None]:
import random
import glob
import pose as P
import numpy as np
import time
from PIL import Image, ImageDraw, ImageEnhance


key_list = list(predictions.keys())
random.shuffle(key_list)
prev_img_path = None
for img_path in key_list:
    if prev_img_path == img_path.split("/")[:-2]:
        continue
    prev_img_path = img_path.split("/")[:-2]
    narration = ann_pd.loc[int(img_path.split('/')[-3])-1]['raw_narration']
    print(f"{img_path} {narration}")
    
    drawn_img_return = draw_poses_on_trans(img_path, labels[img_path].numpy().reshape(30, -1, 2), skeleton="body", return_trans=True, fading_scale = 0, radius = 3, line_width = 3, color='red')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Overlap rate (0.0 to 1.0, where 0.2 means 20% overlap)
    overlap_rate = 0.8 # change overlap rate w.r.t. the original image size
    # Create and save the composite image
    composite_label = create_composite_image(images, overlap_rate)

    drawn_img_return = draw_poses_on_trans(img_path, predictions[img_path].numpy().reshape(30, -1, 2), skeleton="body", return_trans=True, fading_scale = 0, radius = 3, line_width = 3, color='blue')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Overlap rate (0.0 to 1.0, where 0.2 means 20% overlap)
    overlap_rate = 0.8 # change overlap rate w.r.t. the original image size
    # Create and save the composite image
    composite_pred = create_composite_image(images, overlap_rate)

    composite = Image.alpha_composite(composite_label, composite_pred)

    # show input and poses movie together
    input_and_poses = input2movie(drawn_img_return["on_img"], composite)
    input_and_poses.show()
    
    time.sleep(1)

### F-PHAB

In [42]:
import torch
import os

exp_folder = "/z/exp/lgpf/workspace/ptg_research/exp15/outputs/experiments/exp_0099_202308310404"
predictions = torch.load(os.path.join(exp_folder, "outputs/best_predictions_val.pt"))
labels = torch.load(os.path.join(exp_folder, "outputs/labels_val.pt"))
print(labels.keys())
predictions_list = list(predictions.items())
labels_list = list(labels.items())

dict_keys([('/z/dat/F-PHAB/F-PHAB_media_v000/Video_files/Subject_1/flip_pages/2/color/0000000001/full_scale.jpg', 'flip_pages', tensor([0.6213, 0.9536, 0.6320, 0.9246, 0.5693, 0.7838, 0.5358, 0.7086, 0.5164,
        0.6396, 0.6239, 0.7026, 0.5568, 0.6572, 0.5135, 0.6279, 0.4826, 0.6227,
        0.6080, 0.7481, 0.5392, 0.6904, 0.4938, 0.6722, 0.4579, 0.6610, 0.5966,
        0.7978, 0.5373, 0.7692, 0.4931, 0.7594, 0.4533, 0.7503, 0.5863, 0.8341,
        0.5527, 0.7978, 0.5336, 0.7772, 0.5041, 0.7529])), ('/z/dat/F-PHAB/F-PHAB_media_v000/Video_files/Subject_1/flip_pages/2/color/0000000002/full_scale.jpg', 'flip_pages', tensor([0.6210, 0.9542, 0.6318, 0.9250, 0.5693, 0.7843, 0.5359, 0.7091, 0.5165,
        0.6399, 0.6235, 0.7033, 0.5566, 0.6578, 0.5133, 0.6283, 0.4824, 0.6232,
        0.6076, 0.7489, 0.5393, 0.6904, 0.4936, 0.6732, 0.4577, 0.6618, 0.5961,
        0.7986, 0.5365, 0.7720, 0.4923, 0.7619, 0.4525, 0.7525, 0.5859, 0.8350,
        0.5527, 0.7979, 0.5340, 0.7769, 0.5045, 0.7524])

In [None]:
import time
import random
index_list = list(range(len(predictions)))
random.shuffle(index_list)
prev_img_path = None
for kv_index in index_list:
    key, prediction = predictions_list[kv_index]
    key, label = labels_list[kv_index]
    
    img_path = key[0]
    if prev_img_path == img_path.split("/")[:-2]:
        continue
    prev_img_path = img_path.split("/")[:-2]
    narration = key[1]
    print(f"{img_path} {narration}")
    
    pose_label = label.numpy().reshape(30, -1, 2)
    pose_pred = prediction.numpy().reshape(30, -1, 2)

    drawn_img_return = draw_poses_on_trans(img_path, pose_label, skeleton="body", return_trans=True, fading_scale = 0, radius = 3, line_width = 3, color='red')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Overlap rate (0.0 to 1.0, where 0.2 means 20% overlap)
    overlap_rate = 0.8 # change overlap rate w.r.t. the original image size
    # Create and save the composite image
    composite_label = create_composite_image(images, overlap_rate)

    drawn_img_return = draw_poses_on_trans(img_path, pose_pred, skeleton="body", return_trans=True, fading_scale = 0, radius = 3, line_width = 3, color='blue')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Overlap rate (0.0 to 1.0, where 0.2 means 20% overlap)
    overlap_rate = 0.8 # change overlap rate w.r.t. the original image size
    # Create and save the composite image
    composite_pred = create_composite_image(images, overlap_rate)

    composite = Image.alpha_composite(composite_label, composite_pred)

    # show input and poses movie together
    input_and_poses = input2movie(drawn_img_return["on_img"], composite)
    input_and_poses.show()
    
    time.sleep(1)

### EK

In [3]:
import torch
import os
import pandas as pd

ann = pd.read_csv("/z/dat/EpicKitchens/epic-kitchens-100-annotations/EPIC_100_train.csv")
display(ann)

exp_folder = "/z/exp/lgpf/cog/z/home/yayuanli/Research/darpa_ptg/darpa_ptg_yayuan/ptg_research/exp12/outputs/experiments/exp_0059_202307031823"
predictions = torch.load(os.path.join(exp_folder, "outputs/best_predictions_val.pt"))
labels = torch.load(os.path.join(exp_folder, "outputs/labels_val.pt"))
print(labels.keys())
print(predictions.keys())


Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
0,P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,['door'],[3]
1,P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,['light'],[114]
2,P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,['drawer'],[8]
3,P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,['cup'],[13]
4,P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,['cupboard'],[3]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67212,P37_103_71,P37,P37_103,00:06:16.903,00:06:17.17,00:06:17.67,18858,18883,turn off tap,turn-off,8,tap,0,['tap'],[0]
67213,P37_103_72,P37,P37_103,00:06:22.154,00:06:17.86,00:06:23.77,18893,19188,take pan,take,0,pan,5,['pan'],[5]
67214,P37_103_73,P37,P37_103,00:06:26.404,00:06:23.45,00:06:32.66,19172,19633,pour out boiled water,pour-out,9,water:boiled,27,['water:boiled'],[27]
67215,P37_103_8,P37,P37_103,00:00:41.151,00:00:40.57,00:00:44.19,2028,2209,debone chicken thighs,debone,30,thigh:chicken,57,['thigh:chicken'],[57]


dict_keys(['/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000000304/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000000426/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000002265/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000014875/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015110/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015307/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015439/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015542/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015590/full_scale.jpg', '/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos/train/P02/P02_04/0000015

In [None]:
import re

def replace_pattern(s):
    # Define the pattern to be replaced and the replacement pattern
    pattern = r'/(\d+)/full_scale\.jpg'
    replacement = r'frame_\1.jpg'

    # Use re.sub() to replace the pattern
    return re.sub(pattern, replacement, s)


import time
import random
k_list = list(predictions.keys())
random.shuffle(k_list)
prev_img_path = None
for key in k_list:
    prediction = predictions[key]
    label = labels[key]
    
    # get img_path from key and replace the pattern (they are on another machine now)
    img_path = key.replace("/z/dat/EpicKitchens50/EpicKitchens50_media_v000/videos", "/z/dat/EpicKitchens/EpicKitchens50_og/3h91syskeag572hl6tvuovwv4d/frames_rgb_flow/rgb")
    img_path = re.sub(r'/(\d+)/full_scale\.jpg', r'/frame_\1.jpg', img_path)
    
    if prev_img_path == img_path.split("/")[:-2]:
        continue
    prev_img_path = img_path.split("/")[:-2]
    
    # img_path = "/z/dat/EpicKitchens/EpicKitchens50_og/3h91syskeag572hl6tvuovwv4d/frames_rgb_flow/rgb/train/P07/P07_10/frame_0000000001.jpg"
    video_id = img_path.split('/')[10]
    frame_number = int(re.search(r'frame_(\d+)\.jpg', img_path).group(1))
    matching_rows = ann[(ann['video_id'] == video_id) & (ann['stop_frame'] >= frame_number) & (ann['start_frame'] <= frame_number)]
    
    # don't have good way to pick which narration the model was inferencing on so skip the images with such ambiguity of narration annotations
    if matching_rows.shape[0] != 1:
        continue
    narration = list(matching_rows["narration"])[0]
    # display(matching_rows)
    print(f"{img_path} {narration}")
    
    pose_label = label.numpy().reshape(30, -1, 2)
    pose_pred = prediction.numpy().reshape(30, -1, 2)
    
    # Overlap rate (0.0 to 1.0, where 0.2 means 20% overlap)
    overlap_rate = 0.4 # change overlap rate w.r.t. the original image size

    # draw a type of figure -- pose as moive
    drawn_img_return = draw_poses_on_trans(img_path, pose_label, skeleton="2hands", return_trans=True, fading_scale = 0, radius = 1, line_width = 1, color='red')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Create and save the composite image
    composite_label = create_composite_image(images, overlap_rate)

    drawn_img_return = draw_poses_on_trans(img_path, pose_pred, skeleton="2hands", return_trans=True, fading_scale = 0, radius = 1, line_width = 1, color='blue')
    images = drawn_img_return["on_trans"][0::1] # Start from index 0, go to the end, step by 5
    # Create and save the composite image
    composite_pred = create_composite_image(images, overlap_rate)

    composite = Image.alpha_composite(composite_label, composite_pred)

    # show input and poses movie together
    input_and_poses = input2movie(drawn_img_return["on_img"], composite)
    input_and_poses.show()
    
    time.sleep(1)    