In [12]:
import torch
import numpy as np
import sys
sys.path.append('../')
import os
from tqdm.notebook import tqdm
from datasets import FineGym
from torchvision import datasets, transforms
from utils import augmentation
from utils.utils import denorm
from losses import compute_mask
from models import Model
from torch.utils import data
import torchvision
from IPython.display import display, Image

import os
import matplotlib.pyplot as plt 
import PIL
device_ids = [0, 1]
os.environ["CUDA_VISIBLE_DEVICES"]="4,5"
%matplotlib inline

In [13]:
import numbers
from torchvision.transforms import _transforms_video as transforms_video
from torchvision.transforms import _functional_video as F

class BottomCrop:
    def __init__(self, size, consistent=True):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, imgmap):
        img1 = imgmap[0]
        w, h = img1[0].shape
        th, tw = self.size
        x1 = int(round(w - tw) -30)
        y1 = int(round((h - th) / 2. + 20))
        return F.crop(imgmap, x1, y1, tw, th)

In [14]:
# transform = transforms.Compose([
#             BottomCrop(size=180, consistent=True),
#             augmentation.Scale([128, 128]),
#             augmentation.ToTensor(),
#             augmentation.Normalize()
#         ])

In [15]:
transform = transforms.Compose([
            augmentation.CenterCrop(size=128, consistent=True),
            augmentation.ToTensor(),
            augmentation.Normalize()
        ])

In [16]:
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
args = Namespace(hyperbolic=True, \
                 hyperbolic_version=1, \
                 network_feature='resnet18', \
                 distance='squared', \
                 early_action=True, \
                 early_action_self=True, \
                 dataset='finegym', \
                 pred_step=1, \
                 seq_len=5, \
                 num_seq=6, \
                 ds=3, \
                 img_dim=128, \
                 batch_size=32, \
                 fp16=True, \
                 fp64_hyper=True, \
                 use_labels=False, \
                 n_classes=307, \
                 linear_input='features', \
                 hierarchical_labels=False, \
                 action_level_gt=True, \
                 num_workers=16, \
                 cross_gpu_score=True, \
                 feature_dim=256, \
                 viz=True, \
                 not_track_running_stats=True, \
                 final_2dim=False, \
                 no_spatial=False
                 )

In [17]:
dataset = FineGym(mode='test',
                 path_dataset='/proj/vondrick/datasets/FineGym',
                 transform=transform,
                 seq_len=5,  # given duration distribution, we should aim for ~1.5 seconds (around 7-8 frames at 5 fps)
                 num_seq=6,
                 unit_test=False,
                 return_label=False,
                  hierarchical_label=False,
                  action_level_gt=False, return_idx=True)
dataloader = data.DataLoader(dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             pin_memory=True,
                             drop_last=True)

In [18]:
dataset[0][0]['vpath'], dataset[0][0]['idx_block']

TypeError: new(): invalid data type 'str'

In [19]:
model_path = '/proj/vondrick/didac/code/DPC/logs/log_earlyaction_linear_finegym_kinetics_fromfinetune_lr2/20201101_192504/model/model_best_epoch91.pth.tar'
# model_path = '/proj/vondrick/didac/code/DPC/logs/log_earlyaction_linear_finegym_64d_v2/20201112_113024/model/model_best_epoch94.pth.tar'

In [20]:
model = Model(args).cuda()
checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'], strict=False)
model = torch.nn.DataParallel(model, device_ids=device_ids)
model.eval()

DataParallel(
  (module): Model(
    (backbone): ResNet2d3d_full(
      (conv1): Conv3d(3, 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock2d(
          (conv1): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
          (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
          (relu): ReLU(inplace=True)
          (conv2): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
          (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
        )
        (1): BasicBlock2d(
          (conv1): Conv3d(64, 64, kernel_size=(1, 3, 

In [21]:
# video = 'rrrgsW--AE8'
video = 'Z2T9B4qExzk'
idx_list = []
for idx in range(len(dataset.idx2clipidx)):
    if video in dataset.idx2clipidx[idx]:
        idx_list.append(idx)

In [22]:
all_video = []
all_pred = []
all_feature = []
all_vpath = []
all_idx_block = []

In [23]:
with torch.no_grad():
    for idx, (input_dict, label) in tqdm(enumerate(dataloader), total=len(dataloader)):
        input_seq = input_dict['t_seq'].cuda()
        pred, feature, size = model(input_seq)
        pred = pred.reshape(args.batch_size, args.num_seq - args.pred_step, 16, args.feature_dim)[:, :, 5, :].detach().cpu()
        feature = feature[:, :, :, 1, 1].detach().cpu()
        all_pred.append(pred)
        all_feature.append(feature)
        all_video.append(input_seq.cpu())
        all_vpath.append(input_dict['vpath'])
#         all_idx_block.append(input_dict['idx_block'])
        del input_seq, pred, feature, size

HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))




ValueError: too many values to unpack (expected 2)

In [None]:
pred = torch.stack(all_pred).reshape(-1, args.feature_dim)
feature = torch.stack(all_feature).reshape(-1, args.feature_dim)
videos = torch.stack(all_video)

In [None]:
pred = torch.cat(all_pred)

In [None]:
pred.shape

In [None]:
pred_norm = torch.norm(pred, dim=2)
uncertain_rank = pred_norm[:, -1].argsort(descending=False)

In [None]:
all_vpath_processed[0]

In [None]:
# permute axis in returned vpath [num_videos, num_seq]
all_vpath_processed = []
for batch_ind in range(len(all_vpath)):
    batch_list = []
    for vpath_ind in range(len(all_vpath[0][0])):
        step_list = []
        for step_ind in range(len(all_vpath[0])):
            step_list.append(all_vpath[batch_ind][step_ind][vpath_ind])
        batch_list.append(step_list)
    all_vpath_processed.extend(batch_list)

In [None]:
def plot_finegym(vpath, rank, index):
    images = []
    path_list = vpath[index]
    for i in range(len(path_list)):
        video, audio, info = torchvision.io.read_video(path_list[i], start_pts=0, end_pts=None, pts_unit='sec')
        for frame in video:
            if i == len(path_list) - 1:
                im = frame.numpy()
                im[:, :10] = np.array([100, 100, 255])
                im[:10, :] = np.array([100, 100, 255])
                im[:, -10:] = np.array([100, 100, 255])
                im[-10:, :] = np.array([100, 100, 255])
                images.append(PIL.Image.fromarray(im))
        else:
            for frame in video:
                images.append(PIL.Image.fromarray(frame.numpy()))
        gif_path = os.path.join('uncertain_gif', 'finegym', 'uncertain_rank_%d_index_%d.gif' % (rank, index))
        images[0].save(gif_path, format='GIF', append_images=images[1:], save_all=True, duration=50, loop=0)
    return images, gif_path

In [None]:
def display_gif(gif_path):
    with open(gif_path,'rb') as f:
        display(Image(width = 600, height = 400, data=f.read(), format='png'))

# save gifs

In [None]:
for i in tqdm(range(100)):
    rank = i
    images, gif_path = plot_finegym(all_vpath_processed, rank, uncertain_rank[rank])
#     display_gif(gif_path)
    print('rank: %d' % rank)
    print('index: %d' % uncertain_rank[rank])
    print('radius: %f' % pred_norm[uncertain_rank[rank], -1].item())

In [None]:
for i in tqdm(range(100)):
    rank = len(uncertain_rank) - i - 1
    images, gif_path = plot_finegym(all_vpath_processed, uncertain_rank[rank], rank)
#     display_gif(gif_path)
    print('rank: %d' % rank)
    print('index: %d' % uncertain_rank[rank])
    print('radius: %f' % pred_norm[uncertain_rank[rank], -1].item())