# Dataset demonstration

This notebook shows you how to read the data using GulpIO and visualise the frames (both RGB and optical flow U/V pairs).

In [15]:
from gulpio2 import GulpDirectory
from pathlib import Path
from moviepy.editor import ImageSequenceClip, clips_array

from collections import defaultdict

import pickle
import pandas as pd
from pathlib import Path

import torch as t
import numpy as np
from omegaconf import OmegaConf
from typing import Any, Dict, List, Sequence, Union

from systems import EpicActionRecognitionSystem

from utils.metrics import compute_metrics
from utils.actions import action_id_from_verb_noun
from scipy.special import softmax

In [16]:
# SETUP TORCH VARIABLES
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
dtype = t.float

In [17]:
def generate_model():
    # LOAD IN SAVED CHECKPOINT
    ckpt = t.load('../models/trn_rgb.ckpt', map_location='cpu')

    # CREATE CONFIG FROM CHECKPOINT
    cfg = OmegaConf.create(ckpt['hyper_parameters'])
    OmegaConf.set_struct(cfg, False)

    # SET GULP DIRECTORY
    cfg.data._root_gulp_dir = '/home/ts/C1-Action-Recognition-TSN-TRN-TSM/datasets/epic/gulp/rgb'

    # CREATE MODEL
    model = EpicActionRecognitionSystem(cfg)
    model.load_state_dict(ckpt['state_dict'])
    return model.model.to(device)

In [18]:
# SET GULP ROOT FOR gulpio2
gulp_root = Path.home()

# MODEL
model = generate_model()

# LABELS
labels_path = '../datasets/epic/labels/'
verb_classes = pd.read_csv(labels_path+'EPIC_100_verb_classes.csv', usecols=['key']).to_dict('index')
noun_classes = pd.read_csv(labels_path+'EPIC_100_noun_classes.csv', usecols=['key']).to_dict('index')

Multi-Scale Temporal Relation Network Module in use ['8-frame relation', '7-frame relation', '6-frame relation', '5-frame relation', '4-frame relation', '3-frame relation', '2-frame relation']


In [20]:
def narration_from_id(rgb_frames, rgb_meta, top_n=5):

    # RESHAPE INPUT DATA FOR RUNNING THROUGH MODEL AND MOVE TO GPU IF AVAIL
    rgb_frames = np.array(rgb_frames).transpose(0,3,1,2)
    rgb_frames = t.tensor(rgb_frames, device=device, dtype=dtype)
    rgb_frames = rgb_frames.unsqueeze(0)

    # RUN FRAMES THROUGH MODEL
    with t.no_grad():
        result = model(rgb_frames)
    
    scores = {
        'verb': result[:,:97].cpu().numpy(),
        'noun': result[:,97:].cpu().numpy(),
        'narration_id': rgb_meta['narration_id']
    }
    
    verb_top_n = scores['verb'][0].argsort()[::-1][:top_n]
    noun_top_n = scores['noun'][0].argsort()[::-1][:top_n]
    
    out = {
        rgb_meta['verb']: [verb_classes[key]['key'] for key in verb_top_n],
        rgb_meta['noun']: [noun_classes[key]['key'] for key in noun_top_n],
    }
    
    return out

In [21]:
# SELECT SINGLE CLIP TO PROCESS (uncomment for full list of ids)
# rgb_train.merged_meta_dict.keys()
clip_id = 'P01_01_96'

In [22]:
# LOAD GULPED DATA
rgb_train = GulpDirectory('../datasets/epic/gulp/rgb/rgb_test/')
# flow_train = GulpDirectory('../datasets/epic/gulp/flow_train/')#GulpDirectory('/home/will/P01_flow_gulp/')

In [23]:
# LOAD FRAMES AND META FOR SELECTED VIDEO
rgb_frames, rgb_meta = rgb_train[clip_id]
# flow_frames, flow_meta = flow_train[clip_id]

In [24]:
with t.no_grad():
    out = model.model.features(rgb_frames)
    out = model.model.new_fc(out)
    out.shape

ModuleAttributeError: 'MTRN' object has no attribute 'model'

In [25]:
# RGB META
rgb_meta

{'narration_id': 'P01_01_96',
 'participant_id': 'P01',
 'video_id': 'P01_01',
 'narration_timestamp': '00:07:46.520',
 'start_timestamp': '00:07:49.14',
 'stop_timestamp': '00:07:50.32',
 'start_frame': 28148,
 'stop_frame': 28219,
 'narration': 'take glass',
 'verb': 'take',
 'verb_class': 0,
 'noun': 'glass',
 'noun_class': 10,
 'all_nouns': ['glass'],
 'all_noun_classes': [10],
 'frame_size': [256, 456, 3],
 'num_frames': 72}

In [12]:
def display_rgb(rgb_frames, fps=50):
    return ImageSequenceClip(rgb_frames, fps=fps)

In [None]:
def display_flow(flow_frames, fps=50):
    u_frames = flow_frames[::2]
    v_frames = flow_frames[1::2]
    
    def flow_to_clip(flow):
        # Convert optical flow magnitude to greyscale RGB
        return ImageSequenceClip(list(np.stack([flow] * 3, axis=-1)), fps=fps)
    
    u_clip = flow_to_clip(u_frames) 
    v_clip = flow_to_clip(v_frames) 
    return clips_array([[u_clip, v_clip]])

In [13]:
# clips_array([[display_rgb(rgb_frames), display_flow(flow_frames)]]).ipython_display()
display_rgb(rgb_frames).ipython_display()

                                                   

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4

Moviepy - Done !
Moviepy - video ready __temp__.mp4




In [26]:
narration_from_id(rgb_frames, rgb_meta, top_n=10)

{'take': ['take',
  'wash',
  'put',
  'move',
  'turn-on',
  'open',
  'close',
  'insert',
  'throw',
  'turn-off'],
 'glass': ['plate',
  'bin',
  'sponge',
  'tap',
  'top',
  'board:chopping',
  'colander',
  'brush',
  'pan',
  'tray']}

In [51]:
clip_id = 'P01_01_90'

In [52]:
rgb_frames, rgb_meta = rgb_train[clip_id]

narration_from_id(rgb_frames, rgb_meta, top_n=10)

{'turn-off': ['put',
  'take',
  'wash',
  'pour',
  'insert',
  'throw',
  'open',
  'move',
  'close',
  'turn-on'],
 'tap': ['lid',
  'bowl',
  'top',
  'rubbish',
  'cupboard',
  'bag',
  'bottle',
  'plate',
  'food',
  'board:chopping']}

In [None]:
flow_frames, flow_meta = flow_train[clip_id]

In [None]:
clips_array([[display_rgb(rgb_frames, fps=60), display_flow(flow_frames, fps=30)]]).ipython_display()

In [None]:
rgb_meta