# Video Model Inference on Jetson

In [1]:
import mxnet as mx
import gluoncv as gcv
import decord
print('mxnet:', mx.__version__)
print('gluoncv:', gcv.__version__)
print('decord:', decord.__version__)

mxnet: 1.6.0
gluoncv: 0.7.0
decord: 0.3.9


In [2]:
import matplotlib.pyplot as plt
import numpy as np
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

## Preprocessing

In [3]:
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
fast_frame_id_list = range(0, 64, 2)
slow_frame_id_list = range(0, 64, 16)
frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
print('frame list:', frame_id_list)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
print('clip shape:', clip_input[0].shape, 'clip_length:', len(clip_input))

frame list: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 0, 16, 32, 48]
clip shape: (256, 454, 3) clip_length: 36


In [4]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (36, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
print('Video data is downloaded and preprocessed.')

Video data is downloaded and preprocessed.


## Load model

In [5]:
model_name = 'slowfast_4x16_resnet50_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True, ctx=mx.gpu(0))
print('%s model is successfully loaded.' % model_name)

slowfast_4x16_resnet50_kinetics400 model is successfully loaded.


## Inference

In [6]:
pred = net(nd.array(clip_input, ctx=mx.gpu(0)))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

The input video clip is classified to be
	[abseiling], with probability 0.996.
	[rock_climbing], with probability 0.004.
	[ice_climbing], with probability 0.000.
	[paragliding], with probability 0.000.
	[climbing_a_rope], with probability 0.000.
