In [1]:
import numpy as np
import tensorflow as tf
import pickle
import cv2
import os.path as path

import i3d

## Constants

In [42]:
_BATCH_SIZE = 2
_IMAGE_SIZE = 224
_VIDEO_FRAMES = 64
_SAMPLE_VIDEOS_PATH = '../../data/10vid'
_SAMPLE_CAPS_PATH = '../../data/msvd_video_caps.pkl'
_OUTPUT_PATH = '../../data/i3d'
_LABEL_MAP_PATH = 'data/label_map.txt'
_LABEL_MAP_PATH_600 = 'data/label_map_600.txt'

_CHECKPOINT_PATHS = {
    'rgb': 'data/checkpoints/rgb_scratch/model.ckpt',
    'rgb600': 'data/checkpoints/rgb_scratch_kin600/model.ckpt',
    'flow': 'data/checkpoints/flow_scratch/model.ckpt',
    'rgb_imagenet': 'data/checkpoints/rgb_imagenet/model.ckpt',
    'flow_imagenet': 'data/checkpoints/flow_imagenet/model.ckpt',
}

## Arguments

In [None]:
tf.flags.DEFINE_string('eval_type', 'joint', 'rgb, rgb600, flow, or joint')
tf.flags.DEFINE_boolean('imagenet_pretrained', True, '')
tf.logging.set_verbosity(tf.logging.INFO)

In [3]:
eval_type = 'rgb'
imagenet_pretrained = True

In [4]:
if eval_type == 'rgb600':
    kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)]
else:
    kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)]
    
NUM_CLASSES = 400
if eval_type == 'rgb600':
    NUM_CLASSES = 600

## Read video caps

In [None]:
with open(_SAMPLE_CAPS_PATH, 'rb') as f:
    np_data = pickle.load(f)
    filenames = np_data[:,0]
    filenames = np.unique(filenames)

In [9]:
filenames = np.array(['_UqnTFs1BLc_23_27.avi', 
                       '3zgEl-OLFKE_12_15.avi',
#                       '5L5MoemWC6g_10_13.avi', '-7KMZQEsJW4_205_208.avi',
#                       'BgoOihBb78w_38_40.avi', 'GWQTAe64m-0_91_94.avi',
#                       'nTasT5h0LEg_12_14.avi', 'QT8iCDc7NGU_18_23.avi',
#                       'SaOqf2d-y30_22_27.avi', 'UnWgz-mVMXU_2_6.avi'
                      ])

In [44]:
def _transform_frame(bgr_frame):
    rgb_frame = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
    resize_rgb = cv2.resize(rgb_frame, dsize=(_IMAGE_SIZE, _IMAGE_SIZE), interpolation=cv2.INTER_LINEAR)
    # normalize to [0,1]
    norm_rgb = resize_rgb / 255.0
    # transform data using (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
    norm_rgb = (norm_rgb - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
    return norm_rgb

def _read_video_function(filename):
    filename = filename.decode()
    file_path = path.join(_SAMPLE_VIDEOS_PATH, filename)
    frames = list()

    cap = cv2.VideoCapture(file_path)
    assert cap.isOpened(), 'Cannot open file {0}'.format(filename)

    # get total frames
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # if total frames is less than desired, loop the video
    if total_frames <= _VIDEO_FRAMES:
        for i in range(_VIDEO_FRAMES):
            read_ok, bgr_frame = cap.read()
            if not read_ok:
                cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                read_ok, bgr_frame = cap.read()

            transformed = _transform_frame(bgr_frame)
            frames.append(transformed)
    else: # randomly sample from video
        chosen_frames = np.random.choice(total_frames, _VIDEO_FRAMES, replace=False)
        chosen_frames.sort()
        for i in range(_VIDEO_FRAMES):
            cap.set(cv2.CAP_PROP_POS_FRAMES, chosen_frames[i])
            read_ok, bgr_frame = cap.read()

            assert read_ok, "cannot read frame {0}".format(chosen_frames[i])

            transformed = _transform_frame(bgr_frame)
            frames.append(transformed)

    cap.release()
        
    return np.array(frames, dtype=np.float32), filename[:-4]

In [48]:
tf.reset_default_graph()

filenames_placeholder = tf.placeholder(filenames.dtype, filenames.shape)
dataset = tf.data.Dataset.from_tensor_slices(filenames_placeholder)
dataset = dataset.map(lambda filename: tuple(tf.py_func(_read_video_function, 
                                                   [filename],
                                                   [tf.float32, filename.dtype])))
dataset = dataset.batch(_BATCH_SIZE)
iterator = dataset.make_initializable_iterator()
rgb_input, video_ids = iterator.get_next()
rgb_input.set_shape([_BATCH_SIZE, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3])

## Predicting model

In [46]:
with tf.variable_scope('RGB'):
    rgb_model = i3d.InceptionI3d(
      NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits')
    rgb_logits, _ = rgb_model(
      rgb_input, is_training=False, dropout_keep_prob=1.0)


rgb_variable_map = {}
for variable in tf.global_variables():
    if variable.name.split('/')[0] == 'RGB':
        if eval_type == 'rgb600':
            rgb_variable_map[variable.name.replace(':0', '')[len('RGB/inception_i3d/'):]] = variable
        else:
            rgb_variable_map[variable.name.replace(':0', '')] = variable

rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True)

In [26]:
model_logits = rgb_logits
model_predictions = tf.nn.softmax(model_logits)

In [30]:
with tf.Session() as sess:
    feed_dict = {}
    if imagenet_pretrained:
        rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet'])
    else:
        rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type])
    tf.logging.info('RGB checkpoint restored')
    
    sess.run(iterator.initializer, feed_dict={filenames_placeholder: filenames})
    tf.logging.info('RGB data loaded')
    
    while True:
        try:
            out_logits, out_predictions, out_ids = sess.run(
                [model_logits, model_predictions, video_ids])
        except tf.errors.OutOfRangeError:
            break

        out_logits = out_logits[0]
        out_predictions = out_predictions[0]
        sorted_indices = np.argsort(out_predictions)[::-1]

        print('Norm of logits: %f' % np.linalg.norm(out_logits))
        print('\nTop classes and probabilities')
        for index in sorted_indices[:20]:
            print(out_predictions[index], out_logits[index], kinetics_classes[index])

INFO:tensorflow:Restoring parameters from data/checkpoints/rgb_imagenet/model.ckpt
INFO:tensorflow:RGB checkpoint restored
INFO:tensorflow:RGB data loaded
Norm of logits: 72.048782

Top classes and probabilities
0.7462608 13.621449 brushing teeth
0.062778056 11.14598 gargling
0.03862266 10.660213 crying
0.03817852 10.648647 balloon blowing
0.022206897 10.106777 sneezing
0.017703548 9.880139 baby waking up
0.011328001 9.433652 drinking
0.009080447 9.212498 brushing hair
0.0075212307 9.024104 yawning
0.0070843603 8.964264 eating spaghetti
0.006490159 8.876661 kissing
0.0038292082 8.349032 blowing nose
0.0031691713 8.159844 sniffing
0.0030865364 8.133424 laughing
0.0023274824 7.8511615 eating carrots
0.0020293007 7.7140656 washing hair
0.0019155912 7.6564007 eating cake
0.0017570484 7.5700097 taking a shower
0.001413717 7.3525968 carving pumpkin
0.0010630438 7.0675106 playing harmonica
Norm of logits: 59.631428

Top classes and probabilities
0.3584418 9.413069 cartwheeling
0.09580738 8.09

## Feature extraction model

In [49]:
with tf.variable_scope('RGB'):
    rgb_model = i3d.InceptionI3d(final_endpoint='Mixed_5c')
    rgb_features, _ = rgb_model(rgb_input, is_training=False)
    rgb_features = tf.nn.avg_pool3d(rgb_features, ksize=[1, 2, 7, 7, 1],
                             strides=[1, 1, 1, 1, 1], padding='VALID')
    rgb_features = tf.reduce_mean(rgb_features, axis=1)
    rgb_features = tf.squeeze(rgb_features, axis=[1, 2])


rgb_variable_map = {}
for variable in tf.global_variables():
    if variable.name.split('/')[0] == 'RGB':
        if eval_type == 'rgb600':
            rgb_variable_map[variable.name.replace(':0', '')[len('RGB/inception_i3d/'):]] = variable
        else:
            rgb_variable_map[variable.name.replace(':0', '')] = variable

rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True)

In [50]:
n_files = len(filenames)
n_processed = 0
with tf.Session() as sess:
    if imagenet_pretrained:
        rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet'])
    else:
        rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type])
    tf.logging.info('RGB checkpoint restored')
    
    sess.run(iterator.initializer, feed_dict={filenames_placeholder: filenames})    
    while True:
        try:
            out_features, out_ids = sess.run([rgb_features, video_ids])
        except tf.errors.OutOfRangeError:
            break
            
        for i in range(len(out_ids)):
            np.save(path.join(_OUTPUT_PATH, out_ids[i].decode()), out_features[i], allow_pickle=False)
            
        n_processed += len(out_ids)
        tf.logging.info("processed {n_processed:d}/{n_files:d}".format(n_files=n_files, n_processed=n_processed))
        tf.logging.info("--> {0}".format(b', '.join(out_ids)))

INFO:tensorflow:Restoring parameters from data/checkpoints/rgb_imagenet/model.ckpt
INFO:tensorflow:RGB checkpoint restored
INFO:tensorflow:processed 2/2
INFO:tensorflow:--> b'_UqnTFs1BLc_23_27, 3zgEl-OLFKE_12_15'


In [27]:
a = np.random.choice(10, 10, replace=False)

In [28]:
a.sort()

In [29]:
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])