In [None]:
import os, sys, time, argparse, logging, math, gc, json

import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv.model_zoo import get_model
from gluoncv.data import VideoClsCustom
from gluoncv.utils.filesystem import try_import_decord

In [None]:
import utils.setup as setup
from utils.vid import viewer , gerador_clips

parser = argparse.ArgumentParser(description='fext')
parser.add_argument('--experiment', 
                    type=str, 
                    default='cfg/xdviol0.yml', 
                    help='relative path to experiment .yml')
args = parser.parse_args(args=[])

cfg = setup.init(args)
from  utils.log import get_logger
logger = get_logger(__name__)

In [None]:
import glob , os
dir = cfg.DATA_DIR
sub_dirs = glob.glob(dir+"/*_copy")
print(sub_dirs)

for sub_dir in sub_dirs:
    print(sub_dir.split("/")[-1])
    vpaths = glob.glob(sub_dir+"/*.mp4")
    print(len(vpaths))
#paths = glob.glob(dir+"/*.mp4")
#paths.sort()
#paths = paths
#print(f'from {dir} got {len(paths)} vids')

In [None]:
# gpu
gc.set_threshold(100, 5, 5)

# set env
if cfg.GPUID[0] == -1: context = mx.cpu()
else: 
    context = []
    for gpu in cfg.GPUID:context.append(mx.gpu(gpu))
print("context",str(context))

In [None]:
# get data preprocess
image_norm_mean = [0.485, 0.456, 0.406]
image_norm_std = [0.229, 0.224, 0.225]
if cfg.TRANSFORM.TEN_CROP:
    transform_test = transforms.Compose([
        video.VideoTenCrop(cfg.TRANSFORM.INPUT_SIZE),
        video.VideoToTensor(),
        video.VideoNormalize(image_norm_mean, image_norm_std)
    ])
    cfg.DATA.NUM_CROP = 10
elif cfg.TRANSFORM.THREE_CROP:
    transform_test = transforms.Compose([
        video.VideoThreeCrop(cfg.TRANSFORM.INPUT_SIZE),
        video.VideoToTensor(),
        video.VideoNormalize(image_norm_mean, image_norm_std)
    ])
    cfg.DATA.NUM_CROP = 3
else:
    transform_test = video.VideoGroupValTransform(size=cfg.TRANSFORM.INPUT_SIZE, mean=image_norm_mean, std=image_norm_std)
    cfg.DATA.NUM_CROP = 1

In [None]:
# get model
if cfg.MODEL.USE_PRETRAINED and len(cfg.MODEL.HASHTAG) > 0:
    cfg.MODEL.USE_PRETRAINED = cfg.MODEL.HASHTAG
classes = cfg.MODEL.NUM_CLASSES
model_name = cfg.MODEL.NAME
net = get_model(name=model_name, nclass=classes, pretrained=cfg.MODEL.USE_PRETRAINED,
                feat_ext=True, num_segments=cfg.DATA.NUM_SEGMENTS, num_crop=cfg.DATA.NUM_CROP)
net.cast(cfg.MODEL.DTYPE)
#net.collect_params().reset_ctx(context)
net.collect_params().initialize(force_reinit=True, ctx=devices)
#print(net.collect_params())

if cfg.MODEL.MODE == 'hybrid':
    net.hybridize(static_alloc=True, static_shape=True)
if cfg.MODEL.RESUME_PARAMS != '' and not cfg.MODEL.USE_PRETRAINED:
    net.load_parameters(cfg.MODEL.RESUME_PARAMS, ctx=devices)
    logger.info('Pre-trained model %s is successfully loaded.' % (cfg.MODEL.RESUME_PARAMS))
else: logger.info('Pre-trained model is successfully loaded from the model zoo.')

logger.info("Successfully built model {}".format(model_name))


In [None]:
# get data
f = open(cfg.DATA_LIST, 'r')
data_list = f.readlines()
logger.info('Load %d video samples.' % len(data_list))

# build a pseudo dataset instance to use its children class methods
video_utils = VideoClsCustom(root=cfg.DATA_DIR,
                            setting=cfg.DATA_LIST,
                            num_segments=cfg.DATA.NUM_SEGMENTS,
                            num_crop=cfg.DATA.NUM_CROP,
                            new_length=cfg.DATA.NEW_LENGTH,
                            new_step=cfg.DATA.NEW_STEP,
                            new_width=cfg.DATA.NEW_WIDTH,
                            new_height=cfg.DATA.NEW_HEIGHT,
                            video_loader=True,
                            use_decord=True,
                            slowfast=cfg.DATA.SLOWFAST,
                            slow_temporal_stride=cfg.DATA.SLOW_TEMPORAL_STRIDE,
                            fast_temporal_stride=cfg.DATA.FAST_TEMPORAL_STRIDE,
                            data_aug=cfg.DATA.DATA_AUG,
                            lazy_init=True)
'''    
root : str, required.
    Path to the root folder storing the dataset.
setting : str, required.
    A text file describing the dataset, each line per video sample. There are three items in each line: (1) video path; (2) video length and (3) video label.
train : bool, default True.
    Whether to load the training or validation set.
test_mode : bool, default False.
    Whether to perform evaluation on the test set. Usually there is three-crop or ten-crop evaluation strategy involved.
name_pattern : str, default None.
    The naming pattern of the decoded video frames. For example, img_00012.jpg.
video_ext : str, default 'mp4'.
    If video_loader is set to True, please specify the video format accordinly.
is_color : bool, default True.
    Whether the loaded image is color or grayscale.
modality : str, default 'rgb'.
    Input modalities, we support only rgb video frames for now. Will add support for rgb difference image and optical flow image later.
num_segments : int, default 1.
    Number of segments to evenly divide the video into clips. A useful technique to obtain global video-level information.
    Limin Wang, etal, Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016.
num_crop : int, default 1.
    Number of crops for each image. default is 1. Common choices are three crops and ten crops during evaluation.
new_length : int, default 1.
    The length of input video clip. Default is a single image, but it can be multiple video frames. For example, new_length=16 means we will extract a video clip of consecutive 16 frames.
new_step : int, default 1.
    Temporal sampling rate. For example, new_step=1 means we will extract a video clip of consecutive frames. new_step=2 means we will extract a video clip of every other frame.
new_width : int, default 340.
    Scale the width of loaded image to 'new_width' for later multiscale cropping and resizing.
new_height : int, default 256.
    Scale the height of loaded image to 'new_height' for later multiscale cropping and resizing.
target_width : int, default 224.
    Scale the width of transformed image to the same 'target_width' for batch forwarding.
target_height : int, default 224.
    Scale the height of transformed image to the same 'target_height' for batch forwarding.
temporal_jitter : bool, default False.
    Whether to temporally jitter if new_step > 1.
video_loader : bool, default False.
    Whether to use video loader to load data.
use_decord : bool, default True.
    Whether to use Decord video loader to load data. Otherwise use mmcv video loader.
transform : function, default None.
    A function that takes data and label and transforms them.
slowfast : bool, default False.
    If set to True, use data loader designed for SlowFast network. Christoph Feichtenhofer, etal, SlowFast Networks for Video Recognition, ICCV 2019.
slow_temporal_stride : int, default 16.
    The temporal stride for sparse sampling of video frames in slow branch of a SlowFast network.
fast_temporal_stride : int, default 2.
    The temporal stride for sparse sampling of video frames in fast branch of a SlowFast network.
data_aug : str, default 'v1'.
    Different types of data augmentation auto. Supports v1, v2, v3 and v4.
lazy_init : bool, default False.
    If set to True, build a dataset instance without loading any dataset.
'''

In [None]:
def _video_TSN_decord_batch_loader(directory, video_reader, duration, indices, skip_offsets):
    skip_length = cfg.DATA.NEW_LENGTH * cfg.DATA.NEW_STEP ## needed (frames)
    sampled_list = []
    frame_id_list = []
    
    for seg_ind in indices:
        offset = int(seg_ind)
        for i, _ in enumerate(range(0, skip_length, cfg.DATA.NEW_STEP)):
            if offset + skip_offsets[i] <= duration:
                frame_id = offset + skip_offsets[i] - 1
            else:
                frame_id = offset - 1
            frame_id_list.append(frame_id)
            if offset + cfg.DATA.NEW_STEP < duration:
                offset += cfg.DATA.NEW_STEP
    try:
        video_data = video_reader.get_batch(frame_id_list).asnumpy()
        sampled_list = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
    except:
        raise RuntimeError('Error occured in reading frames {} from video {} of duration {}.'.format(frame_id_list, directory, duration))
    return sampled_list

In [None]:
def _sample_test_indices(num_frames):
    temporal_jitter = False
    needed_frames = cfg.DATA.NEW_LENGTH * cfg.DATA.NEW_STEP

    if num_frames > needed_frames - 1:
        tick = (num_frames - needed_frames + 1) / \
            float(cfg.DATA.NUM_SEGMENTS)
        print('num_frames: {}, tick: {}'.format(num_frames, tick))
        offsets = np.array([int(tick / 2.0 + tick * x)
                            for x in range(cfg.DATA.NUM_SEGMENTS)])
    else:
        offsets = np.zeros((cfg.DATA.NUM_SEGMENTS,))

    if temporal_jitter:
        skip_offsets = np.random.randint(
            cfg.DATA.NEW_STEP, size=needed_frames // cfg.DATA.NEW_STEP)
    else:
        skip_offsets = np.zeros(
            needed_frames // cfg.DATA.NEW_STEP, dtype=int)
    return offsets + 1, skip_offsets


In [None]:
def read_data(cfg, video_name, transform, video_utils):

    decord = try_import_decord()
    decord_vr = decord.VideoReader(video_name, width=cfg.DATA.NEW_WIDTH, height=cfg.DATA.NEW_HEIGHT)
    duration = len(decord_vr)
    
    
    segment_indices, skip_offsets = video_utils._sample_test_indices(duration)
    print("segment_indices",segment_indices,"skip_offsets",skip_offsets,"duration",duration)
    segment_indices2, skip_offsets2 = _sample_test_indices(duration)
    assert np.allclose(segment_indices2 , segment_indices )


    if cfg.DATA.SLOWFAST: 
        clip_input = video_utils._video_TSN_decord_slowfast_loader(video_name, decord_vr, duration, segment_indices, skip_offsets)
    else: 
        clip_input = video_utils._video_TSN_decord_batch_loader(video_name, decord_vr, duration, segment_indices, skip_offsets)
        #clip_input2 = _video_TSN_decord_batch_loader(video_name, decord_vr, duration, segment_indices, skip_offsets)
        #np.allclose(clip_input2 , clip_input )
    print("decord out",np.shape(clip_input),type(clip_input))
    
    
    viewer(clip_input)
    

    clip_input = transform(clip_input)
    print("after transform",np.shape(clip_input))
    

    if cfg.DATA.SLOWFAST:
        sparse_sampels = len(clip_input) // (cfg.DATA.NUM_SEGMENTS * cfg.DATA.NUM_CROP)
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1,) + (sparse_sampels, 3, cfg.TRANSFORM.INPUT_SIZE, cfg.TRANSFORM.INPUT_SIZE))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    else:
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1,) + (cfg.DATA.NEW_LENGTH, 3, cfg.TRANSFORM.INPUT_SIZE, cfg.TRANSFORM.INPUT_SIZE))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    print("after reshape",np.shape(clip_input))

    if cfg.DATA.NEW_LENGTH == 1:
        clip_input = np.squeeze(clip_input, axis=2)    # this is for 2D input case
    
    print("end",np.shape(clip_input))
    return nd.array(clip_input)

In [None]:
t = list(range(0,1400,16))

In [None]:
start_time = time.time()
for vid, vline in enumerate(data_list):
    #if vid < 100 : continue
    
    video_path = vline.split()[0]
    video_name = video_path.split('/')[-1]
    video_data = read_data(cfg, video_path, transform_test, video_utils)
    video_input = video_data.as_in_context(context)
    video_feat = net(video_input.astype(cfg.MODEL.DTYPE, copy=False))

    if cfg.DEBUG:
        print(vid,'video_path', video_path)
        print('video_data', video_data.shape)
        print('video_input', video_input.shape)
        print('video_feat', video_feat.shape,"\n                                        ")

    #feat_file = '%s_%s_feat.npy' % (model_name, video_name)
    #np.save(os.path.join(cfg.SAVE_DIR, feat_file), video_feat.asnumpy())

    if vid > 0 and vid % cfg.LOG_INTERVAL == 0:
        logger.info('%04d/%04d is done' % (vid, len(data_list)))
    if vid == 1 : break
    
end_time = time.time()
logger.info('Total feature extraction time is %4.2f minutes' % ((end_time - start_time) / 60))


#### SELF

In [None]:
import cv2 , numpy as np , os
from gluoncv.utils.filesystem import try_import_decord
from  utils.log import get_logger
logger = get_logger(__name__)

decord = try_import_decord()
def gerador_clips(video_path , cfg , transform , use_decord = True):

    '''
        gets batch_size of frames from video
        yields chucnks of 16 frames
        repeat until end
    '''
    
    clip_length = cfg.DATA.NEW_LENGTH
    if not use_decord:
        vid = cv2.VideoCapture(video_path)
        fps = int(vid.get(cv2.CAP_PROP_FPS))
        tframes = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
        logger.info(f'{video_path}\n   {str(tframes)} frames | {str(fps)} fps')
        
        frames = []
        while True:
            success, frame = vid.read()
            if not success: break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

            if len(frames) >= clip_length:
                
                #if cfg.DATA.SLOWFAST:     
                
                #else:
                frames = transform(frames) 
                frames = np.stack(frames, axis=0)
                frames = frames.reshape((-1,) + (cfg.DATA.NEW_LENGTH, 3, cfg.TRANSFORM.INPUT_SIZE, cfg.TRANSFORM.INPUT_SIZE))
                frames = np.transpose(frames, (0, 2, 1, 3, 4))    
                yield frames
                frames = []
        vid.release()
    
    else:
        vid = decord.VideoReader(video_path, width=cfg.DATA.NEW_WIDTH, height=cfg.DATA.NEW_HEIGHT)
        duration = len(vid)
        
        idx_vid = list(range(0,duration,cfg.DATA.NEW_STEP))
        segments = int(len(idx_vid) / cfg.DATA.NEW_LENGTH)
        logger.info(f'yielding {segments} segments from {len(idx_vid)} frames ({str(int(duration/cfg.DATA.NEW_STEP))})')
        yield segments
        
        for segment in range(segments):
            idx_batch = idx_vid[segment*32:(segment+1)*32]
            #logger.info(len(idx_batch),idx_batch)
            frames = vid.get_batch(idx_batch).asnumpy()
            frames = transform(frames) 
            frames = np.stack(frames, axis=0)
            frames = frames.reshape((-1,) + (cfg.DATA.NEW_LENGTH, 3, cfg.TRANSFORM.INPUT_SIZE, cfg.TRANSFORM.INPUT_SIZE))
            frames = np.transpose(frames, (0, 2, 1, 3, 4))    
            yield frames


In [None]:
from utils.vid import gerador_clips
import utils.list
data_list = utils.list.get(cfg.DATA_LIST)

vp = data_list[0].split()[0]
clips = gerador_clips(vp , cfg , transform_test , True)