In [1]:
# %load reader.py
import numpy as np
import skvideo.io
import skimage.transform
import csv
import collections
import os

def readShortVideo(video_path, video_category, video_name, downsample_factor=12, rescale_factor=1):
    '''
    @param video_path: video directory
    @param video_category: video category (see csv files)
    @param video_name: video name (unique, see csv files)
    @param downsample_factor: number of frames between each sampled frame (e.g., downsample_factor = 12 equals 2fps)
    @param rescale_factor: float of scale factor (rescale the image if you want to reduce computations)

    @return: (T, H, W, 3) ndarray, T indicates total sampled frames, H and W is heights and widths
    '''

    filepath = video_path + '/' + video_category
    filename = [file for file in os.listdir(filepath) if file.startswith(video_name)]
    video = os.path.join(filepath,filename[0])

    videogen = skvideo.io.vreader(video)
    frames = []
    for frameIdx, frame in enumerate(videogen):
        if frameIdx % downsample_factor == 0:
            frame = skimage.transform.rescale(frame, rescale_factor, mode='constant', preserve_range=True).astype(np.uint8)
            frames.append(frame)
        else:
            continue

    return np.array(frames).astype(np.uint8)


def getVideoList(data_path):
    '''
    @param data_path: ground-truth file path (csv files)

    @return: ordered dictionary of videos and labels {'Action_labels', 'Nouns', 'End_times', 'Start_times', 'Video_category', 'Video_index', 'Video_name'}
    '''
    result = {}

    with open (data_path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            for column, value in row.items():
                result.setdefault(column,[]).append(value)

    od = collections.OrderedDict(sorted(result.items()))
    return od


In [2]:
import numpy as np
import pandas as pd

In [76]:
import os
import time
import numpy as np
import tensorflow as tf

class Extractor:
    def __init__(self, shape=(224, 224, 3)):
        # input information
        self.H, self.W, self.C = shape

        # parameter dictionary
        self.para_dict = dict()
        
    def build(self, vgg16_npy_path):
        """
        load pre-trained weights from path
        :param vgg16_npy_path: file path of vgg16 pre-trained weights
        """

        # input placeholder
        rgb_input = tf.placeholder(tf.float32, [None, self.H, self.W, self.C])
        self.is_train = tf.placeholder(tf.bool)
        
        # Convert RGB to BGR
        red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_input)
        self.x = tf.concat(axis=3,
                           values=[blue - 103.939,
                                   green - 116.779,
                                   red - 123.68,])
        
        assert self.x.get_shape().as_list()[1:] == [self.H, self.W, self.C]

        # load pre-trained weights
        if isinstance(vgg16_npy_path,dict):
            self.data_dict = vgg16_npy_path
            print("parameters loaded")
        else:
            self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item()
            print("npy file loaded")

        ### pre-trained VGG-16 start ###
        conv1_1 = self.conv_layer( self.x, "conv1_1")
        conv1_2 = self.conv_layer(conv1_1, "conv1_2")
        pool1 = self.max_pool_layer(conv1_2, "pool1")

        conv2_1 = self.conv_layer(  pool1, "conv2_1")
        conv2_2 = self.conv_layer(conv2_1, "conv2_2")
        pool2 = self.max_pool_layer(conv2_2, "pool2")

        conv3_1 = self.conv_layer(  pool2, "conv3_1")
        conv3_2 = self.conv_layer(conv3_1, "conv3_2")
        conv3_3 = self.conv_layer(conv3_2, "conv3_3")
        pool3 = self.max_pool_layer(conv3_3, "pool3")

        conv4_1 = self.conv_layer(  pool3, "conv4_1")
        conv4_2 = self.conv_layer(conv4_1, "conv4_2")
        conv4_3 = self.conv_layer(conv4_2, "conv4_3")
        pool4   = self.max_pool_layer(conv4_3, "pool4")

        conv5_1 = self.conv_layer(  pool4, "conv5_1")
        conv5_2 = self.conv_layer(conv5_1, "conv5_2")
        conv5_3 = self.conv_layer(conv5_2, "conv5_3")
        pool5 = self.max_pool_layer(conv5_3, "pool5")
        # flatten = tf.reduce_mean(conv5_3, [1,2])
        ### pre-trained VGG-16 end ###
        
        flatten = self.flatten_layer(pool5)
        self.output = flatten
        
    def avg_pool_layer(self, bottom, name):
        return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)

    def max_pool_layer(self, bottom, name):
        return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
    
    
    
    def dropout_layer(self, bottom, keep_prob):
        if self.is_train == True:
            return tf.nn.dropout(bottom, keep_prob=keep_prob)
        else:
            return bottom

    def trans_conv_layer(self, bottom, output_shape, stride, name=None, shape=None):
        with tf.variable_scope("VGG16", reuse=tf.AUTO_REUSE):
            if shape is not None:
                conv_filter = self.get_conv_filter(shape=shape, name=name, with_bn=False)
                conv_bias = self.get_bias(shape=shape[2], name=name)
            elif name in self.data_dict.keys():
                conv_filter = self.get_conv_filter(name=name, with_bn=False)
                conv_bias = self.get_bias(name=name)
            else:
                print("Neither give a shape nor lack a pre-trained layer called %s" % name)
        
        self.para_dict[name] = [conv_filter, conv_bias]

        conv = tf.nn.conv2d_transpose(bottom, conv_filter, output_shape, strides=[1, stride, stride, 1], padding="SAME")
        conv = tf.nn.bias_add(conv, conv_bias)

        return conv

    def conv_layer(self, bottom, name=None, shape=None):
        with tf.variable_scope("VGG16",reuse=tf.AUTO_REUSE):
            if name in self.data_dict.keys():
                conv_filter = self.get_conv_filter(name=name)
                conv_bias = self.get_bias(name=name)
            else:
                print("Neither give a shape nor lack a pre-trained layer called %s" % name)
        
        self.para_dict[name] = [conv_filter, conv_bias]
        
        conv = tf.nn.conv2d(bottom, conv_filter, [1, 1, 1, 1], padding='SAME')
        conv = tf.nn.bias_add(conv, conv_bias)
        relu = tf.nn.relu(conv)
        return relu
    
    def flatten_layer(self, bottom):
        shape = bottom.get_shape().as_list()
        dim = 1
        for d in shape[1:]:
            dim *= d
        x = tf.reshape(bottom, [-1, dim])
        return x
    
    def dense_layer(self, bottom, name=None):
        with tf.variable_scope("VGG16",reuse=True):
            weights = tf.get_fc_weight(name=name+"_W")
            biases = tf.get_bias(name=name+"_b")

        # Fully connected layer. Note that the '+' operation automatically broadcasts the biases.
        fc = tf.nn.bias_add(tf.matmul(bottom, weights), biases)
        return fc
        
    def get_conv_filter(self, shape=None, name=None, with_bn=True):
        if shape is not None:
            conv_filter = tf.get_variable(shape=shape, initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1), name=name+"_W", dtype=tf.float32)
            return conv_filter
        elif name in self.data_dict.keys():
            conv_filter = tf.get_variable(initializer=self.data_dict[name][0], name=name+"_W")
            return conv_filter
        else:
            print("Neither give a shape nor lack a pre-trained layer called %s" % name)
            return None
            
    def get_bias(self, name=None, shape=None):
        if shape is not None:
            return tf.get_variable(shape=shape, initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1), name=name+"_b", dtype=tf.float32)
        elif name in self.data_dict.keys(): 
            return tf.get_variable(initializer=self.data_dict[name][1], name=name+"_b")
        else:
            print("(get_bias) neither give a shape nor lack a pre-trained layer called %s" % name)
            return None
        
    def get_fc_weight(self, name, shape=None):
        if shape is not None:
            return tf.get_variable(shape=shape, initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1), name=name+"_W", dtype=tf.float32)
        elif name in self.data_dict.keys():
            return tf.get_variable(initializer=self.data_dict[name][0], name=name+"_W")
        else:
            print("(get weight) neither give a shape nor lack a pre-trained layer called %s" % name)
            return None

In [None]:
def one_hot_encoding(arr, num_classes):
    res = np.zeros((arr.size, num_classes))
    res[np.arange(arr.size),arr] = 1
    return(res)

In [6]:
video_path = "HW5_data/TrimmedVideos/video/"
train_video_path = video_path + "train/"
valid_video_path = video_path + "valid/"

In [117]:
train_list = getVideoList("HW5_data/TrimmedVideos/label/gt_train.csv")
valid_list = getVideoList("HW5_data/TrimmedVideos/label/gt_valid.csv")

In [118]:
dtrain = pd.DataFrame.from_dict(train_list)
dvalid = pd.DataFrame.from_dict(valid_list)

In [125]:
vgg16 = Extractor(shape=(120,160,3))
vgg16.build(vgg16_npy_path="../hw3/keras-vgg16.npy")

npy file loaded


In [130]:
features = list()
for i in range(dtrain.shape[0]):
    print(i, end="\r")
    video = readShortVideo(video_path=train_video_path,
                           video_category=dtrain.iloc[i]['Video_category'],
                           video_name=dtrain.iloc[i]['Video_name'],
                           downsample_factor=12,
                           rescale_factor=0.5)
    features.append(video)

3235

In [140]:
np.concatenate(features).shape

(29751, 120, 160, 3)

In [141]:
import pickle

In [145]:
codes = list()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer()) 
    for i in range(len(features)):
        print(i, end='\r')
        tmp = sess.run(vgg16.output, feed_dict={vgg16.x:features[i]})
        codes.append(tmp)

with open('codes.pkl', 'wb') as f:
    pickle.dump(codes, f)

3235

In [172]:
codes = pickle.load( open( "codes.pkl", "rb" ) )

In [156]:
X = list()
for i in range(len(codes)):
    X.append(np.mean(codes[i], axis=0))
X = np.array(X)

In [157]:
print(X.shape)

(3236, 10240)

In [165]:
Y = np.array(d.Action_labels).astype('int32')

In [170]:
Y = one_hot_encoding(Y, 11)