In [12]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import sys
import ipdb
import time
import cv2
from keras.preprocessing import sequence
import matplotlib.pyplot as plt

tf.reset_default_graph()

In [13]:
class Video_Caption_Generator() :
    def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_lstm_steps, n_video_lstm_step, n_caption_lstm_step, bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_video_lstm_step=n_video_lstm_step
        self.n_caption_lstm_step=n_caption_lstm_step
        
        with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
            with tf.device("/cpu:0"):
                self.Wemb = tf.Variable( tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name = 'Wemb')

            self.lstm1 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False,name = 'basic_lstm_cell_1')
            self.lstm2 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False,name= 'basic_lstm_cell_2')

            self.encode_image_W = tf.Variable( tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1),  name='encode_image_W')
            self.encode_image_b = tf.Variable( tf.zeros([dim_hidden]), name='encode_image_b')

            self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W')

            if bias_init_vector is not None:
                self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
            else:
                self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')

    
    def build_model(self) :
        video = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step])
        
        caption = tf.placeholder(tf.int32, [self.batch_size, self.n_caption_lstm_step+1])
        caption_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_caption_lstm_step+1])
        
        video_flat = tf.reshape(video, [-1, self.dim_image])
        
        image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b ) # (batch_size*n_lstm_steps, dim_hidden)
        image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden])

        state1 = tf.zeros([self.batch_size, self.lstm1.state_size])
        state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
        padding = tf.zeros([self.batch_size, self.dim_hidden])
        
        probs = []
        loss = 0.0
        
        ##############################  Encoding Stage ##################################
        for i in range(0, self.n_video_lstm_step):
            #with tf.variable_scope("LSTM") as scop:
                #with tf.variable_scope("LSTM1"):
            with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                output1, state1 = self.lstm1(image_emb[:,i,:], state1)

                #with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([padding, output1],1), state2)

        ############################# Decoding Stage ######################################
        for i in range(0, self.n_caption_lstm_step): ## Phase 2 => only generate captions
            
            with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
                with tf.device("/cpu:0"):
                    current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i])
                    
                output1, state1 = self.lstm1(padding, state1)

            #with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([current_embed, output1],1), state2)

            labels = tf.expand_dims(caption[:, i+1], 1)
            indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
            concated = tf.concat([indices, labels], 1)
            onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)
            
            logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
            cross_entropy = cross_entropy * caption_mask[:,i]
            probs.append(logit_words)

            current_loss = tf.reduce_sum(cross_entropy)/self.batch_size
            loss = loss + current_loss
            
        return loss, video, video_mask, caption, caption_mask, probs
    
    
    def build_generator(self):
        video = tf.placeholder(tf.float32, [1, self.n_video_lstm_step,  self.dim_image])
        video_mask = tf.placeholder(tf.float32, [1, self.n_video_lstm_step])
        
        video_flat = tf.reshape(video, [-1, self.dim_image])
        
        image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b)
        image_emb = tf.reshape(image_emb, [1, self.n_video_lstm_step, self.dim_hidden])
        
        state1 = tf.zeros([1, self.lstm1.state_size])
        state2 = tf.zeros([1, self.lstm2.state_size])
        padding = tf.zeros([1, self.dim_hidden])
        
        generated_words = []
        
        probs = []
        embeds = []
        
        for i in range(0, self.n_video_lstm_step):
            with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                    
                output1, state1 = self.lstm1(image_emb[:,i,:], state1)

                output2, state2 = self.lstm2(tf.concat([padding, output1],1), state2)
                
        for  i in range(0, self.n_caption_lstm_step):
            tf.get_variable_scope().reuse_variables()
            
            with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
                if i == 0:
                    with tf.device('/cpu:0'):
                        current_embed = tf.nn.embedding_lookup(self.Wemb, tf.ones([1], dtype=tf.int64))
                        
                output1, state1 = self.lstm1(padding, state1)

                output2, state2 = self.lstm2(tf.concat([current_embed, output1], 1), state2)
                
                logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b)
                max_prob_index = tf.argmax(logit_words, 1)[0]
                generated_words.append(max_prob_index)
                probs.append(logit_words)
                
                with tf.device("/cpu:0"):
                    current_embed = tf.nn.embedding_lookup(self.Wemb, max_prob_index)
                    current_embed = tf.expand_dims(current_embed, 0)

                embeds.append(current_embed)
                
        return video, video_mask, generated_words, probs, embeds

In [14]:
#=====================================================================================
# Global Parameters
#=====================================================================================
video_path = './data/training_data/video'

video_train_feat_path = './data/training_data/feat'
video_test_feat_path = './data/testing_data/feat'

video_train_data_label_path = './data/training_label.json'
video_test_data_label_path = './data/testing_public_label.json'

model_path = './models'

#=======================================================================================
# Train Parameters
#=======================================================================================
dim_image = 4096
dim_hidden= 1000

n_video_lstm_step = 80
n_caption_lstm_step = 20
n_frame_step = 80

n_epochs = 1000
batch_size = 50
learning_rate = 0.0001


In [15]:
def get_data(video_feat_path, video_label_path) :
    train_data_captions = pd.read_json(video_label_path,orient='values',encoding='utf-8',lines=False)
    
    train_data_captions = pd.concat([pd.Series(row['id'], row['caption']) for _, row in train_data_captions.iterrows()]).reset_index()
    train_data_captions.columns = ['caption','id']
    train_data_captions['video_path'] = train_data_captions.apply(lambda row: row['id']+'.npy', axis=1)
    train_data_captions['video_path'] = train_data_captions['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    train_data_captions = train_data_captions[train_data_captions['video_path'].map(lambda x: os.path.exists( x ))]
    train_data_captions = train_data_captions[train_data_captions['caption'].map(lambda x: isinstance(x, str))]

    unique_filenames = sorted(train_data_captions['video_path'].unique())
    train_data_captions = train_data_captions[train_data_captions['video_path'].map(lambda x: x in unique_filenames)]
    return train_data_captions

In [16]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
        nsents += 1
        for w in sent.lower().split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print('filtered words from %d to %d' % (len(word_counts), len(vocab)))
    
    ixtoword = {}
    ixtoword[0] = '<pad>'
    ixtoword[1] = '<bos>'
    ixtoword[2] = '<eos>'
    ixtoword[3] = '<unk>'
    
    wordtoix = {}
    wordtoix['<pad>'] = 0
    wordtoix['<bos>'] = 1
    wordtoix['<eos>'] = 2
    wordtoix['<unk>'] = 3
    
    for idx, w in enumerate(vocab):
        wordtoix[w] = idx+4
        ixtoword[idx+4] = w

    word_counts['<pad>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<eos>'] = nsents
    word_counts['<unk>'] = nsents
    
    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
    
    return wordtoix, ixtoword, bias_init_vector

In [17]:
def train():
    
    train_data = get_data(video_train_feat_path, video_train_data_label_path)
    train_captions = train_data['caption'].values
    test_data = get_data(video_test_feat_path, video_test_data_label_path)
    test_captions = test_data['caption'].values

    captions_list = list(train_captions) + list(test_captions)
    captions = np.asarray(captions_list, dtype=np.object)

    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    captions = map(lambda x: x.replace('"', ''), captions)
    captions = map(lambda x: x.replace('\n', ''), captions)
    captions = map(lambda x: x.replace('?', ''), captions)
    captions = map(lambda x: x.replace('!', ''), captions)
    captions = map(lambda x: x.replace('\\', ''), captions)
    captions = map(lambda x: x.replace('/', ''), captions)

    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=0)

    np.save("./data/wordtoix", wordtoix)
    np.save('./data/ixtoword', ixtoword)
    np.save("./data/bias_init_vector", bias_init_vector)

    #tf.get_variable_scope().reuse_variables()
    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
    
    with tf.Session() as sess:
    
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        init = tf.global_variables_initializer()#初始化所在的位置至关重要，以本程序为例，使用adam优化器时，会主动创建变量。
        sess.run(init)

        # my tensorflow version is 0.12.1, I write the saver with version 1.0
        saver = tf.train.Saver(max_to_keep=100, write_version=1)

        #new_saver = tf.train.Saver()
        #new_saver = tf.train.import_meta_graph('./rgb_models/model-1000.meta')
        #new_saver.restore(sess, tf.train.latest_checkpoint('./models/'))
        
        #加入断点续训功能
        ckpt = tf.train.get_checkpoint_state(model_path)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess,ckpt.model_checkpoint_path)

        loss_fd = open('loss.txt', 'w')
        loss_to_draw = []

        for epoch in range(0, n_epochs):
            loss_to_draw_epoch = []

            index = list(train_data.index)
            np.random.shuffle(index)
            train_data = train_data.iloc[index]

            current_train_data = train_data.groupby('video_path').apply(lambda x: x.iloc[np.random.choice(len(x))])
            current_train_data = current_train_data.reset_index(drop=True)

            for start, end in zip(
                    range(0, len(current_train_data)+1, batch_size),
                    range(batch_size, len(current_train_data)+1, batch_size)):

                start_time = time.time()

                current_batch = current_train_data[start:end]
                current_videos = current_batch['video_path'].values

                current_feats = np.zeros((batch_size, n_video_lstm_step, dim_image))
                current_feats_vals = list(map(lambda vid: np.load(vid), current_videos))

                current_video_masks = np.zeros((batch_size, n_video_lstm_step))

                for ind,feat in enumerate(current_feats_vals):
                    current_feats[ind][:len(current_feats_vals[ind])] = feat
                    current_video_masks[ind][:len(current_feats_vals[ind])] = 1

                current_captions = current_batch['caption'].values
                current_captions = map(lambda x: '<bos> ' + x, current_captions)
                current_captions = map(lambda x: x.replace('.', ''), current_captions)
                current_captions = map(lambda x: x.replace(',', ''), current_captions)
                current_captions = map(lambda x: x.replace('"', ''), current_captions)
                current_captions = map(lambda x: x.replace('\n', ''), current_captions)
                current_captions = map(lambda x: x.replace('?', ''), current_captions)
                current_captions = map(lambda x: x.replace('!', ''), current_captions)
                current_captions = map(lambda x: x.replace('\\', ''), current_captions)
                current_captions = map(lambda x: x.replace('/', ''), current_captions)

                current_captions = list(current_captions)

                for idx, each_cap in enumerate(current_captions):
                    word = each_cap.lower().split(' ')
                    if len(word) < n_caption_lstm_step:
                        current_captions[idx] = current_captions[idx] + ' <eos>'
                    else:
                        new_word = ''
                        for i in range(n_caption_lstm_step-1):
                            new_word = new_word + word[i] + ' '
                        current_captions[idx] = new_word + '<eos>'

                current_caption_ind = []
                for cap in current_captions:
                    current_word_ind = []
                    for word in cap.lower().split(' '):
                        if word in wordtoix:
                            current_word_ind.append(wordtoix[word])
                        else:
                            current_word_ind.append(wordtoix['<unk>'])
                    current_caption_ind.append(current_word_ind)

                current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=n_caption_lstm_step)
                current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int)
                current_caption_masks = np.zeros( (current_caption_matrix.shape[0], current_caption_matrix.shape[1]) )
                nonzeros = np.array( list(map(lambda x: (x != 0).sum() + 1, current_caption_matrix )) )

                for ind, row in enumerate(current_caption_masks):
                    row[:nonzeros[ind]] = 1

                probs_val = sess.run(tf_probs, feed_dict={
                    tf_video:current_feats,
                    tf_caption: current_caption_matrix
                    })

                _, loss_val = sess.run(
                        [train_op, tf_loss],
                        feed_dict={
                            tf_video: current_feats,
                            tf_video_mask : current_video_masks,
                            tf_caption: current_caption_matrix,
                            tf_caption_mask: current_caption_masks
                            })
                loss_to_draw_epoch.append(loss_val)

                print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time)))
                loss_fd.write('epoch ' + str(epoch) + ' loss ' + str(loss_val) + '\n')

            # draw loss curve every epoch
            loss_to_draw.append(np.mean(loss_to_draw_epoch))
            plt_save_dir = "./loss_imgs"
            plt_save_img_name = str(epoch) + '.png'
            plt.plot(range(len(loss_to_draw)), loss_to_draw, color='g')
            plt.grid(True)
            plt.savefig(os.path.join(plt_save_dir, plt_save_img_name))

            if np.mod(epoch, 10) == 0:
                print ("Epoch ", epoch, " is done. Saving the model ...")
                saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

        loss_fd.close()

In [18]:
def test(model_path='./models/model-70'):
    
    test_data = get_data(video_test_feat_path, video_test_data_label_path)
    test_videos = test_data['video_path'].unique()
    
    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())
    
    bias_init_vector = np.load('./data/bias_init_vector.npy')
    
    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)
    
    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
    
    with tf.Session() as sess:
        
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        
        test_output_txt_fd = open('S2VT_results.txt', 'w')
        
        for idx, video_feat_path in enumerate(test_videos):
            print (idx, video_feat_path)
            
            video_feat = np.load(video_feat_path)[None,...]
            #video_feat = np.load(video_feat_path)
            #video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
            if video_feat.shape[1] == n_frame_step:
                video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
            else:
                continue
                #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float )
                #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat
                #video_feat = shape_templete
                #video_mask = np.ones((video_feat.shape[0], n_frame_step))

            generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
            generated_words = ixtoword[generated_word_index]

            punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
            generated_words = generated_words[:punctuation]

            generated_sentence = ' '.join(generated_words)
            generated_sentence = generated_sentence.replace('<bos> ', '')
            generated_sentence = generated_sentence.replace(' <eos>', '')
            print (generated_sentence,'\n')
            test_output_txt_fd.write(video_feat_path + '\n')
            test_output_txt_fd.write(generated_sentence + '\n\n')

In [None]:
train()

preprocessing word counts and creating vocab based on word count threshold 0
filtered words from 6248 to 6248
INFO:tensorflow:Restoring parameters from ./models/model-40
idx:  0  Epoch:  0  loss:  7.1822343  Elapsed time:  52.59547400474548
idx:  50  Epoch:  0  loss:  7.574959  Elapsed time:  34.726829051971436
idx:  100  Epoch:  0  loss:  7.220011  Elapsed time:  33.78416204452515
idx:  150  Epoch:  0  loss:  9.711597  Elapsed time:  32.51974010467529
idx:  200  Epoch:  0  loss:  7.5677266  Elapsed time:  33.9223108291626
idx:  250  Epoch:  0  loss:  7.852043  Elapsed time:  32.764440059661865
idx:  300  Epoch:  0  loss:  7.9782925  Elapsed time:  33.83790326118469
idx:  350  Epoch:  0  loss:  8.271207  Elapsed time:  31.78542709350586
idx:  400  Epoch:  0  loss:  9.144431  Elapsed time:  35.24722194671631
idx:  450  Epoch:  0  loss:  8.228762  Elapsed time:  34.957348108291626
idx:  500  Epoch:  0  loss:  8.028235  Elapsed time:  32.36221170425415
idx:  550  Epoch:  0  loss:  7.69772

idx:  650  Epoch:  3  loss:  8.975519  Elapsed time:  34.351688861846924
idx:  700  Epoch:  3  loss:  6.916613  Elapsed time:  30.739910125732422
idx:  750  Epoch:  3  loss:  6.6218576  Elapsed time:  30.172439098358154
idx:  800  Epoch:  3  loss:  8.558196  Elapsed time:  33.496437072753906
idx:  850  Epoch:  3  loss:  8.727559  Elapsed time:  31.3855881690979
idx:  900  Epoch:  3  loss:  7.2127423  Elapsed time:  30.37177300453186
idx:  950  Epoch:  3  loss:  7.6856217  Elapsed time:  30.384530305862427
idx:  1000  Epoch:  3  loss:  7.021962  Elapsed time:  31.286675930023193
idx:  1050  Epoch:  3  loss:  8.890959  Elapsed time:  33.934288024902344
idx:  1100  Epoch:  3  loss:  6.473008  Elapsed time:  30.942083835601807
idx:  1150  Epoch:  3  loss:  7.3903465  Elapsed time:  33.84851288795471
idx:  1200  Epoch:  3  loss:  6.7005205  Elapsed time:  29.71462106704712
idx:  1250  Epoch:  3  loss:  8.127621  Elapsed time:  32.96289300918579
idx:  1300  Epoch:  3  loss:  8.871499  Elapse

idx:  500  Epoch:  7  loss:  7.207582  Elapsed time:  30.1325740814209
idx:  550  Epoch:  7  loss:  8.670147  Elapsed time:  33.646416902542114
idx:  600  Epoch:  7  loss:  8.501601  Elapsed time:  32.864098072052
idx:  650  Epoch:  7  loss:  7.414221  Elapsed time:  31.01235604286194
idx:  700  Epoch:  7  loss:  6.824092  Elapsed time:  32.30096387863159
idx:  750  Epoch:  7  loss:  6.820722  Elapsed time:  31.32965588569641
idx:  800  Epoch:  7  loss:  8.748499  Elapsed time:  32.315638065338135
idx:  850  Epoch:  7  loss:  8.559488  Elapsed time:  32.708718061447144
idx:  900  Epoch:  7  loss:  7.047517  Elapsed time:  30.83052897453308
idx:  950  Epoch:  7  loss:  6.393893  Elapsed time:  33.00684905052185
idx:  1000  Epoch:  7  loss:  7.203106  Elapsed time:  30.359230756759644
idx:  1050  Epoch:  7  loss:  9.578203  Elapsed time:  34.28330588340759
idx:  1100  Epoch:  7  loss:  6.70433  Elapsed time:  31.584022045135498
idx:  1150  Epoch:  7  loss:  7.140863  Elapsed time:  31.21

idx:  0  Epoch:  11  loss:  7.7471995  Elapsed time:  32.008408069610596
idx:  50  Epoch:  11  loss:  7.6586657  Elapsed time:  31.120347261428833
idx:  100  Epoch:  11  loss:  8.3970995  Elapsed time:  31.129838705062866
idx:  150  Epoch:  11  loss:  7.5172195  Elapsed time:  33.653038024902344
idx:  200  Epoch:  11  loss:  7.2890296  Elapsed time:  34.43378710746765
idx:  250  Epoch:  11  loss:  9.103787  Elapsed time:  31.912364959716797
idx:  300  Epoch:  11  loss:  8.845978  Elapsed time:  33.246716022491455
idx:  350  Epoch:  11  loss:  6.8557606  Elapsed time:  30.878857135772705
idx:  400  Epoch:  11  loss:  7.0008264  Elapsed time:  31.1319797039032
idx:  450  Epoch:  11  loss:  7.2927895  Elapsed time:  33.172422885894775
idx:  500  Epoch:  11  loss:  7.5699086  Elapsed time:  34.41175103187561
idx:  550  Epoch:  11  loss:  6.796979  Elapsed time:  29.942752838134766
idx:  600  Epoch:  11  loss:  7.195347  Elapsed time:  32.37998080253601
idx:  650  Epoch:  11  loss:  8.18323

idx:  1200  Epoch:  14  loss:  6.679107  Elapsed time:  34.36536002159119
idx:  1250  Epoch:  14  loss:  7.0071044  Elapsed time:  34.00171232223511
idx:  1300  Epoch:  14  loss:  9.381673  Elapsed time:  32.23774695396423
idx:  1350  Epoch:  14  loss:  8.064628  Elapsed time:  33.87096571922302
idx:  1400  Epoch:  14  loss:  7.1509275  Elapsed time:  30.9001202583313
idx:  0  Epoch:  15  loss:  7.660335  Elapsed time:  32.477020025253296
idx:  50  Epoch:  15  loss:  6.7647347  Elapsed time:  30.564335107803345
idx:  100  Epoch:  15  loss:  7.708552  Elapsed time:  32.51543593406677
idx:  150  Epoch:  15  loss:  8.009706  Elapsed time:  32.91438102722168
idx:  200  Epoch:  15  loss:  8.114998  Elapsed time:  32.089268922805786
idx:  250  Epoch:  15  loss:  8.062027  Elapsed time:  32.3061089515686
idx:  300  Epoch:  15  loss:  7.2986617  Elapsed time:  30.26489806175232
idx:  350  Epoch:  15  loss:  8.45309  Elapsed time:  31.440994024276733
idx:  400  Epoch:  15  loss:  7.7345805  Ela

idx:  950  Epoch:  18  loss:  6.9447002  Elapsed time:  33.464012145996094
idx:  1000  Epoch:  18  loss:  7.686181  Elapsed time:  33.910857915878296
idx:  1050  Epoch:  18  loss:  8.553793  Elapsed time:  33.942583084106445
idx:  1100  Epoch:  18  loss:  6.6203265  Elapsed time:  30.793764114379883
idx:  1150  Epoch:  18  loss:  6.4376335  Elapsed time:  33.96365284919739
idx:  1200  Epoch:  18  loss:  6.923964  Elapsed time:  33.69456505775452
idx:  1250  Epoch:  18  loss:  7.429233  Elapsed time:  31.324567079544067
idx:  1300  Epoch:  18  loss:  7.7298183  Elapsed time:  31.15085220336914
idx:  1350  Epoch:  18  loss:  7.7373877  Elapsed time:  33.10217905044556
idx:  1400  Epoch:  18  loss:  7.4545774  Elapsed time:  34.653257846832275
idx:  0  Epoch:  19  loss:  7.5163713  Elapsed time:  31.735219955444336
idx:  50  Epoch:  19  loss:  7.04613  Elapsed time:  31.544394969940186
idx:  100  Epoch:  19  loss:  7.585515  Elapsed time:  33.125625133514404
idx:  150  Epoch:  19  loss:  

idx:  400  Epoch:  22  loss:  7.5990534  Elapsed time:  31.685203313827515
idx:  450  Epoch:  22  loss:  6.946617  Elapsed time:  30.319593906402588
idx:  500  Epoch:  22  loss:  8.646877  Elapsed time:  30.569660902023315
idx:  550  Epoch:  22  loss:  8.070743  Elapsed time:  31.359747648239136
idx:  600  Epoch:  22  loss:  9.241554  Elapsed time:  31.20979928970337
idx:  650  Epoch:  22  loss:  7.0707364  Elapsed time:  32.30641722679138
idx:  700  Epoch:  22  loss:  8.697308  Elapsed time:  30.776237726211548
idx:  750  Epoch:  22  loss:  5.96469  Elapsed time:  34.068639039993286
idx:  800  Epoch:  22  loss:  6.4633403  Elapsed time:  30.47490692138672
idx:  850  Epoch:  22  loss:  7.4818206  Elapsed time:  33.79642295837402
idx:  900  Epoch:  22  loss:  7.157665  Elapsed time:  33.012816190719604
idx:  950  Epoch:  22  loss:  7.352772  Elapsed time:  30.676756858825684
idx:  1000  Epoch:  22  loss:  6.526046  Elapsed time:  30.622395992279053
idx:  1050  Epoch:  22  loss:  7.12660

idx:  150  Epoch:  26  loss:  8.53339  Elapsed time:  31.22018814086914
idx:  200  Epoch:  26  loss:  8.181883  Elapsed time:  31.271936893463135
idx:  250  Epoch:  26  loss:  6.7111373  Elapsed time:  30.409875869750977
idx:  300  Epoch:  26  loss:  7.8640676  Elapsed time:  31.50172472000122
idx:  350  Epoch:  26  loss:  7.1391516  Elapsed time:  34.76497483253479
idx:  400  Epoch:  26  loss:  7.110788  Elapsed time:  32.57852005958557
idx:  450  Epoch:  26  loss:  7.5992246  Elapsed time:  31.12383222579956
idx:  500  Epoch:  26  loss:  7.290251  Elapsed time:  30.528772830963135
idx:  550  Epoch:  26  loss:  7.168387  Elapsed time:  33.149543046951294
idx:  600  Epoch:  26  loss:  6.3864098  Elapsed time:  30.38488221168518
idx:  650  Epoch:  26  loss:  6.294554  Elapsed time:  32.09540677070618
idx:  700  Epoch:  26  loss:  6.505466  Elapsed time:  34.329522132873535
idx:  750  Epoch:  26  loss:  5.7856126  Elapsed time:  32.45860505104065
idx:  800  Epoch:  26  loss:  7.1504602  

idx:  1350  Epoch:  29  loss:  7.9155593  Elapsed time:  33.21609711647034
idx:  1400  Epoch:  29  loss:  7.3329844  Elapsed time:  33.603830099105835
idx:  0  Epoch:  30  loss:  6.827377  Elapsed time:  31.346051931381226
idx:  50  Epoch:  30  loss:  8.563107  Elapsed time:  30.435919761657715
idx:  100  Epoch:  30  loss:  6.8419623  Elapsed time:  31.78278088569641
idx:  150  Epoch:  30  loss:  8.693716  Elapsed time:  33.78078317642212
idx:  200  Epoch:  30  loss:  8.078719  Elapsed time:  29.797932863235474
idx:  250  Epoch:  30  loss:  7.2320943  Elapsed time:  30.05919599533081
idx:  300  Epoch:  30  loss:  8.083773  Elapsed time:  34.67992806434631
idx:  350  Epoch:  30  loss:  7.135626  Elapsed time:  32.91603398323059
idx:  400  Epoch:  30  loss:  7.157203  Elapsed time:  33.80247092247009
idx:  450  Epoch:  30  loss:  7.465534  Elapsed time:  32.195523738861084
idx:  500  Epoch:  30  loss:  7.194622  Elapsed time:  32.5435528755188
idx:  550  Epoch:  30  loss:  7.133412  Elap

idx:  800  Epoch:  33  loss:  7.783546  Elapsed time:  30.888098001480103
idx:  850  Epoch:  33  loss:  6.925057  Elapsed time:  30.932841300964355
idx:  900  Epoch:  33  loss:  6.3231363  Elapsed time:  34.18598031997681
idx:  950  Epoch:  33  loss:  7.743784  Elapsed time:  33.46364688873291
idx:  1000  Epoch:  33  loss:  6.6062574  Elapsed time:  33.204254150390625
idx:  1050  Epoch:  33  loss:  8.116552  Elapsed time:  31.378398180007935
idx:  1100  Epoch:  33  loss:  6.091733  Elapsed time:  33.21420121192932
idx:  1150  Epoch:  33  loss:  7.039265  Elapsed time:  34.161595821380615
idx:  1200  Epoch:  33  loss:  7.703246  Elapsed time:  32.270628929138184
idx:  1250  Epoch:  33  loss:  7.567116  Elapsed time:  31.442254066467285
idx:  1300  Epoch:  33  loss:  6.696766  Elapsed time:  32.49578857421875
idx:  1350  Epoch:  33  loss:  7.6543956  Elapsed time:  34.76596474647522
idx:  1400  Epoch:  33  loss:  7.389071  Elapsed time:  33.3417751789093
idx:  0  Epoch:  34  loss:  6.481

idx:  550  Epoch:  37  loss:  6.2290077  Elapsed time:  30.300830364227295
idx:  600  Epoch:  37  loss:  6.5787826  Elapsed time:  32.08358812332153
idx:  650  Epoch:  37  loss:  7.2904677  Elapsed time:  32.13324213027954
idx:  700  Epoch:  37  loss:  7.2692976  Elapsed time:  31.64572501182556
idx:  750  Epoch:  37  loss:  6.4014096  Elapsed time:  31.343147039413452
idx:  800  Epoch:  37  loss:  7.0294156  Elapsed time:  31.147650241851807
idx:  850  Epoch:  37  loss:  7.4375577  Elapsed time:  32.94705820083618
idx:  900  Epoch:  37  loss:  8.453251  Elapsed time:  33.95557713508606
idx:  950  Epoch:  37  loss:  7.117253  Elapsed time:  33.774139165878296
idx:  1000  Epoch:  37  loss:  6.8239737  Elapsed time:  31.89946484565735
idx:  1050  Epoch:  37  loss:  8.262327  Elapsed time:  31.01736092567444
idx:  1100  Epoch:  37  loss:  7.4684935  Elapsed time:  29.452664136886597
idx:  1150  Epoch:  37  loss:  6.822472  Elapsed time:  33.412354946136475
idx:  1200  Epoch:  37  loss:  6

idx:  0  Epoch:  41  loss:  7.1003003  Elapsed time:  34.0467050075531
idx:  50  Epoch:  41  loss:  5.623437  Elapsed time:  42.717607736587524
idx:  100  Epoch:  41  loss:  7.7638636  Elapsed time:  50.56665086746216
idx:  150  Epoch:  41  loss:  6.7312512  Elapsed time:  68.47702407836914
idx:  200  Epoch:  41  loss:  6.732919  Elapsed time:  66.5414137840271
idx:  250  Epoch:  41  loss:  7.1916986  Elapsed time:  66.19903898239136
idx:  300  Epoch:  41  loss:  6.622371  Elapsed time:  72.27585625648499
idx:  350  Epoch:  41  loss:  5.832455  Elapsed time:  108.48181986808777
idx:  400  Epoch:  41  loss:  7.0965867  Elapsed time:  103.18992567062378
idx:  450  Epoch:  41  loss:  7.36413  Elapsed time:  97.4323239326477
idx:  500  Epoch:  41  loss:  7.147402  Elapsed time:  98.00578093528748
idx:  550  Epoch:  41  loss:  7.718274  Elapsed time:  99.39488506317139
idx:  600  Epoch:  41  loss:  7.593905  Elapsed time:  98.8389139175415
idx:  650  Epoch:  41  loss:  6.4316688  Elapsed ti

idx:  1250  Epoch:  44  loss:  6.495513  Elapsed time:  35.82964468002319
idx:  1300  Epoch:  44  loss:  6.767736  Elapsed time:  33.83223509788513
idx:  1350  Epoch:  44  loss:  7.3623457  Elapsed time:  34.708595991134644
idx:  1400  Epoch:  44  loss:  6.193482  Elapsed time:  31.72783088684082
idx:  0  Epoch:  45  loss:  7.2070723  Elapsed time:  34.00641703605652
idx:  50  Epoch:  45  loss:  6.5352926  Elapsed time:  33.50560998916626
idx:  100  Epoch:  45  loss:  6.986552  Elapsed time:  35.78262972831726
idx:  150  Epoch:  45  loss:  7.2872753  Elapsed time:  34.170320987701416
idx:  200  Epoch:  45  loss:  6.2916846  Elapsed time:  34.5517361164093
idx:  250  Epoch:  45  loss:  5.40392  Elapsed time:  34.30377006530762
idx:  300  Epoch:  45  loss:  6.9039536  Elapsed time:  35.78252601623535
idx:  350  Epoch:  45  loss:  7.1886916  Elapsed time:  34.010966062545776
idx:  400  Epoch:  45  loss:  6.830749  Elapsed time:  36.36634111404419
idx:  450  Epoch:  45  loss:  7.3785973  E

idx:  1050  Epoch:  48  loss:  6.661149  Elapsed time:  33.09788489341736
idx:  1100  Epoch:  48  loss:  6.6176486  Elapsed time:  35.04351997375488
idx:  1150  Epoch:  48  loss:  7.9410667  Elapsed time:  35.04402995109558
idx:  1200  Epoch:  48  loss:  6.208637  Elapsed time:  32.853431940078735
idx:  1250  Epoch:  48  loss:  6.4390182  Elapsed time:  35.127262115478516
idx:  1300  Epoch:  48  loss:  7.600699  Elapsed time:  34.16447615623474
idx:  1350  Epoch:  48  loss:  6.2889853  Elapsed time:  33.681994915008545
idx:  1400  Epoch:  48  loss:  6.5070124  Elapsed time:  34.90817999839783
idx:  0  Epoch:  49  loss:  7.1867447  Elapsed time:  35.098814964294434
idx:  50  Epoch:  49  loss:  5.901233  Elapsed time:  34.87979292869568
idx:  100  Epoch:  49  loss:  5.8779073  Elapsed time:  35.51928496360779
idx:  150  Epoch:  49  loss:  7.331081  Elapsed time:  33.26769208908081
idx:  200  Epoch:  49  loss:  6.505862  Elapsed time:  34.5626437664032
idx:  250  Epoch:  49  loss:  6.8889

idx:  500  Epoch:  52  loss:  6.7278724  Elapsed time:  33.64378595352173
idx:  550  Epoch:  52  loss:  6.5704336  Elapsed time:  33.127121925354004
idx:  600  Epoch:  52  loss:  6.947096  Elapsed time:  33.521838903427124
idx:  650  Epoch:  52  loss:  6.3417726  Elapsed time:  34.96752715110779
idx:  700  Epoch:  52  loss:  7.191027  Elapsed time:  35.257712841033936
idx:  750  Epoch:  52  loss:  5.98839  Elapsed time:  31.558250188827515
idx:  800  Epoch:  52  loss:  7.0211344  Elapsed time:  33.838478088378906
idx:  850  Epoch:  52  loss:  6.9741883  Elapsed time:  33.82939314842224
idx:  900  Epoch:  52  loss:  5.706912  Elapsed time:  34.3756000995636
idx:  950  Epoch:  52  loss:  6.4682603  Elapsed time:  36.08769607543945
idx:  1000  Epoch:  52  loss:  7.110865  Elapsed time:  34.46661615371704
idx:  1050  Epoch:  52  loss:  6.528047  Elapsed time:  34.586769104003906
idx:  1100  Epoch:  52  loss:  7.2511754  Elapsed time:  32.925079107284546
idx:  1150  Epoch:  52  loss:  6.939

idx:  250  Epoch:  56  loss:  6.133661  Elapsed time:  34.240641593933105
idx:  300  Epoch:  56  loss:  8.045548  Elapsed time:  35.23266410827637
idx:  350  Epoch:  56  loss:  6.4579635  Elapsed time:  35.884592056274414
idx:  400  Epoch:  56  loss:  6.255757  Elapsed time:  34.67267990112305
idx:  450  Epoch:  56  loss:  7.0559983  Elapsed time:  34.441685914993286
idx:  500  Epoch:  56  loss:  6.6680117  Elapsed time:  34.269675970077515
idx:  550  Epoch:  56  loss:  5.864451  Elapsed time:  34.24297499656677
idx:  600  Epoch:  56  loss:  7.7641397  Elapsed time:  31.965338230133057
idx:  650  Epoch:  56  loss:  6.5493717  Elapsed time:  33.46974301338196
idx:  700  Epoch:  56  loss:  6.1083326  Elapsed time:  35.06490087509155
idx:  750  Epoch:  56  loss:  5.8581476  Elapsed time:  35.793964862823486
idx:  800  Epoch:  56  loss:  6.8229685  Elapsed time:  35.416709899902344
idx:  850  Epoch:  56  loss:  8.274655  Elapsed time:  33.36923384666443
idx:  900  Epoch:  56  loss:  6.0699

idx:  0  Epoch:  60  loss:  6.4501967  Elapsed time:  33.30796217918396
idx:  50  Epoch:  60  loss:  6.1962833  Elapsed time:  33.04367017745972
idx:  100  Epoch:  60  loss:  6.143671  Elapsed time:  33.690205097198486
idx:  150  Epoch:  60  loss:  7.440846  Elapsed time:  34.19418907165527
idx:  200  Epoch:  60  loss:  6.5026946  Elapsed time:  34.94851112365723
idx:  250  Epoch:  60  loss:  5.963957  Elapsed time:  34.26695513725281
idx:  300  Epoch:  60  loss:  7.1238704  Elapsed time:  35.01620292663574
idx:  350  Epoch:  60  loss:  6.637897  Elapsed time:  33.869121074676514
idx:  400  Epoch:  60  loss:  6.206536  Elapsed time:  33.64252686500549
idx:  450  Epoch:  60  loss:  6.8264346  Elapsed time:  32.592897176742554
idx:  500  Epoch:  60  loss:  7.505416  Elapsed time:  33.58396077156067
idx:  550  Epoch:  60  loss:  7.4454675  Elapsed time:  34.44843888282776
idx:  600  Epoch:  60  loss:  6.8989077  Elapsed time:  34.11693096160889
idx:  650  Epoch:  60  loss:  6.2253513  Ela

idx:  900  Epoch:  63  loss:  5.6665635  Elapsed time:  33.77094602584839
idx:  950  Epoch:  63  loss:  5.956678  Elapsed time:  34.615824699401855
idx:  1000  Epoch:  63  loss:  6.814145  Elapsed time:  34.40640902519226
idx:  1050  Epoch:  63  loss:  7.4394274  Elapsed time:  35.483978033065796
idx:  1100  Epoch:  63  loss:  5.6109395  Elapsed time:  33.05728602409363
idx:  1150  Epoch:  63  loss:  6.646125  Elapsed time:  33.576059341430664
idx:  1200  Epoch:  63  loss:  6.6962185  Elapsed time:  34.519081115722656
idx:  1250  Epoch:  63  loss:  6.6625576  Elapsed time:  34.96235108375549
idx:  1300  Epoch:  63  loss:  6.7227054  Elapsed time:  34.62621593475342
idx:  1350  Epoch:  63  loss:  6.0628495  Elapsed time:  34.96719288825989
idx:  1400  Epoch:  63  loss:  7.25183  Elapsed time:  34.372459173202515
idx:  0  Epoch:  64  loss:  5.5740557  Elapsed time:  35.925798177719116
idx:  50  Epoch:  64  loss:  5.120302  Elapsed time:  33.52098608016968
idx:  100  Epoch:  64  loss:  6.

idx:  650  Epoch:  67  loss:  6.138926  Elapsed time:  35.14009094238281
idx:  700  Epoch:  67  loss:  5.588706  Elapsed time:  34.96748900413513
idx:  750  Epoch:  67  loss:  5.4168496  Elapsed time:  34.86040115356445
idx:  800  Epoch:  67  loss:  7.110188  Elapsed time:  34.12842392921448
idx:  850  Epoch:  67  loss:  7.5742245  Elapsed time:  32.00515079498291
idx:  900  Epoch:  67  loss:  6.477352  Elapsed time:  34.80262804031372
idx:  950  Epoch:  67  loss:  7.2134066  Elapsed time:  35.65200209617615
idx:  1000  Epoch:  67  loss:  6.930642  Elapsed time:  34.38363718986511
idx:  1050  Epoch:  67  loss:  7.059993  Elapsed time:  33.174290895462036
idx:  1100  Epoch:  67  loss:  7.44833  Elapsed time:  34.82588219642639
idx:  1150  Epoch:  67  loss:  6.822374  Elapsed time:  34.377108097076416
idx:  1200  Epoch:  67  loss:  6.8331294  Elapsed time:  35.82673716545105
idx:  1250  Epoch:  67  loss:  6.57009  Elapsed time:  35.24380803108215
idx:  1300  Epoch:  67  loss:  6.7689342 

idx:  100  Epoch:  71  loss:  6.4917636  Elapsed time:  33.648979902267456
idx:  150  Epoch:  71  loss:  6.497846  Elapsed time:  33.6611750125885
idx:  200  Epoch:  71  loss:  8.194492  Elapsed time:  32.08941173553467
idx:  250  Epoch:  71  loss:  6.825038  Elapsed time:  32.09963798522949
idx:  300  Epoch:  71  loss:  5.973546  Elapsed time:  35.100630044937134
idx:  350  Epoch:  71  loss:  5.987597  Elapsed time:  33.579784870147705
idx:  400  Epoch:  71  loss:  6.418503  Elapsed time:  34.16338133811951
idx:  450  Epoch:  71  loss:  6.974238  Elapsed time:  33.72610282897949
idx:  500  Epoch:  71  loss:  7.642157  Elapsed time:  34.15856719017029
idx:  550  Epoch:  71  loss:  5.738751  Elapsed time:  34.615533113479614
idx:  600  Epoch:  71  loss:  7.058396  Elapsed time:  31.540287971496582
idx:  650  Epoch:  71  loss:  6.5017233  Elapsed time:  34.6642210483551
idx:  700  Epoch:  71  loss:  5.683186  Elapsed time:  32.48692297935486
idx:  750  Epoch:  71  loss:  5.842177  Elapse

idx:  1300  Epoch:  74  loss:  6.678636  Elapsed time:  34.651777029037476
idx:  1350  Epoch:  74  loss:  6.092548  Elapsed time:  36.61844301223755
idx:  1400  Epoch:  74  loss:  6.770268  Elapsed time:  33.97962498664856
idx:  0  Epoch:  75  loss:  6.456373  Elapsed time:  34.75300097465515
idx:  50  Epoch:  75  loss:  6.4944963  Elapsed time:  35.06884407997131
idx:  100  Epoch:  75  loss:  6.4228005  Elapsed time:  33.43140506744385
idx:  150  Epoch:  75  loss:  6.1284833  Elapsed time:  35.102439165115356
idx:  200  Epoch:  75  loss:  6.1562834  Elapsed time:  35.6652729511261
idx:  250  Epoch:  75  loss:  5.744483  Elapsed time:  34.357070207595825
idx:  300  Epoch:  75  loss:  5.6473784  Elapsed time:  35.63056707382202
idx:  350  Epoch:  75  loss:  6.586508  Elapsed time:  36.52735090255737
idx:  400  Epoch:  75  loss:  5.864354  Elapsed time:  33.901227951049805
idx:  450  Epoch:  75  loss:  6.9453106  Elapsed time:  31.922008991241455
idx:  500  Epoch:  75  loss:  6.418118  E

In [19]:
test()

INFO:tensorflow:Restoring parameters from ./models/model-70
0 ./data/testing_data/feat/ScdUht-pM6s_53_63.avi.npy
a man is putting a fish on a large bottle 

1 ./data/testing_data/feat/wkgGxsuNVSg_34_41.avi.npy
a woman is standing in a field 

2 ./data/testing_data/feat/BtQtRGI0F2Q_15_20.avi.npy
a man is doing a stunt on a wall 

3 ./data/testing_data/feat/k06Ge9ANKM8_5_16.avi.npy
a woman is playing with a ball 

4 ./data/testing_data/feat/sZf3VDsdDPM_107_114.avi.npy
a man is putting a woman in a hand 

5 ./data/testing_data/feat/shPymuahrsc_5_12.avi.npy
a cat is meowing 

6 ./data/testing_data/feat/XOAgUVVwKEA_8_20.avi.npy
a baby is putting on a baby 

7 ./data/testing_data/feat/ufFT2BWh3BQ_0_8.avi.npy
a panda is climbing on a ground 

8 ./data/testing_data/feat/5YJaS2Eswg0_22_26.avi.npy
a man is falling on a road 

9 ./data/testing_data/feat/lw7pTwpx0K0_38_48.avi.npy
a man is placing a food in a pan 

10 ./data/testing_data/feat/UbmZAe5u5FI_132_141.avi.npy
a person is cutting a shrimp