In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import math, copy, time
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import os
from os.path import join
import glob
from glob import glob

import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# video data directory
video_descriptions_csv = "../video-summarization/video_description.csv"
vocab_file = "vocab_10_sentence.pickle"
video_descriptions_file = "video_descriptions_10_sentence.pickle"
video_features_file = "../video-summarization/features_video_pca.npz"

# Number of training videos
num_train_set = 1300
batch_size = 64
num_epochs = 100

print(device)

cuda:0


In [3]:
class Vocabulary(object):

    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [19]:
class DataLoader:
    def __init__(self, train=True):
        print("Loading Data Loader instance for Train = {}...".format(train))
        
        self.device = device
        self.video_descriptions_csv = video_descriptions_csv
        self.video_descriptions_file = video_descriptions_file
        self.vocab_file = vocab_file
        self.video_features_file = video_features_file
        self.batch_size = batch_size
        
        # max sentences to include in the dataset. '0' to include all sentences
        self.max_sentence_num = 10
        self.max_words_num = 25
        
        self.num_train_set = num_train_set
        
        print("Loading video descriptions...")
        self.video_descriptions = self.load_descriptions()
        
        print("Loading vocabulary...")
        self.vocab = self.load_vocabulary()
        
        train_test = list(self.video_descriptions.keys())
        
        if train:
            self.names = train_test[:self.num_train_set]
        else:
            self.names = train_test[self.num_train_set:]
        
        print("Loading video features...")
        self.video_features = np.load(self.video_features_file)
        
        print("Data Loader initialized")
    
    def load_descriptions(self):
         return pickle.load(open(self.video_descriptions_file, 'rb'))
    
    def load_vocabulary(self):
        return pickle.load(open(self.vocab_file, 'rb'))
        
    def create_descriptions_from_csv(self):
        desc = pd.read_csv(self.video_descriptions_csv)
        desc = desc[(desc['Language'] == 'English')]
        desc = desc[['VideoID', 'Start', 'End', 'Description']]
        desc_dict = {}

        for row in desc.iterrows():
            key = str(row[1][0]) + '_' + str(row[1][1]) + '_' + str(row[1][2])
            if not os.path.exists("../video-summarization/data/" + key):
                continue
                
            if key in desc_dict:
                if self.max_sentence_num != 0 and len(desc_dict[key]) < self.max_sentence_num:
                    desc_dict[key].append(str(row[1][3]))
            else:
                desc_dict[key] = [str(row[1][3])]
            
        return desc_dict
    
    def create_full_vocab(self):
        # load coco vocabulary
        # vocab = pickle.load(open(coco_vocab_dir, 'rb'))
        vocab = Vocabulary()
        vocab.add_word('<pad>')
        vocab.add_word('<start>')
        vocab.add_word('<end>')
        vocab.add_word('<unk>')
        
        for key in self.video_descriptions:
            sentences = ' '.join(self.video_descriptions[key])
            
            for word in sentences.split(' '):
                filtered_word = word.lower().split('.')[0]
                if filtered_word not in vocab.word2idx:
                    vocab.add_word(filtered_word)
                    
        return vocab 
    
    def get_words_from_index(self, tensor):
        words_list = []
        for idx in tensor.data.cpu().numpy():
            words_list.append(self.vocab.idx2word[idx])
        return words_list
    
    def get_one_hot_encoded_all(self, video_id):
        target = []
            
        for sentence in self.video_descriptions[video_id]:
            x = torch.zeros(len(self.vocab), dtype=torch.float32)
            for word in sentence.split(' '):
                filtered_word = word.lower().split('.')[0]
                x[self.vocab.word2idx[filtered_word]] = 1
                target.append(x)
            
        return torch.stack(target)
    
    def get_one_hot_encoded(self, video_id):
        descriptions = self.video_descriptions[video_id]
        index = np.random.randint(low=len(descriptions))
        
        target = torch.zeros(self.max_words_num, len(self.vocab), dtype=torch.float32)
        for word in descriptions[index].split(' '):
            filtered_word = word.lower().split('.')[0]
            target[i, self.vocab.word2idx[filtered_word]] = 1
            
        return target
    
    def data_generator(self):
        
        while True:
            indexes = np.random.choice(a=np.arange(len(self.names)), size=self.batch_size)
            max_num_rows = 0
            for i in range(len(indexes)):
                features = self.video_features[self.names[indexes[i]]]
                if features.shape[0] > max_num_rows:
                    max_num_rows = features.shape[0]
            
            x = torch.zeros(self.batch_size, max_num_rows, 512)
            y = []
            for i in range(len(indexes)):
                video_id = self.names[indexes[i]]
                features = self.video_features[video_id]
                x[i,:features.shape[0],:] = torch.from_numpy(features)
                y.append(self.get_one_hot_encoded(video_id))
                
            yield x.to(device), torch.stack(y).to(device)
            
    def batch_data_generator(self):
        
        for i in range(0, len(self.names), self.batch_size):
            curr_batch_size = self.batch_size
            if (i + self.batch_size) > len(self.names):
                curr_batch_size = len(self.names) - i
                
            indexes = np.random.permutation(np.arange(i, i + curr_batch_size))
            max_seq_len = 0
            for i in range(len(indexes)):
                features = self.video_features[self.names[indexes[i]]]
                if features.shape[0] > max_seq_len:
                    max_seq_len = features.shape[0]
            
            x = torch.zeros(curr_batch_size, max_seq_len, 512)
            y = []
            for i in range(len(indexes)):
                video_id = self.names[indexes[i]]
                features = self.video_features[video_id]
                x[i,:features.shape[0],:] = torch.from_numpy(features)
                y.append(self.get_one_hot_encoded(video_id))
                
            yield x.to(device), torch.stack(y).to(device)

In [20]:
train_loader = DataLoader(train=True)
test_loader = DataLoader(train=False)

Loading Data Loader instance for Train = True...
Loading video descriptions...
Loading vocabulary...
Loading video features...
Data Loader initialized
Loading Data Loader instance for Train = False...
Loading video descriptions...
Loading vocabulary...
Loading video features...
Data Loader initialized


In [21]:
for i, (x, y) in enumerate(train_loader.batch_data_generator()):
    print(i, x.size(), y.size())

0 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
1 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
2 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
3 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
4 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
5 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
6 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
7 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
8 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
9 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
10 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
11 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
12 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
13 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
14 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
15 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
16 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
17 torch.Size([64, 31, 512]) torch.Size([64, 25, 5883])
18

In [None]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [None]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [None]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [None]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [None]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [None]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0.0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [None]:
def make_model(tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
#         nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [None]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [None]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens