In [1]:
import math

from lib.utilities import *
from lib.experiments.an4_speech_encoder_decoder import *

In [2]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'AN4',
    'DESCRIPTION': 'Transformer model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'AN4_CNN_TRANSFORMER',

    'PRELOAD_MODEL_PATH': None, #'AN4_CNN_TRANSFORMER.tar',

    'ROOT_DIR': '/Volumes/SSD1',
    'MANIFESTS': ['manifest.json'],  # , 'sts_manifest_pseudo.json'],

    'TARGET_ENCODING': 'sts',  # ' ctc

    'BATCH_SIZE': 20,
    'NUM_WORKERS': 8,

    'RNN_HIDDEN_SIZE': 256,
    'RNN_NUM_LAYERS': 2,
    'RNN_DROPOUT': 0.5,
    'CNN_DROPOUT': 0.5,
    'BIDIRECTIONAL': True,

    'LR': 0.0003,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'TEACHER_FORCING_RATIO': 0.5,

    'LABEL_SMOOTHING' : 0.2,

    'MAX_GRAD_NORM': 400,

    'MAX_EPOCHS': 200,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,

    'DATASET_MEAN_STD': (0.060487103, 0.16884679),

    'NORMALIZE_DB': -40,
    'NORMALIZE_MAX_GAIN': 300,

    'MIN_MAX_AUDIO_DURATION': None,  # (1, 15),
    'MIN_MAX_TRANSCRIPT_LEN': None,  # (0, 15),
    'MIN_TRANSCRIPT_CONFIDENCE': None,  # 0.95,

    'AUDIO_SAMPLE_RATE': 16000,

    'SPECT_WINDOW_SIZE': 0.02,
    'SPECT_WINDOW_STRIDE': 0.01,
    'SPECT_WINDOW': 'hamming',

    'AUGMENTATION_PROBABILITY': 0.0,

    'NOISE_BG_PROBABILITY': 0.4,
    'NOISE_BG_LEVELS': (0.0, 0.5),
    'NOISE_BG_DIR': '/Volumes/SSD1/BACKGROUND_NOISE',

    'AUDIO_PITCH_PROBABILITY': 0.4,
    'AUDIO_PITCH_PM': 4,

    'AUDIO_SPEED_PROBABILITY': 0.4,
    'AUDIO_SPEED_LOW_HIGH': (0.9, 1.1),

    'AUDIO_DYNAMIC_PROBABILITY': 0.4,
    'AUDIO_DYNAMIC_LOW_HIGH': (0.5, 1.1),

    'AUDIO_SHIFT_PROBABILITY': 0.4,
    'AUDIO_SHIFT_MIN_MAX': (-5, 5),

    'AUDIO_NOISE_PROBABILITY': 0.4,
    'AUDIO_NOISE_LEVELS': (0.0, 0.5),
    'AUDIO_NOISE_COLORS': ['white', 'pink', 'blue', 'brown', 'violet'],
})

In [3]:
train_loader, valid_loader, vocab = create_data_pipelines(H)

In [4]:
for inputs_cpu, labels_cpu, input_sizes_cpu, label_sizes_cpu, _ in train_loader:
    break

inputs_cpu.shape, labels_cpu.shape, input_sizes_cpu.shape, label_sizes_cpu.shape

(torch.Size([20, 1, 161, 81]),
 torch.Size([20, 7]),
 torch.Size([20]),
 torch.Size([20]))

In [5]:
class MaskModule(nn.Module):
    def __init__(self, seq_module):
        """
        Adds padding to the output of the module based on the given lengths. This is to ensure that the
        results of the model do not change when batch sizes change during inference.
        Input needs to be in the shape of (BxCxDxT)
        """
        super(MaskModule, self).__init__()
        self.seq_module = seq_module

    def forward(self, x, x_lengths):
        """
        Input of size BxCxDxT
        """
        lengths = self.get_seq_lens(x_lengths)
        for module in self.seq_module:
            x = module(x)
            mask = torch.ByteTensor(x.size()).fill_(0)
            if x.is_cuda:
                mask = mask.cuda()
            for i, length in enumerate(lengths):
                length = length.item()
                if (mask[i].size(2) - length) > 0:
                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
            x = x.masked_fill(mask, 0)
        return x, lengths

    def get_seq_lens(self, input_lengths):
        """
        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
        containing the size sequences that will be output by the network.
        """
        seq_len = input_lengths.cpu().int()
        for m in self.seq_module.modules():
            if type(m) == nn.modules.conv.Conv2d:
                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
            elif type(m) == nn.modules.pooling.MaxPool2d: 
                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)                
            elif type(m) == torch.nn.modules.pooling.AvgPool2d:
                seq_len = ((seq_len + 2 * m.padding[1] - m.kernel_size[1]) / m.stride[1] + 1)   
                
        return seq_len.int()

In [6]:
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class CNN(nn.Module):
    def __init__(self, dropout=0.5, initialize=None):
        super(CNN, self).__init__()
        self.dropout = dropout
        self.initialize = initialize
        self.inplanes = 32

        self.conv1 = MaskModule(nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5), bias=False),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Dropout(dropout)
        ))

        self.conv2 = MaskModule(nn.Sequential(
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5), bias=False),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Dropout(dropout)
        ))

        self.bb1 = MaskModule(self._make_layer(BasicBlock, 64, 2, stride=(2, 1)))
        self.bb2 = MaskModule(self._make_layer(BasicBlock, 128, 2, stride=(2, 1)))
        self.bb3 = MaskModule(self._make_layer(BasicBlock, 256, 2, stride=(2, 1)))
        self.bb4 = MaskModule(self._make_layer(BasicBlock, 256, 2, stride=(2, 1)))

        self.conv3 = MaskModule(nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 1), padding=(0, 0), bias=False),
            nn.BatchNorm2d(256, affine=True),
            nn.ReLU(inplace=True),
        ))

        if not self.initialize is None:
            self.initialize(self)

    def forward(self, inputs, input_sizes):
        outputs, output_sizes = self.conv1(inputs, input_sizes)

        outputs, output_sizes = self.conv2(outputs, output_sizes)

        outputs, output_sizes  = self.bb1(outputs, output_sizes)
        outputs, output_sizes  = self.bb2(outputs, output_sizes)
        outputs, output_sizes  = self.bb3(outputs, output_sizes)
        outputs, output_sizes  = self.bb4(outputs, output_sizes)

        outputs, output_sizes  = self.conv3(outputs, output_sizes)
        
        outputs = outputs.squeeze(2).transpose(1,2)
        
        return outputs, output_sizes
    
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

In [7]:
# class CNN(nn.Module):
#     def __init__(self, dropout=0.5, initialize=None):
#         super(CNN, self).__init__()
#         self.initialize = initialize

#         self.conv = MaskModule(nn.Sequential(
#             nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5), bias=False),
#             nn.BatchNorm2d(32),
#             nn.Hardtanh(0, 20, inplace=True),
#             nn.Dropout(dropout),
#             nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5), bias=False),
#             nn.BatchNorm2d(32),
#             nn.Hardtanh(0, 20, inplace=True),
#             nn.Dropout(dropout)
#         ))

#         if self.initialize is not None:
#             self.initialize(self)

#     def forward(self, x, lengths):

#         output_lengths = self.get_seq_lens(lengths)

#         x, _ = self.conv(x, output_lengths)

#         sizes = x.size()
#         x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
#         x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH

#         return x, output_lengths

#     def get_seq_lens(self, input_length):
#         """
#         Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
#         containing the size sequences that will be output by the network.
#         :param input_length: 1D Tensor
#         :return: 1D Tensor scaled by model
#         """
#         seq_len = input_length.cpu().int()
#         for m in self.conv.modules():
#             if type(m) == nn.modules.conv.Conv2d:
#                 seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
#         return seq_len.int()

In [8]:
cnn_cpu = CNN(dropout=H.CNN_DROPOUT, initialize=torch_weight_init)

cnn_outputs_cpu, cnn_output_sizes_cpu = cnn_cpu(inputs_cpu, input_sizes_cpu)

inputs_cpu.shape, input_sizes_cpu.shape, cnn_outputs_cpu.shape, cnn_output_sizes_cpu.shape

(torch.Size([20, 1, 161, 81]),
 torch.Size([20]),
 torch.Size([20, 39, 256]),
 torch.Size([20]))

In [9]:
import math

import numpy as np
import torch
import torch.nn as nn

#torch.Size([20, 269, 256]) torch.Size([256, 256]) 269
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, droput, len_max=512):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.droput = droput
        self.len_max = len_max
        
        position = torch.arange(0.0, self.len_max)
        num_timescales = self.d_model // 2
        log_timescale_increment = math.log(10000) / (num_timescales - 1)
        inv_timescales = torch.exp(torch.arange(0.0, num_timescales) * -log_timescale_increment)
        scaled_time = position.unsqueeze(1) * inv_timescales.unsqueeze(0)
        pos_emb = torch.cat((torch.sin(scaled_time), torch.cos(scaled_time)), 1)

        # wrap in a buffer so that model can be moved to GPU
        self.register_buffer('pos_emb', pos_emb)

        self.drop = nn.Dropout(self.droput)

    def forward(self, word_emb):
        len_seq = word_emb.size(1)
        out = word_emb + self.pos_emb[:len_seq, :]
        out = self.drop(out)
        return out

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model, droput):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.droput = droput

        self.d_head = d_model // self.num_heads

        self.fc_query = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_key = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)
        self.fc_value = nn.Linear(self.d_model, self.num_heads * self.d_head, bias=False)

        self.fc_concat = nn.Linear(self.num_heads * self.d_head, self.d_model, bias=False)

        self.softmax = nn.Softmax(dim=1)

        self.attn_dropout = nn.Dropout(self.droput)
        self.dropout = nn.Dropout(self.droput)

        self.norm = nn.LayerNorm(self.d_model)

    def _prepare_proj(self, x):
        """Reshape the projectons to apply softmax on each head
        """
        b, l, d = x.size()
        return x.view(b, l, self.num_heads, self.d_head).transpose(1, 2).contiguous().view(b * self.num_heads, l,
                                                                                           self.d_head)

    def forward(self, query, key, value, mask):
        b, len_query = query.size(0), query.size(1)
        len_key = key.size(1)

        # project inputs to multi-heads
        proj_query = self.fc_query(query)  # batch_size x len_query x h*d_head
        proj_key = self.fc_key(key)  # batch_size x len_key x h*d_head
        proj_value = self.fc_value(value)  # batch_size x len_key x h*d_head

        # prepare the shape for applying softmax
        proj_query = self._prepare_proj(proj_query)  # batch_size*h x len_query x d_head
        proj_key = self._prepare_proj(proj_key)  # batch_size*h x len_key x d_head
        proj_value = self._prepare_proj(proj_value)  # batch_size*h x len_key x d_head

        # get dotproduct softmax attns for each head
        attns = torch.bmm(proj_query, proj_key.transpose(1, 2))  # batch_size*h x len_query x len_key
        attns = attns / math.sqrt(self.d_head)
        attns = attns.view(b, self.num_heads, len_query, len_key)
        attns = attns.masked_fill_(mask.unsqueeze(1), -float('inf'))
        attns = self.softmax(attns.view(-1, len_key))

        # return mean attention from all heads as coverage
        coverage = torch.mean(attns.view(b, self.num_heads, len_query, len_key), dim=1)

        attns = self.attn_dropout(attns)
        attns = attns.view(b * self.num_heads, len_query, len_key)

        # apply attns on value
        out = torch.bmm(attns, proj_value)  # batch_size*h x len_query x d_head
        out = out.view(b, self.num_heads, len_query, self.d_head).transpose(1, 2).contiguous()

        out = self.fc_concat(out.view(b, len_query, self.num_heads * self.d_head))

        out = self.dropout(out).add_(query)
        out = self.norm(out)
        return out, coverage


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.drop = nn.Dropout(self.dropout)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        out = self.fc(inputs)
        out = self.drop(out).add_(inputs)
        out = self.norm(out)
        return out


class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(EncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(self.d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, mask):
        out, _ = self.attention(query, key, value, mask)
        out = self.ff(out)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff):
        super(DecoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = dropout

        self.attention_tgt = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.attention_src = MultiHeadAttention(self.num_heads, self.d_model, self.dropout)

        self.ff = PositionwiseFeedForward(d_model, self.d_ff, self.dropout)

    def forward(self, query, key, value, context, mask_tgt, mask_src):
        out, _ = self.attention_tgt(query, key, value, mask_tgt)
        out, coverage = self.attention_src(out, context, context, mask_src)
        out = self.ff(out)
        return out, coverage

In [11]:
class Encoder(nn.Module):
    def __init__(self, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Encoder, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.cnn = CNN(dropout=0.5)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, len_max=1024)

        self.layers = nn.ModuleList(
            [EncoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

    def forward(self, inputs, input_sizes):
        context, context_size = self.cnn(inputs, input_sizes)  # batch_size x len_src x d_model
        
        mask_src = self.get_mask(context_size).unsqueeze(1)
        mask_src = mask_src.cuda() if next(self.parameters()).is_cuda else mask_src
        self.mask = mask_src
        
        context = self.pos_emb(context)

        for _, layer in enumerate(self.layers):
            context = layer(context, context, context, mask_src)  # batch_size x len_src x d_model
        return context, mask_src

    @staticmethod
    def get_mask(lengths):
        batch_size = lengths.numel()
        mask = (torch.arange(0, lengths.max()).type_as(lengths).repeat(batch_size, 1).gt(lengths.unsqueeze(1)))
        return mask

In [12]:
encode_cpu = Encoder(num_heads=8, d_model=256, dropout=0.1, d_ff=1024, num_layers=6, 
                     padding_idx=0)
                     
context_cpu, mask_src_cpu = encode_cpu(inputs_cpu, input_sizes_cpu)

context_cpu.shape, mask_src_cpu.shape

(torch.Size([20, 39, 256]), torch.Size([20, 1, 39]))

In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=0):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.d_model = d_model
        self.padding_idx = padding_idx
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.dropout = dropout

        self.embedding = nn.Embedding(self.vocab_size, self.d_model, padding_idx=self.padding_idx)

        self.pos_emb = PositionalEncoding(self.d_model, self.dropout, len_max=d_model)

        self.layers = nn.ModuleList(
            [DecoderLayer(self.num_heads, self.d_model, self.dropout, self.d_ff) for _ in range(self.num_layers)]
        )

        self.fc = nn.Linear(self.d_model, self.vocab_size, bias=True)

        # tie weight between word embedding and generator
        self.fc.weight = self.embedding.weight

        self.logsoftmax = nn.LogSoftmax(dim=1)

        # pre-save a mask to avoid future information in self-attentions in decoder
        # save as a buffer, otherwise will need to recreate it and move to GPU during every call
        mask = torch.ByteTensor(np.triu(np.ones((self.d_model, self.d_model)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

    def forward(self, tgt, context, mask_src):
        out = self.embedding(tgt)  # batch_size x len_tgt x d_model

        out = self.pos_emb(out)

        len_tgt = tgt.size(1)
        mask_tgt = tgt.data.eq(self.padding_idx).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)
        for _, layer in enumerate(self.layers):
            out, coverage = layer(out, out, out, context, mask_tgt, mask_src)  # batch_size x len_tgt x d_model

        out = self.fc(out)  # batch_size x len_tgt x bpe_size

        out = self.logsoftmax(out.view(-1, self.vocab_size))
        return out, coverage

In [14]:
class Transformer(nn.Module):
    def __init__(self, tgt_vocab, num_heads, d_model, dropout, d_ff, num_layers=6, padding_idx=1):
        super(Transformer, self).__init__()
        self.tgt_vocab = tgt_vocab
        self.tgt_vocab_size = len(tgt_vocab)
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.dropout = dropout
        self.padding_idx = padding_idx

        self.encode = Encoder(self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)
        self.decode = Decoder(self.tgt_vocab_size, self.num_heads, self.d_model, self.dropout, self.d_ff,
                              self.num_layers, self.padding_idx)

    def forward(self, inputs, input_sizes, tgt):
        
        context, mask_src = self.encode(inputs, input_sizes,)
        
        outputs, _ = self.decode(tgt, context, mask_src)

        probas = outputs.view(inputs.size(0), -1, self.tgt_vocab_size)
        _, max_indices = torch.max(probas, 2)
        tmp = max_indices.eq(self.tgt_vocab('<EOS>'))
        tmp[:,-1] = 1
        proba_sizes = torch.max(tmp, dim=1)[1] + 1

        return probas, proba_sizes

    def decode_greedy(self, inputs, input_sizes, labels=None, max_seq_length=50):
        use_cuda = next(self.parameters()).is_cuda

        idx_sos, idx_eos = self.tgt_vocab('<SOS>'), self.tgt_vocab('<EOS>')

        context, mask_src = self.encode(inputs, input_sizes,)

        batch_size = inputs.size(0)
        decode_input = torch.ones(batch_size, 1).fill_(idx_sos).long()
        decode_input = decode_input.cuda() if use_cuda else decode_input

        dec_output_sizes = torch.LongTensor(batch_size).fill_(max_seq_length).long()
        dec_output_sizes = dec_output_sizes.cuda() if use_cuda else dec_output_sizes

        max_steps = labels.size(1) if labels is not None else max_seq_length + 1

        dec_outputs = []
        for step in range(max_steps):
            outputs, _ = self.decode(decode_input, context, mask_src)
            outputs = outputs.view(batch_size, -1, self.tgt_vocab_size)

            dec_outputs.append(outputs[:, step, :].unsqueeze(1))

            preds = torch.max(outputs[:, -1, :], dim=1)[1]

            dec_output_sizes[preds.eq(idx_eos) * dec_output_sizes.gt(step)] = step
            if labels is None and dec_output_sizes.le(step + 1).all():
                break

            decode_input = torch.cat([decode_input, preds.unsqueeze(1)], dim=1)

        dec_outputs = torch.cat(dec_outputs, dim=1)

        return dec_outputs, dec_output_sizes

    def decode_beam(self, inputs, labels=None, max_seq_length=50, beam_size=64, alpha=0.1, beta=0.3):

        context, mask_src = self.encode(inputs)

        max_seq_len = labels.size(1) if labels is not None else max_seq_length

        dec_outputs = []
        for idx in range(context.size(0)):
            target, _ = beam_search(self, self.tgt_vocab, context[idx].unsqueeze(0), mask_src[idx].unsqueeze(0),
                                    beam_size=beam_size, alpha=alpha, beta=beta, max_seq_len=max_seq_len)
            dec_outputs.append(target)

        return dec_outputs


def beam_search(model, vocab, context, mask_src, beam_size=64, alpha=0.1, beta=0.3, max_seq_len=64):
    probas = []
    preds = []
    probs = []
    coverage_penalties = []

    vocab_size = len(vocab)
    idx_sos, idx_eos, idx_pad = vocab('<SOS>'), vocab('<EOS>'), 0

    decode_inputs = torch.LongTensor([idx_sos]).unsqueeze(1)
    if next(model.parameters()).is_cuda:
        decode_inputs = decode_inputs.cuda()

    decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

    scores, scores_idx = decode_outputs.view(-1).topk(beam_size)
    beam_idx = scores_idx / vocab_size
    pred_idx = (scores_idx - beam_idx * vocab_size).view(beam_size, -1)

    decode_inputs = torch.cat((decode_inputs.repeat(beam_size, 1), pred_idx), 1)
    context = context.repeat(beam_size, 1, 1)

    remaining_beams = beam_size
    for step in range(max_seq_len):
        decode_outputs, coverage = model.decode(decode_inputs, context, mask_src)

        decode_outputs = decode_outputs.view(remaining_beams, -1, vocab_size)
        decode_outputs = scores.unsqueeze(1) + decode_outputs[:, -1, :]
        scores, scores_idx = decode_outputs.view(-1).topk(remaining_beams)

        beam_idx = scores_idx / vocab_size
        pred_idx = (scores_idx - beam_idx * vocab_size).view(remaining_beams, -1)

        decode_inputs = torch.cat((decode_inputs[beam_idx], pred_idx), 1)

        index = decode_inputs[:, -1].eq(idx_eos) + decode_inputs[:, -1].eq(idx_pad)
        finished = index.nonzero().flatten()
        continue_idx = (index ^ 1).nonzero().flatten()

        for idx in finished:
            probas.append(scores[idx].item())
            preds.append(decode_inputs[idx, :].tolist())
            probs.append(coverage[idx, :, :])

            atten_prob = torch.sum(coverage[idx, :, :], dim=0)
            coverage_penalty = torch.log(atten_prob.masked_select(atten_prob.le(1)))
            coverage_penalty = beta * torch.sum(coverage_penalty).item()
            coverage_penalties.append(coverage_penalty)

            remaining_beams -= 1

        if len(continue_idx) > 0:
            scores = scores.index_select(0, continue_idx)
            decode_inputs = decode_inputs.index_select(0, continue_idx)
            context = context.index_select(0, continue_idx)

        if remaining_beams <= 0:
            break

    len_penalties = [math.pow(len(pred), alpha) for pred in preds]
    #     final_scores = [probas[i] / len_penalties[i] + coverage_penalties[i] for i in range(len(preds))]
    final_scores = [probas[i] / len_penalties[i] for i in range(len(preds))]

    sorted_scores_arg = sorted(range(len(preds)), key=lambda i: -final_scores[i])

    best_beam = sorted_scores_arg[0]

    return preds[best_beam], probs[best_beam]

In [15]:
model_cpu = Transformer(vocab, num_heads=8, d_model=256, 
                        dropout=0.1, d_ff=1024, num_layers=6, padding_idx=0)

outputs_cpu, output_sizes_cpu = model_cpu(inputs_cpu, input_sizes_cpu, labels_cpu)

fix_embedding = torch.from_numpy(np.eye(len(vocab), 256).astype(np.float32))
model_cpu.decode.embedding.weight = nn.Parameter(fix_embedding)
model_cpu.decode.embedding.weight.requires_grad = False

print(outputs_cpu.shape, output_sizes_cpu.shape)

outputs_cpu, output_sizes_cpu = model_cpu.decode_greedy(inputs_cpu, input_sizes_cpu, labels_cpu)

# print(outputs_cpu.shape, output_sizes_cpu.shape)

# outputs_cpu, output_sizes_cpu = model_cpu.decode_beam(inputs_cpu, labels_cpu)

print(outputs_cpu.shape, output_sizes_cpu.shape)

torch.Size([20, 7, 32]) torch.Size([20])
torch.Size([20, 7, 32]) torch.Size([20])


In [19]:
m = Metric([('train_loss', np.inf), ('train_score', np.inf), ('valid_loss', np.inf), ('valid_score', 0),
            ('train_lr', 0), ('valid_cer', np.inf)])

model = Transformer(vocab, num_heads=8, d_model=256, 
                        dropout=0.1, d_ff=1024, num_layers=6, padding_idx=0)

for p in model.parameters():
    if p.dim() > 1:
        torch_weight_init(p)
        
fix_embedding = torch.from_numpy(np.eye(len(vocab), 256).astype(np.float32))
model.decode.embedding.weight = nn.Parameter(fix_embedding)
model.decode.embedding.weight.requires_grad = False

if H.USE_CUDA:
    model.cuda()

if H.PRELOAD_MODEL_PATH:
    path = os.path.join(H.EXPERIMENT, H.PRELOAD_MODEL_PATH)
    state = torch.load(path)
    model.load_state_dict(state)
    print("Preloaded model: {}".format(path))

# torch_weight_init(model.dec) 
    
criterion = LabelSmoothingLoss(padding_idx=0, label_smoothing=H.LABEL_SMOOTHING)

sts_decoder = STSDecoder(vocab)

scorer = Scorer()

optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, model.parameters())),
                       amsgrad=False,
                       betas=(0.9, 0.999),
                       eps=1e-08,
                       lr=H.LR,
                       weight_decay=H.WEIGHT_DECAY)

# optimizer = optim.SGD(list(filter(lambda p: p.requires_grad, model.parameters())),
#                       lr=H.LR, weight_decay=H.WEIGHT_DECAY, momentum=H.MOMENTUM, nesterov=H.NESTEROV)

stopping = Stopping(model, patience=H.STOPPING_PATIENCE)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[H.LR_LAMBDA])

# scheduler = NoamLRScheduler(optimizer, H.RNN_NUM_LAYERS, 10, 0.003)

tlogger = TensorboardLogger(root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP)  # PytorchLogger()

checkpoint = Checkpoint(model, optimizer, stopping, m,
                        root_dir=H.EXPERIMENT, experiment_dir=H.TIMESTAMP, restore_from=-1,
                        interval=H.CHECKPOINT_INTERVAL, verbose=0)

# trainer = Trainer(model, train_loader, optimizer, scheduler, criterion, sts_decoder, scorer, H.MAX_GRAD_NORM)

# evaluator = Evaluator(model, valid_loader, criterion, sts_decoder, scorer)

In [23]:
state = torch.load(os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))
model.load_state_dict(state)


In [35]:
epoch_start = 1
if H.CHECKPOINT_RESTORE:
    epoch_start = checkpoint.restore() + 1
#     train_loader.batch_sampler.shuffle(epoch_start)

epoch = epoch_start
try:
    epoch_itr = tlogger.set_itr(range(epoch_start, H.MAX_EPOCHS + 1))

    for epoch in epoch_itr:
        
#         with DelayedKeyboardInterrupt():

        model.train(True)

        scheduler.step()
    
        train_lr = [float(param_group['lr']) for param_group in optimizer.param_groups][0]

        total_size, total_loss, total_score = 0, 0.0, 0.0
        for inputs, labels, input_sizes, label_sizes, _ in train_loader:
            if next(model.parameters()).is_cuda:
                inputs, labels = inputs.cuda(), labels.cuda()

            probas, proba_sizes = model(inputs, input_sizes, labels)

            loss = criterion(probas, proba_sizes, labels.contiguous(), label_sizes)
            total_loss += loss.item()      
            
            preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels.contiguous(), label_sizes)
            total_score += scorer(preds_seq, label_seq)
            total_size += inputs.size(0)
            1/0
            optimizer.zero_grad()
            loss.backward()
            
            if H.MAX_GRAD_NORM is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), H.MAX_GRAD_NORM)
            optimizer.step()

            del probas
            del loss
            
        m.train_loss = total_loss / total_size
        m.train_score = 1.0 - min(1.0, total_score / total_size)
        m.train_lr = train_lr
    
        #-----------------------------------------------------------
        
#         model.eval()
        
#         with torch.no_grad():

#             hypotheses = []
#             references = []
#             total_size, total_loss, total_score = 0, 0.0, 0.0
#             for inputs, labels, input_sizes, label_sizes, _ in valid_loader:
#                 if next(model.parameters()).is_cuda:
#                     inputs, labels = inputs.cuda(), labels.cuda()

#                 probas, proba_sizes = model.decode_greedy(inputs, input_sizes, labels[:,1:], train_loader.dataset.max_seq_length)
                
#                 loss = criterion(probas, proba_sizes, labels[:,1:].contiguous(), label_sizes-1)
#                 total_loss += loss.item()      

#                 preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels[:,1:].contiguous(), label_sizes-1)
#                 total_score += scorer(preds_seq, label_seq)

#                 total_size += inputs.size(0)
                
#             del probas
#             del loss

#         m.valid_loss = total_loss / total_size
#         m.valid_score = 1.0 - min(1.0, total_score / total_size)

        if checkpoint:
            checkpoint.step(epoch)

        stopping_flag = stopping.step(epoch, m.valid_loss, m.valid_score)

        epoch_itr.log_values(m.train_loss, m.train_score, m.train_lr, m.valid_loss, m.valid_score,
                             stopping.best_score_epoch, stopping.best_score)

        if stopping_flag:
            logger.info(
                "Early stopping at epoch: %d, score %f" % (stopping.best_score_epoch, stopping.best_score))
            break

#             train_loader.batch_sampler.shuffle(epoch)

except KeyboardInterrupt:
    logger.info("Training interrupted at: {}".format(epoch))
    pass

checkpoint.create(epoch)

model.load_state_dict(stopping.best_score_state)
torch.save(model.state_dict(), os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

logger.info(repr(tlogger))
logger.info(repr(stopping))
logger.info(repr(checkpoint))

logger.info("Training end.")

ZeroDivisionError: division by zero

In [36]:
torch.max(probas, 2)

(tensor([[-0.1286, -0.0753, -0.6658, -0.1011, -0.4612, -0.1878, -0.0884],
         [-0.9921, -1.0244, -0.3576, -0.0325, -0.9927, -1.2661, -0.2546],
         [-0.3076, -0.8505, -0.7674, -0.9873, -0.1992, -1.0855, -0.1261],
         [-0.8583, -0.2498, -0.0986, -0.0377, -0.5966, -0.0851, -0.4258],
         [-0.3866, -0.7174, -0.3374, -0.9027, -0.9997, -1.2534, -1.2911],
         [-0.1329, -0.0479, -0.4300, -0.5842, -0.7648, -0.0212, -0.5437],
         [-1.0931, -0.1382, -1.6085, -0.4251, -0.8895, -0.1107, -0.0232],
         [-0.0431, -0.0330, -0.6706, -0.1687, -0.2601, -0.3927, -0.9095],
         [-0.0317, -0.6563, -0.8508, -0.2675, -0.6752, -0.8011, -0.4727],
         [-0.1601, -0.0345, -0.1775, -0.3584, -0.0904, -0.7756, -1.3098],
         [-0.0606, -0.0485, -0.1103, -0.6938, -0.5940, -0.5738, -1.3479],
         [-0.6818, -1.0915, -0.2128, -1.4559, -0.3404, -0.4271, -0.0306],
         [-0.5821, -0.0062, -0.1605, -0.2786, -0.3102, -0.3347, -0.3245],
         [-0.3602, -0.0861, -0.2116, -

In [37]:
vocab(28)

' '

In [None]:
1/0

In [29]:
audio_transform = transforms.Compose([
    AudioNormalizeDB(db=H.NORMALIZE_DB,
                     max_gain_db=H.NORMALIZE_MAX_GAIN),
    AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE,
                     window_size=H.SPECT_WINDOW_SIZE,
                     window_stride=H.SPECT_WINDOW_STRIDE,
                     window=H.SPECT_WINDOW),
    AudioNormalize(),
    FromNumpyToTensor(tensor_type=torch.FloatTensor)
])

label_transform = transforms.Compose([
    TranscriptEncodeSTS(vocab),
    FromNumpyToTensor(tensor_type=torch.LongTensor)
])

test_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT), manifests_files=H.MANIFESTS, datasets="test",
                            transform=audio_transform, label_transform=label_transform, max_data_size=None,
                            sorted_by='recording_duration')

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=H.BATCH_SIZE, num_workers=H.NUM_WORKERS,
                                          shuffle=False, collate_fn=collate_fn, pin_memory=True)

logger.info(test_loader.dataset)

In [30]:
model_pre = Transformer(vocab, num_heads=8, d_model=256, 
                        dropout=0.1, d_ff=1024, num_layers=6, padding_idx=0)
if H.USE_CUDA:
    model_pre.cuda()

state = torch.load(os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))
model_pre.load_state_dict(state)

sts_decoder = STSDecoder(vocab)

In [33]:
%%time 

model_pre.eval()
with torch.no_grad():

    hypotheses = []
    references = []
    for inputs, labels, input_sizes, label_sizes, _ in test_loader:
        if next(model_pre.parameters()).is_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()        
 
        probas, proba_sizes = model_pre.decode_greedy( inputs, input_sizes.cuda(), labels[:,1:], test_loader.dataset.max_seq_length)
        
        preds_seq, label_seq = sts_decoder(probas, proba_sizes, labels.contiguous(), label_sizes)

        hypotheses.extend(preds_seq)
        references.extend(label_seq)
        

CPU times: user 4.28 s, sys: 154 ms, total: 4.44 s
Wall time: 4.46 s


In [34]:
hypotheses

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [None]:
from lib.scorer import Scorer

bleu = Scorer.get_moses_multi_bleu(hypotheses, references, lowercase=False)
wer, cer = Scorer.get_wer_cer(hypotheses, references)
acc = Scorer.get_acc(hypotheses, references)


print('Test Summary \n'
            'Bleu: {bleu:.3f}\n'
            'WER:  {wer:.3f}\n'
            'CER:  {cer:.3f}\n'
            'ACC:  {acc:.3f}'.format(bleu=bleu, wer=wer * 100, cer=cer * 100, acc=acc * 100))

In [None]:
import matplotlib.mlab as mlab

line_len = sorted([len(l[0]) for l in bad_lines], reverse=True)

if max(line_len):
    fig = plt.figure(figsize=(15, 6))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xticks(range(1, max(line_len)))

    ax = plt.hist(line_len, bins=range(1, max(line_len)))

    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.title("Bad Line Length Distribution")
    plt.xticks(rotation=90)
    plt.grid(True);