In [0]:
# for uploading big files to google drive

from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build
from google.colab import auth

auth.authenticate_user()
drive_service = build('drive', 'v3')

def save_file_to_drive(name, path):
  file_metadata = {'name': name, 'mimeType': 'application/octet-stream'}
  media = MediaFileUpload(path, mimetype='application/octet-stream', resumable=True)
  created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
  return created

In [0]:
# for downloading files from google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# RNN-T loss
!pip install warp-rnnt

Collecting warp-rnnt
  Downloading https://files.pythonhosted.org/packages/86/6d/1389db3abbeaaed0279516878275b7c8f536c70323376bfd46a6e95e4040/warp_rnnt-0.1.0.tar.gz
Collecting pybind11
[?25l  Downloading https://files.pythonhosted.org/packages/4b/4d/ae1c4d8e8b139afa9682054dd42df3b0e3b5c1731287933021b9fd7e9cc4/pybind11-2.4.3-py2.py3-none-any.whl (150kB)
[K     |████████████████████████████████| 153kB 7.0MB/s 
Building wheels for collected packages: warp-rnnt
  Building wheel for warp-rnnt (setup.py) ... [?25l[?25hdone
  Created wheel for warp-rnnt: filename=warp_rnnt-0.1.0-cp36-cp36m-linux_x86_64.whl size=1403375 sha256=d9d4e34c25cf2bad55f6404b4ace8ad7a872c6287a877c6f2323f2cedaa74058
  Stored in directory: /root/.cache/pip/wheels/b4/4d/6b/004a7f35a7c506bb6f82900efb961d345d87ae9bfa06e72bb9
Successfully built warp-rnnt
Installing collected packages: pybind11, warp-rnnt
Successfully installed pybind11-2.4.3 warp-rnnt-0.1.0


In [0]:
!pip3 install python_speech_features

Collecting python_speech_features
  Downloading https://files.pythonhosted.org/packages/ff/d1/94c59e20a2631985fbd2124c45177abaa9e0a4eee8ba8a305aa26fc02a8e/python_speech_features-0.6.tar.gz
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-cp36-none-any.whl size=5889 sha256=8b549a2c0d58b0309080138f72e638fa56377fffd9b88b97e7fc1b2bb9b0a3ec
  Stored in directory: /root/.cache/pip/wheels/3c/42/7c/f60e9d1b40015cd69b213ad90f7c18a9264cd745b9888134be
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


In [0]:
!mkdir -p data/datasets

In [0]:
# validation descriptions
!cp -Rv "/content/drive/My Drive/val.txt" ./

In [0]:
# validation mfcc features
!cp -Rv "/content/drive/My Drive/val.tar.gz" ./

In [0]:
# train mfcc features
!cp -Rv "/content/drive/My Drive/train_part_8a.tar.gz" ./

In [0]:
!tar -xf  val.tar.gz -C /

In [0]:
!tar -xf  /content/train_part_8a.tar.gz -C ./

In [0]:
# !cp -R '/content/drive/My Drive/check/asr' /content/

In [0]:
# %cd /content/asr/

In [0]:
# train annotation in diffent sizes 

!cp -Rv "/content/drive/My Drive/train.txt" /content/

In [0]:
!sed -n '1,100000p' train.txt > train_100000.txt

In [0]:
!sed -n '1,100000p' ../train.txt > ../train_100000.txt

In [0]:
!sed -n '1,200000p' ../train.txt > ../train_200000.txt

In [0]:
!sed -n '1,400000p' ../train.txt > ../train_400000.txt

In [0]:
!sed -n '1,50000p' ../train.txt > ../train_50000.txt

In [0]:
# loading model from previous iteration

!cp -Rv "/content/drive/My Drive/model_lm2_4e5_samp_0_ep" ../

'/content/drive/My Drive/model_lm2_4e5_samp_0_ep' -> '../model_lm2_4e5_samp_0_ep'


In [0]:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


def decrease_dim(x, layer, dim=1):
    if type(layer) != nn.modules.conv.Conv2d:
        return x
    p = layer.padding[dim]
    d = layer.dilation[dim]
    f = layer.kernel_size[dim]
    s = layer.stride[dim]
    x = (x + 2 * p - d * (f - 1) - 1) // s + 1
    return x


def is_time_decrease(layer):
    return decrease_dim(100, layer) != 100


class BatchNorm1d(nn.BatchNorm1d):

    def forward(self, x):
        shape = list(x.size())
        x = x.view(-1, self.num_features)
        x = super().forward(x)
        shape = shape[:-1] + [self.num_features]
        x = x.view(shape)
        return x


class MaskConv(nn.Module):

    def __init__(self, layers):
        """
        Erase padding of the output based on the given lengths.
        Input needs to be in the shape of (NxCxDxT)
        :param layers: The sequential module containing the conv stack.
        """
        super(MaskConv, self).__init__()
        self.layers = layers

    def output_time(self, x):
        for layer in self.layers:
            x = decrease_dim(x, layer, dim=1)
        return x

    def output_dim(self, dim):
        channels = 0
        for layer in self.layers:
            dim = decrease_dim(dim, layer, dim=0)
            if type(layer) == nn.modules.conv.Conv2d:
                channels = layer.out_channels
        return dim * channels

    def forward(self, x, lengths):
        """
        :param x: The input of size NxCxDxT
        :param lengths: The actual length of each sequence in the batch
        :return: Masked output from the module
        """

        mask = None

        for layer in self.layers:

            x = layer(x)

            if is_time_decrease(layer):

                lengths = decrease_dim(lengths, layer)

                n, c, d, t = x.size()

                mask = torch.zeros((n, 1, 1, t), dtype=torch.bool, device=x.device)

                for i, length in enumerate(lengths):
                    start = length.item()
                    length = t - start
                    if length > 0:
                        mask[i].narrow(2, start, length).fill_(1)

            if mask is not None:
                x = x.masked_fill(mask, 0)

        n, c, d, t = x.size()
        x = x.view(n, c * d, t)
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # T x N x H

        return x, lengths


class AcousticModel(nn.Module):

    def __init__(self, input_size, hidden_size, prj_size, output_size, n_layers=1, dropout=0, checkpoint=''):
        super(AcousticModel, self).__init__()
        self.conv = MaskConv(nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(21, 11), stride=(2, 2), padding=(10, 5), bias=False),
            nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.Dropout(dropout),
            nn.Conv2d(32, 32, kernel_size=(11, 11), stride=(2, 1), padding=(5, 5), bias=False),
            nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.Dropout(dropout)
        ))
        input_size = self.conv.output_dim(input_size)
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                          dropout=dropout if n_layers > 1 else 0,
                          bidirectional=True)
        self.prj = nn.Sequential(nn.Dropout(dropout),
                                 nn.Linear(hidden_size, prj_size, bias=False))
        self.fc = nn.Sequential(BatchNorm1d(prj_size), nn.ReLU(inplace=True),
                                nn.Linear(prj_size, output_size))
        if len(checkpoint):
            print(checkpoint)
            self.load_state_dict(torch.load(checkpoint, map_location='cpu'))

    def features(self, x, lengths):
        # Apply 2d convolutions
        x, lengths = self.conv(x, lengths)
        # Pack padded batch of sequences for RNN module
        x = pack_padded_sequence(x, lengths)
        # Forward pass through GRU
        x, _ = self.rnn(x)
        # Unpack padding
        x, _ = pad_packed_sequence(x)
        # Sum bidirectional GRU outputs
        x = x[:, :, :self.rnn.hidden_size] + x[:, :, self.rnn.hidden_size:]
        x = self.prj(x)
        return x, lengths

    def forward(self, x, lengths):
        x, lengths = self.features(x, lengths)
        x = self.fc(x)  # T x N x H
        return x, lengths


class LanguageModel(nn.Module):

    def __init__(self, emb_size, hidden_size, prj_size, vocab_size, n_layers=1, dropout=0, blank=0, checkpoint=''):
        super(LanguageModel, self).__init__()
        # The gradient for blank input is always zero.
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=blank)
        self.rnn = nn.LSTM(emb_size, hidden_size, num_layers=n_layers,
                           dropout=dropout if n_layers > 1 else 0)
        self.prj = nn.Sequential(nn.Dropout(dropout),
                                 nn.Linear(hidden_size, prj_size, bias=False))
        self.fc = nn.Sequential(BatchNorm1d(prj_size), nn.ReLU(inplace=True),
                                nn.Linear(prj_size, vocab_size))
        if len(checkpoint):
            print(checkpoint)
            self.load_state_dict(torch.load(checkpoint, map_location='cpu'))

    def features(self, x, lengths):
        init = torch.zeros((1, x.shape[1]), device=x.device).long()
        x = torch.cat([init, x.long()])
        x = self.emb(x)
        x = pack_padded_sequence(x, lengths + 1, enforce_sorted=False)
        x, _ = self.rnn(x)
        x, _ = pad_packed_sequence(x)
        x = self.prj(x)
        return x

    def forward(self, x, lengths):
        x = self.features(x, lengths)
        x = self.fc(x)  # T x N x H
        return x

    def step_features(self, x, h=None):
        x = self.emb(x)
        x, h = self.rnn(x, h)
        x = self.prj(x)
        return x, h

    def step_forward(self, x, h=None):
        x, h = self.step_features(x, h)
        x = self.fc(x)  # T x N x H
        return x, h

    def step_init(self, batch_size):
        weight = next(self.rnn.parameters())
        return (weight.new_zeros(self.rnn.num_layers, batch_size, self.rnn.hidden_size),
                weight.new_zeros(self.rnn.num_layers, batch_size, self.rnn.hidden_size))


class Transducer(nn.Module):

    def __init__(self, emb_size, vocab_size, hidden_size, prj_size, am_layers=3, lm_layers=2, dropout=0, blank=0,
                 am_checkpoint='', lm_checkpoint=''):
        super(Transducer, self).__init__()

        self.blank = blank

        self.encoder = AcousticModel(40, hidden_size, prj_size, vocab_size, n_layers=am_layers, dropout=dropout,
                                     checkpoint=am_checkpoint)
        self.decoder = LanguageModel(emb_size, hidden_size, prj_size, vocab_size, n_layers=lm_layers, dropout=dropout, blank=blank,
                                     checkpoint=lm_checkpoint)

        for p in self.encoder.fc.parameters():
            p.requires_grads = False
        for p in self.decoder.fc.parameters():
            p.requires_grads = False

        self.fc = nn.Linear(prj_size, vocab_size)

    def joint(self, x, y):
        z = torch.tanh(x + y)
        z = self.fc(z)
        z = log_softmax(z, dim=-1)
        return z

    def forward(self, xs, ys, xn, yn):
        # encoder
        xs, xn = self.encoder.features(xs, xn)
        xs = xs.transpose(0, 1)
        # decoder
        ys = self.decoder.features(ys, yn)
        ys = ys.transpose(0, 1)
        # align
        n, t, x_h = xs.size()
        n, u, y_h = ys.size()
        x = xs.unsqueeze(dim=2).expand(torch.Size([n, t, u, x_h]))
        y = ys.unsqueeze(dim=1).expand(torch.Size([n, t, u, y_h]))
        # predict
        zs = self.joint(x, y)
        return zs, xs, xn

    def greedy_decode(self, xs):

        n, t, h = xs.size()

        c = torch.zeros((1, n), device=xs.device).long()
        yd, (hd, cd) = self.decoder.step_features(c)

        s = torch.zeros((n, t), dtype=torch.int)

        for i in range(t):

            z = self.joint(xs[:, i], yd[0])

            c = torch.argmax(z, dim=-1).view(1, n)

            s[:, i] = c.cpu().view(n)

            mask = c == self.blank
            mask = mask.unsqueeze(-1)

            yd_next, (hd_next, cd_next) = self.decoder.step_features(c, (hd, cd))

            yd = torch.where(mask, yd, yd_next)
            hd = torch.where(mask, hd, hd_next)
            cd = torch.where(mask, cd, cd_next)

        return s


In [0]:
%cd /content/asr/

In [0]:
import sys
import torch
import torch.nn as nn

import numpy as np

from data import Labels, AudioDataset, DataLoader, collate_fn_rnnt, BucketingSampler

from tqdm import tqdm_notebook, tqdm

# from model import Transducer
from utils import AverageMeter, entropy

import decoder

from warp_rnnt import rnnt_loss

torch.backends.cudnn.benchmark = True
torch.manual_seed(0)
np.random.seed(0)

labels = Labels()

model = Transducer(128, len(labels), 256, 256, am_layers=3, lm_layers=2, dropout=0.3)

model.load_state_dict(torch.load("../model_lm2_4e5_samp_0_ep"))

train = AudioDataset('/content/train.txt', labels)
test = AudioDataset('/content/val.txt', labels)

train.filter_by_conv(model.encoder.conv)
train.filter_by_length(5000)

test.filter_by_conv(model.encoder.conv)
test.filter_by_length(10000)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)

model.cuda()

sampler = BucketingSampler(train, 32)

train = DataLoader(train, pin_memory=True, num_workers=2, collate_fn=collate_fn_rnnt, batch_sampler=sampler)
test = DataLoader(test, pin_memory=True, num_workers=2, collate_fn=collate_fn_rnnt, batch_size=16)

train_err = []
train_grad = []

val_err = []
val_wer = []
val_cer = []
val_ent = []

filter     765    0.05%
filter       0    0.00%
filter       0    0.00%
filter       0    0.00%


In [0]:
print("Start\n")
# new_epoch = epoch + 1
new_epoch = 1

for epoch in range(new_epoch, 100):

    sampler.shuffle(epoch)

    model.train()

    err = AverageMeter('loss')
    grd = AverageMeter('gradient')

    progress = train
    for xs, ys, xn, yn in progress:

        optimizer.zero_grad()

        xs = xs.cuda(non_blocking=True)
        ys = ys.cuda(non_blocking=True)
        xn = xn.cuda(non_blocking=True)
        yn = yn.cuda(non_blocking=True)

        zs, xs, xn = model(xs, ys, xn, yn)

        ys = ys.t().contiguous()

        loss = rnnt_loss(zs, ys, xn, yn, average_frames=False, reduction="mean")
        loss.backward()

        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), 100)

        optimizer.step()

        err.update(loss.item())
        grd.update(grad_norm)

        

        train_err.append(err)
        train_grad.append(grd)

        # progress.set_description('epoch %d %s %s' % (epoch + 1, err, grd))
    print('epoch %d %s %s' % (epoch + 1, err, grd))
    torch.save(model.state_dict(), "../model_lm2_h256_full_samp_{}_ep".format(epoch))
    save_file_to_drive("model_lm2_h256_full_samp_{}_ep".format(epoch), 
                       "../model_lm2_h256_full_samp_{}_ep".format(epoch))    
    model.eval()

    err = AverageMeter('loss')
    cer = AverageMeter('cer')
    wer = AverageMeter('wer')
    ent = AverageMeter('ent')

    with torch.no_grad():
        progress = test
        for xs, ys, xn, yn in progress:

            xs = xs.cuda(non_blocking=True)
            ys = ys.cuda(non_blocking=True)
            xn = xn.cuda(non_blocking=True)
            yn = yn.cuda(non_blocking=True)

            zs, xs, xn = model(xs, ys, xn, yn)

            ys = ys.t().contiguous()

            loss = rnnt_loss(zs, ys, xn, yn, average_frames=False, reduction="mean")

            xs = model.greedy_decode(xs)

            err.update(loss.item())
            ent.update(entropy(xs))

            hypothesis = decoder.unpad(xs, xn, labels)
            references = decoder.unpad(ys, yn, labels)
            
            if (epoch % 1) == 0:
                for h, r in zip(hypothesis, references):
                    cer.update(decoder.cer(h, r))
                    wer.update(decoder.wer(h, r))

            # progress.set_description('epoch %d %s %s %s %s' % (epoch + 1, err, cer, wer, ent))

            val_err.append(err)
            val_wer.append(wer)
            val_cer.append(cer)
            val_ent.append(ent)
        print('epoch %d %s %s %s %s' % (epoch + 1, err, cer, wer, ent))
        sys.stderr.write('\n')

Start

epoch 2 loss 22.5197±9.15 gradient 24.7021±7.04
epoch 2 loss 17.0966±9.15 cer 0.5015±0.29 wer 0.7573±0.30 ent 1.0626±0.17



