[example](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb)

In [13]:
# python train.py
# $DIR_FOR_PREPROCESSED_DATA
# --save-dir $MODEL_PATH
# --max-epoch 80
# --task speech_recognition
# --arch vggtransformer_2
# --optimizer adadelta
# --lr 1.0
# --adadelta-eps 1e-8
# --adadelta-rho 0.95
# --clip-norm 10.0
# --max-tokens 5000
# --log-format json
# --log-interval 1
# --criterion cross_entropy_acc
# --user-dir examples/speech_recognition/

In [14]:
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS


In [15]:
torchaudio.set_audio_backend("soundfile")

In [16]:
from torchaudio.datasets import LIBRISPEECH
# waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id

class SAFE_LIBRISPEECH(LIBRISPEECH):

    def __getitem__(self, n):
        try:
            return super().__getitem__(n)
        except (FileNotFoundError, RuntimeError):
            return None
        
    def __next__(self):
        try:
            return super().__next__()
        except (FileNotFoundError, RuntimeError):
            return self.__next__()

        
def datasets():

    folder_in_archive = 'LibriSpeech'
    download = True
    root = "./"

    print("train")
    dataset1 = SAFE_LIBRISPEECH(root, url='train-clean-100', folder_in_archive=folder_in_archive, download=download)
    # print(dataset1[0])
    dataset2 = SAFE_LIBRISPEECH(root, url='train-clean-360', folder_in_archive=folder_in_archive, download=download)
    # dataset3 = SAFE_LIBRISPEECH(root, url='train-other-500', folder_in_archive=folder_in_archive, download=download)
    # train = torch.utils.data.ConcatDataset([dataset1, dataset2, dataset3])
    train = torch.utils.data.ConcatDataset([dataset1, dataset2])
    # print(train[0])

    print("valid")
    dataset1 = SAFE_LIBRISPEECH(root, url='dev-clean', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='dev-other', folder_in_archive=folder_in_archive, download=download)
    valid = torch.utils.data.ConcatDataset([dataset1, dataset2])

    print("test")
    dataset1 = SAFE_LIBRISPEECH(root, url='test-other', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='test-clean', folder_in_archive=folder_in_archive, download=download)
    test = torch.utils.data.ConcatDataset([dataset1, dataset2])

    return train, valid, test

In [17]:
#     waveform, sample_rate, label, speaker_id, utterance_number

def datasets():

    download = True
    root = "./"

    print("train")
    dataset1 = SPEECHCOMMANDS(root, download=download)

    return dataset1, None, None

In [18]:
train, valid, test = datasets()

train


In [19]:
# spm_train 
# --input=data/lang_char/input.txt
# --vocab_size=${nbpe}
# --model_type=${bpemode}
# --model_prefix=${bpemodel}
# --input_sentence_size=100000000
# --unk_id  =3
# --eos_id=2
# --pad_id=1
# --bos_id=-1
# --character_coverage=1

In [20]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
# sp.Load(bpemodel + ".model")
sp.Load("/Users/vincentqb/spm.model")

token = sp.encode_as_pieces("This is a test")
token = " ".join(token)

print(token)

token = sp.encode_as_ids("This is a test")
# token = " ".join(str(token))

print(token)

▁T his ▁ is ▁ a ▁ test
[640, 3, 394, 3, 394, 3, 394, 3]


In [21]:
next(iter(train))

(tensor([[ 0.0000e+00,  0.0000e+00, -3.0518e-05,  ..., -6.1035e-05,
          -6.1035e-05, -6.1035e-05]]),
 16000,
 'right',
 '8e523821',
 2)

In [22]:
labels = [
    '-', '*', 'right', 'eight', 'cat', 'tree', 'bed', 'happy', 'go', 'dog', 'no', 
    'wow', 'nine', 'left', 'stop', 'three', 'sheila', 'one', 'bird', 'zero',
    'seven', 'up', 'marvin', 'two', 'house', 'down', 'six', 'yes', 'on', 
    'five', 'off', 'four'
]

import collections


def build_mapping(labels):
    labels = list(collections.OrderedDict.fromkeys(list("".join(labels))))
    enumerated = list(enumerate(labels))
    flipped = [(sub[1], sub[0]) for sub in enumerated]

    d1 = collections.OrderedDict(enumerated)
    d2 = collections.OrderedDict(flipped)
    return {**d1, **d2}

def padding(l, max_length, fillwith):
    return l  + [fillwith] * (max_length-len(l))

def map_with_dict(mapping, l):
    return [mapping[t] for t in l]

def apply_with_padding(l, mapping, max_length, fillwith):
    l = map_with_dict(mapping, l)
    l = padding(l, max_length, mapping["*"])
    return l


test = "house"
max_length = max(map(len, labels))
vocab_size = len(labels) + 2

mapping = build_mapping(labels)

# test = apply(mapping, test)
# test = padding(test, max_length, mapping["*"])

encode = lambda l: apply_with_padding(l, mapping, max_length, mapping["*"])
decode = lambda l: apply_with_padding(l, mapping, max_length, mapping[1])

decode(encode(test))

['h', 'o', 'u', 's', 'e', '*']

In [23]:
from torch.utils.data import DataLoader
from random import randint

from torchaudio.transforms import MFCC

num_features = 13

melkwargs = {
    'n_fft': 512,
    'n_mels': 20,
    'hop_length': 80,
}

mfcc = MFCC(sample_rate=16000, n_mfcc=num_features, melkwargs=melkwargs)

# audio, self.sr, window_stride=(160, 80),
# fft_size=512, num_filt=20, num_coeffs=13

def process_waveform(waveform):
    # pick first channel, apply mfcc, tranpose for pad_sequence
    return mfcc(waveform)[0, ...].transpose(0, -1)

def process_target(target):

    # targets = []
    # for b in batch:
    #     if b:
    #         token = sp.encode_as_pieces(b[2])
    #         print(len(token))
    #         token = " ".join(token)
    #         targets.append(token)

    # return " ".join(sp.encode_as_ids(target))
    
    # return torch.IntTensor(sp.encode_as_ids(target))
    return torch.IntTensor(encode(target))

def collate_fn(batch):

    tensors = [process_waveform(b[0]) for b in batch if b]
    targets = [process_target(b[2]) for b in batch if b]

    # truncate tensor list
    # length = 2**10
    # a = max(0, min([tensor.shape[-1] for tensor in tensors]) - length)
    # m = randint(0, a)
    # n = m + length
    # tensors = [t[..., m:n] for t in tensors]
    
    input_lengths = [t.shape[0] for t in tensors]
    target_lengths = [len(t) for t in targets]

    if tensors:    
        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
        tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
        tensors = tensors.transpose(1, -1)
        return tensors, targets, input_lengths, target_lengths
    else:
        return None, None, None, None

max_tokens = 5000  # max number of tokens per batch
# vocab_size = max_tokens
batch_size = 3  # max number of sentences per batch
loader_train = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn)

In [24]:
for a, b, c, d in loader_train:
    print(a.shape)
    print(b.shape)
    print(c)
    print(d)
    break

torch.Size([3, 13, 201])
torch.Size([3, 6])
[201, 201, 201]
[6, 6, 6]


In [25]:
input_feat_per_channel = 80
vggblock_enc_config = "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
transformer_enc_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16"
enc_output_dim = 1024
tgt_embed_dim = 512
conv_dec_config = "((256, 3, True),) * 4"
transformer_dec_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6"

In [26]:
from torch import nn


class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x)
        return x
    
    

class Wav2Letter(nn.Module):
    """Wav2Letter Speech Recognition model
        Architecture is based off of Facebooks AI Research paper
        https://arxiv.org/pdf/1609.03193.pdf
        This specific architecture accepts mfcc or
        power spectrums speech signals
        TODO: use cuda if available
        Args:
            num_features (int): number of mfcc features
            num_classes (int): number of unique grapheme class labels
    """

    def __init__(self, num_features, num_classes):
        super(Wav2Letter, self).__init__()

        # Conv1d(in_channels, out_channels, kernel_size, stride)
        self.layers = nn.Sequential(
            PrintLayer(),
            nn.Conv1d(num_features, 250, 48, 2),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 2000, 32),
            nn.ReLU(),
            nn.Conv1d(2000, 2000, 1),
            nn.ReLU(),
            nn.Conv1d(2000, num_classes, 1),
        )

    def forward(self, batch):
        """Forward pass through Wav2Letter network than 
            takes log probability of output
        Args:
            batch (int): mini batch of data
             shape (batch, num_features, frame_len)
        Returns:
            log_probs (torch.Tensor):
                shape  (batch_size, num_classes, output_len)
        """
        # y_pred shape (batch_size, num_classes, output_len)
        y_pred = self.layers(batch)

        # compute log softmax probability on graphemes
        log_probs = nn.functional.log_softmax(y_pred, dim=1)

        return log_probs

model = Wav2Letter(num_features, vocab_size)

In [27]:
import torchaudio
from torch.optim import Adadelta

optimizer_params = {
    "lr": 1.0,
    "eps": 1e-8,
    "rho": 0.95,
}
optimizer = Adadelta(model.parameters(), **optimizer_params)

max_epoch = 2 # 80
clip_norm = 10.

criterion = torch.nn.CTCLoss()

max_files = 10
for epoch in range(max_epoch):
    # print(epoch)
    
    i_files = 0
    for inputs, targets, _, target_lengths in loader_train:
        if i_files > max_files:
            break

        print(i_files, max_files)

        if inputs is None or targets is None:
            continue

        print("input", inputs.shape)
        outputs = model(inputs)
        # (input length, batch size, number of classes)
        # input_lengths = [len(o) for o in outputs]

        outputs = outputs.transpose(1, 2).transpose(0, 1)

        # print(inputs.shape)
        # print(outputs.shape)
        # print(targets.shape)
        # print(len(targets))
        # print(targets.shape)
        # print(input_lengths)
        # input_lengths = [len(o) for o in outputs]
        # print(len(input_lengths))
        # target_lengths = [len(t) for t in targets]
        # print(target_lengths)
        # ctc_loss(input, target, input_lengths, target_lengths)

        input_lengths = [outputs.shape[0]] * outputs.shape[1]
        loss = criterion(outputs, targets, input_lengths, target_lengths)

        # print("stepping")
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
        optimizer.step()
        
        i_files += 1

0 10
input torch.Size([3, 13, 201])
tensor([[[-2.0551e+02, -2.0546e+02, -2.0544e+02,  ..., -2.0606e+02,
          -2.0593e+02, -2.0594e+02],
         [ 9.6541e-01,  1.0404e+00,  1.0608e+00,  ...,  2.0251e-01,
           3.7340e-01,  3.5829e-01],
         [ 8.8461e-01,  9.5334e-01,  9.7204e-01,  ...,  1.5497e-01,
           2.8577e-01,  2.7420e-01],
         ...,
         [-7.0201e-01, -7.5656e-01, -7.7140e-01,  ..., -1.5495e-01,
          -2.8575e-01, -2.7418e-01],
         [-8.4657e-01, -9.1234e-01, -9.3024e-01,  ..., -8.3917e-02,
          -1.5470e-01, -1.4844e-01],
         [-9.4423e-01, -1.0176e+00, -1.0376e+00,  ...,  1.5817e-05,
           1.5817e-05,  1.5817e-05]],

        [[-1.7214e+02, -1.7196e+02, -1.7240e+02,  ..., -1.3082e+02,
          -1.2717e+02, -1.2653e+02],
         [ 2.7126e+00,  3.5427e+00,  3.9221e+00,  ..., -2.9898e+01,
          -3.6009e+01, -3.8999e+01],
         [ 4.2158e+00,  2.3428e+00, -3.2983e-01,  ...,  2.7697e+01,
           2.7758e+01,  2.7359e+01],
   

5 10
input torch.Size([3, 13, 201])
tensor([[[-7.4427e+01, -7.3534e+01, -7.1631e+01,  ..., -3.9981e+01,
          -4.4656e+01, -4.7321e+01],
         [ 3.8314e+01,  3.8035e+01,  3.8432e+01,  ...,  7.3542e+00,
           1.1236e+01,  1.1830e+01],
         [ 1.1105e+01,  1.0009e+01,  7.5111e+00,  ...,  1.2808e+01,
           9.9535e+00,  7.6007e+00],
         ...,
         [-1.8385e+00, -1.7110e+00, -1.3273e+00,  ..., -2.3206e+00,
          -1.8984e+00, -1.8091e+00],
         [ 2.1342e+00,  1.9628e+00,  2.0726e+00,  ..., -4.1257e+00,
          -1.6141e+00, -3.8569e-01],
         [ 8.1677e-01,  9.5586e-01,  1.5412e+00,  ...,  1.9819e+00,
           1.1785e+00,  4.4839e-01]],

        [[-1.0867e+02, -1.0352e+02, -9.7942e+01,  ..., -1.0955e+02,
          -1.1136e+02, -1.1340e+02],
         [ 1.8459e+01,  2.1616e+01,  2.7248e+01,  ...,  1.9371e+01,
           1.8505e+01,  1.7755e+01],
         [ 8.2544e+00,  8.3072e+00,  1.0231e+01,  ...,  4.3698e+00,
           5.9766e+00,  7.1244e+00],
   

10 10
input torch.Size([3, 13, 201])
tensor([[[-1.2022e+02, -1.2040e+02, -1.2251e+02,  ..., -1.2420e+02,
          -1.2491e+02, -1.2586e+02],
         [ 2.9343e+01,  3.1349e+01,  3.3140e+01,  ...,  3.1463e+01,
           3.1577e+01,  3.1315e+01],
         [ 1.8287e+00,  1.9291e+00,  2.8441e+00,  ..., -1.0590e+00,
           9.2993e-01,  1.9012e+00],
         ...,
         [ 1.0818e+00,  2.2400e+00,  4.4768e+00,  ...,  1.2157e+00,
           1.9395e+00,  2.4451e+00],
         [-6.8659e-01, -7.7150e-01, -6.7902e-01,  ..., -1.1237e+00,
          -2.9272e+00, -3.3809e+00],
         [ 3.4166e-02, -7.3930e-01, -1.5789e+00,  ...,  1.1690e+00,
          -1.8214e+00, -3.9950e+00]],

        [[-1.4607e+02, -1.2724e+02, -1.1399e+02,  ..., -1.6818e+02,
          -1.6890e+02, -1.6998e+02],
         [ 1.4048e+01,  1.2821e+01,  1.4831e+01,  ...,  1.9576e+01,
           2.0768e+01,  2.0787e+01],
         [-9.7058e+00, -2.0076e+01, -2.2919e+01,  ...,  1.1486e+00,
           1.3188e+00,  7.3683e-01],
  

4 10
input torch.Size([3, 13, 201])
tensor([[[-9.7900e+01, -9.6968e+01, -9.5840e+01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 5.3090e+00,  4.4287e+00,  2.7453e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.3191e+01,  1.1909e+01,  8.4542e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [-5.0433e+00, -3.9648e+00, -1.7216e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-3.2600e-01,  1.5826e-02,  1.3113e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.4879e-01,  6.4955e-02,  6.2535e-01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-9.5062e+01, -9.5150e+01, -9.6207e+01,  ..., -9.9450e+01,
          -1.0089e+02, -1.0274e+02],
         [ 3.3071e+01,  3.3387e+01,  3.4256e+01,  ...,  3.2012e+01,
           3.4316e+01,  3.4627e+01],
         [ 1.3673e+01,  1.2882e+01,  1.2115e+01,  ...,  1.2593e+01,
           1.2478e+01,  1.2077e+01],
   

9 10
input torch.Size([3, 13, 201])
tensor([[[-5.7988e+01, -5.8524e+01, -6.1024e+01,  ..., -1.0545e+02,
          -1.0666e+02, -1.0866e+02],
         [ 8.2055e+00,  5.1719e+00, -4.7202e-01,  ...,  3.5545e+01,
           3.3466e+01,  3.1439e+01],
         [-1.1816e+01, -1.2359e+01, -1.4618e+01,  ...,  1.0594e+01,
           9.9183e+00,  1.0035e+01],
         ...,
         [-2.7682e+00, -2.8580e+00, -3.7649e+00,  ..., -2.7832e+00,
          -2.1657e+00, -1.8911e+00],
         [-8.2529e-02,  7.9089e-01,  2.0275e+00,  ...,  4.8093e+00,
           4.9230e+00,  5.3454e+00],
         [ 7.4543e-01,  5.6692e-01,  8.5081e-01,  ...,  1.6353e+00,
           1.9119e+00,  2.4003e+00]],

        [[-2.0483e+02, -2.0221e+02, -2.0006e+02,  ..., -1.7670e+02,
          -1.7992e+02, -1.8157e+02],
         [ 8.5228e+00,  1.0710e+01,  1.3293e+01,  ...,  1.1935e+01,
           1.1353e+01,  9.9699e+00],
         [-6.8912e-01,  2.0648e-01, -2.6089e-01,  ...,  6.3458e-01,
           9.9312e-01,  8.3993e-01],
   

In [41]:
from torch import topk

def GreedyDecoder(ctc_matrix, blank_label=0):
    """Greedy Decoder. Returns highest probability of
        class labels for each timestep
        # TODO: collapse blank labels
    Args:
        ctc_matrix (torch.Tensor): 
            shape (1, num_classes, output_len)
        blank_label (int): blank labels to collapse
    
    Returns:
        torch.Tensor: class labels per time step.
         shape (ctc timesteps)
    """
    _, indices = topk(ctc_matrix, k=1, dim=1)
    return indices[:, 0, :]

In [46]:
sample = inputs[0].unsqueeze(0)
target = targets[0]

output = model(sample)
print(output)
output = GreedyDecoder(output)

print(output.shape)

tensor([[[nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
        

In [44]:
ctc_matrix

NameError: name 'ctc_matrix' is not defined