[example](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb)

In [70]:
# python train.py
# $DIR_FOR_PREPROCESSED_DATA
# --save-dir $MODEL_PATH
# --max-epoch 80
# --task speech_recognition
# --arch vggtransformer_2
# --optimizer adadelta
# --lr 1.0
# --adadelta-eps 1e-8
# --adadelta-rho 0.95
# --clip-norm 10.0
# --max-tokens 5000
# --log-format json
# --log-interval 1
# --criterion cross_entropy_acc
# --user-dir examples/speech_recognition/

In [71]:
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS


In [72]:
torchaudio.set_audio_backend("soundfile")

In [73]:
from torchaudio.datasets import LIBRISPEECH
# waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id

class SAFE_LIBRISPEECH(LIBRISPEECH):

    def __getitem__(self, n):
        try:
            return super().__getitem__(n)
        except (FileNotFoundError, RuntimeError):
            return None
        
    def __next__(self):
        try:
            return super().__next__()
        except (FileNotFoundError, RuntimeError):
            return self.__next__()

        
def datasets():

    folder_in_archive = 'LibriSpeech'
    download = True
    root = "./"

    print("train")
    dataset1 = SAFE_LIBRISPEECH(root, url='train-clean-100', folder_in_archive=folder_in_archive, download=download)
    # print(dataset1[0])
    dataset2 = SAFE_LIBRISPEECH(root, url='train-clean-360', folder_in_archive=folder_in_archive, download=download)
    # dataset3 = SAFE_LIBRISPEECH(root, url='train-other-500', folder_in_archive=folder_in_archive, download=download)
    # train = torch.utils.data.ConcatDataset([dataset1, dataset2, dataset3])
    train = torch.utils.data.ConcatDataset([dataset1, dataset2])
    # print(train[0])

    print("valid")
    dataset1 = SAFE_LIBRISPEECH(root, url='dev-clean', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='dev-other', folder_in_archive=folder_in_archive, download=download)
    valid = torch.utils.data.ConcatDataset([dataset1, dataset2])

    print("test")
    dataset1 = SAFE_LIBRISPEECH(root, url='test-other', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='test-clean', folder_in_archive=folder_in_archive, download=download)
    test = torch.utils.data.ConcatDataset([dataset1, dataset2])

    return train, valid, test

In [None]:
class PROCESSED_SPEECHCOMMANDS(SPEECHCOMMANDS):
    def __getitem__(self, n):
        return super().__getitem__(n)

    def __next__(self):
        return super().__next__()

In [None]:
class MemoryCache(Dataset):
    """
    Wrap a dataset so that, whenever a new item is returned, it is saved to disk.
    """
                                                      
    def __init__(self, dataset):
        self.dataset = dataset                                                   
        self.location = location
                                            
        self._id = id(self)                         
        self._cache = [None] * len(dataset)
                                  
    def __getitem__(self, n):      
        if self._cache[n]:                             
            return self._cache[n]          
                               
        item = self.dataset[n]          
        self._cache[n] = item             
                                          
        return item                       
                                  
    def __len__(self):                 
        return len(self.dataset) 

In [74]:
#     waveform, sample_rate, label, speaker_id, utterance_number

def datasets():

    download = True
    root = "./"

    print("train")
    dataset1 = SPEECHCOMMANDS(root, download=download)
    dataset1 = MemoryCache(dataset1)

    return dataset1, None, None

In [75]:
train, valid, test = datasets()

train


In [76]:
# spm_train 
# --input=data/lang_char/input.txt
# --vocab_size=${nbpe}
# --model_type=${bpemode}
# --model_prefix=${bpemodel}
# --input_sentence_size=100000000
# --unk_id  =3
# --eos_id=2
# --pad_id=1
# --bos_id=-1
# --character_coverage=1

In [77]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
# sp.Load(bpemodel + ".model")
sp.Load("/Users/vincentqb/spm.model")

token = sp.encode_as_pieces("This is a test")
token = " ".join(token)

print(token)

token = sp.encode_as_ids("This is a test")
# token = " ".join(str(token))

print(token)

▁T his ▁ is ▁ a ▁ test
[640, 3, 394, 3, 394, 3, 394, 3]


In [79]:
labels = [
    '-', '*', 'right', 'eight', 'cat', 'tree', 'bed', 'happy', 'go', 'dog', 'no', 
    'wow', 'nine', 'left', 'stop', 'three', 'sheila', 'one', 'bird', 'zero',
    'seven', 'up', 'marvin', 'two', 'house', 'down', 'six', 'yes', 'on', 
    'five', 'off', 'four',
]

labels = [
        '-', '*',
        "backward",
        "bed",
        "bird",
        "cat",
        "dog",
        "down",
        "eight",
        "five",
        "follow",
        "forward",
        "four",
        "go",
        "happy",
        "house",
        "learn",
        "left",
        "marvin",
        "nine",
        "no",
        "off",
        "on",
        "one",
        "right",
        "seven",
        "sheila",
        "six",
        "stop",
        "three",
        "tree",
        "two",
        "up",
        "visual",
        "wow",
        "yes",
        "zero",
]

import collections


def build_mapping(labels):
    labels = list(collections.OrderedDict.fromkeys(list("".join(labels))))
    enumerated = list(enumerate(labels))
    flipped = [(sub[1], sub[0]) for sub in enumerated]

    d1 = collections.OrderedDict(enumerated)
    d2 = collections.OrderedDict(flipped)
    return {**d1, **d2}

def padding(l, max_length, fillwith):
    return l  + [fillwith] * (max_length-len(l))

def map_with_dict(mapping, l):
    return [mapping[t] for t in l]

def apply_with_padding(l, mapping, max_length, fillwith):
    l = map_with_dict(mapping, l)
    l = padding(l, max_length, mapping["*"])
    return l


test = "house"
max_length = max(map(len, labels))
vocab_size = len(labels) + 2

mapping = build_mapping(labels)

# test = apply(mapping, test)
# test = padding(test, max_length, mapping["*"])

encode = lambda l: apply_with_padding(l, mapping, max_length, mapping["*"])
decode = lambda l: apply_with_padding(l, mapping, max_length, mapping[1])

decode(encode(test))

['h', 'o', 'u', 's', 'e', '*', '*', '*']

In [None]:
from torchaudio.transforms import MFCC

num_features = 13

melkwargs = {
    'n_fft': 512,
    'n_mels': 20,
    'hop_length': 80,
}

mfcc = MFCC(sample_rate=16000, n_mfcc=num_features, melkwargs=melkwargs)

# audio, self.sr, window_stride=(160, 80),
# fft_size=512, num_filt=20, num_coeffs=13

def process_waveform(waveform):
    # pick first channel, apply mfcc, tranpose for pad_sequence
    return mfcc(waveform)[0, ...].transpose(0, -1)

def process_target(target):

    # targets = []
    # for b in batch:
    #     if b:
    #         token = sp.encode_as_pieces(b[2])
    #         print(len(token))
    #         token = " ".join(token)
    #         targets.append(token)

    # return " ".join(sp.encode_as_ids(target))
    
    # return torch.IntTensor(sp.encode_as_ids(target))
    # print(target)
    return torch.IntTensor(encode(target))

In [80]:
from torch.utils.data import DataLoader
from random import randint



def collate_fn(batch):

    tensors = [process_waveform(b[0]) for b in batch if b]
    targets = [process_target(b[2]) for b in batch if b]

    # truncate tensor list
    # length = 2**10
    # a = max(0, min([tensor.shape[-1] for tensor in tensors]) - length)
    # m = randint(0, a)
    # n = m + length
    # tensors = [t[..., m:n] for t in tensors]
    
    input_lengths = [t.shape[0] for t in tensors]
    target_lengths = [len(t) for t in targets]

    if tensors:    
        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
        tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
        tensors = tensors.transpose(1, -1)
        return tensors, targets, input_lengths, target_lengths
    else:
        return None, None, None, None

max_tokens = 5000  # max number of tokens per batch
# vocab_size = max_tokens
batch_size = 32  # max number of sentences per batch
loader_train = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [81]:
for a, b, c, d in loader_train:
    print(a.shape)
    print(b.shape)
    print(c)
    print(d)
    break

torch.Size([32, 13, 201])
torch.Size([32, 8])
[201, 201, 201, 201, 201, 201, 201, 201, 201, 180, 201, 201, 171, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 188, 201, 182, 201, 201, 201, 201, 201, 201]
[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]


In [82]:
input_feat_per_channel = 80
vggblock_enc_config = "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
transformer_enc_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16"
enc_output_dim = 1024
tgt_embed_dim = 512
conv_dec_config = "((256, 3, True),) * 4"
transformer_dec_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6"

In [96]:
from torch import nn


class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x)
        return x
    
    

class Wav2Letter(nn.Module):
    """Wav2Letter Speech Recognition model
        Architecture is based off of Facebooks AI Research paper
        https://arxiv.org/pdf/1609.03193.pdf
        This specific architecture accepts mfcc or
        power spectrums speech signals
        TODO: use cuda if available
        Args:
            num_features (int): number of mfcc features
            num_classes (int): number of unique grapheme class labels
    """

    def __init__(self, num_features, num_classes):
        super(Wav2Letter, self).__init__()

        # Conv1d(in_channels, out_channels, kernel_size, stride)
        self.layers = nn.Sequential(
            # PrintLayer(),
            nn.Conv1d(num_features, 250, 48, 2),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            # nn.Conv1d(250, 250, 7),
            # nn.ReLU(),
            # nn.Conv1d(250, 250, 7),
            # nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 2000, 32),
            nn.ReLU(),
            nn.Conv1d(2000, 2000, 1),
            nn.ReLU(),
            nn.Conv1d(2000, num_classes, 1),
        )

    def forward(self, batch):
        """Forward pass through Wav2Letter network than 
            takes log probability of output
        Args:
            batch (int): mini batch of data
             shape (batch, num_features, frame_len)
        Returns:
            log_probs (torch.Tensor):
                shape  (batch_size, num_classes, output_len)
        """
        # y_pred shape (batch_size, num_classes, output_len)
        y_pred = self.layers(batch)

        # compute log softmax probability on graphemes
        log_probs = nn.functional.log_softmax(y_pred, dim=1)

        return log_probs


model = Wav2Letter(num_features, vocab_size)

In [99]:
import torchaudio
from torch.optim import Adadelta

model = Wav2Letter(num_features, vocab_size)

optimizer_params = {
    "lr": 1.0,
    "eps": 1e-8,
    "rho": 0.95,
}
optimizer = Adadelta(model.parameters(), **optimizer_params)

max_epoch = 2 # 80
clip_norm = 10.

criterion = torch.nn.CTCLoss()

# max_files = 10
for epoch in range(max_epoch):
    # print(epoch)
    
    i_files = 0
    for inputs, targets, _, target_lengths in loader_train:
        # if i_files > max_files:
        #     break

        # print(i_files, max_files)

        if inputs is None or targets is None:
            continue

        # print("input", inputs.shape)
        outputs = model(inputs)
        # (input length, batch size, number of classes)
        # input_lengths = [len(o) for o in outputs]

        outputs = outputs.transpose(1, 2).transpose(0, 1)
        # print("output", outputs.shape)
        # print("target", targets.shape)
        
        # print(inputs.shape)
        # print(outputs.shape)
        # print(targets.shape)
        # print(len(targets))
        # print(targets.shape)
        # print(input_lengths)
        # input_lengths = [len(o) for o in outputs]
        # print(len(input_lengths))
        # target_lengths = [len(t) for t in targets]
        # print(target_lengths)
        # ctc_loss(input, target, input_lengths, target_lengths)

        # input_lengths = [outputs.shape[0]] * outputs.shape[1]
        
        # CTC arguments
        # https://pytorch.org/docs/master/nn.html#torch.nn.CTCLoss
        # better definitions for ctc arguments
        # https://discuss.pytorch.org/t/ctcloss-with-warp-ctc-help/8788/3
        mini_batch_size = len(inputs)
        
        input_lengths = torch.full((mini_batch_size,), outputs.shape[0], dtype=torch.long)
        target_lengths = torch.IntTensor([target.shape[0] for target in targets])
        
        # print(torch.isnan(outputs).any())
        # print(torch.isnan(targets).any())
        # print(torch.isnan(input_lengths).any())
        # print(torch.isnan(target_lengths).any())
        # print(outputs.shape)
        # print(targets.shape)
        # print(input_lengths.shape)
        # print(target_lengths.shape)

        # outputs: input length, batch size, number of classes (including blank) 
        # targets: batch size, max target length
        # input_lengths: batch size
        # target_lengths: batch size
        loss = criterion(outputs, targets, input_lengths, target_lengths)

        # print("stepping")
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
        optimizer.step()
        
        i_files += 1
    
    print(epoch, loss)

KeyboardInterrupt: 

In [87]:
from torch import topk

def GreedyDecoder(ctc_matrix, blank_label=0):
    """Greedy Decoder. Returns highest probability of
        class labels for each timestep
        # TODO: collapse blank labels
    Args:
        ctc_matrix (torch.Tensor): 
            shape (1, num_classes, output_len)
        blank_label (int): blank labels to collapse
    
    Returns:
        torch.Tensor: class labels per time step.
         shape (ctc timesteps)
    """
    _, indices = topk(ctc_matrix, k=1, dim=1)
    return indices[:, 0, :]

In [88]:
sample = inputs[0].unsqueeze(0)
target = targets[0]

# decode(targets[0].tolist())

output = model(sample)
print(output)

# output = GreedyDecoder(output)

# print(output.shape)

tensor([[[nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
         [nan, nan, nan, nan],
        