[example](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb)

In [6]:
# python train.py
# $DIR_FOR_PREPROCESSED_DATA
# --save-dir $MODEL_PATH
# --max-epoch 80
# --task speech_recognition
# --arch vggtransformer_2
# --optimizer adadelta
# --lr 1.0
# --adadelta-eps 1e-8
# --adadelta-rho 0.95
# --clip-norm 10.0
# --max-tokens 5000
# --log-format json
# --log-interval 1
# --criterion cross_entropy_acc
# --user-dir examples/speech_recognition/

In [7]:
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH


In [8]:
torchaudio.set_audio_backend("soundfile")

In [9]:
from torchaudio.datasets import LIBRISPEECH
# waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id

class SAFE_LIBRISPEECH(LIBRISPEECH):

    def __getitem__(self, n):
        try:
            return super().__getitem__(n)
        except (FileNotFoundError, RuntimeError):
            return None
        
    def __next__(self):
        try:
            return super().__next__()
        except (FileNotFoundError, RuntimeError):
            return self.__next__()

        
def datasets():

    folder_in_archive = 'LibriSpeech'
    download = True
    root = "./"

    print("train")
    dataset1 = SAFE_LIBRISPEECH(root, url='train-clean-100', folder_in_archive=folder_in_archive, download=download)
    # print(dataset1[0])
    dataset2 = SAFE_LIBRISPEECH(root, url='train-clean-360', folder_in_archive=folder_in_archive, download=download)
    # dataset3 = SAFE_LIBRISPEECH(root, url='train-other-500', folder_in_archive=folder_in_archive, download=download)
    # train = torch.utils.data.ConcatDataset([dataset1, dataset2, dataset3])
    train = torch.utils.data.ConcatDataset([dataset1, dataset2])
    # print(train[0])

    print("valid")
    dataset1 = SAFE_LIBRISPEECH(root, url='dev-clean', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='dev-other', folder_in_archive=folder_in_archive, download=download)
    valid = torch.utils.data.ConcatDataset([dataset1, dataset2])

    print("test")
    dataset1 = SAFE_LIBRISPEECH(root, url='test-other', folder_in_archive=folder_in_archive, download=download)
    dataset2 = SAFE_LIBRISPEECH(root, url='test-clean', folder_in_archive=folder_in_archive, download=download)
    test = torch.utils.data.ConcatDataset([dataset1, dataset2])

    return train, valid, test

In [10]:
train, valid, test = datasets()

train
valid
test


In [11]:
# spm_train 
# --input=data/lang_char/input.txt
# --vocab_size=${nbpe}
# --model_type=${bpemode}
# --model_prefix=${bpemodel}
# --input_sentence_size=100000000
# --unk_id  =3
# --eos_id=2
# --pad_id=1
# --bos_id=-1
# --character_coverage=1

In [12]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
# sp.Load(bpemodel + ".model")
sp.Load("/Users/vincentqb/spm.model")

token = sp.encode_as_pieces("This is a test")
token = " ".join(token)

print(token)

token = sp.encode_as_ids("This is a test")
# token = " ".join(str(token))

print(token)

▁T his ▁ is ▁ a ▁ test
[640, 3, 394, 3, 394, 3, 394, 3]


In [13]:
next(iter(train))

(tensor([[ 7.9346e-04,  0.0000e+00, -6.4087e-04,  ...,  6.4087e-04,
           2.1362e-04,  6.1035e-05]]),
 16000,
 "YES SIR SAID THE SERVANT SUCH AN INSTRUCTION WAS REMARKABLY PLEASING TO HIM THERE WAS MUCH THAT HE HAD TO DO AND THAT NIGHT'S FREEDOM WOULD ASSIST HIM MATERIALLY PERHAPS KARA HESITATED PERHAPS YOU HAD BETTER WAIT UNTIL ELEVEN O'CLOCK",
 1088,
 134318,
 8)

In [14]:
from torch.utils.data import DataLoader
from random import randint

from torchaudio.transforms import MFCC

num_features = 13
mfcc = MFCC(sample_rate=16000, n_mfcc=num_features)

def collate_fn(batch):

    # pick first channel, apply mfcc, tranpose for pad_sequence
    tensors = [mfcc(b[0])[0, ...].transpose(0, -1) for b in batch if b]
    # targets = [" ".join(sp.encode_as_ids(b[2])) for b in batch if b]
    targets = [torch.IntTensor(sp.encode_as_ids(b[2])) for b in batch if b]

    # targets = []
    # for b in batch:
    #     if b:
    #         token = sp.encode_as_pieces(b[2])
    #         print(len(token))
    #         token = " ".join(token)
    #         targets.append(token)

    # truncate tensor list
    # length = 2**10
    # a = max(0, min([tensor.shape[-1] for tensor in tensors]) - length)
    # m = randint(0, a)
    # n = m + length
    # tensors = [t[..., m:n] for t in tensors]
    
    input_lengths = [t.shape[0] for t in tensors]
    target_lengths = [len(t) for t in targets]

    if tensors:    
        tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
        tensors = tensors.transpose(1, -1)
        
        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)

        return tensors, targets, input_lengths, target_lengths
    else:
        return None, None, None, None

# max_tokens = 5000  # max number of tokens per batch
batch_size = 3  # max number of sentences per batch
loader_train = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn)

In [15]:
next(iter(train))

(tensor([[ 7.9346e-04,  0.0000e+00, -6.4087e-04,  ...,  6.4087e-04,
           2.1362e-04,  6.1035e-05]]),
 16000,
 "YES SIR SAID THE SERVANT SUCH AN INSTRUCTION WAS REMARKABLY PLEASING TO HIM THERE WAS MUCH THAT HE HAD TO DO AND THAT NIGHT'S FREEDOM WOULD ASSIST HIM MATERIALLY PERHAPS KARA HESITATED PERHAPS YOU HAD BETTER WAIT UNTIL ELEVEN O'CLOCK",
 1088,
 134318,
 8)

In [16]:
from torch.utils.data import DataLoader
from random import randint

from torchaudio.transforms import MFCC

num_features = 13
mfcc = MFCC(sample_rate=16000, n_mfcc=num_features)

def collate_fn(batch):

    # pick first channel, apply mfcc, tranpose for pad_sequence
    tensors = [mfcc(b[0])[0, ...].transpose(0, -1) for b in batch if b]
    # targets = [" ".join(sp.encode_as_ids(b[2])) for b in batch if b]
    targets = [torch.IntTensor(sp.encode_as_ids(b[2])) for b in batch if b]

    # targets = []
    # for b in batch:
    #     if b:
    #         token = sp.encode_as_pieces(b[2])
    #         print(len(token))
    #         token = " ".join(token)
    #         targets.append(token)

    # truncate tensor list
    # length = 2**10
    # a = max(0, min([tensor.shape[-1] for tensor in tensors]) - length)
    # m = randint(0, a)
    # n = m + length
    # tensors = [t[..., m:n] for t in tensors]
    
    input_lengths = [t.shape[0] for t in tensors]
    target_lengths = [len(t) for t in targets]

    if tensors:    
        tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
        tensors = tensors.transpose(1, -1)
        
        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)

        return tensors, targets, input_lengths, target_lengths
    else:
        return None, None, None, None

# max_tokens = 5000  # max number of tokens per batch
batch_size = 3  # max number of sentences per batch
loader_train = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn)

In [17]:
train[0]

(tensor([[ 7.9346e-04,  0.0000e+00, -6.4087e-04,  ...,  6.4087e-04,
           2.1362e-04,  6.1035e-05]]),
 16000,
 "YES SIR SAID THE SERVANT SUCH AN INSTRUCTION WAS REMARKABLY PLEASING TO HIM THERE WAS MUCH THAT HE HAD TO DO AND THAT NIGHT'S FREEDOM WOULD ASSIST HIM MATERIALLY PERHAPS KARA HESITATED PERHAPS YOU HAD BETTER WAIT UNTIL ELEVEN O'CLOCK",
 1088,
 134318,
 8)

In [18]:
for a, b, c, d in loader_train:
    print(a.shape)
    print(b.shape)
    print(c)
    print(d)
    break

torch.Size([3, 13, 1227])
torch.Size([3, 55])
[1203, 627, 1227]
[50, 27, 55]


In [19]:
input_feat_per_channel = 80
vggblock_enc_config = "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
transformer_enc_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16"
enc_output_dim = 1024
tgt_embed_dim = 512
conv_dec_config = "((256, 3, True),) * 4"
transformer_dec_config = "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6"

In [20]:
from torch import nn
class Wav2Letter(nn.Module):
    """Wav2Letter Speech Recognition model
        Architecture is based off of Facebooks AI Research paper
        https://arxiv.org/pdf/1609.03193.pdf
        This specific architecture accepts mfcc or
        power spectrums speech signals
        TODO: use cuda if available
        Args:
            num_features (int): number of mfcc features
            num_classes (int): number of unique grapheme class labels
    """

    def __init__(self, num_features, num_classes):
        super(Wav2Letter, self).__init__()

        # Conv1d(in_channels, out_channels, kernel_size, stride)
        self.layers = nn.Sequential(
            nn.Conv1d(num_features, 250, 48, 2),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 2000, 32),
            nn.ReLU(),
            nn.Conv1d(2000, 2000, 1),
            nn.ReLU(),
            nn.Conv1d(2000, num_classes, 1),
        )

    def forward(self, batch):
        """Forward pass through Wav2Letter network than 
            takes log probability of output
        Args:
            batch (int): mini batch of data
             shape (batch, num_features, frame_len)
        Returns:
            log_probs (torch.Tensor):
                shape  (batch_size, num_classes, output_len)
        """
        # y_pred shape (batch_size, num_classes, output_len)
        y_pred = self.layers(batch)

        # compute log softmax probability on graphemes
        log_probs = nn.functional.log_softmax(y_pred, dim=1)

        return log_probs

vocab_size = 5000
model = Wav2Letter(num_features, vocab_size)

In [None]:
import torchaudio
from torch.optim import Adadelta

optimizer_params = {
    "lr": 1.0,
    "eps": 1e-8,
    "rho": 0.95,
}
optimizer = Adadelta(model.parameters(), **optimizer_params)

max_epoch = 2 # 80
clip_norm = 10.

criterion = torch.nn.CTCLoss()

max_files = 10
for epoch in range(max_epoch):
    # print(epoch)
    
    i_files = 0
    for inputs, targets, _, target_lengths in loader_train:
        print(i_files, max_files)

        if inputs is None or targets is None:
            continue

        outputs = model(inputs)
        # (input length, batch size, number of classes)
        # input_lengths = [len(o) for o in outputs]

        outputs = outputs.transpose(1, 2).transpose(0, 1)

        # print(inputs.shape)
        # print(outputs.shape)
        # print(targets.shape)
        # print(len(targets))
        # print(targets.shape)
        # print(input_lengths)
        # input_lengths = [len(o) for o in outputs]
        # print(len(input_lengths))
        # target_lengths = [len(t) for t in targets]
        # print(target_lengths)
        # ctc_loss(input, target, input_lengths, target_lengths)

        input_lengths = [outputs.shape[0]] * outputs.shape[1]
        loss = criterion(outputs, targets, input_lengths, target_lengths)

        # print("stepping")
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
        optimizer.step()
        
        if i_files > max_files:
            break
        i_files += 1

0 10
1 10
2 10
3 10
4 10
5 10
6 10


In [None]:
from torch import topk

def GreedyDecoder(ctc_matrix, blank_label=0):
    """Greedy Decoder. Returns highest probability of
        class labels for each timestep
        # TODO: collapse blank labels
    Args:
        ctc_matrix (torch.Tensor): 
            shape (1, num_classes, output_len)
        blank_label (int): blank labels to collapse
    
    Returns:
        torch.Tensor: class labels per time step.
         shape (ctc timesteps)
    """
    top = topk(ctc_matrix, k=1, dim=1)
    return top[1][0][0]

In [None]:
sample = inputs[0]
target = targets[0]

print(target)

output = model.eval(sample)
output = GreedyDecoder(output)

print(output)