In [1]:
import cProfile

pr = cProfile.Profile()
pr.enable()

[example](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb)

In [2]:
import torch
import torchaudio
from torchaudio.datasets import SPEECHCOMMANDS


In [3]:
torchaudio.set_audio_backend("soundfile")

In [4]:
labels = [
        '-', '*',
        "backward",
        "bed",
        "bird",
        "cat",
        "dog",
        "down",
        "eight",
        "five",
        "follow",
        "forward",
        "four",
        "go",
        "happy",
        "house",
        "learn",
        "left",
        "marvin",
        "nine",
        "no",
        "off",
        "on",
        "one",
        "right",
        "seven",
        "sheila",
        "six",
        "stop",
        "three",
        "tree",
        "two",
        "up",
        "visual",
        "wow",
        "yes",
        "zero",
]

import collections


def build_mapping(labels):
    labels = list(collections.OrderedDict.fromkeys(list("".join(labels))))
    enumerated = list(enumerate(labels))
    flipped = [(sub[1], sub[0]) for sub in enumerated]

    d1 = collections.OrderedDict(enumerated)
    d2 = collections.OrderedDict(flipped)
    return {**d1, **d2}

def padding(l, max_length, fillwith):
    return l  + [fillwith] * (max_length-len(l))

def map_with_dict(mapping, l):
    return [mapping[t] for t in l]

def apply_with_padding(l, mapping, max_length, fillwith):
    l = map_with_dict(mapping, l)
    l = padding(l, max_length, mapping["*"])
    return l


test = "house"
max_length = max(map(len, labels))
vocab_size = len(labels) + 2

mapping = build_mapping(labels)

# test = apply(mapping, test)
# test = padding(test, max_length, mapping["*"])

encode = lambda l: apply_with_padding(l, mapping, max_length, mapping["*"])
decode = lambda l: apply_with_padding(l, mapping, max_length, mapping[1])

decode(encode(test))

['h', 'o', 'u', 's', 'e', '*', '*', '*']

In [5]:
from torchaudio.transforms import MFCC

num_features = 13

melkwargs = {
    'n_fft': 512,
    'n_mels': 20,
    'hop_length': 80,
}

mfcc = MFCC(sample_rate=16000, n_mfcc=num_features, melkwargs=melkwargs)

# audio, self.sr, window_stride=(160, 80),
# fft_size=512, num_filt=20, num_coeffs=13

def process_waveform(waveform):
    # pick first channel, apply mfcc, tranpose for pad_sequence
    return mfcc(waveform)[0, ...].transpose(0, -1)

def process_target(target):

    # targets = []
    # for b in batch:
    #     if b:
    #         token = sp.encode_as_pieces(b[2])
    #         print(len(token))
    #         token = " ".join(token)
    #         targets.append(token)

    # return " ".join(sp.encode_as_ids(target))
    
    # return torch.IntTensor(sp.encode_as_ids(target))
    # print(target)
    return torch.IntTensor(encode(target))

In [6]:
class PROCESSED_SPEECHCOMMANDS(SPEECHCOMMANDS):
    def __getitem__(self, n):
        return self._process(super().__getitem__(n))
        
    def _process(self, item):
        # waveform, sample_rate, label, speaker_id, utterance_number
        waveform = process_waveform(item[0])
        label = process_target(item[2])
        return waveform, label

    def __next__(self):
        return self._process(super().__next__())
    

In [7]:
class MemoryCache(torch.utils.data.Dataset):
    """
    Wrap a dataset so that, whenever a new item is returned, it is saved to disk.
    """
                                                      
    def __init__(self, dataset):
        self.dataset = dataset                                                   
        self._id = id(self)                         
        self._cache = [None] * len(dataset)
                                  
    def __getitem__(self, n):      
        if self._cache[n]:                             
            return self._cache[n]          
                               
        item = self.dataset[n]          
        self._cache[n] = item             
                                          
        return item                       
                                  
    def __len__(self):                 
        return len(self.dataset) 

In [8]:
#     waveform, sample_rate, label, speaker_id, utterance_number

def datasets():

    download = True
    root = "./"

    dataset = PROCESSED_SPEECHCOMMANDS(root, download=download)
    dataset = MemoryCache(dataset)

    return dataset

In [9]:
train = datasets()

In [10]:
from torch.utils.data import DataLoader
from random import randint



def collate_fn(batch):

    tensors = [b[0] for b in batch if b]
    targets = [b[1] for b in batch if b]

    # tensors = [process_waveform(b[0]) for b in batch if b]
    # targets = [process_target(b[2]) for b in batch if b]

    # truncate tensor list
    # length = 2**10
    # a = max(0, min([tensor.shape[-1] for tensor in tensors]) - length)
    # m = randint(0, a)
    # n = m + length
    # tensors = [t[..., m:n] for t in tensors]
    
    input_lengths = [t.shape[0] for t in tensors]
    target_lengths = [len(t) for t in targets]

    targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
    tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
    tensors = tensors.transpose(1, -1)
    return tensors, targets, input_lengths, target_lengths


batch_size = 128  # max number of sentences per batch
loader_train = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [11]:
#     waveform, sample_rate, label, speaker_id, utterance_number

def datasets():

    download = True
    root = "./"

    dataset = SPEECHCOMMANDS(root, download=download)

    return dataset

train = datasets()

In [12]:
from torch import nn


class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x)
        return x
    
    

class Wav2Letter(nn.Module):
    """Wav2Letter Speech Recognition model
        Architecture is based off of Facebooks AI Research paper
        https://arxiv.org/pdf/1609.03193.pdf
        This specific architecture accepts mfcc or
        power spectrums speech signals
        TODO: use cuda if available
        Args:
            num_features (int): number of mfcc features
            num_classes (int): number of unique grapheme class labels
    """

    def __init__(self, num_features, num_classes):
        super(Wav2Letter, self).__init__()

        # Conv1d(in_channels, out_channels, kernel_size, stride)
        self.layers = nn.Sequential(
            # PrintLayer(),
            nn.Conv1d(num_features, 250, 48, 2),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            # nn.Conv1d(250, 250, 7),
            # nn.ReLU(),
            # nn.Conv1d(250, 250, 7),
            # nn.ReLU(),
            nn.Conv1d(250, 250, 7),
            nn.ReLU(),
            nn.Conv1d(250, 2000, 32),
            nn.ReLU(),
            nn.Conv1d(2000, 2000, 1),
            nn.ReLU(),
            nn.Conv1d(2000, num_classes, 1),
        )

    def forward(self, batch):
        """Forward pass through Wav2Letter network than 
            takes log probability of output
        Args:
            batch (int): mini batch of data
             shape (batch, num_features, frame_len)
        Returns:
            log_probs (torch.Tensor):
                shape  (batch_size, num_classes, output_len)
        """
        # y_pred shape (batch_size, num_classes, output_len)
        y_pred = self.layers(batch)

        # compute log softmax probability on graphemes
        log_probs = nn.functional.log_softmax(y_pred, dim=1)

        return log_probs


model = Wav2Letter(num_features, vocab_size)

In [13]:
import torchaudio
from torch.optim import Adadelta

model = Wav2Letter(num_features, vocab_size)

optimizer_params = {
    "lr": 1.0,
    "eps": 1e-8,
    "rho": 0.95,
}
optimizer = Adadelta(model.parameters(), **optimizer_params)

max_epoch = 10
clip_norm = 10.

criterion = torch.nn.CTCLoss()

max_files = 100

from tqdm import tqdm

for epoch in range(max_epoch):
    # print(epoch)
    
    i_files = 0
    for inputs, targets, _, target_lengths in tqdm(loader_train):
        
        if i_files > max_files:
            break

        if inputs is None or targets is None:
            continue

        # print("input", inputs.shape)
        outputs = model(inputs)
        # (input length, batch size, number of classes)
        # input_lengths = [len(o) for o in outputs]

        outputs = outputs.transpose(1, 2).transpose(0, 1)
        # print("output", outputs.shape)
        # print("target", targets.shape)
        
        # print(inputs.shape)
        # print(outputs.shape)
        # print(targets.shape)
        # print(len(targets))
        # print(targets.shape)
        # print(input_lengths)
        # input_lengths = [len(o) for o in outputs]
        # print(len(input_lengths))
        # target_lengths = [len(t) for t in targets]
        # print(target_lengths)
        # ctc_loss(input, target, input_lengths, target_lengths)

        # input_lengths = [outputs.shape[0]] * outputs.shape[1]
        
        # CTC arguments
        # https://pytorch.org/docs/master/nn.html#torch.nn.CTCLoss
        # better definitions for ctc arguments
        # https://discuss.pytorch.org/t/ctcloss-with-warp-ctc-help/8788/3
        mini_batch_size = len(inputs)
        
        input_lengths = torch.full((mini_batch_size,), outputs.shape[0], dtype=torch.long)
        target_lengths = torch.IntTensor([target.shape[0] for target in targets])
        
        # print(torch.isnan(outputs).any())
        # print(torch.isnan(targets).any())
        # print(torch.isnan(input_lengths).any())
        # print(torch.isnan(target_lengths).any())
        # print(outputs.shape)
        # print(targets.shape)
        # print(input_lengths.shape)
        # print(target_lengths.shape)

        # outputs: input length, batch size, number of classes (including blank) 
        # targets: batch size, max target length
        # input_lengths: batch size
        # target_lengths: batch size
        loss = criterion(outputs, targets, input_lengths, target_lengths)

        # print("stepping")
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
        optimizer.step()
        
        i_files += 1
        
    print(epoch, loss)

 12%|█▏        | 101/827 [53:21<1:47:36,  8.89s/it]  
  0%|          | 0/827 [00:00<?, ?it/s][A

0 tensor(2.8296, grad_fn=<MeanBackward0>)



  0%|          | 1/827 [00:08<1:51:41,  8.11s/it][A
  0%|          | 2/827 [00:16<1:51:05,  8.08s/it][A
  0%|          | 3/827 [00:23<1:50:05,  8.02s/it][A
  0%|          | 4/827 [00:32<1:51:31,  8.13s/it][A
  1%|          | 5/827 [00:41<1:54:18,  8.34s/it][A
  1%|          | 6/827 [00:49<1:53:05,  8.26s/it][A
  1%|          | 7/827 [00:57<1:52:54,  8.26s/it][A
  1%|          | 8/827 [01:05<1:51:13,  8.15s/it][A
  1%|          | 9/827 [01:13<1:49:51,  8.06s/it][A
  1%|          | 10/827 [01:21<1:49:29,  8.04s/it][A
  1%|▏         | 11/827 [01:29<1:48:19,  7.97s/it][A
  1%|▏         | 12/827 [01:36<1:47:37,  7.92s/it][A
  2%|▏         | 13/827 [01:44<1:46:51,  7.88s/it][A
  2%|▏         | 14/827 [01:52<1:45:47,  7.81s/it][A
  2%|▏         | 15/827 [02:00<1:45:42,  7.81s/it][A
  2%|▏         | 16/827 [02:07<1:45:41,  7.82s/it][A
  2%|▏         | 17/827 [02:15<1:45:37,  7.82s/it][A
  2%|▏         | 18/827 [02:23<1:45:08,  7.80s/it][A
  2%|▏         | 19/827 [02:31<1:44:

1 tensor(2.5557, grad_fn=<MeanBackward0>)




  0%|          | 1/827 [00:07<1:43:51,  7.54s/it][A[A

  0%|          | 2/827 [00:15<1:44:48,  7.62s/it][A[A

  0%|          | 3/827 [00:22<1:44:37,  7.62s/it][A[A

  0%|          | 4/827 [00:30<1:44:20,  7.61s/it][A[A

  1%|          | 5/827 [00:38<1:43:43,  7.57s/it][A[A

  1%|          | 6/827 [00:45<1:45:06,  7.68s/it][A[A

  1%|          | 7/827 [00:54<1:48:57,  7.97s/it][A[A

  1%|          | 8/827 [01:02<1:47:37,  7.88s/it][A[A

  1%|          | 9/827 [01:10<1:50:45,  8.12s/it][A[A

  1%|          | 10/827 [01:18<1:49:12,  8.02s/it][A[A

  1%|▏         | 11/827 [01:27<1:50:38,  8.14s/it][A[A

  1%|▏         | 12/827 [01:35<1:50:17,  8.12s/it][A[A

  2%|▏         | 13/827 [01:46<2:04:36,  9.18s/it][A[A

  2%|▏         | 14/827 [01:55<2:00:56,  8.92s/it][A[A

  2%|▏         | 15/827 [02:03<1:57:27,  8.68s/it][A[A

  2%|▏         | 16/827 [02:11<1:54:13,  8.45s/it][A[A

  2%|▏         | 17/827 [02:19<1:53:53,  8.44s/it][A[A

  2%|▏         | 18/8

2 tensor(2.5633, grad_fn=<MeanBackward0>)





  0%|          | 1/827 [00:07<1:42:29,  7.45s/it][A[A[A


  0%|          | 2/827 [00:15<1:42:54,  7.48s/it][A[A[A


  0%|          | 3/827 [00:22<1:43:06,  7.51s/it][A[A[A


  0%|          | 4/827 [00:29<1:42:08,  7.45s/it][A[A[A


  1%|          | 5/827 [00:37<1:42:05,  7.45s/it][A[A[A


  1%|          | 6/827 [00:45<1:43:14,  7.54s/it][A[A[A


  1%|          | 7/827 [00:52<1:43:06,  7.54s/it][A[A[A


  1%|          | 8/827 [01:00<1:42:43,  7.53s/it][A[A[A


  1%|          | 9/827 [01:07<1:43:19,  7.58s/it][A[A[A


  1%|          | 10/827 [01:15<1:44:26,  7.67s/it][A[A[A


  1%|▏         | 11/827 [01:23<1:44:30,  7.68s/it][A[A[A


  1%|▏         | 12/827 [01:31<1:44:07,  7.67s/it][A[A[A


  2%|▏         | 13/827 [01:38<1:43:36,  7.64s/it][A[A[A


  2%|▏         | 14/827 [01:46<1:42:38,  7.58s/it][A[A[A


  2%|▏         | 15/827 [01:53<1:42:38,  7.58s/it][A[A[A


  2%|▏         | 16/827 [02:01<1:42:01,  7.55s/it][A[A[A


  2%|▏        

KeyboardInterrupt: 

In [28]:
from torch import topk

def GreedyDecoder(ctc_matrix, blank_label=0):
    """Greedy Decoder. Returns highest probability of
        class labels for each timestep
        # TODO: collapse blank labels
    Args:
        ctc_matrix (torch.Tensor): 
            shape (1, num_classes, output_len)
        blank_label (int): blank labels to collapse
    
    Returns:
        torch.Tensor: class labels per time step.
         shape (ctc timesteps)
    """
    _, indices = topk(ctc_matrix, k=1, dim=1)
    return indices[:, 0, :]

In [30]:
output[0, 0, :]

tensor([-0.7700, -0.7678, -0.7659, -0.7641, -0.7627, -0.7614, -0.7602, -0.7592,
        -0.7584, -0.7579, -0.7575, -0.7571, -0.7568, -0.7573, -0.7576, -0.7583],
       grad_fn=<SliceBackward>)

In [29]:
sample = inputs[0].unsqueeze(0)
target = targets[0]

print(decode(targets[0].tolist()))

output = model(sample)
print(output.shape)

greedy_output = GreedyDecoder(output)

print(greedy_output.shape)
print(greedy_output)
print(decode(greedy_output.tolist()[0]))

['o', 'n', '*', '*', '*', '*', '*', '*']
torch.Size([1, 39, 16])
torch.Size([1, 16])
tensor([[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]])
['e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e']


In [16]:
pr.disable()
pr.print_stats(sort='time')

         15923535 function calls (15545921 primitive calls) in 5101.043 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      342 2324.966    6.798 2324.966    6.798 {method 'run_backward' of 'torch._C._EngineBase' objects}
     3087 2126.677    0.689 2126.677    0.689 {built-in method conv1d}
    37883  502.738    0.013  502.738    0.013 {built-in method norm}
    12312   23.398    0.002   23.398    0.002 {method 'add' of 'torch._C._TensorBase' objects}
    37883   14.301    0.000   14.800    0.000 soundfile.py:1314(_cdata_io)
    37883   14.047    0.000   15.030    0.000 soundfile.py:1160(_open)
       20   10.930    0.546   10.930    0.546 {method 'control' of 'select.kqueue' objects}
    18468    8.064    0.000    8.064    0.000 {method 'mul_' of 'torch._C._TensorBase' objects}
    12312    6.508    0.001    6.508    0.001 {method 'addcmul_' of 'torch._C._TensorBase' objects}
      342    6.201    0.018   55.031    0.16

      164    0.000    0.000    0.000    0.000 py3compat.py:19(encode)
       36    0.000    0.000    0.000    0.000 module.py:136(register_parameter)
      386    0.000    0.000    0.000    0.000 sre_parse.py:172(append)
        1    0.000    0.000    0.000    0.000 tarfile.py:30(<module>)
       31    0.000    0.000    0.000    0.000 traitlets.py:988(__init__)
       22    0.000    0.000    0.002    0.000 ipkernel.py:143(set_parent)
       60    0.000    0.000    0.000    0.000 signal.py:35(_enum_to_int)
       72    0.000    0.000    0.000    0.000 zmqstream.py:351(update_flag)
        1    0.000    0.000    0.000    0.000 umath.py:7(<module>)
        1    0.000    0.000    0.000    0.000 _recursive.py:62(infer_concrete_type_builder)
        4    0.000    0.000    0.005    0.001 std.py:787(__init__)
      122    0.000    0.000    0.000    0.000 compilerop.py:138(extra_flags)
      115    0.000    0.000    0.000    0.000 sre_parse.py:343(_escape)
        1    0.000    0.000    0.000  

        9    0.000    0.000    0.000    0.000 observer.py:12(_with_args)
       52    0.000    0.000    0.000    0.000 {method 'translate' of 'str' objects}
       22    0.000    0.000    0.000    0.000 codecs.py:260(__init__)
        1    0.000    0.000    0.000    0.000 traceback.py:366(from_list)
       31    0.000    0.000    0.000    0.000 loader.py:252(__getitem__)
       12    0.000    0.000    0.001    0.000 formatters.py:220(catch_format_error)
        9    0.000    0.000    0.000    0.000 formatters.py:331(__call__)
        3    0.000    0.000    0.000    0.000 base_events.py:1668(_add_callback)
       23    0.000    0.000    0.000    0.000 locks.py:227(clear)
       32    0.000    0.000    0.000    0.000 _type_aliases.py:68(<genexpr>)
        1    0.000    0.000    0.000    0.000 _asarray.py:5(<module>)
        1    0.000    0.000    0.000    0.000 utils.py:133(_get_indent)
        1    0.000    0.000    0.000    0.000 core.py:6358(__new__)
       15    0.000    0.000    0.0

        1    0.000    0.000    0.000    0.000 widget_box.py:112(GridBox)
        1    0.000    0.000    0.000    0.000 widget_string.py:111(Password)
        1    0.000    0.000    0.000    0.000 beta.py:10(Beta)
        1    0.000    0.000    0.000    0.000 transforms.py:29(Transform)
        1    0.000    0.000    0.000    0.000 fishersnedecor.py:10(FisherSnedecor)
        1    0.000    0.000    0.000    0.000 half_cauchy.py:11(HalfCauchy)
        1    0.000    0.000    0.000    0.000 pareto.py:8(Pareto)
        1    0.000    0.000    0.000    0.000 logistic_normal.py:8(LogisticNormal)
        1    0.000    0.000    0.000    0.000 relaxed_categorical.py:10(ExpRelaxedCategorical)
        1    0.000    0.000    0.000    0.000 relaxed_categorical.py:87(RelaxedOneHotCategorical)
        1    0.000    0.000    0.000    0.000 studentT.py:10(StudentT)
        1    0.000    0.000    0.000    0.000 conv_relu.py:11(ConvReLU2d)
        1    0.000    0.000    0.000    0.000 stubs.py:2(<module>)


        1    0.000    0.000    0.000    0.000 lr_scheduler.py:354(MultiStepLR)
        1    0.000    0.000    0.000    0.000 lr_scheduler.py:399(ExponentialLR)
        1    0.000    0.000    0.000    0.000 lr_scheduler.py:1005(OneCycleLR)
        1    0.000    0.000    0.000    0.000 reductions.py:22(StorageWeakRef)
        1    0.000    0.000    0.000    0.000 reductions.py:54(_after_fork)
        1    0.000    0.000    0.000    0.000 reductions.py:41(SharedCache)
        1    0.000    0.000    0.000    0.000 context.py:196(get_start_method)
        1    0.000    0.000    0.000    0.000 context.py:202(reducer)
        1    0.000    0.000    0.000    0.000 context.py:228(__init__)
        1    0.000    0.000    0.000    0.000 context.py:279(SpawnProcess)
        1    0.000    0.000    0.000    0.000 context.py:297(SpawnContext)
        1    0.000    0.000    0.000    0.000 context.py:301(ForkServerContext)
        1    0.000    0.000    0.000    0.000 process.py:36(current_process)
   