In [1]:
import os
os.chdir('../HW4')

In [2]:
dev_spectrograms_path = './data/dev.npy'
dev_labels_path = './data/dev_transcripts_cleaned.txt'

test_spectrograms_path = './data/test.npy'
test_labels_path = None

In [3]:
# importing all the required libraries. 
import torch # Deep Learning Framework
import torch.nn as nn
import torchaudio # to use the time masking and frequency masking - specaug for regularizing the models
import time # to keep track of the time for each pass and iteration
import numpy as np # array manipulating library

from tqdm.notebook import tqdm # to see the proress bar for prediction :p

In [4]:
from ctcdecode import CTCBeamDecoder # github:parlance/ctcdecode; For beam decoding the model.
import Levenshtein # to measure the edit distance of the prediction from ground truth

In [5]:
configuration = {
    'batchsize':32, 
    'learning_rate':5e-4,
    'weight_decay':5e-5,
    'freq_masking':[3, 10], # frequency masking
    'time_masking':[2, 40], # time masking
    'num_workers':4, # for multithreading/processing the dataloader. 
    'cuda':True, # I have a GPU :P
    'num_processes':os.cpu_count(), # This is for the Batch Beam Decoding
    'experiment_name':'Wav2letterProj_002' # final model
}

In [6]:
CUDA = configuration['cuda']
DEVICE = torch.device('cuda') if CUDA else torch.device('cpu')
print(f"Using device: {DEVICE} and cuda available: {torch.cuda.is_available()}")

Using device: cuda and cuda available: True


In [7]:
label_map = ['<sos>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ', '<eos>']
label_map_c2i = {c:i for i, c in enumerate(label_map)}
print("Total unique characters in the token set: ", len(label_map))

Total unique characters in the token set:  34


In [8]:
BLANK_TAG = '<b>'

In [9]:
dev_audio_transforms = None
test_audio_transforms = None

print(dev_audio_transforms)
print(test_audio_transforms)

None
None


In [10]:
class HW4P2datasetProj(torch.utils.data.Dataset):
    def __init__(self, infile, outfile=None, label_map_c2i=None, transforms=None):
        stime = time.time()
        self.spectrograms = np.load(infile, allow_pickle=True)
        print(f"Spectrograms-> {infile} -> {len(self.spectrograms)}")
        self.labels = None
        self.label_map_c2i = label_map_c2i
        if outfile is not None:
            with open(outfile) as file:
                self.labels = [] 
                for line in file.read().splitlines():
                    labels = [] 
                    for s in line:
                        labels.append(self.label_map_c2i[s])
                    self.labels.append(labels)
            print(f"Labels-> {outfile} -> {len(self.labels)}")
        self.transforms = transforms
        etime = time.time()
        print(f"Loaded in {etime-stime:3.3f} secs")
        
    def __len__(self):
        return len(self.spectrograms)
    
    def __getitem__(self, idx):
        spectrogram = torch.from_numpy(self.spectrograms[idx]).float()
        if self.transforms is not None:
            spectrogram = self.transforms(spectrogram.T).T
        if self.labels is None:
            label = torch.from_numpy(np.array([-1])).int()
        else:
            label = torch.from_numpy(np.array(self.labels[idx])).int() # adding 0 to compensate the blank in CTC
        return spectrogram, label

def collate_function(batch):
    spectrograms = []
    spectrograms_lens = []
    labels = []
    labels_lens = []
    for b in batch:
        spectrograms.append(b[0])
        spectrograms_lens.append(len(b[0]))
        labels.append(b[1])
        labels_lens.append(len(b[1]))
    spectrograms_ = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True) 
    labels_ = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
    spectrograms_lens = torch.LongTensor(spectrograms_lens)
    len_ratios = spectrograms_lens/max(spectrograms_lens)
    return spectrograms_, labels_, spectrograms_lens, torch.LongTensor(labels_lens), len_ratios

In [11]:
dev_dataset = HW4P2datasetProj(infile=dev_spectrograms_path, 
                               outfile=dev_labels_path, 
                               label_map_c2i=label_map_c2i,
                               transforms=dev_audio_transforms)

test_dataset = HW4P2datasetProj(infile=test_spectrograms_path, 
                                outfile=test_labels_path,  
                                label_map_c2i=label_map_c2i, 
                                transforms=test_audio_transforms)

Spectrograms-> ./data/dev.npy -> 2703
Labels-> ./data/dev_transcripts_cleaned.txt -> 2703
Loaded in 0.200 secs
Spectrograms-> ./data/test.npy -> 2620
Loaded in 0.142 secs


In [12]:
dev_data = torch.utils.data.DataLoader(dev_dataset, 
                                       shuffle=True,
                                       batch_size=configuration['batchsize'],
                                       collate_fn=collate_function,
                                       num_workers=configuration['num_workers'],
                                       pin_memory=True)
test_data = torch.utils.data.DataLoader(test_dataset, 
                                        shuffle=False, 
                                        batch_size=configuration['batchsize'],
                                        collate_fn=collate_function,
                                        num_workers=configuration['num_workers'],
                                        pin_memory=True)

In [13]:
# Helper functions to help in various small trivial tasks
class HelperFns:
    def __init__(self, label_map, label_map_c2i):
        self.label_map = label_map
        self.label_map_c2i = label_map_c2i
        self.distance = Levenshtein.distance
        
    def convertID2Sentence(self, I):
        sentence = ""
        for i in I: 
            sentence += self.label_map[i]
        return self.normalizeS(sentence)
    
    def convertID2SentenceRaw(self, I):
        sentence = ""
        for i in I:
            try:
                sentence += self.label_map[i]
            except Exception as e:
                sentence += BLANK_TAG
        return self.normalizeS(sentence)
    
    def calculate_cer(self, S1, S2):
        return self.distance(self.normalizeS(S1), self.normalizeS(S2))
    
    def normalizeS(self, S):
        return S.replace(BLANK_TAG, "").replace('<sos>', "").replace('<eos>', "").strip()
    
    def convertSentence2IDs(self, sentence):
        Ids = []
        for s in sentence:
            Ids.append(self.label_map_c2i[s])
        return Ids

In [14]:
helper = HelperFns(label_map, label_map_c2i)

In [15]:
t_ = helper.convertSentence2IDs('this is a test sentence')
print(t_)
print(helper.convertID2SentenceRaw(t_))

[20, 8, 9, 19, 32, 9, 19, 32, 1, 32, 20, 5, 19, 20, 32, 19, 5, 14, 20, 5, 14, 3, 5]
this is a test sentence


In [16]:
def decode_logits(logits, data_lens, beam_width=1):
    ctc_decoder = CTCBeamDecoder(labels=label_map + [BLANK_TAG], 
                                 beam_width=beam_width, 
                                 num_processes=configuration['num_processes'],
                                 log_probs_input=True)
    beam_results, beam_scores, timesteps, out_lens = ctc_decoder.decode(logits.permute(1, 0, 2), data_lens)
    decoded_transcripts = []
    for i, (beam_result, out_len) in enumerate(zip(beam_results, out_lens)):
        transcript = ""
        if out_len[0]>0: 
            transcript = helper.convertID2SentenceRaw((beam_result[0, :out_len[0]].numpy()).tolist()) #"".join([label_map[i] for i in beam_result[0, :out_len[0]]])
        decoded_transcripts.append(transcript)
    return decoded_transcripts

@torch.no_grad()
def test_model(model, data, loss, current_epoch=1, decode=False, beam_width=1):
    model.eval()
    stime = time.time()
    len_data = len(data)
    print(f"Started Testing Epoch: {current_epoch}")
    total_loss = 0.0
    average_cer = 0.0
    nexamples = 0
    outwriterfile = open(f'sample_decodes.txt', 'w')
    for i, batch in enumerate(data):
        spectrograms = batch[0].to(DEVICE)
        labels = batch[1].to(DEVICE)
        spectrograms_lens = batch[2].to(DEVICE)
        labels_lens = batch[3].to(DEVICE)
        input_len_ratio = batch[4].to(DEVICE)
        output_logits = model(spectrograms) 
        seq_length = output_logits.size(0)
        output_lens = torch.autograd.Variable(input_len_ratio.mul_(int(seq_length)).int(), requires_grad=False)
        
        l = loss(output_logits, labels.cpu(), output_lens.cpu(), labels_lens.cpu())
        # its efficient to decode the transcripts while we have the model predictions. 
        if decode:
            decoded_transcripts = decode_logits(output_logits, output_lens, beam_width=beam_width)
            for ii, (decoded_transcipt, label) in enumerate(zip(decoded_transcripts, labels)):
                nexamples += 1
                label_string = helper.convertID2SentenceRaw((label[:labels_lens[ii]].cpu().numpy()).tolist())
                CER = helper.calculate_cer(decoded_transcipt, label_string)
                average_cer += CER
                print(f"{nexamples} label_string: {label_string}", file=outwriterfile)
                print(f"{nexamples} decoded_transcript: {decoded_transcipt}", file=outwriterfile)
                print(f"{nexamples} cer: {CER}\n", file=outwriterfile)
                
        if i%50==0:
            if decode:
                print(f'\tE:{current_epoch}\tBatch:{i}/{len_data}\tLoss: {l.item():3.3f}\tcer: {average_cer/nexamples}\tTelapsed:{time.time()-stime:3.3f} Secs')
            else:
                print(f'\tE:{current_epoch}\tBatch:{i}/{len_data}\tLoss: {l.item():3.3f}\tTelapsed:{time.time()-stime:3.3f} Secs')
        total_loss += l.item()
        torch.cuda.empty_cache()
        del spectrograms
        del labels
        del spectrograms_lens
        del labels_lens
    outwriterfile.close()
    etime = time.time()
    if not decode:
        print(f"Completed Testing Epoch: Loss: {total_loss/len_data}\tTime: {etime-stime:3.3f} Secs")
    else:
        print(f"Completed Testing Epoch: Loss: {total_loss/len_data}\tCER:{average_cer/nexamples}\tTime: {etime-stime:3.3f} Secs")
    return total_loss/len_data, average_cer/(1e-16+nexamples)

In [17]:
@torch.no_grad()
def get_predictions(model, data, beam_width=1):
    model.eval()
    stime = time.time()
    len_data = len(data)
    print(f"Getting Predictions on {len_data} files")
    all_transcripts = []
    for i, batch in enumerate(tqdm(data)):
        spectrograms = batch[0].to(DEVICE)
        output_logits = model(spectrograms)
        input_len_ratio = batch[4].to(DEVICE)
        output_logits = model(spectrograms) 
        seq_length = output_logits.size(0)
        output_lens = torch.autograd.Variable(input_len_ratio.mul_(int(seq_length)).int(), requires_grad=False)
        decoded_transcripts = decode_logits(output_logits, output_lens, beam_width=beam_width)
        all_transcripts.extend(decoded_transcripts)
        torch.cuda.empty_cache()
        del spectrograms
    return all_transcripts

In [18]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        out = self.conv(x)
        out = self.relu(out)
        out = self.dropout(out)

        return out

class Wav2Letter(nn.Module):

    def __init__(self, num_classes, num_features = 40):
        super(Wav2Letter, self).__init__()

        model = nn.Sequential(
            ConvBlock(in_channels=num_features, out_channels=250, kernel_size=48, stride=2, padding=23),

            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            ConvBlock(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),

            ConvBlock(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
            ConvBlock(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0)
        )
        
        self.model = model
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        # x - # (batch_size, input_length, num_features)
        out = self.model(x.permute(0, 2, 1)) # (batch_size, num_features, input_length)
        out = self.log_softmax(out) # applies on num_features(num_classes)
        return out.permute(2, 0, 1) # (batch_size, input_length, num_features)

In [19]:
model =  Wav2Letter(len(label_map) + 1) # this is for the blank character
model.to(DEVICE)

Wav2Letter(
  (model): Sequential(
    (0): ConvBlock(
      (conv): Conv1d(40, 250, kernel_size=(48,), stride=(2,), padding=(23,))
      (relu): ReLU(inplace=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1): ConvBlock(
      (conv): Conv1d(250, 250, kernel_size=(7,), stride=(1,), padding=(3,))
      (relu): ReLU(inplace=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (2): ConvBlock(
      (conv): Conv1d(250, 250, kernel_size=(7,), stride=(1,), padding=(3,))
      (relu): ReLU(inplace=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (3): ConvBlock(
      (conv): Conv1d(250, 250, kernel_size=(7,), stride=(1,), padding=(3,))
      (relu): ReLU(inplace=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (4): ConvBlock(
      (conv): Conv1d(250, 250, kernel_size=(7,), stride=(1,), padding=(3,))
      (relu): ReLU(inplace=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (5): ConvBlock(
      (conv): Conv1d(250, 250, ker

In [20]:
mdl = torch.load('./models/wav2letter_31.pth')
model.load_state_dict(mdl.state_dict())

<All keys matched successfully>

In [21]:
loss = torch.nn.CTCLoss()

In [22]:
test_model(model, dev_data, loss, decode=True, beam_width=1)

Started Testing Epoch: 1
	E:1	Batch:0/85	Loss: 1.286	cer: 34.78125	Telapsed:0.367 Secs
	E:1	Batch:50/85	Loss: 1.298	cer: 32.099877450980394	Telapsed:11.833 Secs
Completed Testing Epoch: Loss: 1.256102619451635	CER:31.69515353311136	Time: 19.261 Secs


(1.256102619451635, 31.69515353311136)

In [23]:
preds = get_predictions(model, test_data, beam_width=20)

Getting Predictions on 82 files


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))




In [24]:
preds[:20]

['yes serdary the trembling for ing holsh for this dears the yeu to day you see a for geur olle to mar alwayfulden anxety about  monney the day after to morrow the di atfi of a slan er the day afk ter that the missforture of some find then the prevaling weather then something that is been built in ther waws thenn aleazerly whul ster concence san year bur ol colm re hot youyou again the course of public affairs',
 'we were wor intersta ed sent lical condition of the station lat in the comersalbiy',
 'ih saw but i mest at money to be bhat what',
 "in  re gards y brabbing the compry i'l say that i saye tare heavy last one day",
 'but they to te i acprortin a dase against lacy that she was innosa ano thers trots of the resten hoverpropy of the blefh',
 'the wil human division is this to meminus and mis saety',
 'almost instanly wast forced to the topk',
 'as that he wonderf she could laugh about it wilh e now',
 "and thi tay i don'tknow mur tod at thedain of ae",
 'fa  hunded min more he h