# HW4P2: Attention-based Speech Recognition

Welcome to the final assignment in 11785. In this HW, you will work on building a speech recognition system with attention. <br> <br>

HW Writeup: https://piazza.com/class_profile/get_resource/l37uyxe87cq5xn/lam1lcjjj0314e <br>
Kaggle competition link: https://www.kaggle.com/competitions/11-785-f22-hw4p2/ <br>
LAS Paper: https://arxiv.org/pdf/1508.01211.pdf <br>
Attention is all you need:https://arxiv.org/pdf/1706.03762.pdf

# Initial Set-up

In [None]:
!nvidia-smi

In [None]:
# Install some required libraries
# Feel free to add more if you want
!pip install -q python-levenshtein torchsummaryX wandb

# Import

In [None]:
import os
import pandas as pd
import numpy as np
import Levenshtein
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torchaudio.transforms as tat
from sklearn.metrics import accuracy_score
import gc
import math
import zipfile
from tqdm import tqdm
import datetime

import torch
import torchaudio

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
import gc
from torchsummaryX import summary
import wandb
from glob import glob

import warnings
warnings.filterwarnings('ignore')

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", DEVICE)

In [None]:
# TODO: Import drive if you are a colab user
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive

In [None]:
!unzip -qo 'hw4p2.zip' -d '/content/drive/MyDrive/data'

# Config

In [None]:
# Global config dict. Feel free to add or change if you want.
config = {
    'batch_size': 96,
    'epochs': 30,
    'lr': 1e-3
}

# Toy Data Setup

The toy dataset is very essential for you in this HW. The model which you will be building is complicated and you first need to make sure that it runs on the toy dataset. <br>
In other words, you need convergence - the attention diagonal. Take a look at the write-up for this. <br>
We have given you the following code to download the toy data and load it. You can use it the way it is. But be careful, the transcripts are different from the original data from kaggle. The toy dataset has phonemes but the actual data has characters.

In [None]:
!wget -q https://cmu.box.com/shared/static/wok08c2z2dp4clufhy79c5ee6jx3pyj9 --content-disposition --show-progress
!wget -q https://cmu.box.com/shared/static/zctr6mvh7npfn01forli8n45duhp2g85 --content-disposition --show-progress
!wget -q https://cmu.box.com/shared/static/m2oaek69145ljeu6srtbbb7k0ip6yfup --content-disposition --show-progress
!wget -q https://cmu.box.com/shared/static/owrjy0tqra3v7zq2ru7mocy2djskydy9 --content-disposition --show-progress

In [None]:
# Load the toy dataset
X_train = np.load("f0176_mfccs_train.npy")
X_valid = np.load("f0176_mfccs_dev.npy")
Y_train = np.load("f0176_hw3p2_train.npy")
Y_valid = np.load("f0176_hw3p2_dev.npy")

# This is how you actually need to find out the different trancripts in a dataset. 
# Can you think whats going on here? Why are we using a np.unique?
VOCAB_MAP           = dict(zip(np.unique(Y_valid), range(len(np.unique(Y_valid))))) 
VOCAB_MAP["[PAD]"]  = len(VOCAB_MAP)
VOCAB               = list(VOCAB_MAP.keys())

SOS_TOKEN = VOCAB_MAP["[SOS]"]
EOS_TOKEN = VOCAB_MAP["[EOS]"]
PAD_TOKEN = VOCAB_MAP["[PAD]"]

Y_train = [np.array([VOCAB_MAP[p] for p in seq]) for seq in Y_train]
Y_valid = [np.array([VOCAB_MAP[p] for p in seq]) for seq in Y_valid]

In [None]:
# Dataset class for the Toy dataset
class ToyDataset(torch.utils.data.Dataset):

    def __init__(self, partition):

        if partition == "train":
            self.mfccs = X_train[:, :, :15]
            self.transcripts = Y_train

        elif partition == "valid":
            self.mfccs = X_valid[:, :, :15]
            self.transcripts = Y_valid

        assert len(self.mfccs) == len(self.transcripts)

        self.length = len(self.mfccs)

    def __len__(self):

        return self.length

    def __getitem__(self, i):

        x = torch.tensor(self.mfccs[i])
        y = torch.tensor(self.transcripts[i])

        return x, y

    def collate_fn(batch):
        x_batch, y_batch = list(zip(*batch))
        x_lens      = [x.shape[0] for x in x_batch] 
        y_lens      = [y.shape[0] for y in y_batch] 
        x_batch_pad = torch.nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value= EOS_TOKEN)
        y_batch_pad = torch.nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value= EOS_TOKEN) 
        
        return x_batch_pad, y_batch_pad, torch.tensor(x_lens), torch.tensor(y_lens)

In [None]:
train_data = ToyDataset(partition= "train")
train_loader = torch.utils.data.DataLoader(train_data, 
                                           collate_fn=ToyDataset.collate_fn,
                                           num_workers= 4, 
                                           batch_size=config['batch_size'], 
                                           pin_memory= True,shuffle=True
                                           )
for data in train_loader:
    x_batch_pad, y_batch_pad, x_len, y_len = data
    break

In [None]:
print(X_train.shape)
print(len(Y_train))
print(x_batch_pad.shape)
print(x_len.shape)
print(y_batch_pad.shape)
print(y_len.shape)
print(x_len)
print(y_len)

In [None]:
example_batch = x_batch_pad
example_len = x_len

# Kaggle Data

In [None]:
# TODO: Use the same Kaggle code from HW1P2, HW2P2, HW3P2
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle/

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"","key":""}') # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# Download the data
!kaggle competitions download -c 11-785-f22-hw4p2
!mkdir '/content/data'

!unzip -qo '11-785-f22-hw4p2.zip' -d '/content/data'

# Dataset Class

In [None]:
# These are the various characters in the transcripts of the datasetW
VOCAB = ['<sos>',   
         'A',   'B',    'C',    'D',    
         'E',   'F',    'G',    'H',    
         'I',   'J',    'K',    'L',       
         'M',   'N',    'O',    'P',    
         'Q',   'R',    'S',    'T', 
         'U',   'V',    'W',    'X', 
         'Y',   'Z',    "'",    ' ', 
         '<eos>']

VOCAB_MAP = {VOCAB[i]:i for i in range(0, len(VOCAB))}
SOS_TOKEN = VOCAB_MAP["<sos>"]
EOS_TOKEN = VOCAB_MAP["<eos>"]
padding = EOS_TOKEN

In [None]:
# TODO: Create a dataset class which is exactly the same as HW3P2. You are free to reuse it. 
# The only change is that the transcript mapping is different for this HW.
# Note: We also want to retain SOS and EOS tokens in the transcript this time.
class AudioDataset(torch.utils.data.Dataset):
    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW
    #TODO
    def __init__(self, data_path, partition= "train-clean-100", train = True, limit=-1, time_mask_param = 100, freq_mask_param = 6): 
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        ''' 
        timemask = torchaudio.transforms.TimeMasking(time_mask_param=time_mask_param)
        freqmask =  torchaudio.transforms.FrequencyMasking(freq_mask_param=freq_mask_param)
        self.transform = nn.Sequential(timemask, freqmask)
        self.train = train
        # Load the directory and all files in them
        self.data_path = data_path

        self.mfcc_dir = self.data_path +'/'+ partition + '/mfcc' #TODO
        self.transcript_dir = self.data_path +'/'+ partition + '/transcript/raw' #TODO

        self.mfcc_files = sorted(os.listdir(self.mfcc_dir)) #TODO
        self.transcript_files = sorted(os.listdir(self.transcript_dir)) #TODO

        assert len(self.mfcc_files) == len(self.transcript_files) 
        #TODO
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfcc_files)
        #TODO
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        VOCAB = ['<sos>',   
         'A',   'B',    'C',    'D',    
         'E',   'F',    'G',    'H',    
         'I',   'J',    'K',    'L',       
         'M',   'N',    'O',    'P',    
         'Q',   'R',    'S',    'T', 
         'U',   'V',    'W',    'X', 
         'Y',   'Z',    "'",    ' ', 
         '<eos>']
        VOCAB_MAP = {VOCAB[i]:i for i in range(0, len(VOCAB))}
        SOS_TOKEN = VOCAB_MAP["<sos>"]
        EOS_TOKEN = VOCAB_MAP["<eos>"]
        self.padding = EOS_TOKEN
        self.PHONEMES = VOCAB
        self.mapping = VOCAB_MAP
        #TODO
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        self.mfccs, self.transcripts = [], []
        for i in range(0, len(self.mfcc_files)):
            mfcc = np.load(self.mfcc_dir + '/' + self.mfcc_files[i])
        #   Gaussin Normalization of mfcc
            mean = np.mean(mfcc,axis = 0)
            sigma = np.std(mfcc, axis = 0)
            mfcc = (mfcc - mean)/sigma
        #   Load the corresponding transcript
        #   Remove [SOS] and [EOS] from the transcript 
            transcript = np.load(self.transcript_dir + '/' + self.transcript_files[i])
            transcript = transcript
            # Map to label
            transcript = [self.mapping[transcript[index]] for index in range(len(transcript))]
            # transcript = [self.LABELS.index(self.mapping[transcript[index]]) for index in range(len(transcript))]
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)
            # self.label.append(label)
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''
       
    def __len__(self):
        
        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS
        If you didn't do the loading and processing of the data in __init__,
        do that here.
        Once done, return a tuple of features and labels.
        '''
        mfcc = torch.Tensor(self.mfccs[ind])# TODO
        if self.train:
            freqmask =  torchaudio.transforms.FrequencyMasking(freq_mask_param= 6)
            timemask = torchaudio.transforms.TimeMasking(time_mask_param=int(mfcc.size(0)*0.25))
            mfcc = mfcc.transpose(0,1)
            mfcc = mfcc.unsqueeze(0)
            mfcc = timemask(mfcc)
            mfcc = freqmask(mfcc)
            mfcc = mfcc.squeeze(0)
            mfcc = mfcc.transpose(0,1)
        transcript = torch.LongTensor(self.transcripts[ind])# TODO

        return mfcc, transcript

    def collate_fn(batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. 
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, 
            and lengths of labels.
        '''
        # batch of input mfcc coefficients
        # batch of output phonemes
        batch_mfcc, batch_transcript = zip(*batch)# TODO
        #batch_transcript = [transcript for mfcc, transcript in batch] # TODO
        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True, padding_value= padding) # TODO
        lengths_mfcc =  torch.LongTensor([len(x) for x in batch_mfcc])# TODO 

        batch_transcript_pad = pad_sequence(batch_transcript, batch_first=True, padding_value= padding) # TODO
        lengths_transcript =  torch.LongTensor([len(x) for x in batch_transcript])# TODO

        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?
        
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)

In [None]:
# TODO: Similarly, create a test dataset class
# Test Dataloader
#TODO
class AudioDatasetTest(torch.utils.data.Dataset):
    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW
    #TODO
    def __init__(self, data_path, partition= "test-clean", limit=-1): 
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        ''' 
        # Load the directory and all files in them
        self.data_path = data_path
        self.mfcc_dir = self.data_path +'/'+ partition + '/mfcc' #TODO
        self.mfcc_files = sorted(os.listdir(self.mfcc_dir)) #TODO
        #TODO
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfcc_files)
        #TODO
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        VOCAB = ['<sos>',   
         'A',   'B',    'C',    'D',    
         'E',   'F',    'G',    'H',    
         'I',   'J',    'K',    'L',       
         'M',   'N',    'O',    'P',    
         'Q',   'R',    'S',    'T', 
         'U',   'V',    'W',    'X', 
         'Y',   'Z',    "'",    ' ', 
         '<eos>']
        VOCAB_MAP = {VOCAB[i]:i for i in range(0, len(VOCAB))}
        SOS_TOKEN = VOCAB_MAP["<sos>"]
        EOS_TOKEN = VOCAB_MAP["<eos>"]
        self.PHONEMES = VOCAB
        self.mapping = VOCAB_MAP
        #TODO
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        self.mfccs = []
        for i in range(0, len(self.mfcc_files)):
            mfcc = np.load(self.mfcc_dir + '/' + self.mfcc_files[i])
        #   Gaussin Normalization of mfcc
            mean = np.mean(mfcc,axis = 0)
            sigma = np.std(mfcc, axis = 0)
            mfcc = (mfcc - mean)/sigma
        #   Append each mfcc to self.mfcc
            self.mfccs.append(mfcc)
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''
       
    def __len__(self):
        
        '''
        TODO: What do we return here?
        '''
        return self.length
        # raise NotImplemented

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS
        If you didn't do the loading and processing of the data in __init__,
        do that here.
        Once done, return a tuple of features and labels.
        '''
        mfcc = torch.Tensor(self.mfccs[ind])# TODO
        return mfcc

    def collate_fn(batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. 
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, 
            and lengths of labels.
        '''
        # batch of input mfcc coefficients
        # batch of output phonemes
        batch_mfcc = batch # TODO
        #batch_transcript = [transcript for mfcc, transcript in batch] # TODO
        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)
        batch_mfcc_pad = pad_sequence(batch_mfcc, batch_first=True, padding_value=padding) # TODO
        lengths_mfcc =  torch.LongTensor([len(x) for x in batch_mfcc]) # TODO 
        
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, torch.tensor(lengths_mfcc)

# Dataset and Dataloaders

In [None]:
root = '/content/data/hw4p2' 

In [None]:
# TODO: Create the datasets and dataloaders
# All these things are similar to HW3P2
# You can reuse the same code
# Create objects for the dataset class
train_data = AudioDataset(root, partition= "train-clean-100", train = True, limit=-1) #TODO
train_data1 = AudioDataset(root, partition= "train-clean-100", train = False, limit=-1)
val_data = AudioDataset(root, partition= "dev-clean", train = False, limit=-1) # TODO : You can either use the same class with some modifications or make a new one :)
test_data = AudioDatasetTest(root, partition= "test-clean", limit=-1) #TODO

# Do NOT forget to pass in the collate function as parameter while creating the dataloader
train_loader = torch.utils.data.DataLoader(train_data, 
                                           collate_fn=AudioDataset.collate_fn,
                                           num_workers= 4, 
                                           batch_size=config['batch_size'], 
                                           pin_memory= True,shuffle=True
                                           )
train_loader1 = torch.utils.data.DataLoader(train_data1, 
                                           collate_fn=AudioDataset.collate_fn,
                                           num_workers= 4, 
                                           batch_size=config['batch_size'], 
                                           pin_memory= True,shuffle=True
                                           )
val_loader = torch.utils.data.DataLoader(val_data, 
                                         collate_fn=AudioDataset.collate_fn,
                                         num_workers= 4, 
                                         batch_size=config['batch_size'], 
                                         pin_memory= True,shuffle=False
                                         )#TODO
test_loader = torch.utils.data.DataLoader(test_data, 
                                          collate_fn=AudioDatasetTest.collate_fn,
                                          num_workers= 4, 
                                          batch_size=config['batch_size'], 
                                          pin_memory= True,shuffle=False
                                          )#TODO
# The sanity check for shapes also are similar
# Please remember that the only change in the dataset for this HW is the transcripts
# So you are expected to get similar shapes like HW3P2 (Pad, pack and Oh my!)

In [None]:
for data in train_loader:
    x_batch_pad, y_batch_pad, x_len, y_len = data
    break

In [None]:
print(padding)
print(x_batch_pad.shape) # batch_size x length x dim, B x T x D
print(max(x_len))
print(x_len.shape)
print(y_batch_pad.shape) # batch_size x length 
print(max(y_len))
print(y_len.shape)
print(x_len)
print(y_len)
print(x_batch_pad[0, :, 0].shape)
print(y_batch_pad[0].shape)

In [None]:
packed_x = pack_padded_sequence(x_batch_pad, x_len, batch_first = True,  enforce_sorted=False)

In [None]:
print(packed_x)
print(packed_x.data.shape)
print(packed_x.batch_sizes.shape)

# Model

In this section you will be building the LAS model from scratch. Before starting to code, please read the writeup, paper and understand the following parts completely.<br>
- Pyramidal Bi-LSTM 
- Listener
- Attention
- Speller

After getting a good grasp of the workings of these modules, start coding. Follow the TODOs carefully. We will also be adding some extra features to the attention mechanism like keys and values which are not originally present in LAS. So we will be creating a hybrid network based on LAS and Attention is All You Need.


## Encoder

### LockedDrop

In [None]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - dropout)
        mask = mask.div_(1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

#### ResidualBlock


In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels = 15, out_channels = 64, stride = 1, dropout = 0.0, downsample = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
                        nn.ConstantPad1d(3 // 2, 0),
                        nn.Conv1d(in_channels, out_channels, kernel_size = 3, stride = stride, bias = False),
                        nn.BatchNorm1d(out_channels),
                        nn.ReLU(),
                        torch.nn.Dropout(dropout),

                        nn.ConstantPad1d(3 // 2, 0),
                        nn.Conv1d(out_channels, out_channels, kernel_size = 3, stride = 1, bias = False),
                        nn.BatchNorm1d(out_channels),
                        torch.nn.Dropout(dropout)
                        )

        self.downsample = downsample
        self.relu = nn.ReLU()
        self.out_channels = out_channels
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

### Pyramidal Bi-LSTM

In [None]:
class pBLSTM(torch.nn.Module):
    '''
    Pyramidal BiLSTM
    Read the write up/paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension
        (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input? 
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer
    To make our implementation modular, we pass 1 layer at a time.
    '''
    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()
        self.downsamples = 2
        self.input_size = input_size
        self.blstm = nn.LSTM(input_size = input_size * self.downsamples, 
                             hidden_size = hidden_size, 
                             num_layers = 1, 
                             batch_first = True, 
                             bidirectional = True) 
       # TODO: Initialize a single layer bidirectional LSTM with the given input_size and hidden_size

    def forward(self, x_packed): # x_packed is a PackedSequence
        # TODO: Pad Packed Sequence
        x, x_len = pad_packed_sequence(x_packed, batch_first=True)
        # Call self.trunc_reshape() which downsamples the time steps of x and increases the feature dimensions as mentioned above
        # self.trunc_reshape will return 2 outputs. What are they? Think about what quantites are changing.
        x, x_len = self.trunc_reshape(x, x_len)
        # TODO: Pack Padded Sequence. What output(s) would you get?
        x_packed = pack_padded_sequence(x, x_len, batch_first = True,  enforce_sorted=False)
        # TODO: Pass the sequence through bLSTM
        packed_out = self.blstm(x_packed)[0]
        return packed_out

    def trunc_reshape(self, x, x_lens): 
        # TODO: If you have odd number of timesteps, how can you handle it? (Hint: You can exclude them)
        # TODO: Reshape x. When reshaping x, you have to reduce number of timesteps by a downsampling factor while increasing number of features by the same factor
        # TODO: Reduce lengths by the same downsampling factor
        batch_size = x.size(0)
        length = x.size(1)
        feature = x.size(2)
        if length % self.downsamples != 0:
          reminder =  int(length % self.downsamples)
          x = x[:, :-reminder, :]
          length -= reminder
          # x_lens -= reminder
        x = x.view((batch_size, length // self.downsamples, feature * self.downsamples))
        x_lens = x_lens // self.downsamples #+ x_lens % self.downsamples
        return x, x_lens

In [None]:
class pBLSTMwithLockdrop(torch.nn.Module):
    '''
    '''
    def __init__(self, encoder_hidden_size, drop):
        super(pBLSTMwithLockdrop, self).__init__()
        self.blstm1 = pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1* encoder_hidden_size)
        self.blstm2 = pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1* encoder_hidden_size)
        self.blstm3 = pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1* encoder_hidden_size)
        self.blstm4 = pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1* encoder_hidden_size)
        self.drop1 = LockedDropout()
        self.drop2 = LockedDropout()
        self.drop3 = LockedDropout()
        self.drop4 = LockedDropout()
        self.p = drop

       # TODO: Initialize a single layer bidirectional LSTM with the given input_size and hidden_size

    def forward(self, x_packed): # x_packed is a PackedSequence
        x_packed = self.blstm1(x_packed)
        x_outputs, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        x_outputs = self.drop1(x_outputs, dropout=self.p[0])
        x_packed = pack_padded_sequence(x_outputs, x_lens, batch_first = True,  enforce_sorted=False)

        x_packed = self.blstm2(x_packed)
        x_outputs, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        x_outputs = self.drop2(x_outputs, dropout=self.p[1])
        x_packed = pack_padded_sequence(x_outputs, x_lens, batch_first = True,  enforce_sorted=False)
        
        x_packed = self.blstm3(x_packed)
        x_outputs, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        x_outputs = self.drop3(x_outputs, dropout=self.p[2])
        x_packed = pack_padded_sequence(x_outputs, x_lens, batch_first = True,  enforce_sorted=False)
        
        x_packed = self.blstm4(x_packed)
        x_outputs, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        x_outputs = self.drop4(x_outputs, dropout=self.p[3])
        x_packed = pack_padded_sequence(x_outputs, x_lens, batch_first = True,  enforce_sorted=False)
        
        return x_packed

### Listener

In [None]:
class Listener(torch.nn.Module):
    '''
    The Encoder takes utterances as inputs and returns latent feature representations
    '''
    def __init__(self, input_size, cnn, encoder_hidden_size, layer, cnn_drop, lstm_drop):
        super(Listener, self).__init__()
        #[64, 128]
        self.cnn_outputsize = cnn[-1]
        self.down_sample = 2
        # self.cnn = torch.nn.Conv1d(input_size, self.cnn_outputsize, kernel_size=1, stride=1)
        self.cnn1 = self.make_layer(input_size, cnn[0], block_num = layer[0], stride = 1, drop = cnn_drop)
        self.cnn2 = self.make_layer(cnn[0], cnn[1], block_num = layer[1], stride = 1, drop = cnn_drop)
        # self.cnn3 = self.make_layer(cnn[1], cnn[2], block_num = layer[2], stride = 1, drop = cnn_drop)
        # The first LSTM at the very bottom
        self.base_lstm = torch.nn.LSTM(input_size = self.cnn_outputsize, 
                                       hidden_size = encoder_hidden_size, 
                                       num_layers = 1, 
                                       batch_first = True, 
                                       bidirectional = True)#TODO: Fill this up

        # self.pBLSTMs = torch.nn.Sequential( # How many pBLSTMs are required?
        #     # TODO: Fill this up with pBLSTMs - What should the input_size be? 
        #     pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1* encoder_hidden_size),
        #     LockedDropout(),
        #     pBLSTM(input_size = 2* encoder_hidden_size, hidden_size= 1* encoder_hidden_size),
        #     LockedDropout(),
        #     pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1 * encoder_hidden_size),
        #     LockedDropout(),
        #     pBLSTM(input_size = 2 * encoder_hidden_size, hidden_size= 1 * encoder_hidden_size),
        #     LockedDropout(),
        #     # Hint: You are downsampling timesteps by a factor of 2, upsampling features by a factor of 2 and the LSTM is bidirectional)
        #     # Optional: Dropout/Locked Dropout after each pBLSTM (Not needed for early submission)
        #     # ...
        #     # ...
        # )
        self.pBLSTMs = pBLSTMwithLockdrop(encoder_hidden_size = encoder_hidden_size, drop = lstm_drop)
    
    def make_layer(self, in_channel, out_channel, block_num, stride=1, drop=0):
        downsample = None
        if stride != 1 or in_channel != out_channel:
            downsample = nn.Sequential(
                 nn.Conv1d(in_channel, out_channel, kernel_size=1, stride=1, bias = False),
                 nn.BatchNorm1d(out_channel),
            )
        layers = []
        layers.append(ResidualBlock(in_channel, out_channel, stride, drop, downsample))
        in_channel = out_channel
        for i in range(1, block_num):
            layers.append(ResidualBlock(in_channel, out_channel, stride, drop))

        return nn.Sequential(*layers)
         
    def forward(self, x, x_lens):
        # Where are x and x_lens coming from? The dataloader
        # x : B x L x D---> B x D x L --> B x cnn_outputsize x L --> B x L x cnn_outputsize
        x = self.cnn1(x.transpose(1,2))
        x = self.cnn2(x)
        # x = self.cnn3(x)
        x = x.transpose(1,2)
        # TODO: Pack Padded Sequence
        x_packed = pack_padded_sequence(x, x_lens, batch_first = True,  enforce_sorted=False)
        # TODO: Pass it through the first LSTM layer (no truncation)
        x_packed = self.base_lstm(x_packed)[0]
        # TODO: Pad Packed Sequence
        # x, x_lens = pad_packed_sequence(x_packed, batch_first=True)
        # TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        encoder_packed = self.pBLSTMs(x_packed)
        encoder_outputs, encoder_lens = pad_packed_sequence(encoder_packed, batch_first=True)
        # Remember the number of output(s) each function returns
        # return encoder_packed
        return encoder_outputs, encoder_lens

In [None]:
# example_batch, example_len = x_batch_pad, x_len
# encoder = Listener(input_size = 15, cnn=[64, 128], encoder_hidden_size = 256, layer =[2,2], cnn_drop=0).to(DEVICE)# TODO: Initialize Listener
# print(encoder)
# # summary(encoder, example_batch[0].to(DEVICE), example_batch[3])
# summary(encoder, example_batch.to(DEVICE), example_len)
# encoder_outputs, encoder_lens = encoder(example_batch.to(DEVICE), example_len)
# print(encoder_outputs.shape)
# print(encoder_lens.shape)
# print(encoder_lens)
# del encoder
# del example_batch
# del example_len

## Attention (Attend)

### Different ways to compute Attention

1. Dot-product attention
    * raw_weights = bmm(key, query) 
    * Optional: Scaled dot-product by normalizing with sqrt key dimension 
    * Check "Attention is All You Need" Section 3.2.1
    * 1st way is what most TAs are comfortable with, but if you want to explore, check out other methods below


2. Cosine attention
    * raw_weights = cosine(query, key) # almost the same as dot-product xD 

3. Bi-linear attention
    * W = Linear transformation (learnable parameter): d_k -> d_q
    * raw_weights = bmm(key @ W, query)

4. Multi-layer perceptron
    * Check "Neural Machine Translation and Sequence-to-sequence Models: A Tutorial" Section 8.4

5. Multi-Head Attention
    * Check "Attention is All You Need" Section 3.2.2
    * h = Number of heads
    * W_Q, W_K, W_V: Weight matrix for Q, K, V (h of them in total)
    * W_O: d_v -> d_v
    * Reshape K: (B, T, d_k) to (B, T, h, d_k // h) and transpose to (B, h, T, d_k // h)
    * Reshape V: (B, T, d_v) to (B, T, h, d_v // h) and transpose to (B, h, T, d_v // h)
    * Reshape Q: (B, d_q) to (B, h, d_q // h) `
    * raw_weights = Q @ K^T
    * masked_raw_weights = mask(raw_weights)
    * attention = softmax(masked_raw_weights)
    * multi_head = attention @ V
    * multi_head = multi_head reshaped to (B, d_v)
    * context = multi_head @ W_O

In [None]:
def plot_attention(attention): 
    # Function for plotting attention
    # You need to get a diagonal plot
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Attention(torch.nn.Module):
    '''
    Attention is calculated using the key, value (from encoder hidden states) and query from decoder.
    Here are different ways to compute attention and context:
    After obtaining the raw weights, compute and return attention weights and context as follows.:
    masked_raw_weights  = mask(raw_weights) # mask out padded elements with big negative number (e.g. -1e9 or -inf in FP16)
    attention           = softmax(masked_raw_weights)
    context             = bmm(attention, value)
    At the end, you can pass context through a linear layer too.
    '''
    def __init__(self, encoder_hidden_size, decoder_output_size, projection_size):
        super(Attention, self).__init__()
        self.encoder_hidden_size = encoder_hidden_size 
        self.key_projection     = nn.Linear(2 * encoder_hidden_size, projection_size)# TODO: Define an nn.Linear layer which projects the encoder_hidden_state to keys
        self.value_projection   = nn.Linear(2 * encoder_hidden_size, projection_size)# TODO: Define an nn.Linear layer which projects the encoder_hidden_state to value
        self.query_projection   = nn.Linear(decoder_output_size, projection_size)# TODO: Define an nn.Linear layer which projects the decoder_output_state to query
        # Optional : Define an nn.Linear layer which projects the context vector
        self.softmax            = nn.Softmax(dim = 1)# TODO: Define a softmax layer. Think about the dimension which you need to apply 
        # Tip: What is the shape of energy? And what are those?

    # As you know, in the attention mechanism, the key, value and mask are calculated only once.
    # This function is used to calculate them and set them to self
    def set_key_value_mask(self, encoder_outputs, encoder_lens): # B x L x D, B
    
        _, encoder_max_seq_len, _ = encoder_outputs.shape
        self.key      = self.key_projection(encoder_outputs)# TODO: Project encoder_outputs using key_projection to get keys, B x L x decoder_output_size
        self.value    = self.value_projection(encoder_outputs)# TODO: Project encoder_outputs using value_projection to get values
        # encoder_max_seq_len is of shape (batch_size, ) which consists of the lengths encoder output sequences in that batch
        # The raw_weights are of shape (batch_size, timesteps)
        # TODO: To remove the influence of padding in the raw_weights, we want to create a boolean mask of shape (batch_size, timesteps) 
        # The mask is False for all indicies before padding begins, True for all indices after.
        # 1 x L >= B x 1 --> B x L, Ture is padding, False is not padding 
        self.padding_mask     =  torch.arange(encoder_outputs.size(1)).unsqueeze(0) >= encoder_lens.unsqueeze(1) # TODO: You want to use a comparison between encoder_max_seq_len and encoder_lens to create this mask. 
        # (Hint: Broadcasting gives you a one liner)
        
    def forward(self, decoder_output_embedding):
        # key   : (batch_size, timesteps, projection_size)
        # value : (batch_size, timesteps, projection_size)
        # query : (batch_size, projection_size)
        # print(self.encoder_hidden_size)
        self.query         = self.query_projection(decoder_output_embedding)# TODO: Project the query using query_projection
        # Hint: Take a look at torch.bmm for the products below 
        # B x L x projection_size @ B x projections_size x 1 = B x L x 1 --> B x L
        raw_weights        =  torch.bmm(self.key, self.query.unsqueeze(2)).squeeze(2) # * 1 / torch.sqrt(self.query.size(1)) 
        # print(raw_weights.shape)
        # TODO: Calculate raw_weights which is the product of query and key, and is of shape (batch_size, timesteps)
        MASKING_VALUE = -1e+9 if raw_weights.dtype == torch.float32 else -1e+4
        # print(raw_weights.dtype)
        masked_raw_weights = raw_weights.masked_fill_(self.padding_mask.to(DEVICE), value=MASKING_VALUE)# TODO: Mask the raw_weights with self.padding_mask.
        # Take a look at pytorch's masked_fill_ function (You want the fill value to be a big negative number for the softmax to make it close to 0)
        # B x L 
        attention_weights  = self.softmax(masked_raw_weights)# TODO: Calculate the attention weights, which is the softmax of raw_weights
        # B x 1 x L @ B x L x projection_size = B x 1 x projection_size -- > Bx projection_size
        context            = torch.bmm(attention_weights.unsqueeze(1), self.value).squeeze(1)# TODO: Calculate the context - it is a product between attention_weights and value
        # Hint: You might need to use squeeze/unsqueeze to make sure that your operations work with bmm
        return context, attention_weights # Return the context, attention_weights, B x projection_size, B x L

In [None]:
class Attend(torch.nn.Module):
    def __init__(self):
        super(Attend, self).__init__()
        self.softmax            = nn.Softmax(dim = 1)# TODO: Define a softmax layer. Think about the dimension which you need to apply 
    def forward(self, query, key, value, mask):
        # Hint: Take a look at torch.bmm for the products below 
        # B x L x projection_size @ B x projections_size x 1 = B x L x 1 --> B x L
        raw_weights        =  torch.bmm(key, query.unsqueeze(2)).squeeze(2) # * 1 / torch.sqrt(self.query.size(1)) 
        # print(raw_weights.shape)
        # TODO: Calculate raw_weights which is the product of query and key, and is of shape (batch_size, timesteps)
        MASKING_VALUE = -1e+9 if raw_weights.dtype == torch.float32 else -1e+4
        masked_raw_weights = raw_weights.masked_fill_(mask.to(DEVICE), value=MASKING_VALUE)# TODO: Mask the raw_weights with self.padding_mask.
        # Take a look at pytorch's masked_fill_ function (You want the fill value to be a big negative number for the softmax to make it close to 0)
        # B x L 
        attention_weights  = self.softmax(masked_raw_weights)# TODO: Calculate the attention weights, which is the softmax of raw_weights
        # B x 1 x L @ B x L x projection_size = B x 1 x projection_size -- > Bx projection_size
        context            = torch.bmm(attention_weights.unsqueeze(1), value).squeeze(1)# TODO: Calculate the context - it is a product between attention_weights and value
        # Hint: You might need to use squeeze/unsqueeze to make sure that your operations work with bmm
        return context, attention_weights # Return the context, attention_weights, B x projection_size, B x L

In [None]:
def plot_attention(attention): 
    # Function for plotting attention
    # You need to get a diagonal plot
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Multi_Attention(torch.nn.Module):
    '''
    Attention is calculated using the key, value (from encoder hidden states) and query from decoder.
    Here are different ways to compute attention and context:
    After obtaining the raw weights, compute and return attention weights and context as follows.:
    masked_raw_weights  = mask(raw_weights) # mask out padded elements with big negative number (e.g. -1e9 or -inf in FP16)
    attention           = softmax(masked_raw_weights)
    context             = bmm(attention, value)
    At the end, you can pass context through a linear layer too.
    '''
    def __init__(self, encoder_hidden_size, decoder_output_size, projection_size, head):
        super(Multi_Attention, self).__init__()
        self.encoder_hidden_size = encoder_hidden_size 
        self.key_projection     = nn.Linear(2 * encoder_hidden_size, projection_size)# TODO: Define an nn.Linear layer which projects the encoder_hidden_state to keys
        self.value_projection   = nn.Linear(2 * encoder_hidden_size, projection_size)# TODO: Define an nn.Linear layer which projects the encoder_hidden_state to value
        self.query_projection   = nn.Linear(decoder_output_size, projection_size)# TODO: Define an nn.Linear layer which projects the decoder_output_state to query
        # Optional : Define an nn.Linear layer which projects the context vector
        self.softmax            = nn.Softmax(dim = 1)# TODO: Define a softmax layer. Think about the dimension which you need to apply 
        self.head = head
        self.projection_size = projection_size
        self.attend = Attend()

        self.comb = nn.Linear(projection_size, projection_size)
        # Tip: What is the shape of energy? And what are those?

    # As you know, in the attention mechanism, the key, value and mask are calculated only once.
    # This function is used to calculate them and set them to self
    def set_key_value_mask(self, encoder_outputs, encoder_lens): # B x L x D, B
        self.B = encoder_outputs.size(0)
        self.L = encoder_outputs.size(1)
        _, encoder_max_seq_len, _ = encoder_outputs.shape
        self.key      = self.key_projection(encoder_outputs)# TODO: Project encoder_outputs using key_projection to get keys, B x L x decoder_output_size
        self.value    = self.value_projection(encoder_outputs)# TODO: Project encoder_outputs using value_projection to get values 
        # encoder_max_seq_len is of shape (batch_size, ) which consists of the lengths encoder output sequences in that batch
        # The raw_weights are of shape (batch_size, timesteps)
        # TODO: To remove the influence of padding in the raw_weights, we want to create a boolean mask of shape (batch_size, timesteps) 
        # The mask is False for all indicies before padding begins, True for all indices after.
        # 1 x L >= B x 1 --> B x L, Ture is padding, False is not padding 
        #self.padding_mask     =  torch.arange(encoder_outputs.size(1)).unsqueeze(0) >= encoder_lens.unsqueeze(1) # TODO: You want to use a comparison between encoder_max_seq_len and encoder_lens to create this mask. 
        # (Hint: Broadcasting gives you a one liner)
        #B x L, Ture is padding, False is not padding 
        self.padding_mask     = torch.arange(encoder_outputs.size(1)).unsqueeze(0) >= encoder_lens.unsqueeze(1)
        
    def forward(self, decoder_output_embedding):
        # key   : (batch_size, timesteps, projection_size)
        # value : (batch_size, timesteps, projection_size)
        # query : (batch_size, projection_size)
        # print(self.encoder_hidden_size)
        self.query         = self.query_projection(decoder_output_embedding)# TODO: Project the query using query_projection

        com_query         = torch.reshape(self.query, (self.B, 1 , self.head, self.projection_size // self.head)) # B x proj --> B x 1 x head x proj/head
        com_query         = torch.transpose(com_query, 1, 2) # B x 1 x head x proj/head -->  B x head x 1 x proj/head
        com_key =  torch.reshape(self.key, (self.B, self.L, self.head, self.projection_size // self.head)) # B x L x proj --> B x L x head x proj/head
        com_value = torch.reshape(self.value, (self.B, self.L, self.head, self.projection_size // self.head)) # B x L x proj --> B x L x head x proj/head
        com_key = torch.transpose(com_key, 1, 2)#  B x L x head x proj/head  --> B x head x L x proj/head 
        com_value = torch.transpose(com_value, 1, 2) #  B x L x head x proj/head  --> B x head x L x proj/head
        mask = self.padding_mask
        for i in range(self.head):
          query = com_query[:, i , 0,:]
          key = com_key[:, i, :, :]
          value = com_value[:,i, :, :]
          if i == 0:
            context, attention_weights = self.attend(query, key, value, mask) #B x projection_size, B x L
            # Bxprojection  -->B x (head x projection)
          else:
            next_context, next_attention_weights = self.attend(query, key, value, mask) #B x projection_size, B x L
            context = torch.cat((context, next_context), 1)
            attention_weights = torch.cat((attention_weights, next_attention_weights), 1)
        
        context = self.comb(context)
        #attention_weights = self.comb(attention_weights)


        # Hint: You might need to use squeeze/unsqueeze to make sure that your operations work with bmm
        return context, attention_weights # Return the context, attention_weights, B x projection_size, B x L

## Decoder

### Speller

#### Beam(batch size = 1)(slow)

In [None]:
def prune(new_beams, beam_width, batch_size):
    """
    beams = [[char, predictions, char_prob, hidden_states, context, log_probs, attention_plot]]
    """
    if len(new_beams) <= beam_width:
        return new_beams
    beams = new_beams[ :beam_width]
    for i in range(batch_size):
      new_beams = sorted(new_beams, key = lambda x:x[5][i])
      for j in range(beam_width):
          for k in range(7):
              if k != 1 and k != 3 and k != 6:
                  beams[j][k][i] = new_beams[j][k][i]
              elif k != 3:
                  for h in range(len(beams[j][k])):
                    beams[j][k][h][i] = new_beams[j][k][h][i]
              else:
                  for h in range(2):
                      for h1 in range(len(beams[j][k][h])):
                          beams[j][k][h][h1][i] = new_beams[j][k][h][h1][i]

    return beams

In [None]:
from torch._C import get_num_interop_threads
class Speller_beam(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size, attention_module= None, beam = 10):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                # Create Two LSTM Cells as per LAS Architecture
                                # What should the input_size of the first LSTM Cell? 
                                # Hint: It takes in a combination of the character embedding and context from attention
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size)
                                )
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

        self.beam_width = beam

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        # Set Attention Key, Value, Padding Mask just once
        # key   : (batch_size, timesteps, projection_size)
        # value : (batch_size, timesteps, projection_size)
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = []
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        char_prob       = torch.full((batch_size,), fill_value=1.0, dtype= torch.long).to(DEVICE)
        #path_end        = torch.full((batch_size,), fill_value=False, dtype= torch.long).to(DEVICE) 
        log_probs       =  [0.0] * batch_size  
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        attention_plot          = []
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        
        beams = [[char, predictions, char_prob, hidden_states, context, log_probs, attention_plot]]
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            new_beams = []
            for beam in beams:
              char = beam[0]
              predictions = beam[1]
              char_prob = beam[2]
              hidden_states = beam[3]
              context = beam[4]
              log_probs = beam[5]
              #path_end = beam[6]
              attention_plot = beam[6]
              if self.training:
                  # TODO: We want to decide which embedding to use as input for the decoder during training
                  # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
                  # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
                  # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
                  if np.random.random() <= tf_rate :
                      teacher_forcing = True 
                  else:
                      teacher_forcing = False 
                  if t == 0:
                      char_embed = self.embedding(char)
                  else:
                    if teacher_forcing:
                        # Use ground truth
                        char_embed = label_embed[:, t-1, :]
                    else:
                        if Gumbel:
                            char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                        else:
                            char_embed = self.embedding(char)
              else:
                  if Gumbel:
                      char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                  else:
                      char_embed = self.embedding(char)
                  #char_embed = self.embedding(char)
              # char_embed = self.embedding(char)#TODO: Generate the embedding for the character at timestep t
              # if self.training and t > 0:
              #     # TODO: We want to decide which embedding to use as input for the decoder during training
              #     # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
              #     # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
              #     # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
              #     char_embed = self.embedding(char)# TODO
              # B x embed_size  + B x projection_size ---> B x (embed_size + projection_size)
              # print('char_embed: ' + str(char_embed.shape))
              # print('context:' + str(context.shape))
              decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
              # Loop over your lstm cells
              # Each lstm cell takes in an embedding 
              for i in range(len(self.lstm_cells)):
                  # An LSTM Cell returns (h,c) -> h = hidden state, c = cell memory state
                  # Using 2 LSTM Cells is akin to a 2 layer LSTM looped through t timesteps 
                  # The second LSTM Cell takes in the output hidden state of the first LSTM Cell (from the current timestep) as Input,
                  # along with the hidden and cell states of the cell from the previous timestep
                  # B x (embed_size + projection_size), hidden_states[i]--> h, c to hidden_states[i]
                  # decoder_input_embedding: B x decoder_hidden_size
                  hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                  decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
              # The output embedding from the decoder is the hidden state of the last LSTM Cell
              decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
              # We compute attention from the output of the last LSTM Cell
              # key   : (batch_size, timesteps, projection_size)
              # value : (batch_size, timesteps, projection_size)
              # print('deconder_output_embedding:' + str(decoder_output_embedding.shape))
              if self.attention != None:
                  context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
              attention_plot.append(attention_weights[0].detach().cpu())
              # B x projection_size + B x projection_size = B x 2 projection_size
              # print('attention.query:' + str(self.attention.query.shape))
              # print('context:' + str(context.shape))
              output_embedding     =  torch.cat((self.attention.query, context), dim=1)
              # TODO: Concatenate the projected query with context for the output embedding
              # Hint: How can you get the projected query from attention
              # If you are not using attention, what will you use instead of query?
              char_prob            = self.char_prob(output_embedding) # B x vocab_size
              # print('char_prob:' + str(char_prob.shape))
              # Append the character probability distribution to the list of predictions 
              predictions.append(char_prob)

              # DrawFromDistribution
              # for i in range(char_prob.size(1)):
              #     char = torch.full((batch_size,), fill_value= i, dtype= torch.long).to(DEVICE) 
              #     #char = torch.argmax(char_prob, dim = 1)# TODO: Get the predicted character for the next timestep from the probability distribution
              #     log_probs += torch.log(char_prob[:, i]).detach().cpu().numpy()
              #     # (Hint: Use Greedy Decoding for starters)
              #     # print('char: ' + str(char.shape))
              #     new_beam = [char, predictions, char_prob, hidden_states, context, log_probs, attention_plot]
              #     new_beams.append(new_beam)
              values, indices = torch.topk(char_prob, self.beam_width, dim = 1) #B x beam_width
              for i in range(self.beam_width):
                  char = indices[:, i]
                  logprob = log_probs + torch.log(values[:, i]).detach().cpu().numpy()
                  # char = torch.argmax(char_prob, dim = 1)
                  # log_probs += torch.log(torch.max(char_prob, dim = 1)[0]).detach().cpu().numpy()
                  new_beam = [char, predictions, char_prob, hidden_states, context, logprob, attention_plot]
                  new_beams.append(new_beam)
            beams = prune(new_beams, self.beam_width,batch_size)
        beam = beams[0] # choose the top 1 result
        predictions = beam[1]       
        attention_plot = beam[6]
        attention_plot  = torch.stack(attention_plot, dim =1)# TODO: Stack list of attetion_plots 
        predictions     = torch.stack(predictions, dim = 1)# TODO: Stack list of predictions 

        return predictions, attention_plot

#### Beam2（batchsize >1）


In [None]:
def prune(new_beams, beam_width):
    """
    beam = [char, predictions, char_prob, hidden_states, context, logprob, seq, attention_plot]
    char: B, , predicttions:  B x vocab_size x T , char_prob: B x vocab_size
    hidden_states: len(self.lstm_cells) x [h_1, c_1], h_1, c_1: batch x hidden_size
    context: B x projection_size
    logprob: B
    seq : BxL
    attention_plot: T x attention_weights[0].shape
    batch_size = 96
    """
    char = new_beams[0][0].unsqueeze(1)
    predictions = new_beams[0][1].unsqueeze(1)
    char_prob = new_beams[0][2].unsqueeze(1)
    context = new_beams[0][4].unsqueeze(1)
    logprob = new_beams[0][5].unsqueeze(1)
    seq = new_beams[0][6].unsqueeze(1) 
    # attention_plot = new_beams[0][7].unsqueeze(1)

    h0 = new_beams[0][3][0][0].unsqueeze(1)
    c0 = new_beams[0][3][0][1].unsqueeze(1)
    h1 = new_beams[0][3][1][0].unsqueeze(1)
    c1 = new_beams[0][3][1][1].unsqueeze(1)
    h2 = new_beams[0][3][2][0].unsqueeze(1)
    c2 = new_beams[0][3][2][1].unsqueeze(1)

    n = len(new_beams)
    beams = []
    for i in range(1, n):
        char = torch.cat((char, new_beams[i][0].unsqueeze(1)), dim = 1)
        predictions = torch.cat((predictions, new_beams[i][1].unsqueeze(1)), dim = 1)
        char_prob = torch.cat((char_prob, new_beams[i][2].unsqueeze(1)), dim = 1)
        context = torch.cat((context, new_beams[i][4].unsqueeze(1)), dim = 1)
        logprob = torch.cat((logprob, new_beams[i][5].unsqueeze(1)), dim = 1) #n x B
        seq = torch.cat((seq, new_beams[i][6].unsqueeze(1)), dim = 1)
        # attention_plot = torch.cat((attention_plot, new_beams[i][7].unsqueeze(1)), dim = 1)

        h0 = torch.cat((h0, new_beams[i][3][0][0].unsqueeze(1)), dim = 1)
        c0 = torch.cat((c0, new_beams[i][3][0][1].unsqueeze(1)), dim = 1)
        h1 = torch.cat((h1, new_beams[i][3][1][0].unsqueeze(1)), dim = 1)
        c1 = torch.cat((c1, new_beams[i][3][1][1].unsqueeze(1)), dim = 1)
        h2 = torch.cat((h2, new_beams[i][3][2][0].unsqueeze(1)), dim = 1)
        c2 = torch.cat((c2, new_beams[i][3][2][1].unsqueeze(1)), dim = 1)
    value, indice = torch.topk(logprob, beam_width, dim = 1) # B x n -->indice = B x beam_with
    for i in range(beam_width):
        batch_indice = indice[:, i] # i-th (B,)
        one = F.one_hot(batch_indice, num_classes=n) # B x N
        i_char = torch.sum(one * char, dim = 1)
        i_predictions = torch.sum(one.unsqueeze(2).unsqueeze(3) * predictions, dim = 1)
        i_char_prob = torch.sum(one.unsqueeze(2) * char_prob, dim = 1)
        i_context = torch.sum(one.unsqueeze(2) * context, dim = 1)
        i_logprob = torch.sum(one * logprob, dim = 1)
        i_seq = torch.sum(one.unsqueeze(2) * seq, dim = 1)
        # i_attention_plot = torch.sum(one.unsqueeze(2) * attention_plot, dim = 1)

        i_h0 = torch.sum(one.unsqueeze(2) * h0, dim = 1)
        i_c0 = torch.sum(one.unsqueeze(2) * c0, dim = 1)
        i_h1 = torch.sum(one.unsqueeze(2) * h1, dim = 1)
        i_c1 = torch.sum(one.unsqueeze(2) * c1, dim = 1)
        i_h2 = torch.sum(one.unsqueeze(2) * h2, dim = 1)
        i_c2 = torch.sum(one.unsqueeze(2) * c2, dim = 1)

        i_hidden_states = [(i_h0, i_c0), (i_h1, i_c1), (i_h2, i_c2)]

        beam = [i_char, i_predictions.contiguous(), i_char_prob, i_hidden_states, i_context, i_logprob, i_seq]
        beams.append(beam)
    return beams

In [None]:
from torch._C import get_num_interop_threads
class Speller(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size,lstmcell_drop, attention_module= None, beam = 3):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size)
                                )
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

        self.beam_width = beam
        self.drop = nn.Dropout(p = lstmcell_drop)

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = [] 
        attention_plot  = []
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE)
        seq            = torch.full((batch_size, 1), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        char_prob       = torch.full((batch_size,), fill_value=1.0, dtype= torch.long).to(DEVICE)
        #path_end        = torch.full((batch_size,), fill_value=False, dtype= torch.long).to(DEVICE) 
        log_probs       =  torch.full((batch_size,), fill_value=0.0).to(DEVICE)
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        
        beams = [[char, predictions, char_prob, hidden_states, context, log_probs, seq]]
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            new_beams = []
            if self.training:
                # TODO: We want to decide which embedding to use as input for the decoder during training
                # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
                # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
                # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
                if np.random.random() <= tf_rate :
                    teacher_forcing = True 
                else:
                    teacher_forcing = False 
            for num, beam in enumerate(beams):
                char = beam[0].clone()
                if t > 0:
                  predictions = beam[1].clone()
                char_prob = beam[2].clone()
                hidden_states = beam[3].copy()
                context = beam[4].clone()
                log_probs = beam[5].clone()
                seq = beam[6].clone()
                # if t > 0:
                #   attention_plot = beam[7].clone()

                if self.training:
                    if t == 0:
                        char_embed = self.embedding(char)
                    else:
                      if teacher_forcing:
                          # Use ground truth
                          char_embed = label_embed[:, t-1, :]
                      else:
                          if Gumbel:
                              char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                          else:
                              char_embed = self.embedding(char)
                else:
                    if torch.sum(char) == EOS_TOKEN * batch_size: 
                        new_beams.append(beam)
                        continue
                    char_embed = self.embedding(char)
                decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
                # Loop over your lstm cells
                # Each lstm cell takes in an embedding 
                for i in range(len(self.lstm_cells)):
                    hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                    h0 = self.drop(hidden_states[i][0])
                    c0 = self.drop(hidden_states[i][1])
                    hidden_states[i] = (h0, c0)
                    decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
                # The output embedding from the decoder is the hidden state of the last LSTM Cell
                decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
                # if t > 0:
                #     attention_plot = torch.cat((attention_plot, attention_weights[0].unsqueeze(-1)), dim = -1)# 1 x attention_weights[0].shape
                # else:
                #     attention_plot = attention_weights[0].unsqueeze(-1)
                #attention_plot.append(attention_weights[0].detach().cpu())
                if self.attention != None:
                    context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
                output_embedding     =  torch.cat((self.attention.query, context), dim=1)
                char_prob            = self.char_prob(output_embedding) # B x vocab_size
                prob = F.softmax(char_prob)
                if t > 0:
                    predictions = torch.cat((predictions, char_prob.unsqueeze(2)), dim = 2)
                else:
                    predictions = char_prob.unsqueeze(2) # B x vocab_size x T


                #predictions.append(char_prob)

                values, indices = torch.topk(prob, self.beam_width, dim = 1) # B x beam_width, B x beam_width
                for i in range(self.beam_width):
                    char = indices[:, i] # B 
                    logprob = log_probs + torch.log(values[:, i]).to(DEVICE) # B  + B  = B 
                    #seq = torch.cat((seq, char.unsqueeze(1)), dim = 1)
                    new_beam = [char, predictions, char_prob, hidden_states, context, logprob, torch.cat((seq, char.unsqueeze(1)), dim = 1)]
                    new_beams.append(new_beam)

            beams = prune(new_beams, self.beam_width)

        beam = beams[0] # choose the top 1 result
        predictions = beam[1]       
        predictions     = torch.transpose(predictions, 1, 2).contiguous()#torch.stack(predictions, dim = 1)# TODO: Stack list of predictions 
        seq = beam[6]  # B x T
        # attention_plot = beam[7]
        # attention_plot  = torch.transpose(predictions, 1, 2 )# torch.stack(attention_plot, dim =1)
        return predictions, seq # B x T

#### Beam node(fail)

In [None]:
class BeamSearchNode(object):
    def __init__(self, prev_node, char, predictions, char_prob, hidden_states, context, log_probs, attention_plot, length):
        self.prev_node = prev_node
        self.char = char
        self.predictions = predictions
        self.char_prob = char_prob
        self.hidden_states = hidden_states
        self.context = context
        self.log_probs = log_probs
        self.attention_plot =  attention_plot
        self.length = length

In [None]:
def prune(new_beams, score, beam_width, batch_size):
    """
    node = BeamSearchNode (prev_node, char, predictions, char_prob, hidden_states, context, log_probs, attention_plot)
    score
    """
    if len(new_beams) <= beam_width:
        return new_beams
    a = new_beams
    values, indices = torch.topk(torch.tensor(score), beam_width) 
    beams = []
    for i in range(beam_width):
        beams.append(new_beams[indices[i]])
    return beams

In [None]:
from torch._C import get_num_interop_threads
class Speller(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size, attention_module= None, beam = 4):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                # Create Two LSTM Cells as per LAS Architecture
                                # What should the input_size of the first LSTM Cell? 
                                # Hint: It takes in a combination of the character embedding and context from attention
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size)
                                )
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

        self.beam_width = beam

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        # Set Attention Key, Value, Padding Mask just once
        # key   : (batch_size, timesteps, projection_size)
        # value : (batch_size, timesteps, projection_size)
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = None
        attention_plot  = None
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        char_prob       = torch.full((batch_size,), fill_value=1.0, dtype= torch.long).to(DEVICE)
        #path_end        = torch.full((batch_size,), fill_value=False, dtype= torch.long).to(DEVICE) 
        log_probs       =  torch.full((batch_size,), fill_value=0.0).to(DEVICE) 
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        beam = []
        for i in range(batch_size):
            node = BeamSearchNode(None, char[i], predictions[i], char_prob[i], hidden_states, context, log_probs, attention_plot, 0)
            beam.append(node)
        beams = [[beam]]
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            new_beams = []
            score = []
            for beam in beams:
              char = node.char
              predictions = node.predictions
              char_prob = node.char_prob
              hidden_states = node.hidden_states
              context = node.context
              log_probs = node.log_probs

              if self.training:
                  # TODO: We want to decide which embedding to use as input for the decoder during training
                  # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
                  # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
                  # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
                  if np.random.random() <= tf_rate :
                      teacher_forcing = True 
                  else:
                      teacher_forcing = False 
                  if t == 0:
                      char_embed = self.embedding(char)
                  else:
                    if teacher_forcing:
                        # Use ground truth
                        char_embed = label_embed[:, t-1, :]
                    else:
                        if Gumbel:
                            char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                        else:
                            char_embed = self.embedding(char)
              else:
                  if Gumbel:
                      char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                  else:
                      char_embed = self.embedding(char)
              decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
              # Loop over your lstm cells
              # Each lstm cell takes in an embedding 
              for i in range(len(self.lstm_cells)):
                  hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                  decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
              # The output embedding from the decoder is the hidden state of the last LSTM Cell
              decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
              if self.attention != None:
                  context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
              attention_plot = attention_weights[0].detach().cpu()
              # B x projection_size + B x projection_size = B x 2 projection_size
              # print('attention.query:' + str(self.attention.query.shape))
              # print('context:' + str(context.shape))
              output_embedding     =  torch.cat((self.attention.query, context), dim=1)
              char_prob            = self.char_prob(output_embedding) # B x vocab_size
              # print('char_prob:' + str(char_prob.shape))
              # Append the character probability distribution to the list of predictions 
              predictions = char_prob
              # DrawFromDistribution
              values, indices = torch.topk(char_prob, self.beam_width, dim = 1) #B x beam_width
              for i in range(self.beam_width):
                  char = indices[:, i]
                  logprob = log_probs + torch.log(values[:, i])
                  new_node = BeamSearchNode(node, char, predictions, char_prob, hidden_states, context, logprob, attention_plot)
                  new_beams.append(new_node)
                  score.append(torch.sum(logprob))
            beams = prune(new_beams, score, self.beam_width, batch_size)
        node = beams[0] # choose the top 1 result
        predictions = []
        attention_plot = []
        for t in range(timesteps):
            predictions.append(node.predictions)
            attention_plot.append(node.attention_plot)
            node = node.prev_node     
        predictions     = predictions[::-1]# TODO: Stack list of predictions 
        attention_plot  = attention_plot[::-1]
        attention_plot  = torch.stack(attention_plot, dim =1)# TODO: Stack list of attetion_plots 
        predictions     = torch.stack(predictions, dim = 1)# TODO: Stack list of predictions
        return predictions, attention_plot

#### Greedy


In [None]:
from torch._C import get_num_interop_threads
class Speller(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size, lstmcell_drop, attention_module= None):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                # Create Two LSTM Cells as per LAS Architecture
                                # What should the input_size of the first LSTM Cell? 
                                # Hint: It takes in a combination of the character embedding and context from attention
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                
                                # multi-head two layer
                                )
        self.drop = nn.Dropout(p = lstmcell_drop)
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        # global a
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        # Set Attention Key, Value, Padding Mask just once
        # key   : (batch_size, timesteps, projection_size)
        # value : (batch_size, timesteps, projection_size)
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = []
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        attention_plot          = []
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            if self.training:
                # TODO: We want to decide which embedding to use as input for the decoder during training
                # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
                # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
                # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
                if np.random.random() <= tf_rate :
                    teacher_forcing = True 
                else:
                    teacher_forcing = False 
                if t == 0:
                    char_embed = self.embedding(char)
                else:
                  if teacher_forcing:
                      # Use ground truth
                      char_embed = label_embed[:, t-1, :]
                  else:
                      if Gumbel:
                          char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                      else:
                          char_embed = self.embedding(char)

            else:
                if Gumbel:
                    char_embed = torch.nn.functional.gumbel_softmax(char_prob).mm(self.embedding.weight)
                else:
                    char_embed = self.embedding(char)
                #char_embed = self.embedding(char)

            # char_embed = self.embedding(char)#TODO: Generate the embedding for the character at timestep t
            # if self.training and t > 0:
            #     # TODO: We want to decide which embedding to use as input for the decoder during training
            #     # We can use the embedding of the transcript character or the embedding of decoded/predicted character, from the previous timestep 
            #     # Using the embedding of the transcript character is teacher forcing, it is very important for faster convergence
            #     # Use a comparison between a random probability and your teacher forcing rate, to decide which embedding to use
            #     char_embed = self.embedding(char)# TODO
            # B x embed_size  + B x projection_size ---> B x (embed_size + projection_size)
            # print('char_embed: ' + str(char_embed.shape))
            # print('context:' + str(context.shape))
            decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
            # Loop over your lstm cells
            # Each lstm cell takes in an embedding 
            for i in range(len(self.lstm_cells)):
                # An LSTM Cell returns (h,c) -> h = hidden state, c = cell memory state
                # Using 2 LSTM Cells is akin to a 2 layer LSTM looped through t timesteps 
                # The second LSTM Cell takes in the output hidden state of the first LSTM Cell (from the current timestep) as Input,
                # along with the hidden and cell states of the cell from the previous timestep
                # B x (embed_size + projection_size), hidden_states[i]--> h, c to hidden_states[i]
                # decoder_input_embedding: B x decoder_hidden_size
                hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i])
                # print(hidden_states[i])
                # print(len(hidden_states[i]))
                # print(hidden_states[i][0])
                # print(hidden_states[i][0].shape)
                # a = hidden_states[i]
                h0 = self.drop(hidden_states[i][0])
                c0 = self.drop(hidden_states[i][1])
                hidden_states[i] = (h0, c0)
                decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
            # The output embedding from the decoder is the hidden state of the last LSTM Cell
            decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
            # We compute attention from the output of the last LSTM Cell
            # key   : (batch_size, timesteps, projection_size)
            # value : (batch_size, timesteps, projection_size)
            # print('deconder_output_embedding:' + str(decoder_output_embedding.shape))
            if self.attention != None:
                context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
            attention_plot.append(attention_weights[0].detach().cpu())
            # B x projection_size + B x projection_size = B x 2 projection_size
            # print('attention.query:' + str(self.attention.query.shape))
            # print('context:' + str(context.shape))
            output_embedding     =  torch.cat((self.attention.query, context), dim=1)
            # TODO: Concatenate the projected query with context for the output embedding
            # Hint: How can you get the projected query from attention
            # If you are not using attention, what will you use instead of query?
            char_prob            = self.char_prob(output_embedding) # B x vocab_size
            # print('char_prob:' + str(char_prob.shape))
            # Append the character probability distribution to the list of predictions 
            predictions.append(char_prob)

            # DrawFromDistribution
            char = torch.argmax(char_prob, dim = 1)# TODO: Get the predicted character for the next timestep from the probability distribution 
            # (Hint: Use Greedy Decoding for starters)
            # print('char: ' + str(char.shape))

        attention_plot  = torch.stack(attention_plot, dim =1)# TODO: Stack list of attetion_plots 
        predictions     = torch.stack(predictions, dim = 1)# TODO: Stack list of predictions 

        return predictions, attention_plot

## Sequence-to-Sequence Model

### LAS

In [None]:
class LAS(torch.nn.Module):
    def __init__(self, 
                 input_size,
                 cnn,
                 layer,
                 cnn_drop,
                 lstm_drop,
                 lstmcell_drop,
                 encoder_hidden_size, 
                 vocab_size, 
                 embed_size,
                 decoder_hidden_size, 
                 decoder_output_size,
                 projection_size= 128):
        
        super(LAS, self).__init__()
        self.encoder        = Listener(input_size = input_size, 
                                       cnn = cnn, 
                                       encoder_hidden_size = encoder_hidden_size,
                                       layer = layer,
                                       cnn_drop = cnn_drop,
                                       lstm_drop= lstm_drop)# TODO: Initialize Encoder
        attention_module    = Attention(encoder_hidden_size = encoder_hidden_size, 
                                        decoder_output_size= decoder_hidden_size, 
                                        projection_size=projection_size,
                                        )# TODO: Initialize Attention
        self.decoder        = Speller(embed_size = embed_size,
                                      decoder_hidden_size = decoder_hidden_size, 
                                      decoder_output_size = decoder_output_size, 
                                      vocab_size = vocab_size,
                                      lstmcell_drop = lstmcell_drop,
                                      attention_module= attention_module)# TODO: Initialize Decoder, make sure you pass the attention module 

    def forward(self, x, x_lens, y = None, tf_rate = 1, gumbel = False):
        encoder_outputs, encoder_lens = self.encoder(x, x_lens) # from Listener
        # print('encoder_Outputs:' + str(encoder_outputs.shape))
        predictions, attention_plot = self.decoder(encoder_outputs, encoder_lens, y, tf_rate, Gumbel = gumbel)
        
        return predictions, attention_plot

# Training Setup

In [None]:
# Global config dict. Feel free to add or change if you want.
config = {
    'batch_size': 96,
    'epochs': 60,
    'lr': 1e-3,
    'model': 'try6_mask',
    'cnn' : [64,128],
    'layer' : [3, 3],
    'cnn_drop' : 0.1,
    'lstm_drop': [0.2,0.2,0.2,0.2],
    'encoder_hidden_size' : 256,  
    'embed_size' : 256,
    'decoder_hidden_size' : 256, 
    'decoder_output_size' : 128,
}

In [None]:
# Global config dict. Feel free to add or change if you want.
config = {
    'batch_size': 96,
    'epochs': 60,
    'lr': 1e-3,
    'model': 'try7',
    'cnn' : [64,128],
    'layer' : [3, 3],
    'cnn_drop' : 0.1,
    'lstm_drop': [0.2,0.3,0.4,0.5],
    'encoder_hidden_size' : 256,  
    'embed_size' : 256,
    'decoder_hidden_size' : 512, 
    'decoder_output_size' : 128,
}

## Model Setup










In [None]:
# Baseline LAS has the following configuration:
# Encoder bLSTM/pbLSTM Hidden Dimension of 512 (256 per direction)
# Decoder Embedding Layer Dimension of 256
# Decoder Hidden Dimension of 512 
# Decoder Output Dimension of 128
# Attention Projection Size of 128
# Feel Free to Experiment with this 
model = LAS(
    # Initialize your model 
    # Read the paper and think about what dimensions should be used
    # You can experiment on these as well, but they are not requried for the early submission
    # Remember that if you are using weight tying, some sizes need to be the same
          input_size = 15, 
          cnn = [64, 128],
          layer = [3, 3],
          cnn_drop = 0.1,
          lstm_drop = [0.3,0.3,0.3,0.3],
          lstmcell_drop = 0.10,
          encoder_hidden_size = 256, 
          vocab_size = len(VOCAB), 
          embed_size = 256,
          decoder_hidden_size =  256, 
          decoder_output_size = 128,
          projection_size= 128
)
print(model)
model.to(DEVICE)

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Optimizer, Scheduler, Loss

In [None]:
class XEntLoss(torch.nn.Module):
    def __init__(self):
        super(XEntLoss, self).__init__()
        
    def forward(self, prob_list, y):
       prob_list = F.softmax(prob_list, dim = 2)
       y_one = F.one_hot(y, num_classes = len(VOCAB))
       loss = torch.sum(- y_one[:, :, :-1] *torch.log(prob_list[:, :, :-1]), dim = 2)
       return loss.view(-1)

In [None]:
optimizer   = torch.optim.Adam(model.parameters(), lr= config['lr'], amsgrad= True, weight_decay= 5e-6)
criterion   = torch.nn.CrossEntropyLoss(reduction='none') # Why are we using reduction = 'none' ? 
#criterion = nn.CTCLoss(blank = VOCAB_MAP[" "])
# criterion = XEntLoss()
scaler      = torch.cuda.amp.GradScaler()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.98, patience=4, verbose=True, threshold=1e-2)
# Optional: Create a custom class for a Teacher Force Schedule 

# Levenshtein Distance

In [None]:
# We have given you this utility function which takes a sequence of indices and converts them to a list of characters
def indices_to_chars(indices, vocab):
    tokens = []
    for i in indices: # This loops through all the indices
        if int(i) == SOS_TOKEN: # If SOS is encountered, dont add it to the final list
            continue
        elif int(i) == EOS_TOKEN: # If EOS is encountered, stop the decoding process
            break
        else:
            tokens.append(vocab[i])
    return tokens

# To make your life more easier, we have given the Levenshtein distantce / Edit distance calculation code
def calc_edit_distance(predictions, y, ly, vocab= VOCAB, print_example= False):
    dist                = 0
    batch_size, seq_len = predictions.shape
    for batch_idx in range(batch_size): 
        y_sliced    = indices_to_chars(y[batch_idx,0:ly[batch_idx]], vocab)
        pred_sliced = indices_to_chars(predictions[batch_idx], vocab)
        # Strings - When you are using characters from the AudioDataset
        y_string    = ''.join(y_sliced)
        pred_string = ''.join(pred_sliced)
        dist        += Levenshtein.distance(pred_string, y_string)
        # Comment the above abd uncomment below for toy dataset 
        # dist      += Levenshtein.distance(y_sliced, pred_sliced)
    if print_example: 
        # Print y_sliced and pred_sliced if you are using the toy dataset
        print("Ground Truth : ", y_string)
        print("Prediction   : ", pred_string)
        
    dist/=batch_size
    return dist

# Train and Evaluate

In [None]:
from tqdm.std import TRLock
def train(model, dataloader, criterion, optimizer, teacher_forcing_rate):
    # model.to(DEVICE)
    model.train()
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    running_loss        = 0.0
    running_perplexity  = 0.0
    
    for i, (x, y, lx, ly) in enumerate(dataloader):

        optimizer.zero_grad()

        x, y, lx, ly = x.to(DEVICE), y.to(DEVICE), lx, ly

        with torch.cuda.amp.autocast():

            predictions, attention_plot = model(x, lx, y, tf_rate= teacher_forcing_rate, gumbel = True)
            # print('1')
            # Predictions are of Shape (batch_size, timesteps, vocab_size). 
            # Transcripts are of shape (batch_size, timesteps) Which means that you have batch_size amount of batches with timestep number of tokens.
            # So in total, you have batch_size*timesteps amount of characters.
            # Similarly, in predictions, you have batch_size*timesteps amount of probability distributions.
            # How do you need to modify transcipts and predictions so that you can calculate the CrossEntropyLoss? Hint: Use Reshape/View and read the docs
            loss        =  criterion(predictions.view(-1, predictions.size(2)), y.view(-1)) # TODO: Cross Entropy Loss
            # print('2')
            # loss = criterion(predictions, y)
            max_len     = y.shape[1]
            # 1 x L >= B x 1 --- > B x L
            mask     =  torch.arange(max_len).unsqueeze(0) <= ly.unsqueeze(1)
            # TODO: Create a boolean mask using the lengths of your transcript that remove the influence of padding indices (in transcripts) in the loss 
            mask = mask.to(DEVICE)
            # print('3')
            masked_loss = torch.sum(loss * mask.view(-1)) / torch.sum(mask)
            # Product between the mask and the loss, divided by the mask's sum. Hint: You may want to reshape the mask too 
            perplexity  = torch.exp(masked_loss) # Perplexity is defined the exponential of the loss
            # print('4')
            running_loss        += masked_loss.item()
            running_perplexity  += perplexity.item()
        
        # Backward on the masked loss
        scaler.scale(masked_loss).backward()

        # Optional: Use torch.nn.utils.clip_grad_norm to clip gradients to prevent them from exploding, if necessary
        # If using with mixed precision, unscale the Optimizer First before doing gradient clipping
        
        scaler.step(optimizer)
        scaler.update()
        

        batch_bar.set_postfix(
            loss="{:.04f}".format(running_loss/(i+1)),
            perplexity="{:.04f}".format(running_perplexity/(i+1)),
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])),
            tf_rate='{:.02f}'.format(teacher_forcing_rate))
        batch_bar.update()

        del x, y, lx, ly
        torch.cuda.empty_cache()

    running_loss /= len(dataloader)
    running_perplexity /= len(dataloader)
    batch_bar.close()

    return running_loss, running_perplexity, attention_plot

In [None]:
def validate(model, dataloader):

    model.eval()

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")

    running_lev_dist = 0.0

    for i, (x, y, lx, ly) in enumerate(dataloader):

        x, y, lx, ly = x.to(DEVICE), y.to(DEVICE), lx, ly

        with torch.inference_mode():
            predictions, attentions = model(x, lx, y = None)

        # Greedy Decoding
        greedy_predictions   =  predictions.argmax(-1).detach().cpu().numpy()# TODO: How do you get the most likely character from each distribution in the batch?

        # Calculate Levenshtein Distance
        running_lev_dist    += calc_edit_distance(greedy_predictions, y, ly, VOCAB, print_example = False) # You can use print_example = True for one specific index i in your batches if you want

        batch_bar.set_postfix(
            dist="{:.04f}".format(running_lev_dist/(i+1)))
        batch_bar.update()

        del x, y, lx, ly
        torch.cuda.empty_cache()

    batch_bar.close()
    running_lev_dist /= len(dataloader)

    return running_lev_dist#, running_loss, running_perplexity, 

# Train and Evaluate(Beam)

In [None]:
from tqdm.std import TRLock
def train(model, dataloader, criterion, optimizer, teacher_forcing_rate):
    # model.to(DEVICE)
    model.train()
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    running_loss        = 0.0
    running_perplexity  = 0.0
    
    for i, (x, y, lx, ly) in enumerate(dataloader):

        optimizer.zero_grad()

        x, y, lx, ly = x.to(DEVICE), y.to(DEVICE), lx, ly

        with torch.cuda.amp.autocast():

            predictions, attention_plot = model(x, lx, y, tf_rate= teacher_forcing_rate, gumbel = True)
            # print('1')
            # Predictions are of Shape (batch_size, timesteps, vocab_size). 
            # Transcripts are of shape (batch_size, timesteps) Which means that you have batch_size amount of batches with timestep number of tokens.
            # So in total, you have batch_size*timesteps amount of characters.
            # Similarly, in predictions, you have batch_size*timesteps amount of probability distributions.
            # How do you need to modify transcipts and predictions so that you can calculate the CrossEntropyLoss? Hint: Use Reshape/View and read the docs
            loss        =  criterion(predictions.view(-1, predictions.size(2)), y.view(-1)) # TODO: Cross Entropy Loss
            # print('2')
            # loss = criterion(predictions, y)
            max_len     = y.shape[1]
            # 1 x L >= B x 1 --- > B x L
            mask     =  torch.arange(max_len).unsqueeze(0) <= ly.unsqueeze(1)
            # TODO: Create a boolean mask using the lengths of your transcript that remove the influence of padding indices (in transcripts) in the loss 
            mask = mask.to(DEVICE)
            # print('3')
            masked_loss = torch.sum(loss * mask.view(-1)) / torch.sum(mask)
            # Product between the mask and the loss, divided by the mask's sum. Hint: You may want to reshape the mask too 
            perplexity  = torch.exp(masked_loss) # Perplexity is defined the exponential of the loss
            # print('4')
            running_loss        += masked_loss.item()
            running_perplexity  += perplexity.item()
        
        # Backward on the masked loss
        scaler.scale(masked_loss).backward()

        # Optional: Use torch.nn.utils.clip_grad_norm to clip gradients to prevent them from exploding, if necessary
        # If using with mixed precision, unscale the Optimizer First before doing gradient clipping
        
        scaler.step(optimizer)
        scaler.update()
        

        batch_bar.set_postfix(
            loss="{:.04f}".format(running_loss/(i+1)),
            perplexity="{:.04f}".format(running_perplexity/(i+1)),
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])),
            tf_rate='{:.02f}'.format(teacher_forcing_rate))
        batch_bar.update()

        del x, y, lx, ly
        torch.cuda.empty_cache()

    running_loss /= len(dataloader)
    running_perplexity /= len(dataloader)
    batch_bar.close()

    return running_loss, running_perplexity, attention_plot

In [None]:
def validate(model, dataloader):

    model.eval()

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")

    running_lev_dist = 0.0

    for i, (x, y, lx, ly) in enumerate(dataloader):

        x, y, lx, ly = x.to(DEVICE), y.to(DEVICE), lx, ly

        with torch.inference_mode():
            predictions, seq = model(x, lx, y = None)

        # Greedy Decoding
        #greedy_predictions   =  predictions.argmax(-1).detach().cpu().numpy()# TODO: How do you get the most likely character from each distribution in the batch?
        #print(seq.shape)
        greedy_predictions  = seq.cpu().numpy()
        # Calculate Levenshtein Distance
        running_lev_dist    += calc_edit_distance(greedy_predictions, y, ly, VOCAB, print_example = False) # You can use print_example = True for one specific index i in your batches if you want

        batch_bar.set_postfix(
            dist="{:.04f}".format(running_lev_dist/(i+1)))
        batch_bar.update()

        del x, y, lx, ly
        torch.cuda.empty_cache()

    batch_bar.close()
    running_lev_dist /= len(dataloader)

    return running_lev_dist#, running_loss, running_perplexity, 

# Wandb

In [None]:
import wandb
wandb.login(key="") 

In [None]:
# Login to Wandb
# Initialize your Wandb Run Here
# Optional: Save your model architecture in a txt file, and save the file to Wandb
run = wandb.init(
    name = config['model'], ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    project = '', entity = '',
    config = config)

In [None]:
artifact = run.use_artifact('', type='model')
artifact_dir = artifact.download()

In [None]:
model.load_state_dict(torch.load('')['model_state_dict'])
optimizer.load_state_dict(torch.load('')['optimizer_state_dict'])

In [None]:
model.to(DEVICE)

In [None]:
train_dist = validate(model, train_loader1)

# Experiments

In [None]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.95, patience=1, verbose=True, threshold=1e-2)

In [None]:
optimizer.param_groups[0]['lr'] = 0.001
tf_rate =1
best_lev_dist = 1500

In [None]:
# best_lev_dist = 1e6
# tf_rate = 1.0
#model.to(DEVICE)
for epoch in range(0, config['epochs']):
    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))
    # Call train and validate 
    train_loss, train_perplexity, attention_plot = train(model, train_loader, criterion, optimizer, teacher_forcing_rate=tf_rate) 
    val_dist = validate(model, val_loader)
    scheduler.step(val_dist)
    # Print your metrics
    print("\nEpoch {}/{}: \t Train Loss {:.04f} \t Train perplexity {:.04f} ".format(
          epoch + 1,
          config['epochs'],
          train_loss,
          train_perplexity,
          ))
    print("Val dist {:.04f}\t ".format(val_dist))
    wandb.log({"train_loss":train_loss, 
               'train_perplexity': train_perplexity,
              'validation_dist':val_dist, 
               'lr' : optimizer.param_groups[0]['lr'],
              'teacher_forcing_rate': tf_rate})
    # Plot Attention 
    # plot_attention(attention_plot)
    # if epoch > 10 : 
    #   scheduler.step(val_dist)
    #if (epoch + 1) % 5 == 0:
    if tf_rate > 0.6 and val_dist < 12:
        tf_rate -= 0.03
    # if (epoch + 1) % 10 == 0:
    #     train_dist = validate(model, train_loader)
    #     print("Train dist {:.04f}\t ".format(train_dist))
      
    # Log metrics to Wandb

    # Optional: Scheduler Step / Teacher Force Schedule Step
    torch.cuda.empty_cache()

    if val_dist <= best_lev_dist:
        best_lev_dist = val_dist
        # Save your model checkpoint here
        print("Saving model")
        # Saving the model and optimizer states
        torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              #'scheduler_state_dict':scheduler.state_dict()
              'val_dist': val_dist, 
               'epoch': epoch
              }, "Model")
        # Creating Artifact
        model_artifact = wandb.Artifact(config['model'], type='model')
        # Adding model file to Artifact
        model_artifact.add_file("Model")
        # Saving Artifact to WandB
        run.log_artifact(model_artifact)
        best_val_dist = val_dist

run.finish()

In [None]:
torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              #'scheduler_state_dict':scheduler.state_dict()
              'val_dist': val_dist, 
               'epoch': epoch
              }, "Model")
# Creating Artifact
model_artifact = wandb.Artifact(config['model'], type='model')
# Adding model file to Artifact
model_artifact.add_file("Model")
# Saving Artifact to WandB
run.log_artifact(model_artifact)

# Testing

## Greedy search

In [None]:
# Optional: Load your best model Checkpoint here

In [None]:
class LAS(torch.nn.Module):
    def __init__(self, 
                 input_size,
                 cnn,
                 layer,
                 cnn_drop,
                 lstm_drop,
                 encoder_hidden_size, 
                 vocab_size, 
                 embed_size,
                 decoder_hidden_size, 
                 decoder_output_size,
                 projection_size= 128):
        
        super(LAS, self).__init__()
        self.encoder        = Listener(input_size = input_size, 
                                       cnn = cnn, 
                                       encoder_hidden_size = encoder_hidden_size,
                                       layer = layer,
                                       cnn_drop = cnn_drop,
                                       lstm_drop= lstm_drop)# TODO: Initialize Encoder
        attention_module    = Attention(encoder_hidden_size = encoder_hidden_size, 
                                        decoder_output_size= decoder_hidden_size, 
                                        projection_size=projection_size,
                                        )# TODO: Initialize Attention
        self.decoder        = Speller(embed_size = embed_size,
                                      decoder_hidden_size = decoder_hidden_size, 
                                      decoder_output_size = decoder_output_size, 
                                      vocab_size = vocab_size, 
                                      attention_module= attention_module)# TODO: Initialize Decoder, make sure you pass the attention module 

    def forward(self, x, x_lens, y = None, tf_rate = 1, gumbel = False):
        encoder_outputs, encoder_lens = self.encoder(x, x_lens) # from Listener
        # print('encoder_Outputs:' + str(encoder_outputs.shape))
        predictions, attention_plot = self.decoder(encoder_outputs, encoder_lens, y, tf_rate, Gumbel = gumbel)
        
        return predictions, attention_plot

## Load Model

In [None]:
# Baseline LAS has the following configuration:
# Encoder bLSTM/pbLSTM Hidden Dimension of 512 (256 per direction)
# Decoder Embedding Layer Dimension of 256
# Decoder Hidden Dimension of 512 
# Decoder Output Dimension of 128
# Attention Projection Size of 128
# Feel Free to Experiment with this 
model = LAS(
    # Initialize your model 
    # Read the paper and think about what dimensions should be used
    # You can experiment on these as well, but they are not requried for the early submission
    # Remember that if you are using weight tying, some sizes need to be the same
          input_size = 15, 
          cnn = [64, 128],
          layer = [3, 3],
          cnn_drop = 0.1,
          lstm_drop = [0.2,0.3,0.3,0.2],
          encoder_hidden_size = 256, 
          vocab_size = len(VOCAB), 
          embed_size = 256,
          decoder_hidden_size =  256, 
          decoder_output_size = 128,
          projection_size= 128
)
print(model)
model.to(DEVICE)

In [None]:
artifact = run.use_artifact('', type='model')
artifact_dir = artifact.download()

In [None]:
model.load_state_dict(torch.load('')['model_state_dict'])
optimizer.load_state_dict(torch.load('')['optimizer_state_dict'])

In [None]:
# TODO: Create a testing function similar to validation 
def test(model, dataloader):
    model.eval()
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")
    pred = []
    for i, (x, lx) in enumerate(dataloader):
        x, lx = x.to(DEVICE), lx
        with torch.inference_mode():
            predictions, attentions = model(x, lx, y = None)
        # Greedy Decoding
        greedy_predictions   =  predictions.argmax(-1).detach().cpu().numpy()# TODO: How do you get the most likely character from each distribution in the batch?
        print(greedy_predictions[0].shape)
        batch_size= predictions.shape[0]
        for batch_idx in range(batch_size): 
          pred_sliced = indices_to_chars(greedy_predictions[batch_idx], VOCAB)
          # Strings - When you are using characters from the AudioDataset
          pred_string = ''.join(pred_sliced)
          pred.append(pred_string)
        del x, lx
        torch.cuda.empty_cache()

    batch_bar.close()
    return pred


In [None]:
torch.cuda.empty_cache()
predictions = test(model, test_loader)

In [None]:
import pandas as pd
df = pd.DataFrame()
df['id'] = [i for i in range(len(predictions))]
df['label'] = predictions
df.to_csv('submission.csv', index = False)

In [None]:
# TODO: Submit to Kaggle
!kaggle competitions submit -c <competition> -f 'submission.csv' -m "I made it!"

# Testing(Beam2)

#### Beam(batch size >1)

In [None]:
from torch._C import get_num_interop_threads
class Speller_beam(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size, attention_module= None, beam = 10):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size)
                                )
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

        self.beam_width = beam

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = [] 
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE)
        seq            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        char_prob       = torch.full((batch_size,), fill_value=1.0, dtype= torch.long).to(DEVICE)
        #path_end        = torch.full((batch_size,), fill_value=False, dtype= torch.long).to(DEVICE) 
        log_probs       =  [0.0] * batch_size  
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        
        beams = [[char, predictions, char_prob, hidden_states, context, log_probs, seq]]
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            new_beams = []
            for num, beam in enumerate(beams):
                char = beam[0].clone()
                if char == EOS_TOKEN :
                    new_beams.append(beam.copy())
                    continue
                predictions = beam[1].copy()
                # print('time: '+str(t) + ', num: ' + str(num) + ', len: ' + str(len(predictions)))
                char_prob = beam[2].clone()
                hidden_states = beam[3].copy()
                context = beam[4].clone()
                log_probs = beam[5].copy()
                seq = beam[6].clone()

                char_embed = self.embedding(char)
                
                decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
                # Loop over your lstm cells
                # Each lstm cell takes in an embedding 
                for i in range(len(self.lstm_cells)):
                    hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                    decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
                # The output embedding from the decoder is the hidden state of the last LSTM Cell
                decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
                
                if self.attention != None:
                    context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
                output_embedding     =  torch.cat((self.attention.query, context), dim=1)
                char_prob            = self.char_prob(output_embedding) # B x vocab_size
                prob = F.softmax(char_prob)
                predictions.append(char_prob)

                values, indices = torch.topk(prob, self.beam_width, dim = 1) #B x beam_width
                for i in range(self.beam_width):
                    char = indices[:, i]
                    logprob = log_probs + torch.log(values[:, i]).detach().cpu().numpy()
                    # char = torch.argmax(char_prob, dim = 1)
                    # log_probs += torch.log(torch.max(char_prob, dim = 1)[0]).detach().cpu().numpy()
                    #print('values: ' + str(values[:, i]) + ', log: ' + str(logprob))
                    new_beam = [char, predictions, char_prob, hidden_states, context, logprob, torch.cat((seq, char))]
                    new_beams.append(new_beam)

            beams = prune(new_beams, self.beam_width, batch_size)

        beam = beams[0] # choose the top 1 result
        predictions = beam[1]       
        predictions     = torch.stack(predictions, dim = 1)# TODO: Stack list of predictions 
        seq = beam[-1]
        return predictions, seq

#### Beam (batch size = 1)

In [None]:
def prune(new_beams, beam_width, batch_size):
    """
    beams = [[char, predictions, char_prob, hidden_states, context, log_probs, attention_plot]]
    batch_size = 1
    """
    new_beams = sorted(new_beams, key = lambda x:x[5], reverse=True)
    if len(new_beams) <= beam_width:
        return new_beams
    beams = new_beams[ :beam_width]
    return beams

In [None]:
from torch._C import get_num_interop_threads
class Speller(torch.nn.Module):

    def __init__(self, embed_size, decoder_hidden_size, decoder_output_size, vocab_size, attention_module= None, beam = 10):
        super().__init__()
        self.vocab_size         = vocab_size
        self.embedding          = nn.Embedding(num_embeddings = self.vocab_size, 
                                               embedding_dim = embed_size,
                                               padding_idx = padding)
        # TODO: Initialize the Embedding Layer (Use the nn.Embedding Layer from torch), make sure you set the correct padding_idx  
        self.lstm_cells         = torch.nn.Sequential(
                                nn.LSTMCell(input_size = embed_size + decoder_output_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size),
                                nn.LSTMCell(input_size = decoder_hidden_size, 
                                            hidden_size = decoder_hidden_size)
                                )
                                # We are using LSTMCells because process individual time steps inputs and not the whole sequence.
                                # Think why we need this in terms of the query
        self.char_prob          = nn.Linear(2 * decoder_output_size, vocab_size)
        # TODO: Initialize the classification layer to generate your probability distribution over all characters
        self.char_prob.weight   = self.embedding.weight # Weight tying
        self.attention          = attention_module

        self.beam_width = beam

    def forward(self, encoder_outputs, encoder_lens, y = None, tf_rate = 1, Gumbel = False): 
        '''
        Args: 
            embedding: Attention embeddings 
            hidden_list: List of Hidden States for the LSTM Cells
        ''' 
        batch_size, encoder_max_seq_len, _ = encoder_outputs.shape # B, L
        if self.training:
            timesteps     = y.shape[1] # The number of timesteps is the sequence of length of your transcript during training
            label_embed   = self.embedding(y) # Embeddings of the transcript, when we want to use teacher forcing
            # B x seq_size --> B x seq_size x embed_size
        else:
            timesteps     = 600 # 600 is a design choice that we recommend, however you are free to experiment.
        if self.attention != None:
            self.attention.set_key_value_mask(encoder_outputs, encoder_lens)
        # INITS
        predictions     = [] 
        # Initialize the first character input to your decoder, SOS, O(-1) = SOS, (B x 1)
        char            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE)
        seq            = torch.full((batch_size,), fill_value=SOS_TOKEN, dtype= torch.long).to(DEVICE) 
        char_prob       = torch.full((batch_size,), fill_value=1.0, dtype= torch.long).to(DEVICE)
        #path_end        = torch.full((batch_size,), fill_value=False, dtype= torch.long).to(DEVICE) 
        log_probs       =  [0.0] * batch_size  
        # Initialize a list to keep track of LSTM Cell Hidden and Cell Memory States, to None
        hidden_states   = [None]*len(self.lstm_cells) #len(self.decoder.lstm_cells) 
        context                 = self.attention.value[:, 0, :]# TODO: Initialize context (You have a few choices, refer to the writeup ) C(-1) = V(0)
        attention_weights       = torch.zeros(batch_size, encoder_max_seq_len) # Attention Weights are zero if not using Attend Module
        
        beams = [[char, predictions, char_prob, hidden_states, context, log_probs, seq]]
        for t in range(timesteps):
            #TODO: Generate the embedding for the character at timestep t
            # B x embed_size
            new_beams = []
            for num, beam in enumerate(beams):
                char = beam[0].clone()
                if char == EOS_TOKEN :
                    new_beams.append(beam.copy())
                    continue
                predictions = beam[1].copy()
                # print('time: '+str(t) + ', num: ' + str(num) + ', len: ' + str(len(predictions)))
                char_prob = beam[2].clone()
                hidden_states = beam[3].copy()
                context = beam[4].clone()
                log_probs = beam[5].copy()
                seq = beam[6].clone()

                char_embed = self.embedding(char)
                
                decoder_input_embedding =  torch.cat((char_embed, context), dim = 1) # TODO: What do we want to concatenate as input to the decoder? (Use torch.cat)
                # Loop over your lstm cells
                # Each lstm cell takes in an embedding 
                for i in range(len(self.lstm_cells)):
                    hidden_states[i] = self.lstm_cells[i](decoder_input_embedding, hidden_states[i]) 
                    decoder_input_embedding = hidden_states[i][0] # B x decoder_hidden_size
                # The output embedding from the decoder is the hidden state of the last LSTM Cell
                decoder_output_embedding = hidden_states[-1][0] # B x decoder_hidden_size
                
                if self.attention != None:
                    context, attention_weights = self.attention(decoder_output_embedding) # The returned query is the projected query
                output_embedding     =  torch.cat((self.attention.query, context), dim=1)
                char_prob            = self.char_prob(output_embedding) # B x vocab_size
                prob = F.softmax(char_prob)
                predictions.append(char_prob)

                values, indices = torch.topk(prob, self.beam_width, dim = 1) #B x beam_width
                for i in range(self.beam_width):
                    char = indices[:, i]
                    logprob = log_probs + torch.log(values[:, i]).detach().cpu().numpy()
                    # char = torch.argmax(char_prob, dim = 1)
                    # log_probs += torch.log(torch.max(char_prob, dim = 1)[0]).detach().cpu().numpy()
                    #print('values: ' + str(values[:, i]) + ', log: ' + str(logprob))
                    new_beam = [char, predictions, char_prob, hidden_states, context, logprob, torch.cat((seq, char))]
                    new_beams.append(new_beam)

            beams = prune(new_beams, self.beam_width, batch_size)

        beam = beams[0] # choose the top 1 result
        predictions = beam[1]       
        predictions     = torch.stack(predictions, dim = 1)# TODO: Stack list of predictions 
        seq = beam[-1]
        return predictions, seq

## Load model

In [None]:
class LAS(torch.nn.Module):
    def __init__(self, 
                 input_size,
                 cnn,
                 layer,
                 cnn_drop,
                 lstm_drop,
                 encoder_hidden_size, 
                 vocab_size, 
                 embed_size,
                 decoder_hidden_size, 
                 decoder_output_size,
                 projection_size= 128,
                 beam = 10):
        
        super(LAS, self).__init__()
        self.encoder        = Listener(input_size = input_size, 
                                       cnn = cnn, 
                                       encoder_hidden_size = encoder_hidden_size,
                                       layer = layer,
                                       cnn_drop = cnn_drop,
                                       lstm_drop= lstm_drop)# TODO: Initialize Encoder
        attention_module    = Attention(encoder_hidden_size = encoder_hidden_size, 
                                        decoder_output_size= decoder_hidden_size, 
                                        projection_size=projection_size
                                        )# TODO: Initialize Attention
        self.decoder        = Speller_beam(embed_size = embed_size,
                                      decoder_hidden_size = decoder_hidden_size, 
                                      decoder_output_size = decoder_output_size, 
                                      vocab_size = vocab_size, 
                                      attention_module= attention_module,
                                      beam = beam)# TODO: Initialize Decoder, make sure you pass the attention module 

    def forward(self, x, x_lens, y = None, tf_rate = 1, gumbel = False):
        encoder_outputs, encoder_lens = self.encoder(x, x_lens) # from Listener
        # print('encoder_Outputs:' + str(encoder_outputs.shape))
        predictions, seq = self.decoder(encoder_outputs, encoder_lens, y, tf_rate, Gumbel = gumbel)
        
        return predictions, seq

In [None]:
# Baseline LAS has the following configuration:
# Encoder bLSTM/pbLSTM Hidden Dimension of 512 (256 per direction)
# Decoder Embedding Layer Dimension of 256
# Decoder Hidden Dimension of 512 
# Decoder Output Dimension of 128
# Attention Projection Size of 128
# Feel Free to Experiment with this 
model = LAS(
    # Initialize your model 
    # Read the paper and think about what dimensions should be used
    # You can experiment on these as well, but they are not requried for the early submission
    # Remember that if you are using weight tying, some sizes need to be the same
          input_size = 15, 
          cnn = [64, 128],
          layer = [3, 3],
          cnn_drop = 0.1,
          lstm_drop = [0.2,0.3,0.3,0.2],
          encoder_hidden_size = 256, 
          vocab_size = len(VOCAB), 
          embed_size = 256,
          decoder_hidden_size =  256, 
          decoder_output_size = 128,
          projection_size= 128
)
print(model)
model.to(DEVICE)

In [None]:
artifact = run.use_artifact('', type='model')
artifact_dir = artifact.download()

In [None]:
model.load_state_dict(torch.load('')['model_state_dict'])
optimizer.load_state_dict(torch.load('')['optimizer_state_dict'])

In [None]:
# TODO: Create a testing function similar to validation 
def test(model, dataloader):
    model.eval()
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")
    pred = []
    batch_size = config['batch_size']
    for i, (x, lx) in enumerate(dataloader):
        x, lx = x.to(DEVICE), lx
        with torch.inference_mode():
            predictions, seq = model(x, lx, y = None)
        # Greedy Decoding
        #greedy_predictions   =  predictions.argmax(-1).detach().cpu().numpy()# TODO: How do you get the most likely character from each distribution in the batch?
        #print(seq.shape)
        greedy_predictions  = seq.cpu().numpy()# TODO: How do you get the most likely character from each distribution in the batch?
        batch_size= greedy_predictions.shape[0]
        for batch_idx in range(batch_size): 
          pred_sliced = indices_to_chars(greedy_predictions[batch_idx], VOCAB)
          # Strings - When you are using characters from the AudioDataset
          pred_string = ''.join(pred_sliced)
          pred.append(pred_string)
        del x, lx
        torch.cuda.empty_cache()
        batch_bar.update()

    batch_bar.close()
    return pred


In [None]:
torch.cuda.empty_cache()
predictions = test(model, val_loader)

In [None]:
import pandas as pd
df = pd.DataFrame()
df['id'] = [i for i in range(len(predictions))]
df['label'] = predictions
df.to_csv('submission.csv', index = False)

In [None]:
# TODO: Submit to Kaggle
!kaggle competitions submit -c 11-785-f22-hw4p2 -f submission.csv -m "hahahahaha!"
# !kaggle competitions submit -c <competition> -f 'submission.csv' -m "I made it!"