# Installs

In [1]:
%pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchtext==0.14.1 torchaudio==0.13.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
!sudo apt-get install git

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  git-man liberror-perl
Suggested packages:
  git-daemon-run | git-daemon-sysvinit git-doc git-el git-email git-gui gitk
  gitweb git-cvs git-mediawiki git-svn
The following NEW packages will be installed:
  git git-man liberror-perl
0 upgraded, 3 newly installed, 0 to remove and 0 not upgraded.
Need to get 7285 kB of archives.
After this operation, 38.1 MB of additional disk space will be used.
Do you want to continue? [Y/n] ^C



This may take a while

In [1]:
!pip install wandb --quiet
!pip install python-Levenshtein -q
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget -q
%cd ctcdecode
!pip install . -q
%cd ..

# !pip install torchsummaryX -q

fatal: destination path 'ctcdecode' already exists and is not an empty directory.
/home/22941/ctcdecode
/home/22941


In [2]:
'''
If torchsummaryX doesn't work, please run this cell. Alternatively, please refer to Piazza post @209 for more assistance:
'''

!pip install torchsummaryx==1.3.0

Defaulting to user installation because normal site-packages is not writeable


# Imports

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

# imports for decoding and distance calculation
import ctcdecode
import Levenshtein
from ctcdecode import CTCBeamDecoder
import gc
import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Kaggle Setup

In [8]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8 -q
# !mkdir /root/.kaggle

# with open("/root/.kaggle/kaggle.json", "w+") as f:
#     f.write('{"username":"","key":""}') # TODO: Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': Permission denied


In [5]:
!kaggle competitions download -c hw3p2asr-s24

Downloading hw3p2asr-s24.zip to /home/22941
100%|██████████████████████████████████████▉| 3.73G/3.74G [00:27<00:00, 157MB/s]
100%|███████████████████████████████████████| 3.74G/3.74G [00:27<00:00, 148MB/s]


In [6]:
'''
This will take a couple minutes, but you should see at least the following:
11-785-s24-hw3p2  ctcdecode  hw3p2asr-s24.zip  sample_data
'''
!unzip -q hw3p2asr-s24.zip
!ls

11-785-s24-hw3p2		    ctcdecode	      install_gpu_driver.py
HW3P2_Starter_S24_fromVM.ipynb	    get-pip.py	      setup.sh
NVIDIA-Linux-x86_64-525.125.06.run  hw3p2asr-s24.zip


# Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Dataset and Dataloader

In [2]:
# ARPABET PHONEME MAPPING
# DO NOT CHANGE

CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@",
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W",
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R",
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w",
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y",
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D",
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O",
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())


PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

In [3]:
# You might want to play around with the mapping as a sanity check here
print(CMUdict,' ',ARPAbet)

['', '[SIL]', 'NG', 'F', 'M', 'AE', 'R', 'UW', 'N', 'IY', 'AW', 'V', 'UH', 'OW', 'AA', 'ER', 'HH', 'Z', 'K', 'CH', 'W', 'EY', 'ZH', 'T', 'EH', 'Y', 'AH', 'B', 'P', 'TH', 'DH', 'AO', 'G', 'L', 'JH', 'OY', 'SH', 'D', 'AY', 'S', 'IH', '[SOS]', '[EOS]']   [' ', '-', 'G', 'f', 'm', '@', 'r', 'u', 'n', 'i', 'W', 'v', 'U', 'o', 'a', 'R', 'h', 'z', 'k', 'C', 'w', 'e', 'Z', 't', 'E', 'y', 'A', 'b', 'p', 'T', 'D', 'c', 'g', 'l', 'j', 'O', 'S', 'd', 'Y', 's', 'I', '[SOS]', '[EOS]']


### Train Data

In [4]:
class AudioDataset(torch.utils.data.Dataset):

    # For this homework, we give you full flexibility to design your data set class.
    # Hint: The data from HW1 is very similar to this HW

    #TODO
    def __init__(self, root, PHONEMES=PHONEMES, partition= "train-clean-100/", limit = None, transforms = True):
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''
        self.transforms = transforms
        # Load the directory and all files in them

        self.mfcc_dir = os.path.join(root,partition,'mfcc')
        self.transcript_dir = os.path.join(root,partition,'transcript')

        if limit is None:
            self.mfcc_files = sorted(os.listdir(self.mfcc_dir))
            self.transcript_files = sorted(os.listdir(self.transcript_dir))
        else:
            self.mfcc_files = sorted(os.listdir(self.mfcc_dir))[:limit]
            self.transcript_files = sorted(os.listdir(self.transcript_dir))[:limit]

        self.PHONEMES = PHONEMES

        self.mfccs, self.transcripts = [], []

        # Iterate through mfccs and transcripts
        for i in range(len(self.mfcc_files)):
        #   Load a single mfcc
            mfcc        = np.load(os.path.join(self.mfcc_dir,self.mfcc_files[i])) # this is the actual values
        #   Do Cepstral Normalization of mfcc (explained in writeup for hw1p1)
            mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
        #   Load the corresponding transcript
            transcript  = np.load(os.path.join(self.transcript_dir,self.transcript_files[i])) # Remove [SOS] and [EOS] from the transcript
            transcript = transcript[1:-1]
            # (Is there an efficient way to do this without traversing through the transcript?)
            # Note that SOS will always be in the starting and EOS at end, as the name suggests.
            self.mfccs.append(mfcc)
            #self.transcripts.append(transcript)

            self.transcripts.append(np.array([self.PHONEMES.index(i) for i in transcript]))


        # Each mfcc is of shape T1 x 27, T2 x 27, ...
        # Each transcript is of shape (T1_out+2), (T2_out+2) before removing [SOS] and [EOS]

        #TODO
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfccs)

        #TODO
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM?
        # HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS
        # Map the phonemes to their corresponding list indexes in self.phonemes
        # phonemes_map = {phoneme: int(index) for index, phoneme in enumerate(self.PHONEMES)}
        # # labe_map = {label : index for index, label in enumerate()}
        # for i, transcript in enumerate(self.transcripts): # This will iterate the self.transcript list
        #     # for j in range(len(transcript)): # This will iterate the each transcript in self.transcript list
        #     self.transcripts[i] = phonemes_map[transcript]
        # #TODO
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS
        # WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        '''
        You may decide to do this in __getitem__ if you wish.
        However, doing this here will make the __init__ function take the load of
        loading the data, and shift it away from training.
        '''


    def __len__(self):

        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        mfcc =  self.mfccs[ind] #This will return the mfcc of shape[1,27]
        transcript =  self.transcripts[ind]#Now here we want the correspond transcript
        # mfcc      = torch.FloatTensor(mfcc) # Convert to tensors
        # transcript    = torch.tensor(transcript)

        return mfcc, transcript


    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish.
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features,
            and lengths of labels.
        '''
        batch_mfcc = [item[0] for item in batch]  # item[0]
        batch_transcript = [item[1] for item in batch]  # item[1]

        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)

        #This is to store the original length information
        lengths_mfcc = [len(mfcc) for mfcc in batch_mfcc]
        lengths_transcript = [len(transcript) for transcript in batch_transcript]

        batch_mfcc_pad = pad_sequence(sequences=[torch.tensor(mfcc) for mfcc in batch_mfcc],batch_first=True, padding_value=0)
        # print(batch_mfcc_pad.shape)
        batch_transcript_pad = pad_sequence(sequences=[torch.tensor(transcript) for transcript in batch_transcript], batch_first=True, padding_value=0)
        # print(batch_transcript_pad.shape)
        if self.transforms == True:
            time_mask = tat.TimeMasking(time_mask_param=80,iid_masks=True)
            frequency_mask = tat.FrequencyMasking(freq_mask_param=2,iid_masks=True)
            batch_mfcc_pad = torch.permute(batch_mfcc_pad, (0, 2, 1))
            batch_mfcc_pad = time_mask(batch_mfcc_pad)
            batch_mfcc_pad = frequency_mask(batch_mfcc_pad)
            batch_mfcc_pad = torch.permute(batch_mfcc_pad, (0, 2, 1))
        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?

        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, batch_transcript_pad, torch.tensor(lengths_mfcc), torch.tensor(lengths_transcript)



### Test Data

In [5]:
# Test Dataloader
#TODO
class AudioDatasetTest(torch.utils.data.Dataset):
   #TODO
    def __init__(self, root, PHONEMES=PHONEMES, partition= None, limit = None):
        '''
        Initializes the dataset.

        INPUTS: What inputs do you need here?
        '''

        # Load the directory and all files in them

        self.mfcc_dir = os.path.join(root,partition,'mfcc')

        if limit is None:
            self.mfcc_files = sorted(os.listdir(self.mfcc_dir))
        else:
            self.mfcc_files = sorted(os.listdir(self.mfcc_dir))[:limit]

        self.PHONEMES = PHONEMES

        self.mfccs = []

        # Iterate through mfccs and transcripts
        for i in range(len(self.mfcc_files)):
        #   Load a single mfcc
            mfcc        = np.load(os.path.join(self.mfcc_dir,self.mfcc_files[i])) # this is the actual values
        #   Do Cepstral Normalization of mfcc (explained in writeup for hw1p1)
            mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
            self.mfccs.append(mfcc)


        # Each mfcc is of shape T1 x 27, T2 x 27, ...
        # Each transcript is of shape (T1_out+2), (T2_out+2) before removing [SOS] and [EOS]

        #TODO
        # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        self.length = len(self.mfccs)


    def __len__(self):

        '''
        TODO: What do we return here?
        '''
        return self.length

    def __getitem__(self, ind):
        '''
        TODO: RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS

        If you didn't do the loading and processing of the data in __init__,
        do that here.

        Once done, return a tuple of features and labels.
        '''

        mfcc =  self.mfccs[ind] #This will return the mfcc of shape[1,27]
        return mfcc

    def collate_fn(self,batch):
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish.
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features,
            and lengths of labels.
        '''
        batch_mfcc = [item for item in batch]  # item[0]

        # HINT: CHECK OUT -> pad_sequence (imported above)
        # Also be sure to check the input format (batch_first)

        #This is to store the original length information
        lengths_mfcc = [len(mfcc) for mfcc in batch_mfcc]

        batch_mfcc_pad = pad_sequence(sequences=[torch.tensor(mfcc) for mfcc in batch_mfcc],batch_first=True, padding_value=0)
        # print(batch_mfcc_pad.shape)
        # print(batch_transcript_pad.shape)
        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?

        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return batch_mfcc_pad, torch.tensor(lengths_mfcc)

### Config - Hyperparameters

In [6]:
# root = '/content/11-785-s24-hw3p2/'

# Feel free to add more items here
config = {
    "beam_width" : 3,
    "lr"         : 2e-3,
    "epochs"     : 50,
    "batch_size" : 64  # Increase if your device can handle it
}

# You may pass this as a parameter to the dataset class above
# This will help modularize your implementation
transforms = [] # set of tranformations

### Data loaders

In [7]:
# get me RAMMM!!!!
gc.collect()

63

In [8]:
# Create objects for the dataset class
train_data = AudioDataset(root='11-785-s24-hw3p2/', PHONEMES=PHONEMES, partition='train-clean-100/', limit=None, transforms=True) #TODO
val_data = AudioDataset(root='11-785-s24-hw3p2', PHONEMES=PHONEMES, partition='dev-clean/', limit=None, transforms=False)# TODO : You can either use the same class with some modifications or make a new one :)
test_data = AudioDatasetTest(root='11-785-s24-hw3p2', PHONEMES=PHONEMES, partition='test-clean/', limit=None) #TODO

# Do NOT forget to pass in the collate function as parameter while creating the dataloader
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True,
    collate_fn  = train_data.collate_fn
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False,
    collate_fn  = val_data.collate_fn
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False,
    collate_fn  = test_data.collate_fn
)

print("Batch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))
# x, y = train_data.collate_fn(5)


Batch size:  64
Train dataset samples = 28539, batches = 446
Val dataset samples = 2703, batches = 43
Test dataset samples = 2620, batches = 41


In [9]:
# sanity check
for data in train_loader:
    # m = data
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break

for data in test_loader:
    x,lx = data
    print(x.shape,lx.shape)

torch.Size([64, 1688, 27]) torch.Size([64, 195]) torch.Size([64]) torch.Size([64])
torch.Size([64, 2001, 27]) torch.Size([64])
torch.Size([64, 2363, 27]) torch.Size([64])
torch.Size([64, 3000, 27]) torch.Size([64])
torch.Size([64, 2052, 27]) torch.Size([64])
torch.Size([64, 2329, 27]) torch.Size([64])
torch.Size([64, 1818, 27]) torch.Size([64])
torch.Size([64, 1013, 27]) torch.Size([64])
torch.Size([64, 3387, 27]) torch.Size([64])
torch.Size([64, 3161, 27]) torch.Size([64])
torch.Size([64, 2590, 27]) torch.Size([64])
torch.Size([64, 1280, 27]) torch.Size([64])
torch.Size([64, 2114, 27]) torch.Size([64])
torch.Size([64, 1608, 27]) torch.Size([64])
torch.Size([64, 2714, 27]) torch.Size([64])
torch.Size([64, 2594, 27]) torch.Size([64])
torch.Size([64, 2823, 27]) torch.Size([64])
torch.Size([64, 2853, 27]) torch.Size([64])
torch.Size([64, 2607, 27]) torch.Size([64])
torch.Size([64, 960, 27]) torch.Size([64])
torch.Size([64, 3491, 27]) torch.Size([64])
torch.Size([64, 2956, 27]) torch.Size(

# NETWORK

## Basic

This is a basic block for understanding, you can skip this and move to pBLSTM one

In [10]:
# torch.cuda.empty_cache()

# class Network(nn.Module):

#     def __init__(self):

#         super(Network, self).__init__()

#         # Adding some sort of embedding layer or feature extractor might help performance.
#         # self.embedding = ?

#         self.lstm = nn.LSTM(input_size=input_size, hidden_size=256, num_layers=1, batch_first=True)

#         self.classification = nn.Sequential(
#             nn.Linear(in_features=256, out_features=output_size)
#         )

#         self.logSoftmax = nn.LogSoftmax(dim=1)


#         # TODO : look up the documentation. You might need to pass some additional parameters.
#         self.lstm = nn.LSTM(input_size = __, hidden_size = 256, num_layers = 1)

#         self.classification = nn.Sequential(
#             #TODO: Linear layer with in_features from the lstm module above and out_features = OUT_SIZE
#         )


#         self.logSoftmax = #TODO: Apply a log softmax here. Which dimension would apply it on ?

#     def forward(self, x, lx):
#         #TODO
#         # The forward function takes 2 parameter inputs here. Why?
#         # Refer to the handout for hints

#         # packing first to pass through the lstm
#         x_packed = nn.utils.rnn.pack_padded_sequence(x, lx, batch_first=True, enforce_sorted=False)
#         x_lstm, (h_n, c_n) = self.lstm(x_packed)

#         # unpack the packed signal through classification layer
#         x_unpacked, _ = nn.utils.rnn.pad_packed_sequence(x_lstm, batch_first=True)

#         # classify finally
#         x_classified = self.classification(x_unpacked)

#         # final softmax
#         x_logsoftmax = self.logSoftmax(x_classified)
#         pass

## Initialize Basic Network
(If trying out the basic Network)

In [11]:
# torch.cuda.empty_cache()

# model = Network().to(device)
# summary(model, x.to(device), lx) # x and lx come from the sanity check above :)

## ASR Network

### Pyramid Bi-LSTM (pBLSTM)

In [12]:
# Utils for network
torch.cuda.empty_cache()
# from torch.autograd import Variable

class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)
    
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.25):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'
            
            
class Residual_Block(torch.nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.conv1 = torch.nn.Conv1d(in_channels, 64, kernel_size=3, padding=1, stride=1)
        self.bn1 = torch.nn.BatchNorm1d(64)
        self.conv2 = torch.nn.Conv1d(64, 128, kernel_size=3, padding=1, stride=1)
        self.bn2 = torch.nn.BatchNorm1d(128)
        self.conv3 = torch.nn.Conv1d(128, out_channels, kernel_size=3, padding=1, stride=1)
        self.bn3 = torch.nn.BatchNorm1d(out_channels)
        self.relu = torch.nn.ReLU()
        self.shortcut = torch.nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = torch.nn.Sequential(
                torch.nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
                torch.nn.BatchNorm1d(out_channels)
            )
    def forward(self, x):
        identity = self.shortcut(x)

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        
        out = self.conv3(out)
        out = self.bn3(out)
        
        out += identity
        out = self.relu(out)

        return out


In [14]:
class pBLSTM(torch.nn.Module):

    '''
    Pyramidal BiLSTM
    Read the write up/paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension
        (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input?
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''

    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()

        self.blstm = torch.nn.LSTM(input_size=2*input_size, hidden_size=hidden_size, num_layers=3, bidirectional=True, dropout = 0.3)  # TODO: Initialize a single layer bidirectional LSTM with the given input_size and hidden_size
        #could drop out
    def forward(self, x_packed): # x_packed is a PackedSequence

        # TODO: Pad Packed Sequence
        seq_unpacked, lens_unpacked = pad_packed_sequence(x_packed, batch_first=True)
        # print(f'in the plstm section after pad {seq_unpacked.shape},{lens_unpacked.shape}')
        # Call self.trunc_reshape() which downsamples the time steps of x and increases the feature dimensions as mentioned above
        x, x_lens = self.trunc_reshape(x=seq_unpacked, x_lens=lens_unpacked)
        # print(f'in the plstm section after truncate {x.shape},{x_lens.shape}')
        # self.trunc_reshape will return 2 outputs. What are they? Think about what quantites are changing.
        # TODO: Pack Padded Sequence. What output(s) would you get?
        PackedSequence = pack_padded_sequence(input=x,lengths=x_lens,batch_first=True,enforce_sorted=False)
        # TODO: Pass the sequence through bLSTM
        packed_output, (_, _) = self.blstm(PackedSequence)
        # What do you return?

        return packed_output

    def trunc_reshape(self, x, x_lens):
        # TODO: If you have odd number of timesteps, how can you handle it? (Hint: You can exclude them)
        # TODO: Reshape x. When reshaping x, you have to reduce number of timesteps by a downsampling factor while increasing number of features by the same factor
        # TODO: Reduce lengths by the same downsampling factor
        # The shape of x is [batch_size, frames, features]
        batch_size = x.shape[0]
        frames = x.shape[1]
        features = x.shape[2]
        if frames % 2 != 0:
            x = x[:, :-1, :]
        x = x.reshape((batch_size, x.shape[1]//2, features*2))
        x_lens = x_lens/2
        return x, x_lens

### Encoder

In [15]:
class Encoder(torch.nn.Module):
    '''
    The Encoder takes utterances as inputs and returns latent feature representations
    '''
    def __init__(self, input_size, encoder_hidden_size):
        super(Encoder, self).__init__()

        self.PermuteBlock = PermuteBlock()
        self.embedding = torch.nn.Sequential(torch.nn.Conv1d(in_channels=input_size,out_channels=64,kernel_size=3,padding=1,stride=1),
                                             torch.nn.BatchNorm1d(64),
                                             torch.nn.GELU(),
                                             torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1, stride=1),
                                             torch.nn.BatchNorm1d(128),
                                             torch.nn.GELU(),
                                             torch.nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=1),
                                             torch.nn.BatchNorm1d(256),
                                             torch.nn.GELU()
                                             )
        # self.embedding = torch.nn.Sequential(Residual_Block(in_channels=input_size, out_channels=encoder_hidden_size,stride=1)
        #                                     #  Residual_Block(in_channels=encoder_hidden_size, out_channels=encoder_hidden_size, stride=1),
        #                                     #  Residual_Block(in_channels=encoder_hidden_size, out_channels=encoder_hidden_size, stride=1)
        #                                      )
        # self.embedding = torch.nn.Conv1d(in_channels=input_size,out_channels=encoder_hidden_size,kernel_size=3,padding=1,stride=1)
        # self.conv1 = torch.nn.Conv1d(in_channels=input_size, out_channels=64, kernel_size=3, stride=1, padding=1)#TODO: You can use CNNs as Embedding layer to extract features. Keep in mind the Input dimensions and expected dimension of Pytorch CNN.
        # self.batchnorm1 = torch.nn.BatchNorm1d(64)
        # self.relu = torch.nn.ReLU()
        # self.conv2 = torch.nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)#TODO: You can use CNNs as Embedding layer to extract features. Keep in mind the Input dimensions and expected dimension of Pytorch CNN.
        # self.batchnorm2 = torch.nn.BatchNorm1d(encoder_hidden_size)
        # self.conv3 = torch.nn.Conv1d(in_channels=64, out_channels=encoder_hidden_size, kernel_size=3, stride=1, padding=1)
        # self.batch
        # self.shortcut = torch.nn.Sequential(torch.nn.Conv1d(in_channels=input_size, out_channels=encoder_hidden_size, kernel_size=3, stride=1, padding=1), torch.nn.BatchNorm1d(encoder_hidden_size))
        # could dropout, batchnorm, RELU, and residual connection
        
        # self.pBLSTMs = torch.nn.Sequential( # How many pBLSTMs are required?
        #     # TODO: Fill this up with pBLSTMs - What should the input_size be?
        #     # Hint: You are downsampling timesteps by a factor of 2, upsampling features by a factor of 2 and the LSTM is bidirectional)
        #     # Optional: Dropout/Locked Dropout after each pBLSTM (Not needed for early submission)
        #     # https://github.com/salesforce/awd-lstm-lm/blob/dfd3cb0235d2caf2847a4d53e1cbd495b781b5d2/locked_dropout.py#L5
        #     # ...
        #     # ...
        #     pBLSTM(input_size=encoder_hidden_size,hidden_size=encoder_hidden_size),
        #     # LockedDropout(),
        #     pBLSTM(input_size=2*encoder_hidden_size,hidden_size=encoder_hidden_size),
        #     # LockedDropout(),
        #     pBLSTM(input_size=2*encoder_hidden_size,hidden_size=encoder_hidden_size)
        #     # LockedDropout()
        # )
        # self.pBLSTM1 = pBLSTM(input_size=encoder_hidden_size,hidden_size=encoder_hidden_size)
        # self.dropout1 = LockedDropout()
        # self.pBLSTM2 = pBLSTM(input_size=2*encoder_hidden_size,hidden_size=encoder_hidden_size)
        # self.pBLSTM3 = pBLSTM(input_size=2*encoder_hidden_size,hidden_size=encoder_hidden_size)
        self.lstm_layers = torch.nn.ModuleList()
        for i in range(2):
            input_size_pblstm = 256 if i == 0 else encoder_hidden_size * 2
            self.lstm_layers.append(pBLSTM(input_size=input_size_pblstm, hidden_size=encoder_hidden_size))
            self.lstm_layers.append(LockedDropout())
        
    def forward(self, x, x_lens):
        # Where are x and x_lens coming from? The dataloader
        #TODO: Call the permute layer and embedding layer
        x = self.PermuteBlock(x)
        x = self.embedding(x)
        x = self.PermuteBlock(x)
        # # TODO: Pack Padded Sequence
        # packed = pack_padded_sequence(input=x,lengths=x_lens,batch_first=True,enforce_sorted=False)
        # # TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        # x = self.pBLSTM1(packed)
        # # TODO: Pad Packed Sequence
        # encoder_outputs, encoder_lens= pad_packed_sequence(sequence=x,batch_first=True)
        # print(f'The output from plstm, {encoder_outputs.shape},{encoder_lens.shape}')
        for layer in self.lstm_layers:
            if isinstance(layer, pBLSTM):
                x_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
                x_packed = layer(x_packed)
                x, x_lens = pad_packed_sequence(x_packed, batch_first=True)
            else:
                x = torch.permute(x, (1, 0, 2))
                x = layer(x)
                x = torch.permute(x,(1,0,2))
        # Remember the number of output(s) each function returns

        return x, x_lens
    

### Decoder

In [28]:
class Decoder(torch.nn.Module):

    def __init__(self, embed_size, output_size= 41):
        super().__init__()

        self.mlp = torch.nn.Sequential( #maybe change the hidden size to be normal 256,512...
            PermuteBlock(), torch.nn.BatchNorm1d(embed_size), PermuteBlock(),
            #TODO define your MLP arch. Refer HW1P2
            #Use Permute Block before and after BatchNorm1d() to match the size
            torch.nn.Linear(in_features=embed_size,out_features=128),
            torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            torch.nn.Linear(in_features=128, out_features=64),
            torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            torch.nn.Linear(in_features=64, out_features=output_size)
            # PermuteBlock(), torch.nn.BatchNorm1d(512*4), PermuteBlock(),
            # torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            # torch.nn.Linear(in_features=512*4, out_features=512*8),
            # PermuteBlock(), torch.nn.BatchNorm1d(512*8), PermuteBlock(),
            # torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            # torch.nn.Linear(in_features=512*8, out_features=512*4),
            # PermuteBlock(), torch.nn.BatchNorm1d(512*4), PermuteBlock(),
            # torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            # torch.nn.Linear(in_features=512*4, out_features=512*2),
            # PermuteBlock(), torch.nn.BatchNorm1d(512*2), PermuteBlock(),
            # torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            # torch.nn.Linear(in_features=512*2, out_features=512*1),
            # PermuteBlock(), torch.nn.BatchNorm1d(512*1), PermuteBlock(),
            # torch.nn.GELU(),
            # torch.nn.Dropout(p=0.2),
            # torch.nn.Linear(in_features=512*1, out_features=output_size)
        )

        self.softmax = torch.nn.LogSoftmax(dim=2)

    def forward(self, encoder_out):
        #TODO call your MLP
        out = self.mlp(encoder_out)
        #TODO Think what should be the final output of the decoder for the classification
        decoder_out = self.softmax(out)
        return decoder_out

In [29]:
class ASRModel(torch.nn.Module):

    def __init__(self, input_size, embed_size= 192, output_size= len(PHONEMES)):
        super().__init__()

        self.augmentations  = torch.nn.Sequential(
            #TODO Add Time Masking/ Frequency Masking
            #Hint: See how to use PermuteBlock() function defined above
        )
        self.encoder        = Encoder(input_size=input_size, encoder_hidden_size=embed_size*2)
        self.decoder        = Decoder(embed_size=4* embed_size, output_size=41) # this is because the encoder double the hidden size



    def forward(self, x, lengths_x):

        if self.training:
            x = self.augmentations(x)

        encoder_out, encoder_lens   = self.encoder(x, lengths_x)
        decoder_out                 = self.decoder(encoder_out)
        # print(f'The final dimension: {decoder_out.shape},{encoder_lens.shape}')
        return decoder_out, encoder_lens

## Initialize ASR Network

In [30]:
model = ASRModel(
    input_size  = 27,
    embed_size  = 192,
    output_size = len(PHONEMES)
).to(device)
print(model)
summary(model, x.to(device), lx)

ASRModel(
  (augmentations): Sequential()
  (encoder): Encoder(
    (PermuteBlock): PermuteBlock()
    (embedding): Sequential(
      (0): Conv1d(27, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): GELU(approximate='none')
      (3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): GELU(approximate='none')
      (6): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (7): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (8): GELU(approximate='none')
    )
    (lstm_layers): ModuleList(
      (0): pBLSTM(
        (blstm): LSTM(512, 384, num_layers=3, dropout=0.3, bidirectional=True)
      )
      (1): LockedDropout(p=0.25)
      (2): pBLSTM(
        (blstm): LSTM(1536, 384, num_layers=3, dropout=0.3, bidirectional=True

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_augmentations,-,"[64, 2936, 27]",,
1_encoder.PermuteBlock_PermuteBlock,-,"[64, 27, 2936]",,
2_encoder.embedding.Conv1d_0,"[27, 64, 3]","[64, 64, 2936]",5248.0,15220224.0
3_encoder.embedding.BatchNorm1d_1,[64],"[64, 64, 2936]",128.0,64.0
4_encoder.embedding.GELU_2,-,"[64, 64, 2936]",,
5_encoder.embedding.Conv1d_3,"[64, 128, 3]","[64, 128, 2936]",24704.0,72155136.0
6_encoder.embedding.BatchNorm1d_4,[128],"[64, 128, 2936]",256.0,128.0
7_encoder.embedding.GELU_5,-,"[64, 128, 2936]",,
8_encoder.embedding.Conv1d_6,"[128, 256, 3]","[64, 256, 2936]",98560.0,288620544.0
9_encoder.embedding.BatchNorm1d_7,[256],"[64, 256, 2936]",512.0,256.0


In [31]:
import multiprocessing
n_cpus = multiprocessing.cpu_count()
print(n_cpus)

4


# Training Config
Initialize Loss Criterion, Optimizer, CTC Beam Decoder, Scheduler, Scaler (Mixed-Precision), etc.

In [32]:
#TODO
criterion = torch.nn.CTCLoss(blank=0, reduction='mean',zero_infinity=True)  # Define CTC loss as the criterion. How would the losses be reduced?
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
# Refer to the handout for hints

optimizer = torch.optim.AdamW(model.parameters(), lr= config['lr'], weight_decay=1e-2) #Defining Optimizer
# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
# CTC Beam Decoder Doc: https://github.com/parlance/ctcdecode
decoder = CTCBeamDecoder(
    labels = LABELS, #
    model_path=None,
    alpha=0,
    beta=0,
    cutoff_top_n=40,
    cutoff_prob=1.0,
    beam_width=config['beam_width'],
    num_processes=1,
    blank_id=0,
    log_probs_input=True
)

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1, threshold=0.5, threshold_mode = 'rel',factor=0.8)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, threshold=1e-2, threshold_mode = 'abs',factor=0.8)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1, threshold=0.1, threshold_mode = 'abs',factor=0.8, min_lr=5e-5)
# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

# Decode Prediction

In [33]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP= LABELS):

    # output = torch.permute(output, (1, 0, 2))
    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(output, seq_lens = output_lens) #lengths - list of lengths
    pred_strings                    = []

    for i in range(output_lens.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        # get the single batch result
        pred = beam_results[i][0][:out_lens[i][0]]  # we only care about the highest result
        #
        decoded_string = ''.join([PHONEME_MAP[p] for p in pred])
        pred_strings.append(decoded_string)
    return pred_strings # this returns for each batch

def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP= LABELS): # y - sequence of integers

    dist            = 0
    batch_size      = label.shape[0]

    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)

    for i in range(batch_size):
        # TODO: Get predicted string and label string for each element in the batch
        pred_string = pred_strings[i]
        label_index = label[i, :label_lens[i]].tolist()
        label_string = ''.join([PHONEME_MAP[index] for index in label_index])
        dist += Levenshtein.distance(pred_string, label_string)

    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this

    return dist

# Test Implementation

In [34]:
# test code to check shapes
torch.cuda.empty_cache()
model.eval()
for i, data in enumerate(val_loader,0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)
    h = torch.permute(h, (1, 0, 2))
    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))

    break

torch.Size([64, 734, 41])
torch.Size([734, 64, 41]) torch.Size([64, 265])
tensor(7.6026, device='cuda:0', grad_fn=<MeanBackward0>)
203.703125


# WandB

You will need to fetch your api key from wandb.ai

In [35]:
import wandb
# from typing_extensions import Literal
wandb.login(key="b9fec08c02d03c41bdf5fe1f5268589277270fbe")



True

In [36]:
run = wandb.init(
    name = "try17_TA", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # id = 'yvuki600',### Insert specific run id here if you want to resume a previous run
    # resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2-ablations", ### Project should be created in your wandb account
    config = config ### Wandb Config for your run
)

VBox(children=(Label(value='528.512 MB of 528.512 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
lr,▁▁▁▁
train_loss,█▂▁▁
valid_dist,▅▁▇█
valid_loss,█▃▂▁

0,1
lr,0.002
train_loss,3.34129
valid_dist,71.73961
valid_loss,3.29424


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114110611106297, max=1.0…

# Train Functions

In [37]:
# from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16.
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar

    return total_loss / len(train_loader)


def validate_model(model, val_loader, decoder, phoneme_map= LABELS):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0

    for i, data in enumerate(val_loader):

        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)

        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)

        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))

        batch_bar.update()

        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()

    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

## Training Setup

In [38]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1],
         'epoch'                    : epoch},
         path
    )

def load_model(path, model, metric= 'valid_dist', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return [model, optimizer, scheduler, epoch, metric]

In [39]:
# This is for checkpointing, if you're doing it over multiple sessions


last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
epoch_model_path =  '/home/22941/epoch_model_path.pth'#set the model path( Optional, you can just store best one. Make sure to make the changes below )
best_model_path = '/home/22941/best_model_path.pth'#TODO set best model path

In [41]:
model,optimizer,scheduler,epochs,metric= load_model(path=best_model_path,model=model,optimizer=optimizer,scheduler=scheduler)
# manually change the optimizer state
# optimizer.param_groups[0]['lr'] = 2e-4
# 
print(f'model: {model}\n',f'optimizer: {optimizer}\n',f'scheduler: {scheduler}\n',f'epoch: {epochs}\n',f'metric: {metric}\n')


model: ASRModel(
  (augmentations): Sequential()
  (encoder): Encoder(
    (PermuteBlock): PermuteBlock()
    (embedding): Sequential(
      (0): Conv1d(27, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): GELU(approximate='none')
      (3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): GELU(approximate='none')
      (6): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (7): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (8): GELU(approximate='none')
    )
    (lstm_layers): ModuleList(
      (0): pBLSTM(
        (blstm): LSTM(512, 384, num_layers=3, dropout=0.3, bidirectional=True)
      )
      (1): LockedDropout(p=0.25)
      (2): pBLSTM(
        (blstm): LSTM(1536, 384, num_layers=3, dropout=0.3, bidirection

In [40]:
torch.cuda.empty_cache()
gc.collect()

#TODO: Please complete the training loop

for epoch in range(0, 100):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))

    curr_lr = float(optimizer.param_groups[0]['lr'])

    train_loss              = train_model(model=model,train_loader=train_loader,criterion=criterion,optimizer=optimizer)
    valid_loss, valid_dist  = validate_model(model=model,val_loader=val_loader,decoder=decoder,phoneme_map=LABELS)
    scheduler.step(valid_dist)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Dist {:.04f}%\t Val Loss {:.04f}".format(valid_dist, valid_loss))


    wandb.log({
        'train_loss': train_loss,
        'valid_dist': valid_dist,
        'valid_loss': valid_loss,
        'lr'        : curr_lr
    })

    save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, epoch_model_path)
    wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models
run.finish()


Epoch: 1/50


                                                                                  

	Train Loss 3.4057	 Learning Rate 0.0020000
	Val Dist 69.4148%	 Val Loss 3.3000
Saved epoch model
Saved best model

Epoch: 2/50


                                                                                  

	Train Loss 2.4861	 Learning Rate 0.0020000
	Val Dist 27.0820%	 Val Loss 1.2052
Saved epoch model
Saved best model

Epoch: 3/50


                                                                                  

	Train Loss 1.0008	 Learning Rate 0.0020000
	Val Dist 15.9071%	 Val Loss 0.7239
Saved epoch model
Saved best model

Epoch: 4/50


                                                                                  

	Train Loss 0.7192	 Learning Rate 0.0020000
	Val Dist 12.2523%	 Val Loss 0.5619
Saved epoch model
Saved best model

Epoch: 5/50


                                                                                  

	Train Loss 0.6041	 Learning Rate 0.0020000
	Val Dist 10.4472%	 Val Loss 0.4801
Saved epoch model
Saved best model

Epoch: 6/50


                                                                                  

	Train Loss 0.5286	 Learning Rate 0.0020000
	Val Dist 9.2939%	 Val Loss 0.4338
Saved epoch model
Saved best model

Epoch: 7/50


                                                                                  

	Train Loss 0.4774	 Learning Rate 0.0020000
	Val Dist 8.8095%	 Val Loss 0.4144
Saved epoch model
Saved best model

Epoch: 8/50


                                                                                  

	Train Loss 0.4378	 Learning Rate 0.0020000
	Val Dist 7.9649%	 Val Loss 0.3766
Saved epoch model
Saved best model

Epoch: 9/50


                                                                                  

	Train Loss 0.4155	 Learning Rate 0.0020000
	Val Dist 7.7082%	 Val Loss 0.3714
Saved epoch model
Saved best model

Epoch: 10/50


                                                                                  

	Train Loss 0.3852	 Learning Rate 0.0020000
	Val Dist 7.2935%	 Val Loss 0.3475
Saved epoch model
Saved best model

Epoch: 11/50


                                                                                  

	Train Loss 0.3705	 Learning Rate 0.0020000
	Val Dist 7.1881%	 Val Loss 0.3443
Saved epoch model
Saved best model

Epoch: 12/50


                                                                                  

	Train Loss 0.3510	 Learning Rate 0.0020000
	Val Dist 6.8393%	 Val Loss 0.3312
Saved epoch model
Saved best model

Epoch: 13/50


                                                                                  

	Train Loss 0.3353	 Learning Rate 0.0020000
	Val Dist 6.7756%	 Val Loss 0.3356
Saved epoch model
Saved best model

Epoch: 14/50


                                                                                  

	Train Loss 0.3266	 Learning Rate 0.0020000
	Val Dist 6.6070%	 Val Loss 0.3314
Saved epoch model
Saved best model

Epoch: 15/50


                                                                                  

	Train Loss 0.3158	 Learning Rate 0.0020000
	Val Dist 6.3494%	 Val Loss 0.3202
Saved epoch model
Saved best model

Epoch: 16/50


                                                                                  

	Train Loss 0.2971	 Learning Rate 0.0020000
	Val Dist 6.3296%	 Val Loss 0.3209
Saved epoch model
Saved best model

Epoch: 17/50


                                                                                  

	Train Loss 0.2914	 Learning Rate 0.0020000
	Val Dist 6.1091%	 Val Loss 0.3103
Saved epoch model
Saved best model

Epoch: 18/50


                                                                                  

	Train Loss 0.2819	 Learning Rate 0.0020000
	Val Dist 6.1229%	 Val Loss 0.3083
Saved epoch model

Epoch: 19/50


                                                                                  

	Train Loss 0.2752	 Learning Rate 0.0020000
	Val Dist 5.9623%	 Val Loss 0.3061
Saved epoch model
Saved best model

Epoch: 20/50


                                                                                  

	Train Loss 0.2653	 Learning Rate 0.0020000
	Val Dist 5.8691%	 Val Loss 0.3025
Saved epoch model
Saved best model

Epoch: 21/50


                                                                                  

	Train Loss 0.2716	 Learning Rate 0.0020000
	Val Dist 5.7870%	 Val Loss 0.2971
Saved epoch model
Saved best model

Epoch: 22/50


                                                                                  

	Train Loss 0.2563	 Learning Rate 0.0020000
	Val Dist 5.7264%	 Val Loss 0.2937
Saved epoch model
Saved best model

Epoch: 23/50


                                                                                  

	Train Loss 0.2508	 Learning Rate 0.0020000
	Val Dist 5.5070%	 Val Loss 0.2944
Saved epoch model
Saved best model

Epoch: 24/50


                                                                                  

	Train Loss 0.2437	 Learning Rate 0.0020000
	Val Dist 5.8229%	 Val Loss 0.3054
Saved epoch model

Epoch: 25/50


                                                                                  

	Train Loss 0.2463	 Learning Rate 0.0020000
	Val Dist 5.6554%	 Val Loss 0.2957
Saved epoch model

Epoch: 26/50


                                                                                  

	Train Loss 0.2227	 Learning Rate 0.0016000
	Val Dist 5.3176%	 Val Loss 0.2821
Saved epoch model
Saved best model

Epoch: 27/50


                                                                                  

	Train Loss 0.2144	 Learning Rate 0.0016000
	Val Dist 5.1831%	 Val Loss 0.2782
Saved epoch model
Saved best model

Epoch: 28/50


                                                                                  

	Train Loss 0.2073	 Learning Rate 0.0016000
	Val Dist 5.2796%	 Val Loss 0.2886
Saved epoch model

Epoch: 29/50


                                                                                  

	Train Loss 0.2089	 Learning Rate 0.0016000
	Val Dist 5.1708%	 Val Loss 0.2845
Saved epoch model
Saved best model

Epoch: 30/50


                                                                                  

	Train Loss 0.1926	 Learning Rate 0.0012800
	Val Dist 5.0038%	 Val Loss 0.2772
Saved epoch model
Saved best model

Epoch: 31/50


                                                                                  

	Train Loss 0.1920	 Learning Rate 0.0012800
	Val Dist 4.9870%	 Val Loss 0.2775
Saved epoch model
Saved best model

Epoch: 32/50


                                                                                  

	Train Loss 0.1844	 Learning Rate 0.0012800
	Val Dist 4.9770%	 Val Loss 0.2766
Saved epoch model
Saved best model

Epoch: 33/50


                                                                                  

	Train Loss 0.1750	 Learning Rate 0.0010240
	Val Dist 4.7753%	 Val Loss 0.2762
Saved epoch model
Saved best model

Epoch: 34/50


                                                                                  

	Train Loss 0.1693	 Learning Rate 0.0010240
	Val Dist 4.7401%	 Val Loss 0.2728
Saved epoch model
Saved best model

Epoch: 35/50


                                                                                  

	Train Loss 0.1688	 Learning Rate 0.0010240
	Val Dist 4.8311%	 Val Loss 0.2773
Saved epoch model

Epoch: 36/50


                                                                                  

	Train Loss 0.1591	 Learning Rate 0.0008192
	Val Dist 4.6689%	 Val Loss 0.2752
Saved epoch model
Saved best model

Epoch: 37/50


                                                                                  

	Train Loss 0.1537	 Learning Rate 0.0008192
	Val Dist 4.7284%	 Val Loss 0.2806
Saved epoch model

Epoch: 38/50


Train:   3%|▎         | 15/446 [00:11<05:29,  1.31it/s, loss=0.1496, lr=0.000819]

KeyboardInterrupt: 

# Generate Predictions and Submit to Kaggle

In [42]:
#TODO: Make predictions

# Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder

TEST_BEAM_WIDTH = 10

test_decoder  = CTCBeamDecoder(
    labels = LABELS, #
    model_path=None,
    alpha=0,
    beta=0,
    cutoff_top_n=40,
    cutoff_prob=1.0,
    beam_width=TEST_BEAM_WIDTH,
    num_processes=1,
    blank_id=0,
    log_probs_input=True
)
results = []

model.eval()
print("Testing")
for data in tqdm(test_loader):

    x, lx   = data
    x       = x.to(device)

    with torch.no_grad():
        h, lh = model(x, lx)

    prediction_string= decode_prediction(output=h,output_lens=lh,decoder=test_decoder,PHONEME_MAP=LABELS) # TODO call decode_prediction
    print(len(prediction_string))
    #TODO save the output in results array.
    for string in prediction_string:
        results.append(string)
    del x, lx, h, lh
    torch.cuda.empty_cache()

Testing




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64




64


100%|██████████| 41/41 [00:44<00:00,  1.10s/it]

60





In [43]:
data_dir = f"11-785-s24-hw3p2//test-clean/random_submission.csv"
df = pd.read_csv(data_dir)
df.label = results
df.to_csv('submission.csv', index = False)

In [39]:
!kaggle competitions submit -c hw3p2asr-s24 -f submission.csv -m "I made it!"

100%|█████████████████████████████████████████| 209k/209k [00:00<00:00, 332kB/s]
Successfully submitted to HW3P2_ASR-S24