# Start

In [None]:
DOWNLOAD_DATA = 1
BATCH_SIZE = 128

CON_TRAIN = 1

DEBUG = 0

SUBMIT_TO_KAGGLE = 0

# Libraries and Initial Processing

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import Levenshtein as lev
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
import datetime
from torch.utils import data
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from tqdm import tqdm

import pdb
cuda = torch.cuda.is_available()

print(cuda, sys.version)

device = torch.device("cuda" if cuda else "cpu")
num_workers = 4 if cuda else 0
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))
np.random.seed(11785)
torch.manual_seed(11785)

# The labels of the dataset contain letters in LETTER_LIST.
# You should use this to convert the letters to the corresponding indices
# and train your model with numerical labels.
LETTER_LIST = ['<sos>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', "'", ' ', '<eos>']

True 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Cuda = True with num_workers = 4


In [None]:
if DEBUG:
    device = "cpu"
    BATCH_SIZE = 2
    

In [None]:
def create_dictionaries(letter_list):
    '''
    Create dictionaries for letter2index and index2letter transformations
    based on LETTER_LIST

    Args:
        letter_list: LETTER_LIST

    Return:
        letter2index: Dictionary mapping from letters to indices
        index2letter: Dictionary mapping from indices to letters
    '''
    letter2index = dict()
    index2letter = dict()
    
    letter2index = {ele:ind for ind,ele in enumerate(letter_list)}
    index2letter = {ind:ele for ind,ele in enumerate(letter_list)}

    return letter2index, index2letter

def transform_index_to_letter(batch_indices):  #(B,Tmax)
    '''
    Transforms numerical index input to string output by converting each index 
    to its corresponding letter from LETTER_LIST

    Args:
        batch_indices: List of indices from LETTER_LIST with the shape of (N, )
    
    Return:
        transcripts: List of converted string transcripts. This would be a list with a length of N
    '''
    transcripts = []
    # return transcripts without <eos>
    stop_ind = letter2index['<eos>']
    for indices in batch_indices:
        transcript = "" # string of one transcript
        for idx in indices:
            if (idx == stop_ind) or ((idx>=len(LETTER_LIST))or idx<=0):
                break
            else:
                transcript += index2letter[idx]
        transcripts.append(transcript)
    return transcripts
letter2index, index2letter = create_dictionaries(LETTER_LIST)

In [None]:
if DOWNLOAD_DATA:

    !kaggle competitions download -c 11-785-s22-hw4p2

    !unzip 11-785-s22-hw4p2.zip

    !ls

# Dataset and Dataloading

You will need to implement the Dataset class by your own. You can implement it similar to HW3P2. However, you are welcomed to do it your own way if it is more comfortable or efficient.

Note that you need to use LETTER_LIST to convert the transcript into numerical labels for the model.


Example of raw transcript:

    ['<sos>', 'N', 'O', 'R', 'T', 'H', 'A', 'N', 'G', 'E', 'R', ' ','A', 'B', 'B', 'E', 'Y', '<eos>']

Example of converted transcript ready to process for the model:

    [0, 14, 15, 18, 20, 8, 1, 14, 7, 5, 18, 28, 1, 2, 2, 5, 25, 29]


In [None]:
# cepstral normalization
# x tensor
def normalize(x):
    x_hat = x - x.mean(dim=0)
    return x_hat


In [None]:
# simple data Libri
class LibriSimple(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"): # You can use partition to specify train or dev

        self.X_path = data_path +  "/" + partition + ".npy"
        self.Y_path = data_path +  "/" + partition + "_transcripts.npy"
        
        # load all data here 
        xx = np.load(self.X_path, allow_pickle=True) # 10000 obj of np arra
        self.X = [torch.from_numpy(x_arr) for x_arr in xx]
        Y = np.load(self.Y_path, allow_pickle=True) # Y[0] is a list
        self.Yy=[]

        #  ~~~~ keep <eos>
        for Y_ls in Y:
            self.Yy.append(torch.LongTensor([letter2index[yy] for yy in Y_ls[1:]])) #convert the transcript into numerical labels for the model

        assert(len(self.X) == len(self.Yy))

    def __len__(self):
        return len(self.Yy)

    def __getitem__(self, ind):
        return normalize(self.X[ind]),  self.Yy[ind]
    
    def collate_fn(self, batch):

        batch_x = [x for x,y in batch]
        batch_y = [y for x,y in batch]

        batch_x_pad = pad_sequence(batch_x, batch_first=True) # TODO: pad the sequence with pad_sequence (already imported)
        lengths_x = [len(x) for x in batch_x]# TODO: Get original lengths of the sequence before padding

        batch_y_pad = pad_sequence(batch_y, batch_first=True)# TODO: pad the sequence with pad_sequence (already imported)
        lengths_y = [len(y) for y in batch_y]# TODO: Get original lengths of the sequence before padding

        return batch_x_pad, batch_y_pad, torch.tensor(lengths_x), torch.tensor(lengths_y)

In [None]:
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"): # You can use partition to specify train or dev

        self.X_dir = data_path + "/" + partition + "/mfcc/" # mfcc directory path
        self.Y_dir = data_path + "/" + partition +"/transcript/" # transcript path

        self.X_files = os.listdir(self.X_dir)   # list files in the mfcc directory
        self.Y_files = os.listdir(self.Y_dir)   # list files in the transcript directory

        assert(len(self.X_files) == len(self.Y_files))


    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):
    
        X_path = self.X_dir + self.X_files[ind]
        Y_path = self.Y_dir + self.Y_files[ind]
        
        # Y is raw transcript
        X = torch.from_numpy(np.load(X_path)) # Load the mfcc npy file at the specified index ind in the directory
        Y = np.load(Y_path) # Load the corresponding transcripts
        #  ~~~~ keep <eos>
        Yy = torch.LongTensor([letter2index[yy] for yy in Y[1:]]) #convert the transcript into numerical labels for the model

        return normalize(X), Yy
    
    # pad x/y to the same length within one batch
    def collate_fn(self, batch):

        batch_x = [x for x,y in batch]
        batch_y = [y for x,y in batch]

        batch_x_pad = pad_sequence(batch_x, batch_first=True) # TODO: pad the sequence with pad_sequence (already imported)
        lengths_x = [len(x) for x in batch_x]# TODO: Get original lengths of the sequence before padding

        batch_y_pad = pad_sequence(batch_y, batch_first=True)# TODO: pad the sequence with pad_sequence (already imported)
        lengths_y = [len(y) for y in batch_y]# TODO: Get original lengths of the sequence before padding

        return batch_x_pad, batch_y_pad, torch.tensor(lengths_x), torch.tensor(lengths_y)

# You can either try to combine test data in the previous class or write a new Dataset class for test data
class LibriSamplesTest(torch.utils.data.Dataset):

    def __init__(self, data_path, partition = "test", test_order = None): # test_order is the csv similar to what you used in hw1
        # TODO: Load the npy files from test_order.csv and append into a list
        # You can load the files here or save the paths here and load inside __getitem__ like the previous class
        
        self.X_dir = data_path + "/" + partition + "/mfcc/"
        self.X_names = os.listdir(self.X_dir)
        
        if test_order:
            self.X_names = list(pd.read_csv(test_order).file)


    def __len__(self):
        return len(self.X_names)
    
    def __getitem__(self, ind):
        X_path = self.X_dir + self.X_names[ind]
        X = torch.from_numpy(np.load(X_path))
        
        return normalize(X)
    
    def collate_fn(self, batch):
        batch_x = [x for x in batch]
        batch_x_pad = pad_sequence(batch_x, batch_first=True) # TODO: pad the sequence with pad_sequence (already imported)
        lengths_x = [len(x) for x in batch_x]# TODO: Get original lengths of the sequence before padding

        return batch_x_pad, torch.tensor(lengths_x)

In [None]:
batch_size = BATCH_SIZE
root = '/content/hw4p2_student_data/hw4p2_student_data'

train_data = LibriSamples(root, partition="train")
val_data = LibriSamples(root, partition="dev")
test_data = LibriSamplesTest(root, partition="test", test_order=root+"/test/test_order.csv")

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=train_data.collate_fn, pin_memory=True)
# TODO: Define the val loader. Remember to pass in a parameter (function) for the collate_fn argument 
val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=val_data.collate_fn, pin_memory=True)
# TODO: Define the test loader. Remember to pass in a parameter (function) for the collate_fn argument 
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=test_data.collate_fn, pin_memory=True)

print("Batch size: ", batch_size)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  128
Train dataset samples = 28539, batches = 223
Val dataset samples = 2703, batches = 22
Test dataset samples = 2620, batches = 21


In [None]:
# test code for checking shapes
if DEBUG:
    for data in train_loader:
        x, y, lx, ly = data
        print(x.shape, y.shape, lx.shape, len(ly))
        print(y[0]) # desired 
        break

    for data in val_loader:
        x, y, lx, ly = data
        print(x.shape, y.shape, lx.shape, len(ly))
        print(y[0]) # desired 
        break

# Model

In [None]:
# fixed mask every time step
class LockedDropout(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        batch_size, seq_length, feat_size = x.size()
        m = x.data.new(batch_size, 1, feat_size).bernoulli_(1 - dropout)
        #mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = m / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

## Listen

In [None]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    Read paper and understand the concepts and then write your implementation here.

    At each step,
    1. Pad your input if it is packed
    2. Truncate the input length dimension by concatenating feature dimension
        (i) How should  you deal with odd/even length input? 
        (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer

    To make our implementation modular, we pass 1 layer at a time.
    '''
    def __init__(self, input_dim, hidden_dim, dropouth=0.5):
        super(pBLSTM, self).__init__()

        self.hidden_dim = hidden_dim

        #self.dropout = dropout
        #self.dropouti = dropouti
        self.dropouth = dropouth
        ### drop out for one layer LSTM, only modify the hidden layers

        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.lockdrop = LockedDropout()

    def forward(self, x): # packed in, packed out
        # x=(B,seq_len,input_size)=(B,T,dim) # dim=2*hidden_dim
        x_padded, x_len = pad_packed_sequence(x, batch_first=True)
        x_len = x_len.to(device)
        # truncate, make T even
        x_padded = x_padded[:, :(x_padded.size(1) // 2) * 2, :] # (B, T, dim)
        # reshape to (B, T/2, dim*2)
        x_reshaped = x_padded.reshape(x_padded.size(0), x_padded.size(1) // 2, x_padded.size(2) * 2)
        x_len = x_len // 2

        x_packed = pack_padded_sequence(x_reshaped, lengths=x_len.cpu(), batch_first=True, enforce_sorted=False) #(B,T/2,dim*2)
        
        # Outputs: output, (h_n, c_n)
        out_packed, _ = self.blstm(x_packed)
        # input = (B,T/2,h_dim*4)
        # output = (B,T/2,h_dim*2)

        ## unpack and drop and pack again
        out, out_len = pad_packed_sequence(out_packed, batch_first= True)
        out_len = out_len.to(device)
        out = self.lockdrop(out, self.dropouth)  
        out_packed = pack_padded_sequence(out, lengths=out_len.cpu(), batch_first=True, enforce_sorted=False) 
        
        return out_packed
        

In [None]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.

    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128, dropouti=0):
        super(Encoder, self).__init__()

        #self.encoder_hidden_dim = encoder_hidden_dim

        #self.dropout = dropout
        self.dropouti = dropouti
        #self.dropouth = dropouth

        # The first LSTM layer at the bottom
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=encoder_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        # Define the blocks of pBLSTMs
        # Dimensions should be chosen carefully
        # Hint: Bidirectionality, truncation...
        self.pBLSTMs = nn.Sequential(
            pBLSTM(encoder_hidden_dim*4, encoder_hidden_dim),
            pBLSTM(encoder_hidden_dim*4, encoder_hidden_dim),
            pBLSTM(encoder_hidden_dim*4, encoder_hidden_dim, dropouth=0) #  no drop out in last layer
            # Optional: dropout
            # ...
        )

        self.lockdrop = LockedDropout()
        
        # The linear transformations for producing Key and Value for attention
        # Hint: Dimensions when bidirectional lstm? 
        self.key_network = nn.Linear(encoder_hidden_dim*2, key_value_size)
        self.value_network = nn.Linear(encoder_hidden_dim*2, key_value_size)

    def init_hidden(self, bsz):
        # LSTM layer taks h_state and c_state in that order.[h, c]
        weight = next(self.parameters()).data
        return [(weight.new(1,bsz, self.encoder_hidden_dim).zero_(),
                    weight.new(1,bsz, self.encoder_hidden_dim).zero_())]

    def forward(self, x, x_len, hidden = None):
        """
        1. Pack your input and pass it through the first LSTM layer (no truncation)
        2. Pass it through the pyramidal LSTM layer
        3. Pad your input back to (B, T, *) or (T, B, *) shape
        4. Output Key, Value, and truncated input lens

        Key and value could be
            (i) Concatenated hidden vectors from all time steps (key == value).
            (ii) Linear projections of the output from the last pBLSTM network.
                If you choose this way, you can use the final output of
                your pBLSTM network.
        """

        ## dropout to input before packing
        x = self.lockdrop(x, self.dropouti)

        ## do not need 
        # if hidden is None:
        #     hidden = self.init_hidden(x.size(0))

        # pack input
        x_packed = pack_padded_sequence(x, lengths=x_len.cpu(), enforce_sorted=False, batch_first=True) #(B,T,vocab_size)
        # output from first LSTM layer
        out1_packed, _ = self.lstm(x_packed) #(B,T,dim*2)

        ## can do unpack and locked dropout between these two

        # pass to pblstm
        out_packed = self.pBLSTMs(out1_packed) #(B,T/8,dim*2)
        # pad back
        out, lh  = pad_packed_sequence(out_packed, batch_first = True)
        # get key and value
        key = self.key_network(out)
        value = self.value_network(out)

        return key, value, lh


In [None]:
# encoder = Encoder(input_dim=13,encoder_hidden_dim=256)
# # Try out your encoder on a tiny input before moving to the next step...
# print(encoder)

## Attend


In [None]:
# def plot_attention(attention):
#     # utility function for debugging
#     plt.clf()
#     sns.heatmap(attention, cmap='GnBu')
#     plt.show()

# dot-production attention
class Attention(nn.Module):
    '''
    Attention is calculated using key and value from encoder and query from decoder.
    1. Dot-product attention
        energy = bmm(key, query) 
        # Optional: Scaled dot-product by normalizing with sqrt key dimension
        # Check "attention is all you need" Section 3.2.1
    '''
    def __init__(self):
        super(Attention, self).__init__()
        # Optional: dropout
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value, mask):
        """
        input:
            key: (batch_size, seq_len, d_k)
            value: (batch_size, seq_len, d_v)
            query: (batch_size, d_q)

        * Hint: d_k == d_v == d_q is often true if you use linear projections
        return:
            context: (batch_size, key_val_dim)
            key_val_dim = d_k = d_v
        
        """
        # A = Q @ K.T
        # energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2)
        energy = torch.bmm(query.unsqueeze(1), key.transpose(1, 2)).squeeze(1) #(B,T)      
        energy.masked_fill_(mask, -1e9)
        attention = self.softmax(energy) #(B,T)
        context = torch.bmm(attention.unsqueeze(1),value).squeeze(1) #(B,dim)
        
        return context, attention
        # we return attention weights for plotting (for debugging)

## Spell

In [None]:
class EmbeddingDropout(nn.Module):
    """
    Applies dropout in the embedding layer by zeroing out some elements of
    the embedding vector. Dropout is applied to full rows of the embedding
    matrix: we drop out entire words and not components of a word's dense
    embedding.
    """
    def __init__(self, embed_prob):
        super().__init__()
        self.embed_prob = embed_prob

    def forward(self, emb, words):
        if self.training and self.embed_prob != 1:
            mask = torch.from_numpy(np.random.binomial(1, self.embed_prob, size=(emb.weight.data.shape[0])) / self.embed_prob).to(device)
            masked_embed = mask.unsqueeze(1) * emb.weight
        else:
            masked_embed = emb.weight
        return(F.embedding(words.long(), emb.weight).float())


In [None]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the last LSTMCell can be used as a query for calculating attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128, dropouth=0.5, dropoutemb=0.3):
        super(Decoder, self).__init__()
        # Hint: Be careful with the padding_idx

        # provide a lookup table for 30 chrs
        self.embedding = nn.Embedding(vocab_size, embed_dim) #(30,256)

        # add embedding dropout (word level dropout)
        self.embedding_dropout = EmbeddingDropout(embed_prob=1-dropoutemb)

        # The number of cells is defined based on the paper
        # RNN is a 2 layer LSTM.
        self.lstm1 = nn.LSTMCell(embed_dim+key_value_size, decoder_hidden_dim)
        self.lstm2 = nn.LSTMCell(decoder_hidden_dim,key_value_size)
    
        self.attention = Attention()    # add layers later
        self.vocab_size = vocab_size
        
        #self.character_prob = nn.Linear(key_value_size*2, vocab_size) #: d_v -> vocab_size
        self.key_value_size = key_value_size

        # enhance fc layer
        self.character_prob = nn.Sequential(
            nn.Linear(key_value_size * 2,
                      key_value_size),
            nn.Tanh(),
            nn.Linear(key_value_size, vocab_size))
        
        # Optional: Weight-tying
        self.character_prob.weight = self.embedding.weight

        # locked dropout lol
        # #self.dropout = dropout
        # self.dropouti = dropouti
        self.dropouth = dropouth


        self.drop = nn.Dropout(self.dropouth)

    def forward(self, key, value, encoder_len, y=None, mode='train'):
        '''
        Args:
            key :(B, T, d_k) - Output of the Encoder (possibly from the Key projection layer)
            value: (B, T, d_v) - Output of the Encoder (possibly from the Value projection layer)
            y: (B, text_len) - Batch input of text with text_length
            mode: Train or eval mode for teacher forcing

            encoder_len = lx: (batch_size, )
        Return:
            predictions: the character perdiction probability 
        '''
        # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP



        B, key_seq_max_len, key_value_size = key.shape

        if mode == 'train':
            max_len =  y.shape[1]
            # char_embeddings = self.embedding(y) #(B,Tmax,emb_size)
            # if using embdding drop:
            char_embeddings = self.embedding_dropout(self.embedding,y.long()).to(device)
        else:
            max_len = 600

        # TODO: Create the attention mask here (outside the for loop rather than inside) to aviod repetition
        # mask padded part
        mask = torch.arange(key_seq_max_len).unsqueeze(0) >= encoder_len.unsqueeze(1) # (1,T) >= (B,1)-> (B,Tmax)
        mask = mask.to(device)
        
        predictions = []
        # This is the first input to the decoder
        # What should the fill_value be? <sos>
        prediction = torch.full((B,1), fill_value=0, device=device)  # (B,1)
        # The length of hidden_states vector should depend on the number of LSTM Cells defined in init
        # The paper uses 2 [h_i, c_i]
        hidden_states = [None, None] 
        # hidden_states = [(torch.zeros(batch_size, self.hidden_dim).to(device), torch.zeros(batch_size, self.hidden_dim).to(DEVICE)), \
        #                  (torch.zeros(batch_size, self.key_size).to(device), torch.zeros(batch_size, self.key_size).to(DEVICE))]
        
        
        # TODO: Initialize the context
        context = value[:, 0, :]    # (B,1,dim)

        attention_plot = [] # this is for debugging

        rate = 0.9
        gumbel_noise = True

        for i in range(max_len):
            if mode == 'train':
                # TODO: Implement Teacher Forcing
                teacher_forcing_choice = np.random.choice([0,1],p=[1-rate,rate])
                #pdb.set_trace()
                if teacher_forcing_choice:
                    if i == 0:
                        # This is the first time step
                        # Hint: How did you initialize "prediction" variable above? "<sos>"
                        start_char = torch.zeros((B,), dtype=torch.long).fill_(0).to(device)
                        char_embed = self.embedding(start_char)
                    else:
                        # Otherwise, feed the label of the **previous** time step (ground trueth)
                        char_embed = char_embeddings[:,i-1,:]
                else: # inclued gumble_noise
                    if i!=0 and (gumbel_noise):
                        char_embed = F.gumbel_softmax(prediction).mm(self.embedding.weight)
                    else:
                        char_embed = self.embedding(prediction.argmax(dim=-1)) #embedding of the previous prediction
            else: # no ground truth if not train
                 char_embed = self.embedding(prediction.argmax(dim=-1)) # embedding of the previous prediction

            # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
            # what vectors should be concatenated as a context?
            #pdb.set_trace()
            # y_i-1 = char_embed; c_i-1 = context
            y_context = torch.cat([char_embed, context],dim=1) # maybe wrong (128,256) (128,128)
            # context and hidden states of lstm 1 from the previous time step should be fed
            hidden_states[0] = self.lstm1(y_context, hidden_states[0])# fill this out)

            ## add locked dropout here (only between two layers), already at one time-step, so use normal dropout
            out1 = self.drop(hidden_states[0][0])

            # hidden states of lstm1 and hidden states of lstm2 from the previous time step should be fed
            hidden_states[1] = self.lstm2(out1, hidden_states[1])
            #hidden_states[1] = self.lstm2(hidden_states[0][0], hidden_states[1]) # no dropout option
            # What then is the query? at the first time step
            query = hidden_states[1][0] # h_1

            # step 2. attention: c_i = AttentionContext(s_i,h)
            # Compute attention from the output of the second LSTM Cell
            context, attention = self.attention(query, key, value, mask) 
            # We store the first attention of this batch for debugging
            attention_plot.append(attention[0].detach().cpu())
            
            # step 3. concate s_i and c_i, and input to MLP
            # What should be concatenated as the output context?
            output_context = torch.cat([query, context], dim=1) 
            prediction = self.character_prob(output_context) # (B, vocab_size)
            # store predictions
            predictions.append(prediction.unsqueeze(1))# char pred
        
        
        # Concatenate the attention and predictions to return
        attentions = torch.stack(attention_plot, dim=0) #(T,T)
        predictions = torch.cat(predictions, dim=1) # (B,Tmax,vocab_size)

        return predictions, attentions
        #return predictions

In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Seq2Seq,self).__init__()
        self.encoder = Encoder(input_dim, encoder_hidden_dim)# fill this out)
        self.decoder = Decoder(vocab_size, decoder_hidden_dim, embed_dim)# fill this out)

    def forward(self, x, x_len, y=None, mode='train'):
        key, value, encoder_len = self.encoder(x, x_len)
        predictions, attentions = self.decoder(key, value, encoder_len, y=y, mode=mode)
        #predictions= self.decoder(key, value, encoder_len, y=y, mode=mode)
        return predictions, attentions
        #return predictions

# Model Info

In [None]:
model = Seq2Seq(input_dim=13,vocab_size=30,encoder_hidden_dim=256,decoder_hidden_dim=512,embed_dim=256)# fill this out)
model = model.to(device)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (lstm): LSTM(13, 256, batch_first=True, bidirectional=True)
    (pBLSTMs): Sequential(
      (0): pBLSTM(
        (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
        (lockdrop): LockedDropout()
      )
      (1): pBLSTM(
        (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
        (lockdrop): LockedDropout()
      )
      (2): pBLSTM(
        (blstm): LSTM(1024, 256, batch_first=True, bidirectional=True)
        (lockdrop): LockedDropout()
      )
    )
    (lockdrop): LockedDropout()
    (key_network): Linear(in_features=512, out_features=128, bias=True)
    (value_network): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(30, 256)
    (embedding_dropout): EmbeddingDropout()
    (lstm1): LSTMCell(384, 512)
    (lstm2): LSTMCell(512, 128)
    (attention): Attention(
      (softmax): Softmax(dim=-1)
    )
    (character_prob): Sequential(
      (0): Linear(in_

# Functions

## Train

In [None]:
#  edit distance : number of character modifications needed to change the sequence to the gold sequence. 
def calc_edit_dist(preds, targets):
    tot_dist = 0.0
    for pred, target in zip(preds, targets):
        dist = lev.distance(pred, target)
#         print("Lev dist {}".format(dist))
        tot_dist += dist
    return tot_dist/(len(preds))


In [None]:
def train(model, train_loader, criterion, optimizer, mode, epoch):
    model.train()

    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train❤') 

    running_loss = 0 # total_loss actually
    torch.cuda.empty_cache()
    
    # 0) Iterate through your data loader
    for batch_num, (x,y,x_len,y_len) in enumerate(train_loader):
        # 1) Send the inputs to the device
        x, x_len, y, y_len = x.to(device), x_len.to(device), y.to(device), y_len.to(device)
        # 2) Pass your inputs, and length of speech into the model.
        predictions, attentions = model(x, x_len, y, mode='train') # (B, Tmax, vocab_size)
        # 3) Generate a mask based on target length. This is to mark padded elements

        # so that we can exclude them from computing loss.
        # Ensure that the mask is on the device and is the correct shape.

        mask = torch.zeros(y.size()).to(device)
        for i in range(len(y_len)):
          mask[i,:y_len[i]] = 1
        #mask = mask[:,1:]
        mask = mask.reshape(-1)
        
        # 4) Make sure you have the correct shape of predictions when putting into criterion
        loss = criterion(predictions.view(-1, predictions.size(2)), y.view(-1))
        # Use the mask you defined above to compute the average loss
        masked_loss = torch.sum(loss*mask)
        # Sum this masked loss and divide by sum of target lengths 
        # current_loss = masked_loss /(sum(y_len) - batch_size)
        current_loss = masked_loss / torch.sum(mask)
        running_loss += current_loss 
        # 5) backprop
        optimizer.zero_grad()
        current_loss.backward()
        optimizer.step()
        
        # Optional: Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)

        # print the training loss after every N batches
        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            loss="{:.04f}".format(float(running_loss / (batch_num + 1))),
            lr="{:.08f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update() # Update tqdm bar
    batch_bar.close() 

    running_loss = running_loss.detach().cpu().item()/len(train_loader)

    # print("Epoch: ", epoch+1)
    # print('running_loss', running_loss)

    print("Epoch {}: Train Loss {:.08f}, Learning Rate {:.08f}".format(
        epoch + 1,
        float(running_loss),
        float(optimizer.param_groups[0]['lr'])))

    # Optional: plot your attention for debugging
    # plot_attention(attentions)

    path = "/content/drive/MyDrive/models/hw4p2/"
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': running_loss,
        }, path + "hw4p2_model_v3_424_" + str(epoch+1) + ".tar" )
    

## Validation

In [None]:
def greedy_decode(probs):
    # probs: FloatTensor (B, T, vocab_size)
    out = []
    for prob in probs:
        s = []
        for step in prob: #(T, vocab_size)
            #             idx = torch.multinomial(step, 1)[0]
            idx = step.argmax(0)
            c = index2letter[idx]
            if c == '<eos>':
                break
            s.append(c)
        out.append("".join(s))
    return out

In [None]:
def val(model, valid_loader):
    model.eval()
    total_dist = 0.0

    batch_bar = tqdm(total=len(valid_loader), dynamic_ncols=True, leave=False, position=0, desc='Val★') 

    for batch_num, (x,y,x_len,y_len) in enumerate(valid_loader):
        # 1) Send the inputs to the device
        x, x_len, y, y_len = x.to(device), x_len.to(device), y.to(device), y_len.to(device)
        with torch.no_grad():
            # 2) Pass your inputs, and length of speech into the model.
            predictions, attentions = model(x, x_len, y, mode='val') # (B, Tmax, vocab_size)
                    
        
        # When computing Levenshtein distance, make sure you truncate prediction/target
        # TODO: use beam search to decode!!!
        #log_prob = F.softmax(input=predictions,dim=2) # need this for beam search
        # print(log_prob[0][0].sum()) -- log = False
        # print(predictions[0][0].sum()) -- log = True
        # break

        # use greedy_decode to generate pred_text
        # pdb.set_trace()
        # decoder1
        # pred_text = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy()) #(B,Tmax,1)
        # decoder2
        pred_text = greedy_decode(predictions.detach().cpu().numpy())
        # decoder3 my beam decoder
        #pred_text = decoder.decode(log_prob.detach().cpu().numpy())

        target_text = transform_index_to_letter(y.detach().cpu().numpy())
        
        # if batch_num%10 == 1:
        #     print("OG predication")
        #     print(pred_text[:1])

        #  truncate prediction/target
        #  y_len inclued <eos>
        for i in range(len(y_len)):
            # pred_text[i] = pred_text[i][:y_len[i]-1] # disable line up 
            target_text[i] = target_text[i][:y_len[i]-1]

        # if batch_num%10 == 1:
        #     print(pred_text[:1])
        #     print(target_text[:1])

        curr_dist = calc_edit_dist(pred_text, target_text) # averaged
        total_dist += curr_dist 
        running_dist = total_dist / (batch_num + 1)
        #print('curr_dist', running_dist)

        # if  batch_num % 10 == 1: 
        #     print('curr_dist', curr_dist)
        #     print('running_dist', running_dist)
        batch_bar.set_postfix(
            curr_dist="{:.04f}".format(curr_dist),
            running_dist="{:.04f}".format(running_dist))
        batch_bar.update()
    batch_bar.close() 
    
    print("Val: Distance {:.04f}".format(running_dist))
    return(running_dist)

    


## Test

In [None]:
def test(model, test_loader):
    model.eval()
    
    pred_list = []
    for batch_num, (x,x_len) in enumerate(test_loader):
        # 1) Send the inputs to the device
        x, x_len = x.to(device), x_len.to(device)
        with torch.no_grad():
            # 2) Pass your inputs, and length of speech into the model.
            predictions, _ = model(x, x_len, mode='test') # (B, Tmax, vocab_size)

        # TODO: use beam search to decode
        # decoder1
        # pred_text = transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy()) #(B,Tmax,1)
        # decoder2
        pred_text = greedy_decode(predictions.detach().cpu().numpy())
        # decoder3
        # pred_text = decoder.decode(predictions.detach().cpu().numpy())
        # decoder4: ctcbeam
        # pred_text,_,_,_ = decoder.decode(predictions)  #out = (B,k,T)
        # pred_text,_,_,_ = decoder_test.decode(predictions)  #out = (B,k,T)
        # pred_text = pred_text[:,0,:]
        # pred_text = transform_index_to_letter(pred_text.detach().cpu().numpy())

        for pred_text_one in pred_text:#one single transcript
            pred_list.append(pred_text_one)
    return pred_list


# Training

In [None]:
# TODO: Define your model and put it on the device here
# ...
n_epochs = 150
optimizer = torch.optim.Adam(model.parameters(), lr=0.00075000)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.75, patience=4, verbose=True, threshold=1e-2)
# Make sure you understand the implication of setting reduction = 'none'
criterion = nn.CrossEntropyLoss(reduction='none')
mode = 'train'

In [None]:
print('just test')
#path = "/content/drive/MyDrive/models/hw4p2/hw4p2_model_v3_424_268.tar"
#checkpoint = torch.load(path)
#model.load_state_dict(checkpoint['model_state_dict'])
epoch0 = checkpoint['epoch']
print(epoch0)
#train(model, train_loader, criterion, optimizer, mode, epoch0+1)
dist = val(model, val_loader)
print("dist:")
print(dist)


In [None]:
print('just test')
path = "/content/drive/MyDrive/models/hw4p2/hw4p2_model_v3_424_275.tar"
checkpoint = torch.load(path)
#model.load_state_dict(checkpoint['model_state_dict'])
epoch0 = checkpoint['epoch']
print(epoch0)
#train(model, train_loader, criterion, optimizer, mode, epoch0+1)
dist = val(model, val_loader)
print("dist:")
print(dist)


just test
274


                                                                                            

Val: Distance 6.8380
dist:
6.838044507575758




In [None]:
# for what in checkpoint['model_state_dict']:
#     print(what)

In [None]:
if CON_TRAIN:
    path = '/content/drive/MyDrive/models/hw4p2/hw4p2_model_v3_424_268.tar'
    print(path)
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00013348388671875002)
    #optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # maybe not use this # cannot resume optimizer properly
    epoch0 = checkpoint['epoch']
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.75, patience=5, verbose=True, threshold=1e-4)
    criterion = nn.CrossEntropyLoss(reduction='none')
    
    print('Epoch end at:', epoch0)

/content/drive/MyDrive/models/hw4p2/hw4p2_model_v3_424_268.tar
Epoch end at: 267


# Predicting

In [None]:
#df = pd.read_csv("hw4p2_416_submission_v2.csv")
# First 5 rows
#df.head()
# print(len(df)) # = 2620

Unnamed: 0,id,predictions
0,0,HE BEGAN A CONFUSED CONTLAIN TO GENS TO THE WI...
1,1,KIH HAVE NOT SO EARNEST A MINE TO THESE MURMER...
2,2,A GOLDEN FORTUNE AND A HAPPY LIFE
3,3,HE WAS LIKE UP TO MY FATHER AND AWAY THAN YET ...
4,4,OSSO THERE WAS ASTRIPLING PAGE WHO TURNED ATTO...


In [None]:
model.eval()
pred_list = test(model, test_loader)



##Write to csv

In [None]:
pred_list[:10]

['HE BEGAN A CONFUSED COMPLAINT AGAINST THE WISHER WHO ADVANTAGE BEHIND THE CURTAIN ON THE LEFT',
 'KIY HAVE NOT SO EARNEST A MIND TO THESE MUMMERIES CHILD',
 'A GOLDEN FORTUNE AND A HAPPY LIFE',
 'HE WAS LIKE UP TO MY FATHER IN A WAY AND YET WAS NOT MY FATHER',
 'ALSO THERE WAS A STRIPLING PAGE WHO TURNED IT TO A MAID',
 'THIS WAS SO SWEETLED EIGHTY SIR AND IN SOME MANNER I DO THINK SHE DIED',
 'BUT THEN THE PICTURE WAS GONE AS QUICKLY AS IT CAME',
 'SISTER NOW DO YOU HEAR THESE MARVELS',
 'TAKE YOUR PLACE AND LET US SEE WITH THE CRIST BOTH AND SHOW TO YOU',
 'LIKE AS NOT YOUNG MASTER THOUGH I AM AN OLD MAN']

In [None]:
# TODO: Generate the csv file
with open("hw4p2_427_submission_v4.csv", "w+") as f:
    f.write("id,predictions\n")
    for i in range(len(test_data)):
        f.write("{},{}\n".format(i, pred_list[i]))

In [None]:
# if SUBMIT_TO_KAGGLE:
if 1:
    !kaggle competitions submit -c 11-785-s22-hw4p2 -f hw4p2_427_submission_v4.csv -m "v4 atp1"

100% 290k/290k [00:02<00:00, 121kB/s]
Successfully submitted to Attention-Based Speech Recognition