In [None]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.tensor as tensor
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import *
import torch.optim as optim
import time
import Levenshtein as ls

# Data Loader

In [None]:
'''
Loading all the numpy files containing the utterance information and text information
# '''
def load_data():
    speech_train = np.load('train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list, dic):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    l_to_l_idx = []
    for sentence in transcript:
        # temp = [33] # idx for 'sos'
        temp = []
        for word in sentence:
            word = word.decode('UTF-8')
            temp += [dic[letter] for letter in word]
            temp.append(32) # idx for space
        temp[-1] = 34 # idx for 'eos'
        l_to_l_idx.append(temp)
    return l_to_l_idx

'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    letter2index = dict()
    index2letter = dict()
    for i, letter in enumerate(letter_list):
      letter2index[letter] = i
      index2letter[i] = letter
    return letter2index, index2letter


class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if self.isTrain:
            x_len = self.speech[index].shape[0]
            y_len = len(self.text[index])
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index]).long(), torch.tensor(x_len).long(), torch.tensor(y_len).long()
        else:
            x_len = self.speech[index].shape[0]
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(x_len).long()


def collate_train(batch_data):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    x, y, x_len, y_len = zip(*batch_data)
    x_len = torch.LongTensor(x_len)
    y_len = torch.LongTensor(y_len)
    x_pad = pad_sequence(x)
    y_pad = pad_sequence(y, batch_first=True)
    return x_pad, y_pad, x_len, y_len 


def collate_test(batch_data):
    ### Return padded speech and length of utterance ###
    x, x_len = zip(*batch_data)
    x_len = torch.LongTensor(x_len)
    x_pad = pad_sequence(x)
    return x_pad, x_len


# Model

In [None]:
import torch.nn.utils as utils
from torchnlp.nn import LockedDropout

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        '''
        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param key: (batch_size, max_len, key_size) Key Projection from Encoder
        :param value: (batch_size, max_len, value_size) Value Projection from Encoder
        :param lens: (batch_size, )
        :return context: (batch_size, value_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted 
        '''
        
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2) # Shape (batch_size, max_len)
        mask = torch.arange(key.shape[1]).unsqueeze(0) >= lens.unsqueeze(1) # Shape (batch_size, max_len)
        mask = mask.to(DEVICE)
        energy.masked_fill_(mask, -1e9)
        attention = F.softmax(energy, dim=1) # Shape (batch_size, max_len)
        context = torch.bmm(attention.unsqueeze(1), value) # Shape (batch_size, 1, value_size)
        context = context.squeeze(1)
        return context, attention


class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim, dropout=0.0):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, dropout=dropout, bidirectional=True)

    def forward(self, x):
        '''
        :param x :(N, T, D) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        unpacked, unpacked_lens = pad_packed_sequence(x, batch_first=False)
        batch_size = unpacked.shape[1]
        timestep = unpacked.shape[0]
        feature_dim = unpacked.shape[2]
        unpacked = unpacked.permute(1,0,2)
        if timestep % 2 == 1:
            unpacked = unpacked[:,:-1,:]
        unpacked = torch.reshape(unpacked, (batch_size, int(timestep/2), feature_dim*2))
        unpacked = unpacked.permute(1,0,2)
        packed = pack_padded_sequence(unpacked, unpacked_lens//2, batch_first=False, enforce_sorted=False)
        packed, hidden = self.blstm(packed)
        return packed, hidden




class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        self.bn1 = nn.BatchNorm1d(2*hidden_dim)
        self.lockedDO1 = LockedDropout(0.4)
        self.lockedDO2 = LockedDropout(0.5)
        self.lockedDO3 = LockedDropout(0.5)
        ### Add code to define the blocks of pBLSTMs! ###
        self.pblstm1 = pBLSTM(input_dim=hidden_dim*4, hidden_dim=hidden_dim)
        self.bn2 = nn.BatchNorm1d(2*hidden_dim)
        self.pblstm2 = pBLSTM(input_dim=hidden_dim*4, hidden_dim=hidden_dim)
        self.bn3 = nn.BatchNorm1d(2*hidden_dim)
        self.pblstm3 = pBLSTM(input_dim=hidden_dim*4, hidden_dim=hidden_dim)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        reg = True
        outputs, _ = self.lstm(rnn_inp)
        if reg:
            unpacked, unpacked_lens = pad_packed_sequence(outputs, batch_first=True)
            unpacked = unpacked.permute(0,2,1)
            unpacked = self.bn1(unpacked)
            unpacked = self.lockedDO1(unpacked)
            unpacked = unpacked.permute(0,2,1)
            outputs = pack_padded_sequence(unpacked, unpacked_lens, batch_first=True, enforce_sorted=False)
        ### Use the outputs and pass it through the pBLSTM blocks! ###
        outputs, _ = self.pblstm1(outputs)
        if reg:
            unpacked, unpacked_lens = pad_packed_sequence(outputs, batch_first=True)
            unpacked = unpacked.permute(0,2,1)
            unpacked = self.bn2(unpacked)
            unpacked = self.lockedDO2(unpacked)
            unpacked = unpacked.permute(0,2,1)
            outputs = pack_padded_sequence(unpacked, unpacked_lens, batch_first=True, enforce_sorted=False)
        outputs, _ = self.pblstm2(outputs)
        if reg:
            unpacked, unpacked_lens = pad_packed_sequence(outputs, batch_first=True)
            unpacked = unpacked.permute(0,2,1)
            unpacked = self.bn3(unpacked)
            unpacked = self.lockedDO3(unpacked)
            unpacked = unpacked.permute(0,2,1)
            outputs = pack_padded_sequence(unpacked, unpacked_lens, batch_first=True, enforce_sorted=False)
        outputs, _ = self.pblstm3(outputs)

        linear_input, encoder_lens = utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, encoder_lens


class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim*2)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim*2, hidden_size=key_size)

        self.isAttended = isAttended
        if isAttended:
            self.attention = Attention()
        self.fc = nn.Linear(key_size+value_size, key_size+value_size)
        self.character_prob = nn.Linear(key_size + value_size, vocab_size)
        self.teacher_force_rate = 0.95
        self.vocab_size = vocab_size
        self.character_prob.weight = self.embedding.weight

    def forward(self, key, values, lens, text=None, isTrain=True):
        '''
        :param key :(N, T, key_size) Output of the Encoder Key projection layer
        :param values: (N, T, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character prediction probability 
        '''
        batch_size = key.shape[0]
        if isTrain:
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 600
        if epoch_counter % 3 == 0:
            self.teacher_force_rate = max(0.95 - 0.01*epoch_counter/3, 0.7)
        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros((batch_size, self.vocab_size), device=DEVICE)
        prediction[:, 33] = 1
        attention_masks = []
        for i in range(max_len):
            teacher_force = True if np.random.random_sample() < self.teacher_force_rate else False
            use_gumble_noise = False
            if i == 0:
                char_embed = self.embedding(prediction.argmax(dim=-1))
            elif isTrain and teacher_force:
                char_embed = embeddings[:,i-1,:]
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            if self.isAttended and i > 0:
                inp = torch.cat([char_embed, context], dim=1)
            else:
                inp = torch.cat([char_embed, values[:,i,:]], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
            if self.isAttended:
                context, attention_mask = self.attention(output, key, values, lens)
                prediction = self.character_prob(self.fc(torch.cat([output, context], dim=1)))
                attention_masks.append(attention_mask.detach())
            else:
                prediction = self.character_prob(self.fc(torch.cat([output, values[:,i,:]], dim=1)))
            # prediction shape after unsqueeze is (batch_size, 1, vocab_size)
            predictions.append(prediction.unsqueeze(1))
        
        if self.isAttended:
            return torch.cat(predictions, dim=1), torch.stack(attention_masks, dim=1)
        print("First in the predictions is ",predictions[0][0,:,:])
        # return shape is (batch_size, max_len, vocab_size)
        return torch.cat(predictions, dim=1)


class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim, isAttended=isAttended)
        self.isAttended = isAttended

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True):
        key, value, encoder_lens = self.encoder(speech_input, speech_len)
        if isTrain:
            if self.isAttended:
                predictions, attn_mask = self.decoder(key, value, encoder_lens, text_input)
            else:
                predictions = self.decoder(key, value, encoder_lens, text_input)
        else:
            if self.isAttended:
                predictions, attn_mask = self.decoder(key, value, encoder_lens, text=None, isTrain=False)
            else:
                predictions = self.decoder(key, value, encoder_lens, text=None, isTrain=False)
        if self.isAttended:
            return predictions, attn_mask
        return predictions


In [None]:
embeding = nn.Embedding(10,3,padding_idx=0)
input = torch.LongTensor([[1,2,0,5]])
print(embeding(input))
labels = torch.randn(3,5)
labels_lens = torch.ones(3)*2
a = torch.arange(labels.shape[1]).unsqueeze(0) > labels_lens.unsqueeze(1)
print(a)

tensor([[[ 0.3443, -0.2239, -1.0013],
         [-1.1447,  0.5026,  0.0103],
         [ 0.0000,  0.0000,  0.0000],
         [-1.4789,  0.9979,  0.2122]]], grad_fn=<EmbeddingBackward>)
tensor([[False, False, False,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False,  True,  True]])


# Train & Test

In [None]:
def output2string(out):
    res = ""
    for i in range(len(out)):
        idx = out[i].item()
        if idx == 0 or idx == 34:
            res += idx2letter[idx]
            break
        res += idx2letter[idx]
    return res

In [None]:
### Add Your Other Necessary Imports Here! ###

def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    start = time.time()
    avg_loss = 0.0
    attention_mask = None
    for i, data in enumerate(train_loader):
        features, labels, feature_lens, labels_lens = data
        features, labels = features.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs, attention_mask = model(features, feature_lens, text_input=labels) # output shaped (batch_size, max_text_len, vocab_size)
        prediction = outputs.permute(0,2,1)
        loss = criterion(prediction, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        optimizer.step()
        perplexity = loss
        avg_loss += perplexity.item()
        if i % 50 == 49:
            print(f"Epoch: {epoch+1}, Batch: {i+1}, Running train loss is {avg_loss/50}")
            avg_loss = 0.0
            greedy_output = outputs.argmax(dim=-1)
            print(f"sample output is {output2string(greedy_output[0, :])}")
            print(f"sample label is {output2string(labels[0, :])}")
            print(f"This epoch takes up to now {time.time()-start} seconds")
    end = time.time()
    print(f"The time taken for epoch {epoch+1} is {end-start}")
    torch.cuda.empty_cache()
    return attention_mask.cpu().numpy()

def val(model, val_loader, criterion, epoch):
    model.eval()
    dist = 0
    total = 0
    for i, data in enumerate(val_loader):
        features, labels, feature_lens, labels_lens = data
        features, labels = features.to(DEVICE), labels.to(DEVICE)
        outputs, _ = model(features, feature_lens, text_input=None, isTrain=False)
        greedy_output = outputs.argmax(dim=-1)
        for j in range(len(outputs)):
            string1 = output2string(greedy_output[j, :])
            string2 = output2string(labels[j, :])
            if j == 0:
                print(f"Output string is {string1}")
                print(f"Label string is {string2}")
            dist += ls.distance(string1, string2)
        total += len(outputs)
    torch.cuda.empty_cache()
    return dist/total

def test(model, test_loader, epoch):
    model.eval()
    result = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            features, feature_lens = data
            features = features.to(DEVICE)
            outputs, _ = model(features, feature_lens, text_input=None, isTrain=False)
            greedy_output = outputs.argmax(dim=-1)
            for i in range(len(outputs)):
                result.append(output2string(greedy_output[i, :]))
        return result
        
        

# Plot

In [None]:
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
def plot_attn_flow(attn_mask, path):
    plt.imsave(path, attn_mask, cmap='hot')
    return plt

def plot_grad_flow(named_parameters, path):
    ave_grads = []
    max_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            if(p is not None):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    plt.show()
    plt.savefig(path)
    return plt, max_grads


# Main Driver

In [None]:
LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']


model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=256)
model.to(DEVICE)
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss(ignore_index=0)
nepochs = 60
batch_size = 64 if DEVICE == 'cuda' else 1
milestones=[20,25,30,35,40,45,50,52,54,56,58,60]
# milestones=[2,5,8,11,14,17,20,23,26,29,32,35,38]
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.6, verbose=True)

Seq2Seq(
  (encoder): Encoder(
    (lstm): LSTM(40, 256, bidirectional=True)
    (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (lockedDO1): LockedDropout(p=0.4)
    (lockedDO2): LockedDropout(p=0.5)
    (lockedDO3): LockedDropout(p=0.5)
    (pblstm1): pBLSTM(
      (blstm): LSTM(1024, 256, bidirectional=True)
    )
    (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (pblstm2): pBLSTM(
      (blstm): LSTM(1024, 256, bidirectional=True)
    )
    (bn3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (pblstm3): pBLSTM(
      (blstm): LSTM(1024, 256, bidirectional=True)
    )
    (key_network): Linear(in_features=512, out_features=128, bias=True)
    (value_network): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(35, 256, padding_idx=0)
    (lstm1): LSTMCell(384, 512)
    (lstm2): LSTMCell(512, 128)
    (a

In [None]:
print("Start loading data")
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
letter2idx, idx2letter = create_dictionaries(LETTER_LIST)
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST, letter2idx)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST, letter2idx)

train_dataset = Speech2TextDataset(speech_train, character_text_train)
# train_dataset = Speech2TextDataset(speech_valid, character_text_valid)
val_dataset = Speech2TextDataset(speech_valid, character_text_valid)
test_dataset = Speech2TextDataset(speech_test, None, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_train)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)
del train_dataset
del val_dataset
del test_dataset
del speech_train
del speech_valid
del speech_test
del transcript_train
del transcript_valid
del character_text_train
del character_text_valid
print("Done loading data")

Start loading data
Done loading data


In [None]:
for data in train_loader:
    a,b,c,d = data
    print(b[0])
    break

tensor([21, 18,  7,  9, 14,  7, 32,  8,  9, 13, 32, 20, 15, 32,  1,  3, 20,  9,
        15, 14, 32, 23,  8,  9,  3,  8, 32, 23, 15, 21, 12,  4, 32, 15, 14, 12,
        25, 32,  8,  1, 22,  5, 32,  4,  5, 19, 20, 18, 15, 25,  5,  4, 32,  8,
         9, 13, 32,  6, 15, 18,  5, 22,  5, 18, 32,  9, 14, 32, 16, 15, 12,  9,
        20,  9,  3,  1, 12, 32, 12,  9,  6,  5, 32, 23, 15, 21, 12,  4, 32,  8,
         1, 22,  5, 32, 19, 20, 18,  9, 16, 16,  5,  4, 32,  8,  9, 13, 32, 15,
         6, 32,  8,  9, 19, 32,  9, 14,  6, 12, 21,  5, 14,  3,  5, 32,  5, 24,
         9, 12,  5,  4, 32,  8,  9, 13, 32,  6, 18, 15, 13, 32, 20,  8,  1, 20,
        32, 16, 15, 19,  9, 20,  9, 15, 14, 32,  9, 14, 32,  3, 15, 14,  7, 18,
         5, 19, 19, 32, 23,  8,  5, 18,  5, 32,  8,  5, 32,  3, 15, 21, 12,  4,
        32, 18,  5, 14,  4,  5, 18, 32, 20,  8,  5, 32, 13, 15, 19, 20, 32,  5,
         6,  6,  9,  3,  9,  5, 14, 20, 32, 19,  5, 18, 22,  9,  3,  5, 32, 20,
         8,  1, 20, 32, 23,  1, 19, 32, 

In [None]:
res = ""
for i in range(len(character_text_valid[2])):
    if character_text_valid[2][i] == 0 or character_text_valid[2][i] == 34:
        continue
    res += idx2letter[character_text_valid[2][i]]
print(res)

In [None]:
epoch_counter = 0
for epoch in range(nepochs):
    print(f"Start epoch {epoch+1}")
    print("Start training")
    attn_mask = train(model, train_loader, criterion, optimizer, epoch)
    scheduler.step()
    print(f"the tf rate now is {model.decoder.teacher_force_rate}")
    if epoch >= 17:
        epoch_counter += 1 
    plot_attn_flow(attn_mask[0,:,:], '/content/epoch_'+str(epoch+1)+'_adaTF.jpg')
    # print("Start validation")
    # dist = val(model, val_loader, criterion, epoch)
    # print(f"val dist is {dist}")
print("Ended training")

Start epoch 1
Start training
Epoch: 1, Batch: 50, Running train loss is 2.595596504211426
sample output is tn thet taand tot ttiseet tn ti thet tn ths ansrre n tnsrneytn tntast ae tare tiaorg  rtes  teul thetn tntast ah l thr  aai sn tn tf taaid te i  tnd tn txaau  teoat nieinsaaain aaaiin aaaiiiiiiiiiiiiiiiiiiin  aaiiiiiiiiiiiiiiiiiiiiiiiiin  aaiiiiiiiiiiiiiin  aaiiin  aaiiii
sample label is it that cried nora didn't i say that it was a person a girl if i must be more definite ruth roberts if i must tell just who it is oh cried belle and ah echoed brenda<eos>
This epoch takes up to now 58.02694225311279 seconds
Epoch: 1, Batch: 100, Running train loss is 1.8414970636367798
sample output is tou tuseennt n ts teice toase  tnain tou teoohtf too s tvr hetou to n ed the can   tnd thmk d tpcer the coald s tantees  oonreng tone tor ard thth tncocehean tes soeln tnd tas sadd ttast tft eeaaaaiieen eenteeaaaeeeeeeeeeeeeeeeeeeeeeen eeenteeaeenteeeeeenteeenteeaeeeen eeaaaaaeeeeeeeeeeen eeeeeeeeee

KeyboardInterrupt: ignored

In [None]:
print("Start validation")
val_loss, dist = val(model, val_loader, criterion, 0)
print(f"val loss is {val_loss}, dist is {dist}")

Start validation
val loss is 0.186939703461802, dist is 20.594203852686743


In [None]:
result = test(model, test_loader, 0)
print(result[0])
df = pd.DataFrame(result, columns=['label'])
df.to_csv("./submission.csv", index_label="id")

iitt                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
