# Question Answering with SQUAD

### Setup (if using Google Colaboratory)

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# %cd /content/drive/My\ Drive/GaTech/Fall20/NLP-TA/HW5

### Imports

In [7]:
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.notebook import tqdm
random.seed(1)
%load_ext autoreload
%autoreload 2

## Load Data

Read in the pickle files provided to you. We have preprocessed and tokenized all the question answering data form you. Just run the following cells to initialize your word and char to index maps as well as training data. 

For `word2idx` and `char2idx`, index 0 is reserved for *<unk>* and 1 is reserved for *<pad>*. 

In [11]:
word2idx_file = 'data/word2idx.pickle'
char2idx_file = 'data/char2idx.pickle'
train_pkl = 'data/train.pkl'
valid_pkl = 'data/valid.pkl'

In [9]:
word2idx = pickle.load(open(word2idx_file, 'rb')) # word2idx 
idx2word = {v : k for k, v in word2idx.items()}
word_vocab = list(word2idx.keys())
print("Vocab size:", len(word_vocab))
# for i in range(5):
    # print(word_vocab[i], word2idx[word_vocab[i]], idx2word[word2idx[word_vocab[i]]])
# Print <unk> and <pad>
print(idx2word[0], word2idx['<unk>'])
print(idx2word[1], word2idx['<pad>'])

Vocab size: 23284
<unk> 0
<pad> 1


In [10]:
MAX_WORD_LEN = max([len(x) for x in word2idx.keys()])
MAX_WORD_LEN

30

In [11]:
char2idx = pickle.load(open(char2idx_file, 'rb')) # char2idx 
idx2char = {v : k for k, v in char2idx.items()}
char_vocab = list(char2idx.keys())
print("Unique chars:", len(char_vocab))
print(idx2char[0], char2idx['<unk>'])
print(idx2char[1], char2idx['<pad>'])

Unique chars: 96
<unk> 0
<pad> 1


In [12]:
val_df = pd.read_pickle(valid_pkl).reset_index(drop=True)
train_df = pd.read_pickle(train_pkl).reset_index(drop=True)
print("Train samples", len(train_df))
print("Valid samples", len(val_df))
train_df.head()

Train samples 8504
Valid samples 2013


Unnamed: 0,id,context,question,label,answer,context_ids,question_ids,label_idx
0,56be85543aeaaa14008c9063,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"[269, 286]",in the late 1990s,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[55, 21, 98, 412, 1067, 510, 5]","[56, 59]"
1,56be85543aeaaa14008c9065,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"[207, 226]",singing and dancing,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[11, 270, 21, 98, 5164, 7, 90, 128, 12, 1204, ...","[44, 46]"
2,56be85543aeaaa14008c9066,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"[526, 530]",2003,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[55, 21, 98, 1571, 550, 16, 609, 8, 176, 10, 7...","[112, 112]"
3,56bf6b0f3aeaaa14008c9601,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"[166, 180]","Houston, Texas","[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[31, 25, 82, 8, 153, 21, 98, 293, 4104, 113, 5]","[36, 38]"
4,56bf6b0f3aeaaa14008c9602,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"[276, 286]",late 1990s,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[31, 35, 857, 21, 98, 176, 765, 5]","[58, 59]"


In [13]:
sample = train_df.sample(1).iloc[0]
print("Question:", sample.question)
print("Question Ids:", sample.question_ids)
print("Answer:", sample.answer)
print("Answer Ids:", [word2idx[x] for x in sample.answer.split()])
[char_start, char_end] = sample.label
print("Context:", sample.context[char_start - 30 : char_end + 30])
[token_start, token_end] = sample.label_idx
print("Context Ids:", sample.context_ids[token_start - 5: token_end + 5])

Question: Shuman patented his solar engine system in what year?
Question Ids: [2581, 12835, 36, 107, 2090, 151, 7, 25, 59, 5]
Answer: 1912
Answer Ids: [11205]
Context: entire solar engine system by 1912.
Context Ids: [935, 107, 2090, 151, 24, 11205, 6]


In [14]:
# Counts
print("Train samples", len(train_df))
print("Val samples", len(val_df))

Train samples 8504
Val samples 2013


In [15]:
gold_val_answers = val_df[['id', 'answer']].groupby('id')['answer'].apply(list)
gold_val_answers.head()

id
56ddde6b9a695914005b9628                     [France, France, France, France]
56ddde6b9a695914005b9629    [10th and 11th centuries, in the 10th and 11th...
56ddde6b9a695914005b962a    [Denmark, Iceland and Norway, Denmark, Iceland...
56ddde6b9a695914005b962b                         [Rollo, Rollo, Rollo, Rollo]
56ddde6b9a695914005b962c    [10th century, the first half of the 10th cent...
Name: answer, dtype: object

# 1. BiLSTM + Attention (same as [BiDAF](https://arxiv.org/abs/1611.01603) but different encoder)

> Let's implement a variant of BiDAF using the same LSTM code from HW4 as encoder. 


<img src="img/BiDAF.png">

Here, we will swap out *Contextual Embed Layer*, *Word Embed Layer*, and *Character Embed Layer* with our own LSTM-based encoder. 

In [37]:
# you will need to import again after every change you make
#    autoreload does not work here for some reason (let us 
#    know on piazza if you figure out a fix)
from MyQA import *

In [32]:
sampleInput = train_df.iloc[0]
context = sampleInput.context_ids # tokenized context converted to ids
question = sampleInput.question_ids # tokenized query converted to ids
labels = sampleInput.label_idx # [start_idx, end_idx] of answer
# create char inputs for context and question
context_chars = []
for w in context:
    w = idx2word[w]
    padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
    context_chars += padding + list(w)
context_chars = prepare_sequence(context_chars, char2idx)

question_chars = []
for w in question:
    w = idx2word[w]
    padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
    question_chars += padding + list(w)
question_chars = prepare_sequence(question_chars, char2idx)
context = torch.tensor(context, dtype=torch.long)
question = torch.tensor(question, dtype=torch.long)

In [33]:
# Hyperparameters (you will need to change these)
# Encoding layer
EMBEDDING_DIM = 6
HIDDEN_DIM = 3
LEARNING_RATE = 0.1
LSTM_LAYERS = 1
DROPOUT = 0
EPOCHS = 2
CHAR_EMBEDDING_DIM = 3
CHAR_HIDDEN_DIM = 3
BIDIRECTIONAL_LSTM = True
# Modeling layer
LSTM_LAYERS_MODELING = 2

### Part 1: Encoder

We will use the same POSTagger architecture to encode our passage and questions. The minor distinction here is, you have to encode inputs twice. Once for passage ("context") and one for the question ("query"). *Check that this works before moving on the the next part.*
We will also omit the HighWay Layer for this HW. However, you are welcome to add it for Part 2. 

Implement code in `LSTMEncoder` class in `MyQA.py`. 

In [34]:
directions = 1 if not BIDIRECTIONAL_LSTM else 2
encode = LSTMEncoder(EMBEDDING_DIM, HIDDEN_DIM, 
                     CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, 
                     len(char2idx), len(word2idx), lstm_layers=LSTM_LAYERS, 
                     bidirectional=BIDIRECTIONAL_LSTM, dropout=DROPOUT)
context_enc, query_enc = encode(context, context_chars, question, question_chars)

# verify this with various hyperparameters (by running this cell multiple times)

assert context_enc.shape == (len(context), HIDDEN_DIM*directions)
assert query_enc.shape == (len(question), HIDDEN_DIM*directions)

print("SUCCESS")

SUCCESS


### Part 2: Attention Layer

Implement the Attention Flow Layer same as in BiDAF paper inside `AttentionFlow`. More instructions are included in `MyQA.py`. 



In [35]:
attention = AttentionFlow(HIDDEN_DIM * directions * 3)
G = attention(context_enc, query_enc)
print(G.shape)

# verify this with various hyperparameters (by running this cell multiple times)
assert G.shape == (len(context), HIDDEN_DIM * directions * 4)
print("SUCCESS")


torch.Size([149, 24])
SUCCESS


### Part 3: Modeling Layer

Implement the `ModelingLayer` class in `MyQA.py` same as in BiDAF paper. More instructions are included in `MyQA.py`

In [46]:
from MyQA import *
modeling = ModelingLayer(HIDDEN_DIM * directions * 4, HIDDEN_DIM, 
                         num_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT, 
                         bidirectional=BIDIRECTIONAL_LSTM)
M = modeling(G)
print('M', M.shape, len(context), HIDDEN_DIM * directions)
# verify this with various hyperparameters (by running this cell multiple times)
assert M.shape == (len(context), HIDDEN_DIM * directions)

print("SUCCESS")

M torch.Size([149, 6]) 149 6
SUCCESS


### Part 4: Output Layer

Implement the `OutputLayer` class in `MyQA.py` that will yield start_idx and end_idx vectors.


In [12]:
output = OutputLayer(HIDDEN_DIM*directions*5, HIDDEN_DIM * directions, 
                     HIDDEN_DIM, bidirectional=BIDIRECTIONAL_LSTM)

start, end = output(G.double(), M)

# verify this with various hyperparameters (by running this cell multiple times)
start = start.unsqueeze(0)
end = end.unsqueeze(0)

assert start.shape == (1, len(context))
assert end.shape == (1, len(context))

print("SUCCESS")

SUCCESS


### Finally implement the `BiDAF` class

Now combine all the modules you just wrote into `BiDAF` class in `MyQA.py`

In [17]:
model = BiDAF(EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM, 
               CHAR_HIDDEN_DIM, len(char2idx), len(word2idx), 
               bidirectional=BIDIRECTIONAL_LSTM, phrase_LSTM_layers=LSTM_LAYERS, 
               modeling_LSTM_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT)

start, end = model(context, context_chars, question, question_chars)
start = start.unsqueeze(0)
end = end.unsqueeze(0)

# verify this with various hyperparameters (by running this cell multiple times)
assert start.shape == (1, len(context))
assert end.shape == (1, len(context))

print("SUCCESS")

SUCCESS


# 2. Implement Training and Eval

For this homework, you are required to just implement the train and eval functions. You don't need to train it until convergence or until it reaches high accuracy. However, as you might've noticed, the hyperparameters are the same as HW4. After implementing train and eval, your goal is to train the network for 2 epochs and tune hyperparameters so that the loss is decreasing consistently. We will verify this from your outputs. 

The idea here is to see the network we constructed above work. 

In [68]:
# # Hyperparameters (you will need to change these)
# #Iter: 8503/8504	Avg Train Loss: 9.4279	Avg Val Loss: 9.2280
# # Encoding layer
# EMBEDDING_DIM = 12
# HIDDEN_DIM = 6
# LEARNING_RATE = 0.005
# LSTM_LAYERS = 1
# DROPOUT = 0.1
# EPOCHS = 2
# CHAR_EMBEDDING_DIM = 6
# CHAR_HIDDEN_DIM = 6
# BIDIRECTIONAL_LSTM = True
# # Modeling layer
# LSTM_LAYERS_MODELING = 2

# Hyperparameters (you will need to change these)
# Iter: 8503/8504	Avg Train Loss: 9.2655	Avg Val Loss: 9.0793
# Encoding layer
# EMBEDDING_DIM = 32
# HIDDEN_DIM = 16
# LEARNING_RATE = 0.01
# LSTM_LAYERS = 2
# DROPOUT = 0.1
# EPOCHS = 2
# CHAR_EMBEDDING_DIM = 16
# CHAR_HIDDEN_DIM = 16
# BIDIRECTIONAL_LSTM = True
# # Modeling layer
# LSTM_LAYERS_MODELING = 2

#Epoch1: Iter: 8503/8504	Avg Train Loss: 9.3010	Avg Val Loss: 9.0261
#Epoch2: Iter: 8503/8504	Avg Train Loss: 8.8123	Avg Val Loss: 8.7616
# Encoding layer
# EMBEDDING_DIM = 64
# HIDDEN_DIM = 32
# LEARNING_RATE = 0.01
# LSTM_LAYERS = 2
# DROPOUT = 0.1
# EPOCHS = 2
# CHAR_EMBEDDING_DIM = 32
# CHAR_HIDDEN_DIM = 32
# BIDIRECTIONAL_LSTM = True
# # Modeling layer
# LSTM_LAYERS_MODELING = 2


# Encoding layer

EMBEDDING_DIM = 256
HIDDEN_DIM = 128
LEARNING_RATE = 0.01
LSTM_LAYERS = 2
DROPOUT = 0.1
EPOCHS = 2
CHAR_EMBEDDING_DIM = 128
CHAR_HIDDEN_DIM = 128
BIDIRECTIONAL_LSTM = True
# Modeling layer
LSTM_LAYERS_MODELING = 2

In [69]:
def get_eval_scores(predictions):
    """2D list of [[ID, PRED], [ID, PRED], ...]"""
    f1 = exact_match = 0
    for row in predictions:
        [ID, PRED] = row
        goldans = gold_val_answers.loc[ID]
        f1 += max(compute_f1(g, PRED) for g in goldans)
        exact_match += max(compute_exact(g, PRED) for g in goldans)
    num_samples = len(predictions)
    return f1 / num_samples, exact_match / num_samples

In [71]:
def train_BiDAF(epoch, model, loss_function, optimizer):
    train_loss = 0
    train_examples = len(train_df)
    itr = 0
    for i, row in tqdm(train_df.iterrows(), total=len(train_df)):
        # for i, row in train_df.iterrows():
        context = row.context_ids # tokenized context converted to ids
        question = row.question_ids # tokenized query converted to ids
        labels = row.label_idx # [start_idx, end_idx] of answer
        #############################################################################
        # TODO: Implement the training loop
        # Find the gradient with respect to the loss and update the model 
        #   parameters using the optimizer.
        # `context` and `question` are both already tokenized and converted to ids
        # You need to prepare your character level input. Feel free to use the 
        #   prepare_sequence method for this. 
        #############################################################################
        
        # create char inputs for context and question           
        context_chars = []
        for w in context:
            w = idx2word[w]
            padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
            context_chars += padding + list(w)
        context_chars = prepare_sequence(context_chars, char2idx)

        question_chars = []
        for w in question:
            w = idx2word[w]
            padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
            question_chars += padding + list(w)
        question_chars = prepare_sequence(question_chars, char2idx)
        
        # use the model
        context = torch.tensor(context, dtype=torch.long)
        question = torch.tensor(question, dtype=torch.long)
        start, end = model(context, context_chars, question, question_chars)
        
        # prepare data
        start = start.reshape(1,-1)
        end = end.reshape(1,-1)
        labels = torch.tensor(labels)
        se = torch.cat((start, end), dim=0)
        
        # caculate the loss here
#         loss = 2 * loss_function(se, labels)
        loss = loss_function(se, labels)
        
        # update the loss
        model.zero_grad()
        loss.backward()
        optimizer.step()
        
        # update train_loss
        train_loss += loss.item()
        
#         print(torch.norm(model.outputLayer.fc_GM.weight))
               
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        if ((i+1) % 100) == 0:
            avg_train_loss = train_loss / i
#             print("\nLoss:", avg_train_loss)
            avg_val_loss, f1_avg, exact_match_avg = eval_BiDAF(model, loss_function, rand_samples=10)
            print("Iter: {}/{}\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}\tVal F1: {:.0f} \tVal ExactMatch: {:.0f}".format(i, train_examples, avg_train_loss, avg_val_loss, f1_avg, exact_match_avg))
    
    avg_train_loss = train_loss / train_examples
    avg_val_loss, f1_avg, exact_match_avg = eval_BiDAF(model, loss_function, rand_samples=1000)        
    print("Iter: {}/{}\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}\tVal F1: {:.0f} \tVal ExactMatch: {:.0f}".format(i, train_examples, avg_train_loss, avg_val_loss, f1_avg, exact_match_avg))
    return model

def eval_BiDAF(model, loss_function, rand_samples=-1):
    val_loss = 0
    if rand_samples != -1:
        validation_set = val_df.sample(rand_samples)
    else:
        validation_set = val_df
    val_examples = len(validation_set)
    predictions = []
    with torch.no_grad():
        for i, row in tqdm(validation_set.iterrows(), total=len(validation_set)):
            # for i, row in validation_set.iterrows(): # if tqdm is not installed 
            context = row.context_ids # tokenized context converted to ids
            orig_context = prepare_sequence(context, idx2word)
            question = row.question_ids # tokenized query converted to ids
            [true_start, true_end] = row.label_idx # [start_idx, end_idx] of answer
            pred_start, pred_end = None, None # maxidx of the softmax outputs
            #############################################################################
            # TODO: Implement the training loop
            # Find the gradient with respect to the loss and update the model 
            #   parameters using the optimizer.
            # `context` and `question` are both already tokenized and converted to ids
            # You need to prepare your character level input. Feel free to use the 
            #   prepare_sequence method for this. 
            #############################################################################
            
            # create char inputs for context and question           
            context_chars = []
            for w in context:
                w = idx2word[w]
                padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
                context_chars += padding + list(w)
            context_chars = prepare_sequence(context_chars, char2idx)

            question_chars = []
            for w in question:
                w = idx2word[w]
                padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
                question_chars += padding + list(w)
            question_chars = prepare_sequence(question_chars, char2idx)

            # use the model
            context = torch.tensor(context, dtype=torch.long)
            question = torch.tensor(question, dtype=torch.long)
            start, end = model(context, context_chars, question, question_chars)
            
            # prepare data
            start = start.reshape(1,-1)
            end = end.reshape(1,-1)
            labels = torch.tensor([true_start, true_end])
            se = torch.cat((start, end), dim=0)

            # caculate the loss here
            loss = loss_function(se, labels)
            
            #update loss
            val_loss += loss.item()
            
            # get the prediction here.
            pred_start, pred_end = int(start.argmax(dim = 1)[0]), int(end.argmax(dim = 1)[0])
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
#             predictions.append([row.id, orig_context[pred_start : pred_end + 1]])
            predictions.append([row.id, ' '.join(orig_context[pred_start : pred_end + 1])])
            
    f1_avg, exact_match_avg = get_eval_scores(predictions)
    avg_val_loss = val_loss / val_examples
    return avg_val_loss, f1_avg*100, exact_match_avg*100

#############################################################################
# TODO: define loss function and optimizer to be used in train_BiDAF 
#############################################################################
model = BiDAF(EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM, 
               CHAR_HIDDEN_DIM, len(char2idx), len(word2idx), 
               bidirectional=BIDIRECTIONAL_LSTM, phrase_LSTM_layers=LSTM_LAYERS, 
               modeling_LSTM_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT)
device = torch.device("cpu")
model.to(device)

from torch.autograd import Variable

def loss_fn(data, labels):
    loss = Variable(torch.zeros(1))
    for d, label in zip(data, labels):
        loss -= torch.log(d[label]).cpu()
    return loss

# Define loss function
loss_function = loss_fn

# Define optimizer
parameters = filter(lambda p: p.requires_grad, model.parameters())
        
    # in the paper: We use the AdaDelta (Zeiler, 2012) optimizer
    # with a minibatch size of 60 and an initial learning rate of 0.5, for 12 epochs.

optimizer = optim.Adadelta(parameters, lr=LEARNING_RATE)
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
for epoch in range(1, EPOCHS + 1):
    model = train_BiDAF(epoch, model, loss_function, optimizer)
#     eval_BiDAF(model, loss_function, optimizer)

#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################

HBox(children=(FloatProgress(value=0.0, max=8504.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 99/8504	Avg Train Loss: 10.5517	Avg Val Loss: 9.2310	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 199/8504	Avg Train Loss: 10.6183	Avg Val Loss: 9.0218	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 299/8504	Avg Train Loss: 10.3697	Avg Val Loss: 8.9682	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 399/8504	Avg Train Loss: 10.2203	Avg Val Loss: 9.3847	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 499/8504	Avg Train Loss: 10.1632	Avg Val Loss: 9.1513	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 599/8504	Avg Train Loss: 10.0834	Avg Val Loss: 9.0893	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 699/8504	Avg Train Loss: 10.0746	Avg Val Loss: 8.9206	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 799/8504	Avg Train Loss: 10.0609	Avg Val Loss: 8.6278	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 899/8504	Avg Train Loss: 9.9895	Avg Val Loss: 8.7143	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 999/8504	Avg Train Loss: 9.9620	Avg Val Loss: 9.7531	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1099/8504	Avg Train Loss: 10.0033	Avg Val Loss: 8.8350	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1199/8504	Avg Train Loss: 9.9142	Avg Val Loss: 9.2636	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1299/8504	Avg Train Loss: 9.8350	Avg Val Loss: 8.8611	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1399/8504	Avg Train Loss: 9.8006	Avg Val Loss: 8.8387	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1499/8504	Avg Train Loss: 9.7771	Avg Val Loss: 8.8880	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1599/8504	Avg Train Loss: 9.7464	Avg Val Loss: 9.6214	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1699/8504	Avg Train Loss: 9.7665	Avg Val Loss: 8.7381	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1799/8504	Avg Train Loss: 9.7382	Avg Val Loss: 9.5916	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1899/8504	Avg Train Loss: 9.7069	Avg Val Loss: 9.0205	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1999/8504	Avg Train Loss: 9.6607	Avg Val Loss: 9.4314	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2099/8504	Avg Train Loss: 9.6409	Avg Val Loss: 8.4356	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2199/8504	Avg Train Loss: 9.6288	Avg Val Loss: 8.8209	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2299/8504	Avg Train Loss: 9.6212	Avg Val Loss: 8.6899	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2399/8504	Avg Train Loss: 9.6046	Avg Val Loss: 9.5524	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2499/8504	Avg Train Loss: 9.5841	Avg Val Loss: 8.8215	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2599/8504	Avg Train Loss: 9.5735	Avg Val Loss: 8.9406	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2699/8504	Avg Train Loss: 9.5413	Avg Val Loss: 9.5628	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2799/8504	Avg Train Loss: 9.4964	Avg Val Loss: 9.0848	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2899/8504	Avg Train Loss: 9.4771	Avg Val Loss: 8.7419	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2999/8504	Avg Train Loss: 9.4544	Avg Val Loss: 8.9423	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3099/8504	Avg Train Loss: 9.4264	Avg Val Loss: 8.6001	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3199/8504	Avg Train Loss: 9.3971	Avg Val Loss: 8.5990	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3299/8504	Avg Train Loss: 9.3749	Avg Val Loss: 9.6370	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3399/8504	Avg Train Loss: 9.3484	Avg Val Loss: 7.7346	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3499/8504	Avg Train Loss: 9.3238	Avg Val Loss: 8.9533	Val F1: 5 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3599/8504	Avg Train Loss: 9.2916	Avg Val Loss: 9.0662	Val F1: 3 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3699/8504	Avg Train Loss: 9.2653	Avg Val Loss: 8.4618	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3799/8504	Avg Train Loss: 9.2443	Avg Val Loss: 9.1021	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3899/8504	Avg Train Loss: 9.2316	Avg Val Loss: 7.3838	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3999/8504	Avg Train Loss: 9.2344	Avg Val Loss: 8.7836	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4099/8504	Avg Train Loss: 9.2250	Avg Val Loss: 8.5577	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4199/8504	Avg Train Loss: 9.2270	Avg Val Loss: 8.0411	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4299/8504	Avg Train Loss: 9.2058	Avg Val Loss: 8.2953	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4399/8504	Avg Train Loss: 9.1995	Avg Val Loss: 8.1728	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4499/8504	Avg Train Loss: 9.2079	Avg Val Loss: 8.1139	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4599/8504	Avg Train Loss: 9.2004	Avg Val Loss: 8.9907	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4699/8504	Avg Train Loss: 9.1931	Avg Val Loss: 7.9327	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4799/8504	Avg Train Loss: 9.1803	Avg Val Loss: 8.0523	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4899/8504	Avg Train Loss: 9.1652	Avg Val Loss: 9.0543	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4999/8504	Avg Train Loss: 9.1470	Avg Val Loss: 9.1354	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5099/8504	Avg Train Loss: 9.1260	Avg Val Loss: 7.5989	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5199/8504	Avg Train Loss: 9.1116	Avg Val Loss: 8.2519	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5299/8504	Avg Train Loss: 9.0809	Avg Val Loss: 8.4289	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5399/8504	Avg Train Loss: 9.0680	Avg Val Loss: 9.1075	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5499/8504	Avg Train Loss: 9.0494	Avg Val Loss: 9.2547	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5599/8504	Avg Train Loss: 9.0118	Avg Val Loss: 7.6234	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5699/8504	Avg Train Loss: 8.9791	Avg Val Loss: 8.4357	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5799/8504	Avg Train Loss: 8.9490	Avg Val Loss: 9.1461	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5899/8504	Avg Train Loss: 8.9165	Avg Val Loss: 8.8867	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5999/8504	Avg Train Loss: 8.9075	Avg Val Loss: 9.2342	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6099/8504	Avg Train Loss: 8.8970	Avg Val Loss: 8.5831	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6199/8504	Avg Train Loss: 8.8875	Avg Val Loss: 8.5495	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6299/8504	Avg Train Loss: 8.8877	Avg Val Loss: 7.9889	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6399/8504	Avg Train Loss: 8.8810	Avg Val Loss: 8.3196	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6499/8504	Avg Train Loss: 8.8715	Avg Val Loss: 8.9264	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6599/8504	Avg Train Loss: 8.8663	Avg Val Loss: 7.4644	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6699/8504	Avg Train Loss: 8.8646	Avg Val Loss: 7.9143	Val F1: 7 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6799/8504	Avg Train Loss: 8.8603	Avg Val Loss: 9.6422	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6899/8504	Avg Train Loss: 8.8638	Avg Val Loss: 8.1678	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6999/8504	Avg Train Loss: 8.8516	Avg Val Loss: 7.4809	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7099/8504	Avg Train Loss: 8.8373	Avg Val Loss: 7.9319	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7199/8504	Avg Train Loss: 8.8185	Avg Val Loss: 9.1129	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7299/8504	Avg Train Loss: 8.7995	Avg Val Loss: 8.1023	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7399/8504	Avg Train Loss: 8.7791	Avg Val Loss: 7.3643	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7499/8504	Avg Train Loss: 8.7578	Avg Val Loss: 8.3251	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7599/8504	Avg Train Loss: 8.7439	Avg Val Loss: 8.2591	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7699/8504	Avg Train Loss: 8.7346	Avg Val Loss: 8.4338	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7799/8504	Avg Train Loss: 8.7250	Avg Val Loss: 8.2312	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7899/8504	Avg Train Loss: 8.7195	Avg Val Loss: 7.8782	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7999/8504	Avg Train Loss: 8.7011	Avg Val Loss: 9.3012	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8099/8504	Avg Train Loss: 8.6914	Avg Val Loss: 8.3324	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8199/8504	Avg Train Loss: 8.6772	Avg Val Loss: 7.7303	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8299/8504	Avg Train Loss: 8.6657	Avg Val Loss: 8.5995	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8399/8504	Avg Train Loss: 8.6535	Avg Val Loss: 8.1370	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8499/8504	Avg Train Loss: 8.6388	Avg Val Loss: 7.5894	Val F1: 20 	Val ExactMatch: 20



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Iter: 8503/8504	Avg Train Loss: 8.6370	Avg Val Loss: 8.0690	Val F1: 1 	Val ExactMatch: 1


HBox(children=(FloatProgress(value=0.0, max=8504.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 99/8504	Avg Train Loss: 8.9243	Avg Val Loss: 7.8045	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 199/8504	Avg Train Loss: 9.0872	Avg Val Loss: 7.6557	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 299/8504	Avg Train Loss: 8.8741	Avg Val Loss: 7.8198	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 399/8504	Avg Train Loss: 8.7081	Avg Val Loss: 7.9156	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 499/8504	Avg Train Loss: 8.6598	Avg Val Loss: 8.9914	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 599/8504	Avg Train Loss: 8.5495	Avg Val Loss: 8.3991	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 699/8504	Avg Train Loss: 8.5541	Avg Val Loss: 8.3606	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 799/8504	Avg Train Loss: 8.4733	Avg Val Loss: 8.2148	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 899/8504	Avg Train Loss: 8.3473	Avg Val Loss: 7.2722	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 999/8504	Avg Train Loss: 8.2623	Avg Val Loss: 8.9138	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1099/8504	Avg Train Loss: 8.3073	Avg Val Loss: 8.5810	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1199/8504	Avg Train Loss: 8.2254	Avg Val Loss: 8.0908	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1299/8504	Avg Train Loss: 8.1501	Avg Val Loss: 7.9142	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1399/8504	Avg Train Loss: 8.1377	Avg Val Loss: 7.7923	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1499/8504	Avg Train Loss: 8.1349	Avg Val Loss: 8.7703	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1599/8504	Avg Train Loss: 8.1213	Avg Val Loss: 8.7080	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1699/8504	Avg Train Loss: 8.1366	Avg Val Loss: 8.5209	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1799/8504	Avg Train Loss: 8.1030	Avg Val Loss: 9.1756	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1899/8504	Avg Train Loss: 8.0698	Avg Val Loss: 7.6496	Val F1: 20 	Val ExactMatch: 20


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1999/8504	Avg Train Loss: 8.0352	Avg Val Loss: 8.3879	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2099/8504	Avg Train Loss: 8.0280	Avg Val Loss: 9.6482	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2199/8504	Avg Train Loss: 8.0143	Avg Val Loss: 7.7835	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2299/8504	Avg Train Loss: 8.0124	Avg Val Loss: 8.5985	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2399/8504	Avg Train Loss: 8.0030	Avg Val Loss: 7.1830	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2499/8504	Avg Train Loss: 7.9870	Avg Val Loss: 6.8910	Val F1: 20 	Val ExactMatch: 20


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2599/8504	Avg Train Loss: 7.9937	Avg Val Loss: 7.4313	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2699/8504	Avg Train Loss: 7.9667	Avg Val Loss: 7.5607	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2799/8504	Avg Train Loss: 7.9469	Avg Val Loss: 8.6725	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2899/8504	Avg Train Loss: 7.9373	Avg Val Loss: 7.7464	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 2999/8504	Avg Train Loss: 7.9333	Avg Val Loss: 8.1197	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3099/8504	Avg Train Loss: 7.9038	Avg Val Loss: 8.9863	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3199/8504	Avg Train Loss: 7.8699	Avg Val Loss: 9.4811	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3299/8504	Avg Train Loss: 7.8486	Avg Val Loss: 8.6827	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3399/8504	Avg Train Loss: 7.8179	Avg Val Loss: 7.5281	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3499/8504	Avg Train Loss: 7.7812	Avg Val Loss: 9.1783	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3599/8504	Avg Train Loss: 7.7422	Avg Val Loss: 7.2604	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3699/8504	Avg Train Loss: 7.7103	Avg Val Loss: 10.0745	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3799/8504	Avg Train Loss: 7.6954	Avg Val Loss: 9.5206	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3899/8504	Avg Train Loss: 7.6913	Avg Val Loss: 8.2574	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 3999/8504	Avg Train Loss: 7.7029	Avg Val Loss: 7.7940	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4099/8504	Avg Train Loss: 7.6990	Avg Val Loss: 7.8519	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4199/8504	Avg Train Loss: 7.7125	Avg Val Loss: 8.3942	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4299/8504	Avg Train Loss: 7.7055	Avg Val Loss: 7.7486	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4399/8504	Avg Train Loss: 7.7041	Avg Val Loss: 8.1656	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4499/8504	Avg Train Loss: 7.7176	Avg Val Loss: 8.7309	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4599/8504	Avg Train Loss: 7.7150	Avg Val Loss: 8.0743	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4699/8504	Avg Train Loss: 7.7180	Avg Val Loss: 6.6568	Val F1: 20 	Val ExactMatch: 20


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4799/8504	Avg Train Loss: 7.7214	Avg Val Loss: 7.7282	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4899/8504	Avg Train Loss: 7.7173	Avg Val Loss: 8.3094	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 4999/8504	Avg Train Loss: 7.7155	Avg Val Loss: 7.0743	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5099/8504	Avg Train Loss: 7.7105	Avg Val Loss: 8.6019	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5199/8504	Avg Train Loss: 7.7105	Avg Val Loss: 7.4391	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5299/8504	Avg Train Loss: 7.6877	Avg Val Loss: 7.8425	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5399/8504	Avg Train Loss: 7.6812	Avg Val Loss: 7.8866	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5499/8504	Avg Train Loss: 7.6597	Avg Val Loss: 8.9851	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5599/8504	Avg Train Loss: 7.6209	Avg Val Loss: 8.3597	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5699/8504	Avg Train Loss: 7.5931	Avg Val Loss: 8.6731	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5799/8504	Avg Train Loss: 7.5701	Avg Val Loss: 8.3024	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5899/8504	Avg Train Loss: 7.5431	Avg Val Loss: 8.0344	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 5999/8504	Avg Train Loss: 7.5398	Avg Val Loss: 9.0069	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6099/8504	Avg Train Loss: 7.5363	Avg Val Loss: 8.6908	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6199/8504	Avg Train Loss: 7.5422	Avg Val Loss: 8.5533	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6299/8504	Avg Train Loss: 7.5532	Avg Val Loss: 8.5357	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6399/8504	Avg Train Loss: 7.5594	Avg Val Loss: 7.2305	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6499/8504	Avg Train Loss: 7.5576	Avg Val Loss: 8.0178	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6599/8504	Avg Train Loss: 7.5570	Avg Val Loss: 7.3979	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6699/8504	Avg Train Loss: 7.5585	Avg Val Loss: 6.0215	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6799/8504	Avg Train Loss: 7.5599	Avg Val Loss: 7.8409	Val F1: 10 	Val ExactMatch: 10


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6899/8504	Avg Train Loss: 7.5679	Avg Val Loss: 8.9082	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 6999/8504	Avg Train Loss: 7.5636	Avg Val Loss: 9.1221	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7099/8504	Avg Train Loss: 7.5589	Avg Val Loss: 9.1005	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7199/8504	Avg Train Loss: 7.5490	Avg Val Loss: 7.4146	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7299/8504	Avg Train Loss: 7.5400	Avg Val Loss: 7.5686	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7399/8504	Avg Train Loss: 7.5268	Avg Val Loss: 9.1987	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7499/8504	Avg Train Loss: 7.5108	Avg Val Loss: 7.2683	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7599/8504	Avg Train Loss: 7.5058	Avg Val Loss: 7.8747	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7699/8504	Avg Train Loss: 7.5074	Avg Val Loss: 7.7366	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7799/8504	Avg Train Loss: 7.5053	Avg Val Loss: 7.0224	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7899/8504	Avg Train Loss: 7.5069	Avg Val Loss: 9.0812	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 7999/8504	Avg Train Loss: 7.4964	Avg Val Loss: 7.7116	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8099/8504	Avg Train Loss: 7.4937	Avg Val Loss: 8.5764	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8199/8504	Avg Train Loss: 7.4868	Avg Val Loss: 7.6860	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8299/8504	Avg Train Loss: 7.4803	Avg Val Loss: 7.7987	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8399/8504	Avg Train Loss: 7.4770	Avg Val Loss: 7.4427	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 8499/8504	Avg Train Loss: 7.4703	Avg Val Loss: 9.1803	Val F1: 0 	Val ExactMatch: 0



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Iter: 8503/8504	Avg Train Loss: 7.4693	Avg Val Loss: 7.9120	Val F1: 1 	Val ExactMatch: 1


In [26]:
len(torch.tensor([[[1]]]).shape)

3

In [53]:
a = '123'
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'