<a href="https://colab.research.google.com/github/whoami-Lory271/NN-project-memorizing-transformers/blob/main/NN_project_Antonelli_DeSantis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn as nn
import numpy as np
from torch.nn import functional as F
from math import sqrt
import matplotlib.pyplot as plt
from torch.autograd import Variable
from pathlib import Path
from filelock import FileLock
import random
import tqdm
import gzip
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# KNN Memory

In [3]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [4]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [5]:
#import per la knn memory
import os
import math
import torch
import faiss
import numpy as np
from pathlib import Path
from functools import wraps

from contextlib import ExitStack, contextmanager

from einops import rearrange, pack, unpack

# multiprocessing

from joblib import Parallel, delayed, cpu_count

In [6]:
FAISS_INDEX_GPU_ID = int(os.getenv('FAISS_INDEX_GPU_ID', 0))

DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY = './.tmp/knn.memories'

# helper functions

def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

def cast_list(val):
    return val if isinstance(val, list) else [val]

def all_el_unique(arr):
    return len(set(arr)) == len(arr)

@contextmanager
def multi_context(*cms):
    with ExitStack() as stack:
        yield [stack.enter_context(cls) for cls in cms]

def count_intersect(x, y):
    # returns an array that shows how many times an element in x is contained in tensor y
    return np.sum(rearrange(x, 'i -> i 1') == rearrange(y, 'j -> 1 j'), axis = -1)

def check_shape(tensor, pattern, **kwargs):
    return rearrange(tensor, f"{pattern} -> {pattern}", **kwargs)

# a wrapper around faiss IndexIVFFlat
# taking care of expiring old keys automagically

class KNN():
    def __init__(
        self,
        dim,
        max_num_entries,
        cap_num_entries = False,
        M = 15,
        keep_stats = False
    ):
        index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
        self.index = index
        self.max_num_entries = max_num_entries
        self.cap_num_entries = cap_num_entries
        self.is_trained = False
        self.keep_stats = keep_stats

        self.reset()

    def __del__(self):
        if hasattr(self, 'index'):
            del self.index

    def reset(self):
        self.ids = np.empty((0,), dtype = np.int32)

        if self.keep_stats:
            self.hits = np.empty((0,), dtype = np.int32)
            self.age_num_iterations = np.empty((0,), dtype = np.int32)
            self.ages_since_last_hit = np.empty((0,), dtype = np.int32)

        self.index.reset()
        self.is_trained = False

    def train(self, x):
        self.index.train(x)
        self.is_trained = True

    def add(self, x, ids):
        if not self.is_trained:
            self.train(x)

        self.ids = np.concatenate((ids, self.ids))

        if self.keep_stats:
            self.hits = np.concatenate((np.zeros_like(ids), self.hits))
            self.age_num_iterations = np.concatenate((np.zeros_like(ids), self.age_num_iterations))
            self.ages_since_last_hit = np.concatenate((np.zeros_like(ids), self.ages_since_last_hit))

        if self.cap_num_entries and len(self.ids) > self.max_num_entries:
            self.reset()

        return self.index.add(x)

    def search(
        self,
        x,
        topk,
        nprobe = 8,
        return_distances = False,
        increment_hits = False,
        increment_age = True
    ):
        if not self.is_trained:
            return np.full((x.shape[0], topk), -1)

        distances, indices = self.index.search(x, k = topk)

        if increment_hits and self.keep_stats:
            hits = count_intersect(self.ids, rearrange(indices, '... -> (...)'))
            self.hits += hits

            self.ages_since_last_hit += 1
            self.ages_since_last_hit *= (hits == 0)

        if increment_age and self.keep_stats:
            self.age_num_iterations += 1

        if return_distances:
            return indices, distances

        return indices

# KNN memory layer, where one can store key / value memories
# can automatically take care of a collection of faiss indices (across batch dimension)

class KNNMemory():
    def __init__(
        self,
        dim,
        max_memories = 16000,
        num_indices = 1,
        memmap_filename = './knn.memory.memmap',
        multiprocessing = True
    ):
        self.dim = dim
        self.num_indices = num_indices
        self.scoped_indices = list(range(num_indices))

        self.max_memories = max_memories
        self.shape = (num_indices, max_memories, 2, dim)
        self.db_offsets = np.zeros(num_indices, dtype = np.int32)

        self.db = np.memmap(memmap_filename, mode = 'w+', dtype = np.float32, shape = self.shape)
        self.knns = [KNN(dim = dim, max_num_entries = max_memories, cap_num_entries = True) for _ in range(num_indices)]
    
        self.n_jobs = cpu_count() if multiprocessing else 1

    def set_scoped_indices(self, indices):
        indices = list(indices)
        assert all_el_unique(indices), f'all scoped batch indices must be unique, received: {indices}'
        assert all([0 <= i < self.num_indices for i in indices]), f'each batch index must be between 0 and less than {self.num_indices}: received {indices}'
        self.scoped_indices = indices

    @contextmanager
    def at_batch_indices(self, indices):
        prev_indices = self.scoped_indices
        self.set_scoped_indices(indices)
        yield self
        self.set_scoped_indices(prev_indices)

    def clear(self, batch_indices = None):
        if not exists(batch_indices):
            batch_indices = list(range(self.num_indices))

        batch_indices = cast_list(batch_indices)

        for index in batch_indices:
            knn = self.knns[index]
            knn.reset()

        self.db_offsets[batch_indices] = 0

    def add(self, memories):
        check_shape(memories, 'b n kv d', d = self.dim, kv = 2, b = len(self.scoped_indices))

        memories = memories.detach().cpu().numpy()
        memories = memories[:, -self.max_memories:]
        num_memories = memories.shape[1]

        knn_insert_ids = np.arange(num_memories)

        keys = np.ascontiguousarray(memories[..., 0, :])
        knns = [self.knns[i] for i in self.scoped_indices]
        db_offsets = [self.db_offsets[i] for i in self.scoped_indices]

        # use joblib to insert new key / value memories into faiss index

        @delayed
        def knn_add(knn, key, db_offset):
            knn.add(key, ids = knn_insert_ids + db_offset)
            return knn

        updated_knns = Parallel(n_jobs = self.n_jobs)(knn_add(*args) for args in zip(knns, keys, db_offsets))
        for knn_idx, scoped_idx in enumerate(self.scoped_indices):
            self.knns[scoped_idx] = updated_knns[knn_idx]

        # add the new memories to the memmap "database"

        add_indices = (rearrange(np.arange(num_memories), 'j -> 1 j') + rearrange(self.db_offsets[list(self.scoped_indices)], 'i -> i 1')) % self.max_memories
        self.db[rearrange(np.array(self.scoped_indices), 'i -> i 1'), add_indices] = memories
        self.db.flush()

        self.db_offsets += num_memories

    def search(
        self,
        queries,
        topk,
        nprobe = 8,
        increment_hits = True,
        increment_age = True
    ):
        check_shape(queries, 'b ... d', d = self.dim, b = len(self.scoped_indices))
        queries, ps = pack([queries], 'b * d')

        device = queries.device
        queries = queries.detach().cpu().numpy()

        all_masks = []
        all_key_values = []

        knns = [self.knns[i] for i in self.scoped_indices]

        # parallelize faiss search

        @delayed
        def knn_search(knn, query):
            return knn.search(query, topk, nprobe, increment_hits = increment_hits, increment_age = increment_age)

        fetched_indices = Parallel(n_jobs = self.n_jobs)(knn_search(*args) for args in zip(knns, queries))

        # get all the memory key / values from memmap 'database'
        # todo - remove for loop below

        for batch_index, indices in zip(self.scoped_indices, fetched_indices):
            mask = indices !=  -1
            db_indices = np.where(mask, indices, 0)

            all_masks.append(torch.from_numpy(mask))

            key_values = self.db[batch_index, db_indices % self.max_memories]
            all_key_values.append(torch.from_numpy(key_values))

        all_masks = torch.stack(all_masks)
        all_key_values = torch.stack(all_key_values)
        all_key_values = all_key_values.masked_fill(~rearrange(all_masks, '... -> ... 1 1'), 0.)

        all_key_values, = unpack(all_key_values, ps, 'b * n kv d')
        all_masks, = unpack(all_masks, ps, 'b * n')

        return all_key_values.to(device), all_masks.to(device)

    def __del__(self):
        if hasattr(self, 'knns'):
            for knn in self.knns:
                del knn
        del self.db

# extends list with some extra methods for collections of KNN memories

class KNNMemoryList(list):
    def cleanup(self):
        for memory in self:
            del memory

    @classmethod
    def create_memories(
        self,
        *,
        batch_size,
        num_memory_layers,
        memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY
    ):
        memories_path = Path(memories_directory)
        memories_path.mkdir(exist_ok = True, parents = True)

        def inner(*args, **kwargs):
            return self([KNNMemory(*args, num_indices = batch_size, memmap_filename = str(memories_path / f'knn.memory.layer.{ind + 1}.memmap'), **kwargs) for ind in range(num_memory_layers)])
        return inner

    @contextmanager
    def at_batch_indices(
        self,
        indices
    ):
        knn_batch_indices_contexts = [memory.at_batch_indices(indices) for memory in self]
        with multi_context(*knn_batch_indices_contexts):
            yield

    def clear_memory(
        self,
        batch_indices = None,
        memory_indices = None
    ):
        memory_indices = default(memory_indices, tuple(range(len(self))))

        for memory_index in memory_indices:
            memory = self[memory_index]
            memory.clear(batch_indices)

# Memorizing transformers

In [7]:
def attention(query, key, value, sqrt_q, device):
    t = torch.matmul(query, key.transpose(-2, -1))/sqrt_q
    i, j = t.shape[-2:]
    mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
    return torch.matmul(F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

def KNNattention(query, key, value, sqrt_q, mask):
    t = torch.einsum('b h i d, b h i j d -> b h i j', query, key)/sqrt_q
    return torch.einsum('b h i j, b h i j d -> b h i d', F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

In [8]:
class MultiHeadAttention(nn.Module):
  def __init__(self, n, d, h, batch_size):
    super(MultiHeadAttention, self).__init__()
    assert d % h == 0
    #assume q = v 
    self.q = d // h
    self.sqrt_q = sqrt(self.q)
    self.h = h
    self.batch_size = batch_size
    self.W_q = nn.Linear(d, d, bias = False) #stack of h matrices of dimension (d, q), one for each head
    self.W_k = nn.Linear(d, d, bias = False)
    self.W_v = nn.Linear(d, d, bias = False)
    self.W_o = nn.Linear(d, d, bias = False)

  def forward(self, x, device):
    query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    new_memories = torch.stack((key, value), dim = -2).detach()
    attention_value = attention(query, key, value, self.sqrt_q, device)
    return self.W_o(attention_value.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_memories

In [9]:
class KNNAttention(nn.Module):
   def __init__(self, n, d, h, num_retrieved_memories, batch_size):
      super(KNNAttention, self).__init__()
      assert d % h == 0
      #assume q = v 
      self.q = d // h
      self.sqrt_q = sqrt(self.q)
      self.h = h
      self.W_q = nn.Linear(d, d, bias = False)
      self.W_k = nn.Linear(d, d, bias = False)
      self.W_v = nn.Linear(d, d, bias = False)
      self.W_o = nn.Linear(d, d, bias = False)
      self.b_g = nn.Parameter(torch.randn((h,))) #one for each head
      self.num_retrieved_memories = num_retrieved_memories
      self.batch_size = batch_size

   def forward(self, x, knn_memory, device):
      # calculate local attention 
      query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      local_attention = attention(query, key, value, self.sqrt_q, device)

      # calculate knn attention over memory
      mem_kv, mem_mask = knn_memory[0].search(query, self.num_retrieved_memories)
      mem_key, mem_value = mem_kv.unbind(dim = -2)
      knn_attention = KNNattention(query, mem_key, mem_value, self.sqrt_q, ~mem_mask)

      # memory to be stored
      new_kv_memories = torch.stack((key, value), dim = -2).view(self.batch_size, -1, 2, self.q).detach()

      # add to knn memory
      if new_kv_memories.numel() > 0:
        knn_memory[0].add(new_kv_memories)

      # combining local and memory
      g = torch.sigmoid(self.b_g)
      final_attention = torch.einsum('b h n d, h -> b h n d', knn_attention, g) + \
                        torch.einsum('b h n d, h -> b h n d', local_attention, (1 - g))
      
      return self.W_o(final_attention.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_kv_memories

In [10]:
class SubLayer(nn.Module):
  def __init__(self, d, dropout, hidden_size):
    super(SubLayer, self).__init__()
    self.norm = nn.LayerNorm(d)
    self.mlp = nn.Sequential(nn.Linear(d, hidden_size, bias = True), 
                             nn.ReLU(),
                             nn.Dropout(dropout),
                             nn.Linear(hidden_size, d, bias = True))

  def forward(self, x):
    return x + self.mlp(self.norm(x)) #residual connection and normalization

In [11]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super(PositionalEncoding, self).__init__()
    
    # Compute the positional encodings once in log space.
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) *
                          -(math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
      
  def forward(self, x):
    return x + Variable(self.pe[:, :x.size(1)], requires_grad=False)

In [12]:
class MemorizingTransformer(nn.Module):
    def __init__(
          self,
          num_tokens,
          d,
          heads = 8,
          depth = 10,
          knn_attn_idx = 7,
          attn_dropout = 0.,
          hidden_size = 1000,
          dropout = 0.3,
          max_knn_memories = 1000,
          num_retrieved_memories = 8,
          knn_memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY,
          knn_memory_multiprocessing = False,
          batch_size = 16
      ):
          # asserts
          assert d % heads == 0
          assert knn_attn_idx < depth

          super(MemorizingTransformer, self).__init__()
          self.token_emb = nn.Embedding(num_tokens, d)
          self.positional_emb = PositionalEncoding(d, max_len = 5000)
          self.dim_head = d // heads
          self.d = d
          self.heads = heads
          self.knn_attn_idx = knn_attn_idx
          self.depth = depth
          self.attn_dropout = attn_dropout
          self.hidden_size = hidden_size
          self.dropout = dropout
          self.max_knn_memories = max_knn_memories
          self.num_retrieved_memories = num_retrieved_memories
          self.knn_memories_directory = knn_memories_directory
          self.knn_memory_multiprocessing =knn_memory_multiprocessing
          self.batch_size = batch_size

          self.layers = nn.ModuleList([])
          for idx in range(depth):
              attn = KNNAttention(num_tokens, d, heads, num_retrieved_memories, self.batch_size) \
                  if idx == knn_attn_idx else MultiHeadAttention(num_tokens, d, heads, self.batch_size)

              self.layers.append(nn.ModuleList([
                  attn,
                  SubLayer(d, dropout, hidden_size)
              ]))

          self.to_out = nn.Sequential(
               nn.LayerNorm(d),
               nn.Linear(d, num_tokens)
          )

          # knn memories init

          self.knn_mem_kwargs = dict(
              dim = self.dim_head,
              max_memories = self.max_knn_memories,
              multiprocessing = knn_memory_multiprocessing
          )
          
    def forward(
        self,
        x,
        knn_memory
    ):
        batch_size, seq_len, *_, device = *x.shape, x.device
        x = self.token_emb(x)
        x = self.positional_emb(x)

        for idx, (attn, sub_l) in enumerate(self.layers):
            
            #attention

            x, mem = attn(x, knn_memory, device) if self.knn_attn_idx == idx else attn(x, device)
      
            # normalization + feedforward + residual connection

            x = sub_l(x)

        return self.to_out(x).transpose(1, 2)

    
    def create_knn_memories(
          self,
          *,
          batch_size
      ):  
          return KNNMemoryList.create_memories(
              batch_size = batch_size,
              num_memory_layers = 1,
              memories_directory = self.knn_memories_directory
          )(**self.knn_mem_kwargs)
      
    @contextmanager
    def knn_memories_context(
        self,
        **kwargs
    ):
        knn_dir = Path(self.knn_memories_directory)
        knn_dir.mkdir(exist_ok = True, parents = True)
        lock = FileLock(str(knn_dir / 'mutex'))

        with lock:
            knn_memories = self.create_knn_memories(**kwargs)
            yield knn_memories
            knn_memories.cleanup()

    def clear_memory(self, x, token_id):
        """ clears the KNN memories based on if the batch row contains the specified token id """
        """ for auto-clearing KNN memories based on start and end of strings """

        clear_memory = (x == token_id).any(dim = -1)
        batch_indices, _ = clear_memory.nonzero(as_tuple = True)
        batch_indices_to_clear = batch_indices.tolist()

        if len(batch_indices_to_clear) == 0:
            return

        knn_memories.clear_memory(batch_indices_to_clear)

# Training

In [13]:
# constants

NUM_BATCHES = int(1e5)
BATCH_SIZE = 16
SEQ_LEN = 512
SEGMENTS = 5
HEADS = 8
DIM_HEAD = SEQ_LEN // HEADS

LEARNING_RATE = 2e-4
MAX_GRAD_CLIP_NORM = 0.5

EVAL_EVERY = 20
GENERATE_EVERY  = 500
GENERATE_LENGTH = 512
CHECKPOINT = 100

In [14]:
model = MemorizingTransformer(
    num_tokens = 256,
    d = SEQ_LEN,
    heads = HEADS,
    batch_size = BATCH_SIZE,
    num_retrieved_memories = 32
).cuda()

# prepare enwik8 data

#Lorenzo
with gzip.open('/content/drive/MyDrive/Secondo Anno/Neural Networks/project/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    print(X.shape)
    # number of samples to take
    n_samples = math.ceil(0.6*X.shape[0])
    # take the set uniformly at random
    data = resample(X, n_samples=n_samples, replace=False)
    trX, vaX = train_test_split(data, test_size=math.ceil(0.2*data.shape[0]))
    # trX, vaX = np.split(X, [int(90e6)])
    print(trX.shape)
    print(vaX.shape)
    # assert False
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
    # data = torch.from_numpy(X)
"""

#Luigi
with gzip.open('/content/drive/MyDrive/Colab Notebooks/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
"""

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len

# # dataset and dataloader
# dataset = TextSamplerDataset(data, SEQ_LEN)
# # test_dataset = TextSamplerDataset(data_val, SEQ_LEN)

# data_size = dataset.__len__()
# # data_test_size = test_dataset.__len__()

# perc_data = 0.3
# valid_size=0.2
# indices = list(range(data_size))
# np.random.shuffle(indices)
# data_size = int(np.floor(data_size * 0.3))
# print(data_size)
# indices = indices[:data_size]

# split = int(np.floor(valid_size * data_size))
# train_idx, valid_idx = indices[split:], indices[:split]

# # define samplers for obtaining training and validation batches
# train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
# valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)

# train_loader  = DataLoader(dataset, batch_size = BATCH_SIZE, sampler = train_sampler, drop_last = True)

# test_loader = DataLoader(dataset, batch_size = BATCH_SIZE, sampler =valid_sampler, drop_last = True)


train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
train_loader  = DataLoader(train_dataset, batch_size = BATCH_SIZE, drop_last = True)
test_dataset = TextSamplerDataset(data_val, SEQ_LEN)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, drop_last = True)

  X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)


(95000000,)
(45600000,)
(11400000,)


In [15]:
def print_string(a):
  seq = ""
  for word in a:
    for letter in word:
      seq += chr(letter)
    seq += " "
  return seq

In [None]:
# optimizer

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
loss = nn.CrossEntropyLoss()

# training

perplexity_list = []
for i, data in enumerate(tqdm.tqdm(train_loader, desc = 'training')):
    model.train()

    train_loss = 0.
    with model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories:
        
        seq, labels = data[:, :-1], data[:, 1:] #the labels are the same sequences shifted by one

        out = model(
              seq,
              knn_memory = knn_memories
        )
        #loss_item = torch.exp(loss(out, labels)) #perplexity
        loss_item = loss(out, labels)
        train_loss += loss_item
        loss_item.backward() 

    print(f'training loss: {train_loss}', flush = True)
    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_CLIP_NORM)
    optimizer.step()
    optimizer.zero_grad()

    if i % EVAL_EVERY == 0:
        model.eval()
      
        test_data = None
        for test_data in test_loader:
          break

        test_loss = 0.

        with torch.no_grad(), model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories: 
            seq, labels = test_data[:, :-1], test_data[:, 1:]
            
            out = model(
              seq,
              knn_memory = knn_memories
            )

            loss_item = loss(out, labels)
            test_loss +=  loss_item
            

        print(f'valid loss: {test_loss}', flush = True)
        print(f'perplexity: {torch.exp(test_loss)}', flush = True)
        perplexity_list.append(torch.exp(test_loss).to('cpu').item())
    
    if i % CHECKPOINT == 0:
      torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
      }, 'model_optimizer2.pt')
      #Lorenzo
      with open('/content/drive/MyDrive/Università/Magistrale/Secondo Anno/Neural Networks/project/perplexity_moreNN.npy', 'wb') as f:
        np.save(f, np.array(perplexity_list))

plt.plot(perplexity_list, label = "Memorizing Transformer Perplexity Plot")
plt.legend()
plt.show()

training:   0%|          | 0/5566 [00:00<?, ?it/s]

training loss: 5.972207546234131
valid loss: 4.803369045257568
perplexity: 121.92048645019531


training:   0%|          | 1/5566 [00:09<14:56:40,  9.67s/it]

training loss: 4.894824028015137


training:   0%|          | 2/5566 [00:13<9:27:59,  6.12s/it] 

training loss: 4.410308837890625


training:   0%|          | 3/5566 [00:16<7:44:06,  5.01s/it]

training loss: 4.162806034088135


training:   0%|          | 4/5566 [00:20<7:01:52,  4.55s/it]

training loss: 3.996558666229248


training:   0%|          | 5/5566 [00:23<6:02:16,  3.91s/it]

training loss: 3.9087722301483154


training:   0%|          | 6/5566 [00:25<5:10:02,  3.35s/it]

training loss: 3.810528516769409


training:   0%|          | 7/5566 [00:28<4:36:51,  2.99s/it]

training loss: 3.80539608001709


training:   0%|          | 8/5566 [00:30<4:14:34,  2.75s/it]

training loss: 3.7608442306518555


training:   0%|          | 9/5566 [00:32<4:00:47,  2.60s/it]

training loss: 3.698481321334839


training:   0%|          | 10/5566 [00:34<3:50:36,  2.49s/it]

training loss: 3.6910696029663086


training:   0%|          | 11/5566 [00:37<3:43:28,  2.41s/it]

training loss: 3.642563819885254


training:   0%|          | 12/5566 [00:39<3:38:03,  2.36s/it]

training loss: 3.64902400970459


training:   0%|          | 13/5566 [00:41<3:34:15,  2.32s/it]

training loss: 3.6028151512145996


training:   0%|          | 14/5566 [00:43<3:31:39,  2.29s/it]

training loss: 3.617840528488159


training:   0%|          | 15/5566 [00:46<3:31:47,  2.29s/it]

training loss: 3.5949044227600098


training:   0%|          | 16/5566 [00:48<3:30:39,  2.28s/it]

training loss: 3.5943567752838135


training:   0%|          | 17/5566 [00:50<3:30:59,  2.28s/it]

training loss: 3.579521417617798


training:   0%|          | 18/5566 [00:52<3:29:31,  2.27s/it]

training loss: 3.5569164752960205


training:   0%|          | 19/5566 [00:55<3:28:35,  2.26s/it]

training loss: 3.55481219291687


training:   0%|          | 20/5566 [00:57<3:28:03,  2.25s/it]

training loss: 3.566908597946167
valid loss: 3.5559935569763184
perplexity: 35.02259826660156


training:   0%|          | 21/5566 [01:01<4:24:42,  2.86s/it]

training loss: 3.5539965629577637


training:   0%|          | 22/5566 [01:03<4:09:32,  2.70s/it]

training loss: 3.5470376014709473


training:   0%|          | 23/5566 [01:06<3:57:06,  2.57s/it]

training loss: 3.5318338871002197


training:   0%|          | 24/5566 [01:08<3:48:07,  2.47s/it]

training loss: 3.5373806953430176


training:   0%|          | 25/5566 [01:10<3:41:48,  2.40s/it]

training loss: 3.5495803356170654


training:   0%|          | 26/5566 [01:12<3:37:56,  2.36s/it]

training loss: 3.52970552444458


training:   0%|          | 27/5566 [01:15<3:34:57,  2.33s/it]

training loss: 3.5340049266815186


training:   1%|          | 28/5566 [01:17<3:32:26,  2.30s/it]

training loss: 3.539123058319092


training:   1%|          | 29/5566 [01:20<3:42:28,  2.41s/it]

training loss: 3.5448155403137207


training:   1%|          | 30/5566 [01:23<3:56:29,  2.56s/it]

training loss: 3.5408754348754883


training:   1%|          | 31/5566 [01:25<3:47:04,  2.46s/it]

training loss: 3.5160176753997803


training:   1%|          | 32/5566 [01:27<3:41:26,  2.40s/it]

training loss: 3.515538454055786


training:   1%|          | 33/5566 [01:29<3:36:22,  2.35s/it]

training loss: 3.5209438800811768


training:   1%|          | 34/5566 [01:31<3:33:34,  2.32s/it]

training loss: 3.5307443141937256


training:   1%|          | 35/5566 [01:34<3:31:28,  2.29s/it]

training loss: 3.5478322505950928


training:   1%|          | 36/5566 [01:36<3:29:46,  2.28s/it]

training loss: 3.5634641647338867


training:   1%|          | 37/5566 [01:38<3:29:05,  2.27s/it]

training loss: 3.522979974746704


training:   1%|          | 38/5566 [01:40<3:27:15,  2.25s/it]

training loss: 3.501546621322632


training:   1%|          | 39/5566 [01:43<3:28:12,  2.26s/it]

training loss: 3.564802885055542


training:   1%|          | 40/5566 [01:45<3:26:46,  2.25s/it]

training loss: 3.5432887077331543
valid loss: 3.5377936363220215
perplexity: 34.39095687866211


training:   1%|          | 41/5566 [01:49<4:20:58,  2.83s/it]

training loss: 3.539222002029419


training:   1%|          | 42/5566 [01:51<4:04:34,  2.66s/it]

training loss: 3.5374701023101807


training:   1%|          | 43/5566 [01:54<3:52:34,  2.53s/it]

training loss: 3.510319471359253


training:   1%|          | 44/5566 [01:56<3:44:51,  2.44s/it]

training loss: 3.542377471923828


training:   1%|          | 45/5566 [01:58<3:39:07,  2.38s/it]

training loss: 3.5089974403381348


training:   1%|          | 46/5566 [02:00<3:35:13,  2.34s/it]

training loss: 3.5247931480407715


training:   1%|          | 47/5566 [02:03<3:32:26,  2.31s/it]

training loss: 3.526601791381836


training:   1%|          | 48/5566 [02:05<3:30:52,  2.29s/it]

training loss: 3.5296740531921387


training:   1%|          | 49/5566 [02:07<3:28:56,  2.27s/it]

training loss: 3.517510175704956


training:   1%|          | 50/5566 [02:09<3:28:09,  2.26s/it]

training loss: 3.5229320526123047


training:   1%|          | 51/5566 [02:11<3:27:06,  2.25s/it]

training loss: 3.525209665298462


training:   1%|          | 52/5566 [02:14<3:26:51,  2.25s/it]

training loss: 3.537555694580078


training:   1%|          | 53/5566 [02:16<3:26:44,  2.25s/it]

training loss: 3.544950008392334


training:   1%|          | 54/5566 [02:18<3:26:04,  2.24s/it]

training loss: 3.5144436359405518


training:   1%|          | 55/5566 [02:20<3:26:04,  2.24s/it]

training loss: 3.5118203163146973


training:   1%|          | 56/5566 [02:23<3:26:21,  2.25s/it]

training loss: 3.5408713817596436


training:   1%|          | 57/5566 [02:25<3:26:36,  2.25s/it]

training loss: 3.5375285148620605


training:   1%|          | 58/5566 [02:28<3:41:54,  2.42s/it]

training loss: 3.5448381900787354


training:   1%|          | 59/5566 [02:30<3:44:06,  2.44s/it]

training loss: 3.525055408477783


training:   1%|          | 60/5566 [02:32<3:37:59,  2.38s/it]

training loss: 3.519597053527832
valid loss: 3.5168869495391846
perplexity: 33.679420471191406


training:   1%|          | 61/5566 [02:37<4:29:24,  2.94s/it]

training loss: 3.5500926971435547


training:   1%|          | 62/5566 [02:39<4:11:18,  2.74s/it]

training loss: 3.5401909351348877


training:   1%|          | 63/5566 [02:41<3:57:39,  2.59s/it]

training loss: 3.5083985328674316


training:   1%|          | 64/5566 [02:44<3:48:24,  2.49s/it]

training loss: 3.511108160018921


training:   1%|          | 65/5566 [02:46<3:41:27,  2.42s/it]

training loss: 3.516770124435425


training:   1%|          | 66/5566 [02:48<3:37:14,  2.37s/it]

training loss: 3.502488613128662


training:   1%|          | 67/5566 [02:50<3:33:24,  2.33s/it]

training loss: 3.533906936645508


training:   1%|          | 68/5566 [02:52<3:31:13,  2.31s/it]

training loss: 3.527475357055664


training:   1%|          | 69/5566 [02:55<3:29:05,  2.28s/it]

training loss: 3.5153963565826416


training:   1%|▏         | 70/5566 [02:57<3:27:52,  2.27s/it]

training loss: 3.5022294521331787


training:   1%|▏         | 71/5566 [02:59<3:26:46,  2.26s/it]

training loss: 3.518782615661621


training:   1%|▏         | 72/5566 [03:01<3:26:06,  2.25s/it]

training loss: 3.5292270183563232


training:   1%|▏         | 73/5566 [03:04<3:25:21,  2.24s/it]

training loss: 3.543004274368286


training:   1%|▏         | 74/5566 [03:06<3:25:12,  2.24s/it]

training loss: 3.5110223293304443


training:   1%|▏         | 75/5566 [03:08<3:24:53,  2.24s/it]

training loss: 3.5410406589508057


training:   1%|▏         | 76/5566 [03:10<3:24:05,  2.23s/it]

training loss: 3.5465986728668213


training:   1%|▏         | 77/5566 [03:13<3:24:15,  2.23s/it]

training loss: 3.5146267414093018


training:   1%|▏         | 78/5566 [03:15<3:25:25,  2.25s/it]

training loss: 3.538908004760742


training:   1%|▏         | 79/5566 [03:17<3:25:16,  2.24s/it]

training loss: 3.538386821746826


training:   1%|▏         | 80/5566 [03:19<3:24:55,  2.24s/it]

training loss: 3.5317893028259277
valid loss: 3.530390977859497
perplexity: 34.13731002807617


training:   1%|▏         | 81/5566 [03:24<4:19:14,  2.84s/it]

training loss: 3.501634359359741


training:   1%|▏         | 82/5566 [03:26<4:04:42,  2.68s/it]

training loss: 3.5527305603027344


training:   1%|▏         | 83/5566 [03:28<3:53:15,  2.55s/it]

training loss: 3.5295097827911377


training:   2%|▏         | 84/5566 [03:30<3:44:30,  2.46s/it]

training loss: 3.5051913261413574


training:   2%|▏         | 85/5566 [03:33<3:39:16,  2.40s/it]

training loss: 3.5154366493225098


training:   2%|▏         | 86/5566 [03:35<3:34:07,  2.34s/it]

training loss: 3.550697088241577


training:   2%|▏         | 87/5566 [03:37<3:30:22,  2.30s/it]

training loss: 3.5150420665740967


training:   2%|▏         | 88/5566 [03:39<3:29:16,  2.29s/it]

training loss: 3.518157958984375


training:   2%|▏         | 89/5566 [03:42<3:27:45,  2.28s/it]

training loss: 3.526460647583008


training:   2%|▏         | 90/5566 [03:44<3:26:28,  2.26s/it]

training loss: 3.526115655899048


training:   2%|▏         | 91/5566 [03:46<3:27:17,  2.27s/it]

training loss: 3.523620367050171


training:   2%|▏         | 92/5566 [03:48<3:26:02,  2.26s/it]

training loss: 3.5215985774993896


training:   2%|▏         | 93/5566 [03:51<3:25:48,  2.26s/it]

training loss: 3.548900604248047


training:   2%|▏         | 94/5566 [03:53<3:25:25,  2.25s/it]

training loss: 3.491687774658203


training:   2%|▏         | 95/5566 [03:55<3:24:26,  2.24s/it]

training loss: 3.5384018421173096


training:   2%|▏         | 96/5566 [03:57<3:24:33,  2.24s/it]

training loss: 3.5007450580596924


training:   2%|▏         | 97/5566 [03:59<3:23:44,  2.24s/it]

training loss: 3.5169763565063477


training:   2%|▏         | 98/5566 [04:02<3:23:06,  2.23s/it]

training loss: 3.5269501209259033


training:   2%|▏         | 99/5566 [04:04<3:23:46,  2.24s/it]

training loss: 3.5327420234680176


training:   2%|▏         | 100/5566 [04:06<3:22:53,  2.23s/it]

training loss: 3.51596736907959
valid loss: 3.513460397720337
perplexity: 33.564212799072266


training:   2%|▏         | 101/5566 [04:11<4:40:25,  3.08s/it]

training loss: 3.509328842163086


training:   2%|▏         | 102/5566 [04:14<4:21:00,  2.87s/it]

training loss: 3.5345778465270996


training:   2%|▏         | 103/5566 [04:16<4:04:13,  2.68s/it]

training loss: 3.543735980987549


training:   2%|▏         | 104/5566 [04:18<3:53:17,  2.56s/it]

training loss: 3.5277233123779297


training:   2%|▏         | 105/5566 [04:20<3:44:31,  2.47s/it]

training loss: 3.5322518348693848


training:   2%|▏         | 106/5566 [04:23<3:38:20,  2.40s/it]

training loss: 3.517869234085083


training:   2%|▏         | 107/5566 [04:25<3:33:59,  2.35s/it]

training loss: 3.5270607471466064


training:   2%|▏         | 108/5566 [04:27<3:30:24,  2.31s/it]

training loss: 3.5274875164031982


training:   2%|▏         | 109/5566 [04:29<3:27:49,  2.29s/it]

training loss: 3.5305821895599365


training:   2%|▏         | 110/5566 [04:32<3:27:05,  2.28s/it]

training loss: 3.5424680709838867


training:   2%|▏         | 111/5566 [04:34<3:26:20,  2.27s/it]

training loss: 3.5335538387298584


training:   2%|▏         | 112/5566 [04:36<3:25:23,  2.26s/it]

training loss: 3.5266380310058594


training:   2%|▏         | 113/5566 [04:38<3:24:37,  2.25s/it]

training loss: 3.5121288299560547


training:   2%|▏         | 114/5566 [04:41<3:24:06,  2.25s/it]

training loss: 3.5118978023529053


training:   2%|▏         | 115/5566 [04:43<3:23:28,  2.24s/it]

training loss: 3.520329475402832


training:   2%|▏         | 116/5566 [04:45<3:24:11,  2.25s/it]

training loss: 3.5142838954925537


training:   2%|▏         | 117/5566 [04:47<3:25:03,  2.26s/it]

training loss: 3.51157546043396


training:   2%|▏         | 118/5566 [04:50<3:24:44,  2.25s/it]

training loss: 3.516266107559204


training:   2%|▏         | 119/5566 [04:52<3:25:18,  2.26s/it]

training loss: 3.5178987979888916


training:   2%|▏         | 120/5566 [04:54<3:26:06,  2.27s/it]

training loss: 3.5266497135162354
valid loss: 3.523954391479492
perplexity: 33.91829299926758


training:   2%|▏         | 121/5566 [04:58<4:20:35,  2.87s/it]

training loss: 3.5208089351654053


training:   2%|▏         | 122/5566 [05:01<4:04:36,  2.70s/it]

training loss: 3.524679183959961


training:   2%|▏         | 123/5566 [05:03<3:52:08,  2.56s/it]

training loss: 3.509251594543457


training:   2%|▏         | 124/5566 [05:05<3:44:36,  2.48s/it]

training loss: 3.5398285388946533


training:   2%|▏         | 125/5566 [05:07<3:39:20,  2.42s/it]

training loss: 3.536311626434326


training:   2%|▏         | 126/5566 [05:10<3:34:20,  2.36s/it]

training loss: 3.5258677005767822


training:   2%|▏         | 127/5566 [05:12<3:30:47,  2.33s/it]

training loss: 3.51609206199646


training:   2%|▏         | 128/5566 [05:14<3:28:31,  2.30s/it]

training loss: 3.5248334407806396


training:   2%|▏         | 129/5566 [05:16<3:28:32,  2.30s/it]

training loss: 3.5250308513641357


training:   2%|▏         | 130/5566 [05:19<3:27:09,  2.29s/it]

training loss: 3.497001886367798


training:   2%|▏         | 131/5566 [05:21<3:26:15,  2.28s/it]

training loss: 3.550635576248169


training:   2%|▏         | 132/5566 [05:23<3:24:38,  2.26s/it]

training loss: 3.496206760406494


training:   2%|▏         | 133/5566 [05:25<3:24:11,  2.25s/it]

training loss: 3.5082743167877197


training:   2%|▏         | 134/5566 [05:28<3:23:44,  2.25s/it]

training loss: 3.535228967666626


training:   2%|▏         | 135/5566 [05:30<3:23:56,  2.25s/it]

training loss: 3.524559736251831


training:   2%|▏         | 136/5566 [05:32<3:24:09,  2.26s/it]

training loss: 3.528853178024292


training:   2%|▏         | 137/5566 [05:35<3:25:06,  2.27s/it]

training loss: 3.5257463455200195


training:   2%|▏         | 138/5566 [05:37<3:42:56,  2.46s/it]

training loss: 3.520045757293701


training:   2%|▏         | 139/5566 [05:40<3:38:39,  2.42s/it]

training loss: 3.5147182941436768


training:   3%|▎         | 140/5566 [05:42<3:35:10,  2.38s/it]

training loss: 3.5289981365203857
valid loss: 3.5278923511505127
perplexity: 34.052120208740234


training:   3%|▎         | 141/5566 [05:46<4:28:28,  2.97s/it]

training loss: 3.52886700630188


training:   3%|▎         | 142/5566 [05:49<4:10:37,  2.77s/it]

training loss: 3.516242027282715


training:   3%|▎         | 143/5566 [05:51<3:56:37,  2.62s/it]

training loss: 3.5281996726989746


training:   3%|▎         | 144/5566 [05:53<3:47:17,  2.52s/it]

training loss: 3.529928684234619


training:   3%|▎         | 145/5566 [05:55<3:40:49,  2.44s/it]

training loss: 3.536365270614624


training:   3%|▎         | 146/5566 [05:58<3:36:08,  2.39s/it]

training loss: 3.5344135761260986


training:   3%|▎         | 147/5566 [06:00<3:31:35,  2.34s/it]

training loss: 3.5213894844055176


training:   3%|▎         | 148/5566 [06:02<3:29:06,  2.32s/it]

training loss: 3.519585371017456


training:   3%|▎         | 149/5566 [06:05<3:27:26,  2.30s/it]

training loss: 3.5321543216705322


training:   3%|▎         | 150/5566 [06:07<3:26:07,  2.28s/it]

training loss: 3.4973182678222656


training:   3%|▎         | 151/5566 [06:09<3:24:34,  2.27s/it]

training loss: 3.5220413208007812


training:   3%|▎         | 152/5566 [06:11<3:24:34,  2.27s/it]

training loss: 3.5103602409362793


training:   3%|▎         | 153/5566 [06:13<3:23:49,  2.26s/it]

training loss: 3.5214362144470215


training:   3%|▎         | 154/5566 [06:16<3:23:06,  2.25s/it]

training loss: 3.5181589126586914


training:   3%|▎         | 155/5566 [06:18<3:23:58,  2.26s/it]

training loss: 3.4976861476898193


training:   3%|▎         | 156/5566 [06:20<3:24:01,  2.26s/it]

training loss: 3.4940412044525146


training:   3%|▎         | 157/5566 [06:23<3:24:24,  2.27s/it]

training loss: 3.525578737258911


training:   3%|▎         | 158/5566 [06:25<3:23:31,  2.26s/it]

training loss: 3.5541341304779053


training:   3%|▎         | 159/5566 [06:27<3:23:35,  2.26s/it]

training loss: 3.5227863788604736


training:   3%|▎         | 160/5566 [06:29<3:23:30,  2.26s/it]

training loss: 3.518350839614868
valid loss: 3.5152857303619385
perplexity: 33.62553405761719


training:   3%|▎         | 161/5566 [06:34<4:17:20,  2.86s/it]

training loss: 3.5133795738220215


training:   3%|▎         | 162/5566 [06:36<4:01:16,  2.68s/it]

training loss: 3.553286552429199


training:   3%|▎         | 163/5566 [06:38<3:49:47,  2.55s/it]

training loss: 3.5327301025390625


training:   3%|▎         | 164/5566 [06:40<3:41:19,  2.46s/it]

training loss: 3.5321998596191406


training:   3%|▎         | 165/5566 [06:43<3:34:56,  2.39s/it]

training loss: 3.5116958618164062


training:   3%|▎         | 166/5566 [06:45<3:30:42,  2.34s/it]

training loss: 3.5395116806030273


training:   3%|▎         | 167/5566 [06:47<3:28:03,  2.31s/it]

training loss: 3.5089268684387207


training:   3%|▎         | 168/5566 [06:49<3:27:11,  2.30s/it]

training loss: 3.528923273086548


training:   3%|▎         | 169/5566 [06:52<3:26:14,  2.29s/it]

training loss: 3.53751540184021


training:   3%|▎         | 170/5566 [06:54<3:24:51,  2.28s/it]

training loss: 3.545137643814087


training:   3%|▎         | 171/5566 [06:56<3:23:40,  2.27s/it]

training loss: 3.544090986251831


training:   3%|▎         | 172/5566 [06:58<3:22:59,  2.26s/it]

training loss: 3.525536060333252


training:   3%|▎         | 173/5566 [07:01<3:22:39,  2.25s/it]

training loss: 3.526228904724121


training:   3%|▎         | 174/5566 [07:03<3:22:13,  2.25s/it]

training loss: 3.530625343322754


training:   3%|▎         | 175/5566 [07:05<3:21:56,  2.25s/it]

training loss: 3.5225162506103516


training:   3%|▎         | 176/5566 [07:07<3:22:00,  2.25s/it]

training loss: 3.5137128829956055


training:   3%|▎         | 177/5566 [07:10<3:22:01,  2.25s/it]

training loss: 3.5237598419189453


training:   3%|▎         | 178/5566 [07:12<3:21:36,  2.25s/it]

training loss: 3.529651165008545


training:   3%|▎         | 179/5566 [07:14<3:22:17,  2.25s/it]

training loss: 3.521252155303955


training:   3%|▎         | 180/5566 [07:16<3:22:36,  2.26s/it]

training loss: 3.498506546020508
valid loss: 3.4964115619659424
perplexity: 32.99683380126953


training:   3%|▎         | 181/5566 [07:21<4:18:21,  2.88s/it]

training loss: 3.5389199256896973


training:   3%|▎         | 182/5566 [07:23<4:02:06,  2.70s/it]

training loss: 3.5188770294189453


training:   3%|▎         | 183/5566 [07:25<3:49:24,  2.56s/it]

training loss: 3.493840217590332


training:   3%|▎         | 184/5566 [07:27<3:41:14,  2.47s/it]

training loss: 3.515063762664795


training:   3%|▎         | 185/5566 [07:30<3:36:12,  2.41s/it]

training loss: 3.5129196643829346


training:   3%|▎         | 186/5566 [07:32<3:31:39,  2.36s/it]

training loss: 3.544434070587158


training:   3%|▎         | 187/5566 [07:34<3:28:16,  2.32s/it]

training loss: 3.50960111618042


training:   3%|▎         | 188/5566 [07:36<3:26:49,  2.31s/it]

training loss: 3.5033912658691406


training:   3%|▎         | 189/5566 [07:39<3:25:10,  2.29s/it]

training loss: 3.5141799449920654


training:   3%|▎         | 190/5566 [07:41<3:24:16,  2.28s/it]

training loss: 3.520286798477173


training:   3%|▎         | 191/5566 [07:43<3:23:21,  2.27s/it]

training loss: 3.5125603675842285


training:   3%|▎         | 192/5566 [07:45<3:22:45,  2.26s/it]

training loss: 3.524343967437744


training:   3%|▎         | 193/5566 [07:48<3:22:51,  2.27s/it]

training loss: 3.52691912651062


training:   3%|▎         | 194/5566 [07:50<3:23:53,  2.28s/it]

training loss: 3.506443977355957


training:   4%|▎         | 195/5566 [07:52<3:22:41,  2.26s/it]

training loss: 3.5336482524871826


training:   4%|▎         | 196/5566 [07:54<3:21:43,  2.25s/it]

training loss: 3.53178071975708


training:   4%|▎         | 197/5566 [07:57<3:22:19,  2.26s/it]

training loss: 3.5183348655700684


training:   4%|▎         | 198/5566 [07:59<3:21:23,  2.25s/it]

training loss: 3.5105743408203125


training:   4%|▎         | 199/5566 [08:01<3:21:08,  2.25s/it]

training loss: 3.5177276134490967


training:   4%|▎         | 200/5566 [08:03<3:21:07,  2.25s/it]

training loss: 3.513054132461548
valid loss: 3.5109188556671143
perplexity: 33.4790153503418


training:   4%|▎         | 201/5566 [08:08<4:35:42,  3.08s/it]

training loss: 3.51533842086792


training:   4%|▎         | 202/5566 [08:11<4:17:24,  2.88s/it]

training loss: 3.5194549560546875


training:   4%|▎         | 203/5566 [08:13<4:00:06,  2.69s/it]

training loss: 3.56184983253479


training:   4%|▎         | 204/5566 [08:15<3:48:21,  2.56s/it]

training loss: 3.524418592453003


training:   4%|▎         | 205/5566 [08:18<3:40:07,  2.46s/it]

training loss: 3.527775526046753


training:   4%|▎         | 206/5566 [08:20<3:33:59,  2.40s/it]

training loss: 3.5521438121795654


training:   4%|▎         | 207/5566 [08:22<3:29:47,  2.35s/it]

training loss: 3.523855209350586


training:   4%|▎         | 208/5566 [08:24<3:27:11,  2.32s/it]

training loss: 3.5305445194244385


training:   4%|▍         | 209/5566 [08:27<3:25:49,  2.31s/it]

training loss: 3.5284571647644043


training:   4%|▍         | 210/5566 [08:29<3:23:53,  2.28s/it]

training loss: 3.5080726146698


training:   4%|▍         | 211/5566 [08:31<3:23:11,  2.28s/it]

training loss: 3.53425669670105


training:   4%|▍         | 212/5566 [08:33<3:21:42,  2.26s/it]

training loss: 3.5400912761688232


training:   4%|▍         | 213/5566 [08:36<3:22:21,  2.27s/it]

training loss: 3.5174882411956787


training:   4%|▍         | 214/5566 [08:38<3:21:35,  2.26s/it]

training loss: 3.547619581222534


training:   4%|▍         | 215/5566 [08:40<3:21:22,  2.26s/it]

training loss: 3.513223648071289


training:   4%|▍         | 216/5566 [08:42<3:20:57,  2.25s/it]

training loss: 3.5268070697784424


training:   4%|▍         | 217/5566 [08:45<3:22:17,  2.27s/it]

training loss: 3.5346572399139404


training:   4%|▍         | 218/5566 [08:48<3:37:43,  2.44s/it]

training loss: 3.5228445529937744


training:   4%|▍         | 219/5566 [08:50<3:33:41,  2.40s/it]

training loss: 3.513798713684082


training:   4%|▍         | 220/5566 [08:52<3:30:17,  2.36s/it]

training loss: 3.519906759262085
valid loss: 3.518752336502075
perplexity: 33.74230194091797


training:   4%|▍         | 221/5566 [08:57<4:31:51,  3.05s/it]

training loss: 3.539046287536621


training:   4%|▍         | 222/5566 [08:59<4:11:02,  2.82s/it]

training loss: 3.53328275680542


training:   4%|▍         | 223/5566 [09:01<3:56:19,  2.65s/it]

training loss: 3.552621364593506


training:   4%|▍         | 224/5566 [09:04<3:45:06,  2.53s/it]

training loss: 3.4877583980560303


training:   4%|▍         | 225/5566 [09:06<3:37:28,  2.44s/it]

training loss: 3.506963014602661


training:   4%|▍         | 226/5566 [09:08<3:32:34,  2.39s/it]

training loss: 3.535046100616455


training:   4%|▍         | 227/5566 [09:10<3:28:31,  2.34s/it]

training loss: 3.5318539142608643


training:   4%|▍         | 228/5566 [09:13<3:25:50,  2.31s/it]

training loss: 3.4947569370269775


training:   4%|▍         | 229/5566 [09:15<3:23:56,  2.29s/it]

training loss: 3.54038667678833


training:   4%|▍         | 230/5566 [09:17<3:22:15,  2.27s/it]

training loss: 3.536180019378662


training:   4%|▍         | 231/5566 [09:19<3:21:48,  2.27s/it]

training loss: 3.514939069747925


training:   4%|▍         | 232/5566 [09:21<3:20:53,  2.26s/it]

training loss: 3.526047706604004


training:   4%|▍         | 233/5566 [09:24<3:21:01,  2.26s/it]

training loss: 3.511435031890869


training:   4%|▍         | 234/5566 [09:26<3:20:26,  2.26s/it]

training loss: 3.4996871948242188


training:   4%|▍         | 235/5566 [09:28<3:19:41,  2.25s/it]

training loss: 3.5265371799468994


training:   4%|▍         | 236/5566 [09:30<3:19:59,  2.25s/it]

training loss: 3.5016660690307617


training:   4%|▍         | 237/5566 [09:33<3:20:41,  2.26s/it]

training loss: 3.53715443611145


training:   4%|▍         | 238/5566 [09:35<3:19:50,  2.25s/it]

training loss: 3.5338971614837646


training:   4%|▍         | 239/5566 [09:37<3:19:33,  2.25s/it]

training loss: 3.50876522064209


training:   4%|▍         | 240/5566 [09:39<3:19:44,  2.25s/it]

training loss: 3.5261940956115723
valid loss: 3.5237019062042236
perplexity: 33.90972900390625


training:   4%|▍         | 241/5566 [09:44<4:12:52,  2.85s/it]

training loss: 3.5205090045928955


training:   4%|▍         | 242/5566 [09:46<3:56:20,  2.66s/it]

training loss: 3.5290184020996094


training:   4%|▍         | 243/5566 [09:48<3:44:37,  2.53s/it]

training loss: 3.497907876968384


training:   4%|▍         | 244/5566 [09:50<3:36:37,  2.44s/it]

training loss: 3.520749092102051


training:   4%|▍         | 245/5566 [09:53<3:31:04,  2.38s/it]

training loss: 3.5225977897644043


training:   4%|▍         | 246/5566 [09:55<3:26:56,  2.33s/it]

training loss: 3.5276875495910645


training:   4%|▍         | 247/5566 [09:57<3:24:48,  2.31s/it]

training loss: 3.5581560134887695


training:   4%|▍         | 248/5566 [09:59<3:22:53,  2.29s/it]

training loss: 3.4987733364105225


training:   4%|▍         | 249/5566 [10:02<3:21:39,  2.28s/it]

training loss: 3.5152699947357178


training:   4%|▍         | 250/5566 [10:04<3:21:09,  2.27s/it]

training loss: 3.5117223262786865


training:   5%|▍         | 251/5566 [10:06<3:21:22,  2.27s/it]

training loss: 3.510258436203003


training:   5%|▍         | 252/5566 [10:08<3:19:58,  2.26s/it]

training loss: 3.488001823425293


training:   5%|▍         | 253/5566 [10:11<3:19:18,  2.25s/it]

training loss: 3.51019549369812


training:   5%|▍         | 254/5566 [10:13<3:18:57,  2.25s/it]

training loss: 3.525163173675537


training:   5%|▍         | 255/5566 [10:15<3:19:00,  2.25s/it]

training loss: 3.4874494075775146


training:   5%|▍         | 256/5566 [10:17<3:19:11,  2.25s/it]

training loss: 3.5196919441223145


training:   5%|▍         | 257/5566 [10:20<3:18:10,  2.24s/it]

training loss: 3.5328197479248047


training:   5%|▍         | 258/5566 [10:22<3:18:38,  2.25s/it]

training loss: 3.5100302696228027


training:   5%|▍         | 259/5566 [10:24<3:18:55,  2.25s/it]

training loss: 3.4945056438446045


training:   5%|▍         | 260/5566 [10:26<3:19:47,  2.26s/it]

training loss: 3.5030384063720703
valid loss: 3.502045154571533
perplexity: 33.18324661254883


training:   5%|▍         | 261/5566 [10:31<4:13:15,  2.86s/it]

training loss: 3.511324644088745


training:   5%|▍         | 262/5566 [10:33<3:55:52,  2.67s/it]

training loss: 3.5314133167266846


training:   5%|▍         | 263/5566 [10:35<3:44:46,  2.54s/it]

training loss: 3.5212414264678955


training:   5%|▍         | 264/5566 [10:37<3:36:28,  2.45s/it]

training loss: 3.543017625808716


training:   5%|▍         | 265/5566 [10:40<3:30:26,  2.38s/it]

training loss: 3.5450990200042725


training:   5%|▍         | 266/5566 [10:42<3:27:14,  2.35s/it]

training loss: 3.543285369873047


training:   5%|▍         | 267/5566 [10:44<3:24:56,  2.32s/it]

training loss: 3.51900577545166


training:   5%|▍         | 268/5566 [10:46<3:23:18,  2.30s/it]

training loss: 3.508892059326172


training:   5%|▍         | 269/5566 [10:49<3:22:08,  2.29s/it]

training loss: 3.5409154891967773


training:   5%|▍         | 270/5566 [10:51<3:21:24,  2.28s/it]

training loss: 3.5345170497894287


training:   5%|▍         | 271/5566 [10:53<3:20:42,  2.27s/it]

training loss: 3.496413469314575


training:   5%|▍         | 272/5566 [10:55<3:20:03,  2.27s/it]

training loss: 3.4784133434295654


training:   5%|▍         | 273/5566 [10:58<3:20:25,  2.27s/it]

training loss: 3.5242629051208496


training:   5%|▍         | 274/5566 [11:00<3:20:09,  2.27s/it]

training loss: 3.5121006965637207


training:   5%|▍         | 275/5566 [11:02<3:19:48,  2.27s/it]

training loss: 3.522474765777588


training:   5%|▍         | 276/5566 [11:04<3:19:20,  2.26s/it]

training loss: 3.5126953125


training:   5%|▍         | 277/5566 [11:07<3:19:04,  2.26s/it]

training loss: 3.5387017726898193


training:   5%|▍         | 278/5566 [11:09<3:19:07,  2.26s/it]

training loss: 3.5137112140655518


training:   5%|▌         | 279/5566 [11:11<3:18:00,  2.25s/it]

training loss: 3.5040953159332275


training:   5%|▌         | 280/5566 [11:13<3:18:04,  2.25s/it]

training loss: 3.537353754043579
valid loss: 3.53590726852417
perplexity: 34.32614517211914


training:   5%|▌         | 281/5566 [11:18<4:11:03,  2.85s/it]

training loss: 3.538309097290039


training:   5%|▌         | 282/5566 [11:20<3:55:01,  2.67s/it]

training loss: 3.5046145915985107


training:   5%|▌         | 283/5566 [11:22<3:43:01,  2.53s/it]

training loss: 3.5268609523773193


training:   5%|▌         | 284/5566 [11:24<3:34:37,  2.44s/it]

training loss: 3.5325188636779785


training:   5%|▌         | 285/5566 [11:27<3:29:36,  2.38s/it]

training loss: 3.5276131629943848


training:   5%|▌         | 286/5566 [11:29<3:26:14,  2.34s/it]

training loss: 3.5228915214538574


training:   5%|▌         | 287/5566 [11:31<3:24:01,  2.32s/it]

training loss: 3.5269250869750977


training:   5%|▌         | 288/5566 [11:33<3:21:55,  2.30s/it]

training loss: 3.5085935592651367


training:   5%|▌         | 289/5566 [11:36<3:20:03,  2.27s/it]

training loss: 3.5333549976348877


training:   5%|▌         | 290/5566 [11:38<3:18:57,  2.26s/it]

training loss: 3.5311403274536133


training:   5%|▌         | 291/5566 [11:40<3:19:01,  2.26s/it]

training loss: 3.5041234493255615


training:   5%|▌         | 292/5566 [11:42<3:19:13,  2.27s/it]

training loss: 3.521927833557129


training:   5%|▌         | 293/5566 [11:45<3:30:46,  2.40s/it]

training loss: 3.529604911804199


training:   5%|▌         | 294/5566 [11:49<4:02:05,  2.76s/it]

training loss: 3.518495559692383


training:   5%|▌         | 295/5566 [11:51<3:49:30,  2.61s/it]

training loss: 3.5410492420196533


training:   5%|▌         | 296/5566 [11:53<3:40:21,  2.51s/it]

training loss: 3.5174574851989746


training:   5%|▌         | 297/5566 [11:55<3:33:10,  2.43s/it]

training loss: 3.538304567337036


training:   5%|▌         | 298/5566 [11:58<3:43:24,  2.54s/it]

training loss: 3.518373966217041


training:   5%|▌         | 299/5566 [12:01<3:43:52,  2.55s/it]

training loss: 3.52641224861145


training:   5%|▌         | 300/5566 [12:03<3:36:19,  2.46s/it]

training loss: 3.526799201965332
valid loss: 3.525646924972534
perplexity: 33.975746154785156


training:   5%|▌         | 301/5566 [12:08<4:44:06,  3.24s/it]

training loss: 3.5144288539886475


training:   5%|▌         | 302/5566 [12:11<4:21:24,  2.98s/it]

training loss: 3.5387473106384277


training:   5%|▌         | 303/5566 [12:13<4:02:08,  2.76s/it]

training loss: 3.5396416187286377


training:   5%|▌         | 304/5566 [12:15<3:48:21,  2.60s/it]

training loss: 3.505685806274414


training:   5%|▌         | 305/5566 [12:17<3:39:41,  2.51s/it]

training loss: 3.502286672592163


training:   5%|▌         | 306/5566 [12:20<3:32:51,  2.43s/it]

training loss: 3.5077707767486572


training:   6%|▌         | 307/5566 [12:22<3:28:46,  2.38s/it]

training loss: 3.531334400177002


training:   6%|▌         | 308/5566 [12:24<3:25:53,  2.35s/it]

training loss: 3.5195183753967285


training:   6%|▌         | 309/5566 [12:26<3:23:36,  2.32s/it]

training loss: 3.516446590423584


training:   6%|▌         | 310/5566 [12:29<3:20:57,  2.29s/it]

training loss: 3.512080669403076


training:   6%|▌         | 311/5566 [12:31<3:28:58,  2.39s/it]

training loss: 3.5462417602539062


training:   6%|▌         | 312/5566 [12:33<3:24:59,  2.34s/it]

training loss: 3.5006654262542725


training:   6%|▌         | 313/5566 [12:36<3:22:39,  2.31s/it]

training loss: 3.5445597171783447


training:   6%|▌         | 314/5566 [12:38<3:22:37,  2.31s/it]

training loss: 3.520495653152466


training:   6%|▌         | 315/5566 [12:40<3:21:03,  2.30s/it]

training loss: 3.5223586559295654


training:   6%|▌         | 316/5566 [12:42<3:19:27,  2.28s/it]

training loss: 3.539130687713623


training:   6%|▌         | 317/5566 [12:45<3:19:06,  2.28s/it]

training loss: 3.5291292667388916


training:   6%|▌         | 318/5566 [12:47<3:18:22,  2.27s/it]

training loss: 3.528139352798462


training:   6%|▌         | 319/5566 [12:49<3:18:10,  2.27s/it]

training loss: 3.522174119949341


training:   6%|▌         | 320/5566 [12:52<3:18:39,  2.27s/it]

training loss: 3.5191760063171387
valid loss: 3.518040180206299
perplexity: 33.718284606933594


training:   6%|▌         | 321/5566 [12:58<4:56:13,  3.39s/it]

training loss: 3.5134944915771484


training:   6%|▌         | 322/5566 [13:00<4:27:17,  3.06s/it]

training loss: 3.5207531452178955


training:   6%|▌         | 323/5566 [13:02<4:05:43,  2.81s/it]

training loss: 3.554213047027588


training:   6%|▌         | 324/5566 [13:04<3:50:41,  2.64s/it]

training loss: 3.5145139694213867


training:   6%|▌         | 325/5566 [13:07<3:40:05,  2.52s/it]

training loss: 3.501779556274414


training:   6%|▌         | 326/5566 [13:09<3:32:42,  2.44s/it]

training loss: 3.548882007598877


training:   6%|▌         | 327/5566 [13:11<3:27:42,  2.38s/it]

training loss: 3.528438091278076


training:   6%|▌         | 328/5566 [13:13<3:24:20,  2.34s/it]

training loss: 3.502228260040283


training:   6%|▌         | 329/5566 [13:15<3:21:12,  2.31s/it]

training loss: 3.5289247035980225


training:   6%|▌         | 330/5566 [13:18<3:18:48,  2.28s/it]

training loss: 3.5017647743225098


training:   6%|▌         | 331/5566 [13:20<3:18:18,  2.27s/it]

training loss: 3.5408496856689453


training:   6%|▌         | 332/5566 [13:22<3:17:13,  2.26s/it]

training loss: 3.5244410037994385


training:   6%|▌         | 333/5566 [13:24<3:17:49,  2.27s/it]

training loss: 3.5405852794647217


training:   6%|▌         | 334/5566 [13:27<3:18:01,  2.27s/it]

training loss: 3.535090684890747


training:   6%|▌         | 335/5566 [13:29<3:17:20,  2.26s/it]

training loss: 3.5523464679718018


training:   6%|▌         | 336/5566 [13:31<3:17:13,  2.26s/it]

training loss: 3.529280185699463


training:   6%|▌         | 337/5566 [13:33<3:16:37,  2.26s/it]

training loss: 3.515904188156128


training:   6%|▌         | 338/5566 [13:36<3:17:03,  2.26s/it]

training loss: 3.5294787883758545


training:   6%|▌         | 339/5566 [13:38<3:16:19,  2.25s/it]

training loss: 3.526048183441162


training:   6%|▌         | 340/5566 [13:40<3:15:11,  2.24s/it]

training loss: 3.5212502479553223
valid loss: 3.520505905151367
perplexity: 33.8015251159668


training:   6%|▌         | 341/5566 [13:45<4:20:05,  2.99s/it]

training loss: 3.5026376247406006


training:   6%|▌         | 342/5566 [13:47<4:01:33,  2.77s/it]

training loss: 3.5213310718536377


training:   6%|▌         | 343/5566 [13:49<3:47:21,  2.61s/it]

training loss: 3.5409042835235596


training:   6%|▌         | 344/5566 [13:52<3:38:15,  2.51s/it]

training loss: 3.5078866481781006


training:   6%|▌         | 345/5566 [13:54<3:30:52,  2.42s/it]

training loss: 3.527876853942871


training:   6%|▌         | 346/5566 [13:56<3:26:13,  2.37s/it]

training loss: 3.4863317012786865


training:   6%|▌         | 347/5566 [13:58<3:23:02,  2.33s/it]

training loss: 3.5426619052886963


training:   6%|▋         | 348/5566 [14:01<3:21:17,  2.31s/it]

training loss: 3.5035929679870605


training:   6%|▋         | 349/5566 [14:03<3:19:54,  2.30s/it]

training loss: 3.526050567626953


training:   6%|▋         | 350/5566 [14:05<3:18:19,  2.28s/it]

training loss: 3.5067968368530273


training:   6%|▋         | 351/5566 [14:07<3:17:13,  2.27s/it]

training loss: 3.5230724811553955


training:   6%|▋         | 352/5566 [14:10<3:16:00,  2.26s/it]

training loss: 3.5287511348724365


training:   6%|▋         | 353/5566 [14:12<3:15:04,  2.25s/it]

training loss: 3.5066909790039062


training:   6%|▋         | 354/5566 [14:14<3:15:07,  2.25s/it]

training loss: 3.537202835083008


training:   6%|▋         | 355/5566 [14:16<3:15:17,  2.25s/it]

training loss: 3.5309269428253174


training:   6%|▋         | 356/5566 [14:19<3:15:02,  2.25s/it]

training loss: 3.5477027893066406


training:   6%|▋         | 357/5566 [14:21<3:15:26,  2.25s/it]

training loss: 3.51053524017334


training:   6%|▋         | 358/5566 [14:23<3:15:01,  2.25s/it]

training loss: 3.5166752338409424


training:   6%|▋         | 359/5566 [14:25<3:15:09,  2.25s/it]

training loss: 3.552980899810791


training:   6%|▋         | 360/5566 [14:28<3:15:50,  2.26s/it]

training loss: 3.5250797271728516
valid loss: 3.524061679840088
perplexity: 33.92192840576172


training:   6%|▋         | 361/5566 [14:32<4:08:35,  2.87s/it]

training loss: 3.5400965213775635


training:   7%|▋         | 362/5566 [14:34<3:52:55,  2.69s/it]

training loss: 3.5379068851470947


training:   7%|▋         | 363/5566 [14:36<3:41:17,  2.55s/it]

training loss: 3.5119998455047607


training:   7%|▋         | 364/5566 [14:39<3:32:56,  2.46s/it]

training loss: 3.544726610183716


training:   7%|▋         | 365/5566 [14:41<3:27:14,  2.39s/it]

training loss: 3.5273494720458984


training:   7%|▋         | 366/5566 [14:43<3:23:31,  2.35s/it]

training loss: 3.5102860927581787


training:   7%|▋         | 367/5566 [14:45<3:21:03,  2.32s/it]

training loss: 3.5159363746643066


training:   7%|▋         | 368/5566 [14:48<3:18:43,  2.29s/it]

training loss: 3.5174269676208496


training:   7%|▋         | 369/5566 [14:50<3:17:22,  2.28s/it]

training loss: 3.5181469917297363


training:   7%|▋         | 370/5566 [14:52<3:16:07,  2.26s/it]

training loss: 3.5550012588500977


training:   7%|▋         | 371/5566 [14:54<3:15:39,  2.26s/it]

training loss: 3.5183801651000977


training:   7%|▋         | 372/5566 [14:57<3:15:10,  2.25s/it]

training loss: 3.532576084136963


training:   7%|▋         | 373/5566 [14:59<3:15:01,  2.25s/it]

training loss: 3.5243144035339355


training:   7%|▋         | 374/5566 [15:01<3:15:42,  2.26s/it]

training loss: 3.549412727355957


training:   7%|▋         | 375/5566 [15:03<3:15:10,  2.26s/it]

training loss: 3.525127410888672


training:   7%|▋         | 376/5566 [15:06<3:14:24,  2.25s/it]

training loss: 3.5163044929504395


training:   7%|▋         | 377/5566 [15:08<3:14:35,  2.25s/it]

training loss: 3.555107355117798


training:   7%|▋         | 378/5566 [15:10<3:13:53,  2.24s/it]

training loss: 3.5216028690338135


training:   7%|▋         | 379/5566 [15:13<3:30:34,  2.44s/it]

training loss: 3.492828845977783


training:   7%|▋         | 380/5566 [15:15<3:27:30,  2.40s/it]

training loss: 3.527756452560425
valid loss: 3.525965690612793
perplexity: 33.98657989501953


training:   7%|▋         | 381/5566 [15:20<4:16:16,  2.97s/it]

training loss: 3.5128350257873535


training:   7%|▋         | 382/5566 [15:22<3:58:06,  2.76s/it]

training loss: 3.5301263332366943


training:   7%|▋         | 383/5566 [15:24<3:46:36,  2.62s/it]

training loss: 3.5231881141662598


training:   7%|▋         | 384/5566 [15:27<3:39:21,  2.54s/it]

training loss: 3.5465950965881348


training:   7%|▋         | 385/5566 [15:29<3:32:32,  2.46s/it]

training loss: 3.5139260292053223


training:   7%|▋         | 386/5566 [15:31<3:27:06,  2.40s/it]

training loss: 3.510803461074829


training:   7%|▋         | 387/5566 [15:33<3:24:34,  2.37s/it]

training loss: 3.522463798522949


training:   7%|▋         | 388/5566 [15:36<3:21:28,  2.33s/it]

training loss: 3.5296993255615234


training:   7%|▋         | 389/5566 [15:38<3:18:59,  2.31s/it]

training loss: 3.512791872024536


training:   7%|▋         | 390/5566 [15:40<3:17:54,  2.29s/it]

training loss: 3.542623519897461


training:   7%|▋         | 391/5566 [15:42<3:19:02,  2.31s/it]

training loss: 3.536439895629883


training:   7%|▋         | 392/5566 [15:45<3:17:39,  2.29s/it]

training loss: 3.5270955562591553


training:   7%|▋         | 393/5566 [15:47<3:17:04,  2.29s/it]

training loss: 3.5365657806396484


training:   7%|▋         | 394/5566 [15:49<3:16:35,  2.28s/it]

training loss: 3.521383047103882


training:   7%|▋         | 395/5566 [15:52<3:16:03,  2.27s/it]

training loss: 3.5276331901550293


training:   7%|▋         | 396/5566 [15:54<3:16:00,  2.27s/it]

training loss: 3.5302746295928955


training:   7%|▋         | 397/5566 [15:56<3:16:20,  2.28s/it]

training loss: 3.4946229457855225


training:   7%|▋         | 398/5566 [15:58<3:16:25,  2.28s/it]

training loss: 3.514996290206909


training:   7%|▋         | 399/5566 [16:02<3:53:36,  2.71s/it]

training loss: 3.5116279125213623


training:   7%|▋         | 400/5566 [16:07<5:01:55,  3.51s/it]

training loss: 3.526262044906616
valid loss: 3.526339530944824
perplexity: 33.99928665161133


training:   7%|▋         | 401/5566 [16:15<6:40:23,  4.65s/it]

training loss: 3.5198872089385986


training:   7%|▋         | 402/5566 [16:17<5:41:43,  3.97s/it]

training loss: 3.525813579559326


training:   7%|▋         | 403/5566 [16:19<4:57:48,  3.46s/it]

training loss: 3.493673801422119


training:   7%|▋         | 404/5566 [16:22<4:25:55,  3.09s/it]

training loss: 3.493185043334961


training:   7%|▋         | 405/5566 [16:24<4:04:41,  2.84s/it]

training loss: 3.5046019554138184


training:   7%|▋         | 406/5566 [16:26<3:48:38,  2.66s/it]

training loss: 3.5302951335906982


training:   7%|▋         | 407/5566 [16:28<3:37:46,  2.53s/it]

training loss: 3.543768882751465


training:   7%|▋         | 408/5566 [16:31<3:30:37,  2.45s/it]

training loss: 3.5017731189727783


training:   7%|▋         | 409/5566 [16:33<3:25:02,  2.39s/it]

training loss: 3.5275161266326904


training:   7%|▋         | 410/5566 [16:35<3:20:59,  2.34s/it]

training loss: 3.5017833709716797


training:   7%|▋         | 411/5566 [16:37<3:19:11,  2.32s/it]

training loss: 3.5282788276672363


training:   7%|▋         | 412/5566 [16:40<3:17:25,  2.30s/it]

training loss: 3.504310131072998


training:   7%|▋         | 413/5566 [16:42<3:15:31,  2.28s/it]

training loss: 3.5150721073150635


training:   7%|▋         | 414/5566 [16:44<3:14:44,  2.27s/it]

training loss: 3.530588388442993


training:   7%|▋         | 415/5566 [16:46<3:14:00,  2.26s/it]

training loss: 3.5085577964782715


training:   7%|▋         | 416/5566 [16:49<3:13:13,  2.25s/it]

training loss: 3.518629312515259


training:   7%|▋         | 417/5566 [16:51<3:12:43,  2.25s/it]

training loss: 3.511051893234253


training:   8%|▊         | 418/5566 [16:53<3:12:31,  2.24s/it]

training loss: 3.5115132331848145


training:   8%|▊         | 419/5566 [16:55<3:14:30,  2.27s/it]

training loss: 3.5289974212646484


training:   8%|▊         | 420/5566 [16:58<3:13:33,  2.26s/it]

training loss: 3.5099873542785645
valid loss: 3.5088489055633545
perplexity: 33.4097900390625


training:   8%|▊         | 421/5566 [17:02<4:05:54,  2.87s/it]

training loss: 3.5404393672943115


training:   8%|▊         | 422/5566 [17:04<3:50:57,  2.69s/it]

training loss: 3.516512393951416


training:   8%|▊         | 423/5566 [17:06<3:39:38,  2.56s/it]

training loss: 3.529611587524414


training:   8%|▊         | 424/5566 [17:09<3:31:35,  2.47s/it]

training loss: 3.5187876224517822


training:   8%|▊         | 425/5566 [17:11<3:25:28,  2.40s/it]

training loss: 3.547938346862793


training:   8%|▊         | 426/5566 [17:13<3:21:46,  2.36s/it]

training loss: 3.5061233043670654


training:   8%|▊         | 427/5566 [17:15<3:19:20,  2.33s/it]

training loss: 3.5160608291625977


training:   8%|▊         | 428/5566 [17:18<3:17:19,  2.30s/it]

training loss: 3.535691976547241


training:   8%|▊         | 429/5566 [17:20<3:28:51,  2.44s/it]

training loss: 3.5260725021362305


training:   8%|▊         | 430/5566 [17:23<3:23:50,  2.38s/it]

training loss: 3.520447254180908


training:   8%|▊         | 431/5566 [17:25<3:20:10,  2.34s/it]

training loss: 3.504593849182129


training:   8%|▊         | 432/5566 [17:27<3:18:12,  2.32s/it]

training loss: 3.4999265670776367


training:   8%|▊         | 433/5566 [17:29<3:16:31,  2.30s/it]

training loss: 3.5265047550201416


training:   8%|▊         | 434/5566 [17:32<3:14:37,  2.28s/it]

training loss: 3.519876003265381


training:   8%|▊         | 435/5566 [17:34<3:13:30,  2.26s/it]

training loss: 3.5097122192382812


training:   8%|▊         | 436/5566 [17:36<3:12:27,  2.25s/it]

training loss: 3.507136821746826


training:   8%|▊         | 437/5566 [17:38<3:12:25,  2.25s/it]

training loss: 3.517643690109253


training:   8%|▊         | 438/5566 [17:41<3:11:55,  2.25s/it]

training loss: 3.532390832901001


training:   8%|▊         | 439/5566 [17:43<3:12:17,  2.25s/it]

training loss: 3.538360357284546


training:   8%|▊         | 440/5566 [17:45<3:11:51,  2.25s/it]

training loss: 3.513040781021118
valid loss: 3.5117900371551514
perplexity: 33.50819396972656


training:   8%|▊         | 441/5566 [17:49<4:03:32,  2.85s/it]

training loss: 3.4951894283294678


training:   8%|▊         | 442/5566 [17:52<3:47:59,  2.67s/it]

training loss: 3.5258264541625977


training:   8%|▊         | 443/5566 [17:54<3:36:21,  2.53s/it]

training loss: 3.497304916381836


training:   8%|▊         | 444/5566 [17:56<3:28:59,  2.45s/it]

training loss: 3.5255885124206543


training:   8%|▊         | 445/5566 [17:58<3:23:47,  2.39s/it]

training loss: 3.540921449661255


training:   8%|▊         | 446/5566 [18:01<3:20:54,  2.35s/it]

training loss: 3.496058225631714


training:   8%|▊         | 447/5566 [18:03<3:17:58,  2.32s/it]

training loss: 3.5589427947998047


training:   8%|▊         | 448/5566 [18:05<3:16:21,  2.30s/it]

training loss: 3.515293836593628


training:   8%|▊         | 449/5566 [18:07<3:14:54,  2.29s/it]

training loss: 3.524601459503174


training:   8%|▊         | 450/5566 [18:10<3:14:03,  2.28s/it]

training loss: 3.5455305576324463


training:   8%|▊         | 451/5566 [18:12<3:13:19,  2.27s/it]

training loss: 3.5253918170928955


training:   8%|▊         | 452/5566 [18:14<3:12:48,  2.26s/it]

training loss: 3.5302011966705322


training:   8%|▊         | 453/5566 [18:16<3:11:30,  2.25s/it]

training loss: 3.5419998168945312


training:   8%|▊         | 454/5566 [18:19<3:11:33,  2.25s/it]

training loss: 3.487197160720825


training:   8%|▊         | 455/5566 [18:21<3:11:28,  2.25s/it]

training loss: 3.529195785522461


training:   8%|▊         | 456/5566 [18:23<3:10:29,  2.24s/it]

training loss: 3.5134212970733643


training:   8%|▊         | 457/5566 [18:25<3:11:34,  2.25s/it]

training loss: 3.5302093029022217


training:   8%|▊         | 458/5566 [18:28<3:12:09,  2.26s/it]

training loss: 3.5279738903045654


training:   8%|▊         | 459/5566 [18:30<3:11:35,  2.25s/it]

training loss: 3.523145914077759


training:   8%|▊         | 460/5566 [18:32<3:11:08,  2.25s/it]

training loss: 3.5266811847686768
valid loss: 3.525747537612915
perplexity: 33.97916793823242


training:   8%|▊         | 461/5566 [18:36<4:02:56,  2.86s/it]

training loss: 3.5467209815979004


training:   8%|▊         | 462/5566 [18:39<3:48:12,  2.68s/it]

training loss: 3.525205135345459


training:   8%|▊         | 463/5566 [18:41<3:52:42,  2.74s/it]

training loss: 3.5333824157714844


training:   8%|▊         | 464/5566 [18:44<3:42:03,  2.61s/it]

training loss: 3.5197153091430664


training:   8%|▊         | 465/5566 [18:46<3:31:44,  2.49s/it]

training loss: 3.5041284561157227


training:   8%|▊         | 466/5566 [18:48<3:25:23,  2.42s/it]

training loss: 3.5267531871795654


training:   8%|▊         | 467/5566 [18:50<3:21:20,  2.37s/it]

training loss: 3.5067570209503174


training:   8%|▊         | 468/5566 [18:53<3:18:37,  2.34s/it]

training loss: 3.5144448280334473


training:   8%|▊         | 469/5566 [18:55<3:15:20,  2.30s/it]

training loss: 3.5253446102142334


training:   8%|▊         | 470/5566 [18:57<3:13:16,  2.28s/it]

training loss: 3.531435251235962


training:   8%|▊         | 471/5566 [18:59<3:12:24,  2.27s/it]

training loss: 3.5095579624176025


training:   8%|▊         | 472/5566 [19:02<3:12:18,  2.27s/it]

training loss: 3.5369973182678223


training:   8%|▊         | 473/5566 [19:04<3:11:49,  2.26s/it]

training loss: 3.528026580810547


training:   9%|▊         | 474/5566 [19:06<3:10:52,  2.25s/it]

training loss: 3.5236964225769043


training:   9%|▊         | 475/5566 [19:08<3:10:22,  2.24s/it]

training loss: 3.5303311347961426


training:   9%|▊         | 476/5566 [19:11<3:09:55,  2.24s/it]

training loss: 3.5189454555511475


training:   9%|▊         | 477/5566 [19:13<3:10:27,  2.25s/it]

training loss: 3.5361154079437256


training:   9%|▊         | 478/5566 [19:15<3:10:21,  2.24s/it]

training loss: 3.5332329273223877


training:   9%|▊         | 479/5566 [19:17<3:09:43,  2.24s/it]

training loss: 3.5214626789093018


training:   9%|▊         | 480/5566 [19:20<3:09:33,  2.24s/it]

training loss: 3.5330541133880615
valid loss: 3.531921148300171
perplexity: 34.1895866394043


training:   9%|▊         | 481/5566 [19:24<4:00:50,  2.84s/it]

training loss: 3.519874095916748


training:   9%|▊         | 482/5566 [19:26<3:46:08,  2.67s/it]

training loss: 3.5093185901641846


training:   9%|▊         | 483/5566 [19:28<3:35:17,  2.54s/it]

training loss: 3.5400006771087646


training:   9%|▊         | 484/5566 [19:31<3:27:59,  2.46s/it]

training loss: 3.540929079055786


training:   9%|▊         | 485/5566 [19:33<3:22:12,  2.39s/it]

training loss: 3.5084304809570312


training:   9%|▊         | 486/5566 [19:35<3:17:47,  2.34s/it]

training loss: 3.5248043537139893


training:   9%|▊         | 487/5566 [19:37<3:15:45,  2.31s/it]

training loss: 3.547220230102539


training:   9%|▉         | 488/5566 [19:40<3:13:59,  2.29s/it]

training loss: 3.5110201835632324


training:   9%|▉         | 489/5566 [19:42<3:12:40,  2.28s/it]

training loss: 3.528867483139038


training:   9%|▉         | 490/5566 [19:44<3:12:46,  2.28s/it]

training loss: 3.5289411544799805


training:   9%|▉         | 491/5566 [19:46<3:11:41,  2.27s/it]

training loss: 3.5377392768859863


training:   9%|▉         | 492/5566 [19:49<3:11:29,  2.26s/it]

training loss: 3.5079824924468994


training:   9%|▉         | 493/5566 [19:51<3:11:14,  2.26s/it]

training loss: 3.486053943634033


training:   9%|▉         | 494/5566 [19:53<3:10:19,  2.25s/it]

training loss: 3.5629425048828125


training:   9%|▉         | 495/5566 [19:55<3:09:23,  2.24s/it]

training loss: 3.4944911003112793


training:   9%|▉         | 496/5566 [19:58<3:09:10,  2.24s/it]

training loss: 3.500671148300171


training:   9%|▉         | 497/5566 [20:00<3:09:19,  2.24s/it]

training loss: 3.5143139362335205


training:   9%|▉         | 498/5566 [20:02<3:09:38,  2.25s/it]

training loss: 3.5082364082336426


training:   9%|▉         | 499/5566 [20:04<3:09:06,  2.24s/it]

training loss: 3.5098469257354736


training:   9%|▉         | 500/5566 [20:06<3:09:33,  2.24s/it]

training loss: 3.5196776390075684
valid loss: 3.5191280841827393
perplexity: 33.75498580932617


training:   9%|▉         | 501/5566 [20:11<4:19:21,  3.07s/it]

training loss: 3.530848264694214


training:   9%|▉         | 502/5566 [20:14<4:00:05,  2.84s/it]

training loss: 3.5165152549743652


training:   9%|▉         | 503/5566 [20:16<3:44:20,  2.66s/it]

training loss: 3.535754680633545


training:   9%|▉         | 504/5566 [20:18<3:34:30,  2.54s/it]

training loss: 3.5293638706207275


training:   9%|▉         | 505/5566 [20:21<3:27:24,  2.46s/it]

training loss: 3.514580011367798


training:   9%|▉         | 506/5566 [20:23<3:22:11,  2.40s/it]

training loss: 3.5413429737091064


training:   9%|▉         | 507/5566 [20:25<3:17:39,  2.34s/it]

training loss: 3.525223731994629


training:   9%|▉         | 508/5566 [20:27<3:15:28,  2.32s/it]

training loss: 3.524085521697998


training:   9%|▉         | 509/5566 [20:30<3:13:39,  2.30s/it]

training loss: 3.5192971229553223


training:   9%|▉         | 510/5566 [20:32<3:11:55,  2.28s/it]

training loss: 3.526453733444214


training:   9%|▉         | 511/5566 [20:34<3:10:47,  2.26s/it]

training loss: 3.508308172225952


training:   9%|▉         | 512/5566 [20:36<3:10:37,  2.26s/it]

training loss: 3.5217087268829346


training:   9%|▉         | 513/5566 [20:39<3:09:48,  2.25s/it]

training loss: 3.5203065872192383


training:   9%|▉         | 514/5566 [20:41<3:08:58,  2.24s/it]

training loss: 3.5039405822753906


training:   9%|▉         | 515/5566 [20:43<3:09:07,  2.25s/it]

training loss: 3.5062479972839355


training:   9%|▉         | 516/5566 [20:45<3:10:01,  2.26s/it]

training loss: 3.554804801940918


training:   9%|▉         | 517/5566 [20:48<3:09:52,  2.26s/it]

training loss: 3.5247108936309814


training:   9%|▉         | 518/5566 [20:50<3:09:21,  2.25s/it]

training loss: 3.5248911380767822


training:   9%|▉         | 519/5566 [20:52<3:09:13,  2.25s/it]

training loss: 3.5496628284454346


training:   9%|▉         | 520/5566 [20:54<3:08:44,  2.24s/it]

training loss: 3.498093843460083
valid loss: 3.49686336517334
perplexity: 33.011741638183594


training:   9%|▉         | 521/5566 [20:58<3:58:20,  2.83s/it]

training loss: 3.525751829147339


training:   9%|▉         | 522/5566 [21:01<3:42:35,  2.65s/it]

training loss: 3.5325734615325928


training:   9%|▉         | 523/5566 [21:03<3:33:02,  2.53s/it]

training loss: 3.518052101135254


training:   9%|▉         | 524/5566 [21:05<3:25:40,  2.45s/it]

training loss: 3.55735445022583


training:   9%|▉         | 525/5566 [21:07<3:21:30,  2.40s/it]

training loss: 3.5439796447753906


training:   9%|▉         | 526/5566 [21:10<3:17:43,  2.35s/it]

training loss: 3.5243072509765625


training:   9%|▉         | 527/5566 [21:12<3:14:51,  2.32s/it]

training loss: 3.482872486114502


training:   9%|▉         | 528/5566 [21:14<3:13:34,  2.31s/it]

training loss: 3.547664165496826


training:  10%|▉         | 529/5566 [21:16<3:12:10,  2.29s/it]

training loss: 3.52020525932312


training:  10%|▉         | 530/5566 [21:19<3:11:29,  2.28s/it]

training loss: 3.527787685394287


training:  10%|▉         | 531/5566 [21:21<3:09:51,  2.26s/it]

training loss: 3.538026809692383


training:  10%|▉         | 532/5566 [21:23<3:09:22,  2.26s/it]

training loss: 3.5301218032836914


training:  10%|▉         | 533/5566 [21:25<3:08:29,  2.25s/it]

training loss: 3.5407814979553223


training:  10%|▉         | 534/5566 [21:28<3:09:01,  2.25s/it]

training loss: 3.506195068359375


training:  10%|▉         | 535/5566 [21:30<3:08:14,  2.24s/it]

training loss: 3.5282576084136963


training:  10%|▉         | 536/5566 [21:32<3:07:35,  2.24s/it]

training loss: 3.5365686416625977


training:  10%|▉         | 537/5566 [21:34<3:07:41,  2.24s/it]

training loss: 3.512665033340454


training:  10%|▉         | 538/5566 [21:37<3:07:35,  2.24s/it]

training loss: 3.5116288661956787


training:  10%|▉         | 539/5566 [21:39<3:06:51,  2.23s/it]

training loss: 3.536078453063965


training:  10%|▉         | 540/5566 [21:41<3:06:29,  2.23s/it]

training loss: 3.5368692874908447
valid loss: 3.536100387573242
perplexity: 34.3327751159668


training:  10%|▉         | 541/5566 [21:45<3:55:25,  2.81s/it]

training loss: 3.512432813644409


training:  10%|▉         | 542/5566 [21:47<3:40:44,  2.64s/it]

training loss: 3.5143959522247314


training:  10%|▉         | 543/5566 [21:50<3:29:40,  2.50s/it]

training loss: 3.523895502090454


training:  10%|▉         | 544/5566 [21:52<3:21:39,  2.41s/it]

training loss: 3.5527267456054688


training:  10%|▉         | 545/5566 [21:55<3:30:55,  2.52s/it]

training loss: 3.518895387649536


training:  10%|▉         | 546/5566 [21:57<3:29:37,  2.51s/it]

training loss: 3.5239334106445312


training:  10%|▉         | 547/5566 [21:59<3:22:45,  2.42s/it]

training loss: 3.5040409564971924


training:  10%|▉         | 548/5566 [22:02<3:17:46,  2.36s/it]

training loss: 3.4993793964385986


training:  10%|▉         | 549/5566 [22:04<3:15:26,  2.34s/it]

training loss: 3.532562017440796


training:  10%|▉         | 550/5566 [22:06<3:13:51,  2.32s/it]

training loss: 3.521552085876465


training:  10%|▉         | 551/5566 [22:08<3:12:08,  2.30s/it]

training loss: 3.510631799697876


training:  10%|▉         | 552/5566 [22:11<3:10:15,  2.28s/it]

training loss: 3.508192539215088


training:  10%|▉         | 553/5566 [22:13<3:09:34,  2.27s/it]

training loss: 3.5257019996643066


training:  10%|▉         | 554/5566 [22:15<3:08:22,  2.26s/it]

training loss: 3.535377264022827


training:  10%|▉         | 555/5566 [22:17<3:07:38,  2.25s/it]

training loss: 3.5018999576568604


training:  10%|▉         | 556/5566 [22:19<3:06:45,  2.24s/it]

training loss: 3.5266833305358887


training:  10%|█         | 557/5566 [22:22<3:06:24,  2.23s/it]

training loss: 3.5086212158203125


training:  10%|█         | 558/5566 [22:24<3:06:07,  2.23s/it]

training loss: 3.5141749382019043


training:  10%|█         | 559/5566 [22:26<3:05:39,  2.22s/it]

training loss: 3.516791582107544


training:  10%|█         | 560/5566 [22:28<3:06:01,  2.23s/it]

training loss: 3.514031171798706
valid loss: 3.51336932182312
perplexity: 33.5611572265625


training:  10%|█         | 561/5566 [22:33<3:56:34,  2.84s/it]

training loss: 3.518803596496582


training:  10%|█         | 562/5566 [22:35<3:42:50,  2.67s/it]

training loss: 3.5070929527282715


training:  10%|█         | 563/5566 [22:37<3:32:21,  2.55s/it]

training loss: 3.519101142883301


training:  10%|█         | 564/5566 [22:39<3:24:23,  2.45s/it]

training loss: 3.544318437576294


training:  10%|█         | 565/5566 [22:42<3:18:18,  2.38s/it]

training loss: 3.5425703525543213


training:  10%|█         | 566/5566 [22:44<3:13:49,  2.33s/it]

training loss: 3.5454647541046143


training:  10%|█         | 567/5566 [22:46<3:10:57,  2.29s/it]

training loss: 3.547192335128784


training:  10%|█         | 568/5566 [22:48<3:09:03,  2.27s/it]

training loss: 3.527390480041504


training:  10%|█         | 569/5566 [22:50<3:07:55,  2.26s/it]

training loss: 3.53467059135437


training:  10%|█         | 570/5566 [22:53<3:07:33,  2.25s/it]

training loss: 3.5400490760803223


training:  10%|█         | 571/5566 [22:55<3:06:26,  2.24s/it]

training loss: 3.5405497550964355


training:  10%|█         | 572/5566 [22:57<3:06:29,  2.24s/it]

training loss: 3.52852725982666


training:  10%|█         | 573/5566 [22:59<3:06:47,  2.24s/it]

training loss: 3.5118281841278076


training:  10%|█         | 574/5566 [23:02<3:06:46,  2.24s/it]

training loss: 3.5254690647125244


training:  10%|█         | 575/5566 [23:04<3:07:37,  2.26s/it]

training loss: 3.5184080600738525


training:  10%|█         | 576/5566 [23:06<3:07:15,  2.25s/it]

training loss: 3.534252643585205


training:  10%|█         | 577/5566 [23:08<3:06:37,  2.24s/it]

training loss: 3.5212883949279785


training:  10%|█         | 578/5566 [23:11<3:05:53,  2.24s/it]

training loss: 3.5277552604675293


training:  10%|█         | 579/5566 [23:13<3:05:24,  2.23s/it]

training loss: 3.512477397918701


training:  10%|█         | 580/5566 [23:15<3:04:37,  2.22s/it]

training loss: 3.511207103729248
valid loss: 3.5098836421966553
perplexity: 33.444374084472656


training:  10%|█         | 581/5566 [23:19<3:54:08,  2.82s/it]

training loss: 3.527737617492676


training:  10%|█         | 582/5566 [23:21<3:38:53,  2.64s/it]

training loss: 3.5274529457092285


training:  10%|█         | 583/5566 [23:24<3:28:09,  2.51s/it]

training loss: 3.524651288986206


training:  10%|█         | 584/5566 [23:26<3:20:48,  2.42s/it]

training loss: 3.5301566123962402


training:  11%|█         | 585/5566 [23:28<3:15:12,  2.35s/it]

training loss: 3.5274503231048584


training:  11%|█         | 586/5566 [23:30<3:11:33,  2.31s/it]

training loss: 3.5373172760009766


training:  11%|█         | 587/5566 [23:33<3:09:33,  2.28s/it]

training loss: 3.514202117919922


training:  11%|█         | 588/5566 [23:35<3:08:49,  2.28s/it]

training loss: 3.5347323417663574


training:  11%|█         | 589/5566 [23:37<3:07:38,  2.26s/it]

training loss: 3.5306689739227295


training:  11%|█         | 590/5566 [23:39<3:06:16,  2.25s/it]

training loss: 3.534959316253662


training:  11%|█         | 591/5566 [23:41<3:05:09,  2.23s/it]

training loss: 3.5214693546295166


training:  11%|█         | 592/5566 [23:44<3:04:36,  2.23s/it]

training loss: 3.5072410106658936


training:  11%|█         | 593/5566 [23:46<3:04:24,  2.22s/it]

training loss: 3.5116822719573975


training:  11%|█         | 594/5566 [23:48<3:03:59,  2.22s/it]

training loss: 3.4953830242156982


training:  11%|█         | 595/5566 [23:50<3:03:38,  2.22s/it]

training loss: 3.5204856395721436


training:  11%|█         | 596/5566 [23:52<3:03:11,  2.21s/it]

training loss: 3.521742820739746


training:  11%|█         | 597/5566 [23:55<3:02:49,  2.21s/it]

training loss: 3.5255508422851562


training:  11%|█         | 598/5566 [23:57<3:02:59,  2.21s/it]

training loss: 3.531222343444824


training:  11%|█         | 599/5566 [23:59<3:03:25,  2.22s/it]

training loss: 3.5152857303619385


training:  11%|█         | 600/5566 [24:01<3:03:31,  2.22s/it]

training loss: 3.493480920791626
valid loss: 3.4933829307556152
perplexity: 32.89704513549805


training:  11%|█         | 601/5566 [24:06<4:11:37,  3.04s/it]

training loss: 3.5338902473449707


training:  11%|█         | 602/5566 [24:09<3:54:02,  2.83s/it]

training loss: 3.506960153579712


training:  11%|█         | 603/5566 [24:11<3:38:37,  2.64s/it]

training loss: 3.519029378890991


training:  11%|█         | 604/5566 [24:13<3:27:37,  2.51s/it]

training loss: 3.506481409072876


training:  11%|█         | 605/5566 [24:15<3:20:11,  2.42s/it]

training loss: 3.5257604122161865


training:  11%|█         | 606/5566 [24:17<3:15:07,  2.36s/it]

training loss: 3.5265846252441406


training:  11%|█         | 607/5566 [24:20<3:12:04,  2.32s/it]

training loss: 3.503265857696533


training:  11%|█         | 608/5566 [24:22<3:09:11,  2.29s/it]

training loss: 3.531257390975952


training:  11%|█         | 609/5566 [24:24<3:06:25,  2.26s/it]

training loss: 3.539879560470581


training:  11%|█         | 610/5566 [24:26<3:05:28,  2.25s/it]

training loss: 3.508676290512085


training:  11%|█         | 611/5566 [24:29<3:05:50,  2.25s/it]

training loss: 3.541640281677246


training:  11%|█         | 612/5566 [24:31<3:05:27,  2.25s/it]

training loss: 3.5252649784088135


training:  11%|█         | 613/5566 [24:33<3:04:17,  2.23s/it]

training loss: 3.5037152767181396


training:  11%|█         | 614/5566 [24:35<3:04:58,  2.24s/it]

training loss: 3.5311264991760254


training:  11%|█         | 615/5566 [24:38<3:04:56,  2.24s/it]

training loss: 3.545804023742676


training:  11%|█         | 616/5566 [24:40<3:04:47,  2.24s/it]

training loss: 3.531459093093872


training:  11%|█         | 617/5566 [24:42<3:04:27,  2.24s/it]

training loss: 3.5188064575195312


training:  11%|█         | 618/5566 [24:44<3:03:54,  2.23s/it]

training loss: 3.528233051300049


training:  11%|█         | 619/5566 [24:46<3:03:39,  2.23s/it]

training loss: 3.5013840198516846


training:  11%|█         | 620/5566 [24:49<3:03:09,  2.22s/it]

training loss: 3.5090994834899902
valid loss: 3.5081169605255127
perplexity: 33.385345458984375


training:  11%|█         | 621/5566 [24:53<3:52:19,  2.82s/it]

training loss: 3.52644419670105


training:  11%|█         | 622/5566 [24:55<3:37:56,  2.64s/it]

training loss: 3.5243005752563477


training:  11%|█         | 623/5566 [24:57<3:27:04,  2.51s/it]

training loss: 3.5368316173553467


training:  11%|█         | 624/5566 [25:00<3:21:28,  2.45s/it]

training loss: 3.5339279174804688


training:  11%|█         | 625/5566 [25:02<3:16:00,  2.38s/it]

training loss: 3.539572238922119


training:  11%|█         | 626/5566 [25:04<3:12:11,  2.33s/it]

training loss: 3.5174379348754883


training:  11%|█▏        | 627/5566 [25:07<3:26:11,  2.50s/it]

training loss: 3.553171396255493


training:  11%|█▏        | 628/5566 [25:09<3:20:20,  2.43s/it]

training loss: 3.508822441101074


training:  11%|█▏        | 629/5566 [25:11<3:15:02,  2.37s/it]

training loss: 3.52020263671875


training:  11%|█▏        | 630/5566 [25:14<3:11:44,  2.33s/it]

training loss: 3.5217816829681396


training:  11%|█▏        | 631/5566 [25:16<3:08:58,  2.30s/it]

training loss: 3.506293773651123


training:  11%|█▏        | 632/5566 [25:18<3:07:13,  2.28s/it]

training loss: 3.526761054992676


training:  11%|█▏        | 633/5566 [25:20<3:05:44,  2.26s/it]

training loss: 3.518599271774292


training:  11%|█▏        | 634/5566 [25:23<3:05:08,  2.25s/it]

training loss: 3.520402431488037


training:  11%|█▏        | 635/5566 [25:25<3:04:11,  2.24s/it]

training loss: 3.5449817180633545


training:  11%|█▏        | 636/5566 [25:27<3:03:30,  2.23s/it]

training loss: 3.507333993911743


training:  11%|█▏        | 637/5566 [25:29<3:02:36,  2.22s/it]

training loss: 3.534885883331299


training:  11%|█▏        | 638/5566 [25:31<3:02:11,  2.22s/it]

training loss: 3.5182337760925293


training:  11%|█▏        | 639/5566 [25:34<3:01:57,  2.22s/it]

training loss: 3.538986921310425


training:  11%|█▏        | 640/5566 [25:36<3:02:36,  2.22s/it]

training loss: 3.5427298545837402
valid loss: 3.540945053100586
perplexity: 34.499507904052734


training:  12%|█▏        | 641/5566 [25:40<3:50:21,  2.81s/it]

training loss: 3.5239624977111816


training:  12%|█▏        | 642/5566 [25:42<3:35:46,  2.63s/it]

training loss: 3.518505096435547


training:  12%|█▏        | 643/5566 [25:44<3:25:14,  2.50s/it]

training loss: 3.5048117637634277


training:  12%|█▏        | 644/5566 [25:47<3:17:29,  2.41s/it]

training loss: 3.5108680725097656


training:  12%|█▏        | 645/5566 [25:49<3:12:08,  2.34s/it]

training loss: 3.502323865890503


training:  12%|█▏        | 646/5566 [25:51<3:08:27,  2.30s/it]

training loss: 3.537083387374878


training:  12%|█▏        | 647/5566 [25:53<3:06:09,  2.27s/it]

training loss: 3.5198402404785156


training:  12%|█▏        | 648/5566 [25:55<3:05:26,  2.26s/it]

training loss: 3.527345895767212


training:  12%|█▏        | 649/5566 [25:58<3:04:17,  2.25s/it]

training loss: 3.524230480194092


training:  12%|█▏        | 650/5566 [26:00<3:03:22,  2.24s/it]

training loss: 3.521991729736328


training:  12%|█▏        | 651/5566 [26:02<3:02:39,  2.23s/it]

training loss: 3.5300967693328857


training:  12%|█▏        | 652/5566 [26:04<3:01:56,  2.22s/it]

training loss: 3.5327224731445312


training:  12%|█▏        | 653/5566 [26:07<3:02:02,  2.22s/it]

training loss: 3.538027286529541


training:  12%|█▏        | 654/5566 [26:09<3:02:06,  2.22s/it]

training loss: 3.510733127593994


training:  12%|█▏        | 655/5566 [26:11<3:01:26,  2.22s/it]

training loss: 3.5253875255584717


training:  12%|█▏        | 656/5566 [26:13<3:02:13,  2.23s/it]

training loss: 3.517080783843994


training:  12%|█▏        | 657/5566 [26:15<3:01:36,  2.22s/it]

training loss: 3.5173916816711426


training:  12%|█▏        | 658/5566 [26:18<3:01:31,  2.22s/it]

training loss: 3.5257554054260254


training:  12%|█▏        | 659/5566 [26:20<3:01:28,  2.22s/it]

training loss: 3.5216565132141113


training:  12%|█▏        | 660/5566 [26:22<3:01:19,  2.22s/it]

training loss: 3.540726661682129
valid loss: 3.5398354530334473
perplexity: 34.461246490478516


training:  12%|█▏        | 661/5566 [26:26<3:51:17,  2.83s/it]

training loss: 3.51955246925354


training:  12%|█▏        | 662/5566 [26:29<3:38:01,  2.67s/it]

training loss: 3.4980244636535645


training:  12%|█▏        | 663/5566 [26:31<3:27:03,  2.53s/it]

training loss: 3.506105899810791


training:  12%|█▏        | 664/5566 [26:33<3:19:06,  2.44s/it]

training loss: 3.5048797130584717


training:  12%|█▏        | 665/5566 [26:35<3:13:48,  2.37s/it]

training loss: 3.528859853744507


training:  12%|█▏        | 666/5566 [26:37<3:09:46,  2.32s/it]

training loss: 3.52343487739563


training:  12%|█▏        | 667/5566 [26:40<3:07:40,  2.30s/it]

training loss: 3.505192518234253


training:  12%|█▏        | 668/5566 [26:42<3:05:55,  2.28s/it]

training loss: 3.510000467300415


training:  12%|█▏        | 669/5566 [26:44<3:04:11,  2.26s/it]

training loss: 3.5118839740753174


training:  12%|█▏        | 670/5566 [26:46<3:02:48,  2.24s/it]

training loss: 3.506046772003174


training:  12%|█▏        | 671/5566 [26:49<3:02:28,  2.24s/it]

training loss: 3.5431606769561768


training:  12%|█▏        | 672/5566 [26:51<3:01:22,  2.22s/it]

training loss: 3.5188238620758057


training:  12%|█▏        | 673/5566 [26:53<3:00:48,  2.22s/it]

training loss: 3.5312137603759766


training:  12%|█▏        | 674/5566 [26:55<3:00:49,  2.22s/it]

training loss: 3.5240769386291504


training:  12%|█▏        | 675/5566 [26:57<3:00:30,  2.21s/it]

training loss: 3.5285770893096924


training:  12%|█▏        | 676/5566 [27:00<3:00:24,  2.21s/it]

training loss: 3.530601739883423


training:  12%|█▏        | 677/5566 [27:02<3:00:38,  2.22s/it]

training loss: 3.515754222869873


training:  12%|█▏        | 678/5566 [27:04<3:00:22,  2.21s/it]

training loss: 3.529761791229248


training:  12%|█▏        | 679/5566 [27:06<3:00:31,  2.22s/it]

training loss: 3.5222089290618896


training:  12%|█▏        | 680/5566 [27:08<2:59:52,  2.21s/it]

training loss: 3.5193209648132324
valid loss: 3.51849365234375
perplexity: 33.733577728271484


training:  12%|█▏        | 681/5566 [27:13<3:47:30,  2.79s/it]

training loss: 3.5119071006774902


training:  12%|█▏        | 682/5566 [27:15<3:33:29,  2.62s/it]

training loss: 3.537062644958496


training:  12%|█▏        | 683/5566 [27:17<3:23:19,  2.50s/it]

training loss: 3.519853115081787


training:  12%|█▏        | 684/5566 [27:19<3:16:15,  2.41s/it]

training loss: 3.5152952671051025


training:  12%|█▏        | 685/5566 [27:21<3:11:01,  2.35s/it]

training loss: 3.5327391624450684


training:  12%|█▏        | 686/5566 [27:24<3:07:39,  2.31s/it]

training loss: 3.520040512084961


training:  12%|█▏        | 687/5566 [27:26<3:06:15,  2.29s/it]

training loss: 3.5088417530059814


training:  12%|█▏        | 688/5566 [27:28<3:04:48,  2.27s/it]

training loss: 3.5247678756713867


training:  12%|█▏        | 689/5566 [27:30<3:03:03,  2.25s/it]

training loss: 3.5462377071380615


training:  12%|█▏        | 690/5566 [27:33<3:02:09,  2.24s/it]

training loss: 3.506371259689331


training:  12%|█▏        | 691/5566 [27:35<3:01:31,  2.23s/it]

training loss: 3.5213377475738525


training:  12%|█▏        | 692/5566 [27:37<3:01:13,  2.23s/it]

training loss: 3.516280174255371


training:  12%|█▏        | 693/5566 [27:39<3:00:24,  2.22s/it]

training loss: 3.491790294647217


training:  12%|█▏        | 694/5566 [27:41<3:00:51,  2.23s/it]

training loss: 3.528404951095581


training:  12%|█▏        | 695/5566 [27:44<3:00:25,  2.22s/it]

training loss: 3.4900522232055664


training:  13%|█▎        | 696/5566 [27:46<2:59:48,  2.22s/it]

training loss: 3.5482137203216553


training:  13%|█▎        | 697/5566 [27:48<3:00:00,  2.22s/it]

training loss: 3.5073840618133545


training:  13%|█▎        | 698/5566 [27:50<3:00:23,  2.22s/it]

training loss: 3.51187801361084


training:  13%|█▎        | 699/5566 [27:53<2:59:54,  2.22s/it]

training loss: 3.537260055541992


training:  13%|█▎        | 700/5566 [27:55<2:59:37,  2.21s/it]

training loss: 3.522343873977661
valid loss: 3.521998167037964
perplexity: 33.85200500488281


training:  13%|█▎        | 701/5566 [28:00<4:06:20,  3.04s/it]

training loss: 3.5030057430267334


training:  13%|█▎        | 702/5566 [28:02<3:49:59,  2.84s/it]

training loss: 3.513101577758789


training:  13%|█▎        | 703/5566 [28:04<3:34:35,  2.65s/it]

training loss: 3.5292062759399414


training:  13%|█▎        | 704/5566 [28:07<3:24:28,  2.52s/it]

training loss: 3.5251779556274414


training:  13%|█▎        | 705/5566 [28:09<3:16:39,  2.43s/it]

training loss: 3.5203280448913574


training:  13%|█▎        | 706/5566 [28:11<3:12:05,  2.37s/it]

training loss: 3.5407416820526123


training:  13%|█▎        | 707/5566 [28:13<3:07:41,  2.32s/it]

training loss: 3.5337140560150146


training:  13%|█▎        | 708/5566 [28:15<3:05:43,  2.29s/it]

training loss: 3.5089337825775146


training:  13%|█▎        | 709/5566 [28:18<3:04:12,  2.28s/it]

training loss: 3.503772020339966


training:  13%|█▎        | 710/5566 [28:20<3:18:08,  2.45s/it]

training loss: 3.518794298171997


training:  13%|█▎        | 711/5566 [28:23<3:14:23,  2.40s/it]

training loss: 3.511536121368408


training:  13%|█▎        | 712/5566 [28:25<3:09:34,  2.34s/it]

training loss: 3.497789144515991


training:  13%|█▎        | 713/5566 [28:27<3:06:54,  2.31s/it]

training loss: 3.5244481563568115


training:  13%|█▎        | 714/5566 [28:29<3:04:17,  2.28s/it]

training loss: 3.5196518898010254


training:  13%|█▎        | 715/5566 [28:32<3:03:03,  2.26s/it]

training loss: 3.514599323272705


training:  13%|█▎        | 716/5566 [28:34<3:02:00,  2.25s/it]

training loss: 3.518662214279175


training:  13%|█▎        | 717/5566 [28:36<3:01:20,  2.24s/it]

training loss: 3.517766237258911


training:  13%|█▎        | 718/5566 [28:38<3:00:49,  2.24s/it]

training loss: 3.5334017276763916


training:  13%|█▎        | 719/5566 [28:41<3:00:23,  2.23s/it]

training loss: 3.5308845043182373


training:  13%|█▎        | 720/5566 [28:43<2:59:37,  2.22s/it]

training loss: 3.5355865955352783
valid loss: 3.53505277633667
perplexity: 34.29682159423828


training:  13%|█▎        | 721/5566 [28:47<3:47:45,  2.82s/it]

training loss: 3.515200614929199


training:  13%|█▎        | 722/5566 [28:49<3:33:06,  2.64s/it]

training loss: 3.5156941413879395


training:  13%|█▎        | 723/5566 [28:51<3:22:35,  2.51s/it]

training loss: 3.541433334350586


training:  13%|█▎        | 724/5566 [28:54<3:14:48,  2.41s/it]

training loss: 3.5231523513793945


training:  13%|█▎        | 725/5566 [28:56<3:09:30,  2.35s/it]

training loss: 3.5141637325286865


training:  13%|█▎        | 726/5566 [28:58<3:06:07,  2.31s/it]

training loss: 3.519092082977295


training:  13%|█▎        | 727/5566 [29:00<3:03:37,  2.28s/it]

training loss: 3.520580768585205


training:  13%|█▎        | 728/5566 [29:02<3:03:21,  2.27s/it]

training loss: 3.520383596420288


training:  13%|█▎        | 729/5566 [29:05<3:01:48,  2.26s/it]

training loss: 3.525158405303955


training:  13%|█▎        | 730/5566 [29:07<3:01:02,  2.25s/it]

training loss: 3.495732545852661


training:  13%|█▎        | 731/5566 [29:09<3:00:11,  2.24s/it]

training loss: 3.543489456176758


training:  13%|█▎        | 732/5566 [29:11<2:59:54,  2.23s/it]

training loss: 3.5021767616271973


training:  13%|█▎        | 733/5566 [29:14<2:59:55,  2.23s/it]

training loss: 3.5371651649475098


training:  13%|█▎        | 734/5566 [29:16<2:59:41,  2.23s/it]

training loss: 3.5245697498321533


training:  13%|█▎        | 735/5566 [29:18<2:59:35,  2.23s/it]

training loss: 3.550438165664673


training:  13%|█▎        | 736/5566 [29:20<2:59:11,  2.23s/it]

training loss: 3.5441622734069824


training:  13%|█▎        | 737/5566 [29:22<2:59:13,  2.23s/it]

training loss: 3.5310897827148438


training:  13%|█▎        | 738/5566 [29:25<2:58:51,  2.22s/it]

training loss: 3.516078233718872


training:  13%|█▎        | 739/5566 [29:27<2:58:06,  2.21s/it]

training loss: 3.5176892280578613


training:  13%|█▎        | 740/5566 [29:29<2:58:56,  2.22s/it]

training loss: 3.5253970623016357
valid loss: 3.5244953632354736
perplexity: 33.936641693115234


training:  13%|█▎        | 741/5566 [29:33<3:46:40,  2.82s/it]

training loss: 3.559157609939575


training:  13%|█▎        | 742/5566 [29:36<3:32:08,  2.64s/it]

training loss: 3.5523533821105957


training:  13%|█▎        | 743/5566 [29:38<3:21:11,  2.50s/it]

training loss: 3.5326108932495117


training:  13%|█▎        | 744/5566 [29:40<3:14:44,  2.42s/it]

training loss: 3.533346176147461


training:  13%|█▎        | 745/5566 [29:42<3:10:29,  2.37s/it]

training loss: 3.5288658142089844


training:  13%|█▎        | 746/5566 [29:44<3:06:53,  2.33s/it]

training loss: 3.5504870414733887


training:  13%|█▎        | 747/5566 [29:47<3:04:48,  2.30s/it]

training loss: 3.481996774673462


training:  13%|█▎        | 748/5566 [29:49<3:02:28,  2.27s/it]

training loss: 3.5390915870666504


training:  13%|█▎        | 749/5566 [29:51<3:01:52,  2.27s/it]

training loss: 3.5502090454101562


training:  13%|█▎        | 750/5566 [29:53<3:00:07,  2.24s/it]

training loss: 3.521949052810669


training:  13%|█▎        | 751/5566 [29:56<2:59:41,  2.24s/it]

training loss: 3.502000331878662


training:  14%|█▎        | 752/5566 [29:58<2:58:39,  2.23s/it]

training loss: 3.5404186248779297


training:  14%|█▎        | 753/5566 [30:00<2:58:08,  2.22s/it]

training loss: 3.5369420051574707


training:  14%|█▎        | 754/5566 [30:02<2:58:52,  2.23s/it]

training loss: 3.524230718612671


training:  14%|█▎        | 755/5566 [30:04<2:58:26,  2.23s/it]

training loss: 3.5031354427337646


training:  14%|█▎        | 756/5566 [30:07<2:58:17,  2.22s/it]

training loss: 3.5106406211853027


training:  14%|█▎        | 757/5566 [30:09<2:57:49,  2.22s/it]

training loss: 3.524815559387207


training:  14%|█▎        | 758/5566 [30:11<2:57:45,  2.22s/it]

training loss: 3.509094715118408


training:  14%|█▎        | 759/5566 [30:13<2:58:06,  2.22s/it]

training loss: 3.5179359912872314


training:  14%|█▎        | 760/5566 [30:16<2:57:59,  2.22s/it]

training loss: 3.528834819793701
valid loss: 3.5276196002960205
perplexity: 34.04283905029297


training:  14%|█▎        | 761/5566 [30:20<3:44:25,  2.80s/it]

training loss: 3.535613536834717


training:  14%|█▎        | 762/5566 [30:22<3:30:14,  2.63s/it]

training loss: 3.5338528156280518


training:  14%|█▎        | 763/5566 [30:24<3:20:51,  2.51s/it]

training loss: 3.5414185523986816


training:  14%|█▎        | 764/5566 [30:26<3:14:12,  2.43s/it]

training loss: 3.512195110321045


training:  14%|█▎        | 765/5566 [30:29<3:09:48,  2.37s/it]

training loss: 3.5241987705230713


training:  14%|█▍        | 766/5566 [30:31<3:05:26,  2.32s/it]

training loss: 3.504662036895752


training:  14%|█▍        | 767/5566 [30:33<3:02:53,  2.29s/it]

training loss: 3.5211265087127686


training:  14%|█▍        | 768/5566 [30:35<3:01:28,  2.27s/it]

training loss: 3.50592303276062


training:  14%|█▍        | 769/5566 [30:37<3:00:46,  2.26s/it]

training loss: 3.531820774078369


training:  14%|█▍        | 770/5566 [30:40<2:59:50,  2.25s/it]

training loss: 3.550323486328125


training:  14%|█▍        | 771/5566 [30:42<2:59:08,  2.24s/it]

training loss: 3.523120641708374


training:  14%|█▍        | 772/5566 [30:44<2:59:36,  2.25s/it]

training loss: 3.521002769470215


training:  14%|█▍        | 773/5566 [30:46<2:58:55,  2.24s/it]

training loss: 3.521131992340088


training:  14%|█▍        | 774/5566 [30:49<2:58:12,  2.23s/it]

training loss: 3.524628162384033


training:  14%|█▍        | 775/5566 [30:51<2:57:16,  2.22s/it]

training loss: 3.513645887374878


training:  14%|█▍        | 776/5566 [30:53<2:57:01,  2.22s/it]

training loss: 3.5142130851745605


training:  14%|█▍        | 777/5566 [30:55<2:56:57,  2.22s/it]

training loss: 3.516763210296631


training:  14%|█▍        | 778/5566 [30:57<2:56:46,  2.22s/it]

training loss: 3.5229949951171875


training:  14%|█▍        | 779/5566 [31:00<2:56:28,  2.21s/it]

training loss: 3.5307564735412598


training:  14%|█▍        | 780/5566 [31:02<2:57:01,  2.22s/it]

training loss: 3.522012948989868
valid loss: 3.520366668701172
perplexity: 33.796817779541016


training:  14%|█▍        | 781/5566 [31:06<3:45:12,  2.82s/it]

training loss: 3.5283074378967285


training:  14%|█▍        | 782/5566 [31:08<3:30:45,  2.64s/it]

training loss: 3.5434415340423584


training:  14%|█▍        | 783/5566 [31:11<3:20:07,  2.51s/it]

training loss: 3.5177624225616455


training:  14%|█▍        | 784/5566 [31:13<3:13:12,  2.42s/it]

training loss: 3.5279018878936768


training:  14%|█▍        | 785/5566 [31:15<3:08:24,  2.36s/it]

training loss: 3.4912095069885254


training:  14%|█▍        | 786/5566 [31:17<3:05:42,  2.33s/it]

training loss: 3.502603530883789


training:  14%|█▍        | 787/5566 [31:19<3:03:28,  2.30s/it]

training loss: 3.4806511402130127


training:  14%|█▍        | 788/5566 [31:22<3:01:46,  2.28s/it]

training loss: 3.487401247024536


training:  14%|█▍        | 789/5566 [31:24<3:01:08,  2.28s/it]

training loss: 3.532559394836426


training:  14%|█▍        | 790/5566 [31:26<3:00:24,  2.27s/it]

training loss: 3.510946750640869


training:  14%|█▍        | 791/5566 [31:29<3:01:55,  2.29s/it]

training loss: 3.5299856662750244


training:  14%|█▍        | 792/5566 [31:31<3:00:19,  2.27s/it]

training loss: 3.5122737884521484


training:  14%|█▍        | 793/5566 [31:34<3:15:24,  2.46s/it]

training loss: 3.5173544883728027


training:  14%|█▍        | 794/5566 [31:36<3:15:12,  2.45s/it]

training loss: 3.474290132522583


training:  14%|█▍        | 795/5566 [31:38<3:09:49,  2.39s/it]

training loss: 3.541475534439087


training:  14%|█▍        | 796/5566 [31:41<3:06:10,  2.34s/it]

training loss: 3.535827159881592


training:  14%|█▍        | 797/5566 [31:43<3:03:29,  2.31s/it]

training loss: 3.5122416019439697


training:  14%|█▍        | 798/5566 [31:45<3:02:21,  2.29s/it]

training loss: 3.5287652015686035


training:  14%|█▍        | 799/5566 [31:47<3:00:58,  2.28s/it]

training loss: 3.496534824371338


training:  14%|█▍        | 800/5566 [31:50<2:59:12,  2.26s/it]

training loss: 3.554720163345337
valid loss: 3.553577423095703
perplexity: 34.93808364868164


training:  14%|█▍        | 801/5566 [31:54<4:02:16,  3.05s/it]

training loss: 3.5212297439575195


training:  14%|█▍        | 802/5566 [31:57<3:45:19,  2.84s/it]

training loss: 3.5106663703918457


training:  14%|█▍        | 803/5566 [31:59<3:30:40,  2.65s/it]

training loss: 3.5214130878448486


training:  14%|█▍        | 804/5566 [32:01<3:21:00,  2.53s/it]

training loss: 3.511016845703125


training:  14%|█▍        | 805/5566 [32:03<3:13:49,  2.44s/it]

training loss: 3.5310354232788086


training:  14%|█▍        | 806/5566 [32:06<3:08:48,  2.38s/it]

training loss: 3.5157852172851562


training:  14%|█▍        | 807/5566 [32:08<3:04:55,  2.33s/it]

training loss: 3.5066070556640625


training:  15%|█▍        | 808/5566 [32:10<3:01:45,  2.29s/it]

training loss: 3.5193681716918945


training:  15%|█▍        | 809/5566 [32:12<2:59:35,  2.27s/it]

training loss: 3.5351412296295166


training:  15%|█▍        | 810/5566 [32:15<2:58:33,  2.25s/it]

training loss: 3.5128283500671387


training:  15%|█▍        | 811/5566 [32:17<2:57:35,  2.24s/it]

training loss: 3.5058820247650146


training:  15%|█▍        | 812/5566 [32:19<2:56:48,  2.23s/it]

training loss: 3.529343605041504


training:  15%|█▍        | 813/5566 [32:21<2:56:15,  2.23s/it]

training loss: 3.510948657989502


training:  15%|█▍        | 814/5566 [32:23<2:55:49,  2.22s/it]

training loss: 3.542797088623047


training:  15%|█▍        | 815/5566 [32:26<2:55:27,  2.22s/it]

training loss: 3.5203561782836914


training:  15%|█▍        | 816/5566 [32:28<2:55:22,  2.22s/it]

training loss: 3.5303115844726562


training:  15%|█▍        | 817/5566 [32:30<2:54:58,  2.21s/it]

training loss: 3.5175206661224365


training:  15%|█▍        | 818/5566 [32:32<2:54:43,  2.21s/it]

training loss: 3.5230469703674316


training:  15%|█▍        | 819/5566 [32:34<2:55:07,  2.21s/it]

training loss: 3.541409492492676


training:  15%|█▍        | 820/5566 [32:37<2:55:10,  2.21s/it]

training loss: 3.5036518573760986
valid loss: 3.502699851989746
perplexity: 33.204978942871094


training:  15%|█▍        | 821/5566 [32:41<3:42:27,  2.81s/it]

training loss: 3.5297749042510986


training:  15%|█▍        | 822/5566 [32:43<3:29:44,  2.65s/it]

training loss: 3.523282766342163


training:  15%|█▍        | 823/5566 [32:45<3:19:45,  2.53s/it]

training loss: 3.5324196815490723


training:  15%|█▍        | 824/5566 [32:48<3:12:22,  2.43s/it]

training loss: 3.533745050430298


training:  15%|█▍        | 825/5566 [32:50<3:07:15,  2.37s/it]

training loss: 3.5186312198638916


training:  15%|█▍        | 826/5566 [32:52<3:03:35,  2.32s/it]

training loss: 3.524587869644165


training:  15%|█▍        | 827/5566 [32:54<3:00:53,  2.29s/it]

training loss: 3.530917167663574


training:  15%|█▍        | 828/5566 [32:57<2:59:51,  2.28s/it]

training loss: 3.5443825721740723


training:  15%|█▍        | 829/5566 [32:59<2:58:44,  2.26s/it]

training loss: 3.526076078414917


training:  15%|█▍        | 830/5566 [33:01<2:58:09,  2.26s/it]

training loss: 3.501342535018921


training:  15%|█▍        | 831/5566 [33:03<2:58:07,  2.26s/it]

training loss: 3.530456781387329


training:  15%|█▍        | 832/5566 [33:06<2:59:16,  2.27s/it]

training loss: 3.54109525680542


training:  15%|█▍        | 833/5566 [33:08<2:58:25,  2.26s/it]

training loss: 3.525939464569092


training:  15%|█▍        | 834/5566 [33:10<2:57:39,  2.25s/it]

training loss: 3.5420913696289062


training:  15%|█▌        | 835/5566 [33:12<2:56:44,  2.24s/it]

training loss: 3.497699022293091


training:  15%|█▌        | 836/5566 [33:14<2:56:10,  2.23s/it]

training loss: 3.5150928497314453


training:  15%|█▌        | 837/5566 [33:17<2:55:35,  2.23s/it]

training loss: 3.5598084926605225


training:  15%|█▌        | 838/5566 [33:19<2:55:42,  2.23s/it]

training loss: 3.506570339202881


training:  15%|█▌        | 839/5566 [33:21<2:56:35,  2.24s/it]

training loss: 3.5144128799438477


training:  15%|█▌        | 840/5566 [33:23<2:56:43,  2.24s/it]

training loss: 3.530289888381958
valid loss: 3.529171943664551
perplexity: 34.09572219848633


training:  15%|█▌        | 841/5566 [33:28<3:44:00,  2.84s/it]

training loss: 3.559783458709717


training:  15%|█▌        | 842/5566 [33:30<3:29:28,  2.66s/it]

training loss: 3.5061635971069336


training:  15%|█▌        | 843/5566 [33:32<3:19:14,  2.53s/it]

training loss: 3.5145890712738037


training:  15%|█▌        | 844/5566 [33:34<3:12:15,  2.44s/it]

training loss: 3.536134719848633


training:  15%|█▌        | 845/5566 [33:37<3:07:37,  2.38s/it]

training loss: 3.5418496131896973


training:  15%|█▌        | 846/5566 [33:39<3:03:52,  2.34s/it]

training loss: 3.5125999450683594


training:  15%|█▌        | 847/5566 [33:41<3:01:22,  2.31s/it]

training loss: 3.537712335586548


training:  15%|█▌        | 848/5566 [33:43<2:59:28,  2.28s/it]

training loss: 3.533766031265259


training:  15%|█▌        | 849/5566 [33:46<2:59:01,  2.28s/it]

training loss: 3.5113039016723633


training:  15%|█▌        | 850/5566 [33:48<2:58:43,  2.27s/it]

training loss: 3.523104667663574


training:  15%|█▌        | 851/5566 [33:50<2:57:32,  2.26s/it]

training loss: 3.5189731121063232


training:  15%|█▌        | 852/5566 [33:52<2:57:00,  2.25s/it]

training loss: 3.5222251415252686


training:  15%|█▌        | 853/5566 [33:55<2:56:06,  2.24s/it]

training loss: 3.5488569736480713


training:  15%|█▌        | 854/5566 [33:57<2:55:21,  2.23s/it]

training loss: 3.5364022254943848


training:  15%|█▌        | 855/5566 [33:59<2:55:04,  2.23s/it]

training loss: 3.5177934169769287


training:  15%|█▌        | 856/5566 [34:01<2:54:42,  2.23s/it]

training loss: 3.514603853225708


training:  15%|█▌        | 857/5566 [34:03<2:54:47,  2.23s/it]

training loss: 3.504486083984375


training:  15%|█▌        | 858/5566 [34:06<2:54:53,  2.23s/it]

training loss: 3.524973154067993


training:  15%|█▌        | 859/5566 [34:08<2:55:22,  2.24s/it]

training loss: 3.4960925579071045


training:  15%|█▌        | 860/5566 [34:10<2:55:25,  2.24s/it]

training loss: 3.5475122928619385
valid loss: 3.546816349029541
perplexity: 34.70266342163086


training:  15%|█▌        | 861/5566 [34:14<3:42:41,  2.84s/it]

training loss: 3.507235288619995


training:  15%|█▌        | 862/5566 [34:17<3:30:08,  2.68s/it]

training loss: 3.5163826942443848


training:  16%|█▌        | 863/5566 [34:19<3:18:25,  2.53s/it]

training loss: 3.521623134613037


training:  16%|█▌        | 864/5566 [34:21<3:10:40,  2.43s/it]

training loss: 3.5226364135742188


training:  16%|█▌        | 865/5566 [34:23<3:05:24,  2.37s/it]

training loss: 3.512221574783325


training:  16%|█▌        | 866/5566 [34:25<3:02:12,  2.33s/it]

training loss: 3.5272305011749268


training:  16%|█▌        | 867/5566 [34:28<2:59:23,  2.29s/it]

training loss: 3.5543060302734375


training:  16%|█▌        | 868/5566 [34:30<2:57:37,  2.27s/it]

training loss: 3.530604124069214


training:  16%|█▌        | 869/5566 [34:32<2:56:25,  2.25s/it]

training loss: 3.5066676139831543


training:  16%|█▌        | 870/5566 [34:34<2:54:51,  2.23s/it]

training loss: 3.539608955383301


training:  16%|█▌        | 871/5566 [34:37<2:55:19,  2.24s/it]

training loss: 3.4940662384033203


training:  16%|█▌        | 872/5566 [34:39<2:55:20,  2.24s/it]

training loss: 3.4923107624053955


training:  16%|█▌        | 873/5566 [34:41<2:55:00,  2.24s/it]

training loss: 3.498643159866333


training:  16%|█▌        | 874/5566 [34:43<2:55:15,  2.24s/it]

training loss: 3.5009212493896484


training:  16%|█▌        | 875/5566 [34:46<3:06:53,  2.39s/it]

training loss: 3.544064521789551


training:  16%|█▌        | 876/5566 [34:49<3:10:25,  2.44s/it]

training loss: 3.529298782348633


training:  16%|█▌        | 877/5566 [34:51<3:05:44,  2.38s/it]

training loss: 3.5134193897247314


training:  16%|█▌        | 878/5566 [34:53<3:01:39,  2.33s/it]

training loss: 3.524074077606201


training:  16%|█▌        | 879/5566 [34:55<2:59:37,  2.30s/it]

training loss: 3.533114194869995


training:  16%|█▌        | 880/5566 [34:57<2:57:54,  2.28s/it]

training loss: 3.5108389854431152
valid loss: 3.5106866359710693
perplexity: 33.47124099731445


training:  16%|█▌        | 881/5566 [35:02<3:42:59,  2.86s/it]

training loss: 3.5113420486450195


training:  16%|█▌        | 882/5566 [35:04<3:27:37,  2.66s/it]

training loss: 3.5422048568725586


training:  16%|█▌        | 883/5566 [35:06<3:17:25,  2.53s/it]

training loss: 3.5183324813842773


training:  16%|█▌        | 884/5566 [35:08<3:10:50,  2.45s/it]

training loss: 3.5069730281829834


training:  16%|█▌        | 885/5566 [35:11<3:04:54,  2.37s/it]

training loss: 3.5194334983825684


training:  16%|█▌        | 886/5566 [35:13<3:00:42,  2.32s/it]

training loss: 3.510719060897827


training:  16%|█▌        | 887/5566 [35:15<2:59:02,  2.30s/it]

training loss: 3.5284228324890137


training:  16%|█▌        | 888/5566 [35:17<2:58:17,  2.29s/it]

training loss: 3.5115420818328857


training:  16%|█▌        | 889/5566 [35:19<2:56:37,  2.27s/it]

training loss: 3.531107187271118


training:  16%|█▌        | 890/5566 [35:22<2:55:41,  2.25s/it]

training loss: 3.51235032081604


training:  16%|█▌        | 891/5566 [35:24<2:54:57,  2.25s/it]

training loss: 3.5315394401550293


training:  16%|█▌        | 892/5566 [35:26<2:54:25,  2.24s/it]

training loss: 3.508580207824707


training:  16%|█▌        | 893/5566 [35:28<2:53:29,  2.23s/it]

training loss: 3.523970365524292


training:  16%|█▌        | 894/5566 [35:31<2:53:16,  2.23s/it]

training loss: 3.5438194274902344


training:  16%|█▌        | 895/5566 [35:33<2:52:58,  2.22s/it]

training loss: 3.5238866806030273


training:  16%|█▌        | 896/5566 [35:35<2:52:38,  2.22s/it]

training loss: 3.5347366333007812


training:  16%|█▌        | 897/5566 [35:37<2:53:26,  2.23s/it]

training loss: 3.521301031112671


training:  16%|█▌        | 898/5566 [35:39<2:53:06,  2.23s/it]

training loss: 3.5314207077026367


training:  16%|█▌        | 899/5566 [35:42<2:52:34,  2.22s/it]

training loss: 3.493108034133911


training:  16%|█▌        | 900/5566 [35:44<2:52:16,  2.22s/it]

training loss: 3.5355656147003174
valid loss: 3.534714698791504
perplexity: 34.28523254394531


training:  16%|█▌        | 901/5566 [35:49<3:57:27,  3.05s/it]

training loss: 3.556734323501587


training:  16%|█▌        | 902/5566 [35:51<3:38:27,  2.81s/it]

training loss: 3.5148565769195557


training:  16%|█▌        | 903/5566 [35:53<3:25:25,  2.64s/it]

training loss: 3.547602415084839


training:  16%|█▌        | 904/5566 [35:56<3:14:31,  2.50s/it]

training loss: 3.5261027812957764


training:  16%|█▋        | 905/5566 [35:58<3:07:16,  2.41s/it]

training loss: 3.52293062210083


training:  16%|█▋        | 906/5566 [36:00<3:02:45,  2.35s/it]

training loss: 3.526083469390869


training:  16%|█▋        | 907/5566 [36:02<2:59:43,  2.31s/it]

training loss: 3.533254861831665


training:  16%|█▋        | 908/5566 [36:04<2:57:14,  2.28s/it]

training loss: 3.528440475463867


training:  16%|█▋        | 909/5566 [36:07<2:56:40,  2.28s/it]

training loss: 3.502939462661743


training:  16%|█▋        | 910/5566 [36:09<2:54:57,  2.25s/it]

training loss: 3.5174808502197266


training:  16%|█▋        | 911/5566 [36:11<2:53:49,  2.24s/it]

training loss: 3.5130488872528076


training:  16%|█▋        | 912/5566 [36:13<2:52:53,  2.23s/it]

training loss: 3.5313432216644287


training:  16%|█▋        | 913/5566 [36:16<2:52:18,  2.22s/it]

training loss: 3.513363838195801


training:  16%|█▋        | 914/5566 [36:18<2:52:40,  2.23s/it]

training loss: 3.5182816982269287


training:  16%|█▋        | 915/5566 [36:20<2:52:52,  2.23s/it]

training loss: 3.5338945388793945


training:  16%|█▋        | 916/5566 [36:22<2:52:04,  2.22s/it]

training loss: 3.500441551208496


training:  16%|█▋        | 917/5566 [36:24<2:52:14,  2.22s/it]

training loss: 3.5314338207244873


training:  16%|█▋        | 918/5566 [36:27<2:51:13,  2.21s/it]

training loss: 3.49589204788208


training:  17%|█▋        | 919/5566 [36:29<2:51:16,  2.21s/it]

training loss: 3.523298501968384


training:  17%|█▋        | 920/5566 [36:31<2:50:40,  2.20s/it]

training loss: 3.5309362411499023
valid loss: 3.530243396759033
perplexity: 34.13227462768555


training:  17%|█▋        | 921/5566 [36:35<3:36:20,  2.79s/it]

training loss: 3.516420602798462


training:  17%|█▋        | 922/5566 [36:37<3:23:17,  2.63s/it]

training loss: 3.5179316997528076


training:  17%|█▋        | 923/5566 [36:40<3:13:32,  2.50s/it]

training loss: 3.5187129974365234


training:  17%|█▋        | 924/5566 [36:42<3:06:55,  2.42s/it]

training loss: 3.5077261924743652


training:  17%|█▋        | 925/5566 [36:44<3:02:43,  2.36s/it]

training loss: 3.5234463214874268


training:  17%|█▋        | 926/5566 [36:46<2:59:25,  2.32s/it]

training loss: 3.5142362117767334


training:  17%|█▋        | 927/5566 [36:49<2:57:07,  2.29s/it]

training loss: 3.529130220413208


training:  17%|█▋        | 928/5566 [36:51<2:56:36,  2.28s/it]

training loss: 3.514087677001953


training:  17%|█▋        | 929/5566 [36:53<2:54:51,  2.26s/it]

training loss: 3.5295703411102295


training:  17%|█▋        | 930/5566 [36:55<2:53:31,  2.25s/it]

training loss: 3.5097811222076416


training:  17%|█▋        | 931/5566 [36:57<2:52:20,  2.23s/it]

training loss: 3.53714656829834


training:  17%|█▋        | 932/5566 [37:00<2:51:50,  2.23s/it]

training loss: 3.502939224243164


training:  17%|█▋        | 933/5566 [37:02<2:51:50,  2.23s/it]

training loss: 3.529271125793457


training:  17%|█▋        | 934/5566 [37:04<2:51:46,  2.23s/it]

training loss: 3.505953550338745


training:  17%|█▋        | 935/5566 [37:06<2:51:34,  2.22s/it]

training loss: 3.4991116523742676


training:  17%|█▋        | 936/5566 [37:08<2:51:19,  2.22s/it]

training loss: 3.537867784500122


training:  17%|█▋        | 937/5566 [37:11<2:51:00,  2.22s/it]

training loss: 3.5138747692108154


training:  17%|█▋        | 938/5566 [37:13<2:50:44,  2.21s/it]

training loss: 3.539616107940674


training:  17%|█▋        | 939/5566 [37:15<2:50:51,  2.22s/it]

training loss: 3.515366792678833


training:  17%|█▋        | 940/5566 [37:17<2:50:39,  2.21s/it]

training loss: 3.537369728088379
valid loss: 3.5364058017730713
perplexity: 34.34326171875


training:  17%|█▋        | 941/5566 [37:21<3:35:51,  2.80s/it]

training loss: 3.516395092010498


training:  17%|█▋        | 942/5566 [37:24<3:22:08,  2.62s/it]

training loss: 3.5137650966644287


training:  17%|█▋        | 943/5566 [37:26<3:12:57,  2.50s/it]

training loss: 3.518646240234375


training:  17%|█▋        | 944/5566 [37:28<3:06:22,  2.42s/it]

training loss: 3.528378486633301


training:  17%|█▋        | 945/5566 [37:30<3:02:02,  2.36s/it]

training loss: 3.5312588214874268


training:  17%|█▋        | 946/5566 [37:33<2:58:46,  2.32s/it]

training loss: 3.533217191696167


training:  17%|█▋        | 947/5566 [37:35<2:56:21,  2.29s/it]

training loss: 3.5243077278137207


training:  17%|█▋        | 948/5566 [37:37<2:54:46,  2.27s/it]

training loss: 3.550029754638672


training:  17%|█▋        | 949/5566 [37:39<2:53:23,  2.25s/it]

training loss: 3.5327565670013428


training:  17%|█▋        | 950/5566 [37:41<2:52:28,  2.24s/it]

training loss: 3.5045840740203857


training:  17%|█▋        | 951/5566 [37:44<2:51:58,  2.24s/it]

training loss: 3.5095560550689697


training:  17%|█▋        | 952/5566 [37:46<2:51:09,  2.23s/it]

training loss: 3.506225824356079


training:  17%|█▋        | 953/5566 [37:48<2:50:21,  2.22s/it]

training loss: 3.5274152755737305


training:  17%|█▋        | 954/5566 [37:50<2:50:35,  2.22s/it]

training loss: 3.5323829650878906


training:  17%|█▋        | 955/5566 [37:53<2:49:59,  2.21s/it]

training loss: 3.5318028926849365


training:  17%|█▋        | 956/5566 [37:55<2:49:49,  2.21s/it]

training loss: 3.5640664100646973


training:  17%|█▋        | 957/5566 [37:57<2:49:47,  2.21s/it]

training loss: 3.5370125770568848


training:  17%|█▋        | 958/5566 [37:59<2:50:29,  2.22s/it]

training loss: 3.4993715286254883


training:  17%|█▋        | 959/5566 [38:02<3:04:01,  2.40s/it]

training loss: 3.534135103225708


training:  17%|█▋        | 960/5566 [38:04<3:00:20,  2.35s/it]

training loss: 3.5277199745178223
valid loss: 3.525991201400757
perplexity: 33.98744583129883


training:  17%|█▋        | 961/5566 [38:08<3:43:19,  2.91s/it]

training loss: 3.519143581390381


training:  17%|█▋        | 962/5566 [38:11<3:27:18,  2.70s/it]

training loss: 3.5260016918182373


training:  17%|█▋        | 963/5566 [38:13<3:16:26,  2.56s/it]

training loss: 3.5513105392456055


training:  17%|█▋        | 964/5566 [38:15<3:08:20,  2.46s/it]

training loss: 3.53222918510437


training:  17%|█▋        | 965/5566 [38:17<3:02:24,  2.38s/it]

training loss: 3.5315942764282227


training:  17%|█▋        | 966/5566 [38:20<2:58:53,  2.33s/it]

training loss: 3.5149269104003906


training:  17%|█▋        | 967/5566 [38:22<2:56:00,  2.30s/it]

training loss: 3.5052759647369385


training:  17%|█▋        | 968/5566 [38:24<2:53:46,  2.27s/it]

training loss: 3.5175509452819824


training:  17%|█▋        | 969/5566 [38:26<2:52:42,  2.25s/it]

training loss: 3.530040740966797


training:  17%|█▋        | 970/5566 [38:28<2:52:23,  2.25s/it]

training loss: 3.5311570167541504


training:  17%|█▋        | 971/5566 [38:31<2:51:16,  2.24s/it]

training loss: 3.5201854705810547


training:  17%|█▋        | 972/5566 [38:33<2:50:35,  2.23s/it]

training loss: 3.529611110687256


training:  17%|█▋        | 973/5566 [38:35<2:50:08,  2.22s/it]

training loss: 3.5180246829986572


training:  17%|█▋        | 974/5566 [38:37<2:50:37,  2.23s/it]

training loss: 3.517730236053467


training:  18%|█▊        | 975/5566 [38:39<2:50:14,  2.22s/it]

training loss: 3.5018911361694336


training:  18%|█▊        | 976/5566 [38:42<2:50:14,  2.23s/it]

training loss: 3.534827709197998


training:  18%|█▊        | 977/5566 [38:44<2:50:26,  2.23s/it]

training loss: 3.497544527053833


training:  18%|█▊        | 978/5566 [38:46<2:50:30,  2.23s/it]

training loss: 3.518672227859497


training:  18%|█▊        | 979/5566 [38:48<2:50:12,  2.23s/it]

training loss: 3.5440568923950195


training:  18%|█▊        | 980/5566 [38:51<2:50:11,  2.23s/it]

training loss: 3.522733211517334
valid loss: 3.5216708183288574
perplexity: 33.84092330932617


training:  18%|█▊        | 981/5566 [38:55<3:34:25,  2.81s/it]

training loss: 3.52972149848938


training:  18%|█▊        | 982/5566 [38:57<3:20:17,  2.62s/it]

training loss: 3.525529146194458


training:  18%|█▊        | 983/5566 [38:59<3:11:00,  2.50s/it]

training loss: 3.5072500705718994


training:  18%|█▊        | 984/5566 [39:01<3:04:19,  2.41s/it]

training loss: 3.524336814880371


training:  18%|█▊        | 985/5566 [39:04<2:59:15,  2.35s/it]

training loss: 3.5236263275146484


training:  18%|█▊        | 986/5566 [39:06<2:56:08,  2.31s/it]

training loss: 3.5248892307281494


training:  18%|█▊        | 987/5566 [39:08<2:54:36,  2.29s/it]

training loss: 3.5263102054595947


training:  18%|█▊        | 988/5566 [39:10<2:52:33,  2.26s/it]

training loss: 3.5405492782592773


training:  18%|█▊        | 989/5566 [39:12<2:51:19,  2.25s/it]

training loss: 3.526672601699829


training:  18%|█▊        | 990/5566 [39:15<2:50:19,  2.23s/it]

training loss: 3.5259571075439453


training:  18%|█▊        | 991/5566 [39:17<2:50:19,  2.23s/it]

training loss: 3.5211338996887207


training:  18%|█▊        | 992/5566 [39:19<2:49:47,  2.23s/it]

training loss: 3.5243980884552


training:  18%|█▊        | 993/5566 [39:21<2:49:16,  2.22s/it]

training loss: 3.5174601078033447


training:  18%|█▊        | 994/5566 [39:24<2:49:08,  2.22s/it]

training loss: 3.528003692626953


training:  18%|█▊        | 995/5566 [39:26<2:48:44,  2.21s/it]

training loss: 3.5656301975250244


training:  18%|█▊        | 996/5566 [39:28<2:49:28,  2.23s/it]

training loss: 3.5194761753082275


training:  18%|█▊        | 997/5566 [39:30<2:49:39,  2.23s/it]

training loss: 3.5221946239471436


training:  18%|█▊        | 998/5566 [39:32<2:49:14,  2.22s/it]

training loss: 3.5315122604370117


training:  18%|█▊        | 999/5566 [39:35<2:48:35,  2.21s/it]

training loss: 3.5085744857788086


training:  18%|█▊        | 1000/5566 [39:37<2:48:05,  2.21s/it]

training loss: 3.5158486366271973
valid loss: 3.5150649547576904
perplexity: 33.61811065673828


training:  18%|█▊        | 1001/5566 [39:42<3:52:21,  3.05s/it]

training loss: 3.5108420848846436


training:  18%|█▊        | 1002/5566 [39:44<3:37:33,  2.86s/it]

training loss: 3.4895637035369873


training:  18%|█▊        | 1003/5566 [39:46<3:22:28,  2.66s/it]

training loss: 3.546320676803589


training:  18%|█▊        | 1004/5566 [39:49<3:13:28,  2.54s/it]

training loss: 3.5183258056640625


training:  18%|█▊        | 1005/5566 [39:51<3:05:36,  2.44s/it]

training loss: 3.526167631149292


training:  18%|█▊        | 1006/5566 [39:53<3:00:30,  2.38s/it]

training loss: 3.499843120574951


training:  18%|█▊        | 1007/5566 [39:55<2:56:30,  2.32s/it]

training loss: 3.5036463737487793


training:  18%|█▊        | 1008/5566 [39:58<2:53:49,  2.29s/it]

training loss: 3.521946668624878


training:  18%|█▊        | 1009/5566 [40:00<2:51:51,  2.26s/it]

training loss: 3.5363247394561768


training:  18%|█▊        | 1010/5566 [40:02<2:51:00,  2.25s/it]

training loss: 3.51332950592041


training:  18%|█▊        | 1011/5566 [40:04<2:50:19,  2.24s/it]

training loss: 3.5177130699157715


training:  18%|█▊        | 1012/5566 [40:06<2:49:26,  2.23s/it]

training loss: 3.515716552734375


training:  18%|█▊        | 1013/5566 [40:09<2:50:04,  2.24s/it]

training loss: 3.524113178253174


training:  18%|█▊        | 1014/5566 [40:11<2:49:10,  2.23s/it]

training loss: 3.5281331539154053


training:  18%|█▊        | 1015/5566 [40:13<2:48:27,  2.22s/it]

training loss: 3.5226268768310547


training:  18%|█▊        | 1016/5566 [40:15<2:47:46,  2.21s/it]

training loss: 3.5310401916503906


training:  18%|█▊        | 1017/5566 [40:17<2:47:43,  2.21s/it]

training loss: 3.5264453887939453


training:  18%|█▊        | 1018/5566 [40:20<2:48:27,  2.22s/it]

training loss: 3.5104739665985107


training:  18%|█▊        | 1019/5566 [40:22<2:47:39,  2.21s/it]

training loss: 3.5447771549224854


training:  18%|█▊        | 1020/5566 [40:24<2:47:51,  2.22s/it]

training loss: 3.5108537673950195
valid loss: 3.5092203617095947
perplexity: 33.42219924926758


training:  18%|█▊        | 1021/5566 [40:28<3:32:44,  2.81s/it]

training loss: 3.5418527126312256


training:  18%|█▊        | 1022/5566 [40:31<3:19:39,  2.64s/it]

training loss: 3.5091004371643066


training:  18%|█▊        | 1023/5566 [40:33<3:10:06,  2.51s/it]

training loss: 3.5235021114349365


training:  18%|█▊        | 1024/5566 [40:35<3:02:36,  2.41s/it]

training loss: 3.5344338417053223


training:  18%|█▊        | 1025/5566 [40:37<2:57:25,  2.34s/it]

training loss: 3.504207134246826


training:  18%|█▊        | 1026/5566 [40:39<2:55:06,  2.31s/it]

training loss: 3.544619560241699


training:  18%|█▊        | 1027/5566 [40:42<2:53:05,  2.29s/it]

training loss: 3.510016441345215


training:  18%|█▊        | 1028/5566 [40:44<2:51:40,  2.27s/it]

training loss: 3.512514591217041


training:  18%|█▊        | 1029/5566 [40:46<2:50:45,  2.26s/it]

training loss: 3.5163605213165283


training:  19%|█▊        | 1030/5566 [40:48<2:50:23,  2.25s/it]

training loss: 3.5176167488098145


training:  19%|█▊        | 1031/5566 [40:51<2:49:51,  2.25s/it]

training loss: 3.511714458465576


training:  19%|█▊        | 1032/5566 [40:53<2:49:30,  2.24s/it]

training loss: 3.5276522636413574


training:  19%|█▊        | 1033/5566 [40:55<2:48:53,  2.24s/it]

training loss: 3.5190064907073975


training:  19%|█▊        | 1034/5566 [40:57<2:48:23,  2.23s/it]

training loss: 3.5384774208068848


training:  19%|█▊        | 1035/5566 [40:59<2:47:41,  2.22s/it]

training loss: 3.510364294052124


training:  19%|█▊        | 1036/5566 [41:02<2:47:26,  2.22s/it]

training loss: 3.5323922634124756


training:  19%|█▊        | 1037/5566 [41:04<2:48:04,  2.23s/it]

training loss: 3.5150437355041504


training:  19%|█▊        | 1038/5566 [41:06<2:48:02,  2.23s/it]

training loss: 3.52302622795105


training:  19%|█▊        | 1039/5566 [41:08<2:47:47,  2.22s/it]

training loss: 3.524402618408203


training:  19%|█▊        | 1040/5566 [41:11<2:48:26,  2.23s/it]

training loss: 3.5212745666503906
valid loss: 3.520918369293213
perplexity: 33.815467834472656


training:  19%|█▊        | 1041/5566 [41:15<3:34:17,  2.84s/it]

training loss: 3.4967715740203857


training:  19%|█▊        | 1042/5566 [41:17<3:26:26,  2.74s/it]

training loss: 3.532261610031128


training:  19%|█▊        | 1043/5566 [41:20<3:27:41,  2.76s/it]

training loss: 3.5101351737976074


training:  19%|█▉        | 1044/5566 [41:22<3:16:19,  2.60s/it]

training loss: 3.5205814838409424


training:  19%|█▉        | 1045/5566 [41:25<3:07:54,  2.49s/it]

training loss: 3.533461332321167


training:  19%|█▉        | 1046/5566 [41:27<3:01:39,  2.41s/it]

training loss: 3.5379669666290283


training:  19%|█▉        | 1047/5566 [41:29<2:58:02,  2.36s/it]

training loss: 3.545510768890381


training:  19%|█▉        | 1048/5566 [41:31<2:55:17,  2.33s/it]

training loss: 3.5387680530548096


training:  19%|█▉        | 1049/5566 [41:34<2:53:09,  2.30s/it]

training loss: 3.514158248901367


training:  19%|█▉        | 1050/5566 [41:36<2:51:24,  2.28s/it]

training loss: 3.5017693042755127


training:  19%|█▉        | 1051/5566 [41:38<2:50:38,  2.27s/it]

training loss: 3.5049428939819336


training:  19%|█▉        | 1052/5566 [41:40<2:50:05,  2.26s/it]

training loss: 3.532076597213745


training:  19%|█▉        | 1053/5566 [41:43<2:49:22,  2.25s/it]

training loss: 3.5252254009246826


training:  19%|█▉        | 1054/5566 [41:45<2:49:50,  2.26s/it]

training loss: 3.5321738719940186


training:  19%|█▉        | 1055/5566 [41:47<2:49:39,  2.26s/it]

training loss: 3.5203680992126465


training:  19%|█▉        | 1056/5566 [41:49<2:48:50,  2.25s/it]

training loss: 3.533656120300293


training:  19%|█▉        | 1057/5566 [41:52<2:48:41,  2.24s/it]

training loss: 3.5164403915405273


training:  19%|█▉        | 1058/5566 [41:54<2:49:32,  2.26s/it]

training loss: 3.509126663208008


training:  19%|█▉        | 1059/5566 [41:56<2:49:13,  2.25s/it]

training loss: 3.5330970287323


training:  19%|█▉        | 1060/5566 [41:58<2:48:54,  2.25s/it]

training loss: 3.518716335296631
valid loss: 3.5189573764801025
perplexity: 33.74922561645508


training:  19%|█▉        | 1061/5566 [42:03<3:35:36,  2.87s/it]

training loss: 3.5208184719085693


training:  19%|█▉        | 1062/5566 [42:05<3:23:12,  2.71s/it]

training loss: 3.5284950733184814


training:  19%|█▉        | 1063/5566 [42:07<3:13:20,  2.58s/it]

training loss: 3.5105140209198


training:  19%|█▉        | 1064/5566 [42:09<3:06:01,  2.48s/it]

training loss: 3.5415194034576416


training:  19%|█▉        | 1065/5566 [42:12<3:01:12,  2.42s/it]

training loss: 3.530951738357544


training:  19%|█▉        | 1066/5566 [42:14<2:58:11,  2.38s/it]

training loss: 3.49446439743042


training:  19%|█▉        | 1067/5566 [42:16<2:56:14,  2.35s/it]

training loss: 3.509828805923462


training:  19%|█▉        | 1068/5566 [42:19<2:54:43,  2.33s/it]

training loss: 3.543775796890259


training:  19%|█▉        | 1069/5566 [42:21<2:53:01,  2.31s/it]

training loss: 3.532926082611084


training:  19%|█▉        | 1070/5566 [42:23<2:51:36,  2.29s/it]

training loss: 3.4996490478515625


training:  19%|█▉        | 1071/5566 [42:25<2:51:03,  2.28s/it]

training loss: 3.5247325897216797


training:  19%|█▉        | 1072/5566 [42:28<2:50:32,  2.28s/it]

training loss: 3.545159101486206


training:  19%|█▉        | 1073/5566 [42:30<2:51:20,  2.29s/it]

training loss: 3.5272278785705566


training:  19%|█▉        | 1074/5566 [42:32<2:49:49,  2.27s/it]

training loss: 3.5114564895629883


training:  19%|█▉        | 1075/5566 [42:34<2:49:00,  2.26s/it]

training loss: 3.50799298286438


training:  19%|█▉        | 1076/5566 [42:37<2:49:02,  2.26s/it]

training loss: 3.539180040359497


training:  19%|█▉        | 1077/5566 [42:39<2:48:54,  2.26s/it]

training loss: 3.5045125484466553


training:  19%|█▉        | 1078/5566 [42:41<2:49:27,  2.27s/it]

training loss: 3.5246739387512207


training:  19%|█▉        | 1079/5566 [42:43<2:48:39,  2.26s/it]

training loss: 3.5400657653808594


training:  19%|█▉        | 1080/5566 [42:46<2:48:00,  2.25s/it]

training loss: 3.523733377456665
valid loss: 3.522951126098633
perplexity: 33.88427734375


training:  19%|█▉        | 1081/5566 [42:50<3:33:37,  2.86s/it]

training loss: 3.5311901569366455


training:  19%|█▉        | 1082/5566 [42:52<3:21:06,  2.69s/it]

training loss: 3.511810064315796


training:  19%|█▉        | 1083/5566 [42:55<3:11:42,  2.57s/it]

training loss: 3.53757381439209


training:  19%|█▉        | 1084/5566 [42:57<3:04:06,  2.46s/it]

training loss: 3.5157597064971924


training:  19%|█▉        | 1085/5566 [42:59<2:59:35,  2.40s/it]

training loss: 3.5353801250457764


training:  20%|█▉        | 1086/5566 [43:01<2:55:49,  2.35s/it]

training loss: 3.517280101776123


training:  20%|█▉        | 1087/5566 [43:03<2:53:26,  2.32s/it]

training loss: 3.5271425247192383


training:  20%|█▉        | 1088/5566 [43:06<2:51:06,  2.29s/it]

training loss: 3.5164167881011963


training:  20%|█▉        | 1089/5566 [43:08<2:49:58,  2.28s/it]

training loss: 3.483120918273926


training:  20%|█▉        | 1090/5566 [43:10<2:49:59,  2.28s/it]

training loss: 3.505991220474243


training:  20%|█▉        | 1091/5566 [43:12<2:49:21,  2.27s/it]

training loss: 3.518582820892334


training:  20%|█▉        | 1092/5566 [43:15<2:48:40,  2.26s/it]

training loss: 3.528074264526367


training:  20%|█▉        | 1093/5566 [43:17<2:48:39,  2.26s/it]

training loss: 3.537015438079834


training:  20%|█▉        | 1094/5566 [43:19<2:48:41,  2.26s/it]

training loss: 3.5032482147216797


training:  20%|█▉        | 1095/5566 [43:22<2:48:13,  2.26s/it]

training loss: 3.5367319583892822


training:  20%|█▉        | 1096/5566 [43:24<2:47:42,  2.25s/it]

training loss: 3.5220510959625244


training:  20%|█▉        | 1097/5566 [43:26<2:47:56,  2.25s/it]

training loss: 3.534583568572998


training:  20%|█▉        | 1098/5566 [43:28<2:47:08,  2.24s/it]

training loss: 3.5320169925689697


training:  20%|█▉        | 1099/5566 [43:31<2:48:19,  2.26s/it]

training loss: 3.506082773208618


training:  20%|█▉        | 1100/5566 [43:33<2:47:59,  2.26s/it]

training loss: 3.5160677433013916
valid loss: 3.5156607627868652
perplexity: 33.638145446777344


training:  20%|█▉        | 1101/5566 [43:38<3:49:51,  3.09s/it]

training loss: 3.512120485305786


training:  20%|█▉        | 1102/5566 [43:40<3:33:33,  2.87s/it]

training loss: 3.489920139312744


training:  20%|█▉        | 1103/5566 [43:42<3:19:07,  2.68s/it]

training loss: 3.553040027618408


training:  20%|█▉        | 1104/5566 [43:45<3:09:18,  2.55s/it]

training loss: 3.521151304244995


training:  20%|█▉        | 1105/5566 [43:47<3:02:18,  2.45s/it]

training loss: 3.492738962173462


training:  20%|█▉        | 1106/5566 [43:49<2:57:44,  2.39s/it]

training loss: 3.528425693511963


training:  20%|█▉        | 1107/5566 [43:51<2:55:13,  2.36s/it]

training loss: 3.53132700920105


training:  20%|█▉        | 1108/5566 [43:54<2:53:03,  2.33s/it]

training loss: 3.524623155593872


training:  20%|█▉        | 1109/5566 [43:56<2:51:26,  2.31s/it]

training loss: 3.531071186065674


training:  20%|█▉        | 1110/5566 [43:58<2:49:23,  2.28s/it]

training loss: 3.5429813861846924


training:  20%|█▉        | 1111/5566 [44:00<2:48:21,  2.27s/it]

training loss: 3.507615804672241


training:  20%|█▉        | 1112/5566 [44:03<2:47:50,  2.26s/it]

training loss: 3.543365240097046


training:  20%|█▉        | 1113/5566 [44:05<2:48:04,  2.26s/it]

training loss: 3.5261495113372803


training:  20%|██        | 1114/5566 [44:07<2:48:41,  2.27s/it]

training loss: 3.5168731212615967


training:  20%|██        | 1115/5566 [44:09<2:48:17,  2.27s/it]

training loss: 3.499098300933838


training:  20%|██        | 1116/5566 [44:12<2:48:14,  2.27s/it]

training loss: 3.534869432449341


training:  20%|██        | 1117/5566 [44:14<2:48:04,  2.27s/it]

training loss: 3.5190062522888184


training:  20%|██        | 1118/5566 [44:16<2:47:29,  2.26s/it]

training loss: 3.5145585536956787


training:  20%|██        | 1119/5566 [44:18<2:46:45,  2.25s/it]

training loss: 3.4990057945251465


training:  20%|██        | 1120/5566 [44:21<2:46:09,  2.24s/it]

training loss: 3.503507375717163
valid loss: 3.5024912357330322
perplexity: 33.19805145263672


training:  20%|██        | 1121/5566 [44:25<3:30:03,  2.84s/it]

training loss: 3.5091874599456787


training:  20%|██        | 1122/5566 [44:27<3:17:08,  2.66s/it]

training loss: 3.5216336250305176


training:  20%|██        | 1123/5566 [44:29<3:08:10,  2.54s/it]

training loss: 3.5514838695526123


training:  20%|██        | 1124/5566 [44:32<3:01:04,  2.45s/it]

training loss: 3.52333927154541


training:  20%|██        | 1125/5566 [44:34<2:57:13,  2.39s/it]

training loss: 3.51725172996521


training:  20%|██        | 1126/5566 [44:37<3:07:41,  2.54s/it]

training loss: 3.5305402278900146


training:  20%|██        | 1127/5566 [44:39<3:06:39,  2.52s/it]

training loss: 3.5528180599212646


training:  20%|██        | 1128/5566 [44:42<3:00:57,  2.45s/it]

training loss: 3.5159454345703125


training:  20%|██        | 1129/5566 [44:44<2:56:31,  2.39s/it]

training loss: 3.4868249893188477


training:  20%|██        | 1130/5566 [44:46<2:54:00,  2.35s/it]

training loss: 3.5192770957946777


training:  20%|██        | 1131/5566 [44:48<2:52:16,  2.33s/it]

training loss: 3.4942731857299805


training:  20%|██        | 1132/5566 [44:51<2:50:32,  2.31s/it]

training loss: 3.544872760772705


training:  20%|██        | 1133/5566 [44:53<2:49:11,  2.29s/it]

training loss: 3.551483392715454


training:  20%|██        | 1134/5566 [44:55<2:48:48,  2.29s/it]

training loss: 3.499922037124634


training:  20%|██        | 1135/5566 [44:57<2:48:20,  2.28s/it]

training loss: 3.5120675563812256


training:  20%|██        | 1136/5566 [45:00<2:47:54,  2.27s/it]

training loss: 3.528972625732422


training:  20%|██        | 1137/5566 [45:02<2:46:27,  2.25s/it]

training loss: 3.5143723487854004


training:  20%|██        | 1138/5566 [45:04<2:46:19,  2.25s/it]

training loss: 3.506455659866333


training:  20%|██        | 1139/5566 [45:06<2:47:01,  2.26s/it]

training loss: 3.518399715423584


training:  20%|██        | 1140/5566 [45:09<2:46:48,  2.26s/it]

training loss: 3.5240297317504883
valid loss: 3.5235519409179688
perplexity: 33.90464401245117


training:  20%|██        | 1141/5566 [45:13<3:30:51,  2.86s/it]

training loss: 3.5268545150756836


training:  21%|██        | 1142/5566 [45:15<3:17:05,  2.67s/it]

training loss: 3.5109071731567383


training:  21%|██        | 1143/5566 [45:17<3:07:01,  2.54s/it]

training loss: 3.5183515548706055


training:  21%|██        | 1144/5566 [45:20<2:59:44,  2.44s/it]

training loss: 3.529790163040161


training:  21%|██        | 1145/5566 [45:22<2:55:04,  2.38s/it]

training loss: 3.526646852493286


training:  21%|██        | 1146/5566 [45:24<2:52:16,  2.34s/it]

training loss: 3.5185489654541016


training:  21%|██        | 1147/5566 [45:26<2:51:00,  2.32s/it]

training loss: 3.5401039123535156


training:  21%|██        | 1148/5566 [45:29<2:48:59,  2.29s/it]

training loss: 3.508169174194336


training:  21%|██        | 1149/5566 [45:31<2:48:29,  2.29s/it]

training loss: 3.531735897064209


training:  21%|██        | 1150/5566 [45:33<2:47:32,  2.28s/it]

training loss: 3.5283689498901367


training:  21%|██        | 1151/5566 [45:35<2:46:40,  2.27s/it]

training loss: 3.5202252864837646


training:  21%|██        | 1152/5566 [45:38<2:46:16,  2.26s/it]

training loss: 3.5321264266967773


training:  21%|██        | 1153/5566 [45:40<2:45:56,  2.26s/it]

training loss: 3.524538993835449


training:  21%|██        | 1154/5566 [45:42<2:46:27,  2.26s/it]

training loss: 3.5111968517303467


training:  21%|██        | 1155/5566 [45:44<2:46:07,  2.26s/it]

training loss: 3.532252788543701


training:  21%|██        | 1156/5566 [45:47<2:45:27,  2.25s/it]

training loss: 3.5161244869232178


training:  21%|██        | 1157/5566 [45:49<2:44:55,  2.24s/it]

training loss: 3.5166354179382324


training:  21%|██        | 1158/5566 [45:51<2:44:42,  2.24s/it]

training loss: 3.511737823486328


training:  21%|██        | 1159/5566 [45:53<2:45:22,  2.25s/it]

training loss: 3.5236542224884033


training:  21%|██        | 1160/5566 [45:56<2:45:16,  2.25s/it]

training loss: 3.5392303466796875
valid loss: 3.5386438369750977
perplexity: 34.42020797729492


training:  21%|██        | 1161/5566 [46:00<3:29:50,  2.86s/it]

training loss: 3.5315637588500977


training:  21%|██        | 1162/5566 [46:02<3:16:58,  2.68s/it]

training loss: 3.544820547103882


training:  21%|██        | 1163/5566 [46:04<3:07:49,  2.56s/it]

training loss: 3.534675121307373


training:  21%|██        | 1164/5566 [46:07<3:01:18,  2.47s/it]

training loss: 3.525611400604248


training:  21%|██        | 1165/5566 [46:09<2:56:14,  2.40s/it]

training loss: 3.5155444145202637


training:  21%|██        | 1166/5566 [46:11<2:53:06,  2.36s/it]

training loss: 3.526451826095581


training:  21%|██        | 1167/5566 [46:13<2:50:44,  2.33s/it]

training loss: 3.5058095455169678


training:  21%|██        | 1168/5566 [46:16<2:48:55,  2.30s/it]

training loss: 3.522258758544922


training:  21%|██        | 1169/5566 [46:18<2:47:24,  2.28s/it]

training loss: 3.5177133083343506


training:  21%|██        | 1170/5566 [46:20<2:46:04,  2.27s/it]

training loss: 3.5335001945495605


training:  21%|██        | 1171/5566 [46:22<2:45:27,  2.26s/it]

training loss: 3.5527455806732178


training:  21%|██        | 1172/5566 [46:25<2:44:34,  2.25s/it]

training loss: 3.5173962116241455


training:  21%|██        | 1173/5566 [46:27<2:44:16,  2.24s/it]

training loss: 3.5364902019500732


training:  21%|██        | 1174/5566 [46:29<2:43:57,  2.24s/it]

training loss: 3.5207674503326416


training:  21%|██        | 1175/5566 [46:31<2:44:00,  2.24s/it]

training loss: 3.5203704833984375


training:  21%|██        | 1176/5566 [46:34<2:45:35,  2.26s/it]

training loss: 3.5341644287109375


training:  21%|██        | 1177/5566 [46:36<2:45:05,  2.26s/it]

training loss: 3.545711040496826


training:  21%|██        | 1178/5566 [46:38<2:44:10,  2.24s/it]

training loss: 3.5071678161621094


training:  21%|██        | 1179/5566 [46:40<2:43:35,  2.24s/it]

training loss: 3.5091028213500977


training:  21%|██        | 1180/5566 [46:43<2:43:44,  2.24s/it]

training loss: 3.5048861503601074
valid loss: 3.5035221576690674
perplexity: 33.23229217529297


training:  21%|██        | 1181/5566 [46:47<3:28:10,  2.85s/it]

training loss: 3.514357089996338


training:  21%|██        | 1182/5566 [46:49<3:15:31,  2.68s/it]

training loss: 3.563021183013916


training:  21%|██▏       | 1183/5566 [46:51<3:05:45,  2.54s/it]

training loss: 3.5017175674438477


training:  21%|██▏       | 1184/5566 [46:54<2:59:29,  2.46s/it]

training loss: 3.5312583446502686


training:  21%|██▏       | 1185/5566 [46:56<2:54:19,  2.39s/it]

training loss: 3.5015573501586914


training:  21%|██▏       | 1186/5566 [46:58<2:50:22,  2.33s/it]

training loss: 3.5152313709259033


training:  21%|██▏       | 1187/5566 [47:00<2:48:54,  2.31s/it]

training loss: 3.5183024406433105


training:  21%|██▏       | 1188/5566 [47:03<2:47:26,  2.29s/it]

training loss: 3.540168046951294


training:  21%|██▏       | 1189/5566 [47:05<2:47:40,  2.30s/it]

training loss: 3.534579277038574


training:  21%|██▏       | 1190/5566 [47:07<2:46:26,  2.28s/it]

training loss: 3.537889003753662


training:  21%|██▏       | 1191/5566 [47:09<2:45:27,  2.27s/it]

training loss: 3.5382206439971924


training:  21%|██▏       | 1192/5566 [47:12<2:44:23,  2.26s/it]

training loss: 3.524014711380005


training:  21%|██▏       | 1193/5566 [47:14<2:43:56,  2.25s/it]

training loss: 3.5202784538269043


training:  21%|██▏       | 1194/5566 [47:16<2:43:28,  2.24s/it]

training loss: 3.5101590156555176


training:  21%|██▏       | 1195/5566 [47:18<2:43:15,  2.24s/it]

training loss: 3.528374671936035


training:  21%|██▏       | 1196/5566 [47:20<2:42:46,  2.23s/it]

training loss: 3.5419681072235107


training:  22%|██▏       | 1197/5566 [47:23<2:42:30,  2.23s/it]

training loss: 3.51288104057312


training:  22%|██▏       | 1198/5566 [47:25<2:42:35,  2.23s/it]

training loss: 3.529766798019409


training:  22%|██▏       | 1199/5566 [47:27<2:42:47,  2.24s/it]

training loss: 3.5168607234954834


training:  22%|██▏       | 1200/5566 [47:29<2:42:33,  2.23s/it]

training loss: 3.505974769592285
valid loss: 3.5046019554138184
perplexity: 33.2681999206543


training:  22%|██▏       | 1201/5566 [47:34<3:43:41,  3.07s/it]

training loss: 3.5279550552368164


training:  22%|██▏       | 1202/5566 [47:37<3:27:59,  2.86s/it]

training loss: 3.5532798767089844


training:  22%|██▏       | 1203/5566 [47:39<3:14:48,  2.68s/it]

training loss: 3.5146543979644775


training:  22%|██▏       | 1204/5566 [47:41<3:05:09,  2.55s/it]

training loss: 3.526841878890991


training:  22%|██▏       | 1205/5566 [47:44<2:58:30,  2.46s/it]

training loss: 3.557929039001465


training:  22%|██▏       | 1206/5566 [47:46<2:53:20,  2.39s/it]

training loss: 3.5302555561065674


training:  22%|██▏       | 1207/5566 [47:48<2:49:42,  2.34s/it]

training loss: 3.507761240005493


training:  22%|██▏       | 1208/5566 [47:50<2:47:21,  2.30s/it]

training loss: 3.517970323562622


training:  22%|██▏       | 1209/5566 [47:52<2:45:35,  2.28s/it]

training loss: 3.516035795211792


training:  22%|██▏       | 1210/5566 [47:55<2:52:41,  2.38s/it]

training loss: 3.504270553588867


training:  22%|██▏       | 1211/5566 [47:58<3:00:53,  2.49s/it]

training loss: 3.5315051078796387


training:  22%|██▏       | 1212/5566 [48:00<2:55:09,  2.41s/it]

training loss: 3.5321178436279297


training:  22%|██▏       | 1213/5566 [48:02<2:51:35,  2.37s/it]

training loss: 3.5162932872772217


training:  22%|██▏       | 1214/5566 [48:05<2:49:13,  2.33s/it]

training loss: 3.530606746673584


training:  22%|██▏       | 1215/5566 [48:07<2:47:32,  2.31s/it]

training loss: 3.515314817428589


training:  22%|██▏       | 1216/5566 [48:09<2:47:02,  2.30s/it]

training loss: 3.536282539367676


training:  22%|██▏       | 1217/5566 [48:11<2:45:35,  2.28s/it]

training loss: 3.5448925495147705


training:  22%|██▏       | 1218/5566 [48:14<2:44:50,  2.27s/it]

training loss: 3.5511562824249268


training:  22%|██▏       | 1219/5566 [48:16<2:43:37,  2.26s/it]

training loss: 3.517378568649292


training:  22%|██▏       | 1220/5566 [48:18<2:43:04,  2.25s/it]

training loss: 3.509641408920288
valid loss: 3.50929594039917
perplexity: 33.42472839355469


training:  22%|██▏       | 1221/5566 [48:22<3:25:29,  2.84s/it]

training loss: 3.5586907863616943


training:  22%|██▏       | 1222/5566 [48:24<3:11:56,  2.65s/it]

training loss: 3.5178096294403076


training:  22%|██▏       | 1223/5566 [48:27<3:02:24,  2.52s/it]

training loss: 3.5058107376098633


training:  22%|██▏       | 1224/5566 [48:29<2:55:59,  2.43s/it]

training loss: 3.5180864334106445


training:  22%|██▏       | 1225/5566 [48:31<2:52:45,  2.39s/it]

training loss: 3.5095880031585693


training:  22%|██▏       | 1226/5566 [48:33<2:49:32,  2.34s/it]

training loss: 3.5316970348358154


training:  22%|██▏       | 1227/5566 [48:36<2:47:05,  2.31s/it]

training loss: 3.5598373413085938


training:  22%|██▏       | 1228/5566 [48:38<2:45:34,  2.29s/it]

training loss: 3.537264585494995


training:  22%|██▏       | 1229/5566 [48:40<2:44:09,  2.27s/it]

training loss: 3.5016438961029053


training:  22%|██▏       | 1230/5566 [48:42<2:43:19,  2.26s/it]

training loss: 3.508877992630005


training:  22%|██▏       | 1231/5566 [48:45<2:43:26,  2.26s/it]

training loss: 3.520596981048584


training:  22%|██▏       | 1232/5566 [48:47<2:43:05,  2.26s/it]

training loss: 3.506532669067383


training:  22%|██▏       | 1233/5566 [48:49<2:42:50,  2.25s/it]

training loss: 3.5146989822387695


training:  22%|██▏       | 1234/5566 [48:51<2:42:49,  2.26s/it]

training loss: 3.536581516265869


training:  22%|██▏       | 1235/5566 [48:54<2:42:50,  2.26s/it]

training loss: 3.5321712493896484


training:  22%|██▏       | 1236/5566 [48:56<2:42:53,  2.26s/it]

training loss: 3.5074172019958496


training:  22%|██▏       | 1237/5566 [48:58<2:42:24,  2.25s/it]

training loss: 3.510903835296631


training:  22%|██▏       | 1238/5566 [49:00<2:42:31,  2.25s/it]

training loss: 3.5232343673706055


training:  22%|██▏       | 1239/5566 [49:03<2:42:55,  2.26s/it]

training loss: 3.5187461376190186


training:  22%|██▏       | 1240/5566 [49:05<2:42:36,  2.26s/it]

training loss: 3.50679612159729
valid loss: 3.505801200866699
perplexity: 33.30812072753906


training:  22%|██▏       | 1241/5566 [49:09<3:25:44,  2.85s/it]

training loss: 3.5307369232177734


training:  22%|██▏       | 1242/5566 [49:11<3:12:35,  2.67s/it]

training loss: 3.5452637672424316


training:  22%|██▏       | 1243/5566 [49:14<3:03:42,  2.55s/it]

training loss: 3.524848222732544


training:  22%|██▏       | 1244/5566 [49:16<2:57:36,  2.47s/it]

training loss: 3.511214256286621


training:  22%|██▏       | 1245/5566 [49:18<2:52:41,  2.40s/it]

training loss: 3.5195388793945312


training:  22%|██▏       | 1246/5566 [49:20<2:48:58,  2.35s/it]

training loss: 3.520380973815918


training:  22%|██▏       | 1247/5566 [49:23<2:47:02,  2.32s/it]

training loss: 3.533047914505005


training:  22%|██▏       | 1248/5566 [49:25<2:45:22,  2.30s/it]

training loss: 3.533341884613037


training:  22%|██▏       | 1249/5566 [49:27<2:44:08,  2.28s/it]

training loss: 3.5270326137542725


training:  22%|██▏       | 1250/5566 [49:29<2:43:33,  2.27s/it]

training loss: 3.522902011871338


training:  22%|██▏       | 1251/5566 [49:32<2:42:47,  2.26s/it]

training loss: 3.510714292526245


training:  22%|██▏       | 1252/5566 [49:34<2:42:26,  2.26s/it]

training loss: 3.5316519737243652


training:  23%|██▎       | 1253/5566 [49:36<2:42:40,  2.26s/it]

training loss: 3.5044000148773193


training:  23%|██▎       | 1254/5566 [49:38<2:42:50,  2.27s/it]

training loss: 3.5113158226013184


training:  23%|██▎       | 1255/5566 [49:41<2:42:19,  2.26s/it]

training loss: 3.502767562866211


training:  23%|██▎       | 1256/5566 [49:43<2:42:18,  2.26s/it]

training loss: 3.5283026695251465


training:  23%|██▎       | 1257/5566 [49:45<2:41:52,  2.25s/it]

training loss: 3.528331756591797


training:  23%|██▎       | 1258/5566 [49:47<2:41:35,  2.25s/it]

training loss: 3.552481174468994


training:  23%|██▎       | 1259/5566 [49:50<2:41:16,  2.25s/it]

training loss: 3.522486925125122


training:  23%|██▎       | 1260/5566 [49:52<2:41:00,  2.24s/it]

training loss: 3.516289234161377
valid loss: 3.51526141166687
perplexity: 33.62471389770508


training:  23%|██▎       | 1261/5566 [49:56<3:23:19,  2.83s/it]

training loss: 3.521409273147583


training:  23%|██▎       | 1262/5566 [49:58<3:10:10,  2.65s/it]

training loss: 3.5189709663391113


training:  23%|██▎       | 1263/5566 [50:01<3:01:00,  2.52s/it]

training loss: 3.5290539264678955


training:  23%|██▎       | 1264/5566 [50:03<2:55:22,  2.45s/it]

training loss: 3.513612747192383


training:  23%|██▎       | 1265/5566 [50:05<2:51:19,  2.39s/it]

training loss: 3.510772466659546


training:  23%|██▎       | 1266/5566 [50:07<2:47:41,  2.34s/it]

training loss: 3.5002496242523193


training:  23%|██▎       | 1267/5566 [50:10<2:45:22,  2.31s/it]

training loss: 3.508450508117676


training:  23%|██▎       | 1268/5566 [50:12<2:44:01,  2.29s/it]

training loss: 3.5164291858673096


training:  23%|██▎       | 1269/5566 [50:14<2:43:27,  2.28s/it]

training loss: 3.554170608520508


training:  23%|██▎       | 1270/5566 [50:16<2:42:18,  2.27s/it]

training loss: 3.5103201866149902


training:  23%|██▎       | 1271/5566 [50:18<2:41:33,  2.26s/it]

training loss: 3.529282808303833


training:  23%|██▎       | 1272/5566 [50:21<2:41:31,  2.26s/it]

training loss: 3.5428807735443115


training:  23%|██▎       | 1273/5566 [50:23<2:41:12,  2.25s/it]

training loss: 3.5350449085235596


training:  23%|██▎       | 1274/5566 [50:25<2:40:59,  2.25s/it]

training loss: 3.534226894378662


training:  23%|██▎       | 1275/5566 [50:27<2:40:22,  2.24s/it]

training loss: 3.5092408657073975


training:  23%|██▎       | 1276/5566 [50:30<2:40:03,  2.24s/it]

training loss: 3.5414068698883057


training:  23%|██▎       | 1277/5566 [50:32<2:40:24,  2.24s/it]

training loss: 3.516979694366455


training:  23%|██▎       | 1278/5566 [50:34<2:40:07,  2.24s/it]

training loss: 3.5198683738708496


training:  23%|██▎       | 1279/5566 [50:36<2:40:04,  2.24s/it]

training loss: 3.547257423400879


training:  23%|██▎       | 1280/5566 [50:39<2:39:24,  2.23s/it]

training loss: 3.5373969078063965
valid loss: 3.5361998081207275
perplexity: 34.336185455322266


training:  23%|██▎       | 1281/5566 [50:43<3:22:05,  2.83s/it]

training loss: 3.518664598464966


training:  23%|██▎       | 1282/5566 [50:45<3:10:21,  2.67s/it]

training loss: 3.5397355556488037


training:  23%|██▎       | 1283/5566 [50:47<3:01:24,  2.54s/it]

training loss: 3.546684980392456


training:  23%|██▎       | 1284/5566 [50:50<2:55:17,  2.46s/it]

training loss: 3.5313892364501953


training:  23%|██▎       | 1285/5566 [50:52<2:51:13,  2.40s/it]

training loss: 3.5343215465545654


training:  23%|██▎       | 1286/5566 [50:54<2:48:20,  2.36s/it]

training loss: 3.53128719329834


training:  23%|██▎       | 1287/5566 [50:56<2:46:06,  2.33s/it]

training loss: 3.5220255851745605


training:  23%|██▎       | 1288/5566 [50:59<2:44:18,  2.30s/it]

training loss: 3.5521602630615234


training:  23%|██▎       | 1289/5566 [51:01<2:42:34,  2.28s/it]

training loss: 3.521789073944092


training:  23%|██▎       | 1290/5566 [51:03<2:42:33,  2.28s/it]

training loss: 3.514843225479126


training:  23%|██▎       | 1291/5566 [51:05<2:41:43,  2.27s/it]

training loss: 3.5356578826904297


training:  23%|██▎       | 1292/5566 [51:08<2:41:06,  2.26s/it]

training loss: 3.5016558170318604


training:  23%|██▎       | 1293/5566 [51:10<2:40:34,  2.25s/it]

training loss: 3.5277957916259766


training:  23%|██▎       | 1294/5566 [51:12<2:40:46,  2.26s/it]

training loss: 3.5228798389434814


training:  23%|██▎       | 1295/5566 [51:15<2:54:20,  2.45s/it]

training loss: 3.5155744552612305


training:  23%|██▎       | 1296/5566 [51:17<2:51:52,  2.42s/it]

training loss: 3.5238964557647705


training:  23%|██▎       | 1297/5566 [51:20<2:48:13,  2.36s/it]

training loss: 3.5273008346557617


training:  23%|██▎       | 1298/5566 [51:22<2:45:51,  2.33s/it]

training loss: 3.513045310974121


training:  23%|██▎       | 1299/5566 [51:24<2:44:33,  2.31s/it]

training loss: 3.5436131954193115


training:  23%|██▎       | 1300/5566 [51:26<2:42:52,  2.29s/it]

training loss: 3.5152511596679688
valid loss: 3.5148119926452637
perplexity: 33.6096076965332


training:  23%|██▎       | 1301/5566 [51:31<3:39:43,  3.09s/it]

training loss: 3.510826826095581


training:  23%|██▎       | 1302/5566 [51:34<3:25:42,  2.89s/it]

training loss: 3.505354881286621


training:  23%|██▎       | 1303/5566 [51:36<3:11:30,  2.70s/it]

training loss: 3.524681568145752


training:  23%|██▎       | 1304/5566 [51:38<3:01:36,  2.56s/it]

training loss: 3.5149712562561035


training:  23%|██▎       | 1305/5566 [51:41<2:55:10,  2.47s/it]

training loss: 3.5170400142669678


training:  23%|██▎       | 1306/5566 [51:43<2:49:48,  2.39s/it]

training loss: 3.5564136505126953


training:  23%|██▎       | 1307/5566 [51:45<2:46:38,  2.35s/it]

training loss: 3.5201728343963623


training:  23%|██▎       | 1308/5566 [51:47<2:44:50,  2.32s/it]

training loss: 3.532057285308838


training:  24%|██▎       | 1309/5566 [51:50<2:43:09,  2.30s/it]

training loss: 3.527658462524414


training:  24%|██▎       | 1310/5566 [51:52<2:42:00,  2.28s/it]

training loss: 3.5143871307373047


training:  24%|██▎       | 1311/5566 [51:54<2:41:22,  2.28s/it]

training loss: 3.525465250015259


training:  24%|██▎       | 1312/5566 [51:56<2:41:38,  2.28s/it]

training loss: 3.522244691848755


training:  24%|██▎       | 1313/5566 [51:59<2:40:47,  2.27s/it]

training loss: 3.510775089263916


training:  24%|██▎       | 1314/5566 [52:01<2:40:18,  2.26s/it]

training loss: 3.4902515411376953


training:  24%|██▎       | 1315/5566 [52:03<2:40:15,  2.26s/it]

training loss: 3.523312568664551


training:  24%|██▎       | 1316/5566 [52:05<2:39:50,  2.26s/it]

training loss: 3.5233051776885986


training:  24%|██▎       | 1317/5566 [52:08<2:40:42,  2.27s/it]

training loss: 3.516615390777588


training:  24%|██▎       | 1318/5566 [52:10<2:40:24,  2.27s/it]

training loss: 3.5306754112243652


training:  24%|██▎       | 1319/5566 [52:12<2:40:14,  2.26s/it]

training loss: 3.5169148445129395


training:  24%|██▎       | 1320/5566 [52:14<2:39:48,  2.26s/it]

training loss: 3.504366159439087
valid loss: 3.5044398307800293
perplexity: 33.2628059387207


training:  24%|██▎       | 1321/5566 [52:19<3:22:47,  2.87s/it]

training loss: 3.5276310443878174


training:  24%|██▍       | 1322/5566 [52:21<3:10:08,  2.69s/it]

training loss: 3.530705690383911


training:  24%|██▍       | 1323/5566 [52:23<3:00:42,  2.56s/it]

training loss: 3.53769588470459


training:  24%|██▍       | 1324/5566 [52:25<2:54:04,  2.46s/it]

training loss: 3.527010679244995


training:  24%|██▍       | 1325/5566 [52:28<2:49:12,  2.39s/it]

training loss: 3.5235977172851562


training:  24%|██▍       | 1326/5566 [52:30<2:45:59,  2.35s/it]

training loss: 3.5292744636535645


training:  24%|██▍       | 1327/5566 [52:32<2:43:19,  2.31s/it]

training loss: 3.5183334350585938


training:  24%|██▍       | 1328/5566 [52:34<2:41:57,  2.29s/it]

training loss: 3.5249178409576416


training:  24%|██▍       | 1329/5566 [52:37<2:40:18,  2.27s/it]

training loss: 3.5242137908935547


training:  24%|██▍       | 1330/5566 [52:39<2:39:40,  2.26s/it]

training loss: 3.4959259033203125


training:  24%|██▍       | 1331/5566 [52:41<2:39:21,  2.26s/it]

training loss: 3.5082647800445557


training:  24%|██▍       | 1332/5566 [52:43<2:39:12,  2.26s/it]

training loss: 3.5123307704925537


training:  24%|██▍       | 1333/5566 [52:46<2:38:46,  2.25s/it]

training loss: 3.5558359622955322


training:  24%|██▍       | 1334/5566 [52:48<2:39:00,  2.25s/it]

training loss: 3.529496192932129


training:  24%|██▍       | 1335/5566 [52:50<2:38:51,  2.25s/it]

training loss: 3.515735626220703


training:  24%|██▍       | 1336/5566 [52:52<2:38:40,  2.25s/it]

training loss: 3.5156784057617188


training:  24%|██▍       | 1337/5566 [52:55<2:38:56,  2.26s/it]

training loss: 3.5142204761505127


training:  24%|██▍       | 1338/5566 [52:57<2:38:14,  2.25s/it]

training loss: 3.5140702724456787


training:  24%|██▍       | 1339/5566 [52:59<2:38:32,  2.25s/it]

training loss: 3.5018727779388428


training:  24%|██▍       | 1340/5566 [53:01<2:38:00,  2.24s/it]

training loss: 3.5091166496276855
valid loss: 3.508692979812622
perplexity: 33.404579162597656


training:  24%|██▍       | 1341/5566 [53:06<3:20:48,  2.85s/it]

training loss: 3.5193400382995605


training:  24%|██▍       | 1342/5566 [53:08<3:08:43,  2.68s/it]

training loss: 3.5260567665100098


training:  24%|██▍       | 1343/5566 [53:10<2:59:29,  2.55s/it]

training loss: 3.517437696456909


training:  24%|██▍       | 1344/5566 [53:12<2:53:30,  2.47s/it]

training loss: 3.5088226795196533


training:  24%|██▍       | 1345/5566 [53:15<2:48:52,  2.40s/it]

training loss: 3.5125181674957275


training:  24%|██▍       | 1346/5566 [53:17<2:47:08,  2.38s/it]

training loss: 3.5177741050720215


training:  24%|██▍       | 1347/5566 [53:19<2:44:14,  2.34s/it]

training loss: 3.5189263820648193


training:  24%|██▍       | 1348/5566 [53:21<2:42:03,  2.31s/it]

training loss: 3.5095012187957764


training:  24%|██▍       | 1349/5566 [53:24<2:40:04,  2.28s/it]

training loss: 3.537980079650879


training:  24%|██▍       | 1350/5566 [53:26<2:38:26,  2.25s/it]

training loss: 3.506885051727295


training:  24%|██▍       | 1351/5566 [53:28<2:38:06,  2.25s/it]

training loss: 3.4987893104553223


training:  24%|██▍       | 1352/5566 [53:30<2:38:10,  2.25s/it]

training loss: 3.5218913555145264


training:  24%|██▍       | 1353/5566 [53:33<2:38:11,  2.25s/it]

training loss: 3.510957717895508


training:  24%|██▍       | 1354/5566 [53:35<2:37:46,  2.25s/it]

training loss: 3.5316526889801025


training:  24%|██▍       | 1355/5566 [53:37<2:37:53,  2.25s/it]

training loss: 3.5068273544311523


training:  24%|██▍       | 1356/5566 [53:39<2:38:14,  2.26s/it]

training loss: 3.5114104747772217


training:  24%|██▍       | 1357/5566 [53:42<2:38:05,  2.25s/it]

training loss: 3.5151827335357666


training:  24%|██▍       | 1358/5566 [53:44<2:38:01,  2.25s/it]

training loss: 3.5306460857391357


training:  24%|██▍       | 1359/5566 [53:46<2:38:15,  2.26s/it]

training loss: 3.507859706878662


training:  24%|██▍       | 1360/5566 [53:48<2:38:24,  2.26s/it]

training loss: 3.484619140625
valid loss: 3.4835660457611084
perplexity: 32.57568359375


training:  24%|██▍       | 1361/5566 [53:53<3:19:40,  2.85s/it]

training loss: 3.5054140090942383


training:  24%|██▍       | 1362/5566 [53:55<3:08:02,  2.68s/it]

training loss: 3.5455965995788574


training:  24%|██▍       | 1363/5566 [53:57<2:58:12,  2.54s/it]

training loss: 3.5182650089263916


training:  25%|██▍       | 1364/5566 [53:59<2:51:15,  2.45s/it]

training loss: 3.501938819885254


training:  25%|██▍       | 1365/5566 [54:02<2:46:43,  2.38s/it]

training loss: 3.487614154815674


training:  25%|██▍       | 1366/5566 [54:04<2:43:20,  2.33s/it]

training loss: 3.5102596282958984


training:  25%|██▍       | 1367/5566 [54:06<2:42:45,  2.33s/it]

training loss: 3.511664628982544


training:  25%|██▍       | 1368/5566 [54:08<2:41:02,  2.30s/it]

training loss: 3.521263360977173


training:  25%|██▍       | 1369/5566 [54:11<2:39:44,  2.28s/it]

training loss: 3.5121846199035645


training:  25%|██▍       | 1370/5566 [54:13<2:39:04,  2.27s/it]

training loss: 3.5105392932891846


training:  25%|██▍       | 1371/5566 [54:15<2:38:09,  2.26s/it]

training loss: 3.526561975479126


training:  25%|██▍       | 1372/5566 [54:17<2:37:59,  2.26s/it]

training loss: 3.4945268630981445


training:  25%|██▍       | 1373/5566 [54:20<2:37:57,  2.26s/it]

training loss: 3.5073907375335693


training:  25%|██▍       | 1374/5566 [54:22<2:37:38,  2.26s/it]

training loss: 3.5406856536865234


training:  25%|██▍       | 1375/5566 [54:24<2:36:58,  2.25s/it]

training loss: 3.523179054260254


training:  25%|██▍       | 1376/5566 [54:26<2:36:57,  2.25s/it]

training loss: 3.5378646850585938


training:  25%|██▍       | 1377/5566 [54:29<2:38:21,  2.27s/it]

training loss: 3.5039329528808594


training:  25%|██▍       | 1378/5566 [54:31<2:38:07,  2.27s/it]

training loss: 3.5384817123413086


training:  25%|██▍       | 1379/5566 [54:33<2:37:30,  2.26s/it]

training loss: 3.516007661819458


training:  25%|██▍       | 1380/5566 [54:36<2:44:15,  2.35s/it]

training loss: 3.5047457218170166
valid loss: 3.5037200450897217
perplexity: 33.23887252807617


training:  25%|██▍       | 1381/5566 [54:40<3:34:37,  3.08s/it]

training loss: 3.5297048091888428


training:  25%|██▍       | 1382/5566 [54:43<3:16:54,  2.82s/it]

training loss: 3.530198097229004


training:  25%|██▍       | 1383/5566 [54:45<3:05:20,  2.66s/it]

training loss: 3.506287097930908


training:  25%|██▍       | 1384/5566 [54:47<2:57:31,  2.55s/it]

training loss: 3.532162666320801


training:  25%|██▍       | 1385/5566 [54:50<2:51:50,  2.47s/it]

training loss: 3.5146756172180176


training:  25%|██▍       | 1386/5566 [54:52<2:47:24,  2.40s/it]

training loss: 3.5242502689361572


training:  25%|██▍       | 1387/5566 [54:54<2:44:03,  2.36s/it]

training loss: 3.5130136013031006


training:  25%|██▍       | 1388/5566 [54:56<2:42:25,  2.33s/it]

training loss: 3.5290277004241943


training:  25%|██▍       | 1389/5566 [54:59<2:40:30,  2.31s/it]

training loss: 3.526508331298828


training:  25%|██▍       | 1390/5566 [55:01<2:39:35,  2.29s/it]

training loss: 3.522111654281616


training:  25%|██▍       | 1391/5566 [55:03<2:38:01,  2.27s/it]

training loss: 3.5209739208221436


training:  25%|██▌       | 1392/5566 [55:05<2:36:56,  2.26s/it]

training loss: 3.526000499725342


training:  25%|██▌       | 1393/5566 [55:08<2:37:23,  2.26s/it]

training loss: 3.520720958709717


training:  25%|██▌       | 1394/5566 [55:10<2:37:28,  2.26s/it]

training loss: 3.5365841388702393


training:  25%|██▌       | 1395/5566 [55:12<2:36:54,  2.26s/it]

training loss: 3.5097687244415283


training:  25%|██▌       | 1396/5566 [55:14<2:36:18,  2.25s/it]

training loss: 3.535595178604126


training:  25%|██▌       | 1397/5566 [55:17<2:36:02,  2.25s/it]

training loss: 3.519253730773926


training:  25%|██▌       | 1398/5566 [55:19<2:36:25,  2.25s/it]

training loss: 3.528118371963501


training:  25%|██▌       | 1399/5566 [55:21<2:36:04,  2.25s/it]

training loss: 3.4984614849090576


training:  25%|██▌       | 1400/5566 [55:23<2:35:53,  2.25s/it]

training loss: 3.5075645446777344
valid loss: 3.506896734237671
perplexity: 33.34463119506836


training:  25%|██▌       | 1401/5566 [55:28<3:32:56,  3.07s/it]

training loss: 3.522347927093506


training:  25%|██▌       | 1402/5566 [55:31<3:19:38,  2.88s/it]

training loss: 3.511253833770752


training:  25%|██▌       | 1403/5566 [55:33<3:05:40,  2.68s/it]

training loss: 3.5396568775177


training:  25%|██▌       | 1404/5566 [55:35<2:56:16,  2.54s/it]

training loss: 3.52763032913208


training:  25%|██▌       | 1405/5566 [55:37<2:50:19,  2.46s/it]

training loss: 3.5252621173858643


training:  25%|██▌       | 1406/5566 [55:40<2:46:11,  2.40s/it]

training loss: 3.496107339859009


training:  25%|██▌       | 1407/5566 [55:42<2:42:26,  2.34s/it]

training loss: 3.5349414348602295


training:  25%|██▌       | 1408/5566 [55:44<2:39:53,  2.31s/it]

training loss: 3.5318386554718018


training:  25%|██▌       | 1409/5566 [55:46<2:38:29,  2.29s/it]

training loss: 3.5403544902801514


training:  25%|██▌       | 1410/5566 [55:49<2:37:16,  2.27s/it]

training loss: 3.500973701477051


training:  25%|██▌       | 1411/5566 [55:51<2:36:44,  2.26s/it]

training loss: 3.522174119949341


training:  25%|██▌       | 1412/5566 [55:53<2:36:18,  2.26s/it]

training loss: 3.5156893730163574


training:  25%|██▌       | 1413/5566 [55:55<2:35:50,  2.25s/it]

training loss: 3.503061056137085


training:  25%|██▌       | 1414/5566 [55:58<2:35:44,  2.25s/it]

training loss: 3.5347683429718018


training:  25%|██▌       | 1415/5566 [56:00<2:35:37,  2.25s/it]

training loss: 3.53725528717041


training:  25%|██▌       | 1416/5566 [56:02<2:35:29,  2.25s/it]

training loss: 3.515089511871338


training:  25%|██▌       | 1417/5566 [56:04<2:35:01,  2.24s/it]

training loss: 3.522554397583008


training:  25%|██▌       | 1418/5566 [56:06<2:35:11,  2.24s/it]

training loss: 3.536590099334717


training:  25%|██▌       | 1419/5566 [56:09<2:35:06,  2.24s/it]

training loss: 3.52579927444458


training:  26%|██▌       | 1420/5566 [56:11<2:35:07,  2.24s/it]

training loss: 3.5227725505828857
valid loss: 3.5223777294158936
perplexity: 33.86485290527344


training:  26%|██▌       | 1421/5566 [56:15<3:17:30,  2.86s/it]

training loss: 3.508507490158081


training:  26%|██▌       | 1422/5566 [56:18<3:05:11,  2.68s/it]

training loss: 3.5425829887390137


training:  26%|██▌       | 1423/5566 [56:20<2:55:50,  2.55s/it]

training loss: 3.4972453117370605


training:  26%|██▌       | 1424/5566 [56:22<2:49:34,  2.46s/it]

training loss: 3.5023534297943115


training:  26%|██▌       | 1425/5566 [56:24<2:45:22,  2.40s/it]

training loss: 3.517117977142334


training:  26%|██▌       | 1426/5566 [56:26<2:41:47,  2.34s/it]

training loss: 3.5387022495269775


training:  26%|██▌       | 1427/5566 [56:29<2:39:28,  2.31s/it]

training loss: 3.538663148880005


training:  26%|██▌       | 1428/5566 [56:31<2:37:51,  2.29s/it]

training loss: 3.5076541900634766


training:  26%|██▌       | 1429/5566 [56:33<2:37:00,  2.28s/it]

training loss: 3.491023063659668


training:  26%|██▌       | 1430/5566 [56:35<2:36:21,  2.27s/it]

training loss: 3.5306637287139893


training:  26%|██▌       | 1431/5566 [56:38<2:35:23,  2.25s/it]

training loss: 3.505173921585083


training:  26%|██▌       | 1432/5566 [56:40<2:35:29,  2.26s/it]

training loss: 3.5083513259887695


training:  26%|██▌       | 1433/5566 [56:42<2:35:30,  2.26s/it]

training loss: 3.5447287559509277


training:  26%|██▌       | 1434/5566 [56:44<2:35:22,  2.26s/it]

training loss: 3.532583475112915


training:  26%|██▌       | 1435/5566 [56:47<2:35:02,  2.25s/it]

training loss: 3.513716459274292


training:  26%|██▌       | 1436/5566 [56:49<2:34:56,  2.25s/it]

training loss: 3.5259764194488525


training:  26%|██▌       | 1437/5566 [56:51<2:34:56,  2.25s/it]

training loss: 3.523383378982544


training:  26%|██▌       | 1438/5566 [56:53<2:34:13,  2.24s/it]

training loss: 3.5379669666290283


training:  26%|██▌       | 1439/5566 [56:56<2:33:52,  2.24s/it]

training loss: 3.5263516902923584


training:  26%|██▌       | 1440/5566 [56:58<2:33:52,  2.24s/it]

training loss: 3.5281014442443848
valid loss: 3.5270464420318604
perplexity: 34.02333068847656


training:  26%|██▌       | 1441/5566 [57:02<3:15:07,  2.84s/it]

training loss: 3.543693780899048


training:  26%|██▌       | 1442/5566 [57:04<3:03:26,  2.67s/it]

training loss: 3.5053019523620605


training:  26%|██▌       | 1443/5566 [57:07<2:54:36,  2.54s/it]

training loss: 3.524649143218994


training:  26%|██▌       | 1444/5566 [57:09<2:48:38,  2.45s/it]

training loss: 3.5138134956359863


training:  26%|██▌       | 1445/5566 [57:11<2:44:12,  2.39s/it]

training loss: 3.5311195850372314


training:  26%|██▌       | 1446/5566 [57:13<2:41:11,  2.35s/it]

training loss: 3.4977850914001465


training:  26%|██▌       | 1447/5566 [57:16<2:39:24,  2.32s/it]

training loss: 3.524031400680542


training:  26%|██▌       | 1448/5566 [57:18<2:38:12,  2.31s/it]

training loss: 3.502831220626831


training:  26%|██▌       | 1449/5566 [57:20<2:36:48,  2.29s/it]

training loss: 3.533935546875


training:  26%|██▌       | 1450/5566 [57:22<2:36:00,  2.27s/it]

training loss: 3.5171618461608887


training:  26%|██▌       | 1451/5566 [57:25<2:35:21,  2.27s/it]

training loss: 3.5081517696380615


training:  26%|██▌       | 1452/5566 [57:27<2:35:07,  2.26s/it]

training loss: 3.5029494762420654


training:  26%|██▌       | 1453/5566 [57:29<2:34:53,  2.26s/it]

training loss: 3.5103049278259277


training:  26%|██▌       | 1454/5566 [57:31<2:34:48,  2.26s/it]

training loss: 3.529689311981201


training:  26%|██▌       | 1455/5566 [57:34<2:34:51,  2.26s/it]

training loss: 3.521808385848999


training:  26%|██▌       | 1456/5566 [57:36<2:34:39,  2.26s/it]

training loss: 3.5234007835388184


training:  26%|██▌       | 1457/5566 [57:38<2:34:28,  2.26s/it]

training loss: 3.5129425525665283


training:  26%|██▌       | 1458/5566 [57:40<2:34:30,  2.26s/it]

training loss: 3.5097668170928955


training:  26%|██▌       | 1459/5566 [57:43<2:34:58,  2.26s/it]

training loss: 3.520388603210449


training:  26%|██▌       | 1460/5566 [57:45<2:34:54,  2.26s/it]

training loss: 3.5368471145629883
valid loss: 3.5358567237854004
perplexity: 34.32440948486328


training:  26%|██▌       | 1461/5566 [57:49<3:16:06,  2.87s/it]

training loss: 3.526859998703003


training:  26%|██▋       | 1462/5566 [57:52<3:04:47,  2.70s/it]

training loss: 3.509939193725586


training:  26%|██▋       | 1463/5566 [57:54<3:06:28,  2.73s/it]

training loss: 3.5289762020111084


training:  26%|██▋       | 1464/5566 [57:57<3:00:37,  2.64s/it]

training loss: 3.515570640563965


training:  26%|██▋       | 1465/5566 [57:59<2:52:10,  2.52s/it]

training loss: 3.526186466217041


training:  26%|██▋       | 1466/5566 [58:01<2:46:31,  2.44s/it]

training loss: 3.518526792526245


training:  26%|██▋       | 1467/5566 [58:04<2:42:30,  2.38s/it]

training loss: 3.510735511779785


training:  26%|██▋       | 1468/5566 [58:06<2:40:06,  2.34s/it]

training loss: 3.5080816745758057


training:  26%|██▋       | 1469/5566 [58:08<2:38:37,  2.32s/it]

training loss: 3.5045576095581055


training:  26%|██▋       | 1470/5566 [58:10<2:36:40,  2.30s/it]

training loss: 3.524681568145752


training:  26%|██▋       | 1471/5566 [58:13<2:36:10,  2.29s/it]

training loss: 3.5339767932891846


training:  26%|██▋       | 1472/5566 [58:15<2:34:56,  2.27s/it]

training loss: 3.531010627746582


training:  26%|██▋       | 1473/5566 [58:17<2:35:17,  2.28s/it]

training loss: 3.533658504486084


training:  26%|██▋       | 1474/5566 [58:19<2:35:05,  2.27s/it]

training loss: 3.5151138305664062


training:  27%|██▋       | 1475/5566 [58:22<2:34:24,  2.26s/it]

training loss: 3.524597406387329


training:  27%|██▋       | 1476/5566 [58:24<2:34:04,  2.26s/it]

training loss: 3.5290708541870117


training:  27%|██▋       | 1477/5566 [58:26<2:33:30,  2.25s/it]

training loss: 3.5291695594787598


training:  27%|██▋       | 1478/5566 [58:28<2:32:55,  2.24s/it]

training loss: 3.5254268646240234


training:  27%|██▋       | 1479/5566 [58:31<2:33:01,  2.25s/it]

training loss: 3.5065085887908936


training:  27%|██▋       | 1480/5566 [58:33<2:33:08,  2.25s/it]

training loss: 3.5347204208374023
valid loss: 3.5341439247131348
perplexity: 34.26566696166992


training:  27%|██▋       | 1481/5566 [58:37<3:13:43,  2.85s/it]

training loss: 3.5244734287261963


training:  27%|██▋       | 1482/5566 [58:39<3:01:02,  2.66s/it]

training loss: 3.5104613304138184


training:  27%|██▋       | 1483/5566 [58:42<2:53:20,  2.55s/it]

training loss: 3.5189428329467773


training:  27%|██▋       | 1484/5566 [58:44<2:46:41,  2.45s/it]

training loss: 3.5213773250579834


training:  27%|██▋       | 1485/5566 [58:46<2:42:22,  2.39s/it]

training loss: 3.507990837097168


training:  27%|██▋       | 1486/5566 [58:48<2:40:06,  2.35s/it]

training loss: 3.5334763526916504


training:  27%|██▋       | 1487/5566 [58:51<2:38:31,  2.33s/it]

training loss: 3.4812145233154297


training:  27%|██▋       | 1488/5566 [58:53<2:37:00,  2.31s/it]

training loss: 3.521674871444702


training:  27%|██▋       | 1489/5566 [58:55<2:35:23,  2.29s/it]

training loss: 3.5392839908599854


training:  27%|██▋       | 1490/5566 [58:57<2:34:17,  2.27s/it]

training loss: 3.517138719558716


training:  27%|██▋       | 1491/5566 [59:00<2:33:33,  2.26s/it]

training loss: 3.5409393310546875


training:  27%|██▋       | 1492/5566 [59:02<2:33:03,  2.25s/it]

training loss: 3.5227479934692383


training:  27%|██▋       | 1493/5566 [59:04<2:32:37,  2.25s/it]

training loss: 3.5144712924957275


training:  27%|██▋       | 1494/5566 [59:06<2:32:30,  2.25s/it]

training loss: 3.531094789505005


training:  27%|██▋       | 1495/5566 [59:08<2:32:03,  2.24s/it]

training loss: 3.5198802947998047


training:  27%|██▋       | 1496/5566 [59:11<2:32:00,  2.24s/it]

training loss: 3.517982006072998


training:  27%|██▋       | 1497/5566 [59:13<2:32:33,  2.25s/it]

training loss: 3.500190258026123


training:  27%|██▋       | 1498/5566 [59:15<2:32:30,  2.25s/it]

training loss: 3.5099844932556152


training:  27%|██▋       | 1499/5566 [59:18<2:32:57,  2.26s/it]

training loss: 3.5312232971191406


training:  27%|██▋       | 1500/5566 [59:20<2:32:53,  2.26s/it]

training loss: 3.5540003776550293
valid loss: 3.553241014480591
perplexity: 34.92633056640625


training:  27%|██▋       | 1501/5566 [59:25<3:28:28,  3.08s/it]

training loss: 3.530576467514038


training:  27%|██▋       | 1502/5566 [59:27<3:14:40,  2.87s/it]

training loss: 3.494375228881836


training:  27%|██▋       | 1503/5566 [59:29<3:02:19,  2.69s/it]

training loss: 3.52954363822937


training:  27%|██▋       | 1504/5566 [59:32<2:53:32,  2.56s/it]

training loss: 3.53015398979187


training:  27%|██▋       | 1505/5566 [59:34<2:46:54,  2.47s/it]

training loss: 3.4981765747070312


training:  27%|██▋       | 1506/5566 [59:36<2:42:07,  2.40s/it]

training loss: 3.53836727142334


training:  27%|██▋       | 1507/5566 [59:38<2:39:13,  2.35s/it]

training loss: 3.5342085361480713


training:  27%|██▋       | 1508/5566 [59:41<2:36:23,  2.31s/it]

training loss: 3.5255680084228516


training:  27%|██▋       | 1509/5566 [59:43<2:35:06,  2.29s/it]

training loss: 3.500398874282837


training:  27%|██▋       | 1510/5566 [59:45<2:34:05,  2.28s/it]

training loss: 3.551137685775757


training:  27%|██▋       | 1511/5566 [59:47<2:33:11,  2.27s/it]

training loss: 3.5209295749664307


training:  27%|██▋       | 1512/5566 [59:50<2:32:42,  2.26s/it]

training loss: 3.532942056655884


training:  27%|██▋       | 1513/5566 [59:52<2:32:13,  2.25s/it]

training loss: 3.5273778438568115


training:  27%|██▋       | 1514/5566 [59:54<2:32:06,  2.25s/it]

training loss: 3.539336919784546


training:  27%|██▋       | 1515/5566 [59:56<2:32:06,  2.25s/it]

training loss: 3.543823719024658


training:  27%|██▋       | 1516/5566 [59:59<2:31:58,  2.25s/it]

training loss: 3.527745008468628


training:  27%|██▋       | 1517/5566 [1:00:01<2:32:30,  2.26s/it]

training loss: 3.514953851699829


training:  27%|██▋       | 1518/5566 [1:00:03<2:32:15,  2.26s/it]

training loss: 3.5219645500183105


training:  27%|██▋       | 1519/5566 [1:00:05<2:31:40,  2.25s/it]

training loss: 3.519233465194702


training:  27%|██▋       | 1520/5566 [1:00:08<2:31:49,  2.25s/it]

training loss: 3.5336859226226807
valid loss: 3.5327491760253906
perplexity: 34.21791076660156


training:  27%|██▋       | 1521/5566 [1:00:12<3:12:25,  2.85s/it]

training loss: 3.5293049812316895


training:  27%|██▋       | 1522/5566 [1:00:14<3:00:31,  2.68s/it]

training loss: 3.5715184211730957


training:  27%|██▋       | 1523/5566 [1:00:16<2:51:40,  2.55s/it]

training loss: 3.531343936920166


training:  27%|██▋       | 1524/5566 [1:00:19<2:45:19,  2.45s/it]

training loss: 3.527968406677246


training:  27%|██▋       | 1525/5566 [1:00:21<2:41:49,  2.40s/it]

training loss: 3.518728733062744


training:  27%|██▋       | 1526/5566 [1:00:23<2:38:14,  2.35s/it]

training loss: 3.5121688842773438


training:  27%|██▋       | 1527/5566 [1:00:25<2:35:52,  2.32s/it]

training loss: 3.5443716049194336


training:  27%|██▋       | 1528/5566 [1:00:28<2:34:04,  2.29s/it]

training loss: 3.515235424041748


training:  27%|██▋       | 1529/5566 [1:00:30<2:32:41,  2.27s/it]

training loss: 3.526305675506592


training:  27%|██▋       | 1530/5566 [1:00:32<2:31:50,  2.26s/it]

training loss: 3.52917218208313


training:  28%|██▊       | 1531/5566 [1:00:34<2:31:11,  2.25s/it]

training loss: 3.521054267883301


training:  28%|██▊       | 1532/5566 [1:00:36<2:30:32,  2.24s/it]

training loss: 3.5005292892456055


training:  28%|██▊       | 1533/5566 [1:00:39<2:29:59,  2.23s/it]

training loss: 3.542353868484497


training:  28%|██▊       | 1534/5566 [1:00:41<2:29:33,  2.23s/it]

training loss: 3.511585235595703


training:  28%|██▊       | 1535/5566 [1:00:43<2:30:07,  2.23s/it]

training loss: 3.523618459701538


training:  28%|██▊       | 1536/5566 [1:00:45<2:29:31,  2.23s/it]

training loss: 3.5263514518737793


training:  28%|██▊       | 1537/5566 [1:00:48<2:30:02,  2.23s/it]

training loss: 3.529752016067505


training:  28%|██▊       | 1538/5566 [1:00:50<2:29:41,  2.23s/it]

training loss: 3.5240554809570312


training:  28%|██▊       | 1539/5566 [1:00:52<2:29:35,  2.23s/it]

training loss: 3.540163278579712


training:  28%|██▊       | 1540/5566 [1:00:54<2:29:05,  2.22s/it]

training loss: 3.530864715576172
valid loss: 3.529731035232544
perplexity: 34.11479187011719


training:  28%|██▊       | 1541/5566 [1:00:58<3:08:44,  2.81s/it]

training loss: 3.51627779006958


training:  28%|██▊       | 1542/5566 [1:01:01<2:57:17,  2.64s/it]

training loss: 3.5204951763153076


training:  28%|██▊       | 1543/5566 [1:01:03<2:48:33,  2.51s/it]

training loss: 3.5302798748016357


training:  28%|██▊       | 1544/5566 [1:01:05<2:43:04,  2.43s/it]

training loss: 3.542506694793701


training:  28%|██▊       | 1545/5566 [1:01:07<2:39:09,  2.37s/it]

training loss: 3.504938840866089


training:  28%|██▊       | 1546/5566 [1:01:10<2:36:23,  2.33s/it]

training loss: 3.5325920581817627


training:  28%|██▊       | 1547/5566 [1:01:12<2:43:13,  2.44s/it]

training loss: 3.5315351486206055


training:  28%|██▊       | 1548/5566 [1:01:15<2:46:02,  2.48s/it]

training loss: 3.5270273685455322


training:  28%|██▊       | 1549/5566 [1:01:17<2:40:55,  2.40s/it]

training loss: 3.5127007961273193


training:  28%|██▊       | 1550/5566 [1:01:19<2:37:29,  2.35s/it]

training loss: 3.516525983810425


training:  28%|██▊       | 1551/5566 [1:01:22<2:36:10,  2.33s/it]

training loss: 3.5374231338500977


training:  28%|██▊       | 1552/5566 [1:01:24<2:34:56,  2.32s/it]

training loss: 3.5236082077026367


training:  28%|██▊       | 1553/5566 [1:01:26<2:33:36,  2.30s/it]

training loss: 3.4873504638671875


training:  28%|██▊       | 1554/5566 [1:01:28<2:32:13,  2.28s/it]

training loss: 3.5276997089385986


training:  28%|██▊       | 1555/5566 [1:01:31<2:31:15,  2.26s/it]

training loss: 3.519925117492676


training:  28%|██▊       | 1556/5566 [1:01:33<2:31:10,  2.26s/it]

training loss: 3.511251449584961


training:  28%|██▊       | 1557/5566 [1:01:35<2:30:27,  2.25s/it]

training loss: 3.5128896236419678


training:  28%|██▊       | 1558/5566 [1:01:37<2:30:25,  2.25s/it]

training loss: 3.5331192016601562


training:  28%|██▊       | 1559/5566 [1:01:40<2:30:27,  2.25s/it]

training loss: 3.5457751750946045


training:  28%|██▊       | 1560/5566 [1:01:42<2:30:11,  2.25s/it]

training loss: 3.5259265899658203
valid loss: 3.525796413421631
perplexity: 33.98082733154297


training:  28%|██▊       | 1561/5566 [1:01:46<3:11:54,  2.88s/it]

training loss: 3.539163589477539


training:  28%|██▊       | 1562/5566 [1:01:49<3:00:19,  2.70s/it]

training loss: 3.5202746391296387


training:  28%|██▊       | 1563/5566 [1:01:51<2:51:13,  2.57s/it]

training loss: 3.516190528869629


training:  28%|██▊       | 1564/5566 [1:01:53<2:44:58,  2.47s/it]

training loss: 3.5146853923797607


training:  28%|██▊       | 1565/5566 [1:01:55<2:40:24,  2.41s/it]

training loss: 3.5334746837615967


training:  28%|██▊       | 1566/5566 [1:01:58<2:36:56,  2.35s/it]

training loss: 3.5313894748687744


training:  28%|██▊       | 1567/5566 [1:02:00<2:33:39,  2.31s/it]

training loss: 3.5404434204101562


training:  28%|██▊       | 1568/5566 [1:02:02<2:32:02,  2.28s/it]

training loss: 3.5023014545440674


training:  28%|██▊       | 1569/5566 [1:02:04<2:31:08,  2.27s/it]

training loss: 3.535773992538452


training:  28%|██▊       | 1570/5566 [1:02:06<2:30:13,  2.26s/it]

training loss: 3.5503759384155273


training:  28%|██▊       | 1571/5566 [1:02:09<2:30:11,  2.26s/it]

training loss: 3.5091660022735596


training:  28%|██▊       | 1572/5566 [1:02:11<2:30:01,  2.25s/it]

training loss: 3.5286777019500732


training:  28%|██▊       | 1573/5566 [1:02:13<2:29:53,  2.25s/it]

training loss: 3.553100347518921


training:  28%|██▊       | 1574/5566 [1:02:15<2:30:16,  2.26s/it]

training loss: 3.527240753173828


training:  28%|██▊       | 1575/5566 [1:02:18<2:29:26,  2.25s/it]

training loss: 3.515146255493164


training:  28%|██▊       | 1576/5566 [1:02:20<2:28:50,  2.24s/it]

training loss: 3.5209765434265137


training:  28%|██▊       | 1577/5566 [1:02:22<2:28:21,  2.23s/it]

training loss: 3.518118143081665


training:  28%|██▊       | 1578/5566 [1:02:24<2:28:01,  2.23s/it]

training loss: 3.5295560359954834


training:  28%|██▊       | 1579/5566 [1:02:27<2:28:00,  2.23s/it]

training loss: 3.5226218700408936


training:  28%|██▊       | 1580/5566 [1:02:29<2:28:16,  2.23s/it]

training loss: 3.534806251525879
valid loss: 3.533778429031372
perplexity: 34.25314712524414


training:  28%|██▊       | 1581/5566 [1:02:33<3:08:33,  2.84s/it]

training loss: 3.5402626991271973


training:  28%|██▊       | 1582/5566 [1:02:35<2:57:14,  2.67s/it]

training loss: 3.521514654159546


training:  28%|██▊       | 1583/5566 [1:02:38<2:48:45,  2.54s/it]

training loss: 3.5027005672454834


training:  28%|██▊       | 1584/5566 [1:02:40<2:42:14,  2.44s/it]

training loss: 3.528245687484741


training:  28%|██▊       | 1585/5566 [1:02:42<2:38:30,  2.39s/it]

training loss: 3.5206596851348877


training:  28%|██▊       | 1586/5566 [1:02:44<2:34:57,  2.34s/it]

training loss: 3.5194265842437744


training:  29%|██▊       | 1587/5566 [1:02:46<2:32:49,  2.30s/it]

training loss: 3.5318238735198975


training:  29%|██▊       | 1588/5566 [1:02:49<2:31:26,  2.28s/it]

training loss: 3.5097310543060303


training:  29%|██▊       | 1589/5566 [1:02:51<2:29:43,  2.26s/it]

training loss: 3.5351929664611816


training:  29%|██▊       | 1590/5566 [1:02:53<2:29:21,  2.25s/it]

training loss: 3.517566442489624


training:  29%|██▊       | 1591/5566 [1:02:55<2:28:25,  2.24s/it]

training loss: 3.524733781814575


training:  29%|██▊       | 1592/5566 [1:02:58<2:28:17,  2.24s/it]

training loss: 3.5178964138031006


training:  29%|██▊       | 1593/5566 [1:03:00<2:27:47,  2.23s/it]

training loss: 3.546893358230591


training:  29%|██▊       | 1594/5566 [1:03:02<2:27:47,  2.23s/it]

training loss: 3.521544933319092


training:  29%|██▊       | 1595/5566 [1:03:04<2:27:48,  2.23s/it]

training loss: 3.5284595489501953


training:  29%|██▊       | 1596/5566 [1:03:06<2:27:44,  2.23s/it]

training loss: 3.5047693252563477


training:  29%|██▊       | 1597/5566 [1:03:09<2:27:40,  2.23s/it]

training loss: 3.5110058784484863


training:  29%|██▊       | 1598/5566 [1:03:11<2:27:38,  2.23s/it]

training loss: 3.5255661010742188


training:  29%|██▊       | 1599/5566 [1:03:13<2:27:39,  2.23s/it]

training loss: 3.518646717071533


training:  29%|██▊       | 1600/5566 [1:03:15<2:27:23,  2.23s/it]

training loss: 3.5210647583007812
valid loss: 3.5198264122009277
perplexity: 33.778564453125


training:  29%|██▉       | 1601/5566 [1:03:20<3:21:18,  3.05s/it]

training loss: 3.5366556644439697


training:  29%|██▉       | 1602/5566 [1:03:23<3:06:16,  2.82s/it]

training loss: 3.5278193950653076


training:  29%|██▉       | 1603/5566 [1:03:25<2:53:45,  2.63s/it]

training loss: 3.5290331840515137


training:  29%|██▉       | 1604/5566 [1:03:27<2:45:33,  2.51s/it]

training loss: 3.516211986541748


training:  29%|██▉       | 1605/5566 [1:03:29<2:39:43,  2.42s/it]

training loss: 3.5269508361816406


training:  29%|██▉       | 1606/5566 [1:03:32<2:36:11,  2.37s/it]

training loss: 3.5120296478271484


training:  29%|██▉       | 1607/5566 [1:03:34<2:32:57,  2.32s/it]

training loss: 3.5100204944610596


training:  29%|██▉       | 1608/5566 [1:03:36<2:31:15,  2.29s/it]

training loss: 3.562730073928833


training:  29%|██▉       | 1609/5566 [1:03:38<2:30:03,  2.28s/it]

training loss: 3.5315029621124268


training:  29%|██▉       | 1610/5566 [1:03:40<2:29:00,  2.26s/it]

training loss: 3.5262134075164795


training:  29%|██▉       | 1611/5566 [1:03:43<2:28:19,  2.25s/it]

training loss: 3.5478336811065674


training:  29%|██▉       | 1612/5566 [1:03:45<2:28:34,  2.25s/it]

training loss: 3.530257225036621


training:  29%|██▉       | 1613/5566 [1:03:47<2:27:54,  2.25s/it]

training loss: 3.5383853912353516


training:  29%|██▉       | 1614/5566 [1:03:49<2:27:20,  2.24s/it]

training loss: 3.520963430404663


training:  29%|██▉       | 1615/5566 [1:03:52<2:26:26,  2.22s/it]

training loss: 3.5006704330444336


training:  29%|██▉       | 1616/5566 [1:03:54<2:26:23,  2.22s/it]

training loss: 3.5218961238861084


training:  29%|██▉       | 1617/5566 [1:03:56<2:26:51,  2.23s/it]

training loss: 3.511216402053833


training:  29%|██▉       | 1618/5566 [1:03:58<2:26:52,  2.23s/it]

training loss: 3.547278881072998


training:  29%|██▉       | 1619/5566 [1:04:00<2:26:46,  2.23s/it]

training loss: 3.5125339031219482


training:  29%|██▉       | 1620/5566 [1:04:03<2:28:05,  2.25s/it]

training loss: 3.5319571495056152
valid loss: 3.530823230743408
perplexity: 34.15207290649414


training:  29%|██▉       | 1621/5566 [1:04:07<3:06:46,  2.84s/it]

training loss: 3.511874198913574


training:  29%|██▉       | 1622/5566 [1:04:09<2:54:25,  2.65s/it]

training loss: 3.516897678375244


training:  29%|██▉       | 1623/5566 [1:04:11<2:45:39,  2.52s/it]

training loss: 3.518491744995117


training:  29%|██▉       | 1624/5566 [1:04:14<2:39:54,  2.43s/it]

training loss: 3.511976718902588


training:  29%|██▉       | 1625/5566 [1:04:16<2:36:17,  2.38s/it]

training loss: 3.5257554054260254


training:  29%|██▉       | 1626/5566 [1:04:18<2:32:55,  2.33s/it]

training loss: 3.523859739303589


training:  29%|██▉       | 1627/5566 [1:04:20<2:30:35,  2.29s/it]

training loss: 3.526120662689209


training:  29%|██▉       | 1628/5566 [1:04:23<2:29:26,  2.28s/it]

training loss: 3.5223612785339355


training:  29%|██▉       | 1629/5566 [1:04:25<2:28:01,  2.26s/it]

training loss: 3.488070249557495


training:  29%|██▉       | 1630/5566 [1:04:27<2:27:01,  2.24s/it]

training loss: 3.524456739425659


training:  29%|██▉       | 1631/5566 [1:04:29<2:32:08,  2.32s/it]

training loss: 3.534947633743286


training:  29%|██▉       | 1632/5566 [1:04:32<2:39:14,  2.43s/it]

training loss: 3.526390552520752


training:  29%|██▉       | 1633/5566 [1:04:34<2:35:10,  2.37s/it]

training loss: 3.5179800987243652


training:  29%|██▉       | 1634/5566 [1:04:37<2:32:20,  2.32s/it]

training loss: 3.538154363632202


training:  29%|██▉       | 1635/5566 [1:04:39<2:29:40,  2.28s/it]

training loss: 3.5408170223236084


training:  29%|██▉       | 1636/5566 [1:04:41<2:28:15,  2.26s/it]

training loss: 3.5123541355133057


training:  29%|██▉       | 1637/5566 [1:04:43<2:27:13,  2.25s/it]

training loss: 3.5213916301727295


training:  29%|██▉       | 1638/5566 [1:04:45<2:26:20,  2.24s/it]

training loss: 3.5366580486297607


training:  29%|██▉       | 1639/5566 [1:04:48<2:26:09,  2.23s/it]

training loss: 3.5321085453033447


training:  29%|██▉       | 1640/5566 [1:04:50<2:25:36,  2.23s/it]

training loss: 3.511077880859375
valid loss: 3.5097157955169678
perplexity: 33.43876266479492


training:  29%|██▉       | 1641/5566 [1:04:54<3:03:08,  2.80s/it]

training loss: 3.504002094268799


training:  30%|██▉       | 1642/5566 [1:04:56<2:51:51,  2.63s/it]

training loss: 3.531660795211792


training:  30%|██▉       | 1643/5566 [1:04:58<2:43:29,  2.50s/it]

training loss: 3.535825729370117


training:  30%|██▉       | 1644/5566 [1:05:01<2:37:55,  2.42s/it]

training loss: 3.5227181911468506


training:  30%|██▉       | 1645/5566 [1:05:03<2:34:02,  2.36s/it]

training loss: 3.5129477977752686


training:  30%|██▉       | 1646/5566 [1:05:05<2:31:29,  2.32s/it]

training loss: 3.527770757675171


training:  30%|██▉       | 1647/5566 [1:05:07<2:29:28,  2.29s/it]

training loss: 3.5053181648254395


training:  30%|██▉       | 1648/5566 [1:05:10<2:27:50,  2.26s/it]

training loss: 3.5344038009643555


training:  30%|██▉       | 1649/5566 [1:05:12<2:27:06,  2.25s/it]

training loss: 3.515862464904785


training:  30%|██▉       | 1650/5566 [1:05:14<2:26:21,  2.24s/it]

training loss: 3.5083632469177246


training:  30%|██▉       | 1651/5566 [1:05:16<2:26:26,  2.24s/it]

training loss: 3.533663272857666


training:  30%|██▉       | 1652/5566 [1:05:18<2:25:52,  2.24s/it]

training loss: 3.526057720184326


training:  30%|██▉       | 1653/5566 [1:05:21<2:25:19,  2.23s/it]

training loss: 3.5441036224365234


training:  30%|██▉       | 1654/5566 [1:05:23<2:25:14,  2.23s/it]

training loss: 3.5502889156341553


training:  30%|██▉       | 1655/5566 [1:05:25<2:24:43,  2.22s/it]

training loss: 3.52833890914917


training:  30%|██▉       | 1656/5566 [1:05:27<2:24:11,  2.21s/it]

training loss: 3.522580146789551


training:  30%|██▉       | 1657/5566 [1:05:30<2:24:23,  2.22s/it]

training loss: 3.5241024494171143


training:  30%|██▉       | 1658/5566 [1:05:32<2:23:59,  2.21s/it]

training loss: 3.5362401008605957


training:  30%|██▉       | 1659/5566 [1:05:34<2:24:21,  2.22s/it]

training loss: 3.517091989517212


training:  30%|██▉       | 1660/5566 [1:05:36<2:24:33,  2.22s/it]

training loss: 3.51193904876709
valid loss: 3.5116260051727295
perplexity: 33.50270080566406


training:  30%|██▉       | 1661/5566 [1:05:40<3:03:01,  2.81s/it]

training loss: 3.5120368003845215


training:  30%|██▉       | 1662/5566 [1:05:43<2:52:23,  2.65s/it]

training loss: 3.5053629875183105


training:  30%|██▉       | 1663/5566 [1:05:45<2:44:00,  2.52s/it]

training loss: 3.5018579959869385


training:  30%|██▉       | 1664/5566 [1:05:47<2:38:06,  2.43s/it]

training loss: 3.5237693786621094


training:  30%|██▉       | 1665/5566 [1:05:49<2:33:46,  2.37s/it]

training loss: 3.532757043838501


training:  30%|██▉       | 1666/5566 [1:05:52<2:30:54,  2.32s/it]

training loss: 3.538456439971924


training:  30%|██▉       | 1667/5566 [1:05:54<2:29:20,  2.30s/it]

training loss: 3.51045560836792


training:  30%|██▉       | 1668/5566 [1:05:56<2:27:44,  2.27s/it]

training loss: 3.5275118350982666


training:  30%|██▉       | 1669/5566 [1:05:58<2:26:13,  2.25s/it]

training loss: 3.5319979190826416


training:  30%|███       | 1670/5566 [1:06:00<2:25:11,  2.24s/it]

training loss: 3.5021543502807617


training:  30%|███       | 1671/5566 [1:06:03<2:24:40,  2.23s/it]

training loss: 3.515427827835083


training:  30%|███       | 1672/5566 [1:06:05<2:24:16,  2.22s/it]

training loss: 3.5448215007781982


training:  30%|███       | 1673/5566 [1:06:07<2:24:08,  2.22s/it]

training loss: 3.52538800239563


training:  30%|███       | 1674/5566 [1:06:09<2:23:52,  2.22s/it]

training loss: 3.5095126628875732


training:  30%|███       | 1675/5566 [1:06:11<2:23:40,  2.22s/it]

training loss: 3.521106004714966


training:  30%|███       | 1676/5566 [1:06:14<2:23:20,  2.21s/it]

training loss: 3.5303475856781006


training:  30%|███       | 1677/5566 [1:06:16<2:23:18,  2.21s/it]

training loss: 3.5148563385009766


training:  30%|███       | 1678/5566 [1:06:18<2:23:05,  2.21s/it]

training loss: 3.535550594329834


training:  30%|███       | 1679/5566 [1:06:20<2:23:39,  2.22s/it]

training loss: 3.5473177433013916


training:  30%|███       | 1680/5566 [1:06:22<2:23:14,  2.21s/it]

training loss: 3.5295262336730957
valid loss: 3.527869939804077
perplexity: 34.051361083984375


training:  30%|███       | 1681/5566 [1:06:27<3:01:45,  2.81s/it]

training loss: 3.5324885845184326


training:  30%|███       | 1682/5566 [1:06:29<2:51:05,  2.64s/it]

training loss: 3.5220513343811035


training:  30%|███       | 1683/5566 [1:06:31<2:42:21,  2.51s/it]

training loss: 3.535776138305664


training:  30%|███       | 1684/5566 [1:06:33<2:36:41,  2.42s/it]

training loss: 3.5342342853546143


training:  30%|███       | 1685/5566 [1:06:36<2:32:37,  2.36s/it]

training loss: 3.5027589797973633


training:  30%|███       | 1686/5566 [1:06:38<2:29:16,  2.31s/it]

training loss: 3.528766632080078


training:  30%|███       | 1687/5566 [1:06:40<2:27:31,  2.28s/it]

training loss: 3.5207977294921875


training:  30%|███       | 1688/5566 [1:06:42<2:25:46,  2.26s/it]

training loss: 3.5350892543792725


training:  30%|███       | 1689/5566 [1:06:44<2:25:34,  2.25s/it]

training loss: 3.5072271823883057


training:  30%|███       | 1690/5566 [1:06:47<2:24:44,  2.24s/it]

training loss: 3.548165798187256


training:  30%|███       | 1691/5566 [1:06:49<2:24:07,  2.23s/it]

training loss: 3.537917375564575


training:  30%|███       | 1692/5566 [1:06:51<2:24:02,  2.23s/it]

training loss: 3.5114328861236572


training:  30%|███       | 1693/5566 [1:06:53<2:23:36,  2.22s/it]

training loss: 3.537283420562744


training:  30%|███       | 1694/5566 [1:06:56<2:24:12,  2.23s/it]

training loss: 3.526865005493164


training:  30%|███       | 1695/5566 [1:06:58<2:23:51,  2.23s/it]

training loss: 3.518059253692627


training:  30%|███       | 1696/5566 [1:07:00<2:23:31,  2.23s/it]

training loss: 3.5260884761810303


training:  30%|███       | 1697/5566 [1:07:02<2:22:47,  2.21s/it]

training loss: 3.5393123626708984


training:  31%|███       | 1698/5566 [1:07:04<2:23:13,  2.22s/it]

training loss: 3.5535857677459717


training:  31%|███       | 1699/5566 [1:07:07<2:22:39,  2.21s/it]

training loss: 3.5194106101989746


training:  31%|███       | 1700/5566 [1:07:09<2:22:20,  2.21s/it]

training loss: 3.511953830718994
valid loss: 3.511582374572754
perplexity: 33.50123596191406


training:  31%|███       | 1701/5566 [1:07:14<3:16:10,  3.05s/it]

training loss: 3.5039806365966797


training:  31%|███       | 1702/5566 [1:07:16<3:02:12,  2.83s/it]

training loss: 3.5484654903411865


training:  31%|███       | 1703/5566 [1:07:18<2:50:37,  2.65s/it]

training loss: 3.5268585681915283


training:  31%|███       | 1704/5566 [1:07:21<2:42:29,  2.52s/it]

training loss: 3.5044217109680176


training:  31%|███       | 1705/5566 [1:07:23<2:36:18,  2.43s/it]

training loss: 3.5055952072143555


training:  31%|███       | 1706/5566 [1:07:25<2:31:59,  2.36s/it]

training loss: 3.5199837684631348


training:  31%|███       | 1707/5566 [1:07:27<2:28:50,  2.31s/it]

training loss: 3.5200634002685547


training:  31%|███       | 1708/5566 [1:07:29<2:26:36,  2.28s/it]

training loss: 3.5301666259765625


training:  31%|███       | 1709/5566 [1:07:32<2:25:21,  2.26s/it]

training loss: 3.522318124771118


training:  31%|███       | 1710/5566 [1:07:34<2:24:34,  2.25s/it]

training loss: 3.5357789993286133


training:  31%|███       | 1711/5566 [1:07:36<2:23:33,  2.23s/it]

training loss: 3.521712064743042


training:  31%|███       | 1712/5566 [1:07:38<2:22:32,  2.22s/it]

training loss: 3.496952772140503


training:  31%|███       | 1713/5566 [1:07:40<2:22:34,  2.22s/it]

training loss: 3.509012460708618


training:  31%|███       | 1714/5566 [1:07:43<2:22:18,  2.22s/it]

training loss: 3.5084118843078613


training:  31%|███       | 1715/5566 [1:07:45<2:22:37,  2.22s/it]

training loss: 3.515455484390259


training:  31%|███       | 1716/5566 [1:07:47<2:22:56,  2.23s/it]

training loss: 3.551619291305542


training:  31%|███       | 1717/5566 [1:07:50<2:33:57,  2.40s/it]

training loss: 3.503056526184082


training:  31%|███       | 1718/5566 [1:07:52<2:31:09,  2.36s/it]

training loss: 3.511331796646118


training:  31%|███       | 1719/5566 [1:07:54<2:28:13,  2.31s/it]

training loss: 3.502988815307617


training:  31%|███       | 1720/5566 [1:07:57<2:26:11,  2.28s/it]

training loss: 3.5105791091918945
valid loss: 3.510014295578003
perplexity: 33.44874954223633


training:  31%|███       | 1721/5566 [1:08:01<3:03:13,  2.86s/it]

training loss: 3.5330634117126465


training:  31%|███       | 1722/5566 [1:08:03<2:51:05,  2.67s/it]

training loss: 3.538743495941162


training:  31%|███       | 1723/5566 [1:08:05<2:42:05,  2.53s/it]

training loss: 3.517765998840332


training:  31%|███       | 1724/5566 [1:08:07<2:36:26,  2.44s/it]

training loss: 3.5236475467681885


training:  31%|███       | 1725/5566 [1:08:10<2:31:54,  2.37s/it]

training loss: 3.516961097717285


training:  31%|███       | 1726/5566 [1:08:12<2:28:32,  2.32s/it]

training loss: 3.544712781906128


training:  31%|███       | 1727/5566 [1:08:14<2:26:42,  2.29s/it]

training loss: 3.528184175491333


training:  31%|███       | 1728/5566 [1:08:16<2:25:07,  2.27s/it]

training loss: 3.5393569469451904


training:  31%|███       | 1729/5566 [1:08:19<2:24:08,  2.25s/it]

training loss: 3.5249550342559814


training:  31%|███       | 1730/5566 [1:08:21<2:23:42,  2.25s/it]

training loss: 3.507993459701538


training:  31%|███       | 1731/5566 [1:08:23<2:22:57,  2.24s/it]

training loss: 3.518378973007202


training:  31%|███       | 1732/5566 [1:08:25<2:22:48,  2.23s/it]

training loss: 3.530841112136841


training:  31%|███       | 1733/5566 [1:08:27<2:22:20,  2.23s/it]

training loss: 3.5229949951171875


training:  31%|███       | 1734/5566 [1:08:30<2:21:54,  2.22s/it]

training loss: 3.5517895221710205


training:  31%|███       | 1735/5566 [1:08:32<2:21:58,  2.22s/it]

training loss: 3.5661277770996094


training:  31%|███       | 1736/5566 [1:08:34<2:21:35,  2.22s/it]

training loss: 3.531212568283081


training:  31%|███       | 1737/5566 [1:08:36<2:21:43,  2.22s/it]

training loss: 3.4957985877990723


training:  31%|███       | 1738/5566 [1:08:38<2:21:22,  2.22s/it]

training loss: 3.5038015842437744


training:  31%|███       | 1739/5566 [1:08:41<2:21:21,  2.22s/it]

training loss: 3.5339369773864746


training:  31%|███▏      | 1740/5566 [1:08:43<2:20:48,  2.21s/it]

training loss: 3.5308282375335693
valid loss: 3.53053879737854
perplexity: 34.142356872558594


training:  31%|███▏      | 1741/5566 [1:08:47<2:58:49,  2.81s/it]

training loss: 3.510730266571045


training:  31%|███▏      | 1742/5566 [1:08:49<2:47:52,  2.63s/it]

training loss: 3.513578176498413


training:  31%|███▏      | 1743/5566 [1:08:52<2:39:57,  2.51s/it]

training loss: 3.5006141662597656


training:  31%|███▏      | 1744/5566 [1:08:54<2:33:54,  2.42s/it]

training loss: 3.50526762008667


training:  31%|███▏      | 1745/5566 [1:08:56<2:29:41,  2.35s/it]

training loss: 3.5360066890716553


training:  31%|███▏      | 1746/5566 [1:08:58<2:26:34,  2.30s/it]

training loss: 3.5068557262420654


training:  31%|███▏      | 1747/5566 [1:09:00<2:24:44,  2.27s/it]

training loss: 3.527585506439209


training:  31%|███▏      | 1748/5566 [1:09:03<2:24:02,  2.26s/it]

training loss: 3.5365490913391113


training:  31%|███▏      | 1749/5566 [1:09:05<2:22:42,  2.24s/it]

training loss: 3.5002872943878174


training:  31%|███▏      | 1750/5566 [1:09:07<2:22:40,  2.24s/it]

training loss: 3.5144307613372803


training:  31%|███▏      | 1751/5566 [1:09:09<2:22:33,  2.24s/it]

training loss: 3.5210399627685547


training:  31%|███▏      | 1752/5566 [1:09:11<2:21:22,  2.22s/it]

training loss: 3.5163910388946533


training:  31%|███▏      | 1753/5566 [1:09:14<2:21:33,  2.23s/it]

training loss: 3.542449951171875


training:  32%|███▏      | 1754/5566 [1:09:16<2:21:54,  2.23s/it]

training loss: 3.518664836883545


training:  32%|███▏      | 1755/5566 [1:09:18<2:21:17,  2.22s/it]

training loss: 3.5268895626068115


training:  32%|███▏      | 1756/5566 [1:09:20<2:21:10,  2.22s/it]

training loss: 3.5033822059631348


training:  32%|███▏      | 1757/5566 [1:09:23<2:21:33,  2.23s/it]

training loss: 3.496473789215088


training:  32%|███▏      | 1758/5566 [1:09:25<2:21:20,  2.23s/it]

training loss: 3.525402784347534


training:  32%|███▏      | 1759/5566 [1:09:27<2:21:12,  2.23s/it]

training loss: 3.4919986724853516


training:  32%|███▏      | 1760/5566 [1:09:29<2:21:07,  2.22s/it]

training loss: 3.514930248260498
valid loss: 3.5144009590148926
perplexity: 33.59579849243164


training:  32%|███▏      | 1761/5566 [1:09:33<2:58:27,  2.81s/it]

training loss: 3.490318775177002


training:  32%|███▏      | 1762/5566 [1:09:36<2:47:21,  2.64s/it]

training loss: 3.530726432800293


training:  32%|███▏      | 1763/5566 [1:09:38<2:39:33,  2.52s/it]

training loss: 3.5232415199279785


training:  32%|███▏      | 1764/5566 [1:09:40<2:33:50,  2.43s/it]

training loss: 3.505645275115967


training:  32%|███▏      | 1765/5566 [1:09:42<2:29:35,  2.36s/it]

training loss: 3.521972894668579


training:  32%|███▏      | 1766/5566 [1:09:45<2:26:45,  2.32s/it]

training loss: 3.5315818786621094


training:  32%|███▏      | 1767/5566 [1:09:47<2:24:37,  2.28s/it]

training loss: 3.497495651245117


training:  32%|███▏      | 1768/5566 [1:09:49<2:23:18,  2.26s/it]

training loss: 3.516573429107666


training:  32%|███▏      | 1769/5566 [1:09:51<2:22:20,  2.25s/it]

training loss: 3.5059568881988525


training:  32%|███▏      | 1770/5566 [1:09:53<2:21:30,  2.24s/it]

training loss: 3.50974702835083


training:  32%|███▏      | 1771/5566 [1:09:56<2:20:41,  2.22s/it]

training loss: 3.5308122634887695


training:  32%|███▏      | 1772/5566 [1:09:58<2:20:29,  2.22s/it]

training loss: 3.52030086517334


training:  32%|███▏      | 1773/5566 [1:10:00<2:19:47,  2.21s/it]

training loss: 3.552678346633911


training:  32%|███▏      | 1774/5566 [1:10:02<2:20:01,  2.22s/it]

training loss: 3.5058696269989014


training:  32%|███▏      | 1775/5566 [1:10:04<2:19:43,  2.21s/it]

training loss: 3.5149340629577637


training:  32%|███▏      | 1776/5566 [1:10:07<2:19:22,  2.21s/it]

training loss: 3.526259660720825


training:  32%|███▏      | 1777/5566 [1:10:09<2:19:02,  2.20s/it]

training loss: 3.5295186042785645


training:  32%|███▏      | 1778/5566 [1:10:11<2:19:00,  2.20s/it]

training loss: 3.527783155441284


training:  32%|███▏      | 1779/5566 [1:10:13<2:19:12,  2.21s/it]

training loss: 3.516788959503174


training:  32%|███▏      | 1780/5566 [1:10:15<2:19:14,  2.21s/it]

training loss: 3.528607130050659
valid loss: 3.527681350708008
perplexity: 34.04494094848633


training:  32%|███▏      | 1781/5566 [1:10:20<2:56:06,  2.79s/it]

training loss: 3.5286178588867188


training:  32%|███▏      | 1782/5566 [1:10:22<2:45:01,  2.62s/it]

training loss: 3.516122579574585


training:  32%|███▏      | 1783/5566 [1:10:24<2:37:14,  2.49s/it]

training loss: 3.5045382976531982


training:  32%|███▏      | 1784/5566 [1:10:26<2:31:50,  2.41s/it]

training loss: 3.50449275970459


training:  32%|███▏      | 1785/5566 [1:10:28<2:28:06,  2.35s/it]

training loss: 3.5328853130340576


training:  32%|███▏      | 1786/5566 [1:10:31<2:24:44,  2.30s/it]

training loss: 3.5210120677948


training:  32%|███▏      | 1787/5566 [1:10:33<2:23:47,  2.28s/it]

training loss: 3.5443344116210938


training:  32%|███▏      | 1788/5566 [1:10:35<2:22:00,  2.26s/it]

training loss: 3.50260591506958


training:  32%|███▏      | 1789/5566 [1:10:37<2:21:00,  2.24s/it]

training loss: 3.522010087966919


training:  32%|███▏      | 1790/5566 [1:10:39<2:20:34,  2.23s/it]

training loss: 3.5025177001953125


training:  32%|███▏      | 1791/5566 [1:10:42<2:20:16,  2.23s/it]

training loss: 3.527862548828125


training:  32%|███▏      | 1792/5566 [1:10:44<2:20:00,  2.23s/it]

training loss: 3.512388229370117


training:  32%|███▏      | 1793/5566 [1:10:46<2:20:23,  2.23s/it]

training loss: 3.5322887897491455


training:  32%|███▏      | 1794/5566 [1:10:48<2:20:12,  2.23s/it]

training loss: 3.501492500305176


training:  32%|███▏      | 1795/5566 [1:10:51<2:19:36,  2.22s/it]

training loss: 3.5018157958984375


training:  32%|███▏      | 1796/5566 [1:10:53<2:19:53,  2.23s/it]

training loss: 3.536072015762329


training:  32%|███▏      | 1797/5566 [1:10:55<2:19:11,  2.22s/it]

training loss: 3.5172793865203857


training:  32%|███▏      | 1798/5566 [1:10:57<2:18:58,  2.21s/it]

training loss: 3.548156261444092


training:  32%|███▏      | 1799/5566 [1:10:59<2:19:07,  2.22s/it]

training loss: 3.51338529586792


training:  32%|███▏      | 1800/5566 [1:11:02<2:19:20,  2.22s/it]

training loss: 3.5124316215515137
valid loss: 3.511467695236206
perplexity: 33.49739456176758


training:  32%|███▏      | 1801/5566 [1:11:07<3:10:12,  3.03s/it]

training loss: 3.5306241512298584


training:  32%|███▏      | 1802/5566 [1:11:09<2:57:43,  2.83s/it]

training loss: 3.525437593460083


training:  32%|███▏      | 1803/5566 [1:11:12<2:57:05,  2.82s/it]

training loss: 3.5144803524017334


training:  32%|███▏      | 1804/5566 [1:11:14<2:46:47,  2.66s/it]

training loss: 3.501131057739258


training:  32%|███▏      | 1805/5566 [1:11:16<2:37:58,  2.52s/it]

training loss: 3.4966259002685547


training:  32%|███▏      | 1806/5566 [1:11:18<2:32:07,  2.43s/it]

training loss: 3.495252847671509


training:  32%|███▏      | 1807/5566 [1:11:21<2:28:10,  2.37s/it]

training loss: 3.5207133293151855


training:  32%|███▏      | 1808/5566 [1:11:23<2:25:45,  2.33s/it]

training loss: 3.525534152984619


training:  33%|███▎      | 1809/5566 [1:11:25<2:23:41,  2.29s/it]

training loss: 3.514317512512207


training:  33%|███▎      | 1810/5566 [1:11:27<2:22:18,  2.27s/it]

training loss: 3.5178627967834473


training:  33%|███▎      | 1811/5566 [1:11:30<2:21:13,  2.26s/it]

training loss: 3.5214152336120605


training:  33%|███▎      | 1812/5566 [1:11:32<2:20:18,  2.24s/it]

training loss: 3.5049171447753906


training:  33%|███▎      | 1813/5566 [1:11:34<2:19:19,  2.23s/it]

training loss: 3.5479512214660645


training:  33%|███▎      | 1814/5566 [1:11:36<2:18:29,  2.21s/it]

training loss: 3.52274227142334


training:  33%|███▎      | 1815/5566 [1:11:38<2:18:21,  2.21s/it]

training loss: 3.520961046218872


training:  33%|███▎      | 1816/5566 [1:11:41<2:18:03,  2.21s/it]

training loss: 3.5151915550231934


training:  33%|███▎      | 1817/5566 [1:11:43<2:18:34,  2.22s/it]

training loss: 3.527611255645752


training:  33%|███▎      | 1818/5566 [1:11:47<2:55:39,  2.81s/it]

training loss: 3.4924585819244385


training:  33%|███▎      | 1819/5566 [1:11:50<3:04:12,  2.95s/it]

training loss: 3.510984182357788


training:  33%|███▎      | 1820/5566 [1:11:52<2:50:37,  2.73s/it]

training loss: 3.5101242065429688
valid loss: 3.509185791015625
perplexity: 33.42104721069336


training:  33%|███▎      | 1821/5566 [1:11:57<3:17:38,  3.17s/it]

training loss: 3.5127458572387695


training:  33%|███▎      | 1822/5566 [1:11:59<2:59:53,  2.88s/it]

training loss: 3.4911487102508545


training:  33%|███▎      | 1823/5566 [1:12:01<2:47:06,  2.68s/it]

training loss: 3.528381109237671


training:  33%|███▎      | 1824/5566 [1:12:03<2:38:30,  2.54s/it]

training loss: 3.5367090702056885


training:  33%|███▎      | 1825/5566 [1:12:06<2:32:28,  2.45s/it]

training loss: 3.5408716201782227


training:  33%|███▎      | 1826/5566 [1:12:08<2:28:15,  2.38s/it]

training loss: 3.497992753982544


training:  33%|███▎      | 1827/5566 [1:12:10<2:24:51,  2.32s/it]

training loss: 3.5181996822357178


training:  33%|███▎      | 1828/5566 [1:12:12<2:22:58,  2.30s/it]

training loss: 3.538783073425293


training:  33%|███▎      | 1829/5566 [1:12:14<2:21:32,  2.27s/it]

training loss: 3.5369668006896973


training:  33%|███▎      | 1830/5566 [1:12:17<2:21:01,  2.26s/it]

training loss: 3.514587640762329


training:  33%|███▎      | 1831/5566 [1:12:19<2:20:20,  2.25s/it]

training loss: 3.5289626121520996


training:  33%|███▎      | 1832/5566 [1:12:21<2:19:20,  2.24s/it]

training loss: 3.491628646850586


training:  33%|███▎      | 1833/5566 [1:12:23<2:19:03,  2.23s/it]

training loss: 3.522339105606079


training:  33%|███▎      | 1834/5566 [1:12:26<2:18:48,  2.23s/it]

training loss: 3.5357446670532227


training:  33%|███▎      | 1835/5566 [1:12:28<2:18:47,  2.23s/it]

training loss: 3.5288612842559814


training:  33%|███▎      | 1836/5566 [1:12:30<2:18:25,  2.23s/it]

training loss: 3.543177366256714


training:  33%|███▎      | 1837/5566 [1:12:32<2:17:59,  2.22s/it]

training loss: 3.5372583866119385


training:  33%|███▎      | 1838/5566 [1:12:34<2:18:01,  2.22s/it]

training loss: 3.4898881912231445


training:  33%|███▎      | 1839/5566 [1:12:37<2:17:29,  2.21s/it]

training loss: 3.5257973670959473


training:  33%|███▎      | 1840/5566 [1:12:39<2:17:47,  2.22s/it]

training loss: 3.5486793518066406
valid loss: 3.5483107566833496
perplexity: 34.75455856323242


training:  33%|███▎      | 1841/5566 [1:12:43<2:54:09,  2.81s/it]

training loss: 3.5131309032440186


training:  33%|███▎      | 1842/5566 [1:12:45<2:42:54,  2.62s/it]

training loss: 3.532636880874634


training:  33%|███▎      | 1843/5566 [1:12:47<2:35:25,  2.50s/it]

training loss: 3.5096163749694824


training:  33%|███▎      | 1844/5566 [1:12:50<2:29:54,  2.42s/it]

training loss: 3.5214569568634033


training:  33%|███▎      | 1845/5566 [1:12:52<2:26:14,  2.36s/it]

training loss: 3.5076866149902344


training:  33%|███▎      | 1846/5566 [1:12:54<2:24:31,  2.33s/it]

training loss: 3.4989681243896484


training:  33%|███▎      | 1847/5566 [1:12:56<2:22:02,  2.29s/it]

training loss: 3.516305446624756


training:  33%|███▎      | 1848/5566 [1:12:59<2:20:35,  2.27s/it]

training loss: 3.5073189735412598


training:  33%|███▎      | 1849/5566 [1:13:01<2:19:18,  2.25s/it]

training loss: 3.509005308151245


training:  33%|███▎      | 1850/5566 [1:13:03<2:18:47,  2.24s/it]

training loss: 3.491011619567871


training:  33%|███▎      | 1851/5566 [1:13:05<2:17:50,  2.23s/it]

training loss: 3.5131163597106934


training:  33%|███▎      | 1852/5566 [1:13:07<2:18:03,  2.23s/it]

training loss: 3.5165810585021973


training:  33%|███▎      | 1853/5566 [1:13:10<2:17:21,  2.22s/it]

training loss: 3.5390970706939697


training:  33%|███▎      | 1854/5566 [1:13:13<2:33:01,  2.47s/it]

training loss: 3.5404725074768066


training:  33%|███▎      | 1855/5566 [1:13:15<2:30:42,  2.44s/it]

training loss: 3.526930809020996


training:  33%|███▎      | 1856/5566 [1:13:17<2:26:38,  2.37s/it]

training loss: 3.531442165374756


training:  33%|███▎      | 1857/5566 [1:13:19<2:23:39,  2.32s/it]

training loss: 3.515937328338623


training:  33%|███▎      | 1858/5566 [1:13:22<2:21:31,  2.29s/it]

training loss: 3.524810314178467


training:  33%|███▎      | 1859/5566 [1:13:24<2:20:05,  2.27s/it]

training loss: 3.501765251159668


training:  33%|███▎      | 1860/5566 [1:13:26<2:19:17,  2.26s/it]

training loss: 3.5149407386779785
valid loss: 3.513824224472046
perplexity: 33.5764274597168


training:  33%|███▎      | 1861/5566 [1:13:30<2:55:05,  2.84s/it]

training loss: 3.535754919052124


training:  33%|███▎      | 1862/5566 [1:13:33<2:43:40,  2.65s/it]

training loss: 3.514446973800659


training:  33%|███▎      | 1863/5566 [1:13:35<2:35:43,  2.52s/it]

training loss: 3.5024242401123047


training:  33%|███▎      | 1864/5566 [1:13:37<2:30:04,  2.43s/it]

training loss: 3.501832962036133


training:  34%|███▎      | 1865/5566 [1:13:39<2:25:57,  2.37s/it]

training loss: 3.5031135082244873


training:  34%|███▎      | 1866/5566 [1:13:41<2:22:59,  2.32s/it]

training loss: 3.5448412895202637


training:  34%|███▎      | 1867/5566 [1:13:44<2:21:20,  2.29s/it]

training loss: 3.5018038749694824


training:  34%|███▎      | 1868/5566 [1:13:46<2:19:45,  2.27s/it]

training loss: 3.5306942462921143


training:  34%|███▎      | 1869/5566 [1:13:48<2:19:08,  2.26s/it]

training loss: 3.498419761657715


training:  34%|███▎      | 1870/5566 [1:13:50<2:18:07,  2.24s/it]

training loss: 3.4948558807373047


training:  34%|███▎      | 1871/5566 [1:13:52<2:17:43,  2.24s/it]

training loss: 3.5405452251434326


training:  34%|███▎      | 1872/5566 [1:13:55<2:17:46,  2.24s/it]

training loss: 3.521590232849121


training:  34%|███▎      | 1873/5566 [1:13:57<2:17:02,  2.23s/it]

training loss: 3.524376392364502


training:  34%|███▎      | 1874/5566 [1:13:59<2:16:43,  2.22s/it]

training loss: 3.5145750045776367


training:  34%|███▎      | 1875/5566 [1:14:01<2:16:37,  2.22s/it]

training loss: 3.520326614379883


training:  34%|███▎      | 1876/5566 [1:14:04<2:16:15,  2.22s/it]

training loss: 3.5134341716766357


training:  34%|███▎      | 1877/5566 [1:14:06<2:15:54,  2.21s/it]

training loss: 3.524362802505493


training:  34%|███▎      | 1878/5566 [1:14:08<2:15:45,  2.21s/it]

training loss: 3.516951322555542


training:  34%|███▍      | 1879/5566 [1:14:10<2:15:23,  2.20s/it]

training loss: 3.535695791244507


training:  34%|███▍      | 1880/5566 [1:14:12<2:15:28,  2.21s/it]

training loss: 3.5283305644989014
valid loss: 3.5274736881256104
perplexity: 34.037872314453125


training:  34%|███▍      | 1881/5566 [1:14:17<2:52:17,  2.81s/it]

training loss: 3.5367367267608643


training:  34%|███▍      | 1882/5566 [1:14:19<2:41:09,  2.62s/it]

training loss: 3.5304222106933594


training:  34%|███▍      | 1883/5566 [1:14:21<2:33:50,  2.51s/it]

training loss: 3.5311179161071777


training:  34%|███▍      | 1884/5566 [1:14:23<2:28:42,  2.42s/it]

training loss: 3.531482219696045


training:  34%|███▍      | 1885/5566 [1:14:25<2:25:19,  2.37s/it]

training loss: 3.5218093395233154


training:  34%|███▍      | 1886/5566 [1:14:28<2:22:35,  2.32s/it]

training loss: 3.5413575172424316


training:  34%|███▍      | 1887/5566 [1:14:30<2:31:11,  2.47s/it]

training loss: 3.528702974319458


training:  34%|███▍      | 1888/5566 [1:14:33<2:27:20,  2.40s/it]

training loss: 3.5079030990600586


training:  34%|███▍      | 1889/5566 [1:14:35<2:23:32,  2.34s/it]

training loss: 3.535505771636963


training:  34%|███▍      | 1890/5566 [1:14:37<2:21:19,  2.31s/it]

training loss: 3.5011684894561768


training:  34%|███▍      | 1891/5566 [1:14:39<2:19:37,  2.28s/it]

training loss: 3.534351110458374


training:  34%|███▍      | 1892/5566 [1:14:42<2:18:46,  2.27s/it]

training loss: 3.5236663818359375


training:  34%|███▍      | 1893/5566 [1:14:44<2:17:38,  2.25s/it]

training loss: 3.496981620788574


training:  34%|███▍      | 1894/5566 [1:14:46<2:16:59,  2.24s/it]

training loss: 3.536355495452881


training:  34%|███▍      | 1895/5566 [1:14:48<2:17:01,  2.24s/it]

training loss: 3.531750440597534


training:  34%|███▍      | 1896/5566 [1:14:50<2:16:17,  2.23s/it]

training loss: 3.522247314453125


training:  34%|███▍      | 1897/5566 [1:14:53<2:15:37,  2.22s/it]

training loss: 3.537796974182129


training:  34%|███▍      | 1898/5566 [1:14:55<2:15:38,  2.22s/it]

training loss: 3.525456666946411


training:  34%|███▍      | 1899/5566 [1:14:57<2:15:24,  2.22s/it]

training loss: 3.541897773742676


training:  34%|███▍      | 1900/5566 [1:14:59<2:15:16,  2.21s/it]

training loss: 3.543536901473999
valid loss: 3.5429251194000244
perplexity: 34.56788635253906


training:  34%|███▍      | 1901/5566 [1:15:04<3:05:12,  3.03s/it]

training loss: 3.512540340423584


training:  34%|███▍      | 1902/5566 [1:15:07<2:51:53,  2.81s/it]

training loss: 3.5360183715820312


training:  34%|███▍      | 1903/5566 [1:15:09<2:41:01,  2.64s/it]

training loss: 3.533639907836914


training:  34%|███▍      | 1904/5566 [1:15:11<2:32:54,  2.51s/it]

training loss: 3.49509859085083


training:  34%|███▍      | 1905/5566 [1:15:13<2:27:32,  2.42s/it]

training loss: 3.524498224258423


training:  34%|███▍      | 1906/5566 [1:15:15<2:23:38,  2.35s/it]

training loss: 3.5083963871002197


training:  34%|███▍      | 1907/5566 [1:15:18<2:20:57,  2.31s/it]

training loss: 3.524171829223633


training:  34%|███▍      | 1908/5566 [1:15:20<2:19:06,  2.28s/it]

training loss: 3.5334911346435547


training:  34%|███▍      | 1909/5566 [1:15:22<2:17:57,  2.26s/it]

training loss: 3.5123839378356934


training:  34%|███▍      | 1910/5566 [1:15:24<2:17:02,  2.25s/it]

training loss: 3.5143609046936035


training:  34%|███▍      | 1911/5566 [1:15:27<2:16:28,  2.24s/it]

training loss: 3.505664348602295


training:  34%|███▍      | 1912/5566 [1:15:29<2:16:06,  2.24s/it]

training loss: 3.5188510417938232


training:  34%|███▍      | 1913/5566 [1:15:31<2:15:21,  2.22s/it]

training loss: 3.523259401321411


training:  34%|███▍      | 1914/5566 [1:15:33<2:15:27,  2.23s/it]

training loss: 3.5209736824035645


training:  34%|███▍      | 1915/5566 [1:15:35<2:15:02,  2.22s/it]

training loss: 3.53568172454834


training:  34%|███▍      | 1916/5566 [1:15:38<2:14:43,  2.21s/it]

training loss: 3.5264222621917725


training:  34%|███▍      | 1917/5566 [1:15:40<2:14:48,  2.22s/it]

training loss: 3.5213379859924316


training:  34%|███▍      | 1918/5566 [1:15:42<2:14:23,  2.21s/it]

training loss: 3.5445430278778076


training:  34%|███▍      | 1919/5566 [1:15:44<2:14:29,  2.21s/it]

training loss: 3.540802001953125


training:  34%|███▍      | 1920/5566 [1:15:46<2:14:57,  2.22s/it]

training loss: 3.553783655166626
valid loss: 3.552466630935669
perplexity: 34.89929962158203


training:  35%|███▍      | 1921/5566 [1:15:51<2:50:13,  2.80s/it]

training loss: 3.520594358444214


training:  35%|███▍      | 1922/5566 [1:15:53<2:39:02,  2.62s/it]

training loss: 3.534909725189209


training:  35%|███▍      | 1923/5566 [1:15:55<2:31:26,  2.49s/it]

training loss: 3.5278379917144775


training:  35%|███▍      | 1924/5566 [1:15:57<2:26:35,  2.41s/it]

training loss: 3.514190435409546


training:  35%|███▍      | 1925/5566 [1:15:59<2:22:20,  2.35s/it]

training loss: 3.5170767307281494


training:  35%|███▍      | 1926/5566 [1:16:02<2:19:49,  2.30s/it]

training loss: 3.5198886394500732


training:  35%|███▍      | 1927/5566 [1:16:04<2:18:21,  2.28s/it]

training loss: 3.514988660812378


training:  35%|███▍      | 1928/5566 [1:16:06<2:16:58,  2.26s/it]

training loss: 3.5242016315460205


training:  35%|███▍      | 1929/5566 [1:16:08<2:16:08,  2.25s/it]

training loss: 3.5154688358306885


training:  35%|███▍      | 1930/5566 [1:16:10<2:15:12,  2.23s/it]

training loss: 3.5215537548065186


training:  35%|███▍      | 1931/5566 [1:16:13<2:14:32,  2.22s/it]

training loss: 3.502960681915283


training:  35%|███▍      | 1932/5566 [1:16:15<2:13:59,  2.21s/it]

training loss: 3.4983510971069336


training:  35%|███▍      | 1933/5566 [1:16:17<2:14:06,  2.21s/it]

training loss: 3.5203213691711426


training:  35%|███▍      | 1934/5566 [1:16:19<2:14:22,  2.22s/it]

training loss: 3.5263049602508545


training:  35%|███▍      | 1935/5566 [1:16:22<2:14:14,  2.22s/it]

training loss: 3.525806427001953


training:  35%|███▍      | 1936/5566 [1:16:24<2:14:17,  2.22s/it]

training loss: 3.539191484451294


training:  35%|███▍      | 1937/5566 [1:16:26<2:14:04,  2.22s/it]

training loss: 3.5271944999694824


training:  35%|███▍      | 1938/5566 [1:16:28<2:14:08,  2.22s/it]

training loss: 3.5150856971740723


training:  35%|███▍      | 1939/5566 [1:16:30<2:14:02,  2.22s/it]

training loss: 3.5160953998565674


training:  35%|███▍      | 1940/5566 [1:16:33<2:14:03,  2.22s/it]

training loss: 3.5266151428222656
valid loss: 3.5262997150421143
perplexity: 33.99793243408203


training:  35%|███▍      | 1941/5566 [1:16:37<2:49:51,  2.81s/it]

training loss: 3.5210793018341064


training:  35%|███▍      | 1942/5566 [1:16:39<2:39:24,  2.64s/it]

training loss: 3.5580835342407227


training:  35%|███▍      | 1943/5566 [1:16:41<2:31:32,  2.51s/it]

training loss: 3.545457124710083


training:  35%|███▍      | 1944/5566 [1:16:43<2:25:40,  2.41s/it]

training loss: 3.5069069862365723


training:  35%|███▍      | 1945/5566 [1:16:46<2:21:30,  2.34s/it]

training loss: 3.5212597846984863


training:  35%|███▍      | 1946/5566 [1:16:48<2:19:36,  2.31s/it]

training loss: 3.5084166526794434


training:  35%|███▍      | 1947/5566 [1:16:50<2:17:13,  2.27s/it]

training loss: 3.5324437618255615


training:  35%|███▍      | 1948/5566 [1:16:52<2:16:09,  2.26s/it]

training loss: 3.5266940593719482


training:  35%|███▌      | 1949/5566 [1:16:54<2:15:28,  2.25s/it]

training loss: 3.5243968963623047


training:  35%|███▌      | 1950/5566 [1:16:57<2:14:28,  2.23s/it]

training loss: 3.544748067855835


training:  35%|███▌      | 1951/5566 [1:16:59<2:14:11,  2.23s/it]

training loss: 3.5216052532196045


training:  35%|███▌      | 1952/5566 [1:17:01<2:14:12,  2.23s/it]

training loss: 3.524041175842285


training:  35%|███▌      | 1953/5566 [1:17:03<2:13:53,  2.22s/it]

training loss: 3.5260913372039795


training:  35%|███▌      | 1954/5566 [1:17:06<2:13:45,  2.22s/it]

training loss: 3.529951572418213


training:  35%|███▌      | 1955/5566 [1:17:08<2:13:40,  2.22s/it]

training loss: 3.5205676555633545


training:  35%|███▌      | 1956/5566 [1:17:10<2:13:09,  2.21s/it]

training loss: 3.520409345626831


training:  35%|███▌      | 1957/5566 [1:17:12<2:13:33,  2.22s/it]

training loss: 3.513784885406494


training:  35%|███▌      | 1958/5566 [1:17:14<2:13:26,  2.22s/it]

training loss: 3.53653621673584


training:  35%|███▌      | 1959/5566 [1:17:17<2:13:38,  2.22s/it]

training loss: 3.5097222328186035


training:  35%|███▌      | 1960/5566 [1:17:19<2:13:16,  2.22s/it]

training loss: 3.525157928466797
valid loss: 3.524630546569824
perplexity: 33.94123077392578


training:  35%|███▌      | 1961/5566 [1:17:23<2:48:20,  2.80s/it]

training loss: 3.507533311843872


training:  35%|███▌      | 1962/5566 [1:17:25<2:37:18,  2.62s/it]

training loss: 3.544325351715088


training:  35%|███▌      | 1963/5566 [1:17:27<2:29:45,  2.49s/it]

training loss: 3.519519090652466


training:  35%|███▌      | 1964/5566 [1:17:30<2:24:43,  2.41s/it]

training loss: 3.521732807159424


training:  35%|███▌      | 1965/5566 [1:17:32<2:20:34,  2.34s/it]

training loss: 3.4886109828948975


training:  35%|███▌      | 1966/5566 [1:17:34<2:18:05,  2.30s/it]

training loss: 3.5256991386413574


training:  35%|███▌      | 1967/5566 [1:17:36<2:16:16,  2.27s/it]

training loss: 3.526090145111084


training:  35%|███▌      | 1968/5566 [1:17:38<2:14:40,  2.25s/it]

training loss: 3.5105977058410645


training:  35%|███▌      | 1969/5566 [1:17:41<2:14:00,  2.24s/it]

training loss: 3.517883062362671


training:  35%|███▌      | 1970/5566 [1:17:43<2:13:50,  2.23s/it]

training loss: 3.5325441360473633


training:  35%|███▌      | 1971/5566 [1:17:45<2:13:50,  2.23s/it]

training loss: 3.4881813526153564


training:  35%|███▌      | 1972/5566 [1:17:48<2:24:14,  2.41s/it]

training loss: 3.5323688983917236


training:  35%|███▌      | 1973/5566 [1:17:50<2:22:29,  2.38s/it]

training loss: 3.51617169380188


training:  35%|███▌      | 1974/5566 [1:17:52<2:19:23,  2.33s/it]

training loss: 3.5268704891204834


training:  35%|███▌      | 1975/5566 [1:17:55<2:17:04,  2.29s/it]

training loss: 3.5019447803497314


training:  36%|███▌      | 1976/5566 [1:17:57<2:16:10,  2.28s/it]

training loss: 3.5173089504241943


training:  36%|███▌      | 1977/5566 [1:17:59<2:15:15,  2.26s/it]

training loss: 3.511289596557617


training:  36%|███▌      | 1978/5566 [1:18:01<2:14:46,  2.25s/it]

training loss: 3.527547597885132


training:  36%|███▌      | 1979/5566 [1:18:04<2:14:21,  2.25s/it]

training loss: 3.5440523624420166


training:  36%|███▌      | 1980/5566 [1:18:06<2:13:50,  2.24s/it]

training loss: 3.5453226566314697
valid loss: 3.5446395874023438
perplexity: 34.62720489501953


training:  36%|███▌      | 1981/5566 [1:18:10<2:49:35,  2.84s/it]

training loss: 3.5366363525390625


training:  36%|███▌      | 1982/5566 [1:18:12<2:39:27,  2.67s/it]

training loss: 3.545677661895752


training:  36%|███▌      | 1983/5566 [1:18:15<2:31:09,  2.53s/it]

training loss: 3.5312204360961914


training:  36%|███▌      | 1984/5566 [1:18:17<2:25:36,  2.44s/it]

training loss: 3.527740478515625


training:  36%|███▌      | 1985/5566 [1:18:19<2:21:41,  2.37s/it]

training loss: 3.4917266368865967


training:  36%|███▌      | 1986/5566 [1:18:21<2:18:31,  2.32s/it]

training loss: 3.5044894218444824


training:  36%|███▌      | 1987/5566 [1:18:23<2:15:58,  2.28s/it]

training loss: 3.5306835174560547


training:  36%|███▌      | 1988/5566 [1:18:26<2:15:04,  2.27s/it]

training loss: 3.5104594230651855


training:  36%|███▌      | 1989/5566 [1:18:28<2:14:09,  2.25s/it]

training loss: 3.538356065750122


training:  36%|███▌      | 1990/5566 [1:18:30<2:13:01,  2.23s/it]

training loss: 3.519665479660034


training:  36%|███▌      | 1991/5566 [1:18:32<2:13:17,  2.24s/it]

training loss: 3.5198726654052734


training:  36%|███▌      | 1992/5566 [1:18:34<2:13:17,  2.24s/it]

training loss: 3.516125440597534


training:  36%|███▌      | 1993/5566 [1:18:37<2:12:57,  2.23s/it]

training loss: 3.5214080810546875


training:  36%|███▌      | 1994/5566 [1:18:39<2:12:21,  2.22s/it]

training loss: 3.491307258605957


training:  36%|███▌      | 1995/5566 [1:18:41<2:12:04,  2.22s/it]

training loss: 3.5053067207336426


training:  36%|███▌      | 1996/5566 [1:18:43<2:11:56,  2.22s/it]

training loss: 3.495415687561035


training:  36%|███▌      | 1997/5566 [1:18:46<2:12:11,  2.22s/it]

training loss: 3.5003838539123535


training:  36%|███▌      | 1998/5566 [1:18:48<2:12:02,  2.22s/it]

training loss: 3.5424180030822754


training:  36%|███▌      | 1999/5566 [1:18:50<2:12:09,  2.22s/it]

training loss: 3.5240185260772705


training:  36%|███▌      | 2000/5566 [1:18:52<2:11:39,  2.22s/it]

training loss: 3.5458409786224365
valid loss: 3.5447709560394287
perplexity: 34.631752014160156


training:  36%|███▌      | 2001/5566 [1:18:57<2:59:54,  3.03s/it]

training loss: 3.5199575424194336


training:  36%|███▌      | 2002/5566 [1:18:59<2:47:08,  2.81s/it]

training loss: 3.5310018062591553


training:  36%|███▌      | 2003/5566 [1:19:02<2:36:42,  2.64s/it]

training loss: 3.5146684646606445


training:  36%|███▌      | 2004/5566 [1:19:04<2:29:26,  2.52s/it]

training loss: 3.5301084518432617


training:  36%|███▌      | 2005/5566 [1:19:06<2:24:20,  2.43s/it]

training loss: 3.5325076580047607


training:  36%|███▌      | 2006/5566 [1:19:08<2:20:27,  2.37s/it]

training loss: 3.517641544342041


training:  36%|███▌      | 2007/5566 [1:19:11<2:18:10,  2.33s/it]

training loss: 3.5377323627471924


training:  36%|███▌      | 2008/5566 [1:19:13<2:15:41,  2.29s/it]

training loss: 3.505075216293335


training:  36%|███▌      | 2009/5566 [1:19:15<2:14:33,  2.27s/it]

training loss: 3.528827428817749


training:  36%|███▌      | 2010/5566 [1:19:17<2:13:27,  2.25s/it]

training loss: 3.533808946609497


training:  36%|███▌      | 2011/5566 [1:19:19<2:13:00,  2.24s/it]

training loss: 3.5033669471740723


training:  36%|███▌      | 2012/5566 [1:19:22<2:12:21,  2.23s/it]

training loss: 3.515259027481079


training:  36%|███▌      | 2013/5566 [1:19:24<2:11:44,  2.22s/it]

training loss: 3.5163514614105225


training:  36%|███▌      | 2014/5566 [1:19:26<2:10:56,  2.21s/it]

training loss: 3.4910030364990234


training:  36%|███▌      | 2015/5566 [1:19:28<2:11:13,  2.22s/it]

training loss: 3.5238912105560303


training:  36%|███▌      | 2016/5566 [1:19:30<2:11:14,  2.22s/it]

training loss: 3.5198142528533936


training:  36%|███▌      | 2017/5566 [1:19:33<2:10:45,  2.21s/it]

training loss: 3.4999918937683105


training:  36%|███▋      | 2018/5566 [1:19:35<2:10:41,  2.21s/it]

training loss: 3.538057327270508


training:  36%|███▋      | 2019/5566 [1:19:37<2:10:54,  2.21s/it]

training loss: 3.530179500579834


training:  36%|███▋      | 2020/5566 [1:19:39<2:10:27,  2.21s/it]

training loss: 3.5255918502807617
valid loss: 3.5246803760528564
perplexity: 33.94292449951172


training:  36%|███▋      | 2021/5566 [1:19:43<2:45:30,  2.80s/it]

training loss: 3.507636070251465


training:  36%|███▋      | 2022/5566 [1:19:46<2:35:21,  2.63s/it]

training loss: 3.5376315116882324


training:  36%|███▋      | 2023/5566 [1:19:48<2:27:42,  2.50s/it]

training loss: 3.514127731323242


training:  36%|███▋      | 2024/5566 [1:19:50<2:23:12,  2.43s/it]

training loss: 3.5036280155181885


training:  36%|███▋      | 2025/5566 [1:19:52<2:19:28,  2.36s/it]

training loss: 3.5282459259033203


training:  36%|███▋      | 2026/5566 [1:19:55<2:16:30,  2.31s/it]

training loss: 3.536458969116211


training:  36%|███▋      | 2027/5566 [1:19:57<2:14:40,  2.28s/it]

training loss: 3.5121819972991943


training:  36%|███▋      | 2028/5566 [1:19:59<2:13:22,  2.26s/it]

training loss: 3.5573437213897705


training:  36%|███▋      | 2029/5566 [1:20:01<2:12:45,  2.25s/it]

training loss: 3.5268263816833496


training:  36%|███▋      | 2030/5566 [1:20:03<2:12:13,  2.24s/it]

training loss: 3.5136590003967285


training:  36%|███▋      | 2031/5566 [1:20:06<2:11:39,  2.23s/it]

training loss: 3.527966260910034


training:  37%|███▋      | 2032/5566 [1:20:08<2:11:30,  2.23s/it]

training loss: 3.5070910453796387


training:  37%|███▋      | 2033/5566 [1:20:10<2:10:31,  2.22s/it]

training loss: 3.507749080657959


training:  37%|███▋      | 2034/5566 [1:20:12<2:10:08,  2.21s/it]

training loss: 3.506772756576538


training:  37%|███▋      | 2035/5566 [1:20:14<2:10:16,  2.21s/it]

training loss: 3.501652717590332


training:  37%|███▋      | 2036/5566 [1:20:17<2:09:59,  2.21s/it]

training loss: 3.5304336547851562


training:  37%|███▋      | 2037/5566 [1:20:19<2:09:31,  2.20s/it]

training loss: 3.5469822883605957


training:  37%|███▋      | 2038/5566 [1:20:21<2:09:36,  2.20s/it]

training loss: 3.5169286727905273


training:  37%|███▋      | 2039/5566 [1:20:23<2:09:58,  2.21s/it]

training loss: 3.5472095012664795


training:  37%|███▋      | 2040/5566 [1:20:26<2:10:00,  2.21s/it]

training loss: 3.530454397201538
valid loss: 3.530118942260742
perplexity: 34.12802505493164


training:  37%|███▋      | 2041/5566 [1:20:30<2:44:57,  2.81s/it]

training loss: 3.5351247787475586


training:  37%|███▋      | 2042/5566 [1:20:32<2:34:18,  2.63s/it]

training loss: 3.503859519958496


training:  37%|███▋      | 2043/5566 [1:20:34<2:27:00,  2.50s/it]

training loss: 3.533281087875366


training:  37%|███▋      | 2044/5566 [1:20:36<2:21:39,  2.41s/it]

training loss: 3.5280144214630127


training:  37%|███▋      | 2045/5566 [1:20:39<2:17:45,  2.35s/it]

training loss: 3.521355152130127


training:  37%|███▋      | 2046/5566 [1:20:41<2:15:24,  2.31s/it]

training loss: 3.523704767227173


training:  37%|███▋      | 2047/5566 [1:20:43<2:13:24,  2.27s/it]

training loss: 3.5164029598236084


training:  37%|███▋      | 2048/5566 [1:20:45<2:11:52,  2.25s/it]

training loss: 3.518939256668091


training:  37%|███▋      | 2049/5566 [1:20:47<2:10:51,  2.23s/it]

training loss: 3.5391972064971924


training:  37%|███▋      | 2050/5566 [1:20:50<2:10:02,  2.22s/it]

training loss: 3.5344066619873047


training:  37%|███▋      | 2051/5566 [1:20:52<2:09:22,  2.21s/it]

training loss: 3.5466995239257812


training:  37%|███▋      | 2052/5566 [1:20:54<2:09:13,  2.21s/it]

training loss: 3.519304037094116


training:  37%|███▋      | 2053/5566 [1:20:56<2:09:13,  2.21s/it]

training loss: 3.5306029319763184


training:  37%|███▋      | 2054/5566 [1:20:58<2:09:32,  2.21s/it]

training loss: 3.5276272296905518


training:  37%|███▋      | 2055/5566 [1:21:01<2:09:20,  2.21s/it]

training loss: 3.50817608833313


training:  37%|███▋      | 2056/5566 [1:21:03<2:09:58,  2.22s/it]

training loss: 3.519895315170288


training:  37%|███▋      | 2057/5566 [1:21:06<2:19:54,  2.39s/it]

training loss: 3.537508249282837


training:  37%|███▋      | 2058/5566 [1:21:08<2:18:33,  2.37s/it]

training loss: 3.51945161819458


training:  37%|███▋      | 2059/5566 [1:21:10<2:15:46,  2.32s/it]

training loss: 3.5357260704040527


training:  37%|███▋      | 2060/5566 [1:21:12<2:13:35,  2.29s/it]

training loss: 3.541100025177002
valid loss: 3.5402989387512207
perplexity: 34.47722625732422


training:  37%|███▋      | 2061/5566 [1:21:17<2:46:55,  2.86s/it]

training loss: 3.543095350265503


training:  37%|███▋      | 2062/5566 [1:21:19<2:35:30,  2.66s/it]

training loss: 3.5343716144561768


training:  37%|███▋      | 2063/5566 [1:21:21<2:27:26,  2.53s/it]

training loss: 3.522749423980713


training:  37%|███▋      | 2064/5566 [1:21:23<2:21:46,  2.43s/it]

training loss: 3.5361499786376953


training:  37%|███▋      | 2065/5566 [1:21:25<2:18:15,  2.37s/it]

training loss: 3.5274972915649414


training:  37%|███▋      | 2066/5566 [1:21:28<2:15:23,  2.32s/it]

training loss: 3.502589225769043


training:  37%|███▋      | 2067/5566 [1:21:30<2:13:30,  2.29s/it]

training loss: 3.495842456817627


training:  37%|███▋      | 2068/5566 [1:21:32<2:12:00,  2.26s/it]

training loss: 3.4969515800476074


training:  37%|███▋      | 2069/5566 [1:21:34<2:10:59,  2.25s/it]

training loss: 3.540558338165283


training:  37%|███▋      | 2070/5566 [1:21:36<2:10:12,  2.23s/it]

training loss: 3.5152392387390137


training:  37%|███▋      | 2071/5566 [1:21:39<2:09:52,  2.23s/it]

training loss: 3.5278830528259277


training:  37%|███▋      | 2072/5566 [1:21:41<2:09:03,  2.22s/it]

training loss: 3.514866590499878


training:  37%|███▋      | 2073/5566 [1:21:43<2:08:36,  2.21s/it]

training loss: 3.5221686363220215


training:  37%|███▋      | 2074/5566 [1:21:45<2:08:53,  2.21s/it]

training loss: 3.5146703720092773


training:  37%|███▋      | 2075/5566 [1:21:47<2:08:48,  2.21s/it]

training loss: 3.534222364425659


training:  37%|███▋      | 2076/5566 [1:21:50<2:08:34,  2.21s/it]

training loss: 3.5216612815856934


training:  37%|███▋      | 2077/5566 [1:21:52<2:08:15,  2.21s/it]

training loss: 3.5143301486968994


training:  37%|███▋      | 2078/5566 [1:21:54<2:08:06,  2.20s/it]

training loss: 3.5218863487243652


training:  37%|███▋      | 2079/5566 [1:21:56<2:07:47,  2.20s/it]

training loss: 3.5222296714782715


training:  37%|███▋      | 2080/5566 [1:21:58<2:08:03,  2.20s/it]

training loss: 3.525286912918091
valid loss: 3.5241947174072266
perplexity: 33.92644500732422


training:  37%|███▋      | 2081/5566 [1:22:03<2:42:08,  2.79s/it]

training loss: 3.5187220573425293


training:  37%|███▋      | 2082/5566 [1:22:05<2:31:39,  2.61s/it]

training loss: 3.533026695251465


training:  37%|███▋      | 2083/5566 [1:22:07<2:24:36,  2.49s/it]

training loss: 3.530141592025757


training:  37%|███▋      | 2084/5566 [1:22:09<2:19:17,  2.40s/it]

training loss: 3.5390615463256836


training:  37%|███▋      | 2085/5566 [1:22:11<2:15:47,  2.34s/it]

training loss: 3.510354995727539


training:  37%|███▋      | 2086/5566 [1:22:14<2:13:45,  2.31s/it]

training loss: 3.516183614730835


training:  37%|███▋      | 2087/5566 [1:22:16<2:11:43,  2.27s/it]

training loss: 3.53983736038208


training:  38%|███▊      | 2088/5566 [1:22:18<2:10:43,  2.26s/it]

training loss: 3.545668363571167


training:  38%|███▊      | 2089/5566 [1:22:20<2:09:34,  2.24s/it]

training loss: 3.533186197280884


training:  38%|███▊      | 2090/5566 [1:22:22<2:09:09,  2.23s/it]

training loss: 3.50492787361145


training:  38%|███▊      | 2091/5566 [1:22:25<2:08:51,  2.22s/it]

training loss: 3.524258613586426


training:  38%|███▊      | 2092/5566 [1:22:27<2:07:57,  2.21s/it]

training loss: 3.534257173538208


training:  38%|███▊      | 2093/5566 [1:22:29<2:07:48,  2.21s/it]

training loss: 3.4981138706207275


training:  38%|███▊      | 2094/5566 [1:22:31<2:07:54,  2.21s/it]

training loss: 3.511915683746338


training:  38%|███▊      | 2095/5566 [1:22:33<2:07:47,  2.21s/it]

training loss: 3.511030912399292


training:  38%|███▊      | 2096/5566 [1:22:36<2:07:40,  2.21s/it]

training loss: 3.5157570838928223


training:  38%|███▊      | 2097/5566 [1:22:38<2:07:22,  2.20s/it]

training loss: 3.5134291648864746


training:  38%|███▊      | 2098/5566 [1:22:40<2:07:15,  2.20s/it]

training loss: 3.518019914627075


training:  38%|███▊      | 2099/5566 [1:22:42<2:06:54,  2.20s/it]

training loss: 3.5298075675964355


training:  38%|███▊      | 2100/5566 [1:22:44<2:06:53,  2.20s/it]

training loss: 3.542389154434204
valid loss: 3.540933609008789
perplexity: 34.49911117553711


training:  38%|███▊      | 2101/5566 [1:22:49<2:53:32,  3.01s/it]

training loss: 3.515082836151123


training:  38%|███▊      | 2102/5566 [1:22:52<2:40:30,  2.78s/it]

training loss: 3.524268865585327


training:  38%|███▊      | 2103/5566 [1:22:54<2:30:38,  2.61s/it]

training loss: 3.4886112213134766


training:  38%|███▊      | 2104/5566 [1:22:56<2:23:16,  2.48s/it]

training loss: 3.5263383388519287


training:  38%|███▊      | 2105/5566 [1:22:58<2:18:24,  2.40s/it]

training loss: 3.5230910778045654


training:  38%|███▊      | 2106/5566 [1:23:00<2:14:49,  2.34s/it]

training loss: 3.535928249359131


training:  38%|███▊      | 2107/5566 [1:23:03<2:12:50,  2.30s/it]

training loss: 3.536189556121826


training:  38%|███▊      | 2108/5566 [1:23:05<2:11:29,  2.28s/it]

training loss: 3.512611150741577


training:  38%|███▊      | 2109/5566 [1:23:07<2:10:32,  2.27s/it]

training loss: 3.5014920234680176


training:  38%|███▊      | 2110/5566 [1:23:09<2:09:42,  2.25s/it]

training loss: 3.5112125873565674


training:  38%|███▊      | 2111/5566 [1:23:11<2:08:55,  2.24s/it]

training loss: 3.5296261310577393


training:  38%|███▊      | 2112/5566 [1:23:14<2:08:34,  2.23s/it]

training loss: 3.543879270553589


training:  38%|███▊      | 2113/5566 [1:23:16<2:08:11,  2.23s/it]

training loss: 3.529341459274292


training:  38%|███▊      | 2114/5566 [1:23:18<2:07:58,  2.22s/it]

training loss: 3.4980664253234863


training:  38%|███▊      | 2115/5566 [1:23:20<2:07:45,  2.22s/it]

training loss: 3.4926095008850098


training:  38%|███▊      | 2116/5566 [1:23:23<2:08:14,  2.23s/it]

training loss: 3.4905319213867188


training:  38%|███▊      | 2117/5566 [1:23:25<2:07:54,  2.23s/it]

training loss: 3.5214481353759766


training:  38%|███▊      | 2118/5566 [1:23:27<2:07:37,  2.22s/it]

training loss: 3.511593818664551


training:  38%|███▊      | 2119/5566 [1:23:29<2:07:10,  2.21s/it]

training loss: 3.5034396648406982


training:  38%|███▊      | 2120/5566 [1:23:31<2:07:13,  2.22s/it]

training loss: 3.5201358795166016
valid loss: 3.5196518898010254
perplexity: 33.77267074584961


training:  38%|███▊      | 2121/5566 [1:23:36<2:41:30,  2.81s/it]

training loss: 3.5189177989959717


training:  38%|███▊      | 2122/5566 [1:23:38<2:30:44,  2.63s/it]

training loss: 3.5362212657928467


training:  38%|███▊      | 2123/5566 [1:23:40<2:23:25,  2.50s/it]

training loss: 3.528529644012451


training:  38%|███▊      | 2124/5566 [1:23:42<2:18:15,  2.41s/it]

training loss: 3.5201950073242188


training:  38%|███▊      | 2125/5566 [1:23:44<2:14:13,  2.34s/it]

training loss: 3.5262856483459473


training:  38%|███▊      | 2126/5566 [1:23:47<2:11:30,  2.29s/it]

training loss: 3.5039138793945312


training:  38%|███▊      | 2127/5566 [1:23:49<2:09:36,  2.26s/it]

training loss: 3.530696392059326


training:  38%|███▊      | 2128/5566 [1:23:51<2:08:45,  2.25s/it]

training loss: 3.5258355140686035


training:  38%|███▊      | 2129/5566 [1:23:53<2:07:27,  2.22s/it]

training loss: 3.515796184539795


training:  38%|███▊      | 2130/5566 [1:23:55<2:06:48,  2.21s/it]

training loss: 3.5333118438720703


training:  38%|███▊      | 2131/5566 [1:23:58<2:06:24,  2.21s/it]

training loss: 3.5326755046844482


training:  38%|███▊      | 2132/5566 [1:24:00<2:06:39,  2.21s/it]

training loss: 3.5190181732177734


training:  38%|███▊      | 2133/5566 [1:24:02<2:06:31,  2.21s/it]

training loss: 3.5383694171905518


training:  38%|███▊      | 2134/5566 [1:24:04<2:06:14,  2.21s/it]

training loss: 3.5224175453186035


training:  38%|███▊      | 2135/5566 [1:24:06<2:05:58,  2.20s/it]

training loss: 3.553504467010498


training:  38%|███▊      | 2136/5566 [1:24:09<2:06:04,  2.21s/it]

training loss: 3.509277582168579


training:  38%|███▊      | 2137/5566 [1:24:11<2:05:49,  2.20s/it]

training loss: 3.5357284545898438


training:  38%|███▊      | 2138/5566 [1:24:13<2:05:32,  2.20s/it]

training loss: 3.507642984390259


training:  38%|███▊      | 2139/5566 [1:24:15<2:05:44,  2.20s/it]

training loss: 3.5336201190948486


training:  38%|███▊      | 2140/5566 [1:24:17<2:05:41,  2.20s/it]

training loss: 3.52474308013916
valid loss: 3.5240001678466797
perplexity: 33.91984176635742


training:  38%|███▊      | 2141/5566 [1:24:22<2:43:33,  2.87s/it]

training loss: 3.52081561088562


training:  38%|███▊      | 2142/5566 [1:24:25<2:42:01,  2.84s/it]

training loss: 3.52390456199646


training:  39%|███▊      | 2143/5566 [1:24:27<2:31:44,  2.66s/it]

training loss: 3.519573926925659


training:  39%|███▊      | 2144/5566 [1:24:29<2:24:18,  2.53s/it]

training loss: 3.525351047515869


training:  39%|███▊      | 2145/5566 [1:24:31<2:18:41,  2.43s/it]

training loss: 3.5302066802978516


training:  39%|███▊      | 2146/5566 [1:24:33<2:14:43,  2.36s/it]

training loss: 3.5034244060516357


training:  39%|███▊      | 2147/5566 [1:24:36<2:12:28,  2.32s/it]

training loss: 3.5264947414398193


training:  39%|███▊      | 2148/5566 [1:24:38<2:10:29,  2.29s/it]

training loss: 3.5194215774536133


training:  39%|███▊      | 2149/5566 [1:24:40<2:08:37,  2.26s/it]

training loss: 3.5257859230041504


training:  39%|███▊      | 2150/5566 [1:24:42<2:07:31,  2.24s/it]

training loss: 3.5316834449768066


training:  39%|███▊      | 2151/5566 [1:24:44<2:06:46,  2.23s/it]

training loss: 3.5257623195648193


training:  39%|███▊      | 2152/5566 [1:24:47<2:06:04,  2.22s/it]

training loss: 3.511603593826294


training:  39%|███▊      | 2153/5566 [1:24:49<2:05:27,  2.21s/it]

training loss: 3.524024724960327


training:  39%|███▊      | 2154/5566 [1:24:51<2:05:24,  2.21s/it]

training loss: 3.4858765602111816


training:  39%|███▊      | 2155/5566 [1:24:53<2:04:55,  2.20s/it]

training loss: 3.5346899032592773


training:  39%|███▊      | 2156/5566 [1:24:55<2:04:45,  2.20s/it]

training loss: 3.537921905517578


training:  39%|███▉      | 2157/5566 [1:24:58<2:04:47,  2.20s/it]

training loss: 3.515291929244995


training:  39%|███▉      | 2158/5566 [1:25:00<2:04:31,  2.19s/it]

training loss: 3.5107827186584473


training:  39%|███▉      | 2159/5566 [1:25:02<2:04:53,  2.20s/it]

training loss: 3.5277657508850098


training:  39%|███▉      | 2160/5566 [1:25:04<2:04:58,  2.20s/it]

training loss: 3.5101418495178223
valid loss: 3.50972843170166
perplexity: 33.439186096191406


training:  39%|███▉      | 2161/5566 [1:25:08<2:38:12,  2.79s/it]

training loss: 3.5198256969451904


training:  39%|███▉      | 2162/5566 [1:25:11<2:27:52,  2.61s/it]

training loss: 3.528327703475952


training:  39%|███▉      | 2163/5566 [1:25:13<2:21:14,  2.49s/it]

training loss: 3.531848430633545


training:  39%|███▉      | 2164/5566 [1:25:15<2:16:11,  2.40s/it]

training loss: 3.5398471355438232


training:  39%|███▉      | 2165/5566 [1:25:17<2:12:37,  2.34s/it]

training loss: 3.5180301666259766


training:  39%|███▉      | 2166/5566 [1:25:19<2:10:02,  2.29s/it]

training loss: 3.5214459896087646


training:  39%|███▉      | 2167/5566 [1:25:22<2:08:10,  2.26s/it]

training loss: 3.5339791774749756


training:  39%|███▉      | 2168/5566 [1:25:24<2:06:50,  2.24s/it]

training loss: 3.5175845623016357


training:  39%|███▉      | 2169/5566 [1:25:26<2:06:25,  2.23s/it]

training loss: 3.514796018600464


training:  39%|███▉      | 2170/5566 [1:25:28<2:05:57,  2.23s/it]

training loss: 3.5280230045318604


training:  39%|███▉      | 2171/5566 [1:25:30<2:05:51,  2.22s/it]

training loss: 3.526568651199341


training:  39%|███▉      | 2172/5566 [1:25:33<2:05:15,  2.21s/it]

training loss: 3.5273876190185547


training:  39%|███▉      | 2173/5566 [1:25:35<2:04:59,  2.21s/it]

training loss: 3.519609212875366


training:  39%|███▉      | 2174/5566 [1:25:37<2:05:14,  2.22s/it]

training loss: 3.5214996337890625


training:  39%|███▉      | 2175/5566 [1:25:39<2:04:56,  2.21s/it]

training loss: 3.5181891918182373


training:  39%|███▉      | 2176/5566 [1:25:41<2:04:42,  2.21s/it]

training loss: 3.5300493240356445


training:  39%|███▉      | 2177/5566 [1:25:44<2:04:28,  2.20s/it]

training loss: 3.5208401679992676


training:  39%|███▉      | 2178/5566 [1:25:46<2:04:23,  2.20s/it]

training loss: 3.5092086791992188


training:  39%|███▉      | 2179/5566 [1:25:48<2:04:38,  2.21s/it]

training loss: 3.540224075317383


training:  39%|███▉      | 2180/5566 [1:25:50<2:04:22,  2.20s/it]

training loss: 3.5059330463409424
valid loss: 3.505580425262451
perplexity: 33.30076599121094


training:  39%|███▉      | 2181/5566 [1:25:54<2:37:25,  2.79s/it]

training loss: 3.5077359676361084


training:  39%|███▉      | 2182/5566 [1:25:57<2:27:36,  2.62s/it]

training loss: 3.513903856277466


training:  39%|███▉      | 2183/5566 [1:25:59<2:20:36,  2.49s/it]

training loss: 3.532604455947876


training:  39%|███▉      | 2184/5566 [1:26:01<2:15:13,  2.40s/it]

training loss: 3.494523763656616


training:  39%|███▉      | 2185/5566 [1:26:03<2:11:41,  2.34s/it]

training loss: 3.5017406940460205


training:  39%|███▉      | 2186/5566 [1:26:05<2:09:27,  2.30s/it]

training loss: 3.5425865650177


training:  39%|███▉      | 2187/5566 [1:26:08<2:07:34,  2.27s/it]

training loss: 3.536344528198242


training:  39%|███▉      | 2188/5566 [1:26:10<2:06:40,  2.25s/it]

training loss: 3.5136871337890625


training:  39%|███▉      | 2189/5566 [1:26:12<2:05:58,  2.24s/it]

training loss: 3.5363128185272217


training:  39%|███▉      | 2190/5566 [1:26:14<2:05:06,  2.22s/it]

training loss: 3.4995102882385254


training:  39%|███▉      | 2191/5566 [1:26:16<2:04:36,  2.22s/it]

training loss: 3.5270657539367676


training:  39%|███▉      | 2192/5566 [1:26:19<2:04:25,  2.21s/it]

training loss: 3.498666286468506


training:  39%|███▉      | 2193/5566 [1:26:21<2:04:08,  2.21s/it]

training loss: 3.529218912124634


training:  39%|███▉      | 2194/5566 [1:26:23<2:04:20,  2.21s/it]

training loss: 3.5086259841918945


training:  39%|███▉      | 2195/5566 [1:26:25<2:04:28,  2.22s/it]

training loss: 3.505333423614502


training:  39%|███▉      | 2196/5566 [1:26:27<2:04:19,  2.21s/it]

training loss: 3.5053725242614746


training:  39%|███▉      | 2197/5566 [1:26:30<2:04:15,  2.21s/it]

training loss: 3.526547431945801


training:  39%|███▉      | 2198/5566 [1:26:32<2:03:26,  2.20s/it]

training loss: 3.5238261222839355


training:  40%|███▉      | 2199/5566 [1:26:34<2:03:31,  2.20s/it]

training loss: 3.515591621398926


training:  40%|███▉      | 2200/5566 [1:26:36<2:03:20,  2.20s/it]

training loss: 3.5008089542388916
valid loss: 3.5001325607299805
perplexity: 33.119842529296875


training:  40%|███▉      | 2201/5566 [1:26:41<2:49:34,  3.02s/it]

training loss: 3.5107202529907227


training:  40%|███▉      | 2202/5566 [1:26:43<2:36:54,  2.80s/it]

training loss: 3.51347017288208


training:  40%|███▉      | 2203/5566 [1:26:46<2:27:10,  2.63s/it]

training loss: 3.508479595184326


training:  40%|███▉      | 2204/5566 [1:26:48<2:19:35,  2.49s/it]

training loss: 3.5022032260894775


training:  40%|███▉      | 2205/5566 [1:26:50<2:14:43,  2.41s/it]

training loss: 3.5033462047576904


training:  40%|███▉      | 2206/5566 [1:26:52<2:11:26,  2.35s/it]

training loss: 3.5461673736572266


training:  40%|███▉      | 2207/5566 [1:26:54<2:09:23,  2.31s/it]

training loss: 3.518808364868164


training:  40%|███▉      | 2208/5566 [1:26:57<2:08:05,  2.29s/it]

training loss: 3.5259664058685303


training:  40%|███▉      | 2209/5566 [1:26:59<2:06:45,  2.27s/it]

training loss: 3.5233359336853027


training:  40%|███▉      | 2210/5566 [1:27:01<2:12:02,  2.36s/it]

training loss: 3.508338212966919


training:  40%|███▉      | 2211/5566 [1:27:04<2:12:45,  2.37s/it]

training loss: 3.5325300693511963


training:  40%|███▉      | 2212/5566 [1:27:06<2:16:23,  2.44s/it]

training loss: 3.5019402503967285


training:  40%|███▉      | 2213/5566 [1:27:09<2:12:50,  2.38s/it]

training loss: 3.515389919281006


training:  40%|███▉      | 2214/5566 [1:27:11<2:10:06,  2.33s/it]

training loss: 3.5331573486328125


training:  40%|███▉      | 2215/5566 [1:27:13<2:09:16,  2.31s/it]

training loss: 3.520888566970825


training:  40%|███▉      | 2216/5566 [1:27:15<2:07:14,  2.28s/it]

training loss: 3.514986276626587


training:  40%|███▉      | 2217/5566 [1:27:18<2:06:22,  2.26s/it]

training loss: 3.515899658203125


training:  40%|███▉      | 2218/5566 [1:27:20<2:05:41,  2.25s/it]

training loss: 3.5216188430786133


training:  40%|███▉      | 2219/5566 [1:27:22<2:05:22,  2.25s/it]

training loss: 3.516665458679199


training:  40%|███▉      | 2220/5566 [1:27:24<2:05:23,  2.25s/it]

training loss: 3.537134885787964
valid loss: 3.536769151687622
perplexity: 34.35573959350586


training:  40%|███▉      | 2221/5566 [1:27:29<2:38:00,  2.83s/it]

training loss: 3.529237985610962


training:  40%|███▉      | 2222/5566 [1:27:31<2:28:30,  2.66s/it]

training loss: 3.5353446006774902


training:  40%|███▉      | 2223/5566 [1:27:33<2:20:43,  2.53s/it]

training loss: 3.53037691116333


training:  40%|███▉      | 2224/5566 [1:27:35<2:15:17,  2.43s/it]

training loss: 3.5159597396850586


training:  40%|███▉      | 2225/5566 [1:27:37<2:11:43,  2.37s/it]

training loss: 3.5226590633392334


training:  40%|███▉      | 2226/5566 [1:27:40<2:14:09,  2.41s/it]

training loss: 3.510967254638672


training:  40%|████      | 2227/5566 [1:27:43<2:19:53,  2.51s/it]

training loss: 3.5186398029327393


training:  40%|████      | 2228/5566 [1:27:45<2:14:20,  2.41s/it]

training loss: 3.4953830242156982


training:  40%|████      | 2229/5566 [1:27:47<2:10:46,  2.35s/it]

training loss: 3.5020530223846436


training:  40%|████      | 2230/5566 [1:27:49<2:08:28,  2.31s/it]

training loss: 3.531635284423828


training:  40%|████      | 2231/5566 [1:27:52<2:07:04,  2.29s/it]

training loss: 3.5357017517089844


training:  40%|████      | 2232/5566 [1:27:54<2:05:54,  2.27s/it]

training loss: 3.5256006717681885


training:  40%|████      | 2233/5566 [1:27:56<2:05:10,  2.25s/it]

training loss: 3.5128977298736572


training:  40%|████      | 2234/5566 [1:27:58<2:04:15,  2.24s/it]

training loss: 3.5268120765686035


training:  40%|████      | 2235/5566 [1:28:00<2:03:45,  2.23s/it]

training loss: 3.539278745651245


training:  40%|████      | 2236/5566 [1:28:03<2:03:08,  2.22s/it]

training loss: 3.5408310890197754


training:  40%|████      | 2237/5566 [1:28:05<2:02:51,  2.21s/it]

training loss: 3.5374717712402344


training:  40%|████      | 2238/5566 [1:28:07<2:02:49,  2.21s/it]

training loss: 3.5424067974090576


training:  40%|████      | 2239/5566 [1:28:09<2:02:50,  2.22s/it]

training loss: 3.543905258178711


training:  40%|████      | 2240/5566 [1:28:11<2:03:14,  2.22s/it]

training loss: 3.538429021835327
valid loss: 3.5373470783233643
perplexity: 34.37560272216797


training:  40%|████      | 2241/5566 [1:28:16<2:35:48,  2.81s/it]

training loss: 3.536512851715088


training:  40%|████      | 2242/5566 [1:28:18<2:26:08,  2.64s/it]

training loss: 3.530308485031128


training:  40%|████      | 2243/5566 [1:28:20<2:19:03,  2.51s/it]

training loss: 3.525611400604248


training:  40%|████      | 2244/5566 [1:28:22<2:14:14,  2.42s/it]

training loss: 3.5155410766601562


training:  40%|████      | 2245/5566 [1:28:25<2:10:49,  2.36s/it]

training loss: 3.513622999191284


training:  40%|████      | 2246/5566 [1:28:27<2:08:31,  2.32s/it]

training loss: 3.545377731323242


training:  40%|████      | 2247/5566 [1:28:29<2:07:26,  2.30s/it]

training loss: 3.538961887359619


training:  40%|████      | 2248/5566 [1:28:31<2:06:01,  2.28s/it]

training loss: 3.4976351261138916


training:  40%|████      | 2249/5566 [1:28:33<2:05:01,  2.26s/it]

training loss: 3.500945568084717


training:  40%|████      | 2250/5566 [1:28:36<2:05:17,  2.27s/it]

training loss: 3.5292229652404785


training:  40%|████      | 2251/5566 [1:28:38<2:04:29,  2.25s/it]

training loss: 3.535979986190796


training:  40%|████      | 2252/5566 [1:28:40<2:03:48,  2.24s/it]

training loss: 3.513650894165039


training:  40%|████      | 2253/5566 [1:28:42<2:03:10,  2.23s/it]

training loss: 3.5369720458984375


training:  40%|████      | 2254/5566 [1:28:45<2:02:46,  2.22s/it]

training loss: 3.5423684120178223


training:  41%|████      | 2255/5566 [1:28:47<2:03:21,  2.24s/it]

training loss: 3.5331220626831055


training:  41%|████      | 2256/5566 [1:28:49<2:02:54,  2.23s/it]

training loss: 3.512298583984375


training:  41%|████      | 2257/5566 [1:28:51<2:02:32,  2.22s/it]

training loss: 3.532207489013672


training:  41%|████      | 2258/5566 [1:28:53<2:02:07,  2.22s/it]

training loss: 3.49692702293396


training:  41%|████      | 2259/5566 [1:28:56<2:01:43,  2.21s/it]

training loss: 3.5037715435028076


training:  41%|████      | 2260/5566 [1:28:58<2:01:38,  2.21s/it]

training loss: 3.5127062797546387
valid loss: 3.5121853351593018
perplexity: 33.52144241333008


training:  41%|████      | 2261/5566 [1:29:02<2:33:39,  2.79s/it]

training loss: 3.514904260635376


training:  41%|████      | 2262/5566 [1:29:04<2:24:47,  2.63s/it]

training loss: 3.511554718017578


training:  41%|████      | 2263/5566 [1:29:07<2:17:44,  2.50s/it]

training loss: 3.51084566116333


training:  41%|████      | 2264/5566 [1:29:09<2:12:52,  2.41s/it]

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(
          self,
          num_tokens,
          d,
          heads = 8,
          depth = 4,
          hidden_size = 1000,
          dropout = 0.3,
          batch_size = 16
      ):
          # asserts
          assert d % heads == 0

          super(TransformerDecoder, self).__init__()
          self.token_emb = nn.Embedding(num_tokens, d)
          self.positional_emb = PositionalEncoding(d, max_len = 5000)
          self.dim_head = d // heads
          self.d = d
          self.heads = heads
          self.depth = depth
          self.hidden_size = hidden_size
          self.dropout = dropout
          self.batch_size = batch_size

          self.layers = nn.ModuleList([])
          for idx in range(depth):
              attn = MultiHeadAttention(num_tokens, d, heads, self.batch_size)

              self.layers.append(nn.ModuleList([
                  attn,
                  SubLayer(d, dropout, hidden_size)
              ]))

          self.to_out = nn.Sequential(
               nn.LayerNorm(d),
               nn.Linear(d, num_tokens)
          )
          
    def forward(
        self,
        x
    ):
        batch_size, seq_len, *_, device = *x.shape, x.device
        x = self.token_emb(x)
        x = self.positional_emb(x)

        for idx, (attn, sub_l) in enumerate(self.layers):
            
            #attention
            x, mem = attn(x, device)
      
            # normalization + feedforward + residual connection
            x = sub_l(x)

        return self.to_out(x).transpose(1, 2)