<a href="https://colab.research.google.com/github/whoami-Lory271/NN-project-memorizing-transformers/blob/main/NN_project_Antonelli_DeSantis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn as nn
import numpy as np
from torch.nn import functional as F
from math import sqrt
import matplotlib.pyplot as plt
from torch.autograd import Variable
from pathlib import Path
from filelock import FileLock
import random
import tqdm
import gzip
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# KNN Memory

In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#import per la knn memory
import os
import math
import torch
import faiss
import numpy as np
from pathlib import Path
from functools import wraps

from contextlib import ExitStack, contextmanager

from einops import rearrange, pack, unpack

# multiprocessing

from joblib import Parallel, delayed, cpu_count

In [None]:
FAISS_INDEX_GPU_ID = int(os.getenv('FAISS_INDEX_GPU_ID', 0))

DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY = './.tmp/knn.memories'

# helper functions

def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

def cast_list(val):
    return val if isinstance(val, list) else [val]

def all_el_unique(arr):
    return len(set(arr)) == len(arr)

@contextmanager
def multi_context(*cms):
    with ExitStack() as stack:
        yield [stack.enter_context(cls) for cls in cms]

def count_intersect(x, y):
    # returns an array that shows how many times an element in x is contained in tensor y
    return np.sum(rearrange(x, 'i -> i 1') == rearrange(y, 'j -> 1 j'), axis = -1)

def check_shape(tensor, pattern, **kwargs):
    return rearrange(tensor, f"{pattern} -> {pattern}", **kwargs)

# a wrapper around faiss IndexIVFFlat
# taking care of expiring old keys automagically

class KNN():
    def __init__(
        self,
        dim,
        max_num_entries,
        cap_num_entries = False,
        M = 15,
        keep_stats = False
    ):
        index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
        self.index = index
        self.max_num_entries = max_num_entries
        self.cap_num_entries = cap_num_entries
        self.is_trained = False
        self.keep_stats = keep_stats

        self.reset()

    def __del__(self):
        if hasattr(self, 'index'):
            del self.index

    def reset(self):
        self.ids = np.empty((0,), dtype = np.int32)

        if self.keep_stats:
            self.hits = np.empty((0,), dtype = np.int32)
            self.age_num_iterations = np.empty((0,), dtype = np.int32)
            self.ages_since_last_hit = np.empty((0,), dtype = np.int32)

        self.index.reset()
        self.is_trained = False

    def train(self, x):
        self.index.train(x)
        self.is_trained = True

    def add(self, x, ids):
        if not self.is_trained:
            self.train(x)

        self.ids = np.concatenate((ids, self.ids))

        if self.keep_stats:
            self.hits = np.concatenate((np.zeros_like(ids), self.hits))
            self.age_num_iterations = np.concatenate((np.zeros_like(ids), self.age_num_iterations))
            self.ages_since_last_hit = np.concatenate((np.zeros_like(ids), self.ages_since_last_hit))

        if self.cap_num_entries and len(self.ids) > self.max_num_entries:
            self.reset()

        return self.index.add(x)

    def search(
        self,
        x,
        topk,
        nprobe = 8,
        return_distances = False,
        increment_hits = False,
        increment_age = True
    ):
        if not self.is_trained:
            return np.full((x.shape[0], topk), -1)

        distances, indices = self.index.search(x, k = topk)

        if increment_hits and self.keep_stats:
            hits = count_intersect(self.ids, rearrange(indices, '... -> (...)'))
            self.hits += hits

            self.ages_since_last_hit += 1
            self.ages_since_last_hit *= (hits == 0)

        if increment_age and self.keep_stats:
            self.age_num_iterations += 1

        if return_distances:
            return indices, distances

        return indices

# KNN memory layer, where one can store key / value memories
# can automatically take care of a collection of faiss indices (across batch dimension)

class KNNMemory():
    def __init__(
        self,
        dim,
        max_memories = 16000,
        num_indices = 1,
        memmap_filename = './knn.memory.memmap',
        multiprocessing = True
    ):
        self.dim = dim
        self.num_indices = num_indices
        self.scoped_indices = list(range(num_indices))

        self.max_memories = max_memories
        self.shape = (num_indices, max_memories, 2, dim)
        self.db_offsets = np.zeros(num_indices, dtype = np.int32)

        self.db = np.memmap(memmap_filename, mode = 'w+', dtype = np.float32, shape = self.shape)
        self.knns = [KNN(dim = dim, max_num_entries = max_memories, cap_num_entries = True) for _ in range(num_indices)]
    
        self.n_jobs = cpu_count() if multiprocessing else 1

    def set_scoped_indices(self, indices):
        indices = list(indices)
        assert all_el_unique(indices), f'all scoped batch indices must be unique, received: {indices}'
        assert all([0 <= i < self.num_indices for i in indices]), f'each batch index must be between 0 and less than {self.num_indices}: received {indices}'
        self.scoped_indices = indices

    @contextmanager
    def at_batch_indices(self, indices):
        prev_indices = self.scoped_indices
        self.set_scoped_indices(indices)
        yield self
        self.set_scoped_indices(prev_indices)

    def clear(self, batch_indices = None):
        if not exists(batch_indices):
            batch_indices = list(range(self.num_indices))

        batch_indices = cast_list(batch_indices)

        for index in batch_indices:
            knn = self.knns[index]
            knn.reset()

        self.db_offsets[batch_indices] = 0

    def add(self, memories):
        check_shape(memories, 'b n kv d', d = self.dim, kv = 2, b = len(self.scoped_indices))

        memories = memories.detach().cpu().numpy()
        memories = memories[:, -self.max_memories:]
        num_memories = memories.shape[1]

        knn_insert_ids = np.arange(num_memories)

        keys = np.ascontiguousarray(memories[..., 0, :])
        knns = [self.knns[i] for i in self.scoped_indices]
        db_offsets = [self.db_offsets[i] for i in self.scoped_indices]

        # use joblib to insert new key / value memories into faiss index

        @delayed
        def knn_add(knn, key, db_offset):
            knn.add(key, ids = knn_insert_ids + db_offset)
            return knn

        updated_knns = Parallel(n_jobs = self.n_jobs)(knn_add(*args) for args in zip(knns, keys, db_offsets))
        for knn_idx, scoped_idx in enumerate(self.scoped_indices):
            self.knns[scoped_idx] = updated_knns[knn_idx]

        # add the new memories to the memmap "database"

        add_indices = (rearrange(np.arange(num_memories), 'j -> 1 j') + rearrange(self.db_offsets[list(self.scoped_indices)], 'i -> i 1')) % self.max_memories
        self.db[rearrange(np.array(self.scoped_indices), 'i -> i 1'), add_indices] = memories
        self.db.flush()

        self.db_offsets += num_memories

    def search(
        self,
        queries,
        topk,
        nprobe = 8,
        increment_hits = True,
        increment_age = True
    ):
        check_shape(queries, 'b ... d', d = self.dim, b = len(self.scoped_indices))
        queries, ps = pack([queries], 'b * d')

        device = queries.device
        queries = queries.detach().cpu().numpy()

        all_masks = []
        all_key_values = []

        knns = [self.knns[i] for i in self.scoped_indices]

        # parallelize faiss search

        @delayed
        def knn_search(knn, query):
            return knn.search(query, topk, nprobe, increment_hits = increment_hits, increment_age = increment_age)

        fetched_indices = Parallel(n_jobs = self.n_jobs)(knn_search(*args) for args in zip(knns, queries))

        # get all the memory key / values from memmap 'database'
        # todo - remove for loop below

        for batch_index, indices in zip(self.scoped_indices, fetched_indices):
            mask = indices !=  -1
            db_indices = np.where(mask, indices, 0)

            all_masks.append(torch.from_numpy(mask))

            key_values = self.db[batch_index, db_indices % self.max_memories]
            all_key_values.append(torch.from_numpy(key_values))

        all_masks = torch.stack(all_masks)
        all_key_values = torch.stack(all_key_values)
        all_key_values = all_key_values.masked_fill(~rearrange(all_masks, '... -> ... 1 1'), 0.)

        all_key_values, = unpack(all_key_values, ps, 'b * n kv d')
        all_masks, = unpack(all_masks, ps, 'b * n')

        return all_key_values.to(device), all_masks.to(device)

    def __del__(self):
        if hasattr(self, 'knns'):
            for knn in self.knns:
                del knn
        del self.db

# extends list with some extra methods for collections of KNN memories

class KNNMemoryList(list):
    def cleanup(self):
        for memory in self:
            del memory

    @classmethod
    def create_memories(
        self,
        *,
        batch_size,
        num_memory_layers,
        memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY
    ):
        memories_path = Path(memories_directory)
        memories_path.mkdir(exist_ok = True, parents = True)

        def inner(*args, **kwargs):
            return self([KNNMemory(*args, num_indices = batch_size, memmap_filename = str(memories_path / f'knn.memory.layer.{ind + 1}.memmap'), **kwargs) for ind in range(num_memory_layers)])
        return inner

    @contextmanager
    def at_batch_indices(
        self,
        indices
    ):
        knn_batch_indices_contexts = [memory.at_batch_indices(indices) for memory in self]
        with multi_context(*knn_batch_indices_contexts):
            yield

    def clear_memory(
        self,
        batch_indices = None,
        memory_indices = None
    ):
        memory_indices = default(memory_indices, tuple(range(len(self))))

        for memory_index in memory_indices:
            memory = self[memory_index]
            memory.clear(batch_indices)

# Memorizing transformers

In [None]:
def attention(query, key, value, sqrt_q, device):
    t = torch.matmul(query, key.transpose(-2, -1))/sqrt_q
    i, j = t.shape[-2:]
    mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
    return torch.matmul(F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

def KNNattention(query, key, value, sqrt_q, mask):
    t = torch.einsum('b h i d, b h i j d -> b h i j', query, key)/sqrt_q
    return torch.einsum('b h i j, b h i j d -> b h i d', F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, n, d, h, batch_size):
    super(MultiHeadAttention, self).__init__()
    assert d % h == 0
    #assume q = v 
    self.q = d // h
    self.sqrt_q = sqrt(self.q)
    self.h = h
    self.batch_size = batch_size
    self.W_q = nn.Linear(d, d, bias = False) #stack of h matrices of dimension (d, q), one for each head
    self.W_k = nn.Linear(d, d, bias = False)
    self.W_v = nn.Linear(d, d, bias = False)
    self.W_o = nn.Linear(d, d, bias = False)

  def forward(self, x, device):
    query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    new_memories = torch.stack((key, value), dim = -2).detach()
    attention_value = attention(query, key, value, self.sqrt_q, device)
    return self.W_o(attention_value.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_memories

In [None]:
class KNNAttention(nn.Module):
   def __init__(self, n, d, h, num_retrieved_memories, batch_size):
      super(KNNAttention, self).__init__()
      assert d % h == 0
      #assume q = v 
      self.q = d // h
      self.sqrt_q = sqrt(self.q)
      self.h = h
      self.W_q = nn.Linear(d, d, bias = False)
      self.W_k = nn.Linear(d, d, bias = False)
      self.W_v = nn.Linear(d, d, bias = False)
      self.W_o = nn.Linear(d, d, bias = False)
      self.b_g = nn.Parameter(torch.randn((h,))) #one for each head
      self.num_retrieved_memories = num_retrieved_memories
      self.batch_size = batch_size

   def forward(self, x, knn_memory, device):
      # calculate local attention 
      query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      local_attention = attention(query, key, value, self.sqrt_q, device)

      # calculate knn attention over memory
      mem_kv, mem_mask = knn_memory[0].search(query, self.num_retrieved_memories)
      mem_key, mem_value = mem_kv.unbind(dim = -2)
      knn_attention = KNNattention(query, mem_key, mem_value, self.sqrt_q, ~mem_mask)

      # memory to be stored
      new_kv_memories = torch.stack((key, value), dim = -2).view(self.batch_size, -1, 2, self.q).detach()

      # add to knn memory
      if new_kv_memories.numel() > 0:
        knn_memory[0].add(new_kv_memories)

      # combining local and memory
      g = torch.sigmoid(self.b_g)
      final_attention = torch.einsum('b h n d, h -> b h n d', knn_attention, g) + \
                        torch.einsum('b h n d, h -> b h n d', local_attention, (1 - g))
      
      return self.W_o(final_attention.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_kv_memories

In [None]:
class SubLayer(nn.Module):
  def __init__(self, d, dropout, hidden_size):
    super(SubLayer, self).__init__()
    self.norm = nn.LayerNorm(d)
    self.mlp = nn.Sequential(nn.Linear(d, hidden_size, bias = True), 
                             nn.ReLU(),
                             nn.Dropout(dropout),
                             nn.Linear(hidden_size, d, bias = True))

  def forward(self, x):
    return x + self.mlp(self.norm(x)) #residual connection and normalization

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super(PositionalEncoding, self).__init__()
    
    # Compute the positional encodings once in log space.
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) *
                          -(math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
      
  def forward(self, x):
    return x + Variable(self.pe[:, :x.size(1)], requires_grad=False)

In [None]:
class MemorizingTransformer(nn.Module):
    def __init__(
          self,
          num_tokens,
          d,
          heads = 8,
          depth = 4,
          knn_attn_idx = 2,
          attn_dropout = 0.,
          hidden_size = 1000,
          dropout = 0.3,
          max_knn_memories = 1000,
          num_retrieved_memories = 8,
          knn_memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY,
          knn_memory_multiprocessing = False,
          batch_size = 16
      ):
          # asserts
          assert d % heads == 0
          assert knn_attn_idx < depth

          super(MemorizingTransformer, self).__init__()
          self.token_emb = nn.Embedding(num_tokens, d)
          self.positional_emb = PositionalEncoding(d, max_len = 5000)
          self.dim_head = d // heads
          self.d = d
          self.heads = heads
          self.knn_attn_idx = knn_attn_idx
          self.depth = depth
          self.attn_dropout = attn_dropout
          self.hidden_size = hidden_size
          self.dropout = dropout
          self.max_knn_memories = max_knn_memories
          self.num_retrieved_memories = num_retrieved_memories
          self.knn_memories_directory = knn_memories_directory
          self.knn_memory_multiprocessing =knn_memory_multiprocessing
          self.batch_size = batch_size

          self.layers = nn.ModuleList([])
          for idx in range(depth):
              attn = KNNAttention(num_tokens, d, heads, num_retrieved_memories, self.batch_size) \
                  if idx == knn_attn_idx else MultiHeadAttention(num_tokens, d, heads, self.batch_size)

              self.layers.append(nn.ModuleList([
                  attn,
                  SubLayer(d, dropout, hidden_size)
              ]))

          self.to_out = nn.Sequential(
               nn.LayerNorm(d),
               nn.Linear(d, num_tokens)
          )

          # knn memories init

          self.knn_mem_kwargs = dict(
              dim = self.dim_head,
              max_memories = self.max_knn_memories,
              multiprocessing = knn_memory_multiprocessing
          )
          
    def forward(
        self,
        x,
        knn_memory
    ):
        batch_size, seq_len, *_, device = *x.shape, x.device
        x = self.token_emb(x)
        x = self.positional_emb(x)

        for idx, (attn, sub_l) in enumerate(self.layers):
            
            #attention

            x, mem = attn(x, knn_memory, device) if self.knn_attn_idx == idx else attn(x, device)
      
            # normalization + feedforward + residual connection

            x = sub_l(x)

        return self.to_out(x).transpose(1, 2)

    
    def create_knn_memories(
          self,
          *,
          batch_size
      ):  
          return KNNMemoryList.create_memories(
              batch_size = batch_size,
              num_memory_layers = 1,
              memories_directory = self.knn_memories_directory
          )(**self.knn_mem_kwargs)
      
    @contextmanager
    def knn_memories_context(
        self,
        **kwargs
    ):
        knn_dir = Path(self.knn_memories_directory)
        knn_dir.mkdir(exist_ok = True, parents = True)
        lock = FileLock(str(knn_dir / 'mutex'))

        with lock:
            knn_memories = self.create_knn_memories(**kwargs)
            yield knn_memories
            knn_memories.cleanup()

    def clear_memory(self, x, token_id):
        """ clears the KNN memories based on if the batch row contains the specified token id """
        """ for auto-clearing KNN memories based on start and end of strings """

        clear_memory = (x == token_id).any(dim = -1)
        batch_indices, _ = clear_memory.nonzero(as_tuple = True)
        batch_indices_to_clear = batch_indices.tolist()

        if len(batch_indices_to_clear) == 0:
            return

        knn_memories.clear_memory(batch_indices_to_clear)

# Training

In [None]:
# constants

NUM_BATCHES = int(1e5)
BATCH_SIZE = 16
SEQ_LEN = 512
SEGMENTS = 5
HEADS = 8
DIM_HEAD = SEQ_LEN // HEADS

LEARNING_RATE = 2e-4
MAX_GRAD_CLIP_NORM = 0.5

EVAL_EVERY = 20
GENERATE_EVERY  = 500
GENERATE_LENGTH = 512
CHECKPOINT = 10

In [None]:
model = MemorizingTransformer(
    num_tokens = 256,
    d = SEQ_LEN,
    heads = HEADS,
    batch_size = BATCH_SIZE
).cuda()

# prepare enwik8 data
"""
#Lorenzo
with gzip.open('/content/drive/MyDrive/Secondo Anno/Neural Networks/project/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
"""

#Luigi
with gzip.open('/content/drive/MyDrive/Colab Notebooks/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len

# dataset and dataloader

train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
train_loader  = DataLoader(train_dataset, batch_size = BATCH_SIZE, drop_last = True)
test_dataset = TextSamplerDataset(data_val, SEQ_LEN)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, drop_last = True)

  X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)


In [None]:
def print_string(a):
  seq = ""
  for word in a:
    for letter in word:
      seq += chr(letter)
    seq += " "
  return seq

In [None]:
# optimizer

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
loss = nn.CrossEntropyLoss()

# training

for i, data in enumerate(tqdm.tqdm(train_loader, desc = 'training')):
    model.train()

    train_loss = 0.
    with model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories:
        
        seq, labels = data[:, :-1], data[:, 1:] #the labels are the same sequences shifted by one

        out = model(
              seq,
              knn_memory = knn_memories
        )
        #loss_item = torch.exp(loss(out, labels)) #perplexity
        loss_item = loss(out, labels)
        train_loss += loss_item
        loss_item.backward() 

    print(f'training loss: {train_loss}', flush = True)
    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_CLIP_NORM)
    optimizer.step()
    optimizer.zero_grad()

    if i % EVAL_EVERY == 0:
        model.eval()
      
        test_data = None
        for test_data in test_loader:
          break

        test_loss = 0.

        with torch.no_grad(), model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories: 
            seq, labels = data[:, :-1], data[:, 1:]
            
            out = model(
              seq,
              knn_memory = knn_memories
            )

            loss_item = loss(out, labels)
            test_loss +=  loss_item
            

        print(f'valid loss: {test_loss}', flush = True)
        print(f'perplexity: {torch.exp(test_loss)}', flush = True)
    
    if i % CHECKPOINT == 0:
      torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
      }, 'model_optimizer2.pt')

training:   0%|          | 0/10986 [00:00<?, ?it/s]

training loss: 5.681037902832031
valid loss: 4.7362446784973145
perplexity: 114.0052719116211


training:   0%|          | 1/10986 [00:04<14:06:38,  4.62s/it]

training loss: 4.814972877502441


training:   0%|          | 2/10986 [00:07<10:24:58,  3.41s/it]

training loss: 4.3826422691345215


training:   0%|          | 3/10986 [00:10<9:40:01,  3.17s/it] 

training loss: 4.210513591766357


training:   0%|          | 4/10986 [00:12<8:19:40,  2.73s/it]

training loss: 3.988753318786621


training:   0%|          | 5/10986 [00:13<7:12:44,  2.36s/it]

training loss: 3.946422815322876


training:   0%|          | 6/10986 [00:15<6:04:12,  1.99s/it]

training loss: 3.8310883045196533


training:   0%|          | 7/10986 [00:16<5:20:29,  1.75s/it]

training loss: 3.8024702072143555


training:   0%|          | 8/10986 [00:17<4:53:02,  1.60s/it]

training loss: 3.759239912033081


training:   0%|          | 9/10986 [00:18<4:35:08,  1.50s/it]

training loss: 3.748157024383545


training:   0%|          | 10/10986 [00:20<4:21:11,  1.43s/it]

training loss: 3.7810616493225098


training:   0%|          | 11/10986 [00:21<4:25:47,  1.45s/it]

training loss: 3.649822473526001


training:   0%|          | 12/10986 [00:23<4:20:33,  1.42s/it]

training loss: 3.6404829025268555


training:   0%|          | 13/10986 [00:24<4:12:01,  1.38s/it]

training loss: 3.6724584102630615


training:   0%|          | 14/10986 [00:25<4:05:44,  1.34s/it]

training loss: 3.7242419719696045


training:   0%|          | 15/10986 [00:26<4:01:25,  1.32s/it]

training loss: 3.7174465656280518


training:   0%|          | 16/10986 [00:28<3:58:48,  1.31s/it]

training loss: 3.6523590087890625


training:   0%|          | 17/10986 [00:29<3:56:54,  1.30s/it]

training loss: 3.5001118183135986


training:   0%|          | 18/10986 [00:30<3:55:42,  1.29s/it]

training loss: 3.52960205078125


training:   0%|          | 19/10986 [00:31<3:55:16,  1.29s/it]

training loss: 3.6247010231018066


training:   0%|          | 20/10986 [00:33<3:56:07,  1.29s/it]

training loss: 3.513913631439209
valid loss: 3.4985275268554688
perplexity: 33.06672286987305


training:   0%|          | 21/10986 [00:35<5:11:47,  1.71s/it]

training loss: 3.5048933029174805


training:   0%|          | 22/10986 [00:37<4:52:36,  1.60s/it]

training loss: 3.5997586250305176


training:   0%|          | 23/10986 [00:38<4:34:50,  1.50s/it]

training loss: 3.5698401927948


training:   0%|          | 24/10986 [00:39<4:22:07,  1.43s/it]

training loss: 3.4274373054504395


training:   0%|          | 25/10986 [00:41<4:14:27,  1.39s/it]

training loss: 3.415440797805786


training:   0%|          | 26/10986 [00:42<4:08:35,  1.36s/it]

training loss: 3.5088753700256348


training:   0%|          | 27/10986 [00:43<4:04:21,  1.34s/it]

training loss: 3.5230541229248047


training:   0%|          | 28/10986 [00:44<4:01:08,  1.32s/it]

training loss: 3.573179244995117


training:   0%|          | 29/10986 [00:46<3:59:17,  1.31s/it]

training loss: 3.582728385925293


training:   0%|          | 30/10986 [00:47<3:57:16,  1.30s/it]

training loss: 3.668659210205078


training:   0%|          | 31/10986 [00:49<4:07:44,  1.36s/it]

training loss: 3.5954484939575195


training:   0%|          | 32/10986 [00:50<4:05:04,  1.34s/it]

training loss: 3.4493870735168457


training:   0%|          | 33/10986 [00:51<4:01:16,  1.32s/it]

training loss: 3.580963373184204


training:   0%|          | 34/10986 [00:52<4:00:11,  1.32s/it]

training loss: 3.5612220764160156


training:   0%|          | 35/10986 [00:54<3:59:05,  1.31s/it]

training loss: 3.475693702697754


training:   0%|          | 36/10986 [00:55<3:57:37,  1.30s/it]

training loss: 3.6187849044799805


training:   0%|          | 37/10986 [00:56<3:56:24,  1.30s/it]

training loss: 3.6008498668670654


training:   0%|          | 38/10986 [00:58<3:54:41,  1.29s/it]

training loss: 3.6643519401550293


training:   0%|          | 39/10986 [00:59<3:54:15,  1.28s/it]

training loss: 3.4262547492980957


training:   0%|          | 40/10986 [01:00<3:54:45,  1.29s/it]

training loss: 3.545903444290161
valid loss: 3.5374763011932373
perplexity: 34.380043029785156


training:   0%|          | 41/10986 [01:03<5:34:03,  1.83s/it]

training loss: 3.4525632858276367


training:   0%|          | 42/10986 [01:05<5:09:49,  1.70s/it]

training loss: 3.513957977294922


training:   0%|          | 43/10986 [01:06<4:46:23,  1.57s/it]

training loss: 3.373309850692749


training:   0%|          | 44/10986 [01:07<4:29:26,  1.48s/it]

training loss: 3.467700481414795


training:   0%|          | 45/10986 [01:08<4:18:06,  1.42s/it]

training loss: 3.701101541519165


training:   0%|          | 46/10986 [01:10<4:10:28,  1.37s/it]

training loss: 3.524662494659424


training:   0%|          | 47/10986 [01:11<4:05:30,  1.35s/it]

training loss: 3.503230333328247


training:   0%|          | 48/10986 [01:12<4:02:31,  1.33s/it]

training loss: 3.4924161434173584


training:   0%|          | 49/10986 [01:14<4:00:00,  1.32s/it]

training loss: 3.540271282196045


training:   0%|          | 50/10986 [01:15<3:57:41,  1.30s/it]

training loss: 3.6498491764068604


training:   0%|          | 51/10986 [01:16<4:09:06,  1.37s/it]

training loss: 3.503150463104248


training:   0%|          | 52/10986 [01:18<4:04:46,  1.34s/it]

training loss: 3.4544386863708496


training:   0%|          | 53/10986 [01:19<4:01:50,  1.33s/it]

training loss: 3.481593608856201


training:   0%|          | 54/10986 [01:20<3:58:44,  1.31s/it]

training loss: 3.5097358226776123


training:   1%|          | 55/10986 [01:21<3:58:32,  1.31s/it]

training loss: 3.6247291564941406


training:   1%|          | 56/10986 [01:23<3:56:43,  1.30s/it]

training loss: 3.611325740814209


training:   1%|          | 57/10986 [01:24<3:55:56,  1.30s/it]

training loss: 3.5820393562316895


training:   1%|          | 58/10986 [01:25<3:54:47,  1.29s/it]

training loss: 3.7263755798339844


training:   1%|          | 59/10986 [01:27<3:53:21,  1.28s/it]

training loss: 3.400751829147339


training:   1%|          | 60/10986 [01:28<3:54:36,  1.29s/it]

training loss: 3.554039716720581
valid loss: 3.553358316421509
perplexity: 34.9304313659668


training:   1%|          | 61/10986 [01:31<5:32:09,  1.82s/it]

training loss: 3.4114625453948975


training:   1%|          | 62/10986 [01:33<5:24:30,  1.78s/it]

training loss: 3.479671001434326


training:   1%|          | 63/10986 [01:34<4:56:28,  1.63s/it]

training loss: 3.5564181804656982


training:   1%|          | 64/10986 [01:35<4:36:14,  1.52s/it]

training loss: 3.510979175567627


training:   1%|          | 65/10986 [01:36<4:23:17,  1.45s/it]

training loss: 3.7624940872192383


training:   1%|          | 66/10986 [01:38<4:13:53,  1.39s/it]

training loss: 3.6217126846313477


training:   1%|          | 67/10986 [01:39<4:07:18,  1.36s/it]

training loss: 3.4363906383514404


training:   1%|          | 68/10986 [01:40<4:01:54,  1.33s/it]

training loss: 3.336665630340576


training:   1%|          | 69/10986 [01:42<3:59:07,  1.31s/it]

training loss: 3.3781638145446777


training:   1%|          | 70/10986 [01:43<3:56:33,  1.30s/it]

training loss: 3.4395391941070557


training:   1%|          | 71/10986 [01:44<4:06:05,  1.35s/it]

training loss: 3.53521728515625


training:   1%|          | 72/10986 [01:46<4:07:29,  1.36s/it]

training loss: 3.4302778244018555


training:   1%|          | 73/10986 [01:47<4:01:35,  1.33s/it]

training loss: 3.468618154525757


training:   1%|          | 74/10986 [01:48<3:57:19,  1.30s/it]

training loss: 3.484208106994629


training:   1%|          | 75/10986 [01:49<3:54:42,  1.29s/it]

training loss: 3.5284249782562256


training:   1%|          | 76/10986 [01:51<3:53:18,  1.28s/it]

training loss: 3.533720016479492


training:   1%|          | 77/10986 [01:52<3:52:04,  1.28s/it]

training loss: 3.389390707015991


training:   1%|          | 78/10986 [01:53<3:51:22,  1.27s/it]

training loss: 3.5749239921569824


training:   1%|          | 79/10986 [01:55<3:51:09,  1.27s/it]

training loss: 3.612715005874634


training:   1%|          | 80/10986 [01:56<3:50:51,  1.27s/it]

training loss: 3.569838762283325
valid loss: 3.5615220069885254
perplexity: 35.216758728027344


training:   1%|          | 81/10986 [01:58<5:09:27,  1.70s/it]

training loss: 3.419520854949951


training:   1%|          | 82/10986 [02:00<4:50:24,  1.60s/it]

training loss: 3.5407795906066895


training:   1%|          | 83/10986 [02:01<4:32:19,  1.50s/it]

training loss: 3.429990291595459


training:   1%|          | 84/10986 [02:02<4:20:18,  1.43s/it]

training loss: 3.4804420471191406


training:   1%|          | 85/10986 [02:04<4:14:53,  1.40s/it]

training loss: 3.4311907291412354


training:   1%|          | 86/10986 [02:05<4:07:57,  1.36s/it]

training loss: 3.4461441040039062


training:   1%|          | 87/10986 [02:06<4:03:19,  1.34s/it]

training loss: 3.5875391960144043


training:   1%|          | 88/10986 [02:08<3:59:16,  1.32s/it]

training loss: 3.637474536895752


training:   1%|          | 89/10986 [02:09<3:56:19,  1.30s/it]

training loss: 3.612819194793701


training:   1%|          | 90/10986 [02:10<3:57:13,  1.31s/it]

training loss: 3.561148166656494


training:   1%|          | 91/10986 [02:12<4:18:34,  1.42s/it]

training loss: 3.383760690689087


training:   1%|          | 92/10986 [02:13<4:17:01,  1.42s/it]

training loss: 3.5112290382385254


training:   1%|          | 93/10986 [02:14<4:09:16,  1.37s/it]

training loss: 3.3633792400360107


training:   1%|          | 94/10986 [02:16<4:03:38,  1.34s/it]

training loss: 3.552936553955078


training:   1%|          | 95/10986 [02:17<3:58:55,  1.32s/it]

training loss: 3.5769567489624023


training:   1%|          | 96/10986 [02:18<3:57:01,  1.31s/it]

training loss: 3.443922996520996


training:   1%|          | 97/10986 [02:20<3:54:44,  1.29s/it]

training loss: 3.491767168045044


training:   1%|          | 98/10986 [02:21<3:52:58,  1.28s/it]

training loss: 3.5254478454589844


training:   1%|          | 99/10986 [02:22<3:51:43,  1.28s/it]

training loss: 3.500261068344116


training:   1%|          | 100/10986 [02:23<3:52:31,  1.28s/it]

training loss: 3.518612861633301
valid loss: 3.5110111236572266
perplexity: 33.48210525512695


training:   1%|          | 101/10986 [02:26<5:07:26,  1.69s/it]

training loss: 3.452504873275757


training:   1%|          | 102/10986 [02:27<4:49:06,  1.59s/it]

training loss: 3.514434814453125


training:   1%|          | 103/10986 [02:29<4:31:08,  1.49s/it]

training loss: 3.3814053535461426


training:   1%|          | 104/10986 [02:30<4:18:31,  1.43s/it]

training loss: 3.5290307998657227


training:   1%|          | 105/10986 [02:31<4:09:15,  1.37s/it]

training loss: 3.5639395713806152


training:   1%|          | 106/10986 [02:32<4:03:38,  1.34s/it]

training loss: 3.5220589637756348


training:   1%|          | 107/10986 [02:34<4:01:16,  1.33s/it]

training loss: 3.6238512992858887


training:   1%|          | 108/10986 [02:35<3:59:25,  1.32s/it]

training loss: 3.464083194732666


training:   1%|          | 109/10986 [02:36<3:57:20,  1.31s/it]

training loss: 3.6190133094787598


training:   1%|          | 110/10986 [02:38<3:54:08,  1.29s/it]

training loss: 3.427234411239624


training:   1%|          | 111/10986 [02:39<4:06:01,  1.36s/it]

training loss: 3.4031894207000732


training:   1%|          | 112/10986 [02:40<4:03:26,  1.34s/it]

training loss: 3.5764975547790527


training:   1%|          | 113/10986 [02:42<3:59:36,  1.32s/it]

training loss: 3.5984528064727783


training:   1%|          | 114/10986 [02:43<3:56:45,  1.31s/it]

training loss: 3.5733954906463623


training:   1%|          | 115/10986 [02:44<3:56:24,  1.30s/it]

training loss: 3.310150623321533


training:   1%|          | 116/10986 [02:46<3:55:26,  1.30s/it]

training loss: 3.545153856277466


training:   1%|          | 117/10986 [02:47<3:54:33,  1.29s/it]

training loss: 3.5517284870147705


training:   1%|          | 118/10986 [02:48<3:54:03,  1.29s/it]

training loss: 3.483367443084717


training:   1%|          | 119/10986 [02:49<3:53:49,  1.29s/it]

training loss: 3.5831356048583984


training:   1%|          | 120/10986 [02:51<3:53:24,  1.29s/it]

training loss: 3.463876724243164
valid loss: 3.457840919494629
perplexity: 31.748353958129883


training:   1%|          | 121/10986 [02:53<5:10:33,  1.72s/it]

training loss: 3.6196885108947754


training:   1%|          | 122/10986 [02:55<4:55:15,  1.63s/it]

training loss: 3.2451775074005127


training:   1%|          | 123/10986 [02:56<4:36:22,  1.53s/it]

training loss: 3.424381732940674


training:   1%|          | 124/10986 [02:57<4:22:38,  1.45s/it]

training loss: 3.532890558242798


training:   1%|          | 125/10986 [03:00<5:02:33,  1.67s/it]

training loss: 3.553417682647705


training:   1%|          | 126/10986 [03:01<5:03:05,  1.67s/it]

training loss: 3.482050657272339


training:   1%|          | 127/10986 [03:03<4:41:18,  1.55s/it]

training loss: 3.5674149990081787


training:   1%|          | 128/10986 [03:04<4:28:42,  1.48s/it]

training loss: 3.4447453022003174


training:   1%|          | 129/10986 [03:05<4:19:32,  1.43s/it]

training loss: 3.604048252105713


training:   1%|          | 130/10986 [03:06<4:11:10,  1.39s/it]

training loss: 3.4131057262420654


training:   1%|          | 131/10986 [03:08<4:18:43,  1.43s/it]

training loss: 3.4370648860931396


training:   1%|          | 132/10986 [03:09<4:13:13,  1.40s/it]

training loss: 3.4639334678649902


training:   1%|          | 133/10986 [03:11<4:07:04,  1.37s/it]

training loss: 3.4760990142822266


training:   1%|          | 134/10986 [03:12<4:03:17,  1.35s/it]

training loss: 3.6757657527923584


training:   1%|          | 135/10986 [03:13<3:59:30,  1.32s/it]

training loss: 3.5923409461975098


training:   1%|          | 136/10986 [03:14<3:57:50,  1.32s/it]

training loss: 3.501678466796875


training:   1%|          | 137/10986 [03:16<3:55:19,  1.30s/it]

training loss: 3.5156993865966797


training:   1%|▏         | 138/10986 [03:17<3:54:20,  1.30s/it]

training loss: 3.498730421066284


training:   1%|▏         | 139/10986 [03:18<3:54:55,  1.30s/it]

training loss: 3.467867374420166


training:   1%|▏         | 140/10986 [03:20<3:54:19,  1.30s/it]

training loss: 3.3322088718414307
valid loss: 3.3264575004577637
perplexity: 27.83954429626465


training:   1%|▏         | 141/10986 [03:22<5:11:47,  1.72s/it]

training loss: 3.536808490753174


training:   1%|▏         | 142/10986 [03:24<4:53:32,  1.62s/it]

training loss: 3.4889156818389893


training:   1%|▏         | 143/10986 [03:25<4:38:05,  1.54s/it]

training loss: 3.500053882598877


training:   1%|▏         | 144/10986 [03:26<4:25:08,  1.47s/it]

training loss: 3.3966217041015625


training:   1%|▏         | 145/10986 [03:28<4:14:34,  1.41s/it]

training loss: 3.3855905532836914


training:   1%|▏         | 146/10986 [03:29<4:08:28,  1.38s/it]

training loss: 3.5296437740325928


training:   1%|▏         | 147/10986 [03:30<4:04:07,  1.35s/it]

training loss: 3.4992594718933105


training:   1%|▏         | 148/10986 [03:32<4:00:40,  1.33s/it]

training loss: 3.440089702606201


training:   1%|▏         | 149/10986 [03:33<4:09:16,  1.38s/it]

training loss: 3.437566041946411


training:   1%|▏         | 150/10986 [03:34<4:06:01,  1.36s/it]

training loss: 3.474841594696045


training:   1%|▏         | 151/10986 [03:36<4:17:47,  1.43s/it]

training loss: 3.5818934440612793


training:   1%|▏         | 152/10986 [03:37<4:11:08,  1.39s/it]

training loss: 3.4361324310302734


training:   1%|▏         | 153/10986 [03:39<4:07:06,  1.37s/it]

training loss: 3.5136454105377197


training:   1%|▏         | 154/10986 [03:40<4:02:32,  1.34s/it]

training loss: 3.367056131362915


training:   1%|▏         | 155/10986 [03:41<4:02:28,  1.34s/it]

training loss: 3.399841547012329


training:   1%|▏         | 156/10986 [03:42<4:00:37,  1.33s/it]

training loss: 3.4228873252868652


training:   1%|▏         | 157/10986 [03:44<3:59:00,  1.32s/it]

training loss: 3.527817726135254


training:   1%|▏         | 158/10986 [03:45<3:57:43,  1.32s/it]

training loss: 3.4251930713653564


training:   1%|▏         | 159/10986 [03:46<3:57:02,  1.31s/it]

training loss: 3.4082839488983154


training:   1%|▏         | 160/10986 [03:48<3:55:47,  1.31s/it]

training loss: 3.3097307682037354
valid loss: 3.3052520751953125
perplexity: 27.25541114807129


training:   1%|▏         | 161/10986 [03:50<5:14:55,  1.75s/it]

training loss: 3.3509435653686523


training:   1%|▏         | 162/10986 [03:52<4:56:35,  1.64s/it]

training loss: 3.416659355163574


training:   1%|▏         | 163/10986 [03:53<4:46:51,  1.59s/it]

training loss: 3.3288400173187256


training:   1%|▏         | 164/10986 [03:55<4:58:38,  1.66s/it]

training loss: 3.533806324005127


training:   2%|▏         | 165/10986 [03:57<5:16:34,  1.76s/it]

training loss: 3.4574763774871826


training:   2%|▏         | 166/10986 [03:59<5:19:46,  1.77s/it]

training loss: 3.4095540046691895


training:   2%|▏         | 167/10986 [04:00<4:57:07,  1.65s/it]

training loss: 3.458979606628418


training:   2%|▏         | 168/10986 [04:02<4:39:23,  1.55s/it]

training loss: 3.4004154205322266


training:   2%|▏         | 169/10986 [04:03<4:26:21,  1.48s/it]

training loss: 3.5030224323272705


training:   2%|▏         | 170/10986 [04:04<4:17:42,  1.43s/it]

training loss: 3.5291597843170166


training:   2%|▏         | 171/10986 [04:06<4:23:57,  1.46s/it]

training loss: 3.610795736312866


training:   2%|▏         | 172/10986 [04:07<4:15:48,  1.42s/it]

training loss: 3.430988311767578


training:   2%|▏         | 173/10986 [04:08<4:09:49,  1.39s/it]

training loss: 3.4757819175720215


training:   2%|▏         | 174/10986 [04:10<4:04:32,  1.36s/it]

training loss: 3.5755300521850586


training:   2%|▏         | 175/10986 [04:11<4:01:28,  1.34s/it]

training loss: 3.4080681800842285


training:   2%|▏         | 176/10986 [04:12<3:58:26,  1.32s/it]

training loss: 3.509337902069092


training:   2%|▏         | 177/10986 [04:14<3:56:08,  1.31s/it]

training loss: 3.466003656387329


training:   2%|▏         | 178/10986 [04:15<3:54:53,  1.30s/it]

training loss: 3.4660396575927734


training:   2%|▏         | 179/10986 [04:16<3:53:30,  1.30s/it]

training loss: 3.3202359676361084


training:   2%|▏         | 180/10986 [04:17<3:53:11,  1.29s/it]

training loss: 3.4205169677734375
valid loss: 3.4116363525390625
perplexity: 30.314809799194336


training:   2%|▏         | 181/10986 [04:20<5:05:36,  1.70s/it]

training loss: 3.490306854248047


training:   2%|▏         | 182/10986 [04:22<4:58:42,  1.66s/it]

training loss: 3.4589316844940186


training:   2%|▏         | 183/10986 [04:23<4:53:13,  1.63s/it]

training loss: 3.5362188816070557


training:   2%|▏         | 184/10986 [04:24<4:35:50,  1.53s/it]

training loss: 3.4199953079223633


training:   2%|▏         | 185/10986 [04:26<4:22:35,  1.46s/it]

training loss: 3.389787197113037


training:   2%|▏         | 186/10986 [04:27<4:12:23,  1.40s/it]

training loss: 3.436598539352417


training:   2%|▏         | 187/10986 [04:28<4:05:22,  1.36s/it]

training loss: 3.3355700969696045


training:   2%|▏         | 188/10986 [04:30<4:01:35,  1.34s/it]

training loss: 3.488196849822998


training:   2%|▏         | 189/10986 [04:31<3:58:33,  1.33s/it]

training loss: 3.4842541217803955


training:   2%|▏         | 190/10986 [04:32<3:57:08,  1.32s/it]

training loss: 3.3225622177124023


training:   2%|▏         | 191/10986 [04:34<4:07:03,  1.37s/it]

training loss: 3.528108596801758


training:   2%|▏         | 192/10986 [04:35<4:05:45,  1.37s/it]

training loss: 3.5786314010620117


training:   2%|▏         | 193/10986 [04:36<4:02:22,  1.35s/it]

training loss: 3.482042074203491


training:   2%|▏         | 194/10986 [04:38<3:58:28,  1.33s/it]

training loss: 3.4695773124694824


training:   2%|▏         | 195/10986 [04:39<3:55:28,  1.31s/it]

training loss: 3.6459853649139404


training:   2%|▏         | 196/10986 [04:40<3:55:20,  1.31s/it]

training loss: 3.4848906993865967


training:   2%|▏         | 197/10986 [04:41<3:54:19,  1.30s/it]

training loss: 3.3582875728607178


training:   2%|▏         | 198/10986 [04:43<3:54:04,  1.30s/it]

training loss: 3.34336256980896


training:   2%|▏         | 199/10986 [04:44<3:52:38,  1.29s/it]

training loss: 3.5116610527038574


training:   2%|▏         | 200/10986 [04:45<3:52:14,  1.29s/it]

training loss: 3.540149688720703
valid loss: 3.530245304107666
perplexity: 34.13233947753906


training:   2%|▏         | 201/10986 [04:48<5:03:16,  1.69s/it]

training loss: 3.5342254638671875


training:   2%|▏         | 202/10986 [04:49<4:45:01,  1.59s/it]

training loss: 3.5313055515289307


training:   2%|▏         | 203/10986 [04:51<4:29:21,  1.50s/it]

training loss: 3.4010651111602783


training:   2%|▏         | 204/10986 [04:52<4:17:09,  1.43s/it]

training loss: 3.459660053253174


training:   2%|▏         | 205/10986 [04:53<4:09:34,  1.39s/it]

training loss: 3.354768991470337


training:   2%|▏         | 206/10986 [04:54<4:03:38,  1.36s/it]

training loss: 3.525045394897461


training:   2%|▏         | 207/10986 [04:56<4:00:03,  1.34s/it]

training loss: 3.462669849395752


training:   2%|▏         | 208/10986 [04:57<3:58:01,  1.33s/it]

training loss: 3.3351759910583496


training:   2%|▏         | 209/10986 [04:58<3:55:51,  1.31s/it]

training loss: 3.4641335010528564


training:   2%|▏         | 210/10986 [05:00<3:54:37,  1.31s/it]

training loss: 3.419130802154541


training:   2%|▏         | 211/10986 [05:01<4:05:29,  1.37s/it]

training loss: 3.4473841190338135


training:   2%|▏         | 212/10986 [05:02<4:02:50,  1.35s/it]

training loss: 3.5411624908447266


training:   2%|▏         | 213/10986 [05:04<3:59:06,  1.33s/it]

training loss: 3.4831690788269043


training:   2%|▏         | 214/10986 [05:05<3:58:15,  1.33s/it]

training loss: 3.456819534301758


training:   2%|▏         | 215/10986 [05:06<3:56:04,  1.32s/it]

training loss: 3.3912763595581055


training:   2%|▏         | 216/10986 [05:08<3:54:32,  1.31s/it]

training loss: 3.514355421066284


training:   2%|▏         | 217/10986 [05:09<3:54:30,  1.31s/it]

training loss: 3.453356981277466


training:   2%|▏         | 218/10986 [05:10<3:54:19,  1.31s/it]

training loss: 3.4141650199890137


training:   2%|▏         | 219/10986 [05:11<3:53:21,  1.30s/it]

training loss: 3.3989880084991455


training:   2%|▏         | 220/10986 [05:13<3:52:09,  1.29s/it]

training loss: 3.397167205810547
valid loss: 3.392866373062134
perplexity: 29.751108169555664


training:   2%|▏         | 221/10986 [05:15<5:07:35,  1.71s/it]

training loss: 3.3892900943756104


training:   2%|▏         | 222/10986 [05:17<4:48:43,  1.61s/it]

training loss: 3.434086322784424


training:   2%|▏         | 223/10986 [05:18<4:31:33,  1.51s/it]

training loss: 3.4018642902374268


training:   2%|▏         | 224/10986 [05:19<4:19:03,  1.44s/it]

training loss: 3.439013719558716


training:   2%|▏         | 225/10986 [05:21<4:10:12,  1.40s/it]

training loss: 3.3898661136627197


training:   2%|▏         | 226/10986 [05:22<4:04:43,  1.36s/it]

training loss: 3.4294540882110596


training:   2%|▏         | 227/10986 [05:23<3:59:53,  1.34s/it]

training loss: 3.3706612586975098


training:   2%|▏         | 228/10986 [05:25<3:58:48,  1.33s/it]

training loss: 3.2924671173095703


training:   2%|▏         | 229/10986 [05:26<3:56:41,  1.32s/it]

training loss: 3.397976875305176


training:   2%|▏         | 230/10986 [05:27<3:55:06,  1.31s/it]

training loss: 3.4025940895080566


training:   2%|▏         | 231/10986 [05:29<4:06:43,  1.38s/it]

training loss: 3.470703125


training:   2%|▏         | 232/10986 [05:30<4:04:25,  1.36s/it]

training loss: 3.333287000656128


training:   2%|▏         | 233/10986 [05:31<4:01:55,  1.35s/it]

training loss: 3.402494430541992


training:   2%|▏         | 234/10986 [05:33<3:58:49,  1.33s/it]

training loss: 3.3935530185699463


training:   2%|▏         | 235/10986 [05:34<3:55:53,  1.32s/it]

training loss: 3.4669954776763916


training:   2%|▏         | 236/10986 [05:35<3:54:24,  1.31s/it]

training loss: 3.359135389328003


training:   2%|▏         | 237/10986 [05:37<3:55:35,  1.32s/it]

training loss: 3.3771088123321533


training:   2%|▏         | 238/10986 [05:38<3:53:12,  1.30s/it]

training loss: 3.5074574947357178


training:   2%|▏         | 239/10986 [05:39<3:52:20,  1.30s/it]

training loss: 3.3602988719940186


training:   2%|▏         | 240/10986 [05:40<3:52:56,  1.30s/it]

training loss: 3.565607786178589
valid loss: 3.5513765811920166
perplexity: 34.861270904541016


training:   2%|▏         | 241/10986 [05:43<5:06:20,  1.71s/it]

training loss: 3.3660924434661865


training:   2%|▏         | 242/10986 [05:44<4:46:36,  1.60s/it]

training loss: 3.4365594387054443


training:   2%|▏         | 243/10986 [05:46<4:30:28,  1.51s/it]

training loss: 3.3712470531463623


training:   2%|▏         | 244/10986 [05:47<4:19:03,  1.45s/it]

training loss: 3.4273386001586914


training:   2%|▏         | 245/10986 [05:48<4:10:42,  1.40s/it]

training loss: 3.3705239295959473


training:   2%|▏         | 246/10986 [05:50<4:04:40,  1.37s/it]

training loss: 3.357623815536499


training:   2%|▏         | 247/10986 [05:51<4:00:41,  1.34s/it]

training loss: 3.4024455547332764


training:   2%|▏         | 248/10986 [05:52<3:56:58,  1.32s/it]

training loss: 3.4694089889526367


training:   2%|▏         | 249/10986 [05:53<3:57:41,  1.33s/it]

training loss: 3.443791627883911


training:   2%|▏         | 250/10986 [05:55<3:55:42,  1.32s/it]

training loss: 3.4150078296661377


training:   2%|▏         | 251/10986 [05:56<4:06:13,  1.38s/it]

training loss: 3.391085624694824


training:   2%|▏         | 252/10986 [05:58<4:02:03,  1.35s/it]

training loss: 3.4565038681030273


training:   2%|▏         | 253/10986 [05:59<3:57:13,  1.33s/it]

training loss: 3.4490914344787598


training:   2%|▏         | 254/10986 [06:00<3:54:34,  1.31s/it]

training loss: 3.321467638015747


training:   2%|▏         | 255/10986 [06:01<3:52:25,  1.30s/it]

training loss: 3.449155569076538


training:   2%|▏         | 256/10986 [06:03<3:51:39,  1.30s/it]

training loss: 3.4309799671173096


training:   2%|▏         | 257/10986 [06:04<3:51:55,  1.30s/it]

training loss: 3.366734743118286


training:   2%|▏         | 258/10986 [06:05<3:50:21,  1.29s/it]

training loss: 3.4522316455841064


training:   2%|▏         | 259/10986 [06:07<3:53:57,  1.31s/it]

training loss: 3.414597988128662


training:   2%|▏         | 260/10986 [06:08<3:53:12,  1.30s/it]

training loss: 3.3767151832580566
valid loss: 3.373201608657837
perplexity: 29.171775817871094


training:   2%|▏         | 261/10986 [06:11<5:07:43,  1.72s/it]

training loss: 3.4429879188537598


training:   2%|▏         | 262/10986 [06:12<4:51:47,  1.63s/it]

training loss: 3.4491634368896484


training:   2%|▏         | 263/10986 [06:13<4:34:02,  1.53s/it]

training loss: 3.3506906032562256


training:   2%|▏         | 264/10986 [06:15<4:20:38,  1.46s/it]

training loss: 3.562206268310547


training:   2%|▏         | 265/10986 [06:16<4:11:38,  1.41s/it]

training loss: 3.356653928756714


training:   2%|▏         | 266/10986 [06:17<4:04:51,  1.37s/it]

training loss: 3.3870654106140137


training:   2%|▏         | 267/10986 [06:18<4:01:32,  1.35s/it]

training loss: 3.3292415142059326


training:   2%|▏         | 268/10986 [06:20<3:56:49,  1.33s/it]

training loss: 3.432405710220337


training:   2%|▏         | 269/10986 [06:21<3:54:14,  1.31s/it]

training loss: 3.4508602619171143


training:   2%|▏         | 270/10986 [06:22<3:52:35,  1.30s/it]

training loss: 3.4067494869232178


training:   2%|▏         | 271/10986 [06:24<4:03:15,  1.36s/it]

training loss: 3.3992998600006104


training:   2%|▏         | 272/10986 [06:25<4:00:30,  1.35s/it]

training loss: 3.3990514278411865


training:   2%|▏         | 273/10986 [06:26<3:56:05,  1.32s/it]

training loss: 3.3934764862060547


training:   2%|▏         | 274/10986 [06:28<3:53:56,  1.31s/it]

training loss: 3.3775107860565186


training:   3%|▎         | 275/10986 [06:29<3:52:11,  1.30s/it]

training loss: 3.510993480682373


training:   3%|▎         | 276/10986 [06:30<3:50:09,  1.29s/it]

training loss: 3.407928705215454


training:   3%|▎         | 277/10986 [06:31<3:49:20,  1.28s/it]

training loss: 3.326474666595459


training:   3%|▎         | 278/10986 [06:33<3:48:25,  1.28s/it]

training loss: 3.5013222694396973


training:   3%|▎         | 279/10986 [06:34<3:48:35,  1.28s/it]

training loss: 3.4412295818328857


training:   3%|▎         | 280/10986 [06:35<3:48:11,  1.28s/it]

training loss: 3.46174955368042
valid loss: 3.442466974258423
perplexity: 31.263988494873047


training:   3%|▎         | 281/10986 [06:38<5:05:27,  1.71s/it]

training loss: 3.3417890071868896


training:   3%|▎         | 282/10986 [06:39<4:45:14,  1.60s/it]

training loss: 3.5478155612945557


training:   3%|▎         | 283/10986 [06:41<4:29:06,  1.51s/it]

training loss: 3.437451124191284


training:   3%|▎         | 284/10986 [06:42<4:16:32,  1.44s/it]

training loss: 3.3770923614501953


training:   3%|▎         | 285/10986 [06:43<4:09:16,  1.40s/it]

training loss: 3.357442855834961


training:   3%|▎         | 286/10986 [06:45<4:02:51,  1.36s/it]

training loss: 3.3831162452697754


training:   3%|▎         | 287/10986 [06:46<3:56:38,  1.33s/it]

training loss: 3.3423092365264893


training:   3%|▎         | 288/10986 [06:47<3:54:33,  1.32s/it]

training loss: 3.306029796600342


training:   3%|▎         | 289/10986 [06:48<3:51:42,  1.30s/it]

training loss: 3.325347661972046


training:   3%|▎         | 290/10986 [06:50<3:48:56,  1.28s/it]

training loss: 3.316263198852539


training:   3%|▎         | 291/10986 [06:51<3:59:41,  1.34s/it]

training loss: 3.3966095447540283


training:   3%|▎         | 292/10986 [06:52<3:55:53,  1.32s/it]

training loss: 3.3971660137176514


training:   3%|▎         | 293/10986 [06:54<3:51:08,  1.30s/it]

training loss: 3.480912685394287


training:   3%|▎         | 294/10986 [06:55<3:49:38,  1.29s/it]

training loss: 3.4830100536346436


training:   3%|▎         | 295/10986 [06:56<3:48:20,  1.28s/it]

training loss: 3.3655357360839844


training:   3%|▎         | 296/10986 [06:57<3:46:28,  1.27s/it]

training loss: 3.3268637657165527


training:   3%|▎         | 297/10986 [06:59<3:45:29,  1.27s/it]

training loss: 3.407987117767334


training:   3%|▎         | 298/10986 [07:00<3:45:02,  1.26s/it]

training loss: 3.365912914276123


training:   3%|▎         | 299/10986 [07:01<3:45:18,  1.26s/it]

training loss: 3.4400453567504883


training:   3%|▎         | 300/10986 [07:02<3:45:34,  1.27s/it]

training loss: 3.481367588043213
valid loss: 3.462796211242676
perplexity: 31.906068801879883


training:   3%|▎         | 301/10986 [07:05<4:57:03,  1.67s/it]

training loss: 3.5487236976623535


training:   3%|▎         | 302/10986 [07:06<4:39:32,  1.57s/it]

training loss: 3.3025221824645996


training:   3%|▎         | 303/10986 [07:08<4:22:20,  1.47s/it]

training loss: 3.4723987579345703


training:   3%|▎         | 304/10986 [07:09<4:09:27,  1.40s/it]

training loss: 3.3748834133148193


training:   3%|▎         | 305/10986 [07:10<4:05:07,  1.38s/it]

training loss: 3.4298505783081055


training:   3%|▎         | 306/10986 [07:11<3:58:56,  1.34s/it]

training loss: 3.2661736011505127


training:   3%|▎         | 307/10986 [07:13<3:54:14,  1.32s/it]

training loss: 3.4128901958465576


training:   3%|▎         | 308/10986 [07:14<3:51:20,  1.30s/it]

training loss: 3.34087872505188


training:   3%|▎         | 309/10986 [07:15<3:48:56,  1.29s/it]

training loss: 3.2550277709960938


training:   3%|▎         | 310/10986 [07:16<3:47:03,  1.28s/it]

training loss: 3.4744770526885986


training:   3%|▎         | 311/10986 [07:18<3:58:33,  1.34s/it]

training loss: 3.360365629196167


training:   3%|▎         | 312/10986 [07:19<3:55:57,  1.33s/it]

training loss: 3.231797456741333


training:   3%|▎         | 313/10986 [07:20<3:51:42,  1.30s/it]

training loss: 3.500286817550659


training:   3%|▎         | 314/10986 [07:22<3:49:02,  1.29s/it]

training loss: 3.24808406829834


training:   3%|▎         | 315/10986 [07:23<3:47:05,  1.28s/it]

training loss: 3.31526255607605


training:   3%|▎         | 316/10986 [07:24<3:45:56,  1.27s/it]

training loss: 3.33931303024292


training:   3%|▎         | 317/10986 [07:25<3:44:25,  1.26s/it]

training loss: 3.3431928157806396


training:   3%|▎         | 318/10986 [07:27<3:44:15,  1.26s/it]

training loss: 3.4374938011169434


training:   3%|▎         | 319/10986 [07:28<3:43:52,  1.26s/it]

training loss: 3.3117191791534424


training:   3%|▎         | 320/10986 [07:29<3:43:24,  1.26s/it]

training loss: 3.2936477661132812
valid loss: 3.288661003112793
perplexity: 26.806943893432617


training:   3%|▎         | 321/10986 [07:32<4:55:04,  1.66s/it]

training loss: 3.3815808296203613


training:   3%|▎         | 322/10986 [07:33<4:37:04,  1.56s/it]

training loss: 3.3947649002075195


training:   3%|▎         | 323/10986 [07:34<4:20:06,  1.46s/it]

training loss: 3.32698392868042


training:   3%|▎         | 324/10986 [07:36<4:08:53,  1.40s/it]

training loss: 3.2601711750030518


training:   3%|▎         | 325/10986 [07:37<4:00:46,  1.36s/it]

training loss: 3.265634536743164


training:   3%|▎         | 326/10986 [07:38<3:55:19,  1.32s/it]

training loss: 3.223222494125366


training:   3%|▎         | 327/10986 [07:39<3:51:08,  1.30s/it]

training loss: 3.342470169067383


training:   3%|▎         | 328/10986 [07:41<3:50:14,  1.30s/it]

training loss: 3.3950698375701904


training:   3%|▎         | 329/10986 [07:42<3:48:04,  1.28s/it]

training loss: 3.2670023441314697


training:   3%|▎         | 330/10986 [07:43<3:46:38,  1.28s/it]

training loss: 3.259854555130005


training:   3%|▎         | 331/10986 [07:45<3:57:42,  1.34s/it]

training loss: 3.3060126304626465


training:   3%|▎         | 332/10986 [07:46<3:53:02,  1.31s/it]

training loss: 3.2846148014068604


training:   3%|▎         | 333/10986 [07:47<3:49:03,  1.29s/it]

training loss: 3.294328451156616


training:   3%|▎         | 334/10986 [07:48<3:45:40,  1.27s/it]

training loss: 3.3427133560180664


training:   3%|▎         | 335/10986 [07:50<3:43:50,  1.26s/it]

training loss: 3.2885260581970215


training:   3%|▎         | 336/10986 [07:51<3:42:44,  1.25s/it]

training loss: 3.439486265182495


training:   3%|▎         | 337/10986 [07:52<3:41:43,  1.25s/it]

training loss: 3.3345937728881836


training:   3%|▎         | 338/10986 [07:53<3:40:24,  1.24s/it]

training loss: 3.432260513305664


training:   3%|▎         | 339/10986 [07:55<3:40:29,  1.24s/it]

training loss: 3.3208167552948


training:   3%|▎         | 340/10986 [07:56<3:39:37,  1.24s/it]

training loss: 3.4334158897399902
valid loss: 3.4273898601531982
perplexity: 30.79615592956543


training:   3%|▎         | 341/10986 [07:58<4:52:25,  1.65s/it]

training loss: 3.4009830951690674


training:   3%|▎         | 342/10986 [08:00<4:35:24,  1.55s/it]

training loss: 3.3605873584747314


training:   3%|▎         | 343/10986 [08:01<4:19:03,  1.46s/it]

training loss: 3.3259851932525635


training:   3%|▎         | 344/10986 [08:02<4:07:54,  1.40s/it]

training loss: 3.4046080112457275


training:   3%|▎         | 345/10986 [08:03<3:58:45,  1.35s/it]

training loss: 3.3229758739471436


training:   3%|▎         | 346/10986 [08:05<3:54:08,  1.32s/it]

training loss: 3.417463779449463


training:   3%|▎         | 347/10986 [08:06<3:50:47,  1.30s/it]

training loss: 3.2930123805999756


training:   3%|▎         | 348/10986 [08:07<3:47:41,  1.28s/it]

training loss: 3.315553903579712


training:   3%|▎         | 349/10986 [08:08<3:44:49,  1.27s/it]

training loss: 3.2739555835723877


training:   3%|▎         | 350/10986 [08:10<3:43:27,  1.26s/it]

training loss: 3.279862880706787


training:   3%|▎         | 351/10986 [08:11<3:55:41,  1.33s/it]

training loss: 3.2989940643310547


training:   3%|▎         | 352/10986 [08:12<3:53:29,  1.32s/it]

training loss: 3.377471923828125


training:   3%|▎         | 353/10986 [08:14<3:48:23,  1.29s/it]

training loss: 3.357830286026001


training:   3%|▎         | 354/10986 [08:15<3:44:30,  1.27s/it]

training loss: 3.380685567855835


training:   3%|▎         | 355/10986 [08:16<3:44:01,  1.26s/it]

training loss: 3.363132953643799


training:   3%|▎         | 356/10986 [08:17<3:41:04,  1.25s/it]

training loss: 3.3218719959259033


training:   3%|▎         | 357/10986 [08:19<3:40:36,  1.25s/it]

training loss: 3.2313220500946045


training:   3%|▎         | 358/10986 [08:20<3:39:27,  1.24s/it]

training loss: 3.3609812259674072


training:   3%|▎         | 359/10986 [08:21<3:38:45,  1.24s/it]

training loss: 3.3318305015563965


training:   3%|▎         | 360/10986 [08:22<3:39:24,  1.24s/it]

training loss: 3.3470258712768555
valid loss: 3.3305912017822266
perplexity: 27.954864501953125


training:   3%|▎         | 361/10986 [08:25<4:48:57,  1.63s/it]

training loss: 3.4700405597686768


training:   3%|▎         | 362/10986 [08:27<4:50:04,  1.64s/it]

training loss: 3.3064537048339844


training:   3%|▎         | 363/10986 [08:28<4:28:32,  1.52s/it]

training loss: 3.3723950386047363


training:   3%|▎         | 364/10986 [08:29<4:12:58,  1.43s/it]

training loss: 3.2819290161132812


training:   3%|▎         | 365/10986 [08:30<4:02:44,  1.37s/it]

training loss: 3.2725419998168945


training:   3%|▎         | 366/10986 [08:31<3:54:50,  1.33s/it]

training loss: 3.254519462585449


training:   3%|▎         | 367/10986 [08:33<3:49:48,  1.30s/it]

training loss: 3.358698606491089


training:   3%|▎         | 368/10986 [08:34<3:45:32,  1.27s/it]

training loss: 3.225776195526123


training:   3%|▎         | 369/10986 [08:35<3:44:35,  1.27s/it]

training loss: 3.383451461791992


training:   3%|▎         | 370/10986 [08:36<3:41:53,  1.25s/it]

training loss: 3.3798928260803223


training:   3%|▎         | 371/10986 [08:38<3:52:48,  1.32s/it]

training loss: 3.276560068130493


training:   3%|▎         | 372/10986 [08:39<3:48:32,  1.29s/it]

training loss: 3.344076633453369


training:   3%|▎         | 373/10986 [08:40<3:45:12,  1.27s/it]

training loss: 3.338165760040283


training:   3%|▎         | 374/10986 [08:42<3:45:43,  1.28s/it]

training loss: 3.2349305152893066


training:   3%|▎         | 375/10986 [08:43<3:43:21,  1.26s/it]

training loss: 3.2726054191589355


training:   3%|▎         | 376/10986 [08:44<3:41:37,  1.25s/it]

training loss: 3.30753231048584


training:   3%|▎         | 377/10986 [08:45<3:39:59,  1.24s/it]

training loss: 3.3927130699157715


training:   3%|▎         | 378/10986 [08:47<3:39:39,  1.24s/it]

training loss: 3.3541312217712402


training:   3%|▎         | 379/10986 [08:48<3:37:57,  1.23s/it]

training loss: 3.3626248836517334


training:   3%|▎         | 380/10986 [08:49<3:37:25,  1.23s/it]

training loss: 3.32796311378479
valid loss: 3.325742483139038
perplexity: 27.81964683532715


training:   3%|▎         | 381/10986 [08:52<4:48:19,  1.63s/it]

training loss: 3.3394434452056885


training:   3%|▎         | 382/10986 [08:53<4:31:21,  1.54s/it]

training loss: 3.2989578247070312


training:   3%|▎         | 383/10986 [08:54<4:16:45,  1.45s/it]

training loss: 3.322610855102539


training:   3%|▎         | 384/10986 [08:55<4:05:01,  1.39s/it]

training loss: 3.2898037433624268


training:   4%|▎         | 385/10986 [08:57<3:56:49,  1.34s/it]

training loss: 3.2849621772766113


training:   4%|▎         | 386/10986 [08:58<3:50:19,  1.30s/it]

training loss: 3.3485522270202637


training:   4%|▎         | 387/10986 [08:59<3:47:28,  1.29s/it]

training loss: 3.3303885459899902


training:   4%|▎         | 388/10986 [09:00<3:44:22,  1.27s/it]

training loss: 3.429349422454834


training:   4%|▎         | 389/10986 [09:01<3:41:56,  1.26s/it]

training loss: 3.3560760021209717


training:   4%|▎         | 390/10986 [09:03<3:40:24,  1.25s/it]

training loss: 3.22709321975708


training:   4%|▎         | 391/10986 [09:04<3:52:27,  1.32s/it]

training loss: 3.4933900833129883


training:   4%|▎         | 392/10986 [09:05<3:49:01,  1.30s/it]

training loss: 3.303072452545166


training:   4%|▎         | 393/10986 [09:07<3:45:25,  1.28s/it]

training loss: 3.305647134780884


training:   4%|▎         | 394/10986 [09:08<3:44:11,  1.27s/it]

training loss: 3.2919230461120605


training:   4%|▎         | 395/10986 [09:10<4:02:13,  1.37s/it]

training loss: 3.3602287769317627


training:   4%|▎         | 396/10986 [09:11<4:28:35,  1.52s/it]

training loss: 3.3417320251464844


training:   4%|▎         | 397/10986 [09:13<4:38:22,  1.58s/it]

training loss: 3.3417274951934814


training:   4%|▎         | 398/10986 [09:15<4:44:51,  1.61s/it]

training loss: 3.213169574737549


training:   4%|▎         | 399/10986 [09:16<4:25:57,  1.51s/it]

training loss: 3.4379971027374268


training:   4%|▎         | 400/10986 [09:17<4:12:11,  1.43s/it]

training loss: 3.352809429168701
valid loss: 3.3493733406066895
perplexity: 28.48487663269043


training:   4%|▎         | 401/10986 [09:20<5:14:05,  1.78s/it]

training loss: 3.3196511268615723


training:   4%|▎         | 402/10986 [09:21<4:50:37,  1.65s/it]

training loss: 3.3812079429626465


training:   4%|▎         | 403/10986 [09:22<4:28:48,  1.52s/it]

training loss: 3.373450517654419


training:   4%|▎         | 404/10986 [09:24<4:13:16,  1.44s/it]

training loss: 3.4546055793762207


training:   4%|▎         | 405/10986 [09:25<4:03:34,  1.38s/it]

training loss: 3.3066136837005615


training:   4%|▎         | 406/10986 [09:26<3:57:41,  1.35s/it]

training loss: 3.32763671875


training:   4%|▎         | 407/10986 [09:27<3:51:40,  1.31s/it]

training loss: 3.2680611610412598


training:   4%|▎         | 408/10986 [09:29<3:48:48,  1.30s/it]

training loss: 3.2996156215667725


training:   4%|▎         | 409/10986 [09:30<3:45:55,  1.28s/it]

training loss: 3.336099624633789


training:   4%|▎         | 410/10986 [09:31<3:43:40,  1.27s/it]

training loss: 3.2540786266326904


training:   4%|▎         | 411/10986 [09:33<3:55:03,  1.33s/it]

training loss: 3.470247745513916


training:   4%|▍         | 412/10986 [09:34<3:51:27,  1.31s/it]

training loss: 3.2909793853759766


training:   4%|▍         | 413/10986 [09:35<3:48:06,  1.29s/it]

training loss: 3.253317356109619


training:   4%|▍         | 414/10986 [09:36<3:46:30,  1.29s/it]

training loss: 3.229188919067383


training:   4%|▍         | 415/10986 [09:38<3:43:46,  1.27s/it]

training loss: 3.330669403076172


training:   4%|▍         | 416/10986 [09:39<3:42:16,  1.26s/it]

training loss: 3.268969774246216


training:   4%|▍         | 417/10986 [09:40<3:41:29,  1.26s/it]

training loss: 3.2890381813049316


training:   4%|▍         | 418/10986 [09:41<3:43:29,  1.27s/it]

training loss: 3.2191569805145264


training:   4%|▍         | 419/10986 [09:43<3:43:45,  1.27s/it]

training loss: 3.3308379650115967


training:   4%|▍         | 420/10986 [09:44<3:42:13,  1.26s/it]

training loss: 3.352377414703369
valid loss: 3.340909242630005
perplexity: 28.244794845581055


training:   4%|▍         | 421/10986 [09:47<4:52:40,  1.66s/it]

training loss: 3.324498414993286


training:   4%|▍         | 422/10986 [09:48<4:34:41,  1.56s/it]

training loss: 3.336932897567749


training:   4%|▍         | 423/10986 [09:49<4:18:31,  1.47s/it]

training loss: 3.234102249145508


training:   4%|▍         | 424/10986 [09:50<4:06:55,  1.40s/it]

training loss: 3.2214674949645996


training:   4%|▍         | 425/10986 [09:52<3:59:15,  1.36s/it]

training loss: 3.2441017627716064


training:   4%|▍         | 426/10986 [09:53<3:53:09,  1.32s/it]

training loss: 3.3122615814208984


training:   4%|▍         | 427/10986 [09:54<3:48:57,  1.30s/it]

training loss: 3.2290449142456055


training:   4%|▍         | 428/10986 [09:55<3:45:30,  1.28s/it]

training loss: 3.1995368003845215


training:   4%|▍         | 429/10986 [09:57<3:43:12,  1.27s/it]

training loss: 3.383207082748413


training:   4%|▍         | 430/10986 [09:58<3:41:14,  1.26s/it]

training loss: 3.305208206176758


training:   4%|▍         | 431/10986 [09:59<3:51:28,  1.32s/it]

training loss: 3.1602773666381836


training:   4%|▍         | 432/10986 [10:01<3:48:34,  1.30s/it]

training loss: 3.3186612129211426


training:   4%|▍         | 433/10986 [10:02<3:44:39,  1.28s/it]

training loss: 3.369387626647949


training:   4%|▍         | 434/10986 [10:03<3:41:41,  1.26s/it]

training loss: 3.288630485534668


training:   4%|▍         | 435/10986 [10:04<3:39:21,  1.25s/it]

training loss: 3.318410634994507


training:   4%|▍         | 436/10986 [10:05<3:38:33,  1.24s/it]

training loss: 3.422703266143799


training:   4%|▍         | 437/10986 [10:07<3:38:07,  1.24s/it]

training loss: 3.302913188934326


training:   4%|▍         | 438/10986 [10:08<3:38:08,  1.24s/it]

training loss: 3.2775661945343018


training:   4%|▍         | 439/10986 [10:09<3:37:46,  1.24s/it]

training loss: 3.304898262023926


training:   4%|▍         | 440/10986 [10:10<3:38:09,  1.24s/it]

training loss: 3.275312900543213
valid loss: 3.267761468887329
perplexity: 26.252506256103516


training:   4%|▍         | 441/10986 [10:13<4:55:58,  1.68s/it]

training loss: 3.1958703994750977


training:   4%|▍         | 442/10986 [10:15<4:49:38,  1.65s/it]

training loss: 3.2240848541259766


training:   4%|▍         | 443/10986 [10:16<4:29:01,  1.53s/it]

training loss: 3.3194046020507812


training:   4%|▍         | 444/10986 [10:17<4:14:23,  1.45s/it]

training loss: 3.2504820823669434


training:   4%|▍         | 445/10986 [10:18<4:03:31,  1.39s/it]

training loss: 3.2992775440216064


training:   4%|▍         | 446/10986 [10:20<3:55:35,  1.34s/it]

training loss: 3.220973491668701


training:   4%|▍         | 447/10986 [10:21<3:50:15,  1.31s/it]

training loss: 3.2757296562194824


training:   4%|▍         | 448/10986 [10:22<3:46:20,  1.29s/it]

training loss: 3.245180130004883


training:   4%|▍         | 449/10986 [10:23<3:42:43,  1.27s/it]

training loss: 3.289691686630249


training:   4%|▍         | 450/10986 [10:25<3:41:50,  1.26s/it]

training loss: 3.3266143798828125


training:   4%|▍         | 451/10986 [10:26<3:53:08,  1.33s/it]

training loss: 3.3404369354248047


training:   4%|▍         | 452/10986 [10:28<4:07:11,  1.41s/it]

training loss: 3.2821409702301025


training:   4%|▍         | 453/10986 [10:29<3:57:35,  1.35s/it]

training loss: 3.349771022796631


training:   4%|▍         | 454/10986 [10:30<3:51:09,  1.32s/it]

training loss: 3.245473623275757


training:   4%|▍         | 455/10986 [10:31<3:46:59,  1.29s/it]

training loss: 3.233610153198242


training:   4%|▍         | 456/10986 [10:33<3:43:27,  1.27s/it]

training loss: 3.294912338256836


training:   4%|▍         | 457/10986 [10:34<3:41:15,  1.26s/it]

training loss: 3.4217402935028076


training:   4%|▍         | 458/10986 [10:35<3:41:09,  1.26s/it]

training loss: 3.2761387825012207


training:   4%|▍         | 459/10986 [10:36<3:41:54,  1.26s/it]

training loss: 3.350388288497925


training:   4%|▍         | 460/10986 [10:38<3:40:27,  1.26s/it]

training loss: 3.25089693069458
valid loss: 3.2526497840881348
perplexity: 25.858768463134766


training:   4%|▍         | 461/10986 [10:40<4:49:45,  1.65s/it]

training loss: 3.399468421936035


training:   4%|▍         | 462/10986 [10:42<4:31:41,  1.55s/it]

training loss: 3.4059810638427734


training:   4%|▍         | 463/10986 [10:43<4:18:27,  1.47s/it]

training loss: 3.392638921737671


training:   4%|▍         | 464/10986 [10:44<4:06:30,  1.41s/it]

training loss: 3.2544639110565186


training:   4%|▍         | 465/10986 [10:45<3:57:53,  1.36s/it]

training loss: 3.2197115421295166


training:   4%|▍         | 466/10986 [10:47<3:51:56,  1.32s/it]

training loss: 3.203342914581299


training:   4%|▍         | 467/10986 [10:48<3:47:31,  1.30s/it]

training loss: 3.2837882041931152


training:   4%|▍         | 468/10986 [10:49<3:44:22,  1.28s/it]

training loss: 3.516284227371216


training:   4%|▍         | 469/10986 [10:50<3:42:27,  1.27s/it]

training loss: 3.2765941619873047


training:   4%|▍         | 470/10986 [10:52<3:41:30,  1.26s/it]

training loss: 3.3001577854156494


training:   4%|▍         | 471/10986 [10:53<3:53:12,  1.33s/it]

training loss: 3.303161382675171


training:   4%|▍         | 472/10986 [10:54<3:51:02,  1.32s/it]

training loss: 3.227660655975342


training:   4%|▍         | 473/10986 [10:56<3:46:12,  1.29s/it]

training loss: 3.3731541633605957


training:   4%|▍         | 474/10986 [10:57<3:43:48,  1.28s/it]

training loss: 3.3798251152038574


training:   4%|▍         | 475/10986 [10:58<3:42:03,  1.27s/it]

training loss: 3.2404513359069824


training:   4%|▍         | 476/10986 [10:59<3:39:39,  1.25s/it]

training loss: 3.223442316055298


training:   4%|▍         | 477/10986 [11:01<3:38:14,  1.25s/it]

training loss: 3.272278070449829


training:   4%|▍         | 478/10986 [11:02<3:38:06,  1.25s/it]

training loss: 3.168220043182373


training:   4%|▍         | 479/10986 [11:03<3:37:51,  1.24s/it]

training loss: 3.2692880630493164


training:   4%|▍         | 480/10986 [11:04<3:37:37,  1.24s/it]

training loss: 3.232296943664551
valid loss: 3.222557544708252
perplexity: 25.092212677001953


training:   4%|▍         | 481/10986 [11:07<4:49:01,  1.65s/it]

training loss: 3.298379898071289


training:   4%|▍         | 482/10986 [11:08<4:31:20,  1.55s/it]

training loss: 3.450866460800171


training:   4%|▍         | 483/10986 [11:09<4:16:37,  1.47s/it]

training loss: 3.2757349014282227


training:   4%|▍         | 484/10986 [11:11<4:03:43,  1.39s/it]

training loss: 3.280369281768799


training:   4%|▍         | 485/10986 [11:12<3:56:00,  1.35s/it]

training loss: 3.248339891433716


training:   4%|▍         | 486/10986 [11:13<3:52:52,  1.33s/it]

training loss: 3.30782413482666


training:   4%|▍         | 487/10986 [11:14<3:48:12,  1.30s/it]

training loss: 3.2889018058776855


training:   4%|▍         | 488/10986 [11:16<3:44:11,  1.28s/it]

training loss: 3.345853805541992


training:   4%|▍         | 489/10986 [11:17<3:42:06,  1.27s/it]

training loss: 3.285244941711426


training:   4%|▍         | 490/10986 [11:18<3:40:13,  1.26s/it]

training loss: 3.2730252742767334


training:   4%|▍         | 491/10986 [11:20<3:50:51,  1.32s/it]

training loss: 3.2481889724731445


training:   4%|▍         | 492/10986 [11:21<4:04:45,  1.40s/it]

training loss: 3.2235605716705322


training:   4%|▍         | 493/10986 [11:22<3:55:55,  1.35s/it]

training loss: 3.333974599838257


training:   4%|▍         | 494/10986 [11:24<3:49:06,  1.31s/it]

training loss: 3.269589900970459


training:   5%|▍         | 495/10986 [11:25<3:45:54,  1.29s/it]

training loss: 3.1913599967956543


training:   5%|▍         | 496/10986 [11:26<3:42:55,  1.28s/it]

training loss: 3.242003917694092


training:   5%|▍         | 497/10986 [11:27<3:41:38,  1.27s/it]

training loss: 3.3367791175842285


training:   5%|▍         | 498/10986 [11:29<3:39:59,  1.26s/it]

training loss: 3.1548285484313965


training:   5%|▍         | 499/10986 [11:30<3:38:18,  1.25s/it]

training loss: 3.2408595085144043


training:   5%|▍         | 500/10986 [11:31<3:37:35,  1.25s/it]

training loss: 3.2718698978424072
valid loss: 3.2661755084991455
perplexity: 26.21090316772461


training:   5%|▍         | 501/10986 [11:34<4:48:50,  1.65s/it]

training loss: 3.216973304748535


training:   5%|▍         | 502/10986 [11:35<4:31:05,  1.55s/it]

training loss: 3.2729287147521973


training:   5%|▍         | 503/10986 [11:36<4:15:32,  1.46s/it]

training loss: 3.270498275756836


training:   5%|▍         | 504/10986 [11:37<4:02:54,  1.39s/it]

training loss: 3.1730055809020996


training:   5%|▍         | 505/10986 [11:39<3:57:04,  1.36s/it]

training loss: 3.147705078125


training:   5%|▍         | 506/10986 [11:40<3:51:39,  1.33s/it]

training loss: 3.259490966796875


training:   5%|▍         | 507/10986 [11:41<3:46:55,  1.30s/it]

training loss: 3.2845072746276855


training:   5%|▍         | 508/10986 [11:42<3:43:17,  1.28s/it]

training loss: 3.3335134983062744


training:   5%|▍         | 509/10986 [11:44<3:42:09,  1.27s/it]

training loss: 3.178614616394043


training:   5%|▍         | 510/10986 [11:45<3:41:47,  1.27s/it]

training loss: 3.3376855850219727


training:   5%|▍         | 511/10986 [11:46<3:51:25,  1.33s/it]

training loss: 3.3029637336730957


training:   5%|▍         | 512/10986 [11:48<3:47:25,  1.30s/it]

training loss: 3.239888906478882


training:   5%|▍         | 513/10986 [11:49<3:44:31,  1.29s/it]

training loss: 3.1912882328033447


training:   5%|▍         | 514/10986 [11:50<3:42:05,  1.27s/it]

training loss: 3.2553842067718506


training:   5%|▍         | 515/10986 [11:51<3:39:46,  1.26s/it]

training loss: 3.198575496673584


training:   5%|▍         | 516/10986 [11:53<3:38:39,  1.25s/it]

training loss: 3.3413643836975098


training:   5%|▍         | 517/10986 [11:54<3:37:37,  1.25s/it]

training loss: 3.2386810779571533


training:   5%|▍         | 518/10986 [11:55<3:36:48,  1.24s/it]

training loss: 3.257364511489868


training:   5%|▍         | 519/10986 [11:56<3:36:14,  1.24s/it]

training loss: 3.2929718494415283


training:   5%|▍         | 520/10986 [11:58<3:35:28,  1.24s/it]

training loss: 3.2924270629882812
valid loss: 3.2863521575927734
perplexity: 26.7451229095459


training:   5%|▍         | 521/10986 [12:00<4:46:38,  1.64s/it]

training loss: 3.2721688747406006


training:   5%|▍         | 522/10986 [12:01<4:29:13,  1.54s/it]

training loss: 3.2255465984344482


training:   5%|▍         | 523/10986 [12:03<4:13:06,  1.45s/it]

training loss: 3.3261938095092773


training:   5%|▍         | 524/10986 [12:04<4:02:08,  1.39s/it]

training loss: 3.2166707515716553


training:   5%|▍         | 525/10986 [12:05<3:55:08,  1.35s/it]

training loss: 3.3937411308288574


training:   5%|▍         | 526/10986 [12:06<3:48:03,  1.31s/it]

training loss: 3.1961076259613037


training:   5%|▍         | 527/10986 [12:08<3:44:42,  1.29s/it]

training loss: 3.374115228652954


training:   5%|▍         | 528/10986 [12:09<3:42:15,  1.28s/it]

training loss: 3.3668298721313477


training:   5%|▍         | 529/10986 [12:10<3:43:02,  1.28s/it]

training loss: 3.2345826625823975


training:   5%|▍         | 530/10986 [12:11<3:41:01,  1.27s/it]

training loss: 3.265042543411255


training:   5%|▍         | 531/10986 [12:13<3:51:25,  1.33s/it]

training loss: 3.23838210105896


training:   5%|▍         | 532/10986 [12:14<3:48:40,  1.31s/it]

training loss: 3.382387161254883


training:   5%|▍         | 533/10986 [12:15<3:45:38,  1.30s/it]

training loss: 3.3010096549987793


training:   5%|▍         | 534/10986 [12:17<3:42:32,  1.28s/it]

training loss: 3.253370523452759


training:   5%|▍         | 535/10986 [12:18<3:40:47,  1.27s/it]

training loss: 3.2671403884887695


training:   5%|▍         | 536/10986 [12:19<3:38:49,  1.26s/it]

training loss: 3.247535467147827


training:   5%|▍         | 537/10986 [12:20<3:37:40,  1.25s/it]

training loss: 3.323788642883301


training:   5%|▍         | 538/10986 [12:22<3:37:59,  1.25s/it]

training loss: 3.2317357063293457


training:   5%|▍         | 539/10986 [12:23<3:36:28,  1.24s/it]

training loss: 3.340543746948242


training:   5%|▍         | 540/10986 [12:24<3:35:26,  1.24s/it]

training loss: 3.2257747650146484
valid loss: 3.220066547393799
perplexity: 25.02978515625


training:   5%|▍         | 541/10986 [12:27<4:44:04,  1.63s/it]

training loss: 3.2915127277374268


training:   5%|▍         | 542/10986 [12:28<4:28:29,  1.54s/it]

training loss: 3.2892768383026123


training:   5%|▍         | 543/10986 [12:29<4:12:26,  1.45s/it]

training loss: 3.2671380043029785


training:   5%|▍         | 544/10986 [12:30<4:01:19,  1.39s/it]

training loss: 3.366360664367676


training:   5%|▍         | 545/10986 [12:32<3:54:43,  1.35s/it]

training loss: 3.2510387897491455


training:   5%|▍         | 546/10986 [12:33<3:49:02,  1.32s/it]

training loss: 3.237968921661377


training:   5%|▍         | 547/10986 [12:34<3:44:46,  1.29s/it]

training loss: 3.302152633666992


training:   5%|▍         | 548/10986 [12:35<3:40:26,  1.27s/it]

training loss: 3.294438600540161


training:   5%|▍         | 549/10986 [12:37<3:38:59,  1.26s/it]

training loss: 3.201038122177124


training:   5%|▌         | 550/10986 [12:38<3:37:06,  1.25s/it]

training loss: 3.344170093536377


training:   5%|▌         | 551/10986 [12:39<3:49:44,  1.32s/it]

training loss: 3.299743175506592


training:   5%|▌         | 552/10986 [12:41<3:45:47,  1.30s/it]

training loss: 3.353316068649292


training:   5%|▌         | 553/10986 [12:42<3:42:33,  1.28s/it]

training loss: 3.2921624183654785


training:   5%|▌         | 554/10986 [12:43<3:40:11,  1.27s/it]

training loss: 3.2967002391815186


training:   5%|▌         | 555/10986 [12:44<3:40:32,  1.27s/it]

training loss: 3.2372870445251465


training:   5%|▌         | 556/10986 [12:46<3:39:27,  1.26s/it]

training loss: 3.2495296001434326


training:   5%|▌         | 557/10986 [12:47<3:38:50,  1.26s/it]

training loss: 3.2132654190063477


training:   5%|▌         | 558/10986 [12:48<3:37:16,  1.25s/it]

training loss: 3.203991174697876


training:   5%|▌         | 559/10986 [12:49<3:36:38,  1.25s/it]

training loss: 3.3362979888916016


training:   5%|▌         | 560/10986 [12:51<3:35:52,  1.24s/it]

training loss: 3.254314422607422
valid loss: 3.2549643516540527
perplexity: 25.918691635131836


training:   5%|▌         | 561/10986 [12:53<4:48:09,  1.66s/it]

training loss: 3.2287583351135254


training:   5%|▌         | 562/10986 [12:54<4:30:38,  1.56s/it]

training loss: 3.1917829513549805


training:   5%|▌         | 563/10986 [12:56<4:13:52,  1.46s/it]

training loss: 3.2667620182037354


training:   5%|▌         | 564/10986 [12:57<4:01:23,  1.39s/it]

training loss: 3.2105929851531982


training:   5%|▌         | 565/10986 [12:58<3:52:13,  1.34s/it]

training loss: 3.291900157928467


training:   5%|▌         | 566/10986 [12:59<3:46:34,  1.30s/it]

training loss: 3.1914358139038086


training:   5%|▌         | 567/10986 [13:01<3:42:24,  1.28s/it]

training loss: 3.3385133743286133


training:   5%|▌         | 568/10986 [13:02<3:40:23,  1.27s/it]

training loss: 3.3090262413024902


training:   5%|▌         | 569/10986 [13:03<3:38:48,  1.26s/it]

training loss: 3.206334352493286


training:   5%|▌         | 570/10986 [13:04<3:37:31,  1.25s/it]

training loss: 3.327563524246216


training:   5%|▌         | 571/10986 [13:06<3:49:02,  1.32s/it]

training loss: 3.2523598670959473


training:   5%|▌         | 572/10986 [13:07<3:45:42,  1.30s/it]

training loss: 3.3174777030944824


training:   5%|▌         | 573/10986 [13:08<3:41:24,  1.28s/it]

training loss: 3.2023847103118896


training:   5%|▌         | 574/10986 [13:10<3:39:00,  1.26s/it]

training loss: 3.219133138656616


training:   5%|▌         | 575/10986 [13:11<3:38:03,  1.26s/it]

training loss: 3.2760913372039795


training:   5%|▌         | 576/10986 [13:12<3:36:58,  1.25s/it]

training loss: 3.2126622200012207


training:   5%|▌         | 577/10986 [13:13<3:35:53,  1.24s/it]

training loss: 3.2894933223724365


training:   5%|▌         | 578/10986 [13:15<3:38:10,  1.26s/it]

training loss: 3.2773969173431396


training:   5%|▌         | 579/10986 [13:16<3:36:31,  1.25s/it]

training loss: 3.3293330669403076


training:   5%|▌         | 580/10986 [13:17<3:35:03,  1.24s/it]

training loss: 3.2845067977905273
valid loss: 3.283951759338379
perplexity: 26.681001663208008


training:   5%|▌         | 581/10986 [13:20<4:44:09,  1.64s/it]

training loss: 3.3288893699645996


training:   5%|▌         | 582/10986 [13:21<4:26:28,  1.54s/it]

training loss: 3.2122747898101807


training:   5%|▌         | 583/10986 [13:22<4:10:17,  1.44s/it]

training loss: 3.315425395965576


training:   5%|▌         | 584/10986 [13:23<3:58:53,  1.38s/it]

training loss: 3.282315731048584


training:   5%|▌         | 585/10986 [13:25<3:51:43,  1.34s/it]

training loss: 3.19797420501709


training:   5%|▌         | 586/10986 [13:26<3:45:24,  1.30s/it]

training loss: 3.3230082988739014


training:   5%|▌         | 587/10986 [13:27<3:42:38,  1.28s/it]

training loss: 3.278057098388672


training:   5%|▌         | 588/10986 [13:28<3:40:31,  1.27s/it]

training loss: 3.3097751140594482


training:   5%|▌         | 589/10986 [13:29<3:38:42,  1.26s/it]

training loss: 3.163273811340332


training:   5%|▌         | 590/10986 [13:31<3:37:22,  1.25s/it]

training loss: 3.2035839557647705


training:   5%|▌         | 591/10986 [13:32<3:48:48,  1.32s/it]

training loss: 3.2710189819335938


training:   5%|▌         | 592/10986 [13:33<3:45:42,  1.30s/it]

training loss: 3.222705364227295


training:   5%|▌         | 593/10986 [13:35<3:42:38,  1.29s/it]

training loss: 3.250222682952881


training:   5%|▌         | 594/10986 [13:36<3:43:30,  1.29s/it]

training loss: 3.174072265625


training:   5%|▌         | 595/10986 [13:37<3:40:55,  1.28s/it]

training loss: 3.2293789386749268


training:   5%|▌         | 596/10986 [13:38<3:38:57,  1.26s/it]

training loss: 3.247846841812134


training:   5%|▌         | 597/10986 [13:40<3:38:01,  1.26s/it]

training loss: 3.27305269241333


training:   5%|▌         | 598/10986 [13:41<3:36:07,  1.25s/it]

training loss: 3.2342796325683594


training:   5%|▌         | 599/10986 [13:42<3:35:51,  1.25s/it]

training loss: 3.351224184036255


training:   5%|▌         | 600/10986 [13:43<3:34:27,  1.24s/it]

training loss: 3.1932146549224854
valid loss: 3.1896729469299316
perplexity: 24.280485153198242


training:   5%|▌         | 601/10986 [13:46<4:49:48,  1.67s/it]

training loss: 3.2507688999176025


training:   5%|▌         | 602/10986 [13:47<4:31:02,  1.57s/it]

training loss: 3.198760747909546


training:   5%|▌         | 603/10986 [13:49<4:13:49,  1.47s/it]

training loss: 3.3074774742126465


training:   5%|▌         | 604/10986 [13:50<4:01:03,  1.39s/it]

training loss: 3.2141644954681396


training:   6%|▌         | 605/10986 [13:51<3:52:06,  1.34s/it]

training loss: 3.3057122230529785


training:   6%|▌         | 606/10986 [13:52<3:45:41,  1.30s/it]

training loss: 3.3072729110717773


training:   6%|▌         | 607/10986 [13:54<3:41:23,  1.28s/it]

training loss: 3.2861716747283936


training:   6%|▌         | 608/10986 [13:55<3:39:00,  1.27s/it]

training loss: 3.347006320953369


training:   6%|▌         | 609/10986 [13:56<3:36:33,  1.25s/it]

training loss: 3.2435989379882812


training:   6%|▌         | 610/10986 [13:57<3:35:18,  1.25s/it]

training loss: 3.2145252227783203


training:   6%|▌         | 611/10986 [13:59<3:47:13,  1.31s/it]

training loss: 3.3092448711395264


training:   6%|▌         | 612/10986 [14:00<3:45:06,  1.30s/it]

training loss: 3.1899797916412354


training:   6%|▌         | 613/10986 [14:01<3:41:28,  1.28s/it]

training loss: 3.277656316757202


training:   6%|▌         | 614/10986 [14:02<3:38:55,  1.27s/it]

training loss: 3.294429302215576


training:   6%|▌         | 615/10986 [14:04<3:37:01,  1.26s/it]

training loss: 3.21043062210083


training:   6%|▌         | 616/10986 [14:05<3:36:08,  1.25s/it]

training loss: 3.2354514598846436


training:   6%|▌         | 617/10986 [14:06<3:35:21,  1.25s/it]

training loss: 3.16222882270813


training:   6%|▌         | 618/10986 [14:07<3:35:01,  1.24s/it]

training loss: 3.303844451904297


training:   6%|▌         | 619/10986 [14:09<3:34:47,  1.24s/it]

training loss: 3.277271270751953


training:   6%|▌         | 620/10986 [14:10<3:34:07,  1.24s/it]

training loss: 3.297833204269409
valid loss: 3.292121648788452
perplexity: 26.89987564086914


training:   6%|▌         | 621/10986 [14:12<4:44:24,  1.65s/it]

training loss: 3.3008224964141846


training:   6%|▌         | 622/10986 [14:14<4:26:27,  1.54s/it]

training loss: 3.307766914367676


training:   6%|▌         | 623/10986 [14:15<4:14:14,  1.47s/it]

training loss: 3.302565813064575


training:   6%|▌         | 624/10986 [14:16<4:02:06,  1.40s/it]

training loss: 3.2441554069519043


training:   6%|▌         | 625/10986 [14:18<3:54:15,  1.36s/it]

training loss: 3.2867591381073


training:   6%|▌         | 626/10986 [14:19<3:52:19,  1.35s/it]

training loss: 3.2097513675689697


training:   6%|▌         | 627/10986 [14:20<4:07:26,  1.43s/it]

training loss: 3.3423852920532227


training:   6%|▌         | 628/10986 [14:22<4:19:17,  1.50s/it]

training loss: 3.22530460357666


training:   6%|▌         | 629/10986 [14:24<4:27:50,  1.55s/it]

training loss: 3.1936750411987305


training:   6%|▌         | 630/10986 [14:25<4:32:11,  1.58s/it]

training loss: 3.312330722808838


training:   6%|▌         | 631/10986 [14:27<4:26:30,  1.54s/it]

training loss: 3.2318060398101807


training:   6%|▌         | 632/10986 [14:28<4:12:00,  1.46s/it]

training loss: 3.3473124504089355


training:   6%|▌         | 633/10986 [14:29<4:00:23,  1.39s/it]

training loss: 3.4912638664245605


training:   6%|▌         | 634/10986 [14:31<3:51:32,  1.34s/it]

training loss: 3.4140102863311768


training:   6%|▌         | 635/10986 [14:32<3:46:43,  1.31s/it]

training loss: 3.2384586334228516


training:   6%|▌         | 636/10986 [14:33<3:42:01,  1.29s/it]

training loss: 3.2861154079437256


training:   6%|▌         | 637/10986 [14:34<3:40:14,  1.28s/it]

training loss: 3.3458871841430664


training:   6%|▌         | 638/10986 [14:36<3:38:04,  1.26s/it]

training loss: 3.3125104904174805


training:   6%|▌         | 639/10986 [14:37<3:37:33,  1.26s/it]

training loss: 3.2638118267059326


training:   6%|▌         | 640/10986 [14:38<3:36:19,  1.25s/it]

training loss: 3.344782590866089
valid loss: 3.3302927017211914
perplexity: 27.946521759033203


training:   6%|▌         | 641/10986 [14:41<4:45:57,  1.66s/it]

training loss: 3.2761995792388916


training:   6%|▌         | 642/10986 [14:42<4:27:41,  1.55s/it]

training loss: 3.209200143814087


training:   6%|▌         | 643/10986 [14:43<4:11:37,  1.46s/it]

training loss: 3.2589340209960938


training:   6%|▌         | 644/10986 [14:44<4:01:03,  1.40s/it]

training loss: 3.27681565284729


training:   6%|▌         | 645/10986 [14:46<3:56:39,  1.37s/it]

training loss: 3.372628688812256


training:   6%|▌         | 646/10986 [14:47<3:50:46,  1.34s/it]

training loss: 3.371737003326416


training:   6%|▌         | 647/10986 [14:48<3:45:52,  1.31s/it]

training loss: 3.2605180740356445


training:   6%|▌         | 648/10986 [14:50<3:41:54,  1.29s/it]

training loss: 3.2542033195495605


training:   6%|▌         | 649/10986 [14:51<3:40:15,  1.28s/it]

training loss: 3.2958009243011475


training:   6%|▌         | 650/10986 [14:52<3:38:50,  1.27s/it]

training loss: 3.2643144130706787


training:   6%|▌         | 651/10986 [14:54<3:49:32,  1.33s/it]

training loss: 3.3048858642578125


training:   6%|▌         | 652/10986 [14:55<3:51:54,  1.35s/it]

training loss: 3.3092896938323975


training:   6%|▌         | 653/10986 [14:56<3:46:26,  1.31s/it]

training loss: 3.2403202056884766


training:   6%|▌         | 654/10986 [14:57<3:43:05,  1.30s/it]

training loss: 3.243328809738159


training:   6%|▌         | 655/10986 [14:59<3:38:53,  1.27s/it]

training loss: 3.2255194187164307


training:   6%|▌         | 656/10986 [15:00<3:38:52,  1.27s/it]

training loss: 3.290483236312866


training:   6%|▌         | 657/10986 [15:01<3:37:02,  1.26s/it]

training loss: 3.2585084438323975


training:   6%|▌         | 658/10986 [15:02<3:35:49,  1.25s/it]

training loss: 3.373309373855591


training:   6%|▌         | 659/10986 [15:04<3:34:47,  1.25s/it]

training loss: 3.270752429962158


training:   6%|▌         | 660/10986 [15:05<3:33:19,  1.24s/it]

training loss: 3.266528367996216
valid loss: 3.2634732723236084
perplexity: 26.14017105102539


training:   6%|▌         | 661/10986 [15:07<4:42:28,  1.64s/it]

training loss: 3.2138020992279053


training:   6%|▌         | 662/10986 [15:09<4:25:14,  1.54s/it]

training loss: 3.1815531253814697


training:   6%|▌         | 663/10986 [15:10<4:10:12,  1.45s/it]

training loss: 3.254310131072998


training:   6%|▌         | 664/10986 [15:11<3:58:51,  1.39s/it]

training loss: 3.216093063354492


training:   6%|▌         | 665/10986 [15:12<3:51:16,  1.34s/it]

training loss: 3.184215545654297


training:   6%|▌         | 666/10986 [15:14<3:45:10,  1.31s/it]

training loss: 3.3324289321899414


training:   6%|▌         | 667/10986 [15:15<3:41:56,  1.29s/it]

training loss: 3.3071224689483643


training:   6%|▌         | 668/10986 [15:16<3:40:54,  1.28s/it]

training loss: 3.3217244148254395


training:   6%|▌         | 669/10986 [15:17<3:38:18,  1.27s/it]

training loss: 3.218062400817871


training:   6%|▌         | 670/10986 [15:19<3:36:38,  1.26s/it]

training loss: 3.2031912803649902


training:   6%|▌         | 671/10986 [15:20<3:49:13,  1.33s/it]

training loss: 3.1878767013549805


training:   6%|▌         | 672/10986 [15:22<4:03:14,  1.42s/it]

training loss: 3.2250757217407227


training:   6%|▌         | 673/10986 [15:23<3:54:43,  1.37s/it]

training loss: 3.3157544136047363


training:   6%|▌         | 674/10986 [15:24<3:46:48,  1.32s/it]

training loss: 3.2672057151794434


training:   6%|▌         | 675/10986 [15:25<3:42:27,  1.29s/it]

training loss: 3.2225818634033203


training:   6%|▌         | 676/10986 [15:27<3:39:27,  1.28s/it]

training loss: 3.3066773414611816


training:   6%|▌         | 677/10986 [15:28<3:37:12,  1.26s/it]

training loss: 3.2264721393585205


training:   6%|▌         | 678/10986 [15:29<3:36:07,  1.26s/it]

training loss: 3.226976156234741


training:   6%|▌         | 679/10986 [15:30<3:34:53,  1.25s/it]

training loss: 3.27648663520813


training:   6%|▌         | 680/10986 [15:32<3:34:18,  1.25s/it]

training loss: 3.3177521228790283
valid loss: 3.3100340366363525
perplexity: 27.386056900024414


training:   6%|▌         | 681/10986 [15:34<4:41:12,  1.64s/it]

training loss: 3.2771122455596924


training:   6%|▌         | 682/10986 [15:36<4:38:27,  1.62s/it]

training loss: 3.2388131618499756


training:   6%|▌         | 683/10986 [15:37<4:20:10,  1.52s/it]

training loss: 3.259913682937622


training:   6%|▌         | 684/10986 [15:38<4:05:10,  1.43s/it]

training loss: 3.2486343383789062


training:   6%|▌         | 685/10986 [15:40<3:55:12,  1.37s/it]

training loss: 3.2341678142547607


training:   6%|▌         | 686/10986 [15:41<3:48:36,  1.33s/it]

training loss: 3.272266387939453


training:   6%|▋         | 687/10986 [15:42<3:43:26,  1.30s/it]

training loss: 3.31872820854187


training:   6%|▋         | 688/10986 [15:43<3:39:47,  1.28s/it]

training loss: 3.2802157402038574


training:   6%|▋         | 689/10986 [15:44<3:38:51,  1.28s/it]

training loss: 3.281294584274292


training:   6%|▋         | 690/10986 [15:46<3:37:32,  1.27s/it]

training loss: 3.3568432331085205


training:   6%|▋         | 691/10986 [15:47<3:49:56,  1.34s/it]

training loss: 3.300135612487793


training:   6%|▋         | 692/10986 [15:48<3:44:37,  1.31s/it]

training loss: 3.411484718322754


training:   6%|▋         | 693/10986 [15:50<3:40:14,  1.28s/it]

training loss: 3.276738166809082


training:   6%|▋         | 694/10986 [15:51<3:38:30,  1.27s/it]

training loss: 3.3450350761413574


training:   6%|▋         | 695/10986 [15:52<3:36:56,  1.26s/it]

training loss: 3.2539424896240234


training:   6%|▋         | 696/10986 [15:53<3:36:21,  1.26s/it]

training loss: 3.2691516876220703


training:   6%|▋         | 697/10986 [15:55<3:35:51,  1.26s/it]

training loss: 3.2347500324249268


training:   6%|▋         | 698/10986 [15:56<3:35:04,  1.25s/it]

training loss: 3.3380846977233887


training:   6%|▋         | 699/10986 [15:57<3:33:19,  1.24s/it]

training loss: 3.327848196029663


training:   6%|▋         | 700/10986 [15:58<3:32:40,  1.24s/it]

training loss: 3.3007497787475586
valid loss: 3.301729202270508
perplexity: 27.159563064575195


training:   6%|▋         | 701/10986 [16:01<4:42:51,  1.65s/it]

training loss: 3.183619260787964


training:   6%|▋         | 702/10986 [16:02<4:26:51,  1.56s/it]

training loss: 3.343238353729248


training:   6%|▋         | 703/10986 [16:04<4:09:37,  1.46s/it]

training loss: 3.271822929382324


training:   6%|▋         | 704/10986 [16:05<3:57:35,  1.39s/it]

training loss: 3.2782106399536133


training:   6%|▋         | 705/10986 [16:06<3:50:50,  1.35s/it]

training loss: 3.2418384552001953


training:   6%|▋         | 706/10986 [16:07<3:44:35,  1.31s/it]

training loss: 3.3319814205169678


training:   6%|▋         | 707/10986 [16:08<3:40:26,  1.29s/it]

training loss: 3.2775535583496094


training:   6%|▋         | 708/10986 [16:10<3:36:58,  1.27s/it]

training loss: 3.2415502071380615


training:   6%|▋         | 709/10986 [16:11<3:36:22,  1.26s/it]

training loss: 3.3623149394989014


training:   6%|▋         | 710/10986 [16:12<3:35:59,  1.26s/it]

training loss: 3.3424389362335205


training:   6%|▋         | 711/10986 [16:14<3:46:31,  1.32s/it]

training loss: 3.284111261367798


training:   6%|▋         | 712/10986 [16:15<3:43:49,  1.31s/it]

training loss: 3.2830841541290283


training:   6%|▋         | 713/10986 [16:16<3:40:07,  1.29s/it]

training loss: 3.2339675426483154


training:   6%|▋         | 714/10986 [16:17<3:38:43,  1.28s/it]

training loss: 3.3009138107299805


training:   7%|▋         | 715/10986 [16:19<3:36:38,  1.27s/it]

training loss: 3.200998544692993


training:   7%|▋         | 716/10986 [16:20<3:35:13,  1.26s/it]

training loss: 3.2101898193359375


training:   7%|▋         | 717/10986 [16:21<3:33:51,  1.25s/it]

training loss: 3.1439342498779297


training:   7%|▋         | 718/10986 [16:22<3:31:57,  1.24s/it]

training loss: 3.2477104663848877


training:   7%|▋         | 719/10986 [16:24<3:30:41,  1.23s/it]

training loss: 3.335160732269287


training:   7%|▋         | 720/10986 [16:25<3:31:46,  1.24s/it]

training loss: 3.2501323223114014
valid loss: 3.2504937648773193
perplexity: 25.803077697753906


training:   7%|▋         | 721/10986 [16:27<4:41:52,  1.65s/it]

training loss: 3.376446008682251


training:   7%|▋         | 722/10986 [16:29<4:40:52,  1.64s/it]

training loss: 3.1992640495300293


training:   7%|▋         | 723/10986 [16:30<4:20:41,  1.52s/it]

training loss: 3.3283557891845703


training:   7%|▋         | 724/10986 [16:32<4:04:53,  1.43s/it]

training loss: 3.2124783992767334


training:   7%|▋         | 725/10986 [16:33<3:54:29,  1.37s/it]

training loss: 3.2724995613098145


training:   7%|▋         | 726/10986 [16:34<3:46:19,  1.32s/it]

training loss: 3.3142433166503906


training:   7%|▋         | 727/10986 [16:35<3:40:39,  1.29s/it]

training loss: 3.292935371398926


training:   7%|▋         | 728/10986 [16:36<3:37:33,  1.27s/it]

training loss: 3.2034642696380615


training:   7%|▋         | 729/10986 [16:38<3:35:09,  1.26s/it]

training loss: 3.3197033405303955


training:   7%|▋         | 730/10986 [16:39<3:33:44,  1.25s/it]

training loss: 3.270534038543701


training:   7%|▋         | 731/10986 [16:40<3:44:40,  1.31s/it]

training loss: 3.2600739002227783


training:   7%|▋         | 732/10986 [16:42<3:59:17,  1.40s/it]

training loss: 3.2504773139953613


training:   7%|▋         | 733/10986 [16:43<3:50:03,  1.35s/it]

training loss: 3.2797961235046387


training:   7%|▋         | 734/10986 [16:44<3:43:29,  1.31s/it]

training loss: 3.2974870204925537


training:   7%|▋         | 735/10986 [16:46<3:39:14,  1.28s/it]

training loss: 3.2332253456115723


training:   7%|▋         | 736/10986 [16:47<3:37:36,  1.27s/it]

training loss: 3.186840772628784


training:   7%|▋         | 737/10986 [16:48<3:36:23,  1.27s/it]

training loss: 3.301318645477295


training:   7%|▋         | 738/10986 [16:49<3:38:39,  1.28s/it]

training loss: 3.1858952045440674


training:   7%|▋         | 739/10986 [16:51<3:37:15,  1.27s/it]

training loss: 3.198760986328125


training:   7%|▋         | 740/10986 [16:52<3:35:36,  1.26s/it]

training loss: 3.2161152362823486
valid loss: 3.2126083374023438
perplexity: 24.84380340576172


training:   7%|▋         | 741/10986 [16:55<4:44:42,  1.67s/it]

training loss: 3.2283637523651123


training:   7%|▋         | 742/10986 [16:56<4:26:10,  1.56s/it]

training loss: 3.3344388008117676


training:   7%|▋         | 743/10986 [16:57<4:09:59,  1.46s/it]

training loss: 3.3727686405181885


training:   7%|▋         | 744/10986 [16:58<3:57:27,  1.39s/it]

training loss: 3.4095396995544434


training:   7%|▋         | 745/10986 [17:00<3:50:16,  1.35s/it]

training loss: 3.316645622253418


training:   7%|▋         | 746/10986 [17:01<3:43:13,  1.31s/it]

training loss: 3.3329694271087646


training:   7%|▋         | 747/10986 [17:02<3:40:47,  1.29s/it]

training loss: 3.3440635204315186


training:   7%|▋         | 748/10986 [17:03<3:36:43,  1.27s/it]

training loss: 3.21142578125


training:   7%|▋         | 749/10986 [17:04<3:35:35,  1.26s/it]

training loss: 3.168980836868286


training:   7%|▋         | 750/10986 [17:06<3:34:53,  1.26s/it]

training loss: 3.1140825748443604


training:   7%|▋         | 751/10986 [17:07<3:45:39,  1.32s/it]

training loss: 3.288886785507202


training:   7%|▋         | 752/10986 [17:09<3:58:46,  1.40s/it]

training loss: 3.2783281803131104


training:   7%|▋         | 753/10986 [17:10<3:50:56,  1.35s/it]

training loss: 3.268430233001709


training:   7%|▋         | 754/10986 [17:11<3:44:32,  1.32s/it]

training loss: 3.2521533966064453


training:   7%|▋         | 755/10986 [17:12<3:40:09,  1.29s/it]

training loss: 3.206012725830078


training:   7%|▋         | 756/10986 [17:14<3:36:56,  1.27s/it]

training loss: 3.3149337768554688


training:   7%|▋         | 757/10986 [17:15<3:34:15,  1.26s/it]

training loss: 3.2576003074645996


training:   7%|▋         | 758/10986 [17:16<3:33:01,  1.25s/it]

training loss: 3.2656478881835938


training:   7%|▋         | 759/10986 [17:17<3:31:47,  1.24s/it]

training loss: 3.2168729305267334


training:   7%|▋         | 760/10986 [17:19<3:31:27,  1.24s/it]

training loss: 3.244009256362915
valid loss: 3.242283344268799
perplexity: 25.592090606689453


training:   7%|▋         | 761/10986 [17:21<4:41:34,  1.65s/it]

training loss: 3.282665491104126


training:   7%|▋         | 762/10986 [17:23<4:26:17,  1.56s/it]

training loss: 3.2538182735443115


training:   7%|▋         | 763/10986 [17:24<4:08:53,  1.46s/it]

training loss: 3.3394458293914795


training:   7%|▋         | 764/10986 [17:25<3:55:55,  1.38s/it]

training loss: 3.2894277572631836


training:   7%|▋         | 765/10986 [17:26<3:47:43,  1.34s/it]

training loss: 3.2136847972869873


training:   7%|▋         | 766/10986 [17:27<3:41:37,  1.30s/it]

training loss: 3.3682451248168945


training:   7%|▋         | 767/10986 [17:29<3:37:07,  1.27s/it]

training loss: 3.3008947372436523


training:   7%|▋         | 768/10986 [17:30<3:34:08,  1.26s/it]

training loss: 3.1735360622406006


training:   7%|▋         | 769/10986 [17:31<3:32:23,  1.25s/it]

training loss: 3.220689058303833


training:   7%|▋         | 770/10986 [17:32<3:32:00,  1.25s/it]

training loss: 3.268489360809326


training:   7%|▋         | 771/10986 [17:34<3:42:48,  1.31s/it]

training loss: 3.2462406158447266


training:   7%|▋         | 772/10986 [17:35<3:58:03,  1.40s/it]

training loss: 3.2153100967407227


training:   7%|▋         | 773/10986 [17:37<3:49:32,  1.35s/it]

training loss: 3.2735087871551514


training:   7%|▋         | 774/10986 [17:38<3:42:34,  1.31s/it]

training loss: 3.2009236812591553


training:   7%|▋         | 775/10986 [17:39<3:38:37,  1.28s/it]

training loss: 3.283794641494751


training:   7%|▋         | 776/10986 [17:40<3:34:52,  1.26s/it]

training loss: 3.1533210277557373


training:   7%|▋         | 777/10986 [17:42<3:34:13,  1.26s/it]

training loss: 3.282965660095215


training:   7%|▋         | 778/10986 [17:43<3:32:24,  1.25s/it]

training loss: 3.188041925430298


training:   7%|▋         | 779/10986 [17:44<3:30:57,  1.24s/it]

training loss: 3.264863967895508


training:   7%|▋         | 780/10986 [17:45<3:29:56,  1.23s/it]

training loss: 3.1989760398864746
valid loss: 3.193636178970337
perplexity: 24.376907348632812


training:   7%|▋         | 781/10986 [17:48<4:39:04,  1.64s/it]

training loss: 3.233243227005005


training:   7%|▋         | 782/10986 [17:49<4:23:37,  1.55s/it]

training loss: 3.2028675079345703


training:   7%|▋         | 783/10986 [17:50<4:11:08,  1.48s/it]

training loss: 3.274859666824341


training:   7%|▋         | 784/10986 [17:52<3:58:42,  1.40s/it]

training loss: 3.241636276245117


training:   7%|▋         | 785/10986 [17:53<3:49:33,  1.35s/it]

training loss: 3.333156108856201


training:   7%|▋         | 786/10986 [17:54<3:42:41,  1.31s/it]

training loss: 3.223137617111206


training:   7%|▋         | 787/10986 [17:55<3:37:48,  1.28s/it]

training loss: 3.213104724884033


training:   7%|▋         | 788/10986 [17:57<3:33:58,  1.26s/it]

training loss: 3.1856513023376465


training:   7%|▋         | 789/10986 [17:58<3:33:19,  1.26s/it]

training loss: 3.2130520343780518


training:   7%|▋         | 790/10986 [17:59<3:31:59,  1.25s/it]

training loss: 3.218975305557251


training:   7%|▋         | 791/10986 [18:01<3:43:53,  1.32s/it]

training loss: 3.367633104324341


training:   7%|▋         | 792/10986 [18:02<3:41:17,  1.30s/it]

training loss: 3.2285263538360596


training:   7%|▋         | 793/10986 [18:03<3:37:49,  1.28s/it]

training loss: 3.3138246536254883


training:   7%|▋         | 794/10986 [18:04<3:35:09,  1.27s/it]

training loss: 3.263524055480957


training:   7%|▋         | 795/10986 [18:06<3:34:58,  1.27s/it]

training loss: 3.1902196407318115


training:   7%|▋         | 796/10986 [18:07<3:33:09,  1.26s/it]

training loss: 3.309701442718506


training:   7%|▋         | 797/10986 [18:08<3:30:22,  1.24s/it]

training loss: 3.2948567867279053


training:   7%|▋         | 798/10986 [18:09<3:28:42,  1.23s/it]

training loss: 3.1565189361572266


training:   7%|▋         | 799/10986 [18:10<3:28:45,  1.23s/it]

training loss: 3.2835254669189453


training:   7%|▋         | 800/10986 [18:12<3:28:54,  1.23s/it]

training loss: 3.3108909130096436
valid loss: 3.304211139678955
perplexity: 27.227054595947266


training:   7%|▋         | 801/10986 [18:14<4:38:16,  1.64s/it]

training loss: 3.190701723098755


training:   7%|▋         | 802/10986 [18:16<4:40:13,  1.65s/it]

training loss: 3.223039150238037


training:   7%|▋         | 803/10986 [18:17<4:20:05,  1.53s/it]

training loss: 3.2647135257720947


training:   7%|▋         | 804/10986 [18:18<4:05:51,  1.45s/it]

training loss: 3.3005518913269043


training:   7%|▋         | 805/10986 [18:20<3:54:30,  1.38s/it]

training loss: 3.291053056716919


training:   7%|▋         | 806/10986 [18:21<3:49:54,  1.36s/it]

training loss: 3.2933268547058105


training:   7%|▋         | 807/10986 [18:22<3:44:04,  1.32s/it]

training loss: 3.2260093688964844


training:   7%|▋         | 808/10986 [18:23<3:39:16,  1.29s/it]

training loss: 3.2751946449279785


training:   7%|▋         | 809/10986 [18:25<3:35:57,  1.27s/it]

training loss: 3.3872761726379395


training:   7%|▋         | 810/10986 [18:26<3:33:34,  1.26s/it]

training loss: 3.4210920333862305


training:   7%|▋         | 811/10986 [18:27<3:44:27,  1.32s/it]

training loss: 3.293259620666504


training:   7%|▋         | 812/10986 [18:29<3:45:45,  1.33s/it]

training loss: 3.339155673980713


training:   7%|▋         | 813/10986 [18:30<3:39:19,  1.29s/it]

training loss: 3.2442333698272705


training:   7%|▋         | 814/10986 [18:31<3:36:13,  1.28s/it]

training loss: 3.2488763332366943


training:   7%|▋         | 815/10986 [18:32<3:33:32,  1.26s/it]

training loss: 3.3756957054138184


training:   7%|▋         | 816/10986 [18:34<3:31:25,  1.25s/it]

training loss: 3.416971206665039


training:   7%|▋         | 817/10986 [18:35<3:30:05,  1.24s/it]

training loss: 3.235990524291992


training:   7%|▋         | 818/10986 [18:36<3:30:15,  1.24s/it]

training loss: 3.364626884460449


training:   7%|▋         | 819/10986 [18:37<3:29:39,  1.24s/it]

training loss: 3.316328525543213


training:   7%|▋         | 820/10986 [18:38<3:27:57,  1.23s/it]

training loss: 3.3863885402679443
valid loss: 3.3788883686065674
perplexity: 29.338138580322266


training:   7%|▋         | 821/10986 [18:41<4:36:50,  1.63s/it]

training loss: 3.2457101345062256


training:   7%|▋         | 822/10986 [18:42<4:20:39,  1.54s/it]

training loss: 3.2484216690063477


training:   7%|▋         | 823/10986 [18:44<4:05:36,  1.45s/it]

training loss: 3.1984643936157227


training:   8%|▊         | 824/10986 [18:45<3:54:03,  1.38s/it]

training loss: 3.235858678817749


training:   8%|▊         | 825/10986 [18:46<3:46:27,  1.34s/it]

training loss: 3.296126365661621


training:   8%|▊         | 826/10986 [18:47<3:39:13,  1.29s/it]

training loss: 3.3217432498931885


training:   8%|▊         | 827/10986 [18:48<3:35:45,  1.27s/it]

training loss: 3.36629581451416


training:   8%|▊         | 828/10986 [18:50<3:33:06,  1.26s/it]

training loss: 3.276381254196167


training:   8%|▊         | 829/10986 [18:51<3:31:59,  1.25s/it]

training loss: 3.188109874725342


training:   8%|▊         | 830/10986 [18:52<3:31:45,  1.25s/it]

training loss: 3.301981210708618


training:   8%|▊         | 831/10986 [18:54<3:43:54,  1.32s/it]

training loss: 3.1999213695526123


training:   8%|▊         | 832/10986 [18:55<3:59:42,  1.42s/it]

training loss: 3.2366597652435303


training:   8%|▊         | 833/10986 [18:57<3:52:10,  1.37s/it]

training loss: 3.188929319381714


training:   8%|▊         | 834/10986 [18:58<3:45:56,  1.34s/it]

training loss: 3.1701927185058594


training:   8%|▊         | 835/10986 [18:59<3:41:50,  1.31s/it]

training loss: 3.229637622833252


training:   8%|▊         | 836/10986 [19:00<3:37:50,  1.29s/it]

training loss: 3.3131237030029297


training:   8%|▊         | 837/10986 [19:02<3:35:11,  1.27s/it]

training loss: 3.3914365768432617


training:   8%|▊         | 838/10986 [19:03<3:31:46,  1.25s/it]

training loss: 3.264841318130493


training:   8%|▊         | 839/10986 [19:04<3:31:00,  1.25s/it]

training loss: 3.3534271717071533


training:   8%|▊         | 840/10986 [19:05<3:30:40,  1.25s/it]

training loss: 3.4075353145599365
valid loss: 3.4012610912323
perplexity: 30.001911163330078


training:   8%|▊         | 841/10986 [19:08<4:38:08,  1.64s/it]

training loss: 3.204272747039795


training:   8%|▊         | 842/10986 [19:09<4:22:19,  1.55s/it]

training loss: 3.399143695831299


training:   8%|▊         | 843/10986 [19:10<4:07:19,  1.46s/it]

training loss: 3.3456685543060303


training:   8%|▊         | 844/10986 [19:12<3:55:51,  1.40s/it]

training loss: 3.3674113750457764


training:   8%|▊         | 845/10986 [19:13<3:47:49,  1.35s/it]

training loss: 3.222263813018799


training:   8%|▊         | 846/10986 [19:14<3:42:10,  1.31s/it]

training loss: 3.207571268081665


training:   8%|▊         | 847/10986 [19:15<3:37:19,  1.29s/it]

training loss: 3.341219425201416


training:   8%|▊         | 848/10986 [19:17<3:32:35,  1.26s/it]

training loss: 3.1358494758605957


training:   8%|▊         | 849/10986 [19:18<3:32:49,  1.26s/it]

training loss: 3.2927300930023193


training:   8%|▊         | 850/10986 [19:19<3:31:34,  1.25s/it]

training loss: 3.3057267665863037


training:   8%|▊         | 851/10986 [19:21<3:43:32,  1.32s/it]

training loss: 3.2543416023254395


training:   8%|▊         | 852/10986 [19:22<3:49:45,  1.36s/it]

training loss: 3.306854248046875


training:   8%|▊         | 853/10986 [19:23<3:44:31,  1.33s/it]

training loss: 3.3169631958007812


training:   8%|▊         | 854/10986 [19:24<3:39:34,  1.30s/it]

training loss: 3.21964955329895


training:   8%|▊         | 855/10986 [19:26<3:36:45,  1.28s/it]

training loss: 3.260425329208374


training:   8%|▊         | 856/10986 [19:27<3:35:50,  1.28s/it]

training loss: 3.3423187732696533


training:   8%|▊         | 857/10986 [19:29<3:56:23,  1.40s/it]

training loss: 3.322031259536743


training:   8%|▊         | 858/10986 [19:30<4:13:59,  1.50s/it]

training loss: 3.2838330268859863


training:   8%|▊         | 859/10986 [19:32<4:24:55,  1.57s/it]

training loss: 3.397559642791748


training:   8%|▊         | 860/10986 [19:34<4:25:29,  1.57s/it]

training loss: 3.2234537601470947
valid loss: 3.220463752746582
perplexity: 25.03972816467285


training:   8%|▊         | 861/10986 [19:36<5:18:23,  1.89s/it]

training loss: 3.21307373046875


training:   8%|▊         | 862/10986 [19:38<4:49:51,  1.72s/it]

training loss: 3.3409125804901123


training:   8%|▊         | 863/10986 [19:39<4:26:09,  1.58s/it]

training loss: 3.1660807132720947


training:   8%|▊         | 864/10986 [19:40<4:09:28,  1.48s/it]

training loss: 3.2540717124938965


training:   8%|▊         | 865/10986 [19:41<3:57:46,  1.41s/it]

training loss: 3.192495822906494


training:   8%|▊         | 866/10986 [19:43<3:49:29,  1.36s/it]

training loss: 3.248297691345215


training:   8%|▊         | 867/10986 [19:44<3:43:08,  1.32s/it]

training loss: 3.2269277572631836


training:   8%|▊         | 868/10986 [19:45<3:41:14,  1.31s/it]

training loss: 3.2509350776672363


training:   8%|▊         | 869/10986 [19:46<3:37:22,  1.29s/it]

training loss: 3.280735969543457


training:   8%|▊         | 870/10986 [19:48<3:34:05,  1.27s/it]

training loss: 3.21575665473938


training:   8%|▊         | 871/10986 [19:49<3:44:11,  1.33s/it]

training loss: 3.292694330215454


training:   8%|▊         | 872/10986 [19:50<3:43:29,  1.33s/it]

training loss: 3.250305652618408


training:   8%|▊         | 873/10986 [19:52<3:40:43,  1.31s/it]

training loss: 3.2533838748931885


training:   8%|▊         | 874/10986 [19:53<3:37:55,  1.29s/it]

training loss: 3.2295031547546387


training:   8%|▊         | 875/10986 [19:54<3:35:06,  1.28s/it]

training loss: 3.2292282581329346


training:   8%|▊         | 876/10986 [19:55<3:32:42,  1.26s/it]

training loss: 3.4723713397979736


training:   8%|▊         | 877/10986 [19:57<3:30:39,  1.25s/it]

training loss: 3.2276906967163086


training:   8%|▊         | 878/10986 [19:58<3:29:00,  1.24s/it]

training loss: 3.2263526916503906


training:   8%|▊         | 879/10986 [19:59<3:29:43,  1.24s/it]

training loss: 3.296773910522461


training:   8%|▊         | 880/10986 [20:00<3:29:36,  1.24s/it]

training loss: 3.266425609588623
valid loss: 3.2575416564941406
perplexity: 25.985578536987305


training:   8%|▊         | 881/10986 [20:03<4:38:26,  1.65s/it]

training loss: 3.2740564346313477


training:   8%|▊         | 882/10986 [20:04<4:21:50,  1.55s/it]

training loss: 3.374540090560913


training:   8%|▊         | 883/10986 [20:06<4:06:26,  1.46s/it]

training loss: 3.3122456073760986


training:   8%|▊         | 884/10986 [20:07<3:54:00,  1.39s/it]

training loss: 3.2396984100341797


training:   8%|▊         | 885/10986 [20:08<3:46:16,  1.34s/it]

training loss: 3.2500391006469727


training:   8%|▊         | 886/10986 [20:09<3:40:47,  1.31s/it]

training loss: 3.1909844875335693


training:   8%|▊         | 887/10986 [20:10<3:36:52,  1.29s/it]

training loss: 3.31398606300354


training:   8%|▊         | 888/10986 [20:12<3:35:39,  1.28s/it]

training loss: 3.1761467456817627


training:   8%|▊         | 889/10986 [20:13<3:33:46,  1.27s/it]

training loss: 3.1788692474365234


training:   8%|▊         | 890/10986 [20:14<3:33:26,  1.27s/it]

training loss: 3.220003843307495


training:   8%|▊         | 891/10986 [20:16<3:45:54,  1.34s/it]

training loss: 3.2576258182525635


training:   8%|▊         | 892/10986 [20:17<3:47:25,  1.35s/it]

training loss: 3.321540355682373


training:   8%|▊         | 893/10986 [20:18<3:41:51,  1.32s/it]

training loss: 3.232123374938965


training:   8%|▊         | 894/10986 [20:20<3:39:30,  1.31s/it]

training loss: 3.2926597595214844


training:   8%|▊         | 895/10986 [20:21<3:36:11,  1.29s/it]

training loss: 3.229336977005005


training:   8%|▊         | 896/10986 [20:22<3:35:54,  1.28s/it]

training loss: 3.2454047203063965


training:   8%|▊         | 897/10986 [20:23<3:33:02,  1.27s/it]

training loss: 3.2430036067962646


training:   8%|▊         | 898/10986 [20:25<3:31:51,  1.26s/it]

training loss: 3.298375368118286


training:   8%|▊         | 899/10986 [20:26<3:29:50,  1.25s/it]

training loss: 3.233375310897827


training:   8%|▊         | 900/10986 [20:27<3:28:34,  1.24s/it]

training loss: 3.232126474380493
valid loss: 3.237461566925049
perplexity: 25.4689884185791


training:   8%|▊         | 901/10986 [20:30<4:39:03,  1.66s/it]

training loss: 3.2931034564971924


training:   8%|▊         | 902/10986 [20:31<4:28:06,  1.60s/it]

training loss: 3.3483693599700928


training:   8%|▊         | 903/10986 [20:32<4:11:42,  1.50s/it]

training loss: 3.284017562866211


training:   8%|▊         | 904/10986 [20:34<3:57:45,  1.41s/it]

training loss: 3.3230252265930176


training:   8%|▊         | 905/10986 [20:35<3:48:07,  1.36s/it]

training loss: 3.286679744720459


training:   8%|▊         | 906/10986 [20:36<3:40:47,  1.31s/it]

training loss: 3.38910174369812


training:   8%|▊         | 907/10986 [20:37<3:35:42,  1.28s/it]

training loss: 3.266268014907837


training:   8%|▊         | 908/10986 [20:39<3:33:37,  1.27s/it]

training loss: 3.2600157260894775


training:   8%|▊         | 909/10986 [20:40<3:31:02,  1.26s/it]

training loss: 3.3234317302703857


training:   8%|▊         | 910/10986 [20:41<3:29:31,  1.25s/it]

training loss: 3.1753721237182617


training:   8%|▊         | 911/10986 [20:42<3:43:13,  1.33s/it]

training loss: 3.3490567207336426


training:   8%|▊         | 912/10986 [20:44<3:39:19,  1.31s/it]

training loss: 3.274740695953369


training:   8%|▊         | 913/10986 [20:45<3:35:08,  1.28s/it]

training loss: 3.2403478622436523


training:   8%|▊         | 914/10986 [20:46<3:31:34,  1.26s/it]

training loss: 3.153299331665039


training:   8%|▊         | 915/10986 [20:47<3:30:29,  1.25s/it]

training loss: 3.265868902206421


training:   8%|▊         | 916/10986 [20:49<3:28:59,  1.25s/it]

training loss: 3.187251091003418


training:   8%|▊         | 917/10986 [20:50<3:28:15,  1.24s/it]

training loss: 3.2934885025024414


training:   8%|▊         | 918/10986 [20:51<3:27:17,  1.24s/it]

training loss: 3.3034746646881104


training:   8%|▊         | 919/10986 [20:52<3:30:32,  1.25s/it]

training loss: 3.2479512691497803


training:   8%|▊         | 920/10986 [20:54<3:29:21,  1.25s/it]

training loss: 3.2905113697052
valid loss: 3.287821054458618
perplexity: 26.784439086914062


training:   8%|▊         | 921/10986 [20:56<4:33:24,  1.63s/it]

training loss: 3.176764726638794


training:   8%|▊         | 922/10986 [20:57<4:16:38,  1.53s/it]

training loss: 3.2159063816070557


training:   8%|▊         | 923/10986 [20:59<4:00:08,  1.43s/it]

training loss: 3.2667694091796875


training:   8%|▊         | 924/10986 [21:00<3:47:34,  1.36s/it]

training loss: 3.301018476486206


training:   8%|▊         | 925/10986 [21:01<3:41:19,  1.32s/it]

training loss: 3.269465446472168


training:   8%|▊         | 926/10986 [21:02<3:36:29,  1.29s/it]

training loss: 3.437561273574829


training:   8%|▊         | 927/10986 [21:03<3:32:12,  1.27s/it]

training loss: 3.284165620803833


training:   8%|▊         | 928/10986 [21:05<3:29:04,  1.25s/it]

training loss: 3.338444709777832


training:   8%|▊         | 929/10986 [21:06<3:27:36,  1.24s/it]

training loss: 3.1580960750579834


training:   8%|▊         | 930/10986 [21:07<3:26:51,  1.23s/it]

training loss: 3.183170795440674


training:   8%|▊         | 931/10986 [21:09<3:38:50,  1.31s/it]

training loss: 3.2358219623565674


training:   8%|▊         | 932/10986 [21:10<3:34:56,  1.28s/it]

training loss: 3.2589051723480225


training:   8%|▊         | 933/10986 [21:11<3:30:55,  1.26s/it]

training loss: 3.3283255100250244


training:   9%|▊         | 934/10986 [21:12<3:27:32,  1.24s/it]

training loss: 3.374207019805908


training:   9%|▊         | 935/10986 [21:13<3:26:04,  1.23s/it]

training loss: 3.2297980785369873


training:   9%|▊         | 936/10986 [21:15<3:24:58,  1.22s/it]

training loss: 3.280089855194092


training:   9%|▊         | 937/10986 [21:16<3:23:40,  1.22s/it]

training loss: 3.299671173095703


training:   9%|▊         | 938/10986 [21:17<3:23:15,  1.21s/it]

training loss: 3.233612537384033


training:   9%|▊         | 939/10986 [21:18<3:23:30,  1.22s/it]

training loss: 3.204162120819092


training:   9%|▊         | 940/10986 [21:19<3:22:51,  1.21s/it]

training loss: 3.2440249919891357
valid loss: 3.2436232566833496
perplexity: 25.62640380859375


training:   9%|▊         | 941/10986 [21:22<4:32:33,  1.63s/it]

training loss: 3.233752727508545


training:   9%|▊         | 942/10986 [21:23<4:17:14,  1.54s/it]

training loss: 3.2086398601531982


training:   9%|▊         | 943/10986 [21:25<4:03:49,  1.46s/it]

training loss: 3.2414865493774414


training:   9%|▊         | 944/10986 [21:26<3:51:09,  1.38s/it]

training loss: 3.2264764308929443


training:   9%|▊         | 945/10986 [21:27<3:42:57,  1.33s/it]

training loss: 3.1800425052642822


training:   9%|▊         | 946/10986 [21:28<3:36:46,  1.30s/it]

training loss: 3.2132270336151123


training:   9%|▊         | 947/10986 [21:30<3:32:59,  1.27s/it]

training loss: 3.128074884414673


training:   9%|▊         | 948/10986 [21:31<3:31:06,  1.26s/it]

training loss: 3.297501564025879


training:   9%|▊         | 949/10986 [21:32<3:30:09,  1.26s/it]

training loss: 3.387115716934204


training:   9%|▊         | 950/10986 [21:33<3:27:56,  1.24s/it]

training loss: 3.1892776489257812


training:   9%|▊         | 951/10986 [21:35<3:38:52,  1.31s/it]

training loss: 3.2767438888549805


training:   9%|▊         | 952/10986 [21:36<3:37:23,  1.30s/it]

training loss: 3.2704989910125732


training:   9%|▊         | 953/10986 [21:37<3:34:25,  1.28s/it]

training loss: 3.2430155277252197


training:   9%|▊         | 954/10986 [21:38<3:33:23,  1.28s/it]

training loss: 3.3158669471740723


training:   9%|▊         | 955/10986 [21:40<3:31:45,  1.27s/it]

training loss: 3.2182445526123047


training:   9%|▊         | 956/10986 [21:41<3:29:51,  1.26s/it]

training loss: 3.3604135513305664


training:   9%|▊         | 957/10986 [21:42<3:28:29,  1.25s/it]

training loss: 3.3033742904663086


training:   9%|▊         | 958/10986 [21:43<3:27:39,  1.24s/it]

training loss: 3.2777159214019775


training:   9%|▊         | 959/10986 [21:45<3:26:43,  1.24s/it]

training loss: 3.3080549240112305


training:   9%|▊         | 960/10986 [21:46<3:26:52,  1.24s/it]

training loss: 3.261090040206909
valid loss: 3.251239538192749
perplexity: 25.82232666015625


training:   9%|▊         | 961/10986 [21:48<4:32:30,  1.63s/it]

training loss: 3.2654449939727783


training:   9%|▉         | 962/10986 [21:50<4:19:31,  1.55s/it]

training loss: 3.2941343784332275


training:   9%|▉         | 963/10986 [21:51<4:03:29,  1.46s/it]

training loss: 3.348548650741577


training:   9%|▉         | 964/10986 [21:52<3:51:40,  1.39s/it]

training loss: 3.194007396697998


training:   9%|▉         | 965/10986 [21:54<3:47:34,  1.36s/it]

training loss: 3.261874198913574


training:   9%|▉         | 966/10986 [21:55<3:41:44,  1.33s/it]

training loss: 3.2768707275390625


training:   9%|▉         | 967/10986 [21:56<3:39:06,  1.31s/it]

training loss: 3.2598190307617188


training:   9%|▉         | 968/10986 [21:57<3:36:02,  1.29s/it]

training loss: 3.2358920574188232


training:   9%|▉         | 969/10986 [21:59<3:34:26,  1.28s/it]

training loss: 3.3136613368988037


training:   9%|▉         | 970/10986 [22:00<3:32:13,  1.27s/it]

training loss: 3.2207579612731934


training:   9%|▉         | 971/10986 [22:01<3:42:50,  1.34s/it]

training loss: 3.1956164836883545


training:   9%|▉         | 972/10986 [22:03<3:40:44,  1.32s/it]

training loss: 3.2786266803741455


training:   9%|▉         | 973/10986 [22:04<3:37:07,  1.30s/it]

training loss: 3.2004506587982178


training:   9%|▉         | 974/10986 [22:05<3:34:30,  1.29s/it]

training loss: 3.272897243499756


training:   9%|▉         | 975/10986 [22:06<3:33:52,  1.28s/it]

training loss: 3.31392502784729


training:   9%|▉         | 976/10986 [22:08<3:32:03,  1.27s/it]

training loss: 3.1860544681549072


training:   9%|▉         | 977/10986 [22:09<3:33:50,  1.28s/it]

training loss: 3.274840831756592


training:   9%|▉         | 978/10986 [22:10<3:32:36,  1.27s/it]

training loss: 3.2560606002807617


training:   9%|▉         | 979/10986 [22:11<3:31:37,  1.27s/it]

training loss: 3.1516273021698


training:   9%|▉         | 980/10986 [22:13<3:31:16,  1.27s/it]

training loss: 3.29341721534729
valid loss: 3.282304048538208
perplexity: 26.637075424194336


training:   9%|▉         | 981/10986 [22:15<4:39:33,  1.68s/it]

training loss: 3.222205400466919


training:   9%|▉         | 982/10986 [22:17<4:23:43,  1.58s/it]

training loss: 3.2949142456054688


training:   9%|▉         | 983/10986 [22:18<4:07:32,  1.48s/it]

training loss: 3.180043935775757


training:   9%|▉         | 984/10986 [22:19<3:56:31,  1.42s/it]

training loss: 3.278292179107666


training:   9%|▉         | 985/10986 [22:20<3:48:14,  1.37s/it]

training loss: 3.3250200748443604


training:   9%|▉         | 986/10986 [22:22<3:42:02,  1.33s/it]

training loss: 3.1821277141571045


training:   9%|▉         | 987/10986 [22:23<3:38:09,  1.31s/it]

training loss: 3.247736930847168


training:   9%|▉         | 988/10986 [22:24<3:35:41,  1.29s/it]

training loss: 3.2181777954101562


training:   9%|▉         | 989/10986 [22:25<3:34:06,  1.29s/it]

training loss: 3.25576114654541


training:   9%|▉         | 990/10986 [22:27<3:33:03,  1.28s/it]

training loss: 3.2672476768493652


training:   9%|▉         | 991/10986 [22:28<3:44:01,  1.34s/it]

training loss: 3.320789098739624


training:   9%|▉         | 992/10986 [22:30<3:57:34,  1.43s/it]

training loss: 3.250389575958252


training:   9%|▉         | 993/10986 [22:31<3:49:00,  1.38s/it]

training loss: 3.3123080730438232


training:   9%|▉         | 994/10986 [22:32<3:43:13,  1.34s/it]

training loss: 3.147329330444336


training:   9%|▉         | 995/10986 [22:34<3:40:02,  1.32s/it]

training loss: 3.269408941268921


training:   9%|▉         | 996/10986 [22:35<3:36:30,  1.30s/it]

training loss: 3.2232656478881836


training:   9%|▉         | 997/10986 [22:36<3:33:44,  1.28s/it]

training loss: 3.2410926818847656


training:   9%|▉         | 998/10986 [22:37<3:33:06,  1.28s/it]

training loss: 3.2030961513519287


training:   9%|▉         | 999/10986 [22:39<3:32:24,  1.28s/it]

training loss: 3.1860010623931885


training:   9%|▉         | 1000/10986 [22:40<3:31:59,  1.27s/it]

training loss: 3.1222198009490967
valid loss: 3.1277458667755127
perplexity: 22.822477340698242


training:   9%|▉         | 1001/10986 [22:43<4:40:47,  1.69s/it]

training loss: 3.271968364715576


training:   9%|▉         | 1002/10986 [22:44<4:22:25,  1.58s/it]

training loss: 3.335857391357422


training:   9%|▉         | 1003/10986 [22:45<4:06:27,  1.48s/it]

training loss: 3.1421847343444824


training:   9%|▉         | 1004/10986 [22:46<3:55:50,  1.42s/it]

training loss: 3.2026970386505127


training:   9%|▉         | 1005/10986 [22:48<3:46:55,  1.36s/it]

training loss: 3.34499454498291


training:   9%|▉         | 1006/10986 [22:49<3:39:51,  1.32s/it]

training loss: 3.1829655170440674


training:   9%|▉         | 1007/10986 [22:50<3:36:08,  1.30s/it]

training loss: 3.1913630962371826


training:   9%|▉         | 1008/10986 [22:51<3:33:12,  1.28s/it]

training loss: 3.27734112739563


training:   9%|▉         | 1009/10986 [22:53<3:31:01,  1.27s/it]

training loss: 3.309622287750244


training:   9%|▉         | 1010/10986 [22:54<3:29:20,  1.26s/it]

training loss: 3.2407889366149902


training:   9%|▉         | 1011/10986 [22:55<3:39:22,  1.32s/it]

training loss: 3.2967472076416016


training:   9%|▉         | 1012/10986 [22:57<3:37:08,  1.31s/it]

training loss: 3.185992479324341


training:   9%|▉         | 1013/10986 [22:58<3:33:44,  1.29s/it]

training loss: 3.3032405376434326


training:   9%|▉         | 1014/10986 [22:59<3:31:49,  1.27s/it]

training loss: 3.289538621902466


training:   9%|▉         | 1015/10986 [23:00<3:31:11,  1.27s/it]

training loss: 3.254359245300293


training:   9%|▉         | 1016/10986 [23:02<3:30:29,  1.27s/it]

training loss: 3.307446002960205


training:   9%|▉         | 1017/10986 [23:03<3:29:19,  1.26s/it]

training loss: 3.351720094680786


training:   9%|▉         | 1018/10986 [23:04<3:28:16,  1.25s/it]

training loss: 3.210397481918335


training:   9%|▉         | 1019/10986 [23:05<3:28:06,  1.25s/it]

training loss: 3.2941572666168213


training:   9%|▉         | 1020/10986 [23:07<3:29:02,  1.26s/it]

training loss: 3.263089179992676
valid loss: 3.25559139251709
perplexity: 25.934947967529297


training:   9%|▉         | 1021/10986 [23:09<4:37:17,  1.67s/it]

training loss: 3.292994976043701


training:   9%|▉         | 1022/10986 [23:11<4:21:15,  1.57s/it]

training loss: 3.277355432510376


training:   9%|▉         | 1023/10986 [23:12<4:05:48,  1.48s/it]

training loss: 3.2515456676483154


training:   9%|▉         | 1024/10986 [23:13<3:54:07,  1.41s/it]

training loss: 3.241006374359131


training:   9%|▉         | 1025/10986 [23:14<3:45:28,  1.36s/it]

training loss: 3.1899940967559814


training:   9%|▉         | 1026/10986 [23:16<3:39:56,  1.32s/it]

training loss: 3.183821678161621


training:   9%|▉         | 1027/10986 [23:17<3:36:28,  1.30s/it]

training loss: 3.244713306427002


training:   9%|▉         | 1028/10986 [23:18<3:34:34,  1.29s/it]

training loss: 3.155043363571167


training:   9%|▉         | 1029/10986 [23:19<3:33:52,  1.29s/it]

training loss: 3.2760086059570312


training:   9%|▉         | 1030/10986 [23:21<3:31:40,  1.28s/it]

training loss: 3.300053358078003


training:   9%|▉         | 1031/10986 [23:22<3:41:43,  1.34s/it]

training loss: 3.1969308853149414


training:   9%|▉         | 1032/10986 [23:23<3:40:48,  1.33s/it]

training loss: 3.3653342723846436


training:   9%|▉         | 1033/10986 [23:25<3:39:19,  1.32s/it]

training loss: 3.298213005065918


training:   9%|▉         | 1034/10986 [23:26<3:35:35,  1.30s/it]

training loss: 3.2210071086883545


training:   9%|▉         | 1035/10986 [23:27<3:33:28,  1.29s/it]

training loss: 3.3938486576080322


training:   9%|▉         | 1036/10986 [23:29<3:31:18,  1.27s/it]

training loss: 3.260409355163574


training:   9%|▉         | 1037/10986 [23:30<3:29:18,  1.26s/it]

training loss: 3.2694647312164307


training:   9%|▉         | 1038/10986 [23:31<3:28:03,  1.25s/it]

training loss: 3.264272928237915


training:   9%|▉         | 1039/10986 [23:32<3:27:12,  1.25s/it]

training loss: 3.1748862266540527


training:   9%|▉         | 1040/10986 [23:33<3:26:10,  1.24s/it]

training loss: 3.282799482345581
valid loss: 3.2777960300445557
perplexity: 26.51726531982422


training:   9%|▉         | 1041/10986 [23:36<4:34:18,  1.65s/it]

training loss: 3.2228362560272217


training:   9%|▉         | 1042/10986 [23:37<4:18:05,  1.56s/it]

training loss: 3.386183738708496


training:   9%|▉         | 1043/10986 [23:39<4:03:52,  1.47s/it]

training loss: 3.327211380004883


training:  10%|▉         | 1044/10986 [23:40<3:51:47,  1.40s/it]

training loss: 3.2412595748901367


training:  10%|▉         | 1045/10986 [23:41<3:43:46,  1.35s/it]

training loss: 3.2605693340301514


training:  10%|▉         | 1046/10986 [23:42<3:38:23,  1.32s/it]

training loss: 3.221205234527588


training:  10%|▉         | 1047/10986 [23:44<3:33:41,  1.29s/it]

training loss: 3.2625486850738525


training:  10%|▉         | 1048/10986 [23:45<3:30:41,  1.27s/it]

training loss: 3.276646614074707


training:  10%|▉         | 1049/10986 [23:46<3:27:35,  1.25s/it]

training loss: 3.2730371952056885


training:  10%|▉         | 1050/10986 [23:47<3:25:55,  1.24s/it]

training loss: 3.3050410747528076


training:  10%|▉         | 1051/10986 [23:49<3:37:03,  1.31s/it]

training loss: 3.3069021701812744


training:  10%|▉         | 1052/10986 [23:50<3:36:27,  1.31s/it]

training loss: 3.20810604095459


training:  10%|▉         | 1053/10986 [23:51<3:34:14,  1.29s/it]

training loss: 3.2675726413726807


training:  10%|▉         | 1054/10986 [23:53<3:31:59,  1.28s/it]

training loss: 3.3305201530456543


training:  10%|▉         | 1055/10986 [23:54<3:29:19,  1.26s/it]

training loss: 3.4470138549804688


training:  10%|▉         | 1056/10986 [23:55<3:28:09,  1.26s/it]

training loss: 3.310819387435913


training:  10%|▉         | 1057/10986 [23:56<3:27:03,  1.25s/it]

training loss: 3.17401123046875


training:  10%|▉         | 1058/10986 [23:57<3:26:22,  1.25s/it]

training loss: 3.171949863433838


training:  10%|▉         | 1059/10986 [23:59<3:27:01,  1.25s/it]

training loss: 3.2527856826782227


training:  10%|▉         | 1060/10986 [24:00<3:26:39,  1.25s/it]

training loss: 3.2833216190338135
valid loss: 3.2871501445770264
perplexity: 26.766475677490234


training:  10%|▉         | 1061/10986 [24:03<4:33:51,  1.66s/it]

training loss: 3.3204777240753174


training:  10%|▉         | 1062/10986 [24:04<4:18:48,  1.56s/it]

training loss: 3.32275128364563


training:  10%|▉         | 1063/10986 [24:05<4:02:38,  1.47s/it]

training loss: 3.323065757751465


training:  10%|▉         | 1064/10986 [24:06<3:53:52,  1.41s/it]

training loss: 3.3982205390930176


training:  10%|▉         | 1065/10986 [24:08<3:45:02,  1.36s/it]

training loss: 3.3437743186950684


training:  10%|▉         | 1066/10986 [24:09<3:39:27,  1.33s/it]

training loss: 3.216029167175293


training:  10%|▉         | 1067/10986 [24:10<3:36:09,  1.31s/it]

training loss: 3.330141544342041


training:  10%|▉         | 1068/10986 [24:11<3:33:53,  1.29s/it]

training loss: 3.249072551727295


training:  10%|▉         | 1069/10986 [24:13<3:33:30,  1.29s/it]

training loss: 3.245729923248291


training:  10%|▉         | 1070/10986 [24:14<3:33:08,  1.29s/it]

training loss: 3.195364236831665


training:  10%|▉         | 1071/10986 [24:16<3:45:43,  1.37s/it]

training loss: 3.2484560012817383


training:  10%|▉         | 1072/10986 [24:17<3:42:09,  1.34s/it]

training loss: 3.399789810180664


training:  10%|▉         | 1073/10986 [24:18<3:38:28,  1.32s/it]

training loss: 3.2193455696105957


training:  10%|▉         | 1074/10986 [24:19<3:36:46,  1.31s/it]

training loss: 3.3378024101257324


training:  10%|▉         | 1075/10986 [24:21<3:34:17,  1.30s/it]

training loss: 3.323500871658325


training:  10%|▉         | 1076/10986 [24:22<3:33:17,  1.29s/it]

training loss: 3.255589008331299


training:  10%|▉         | 1077/10986 [24:23<3:32:50,  1.29s/it]

training loss: 3.251877784729004


training:  10%|▉         | 1078/10986 [24:25<3:33:36,  1.29s/it]

training loss: 3.2986080646514893


training:  10%|▉         | 1079/10986 [24:26<3:32:53,  1.29s/it]

training loss: 3.2560150623321533


training:  10%|▉         | 1080/10986 [24:27<3:32:10,  1.29s/it]

training loss: 3.29205322265625
valid loss: 3.286752700805664
perplexity: 26.75583839416504


training:  10%|▉         | 1081/10986 [24:30<4:37:15,  1.68s/it]

training loss: 3.236966609954834


training:  10%|▉         | 1082/10986 [24:31<4:20:54,  1.58s/it]

training loss: 3.2892935276031494


training:  10%|▉         | 1083/10986 [24:32<4:06:07,  1.49s/it]

training loss: 3.2957122325897217


training:  10%|▉         | 1084/10986 [24:34<3:54:21,  1.42s/it]

training loss: 3.32491397857666


training:  10%|▉         | 1085/10986 [24:35<3:45:51,  1.37s/it]

training loss: 3.315809488296509


training:  10%|▉         | 1086/10986 [24:36<3:40:40,  1.34s/it]

training loss: 3.2927796840667725


training:  10%|▉         | 1087/10986 [24:37<3:36:22,  1.31s/it]

training loss: 3.2455058097839355


training:  10%|▉         | 1088/10986 [24:39<3:33:42,  1.30s/it]

training loss: 3.2131829261779785


training:  10%|▉         | 1089/10986 [24:40<3:33:13,  1.29s/it]

training loss: 3.2974374294281006


training:  10%|▉         | 1090/10986 [24:41<3:45:23,  1.37s/it]

training loss: 3.1845242977142334


training:  10%|▉         | 1091/10986 [24:44<4:22:41,  1.59s/it]

training loss: 3.2099413871765137


training:  10%|▉         | 1092/10986 [24:45<4:35:19,  1.67s/it]

training loss: 3.249577522277832


training:  10%|▉         | 1093/10986 [24:47<4:39:03,  1.69s/it]

training loss: 3.2405850887298584


training:  10%|▉         | 1094/10986 [24:48<4:17:23,  1.56s/it]

training loss: 3.350201368331909


training:  10%|▉         | 1095/10986 [24:50<4:01:13,  1.46s/it]

training loss: 3.197335958480835


training:  10%|▉         | 1096/10986 [24:51<3:50:40,  1.40s/it]

training loss: 3.318798065185547


training:  10%|▉         | 1097/10986 [24:52<3:43:38,  1.36s/it]

training loss: 3.3117594718933105


training:  10%|▉         | 1098/10986 [24:53<3:38:27,  1.33s/it]

training loss: 3.370875835418701


training:  10%|█         | 1099/10986 [24:55<3:35:46,  1.31s/it]

training loss: 3.3827528953552246


training:  10%|█         | 1100/10986 [24:56<3:34:56,  1.30s/it]

training loss: 3.2794785499572754
valid loss: 3.25692081451416
perplexity: 25.969449996948242


training:  10%|█         | 1101/10986 [24:59<4:40:52,  1.70s/it]

training loss: 3.2661659717559814


training:  10%|█         | 1102/10986 [25:00<4:21:16,  1.59s/it]

training loss: 3.282560348510742


training:  10%|█         | 1103/10986 [25:01<4:03:05,  1.48s/it]

training loss: 3.238058090209961


training:  10%|█         | 1104/10986 [25:02<3:52:15,  1.41s/it]

training loss: 3.247563123703003


training:  10%|█         | 1105/10986 [25:04<3:44:25,  1.36s/it]

training loss: 3.3361268043518066


training:  10%|█         | 1106/10986 [25:05<3:38:44,  1.33s/it]

training loss: 3.374300003051758


training:  10%|█         | 1107/10986 [25:06<3:35:41,  1.31s/it]

training loss: 3.2497596740722656


training:  10%|█         | 1108/10986 [25:07<3:33:40,  1.30s/it]

training loss: 3.192444324493408


training:  10%|█         | 1109/10986 [25:09<3:31:33,  1.29s/it]

training loss: 3.204508066177368


training:  10%|█         | 1110/10986 [25:10<3:29:43,  1.27s/it]

training loss: 3.2198498249053955


training:  10%|█         | 1111/10986 [25:11<3:39:43,  1.34s/it]

training loss: 3.205441474914551


training:  10%|█         | 1112/10986 [25:13<3:34:25,  1.30s/it]

training loss: 3.2417163848876953


training:  10%|█         | 1113/10986 [25:14<3:30:24,  1.28s/it]

training loss: 3.3249096870422363


training:  10%|█         | 1114/10986 [25:15<3:26:24,  1.25s/it]

training loss: 3.262831211090088


training:  10%|█         | 1115/10986 [25:16<3:23:59,  1.24s/it]

training loss: 3.274918794631958


training:  10%|█         | 1116/10986 [25:18<3:23:29,  1.24s/it]

training loss: 3.3022444248199463


training:  10%|█         | 1117/10986 [25:19<3:21:57,  1.23s/it]

training loss: 3.240645408630371


training:  10%|█         | 1118/10986 [25:20<3:20:46,  1.22s/it]

training loss: 3.392080783843994


training:  10%|█         | 1119/10986 [25:21<3:19:35,  1.21s/it]

training loss: 3.238144874572754


training:  10%|█         | 1120/10986 [25:22<3:18:16,  1.21s/it]

training loss: 3.2306690216064453
valid loss: 3.228867530822754
perplexity: 25.25104522705078


training:  10%|█         | 1121/10986 [25:25<4:24:13,  1.61s/it]

training loss: 3.2591991424560547


training:  10%|█         | 1122/10986 [25:26<4:16:02,  1.56s/it]

training loss: 3.300525665283203


training:  10%|█         | 1123/10986 [25:28<3:58:34,  1.45s/it]

training loss: 3.236790180206299


training:  10%|█         | 1124/10986 [25:29<3:46:39,  1.38s/it]

training loss: 3.262965202331543


training:  10%|█         | 1125/10986 [25:30<3:38:35,  1.33s/it]

training loss: 3.2651851177215576


training:  10%|█         | 1126/10986 [25:31<3:31:45,  1.29s/it]

training loss: 3.2928805351257324


training:  10%|█         | 1127/10986 [25:32<3:28:31,  1.27s/it]

training loss: 3.2193446159362793


training:  10%|█         | 1128/10986 [25:34<3:25:44,  1.25s/it]

training loss: 3.2537200450897217


training:  10%|█         | 1129/10986 [25:35<3:24:43,  1.25s/it]

training loss: 3.374098300933838


training:  10%|█         | 1130/10986 [25:36<3:22:37,  1.23s/it]

training loss: 3.3314905166625977


training:  10%|█         | 1131/10986 [25:37<3:34:10,  1.30s/it]

training loss: 3.1010820865631104


training:  10%|█         | 1132/10986 [25:39<3:34:35,  1.31s/it]

training loss: 3.2065114974975586


training:  10%|█         | 1133/10986 [25:40<3:31:08,  1.29s/it]

training loss: 3.301267147064209


training:  10%|█         | 1134/10986 [25:41<3:27:38,  1.26s/it]

training loss: 3.2888808250427246


training:  10%|█         | 1135/10986 [25:42<3:25:10,  1.25s/it]

training loss: 3.3140149116516113


training:  10%|█         | 1136/10986 [25:44<3:23:43,  1.24s/it]

training loss: 3.205042600631714


training:  10%|█         | 1137/10986 [25:45<3:22:52,  1.24s/it]

training loss: 3.2497236728668213


training:  10%|█         | 1138/10986 [25:46<3:21:38,  1.23s/it]

training loss: 3.1878528594970703


training:  10%|█         | 1139/10986 [25:47<3:19:55,  1.22s/it]

training loss: 3.2334299087524414


training:  10%|█         | 1140/10986 [25:48<3:18:41,  1.21s/it]

training loss: 3.206759452819824
valid loss: 3.200417995452881
perplexity: 24.542787551879883


training:  10%|█         | 1141/10986 [25:51<4:25:33,  1.62s/it]

training loss: 3.2758748531341553


training:  10%|█         | 1142/10986 [25:52<4:15:51,  1.56s/it]

training loss: 3.2721331119537354


training:  10%|█         | 1143/10986 [25:54<4:01:05,  1.47s/it]

training loss: 3.26487135887146


training:  10%|█         | 1144/10986 [25:55<3:49:16,  1.40s/it]

training loss: 3.26481294631958


training:  10%|█         | 1145/10986 [25:56<3:42:57,  1.36s/it]

training loss: 3.2522664070129395


training:  10%|█         | 1146/10986 [25:57<3:36:33,  1.32s/it]

training loss: 3.2375845909118652


training:  10%|█         | 1147/10986 [25:59<3:33:37,  1.30s/it]

training loss: 3.3380253314971924


training:  10%|█         | 1148/10986 [26:00<3:29:58,  1.28s/it]

training loss: 3.3066611289978027


training:  10%|█         | 1149/10986 [26:01<3:28:21,  1.27s/it]

training loss: 3.3285293579101562


training:  10%|█         | 1150/10986 [26:02<3:27:06,  1.26s/it]

training loss: 3.337381362915039


training:  10%|█         | 1151/10986 [26:04<3:38:52,  1.34s/it]

training loss: 3.3412582874298096


training:  10%|█         | 1152/10986 [26:05<3:47:26,  1.39s/it]

training loss: 3.2444710731506348


training:  10%|█         | 1153/10986 [26:07<3:40:26,  1.35s/it]

training loss: 3.3138480186462402


training:  11%|█         | 1154/10986 [26:08<3:34:55,  1.31s/it]

training loss: 3.3095717430114746


training:  11%|█         | 1155/10986 [26:09<3:31:07,  1.29s/it]

training loss: 3.191293716430664


training:  11%|█         | 1156/10986 [26:10<3:28:52,  1.27s/it]

training loss: 3.270573377609253


training:  11%|█         | 1157/10986 [26:12<3:27:07,  1.26s/it]

training loss: 3.211622476577759


training:  11%|█         | 1158/10986 [26:13<3:24:41,  1.25s/it]

training loss: 3.252424716949463


training:  11%|█         | 1159/10986 [26:14<3:23:23,  1.24s/it]

training loss: 3.32686710357666


training:  11%|█         | 1160/10986 [26:15<3:21:51,  1.23s/it]

training loss: 3.2770795822143555
valid loss: 3.2725741863250732
perplexity: 26.3791561126709


training:  11%|█         | 1161/10986 [26:18<4:28:48,  1.64s/it]

training loss: 3.2861554622650146


training:  11%|█         | 1162/10986 [26:19<4:12:25,  1.54s/it]

training loss: 3.313845157623291


training:  11%|█         | 1163/10986 [26:20<3:57:30,  1.45s/it]

training loss: 3.321606159210205


training:  11%|█         | 1164/10986 [26:22<3:45:34,  1.38s/it]

training loss: 3.3828537464141846


training:  11%|█         | 1165/10986 [26:23<3:37:57,  1.33s/it]

training loss: 3.241311550140381


training:  11%|█         | 1166/10986 [26:24<3:32:07,  1.30s/it]

training loss: 3.3291687965393066


training:  11%|█         | 1167/10986 [26:25<3:27:27,  1.27s/it]

training loss: 3.2626333236694336


training:  11%|█         | 1168/10986 [26:27<3:25:14,  1.25s/it]

training loss: 3.2698490619659424


training:  11%|█         | 1169/10986 [26:28<3:23:15,  1.24s/it]

training loss: 3.3852028846740723


training:  11%|█         | 1170/10986 [26:29<3:21:02,  1.23s/it]

training loss: 3.271770477294922


training:  11%|█         | 1171/10986 [26:30<3:32:49,  1.30s/it]

training loss: 3.3849070072174072


training:  11%|█         | 1172/10986 [26:32<3:44:06,  1.37s/it]

training loss: 3.363494396209717


training:  11%|█         | 1173/10986 [26:33<3:34:54,  1.31s/it]

training loss: 3.2510175704956055


training:  11%|█         | 1174/10986 [26:34<3:30:58,  1.29s/it]

training loss: 3.186594009399414


training:  11%|█         | 1175/10986 [26:36<3:26:18,  1.26s/it]

training loss: 3.261017322540283


training:  11%|█         | 1176/10986 [26:37<3:22:44,  1.24s/it]

training loss: 3.294603109359741


training:  11%|█         | 1177/10986 [26:38<3:20:37,  1.23s/it]

training loss: 3.3261125087738037


training:  11%|█         | 1178/10986 [26:39<3:18:27,  1.21s/it]

training loss: 3.310023069381714


training:  11%|█         | 1179/10986 [26:40<3:17:08,  1.21s/it]

training loss: 3.3355441093444824


training:  11%|█         | 1180/10986 [26:42<3:17:23,  1.21s/it]

training loss: 3.3126800060272217
valid loss: 3.3081181049346924
perplexity: 27.333637237548828


training:  11%|█         | 1181/10986 [26:44<4:23:01,  1.61s/it]

training loss: 3.2954230308532715


training:  11%|█         | 1182/10986 [26:46<4:22:45,  1.61s/it]

training loss: 3.300502061843872


training:  11%|█         | 1183/10986 [26:47<4:03:09,  1.49s/it]

training loss: 3.291651725769043


training:  11%|█         | 1184/10986 [26:48<3:50:33,  1.41s/it]

training loss: 3.2451844215393066


training:  11%|█         | 1185/10986 [26:49<3:41:28,  1.36s/it]

training loss: 3.2535417079925537


training:  11%|█         | 1186/10986 [26:51<3:35:16,  1.32s/it]

training loss: 3.196702241897583


training:  11%|█         | 1187/10986 [26:52<3:31:44,  1.30s/it]

training loss: 3.3129239082336426


training:  11%|█         | 1188/10986 [26:53<3:28:35,  1.28s/it]

training loss: 3.311572551727295


training:  11%|█         | 1189/10986 [26:54<3:28:26,  1.28s/it]

training loss: 3.395465612411499


training:  11%|█         | 1190/10986 [26:56<3:26:11,  1.26s/it]

training loss: 3.250683307647705


training:  11%|█         | 1191/10986 [26:57<3:36:41,  1.33s/it]

training loss: 3.2955856323242188


training:  11%|█         | 1192/10986 [26:58<3:33:38,  1.31s/it]

training loss: 3.3041558265686035


training:  11%|█         | 1193/10986 [27:00<3:30:37,  1.29s/it]

training loss: 3.2763054370880127


training:  11%|█         | 1194/10986 [27:01<3:28:15,  1.28s/it]

training loss: 3.291860818862915


training:  11%|█         | 1195/10986 [27:02<3:27:17,  1.27s/it]

training loss: 3.3580753803253174


training:  11%|█         | 1196/10986 [27:03<3:25:14,  1.26s/it]

training loss: 3.3126845359802246


training:  11%|█         | 1197/10986 [27:05<3:25:47,  1.26s/it]

training loss: 3.2378275394439697


training:  11%|█         | 1198/10986 [27:06<3:25:29,  1.26s/it]

training loss: 3.2927403450012207


training:  11%|█         | 1199/10986 [27:07<3:25:02,  1.26s/it]

training loss: 3.2329607009887695


training:  11%|█         | 1200/10986 [27:08<3:23:31,  1.25s/it]

training loss: 3.340172290802002
valid loss: 3.325255870819092
perplexity: 27.806114196777344


training:  11%|█         | 1201/10986 [27:11<4:31:32,  1.67s/it]

training loss: 3.125701904296875


training:  11%|█         | 1202/10986 [27:12<4:18:22,  1.58s/it]

training loss: 3.3184964656829834


training:  11%|█         | 1203/10986 [27:14<4:00:52,  1.48s/it]

training loss: 3.147385597229004


training:  11%|█         | 1204/10986 [27:15<3:49:25,  1.41s/it]

training loss: 3.3167405128479004


training:  11%|█         | 1205/10986 [27:16<3:41:53,  1.36s/it]

training loss: 3.326860189437866


training:  11%|█         | 1206/10986 [27:17<3:35:14,  1.32s/it]

training loss: 3.2308802604675293


training:  11%|█         | 1207/10986 [27:19<3:32:57,  1.31s/it]

training loss: 3.2186081409454346


training:  11%|█         | 1208/10986 [27:20<3:29:54,  1.29s/it]

training loss: 3.3486883640289307


training:  11%|█         | 1209/10986 [27:21<3:27:46,  1.28s/it]

training loss: 3.3733813762664795


training:  11%|█         | 1210/10986 [27:22<3:25:03,  1.26s/it]

training loss: 3.188026189804077


training:  11%|█         | 1211/10986 [27:24<3:36:33,  1.33s/it]

training loss: 3.298701763153076


training:  11%|█         | 1212/10986 [27:25<3:47:29,  1.40s/it]

training loss: 3.381073236465454


training:  11%|█         | 1213/10986 [27:27<3:39:52,  1.35s/it]

training loss: 3.180915117263794


training:  11%|█         | 1214/10986 [27:28<3:33:44,  1.31s/it]

training loss: 3.141404867172241


training:  11%|█         | 1215/10986 [27:29<3:30:19,  1.29s/it]

training loss: 3.2089831829071045


training:  11%|█         | 1216/10986 [27:30<3:30:06,  1.29s/it]

training loss: 3.2722291946411133


training:  11%|█         | 1217/10986 [27:32<3:27:29,  1.27s/it]

training loss: 3.224102020263672


training:  11%|█         | 1218/10986 [27:33<3:25:05,  1.26s/it]

training loss: 3.23032808303833


training:  11%|█         | 1219/10986 [27:34<3:24:44,  1.26s/it]

training loss: 3.245229959487915


training:  11%|█         | 1220/10986 [27:35<3:24:10,  1.25s/it]

training loss: 3.2462806701660156
valid loss: 3.2377984523773193
perplexity: 25.477569580078125


training:  11%|█         | 1221/10986 [27:38<4:30:31,  1.66s/it]

training loss: 3.268301248550415


training:  11%|█         | 1222/10986 [27:39<4:14:17,  1.56s/it]

training loss: 3.29599666595459


training:  11%|█         | 1223/10986 [27:40<3:59:37,  1.47s/it]

training loss: 3.2863125801086426


training:  11%|█         | 1224/10986 [27:42<3:47:30,  1.40s/it]

training loss: 3.1879124641418457


training:  11%|█         | 1225/10986 [27:43<3:39:08,  1.35s/it]

training loss: 3.3841395378112793


training:  11%|█         | 1226/10986 [27:44<3:32:42,  1.31s/it]

training loss: 3.431004047393799


training:  11%|█         | 1227/10986 [27:45<3:28:23,  1.28s/it]

training loss: 3.281318187713623


training:  11%|█         | 1228/10986 [27:47<3:24:57,  1.26s/it]

training loss: 3.1809983253479004


training:  11%|█         | 1229/10986 [27:48<3:23:53,  1.25s/it]

training loss: 3.279585361480713


training:  11%|█         | 1230/10986 [27:49<3:21:28,  1.24s/it]

training loss: 3.2220511436462402


training:  11%|█         | 1231/10986 [27:50<3:33:36,  1.31s/it]

training loss: 3.2655930519104004


training:  11%|█         | 1232/10986 [27:52<3:33:10,  1.31s/it]

training loss: 3.2844152450561523


training:  11%|█         | 1233/10986 [27:53<3:29:02,  1.29s/it]

training loss: 3.2485694885253906


training:  11%|█         | 1234/10986 [27:54<3:26:15,  1.27s/it]

training loss: 3.2791247367858887


training:  11%|█         | 1235/10986 [27:55<3:23:47,  1.25s/it]

training loss: 3.3462510108947754


training:  11%|█▏        | 1236/10986 [27:57<3:21:04,  1.24s/it]

training loss: 3.3544654846191406


training:  11%|█▏        | 1237/10986 [27:58<3:19:59,  1.23s/it]

training loss: 3.2168686389923096


training:  11%|█▏        | 1238/10986 [27:59<3:18:59,  1.22s/it]

training loss: 3.3397064208984375


training:  11%|█▏        | 1239/10986 [28:00<3:20:29,  1.23s/it]

training loss: 3.3687591552734375


training:  11%|█▏        | 1240/10986 [28:02<3:20:39,  1.24s/it]

training loss: 3.224301338195801
valid loss: 3.2188024520874023
perplexity: 24.998165130615234


training:  11%|█▏        | 1241/10986 [28:04<4:26:08,  1.64s/it]

training loss: 3.309746265411377


training:  11%|█▏        | 1242/10986 [28:06<4:25:20,  1.63s/it]

training loss: 3.4401559829711914


training:  11%|█▏        | 1243/10986 [28:07<4:06:03,  1.52s/it]

training loss: 3.196732997894287


training:  11%|█▏        | 1244/10986 [28:08<3:50:14,  1.42s/it]

training loss: 3.202530860900879


training:  11%|█▏        | 1245/10986 [28:09<3:40:13,  1.36s/it]

training loss: 3.2542724609375


training:  11%|█▏        | 1246/10986 [28:11<3:32:50,  1.31s/it]

training loss: 3.344937324523926


training:  11%|█▏        | 1247/10986 [28:12<3:26:52,  1.27s/it]

training loss: 3.1516804695129395


training:  11%|█▏        | 1248/10986 [28:13<3:24:27,  1.26s/it]

training loss: 3.2814137935638428


training:  11%|█▏        | 1249/10986 [28:14<3:22:15,  1.25s/it]

training loss: 3.370481014251709


training:  11%|█▏        | 1250/10986 [28:15<3:20:33,  1.24s/it]

training loss: 3.2621235847473145


training:  11%|█▏        | 1251/10986 [28:17<3:31:17,  1.30s/it]

training loss: 3.385420322418213


training:  11%|█▏        | 1252/10986 [28:18<3:28:20,  1.28s/it]

training loss: 3.251565933227539


training:  11%|█▏        | 1253/10986 [28:19<3:26:00,  1.27s/it]

training loss: 3.2621355056762695


training:  11%|█▏        | 1254/10986 [28:21<3:24:29,  1.26s/it]

training loss: 3.315662145614624


training:  11%|█▏        | 1255/10986 [28:22<3:22:32,  1.25s/it]

training loss: 3.1637418270111084


training:  11%|█▏        | 1256/10986 [28:23<3:21:55,  1.25s/it]

training loss: 3.3258748054504395


training:  11%|█▏        | 1257/10986 [28:24<3:21:37,  1.24s/it]

training loss: 3.378567695617676


training:  11%|█▏        | 1258/10986 [28:26<3:20:37,  1.24s/it]

training loss: 3.2767372131347656


training:  11%|█▏        | 1259/10986 [28:27<3:20:24,  1.24s/it]

training loss: 3.2074007987976074


training:  11%|█▏        | 1260/10986 [28:28<3:19:06,  1.23s/it]

training loss: 3.282404899597168
valid loss: 3.2819173336029053
perplexity: 26.62677574157715


training:  11%|█▏        | 1261/10986 [28:31<4:29:03,  1.66s/it]

training loss: 3.259457588195801


training:  11%|█▏        | 1262/10986 [28:32<4:25:17,  1.64s/it]

training loss: 3.278289794921875


training:  11%|█▏        | 1263/10986 [28:34<4:06:37,  1.52s/it]

training loss: 3.2384917736053467


training:  12%|█▏        | 1264/10986 [28:35<3:52:37,  1.44s/it]

training loss: 3.2199532985687256


training:  12%|█▏        | 1265/10986 [28:36<3:42:40,  1.37s/it]

training loss: 3.3786368370056152


training:  12%|█▏        | 1266/10986 [28:37<3:35:52,  1.33s/it]

training loss: 3.3231518268585205


training:  12%|█▏        | 1267/10986 [28:38<3:31:11,  1.30s/it]

training loss: 3.3437366485595703


training:  12%|█▏        | 1268/10986 [28:40<3:27:20,  1.28s/it]

training loss: 3.3103885650634766


training:  12%|█▏        | 1269/10986 [28:41<3:23:37,  1.26s/it]

training loss: 3.2069218158721924


training:  12%|█▏        | 1270/10986 [28:42<3:22:09,  1.25s/it]

training loss: 3.2772653102874756


training:  12%|█▏        | 1271/10986 [28:44<3:33:37,  1.32s/it]

training loss: 3.258835792541504


training:  12%|█▏        | 1272/10986 [28:45<3:45:45,  1.39s/it]

training loss: 3.3251547813415527


training:  12%|█▏        | 1273/10986 [28:46<3:37:30,  1.34s/it]

training loss: 3.221329689025879


training:  12%|█▏        | 1274/10986 [28:48<3:31:58,  1.31s/it]

training loss: 3.3673527240753174


training:  12%|█▏        | 1275/10986 [28:49<3:27:29,  1.28s/it]

training loss: 3.202803134918213


training:  12%|█▏        | 1276/10986 [28:50<3:24:54,  1.27s/it]

training loss: 3.202810287475586


training:  12%|█▏        | 1277/10986 [28:51<3:23:34,  1.26s/it]

training loss: 3.27899432182312


training:  12%|█▏        | 1278/10986 [28:53<3:21:35,  1.25s/it]

training loss: 3.3638880252838135


training:  12%|█▏        | 1279/10986 [28:54<3:21:07,  1.24s/it]

training loss: 3.348292350769043


training:  12%|█▏        | 1280/10986 [28:55<3:21:17,  1.24s/it]

training loss: 3.2714781761169434
valid loss: 3.2638356685638428
perplexity: 26.149646759033203


training:  12%|█▏        | 1281/10986 [28:58<4:25:35,  1.64s/it]

training loss: 3.1941299438476562


training:  12%|█▏        | 1282/10986 [28:59<4:10:05,  1.55s/it]

training loss: 3.2264482975006104


training:  12%|█▏        | 1283/10986 [29:00<3:55:09,  1.45s/it]

training loss: 3.2740957736968994


training:  12%|█▏        | 1284/10986 [29:01<3:46:39,  1.40s/it]

training loss: 3.415111541748047


training:  12%|█▏        | 1285/10986 [29:03<3:38:03,  1.35s/it]

training loss: 3.228957176208496


training:  12%|█▏        | 1286/10986 [29:04<3:33:24,  1.32s/it]

training loss: 3.248440980911255


training:  12%|█▏        | 1287/10986 [29:05<3:29:04,  1.29s/it]

training loss: 3.314976215362549


training:  12%|█▏        | 1288/10986 [29:06<3:25:56,  1.27s/it]

training loss: 3.3073792457580566


training:  12%|█▏        | 1289/10986 [29:08<3:23:38,  1.26s/it]

training loss: 3.259392023086548


training:  12%|█▏        | 1290/10986 [29:09<3:22:30,  1.25s/it]

training loss: 3.1441633701324463


training:  12%|█▏        | 1291/10986 [29:10<3:33:44,  1.32s/it]

training loss: 3.2583937644958496


training:  12%|█▏        | 1292/10986 [29:12<3:45:42,  1.40s/it]

training loss: 3.323417901992798


training:  12%|█▏        | 1293/10986 [29:13<3:37:04,  1.34s/it]

training loss: 3.296849489212036


training:  12%|█▏        | 1294/10986 [29:14<3:31:13,  1.31s/it]

training loss: 3.374723434448242


training:  12%|█▏        | 1295/10986 [29:16<3:28:25,  1.29s/it]

training loss: 3.305388927459717


training:  12%|█▏        | 1296/10986 [29:17<3:25:53,  1.27s/it]

training loss: 3.155691623687744


training:  12%|█▏        | 1297/10986 [29:18<3:24:18,  1.27s/it]

training loss: 3.2839229106903076


training:  12%|█▏        | 1298/10986 [29:19<3:22:34,  1.25s/it]

training loss: 3.1858320236206055


training:  12%|█▏        | 1299/10986 [29:21<3:22:05,  1.25s/it]

training loss: 3.309368371963501


training:  12%|█▏        | 1300/10986 [29:22<3:21:31,  1.25s/it]

training loss: 3.1705121994018555
valid loss: 3.176217794418335
perplexity: 23.955976486206055


training:  12%|█▏        | 1301/10986 [29:24<4:31:37,  1.68s/it]

training loss: 3.2611029148101807


training:  12%|█▏        | 1302/10986 [29:26<4:16:44,  1.59s/it]

training loss: 3.438833475112915


training:  12%|█▏        | 1303/10986 [29:27<4:00:39,  1.49s/it]

training loss: 3.4345061779022217


training:  12%|█▏        | 1304/10986 [29:28<3:48:07,  1.41s/it]

training loss: 3.3251357078552246


training:  12%|█▏        | 1305/10986 [29:30<3:41:42,  1.37s/it]

training loss: 3.3542861938476562


training:  12%|█▏        | 1306/10986 [29:31<3:36:32,  1.34s/it]

training loss: 3.3148484230041504


training:  12%|█▏        | 1307/10986 [29:32<3:34:17,  1.33s/it]

training loss: 3.3044419288635254


training:  12%|█▏        | 1308/10986 [29:33<3:30:45,  1.31s/it]

training loss: 3.4573352336883545


training:  12%|█▏        | 1309/10986 [29:35<3:28:59,  1.30s/it]

training loss: 3.2596049308776855


training:  12%|█▏        | 1310/10986 [29:36<3:29:17,  1.30s/it]

training loss: 3.33554744720459


training:  12%|█▏        | 1311/10986 [29:38<3:41:57,  1.38s/it]

training loss: 3.2996044158935547


training:  12%|█▏        | 1312/10986 [29:39<3:50:24,  1.43s/it]

training loss: 3.2732994556427


training:  12%|█▏        | 1313/10986 [29:40<3:42:14,  1.38s/it]

training loss: 3.2894914150238037


training:  12%|█▏        | 1314/10986 [29:42<3:37:19,  1.35s/it]

training loss: 3.2571797370910645


training:  12%|█▏        | 1315/10986 [29:43<3:34:20,  1.33s/it]

training loss: 3.393369674682617


training:  12%|█▏        | 1316/10986 [29:44<3:30:36,  1.31s/it]

training loss: 3.373704195022583


training:  12%|█▏        | 1317/10986 [29:45<3:29:29,  1.30s/it]

training loss: 3.3733224868774414


training:  12%|█▏        | 1318/10986 [29:47<3:27:00,  1.28s/it]

training loss: 3.2057693004608154


training:  12%|█▏        | 1319/10986 [29:48<3:26:40,  1.28s/it]

training loss: 3.2130126953125


training:  12%|█▏        | 1320/10986 [29:49<3:26:08,  1.28s/it]

training loss: 3.2493293285369873
valid loss: 3.2354934215545654
perplexity: 25.41891098022461


training:  12%|█▏        | 1321/10986 [29:52<4:34:22,  1.70s/it]

training loss: 3.30818510055542


training:  12%|█▏        | 1322/10986 [29:53<4:16:54,  1.60s/it]

training loss: 3.269103527069092


training:  12%|█▏        | 1323/10986 [29:55<4:03:39,  1.51s/it]

training loss: 3.290299892425537


training:  12%|█▏        | 1324/10986 [29:56<3:56:46,  1.47s/it]

training loss: 3.2747957706451416


training:  12%|█▏        | 1325/10986 [29:58<4:08:39,  1.54s/it]

training loss: 3.3400487899780273


training:  12%|█▏        | 1326/10986 [29:59<4:15:36,  1.59s/it]

training loss: 3.37125825881958


training:  12%|█▏        | 1327/10986 [30:01<4:26:09,  1.65s/it]

training loss: 3.313201904296875


training:  12%|█▏        | 1328/10986 [30:03<4:21:40,  1.63s/it]

training loss: 3.2629647254943848


training:  12%|█▏        | 1329/10986 [30:04<4:04:30,  1.52s/it]

training loss: 3.2688629627227783


training:  12%|█▏        | 1330/10986 [30:05<3:52:50,  1.45s/it]

training loss: 3.3406243324279785


training:  12%|█▏        | 1331/10986 [30:07<3:56:19,  1.47s/it]

training loss: 3.2894086837768555


training:  12%|█▏        | 1332/10986 [30:08<3:48:45,  1.42s/it]

training loss: 3.2764058113098145


training:  12%|█▏        | 1333/10986 [30:09<3:43:16,  1.39s/it]

training loss: 3.377037286758423


training:  12%|█▏        | 1334/10986 [30:11<3:36:39,  1.35s/it]

training loss: 3.2800188064575195


training:  12%|█▏        | 1335/10986 [30:12<3:33:07,  1.33s/it]

training loss: 3.229905843734741


training:  12%|█▏        | 1336/10986 [30:13<3:31:32,  1.32s/it]

training loss: 3.410262107849121


training:  12%|█▏        | 1337/10986 [30:15<3:28:46,  1.30s/it]

training loss: 3.3899800777435303


training:  12%|█▏        | 1338/10986 [30:16<3:27:47,  1.29s/it]

training loss: 3.336522340774536


training:  12%|█▏        | 1339/10986 [30:17<3:26:31,  1.28s/it]

training loss: 3.21268892288208


training:  12%|█▏        | 1340/10986 [30:18<3:26:31,  1.28s/it]

training loss: 3.2592766284942627
valid loss: 3.2601547241210938
perplexity: 26.053569793701172


training:  12%|█▏        | 1341/10986 [30:21<4:33:53,  1.70s/it]

training loss: 3.2493205070495605


training:  12%|█▏        | 1342/10986 [30:22<4:18:23,  1.61s/it]

training loss: 3.375453472137451


training:  12%|█▏        | 1343/10986 [30:24<4:01:10,  1.50s/it]

training loss: 3.3395702838897705


training:  12%|█▏        | 1344/10986 [30:25<3:49:52,  1.43s/it]

training loss: 3.26216983795166


training:  12%|█▏        | 1345/10986 [30:26<3:42:13,  1.38s/it]

training loss: 3.368257522583008


training:  12%|█▏        | 1346/10986 [30:27<3:36:04,  1.34s/it]

training loss: 3.2514760494232178


training:  12%|█▏        | 1347/10986 [30:29<3:31:49,  1.32s/it]

training loss: 3.203639030456543


training:  12%|█▏        | 1348/10986 [30:30<3:30:36,  1.31s/it]

training loss: 3.4412641525268555


training:  12%|█▏        | 1349/10986 [30:31<3:29:11,  1.30s/it]

training loss: 3.2346370220184326


training:  12%|█▏        | 1350/10986 [30:33<3:29:39,  1.31s/it]

training loss: 3.2673213481903076


training:  12%|█▏        | 1351/10986 [30:34<3:42:00,  1.38s/it]

training loss: 3.241732597351074


training:  12%|█▏        | 1352/10986 [30:36<3:39:31,  1.37s/it]

training loss: 3.315394163131714


training:  12%|█▏        | 1353/10986 [30:37<3:35:40,  1.34s/it]

training loss: 3.3445889949798584


training:  12%|█▏        | 1354/10986 [30:38<3:32:21,  1.32s/it]

training loss: 3.3927855491638184


training:  12%|█▏        | 1355/10986 [30:39<3:30:11,  1.31s/it]

training loss: 3.2700228691101074


training:  12%|█▏        | 1356/10986 [30:41<3:28:59,  1.30s/it]

training loss: 3.3559720516204834


training:  12%|█▏        | 1357/10986 [30:42<3:27:26,  1.29s/it]

training loss: 3.3360142707824707


training:  12%|█▏        | 1358/10986 [30:43<3:26:12,  1.29s/it]

training loss: 3.359074592590332


training:  12%|█▏        | 1359/10986 [30:44<3:26:40,  1.29s/it]

training loss: 3.278170108795166


training:  12%|█▏        | 1360/10986 [30:46<3:26:09,  1.29s/it]

training loss: 3.191166877746582
valid loss: 3.182049512863159
perplexity: 24.096088409423828


training:  12%|█▏        | 1361/10986 [30:48<4:32:24,  1.70s/it]

training loss: 3.3052940368652344


training:  12%|█▏        | 1362/10986 [30:50<4:16:54,  1.60s/it]

training loss: 3.3509275913238525


training:  12%|█▏        | 1363/10986 [30:51<4:01:39,  1.51s/it]

training loss: 3.2005796432495117


training:  12%|█▏        | 1364/10986 [30:52<3:52:34,  1.45s/it]

training loss: 3.2802789211273193


training:  12%|█▏        | 1365/10986 [30:54<3:46:41,  1.41s/it]

training loss: 3.2843072414398193


training:  12%|█▏        | 1366/10986 [30:55<3:41:34,  1.38s/it]

training loss: 3.3119025230407715


training:  12%|█▏        | 1367/10986 [30:56<3:38:53,  1.37s/it]

training loss: 3.282496690750122


training:  12%|█▏        | 1368/10986 [30:58<3:36:11,  1.35s/it]

training loss: 3.321885347366333


training:  12%|█▏        | 1369/10986 [30:59<3:33:53,  1.33s/it]

training loss: 3.3130197525024414


training:  12%|█▏        | 1370/10986 [31:00<3:31:14,  1.32s/it]

training loss: 3.257780075073242


training:  12%|█▏        | 1371/10986 [31:02<3:42:55,  1.39s/it]

training loss: 3.300921678543091


training:  12%|█▏        | 1372/10986 [31:03<3:39:31,  1.37s/it]

training loss: 3.3091559410095215


training:  12%|█▏        | 1373/10986 [31:04<3:35:10,  1.34s/it]

training loss: 3.252103090286255


training:  13%|█▎        | 1374/10986 [31:06<3:31:48,  1.32s/it]

training loss: 3.305988073348999


training:  13%|█▎        | 1375/10986 [31:07<3:29:24,  1.31s/it]

training loss: 3.2037763595581055


training:  13%|█▎        | 1376/10986 [31:08<3:28:00,  1.30s/it]

training loss: 3.211061954498291


training:  13%|█▎        | 1377/10986 [31:10<3:25:57,  1.29s/it]

training loss: 3.292095184326172


training:  13%|█▎        | 1378/10986 [31:11<3:25:25,  1.28s/it]

training loss: 3.2889084815979004


training:  13%|█▎        | 1379/10986 [31:12<3:26:17,  1.29s/it]

training loss: 3.1493232250213623


training:  13%|█▎        | 1380/10986 [31:13<3:24:57,  1.28s/it]

training loss: 3.195948839187622
valid loss: 3.1826136112213135
perplexity: 24.109683990478516


training:  13%|█▎        | 1381/10986 [31:16<4:29:56,  1.69s/it]

training loss: 3.3480641841888428


training:  13%|█▎        | 1382/10986 [31:17<4:12:42,  1.58s/it]

training loss: 3.488360643386841


training:  13%|█▎        | 1383/10986 [31:19<3:58:20,  1.49s/it]

training loss: 3.2162227630615234


training:  13%|█▎        | 1384/10986 [31:20<3:46:34,  1.42s/it]

training loss: 3.324465036392212


training:  13%|█▎        | 1385/10986 [31:21<3:39:26,  1.37s/it]

training loss: 3.2390546798706055


training:  13%|█▎        | 1386/10986 [31:22<3:33:04,  1.33s/it]

training loss: 3.286001443862915


training:  13%|█▎        | 1387/10986 [31:24<3:28:46,  1.31s/it]

training loss: 3.3162384033203125


training:  13%|█▎        | 1388/10986 [31:25<3:25:56,  1.29s/it]

training loss: 3.2471601963043213


training:  13%|█▎        | 1389/10986 [31:26<3:23:09,  1.27s/it]

training loss: 3.199679374694824


training:  13%|█▎        | 1390/10986 [31:27<3:22:09,  1.26s/it]

training loss: 3.236710786819458


training:  13%|█▎        | 1391/10986 [31:29<3:33:33,  1.34s/it]

training loss: 3.2203571796417236


training:  13%|█▎        | 1392/10986 [31:30<3:46:15,  1.42s/it]

training loss: 3.544219732284546


training:  13%|█▎        | 1393/10986 [31:32<3:39:06,  1.37s/it]

training loss: 3.2362048625946045


training:  13%|█▎        | 1394/10986 [31:33<3:35:26,  1.35s/it]

training loss: 3.334094524383545


training:  13%|█▎        | 1395/10986 [31:34<3:30:56,  1.32s/it]

training loss: 3.25441837310791


training:  13%|█▎        | 1396/10986 [31:35<3:28:16,  1.30s/it]

training loss: 3.316751480102539


training:  13%|█▎        | 1397/10986 [31:37<3:26:33,  1.29s/it]

training loss: 3.2894182205200195


training:  13%|█▎        | 1398/10986 [31:38<3:24:53,  1.28s/it]

training loss: 3.155784845352173


training:  13%|█▎        | 1399/10986 [31:39<3:23:29,  1.27s/it]

training loss: 3.220445394515991


training:  13%|█▎        | 1400/10986 [31:41<3:22:26,  1.27s/it]

training loss: 3.314284324645996
valid loss: 3.3079988956451416
perplexity: 27.33037757873535


training:  13%|█▎        | 1401/10986 [31:43<4:27:31,  1.67s/it]

training loss: 3.3225224018096924


training:  13%|█▎        | 1402/10986 [31:45<4:25:28,  1.66s/it]

training loss: 3.301970958709717


training:  13%|█▎        | 1403/10986 [31:46<4:06:13,  1.54s/it]

training loss: 3.432690382003784


training:  13%|█▎        | 1404/10986 [31:47<3:53:15,  1.46s/it]

training loss: 3.3724381923675537


training:  13%|█▎        | 1405/10986 [31:49<3:43:32,  1.40s/it]

training loss: 3.286766767501831


training:  13%|█▎        | 1406/10986 [31:50<3:36:46,  1.36s/it]

training loss: 3.2729480266571045


training:  13%|█▎        | 1407/10986 [31:51<3:33:16,  1.34s/it]

training loss: 3.3545846939086914


training:  13%|█▎        | 1408/10986 [31:52<3:29:29,  1.31s/it]

training loss: 3.3235087394714355


training:  13%|█▎        | 1409/10986 [31:54<3:27:18,  1.30s/it]

training loss: 3.3129827976226807


training:  13%|█▎        | 1410/10986 [31:55<3:27:31,  1.30s/it]

training loss: 3.3187248706817627


training:  13%|█▎        | 1411/10986 [31:56<3:37:50,  1.37s/it]

training loss: 3.389698028564453


training:  13%|█▎        | 1412/10986 [31:58<3:37:31,  1.36s/it]

training loss: 3.1983914375305176


training:  13%|█▎        | 1413/10986 [31:59<3:32:54,  1.33s/it]

training loss: 3.4508612155914307


training:  13%|█▎        | 1414/10986 [32:00<3:29:14,  1.31s/it]

training loss: 3.374544858932495


training:  13%|█▎        | 1415/10986 [32:02<3:27:47,  1.30s/it]

training loss: 3.231792688369751


training:  13%|█▎        | 1416/10986 [32:03<3:26:06,  1.29s/it]

training loss: 3.253679037094116


training:  13%|█▎        | 1417/10986 [32:04<3:28:43,  1.31s/it]

training loss: 3.2325096130371094


training:  13%|█▎        | 1418/10986 [32:06<3:28:27,  1.31s/it]

training loss: 3.2981557846069336


training:  13%|█▎        | 1419/10986 [32:07<3:28:12,  1.31s/it]

training loss: 3.2357919216156006


training:  13%|█▎        | 1420/10986 [32:08<3:28:27,  1.31s/it]

training loss: 3.2791104316711426
valid loss: 3.2682318687438965
perplexity: 26.264860153198242


training:  13%|█▎        | 1421/10986 [32:11<4:37:07,  1.74s/it]

training loss: 3.2449569702148438


training:  13%|█▎        | 1422/10986 [32:12<4:19:35,  1.63s/it]

training loss: 3.297696590423584


training:  13%|█▎        | 1423/10986 [32:14<4:03:22,  1.53s/it]

training loss: 3.271726369857788


training:  13%|█▎        | 1424/10986 [32:15<3:52:29,  1.46s/it]

training loss: 3.2880027294158936


training:  13%|█▎        | 1425/10986 [32:16<3:44:34,  1.41s/it]

training loss: 3.4327030181884766


training:  13%|█▎        | 1426/10986 [32:17<3:38:34,  1.37s/it]

training loss: 3.3117151260375977


training:  13%|█▎        | 1427/10986 [32:19<3:34:18,  1.35s/it]

training loss: 3.2693593502044678


training:  13%|█▎        | 1428/10986 [32:20<3:32:05,  1.33s/it]

training loss: 3.362879753112793


training:  13%|█▎        | 1429/10986 [32:21<3:29:06,  1.31s/it]

training loss: 3.1516332626342773


training:  13%|█▎        | 1430/10986 [32:23<3:27:32,  1.30s/it]

training loss: 3.283245325088501


training:  13%|█▎        | 1431/10986 [32:24<3:39:35,  1.38s/it]

training loss: 3.251157760620117


training:  13%|█▎        | 1432/10986 [32:25<3:35:30,  1.35s/it]

training loss: 3.2991862297058105


training:  13%|█▎        | 1433/10986 [32:27<3:32:00,  1.33s/it]

training loss: 3.2629106044769287


training:  13%|█▎        | 1434/10986 [32:28<3:28:18,  1.31s/it]

training loss: 3.213686943054199


training:  13%|█▎        | 1435/10986 [32:29<3:26:16,  1.30s/it]

training loss: 3.2936084270477295


training:  13%|█▎        | 1436/10986 [32:30<3:25:17,  1.29s/it]

training loss: 3.323246717453003


training:  13%|█▎        | 1437/10986 [32:32<3:25:01,  1.29s/it]

training loss: 3.2234232425689697


training:  13%|█▎        | 1438/10986 [32:33<3:23:44,  1.28s/it]

training loss: 3.3320305347442627


training:  13%|█▎        | 1439/10986 [32:34<3:25:08,  1.29s/it]

training loss: 3.3078653812408447


training:  13%|█▎        | 1440/10986 [32:36<3:23:56,  1.28s/it]

training loss: 3.2373013496398926
valid loss: 3.227987289428711
perplexity: 25.22882843017578


training:  13%|█▎        | 1441/10986 [32:38<4:29:00,  1.69s/it]

training loss: 3.2968928813934326


training:  13%|█▎        | 1442/10986 [32:40<4:12:31,  1.59s/it]

training loss: 3.1736319065093994


training:  13%|█▎        | 1443/10986 [32:41<3:57:21,  1.49s/it]

training loss: 3.278130292892456


training:  13%|█▎        | 1444/10986 [32:42<3:46:29,  1.42s/it]

training loss: 3.23816180229187


training:  13%|█▎        | 1445/10986 [32:43<3:38:00,  1.37s/it]

training loss: 3.3428943157196045


training:  13%|█▎        | 1446/10986 [32:45<3:33:13,  1.34s/it]

training loss: 3.221179962158203


training:  13%|█▎        | 1447/10986 [32:46<3:28:34,  1.31s/it]

training loss: 3.256863832473755


training:  13%|█▎        | 1448/10986 [32:47<3:25:40,  1.29s/it]

training loss: 3.243919610977173


training:  13%|█▎        | 1449/10986 [32:48<3:24:51,  1.29s/it]

training loss: 3.2409307956695557


training:  13%|█▎        | 1450/10986 [32:50<3:23:54,  1.28s/it]

training loss: 3.2784833908081055


training:  13%|█▎        | 1451/10986 [32:51<3:34:49,  1.35s/it]

training loss: 3.3960440158843994


training:  13%|█▎        | 1452/10986 [32:53<3:38:27,  1.37s/it]

training loss: 3.345625400543213


training:  13%|█▎        | 1453/10986 [32:54<3:33:37,  1.34s/it]

training loss: 3.2677016258239746


training:  13%|█▎        | 1454/10986 [32:55<3:30:46,  1.33s/it]

training loss: 3.339658260345459


training:  13%|█▎        | 1455/10986 [32:56<3:28:00,  1.31s/it]

training loss: 3.244252920150757


training:  13%|█▎        | 1456/10986 [32:58<3:27:53,  1.31s/it]

training loss: 3.263429641723633


training:  13%|█▎        | 1457/10986 [32:59<3:27:32,  1.31s/it]

training loss: 3.383207321166992


training:  13%|█▎        | 1458/10986 [33:00<3:25:56,  1.30s/it]

training loss: 3.2768290042877197


training:  13%|█▎        | 1459/10986 [33:02<3:25:44,  1.30s/it]

training loss: 3.3370583057403564


training:  13%|█▎        | 1460/10986 [33:03<3:24:22,  1.29s/it]

training loss: 3.34578537940979
valid loss: 3.345888614654541
perplexity: 28.385786056518555


training:  13%|█▎        | 1461/10986 [33:06<4:34:06,  1.73s/it]

training loss: 3.2929418087005615


training:  13%|█▎        | 1462/10986 [33:07<4:33:03,  1.72s/it]

training loss: 3.286846876144409


training:  13%|█▎        | 1463/10986 [33:09<4:11:45,  1.59s/it]

training loss: 3.3447728157043457


training:  13%|█▎        | 1464/10986 [33:10<3:57:41,  1.50s/it]

training loss: 3.2554852962493896


training:  13%|█▎        | 1465/10986 [33:11<3:47:59,  1.44s/it]

training loss: 3.2317070960998535


training:  13%|█▎        | 1466/10986 [33:13<3:41:26,  1.40s/it]

training loss: 3.3990421295166016


training:  13%|█▎        | 1467/10986 [33:14<3:35:23,  1.36s/it]

training loss: 3.283536672592163


training:  13%|█▎        | 1468/10986 [33:15<3:31:56,  1.34s/it]

training loss: 3.2377052307128906


training:  13%|█▎        | 1469/10986 [33:16<3:28:58,  1.32s/it]

training loss: 3.252227306365967


training:  13%|█▎        | 1470/10986 [33:18<3:27:02,  1.31s/it]

training loss: 3.3741161823272705


training:  13%|█▎        | 1471/10986 [33:19<3:38:58,  1.38s/it]

training loss: 3.326594352722168


training:  13%|█▎        | 1472/10986 [33:21<3:53:13,  1.47s/it]

training loss: 3.229553461074829


training:  13%|█▎        | 1473/10986 [33:22<3:44:42,  1.42s/it]

training loss: 3.2693185806274414


training:  13%|█▎        | 1474/10986 [33:23<3:39:47,  1.39s/it]

training loss: 3.2705135345458984


training:  13%|█▎        | 1475/10986 [33:25<3:34:07,  1.35s/it]

training loss: 3.3418822288513184


training:  13%|█▎        | 1476/10986 [33:26<3:30:58,  1.33s/it]

training loss: 3.237334966659546


training:  13%|█▎        | 1477/10986 [33:27<3:29:58,  1.32s/it]

training loss: 3.1762139797210693


training:  13%|█▎        | 1478/10986 [33:29<3:27:06,  1.31s/it]

training loss: 3.250676393508911


training:  13%|█▎        | 1479/10986 [33:30<3:26:36,  1.30s/it]

training loss: 3.2247047424316406


training:  13%|█▎        | 1480/10986 [33:31<3:25:30,  1.30s/it]

training loss: 3.2997448444366455
valid loss: 3.295295238494873
perplexity: 26.985380172729492


training:  13%|█▎        | 1481/10986 [33:34<4:33:29,  1.73s/it]

training loss: 3.2675411701202393


training:  13%|█▎        | 1482/10986 [33:35<4:18:28,  1.63s/it]

training loss: 3.259659767150879


training:  13%|█▎        | 1483/10986 [33:37<4:01:05,  1.52s/it]

training loss: 3.255178928375244


training:  14%|█▎        | 1484/10986 [33:38<3:50:52,  1.46s/it]

training loss: 3.140674114227295


training:  14%|█▎        | 1485/10986 [33:39<3:42:37,  1.41s/it]

training loss: 3.249953269958496


training:  14%|█▎        | 1486/10986 [33:40<3:37:21,  1.37s/it]

training loss: 3.2354300022125244


training:  14%|█▎        | 1487/10986 [33:42<3:34:55,  1.36s/it]

training loss: 3.2279646396636963


training:  14%|█▎        | 1488/10986 [33:43<3:32:29,  1.34s/it]

training loss: 3.2912039756774902


training:  14%|█▎        | 1489/10986 [33:44<3:30:44,  1.33s/it]

training loss: 3.303741216659546


training:  14%|█▎        | 1490/10986 [33:46<3:29:52,  1.33s/it]

training loss: 3.3566956520080566


training:  14%|█▎        | 1491/10986 [33:47<3:39:51,  1.39s/it]

training loss: 3.4290266036987305


training:  14%|█▎        | 1492/10986 [33:49<3:35:57,  1.36s/it]

training loss: 3.2913310527801514


training:  14%|█▎        | 1493/10986 [33:50<3:31:18,  1.34s/it]

training loss: 3.365689754486084


training:  14%|█▎        | 1494/10986 [33:51<3:29:03,  1.32s/it]

training loss: 3.2931442260742188


training:  14%|█▎        | 1495/10986 [33:52<3:26:19,  1.30s/it]

training loss: 3.419025421142578


training:  14%|█▎        | 1496/10986 [33:54<3:24:32,  1.29s/it]

training loss: 3.4245941638946533


training:  14%|█▎        | 1497/10986 [33:55<3:23:44,  1.29s/it]

training loss: 3.32771897315979


training:  14%|█▎        | 1498/10986 [33:56<3:23:33,  1.29s/it]

training loss: 3.313692569732666


training:  14%|█▎        | 1499/10986 [33:57<3:22:41,  1.28s/it]

training loss: 3.246347188949585


training:  14%|█▎        | 1500/10986 [33:59<3:23:33,  1.29s/it]

training loss: 3.3916015625
valid loss: 3.3935019969940186
perplexity: 29.770023345947266


training:  14%|█▎        | 1501/10986 [34:02<4:31:41,  1.72s/it]

training loss: 3.302513599395752


training:  14%|█▎        | 1502/10986 [34:03<4:16:21,  1.62s/it]

training loss: 3.2727949619293213


training:  14%|█▎        | 1503/10986 [34:04<4:01:34,  1.53s/it]

training loss: 3.298405408859253


training:  14%|█▎        | 1504/10986 [34:06<3:52:45,  1.47s/it]

training loss: 3.1997151374816895


training:  14%|█▎        | 1505/10986 [34:07<3:46:13,  1.43s/it]

training loss: 3.2346255779266357


training:  14%|█▎        | 1506/10986 [34:08<3:40:39,  1.40s/it]

training loss: 3.325395345687866


training:  14%|█▎        | 1507/10986 [34:10<3:36:27,  1.37s/it]

training loss: 3.306206464767456


training:  14%|█▎        | 1508/10986 [34:11<3:32:50,  1.35s/it]

training loss: 3.303518533706665


training:  14%|█▎        | 1509/10986 [34:12<3:31:11,  1.34s/it]

training loss: 3.2757248878479004


training:  14%|█▎        | 1510/10986 [34:13<3:30:19,  1.33s/it]

training loss: 3.193377733230591


training:  14%|█▍        | 1511/10986 [34:15<3:40:39,  1.40s/it]

training loss: 3.3062119483947754


training:  14%|█▍        | 1512/10986 [34:16<3:36:48,  1.37s/it]

training loss: 3.344564914703369


training:  14%|█▍        | 1513/10986 [34:18<3:32:52,  1.35s/it]

training loss: 3.306164026260376


training:  14%|█▍        | 1514/10986 [34:19<3:30:45,  1.34s/it]

training loss: 3.3014273643493652


training:  14%|█▍        | 1515/10986 [34:20<3:27:46,  1.32s/it]

training loss: 3.2964887619018555


training:  14%|█▍        | 1516/10986 [34:21<3:26:23,  1.31s/it]

training loss: 3.360726833343506


training:  14%|█▍        | 1517/10986 [34:23<3:24:16,  1.29s/it]

training loss: 3.394512176513672


training:  14%|█▍        | 1518/10986 [34:24<3:24:08,  1.29s/it]

training loss: 3.398859739303589


training:  14%|█▍        | 1519/10986 [34:25<3:23:45,  1.29s/it]

training loss: 3.29872727394104


training:  14%|█▍        | 1520/10986 [34:27<3:22:09,  1.28s/it]

training loss: 3.2064781188964844
valid loss: 3.202751636505127
perplexity: 24.600128173828125


training:  14%|█▍        | 1521/10986 [34:29<4:27:04,  1.69s/it]

training loss: 3.3429009914398193


training:  14%|█▍        | 1522/10986 [34:31<4:11:05,  1.59s/it]

training loss: 3.404435634613037


training:  14%|█▍        | 1523/10986 [34:32<3:56:07,  1.50s/it]

training loss: 3.4164531230926514


training:  14%|█▍        | 1524/10986 [34:33<3:44:54,  1.43s/it]

training loss: 3.258953094482422


training:  14%|█▍        | 1525/10986 [34:34<3:37:55,  1.38s/it]

training loss: 3.3368659019470215


training:  14%|█▍        | 1526/10986 [34:36<3:34:39,  1.36s/it]

training loss: 3.3107712268829346


training:  14%|█▍        | 1527/10986 [34:37<3:30:20,  1.33s/it]

training loss: 3.271653175354004


training:  14%|█▍        | 1528/10986 [34:38<3:26:59,  1.31s/it]

training loss: 3.288285732269287


training:  14%|█▍        | 1529/10986 [34:40<3:24:44,  1.30s/it]

training loss: 3.3404970169067383


training:  14%|█▍        | 1530/10986 [34:41<3:22:38,  1.29s/it]

training loss: 3.2833120822906494


training:  14%|█▍        | 1531/10986 [34:42<3:34:34,  1.36s/it]

training loss: 3.2286863327026367


training:  14%|█▍        | 1532/10986 [34:44<3:30:52,  1.34s/it]

training loss: 3.3585352897644043


training:  14%|█▍        | 1533/10986 [34:45<3:28:19,  1.32s/it]

training loss: 3.3142964839935303


training:  14%|█▍        | 1534/10986 [34:46<3:25:47,  1.31s/it]

training loss: 3.345179796218872


training:  14%|█▍        | 1535/10986 [34:47<3:23:59,  1.30s/it]

training loss: 3.3398871421813965


training:  14%|█▍        | 1536/10986 [34:49<3:22:09,  1.28s/it]

training loss: 3.275592803955078


training:  14%|█▍        | 1537/10986 [34:50<3:21:37,  1.28s/it]

training loss: 3.256499767303467


training:  14%|█▍        | 1538/10986 [34:51<3:20:30,  1.27s/it]

training loss: 3.3060476779937744


training:  14%|█▍        | 1539/10986 [34:52<3:19:54,  1.27s/it]

training loss: 3.378662586212158


training:  14%|█▍        | 1540/10986 [34:54<3:19:27,  1.27s/it]

training loss: 3.2143454551696777
valid loss: 3.2146782875061035
perplexity: 24.895280838012695


training:  14%|█▍        | 1541/10986 [34:57<4:31:33,  1.73s/it]

training loss: 3.239945650100708


training:  14%|█▍        | 1542/10986 [34:58<4:17:52,  1.64s/it]

training loss: 3.316629409790039


training:  14%|█▍        | 1543/10986 [35:00<4:20:12,  1.65s/it]

training loss: 3.271223783493042


training:  14%|█▍        | 1544/10986 [35:01<4:21:47,  1.66s/it]

training loss: 3.3897016048431396


training:  14%|█▍        | 1545/10986 [35:03<4:06:08,  1.56s/it]

training loss: 3.3303966522216797


training:  14%|█▍        | 1546/10986 [35:04<3:51:58,  1.47s/it]

training loss: 3.243915557861328


training:  14%|█▍        | 1547/10986 [35:05<3:42:02,  1.41s/it]

training loss: 3.3340365886688232


training:  14%|█▍        | 1548/10986 [35:06<3:35:12,  1.37s/it]

training loss: 3.2575840950012207


training:  14%|█▍        | 1549/10986 [35:08<3:31:29,  1.34s/it]

training loss: 3.286726713180542


training:  14%|█▍        | 1550/10986 [35:09<3:28:53,  1.33s/it]

training loss: 3.3400275707244873


training:  14%|█▍        | 1551/10986 [35:11<3:39:53,  1.40s/it]

training loss: 3.3084278106689453


training:  14%|█▍        | 1552/10986 [35:12<3:35:57,  1.37s/it]

training loss: 3.35800838470459


training:  14%|█▍        | 1553/10986 [35:13<3:31:08,  1.34s/it]

training loss: 3.3553829193115234


training:  14%|█▍        | 1554/10986 [35:14<3:29:36,  1.33s/it]

training loss: 3.2903451919555664


training:  14%|█▍        | 1555/10986 [35:16<3:29:15,  1.33s/it]

training loss: 3.3225250244140625


training:  14%|█▍        | 1556/10986 [35:17<3:28:27,  1.33s/it]

training loss: 3.2628536224365234


training:  14%|█▍        | 1557/10986 [35:18<3:28:49,  1.33s/it]

training loss: 3.362229585647583


training:  14%|█▍        | 1558/10986 [35:20<3:29:36,  1.33s/it]

training loss: 3.3588554859161377


training:  14%|█▍        | 1559/10986 [35:21<3:27:29,  1.32s/it]

training loss: 3.3431174755096436


training:  14%|█▍        | 1560/10986 [35:22<3:25:21,  1.31s/it]

training loss: 3.279421806335449
valid loss: 3.2736318111419678
perplexity: 26.40707015991211


training:  14%|█▍        | 1561/10986 [35:25<4:33:21,  1.74s/it]

training loss: 3.3532304763793945


training:  14%|█▍        | 1562/10986 [35:27<4:17:31,  1.64s/it]

training loss: 3.2131035327911377


training:  14%|█▍        | 1563/10986 [35:28<4:01:39,  1.54s/it]

training loss: 3.2036752700805664


training:  14%|█▍        | 1564/10986 [35:29<3:49:45,  1.46s/it]

training loss: 3.2840301990509033


training:  14%|█▍        | 1565/10986 [35:30<3:41:38,  1.41s/it]

training loss: 3.280512809753418


training:  14%|█▍        | 1566/10986 [35:32<3:35:37,  1.37s/it]

training loss: 3.2147321701049805


training:  14%|█▍        | 1567/10986 [35:33<3:31:22,  1.35s/it]

training loss: 3.2949633598327637


training:  14%|█▍        | 1568/10986 [35:34<3:28:46,  1.33s/it]

training loss: 3.2765536308288574


training:  14%|█▍        | 1569/10986 [35:36<3:26:51,  1.32s/it]

training loss: 3.4323854446411133


training:  14%|█▍        | 1570/10986 [35:37<3:27:15,  1.32s/it]

training loss: 3.242145299911499


training:  14%|█▍        | 1571/10986 [35:38<3:38:43,  1.39s/it]

training loss: 3.271209239959717


training:  14%|█▍        | 1572/10986 [35:40<3:49:06,  1.46s/it]

training loss: 3.2899322509765625


training:  14%|█▍        | 1573/10986 [35:41<3:40:37,  1.41s/it]

training loss: 3.189574718475342


training:  14%|█▍        | 1574/10986 [35:43<3:34:53,  1.37s/it]

training loss: 3.274259090423584


training:  14%|█▍        | 1575/10986 [35:44<3:30:35,  1.34s/it]

training loss: 3.382801055908203


training:  14%|█▍        | 1576/10986 [35:45<3:27:11,  1.32s/it]

training loss: 3.27701735496521


training:  14%|█▍        | 1577/10986 [35:46<3:24:51,  1.31s/it]

training loss: 3.343766450881958


training:  14%|█▍        | 1578/10986 [35:48<3:23:39,  1.30s/it]

training loss: 3.1958189010620117


training:  14%|█▍        | 1579/10986 [35:49<3:22:20,  1.29s/it]

training loss: 3.2819430828094482


training:  14%|█▍        | 1580/10986 [35:50<3:20:33,  1.28s/it]

training loss: 3.3513681888580322
valid loss: 3.3609204292297363
perplexity: 28.815702438354492


training:  14%|█▍        | 1581/10986 [35:53<4:29:54,  1.72s/it]

training loss: 3.3159031867980957


training:  14%|█▍        | 1582/10986 [35:54<4:12:58,  1.61s/it]

training loss: 3.3572096824645996


training:  14%|█▍        | 1583/10986 [35:56<3:56:54,  1.51s/it]

training loss: 3.33488130569458


training:  14%|█▍        | 1584/10986 [35:57<3:45:00,  1.44s/it]

training loss: 3.3371822834014893


training:  14%|█▍        | 1585/10986 [35:58<3:37:25,  1.39s/it]

training loss: 3.379087448120117


training:  14%|█▍        | 1586/10986 [35:59<3:33:23,  1.36s/it]

training loss: 3.2326245307922363


training:  14%|█▍        | 1587/10986 [36:01<3:28:52,  1.33s/it]

training loss: 3.3631246089935303


training:  14%|█▍        | 1588/10986 [36:02<3:25:39,  1.31s/it]

training loss: 3.2696573734283447


training:  14%|█▍        | 1589/10986 [36:03<3:23:36,  1.30s/it]

training loss: 3.194106101989746


training:  14%|█▍        | 1590/10986 [36:05<3:21:31,  1.29s/it]

training loss: 3.306915760040283


training:  14%|█▍        | 1591/10986 [36:06<3:33:57,  1.37s/it]

training loss: 3.194603204727173


training:  14%|█▍        | 1592/10986 [36:07<3:32:26,  1.36s/it]

training loss: 3.276686429977417


training:  15%|█▍        | 1593/10986 [36:09<3:27:22,  1.32s/it]

training loss: 3.3307738304138184


training:  15%|█▍        | 1594/10986 [36:10<3:24:47,  1.31s/it]

training loss: 3.235546350479126


training:  15%|█▍        | 1595/10986 [36:11<3:22:36,  1.29s/it]

training loss: 3.3054122924804688


training:  15%|█▍        | 1596/10986 [36:13<3:21:46,  1.29s/it]

training loss: 3.24349308013916


training:  15%|█▍        | 1597/10986 [36:14<3:19:47,  1.28s/it]

training loss: 3.2281501293182373


training:  15%|█▍        | 1598/10986 [36:15<3:18:24,  1.27s/it]

training loss: 3.339871644973755


training:  15%|█▍        | 1599/10986 [36:16<3:18:17,  1.27s/it]

training loss: 3.3100829124450684


training:  15%|█▍        | 1600/10986 [36:18<3:18:19,  1.27s/it]

training loss: 3.2430849075317383
valid loss: 3.234027147293091
perplexity: 25.38166618347168


training:  15%|█▍        | 1601/10986 [36:20<4:24:39,  1.69s/it]

training loss: 3.37741756439209


training:  15%|█▍        | 1602/10986 [36:22<4:09:13,  1.59s/it]

training loss: 3.21346116065979


training:  15%|█▍        | 1603/10986 [36:23<3:54:24,  1.50s/it]

training loss: 3.253023624420166


training:  15%|█▍        | 1604/10986 [36:24<3:43:36,  1.43s/it]

training loss: 3.228079080581665


training:  15%|█▍        | 1605/10986 [36:25<3:35:45,  1.38s/it]

training loss: 3.2835581302642822


training:  15%|█▍        | 1606/10986 [36:27<3:31:01,  1.35s/it]

training loss: 3.2686643600463867


training:  15%|█▍        | 1607/10986 [36:28<3:26:40,  1.32s/it]

training loss: 3.3193881511688232


training:  15%|█▍        | 1608/10986 [36:29<3:24:05,  1.31s/it]

training loss: 3.317561626434326


training:  15%|█▍        | 1609/10986 [36:30<3:22:11,  1.29s/it]

training loss: 3.279466390609741


training:  15%|█▍        | 1610/10986 [36:32<3:21:01,  1.29s/it]

training loss: 3.329383611679077


training:  15%|█▍        | 1611/10986 [36:33<3:33:06,  1.36s/it]

training loss: 3.285263776779175


training:  15%|█▍        | 1612/10986 [36:35<3:41:23,  1.42s/it]

training loss: 3.292644500732422


training:  15%|█▍        | 1613/10986 [36:36<3:33:40,  1.37s/it]

training loss: 3.275965452194214


training:  15%|█▍        | 1614/10986 [36:37<3:28:59,  1.34s/it]

training loss: 3.366189479827881


training:  15%|█▍        | 1615/10986 [36:39<3:25:43,  1.32s/it]

training loss: 3.227088689804077


training:  15%|█▍        | 1616/10986 [36:40<3:23:18,  1.30s/it]

training loss: 3.2882463932037354


training:  15%|█▍        | 1617/10986 [36:41<3:21:48,  1.29s/it]

training loss: 3.241035223007202


training:  15%|█▍        | 1618/10986 [36:42<3:21:35,  1.29s/it]

training loss: 3.365478992462158


training:  15%|█▍        | 1619/10986 [36:44<3:20:42,  1.29s/it]

training loss: 3.278517246246338


training:  15%|█▍        | 1620/10986 [36:45<3:23:21,  1.30s/it]

training loss: 3.391868829727173
valid loss: 3.4055278301239014
perplexity: 30.13019561767578


training:  15%|█▍        | 1621/10986 [36:48<4:31:03,  1.74s/it]

training loss: 3.379324436187744


training:  15%|█▍        | 1622/10986 [36:49<4:13:47,  1.63s/it]

training loss: 3.322352170944214


training:  15%|█▍        | 1623/10986 [36:50<3:57:55,  1.52s/it]

training loss: 3.4235737323760986


training:  15%|█▍        | 1624/10986 [36:52<3:45:42,  1.45s/it]

training loss: 3.4377732276916504


training:  15%|█▍        | 1625/10986 [36:53<3:38:24,  1.40s/it]

training loss: 3.502789258956909


training:  15%|█▍        | 1626/10986 [36:54<3:33:15,  1.37s/it]

training loss: 3.4406442642211914


training:  15%|█▍        | 1627/10986 [36:56<3:29:41,  1.34s/it]

training loss: 3.3401944637298584


training:  15%|█▍        | 1628/10986 [36:57<3:26:40,  1.33s/it]

training loss: 3.3016436100006104


training:  15%|█▍        | 1629/10986 [36:58<3:25:11,  1.32s/it]

training loss: 3.3675179481506348


training:  15%|█▍        | 1630/10986 [36:59<3:24:12,  1.31s/it]

training loss: 3.5041697025299072


training:  15%|█▍        | 1631/10986 [37:01<3:35:44,  1.38s/it]

training loss: 3.4352893829345703


training:  15%|█▍        | 1632/10986 [37:02<3:32:35,  1.36s/it]

training loss: 3.3168609142303467


training:  15%|█▍        | 1633/10986 [37:04<3:29:20,  1.34s/it]

training loss: 3.257671356201172


training:  15%|█▍        | 1634/10986 [37:05<3:27:16,  1.33s/it]

training loss: 3.3763508796691895


training:  15%|█▍        | 1635/10986 [37:06<3:24:34,  1.31s/it]

training loss: 3.4203743934631348


training:  15%|█▍        | 1636/10986 [37:08<3:24:13,  1.31s/it]

training loss: 3.2875330448150635


training:  15%|█▍        | 1637/10986 [37:09<3:22:57,  1.30s/it]

training loss: 3.356172800064087


training:  15%|█▍        | 1638/10986 [37:10<3:25:10,  1.32s/it]

training loss: 3.275761365890503


training:  15%|█▍        | 1639/10986 [37:11<3:23:57,  1.31s/it]

training loss: 3.3010871410369873


training:  15%|█▍        | 1640/10986 [37:13<3:22:14,  1.30s/it]

training loss: 3.3112952709198
valid loss: 3.3087875843048096
perplexity: 27.35194206237793


training:  15%|█▍        | 1641/10986 [37:15<4:27:22,  1.72s/it]

training loss: 3.309028387069702


training:  15%|█▍        | 1642/10986 [37:17<4:22:43,  1.69s/it]

training loss: 3.4578359127044678


training:  15%|█▍        | 1643/10986 [37:18<4:04:03,  1.57s/it]

training loss: 3.2715160846710205


training:  15%|█▍        | 1644/10986 [37:20<3:50:37,  1.48s/it]

training loss: 3.4141476154327393


training:  15%|█▍        | 1645/10986 [37:21<3:40:23,  1.42s/it]

training loss: 3.357980728149414


training:  15%|█▍        | 1646/10986 [37:22<3:32:50,  1.37s/it]

training loss: 3.296721935272217


training:  15%|█▍        | 1647/10986 [37:23<3:28:34,  1.34s/it]

training loss: 3.363638401031494


training:  15%|█▌        | 1648/10986 [37:25<3:26:10,  1.32s/it]

training loss: 3.3696250915527344


training:  15%|█▌        | 1649/10986 [37:26<3:24:36,  1.31s/it]

training loss: 3.367549419403076


training:  15%|█▌        | 1650/10986 [37:27<3:23:07,  1.31s/it]

training loss: 3.339409112930298


training:  15%|█▌        | 1651/10986 [37:29<3:33:28,  1.37s/it]

training loss: 3.30728816986084


training:  15%|█▌        | 1652/10986 [37:30<3:30:31,  1.35s/it]

training loss: 3.497654438018799


training:  15%|█▌        | 1653/10986 [37:31<3:27:11,  1.33s/it]

training loss: 3.209930658340454


training:  15%|█▌        | 1654/10986 [37:33<3:25:24,  1.32s/it]

training loss: 3.382256507873535


training:  15%|█▌        | 1655/10986 [37:34<3:22:36,  1.30s/it]

training loss: 3.3962628841400146


training:  15%|█▌        | 1656/10986 [37:35<3:22:01,  1.30s/it]

training loss: 3.299753427505493


training:  15%|█▌        | 1657/10986 [37:36<3:21:01,  1.29s/it]

training loss: 3.3495662212371826


training:  15%|█▌        | 1658/10986 [37:38<3:20:35,  1.29s/it]

training loss: 3.3811867237091064


training:  15%|█▌        | 1659/10986 [37:39<3:19:17,  1.28s/it]

training loss: 3.3255796432495117


training:  15%|█▌        | 1660/10986 [37:40<3:21:11,  1.29s/it]

training loss: 3.330723524093628
valid loss: 3.3305320739746094
perplexity: 27.95321273803711


training:  15%|█▌        | 1661/10986 [37:43<4:27:38,  1.72s/it]

training loss: 3.5181708335876465


training:  15%|█▌        | 1662/10986 [37:44<4:13:14,  1.63s/it]

training loss: 3.341071605682373


training:  15%|█▌        | 1663/10986 [37:46<3:59:13,  1.54s/it]

training loss: 3.379450559616089


training:  15%|█▌        | 1664/10986 [37:47<3:47:19,  1.46s/it]

training loss: 3.335486888885498


training:  15%|█▌        | 1665/10986 [37:48<3:39:10,  1.41s/it]

training loss: 3.403190851211548


training:  15%|█▌        | 1666/10986 [37:50<3:33:16,  1.37s/it]

training loss: 3.3909571170806885


training:  15%|█▌        | 1667/10986 [37:51<3:29:57,  1.35s/it]

training loss: 3.2870430946350098


training:  15%|█▌        | 1668/10986 [37:52<3:25:58,  1.33s/it]

training loss: 3.269444465637207


training:  15%|█▌        | 1669/10986 [37:54<3:24:35,  1.32s/it]

training loss: 3.3557474613189697


training:  15%|█▌        | 1670/10986 [37:55<3:22:55,  1.31s/it]

training loss: 3.3283627033233643


training:  15%|█▌        | 1671/10986 [37:56<3:34:20,  1.38s/it]

training loss: 3.237577199935913


training:  15%|█▌        | 1672/10986 [37:58<3:44:19,  1.45s/it]

training loss: 3.423766613006592


training:  15%|█▌        | 1673/10986 [37:59<3:36:38,  1.40s/it]

training loss: 3.2984986305236816


training:  15%|█▌        | 1674/10986 [38:01<3:31:01,  1.36s/it]

training loss: 3.3522634506225586


training:  15%|█▌        | 1675/10986 [38:02<3:28:24,  1.34s/it]

training loss: 3.22031307220459


training:  15%|█▌        | 1676/10986 [38:03<3:25:14,  1.32s/it]

training loss: 3.392986297607422


training:  15%|█▌        | 1677/10986 [38:04<3:23:39,  1.31s/it]

training loss: 3.382542371749878


training:  15%|█▌        | 1678/10986 [38:06<3:21:22,  1.30s/it]

training loss: 3.2460482120513916


training:  15%|█▌        | 1679/10986 [38:07<3:19:53,  1.29s/it]

training loss: 3.328104019165039


training:  15%|█▌        | 1680/10986 [38:08<3:19:08,  1.28s/it]

training loss: 3.3957743644714355
valid loss: 3.388392686843872
perplexity: 29.61830711364746


training:  15%|█▌        | 1681/10986 [38:11<4:27:38,  1.73s/it]

training loss: 3.2798848152160645


training:  15%|█▌        | 1682/10986 [38:12<4:10:55,  1.62s/it]

training loss: 3.2864878177642822


training:  15%|█▌        | 1683/10986 [38:14<3:54:36,  1.51s/it]

training loss: 3.338855743408203


training:  15%|█▌        | 1684/10986 [38:15<3:42:34,  1.44s/it]

training loss: 3.3264713287353516


training:  15%|█▌        | 1685/10986 [38:16<3:34:52,  1.39s/it]

training loss: 3.3055734634399414


training:  15%|█▌        | 1686/10986 [38:17<3:29:41,  1.35s/it]

training loss: 3.2809486389160156


training:  15%|█▌        | 1687/10986 [38:19<3:26:01,  1.33s/it]

training loss: 3.2925665378570557


training:  15%|█▌        | 1688/10986 [38:20<3:22:36,  1.31s/it]

training loss: 3.4089066982269287


training:  15%|█▌        | 1689/10986 [38:21<3:21:05,  1.30s/it]

training loss: 3.312520980834961


training:  15%|█▌        | 1690/10986 [38:22<3:19:53,  1.29s/it]

training loss: 3.3105921745300293


training:  15%|█▌        | 1691/10986 [38:24<3:31:21,  1.36s/it]

training loss: 3.2831227779388428


training:  15%|█▌        | 1692/10986 [38:25<3:37:19,  1.40s/it]

training loss: 3.2868123054504395


training:  15%|█▌        | 1693/10986 [38:27<3:30:40,  1.36s/it]

training loss: 3.314213275909424


training:  15%|█▌        | 1694/10986 [38:28<3:26:48,  1.34s/it]

training loss: 3.2413032054901123


training:  15%|█▌        | 1695/10986 [38:30<3:40:10,  1.42s/it]

training loss: 3.2283546924591064


training:  15%|█▌        | 1696/10986 [38:31<3:51:50,  1.50s/it]

training loss: 3.348539352416992


training:  15%|█▌        | 1697/10986 [38:33<3:46:21,  1.46s/it]

training loss: 3.3023598194122314


training:  15%|█▌        | 1698/10986 [38:34<3:37:05,  1.40s/it]

training loss: 3.3156983852386475


training:  15%|█▌        | 1699/10986 [38:35<3:31:19,  1.37s/it]

training loss: 3.3036935329437256


training:  15%|█▌        | 1700/10986 [38:37<3:26:17,  1.33s/it]

training loss: 3.35011625289917
valid loss: 3.354973793029785
perplexity: 28.644853591918945


training:  15%|█▌        | 1701/10986 [38:39<4:29:38,  1.74s/it]

training loss: 3.265190839767456


training:  15%|█▌        | 1702/10986 [38:41<4:14:00,  1.64s/it]

training loss: 3.330920934677124


training:  16%|█▌        | 1703/10986 [38:42<3:59:57,  1.55s/it]

training loss: 3.347437858581543


training:  16%|█▌        | 1704/10986 [38:43<3:47:00,  1.47s/it]

training loss: 3.3217177391052246


training:  16%|█▌        | 1705/10986 [38:45<3:38:15,  1.41s/it]

training loss: 3.278362274169922


training:  16%|█▌        | 1706/10986 [38:46<3:31:46,  1.37s/it]

training loss: 3.3036410808563232


training:  16%|█▌        | 1707/10986 [38:47<3:27:41,  1.34s/it]

training loss: 3.293963670730591


training:  16%|█▌        | 1708/10986 [38:48<3:25:49,  1.33s/it]

training loss: 3.3192296028137207


training:  16%|█▌        | 1709/10986 [38:50<3:23:51,  1.32s/it]

training loss: 3.3921852111816406


training:  16%|█▌        | 1710/10986 [38:51<3:22:55,  1.31s/it]

training loss: 3.3083722591400146


training:  16%|█▌        | 1711/10986 [38:52<3:32:19,  1.37s/it]

training loss: 3.345473051071167


training:  16%|█▌        | 1712/10986 [38:54<3:29:46,  1.36s/it]

training loss: 3.301147699356079


training:  16%|█▌        | 1713/10986 [38:55<3:26:59,  1.34s/it]

training loss: 3.3286468982696533


training:  16%|█▌        | 1714/10986 [38:56<3:24:40,  1.32s/it]

training loss: 3.3794612884521484


training:  16%|█▌        | 1715/10986 [38:58<3:22:20,  1.31s/it]

training loss: 3.28759765625


training:  16%|█▌        | 1716/10986 [38:59<3:22:37,  1.31s/it]

training loss: 3.3346734046936035


training:  16%|█▌        | 1717/10986 [39:00<3:21:00,  1.30s/it]

training loss: 3.405744791030884


training:  16%|█▌        | 1718/10986 [39:02<3:20:44,  1.30s/it]

training loss: 3.3228561878204346


training:  16%|█▌        | 1719/10986 [39:03<3:20:45,  1.30s/it]

training loss: 3.317467212677002


training:  16%|█▌        | 1720/10986 [39:04<3:20:43,  1.30s/it]

training loss: 3.257478952407837
valid loss: 3.252753496170044
perplexity: 25.8614501953125


training:  16%|█▌        | 1721/10986 [39:07<4:27:16,  1.73s/it]

training loss: 3.3325729370117188


training:  16%|█▌        | 1722/10986 [39:08<4:11:51,  1.63s/it]

training loss: 3.3824450969696045


training:  16%|█▌        | 1723/10986 [39:10<3:58:14,  1.54s/it]

training loss: 3.277711868286133


training:  16%|█▌        | 1724/10986 [39:11<3:47:16,  1.47s/it]

training loss: 3.2936301231384277


training:  16%|█▌        | 1725/10986 [39:12<3:38:45,  1.42s/it]

training loss: 3.3551573753356934


training:  16%|█▌        | 1726/10986 [39:13<3:32:41,  1.38s/it]

training loss: 3.413289785385132


training:  16%|█▌        | 1727/10986 [39:15<3:29:28,  1.36s/it]

training loss: 3.4873950481414795


training:  16%|█▌        | 1728/10986 [39:16<3:26:27,  1.34s/it]

training loss: 3.2290592193603516


training:  16%|█▌        | 1729/10986 [39:17<3:24:53,  1.33s/it]

training loss: 3.3161394596099854


training:  16%|█▌        | 1730/10986 [39:19<3:23:22,  1.32s/it]

training loss: 3.3903188705444336


training:  16%|█▌        | 1731/10986 [39:20<3:33:53,  1.39s/it]

training loss: 3.3498423099517822


training:  16%|█▌        | 1732/10986 [39:22<3:44:56,  1.46s/it]

training loss: 3.45418381690979


training:  16%|█▌        | 1733/10986 [39:23<3:37:29,  1.41s/it]

training loss: 3.276447057723999


training:  16%|█▌        | 1734/10986 [39:24<3:31:53,  1.37s/it]

training loss: 3.3921260833740234


training:  16%|█▌        | 1735/10986 [39:26<3:26:35,  1.34s/it]

training loss: 3.3801259994506836


training:  16%|█▌        | 1736/10986 [39:27<3:24:20,  1.33s/it]

training loss: 3.3335256576538086


training:  16%|█▌        | 1737/10986 [39:28<3:22:25,  1.31s/it]

training loss: 3.3712575435638428


training:  16%|█▌        | 1738/10986 [39:30<3:20:31,  1.30s/it]

training loss: 3.309021472930908


training:  16%|█▌        | 1739/10986 [39:31<3:20:24,  1.30s/it]

training loss: 3.2440273761749268


training:  16%|█▌        | 1740/10986 [39:32<3:20:04,  1.30s/it]

training loss: 3.2445483207702637
valid loss: 3.24369478225708
perplexity: 25.628238677978516


training:  16%|█▌        | 1741/10986 [39:35<4:27:13,  1.73s/it]

training loss: 3.2819366455078125


training:  16%|█▌        | 1742/10986 [39:37<4:23:01,  1.71s/it]

training loss: 3.364769458770752


training:  16%|█▌        | 1743/10986 [39:38<4:04:36,  1.59s/it]

training loss: 3.3561599254608154


training:  16%|█▌        | 1744/10986 [39:39<3:50:39,  1.50s/it]

training loss: 3.429758071899414


training:  16%|█▌        | 1745/10986 [39:40<3:40:19,  1.43s/it]

training loss: 3.5640575885772705


training:  16%|█▌        | 1746/10986 [39:42<3:36:04,  1.40s/it]

training loss: 3.366183280944824


training:  16%|█▌        | 1747/10986 [39:43<3:31:33,  1.37s/it]

training loss: 3.3694028854370117


training:  16%|█▌        | 1748/10986 [39:44<3:27:01,  1.34s/it]

training loss: 3.2881908416748047


training:  16%|█▌        | 1749/10986 [39:46<3:24:07,  1.33s/it]

training loss: 3.4114115238189697


training:  16%|█▌        | 1750/10986 [39:47<3:21:47,  1.31s/it]

training loss: 3.433615207672119


training:  16%|█▌        | 1751/10986 [39:48<3:34:06,  1.39s/it]

training loss: 3.454624652862549


training:  16%|█▌        | 1752/10986 [39:50<3:29:08,  1.36s/it]

training loss: 3.423488140106201


training:  16%|█▌        | 1753/10986 [39:51<3:25:57,  1.34s/it]

training loss: 3.4138336181640625


training:  16%|█▌        | 1754/10986 [39:52<3:23:13,  1.32s/it]

training loss: 3.376110792160034


training:  16%|█▌        | 1755/10986 [39:54<3:21:19,  1.31s/it]

training loss: 3.33101224899292


training:  16%|█▌        | 1756/10986 [39:55<3:19:29,  1.30s/it]

training loss: 3.4077954292297363


training:  16%|█▌        | 1757/10986 [39:56<3:17:58,  1.29s/it]

training loss: 3.5174720287323


training:  16%|█▌        | 1758/10986 [39:57<3:17:58,  1.29s/it]

training loss: 3.4498801231384277


training:  16%|█▌        | 1759/10986 [39:59<3:17:02,  1.28s/it]

training loss: 3.4023921489715576


training:  16%|█▌        | 1760/10986 [40:00<3:17:05,  1.28s/it]

training loss: 3.4504382610321045
valid loss: 3.4433021545410156
perplexity: 31.290111541748047


training:  16%|█▌        | 1761/10986 [40:03<4:22:10,  1.71s/it]

training loss: 3.3250319957733154


training:  16%|█▌        | 1762/10986 [40:04<4:06:03,  1.60s/it]

training loss: 3.242459535598755


training:  16%|█▌        | 1763/10986 [40:05<3:51:59,  1.51s/it]

training loss: 3.343832492828369


training:  16%|█▌        | 1764/10986 [40:07<3:41:00,  1.44s/it]

training loss: 3.396890878677368


training:  16%|█▌        | 1765/10986 [40:08<3:34:25,  1.40s/it]

training loss: 3.351365566253662


training:  16%|█▌        | 1766/10986 [40:09<3:29:18,  1.36s/it]

training loss: 3.3649938106536865


training:  16%|█▌        | 1767/10986 [40:10<3:25:18,  1.34s/it]

training loss: 3.4275004863739014


training:  16%|█▌        | 1768/10986 [40:12<3:23:10,  1.32s/it]

training loss: 3.282423496246338


training:  16%|█▌        | 1769/10986 [40:13<3:21:18,  1.31s/it]

training loss: 3.3649561405181885


training:  16%|█▌        | 1770/10986 [40:14<3:20:04,  1.30s/it]

training loss: 3.3510828018188477


training:  16%|█▌        | 1771/10986 [40:16<3:30:31,  1.37s/it]

training loss: 3.3937370777130127


training:  16%|█▌        | 1772/10986 [40:17<3:27:12,  1.35s/it]

training loss: 3.4145283699035645


training:  16%|█▌        | 1773/10986 [40:18<3:24:27,  1.33s/it]

training loss: 3.4315505027770996


training:  16%|█▌        | 1774/10986 [40:20<3:22:19,  1.32s/it]

training loss: 3.3338513374328613


training:  16%|█▌        | 1775/10986 [40:21<3:20:40,  1.31s/it]

training loss: 3.403665542602539


training:  16%|█▌        | 1776/10986 [40:22<3:19:45,  1.30s/it]

training loss: 3.3043859004974365


training:  16%|█▌        | 1777/10986 [40:24<3:18:35,  1.29s/it]

training loss: 3.3765830993652344


training:  16%|█▌        | 1778/10986 [40:25<3:19:53,  1.30s/it]

training loss: 3.3462140560150146


training:  16%|█▌        | 1779/10986 [40:26<3:18:31,  1.29s/it]

training loss: 3.369384527206421


training:  16%|█▌        | 1780/10986 [40:27<3:18:35,  1.29s/it]

training loss: 3.3628830909729004
valid loss: 3.3540894985198975
perplexity: 28.61953353881836


training:  16%|█▌        | 1781/10986 [40:30<4:24:40,  1.73s/it]

training loss: 3.481048822402954


training:  16%|█▌        | 1782/10986 [40:32<4:13:21,  1.65s/it]

training loss: 3.379013776779175


training:  16%|█▌        | 1783/10986 [40:33<3:57:47,  1.55s/it]

training loss: 3.4695262908935547


training:  16%|█▌        | 1784/10986 [40:34<3:45:21,  1.47s/it]

training loss: 3.3498616218566895


training:  16%|█▌        | 1785/10986 [40:36<3:37:05,  1.42s/it]

training loss: 3.3692784309387207


training:  16%|█▋        | 1786/10986 [40:37<3:32:17,  1.38s/it]

training loss: 3.4408516883850098


training:  16%|█▋        | 1787/10986 [40:38<3:29:01,  1.36s/it]

training loss: 3.4783668518066406


training:  16%|█▋        | 1788/10986 [40:39<3:25:45,  1.34s/it]

training loss: 3.3563477993011475


training:  16%|█▋        | 1789/10986 [40:41<3:24:45,  1.34s/it]

training loss: 3.3485589027404785


training:  16%|█▋        | 1790/10986 [40:42<3:23:54,  1.33s/it]

training loss: 3.469421863555908


training:  16%|█▋        | 1791/10986 [40:44<3:35:45,  1.41s/it]

training loss: 3.421971559524536


training:  16%|█▋        | 1792/10986 [40:45<3:44:10,  1.46s/it]

training loss: 3.442563533782959


training:  16%|█▋        | 1793/10986 [40:47<3:36:08,  1.41s/it]

training loss: 3.4946014881134033


training:  16%|█▋        | 1794/10986 [40:48<3:30:59,  1.38s/it]

training loss: 3.4998390674591064


training:  16%|█▋        | 1795/10986 [40:49<3:26:49,  1.35s/it]

training loss: 3.4581098556518555


training:  16%|█▋        | 1796/10986 [40:50<3:24:03,  1.33s/it]

training loss: 3.4651105403900146


training:  16%|█▋        | 1797/10986 [40:52<3:21:53,  1.32s/it]

training loss: 3.316715717315674


training:  16%|█▋        | 1798/10986 [40:53<3:22:49,  1.32s/it]

training loss: 3.3497848510742188


training:  16%|█▋        | 1799/10986 [40:54<3:22:05,  1.32s/it]

training loss: 3.469794988632202


training:  16%|█▋        | 1800/10986 [40:56<3:21:04,  1.31s/it]

training loss: 3.3707242012023926
valid loss: 3.3736233711242676
perplexity: 29.184080123901367


training:  16%|█▋        | 1801/10986 [40:58<4:26:47,  1.74s/it]

training loss: 3.4532060623168945


training:  16%|█▋        | 1802/10986 [41:00<4:07:50,  1.62s/it]

training loss: 3.2841296195983887


training:  16%|█▋        | 1803/10986 [41:01<3:52:04,  1.52s/it]

training loss: 3.28897762298584


training:  16%|█▋        | 1804/10986 [41:02<3:42:00,  1.45s/it]

training loss: 3.4788200855255127


training:  16%|█▋        | 1805/10986 [41:04<3:33:58,  1.40s/it]

training loss: 3.3578977584838867


training:  16%|█▋        | 1806/10986 [41:05<3:29:12,  1.37s/it]

training loss: 3.6042864322662354


training:  16%|█▋        | 1807/10986 [41:06<3:25:23,  1.34s/it]

training loss: 3.481905460357666


training:  16%|█▋        | 1808/10986 [41:07<3:23:04,  1.33s/it]

training loss: 3.451570510864258


training:  16%|█▋        | 1809/10986 [41:09<3:20:31,  1.31s/it]

training loss: 3.3591222763061523


training:  16%|█▋        | 1810/10986 [41:10<3:19:02,  1.30s/it]

training loss: 3.4649946689605713


training:  16%|█▋        | 1811/10986 [41:12<3:29:33,  1.37s/it]

training loss: 3.387939929962158


training:  16%|█▋        | 1812/10986 [41:13<3:28:33,  1.36s/it]

training loss: 3.39388108253479


training:  17%|█▋        | 1813/10986 [41:14<3:26:48,  1.35s/it]

training loss: 3.479020357131958


training:  17%|█▋        | 1814/10986 [41:16<3:23:03,  1.33s/it]

training loss: 3.490114688873291


training:  17%|█▋        | 1815/10986 [41:17<3:21:02,  1.32s/it]

training loss: 3.389036178588867


training:  17%|█▋        | 1816/10986 [41:18<3:18:11,  1.30s/it]

training loss: 3.4155519008636475


training:  17%|█▋        | 1817/10986 [41:19<3:16:54,  1.29s/it]

training loss: 3.465115547180176


training:  17%|█▋        | 1818/10986 [41:21<3:16:24,  1.29s/it]

training loss: 3.3385732173919678


training:  17%|█▋        | 1819/10986 [41:22<3:16:39,  1.29s/it]

training loss: 3.3789491653442383


training:  17%|█▋        | 1820/10986 [41:23<3:15:13,  1.28s/it]

training loss: 3.581289052963257
valid loss: 3.581486463546753
perplexity: 35.926902770996094


training:  17%|█▋        | 1821/10986 [41:26<4:19:21,  1.70s/it]

training loss: 3.555495262145996


training:  17%|█▋        | 1822/10986 [41:28<4:18:27,  1.69s/it]

training loss: 3.250847339630127


training:  17%|█▋        | 1823/10986 [41:29<3:58:30,  1.56s/it]

training loss: 3.425049066543579


training:  17%|█▋        | 1824/10986 [41:30<3:44:17,  1.47s/it]

training loss: 3.437689781188965


training:  17%|█▋        | 1825/10986 [41:31<3:33:57,  1.40s/it]

training loss: 3.6214487552642822


training:  17%|█▋        | 1826/10986 [41:33<3:26:50,  1.35s/it]

training loss: 3.337789535522461


training:  17%|█▋        | 1827/10986 [41:34<3:22:09,  1.32s/it]

training loss: 3.4840545654296875


training:  17%|█▋        | 1828/10986 [41:35<3:19:06,  1.30s/it]

training loss: 3.4522502422332764


training:  17%|█▋        | 1829/10986 [41:36<3:17:30,  1.29s/it]

training loss: 3.502906560897827


training:  17%|█▋        | 1830/10986 [41:38<3:16:44,  1.29s/it]

training loss: 3.3865225315093994


training:  17%|█▋        | 1831/10986 [41:39<3:29:25,  1.37s/it]

training loss: 3.3976051807403564


training:  17%|█▋        | 1832/10986 [41:40<3:28:16,  1.37s/it]

training loss: 3.448098659515381


training:  17%|█▋        | 1833/10986 [41:42<3:23:38,  1.33s/it]

training loss: 3.439159631729126


training:  17%|█▋        | 1834/10986 [41:43<3:21:11,  1.32s/it]

training loss: 3.2801029682159424


training:  17%|█▋        | 1835/10986 [41:44<3:19:15,  1.31s/it]

training loss: 3.3859658241271973


training:  17%|█▋        | 1836/10986 [41:46<3:17:27,  1.29s/it]

training loss: 3.344791889190674


training:  17%|█▋        | 1837/10986 [41:47<3:16:55,  1.29s/it]

training loss: 3.391979455947876


training:  17%|█▋        | 1838/10986 [41:48<3:16:12,  1.29s/it]

training loss: 3.4559435844421387


training:  17%|█▋        | 1839/10986 [41:49<3:15:18,  1.28s/it]

training loss: 3.385637044906616


training:  17%|█▋        | 1840/10986 [41:51<3:15:24,  1.28s/it]

training loss: 3.3954668045043945
valid loss: 3.3855276107788086
perplexity: 29.533571243286133


training:  17%|█▋        | 1841/10986 [41:53<4:21:19,  1.71s/it]

training loss: 3.3801398277282715


training:  17%|█▋        | 1842/10986 [41:55<4:03:59,  1.60s/it]

training loss: 3.4766857624053955


training:  17%|█▋        | 1843/10986 [41:56<3:49:33,  1.51s/it]

training loss: 3.4544153213500977


training:  17%|█▋        | 1844/10986 [41:58<3:51:38,  1.52s/it]

training loss: 3.4044694900512695


training:  17%|█▋        | 1845/10986 [41:59<3:56:54,  1.56s/it]

training loss: 3.437685012817383


training:  17%|█▋        | 1846/10986 [42:01<3:51:08,  1.52s/it]

training loss: 3.3690948486328125


training:  17%|█▋        | 1847/10986 [42:02<3:41:05,  1.45s/it]

training loss: 3.3013925552368164


training:  17%|█▋        | 1848/10986 [42:03<3:33:37,  1.40s/it]

training loss: 3.4256668090820312


training:  17%|█▋        | 1849/10986 [42:04<3:27:15,  1.36s/it]

training loss: 3.383713483810425


training:  17%|█▋        | 1850/10986 [42:06<3:22:34,  1.33s/it]

training loss: 3.5056872367858887


training:  17%|█▋        | 1851/10986 [42:07<3:31:01,  1.39s/it]

training loss: 3.4278082847595215


training:  17%|█▋        | 1852/10986 [42:09<3:26:04,  1.35s/it]

training loss: 3.3553357124328613


training:  17%|█▋        | 1853/10986 [42:10<3:22:34,  1.33s/it]

training loss: 3.526496410369873


training:  17%|█▋        | 1854/10986 [42:11<3:19:45,  1.31s/it]

training loss: 3.38141131401062


training:  17%|█▋        | 1855/10986 [42:12<3:17:58,  1.30s/it]

training loss: 3.427738904953003


training:  17%|█▋        | 1856/10986 [42:14<3:18:23,  1.30s/it]

training loss: 3.3007192611694336


training:  17%|█▋        | 1857/10986 [42:15<3:17:16,  1.30s/it]

training loss: 3.400387763977051


training:  17%|█▋        | 1858/10986 [42:16<3:15:31,  1.29s/it]

training loss: 3.4024834632873535


training:  17%|█▋        | 1859/10986 [42:17<3:14:51,  1.28s/it]

training loss: 3.3359079360961914


training:  17%|█▋        | 1860/10986 [42:19<3:14:34,  1.28s/it]

training loss: 3.368582248687744
valid loss: 3.3608479499816895
perplexity: 28.81361198425293


training:  17%|█▋        | 1861/10986 [42:22<4:21:35,  1.72s/it]

training loss: 3.318188190460205


training:  17%|█▋        | 1862/10986 [42:23<4:04:49,  1.61s/it]

training loss: 3.4068753719329834


training:  17%|█▋        | 1863/10986 [42:24<3:51:17,  1.52s/it]

training loss: 3.4214489459991455


training:  17%|█▋        | 1864/10986 [42:25<3:40:15,  1.45s/it]

training loss: 3.386690139770508


training:  17%|█▋        | 1865/10986 [42:27<3:32:58,  1.40s/it]

training loss: 3.3817524909973145


training:  17%|█▋        | 1866/10986 [42:28<3:26:22,  1.36s/it]

training loss: 3.39935040473938


training:  17%|█▋        | 1867/10986 [42:29<3:21:59,  1.33s/it]

training loss: 3.306265354156494


training:  17%|█▋        | 1868/10986 [42:31<3:19:45,  1.31s/it]

training loss: 3.3767523765563965


training:  17%|█▋        | 1869/10986 [42:32<3:17:26,  1.30s/it]

training loss: 3.336151123046875


training:  17%|█▋        | 1870/10986 [42:33<3:16:03,  1.29s/it]

training loss: 3.3782055377960205


training:  17%|█▋        | 1871/10986 [42:35<3:26:35,  1.36s/it]

training loss: 3.336906671524048


training:  17%|█▋        | 1872/10986 [42:36<3:23:04,  1.34s/it]

training loss: 3.325653553009033


training:  17%|█▋        | 1873/10986 [42:37<3:19:43,  1.31s/it]

training loss: 3.456083297729492


training:  17%|█▋        | 1874/10986 [42:38<3:16:25,  1.29s/it]

training loss: 3.3314504623413086


training:  17%|█▋        | 1875/10986 [42:40<3:14:01,  1.28s/it]

training loss: 3.306797504425049


training:  17%|█▋        | 1876/10986 [42:41<3:13:54,  1.28s/it]

training loss: 3.332979679107666


training:  17%|█▋        | 1877/10986 [42:42<3:12:10,  1.27s/it]

training loss: 3.414078950881958


training:  17%|█▋        | 1878/10986 [42:43<3:11:41,  1.26s/it]

training loss: 3.4427947998046875


training:  17%|█▋        | 1879/10986 [42:45<3:13:17,  1.27s/it]

training loss: 3.3612568378448486


training:  17%|█▋        | 1880/10986 [42:46<3:12:17,  1.27s/it]

training loss: 3.310006856918335
valid loss: 3.304645299911499
perplexity: 27.23887825012207


training:  17%|█▋        | 1881/10986 [42:49<4:15:17,  1.68s/it]

training loss: 3.355114459991455


training:  17%|█▋        | 1882/10986 [42:50<3:58:29,  1.57s/it]

training loss: 3.426957845687866


training:  17%|█▋        | 1883/10986 [42:51<3:45:02,  1.48s/it]

training loss: 3.4365689754486084


training:  17%|█▋        | 1884/10986 [42:52<3:34:40,  1.42s/it]

training loss: 3.302372694015503


training:  17%|█▋        | 1885/10986 [42:54<3:27:27,  1.37s/it]

training loss: 3.379127264022827


training:  17%|█▋        | 1886/10986 [42:55<3:22:25,  1.33s/it]

training loss: 3.3377671241760254


training:  17%|█▋        | 1887/10986 [42:56<3:19:59,  1.32s/it]

training loss: 3.3591179847717285


training:  17%|█▋        | 1888/10986 [42:57<3:16:11,  1.29s/it]

training loss: 3.2891881465911865


training:  17%|█▋        | 1889/10986 [42:59<3:13:28,  1.28s/it]

training loss: 3.418637275695801


training:  17%|█▋        | 1890/10986 [43:00<3:12:55,  1.27s/it]

training loss: 3.491175413131714


training:  17%|█▋        | 1891/10986 [43:01<3:23:02,  1.34s/it]

training loss: 3.4507429599761963


training:  17%|█▋        | 1892/10986 [43:03<3:23:42,  1.34s/it]

training loss: 3.368274211883545


training:  17%|█▋        | 1893/10986 [43:04<3:19:30,  1.32s/it]

training loss: 3.432433843612671


training:  17%|█▋        | 1894/10986 [43:05<3:15:53,  1.29s/it]

training loss: 3.3376271724700928


training:  17%|█▋        | 1895/10986 [43:07<3:13:20,  1.28s/it]

training loss: 3.498316764831543


training:  17%|█▋        | 1896/10986 [43:08<3:11:48,  1.27s/it]

training loss: 3.4285728931427


training:  17%|█▋        | 1897/10986 [43:09<3:11:37,  1.26s/it]

training loss: 3.331326961517334


training:  17%|█▋        | 1898/10986 [43:10<3:11:01,  1.26s/it]

training loss: 3.298828601837158


training:  17%|█▋        | 1899/10986 [43:12<3:10:47,  1.26s/it]

training loss: 3.396803617477417


training:  17%|█▋        | 1900/10986 [43:13<3:12:28,  1.27s/it]

training loss: 3.3545055389404297
valid loss: 3.345665454864502
perplexity: 28.37945556640625


training:  17%|█▋        | 1901/10986 [43:16<4:14:49,  1.68s/it]

training loss: 3.3658931255340576


training:  17%|█▋        | 1902/10986 [43:17<3:59:04,  1.58s/it]

training loss: 3.538093328475952


training:  17%|█▋        | 1903/10986 [43:18<3:45:14,  1.49s/it]

training loss: 3.355475664138794


training:  17%|█▋        | 1904/10986 [43:19<3:34:12,  1.42s/it]

training loss: 3.3243489265441895


training:  17%|█▋        | 1905/10986 [43:21<3:26:40,  1.37s/it]

training loss: 3.4067845344543457


training:  17%|█▋        | 1906/10986 [43:22<3:21:56,  1.33s/it]

training loss: 3.434547185897827


training:  17%|█▋        | 1907/10986 [43:23<3:17:31,  1.31s/it]

training loss: 3.361154317855835


training:  17%|█▋        | 1908/10986 [43:24<3:15:02,  1.29s/it]

training loss: 3.4635322093963623


training:  17%|█▋        | 1909/10986 [43:26<3:14:28,  1.29s/it]

training loss: 3.430994987487793


training:  17%|█▋        | 1910/10986 [43:27<3:12:45,  1.27s/it]

training loss: 3.329000234603882


training:  17%|█▋        | 1911/10986 [43:28<3:23:15,  1.34s/it]

training loss: 3.393679618835449


training:  17%|█▋        | 1912/10986 [43:30<3:23:14,  1.34s/it]

training loss: 3.3171629905700684


training:  17%|█▋        | 1913/10986 [43:31<3:19:38,  1.32s/it]

training loss: 3.434642791748047


training:  17%|█▋        | 1914/10986 [43:32<3:17:58,  1.31s/it]

training loss: 3.450866460800171


training:  17%|█▋        | 1915/10986 [43:34<3:18:01,  1.31s/it]

training loss: 3.454252243041992


training:  17%|█▋        | 1916/10986 [43:35<3:15:49,  1.30s/it]

training loss: 3.4941117763519287


training:  17%|█▋        | 1917/10986 [43:36<3:13:44,  1.28s/it]

training loss: 3.2781505584716797


training:  17%|█▋        | 1918/10986 [43:37<3:12:22,  1.27s/it]

training loss: 3.443272113800049


training:  17%|█▋        | 1919/10986 [43:39<3:12:34,  1.27s/it]

training loss: 3.3490536212921143


training:  17%|█▋        | 1920/10986 [43:40<3:12:51,  1.28s/it]

training loss: 3.563370704650879
valid loss: 3.572376251220703
perplexity: 35.60108947753906


training:  17%|█▋        | 1921/10986 [43:43<4:17:41,  1.71s/it]

training loss: 3.407744884490967


training:  17%|█▋        | 1922/10986 [43:44<4:03:43,  1.61s/it]

training loss: 3.4637742042541504


training:  18%|█▊        | 1923/10986 [43:45<3:51:29,  1.53s/it]

training loss: 3.4815802574157715


training:  18%|█▊        | 1924/10986 [43:47<3:41:23,  1.47s/it]

training loss: 3.488175392150879


training:  18%|█▊        | 1925/10986 [43:48<3:33:49,  1.42s/it]

training loss: 3.368955612182617


training:  18%|█▊        | 1926/10986 [43:49<3:27:52,  1.38s/it]

training loss: 3.537553310394287


training:  18%|█▊        | 1927/10986 [43:51<3:24:27,  1.35s/it]

training loss: 3.4114959239959717


training:  18%|█▊        | 1928/10986 [43:52<3:21:55,  1.34s/it]

training loss: 3.5287086963653564


training:  18%|█▊        | 1929/10986 [43:53<3:19:45,  1.32s/it]

training loss: 3.6294217109680176


training:  18%|█▊        | 1930/10986 [43:54<3:18:19,  1.31s/it]

training loss: 3.5550687313079834


training:  18%|█▊        | 1931/10986 [43:56<3:27:47,  1.38s/it]

training loss: 3.387746572494507


training:  18%|█▊        | 1932/10986 [43:57<3:24:50,  1.36s/it]

training loss: 3.3472232818603516


training:  18%|█▊        | 1933/10986 [43:59<3:22:05,  1.34s/it]

training loss: 3.6011714935302734


training:  18%|█▊        | 1934/10986 [44:00<3:19:45,  1.32s/it]

training loss: 3.6441287994384766


training:  18%|█▊        | 1935/10986 [44:01<3:18:27,  1.32s/it]

training loss: 3.561267137527466


training:  18%|█▊        | 1936/10986 [44:02<3:18:06,  1.31s/it]

training loss: 3.423793077468872


training:  18%|█▊        | 1937/10986 [44:04<3:17:32,  1.31s/it]

training loss: 3.411850929260254


training:  18%|█▊        | 1938/10986 [44:05<3:17:31,  1.31s/it]

training loss: 3.43778395652771


training:  18%|█▊        | 1939/10986 [44:06<3:16:41,  1.30s/it]

training loss: 3.482506275177002


training:  18%|█▊        | 1940/10986 [44:08<3:16:02,  1.30s/it]

training loss: 3.400515556335449
valid loss: 3.4024219512939453
perplexity: 30.036758422851562


training:  18%|█▊        | 1941/10986 [44:10<4:19:46,  1.72s/it]

training loss: 3.456000804901123


training:  18%|█▊        | 1942/10986 [44:12<4:06:28,  1.64s/it]

training loss: 3.401374101638794


training:  18%|█▊        | 1943/10986 [44:13<3:51:12,  1.53s/it]

training loss: 3.357586145401001


training:  18%|█▊        | 1944/10986 [44:14<3:40:13,  1.46s/it]

training loss: 3.3907735347747803


training:  18%|█▊        | 1945/10986 [44:16<3:33:41,  1.42s/it]

training loss: 3.407489538192749


training:  18%|█▊        | 1946/10986 [44:17<3:27:52,  1.38s/it]

training loss: 3.3544533252716064


training:  18%|█▊        | 1947/10986 [44:18<3:23:26,  1.35s/it]

training loss: 3.436821222305298


training:  18%|█▊        | 1948/10986 [44:20<3:19:56,  1.33s/it]

training loss: 3.58514142036438


training:  18%|█▊        | 1949/10986 [44:21<3:18:30,  1.32s/it]

training loss: 3.432807445526123


training:  18%|█▊        | 1950/10986 [44:22<3:17:55,  1.31s/it]

training loss: 3.4677493572235107


training:  18%|█▊        | 1951/10986 [44:24<3:27:24,  1.38s/it]

training loss: 3.3127739429473877


training:  18%|█▊        | 1952/10986 [44:25<3:36:55,  1.44s/it]

training loss: 3.3141279220581055


training:  18%|█▊        | 1953/10986 [44:27<3:31:04,  1.40s/it]

training loss: 3.446816921234131


training:  18%|█▊        | 1954/10986 [44:28<3:25:16,  1.36s/it]

training loss: 3.4663877487182617


training:  18%|█▊        | 1955/10986 [44:29<3:21:31,  1.34s/it]

training loss: 3.3625729084014893


training:  18%|█▊        | 1956/10986 [44:30<3:18:09,  1.32s/it]

training loss: 3.4034550189971924


training:  18%|█▊        | 1957/10986 [44:32<3:17:00,  1.31s/it]

training loss: 3.391545534133911


training:  18%|█▊        | 1958/10986 [44:33<3:15:09,  1.30s/it]

training loss: 3.462808847427368


training:  18%|█▊        | 1959/10986 [44:34<3:13:43,  1.29s/it]

training loss: 3.3508455753326416


training:  18%|█▊        | 1960/10986 [44:36<3:13:45,  1.29s/it]

training loss: 3.489546298980713
valid loss: 3.48407244682312
perplexity: 32.59218215942383


training:  18%|█▊        | 1961/10986 [44:38<4:14:39,  1.69s/it]

training loss: 3.3546302318573


training:  18%|█▊        | 1962/10986 [44:40<3:58:46,  1.59s/it]

training loss: 3.259744167327881


training:  18%|█▊        | 1963/10986 [44:41<3:44:23,  1.49s/it]

training loss: 3.328444719314575


training:  18%|█▊        | 1964/10986 [44:42<3:34:12,  1.42s/it]

training loss: 3.4288694858551025


training:  18%|█▊        | 1965/10986 [44:43<3:26:20,  1.37s/it]

training loss: 3.374166488647461


training:  18%|█▊        | 1966/10986 [44:45<3:22:09,  1.34s/it]

training loss: 3.4341769218444824


training:  18%|█▊        | 1967/10986 [44:46<3:20:07,  1.33s/it]

training loss: 3.3667354583740234


training:  18%|█▊        | 1968/10986 [44:47<3:17:15,  1.31s/it]

training loss: 3.3417623043060303


training:  18%|█▊        | 1969/10986 [44:48<3:15:10,  1.30s/it]

training loss: 3.31204891204834


training:  18%|█▊        | 1970/10986 [44:50<3:13:30,  1.29s/it]

training loss: 3.285741090774536


training:  18%|█▊        | 1971/10986 [44:51<3:24:36,  1.36s/it]

training loss: 3.282224655151367


training:  18%|█▊        | 1972/10986 [44:53<3:21:34,  1.34s/it]

training loss: 3.39092755317688


training:  18%|█▊        | 1973/10986 [44:54<3:18:13,  1.32s/it]

training loss: 3.461069345474243


training:  18%|█▊        | 1974/10986 [44:55<3:15:49,  1.30s/it]

training loss: 3.354856014251709


training:  18%|█▊        | 1975/10986 [44:56<3:13:04,  1.29s/it]

training loss: 3.5372161865234375


training:  18%|█▊        | 1976/10986 [44:58<3:10:11,  1.27s/it]

training loss: 3.381193161010742


training:  18%|█▊        | 1977/10986 [44:59<3:09:59,  1.27s/it]

training loss: 3.2839672565460205


training:  18%|█▊        | 1978/10986 [45:00<3:09:31,  1.26s/it]

training loss: 3.4368655681610107


training:  18%|█▊        | 1979/10986 [45:01<3:09:25,  1.26s/it]

training loss: 3.315554618835449


training:  18%|█▊        | 1980/10986 [45:03<3:09:05,  1.26s/it]

training loss: 3.429717540740967
valid loss: 3.4322047233581543
perplexity: 30.944793701171875


training:  18%|█▊        | 1981/10986 [45:05<4:12:14,  1.68s/it]

training loss: 3.3468332290649414


training:  18%|█▊        | 1982/10986 [45:07<3:58:12,  1.59s/it]

training loss: 3.348087787628174


training:  18%|█▊        | 1983/10986 [45:08<3:43:54,  1.49s/it]

training loss: 3.3283140659332275


training:  18%|█▊        | 1984/10986 [45:09<3:32:46,  1.42s/it]

training loss: 3.384441614151001


training:  18%|█▊        | 1985/10986 [45:10<3:25:47,  1.37s/it]

training loss: 3.347522497177124


training:  18%|█▊        | 1986/10986 [45:12<3:20:35,  1.34s/it]

training loss: 3.35918927192688


training:  18%|█▊        | 1987/10986 [45:13<3:16:20,  1.31s/it]

training loss: 3.4861836433410645


training:  18%|█▊        | 1988/10986 [45:14<3:13:23,  1.29s/it]

training loss: 3.4058117866516113


training:  18%|█▊        | 1989/10986 [45:15<3:12:56,  1.29s/it]

training loss: 3.3334240913391113


training:  18%|█▊        | 1990/10986 [45:17<3:13:29,  1.29s/it]

training loss: 3.3782894611358643


training:  18%|█▊        | 1991/10986 [45:18<3:24:01,  1.36s/it]

training loss: 3.3400187492370605


training:  18%|█▊        | 1992/10986 [45:19<3:20:07,  1.34s/it]

training loss: 3.4724488258361816


training:  18%|█▊        | 1993/10986 [45:21<3:18:35,  1.33s/it]

training loss: 3.315929651260376


training:  18%|█▊        | 1994/10986 [45:22<3:16:17,  1.31s/it]

training loss: 3.419241189956665


training:  18%|█▊        | 1995/10986 [45:23<3:14:31,  1.30s/it]

training loss: 3.308321237564087


training:  18%|█▊        | 1996/10986 [45:25<3:14:06,  1.30s/it]

training loss: 3.3898634910583496


training:  18%|█▊        | 1997/10986 [45:26<3:13:26,  1.29s/it]

training loss: 3.503370523452759


training:  18%|█▊        | 1998/10986 [45:28<3:30:56,  1.41s/it]

training loss: 3.435081720352173


training:  18%|█▊        | 1999/10986 [45:29<3:44:02,  1.50s/it]

training loss: 3.4089698791503906


training:  18%|█▊        | 2000/10986 [45:31<3:37:06,  1.45s/it]

training loss: 3.4704763889312744
valid loss: 3.464466094970703
perplexity: 31.959392547607422


training:  18%|█▊        | 2001/10986 [45:33<4:33:34,  1.83s/it]

training loss: 3.2534337043762207


training:  18%|█▊        | 2002/10986 [45:35<4:13:20,  1.69s/it]

training loss: 3.4218311309814453


training:  18%|█▊        | 2003/10986 [45:36<3:54:45,  1.57s/it]

training loss: 3.3514785766601562


training:  18%|█▊        | 2004/10986 [45:37<3:41:27,  1.48s/it]

training loss: 3.3200483322143555


training:  18%|█▊        | 2005/10986 [45:39<3:32:30,  1.42s/it]

training loss: 3.407050371170044


training:  18%|█▊        | 2006/10986 [45:40<3:26:13,  1.38s/it]

training loss: 3.3382740020751953


training:  18%|█▊        | 2007/10986 [45:41<3:22:44,  1.35s/it]

training loss: 3.4016239643096924


training:  18%|█▊        | 2008/10986 [45:42<3:19:00,  1.33s/it]

training loss: 3.3595786094665527


training:  18%|█▊        | 2009/10986 [45:44<3:16:40,  1.31s/it]

training loss: 3.308332920074463


training:  18%|█▊        | 2010/10986 [45:45<3:14:51,  1.30s/it]

training loss: 3.299360513687134


training:  18%|█▊        | 2011/10986 [45:47<3:27:15,  1.39s/it]

training loss: 3.452410936355591


training:  18%|█▊        | 2012/10986 [45:48<3:23:55,  1.36s/it]

training loss: 3.247069835662842


training:  18%|█▊        | 2013/10986 [45:49<3:20:28,  1.34s/it]

training loss: 3.376972198486328


training:  18%|█▊        | 2014/10986 [45:50<3:17:25,  1.32s/it]

training loss: 3.2974934577941895


training:  18%|█▊        | 2015/10986 [45:52<3:16:48,  1.32s/it]

training loss: 3.3637778759002686


training:  18%|█▊        | 2016/10986 [45:53<3:15:11,  1.31s/it]

training loss: 3.433131694793701


training:  18%|█▊        | 2017/10986 [45:54<3:15:23,  1.31s/it]

training loss: 3.333113670349121


training:  18%|█▊        | 2018/10986 [45:56<3:14:39,  1.30s/it]

training loss: 3.300506114959717


training:  18%|█▊        | 2019/10986 [45:57<3:12:26,  1.29s/it]

training loss: 3.3236074447631836


training:  18%|█▊        | 2020/10986 [45:58<3:11:43,  1.28s/it]

training loss: 3.3189857006073
valid loss: 3.3195912837982178
perplexity: 27.6490478515625


training:  18%|█▊        | 2021/10986 [46:01<4:14:58,  1.71s/it]

training loss: 3.3240463733673096


training:  18%|█▊        | 2022/10986 [46:02<4:01:52,  1.62s/it]

training loss: 3.318967580795288


training:  18%|█▊        | 2023/10986 [46:04<3:46:49,  1.52s/it]

training loss: 3.34383487701416


training:  18%|█▊        | 2024/10986 [46:05<3:36:24,  1.45s/it]

training loss: 3.3167500495910645


training:  18%|█▊        | 2025/10986 [46:06<3:29:05,  1.40s/it]

training loss: 3.314340114593506


training:  18%|█▊        | 2026/10986 [46:07<3:23:25,  1.36s/it]

training loss: 3.2374441623687744


training:  18%|█▊        | 2027/10986 [46:09<3:20:57,  1.35s/it]

training loss: 3.4210047721862793


training:  18%|█▊        | 2028/10986 [46:10<3:17:19,  1.32s/it]

training loss: 3.502011299133301


training:  18%|█▊        | 2029/10986 [46:11<3:14:54,  1.31s/it]

training loss: 3.2369654178619385


training:  18%|█▊        | 2030/10986 [46:12<3:14:07,  1.30s/it]

training loss: 3.389420986175537


training:  18%|█▊        | 2031/10986 [46:14<3:24:32,  1.37s/it]

training loss: 3.253239631652832


training:  18%|█▊        | 2032/10986 [46:16<3:32:49,  1.43s/it]

training loss: 3.2895636558532715


training:  19%|█▊        | 2033/10986 [46:17<3:25:56,  1.38s/it]

training loss: 3.3822991847991943


training:  19%|█▊        | 2034/10986 [46:18<3:20:50,  1.35s/it]

training loss: 3.487544536590576


training:  19%|█▊        | 2035/10986 [46:19<3:17:24,  1.32s/it]

training loss: 3.298410177230835


training:  19%|█▊        | 2036/10986 [46:21<3:15:23,  1.31s/it]

training loss: 3.33577036857605


training:  19%|█▊        | 2037/10986 [46:22<3:13:51,  1.30s/it]

training loss: 3.230386734008789


training:  19%|█▊        | 2038/10986 [46:23<3:12:51,  1.29s/it]

training loss: 3.319938898086548


training:  19%|█▊        | 2039/10986 [46:25<3:13:43,  1.30s/it]

training loss: 3.3810276985168457


training:  19%|█▊        | 2040/10986 [46:26<3:13:10,  1.30s/it]

training loss: 3.3623621463775635
valid loss: 3.368079900741577
perplexity: 29.022747039794922


training:  19%|█▊        | 2041/10986 [46:28<4:13:48,  1.70s/it]

training loss: 3.320822238922119


training:  19%|█▊        | 2042/10986 [46:30<3:58:56,  1.60s/it]

training loss: 3.25125789642334


training:  19%|█▊        | 2043/10986 [46:31<3:45:05,  1.51s/it]

training loss: 3.355335235595703


training:  19%|█▊        | 2044/10986 [46:32<3:35:04,  1.44s/it]

training loss: 3.384061336517334


training:  19%|█▊        | 2045/10986 [46:34<3:27:19,  1.39s/it]

training loss: 3.3163347244262695


training:  19%|█▊        | 2046/10986 [46:35<3:22:24,  1.36s/it]

training loss: 3.30248761177063


training:  19%|█▊        | 2047/10986 [46:36<3:18:30,  1.33s/it]

training loss: 3.375251054763794


training:  19%|█▊        | 2048/10986 [46:38<3:16:12,  1.32s/it]

training loss: 3.3289780616760254


training:  19%|█▊        | 2049/10986 [46:39<3:15:09,  1.31s/it]

training loss: 3.3351471424102783


training:  19%|█▊        | 2050/10986 [46:40<3:14:34,  1.31s/it]

training loss: 3.3356094360351562


training:  19%|█▊        | 2051/10986 [46:42<3:25:31,  1.38s/it]

training loss: 3.3833367824554443


training:  19%|█▊        | 2052/10986 [46:43<3:21:56,  1.36s/it]

training loss: 3.344409704208374


training:  19%|█▊        | 2053/10986 [46:44<3:19:10,  1.34s/it]

training loss: 3.436426877975464


training:  19%|█▊        | 2054/10986 [46:46<3:17:33,  1.33s/it]

training loss: 3.306567668914795


training:  19%|█▊        | 2055/10986 [46:47<3:15:33,  1.31s/it]

training loss: 3.291471481323242


training:  19%|█▊        | 2056/10986 [46:48<3:14:01,  1.30s/it]

training loss: 3.415923595428467


training:  19%|█▊        | 2057/10986 [46:49<3:14:26,  1.31s/it]

training loss: 3.4534010887145996


training:  19%|█▊        | 2058/10986 [46:51<3:13:44,  1.30s/it]

training loss: 3.421447992324829


training:  19%|█▊        | 2059/10986 [46:52<3:12:23,  1.29s/it]

training loss: 3.3402390480041504


training:  19%|█▉        | 2060/10986 [46:53<3:12:20,  1.29s/it]

training loss: 3.4269466400146484
valid loss: 3.424999475479126
perplexity: 30.722627639770508


training:  19%|█▉        | 2061/10986 [46:56<4:18:09,  1.74s/it]

training loss: 3.413524866104126


training:  19%|█▉        | 2062/10986 [46:58<4:05:27,  1.65s/it]

training loss: 3.4143903255462646


training:  19%|█▉        | 2063/10986 [46:59<3:49:02,  1.54s/it]

training loss: 3.349574565887451


training:  19%|█▉        | 2064/10986 [47:00<3:37:43,  1.46s/it]

training loss: 3.390650749206543


training:  19%|█▉        | 2065/10986 [47:01<3:28:25,  1.40s/it]

training loss: 3.4049127101898193


training:  19%|█▉        | 2066/10986 [47:03<3:22:16,  1.36s/it]

training loss: 3.3245527744293213


training:  19%|█▉        | 2067/10986 [47:04<3:17:54,  1.33s/it]

training loss: 3.4088284969329834


training:  19%|█▉        | 2068/10986 [47:05<3:14:49,  1.31s/it]

training loss: 3.3375964164733887


training:  19%|█▉        | 2069/10986 [47:06<3:13:00,  1.30s/it]

training loss: 3.449636459350586


training:  19%|█▉        | 2070/10986 [47:08<3:12:10,  1.29s/it]

training loss: 3.5145201683044434


training:  19%|█▉        | 2071/10986 [47:09<3:24:39,  1.38s/it]

training loss: 3.388171672821045


training:  19%|█▉        | 2072/10986 [47:11<3:22:27,  1.36s/it]

training loss: 3.461212635040283


training:  19%|█▉        | 2073/10986 [47:12<3:18:57,  1.34s/it]

training loss: 3.271970510482788


training:  19%|█▉        | 2074/10986 [47:13<3:16:19,  1.32s/it]

training loss: 3.498974084854126


training:  19%|█▉        | 2075/10986 [47:14<3:14:39,  1.31s/it]

training loss: 3.3203353881835938


training:  19%|█▉        | 2076/10986 [47:16<3:12:43,  1.30s/it]

training loss: 3.313786506652832


training:  19%|█▉        | 2077/10986 [47:17<3:12:59,  1.30s/it]

training loss: 3.4837710857391357


training:  19%|█▉        | 2078/10986 [47:18<3:12:05,  1.29s/it]

training loss: 3.390571355819702


training:  19%|█▉        | 2079/10986 [47:20<3:11:52,  1.29s/it]

training loss: 3.3534533977508545


training:  19%|█▉        | 2080/10986 [47:21<3:11:20,  1.29s/it]

training loss: 3.3437819480895996
valid loss: 3.348397970199585
perplexity: 28.457107543945312


training:  19%|█▉        | 2081/10986 [47:24<4:13:56,  1.71s/it]

training loss: 3.447636127471924


training:  19%|█▉        | 2082/10986 [47:25<4:10:17,  1.69s/it]

training loss: 3.4740469455718994


training:  19%|█▉        | 2083/10986 [47:26<3:53:38,  1.57s/it]

training loss: 3.4672293663024902


training:  19%|█▉        | 2084/10986 [47:28<3:40:31,  1.49s/it]

training loss: 3.3751540184020996


training:  19%|█▉        | 2085/10986 [47:29<3:30:53,  1.42s/it]

training loss: 3.387838363647461


training:  19%|█▉        | 2086/10986 [47:30<3:23:50,  1.37s/it]

training loss: 3.3518717288970947


training:  19%|█▉        | 2087/10986 [47:32<3:19:11,  1.34s/it]

training loss: 3.4751968383789062


training:  19%|█▉        | 2088/10986 [47:33<3:15:30,  1.32s/it]

training loss: 3.4321885108947754


training:  19%|█▉        | 2089/10986 [47:34<3:13:39,  1.31s/it]

training loss: 3.3593807220458984


training:  19%|█▉        | 2090/10986 [47:35<3:12:14,  1.30s/it]

training loss: 3.415811061859131


training:  19%|█▉        | 2091/10986 [47:37<3:22:48,  1.37s/it]

training loss: 3.490135908126831


training:  19%|█▉        | 2092/10986 [47:39<3:36:27,  1.46s/it]

training loss: 3.3085458278656006


training:  19%|█▉        | 2093/10986 [47:40<3:28:59,  1.41s/it]

training loss: 3.405940055847168


training:  19%|█▉        | 2094/10986 [47:41<3:23:27,  1.37s/it]

training loss: 3.2942025661468506


training:  19%|█▉        | 2095/10986 [47:42<3:19:29,  1.35s/it]

training loss: 3.3805952072143555


training:  19%|█▉        | 2096/10986 [47:44<3:17:16,  1.33s/it]

training loss: 3.308218002319336


training:  19%|█▉        | 2097/10986 [47:45<3:14:59,  1.32s/it]

training loss: 3.3239290714263916


training:  19%|█▉        | 2098/10986 [47:46<3:13:22,  1.31s/it]

training loss: 3.3484060764312744


training:  19%|█▉        | 2099/10986 [47:48<3:12:21,  1.30s/it]

training loss: 3.4184751510620117


training:  19%|█▉        | 2100/10986 [47:49<3:11:04,  1.29s/it]

training loss: 3.3985748291015625
valid loss: 3.397578716278076
perplexity: 29.891637802124023


training:  19%|█▉        | 2101/10986 [47:52<4:15:39,  1.73s/it]

training loss: 3.3701231479644775


training:  19%|█▉        | 2102/10986 [47:53<4:13:01,  1.71s/it]

training loss: 3.361267566680908


training:  19%|█▉        | 2103/10986 [47:55<3:54:28,  1.58s/it]

training loss: 3.388545036315918


training:  19%|█▉        | 2104/10986 [47:56<3:41:05,  1.49s/it]

training loss: 3.440329074859619


training:  19%|█▉        | 2105/10986 [47:57<3:31:33,  1.43s/it]

training loss: 3.353943347930908


training:  19%|█▉        | 2106/10986 [47:58<3:25:22,  1.39s/it]

training loss: 3.357283115386963


training:  19%|█▉        | 2107/10986 [48:00<3:21:00,  1.36s/it]

training loss: 3.403451919555664


training:  19%|█▉        | 2108/10986 [48:01<3:19:32,  1.35s/it]

training loss: 3.4166860580444336


training:  19%|█▉        | 2109/10986 [48:02<3:17:56,  1.34s/it]

training loss: 3.39058256149292


training:  19%|█▉        | 2110/10986 [48:04<3:15:24,  1.32s/it]

training loss: 3.2588255405426025


training:  19%|█▉        | 2111/10986 [48:05<3:26:14,  1.39s/it]

training loss: 3.31502628326416


training:  19%|█▉        | 2112/10986 [48:07<3:23:19,  1.37s/it]

training loss: 3.363994598388672


training:  19%|█▉        | 2113/10986 [48:08<3:20:16,  1.35s/it]

training loss: 3.446735382080078


training:  19%|█▉        | 2114/10986 [48:09<3:18:04,  1.34s/it]

training loss: 3.369234085083008


training:  19%|█▉        | 2115/10986 [48:10<3:14:58,  1.32s/it]

training loss: 3.407823085784912


training:  19%|█▉        | 2116/10986 [48:12<3:13:37,  1.31s/it]

training loss: 3.363341808319092


training:  19%|█▉        | 2117/10986 [48:13<3:12:14,  1.30s/it]

training loss: 3.3735837936401367


training:  19%|█▉        | 2118/10986 [48:14<3:11:40,  1.30s/it]

training loss: 3.4005236625671387


training:  19%|█▉        | 2119/10986 [48:16<3:11:04,  1.29s/it]

training loss: 3.2969417572021484


training:  19%|█▉        | 2120/10986 [48:17<3:10:14,  1.29s/it]

training loss: 3.355729579925537
valid loss: 3.3434126377105713
perplexity: 28.315593719482422


training:  19%|█▉        | 2121/10986 [48:20<4:11:55,  1.71s/it]

training loss: 3.4154651165008545


training:  19%|█▉        | 2122/10986 [48:21<4:14:18,  1.72s/it]

training loss: 3.3006246089935303


training:  19%|█▉        | 2123/10986 [48:23<3:55:46,  1.60s/it]

training loss: 3.5172905921936035


training:  19%|█▉        | 2124/10986 [48:24<3:41:50,  1.50s/it]

training loss: 3.390477180480957


training:  19%|█▉        | 2125/10986 [48:25<3:32:49,  1.44s/it]

training loss: 3.3353283405303955


training:  19%|█▉        | 2126/10986 [48:26<3:25:15,  1.39s/it]

training loss: 3.395519971847534


training:  19%|█▉        | 2127/10986 [48:28<3:20:04,  1.36s/it]

training loss: 3.408433198928833


training:  19%|█▉        | 2128/10986 [48:29<3:17:01,  1.33s/it]

training loss: 3.3848519325256348


training:  19%|█▉        | 2129/10986 [48:30<3:15:43,  1.33s/it]

training loss: 3.3295531272888184


training:  19%|█▉        | 2130/10986 [48:32<3:14:51,  1.32s/it]

training loss: 3.3663265705108643


training:  19%|█▉        | 2131/10986 [48:33<3:24:58,  1.39s/it]

training loss: 3.317600727081299


training:  19%|█▉        | 2132/10986 [48:35<3:32:58,  1.44s/it]

training loss: 3.4106006622314453


training:  19%|█▉        | 2133/10986 [48:36<3:25:59,  1.40s/it]

training loss: 3.4629037380218506


training:  19%|█▉        | 2134/10986 [48:37<3:21:25,  1.37s/it]

training loss: 3.442500352859497


training:  19%|█▉        | 2135/10986 [48:39<3:17:08,  1.34s/it]

training loss: 3.4597537517547607


training:  19%|█▉        | 2136/10986 [48:40<3:15:31,  1.33s/it]

training loss: 3.499436616897583


training:  19%|█▉        | 2137/10986 [48:41<3:13:09,  1.31s/it]

training loss: 3.4688026905059814


training:  19%|█▉        | 2138/10986 [48:42<3:11:42,  1.30s/it]

training loss: 3.4386179447174072


training:  19%|█▉        | 2139/10986 [48:44<3:10:08,  1.29s/it]

training loss: 3.396615743637085


training:  19%|█▉        | 2140/10986 [48:45<3:09:28,  1.29s/it]

training loss: 3.473557472229004
valid loss: 3.4588112831115723
perplexity: 31.779178619384766


training:  19%|█▉        | 2141/10986 [48:48<4:11:40,  1.71s/it]

training loss: 3.48077654838562


training:  19%|█▉        | 2142/10986 [48:49<3:56:59,  1.61s/it]

training loss: 3.4551682472229004


training:  20%|█▉        | 2143/10986 [48:50<3:42:57,  1.51s/it]

training loss: 3.374654769897461


training:  20%|█▉        | 2144/10986 [48:52<3:34:10,  1.45s/it]

training loss: 3.438244342803955


training:  20%|█▉        | 2145/10986 [48:53<3:26:34,  1.40s/it]

training loss: 3.4475326538085938


training:  20%|█▉        | 2146/10986 [48:54<3:22:27,  1.37s/it]

training loss: 3.2592995166778564


training:  20%|█▉        | 2147/10986 [48:56<3:25:58,  1.40s/it]

training loss: 3.4019241333007812


training:  20%|█▉        | 2148/10986 [48:57<3:38:12,  1.48s/it]

training loss: 3.386086940765381


training:  20%|█▉        | 2149/10986 [48:59<3:41:27,  1.50s/it]

training loss: 3.310741662979126


training:  20%|█▉        | 2150/10986 [49:00<3:31:46,  1.44s/it]

training loss: 3.433689594268799


training:  20%|█▉        | 2151/10986 [49:02<3:36:25,  1.47s/it]

training loss: 3.399165391921997


training:  20%|█▉        | 2152/10986 [49:03<3:29:36,  1.42s/it]

training loss: 3.3412387371063232


training:  20%|█▉        | 2153/10986 [49:04<3:22:38,  1.38s/it]

training loss: 3.392617702484131


training:  20%|█▉        | 2154/10986 [49:06<3:17:56,  1.34s/it]

training loss: 3.4501945972442627


training:  20%|█▉        | 2155/10986 [49:07<3:15:33,  1.33s/it]

training loss: 3.2904043197631836


training:  20%|█▉        | 2156/10986 [49:08<3:13:59,  1.32s/it]

training loss: 3.390199899673462


training:  20%|█▉        | 2157/10986 [49:09<3:12:13,  1.31s/it]

training loss: 3.3526995182037354


training:  20%|█▉        | 2158/10986 [49:11<3:11:28,  1.30s/it]

training loss: 3.5388691425323486


training:  20%|█▉        | 2159/10986 [49:12<3:10:50,  1.30s/it]

training loss: 3.3602499961853027


training:  20%|█▉        | 2160/10986 [49:13<3:10:18,  1.29s/it]

training loss: 3.508765697479248
valid loss: 3.500387191772461
perplexity: 33.128273010253906


training:  20%|█▉        | 2161/10986 [49:16<4:11:55,  1.71s/it]

training loss: 3.3432393074035645


training:  20%|█▉        | 2162/10986 [49:17<3:57:41,  1.62s/it]

training loss: 3.284576892852783


training:  20%|█▉        | 2163/10986 [49:19<3:44:04,  1.52s/it]

training loss: 3.388762950897217


training:  20%|█▉        | 2164/10986 [49:20<3:33:20,  1.45s/it]

training loss: 3.384383201599121


training:  20%|█▉        | 2165/10986 [49:21<3:28:47,  1.42s/it]

training loss: 3.4342286586761475


training:  20%|█▉        | 2166/10986 [49:23<3:23:31,  1.38s/it]

training loss: 3.4445950984954834


training:  20%|█▉        | 2167/10986 [49:24<3:19:47,  1.36s/it]

training loss: 3.38179087638855


training:  20%|█▉        | 2168/10986 [49:25<3:16:31,  1.34s/it]

training loss: 3.3508501052856445


training:  20%|█▉        | 2169/10986 [49:27<3:15:15,  1.33s/it]

training loss: 3.2685494422912598


training:  20%|█▉        | 2170/10986 [49:28<3:12:47,  1.31s/it]

training loss: 3.3304824829101562


training:  20%|█▉        | 2171/10986 [49:29<3:23:54,  1.39s/it]

training loss: 3.3259313106536865


training:  20%|█▉        | 2172/10986 [49:31<3:19:53,  1.36s/it]

training loss: 3.3660545349121094


training:  20%|█▉        | 2173/10986 [49:32<3:17:00,  1.34s/it]

training loss: 3.3075249195098877


training:  20%|█▉        | 2174/10986 [49:33<3:14:59,  1.33s/it]

training loss: 3.3101747035980225


training:  20%|█▉        | 2175/10986 [49:35<3:13:19,  1.32s/it]

training loss: 3.4788758754730225


training:  20%|█▉        | 2176/10986 [49:36<3:11:22,  1.30s/it]

training loss: 3.324969530105591


training:  20%|█▉        | 2177/10986 [49:37<3:10:39,  1.30s/it]

training loss: 3.2625348567962646


training:  20%|█▉        | 2178/10986 [49:38<3:10:24,  1.30s/it]

training loss: 3.445427656173706


training:  20%|█▉        | 2179/10986 [49:40<3:09:23,  1.29s/it]

training loss: 3.447550058364868


training:  20%|█▉        | 2180/10986 [49:41<3:08:41,  1.29s/it]

training loss: 3.4585330486297607
valid loss: 3.453723907470703
perplexity: 31.617916107177734


training:  20%|█▉        | 2181/10986 [49:44<4:10:41,  1.71s/it]

training loss: 3.3538687229156494


training:  20%|█▉        | 2182/10986 [49:45<3:57:31,  1.62s/it]

training loss: 3.3493881225585938


training:  20%|█▉        | 2183/10986 [49:46<3:42:09,  1.51s/it]

training loss: 3.3863320350646973


training:  20%|█▉        | 2184/10986 [49:48<3:32:17,  1.45s/it]

training loss: 3.3656179904937744


training:  20%|█▉        | 2185/10986 [49:49<3:24:43,  1.40s/it]

training loss: 3.5053532123565674


training:  20%|█▉        | 2186/10986 [49:50<3:19:59,  1.36s/it]

training loss: 3.379753351211548


training:  20%|█▉        | 2187/10986 [49:51<3:16:57,  1.34s/it]

training loss: 3.3910183906555176


training:  20%|█▉        | 2188/10986 [49:53<3:14:17,  1.32s/it]

training loss: 3.46830153465271


training:  20%|█▉        | 2189/10986 [49:54<3:13:45,  1.32s/it]

training loss: 3.3800272941589355


training:  20%|█▉        | 2190/10986 [49:55<3:12:42,  1.31s/it]

training loss: 3.402582883834839


training:  20%|█▉        | 2191/10986 [49:57<3:24:16,  1.39s/it]

training loss: 3.3567466735839844


training:  20%|█▉        | 2192/10986 [49:58<3:20:58,  1.37s/it]

training loss: 3.437394380569458


training:  20%|█▉        | 2193/10986 [50:00<3:16:56,  1.34s/it]

training loss: 3.437993049621582


training:  20%|█▉        | 2194/10986 [50:01<3:14:03,  1.32s/it]

training loss: 3.427151679992676


training:  20%|█▉        | 2195/10986 [50:02<3:11:29,  1.31s/it]

training loss: 3.3445427417755127


training:  20%|█▉        | 2196/10986 [50:03<3:10:22,  1.30s/it]

training loss: 3.488710641860962


training:  20%|█▉        | 2197/10986 [50:05<3:09:28,  1.29s/it]

training loss: 3.300999402999878


training:  20%|██        | 2198/10986 [50:06<3:08:59,  1.29s/it]

training loss: 3.2801496982574463


training:  20%|██        | 2199/10986 [50:07<3:07:23,  1.28s/it]

training loss: 3.4551374912261963


training:  20%|██        | 2200/10986 [50:08<3:08:20,  1.29s/it]

training loss: 3.3614747524261475
valid loss: 3.360902786254883
perplexity: 28.81519317626953


training:  20%|██        | 2201/10986 [50:11<4:09:22,  1.70s/it]

training loss: 3.42124080657959


training:  20%|██        | 2202/10986 [50:13<3:54:09,  1.60s/it]

training loss: 3.3481383323669434


training:  20%|██        | 2203/10986 [50:14<3:39:59,  1.50s/it]

training loss: 3.406977653503418


training:  20%|██        | 2204/10986 [50:15<3:29:07,  1.43s/it]

training loss: 3.3605639934539795


training:  20%|██        | 2205/10986 [50:16<3:22:10,  1.38s/it]

training loss: 3.3779842853546143


training:  20%|██        | 2206/10986 [50:18<3:16:42,  1.34s/it]

training loss: 3.4174301624298096


training:  20%|██        | 2207/10986 [50:19<3:13:58,  1.33s/it]

training loss: 3.496804714202881


training:  20%|██        | 2208/10986 [50:20<3:11:38,  1.31s/it]

training loss: 3.2885923385620117


training:  20%|██        | 2209/10986 [50:21<3:09:52,  1.30s/it]

training loss: 3.4655518531799316


training:  20%|██        | 2210/10986 [50:23<3:10:53,  1.31s/it]

training loss: 3.4194118976593018


training:  20%|██        | 2211/10986 [50:24<3:21:13,  1.38s/it]

training loss: 3.3375661373138428


training:  20%|██        | 2212/10986 [50:26<3:17:40,  1.35s/it]

training loss: 3.3563404083251953


training:  20%|██        | 2213/10986 [50:27<3:14:49,  1.33s/it]

training loss: 3.344226837158203


training:  20%|██        | 2214/10986 [50:28<3:12:20,  1.32s/it]

training loss: 3.370190382003784


training:  20%|██        | 2215/10986 [50:29<3:10:32,  1.30s/it]

training loss: 3.48315167427063


training:  20%|██        | 2216/10986 [50:31<3:09:17,  1.30s/it]

training loss: 3.3690011501312256


training:  20%|██        | 2217/10986 [50:32<3:10:02,  1.30s/it]

training loss: 3.277778387069702


training:  20%|██        | 2218/10986 [50:33<3:09:04,  1.29s/it]

training loss: 3.4394781589508057


training:  20%|██        | 2219/10986 [50:35<3:08:15,  1.29s/it]

training loss: 3.341860055923462


training:  20%|██        | 2220/10986 [50:36<3:07:42,  1.28s/it]

training loss: 3.4438729286193848
valid loss: 3.441110849380493
perplexity: 31.221620559692383


training:  20%|██        | 2221/10986 [50:39<4:09:28,  1.71s/it]

training loss: 3.382563352584839


training:  20%|██        | 2222/10986 [50:40<4:07:17,  1.69s/it]

training loss: 3.3699915409088135


training:  20%|██        | 2223/10986 [50:41<3:50:06,  1.58s/it]

training loss: 3.3640196323394775


training:  20%|██        | 2224/10986 [50:43<3:37:39,  1.49s/it]

training loss: 3.3313188552856445


training:  20%|██        | 2225/10986 [50:44<3:28:00,  1.42s/it]

training loss: 3.4665656089782715


training:  20%|██        | 2226/10986 [50:45<3:21:07,  1.38s/it]

training loss: 3.49021053314209


training:  20%|██        | 2227/10986 [50:47<3:16:49,  1.35s/it]

training loss: 3.467665433883667


training:  20%|██        | 2228/10986 [50:48<3:12:50,  1.32s/it]

training loss: 3.3507235050201416


training:  20%|██        | 2229/10986 [50:49<3:10:19,  1.30s/it]

training loss: 3.3395400047302246


training:  20%|██        | 2230/10986 [50:50<3:09:00,  1.30s/it]

training loss: 3.4163005352020264


training:  20%|██        | 2231/10986 [50:52<3:20:32,  1.37s/it]

training loss: 3.34332013130188


training:  20%|██        | 2232/10986 [50:53<3:18:52,  1.36s/it]

training loss: 3.2623701095581055


training:  20%|██        | 2233/10986 [50:55<3:15:33,  1.34s/it]

training loss: 3.3725409507751465


training:  20%|██        | 2234/10986 [50:56<3:11:57,  1.32s/it]

training loss: 3.355985641479492


training:  20%|██        | 2235/10986 [50:57<3:09:58,  1.30s/it]

training loss: 3.3303096294403076


training:  20%|██        | 2236/10986 [50:58<3:09:06,  1.30s/it]

training loss: 3.3839614391326904


training:  20%|██        | 2237/10986 [51:00<3:07:57,  1.29s/it]

training loss: 3.4339168071746826


training:  20%|██        | 2238/10986 [51:01<3:07:26,  1.29s/it]

training loss: 3.3166890144348145


training:  20%|██        | 2239/10986 [51:02<3:07:22,  1.29s/it]

training loss: 3.4311623573303223


training:  20%|██        | 2240/10986 [51:03<3:06:27,  1.28s/it]

training loss: 3.479588031768799
valid loss: 3.4811129570007324
perplexity: 32.49586868286133


training:  20%|██        | 2241/10986 [51:06<4:06:52,  1.69s/it]

training loss: 3.325894832611084


training:  20%|██        | 2242/10986 [51:07<3:51:42,  1.59s/it]

training loss: 3.3419928550720215


training:  20%|██        | 2243/10986 [51:09<3:38:43,  1.50s/it]

training loss: 3.3250021934509277


training:  20%|██        | 2244/10986 [51:10<3:30:16,  1.44s/it]

training loss: 3.474318504333496


training:  20%|██        | 2245/10986 [51:11<3:23:31,  1.40s/it]

training loss: 3.40321683883667


training:  20%|██        | 2246/10986 [51:13<3:18:50,  1.37s/it]

training loss: 3.349590539932251


training:  20%|██        | 2247/10986 [51:14<3:14:33,  1.34s/it]

training loss: 3.393988847732544


training:  20%|██        | 2248/10986 [51:15<3:11:51,  1.32s/it]

training loss: 3.6011104583740234


training:  20%|██        | 2249/10986 [51:16<3:09:47,  1.30s/it]

training loss: 3.4330978393554688


training:  20%|██        | 2250/10986 [51:18<3:08:39,  1.30s/it]

training loss: 3.4198405742645264


training:  20%|██        | 2251/10986 [51:19<3:20:12,  1.38s/it]

training loss: 3.3701300621032715


training:  20%|██        | 2252/10986 [51:21<3:27:16,  1.42s/it]

training loss: 3.4265213012695312


training:  21%|██        | 2253/10986 [51:22<3:20:28,  1.38s/it]

training loss: 3.304572820663452


training:  21%|██        | 2254/10986 [51:23<3:17:26,  1.36s/it]

training loss: 3.4071874618530273


training:  21%|██        | 2255/10986 [51:25<3:13:38,  1.33s/it]

training loss: 3.4032058715820312


training:  21%|██        | 2256/10986 [51:26<3:10:57,  1.31s/it]

training loss: 3.414263963699341


training:  21%|██        | 2257/10986 [51:27<3:08:45,  1.30s/it]

training loss: 3.35715913772583


training:  21%|██        | 2258/10986 [51:29<3:07:15,  1.29s/it]

training loss: 3.3535232543945312


training:  21%|██        | 2259/10986 [51:30<3:06:30,  1.28s/it]

training loss: 3.3793914318084717


training:  21%|██        | 2260/10986 [51:31<3:06:53,  1.29s/it]

training loss: 3.396682024002075
valid loss: 3.4037046432495117
perplexity: 30.075313568115234


training:  21%|██        | 2261/10986 [51:34<4:09:30,  1.72s/it]

training loss: 3.4307761192321777


training:  21%|██        | 2262/10986 [51:35<3:52:12,  1.60s/it]

training loss: 3.3093724250793457


training:  21%|██        | 2263/10986 [51:36<3:37:40,  1.50s/it]

training loss: 3.392122745513916


training:  21%|██        | 2264/10986 [51:38<3:28:15,  1.43s/it]

training loss: 3.2843058109283447


training:  21%|██        | 2265/10986 [51:39<3:21:29,  1.39s/it]

training loss: 3.349000930786133


training:  21%|██        | 2266/10986 [51:40<3:16:05,  1.35s/it]

training loss: 3.4515862464904785


training:  21%|██        | 2267/10986 [51:41<3:12:07,  1.32s/it]

training loss: 3.3062052726745605


training:  21%|██        | 2268/10986 [51:43<3:09:30,  1.30s/it]

training loss: 3.467311143875122


training:  21%|██        | 2269/10986 [51:44<3:07:25,  1.29s/it]

training loss: 3.357140064239502


training:  21%|██        | 2270/10986 [51:45<3:08:07,  1.30s/it]

training loss: 3.3872885704040527


training:  21%|██        | 2271/10986 [51:47<3:19:36,  1.37s/it]

training loss: 3.4564919471740723


training:  21%|██        | 2272/10986 [51:48<3:15:47,  1.35s/it]

training loss: 3.3703720569610596


training:  21%|██        | 2273/10986 [51:49<3:12:04,  1.32s/it]

training loss: 3.410430669784546


training:  21%|██        | 2274/10986 [51:51<3:09:01,  1.30s/it]

training loss: 3.3721230030059814


training:  21%|██        | 2275/10986 [51:52<3:06:37,  1.29s/it]

training loss: 3.312274217605591


training:  21%|██        | 2276/10986 [51:53<3:07:33,  1.29s/it]

training loss: 3.3984971046447754


training:  21%|██        | 2277/10986 [51:55<3:08:23,  1.30s/it]

training loss: 3.299013137817383


training:  21%|██        | 2278/10986 [51:56<3:06:08,  1.28s/it]

training loss: 3.476703405380249


training:  21%|██        | 2279/10986 [51:57<3:04:43,  1.27s/it]

training loss: 3.333828926086426


training:  21%|██        | 2280/10986 [51:58<3:04:25,  1.27s/it]

training loss: 3.2836601734161377
valid loss: 3.2810418605804443
perplexity: 26.60347557067871


training:  21%|██        | 2281/10986 [52:01<4:06:25,  1.70s/it]

training loss: 3.3091421127319336


training:  21%|██        | 2282/10986 [52:02<3:50:28,  1.59s/it]

training loss: 3.3445241451263428


training:  21%|██        | 2283/10986 [52:04<3:36:37,  1.49s/it]

training loss: 3.2842280864715576


training:  21%|██        | 2284/10986 [52:05<3:26:43,  1.43s/it]

training loss: 3.475160837173462


training:  21%|██        | 2285/10986 [52:06<3:19:34,  1.38s/it]

training loss: 3.414008140563965


training:  21%|██        | 2286/10986 [52:07<3:14:21,  1.34s/it]

training loss: 3.47802472114563


training:  21%|██        | 2287/10986 [52:09<3:11:15,  1.32s/it]

training loss: 3.323575258255005


training:  21%|██        | 2288/10986 [52:10<3:10:53,  1.32s/it]

training loss: 3.4303126335144043


training:  21%|██        | 2289/10986 [52:11<3:08:49,  1.30s/it]

training loss: 3.3439600467681885


training:  21%|██        | 2290/10986 [52:12<3:07:21,  1.29s/it]

training loss: 3.400717258453369


training:  21%|██        | 2291/10986 [52:14<3:18:52,  1.37s/it]

training loss: 3.4009580612182617


training:  21%|██        | 2292/10986 [52:15<3:14:51,  1.34s/it]

training loss: 3.394509792327881


training:  21%|██        | 2293/10986 [52:17<3:11:19,  1.32s/it]

training loss: 3.500798225402832


training:  21%|██        | 2294/10986 [52:18<3:08:47,  1.30s/it]

training loss: 3.3645544052124023


training:  21%|██        | 2295/10986 [52:19<3:07:40,  1.30s/it]

training loss: 3.3644566535949707


training:  21%|██        | 2296/10986 [52:20<3:06:41,  1.29s/it]

training loss: 3.4095208644866943


training:  21%|██        | 2297/10986 [52:22<3:05:32,  1.28s/it]

training loss: 3.3977127075195312


training:  21%|██        | 2298/10986 [52:23<3:05:37,  1.28s/it]

training loss: 3.4215738773345947


training:  21%|██        | 2299/10986 [52:25<3:24:35,  1.41s/it]

training loss: 3.3257181644439697


training:  21%|██        | 2300/10986 [52:26<3:36:33,  1.50s/it]

training loss: 3.4314208030700684
valid loss: 3.4330010414123535
perplexity: 30.969444274902344


training:  21%|██        | 2301/10986 [52:29<4:31:25,  1.88s/it]

training loss: 3.432997465133667


training:  21%|██        | 2302/10986 [52:31<4:19:02,  1.79s/it]

training loss: 3.5150346755981445


training:  21%|██        | 2303/10986 [52:32<3:56:36,  1.64s/it]

training loss: 3.4390664100646973


training:  21%|██        | 2304/10986 [52:33<3:40:16,  1.52s/it]

training loss: 3.3699560165405273


training:  21%|██        | 2305/10986 [52:34<3:28:36,  1.44s/it]

training loss: 3.4175145626068115


training:  21%|██        | 2306/10986 [52:36<3:21:30,  1.39s/it]

training loss: 3.3984739780426025


training:  21%|██        | 2307/10986 [52:37<3:15:14,  1.35s/it]

training loss: 3.3612940311431885


training:  21%|██        | 2308/10986 [52:38<3:11:17,  1.32s/it]

training loss: 3.392261028289795


training:  21%|██        | 2309/10986 [52:40<3:09:33,  1.31s/it]

training loss: 3.2617011070251465


training:  21%|██        | 2310/10986 [52:41<3:07:50,  1.30s/it]

training loss: 3.39839243888855


training:  21%|██        | 2311/10986 [52:42<3:17:03,  1.36s/it]

training loss: 3.2865755558013916


training:  21%|██        | 2312/10986 [52:44<3:14:05,  1.34s/it]

training loss: 3.355987071990967


training:  21%|██        | 2313/10986 [52:45<3:10:13,  1.32s/it]

training loss: 3.369357109069824


training:  21%|██        | 2314/10986 [52:46<3:09:00,  1.31s/it]

training loss: 3.4424500465393066


training:  21%|██        | 2315/10986 [52:47<3:06:37,  1.29s/it]

training loss: 3.5051820278167725


training:  21%|██        | 2316/10986 [52:49<3:04:42,  1.28s/it]

training loss: 3.402207136154175


training:  21%|██        | 2317/10986 [52:50<3:03:50,  1.27s/it]

training loss: 3.4588708877563477


training:  21%|██        | 2318/10986 [52:51<3:03:08,  1.27s/it]

training loss: 3.3814594745635986


training:  21%|██        | 2319/10986 [52:52<3:02:32,  1.26s/it]

training loss: 3.396854877471924


training:  21%|██        | 2320/10986 [52:54<3:03:56,  1.27s/it]

training loss: 3.4120681285858154
valid loss: 3.4007601737976074
perplexity: 29.986886978149414


training:  21%|██        | 2321/10986 [52:56<4:05:11,  1.70s/it]

training loss: 3.405351400375366


training:  21%|██        | 2322/10986 [52:58<3:49:30,  1.59s/it]

training loss: 3.424964427947998


training:  21%|██        | 2323/10986 [52:59<3:36:00,  1.50s/it]

training loss: 3.31355357170105


training:  21%|██        | 2324/10986 [53:00<3:25:30,  1.42s/it]

training loss: 3.3784897327423096


training:  21%|██        | 2325/10986 [53:02<3:20:22,  1.39s/it]

training loss: 3.358593463897705


training:  21%|██        | 2326/10986 [53:03<3:14:45,  1.35s/it]

training loss: 3.4661412239074707


training:  21%|██        | 2327/10986 [53:04<3:10:36,  1.32s/it]

training loss: 3.374877691268921


training:  21%|██        | 2328/10986 [53:05<3:07:52,  1.30s/it]

training loss: 3.3698267936706543


training:  21%|██        | 2329/10986 [53:07<3:05:54,  1.29s/it]

training loss: 3.365906000137329


training:  21%|██        | 2330/10986 [53:08<3:04:38,  1.28s/it]

training loss: 3.5321178436279297


training:  21%|██        | 2331/10986 [53:09<3:16:18,  1.36s/it]

training loss: 3.3804900646209717


training:  21%|██        | 2332/10986 [53:11<3:13:50,  1.34s/it]

training loss: 3.362729072570801


training:  21%|██        | 2333/10986 [53:12<3:11:28,  1.33s/it]

training loss: 3.3933944702148438


training:  21%|██        | 2334/10986 [53:13<3:07:21,  1.30s/it]

training loss: 3.335758686065674


training:  21%|██▏       | 2335/10986 [53:15<3:05:47,  1.29s/it]

training loss: 3.3702118396759033


training:  21%|██▏       | 2336/10986 [53:16<3:04:26,  1.28s/it]

training loss: 3.3714375495910645


training:  21%|██▏       | 2337/10986 [53:17<3:04:29,  1.28s/it]

training loss: 3.393979072570801


training:  21%|██▏       | 2338/10986 [53:18<3:04:28,  1.28s/it]

training loss: 3.395427942276001


training:  21%|██▏       | 2339/10986 [53:20<3:02:55,  1.27s/it]

training loss: 3.380387783050537


training:  21%|██▏       | 2340/10986 [53:21<3:02:27,  1.27s/it]

training loss: 3.400696277618408
valid loss: 3.3950436115264893
perplexity: 29.81595230102539


training:  21%|██▏       | 2341/10986 [53:24<4:02:15,  1.68s/it]

training loss: 3.3926188945770264


training:  21%|██▏       | 2342/10986 [53:25<3:47:55,  1.58s/it]

training loss: 3.4143075942993164


training:  21%|██▏       | 2343/10986 [53:26<3:34:27,  1.49s/it]

training loss: 3.3447256088256836


training:  21%|██▏       | 2344/10986 [53:27<3:23:59,  1.42s/it]

training loss: 3.4358716011047363


training:  21%|██▏       | 2345/10986 [53:29<3:17:07,  1.37s/it]

training loss: 3.3527719974517822


training:  21%|██▏       | 2346/10986 [53:30<3:12:12,  1.33s/it]

training loss: 3.437932014465332


training:  21%|██▏       | 2347/10986 [53:31<3:08:29,  1.31s/it]

training loss: 3.508592128753662


training:  21%|██▏       | 2348/10986 [53:32<3:05:56,  1.29s/it]

training loss: 3.2973251342773438


training:  21%|██▏       | 2349/10986 [53:34<3:04:11,  1.28s/it]

training loss: 3.312535524368286


training:  21%|██▏       | 2350/10986 [53:35<3:03:21,  1.27s/it]

training loss: 3.3565151691436768


training:  21%|██▏       | 2351/10986 [53:36<3:15:27,  1.36s/it]

training loss: 3.4223432540893555


training:  21%|██▏       | 2352/10986 [53:38<3:12:41,  1.34s/it]

training loss: 3.3424222469329834


training:  21%|██▏       | 2353/10986 [53:39<3:09:03,  1.31s/it]

training loss: 3.32519268989563


training:  21%|██▏       | 2354/10986 [53:40<3:06:54,  1.30s/it]

training loss: 3.429675340652466


training:  21%|██▏       | 2355/10986 [53:42<3:04:53,  1.29s/it]

training loss: 3.445303201675415


training:  21%|██▏       | 2356/10986 [53:43<3:03:20,  1.27s/it]

training loss: 3.3549869060516357


training:  21%|██▏       | 2357/10986 [53:44<3:02:09,  1.27s/it]

training loss: 3.4455864429473877


training:  21%|██▏       | 2358/10986 [53:45<3:00:46,  1.26s/it]

training loss: 3.3472280502319336


training:  21%|██▏       | 2359/10986 [53:47<3:00:33,  1.26s/it]

training loss: 3.4252922534942627


training:  21%|██▏       | 2360/10986 [53:48<2:59:57,  1.25s/it]

training loss: 3.406893253326416
valid loss: 3.4071521759033203
perplexity: 30.17917823791504


training:  21%|██▏       | 2361/10986 [53:50<4:01:27,  1.68s/it]

training loss: 3.4355263710021973


training:  22%|██▏       | 2362/10986 [53:52<3:46:41,  1.58s/it]

training loss: 3.313868761062622


training:  22%|██▏       | 2363/10986 [53:53<3:33:30,  1.49s/it]

training loss: 3.5248379707336426


training:  22%|██▏       | 2364/10986 [53:54<3:25:18,  1.43s/it]

training loss: 3.278104066848755


training:  22%|██▏       | 2365/10986 [53:56<3:17:42,  1.38s/it]

training loss: 3.3218955993652344


training:  22%|██▏       | 2366/10986 [53:57<3:13:19,  1.35s/it]

training loss: 3.339073896408081


training:  22%|██▏       | 2367/10986 [53:58<3:09:27,  1.32s/it]

training loss: 3.3841440677642822


training:  22%|██▏       | 2368/10986 [53:59<3:07:50,  1.31s/it]

training loss: 3.460188388824463


training:  22%|██▏       | 2369/10986 [54:01<3:06:29,  1.30s/it]

training loss: 3.41508150100708


training:  22%|██▏       | 2370/10986 [54:02<3:04:47,  1.29s/it]

training loss: 3.339722156524658


training:  22%|██▏       | 2371/10986 [54:03<3:13:56,  1.35s/it]

training loss: 3.381425380706787


training:  22%|██▏       | 2372/10986 [54:05<3:11:05,  1.33s/it]

training loss: 3.408015727996826


training:  22%|██▏       | 2373/10986 [54:06<3:07:23,  1.31s/it]

training loss: 3.378355026245117


training:  22%|██▏       | 2374/10986 [54:07<3:04:57,  1.29s/it]

training loss: 3.358628273010254


training:  22%|██▏       | 2375/10986 [54:08<3:02:44,  1.27s/it]

training loss: 3.438471794128418


training:  22%|██▏       | 2376/10986 [54:10<3:02:56,  1.27s/it]

training loss: 3.3816323280334473


training:  22%|██▏       | 2377/10986 [54:11<3:02:41,  1.27s/it]

training loss: 3.329076051712036


training:  22%|██▏       | 2378/10986 [54:12<3:02:09,  1.27s/it]

training loss: 3.4682207107543945


training:  22%|██▏       | 2379/10986 [54:14<3:01:04,  1.26s/it]

training loss: 3.363363742828369


training:  22%|██▏       | 2380/10986 [54:15<3:00:23,  1.26s/it]

training loss: 3.368212938308716
valid loss: 3.370236396789551
perplexity: 29.08540153503418


training:  22%|██▏       | 2381/10986 [54:17<4:02:56,  1.69s/it]

training loss: 3.3115432262420654


training:  22%|██▏       | 2382/10986 [54:19<3:47:17,  1.59s/it]

training loss: 3.449554920196533


training:  22%|██▏       | 2383/10986 [54:20<3:32:53,  1.48s/it]

training loss: 3.3395609855651855


training:  22%|██▏       | 2384/10986 [54:21<3:22:39,  1.41s/it]

training loss: 3.454900026321411


training:  22%|██▏       | 2385/10986 [54:23<3:15:17,  1.36s/it]

training loss: 3.438641309738159


training:  22%|██▏       | 2386/10986 [54:24<3:09:32,  1.32s/it]

training loss: 3.4562814235687256


training:  22%|██▏       | 2387/10986 [54:25<3:08:09,  1.31s/it]

training loss: 3.407794237136841


training:  22%|██▏       | 2388/10986 [54:26<3:07:42,  1.31s/it]

training loss: 3.434908390045166


training:  22%|██▏       | 2389/10986 [54:28<3:04:24,  1.29s/it]

training loss: 3.4504129886627197


training:  22%|██▏       | 2390/10986 [54:29<3:01:55,  1.27s/it]

training loss: 3.3458328247070312


training:  22%|██▏       | 2391/10986 [54:30<3:12:02,  1.34s/it]

training loss: 3.4861087799072266


training:  22%|██▏       | 2392/10986 [54:32<3:09:33,  1.32s/it]

training loss: 3.4495186805725098


training:  22%|██▏       | 2393/10986 [54:33<3:05:27,  1.29s/it]

training loss: 3.492875337600708


training:  22%|██▏       | 2394/10986 [54:34<3:03:22,  1.28s/it]

training loss: 3.5311355590820312


training:  22%|██▏       | 2395/10986 [54:35<3:01:34,  1.27s/it]

training loss: 3.3713455200195312


training:  22%|██▏       | 2396/10986 [54:37<3:00:43,  1.26s/it]

training loss: 3.319011926651001


training:  22%|██▏       | 2397/10986 [54:38<2:59:40,  1.26s/it]

training loss: 3.3935580253601074


training:  22%|██▏       | 2398/10986 [54:39<2:59:01,  1.25s/it]

training loss: 3.389972686767578


training:  22%|██▏       | 2399/10986 [54:40<2:59:04,  1.25s/it]

training loss: 3.3927245140075684


training:  22%|██▏       | 2400/10986 [54:42<2:58:22,  1.25s/it]

training loss: 3.412187337875366
valid loss: 3.4189400672912598
perplexity: 30.537031173706055


training:  22%|██▏       | 2401/10986 [54:44<3:58:24,  1.67s/it]

training loss: 3.412122964859009


training:  22%|██▏       | 2402/10986 [54:46<3:57:05,  1.66s/it]

training loss: 3.3128256797790527


training:  22%|██▏       | 2403/10986 [54:47<3:40:06,  1.54s/it]

training loss: 3.3071811199188232


training:  22%|██▏       | 2404/10986 [54:48<3:27:14,  1.45s/it]

training loss: 3.2950706481933594


training:  22%|██▏       | 2405/10986 [54:50<3:18:10,  1.39s/it]

training loss: 3.3293066024780273


training:  22%|██▏       | 2406/10986 [54:51<3:12:11,  1.34s/it]

training loss: 3.4478330612182617


training:  22%|██▏       | 2407/10986 [54:52<3:07:20,  1.31s/it]

training loss: 3.386103630065918


training:  22%|██▏       | 2408/10986 [54:53<3:04:35,  1.29s/it]

training loss: 3.460526466369629


training:  22%|██▏       | 2409/10986 [54:55<3:03:40,  1.28s/it]

training loss: 3.3749632835388184


training:  22%|██▏       | 2410/10986 [54:56<3:04:05,  1.29s/it]

training loss: 3.355771780014038


training:  22%|██▏       | 2411/10986 [54:57<3:14:06,  1.36s/it]

training loss: 3.4305267333984375


training:  22%|██▏       | 2412/10986 [54:59<3:13:27,  1.35s/it]

training loss: 3.4642622470855713


training:  22%|██▏       | 2413/10986 [55:00<3:09:51,  1.33s/it]

training loss: 3.3574838638305664


training:  22%|██▏       | 2414/10986 [55:01<3:06:05,  1.30s/it]

training loss: 3.3227014541625977


training:  22%|██▏       | 2415/10986 [55:03<3:04:57,  1.29s/it]

training loss: 3.373410224914551


training:  22%|██▏       | 2416/10986 [55:04<3:03:41,  1.29s/it]

training loss: 3.3612921237945557


training:  22%|██▏       | 2417/10986 [55:05<3:02:24,  1.28s/it]

training loss: 3.382258415222168


training:  22%|██▏       | 2418/10986 [55:06<3:01:13,  1.27s/it]

training loss: 3.3555643558502197


training:  22%|██▏       | 2419/10986 [55:08<3:00:37,  1.27s/it]

training loss: 3.353653907775879


training:  22%|██▏       | 2420/10986 [55:09<3:00:40,  1.27s/it]

training loss: 3.3514885902404785
valid loss: 3.347374200820923
perplexity: 28.427989959716797


training:  22%|██▏       | 2421/10986 [55:12<4:02:27,  1.70s/it]

training loss: 3.3178586959838867


training:  22%|██▏       | 2422/10986 [55:13<3:46:49,  1.59s/it]

training loss: 3.4050533771514893


training:  22%|██▏       | 2423/10986 [55:14<3:33:52,  1.50s/it]

training loss: 3.395937919616699


training:  22%|██▏       | 2424/10986 [55:15<3:23:53,  1.43s/it]

training loss: 3.4073574542999268


training:  22%|██▏       | 2425/10986 [55:17<3:16:18,  1.38s/it]

training loss: 3.3922231197357178


training:  22%|██▏       | 2426/10986 [55:18<3:12:01,  1.35s/it]

training loss: 3.301217794418335


training:  22%|██▏       | 2427/10986 [55:19<3:09:13,  1.33s/it]

training loss: 3.5296881198883057


training:  22%|██▏       | 2428/10986 [55:20<3:06:37,  1.31s/it]

training loss: 3.312562942504883


training:  22%|██▏       | 2429/10986 [55:22<3:05:11,  1.30s/it]

training loss: 3.343043088912964


training:  22%|██▏       | 2430/10986 [55:23<3:03:03,  1.28s/it]

training loss: 3.4020748138427734


training:  22%|██▏       | 2431/10986 [55:25<3:14:41,  1.37s/it]

training loss: 3.317042112350464


training:  22%|██▏       | 2432/10986 [55:26<3:12:04,  1.35s/it]

training loss: 3.3596134185791016


training:  22%|██▏       | 2433/10986 [55:27<3:10:08,  1.33s/it]

training loss: 3.4048030376434326


training:  22%|██▏       | 2434/10986 [55:28<3:06:53,  1.31s/it]

training loss: 3.392390727996826


training:  22%|██▏       | 2435/10986 [55:30<3:04:19,  1.29s/it]

training loss: 3.3675765991210938


training:  22%|██▏       | 2436/10986 [55:31<3:02:13,  1.28s/it]

training loss: 3.3511345386505127


training:  22%|██▏       | 2437/10986 [55:32<3:01:19,  1.27s/it]

training loss: 3.3007898330688477


training:  22%|██▏       | 2438/10986 [55:33<3:00:23,  1.27s/it]

training loss: 3.384669542312622


training:  22%|██▏       | 2439/10986 [55:35<2:59:48,  1.26s/it]

training loss: 3.4022932052612305


training:  22%|██▏       | 2440/10986 [55:36<3:00:31,  1.27s/it]

training loss: 3.3383584022521973
valid loss: 3.3332250118255615
perplexity: 28.028589248657227


training:  22%|██▏       | 2441/10986 [55:39<3:59:49,  1.68s/it]

training loss: 3.350604295730591


training:  22%|██▏       | 2442/10986 [55:40<3:47:34,  1.60s/it]

training loss: 3.4465699195861816


training:  22%|██▏       | 2443/10986 [55:41<3:33:12,  1.50s/it]

training loss: 3.4366416931152344


training:  22%|██▏       | 2444/10986 [55:43<3:23:04,  1.43s/it]

training loss: 3.3712337017059326


training:  22%|██▏       | 2445/10986 [55:44<3:16:10,  1.38s/it]

training loss: 3.327286720275879


training:  22%|██▏       | 2446/10986 [55:45<3:11:19,  1.34s/it]

training loss: 3.4435181617736816


training:  22%|██▏       | 2447/10986 [55:46<3:08:23,  1.32s/it]

training loss: 3.3338732719421387


training:  22%|██▏       | 2448/10986 [55:48<3:06:18,  1.31s/it]

training loss: 3.485821008682251


training:  22%|██▏       | 2449/10986 [55:49<3:04:25,  1.30s/it]

training loss: 3.357032537460327


training:  22%|██▏       | 2450/10986 [55:50<3:02:56,  1.29s/it]

training loss: 3.4273693561553955


training:  22%|██▏       | 2451/10986 [55:52<3:22:55,  1.43s/it]

training loss: 3.316586971282959


training:  22%|██▏       | 2452/10986 [55:54<3:49:23,  1.61s/it]

training loss: 3.3531179428100586


training:  22%|██▏       | 2453/10986 [55:55<3:40:03,  1.55s/it]

training loss: 3.424311399459839


training:  22%|██▏       | 2454/10986 [55:57<3:30:47,  1.48s/it]

training loss: 3.380208969116211


training:  22%|██▏       | 2455/10986 [55:58<3:21:19,  1.42s/it]

training loss: 3.41306734085083


training:  22%|██▏       | 2456/10986 [55:59<3:14:41,  1.37s/it]

training loss: 3.3344321250915527


training:  22%|██▏       | 2457/10986 [56:00<3:10:19,  1.34s/it]

training loss: 3.4533867835998535


training:  22%|██▏       | 2458/10986 [56:02<3:07:29,  1.32s/it]

training loss: 3.354081153869629


training:  22%|██▏       | 2459/10986 [56:03<3:05:58,  1.31s/it]

training loss: 3.4355337619781494


training:  22%|██▏       | 2460/10986 [56:04<3:04:17,  1.30s/it]

training loss: 3.37333083152771
valid loss: 3.368225336074829
perplexity: 29.026968002319336


training:  22%|██▏       | 2461/10986 [56:07<4:02:48,  1.71s/it]

training loss: 3.439511299133301


training:  22%|██▏       | 2462/10986 [56:08<3:49:36,  1.62s/it]

training loss: 3.573122024536133


training:  22%|██▏       | 2463/10986 [56:10<3:36:20,  1.52s/it]

training loss: 3.6299889087677


training:  22%|██▏       | 2464/10986 [56:11<3:25:31,  1.45s/it]

training loss: 3.3281054496765137


training:  22%|██▏       | 2465/10986 [56:12<3:18:43,  1.40s/it]

training loss: 3.3492605686187744


training:  22%|██▏       | 2466/10986 [56:14<3:14:03,  1.37s/it]

training loss: 3.454425096511841


training:  22%|██▏       | 2467/10986 [56:15<3:09:36,  1.34s/it]

training loss: 3.3324220180511475


training:  22%|██▏       | 2468/10986 [56:16<3:06:41,  1.32s/it]

training loss: 3.461787223815918


training:  22%|██▏       | 2469/10986 [56:17<3:04:05,  1.30s/it]

training loss: 3.408001184463501


training:  22%|██▏       | 2470/10986 [56:19<3:02:35,  1.29s/it]

training loss: 3.4706408977508545


training:  22%|██▏       | 2471/10986 [56:20<3:13:11,  1.36s/it]

training loss: 3.421786069869995


training:  23%|██▎       | 2472/10986 [56:22<3:23:45,  1.44s/it]

training loss: 3.37152099609375


training:  23%|██▎       | 2473/10986 [56:23<3:16:45,  1.39s/it]

training loss: 3.4465079307556152


training:  23%|██▎       | 2474/10986 [56:24<3:11:37,  1.35s/it]

training loss: 3.458540916442871


training:  23%|██▎       | 2475/10986 [56:26<3:07:43,  1.32s/it]

training loss: 3.3256332874298096


training:  23%|██▎       | 2476/10986 [56:27<3:04:59,  1.30s/it]

training loss: 3.2894725799560547


training:  23%|██▎       | 2477/10986 [56:28<3:04:17,  1.30s/it]

training loss: 3.4235308170318604


training:  23%|██▎       | 2478/10986 [56:29<3:03:16,  1.29s/it]

training loss: 3.480116128921509


training:  23%|██▎       | 2479/10986 [56:31<3:01:41,  1.28s/it]

training loss: 3.4911551475524902


training:  23%|██▎       | 2480/10986 [56:32<3:02:33,  1.29s/it]

training loss: 3.3443808555603027
valid loss: 3.3358309268951416
perplexity: 28.10172462463379


training:  23%|██▎       | 2481/10986 [56:35<4:01:48,  1.71s/it]

training loss: 3.442704200744629


training:  23%|██▎       | 2482/10986 [56:36<3:50:59,  1.63s/it]

training loss: 3.3700826168060303


training:  23%|██▎       | 2483/10986 [56:37<3:36:29,  1.53s/it]

training loss: 3.48490309715271


training:  23%|██▎       | 2484/10986 [56:39<3:26:32,  1.46s/it]

training loss: 3.3697214126586914


training:  23%|██▎       | 2485/10986 [56:40<3:18:52,  1.40s/it]

training loss: 3.431605815887451


training:  23%|██▎       | 2486/10986 [56:41<3:13:03,  1.36s/it]

training loss: 3.445972442626953


training:  23%|██▎       | 2487/10986 [56:42<3:09:36,  1.34s/it]

training loss: 3.4687602519989014


training:  23%|██▎       | 2488/10986 [56:44<3:06:15,  1.32s/it]

training loss: 3.3011820316314697


training:  23%|██▎       | 2489/10986 [56:45<3:04:43,  1.30s/it]

training loss: 3.4148285388946533


training:  23%|██▎       | 2490/10986 [56:46<3:03:24,  1.30s/it]

training loss: 3.3995766639709473


training:  23%|██▎       | 2491/10986 [56:48<3:14:06,  1.37s/it]

training loss: 3.4361155033111572


training:  23%|██▎       | 2492/10986 [56:49<3:10:41,  1.35s/it]

training loss: 3.349679708480835


training:  23%|██▎       | 2493/10986 [56:50<3:07:50,  1.33s/it]

training loss: 3.306227207183838


training:  23%|██▎       | 2494/10986 [56:52<3:05:10,  1.31s/it]

training loss: 3.478771209716797


training:  23%|██▎       | 2495/10986 [56:53<3:03:50,  1.30s/it]

training loss: 3.360471248626709


training:  23%|██▎       | 2496/10986 [56:54<3:03:37,  1.30s/it]

training loss: 3.404172658920288


training:  23%|██▎       | 2497/10986 [56:55<3:02:24,  1.29s/it]

training loss: 3.381460666656494


training:  23%|██▎       | 2498/10986 [56:57<3:01:32,  1.28s/it]

training loss: 3.45365834236145


training:  23%|██▎       | 2499/10986 [56:58<3:01:15,  1.28s/it]

training loss: 3.4023687839508057


training:  23%|██▎       | 2500/10986 [56:59<3:01:53,  1.29s/it]

training loss: 3.4154515266418457
valid loss: 3.4189555644989014
perplexity: 30.537504196166992


training:  23%|██▎       | 2501/10986 [57:02<4:02:11,  1.71s/it]

training loss: 3.4462485313415527


training:  23%|██▎       | 2502/10986 [57:03<3:48:01,  1.61s/it]

training loss: 3.400676965713501


training:  23%|██▎       | 2503/10986 [57:05<3:32:57,  1.51s/it]

training loss: 3.3994524478912354


training:  23%|██▎       | 2504/10986 [57:06<3:22:26,  1.43s/it]

training loss: 3.4884440898895264


training:  23%|██▎       | 2505/10986 [57:07<3:15:21,  1.38s/it]

training loss: 3.3940718173980713


training:  23%|██▎       | 2506/10986 [57:08<3:10:55,  1.35s/it]

training loss: 3.380601406097412


training:  23%|██▎       | 2507/10986 [57:10<3:08:06,  1.33s/it]

training loss: 3.4488861560821533


training:  23%|██▎       | 2508/10986 [57:11<3:05:36,  1.31s/it]

training loss: 3.494161605834961


training:  23%|██▎       | 2509/10986 [57:12<3:03:35,  1.30s/it]

training loss: 3.371997833251953


training:  23%|██▎       | 2510/10986 [57:14<3:02:20,  1.29s/it]

training loss: 3.432830810546875


training:  23%|██▎       | 2511/10986 [57:15<3:12:42,  1.36s/it]

training loss: 3.32411527633667


training:  23%|██▎       | 2512/10986 [57:17<3:22:24,  1.43s/it]

training loss: 3.3631458282470703


training:  23%|██▎       | 2513/10986 [57:18<3:15:16,  1.38s/it]

training loss: 3.4448530673980713


training:  23%|██▎       | 2514/10986 [57:19<3:10:19,  1.35s/it]

training loss: 3.306440591812134


training:  23%|██▎       | 2515/10986 [57:21<3:07:28,  1.33s/it]

training loss: 3.2242627143859863


training:  23%|██▎       | 2516/10986 [57:22<3:05:39,  1.32s/it]

training loss: 3.3924293518066406


training:  23%|██▎       | 2517/10986 [57:23<3:03:12,  1.30s/it]

training loss: 3.473532199859619


training:  23%|██▎       | 2518/10986 [57:24<3:01:59,  1.29s/it]

training loss: 3.255770206451416


training:  23%|██▎       | 2519/10986 [57:26<3:00:46,  1.28s/it]

training loss: 3.4259252548217773


training:  23%|██▎       | 2520/10986 [57:27<3:00:41,  1.28s/it]

training loss: 3.318582773208618
valid loss: 3.3065261840820312
perplexity: 27.290159225463867


training:  23%|██▎       | 2521/10986 [57:30<3:59:17,  1.70s/it]

training loss: 3.3553998470306396


training:  23%|██▎       | 2522/10986 [57:31<3:46:25,  1.61s/it]

training loss: 3.4053149223327637


training:  23%|██▎       | 2523/10986 [57:32<3:33:35,  1.51s/it]

training loss: 3.38617205619812


training:  23%|██▎       | 2524/10986 [57:33<3:22:25,  1.44s/it]

training loss: 3.3658459186553955


training:  23%|██▎       | 2525/10986 [57:35<3:15:10,  1.38s/it]

training loss: 3.31957745552063


training:  23%|██▎       | 2526/10986 [57:36<3:09:46,  1.35s/it]

training loss: 3.483792781829834


training:  23%|██▎       | 2527/10986 [57:37<3:06:39,  1.32s/it]

training loss: 3.479912757873535


training:  23%|██▎       | 2528/10986 [57:39<3:04:37,  1.31s/it]

training loss: 3.352567434310913


training:  23%|██▎       | 2529/10986 [57:40<3:03:08,  1.30s/it]

training loss: 3.427088737487793


training:  23%|██▎       | 2530/10986 [57:41<3:01:26,  1.29s/it]

training loss: 3.5414721965789795


training:  23%|██▎       | 2531/10986 [57:43<3:12:51,  1.37s/it]

training loss: 3.4894611835479736


training:  23%|██▎       | 2532/10986 [57:44<3:09:42,  1.35s/it]

training loss: 3.343289375305176


training:  23%|██▎       | 2533/10986 [57:45<3:06:37,  1.32s/it]

training loss: 3.2581164836883545


training:  23%|██▎       | 2534/10986 [57:46<3:04:42,  1.31s/it]

training loss: 3.3273158073425293


training:  23%|██▎       | 2535/10986 [57:48<3:03:18,  1.30s/it]

training loss: 3.422572135925293


training:  23%|██▎       | 2536/10986 [57:49<3:02:57,  1.30s/it]

training loss: 3.418788433074951


training:  23%|██▎       | 2537/10986 [57:50<3:02:10,  1.29s/it]

training loss: 3.4429335594177246


training:  23%|██▎       | 2538/10986 [57:52<3:00:48,  1.28s/it]

training loss: 3.260930299758911


training:  23%|██▎       | 2539/10986 [57:53<3:00:21,  1.28s/it]

training loss: 3.3548638820648193


training:  23%|██▎       | 2540/10986 [57:54<3:01:37,  1.29s/it]

training loss: 3.3298397064208984
valid loss: 3.3343186378479004
perplexity: 28.05925750732422


training:  23%|██▎       | 2541/10986 [57:57<4:03:46,  1.73s/it]

training loss: 3.4244635105133057


training:  23%|██▎       | 2542/10986 [57:58<3:47:26,  1.62s/it]

training loss: 3.4117352962493896


training:  23%|██▎       | 2543/10986 [58:00<3:33:38,  1.52s/it]

training loss: 3.4522156715393066


training:  23%|██▎       | 2544/10986 [58:01<3:23:50,  1.45s/it]

training loss: 3.4224534034729004


training:  23%|██▎       | 2545/10986 [58:02<3:16:42,  1.40s/it]

training loss: 3.4736709594726562


training:  23%|██▎       | 2546/10986 [58:03<3:12:17,  1.37s/it]

training loss: 3.434894561767578


training:  23%|██▎       | 2547/10986 [58:05<3:08:25,  1.34s/it]

training loss: 3.3348281383514404


training:  23%|██▎       | 2548/10986 [58:06<3:05:17,  1.32s/it]

training loss: 3.336794853210449


training:  23%|██▎       | 2549/10986 [58:07<3:02:34,  1.30s/it]

training loss: 3.3027703762054443


training:  23%|██▎       | 2550/10986 [58:09<3:01:16,  1.29s/it]

training loss: 3.3537988662719727


training:  23%|██▎       | 2551/10986 [58:10<3:12:42,  1.37s/it]

training loss: 3.3187241554260254


training:  23%|██▎       | 2552/10986 [58:11<3:10:01,  1.35s/it]

training loss: 3.3749279975891113


training:  23%|██▎       | 2553/10986 [58:13<3:06:44,  1.33s/it]

training loss: 3.370393991470337


training:  23%|██▎       | 2554/10986 [58:14<3:04:53,  1.32s/it]

training loss: 3.496774435043335


training:  23%|██▎       | 2555/10986 [58:15<3:05:04,  1.32s/it]

training loss: 3.375871181488037


training:  23%|██▎       | 2556/10986 [58:17<3:03:31,  1.31s/it]

training loss: 3.4348297119140625


training:  23%|██▎       | 2557/10986 [58:18<3:02:18,  1.30s/it]

training loss: 3.435899257659912


training:  23%|██▎       | 2558/10986 [58:19<3:02:03,  1.30s/it]

training loss: 3.4514517784118652


training:  23%|██▎       | 2559/10986 [58:20<3:01:40,  1.29s/it]

training loss: 3.4144649505615234


training:  23%|██▎       | 2560/10986 [58:22<3:03:42,  1.31s/it]

training loss: 3.369324207305908
valid loss: 3.3581855297088623
perplexity: 28.737001419067383


training:  23%|██▎       | 2561/10986 [58:24<4:04:18,  1.74s/it]

training loss: 3.361701250076294


training:  23%|██▎       | 2562/10986 [58:26<3:48:50,  1.63s/it]

training loss: 3.4095261096954346


training:  23%|██▎       | 2563/10986 [58:27<3:35:41,  1.54s/it]

training loss: 3.3995611667633057


training:  23%|██▎       | 2564/10986 [58:28<3:24:44,  1.46s/it]

training loss: 3.420440912246704


training:  23%|██▎       | 2565/10986 [58:30<3:16:45,  1.40s/it]

training loss: 3.335439682006836


training:  23%|██▎       | 2566/10986 [58:31<3:13:57,  1.38s/it]

training loss: 3.440225839614868


training:  23%|██▎       | 2567/10986 [58:32<3:11:13,  1.36s/it]

training loss: 3.297178030014038


training:  23%|██▎       | 2568/10986 [58:34<3:07:27,  1.34s/it]

training loss: 3.4742610454559326


training:  23%|██▎       | 2569/10986 [58:35<3:05:02,  1.32s/it]

training loss: 3.3839590549468994


training:  23%|██▎       | 2570/10986 [58:36<3:03:45,  1.31s/it]

training loss: 3.3963475227355957


training:  23%|██▎       | 2571/10986 [58:38<3:12:32,  1.37s/it]

training loss: 3.4042999744415283


training:  23%|██▎       | 2572/10986 [58:39<3:10:47,  1.36s/it]

training loss: 3.373403787612915


training:  23%|██▎       | 2573/10986 [58:40<3:07:58,  1.34s/it]

training loss: 3.2861809730529785


training:  23%|██▎       | 2574/10986 [58:42<3:04:35,  1.32s/it]

training loss: 3.3078057765960693


training:  23%|██▎       | 2575/10986 [58:43<3:03:17,  1.31s/it]

training loss: 3.3867266178131104


training:  23%|██▎       | 2576/10986 [58:44<3:01:45,  1.30s/it]

training loss: 3.462657928466797


training:  23%|██▎       | 2577/10986 [58:45<3:00:49,  1.29s/it]

training loss: 3.326648712158203


training:  23%|██▎       | 2578/10986 [58:47<3:00:49,  1.29s/it]

training loss: 3.379709482192993


training:  23%|██▎       | 2579/10986 [58:48<3:00:24,  1.29s/it]

training loss: 3.426842451095581


training:  23%|██▎       | 2580/10986 [58:49<2:59:41,  1.28s/it]

training loss: 3.344829559326172
valid loss: 3.34248948097229
perplexity: 28.28946304321289


training:  23%|██▎       | 2581/10986 [58:52<3:59:05,  1.71s/it]

training loss: 3.3614604473114014


training:  24%|██▎       | 2582/10986 [58:53<3:45:26,  1.61s/it]

training loss: 3.4178004264831543


training:  24%|██▎       | 2583/10986 [58:55<3:31:46,  1.51s/it]

training loss: 3.47352933883667


training:  24%|██▎       | 2584/10986 [58:56<3:21:58,  1.44s/it]

training loss: 3.47149658203125


training:  24%|██▎       | 2585/10986 [58:57<3:14:34,  1.39s/it]

training loss: 3.433152914047241


training:  24%|██▎       | 2586/10986 [58:58<3:09:25,  1.35s/it]

training loss: 3.490062952041626


training:  24%|██▎       | 2587/10986 [59:00<3:06:12,  1.33s/it]

training loss: 3.544215679168701


training:  24%|██▎       | 2588/10986 [59:01<3:05:53,  1.33s/it]

training loss: 3.407712936401367


training:  24%|██▎       | 2589/10986 [59:02<3:05:13,  1.32s/it]

training loss: 3.33662748336792


training:  24%|██▎       | 2590/10986 [59:04<3:03:26,  1.31s/it]

training loss: 3.402862071990967


training:  24%|██▎       | 2591/10986 [59:05<3:12:58,  1.38s/it]

training loss: 3.4141573905944824


training:  24%|██▎       | 2592/10986 [59:07<3:09:22,  1.35s/it]

training loss: 3.3620312213897705


training:  24%|██▎       | 2593/10986 [59:08<3:05:27,  1.33s/it]

training loss: 3.4769933223724365


training:  24%|██▎       | 2594/10986 [59:09<3:03:11,  1.31s/it]

training loss: 3.4532222747802734


training:  24%|██▎       | 2595/10986 [59:10<3:01:47,  1.30s/it]

training loss: 3.4918618202209473


training:  24%|██▎       | 2596/10986 [59:12<3:00:03,  1.29s/it]

training loss: 3.322392702102661


training:  24%|██▎       | 2597/10986 [59:13<3:00:20,  1.29s/it]

training loss: 3.458890438079834


training:  24%|██▎       | 2598/10986 [59:14<2:59:27,  1.28s/it]

training loss: 3.3631765842437744


training:  24%|██▎       | 2599/10986 [59:15<2:58:49,  1.28s/it]

training loss: 3.3609044551849365


training:  24%|██▎       | 2600/10986 [59:17<2:58:35,  1.28s/it]

training loss: 3.3207194805145264
valid loss: 3.3208467960357666
perplexity: 27.68378257751465


training:  24%|██▎       | 2601/10986 [59:20<4:37:11,  1.98s/it]

training loss: 3.4350223541259766


training:  24%|██▎       | 2602/10986 [59:22<4:12:59,  1.81s/it]

training loss: 3.404940128326416


training:  24%|██▎       | 2603/10986 [59:23<3:50:43,  1.65s/it]

training loss: 3.2609126567840576


training:  24%|██▎       | 2604/10986 [59:24<3:35:19,  1.54s/it]

training loss: 3.4531610012054443


training:  24%|██▎       | 2605/10986 [59:26<3:23:52,  1.46s/it]

training loss: 3.3854880332946777


training:  24%|██▎       | 2606/10986 [59:27<3:15:48,  1.40s/it]

training loss: 3.4244163036346436


training:  24%|██▎       | 2607/10986 [59:28<3:10:08,  1.36s/it]

training loss: 3.426994800567627


training:  24%|██▎       | 2608/10986 [59:29<3:05:39,  1.33s/it]

training loss: 3.303938865661621


training:  24%|██▎       | 2609/10986 [59:31<3:02:22,  1.31s/it]

training loss: 3.4113729000091553


training:  24%|██▍       | 2610/10986 [59:32<3:02:20,  1.31s/it]

training loss: 3.301903486251831


training:  24%|██▍       | 2611/10986 [59:33<3:11:28,  1.37s/it]

training loss: 3.25762677192688


training:  24%|██▍       | 2612/10986 [59:35<3:07:40,  1.34s/it]

training loss: 3.5641112327575684


training:  24%|██▍       | 2613/10986 [59:36<3:02:56,  1.31s/it]

training loss: 3.501863479614258


training:  24%|██▍       | 2614/10986 [59:37<3:00:21,  1.29s/it]

training loss: 3.355689287185669


training:  24%|██▍       | 2615/10986 [59:38<2:59:46,  1.29s/it]

training loss: 3.365154504776001


training:  24%|██▍       | 2616/10986 [59:40<2:58:11,  1.28s/it]

training loss: 3.428614616394043


training:  24%|██▍       | 2617/10986 [59:41<2:57:17,  1.27s/it]

training loss: 3.3068435192108154


training:  24%|██▍       | 2618/10986 [59:42<2:56:53,  1.27s/it]

training loss: 3.3099253177642822


training:  24%|██▍       | 2619/10986 [59:43<2:56:18,  1.26s/it]

training loss: 3.3982913494110107


training:  24%|██▍       | 2620/10986 [59:45<2:56:02,  1.26s/it]

training loss: 3.3768484592437744
valid loss: 3.3632638454437256
perplexity: 28.88330841064453


training:  24%|██▍       | 2621/10986 [59:47<3:55:02,  1.69s/it]

training loss: 3.3328299522399902


training:  24%|██▍       | 2622/10986 [59:49<3:39:57,  1.58s/it]

training loss: 3.4763872623443604


training:  24%|██▍       | 2623/10986 [59:50<3:26:18,  1.48s/it]

training loss: 3.381225824356079


training:  24%|██▍       | 2624/10986 [59:51<3:16:40,  1.41s/it]

training loss: 3.3047399520874023


training:  24%|██▍       | 2625/10986 [59:53<3:10:02,  1.36s/it]

training loss: 3.3936047554016113


training:  24%|██▍       | 2626/10986 [59:54<3:05:44,  1.33s/it]

training loss: 3.477566957473755


training:  24%|██▍       | 2627/10986 [59:55<3:02:37,  1.31s/it]

training loss: 3.4100451469421387


training:  24%|██▍       | 2628/10986 [59:56<3:00:36,  1.30s/it]

training loss: 3.3498027324676514


training:  24%|██▍       | 2629/10986 [59:58<2:58:24,  1.28s/it]

training loss: 3.358213424682617


training:  24%|██▍       | 2630/10986 [59:59<2:57:28,  1.27s/it]

training loss: 3.3212976455688477


training:  24%|██▍       | 2631/10986 [1:00:00<3:08:00,  1.35s/it]

training loss: 3.3565192222595215


training:  24%|██▍       | 2632/10986 [1:00:02<3:07:53,  1.35s/it]

training loss: 3.378134250640869


training:  24%|██▍       | 2633/10986 [1:00:03<3:03:46,  1.32s/it]

training loss: 3.413076877593994


training:  24%|██▍       | 2634/10986 [1:00:04<3:01:19,  1.30s/it]

training loss: 3.303217887878418


training:  24%|██▍       | 2635/10986 [1:00:05<2:59:45,  1.29s/it]

training loss: 3.3623392581939697


training:  24%|██▍       | 2636/10986 [1:00:07<2:58:04,  1.28s/it]

training loss: 3.33089280128479


training:  24%|██▍       | 2637/10986 [1:00:08<2:57:11,  1.27s/it]

training loss: 3.4233102798461914


training:  24%|██▍       | 2638/10986 [1:00:09<2:56:17,  1.27s/it]

training loss: 3.3222317695617676


training:  24%|██▍       | 2639/10986 [1:00:11<2:57:07,  1.27s/it]

training loss: 3.5082056522369385


training:  24%|██▍       | 2640/10986 [1:00:12<2:56:54,  1.27s/it]

training loss: 3.398266077041626
valid loss: 3.408643960952759
perplexity: 30.22422981262207


training:  24%|██▍       | 2641/10986 [1:00:15<3:58:15,  1.71s/it]

training loss: 3.3706274032592773


training:  24%|██▍       | 2642/10986 [1:00:16<3:42:21,  1.60s/it]

training loss: 3.3759214878082275


training:  24%|██▍       | 2643/10986 [1:00:17<3:29:25,  1.51s/it]

training loss: 3.391535520553589


training:  24%|██▍       | 2644/10986 [1:00:18<3:20:16,  1.44s/it]

training loss: 3.4811413288116455


training:  24%|██▍       | 2645/10986 [1:00:20<3:12:03,  1.38s/it]

training loss: 3.329185962677002


training:  24%|██▍       | 2646/10986 [1:00:21<3:06:54,  1.34s/it]

training loss: 3.423292398452759


training:  24%|██▍       | 2647/10986 [1:00:22<3:03:49,  1.32s/it]

training loss: 3.4190375804901123


training:  24%|██▍       | 2648/10986 [1:00:23<3:02:44,  1.32s/it]

training loss: 3.5243735313415527


training:  24%|██▍       | 2649/10986 [1:00:25<3:00:22,  1.30s/it]

training loss: 3.4173715114593506


training:  24%|██▍       | 2650/10986 [1:00:26<2:59:19,  1.29s/it]

training loss: 3.456500291824341


training:  24%|██▍       | 2651/10986 [1:00:28<3:08:09,  1.35s/it]

training loss: 3.4274280071258545


training:  24%|██▍       | 2652/10986 [1:00:29<3:06:06,  1.34s/it]

training loss: 3.5582637786865234


training:  24%|██▍       | 2653/10986 [1:00:30<3:03:09,  1.32s/it]

training loss: 3.3046603202819824


training:  24%|██▍       | 2654/10986 [1:00:31<3:00:41,  1.30s/it]

training loss: 3.4559803009033203


training:  24%|██▍       | 2655/10986 [1:00:33<3:01:30,  1.31s/it]

training loss: 3.3928329944610596


training:  24%|██▍       | 2656/10986 [1:00:34<3:01:54,  1.31s/it]

training loss: 3.3701305389404297


training:  24%|██▍       | 2657/10986 [1:00:35<3:00:51,  1.30s/it]

training loss: 3.4627506732940674


training:  24%|██▍       | 2658/10986 [1:00:37<2:59:50,  1.30s/it]

training loss: 3.382657527923584


training:  24%|██▍       | 2659/10986 [1:00:38<2:59:08,  1.29s/it]

training loss: 3.2775330543518066


training:  24%|██▍       | 2660/10986 [1:00:39<2:58:37,  1.29s/it]

training loss: 3.302091360092163
valid loss: 3.2972209453582764
perplexity: 27.037395477294922


training:  24%|██▍       | 2661/10986 [1:00:42<4:00:08,  1.73s/it]

training loss: 3.4714744091033936


training:  24%|██▍       | 2662/10986 [1:00:43<3:47:06,  1.64s/it]

training loss: 3.3446531295776367


training:  24%|██▍       | 2663/10986 [1:00:45<3:33:51,  1.54s/it]

training loss: 3.245269775390625


training:  24%|██▍       | 2664/10986 [1:00:46<3:22:56,  1.46s/it]

training loss: 3.4083991050720215


training:  24%|██▍       | 2665/10986 [1:00:47<3:14:51,  1.41s/it]

training loss: 3.3625805377960205


training:  24%|██▍       | 2666/10986 [1:00:48<3:08:43,  1.36s/it]

training loss: 3.46207332611084


training:  24%|██▍       | 2667/10986 [1:00:50<3:05:05,  1.33s/it]

training loss: 3.2860686779022217


training:  24%|██▍       | 2668/10986 [1:00:51<3:02:26,  1.32s/it]

training loss: 3.39902400970459


training:  24%|██▍       | 2669/10986 [1:00:52<3:01:16,  1.31s/it]

training loss: 3.310589075088501


training:  24%|██▍       | 2670/10986 [1:00:54<3:00:54,  1.31s/it]

training loss: 3.3310441970825195


training:  24%|██▍       | 2671/10986 [1:00:55<3:10:49,  1.38s/it]

training loss: 3.3268954753875732


training:  24%|██▍       | 2672/10986 [1:00:56<3:08:22,  1.36s/it]

training loss: 3.4285778999328613


training:  24%|██▍       | 2673/10986 [1:00:58<3:04:40,  1.33s/it]

training loss: 3.2999625205993652


training:  24%|██▍       | 2674/10986 [1:00:59<3:03:13,  1.32s/it]

training loss: 3.3773252964019775


training:  24%|██▍       | 2675/10986 [1:01:00<3:02:33,  1.32s/it]

training loss: 3.430316686630249


training:  24%|██▍       | 2676/10986 [1:01:02<3:01:29,  1.31s/it]

training loss: 3.3800482749938965


training:  24%|██▍       | 2677/10986 [1:01:03<3:02:33,  1.32s/it]

training loss: 3.362910032272339


training:  24%|██▍       | 2678/10986 [1:01:04<3:01:31,  1.31s/it]

training loss: 3.518320083618164


training:  24%|██▍       | 2679/10986 [1:01:06<3:01:02,  1.31s/it]

training loss: 3.3839526176452637


training:  24%|██▍       | 2680/10986 [1:01:07<3:00:34,  1.30s/it]

training loss: 3.465214967727661
valid loss: 3.462667465209961
perplexity: 31.901960372924805


training:  24%|██▍       | 2681/10986 [1:01:10<4:01:34,  1.75s/it]

training loss: 3.310856580734253


training:  24%|██▍       | 2682/10986 [1:01:11<4:00:45,  1.74s/it]

training loss: 3.397416353225708


training:  24%|██▍       | 2683/10986 [1:01:13<3:46:08,  1.63s/it]

training loss: 3.3367056846618652


training:  24%|██▍       | 2684/10986 [1:01:14<3:35:26,  1.56s/it]

training loss: 3.424790620803833


training:  24%|██▍       | 2685/10986 [1:01:15<3:27:14,  1.50s/it]

training loss: 3.3192849159240723


training:  24%|██▍       | 2686/10986 [1:01:17<3:20:37,  1.45s/it]

training loss: 3.28953218460083


training:  24%|██▍       | 2687/10986 [1:01:18<3:16:13,  1.42s/it]

training loss: 3.421734094619751


training:  24%|██▍       | 2688/10986 [1:01:19<3:11:55,  1.39s/it]

training loss: 3.4748761653900146


training:  24%|██▍       | 2689/10986 [1:01:21<3:06:55,  1.35s/it]

training loss: 3.387465715408325


training:  24%|██▍       | 2690/10986 [1:01:22<3:03:58,  1.33s/it]

training loss: 3.423022747039795


training:  24%|██▍       | 2691/10986 [1:01:24<3:15:08,  1.41s/it]

training loss: 3.3539576530456543


training:  25%|██▍       | 2692/10986 [1:01:25<3:12:29,  1.39s/it]

training loss: 3.3766067028045654


training:  25%|██▍       | 2693/10986 [1:01:26<3:09:12,  1.37s/it]

training loss: 3.4161012172698975


training:  25%|██▍       | 2694/10986 [1:01:28<3:06:52,  1.35s/it]

training loss: 3.3844895362854004


training:  25%|██▍       | 2695/10986 [1:01:29<3:03:20,  1.33s/it]

training loss: 3.4092330932617188


training:  25%|██▍       | 2696/10986 [1:01:30<3:00:58,  1.31s/it]

training loss: 3.378376007080078


training:  25%|██▍       | 2697/10986 [1:01:31<2:59:38,  1.30s/it]

training loss: 3.3066558837890625


training:  25%|██▍       | 2698/10986 [1:01:33<2:58:47,  1.29s/it]

training loss: 3.4026780128479004


training:  25%|██▍       | 2699/10986 [1:01:34<2:59:20,  1.30s/it]

training loss: 3.2291488647460938


training:  25%|██▍       | 2700/10986 [1:01:35<2:58:39,  1.29s/it]

training loss: 3.358741283416748
valid loss: 3.357308864593506
perplexity: 28.711820602416992


training:  25%|██▍       | 2701/10986 [1:01:38<3:57:31,  1.72s/it]

training loss: 3.494494915008545


training:  25%|██▍       | 2702/10986 [1:01:39<3:43:00,  1.62s/it]

training loss: 3.476635217666626


training:  25%|██▍       | 2703/10986 [1:01:41<3:28:26,  1.51s/it]

training loss: 3.3962795734405518


training:  25%|██▍       | 2704/10986 [1:01:42<3:18:22,  1.44s/it]

training loss: 3.323423385620117


training:  25%|██▍       | 2705/10986 [1:01:43<3:11:36,  1.39s/it]

training loss: 3.5188891887664795


training:  25%|██▍       | 2706/10986 [1:01:44<3:06:12,  1.35s/it]

training loss: 3.338623523712158


training:  25%|██▍       | 2707/10986 [1:01:46<3:02:08,  1.32s/it]

training loss: 3.248588800430298


training:  25%|██▍       | 2708/10986 [1:01:47<2:59:13,  1.30s/it]

training loss: 3.321204423904419


training:  25%|██▍       | 2709/10986 [1:01:48<2:58:37,  1.29s/it]

training loss: 3.4011075496673584


training:  25%|██▍       | 2710/10986 [1:01:49<2:56:56,  1.28s/it]

training loss: 3.3826780319213867


training:  25%|██▍       | 2711/10986 [1:01:51<3:06:32,  1.35s/it]

training loss: 3.3440189361572266


training:  25%|██▍       | 2712/10986 [1:01:52<3:05:44,  1.35s/it]

training loss: 3.480588436126709


training:  25%|██▍       | 2713/10986 [1:01:54<3:03:00,  1.33s/it]

training loss: 3.3719358444213867


training:  25%|██▍       | 2714/10986 [1:01:55<3:00:19,  1.31s/it]

training loss: 3.360806703567505


training:  25%|██▍       | 2715/10986 [1:01:56<2:58:18,  1.29s/it]

training loss: 3.4600374698638916


training:  25%|██▍       | 2716/10986 [1:01:57<2:57:13,  1.29s/it]

training loss: 3.3931939601898193


training:  25%|██▍       | 2717/10986 [1:01:59<2:57:25,  1.29s/it]

training loss: 3.422365427017212


training:  25%|██▍       | 2718/10986 [1:02:00<2:56:34,  1.28s/it]

training loss: 3.335658550262451


training:  25%|██▍       | 2719/10986 [1:02:01<2:56:08,  1.28s/it]

training loss: 3.462656259536743


training:  25%|██▍       | 2720/10986 [1:02:02<2:55:59,  1.28s/it]

training loss: 3.37395977973938
valid loss: 3.3678488731384277
perplexity: 29.016042709350586


training:  25%|██▍       | 2721/10986 [1:02:05<3:55:49,  1.71s/it]

training loss: 3.538414716720581


training:  25%|██▍       | 2722/10986 [1:02:07<3:41:43,  1.61s/it]

training loss: 3.3699088096618652


training:  25%|██▍       | 2723/10986 [1:02:08<3:27:20,  1.51s/it]

training loss: 3.355222463607788


training:  25%|██▍       | 2724/10986 [1:02:09<3:17:28,  1.43s/it]

training loss: 3.421427011489868


training:  25%|██▍       | 2725/10986 [1:02:10<3:12:51,  1.40s/it]

training loss: 3.345510721206665


training:  25%|██▍       | 2726/10986 [1:02:12<3:07:43,  1.36s/it]

training loss: 3.29461669921875


training:  25%|██▍       | 2727/10986 [1:02:13<3:03:26,  1.33s/it]

training loss: 3.5643997192382812


training:  25%|██▍       | 2728/10986 [1:02:14<3:00:29,  1.31s/it]

training loss: 3.4162449836730957


training:  25%|██▍       | 2729/10986 [1:02:16<2:58:33,  1.30s/it]

training loss: 3.3568506240844727


training:  25%|██▍       | 2730/10986 [1:02:17<2:56:55,  1.29s/it]

training loss: 3.4073047637939453


training:  25%|██▍       | 2731/10986 [1:02:18<3:07:20,  1.36s/it]

training loss: 3.3288755416870117


training:  25%|██▍       | 2732/10986 [1:02:20<3:03:58,  1.34s/it]

training loss: 3.3608057498931885


training:  25%|██▍       | 2733/10986 [1:02:21<3:01:26,  1.32s/it]

training loss: 3.357295036315918


training:  25%|██▍       | 2734/10986 [1:02:22<2:58:56,  1.30s/it]

training loss: 3.2721118927001953


training:  25%|██▍       | 2735/10986 [1:02:23<2:57:51,  1.29s/it]

training loss: 3.3633837699890137


training:  25%|██▍       | 2736/10986 [1:02:25<2:56:46,  1.29s/it]

training loss: 3.306104898452759


training:  25%|██▍       | 2737/10986 [1:02:26<2:55:47,  1.28s/it]

training loss: 3.4199435710906982


training:  25%|██▍       | 2738/10986 [1:02:27<2:55:01,  1.27s/it]

training loss: 3.3541836738586426


training:  25%|██▍       | 2739/10986 [1:02:28<2:54:11,  1.27s/it]

training loss: 3.3493905067443848


training:  25%|██▍       | 2740/10986 [1:02:30<2:53:26,  1.26s/it]

training loss: 3.4280927181243896
valid loss: 3.4196741580963135
perplexity: 30.55945587158203


training:  25%|██▍       | 2741/10986 [1:02:32<3:51:52,  1.69s/it]

training loss: 3.328561782836914


training:  25%|██▍       | 2742/10986 [1:02:34<3:39:05,  1.59s/it]

training loss: 3.439953565597534


training:  25%|██▍       | 2743/10986 [1:02:35<3:27:06,  1.51s/it]

training loss: 3.5904691219329834


training:  25%|██▍       | 2744/10986 [1:02:36<3:16:59,  1.43s/it]

training loss: 3.422312021255493


training:  25%|██▍       | 2745/10986 [1:02:38<3:09:42,  1.38s/it]

training loss: 3.346259832382202


training:  25%|██▍       | 2746/10986 [1:02:39<3:04:51,  1.35s/it]

training loss: 3.3403377532958984


training:  25%|██▌       | 2747/10986 [1:02:40<3:01:16,  1.32s/it]

training loss: 3.406972646713257


training:  25%|██▌       | 2748/10986 [1:02:41<2:58:39,  1.30s/it]

training loss: 3.4445321559906006


training:  25%|██▌       | 2749/10986 [1:02:43<3:02:34,  1.33s/it]

training loss: 3.580688238143921


training:  25%|██▌       | 2750/10986 [1:02:44<3:17:43,  1.44s/it]

training loss: 3.3748245239257812


training:  25%|██▌       | 2751/10986 [1:02:46<3:34:03,  1.56s/it]

training loss: 3.4508142471313477


training:  25%|██▌       | 2752/10986 [1:02:48<3:33:52,  1.56s/it]

training loss: 3.539431571960449


training:  25%|██▌       | 2753/10986 [1:02:49<3:21:42,  1.47s/it]

training loss: 3.3921828269958496


training:  25%|██▌       | 2754/10986 [1:02:50<3:13:06,  1.41s/it]

training loss: 3.4445972442626953


training:  25%|██▌       | 2755/10986 [1:02:52<3:06:41,  1.36s/it]

training loss: 3.338064193725586


training:  25%|██▌       | 2756/10986 [1:02:53<3:03:15,  1.34s/it]

training loss: 3.419126272201538


training:  25%|██▌       | 2757/10986 [1:02:54<3:00:12,  1.31s/it]

training loss: 3.5838074684143066


training:  25%|██▌       | 2758/10986 [1:02:55<2:57:57,  1.30s/it]

training loss: 3.31014347076416


training:  25%|██▌       | 2759/10986 [1:02:57<2:56:21,  1.29s/it]

training loss: 3.381458044052124


training:  25%|██▌       | 2760/10986 [1:02:58<2:55:11,  1.28s/it]

training loss: 3.4251272678375244
valid loss: 3.4189350605010986
perplexity: 30.536876678466797


training:  25%|██▌       | 2761/10986 [1:03:01<3:52:50,  1.70s/it]

training loss: 3.3636553287506104


training:  25%|██▌       | 2762/10986 [1:03:02<3:39:04,  1.60s/it]

training loss: 3.4472391605377197


training:  25%|██▌       | 2763/10986 [1:03:03<3:25:16,  1.50s/it]

training loss: 3.3410489559173584


training:  25%|██▌       | 2764/10986 [1:03:05<3:17:35,  1.44s/it]

training loss: 3.4665935039520264


training:  25%|██▌       | 2765/10986 [1:03:06<3:10:05,  1.39s/it]

training loss: 3.3889663219451904


training:  25%|██▌       | 2766/10986 [1:03:07<3:05:25,  1.35s/it]

training loss: 3.3201630115509033


training:  25%|██▌       | 2767/10986 [1:03:08<3:01:44,  1.33s/it]

training loss: 3.296539068222046


training:  25%|██▌       | 2768/10986 [1:03:10<2:59:38,  1.31s/it]

training loss: 3.2936692237854004


training:  25%|██▌       | 2769/10986 [1:03:11<2:59:35,  1.31s/it]

training loss: 3.3123254776000977


training:  25%|██▌       | 2770/10986 [1:03:12<2:58:04,  1.30s/it]

training loss: 3.3018064498901367


training:  25%|██▌       | 2771/10986 [1:03:14<3:07:56,  1.37s/it]

training loss: 3.3811254501342773


training:  25%|██▌       | 2772/10986 [1:03:15<3:14:43,  1.42s/it]

training loss: 3.2659521102905273


training:  25%|██▌       | 2773/10986 [1:03:17<3:09:37,  1.39s/it]

training loss: 3.338893175125122


training:  25%|██▌       | 2774/10986 [1:03:18<3:04:37,  1.35s/it]

training loss: 3.463737726211548


training:  25%|██▌       | 2775/10986 [1:03:19<3:01:24,  1.33s/it]

training loss: 3.4623217582702637


training:  25%|██▌       | 2776/10986 [1:03:20<2:59:05,  1.31s/it]

training loss: 3.3813493251800537


training:  25%|██▌       | 2777/10986 [1:03:22<2:57:43,  1.30s/it]

training loss: 3.339132070541382


training:  25%|██▌       | 2778/10986 [1:03:23<2:56:45,  1.29s/it]

training loss: 3.316269874572754


training:  25%|██▌       | 2779/10986 [1:03:24<2:56:48,  1.29s/it]

training loss: 3.393852472305298


training:  25%|██▌       | 2780/10986 [1:03:26<2:56:09,  1.29s/it]

training loss: 3.3714189529418945
valid loss: 3.362748861312866
perplexity: 28.868438720703125


training:  25%|██▌       | 2781/10986 [1:03:28<3:53:51,  1.71s/it]

training loss: 3.3155195713043213


training:  25%|██▌       | 2782/10986 [1:03:30<3:39:38,  1.61s/it]

training loss: 3.3398053646087646


training:  25%|██▌       | 2783/10986 [1:03:31<3:25:48,  1.51s/it]

training loss: 3.336466073989868


training:  25%|██▌       | 2784/10986 [1:03:32<3:18:11,  1.45s/it]

training loss: 3.432008743286133


training:  25%|██▌       | 2785/10986 [1:03:33<3:11:05,  1.40s/it]

training loss: 3.408205270767212


training:  25%|██▌       | 2786/10986 [1:03:35<3:08:42,  1.38s/it]

training loss: 3.4127285480499268


training:  25%|██▌       | 2787/10986 [1:03:36<3:04:24,  1.35s/it]

training loss: 3.3677642345428467


training:  25%|██▌       | 2788/10986 [1:03:37<3:01:59,  1.33s/it]

training loss: 3.398942470550537


training:  25%|██▌       | 2789/10986 [1:03:39<2:59:27,  1.31s/it]

training loss: 3.466588258743286


training:  25%|██▌       | 2790/10986 [1:03:40<2:57:52,  1.30s/it]

training loss: 3.3589696884155273


training:  25%|██▌       | 2791/10986 [1:03:41<3:08:34,  1.38s/it]

training loss: 3.4083244800567627


training:  25%|██▌       | 2792/10986 [1:03:43<3:04:46,  1.35s/it]

training loss: 3.370788335800171


training:  25%|██▌       | 2793/10986 [1:03:44<3:02:08,  1.33s/it]

training loss: 3.3176589012145996


training:  25%|██▌       | 2794/10986 [1:03:45<2:59:49,  1.32s/it]

training loss: 3.4447226524353027


training:  25%|██▌       | 2795/10986 [1:03:47<2:57:56,  1.30s/it]

training loss: 3.385982036590576


training:  25%|██▌       | 2796/10986 [1:03:48<2:56:47,  1.30s/it]

training loss: 3.22524356842041


training:  25%|██▌       | 2797/10986 [1:03:49<2:55:24,  1.29s/it]

training loss: 3.420036554336548


training:  25%|██▌       | 2798/10986 [1:03:50<2:54:47,  1.28s/it]

training loss: 3.4400076866149902


training:  25%|██▌       | 2799/10986 [1:03:52<2:54:18,  1.28s/it]

training loss: 3.3155033588409424


training:  25%|██▌       | 2800/10986 [1:03:53<2:54:40,  1.28s/it]

training loss: 3.3416175842285156
valid loss: 3.3371567726135254
perplexity: 28.139007568359375


training:  25%|██▌       | 2801/10986 [1:03:56<3:53:48,  1.71s/it]

training loss: 3.4041390419006348


training:  26%|██▌       | 2802/10986 [1:03:57<3:49:44,  1.68s/it]

training loss: 3.309739828109741


training:  26%|██▌       | 2803/10986 [1:03:59<3:32:58,  1.56s/it]

training loss: 3.482006549835205


training:  26%|██▌       | 2804/10986 [1:04:00<3:20:25,  1.47s/it]

training loss: 3.4683356285095215


training:  26%|██▌       | 2805/10986 [1:04:01<3:12:40,  1.41s/it]

training loss: 3.38417649269104


training:  26%|██▌       | 2806/10986 [1:04:02<3:06:47,  1.37s/it]

training loss: 3.4003164768218994


training:  26%|██▌       | 2807/10986 [1:04:04<3:02:26,  1.34s/it]

training loss: 3.253406047821045


training:  26%|██▌       | 2808/10986 [1:04:05<3:01:57,  1.33s/it]

training loss: 3.395838975906372


training:  26%|██▌       | 2809/10986 [1:04:06<3:01:33,  1.33s/it]

training loss: 3.4965367317199707


training:  26%|██▌       | 2810/10986 [1:04:08<2:59:43,  1.32s/it]

training loss: 3.4834117889404297


training:  26%|██▌       | 2811/10986 [1:04:09<3:08:29,  1.38s/it]

training loss: 3.5779430866241455


training:  26%|██▌       | 2812/10986 [1:04:11<3:16:59,  1.45s/it]

training loss: 3.390540599822998


training:  26%|██▌       | 2813/10986 [1:04:12<3:09:55,  1.39s/it]

training loss: 3.343325614929199


training:  26%|██▌       | 2814/10986 [1:04:13<3:04:53,  1.36s/it]

training loss: 3.3380658626556396


training:  26%|██▌       | 2815/10986 [1:04:15<3:01:03,  1.33s/it]

training loss: 3.3229641914367676


training:  26%|██▌       | 2816/10986 [1:04:16<2:58:44,  1.31s/it]

training loss: 3.354684591293335


training:  26%|██▌       | 2817/10986 [1:04:17<2:57:55,  1.31s/it]

training loss: 3.3509957790374756


training:  26%|██▌       | 2818/10986 [1:04:18<2:57:28,  1.30s/it]

training loss: 3.3257699012756348


training:  26%|██▌       | 2819/10986 [1:04:20<2:55:36,  1.29s/it]

training loss: 3.337207555770874


training:  26%|██▌       | 2820/10986 [1:04:21<2:55:02,  1.29s/it]

training loss: 3.4315927028656006
valid loss: 3.427058458328247
perplexity: 30.785951614379883


training:  26%|██▌       | 2821/10986 [1:04:24<3:52:27,  1.71s/it]

training loss: 3.4015886783599854


training:  26%|██▌       | 2822/10986 [1:04:25<3:48:45,  1.68s/it]

training loss: 3.357241630554199


training:  26%|██▌       | 2823/10986 [1:04:27<3:33:01,  1.57s/it]

training loss: 3.314937114715576


training:  26%|██▌       | 2824/10986 [1:04:28<3:21:35,  1.48s/it]

training loss: 3.478515148162842


training:  26%|██▌       | 2825/10986 [1:04:29<3:12:04,  1.41s/it]

training loss: 3.4189178943634033


training:  26%|██▌       | 2826/10986 [1:04:30<3:06:14,  1.37s/it]

training loss: 3.489900827407837


training:  26%|██▌       | 2827/10986 [1:04:32<3:02:28,  1.34s/it]

training loss: 3.3416907787323


training:  26%|██▌       | 2828/10986 [1:04:33<2:59:52,  1.32s/it]

training loss: 3.3927996158599854


training:  26%|██▌       | 2829/10986 [1:04:34<2:57:36,  1.31s/it]

training loss: 3.4322922229766846


training:  26%|██▌       | 2830/10986 [1:04:35<2:58:10,  1.31s/it]

training loss: 3.3537895679473877


training:  26%|██▌       | 2831/10986 [1:04:37<3:08:03,  1.38s/it]

training loss: 3.414818286895752


training:  26%|██▌       | 2832/10986 [1:04:38<3:06:30,  1.37s/it]

training loss: 3.261446475982666


training:  26%|██▌       | 2833/10986 [1:04:40<3:02:33,  1.34s/it]

training loss: 3.202472686767578


training:  26%|██▌       | 2834/10986 [1:04:41<3:00:43,  1.33s/it]

training loss: 3.2979092597961426


training:  26%|██▌       | 2835/10986 [1:04:42<2:59:13,  1.32s/it]

training loss: 3.4329330921173096


training:  26%|██▌       | 2836/10986 [1:04:44<2:57:05,  1.30s/it]

training loss: 3.3408663272857666


training:  26%|██▌       | 2837/10986 [1:04:45<2:55:40,  1.29s/it]

training loss: 3.337228298187256


training:  26%|██▌       | 2838/10986 [1:04:46<2:55:02,  1.29s/it]

training loss: 3.364009380340576


training:  26%|██▌       | 2839/10986 [1:04:47<2:54:57,  1.29s/it]

training loss: 3.324246406555176


training:  26%|██▌       | 2840/10986 [1:04:49<2:53:39,  1.28s/it]

training loss: 3.322962999343872
valid loss: 3.3223960399627686
perplexity: 27.726707458496094


training:  26%|██▌       | 2841/10986 [1:04:51<3:52:34,  1.71s/it]

training loss: 3.4160220623016357


training:  26%|██▌       | 2842/10986 [1:04:53<3:50:07,  1.70s/it]

training loss: 3.367413282394409


training:  26%|██▌       | 2843/10986 [1:04:54<3:34:00,  1.58s/it]

training loss: 3.3252360820770264


training:  26%|██▌       | 2844/10986 [1:04:56<3:21:35,  1.49s/it]

training loss: 3.390364646911621


training:  26%|██▌       | 2845/10986 [1:04:57<3:12:39,  1.42s/it]

training loss: 3.3470335006713867


training:  26%|██▌       | 2846/10986 [1:04:58<3:06:42,  1.38s/it]

training loss: 3.3286960124969482


training:  26%|██▌       | 2847/10986 [1:04:59<3:04:14,  1.36s/it]

training loss: 3.2932145595550537


training:  26%|██▌       | 2848/10986 [1:05:01<3:00:31,  1.33s/it]

training loss: 3.295754909515381


training:  26%|██▌       | 2849/10986 [1:05:02<2:58:38,  1.32s/it]

training loss: 3.4325966835021973


training:  26%|██▌       | 2850/10986 [1:05:03<2:57:13,  1.31s/it]

training loss: 3.37972354888916


training:  26%|██▌       | 2851/10986 [1:05:05<3:06:14,  1.37s/it]

training loss: 3.2357537746429443


training:  26%|██▌       | 2852/10986 [1:05:06<3:05:34,  1.37s/it]

training loss: 3.40095853805542


training:  26%|██▌       | 2853/10986 [1:05:07<3:01:42,  1.34s/it]

training loss: 3.255647897720337


training:  26%|██▌       | 2854/10986 [1:05:09<2:58:57,  1.32s/it]

training loss: 3.535024642944336


training:  26%|██▌       | 2855/10986 [1:05:10<2:57:49,  1.31s/it]

training loss: 3.3480300903320312


training:  26%|██▌       | 2856/10986 [1:05:11<2:56:07,  1.30s/it]

training loss: 3.39994478225708


training:  26%|██▌       | 2857/10986 [1:05:13<2:55:43,  1.30s/it]

training loss: 3.311924695968628


training:  26%|██▌       | 2858/10986 [1:05:14<2:54:39,  1.29s/it]

training loss: 3.3300724029541016


training:  26%|██▌       | 2859/10986 [1:05:15<2:53:50,  1.28s/it]

training loss: 3.4013874530792236


training:  26%|██▌       | 2860/10986 [1:05:16<2:52:52,  1.28s/it]

training loss: 3.393965005874634
valid loss: 3.3955039978027344
perplexity: 29.829683303833008


training:  26%|██▌       | 2861/10986 [1:05:19<3:50:46,  1.70s/it]

training loss: 3.3205695152282715


training:  26%|██▌       | 2862/10986 [1:05:21<3:48:42,  1.69s/it]

training loss: 3.429621934890747


training:  26%|██▌       | 2863/10986 [1:05:22<3:31:52,  1.56s/it]

training loss: 3.399960517883301


training:  26%|██▌       | 2864/10986 [1:05:23<3:20:19,  1.48s/it]

training loss: 3.33004093170166


training:  26%|██▌       | 2865/10986 [1:05:25<3:11:49,  1.42s/it]

training loss: 3.293175458908081


training:  26%|██▌       | 2866/10986 [1:05:26<3:06:19,  1.38s/it]

training loss: 3.3098061084747314


training:  26%|██▌       | 2867/10986 [1:05:27<3:02:23,  1.35s/it]

training loss: 3.2689733505249023


training:  26%|██▌       | 2868/10986 [1:05:28<2:59:32,  1.33s/it]

training loss: 3.3492910861968994


training:  26%|██▌       | 2869/10986 [1:05:30<2:57:15,  1.31s/it]

training loss: 3.3743302822113037


training:  26%|██▌       | 2870/10986 [1:05:31<2:56:04,  1.30s/it]

training loss: 3.3375980854034424


training:  26%|██▌       | 2871/10986 [1:05:32<3:05:33,  1.37s/it]

training loss: 3.498225688934326


training:  26%|██▌       | 2872/10986 [1:05:34<3:13:55,  1.43s/it]

training loss: 3.434412717819214


training:  26%|██▌       | 2873/10986 [1:05:35<3:07:29,  1.39s/it]

training loss: 3.233246326446533


training:  26%|██▌       | 2874/10986 [1:05:37<3:05:37,  1.37s/it]

training loss: 3.385692596435547


training:  26%|██▌       | 2875/10986 [1:05:38<3:03:59,  1.36s/it]

training loss: 3.2987425327301025


training:  26%|██▌       | 2876/10986 [1:05:39<3:01:21,  1.34s/it]

training loss: 3.2681427001953125


training:  26%|██▌       | 2877/10986 [1:05:41<2:58:44,  1.32s/it]

training loss: 3.234145164489746


training:  26%|██▌       | 2878/10986 [1:05:42<2:56:57,  1.31s/it]

training loss: 3.3996036052703857


training:  26%|██▌       | 2879/10986 [1:05:43<2:55:28,  1.30s/it]

training loss: 3.3954343795776367


training:  26%|██▌       | 2880/10986 [1:05:44<2:54:25,  1.29s/it]

training loss: 3.379230499267578
valid loss: 3.37050724029541
perplexity: 29.093280792236328


training:  26%|██▌       | 2881/10986 [1:05:47<3:51:35,  1.71s/it]

training loss: 3.4060494899749756


training:  26%|██▌       | 2882/10986 [1:05:48<3:37:35,  1.61s/it]

training loss: 3.352752923965454


training:  26%|██▌       | 2883/10986 [1:05:50<3:24:25,  1.51s/it]

training loss: 3.395695686340332


training:  26%|██▋       | 2884/10986 [1:05:51<3:14:29,  1.44s/it]

training loss: 3.289625644683838


training:  26%|██▋       | 2885/10986 [1:05:52<3:08:37,  1.40s/it]

training loss: 3.338160276412964


training:  26%|██▋       | 2886/10986 [1:05:54<3:05:45,  1.38s/it]

training loss: 3.469386339187622


training:  26%|██▋       | 2887/10986 [1:05:55<3:01:58,  1.35s/it]

training loss: 3.4397263526916504


training:  26%|██▋       | 2888/10986 [1:05:56<2:58:20,  1.32s/it]

training loss: 3.439663887023926


training:  26%|██▋       | 2889/10986 [1:05:57<2:56:12,  1.31s/it]

training loss: 3.355067253112793


training:  26%|██▋       | 2890/10986 [1:05:59<2:54:23,  1.29s/it]

training loss: 3.3168041706085205


training:  26%|██▋       | 2891/10986 [1:06:00<3:04:18,  1.37s/it]

training loss: 3.39365291595459


training:  26%|██▋       | 2892/10986 [1:06:02<3:14:08,  1.44s/it]

training loss: 3.467226028442383


training:  26%|██▋       | 2893/10986 [1:06:03<3:08:22,  1.40s/it]

training loss: 3.337519884109497


training:  26%|██▋       | 2894/10986 [1:06:04<3:03:05,  1.36s/it]

training loss: 3.31484317779541


training:  26%|██▋       | 2895/10986 [1:06:06<2:59:50,  1.33s/it]

training loss: 3.4257655143737793


training:  26%|██▋       | 2896/10986 [1:06:07<2:57:52,  1.32s/it]

training loss: 3.406175136566162


training:  26%|██▋       | 2897/10986 [1:06:09<3:11:03,  1.42s/it]

training loss: 3.4808804988861084


training:  26%|██▋       | 2898/10986 [1:06:10<3:22:42,  1.50s/it]

training loss: 3.3347599506378174


training:  26%|██▋       | 2899/10986 [1:06:12<3:17:03,  1.46s/it]

training loss: 3.3175933361053467


training:  26%|██▋       | 2900/10986 [1:06:13<3:09:20,  1.40s/it]

training loss: 3.3550453186035156
valid loss: 3.355175256729126
perplexity: 28.650625228881836


training:  26%|██▋       | 2901/10986 [1:06:16<4:02:25,  1.80s/it]

training loss: 3.382547378540039


training:  26%|██▋       | 2902/10986 [1:06:17<3:43:25,  1.66s/it]

training loss: 3.391467571258545


training:  26%|██▋       | 2903/10986 [1:06:18<3:28:18,  1.55s/it]

training loss: 3.386766195297241


training:  26%|██▋       | 2904/10986 [1:06:20<3:16:19,  1.46s/it]

training loss: 3.3865206241607666


training:  26%|██▋       | 2905/10986 [1:06:21<3:08:00,  1.40s/it]

training loss: 3.393956184387207


training:  26%|██▋       | 2906/10986 [1:06:22<3:02:04,  1.35s/it]

training loss: 3.4491114616394043


training:  26%|██▋       | 2907/10986 [1:06:23<2:58:17,  1.32s/it]

training loss: 3.4087507724761963


training:  26%|██▋       | 2908/10986 [1:06:25<2:56:10,  1.31s/it]

training loss: 3.3394834995269775


training:  26%|██▋       | 2909/10986 [1:06:26<2:53:48,  1.29s/it]

training loss: 3.2825160026550293


training:  26%|██▋       | 2910/10986 [1:06:27<2:52:40,  1.28s/it]

training loss: 3.325018882751465


training:  26%|██▋       | 2911/10986 [1:06:29<3:02:31,  1.36s/it]

training loss: 3.391207695007324


training:  27%|██▋       | 2912/10986 [1:06:30<2:59:19,  1.33s/it]

training loss: 3.3294951915740967


training:  27%|██▋       | 2913/10986 [1:06:31<2:56:29,  1.31s/it]

training loss: 3.554546356201172


training:  27%|██▋       | 2914/10986 [1:06:32<2:54:29,  1.30s/it]

training loss: 3.359644651412964


training:  27%|██▋       | 2915/10986 [1:06:34<2:53:30,  1.29s/it]

training loss: 3.496978998184204


training:  27%|██▋       | 2916/10986 [1:06:35<2:52:57,  1.29s/it]

training loss: 3.2669732570648193


training:  27%|██▋       | 2917/10986 [1:06:36<2:52:06,  1.28s/it]

training loss: 3.4262564182281494


training:  27%|██▋       | 2918/10986 [1:06:38<2:51:07,  1.27s/it]

training loss: 3.3955793380737305


training:  27%|██▋       | 2919/10986 [1:06:39<2:51:10,  1.27s/it]

training loss: 3.409904718399048


training:  27%|██▋       | 2920/10986 [1:06:40<2:50:54,  1.27s/it]

training loss: 3.332367420196533
valid loss: 3.337888479232788
perplexity: 28.159603118896484


training:  27%|██▋       | 2921/10986 [1:06:43<3:48:00,  1.70s/it]

training loss: 3.41644549369812


training:  27%|██▋       | 2922/10986 [1:06:44<3:34:17,  1.59s/it]

training loss: 3.4167418479919434


training:  27%|██▋       | 2923/10986 [1:06:45<3:24:51,  1.52s/it]

training loss: 3.4356791973114014


training:  27%|██▋       | 2924/10986 [1:06:47<3:13:17,  1.44s/it]

training loss: 3.357729434967041


training:  27%|██▋       | 2925/10986 [1:06:48<3:06:28,  1.39s/it]

training loss: 3.536428213119507


training:  27%|██▋       | 2926/10986 [1:06:49<3:01:49,  1.35s/it]

training loss: 3.381143093109131


training:  27%|██▋       | 2927/10986 [1:06:50<2:58:16,  1.33s/it]

training loss: 3.426981210708618


training:  27%|██▋       | 2928/10986 [1:06:52<2:56:02,  1.31s/it]

training loss: 3.320477247238159


training:  27%|██▋       | 2929/10986 [1:06:53<2:54:17,  1.30s/it]

training loss: 3.3657429218292236


training:  27%|██▋       | 2930/10986 [1:06:54<2:53:36,  1.29s/it]

training loss: 3.492241621017456


training:  27%|██▋       | 2931/10986 [1:06:56<3:03:17,  1.37s/it]

training loss: 3.3573269844055176


training:  27%|██▋       | 2932/10986 [1:06:57<3:01:13,  1.35s/it]

training loss: 3.4093844890594482


training:  27%|██▋       | 2933/10986 [1:06:58<2:57:23,  1.32s/it]

training loss: 3.3909809589385986


training:  27%|██▋       | 2934/10986 [1:07:00<2:54:47,  1.30s/it]

training loss: 3.445242166519165


training:  27%|██▋       | 2935/10986 [1:07:01<2:53:21,  1.29s/it]

training loss: 3.372112512588501


training:  27%|██▋       | 2936/10986 [1:07:02<2:52:09,  1.28s/it]

training loss: 3.3245794773101807


training:  27%|██▋       | 2937/10986 [1:07:03<2:51:05,  1.28s/it]

training loss: 3.405783176422119


training:  27%|██▋       | 2938/10986 [1:07:05<2:49:52,  1.27s/it]

training loss: 3.373183250427246


training:  27%|██▋       | 2939/10986 [1:07:06<2:50:00,  1.27s/it]

training loss: 3.3661952018737793


training:  27%|██▋       | 2940/10986 [1:07:07<2:50:09,  1.27s/it]

training loss: 3.375880718231201
valid loss: 3.377194881439209
perplexity: 29.288497924804688


training:  27%|██▋       | 2941/10986 [1:07:10<3:47:35,  1.70s/it]

training loss: 3.5010626316070557


training:  27%|██▋       | 2942/10986 [1:07:11<3:34:41,  1.60s/it]

training loss: 3.4506568908691406


training:  27%|██▋       | 2943/10986 [1:07:13<3:20:30,  1.50s/it]

training loss: 3.356081485748291


training:  27%|██▋       | 2944/10986 [1:07:14<3:10:11,  1.42s/it]

training loss: 3.2892348766326904


training:  27%|██▋       | 2945/10986 [1:07:15<3:03:45,  1.37s/it]

training loss: 3.453768253326416


training:  27%|██▋       | 2946/10986 [1:07:16<2:58:09,  1.33s/it]

training loss: 3.4499340057373047


training:  27%|██▋       | 2947/10986 [1:07:18<2:54:32,  1.30s/it]

training loss: 3.257399320602417


training:  27%|██▋       | 2948/10986 [1:07:19<2:52:43,  1.29s/it]

training loss: 3.3720543384552


training:  27%|██▋       | 2949/10986 [1:07:20<2:51:52,  1.28s/it]

training loss: 3.4544670581817627


training:  27%|██▋       | 2950/10986 [1:07:21<2:50:28,  1.27s/it]

training loss: 3.329545021057129


training:  27%|██▋       | 2951/10986 [1:07:23<2:59:03,  1.34s/it]

training loss: 3.3253884315490723


training:  27%|██▋       | 2952/10986 [1:07:24<2:57:03,  1.32s/it]

training loss: 3.348769426345825


training:  27%|██▋       | 2953/10986 [1:07:25<2:54:00,  1.30s/it]

training loss: 3.368159294128418


training:  27%|██▋       | 2954/10986 [1:07:27<2:51:35,  1.28s/it]

training loss: 3.404611110687256


training:  27%|██▋       | 2955/10986 [1:07:28<2:50:04,  1.27s/it]

training loss: 3.4739999771118164


training:  27%|██▋       | 2956/10986 [1:07:29<2:49:59,  1.27s/it]

training loss: 3.36444354057312


training:  27%|██▋       | 2957/10986 [1:07:30<2:49:24,  1.27s/it]

training loss: 3.4166910648345947


training:  27%|██▋       | 2958/10986 [1:07:32<2:48:34,  1.26s/it]

training loss: 3.302980661392212


training:  27%|██▋       | 2959/10986 [1:07:33<2:47:48,  1.25s/it]

training loss: 3.5380141735076904


training:  27%|██▋       | 2960/10986 [1:07:34<2:47:20,  1.25s/it]

training loss: 3.365394353866577
valid loss: 3.367574691772461
perplexity: 29.008089065551758


training:  27%|██▋       | 2961/10986 [1:07:37<3:42:46,  1.67s/it]

training loss: 3.4235854148864746


training:  27%|██▋       | 2962/10986 [1:07:38<3:29:07,  1.56s/it]

training loss: 3.3068809509277344


training:  27%|██▋       | 2963/10986 [1:07:39<3:18:37,  1.49s/it]

training loss: 3.3578312397003174


training:  27%|██▋       | 2964/10986 [1:07:41<3:11:22,  1.43s/it]

training loss: 3.3800013065338135


training:  27%|██▋       | 2965/10986 [1:07:42<3:04:21,  1.38s/it]

training loss: 3.3827977180480957


training:  27%|██▋       | 2966/10986 [1:07:43<2:59:06,  1.34s/it]

training loss: 3.44003963470459


training:  27%|██▋       | 2967/10986 [1:07:44<2:55:44,  1.31s/it]

training loss: 3.4020395278930664


training:  27%|██▋       | 2968/10986 [1:07:46<2:53:51,  1.30s/it]

training loss: 3.3169641494750977


training:  27%|██▋       | 2969/10986 [1:07:47<2:52:15,  1.29s/it]

training loss: 3.476553440093994


training:  27%|██▋       | 2970/10986 [1:07:48<2:50:36,  1.28s/it]

training loss: 3.416735887527466


training:  27%|██▋       | 2971/10986 [1:07:50<3:00:52,  1.35s/it]

training loss: 3.3430235385894775


training:  27%|██▋       | 2972/10986 [1:07:51<3:09:07,  1.42s/it]

training loss: 3.400452136993408


training:  27%|██▋       | 2973/10986 [1:07:53<3:03:43,  1.38s/it]

training loss: 3.458022117614746


training:  27%|██▋       | 2974/10986 [1:07:54<2:59:42,  1.35s/it]

training loss: 3.424800157546997


training:  27%|██▋       | 2975/10986 [1:07:55<2:56:22,  1.32s/it]

training loss: 3.365382194519043


training:  27%|██▋       | 2976/10986 [1:07:56<2:54:11,  1.30s/it]

training loss: 3.3502273559570312


training:  27%|██▋       | 2977/10986 [1:07:58<2:52:28,  1.29s/it]

training loss: 3.353550434112549


training:  27%|██▋       | 2978/10986 [1:07:59<2:52:00,  1.29s/it]

training loss: 3.3182013034820557


training:  27%|██▋       | 2979/10986 [1:08:00<2:51:29,  1.29s/it]

training loss: 3.381329298019409


training:  27%|██▋       | 2980/10986 [1:08:01<2:50:54,  1.28s/it]

training loss: 3.4683456420898438
valid loss: 3.4680819511413574
perplexity: 32.07516098022461


training:  27%|██▋       | 2981/10986 [1:08:04<3:45:34,  1.69s/it]

training loss: 3.420222282409668


training:  27%|██▋       | 2982/10986 [1:08:05<3:31:54,  1.59s/it]

training loss: 3.455479621887207


training:  27%|██▋       | 2983/10986 [1:08:07<3:19:35,  1.50s/it]

training loss: 3.3484537601470947


training:  27%|██▋       | 2984/10986 [1:08:08<3:10:15,  1.43s/it]

training loss: 3.375108003616333


training:  27%|██▋       | 2985/10986 [1:08:09<3:03:51,  1.38s/it]

training loss: 3.3229732513427734


training:  27%|██▋       | 2986/10986 [1:08:11<3:01:41,  1.36s/it]

training loss: 3.3588945865631104


training:  27%|██▋       | 2987/10986 [1:08:12<2:58:54,  1.34s/it]

training loss: 3.458495616912842


training:  27%|██▋       | 2988/10986 [1:08:13<2:55:28,  1.32s/it]

training loss: 3.4241185188293457


training:  27%|██▋       | 2989/10986 [1:08:14<2:53:05,  1.30s/it]

training loss: 3.286195993423462


training:  27%|██▋       | 2990/10986 [1:08:16<2:51:44,  1.29s/it]

training loss: 3.392808675765991


training:  27%|██▋       | 2991/10986 [1:08:17<3:01:51,  1.36s/it]

training loss: 3.4737534523010254


training:  27%|██▋       | 2992/10986 [1:08:19<2:58:41,  1.34s/it]

training loss: 3.3946332931518555


training:  27%|██▋       | 2993/10986 [1:08:20<2:57:35,  1.33s/it]

training loss: 3.4234633445739746


training:  27%|██▋       | 2994/10986 [1:08:21<3:00:09,  1.35s/it]

training loss: 3.3616628646850586


training:  27%|██▋       | 2995/10986 [1:08:23<3:00:59,  1.36s/it]

training loss: 3.3689022064208984


training:  27%|██▋       | 2996/10986 [1:08:24<3:00:06,  1.35s/it]

training loss: 3.363872528076172


training:  27%|██▋       | 2997/10986 [1:08:25<2:57:57,  1.34s/it]

training loss: 3.395310640335083


training:  27%|██▋       | 2998/10986 [1:08:27<2:55:51,  1.32s/it]

training loss: 3.30464506149292


training:  27%|██▋       | 2999/10986 [1:08:28<2:54:24,  1.31s/it]

training loss: 3.2932095527648926


training:  27%|██▋       | 3000/10986 [1:08:29<2:53:54,  1.31s/it]

training loss: 3.4249486923217773
valid loss: 3.418198823928833
perplexity: 30.514402389526367


training:  27%|██▋       | 3001/10986 [1:08:32<3:50:49,  1.73s/it]

training loss: 3.2924368381500244


training:  27%|██▋       | 3002/10986 [1:08:33<3:36:10,  1.62s/it]

training loss: 3.375368118286133


training:  27%|██▋       | 3003/10986 [1:08:34<3:23:08,  1.53s/it]

training loss: 3.397014617919922


training:  27%|██▋       | 3004/10986 [1:08:36<3:13:05,  1.45s/it]

training loss: 3.4236299991607666


training:  27%|██▋       | 3005/10986 [1:08:37<3:06:15,  1.40s/it]

training loss: 3.373096466064453


training:  27%|██▋       | 3006/10986 [1:08:38<3:02:26,  1.37s/it]

training loss: 3.264928102493286


training:  27%|██▋       | 3007/10986 [1:08:40<3:01:44,  1.37s/it]

training loss: 3.2869927883148193


training:  27%|██▋       | 3008/10986 [1:08:41<3:00:14,  1.36s/it]

training loss: 3.1941895484924316


training:  27%|██▋       | 3009/10986 [1:08:42<2:59:49,  1.35s/it]

training loss: 3.354930877685547


training:  27%|██▋       | 3010/10986 [1:08:44<2:57:30,  1.34s/it]

training loss: 3.2979345321655273


training:  27%|██▋       | 3011/10986 [1:08:45<3:09:50,  1.43s/it]

training loss: 3.4731621742248535


training:  27%|██▋       | 3012/10986 [1:08:47<3:08:42,  1.42s/it]

training loss: 3.32661771774292


training:  27%|██▋       | 3013/10986 [1:08:48<3:08:09,  1.42s/it]

training loss: 3.31350040435791


training:  27%|██▋       | 3014/10986 [1:08:50<3:07:03,  1.41s/it]

training loss: 3.320540189743042


training:  27%|██▋       | 3015/10986 [1:08:51<3:05:44,  1.40s/it]

training loss: 3.4302380084991455


training:  27%|██▋       | 3016/10986 [1:08:52<3:05:10,  1.39s/it]

training loss: 3.3268978595733643


training:  27%|██▋       | 3017/10986 [1:08:54<3:05:40,  1.40s/it]

training loss: 3.3334975242614746


training:  27%|██▋       | 3018/10986 [1:08:55<3:04:26,  1.39s/it]

training loss: 3.382009744644165


training:  27%|██▋       | 3019/10986 [1:08:56<3:03:20,  1.38s/it]

training loss: 3.407832622528076


training:  27%|██▋       | 3020/10986 [1:08:58<3:02:07,  1.37s/it]

training loss: 3.3722338676452637
valid loss: 3.37166690826416
perplexity: 29.12704086303711


training:  27%|██▋       | 3021/10986 [1:09:01<4:00:44,  1.81s/it]

training loss: 3.320176839828491


training:  28%|██▊       | 3022/10986 [1:09:02<3:46:48,  1.71s/it]

training loss: 3.425182342529297


training:  28%|██▊       | 3023/10986 [1:09:03<3:32:25,  1.60s/it]

training loss: 3.4322433471679688


training:  28%|██▊       | 3024/10986 [1:09:05<3:20:34,  1.51s/it]

training loss: 3.3245949745178223


training:  28%|██▊       | 3025/10986 [1:09:06<3:12:06,  1.45s/it]

training loss: 3.5314791202545166


training:  28%|██▊       | 3026/10986 [1:09:07<3:04:42,  1.39s/it]

training loss: 3.3625142574310303


training:  28%|██▊       | 3027/10986 [1:09:09<2:59:49,  1.36s/it]

training loss: 3.3675668239593506


training:  28%|██▊       | 3028/10986 [1:09:10<2:56:19,  1.33s/it]

training loss: 3.3661906719207764


training:  28%|██▊       | 3029/10986 [1:09:11<2:55:42,  1.32s/it]

training loss: 3.356943130493164


training:  28%|██▊       | 3030/10986 [1:09:12<2:55:09,  1.32s/it]

training loss: 3.3475589752197266


training:  28%|██▊       | 3031/10986 [1:09:14<3:03:35,  1.38s/it]

training loss: 3.317309856414795


training:  28%|██▊       | 3032/10986 [1:09:16<3:12:49,  1.45s/it]

training loss: 3.4217801094055176


training:  28%|██▊       | 3033/10986 [1:09:17<3:05:10,  1.40s/it]

training loss: 3.404905319213867


training:  28%|██▊       | 3034/10986 [1:09:18<3:00:00,  1.36s/it]

training loss: 3.316638708114624


training:  28%|██▊       | 3035/10986 [1:09:19<2:56:20,  1.33s/it]

training loss: 3.2801125049591064


training:  28%|██▊       | 3036/10986 [1:09:21<2:53:50,  1.31s/it]

training loss: 3.373746156692505


training:  28%|██▊       | 3037/10986 [1:09:22<2:52:17,  1.30s/it]

training loss: 3.4583332538604736


training:  28%|██▊       | 3038/10986 [1:09:23<2:50:49,  1.29s/it]

training loss: 3.383443832397461


training:  28%|██▊       | 3039/10986 [1:09:24<2:49:55,  1.28s/it]

training loss: 3.3032760620117188


training:  28%|██▊       | 3040/10986 [1:09:26<2:50:11,  1.29s/it]

training loss: 3.4587719440460205
valid loss: 3.4600272178649902
perplexity: 31.817842483520508


training:  28%|██▊       | 3041/10986 [1:09:28<3:46:10,  1.71s/it]

training loss: 3.338207721710205


training:  28%|██▊       | 3042/10986 [1:09:30<3:43:01,  1.68s/it]

training loss: 3.336221933364868


training:  28%|██▊       | 3043/10986 [1:09:31<3:26:51,  1.56s/it]

training loss: 3.3251657485961914


training:  28%|██▊       | 3044/10986 [1:09:33<3:15:25,  1.48s/it]

training loss: 3.363600730895996


training:  28%|██▊       | 3045/10986 [1:09:34<3:07:42,  1.42s/it]

training loss: 3.388957977294922


training:  28%|██▊       | 3046/10986 [1:09:36<3:18:45,  1.50s/it]

training loss: 3.311173915863037


training:  28%|██▊       | 3047/10986 [1:09:37<3:27:02,  1.56s/it]

training loss: 3.4162516593933105


training:  28%|██▊       | 3048/10986 [1:09:39<3:15:33,  1.48s/it]

training loss: 3.3565123081207275


training:  28%|██▊       | 3049/10986 [1:09:40<3:07:22,  1.42s/it]

training loss: 3.352234363555908


training:  28%|██▊       | 3050/10986 [1:09:41<3:01:05,  1.37s/it]

training loss: 3.5454978942871094


training:  28%|██▊       | 3051/10986 [1:09:43<3:08:42,  1.43s/it]

training loss: 3.3312039375305176


training:  28%|██▊       | 3052/10986 [1:09:44<3:05:39,  1.40s/it]

training loss: 3.341609477996826


training:  28%|██▊       | 3053/10986 [1:09:45<3:01:08,  1.37s/it]

training loss: 3.3870561122894287


training:  28%|██▊       | 3054/10986 [1:09:47<2:57:53,  1.35s/it]

training loss: 3.3273847103118896


training:  28%|██▊       | 3055/10986 [1:09:48<2:55:17,  1.33s/it]

training loss: 3.4648139476776123


training:  28%|██▊       | 3056/10986 [1:09:49<2:52:53,  1.31s/it]

training loss: 3.391747236251831


training:  28%|██▊       | 3057/10986 [1:09:50<2:51:33,  1.30s/it]

training loss: 3.382199287414551


training:  28%|██▊       | 3058/10986 [1:09:52<2:50:40,  1.29s/it]

training loss: 3.3365423679351807


training:  28%|██▊       | 3059/10986 [1:09:53<2:49:52,  1.29s/it]

training loss: 3.3612592220306396


training:  28%|██▊       | 3060/10986 [1:09:54<2:49:14,  1.28s/it]

training loss: 3.316852331161499
valid loss: 3.313426971435547
perplexity: 27.47913360595703


training:  28%|██▊       | 3061/10986 [1:09:57<3:45:11,  1.70s/it]

training loss: 3.3304977416992188


training:  28%|██▊       | 3062/10986 [1:09:58<3:31:23,  1.60s/it]

training loss: 3.3409762382507324


training:  28%|██▊       | 3063/10986 [1:10:00<3:18:16,  1.50s/it]

training loss: 3.367856740951538


training:  28%|██▊       | 3064/10986 [1:10:01<3:09:28,  1.44s/it]

training loss: 3.328237295150757


training:  28%|██▊       | 3065/10986 [1:10:02<3:03:23,  1.39s/it]

training loss: 3.327392816543579


training:  28%|██▊       | 3066/10986 [1:10:03<2:58:03,  1.35s/it]

training loss: 3.262498617172241


training:  28%|██▊       | 3067/10986 [1:10:05<2:54:34,  1.32s/it]

training loss: 3.2815001010894775


training:  28%|██▊       | 3068/10986 [1:10:06<2:51:42,  1.30s/it]

training loss: 3.266314744949341


training:  28%|██▊       | 3069/10986 [1:10:07<2:50:21,  1.29s/it]

training loss: 3.307185411453247


training:  28%|██▊       | 3070/10986 [1:10:08<2:49:42,  1.29s/it]

training loss: 3.3127336502075195


training:  28%|██▊       | 3071/10986 [1:10:10<3:00:09,  1.37s/it]

training loss: 3.3124847412109375


training:  28%|██▊       | 3072/10986 [1:10:11<2:57:53,  1.35s/it]

training loss: 3.3726680278778076


training:  28%|██▊       | 3073/10986 [1:10:13<2:56:06,  1.34s/it]

training loss: 3.464081287384033


training:  28%|██▊       | 3074/10986 [1:10:14<2:54:26,  1.32s/it]

training loss: 3.4124999046325684


training:  28%|██▊       | 3075/10986 [1:10:15<2:51:45,  1.30s/it]

training loss: 3.36641526222229


training:  28%|██▊       | 3076/10986 [1:10:16<2:49:57,  1.29s/it]

training loss: 3.3125665187835693


training:  28%|██▊       | 3077/10986 [1:10:18<2:48:39,  1.28s/it]

training loss: 3.393195629119873


training:  28%|██▊       | 3078/10986 [1:10:19<2:48:18,  1.28s/it]

training loss: 3.4244914054870605


training:  28%|██▊       | 3079/10986 [1:10:20<2:47:32,  1.27s/it]

training loss: 3.3830580711364746


training:  28%|██▊       | 3080/10986 [1:10:21<2:47:00,  1.27s/it]

training loss: 3.358757495880127
valid loss: 3.3555052280426025
perplexity: 28.66008186340332


training:  28%|██▊       | 3081/10986 [1:10:24<3:43:37,  1.70s/it]

training loss: 3.420823812484741


training:  28%|██▊       | 3082/10986 [1:10:26<3:29:31,  1.59s/it]

training loss: 3.3137447834014893


training:  28%|██▊       | 3083/10986 [1:10:27<3:17:25,  1.50s/it]

training loss: 3.4256601333618164


training:  28%|██▊       | 3084/10986 [1:10:28<3:07:44,  1.43s/it]

training loss: 3.3061585426330566


training:  28%|██▊       | 3085/10986 [1:10:29<3:00:52,  1.37s/it]

training loss: 3.3471200466156006


training:  28%|██▊       | 3086/10986 [1:10:31<2:56:19,  1.34s/it]

training loss: 3.480808734893799


training:  28%|██▊       | 3087/10986 [1:10:32<2:53:01,  1.31s/it]

training loss: 3.461214780807495


training:  28%|██▊       | 3088/10986 [1:10:33<2:50:43,  1.30s/it]

training loss: 3.312133312225342


training:  28%|██▊       | 3089/10986 [1:10:34<2:49:50,  1.29s/it]

training loss: 3.399540662765503


training:  28%|██▊       | 3090/10986 [1:10:36<2:48:30,  1.28s/it]

training loss: 3.4420037269592285


training:  28%|██▊       | 3091/10986 [1:10:37<2:57:29,  1.35s/it]

training loss: 3.3479130268096924


training:  28%|██▊       | 3092/10986 [1:10:39<3:07:16,  1.42s/it]

training loss: 3.3198866844177246


training:  28%|██▊       | 3093/10986 [1:10:40<3:01:19,  1.38s/it]

training loss: 3.4402756690979004


training:  28%|██▊       | 3094/10986 [1:10:41<2:57:14,  1.35s/it]

training loss: 3.3635294437408447


training:  28%|██▊       | 3095/10986 [1:10:43<2:54:55,  1.33s/it]

training loss: 3.5197246074676514


training:  28%|██▊       | 3096/10986 [1:10:44<2:52:25,  1.31s/it]

training loss: 3.43713641166687


training:  28%|██▊       | 3097/10986 [1:10:45<2:50:08,  1.29s/it]

training loss: 3.2669119834899902


training:  28%|██▊       | 3098/10986 [1:10:46<2:47:49,  1.28s/it]

training loss: 3.397784948348999


training:  28%|██▊       | 3099/10986 [1:10:48<2:46:59,  1.27s/it]

training loss: 3.516911029815674


training:  28%|██▊       | 3100/10986 [1:10:49<2:45:58,  1.26s/it]

training loss: 3.37058162689209
valid loss: 3.3691985607147217
perplexity: 29.05523109436035


training:  28%|██▊       | 3101/10986 [1:10:52<3:42:22,  1.69s/it]

training loss: 3.3820645809173584


training:  28%|██▊       | 3102/10986 [1:10:53<3:31:05,  1.61s/it]

training loss: 3.450392961502075


training:  28%|██▊       | 3103/10986 [1:10:54<3:18:29,  1.51s/it]

training loss: 3.3280773162841797


training:  28%|██▊       | 3104/10986 [1:10:55<3:08:16,  1.43s/it]

training loss: 3.461404323577881


training:  28%|██▊       | 3105/10986 [1:10:57<3:01:15,  1.38s/it]

training loss: 3.4166460037231445


training:  28%|██▊       | 3106/10986 [1:10:58<2:56:33,  1.34s/it]

training loss: 3.307471752166748


training:  28%|██▊       | 3107/10986 [1:10:59<2:53:25,  1.32s/it]

training loss: 3.3802103996276855


training:  28%|██▊       | 3108/10986 [1:11:01<2:50:39,  1.30s/it]

training loss: 3.338408946990967


training:  28%|██▊       | 3109/10986 [1:11:02<2:49:32,  1.29s/it]

training loss: 3.437880039215088


training:  28%|██▊       | 3110/10986 [1:11:03<2:47:53,  1.28s/it]

training loss: 3.334803819656372


training:  28%|██▊       | 3111/10986 [1:11:05<2:56:43,  1.35s/it]

training loss: 3.285860061645508


training:  28%|██▊       | 3112/10986 [1:11:06<2:54:03,  1.33s/it]

training loss: 3.324988603591919


training:  28%|██▊       | 3113/10986 [1:11:07<2:50:49,  1.30s/it]

training loss: 3.415130138397217


training:  28%|██▊       | 3114/10986 [1:11:08<2:48:27,  1.28s/it]

training loss: 3.3881311416625977


training:  28%|██▊       | 3115/10986 [1:11:10<2:47:23,  1.28s/it]

training loss: 3.348862648010254


training:  28%|██▊       | 3116/10986 [1:11:11<2:48:30,  1.28s/it]

training loss: 3.401867151260376


training:  28%|██▊       | 3117/10986 [1:11:12<2:47:06,  1.27s/it]

training loss: 3.358492612838745


training:  28%|██▊       | 3118/10986 [1:11:13<2:47:31,  1.28s/it]

training loss: 3.3634860515594482


training:  28%|██▊       | 3119/10986 [1:11:15<2:48:16,  1.28s/it]

training loss: 3.3287975788116455


training:  28%|██▊       | 3120/10986 [1:11:16<2:47:01,  1.27s/it]

training loss: 3.3979122638702393
valid loss: 3.3991498947143555
perplexity: 29.938636779785156


training:  28%|██▊       | 3121/10986 [1:11:19<3:41:48,  1.69s/it]

training loss: 3.4036192893981934


training:  28%|██▊       | 3122/10986 [1:11:20<3:26:59,  1.58s/it]

training loss: 3.3257339000701904


training:  28%|██▊       | 3123/10986 [1:11:21<3:15:58,  1.50s/it]

training loss: 3.4698944091796875


training:  28%|██▊       | 3124/10986 [1:11:22<3:07:02,  1.43s/it]

training loss: 3.3310890197753906


training:  28%|██▊       | 3125/10986 [1:11:24<3:00:57,  1.38s/it]

training loss: 3.3572633266448975


training:  28%|██▊       | 3126/10986 [1:11:25<2:57:12,  1.35s/it]

training loss: 3.3635780811309814


training:  28%|██▊       | 3127/10986 [1:11:26<2:54:23,  1.33s/it]

training loss: 3.3114798069000244


training:  28%|██▊       | 3128/10986 [1:11:28<2:52:14,  1.32s/it]

training loss: 3.4289581775665283


training:  28%|██▊       | 3129/10986 [1:11:29<2:50:15,  1.30s/it]

training loss: 3.2513012886047363


training:  28%|██▊       | 3130/10986 [1:11:30<2:49:43,  1.30s/it]

training loss: 3.4123189449310303


training:  28%|██▊       | 3131/10986 [1:11:32<3:01:29,  1.39s/it]

training loss: 3.415743112564087


training:  29%|██▊       | 3132/10986 [1:11:33<2:57:25,  1.36s/it]

training loss: 3.3566346168518066


training:  29%|██▊       | 3133/10986 [1:11:34<2:53:52,  1.33s/it]

training loss: 3.42836332321167


training:  29%|██▊       | 3134/10986 [1:11:36<2:50:56,  1.31s/it]

training loss: 3.502687454223633


training:  29%|██▊       | 3135/10986 [1:11:37<2:48:47,  1.29s/it]

training loss: 3.4485371112823486


training:  29%|██▊       | 3136/10986 [1:11:38<2:48:12,  1.29s/it]

training loss: 3.424896001815796


training:  29%|██▊       | 3137/10986 [1:11:39<2:47:19,  1.28s/it]

training loss: 3.2979812622070312


training:  29%|██▊       | 3138/10986 [1:11:41<2:46:53,  1.28s/it]

training loss: 3.5202295780181885


training:  29%|██▊       | 3139/10986 [1:11:42<2:45:45,  1.27s/it]

training loss: 3.4762041568756104


training:  29%|██▊       | 3140/10986 [1:11:43<2:47:05,  1.28s/it]

training loss: 3.285900592803955
valid loss: 3.2859084606170654
perplexity: 26.733259201049805


training:  29%|██▊       | 3141/10986 [1:11:46<3:45:15,  1.72s/it]

training loss: 3.319286584854126


training:  29%|██▊       | 3142/10986 [1:11:47<3:31:06,  1.61s/it]

training loss: 3.3709864616394043


training:  29%|██▊       | 3143/10986 [1:11:49<3:17:58,  1.51s/it]

training loss: 3.3472259044647217


training:  29%|██▊       | 3144/10986 [1:11:50<3:07:40,  1.44s/it]

training loss: 3.4833927154541016


training:  29%|██▊       | 3145/10986 [1:11:51<3:01:13,  1.39s/it]

training loss: 3.356569290161133


training:  29%|██▊       | 3146/10986 [1:11:52<2:56:05,  1.35s/it]

training loss: 3.3833518028259277


training:  29%|██▊       | 3147/10986 [1:11:54<2:53:19,  1.33s/it]

training loss: 3.4049525260925293


training:  29%|██▊       | 3148/10986 [1:11:55<2:51:20,  1.31s/it]

training loss: 3.2905850410461426


training:  29%|██▊       | 3149/10986 [1:11:56<2:49:35,  1.30s/it]

training loss: 3.3474338054656982


training:  29%|██▊       | 3150/10986 [1:11:57<2:48:15,  1.29s/it]

training loss: 3.389204740524292


training:  29%|██▊       | 3151/10986 [1:11:59<2:56:31,  1.35s/it]

training loss: 3.3966572284698486


training:  29%|██▊       | 3152/10986 [1:12:00<2:54:09,  1.33s/it]

training loss: 3.5266594886779785


training:  29%|██▊       | 3153/10986 [1:12:01<2:50:54,  1.31s/it]

training loss: 3.292022705078125


training:  29%|██▊       | 3154/10986 [1:12:03<2:48:53,  1.29s/it]

training loss: 3.269644260406494


training:  29%|██▊       | 3155/10986 [1:12:04<2:47:13,  1.28s/it]

training loss: 3.3657236099243164


training:  29%|██▊       | 3156/10986 [1:12:05<2:46:18,  1.27s/it]

training loss: 3.3555643558502197


training:  29%|██▊       | 3157/10986 [1:12:07<2:45:51,  1.27s/it]

training loss: 3.2787468433380127


training:  29%|██▊       | 3158/10986 [1:12:08<2:44:59,  1.26s/it]

training loss: 3.319844961166382


training:  29%|██▉       | 3159/10986 [1:12:09<2:44:58,  1.26s/it]

training loss: 3.409508228302002


training:  29%|██▉       | 3160/10986 [1:12:10<2:45:15,  1.27s/it]

training loss: 3.4742836952209473
valid loss: 3.4716243743896484
perplexity: 32.188987731933594


training:  29%|██▉       | 3161/10986 [1:12:13<3:40:30,  1.69s/it]

training loss: 3.5325381755828857


training:  29%|██▉       | 3162/10986 [1:12:14<3:29:17,  1.61s/it]

training loss: 3.380183219909668


training:  29%|██▉       | 3163/10986 [1:12:16<3:16:16,  1.51s/it]

training loss: 3.3400466442108154


training:  29%|██▉       | 3164/10986 [1:12:17<3:06:54,  1.43s/it]

training loss: 3.4100189208984375


training:  29%|██▉       | 3165/10986 [1:12:18<3:00:48,  1.39s/it]

training loss: 3.48343563079834


training:  29%|██▉       | 3166/10986 [1:12:19<2:55:33,  1.35s/it]

training loss: 3.327903985977173


training:  29%|██▉       | 3167/10986 [1:12:21<2:51:54,  1.32s/it]

training loss: 3.372621774673462


training:  29%|██▉       | 3168/10986 [1:12:22<2:49:19,  1.30s/it]

training loss: 3.446956157684326


training:  29%|██▉       | 3169/10986 [1:12:23<2:47:39,  1.29s/it]

training loss: 3.2785215377807617


training:  29%|██▉       | 3170/10986 [1:12:25<2:47:32,  1.29s/it]

training loss: 3.346740961074829


training:  29%|██▉       | 3171/10986 [1:12:26<2:56:43,  1.36s/it]

training loss: 3.3234574794769287


training:  29%|██▉       | 3172/10986 [1:12:28<3:05:40,  1.43s/it]

training loss: 3.472557544708252


training:  29%|██▉       | 3173/10986 [1:12:29<2:59:09,  1.38s/it]

training loss: 3.405919313430786


training:  29%|██▉       | 3174/10986 [1:12:30<2:54:30,  1.34s/it]

training loss: 3.3448851108551025


training:  29%|██▉       | 3175/10986 [1:12:31<2:51:30,  1.32s/it]

training loss: 3.3308844566345215


training:  29%|██▉       | 3176/10986 [1:12:33<2:49:18,  1.30s/it]

training loss: 3.395906925201416


training:  29%|██▉       | 3177/10986 [1:12:34<2:47:55,  1.29s/it]

training loss: 3.2740564346313477


training:  29%|██▉       | 3178/10986 [1:12:35<2:47:18,  1.29s/it]

training loss: 3.467580795288086


training:  29%|██▉       | 3179/10986 [1:12:36<2:46:40,  1.28s/it]

training loss: 3.2852280139923096


training:  29%|██▉       | 3180/10986 [1:12:38<2:45:51,  1.27s/it]

training loss: 3.30910325050354
valid loss: 3.2991981506347656
perplexity: 27.09090805053711


training:  29%|██▉       | 3181/10986 [1:12:40<3:41:08,  1.70s/it]

training loss: 3.4135141372680664


training:  29%|██▉       | 3182/10986 [1:12:42<3:26:07,  1.58s/it]

training loss: 3.2973315715789795


training:  29%|██▉       | 3183/10986 [1:12:43<3:13:48,  1.49s/it]

training loss: 3.446589708328247


training:  29%|██▉       | 3184/10986 [1:12:44<3:05:59,  1.43s/it]

training loss: 3.380037784576416


training:  29%|██▉       | 3185/10986 [1:12:46<2:59:40,  1.38s/it]

training loss: 3.3093650341033936


training:  29%|██▉       | 3186/10986 [1:12:47<2:55:39,  1.35s/it]

training loss: 3.349919080734253


training:  29%|██▉       | 3187/10986 [1:12:48<2:51:55,  1.32s/it]

training loss: 3.267322301864624


training:  29%|██▉       | 3188/10986 [1:12:49<2:50:13,  1.31s/it]

training loss: 3.359663248062134


training:  29%|██▉       | 3189/10986 [1:12:51<2:49:05,  1.30s/it]

training loss: 3.4090330600738525


training:  29%|██▉       | 3190/10986 [1:12:52<2:48:11,  1.29s/it]

training loss: 3.300213575363159


training:  29%|██▉       | 3191/10986 [1:12:53<2:57:11,  1.36s/it]

training loss: 3.3720273971557617


training:  29%|██▉       | 3192/10986 [1:12:55<2:54:13,  1.34s/it]

training loss: 3.387988328933716


training:  29%|██▉       | 3193/10986 [1:12:56<2:51:35,  1.32s/it]

training loss: 3.2864725589752197


training:  29%|██▉       | 3194/10986 [1:12:57<2:49:01,  1.30s/it]

training loss: 3.3204421997070312


training:  29%|██▉       | 3195/10986 [1:12:59<2:46:56,  1.29s/it]

training loss: 3.4408531188964844


training:  29%|██▉       | 3196/10986 [1:13:00<2:46:16,  1.28s/it]

training loss: 3.2937533855438232


training:  29%|██▉       | 3197/10986 [1:13:01<2:52:16,  1.33s/it]

training loss: 3.3850655555725098


training:  29%|██▉       | 3198/10986 [1:13:03<3:08:05,  1.45s/it]

training loss: 3.356686592102051


training:  29%|██▉       | 3199/10986 [1:13:05<3:11:40,  1.48s/it]

training loss: 3.442539930343628


training:  29%|██▉       | 3200/10986 [1:13:06<3:03:09,  1.41s/it]

training loss: 3.355046033859253
valid loss: 3.355794668197632
perplexity: 28.668378829956055


training:  29%|██▉       | 3201/10986 [1:13:08<3:52:25,  1.79s/it]

training loss: 3.359325408935547


training:  29%|██▉       | 3202/10986 [1:13:10<3:46:22,  1.74s/it]

training loss: 3.3378188610076904


training:  29%|██▉       | 3203/10986 [1:13:11<3:28:44,  1.61s/it]

training loss: 3.4261958599090576


training:  29%|██▉       | 3204/10986 [1:13:13<3:15:01,  1.50s/it]

training loss: 3.3294005393981934


training:  29%|██▉       | 3205/10986 [1:13:14<3:06:18,  1.44s/it]

training loss: 3.2680013179779053


training:  29%|██▉       | 3206/10986 [1:13:15<3:00:34,  1.39s/it]

training loss: 3.3442234992980957


training:  29%|██▉       | 3207/10986 [1:13:16<2:55:40,  1.36s/it]

training loss: 3.3527870178222656


training:  29%|██▉       | 3208/10986 [1:13:18<2:51:45,  1.32s/it]

training loss: 3.4079716205596924


training:  29%|██▉       | 3209/10986 [1:13:19<2:49:11,  1.31s/it]

training loss: 3.334428548812866


training:  29%|██▉       | 3210/10986 [1:13:20<2:47:23,  1.29s/it]

training loss: 3.437551259994507


training:  29%|██▉       | 3211/10986 [1:13:22<2:56:28,  1.36s/it]

training loss: 3.3089821338653564


training:  29%|██▉       | 3212/10986 [1:13:23<3:05:49,  1.43s/it]

training loss: 3.4352810382843018


training:  29%|██▉       | 3213/10986 [1:13:25<2:59:14,  1.38s/it]

training loss: 3.3013415336608887


training:  29%|██▉       | 3214/10986 [1:13:26<2:54:56,  1.35s/it]

training loss: 3.320542573928833


training:  29%|██▉       | 3215/10986 [1:13:27<2:51:21,  1.32s/it]

training loss: 3.3775854110717773


training:  29%|██▉       | 3216/10986 [1:13:28<2:49:25,  1.31s/it]

training loss: 3.4007272720336914


training:  29%|██▉       | 3217/10986 [1:13:30<2:48:15,  1.30s/it]

training loss: 3.399535655975342


training:  29%|██▉       | 3218/10986 [1:13:31<2:48:13,  1.30s/it]

training loss: 3.3765242099761963


training:  29%|██▉       | 3219/10986 [1:13:32<2:47:12,  1.29s/it]

training loss: 3.3916726112365723


training:  29%|██▉       | 3220/10986 [1:13:34<2:45:45,  1.28s/it]

training loss: 3.4273295402526855
valid loss: 3.424769401550293
perplexity: 30.715560913085938


training:  29%|██▉       | 3221/10986 [1:13:36<3:40:30,  1.70s/it]

training loss: 3.354384660720825


training:  29%|██▉       | 3222/10986 [1:13:38<3:27:15,  1.60s/it]

training loss: 3.426912546157837


training:  29%|██▉       | 3223/10986 [1:13:39<3:15:05,  1.51s/it]

training loss: 3.562537908554077


training:  29%|██▉       | 3224/10986 [1:13:40<3:06:19,  1.44s/it]

training loss: 3.5389840602874756


training:  29%|██▉       | 3225/10986 [1:13:41<2:59:33,  1.39s/it]

training loss: 3.480233669281006


training:  29%|██▉       | 3226/10986 [1:13:43<2:54:49,  1.35s/it]

training loss: 3.3861966133117676


training:  29%|██▉       | 3227/10986 [1:13:44<2:51:59,  1.33s/it]

training loss: 3.291233539581299


training:  29%|██▉       | 3228/10986 [1:13:45<2:51:23,  1.33s/it]

training loss: 3.3458120822906494


training:  29%|██▉       | 3229/10986 [1:13:47<2:50:00,  1.31s/it]

training loss: 3.322335720062256


training:  29%|██▉       | 3230/10986 [1:13:48<2:47:49,  1.30s/it]

training loss: 3.3575563430786133


training:  29%|██▉       | 3231/10986 [1:13:49<2:57:59,  1.38s/it]

training loss: 3.3016703128814697


training:  29%|██▉       | 3232/10986 [1:13:51<2:55:49,  1.36s/it]

training loss: 3.373579263687134


training:  29%|██▉       | 3233/10986 [1:13:52<2:52:25,  1.33s/it]

training loss: 3.372274398803711


training:  29%|██▉       | 3234/10986 [1:13:53<2:51:41,  1.33s/it]

training loss: 3.2817883491516113


training:  29%|██▉       | 3235/10986 [1:13:55<2:49:35,  1.31s/it]

training loss: 3.40116548538208


training:  29%|██▉       | 3236/10986 [1:13:56<2:48:23,  1.30s/it]

training loss: 3.4797701835632324


training:  29%|██▉       | 3237/10986 [1:13:57<2:47:25,  1.30s/it]

training loss: 3.39862060546875


training:  29%|██▉       | 3238/10986 [1:13:58<2:47:45,  1.30s/it]

training loss: 3.3523967266082764


training:  29%|██▉       | 3239/10986 [1:14:00<2:47:04,  1.29s/it]

training loss: 3.3784022331237793


training:  29%|██▉       | 3240/10986 [1:14:01<2:46:28,  1.29s/it]

training loss: 3.5161781311035156
valid loss: 3.5159196853637695
perplexity: 33.64685821533203


training:  30%|██▉       | 3241/10986 [1:14:04<3:41:57,  1.72s/it]

training loss: 3.4050076007843018


training:  30%|██▉       | 3242/10986 [1:14:05<3:28:21,  1.61s/it]

training loss: 3.3620169162750244


training:  30%|██▉       | 3243/10986 [1:14:06<3:16:05,  1.52s/it]

training loss: 3.3751590251922607


training:  30%|██▉       | 3244/10986 [1:14:08<3:07:33,  1.45s/it]

training loss: 3.3238627910614014


training:  30%|██▉       | 3245/10986 [1:14:09<3:01:21,  1.41s/it]

training loss: 3.3379087448120117


training:  30%|██▉       | 3246/10986 [1:14:10<2:56:32,  1.37s/it]

training loss: 3.338195562362671


training:  30%|██▉       | 3247/10986 [1:14:12<2:54:04,  1.35s/it]

training loss: 3.4374947547912598


training:  30%|██▉       | 3248/10986 [1:14:13<2:52:02,  1.33s/it]

training loss: 3.3216001987457275


training:  30%|██▉       | 3249/10986 [1:14:14<2:50:09,  1.32s/it]

training loss: 3.4916574954986572


training:  30%|██▉       | 3250/10986 [1:14:16<2:50:33,  1.32s/it]

training loss: 3.399905204772949


training:  30%|██▉       | 3251/10986 [1:14:17<2:59:13,  1.39s/it]

training loss: 3.3972933292388916


training:  30%|██▉       | 3252/10986 [1:14:18<2:55:32,  1.36s/it]

training loss: 3.4188637733459473


training:  30%|██▉       | 3253/10986 [1:14:20<2:52:15,  1.34s/it]

training loss: 3.345724105834961


training:  30%|██▉       | 3254/10986 [1:14:21<2:50:07,  1.32s/it]

training loss: 3.2773549556732178


training:  30%|██▉       | 3255/10986 [1:14:22<2:48:39,  1.31s/it]

training loss: 3.3823506832122803


training:  30%|██▉       | 3256/10986 [1:14:24<2:48:08,  1.31s/it]

training loss: 3.292066812515259


training:  30%|██▉       | 3257/10986 [1:14:25<2:47:22,  1.30s/it]

training loss: 3.400101900100708


training:  30%|██▉       | 3258/10986 [1:14:26<2:46:17,  1.29s/it]

training loss: 3.5052490234375


training:  30%|██▉       | 3259/10986 [1:14:27<2:45:42,  1.29s/it]

training loss: 3.428971529006958


training:  30%|██▉       | 3260/10986 [1:14:29<2:45:41,  1.29s/it]

training loss: 3.4840173721313477
valid loss: 3.4994876384735107
perplexity: 33.098487854003906


training:  30%|██▉       | 3261/10986 [1:14:31<3:40:50,  1.72s/it]

training loss: 3.4793334007263184


training:  30%|██▉       | 3262/10986 [1:14:33<3:27:31,  1.61s/it]

training loss: 3.3774917125701904


training:  30%|██▉       | 3263/10986 [1:14:34<3:16:06,  1.52s/it]

training loss: 3.426877975463867


training:  30%|██▉       | 3264/10986 [1:14:35<3:07:23,  1.46s/it]

training loss: 3.3477513790130615


training:  30%|██▉       | 3265/10986 [1:14:37<3:00:38,  1.40s/it]

training loss: 3.3356258869171143


training:  30%|██▉       | 3266/10986 [1:14:38<2:55:59,  1.37s/it]

training loss: 3.436022996902466


training:  30%|██▉       | 3267/10986 [1:14:39<2:53:37,  1.35s/it]

training loss: 3.354776382446289


training:  30%|██▉       | 3268/10986 [1:14:40<2:50:57,  1.33s/it]

training loss: 3.4012911319732666


training:  30%|██▉       | 3269/10986 [1:14:42<2:49:05,  1.31s/it]

training loss: 3.4986941814422607


training:  30%|██▉       | 3270/10986 [1:14:43<2:48:00,  1.31s/it]

training loss: 3.3911561965942383


training:  30%|██▉       | 3271/10986 [1:14:45<2:57:33,  1.38s/it]

training loss: 3.4381320476531982


training:  30%|██▉       | 3272/10986 [1:14:46<2:56:55,  1.38s/it]

training loss: 3.406052350997925


training:  30%|██▉       | 3273/10986 [1:14:47<2:54:07,  1.35s/it]

training loss: 3.604332685470581


training:  30%|██▉       | 3274/10986 [1:14:49<2:51:38,  1.34s/it]

training loss: 3.486509323120117


training:  30%|██▉       | 3275/10986 [1:14:50<2:49:29,  1.32s/it]

training loss: 3.4780306816101074


training:  30%|██▉       | 3276/10986 [1:14:51<2:48:21,  1.31s/it]

training loss: 3.5318362712860107


training:  30%|██▉       | 3277/10986 [1:14:52<2:47:30,  1.30s/it]

training loss: 3.3779666423797607


training:  30%|██▉       | 3278/10986 [1:14:54<2:47:09,  1.30s/it]

training loss: 3.422893524169922


training:  30%|██▉       | 3279/10986 [1:14:55<2:47:31,  1.30s/it]

training loss: 3.5341672897338867


training:  30%|██▉       | 3280/10986 [1:14:56<2:47:05,  1.30s/it]

training loss: 3.409250259399414
valid loss: 3.4051380157470703
perplexity: 30.118453979492188


training:  30%|██▉       | 3281/10986 [1:14:59<3:41:57,  1.73s/it]

training loss: 3.4319283962249756


training:  30%|██▉       | 3282/10986 [1:15:00<3:27:52,  1.62s/it]

training loss: 3.4449024200439453


training:  30%|██▉       | 3283/10986 [1:15:02<3:15:54,  1.53s/it]

training loss: 3.4395267963409424


training:  30%|██▉       | 3284/10986 [1:15:03<3:06:55,  1.46s/it]

training loss: 3.4956440925598145


training:  30%|██▉       | 3285/10986 [1:15:04<3:00:56,  1.41s/it]

training loss: 3.3371424674987793


training:  30%|██▉       | 3286/10986 [1:15:06<2:56:48,  1.38s/it]

training loss: 3.4348886013031006


training:  30%|██▉       | 3287/10986 [1:15:07<2:53:19,  1.35s/it]

training loss: 3.4455666542053223


training:  30%|██▉       | 3288/10986 [1:15:08<2:50:47,  1.33s/it]

training loss: 3.4906442165374756


training:  30%|██▉       | 3289/10986 [1:15:10<2:49:53,  1.32s/it]

training loss: 3.5216426849365234


training:  30%|██▉       | 3290/10986 [1:15:11<2:48:47,  1.32s/it]

training loss: 3.4379801750183105


training:  30%|██▉       | 3291/10986 [1:15:12<2:58:10,  1.39s/it]

training loss: 3.3305559158325195


training:  30%|██▉       | 3292/10986 [1:15:14<2:55:25,  1.37s/it]

training loss: 3.3810224533081055


training:  30%|██▉       | 3293/10986 [1:15:15<2:52:18,  1.34s/it]

training loss: 3.467010498046875


training:  30%|██▉       | 3294/10986 [1:15:16<2:53:06,  1.35s/it]

training loss: 3.4429688453674316


training:  30%|██▉       | 3295/10986 [1:15:18<2:51:29,  1.34s/it]

training loss: 3.4551258087158203


training:  30%|███       | 3296/10986 [1:15:19<2:49:43,  1.32s/it]

training loss: 3.5223300457000732


training:  30%|███       | 3297/10986 [1:15:20<2:48:01,  1.31s/it]

training loss: 3.386592388153076


training:  30%|███       | 3298/10986 [1:15:21<2:46:46,  1.30s/it]

training loss: 3.45158052444458


training:  30%|███       | 3299/10986 [1:15:23<2:46:14,  1.30s/it]

training loss: 3.4835901260375977


training:  30%|███       | 3300/10986 [1:15:24<2:47:06,  1.30s/it]

training loss: 3.4103004932403564
valid loss: 3.409780979156494
perplexity: 30.258617401123047


training:  30%|███       | 3301/10986 [1:15:27<3:41:53,  1.73s/it]

training loss: 3.37549090385437


training:  30%|███       | 3302/10986 [1:15:28<3:39:14,  1.71s/it]

training loss: 3.423600912094116


training:  30%|███       | 3303/10986 [1:15:30<3:24:00,  1.59s/it]

training loss: 3.4584293365478516


training:  30%|███       | 3304/10986 [1:15:31<3:13:13,  1.51s/it]

training loss: 3.3212203979492188


training:  30%|███       | 3305/10986 [1:15:32<3:05:13,  1.45s/it]

training loss: 3.3913779258728027


training:  30%|███       | 3306/10986 [1:15:34<2:59:33,  1.40s/it]

training loss: 3.393887996673584


training:  30%|███       | 3307/10986 [1:15:35<2:55:45,  1.37s/it]

training loss: 3.3885326385498047


training:  30%|███       | 3308/10986 [1:15:36<2:52:46,  1.35s/it]

training loss: 3.34312105178833


training:  30%|███       | 3309/10986 [1:15:38<2:51:15,  1.34s/it]

training loss: 3.4091265201568604


training:  30%|███       | 3310/10986 [1:15:39<2:49:38,  1.33s/it]

training loss: 3.4653618335723877


training:  30%|███       | 3311/10986 [1:15:41<2:58:54,  1.40s/it]

training loss: 3.5189616680145264


training:  30%|███       | 3312/10986 [1:15:42<3:07:42,  1.47s/it]

training loss: 3.3599507808685303


training:  30%|███       | 3313/10986 [1:15:43<3:00:49,  1.41s/it]

training loss: 3.4517276287078857


training:  30%|███       | 3314/10986 [1:15:45<2:55:55,  1.38s/it]

training loss: 3.607248306274414


training:  30%|███       | 3315/10986 [1:15:46<2:53:56,  1.36s/it]

training loss: 3.3675825595855713


training:  30%|███       | 3316/10986 [1:15:47<2:51:12,  1.34s/it]

training loss: 3.4417386054992676


training:  30%|███       | 3317/10986 [1:15:49<2:49:13,  1.32s/it]

training loss: 3.3532090187072754


training:  30%|███       | 3318/10986 [1:15:50<2:47:20,  1.31s/it]

training loss: 3.429985523223877


training:  30%|███       | 3319/10986 [1:15:51<2:46:06,  1.30s/it]

training loss: 3.5642333030700684


training:  30%|███       | 3320/10986 [1:15:52<2:46:31,  1.30s/it]

training loss: 3.4992895126342773
valid loss: 3.4994149208068848
perplexity: 33.0960807800293


training:  30%|███       | 3321/10986 [1:15:55<3:42:21,  1.74s/it]

training loss: 3.3398001194000244


training:  30%|███       | 3322/10986 [1:15:57<3:29:04,  1.64s/it]

training loss: 3.520082950592041


training:  30%|███       | 3323/10986 [1:15:58<3:16:19,  1.54s/it]

training loss: 3.3123323917388916


training:  30%|███       | 3324/10986 [1:15:59<3:07:48,  1.47s/it]

training loss: 3.3665568828582764


training:  30%|███       | 3325/10986 [1:16:01<3:00:47,  1.42s/it]

training loss: 3.5352795124053955


training:  30%|███       | 3326/10986 [1:16:02<2:56:21,  1.38s/it]

training loss: 3.424152135848999


training:  30%|███       | 3327/10986 [1:16:03<2:52:08,  1.35s/it]

training loss: 3.4006175994873047


training:  30%|███       | 3328/10986 [1:16:04<2:50:19,  1.33s/it]

training loss: 3.3989973068237305


training:  30%|███       | 3329/10986 [1:16:06<2:48:24,  1.32s/it]

training loss: 3.3333754539489746


training:  30%|███       | 3330/10986 [1:16:07<2:47:27,  1.31s/it]

training loss: 3.386279582977295


training:  30%|███       | 3331/10986 [1:16:09<2:55:35,  1.38s/it]

training loss: 3.3413846492767334


training:  30%|███       | 3332/10986 [1:16:10<2:53:57,  1.36s/it]

training loss: 3.396077871322632


training:  30%|███       | 3333/10986 [1:16:11<2:50:25,  1.34s/it]

training loss: 3.351015329360962


training:  30%|███       | 3334/10986 [1:16:12<2:48:47,  1.32s/it]

training loss: 3.3302621841430664


training:  30%|███       | 3335/10986 [1:16:14<2:47:13,  1.31s/it]

training loss: 3.3302526473999023


training:  30%|███       | 3336/10986 [1:16:15<2:45:19,  1.30s/it]

training loss: 3.4172818660736084


training:  30%|███       | 3337/10986 [1:16:16<2:44:32,  1.29s/it]

training loss: 3.4335150718688965


training:  30%|███       | 3338/10986 [1:16:18<2:44:19,  1.29s/it]

training loss: 3.3534224033355713


training:  30%|███       | 3339/10986 [1:16:19<2:44:32,  1.29s/it]

training loss: 3.2768607139587402


training:  30%|███       | 3340/10986 [1:16:20<2:45:11,  1.30s/it]

training loss: 3.4397530555725098
valid loss: 3.441676139831543
perplexity: 31.239276885986328


training:  30%|███       | 3341/10986 [1:16:23<3:39:41,  1.72s/it]

training loss: 3.3870325088500977


training:  30%|███       | 3342/10986 [1:16:24<3:28:22,  1.64s/it]

training loss: 3.343543291091919


training:  30%|███       | 3343/10986 [1:16:26<3:15:10,  1.53s/it]

training loss: 3.3021178245544434


training:  30%|███       | 3344/10986 [1:16:27<3:05:41,  1.46s/it]

training loss: 3.297194004058838


training:  30%|███       | 3345/10986 [1:16:28<3:01:32,  1.43s/it]

training loss: 3.4326934814453125


training:  30%|███       | 3346/10986 [1:16:30<3:10:28,  1.50s/it]

training loss: 3.458880662918091


training:  30%|███       | 3347/10986 [1:16:32<3:18:26,  1.56s/it]

training loss: 3.452235221862793


training:  30%|███       | 3348/10986 [1:16:33<3:07:04,  1.47s/it]

training loss: 3.4398279190063477


training:  30%|███       | 3349/10986 [1:16:34<2:59:11,  1.41s/it]

training loss: 3.391530752182007


training:  30%|███       | 3350/10986 [1:16:35<2:54:12,  1.37s/it]

training loss: 3.278989791870117


training:  31%|███       | 3351/10986 [1:16:37<3:00:38,  1.42s/it]

training loss: 3.344996690750122


training:  31%|███       | 3352/10986 [1:16:38<2:55:23,  1.38s/it]

training loss: 3.332676649093628


training:  31%|███       | 3353/10986 [1:16:39<2:51:49,  1.35s/it]

training loss: 3.411036252975464


training:  31%|███       | 3354/10986 [1:16:41<2:48:42,  1.33s/it]

training loss: 3.2669482231140137


training:  31%|███       | 3355/10986 [1:16:42<2:46:41,  1.31s/it]

training loss: 3.3493144512176514


training:  31%|███       | 3356/10986 [1:16:43<2:44:57,  1.30s/it]

training loss: 3.317103385925293


training:  31%|███       | 3357/10986 [1:16:45<2:43:19,  1.28s/it]

training loss: 3.371946334838867


training:  31%|███       | 3358/10986 [1:16:46<2:42:35,  1.28s/it]

training loss: 3.4774773120880127


training:  31%|███       | 3359/10986 [1:16:47<2:41:52,  1.27s/it]

training loss: 3.340017318725586


training:  31%|███       | 3360/10986 [1:16:48<2:41:38,  1.27s/it]

training loss: 3.422445297241211
valid loss: 3.422536849975586
perplexity: 30.647064208984375


training:  31%|███       | 3361/10986 [1:16:51<3:35:05,  1.69s/it]

training loss: 3.342374801635742


training:  31%|███       | 3362/10986 [1:16:52<3:20:47,  1.58s/it]

training loss: 3.434027910232544


training:  31%|███       | 3363/10986 [1:16:54<3:08:56,  1.49s/it]

training loss: 3.4055991172790527


training:  31%|███       | 3364/10986 [1:16:55<3:00:54,  1.42s/it]

training loss: 3.380389451980591


training:  31%|███       | 3365/10986 [1:16:56<2:54:40,  1.38s/it]

training loss: 3.2651784420013428


training:  31%|███       | 3366/10986 [1:16:57<2:49:55,  1.34s/it]

training loss: 3.2694170475006104


training:  31%|███       | 3367/10986 [1:16:59<2:46:43,  1.31s/it]

training loss: 3.2937891483306885


training:  31%|███       | 3368/10986 [1:17:00<2:44:57,  1.30s/it]

training loss: 3.4576916694641113


training:  31%|███       | 3369/10986 [1:17:01<2:43:01,  1.28s/it]

training loss: 3.2141103744506836


training:  31%|███       | 3370/10986 [1:17:02<2:42:49,  1.28s/it]

training loss: 3.38254714012146


training:  31%|███       | 3371/10986 [1:17:04<2:52:45,  1.36s/it]

training loss: 3.38291335105896


training:  31%|███       | 3372/10986 [1:17:05<2:49:11,  1.33s/it]

training loss: 3.3995535373687744


training:  31%|███       | 3373/10986 [1:17:07<2:46:21,  1.31s/it]

training loss: 3.526686191558838


training:  31%|███       | 3374/10986 [1:17:08<2:43:57,  1.29s/it]

training loss: 3.358323097229004


training:  31%|███       | 3375/10986 [1:17:09<2:43:08,  1.29s/it]

training loss: 3.399716377258301


training:  31%|███       | 3376/10986 [1:17:10<2:41:44,  1.28s/it]

training loss: 3.402686595916748


training:  31%|███       | 3377/10986 [1:17:12<2:41:52,  1.28s/it]

training loss: 3.2999255657196045


training:  31%|███       | 3378/10986 [1:17:13<2:40:54,  1.27s/it]

training loss: 3.354983329772949


training:  31%|███       | 3379/10986 [1:17:14<2:41:09,  1.27s/it]

training loss: 3.4230947494506836


training:  31%|███       | 3380/10986 [1:17:15<2:40:45,  1.27s/it]

training loss: 3.2890336513519287
valid loss: 3.282168388366699
perplexity: 26.633460998535156


training:  31%|███       | 3381/10986 [1:17:18<3:33:45,  1.69s/it]

training loss: 3.3685693740844727


training:  31%|███       | 3382/10986 [1:17:19<3:22:34,  1.60s/it]

training loss: 3.404834747314453


training:  31%|███       | 3383/10986 [1:17:21<3:11:15,  1.51s/it]

training loss: 3.3488998413085938


training:  31%|███       | 3384/10986 [1:17:22<3:01:33,  1.43s/it]

training loss: 3.4881057739257812


training:  31%|███       | 3385/10986 [1:17:23<2:54:58,  1.38s/it]

training loss: 3.42946457862854


training:  31%|███       | 3386/10986 [1:17:25<2:50:59,  1.35s/it]

training loss: 3.37849497795105


training:  31%|███       | 3387/10986 [1:17:26<2:48:05,  1.33s/it]

training loss: 3.3145792484283447


training:  31%|███       | 3388/10986 [1:17:27<2:46:03,  1.31s/it]

training loss: 3.391623020172119


training:  31%|███       | 3389/10986 [1:17:28<2:44:12,  1.30s/it]

training loss: 3.3870389461517334


training:  31%|███       | 3390/10986 [1:17:30<2:42:48,  1.29s/it]

training loss: 3.3059191703796387


training:  31%|███       | 3391/10986 [1:17:31<2:52:33,  1.36s/it]

training loss: 3.315073251724243


training:  31%|███       | 3392/10986 [1:17:32<2:49:33,  1.34s/it]

training loss: 3.51570200920105


training:  31%|███       | 3393/10986 [1:17:34<2:46:18,  1.31s/it]

training loss: 3.416292905807495


training:  31%|███       | 3394/10986 [1:17:35<2:44:16,  1.30s/it]

training loss: 3.316298246383667


training:  31%|███       | 3395/10986 [1:17:36<2:43:13,  1.29s/it]

training loss: 3.3455007076263428


training:  31%|███       | 3396/10986 [1:17:37<2:42:02,  1.28s/it]

training loss: 3.333918809890747


training:  31%|███       | 3397/10986 [1:17:39<2:40:49,  1.27s/it]

training loss: 3.4164326190948486


training:  31%|███       | 3398/10986 [1:17:40<2:40:46,  1.27s/it]

training loss: 3.387944459915161


training:  31%|███       | 3399/10986 [1:17:41<2:39:46,  1.26s/it]

training loss: 3.350034713745117


training:  31%|███       | 3400/10986 [1:17:42<2:39:52,  1.26s/it]

training loss: 3.2872135639190674
valid loss: 3.2859554290771484
perplexity: 26.734516143798828


training:  31%|███       | 3401/10986 [1:17:45<3:35:34,  1.71s/it]

training loss: 3.290409803390503


training:  31%|███       | 3402/10986 [1:17:47<3:23:18,  1.61s/it]

training loss: 3.3229808807373047


training:  31%|███       | 3403/10986 [1:17:48<3:10:47,  1.51s/it]

training loss: 3.411304473876953


training:  31%|███       | 3404/10986 [1:17:49<3:02:33,  1.44s/it]

training loss: 3.410081624984741


training:  31%|███       | 3405/10986 [1:17:51<2:58:24,  1.41s/it]

training loss: 3.501835584640503


training:  31%|███       | 3406/10986 [1:17:52<2:52:42,  1.37s/it]

training loss: 3.322065591812134


training:  31%|███       | 3407/10986 [1:17:53<2:48:26,  1.33s/it]

training loss: 3.3420188426971436


training:  31%|███       | 3408/10986 [1:17:54<2:46:18,  1.32s/it]

training loss: 3.3407747745513916


training:  31%|███       | 3409/10986 [1:17:56<2:44:31,  1.30s/it]

training loss: 3.268401861190796


training:  31%|███       | 3410/10986 [1:17:57<2:42:53,  1.29s/it]

training loss: 3.3435235023498535


training:  31%|███       | 3411/10986 [1:17:58<2:52:52,  1.37s/it]

training loss: 3.3280179500579834


training:  31%|███       | 3412/10986 [1:18:00<2:50:02,  1.35s/it]

training loss: 3.34964656829834


training:  31%|███       | 3413/10986 [1:18:01<2:46:41,  1.32s/it]

training loss: 3.3556504249572754


training:  31%|███       | 3414/10986 [1:18:02<2:44:56,  1.31s/it]

training loss: 3.4030187129974365


training:  31%|███       | 3415/10986 [1:18:03<2:43:57,  1.30s/it]

training loss: 3.335071325302124


training:  31%|███       | 3416/10986 [1:18:05<2:42:19,  1.29s/it]

training loss: 3.4303526878356934


training:  31%|███       | 3417/10986 [1:18:06<2:41:22,  1.28s/it]

training loss: 3.3913121223449707


training:  31%|███       | 3418/10986 [1:18:07<2:41:17,  1.28s/it]

training loss: 3.268150568008423


training:  31%|███       | 3419/10986 [1:18:09<2:42:11,  1.29s/it]

training loss: 3.3024516105651855


training:  31%|███       | 3420/10986 [1:18:10<2:42:03,  1.29s/it]

training loss: 3.327592611312866
valid loss: 3.3206887245178223
perplexity: 27.679407119750977


training:  31%|███       | 3421/10986 [1:18:13<3:37:48,  1.73s/it]

training loss: 3.2992324829101562


training:  31%|███       | 3422/10986 [1:18:14<3:22:37,  1.61s/it]

training loss: 3.4621620178222656


training:  31%|███       | 3423/10986 [1:18:15<3:11:04,  1.52s/it]

training loss: 3.47082781791687


training:  31%|███       | 3424/10986 [1:18:17<3:01:57,  1.44s/it]

training loss: 3.3702175617218018


training:  31%|███       | 3425/10986 [1:18:18<2:56:00,  1.40s/it]

training loss: 3.264214277267456


training:  31%|███       | 3426/10986 [1:18:19<2:50:25,  1.35s/it]

training loss: 3.3042218685150146


training:  31%|███       | 3427/10986 [1:18:20<2:49:13,  1.34s/it]

training loss: 3.3268864154815674


training:  31%|███       | 3428/10986 [1:18:22<2:45:52,  1.32s/it]

training loss: 3.4155783653259277


training:  31%|███       | 3429/10986 [1:18:23<2:43:19,  1.30s/it]

training loss: 3.339932441711426


training:  31%|███       | 3430/10986 [1:18:24<2:41:37,  1.28s/it]

training loss: 3.287153482437134


training:  31%|███       | 3431/10986 [1:18:26<2:52:14,  1.37s/it]

training loss: 3.259782314300537


training:  31%|███       | 3432/10986 [1:18:27<2:48:22,  1.34s/it]

training loss: 3.3354413509368896


training:  31%|███       | 3433/10986 [1:18:28<2:44:39,  1.31s/it]

training loss: 3.3259389400482178


training:  31%|███▏      | 3434/10986 [1:18:29<2:43:05,  1.30s/it]

training loss: 3.362639904022217


training:  31%|███▏      | 3435/10986 [1:18:31<2:41:26,  1.28s/it]

training loss: 3.2615699768066406


training:  31%|███▏      | 3436/10986 [1:18:32<2:40:01,  1.27s/it]

training loss: 3.2797181606292725


training:  31%|███▏      | 3437/10986 [1:18:33<2:38:55,  1.26s/it]

training loss: 3.370577096939087


training:  31%|███▏      | 3438/10986 [1:18:34<2:38:41,  1.26s/it]

training loss: 3.317499876022339


training:  31%|███▏      | 3439/10986 [1:18:36<2:38:40,  1.26s/it]

training loss: 3.3867220878601074


training:  31%|███▏      | 3440/10986 [1:18:37<2:38:24,  1.26s/it]

training loss: 3.32948637008667
valid loss: 3.3264193534851074
perplexity: 27.838485717773438


training:  31%|███▏      | 3441/10986 [1:18:40<3:32:05,  1.69s/it]

training loss: 3.3389337062835693


training:  31%|███▏      | 3442/10986 [1:18:41<3:19:08,  1.58s/it]

training loss: 3.30541729927063


training:  31%|███▏      | 3443/10986 [1:18:42<3:06:41,  1.49s/it]

training loss: 3.307837963104248


training:  31%|███▏      | 3444/10986 [1:18:44<2:57:51,  1.41s/it]

training loss: 3.499715805053711


training:  31%|███▏      | 3445/10986 [1:18:45<2:51:10,  1.36s/it]

training loss: 3.369479179382324


training:  31%|███▏      | 3446/10986 [1:18:46<2:47:04,  1.33s/it]

training loss: 3.457345962524414


training:  31%|███▏      | 3447/10986 [1:18:47<2:44:03,  1.31s/it]

training loss: 3.352766990661621


training:  31%|███▏      | 3448/10986 [1:18:49<2:41:20,  1.28s/it]

training loss: 3.3794374465942383


training:  31%|███▏      | 3449/10986 [1:18:50<2:40:10,  1.28s/it]

training loss: 3.369887590408325


training:  31%|███▏      | 3450/10986 [1:18:51<2:40:09,  1.28s/it]

training loss: 3.3470981121063232


training:  31%|███▏      | 3451/10986 [1:18:53<2:49:38,  1.35s/it]

training loss: 3.4022672176361084


training:  31%|███▏      | 3452/10986 [1:18:54<2:46:40,  1.33s/it]

training loss: 3.3744523525238037


training:  31%|███▏      | 3453/10986 [1:18:55<2:44:49,  1.31s/it]

training loss: 3.393322706222534


training:  31%|███▏      | 3454/10986 [1:18:56<2:42:56,  1.30s/it]

training loss: 3.420091152191162


training:  31%|███▏      | 3455/10986 [1:18:58<2:41:31,  1.29s/it]

training loss: 3.4182372093200684


training:  31%|███▏      | 3456/10986 [1:18:59<2:39:53,  1.27s/it]

training loss: 3.342865228652954


training:  31%|███▏      | 3457/10986 [1:19:00<2:39:41,  1.27s/it]

training loss: 3.4708573818206787


training:  31%|███▏      | 3458/10986 [1:19:01<2:39:56,  1.27s/it]

training loss: 3.4330906867980957


training:  31%|███▏      | 3459/10986 [1:19:03<2:39:53,  1.27s/it]

training loss: 3.3921704292297363


training:  31%|███▏      | 3460/10986 [1:19:04<2:40:32,  1.28s/it]

training loss: 3.289158821105957
valid loss: 3.288149118423462
perplexity: 26.79322624206543


training:  32%|███▏      | 3461/10986 [1:19:07<3:33:31,  1.70s/it]

training loss: 3.359325647354126


training:  32%|███▏      | 3462/10986 [1:19:08<3:20:08,  1.60s/it]

training loss: 3.370633125305176


training:  32%|███▏      | 3463/10986 [1:19:09<3:08:23,  1.50s/it]

training loss: 3.4012057781219482


training:  32%|███▏      | 3464/10986 [1:19:11<3:00:01,  1.44s/it]

training loss: 3.4790892601013184


training:  32%|███▏      | 3465/10986 [1:19:12<2:54:24,  1.39s/it]

training loss: 3.346616268157959


training:  32%|███▏      | 3466/10986 [1:19:13<2:49:10,  1.35s/it]

training loss: 3.267345666885376


training:  32%|███▏      | 3467/10986 [1:19:14<2:45:50,  1.32s/it]

training loss: 3.401167631149292


training:  32%|███▏      | 3468/10986 [1:19:16<2:43:41,  1.31s/it]

training loss: 3.29099440574646


training:  32%|███▏      | 3469/10986 [1:19:17<2:42:22,  1.30s/it]

training loss: 3.297807216644287


training:  32%|███▏      | 3470/10986 [1:19:18<2:41:05,  1.29s/it]

training loss: 3.4160728454589844


training:  32%|███▏      | 3471/10986 [1:19:20<2:50:43,  1.36s/it]

training loss: 3.40224027633667


training:  32%|███▏      | 3472/10986 [1:19:21<2:49:50,  1.36s/it]

training loss: 3.317700147628784


training:  32%|███▏      | 3473/10986 [1:19:22<2:46:33,  1.33s/it]

training loss: 3.3299143314361572


training:  32%|███▏      | 3474/10986 [1:19:24<2:43:52,  1.31s/it]

training loss: 3.3403818607330322


training:  32%|███▏      | 3475/10986 [1:19:25<2:42:32,  1.30s/it]

training loss: 3.3094301223754883


training:  32%|███▏      | 3476/10986 [1:19:26<2:42:18,  1.30s/it]

training loss: 3.3515613079071045


training:  32%|███▏      | 3477/10986 [1:19:27<2:41:51,  1.29s/it]

training loss: 3.251610517501831


training:  32%|███▏      | 3478/10986 [1:19:29<2:40:35,  1.28s/it]

training loss: 3.524566173553467


training:  32%|███▏      | 3479/10986 [1:19:30<2:40:11,  1.28s/it]

training loss: 3.424210786819458


training:  32%|███▏      | 3480/10986 [1:19:31<2:39:59,  1.28s/it]

training loss: 3.4494569301605225
valid loss: 3.442847728729248
perplexity: 31.275896072387695


training:  32%|███▏      | 3481/10986 [1:19:34<3:35:28,  1.72s/it]

training loss: 3.417360305786133


training:  32%|███▏      | 3482/10986 [1:19:35<3:21:57,  1.61s/it]

training loss: 3.4438610076904297


training:  32%|███▏      | 3483/10986 [1:19:37<3:08:49,  1.51s/it]

training loss: 3.398419141769409


training:  32%|███▏      | 3484/10986 [1:19:38<3:00:18,  1.44s/it]

training loss: 3.3074042797088623


training:  32%|███▏      | 3485/10986 [1:19:39<2:54:26,  1.40s/it]

training loss: 3.4613940715789795


training:  32%|███▏      | 3486/10986 [1:19:41<2:49:32,  1.36s/it]

training loss: 3.490931510925293


training:  32%|███▏      | 3487/10986 [1:19:42<2:45:58,  1.33s/it]

training loss: 3.365149974822998


training:  32%|███▏      | 3488/10986 [1:19:43<2:44:31,  1.32s/it]

training loss: 3.4632203578948975


training:  32%|███▏      | 3489/10986 [1:19:44<2:42:51,  1.30s/it]

training loss: 3.368858575820923


training:  32%|███▏      | 3490/10986 [1:19:46<2:41:53,  1.30s/it]

training loss: 3.289945602416992


training:  32%|███▏      | 3491/10986 [1:19:47<2:50:42,  1.37s/it]

training loss: 3.407414674758911


training:  32%|███▏      | 3492/10986 [1:19:48<2:47:32,  1.34s/it]

training loss: 3.42557954788208


training:  32%|███▏      | 3493/10986 [1:19:50<2:44:46,  1.32s/it]

training loss: 3.412036657333374


training:  32%|███▏      | 3494/10986 [1:19:51<2:44:03,  1.31s/it]

training loss: 3.3846473693847656


training:  32%|███▏      | 3495/10986 [1:19:52<2:44:09,  1.31s/it]

training loss: 3.428079843521118


training:  32%|███▏      | 3496/10986 [1:19:54<2:42:32,  1.30s/it]

training loss: 3.479851722717285


training:  32%|███▏      | 3497/10986 [1:19:55<2:49:45,  1.36s/it]

training loss: 3.3526062965393066


training:  32%|███▏      | 3498/10986 [1:19:57<3:01:17,  1.45s/it]

training loss: 3.5027153491973877


training:  32%|███▏      | 3499/10986 [1:19:58<3:03:21,  1.47s/it]

training loss: 3.428403854370117


training:  32%|███▏      | 3500/10986 [1:20:00<2:56:19,  1.41s/it]

training loss: 3.345228672027588
valid loss: 3.3401291370391846
perplexity: 28.22277069091797


training:  32%|███▏      | 3501/10986 [1:20:02<3:45:05,  1.80s/it]

training loss: 3.340859889984131


training:  32%|███▏      | 3502/10986 [1:20:04<3:28:51,  1.67s/it]

training loss: 3.402742385864258


training:  32%|███▏      | 3503/10986 [1:20:05<3:14:58,  1.56s/it]

training loss: 3.459784746170044


training:  32%|███▏      | 3504/10986 [1:20:06<3:04:31,  1.48s/it]

training loss: 3.3781003952026367


training:  32%|███▏      | 3505/10986 [1:20:08<2:57:39,  1.42s/it]

training loss: 3.353667974472046


training:  32%|███▏      | 3506/10986 [1:20:09<2:52:01,  1.38s/it]

training loss: 3.340372323989868


training:  32%|███▏      | 3507/10986 [1:20:10<2:48:12,  1.35s/it]

training loss: 3.3993146419525146


training:  32%|███▏      | 3508/10986 [1:20:11<2:45:43,  1.33s/it]

training loss: 3.48929500579834


training:  32%|███▏      | 3509/10986 [1:20:13<2:44:30,  1.32s/it]

training loss: 3.3664183616638184


training:  32%|███▏      | 3510/10986 [1:20:14<2:43:24,  1.31s/it]

training loss: 3.437249183654785


training:  32%|███▏      | 3511/10986 [1:20:15<2:52:01,  1.38s/it]

training loss: 3.320997476577759


training:  32%|███▏      | 3512/10986 [1:20:17<2:57:54,  1.43s/it]

training loss: 3.3769500255584717


training:  32%|███▏      | 3513/10986 [1:20:18<2:52:06,  1.38s/it]

training loss: 3.435389995574951


training:  32%|███▏      | 3514/10986 [1:20:20<2:48:13,  1.35s/it]

training loss: 3.491783618927002


training:  32%|███▏      | 3515/10986 [1:20:21<2:45:54,  1.33s/it]

training loss: 3.4072799682617188


training:  32%|███▏      | 3516/10986 [1:20:22<2:45:55,  1.33s/it]

training loss: 3.3781697750091553


training:  32%|███▏      | 3517/10986 [1:20:23<2:43:44,  1.32s/it]

training loss: 3.3744890689849854


training:  32%|███▏      | 3518/10986 [1:20:25<2:42:16,  1.30s/it]

training loss: 3.2745282649993896


training:  32%|███▏      | 3519/10986 [1:20:26<2:41:03,  1.29s/it]

training loss: 3.34342622756958


training:  32%|███▏      | 3520/10986 [1:20:27<2:40:02,  1.29s/it]

training loss: 3.389728307723999
valid loss: 3.3841464519500732
perplexity: 29.492809295654297


training:  32%|███▏      | 3521/10986 [1:20:30<3:32:56,  1.71s/it]

training loss: 3.2646777629852295


training:  32%|███▏      | 3522/10986 [1:20:31<3:20:42,  1.61s/it]

training loss: 3.552548885345459


training:  32%|███▏      | 3523/10986 [1:20:33<3:08:09,  1.51s/it]

training loss: 3.406609535217285


training:  32%|███▏      | 3524/10986 [1:20:34<2:58:33,  1.44s/it]

training loss: 3.369202136993408


training:  32%|███▏      | 3525/10986 [1:20:35<2:51:43,  1.38s/it]

training loss: 3.6125876903533936


training:  32%|███▏      | 3526/10986 [1:20:36<2:46:53,  1.34s/it]

training loss: 3.3679206371307373


training:  32%|███▏      | 3527/10986 [1:20:38<2:43:37,  1.32s/it]

training loss: 3.481206178665161


training:  32%|███▏      | 3528/10986 [1:20:39<2:40:56,  1.29s/it]

training loss: 3.4458365440368652


training:  32%|███▏      | 3529/10986 [1:20:40<2:40:12,  1.29s/it]

training loss: 3.4301416873931885


training:  32%|███▏      | 3530/10986 [1:20:41<2:39:02,  1.28s/it]

training loss: 3.4164528846740723


training:  32%|███▏      | 3531/10986 [1:20:43<2:48:27,  1.36s/it]

training loss: 3.409294843673706


training:  32%|███▏      | 3532/10986 [1:20:44<2:46:01,  1.34s/it]

training loss: 3.391472101211548


training:  32%|███▏      | 3533/10986 [1:20:46<2:43:17,  1.31s/it]

training loss: 3.3931195735931396


training:  32%|███▏      | 3534/10986 [1:20:47<2:40:24,  1.29s/it]

training loss: 3.32240629196167


training:  32%|███▏      | 3535/10986 [1:20:48<2:39:42,  1.29s/it]

training loss: 3.3719322681427


training:  32%|███▏      | 3536/10986 [1:20:49<2:38:54,  1.28s/it]

training loss: 3.380023241043091


training:  32%|███▏      | 3537/10986 [1:20:51<2:38:52,  1.28s/it]

training loss: 3.372694969177246


training:  32%|███▏      | 3538/10986 [1:20:52<2:38:57,  1.28s/it]

training loss: 3.24884295463562


training:  32%|███▏      | 3539/10986 [1:20:53<2:39:14,  1.28s/it]

training loss: 3.428706645965576


training:  32%|███▏      | 3540/10986 [1:20:54<2:39:04,  1.28s/it]

training loss: 3.3985860347747803
valid loss: 3.395784616470337
perplexity: 29.838054656982422


training:  32%|███▏      | 3541/10986 [1:20:57<3:31:32,  1.70s/it]

training loss: 3.3788681030273438


training:  32%|███▏      | 3542/10986 [1:20:58<3:18:01,  1.60s/it]

training loss: 3.4042575359344482


training:  32%|███▏      | 3543/10986 [1:21:00<3:06:01,  1.50s/it]

training loss: 3.433164358139038


training:  32%|███▏      | 3544/10986 [1:21:01<2:56:54,  1.43s/it]

training loss: 3.284940004348755


training:  32%|███▏      | 3545/10986 [1:21:02<2:51:59,  1.39s/it]

training loss: 3.395561695098877


training:  32%|███▏      | 3546/10986 [1:21:04<2:48:04,  1.36s/it]

training loss: 3.3064076900482178


training:  32%|███▏      | 3547/10986 [1:21:05<2:45:01,  1.33s/it]

training loss: 3.387570858001709


training:  32%|███▏      | 3548/10986 [1:21:06<2:42:27,  1.31s/it]

training loss: 3.3769609928131104


training:  32%|███▏      | 3549/10986 [1:21:07<2:40:40,  1.30s/it]

training loss: 3.4969608783721924


training:  32%|███▏      | 3550/10986 [1:21:09<2:39:42,  1.29s/it]

training loss: 3.5454182624816895


training:  32%|███▏      | 3551/10986 [1:21:10<2:47:48,  1.35s/it]

training loss: 3.340398073196411


training:  32%|███▏      | 3552/10986 [1:21:11<2:45:07,  1.33s/it]

training loss: 3.415081262588501


training:  32%|███▏      | 3553/10986 [1:21:13<2:43:23,  1.32s/it]

training loss: 3.4729151725769043


training:  32%|███▏      | 3554/10986 [1:21:14<2:41:08,  1.30s/it]

training loss: 3.4398434162139893


training:  32%|███▏      | 3555/10986 [1:21:15<2:39:44,  1.29s/it]

training loss: 3.4334869384765625


training:  32%|███▏      | 3556/10986 [1:21:17<2:38:33,  1.28s/it]

training loss: 3.4377894401550293


training:  32%|███▏      | 3557/10986 [1:21:18<2:38:26,  1.28s/it]

training loss: 3.4154152870178223


training:  32%|███▏      | 3558/10986 [1:21:19<2:38:03,  1.28s/it]

training loss: 3.4459431171417236


training:  32%|███▏      | 3559/10986 [1:21:20<2:37:35,  1.27s/it]

training loss: 3.327155590057373


training:  32%|███▏      | 3560/10986 [1:21:22<2:36:47,  1.27s/it]

training loss: 3.4466331005096436
valid loss: 3.4484970569610596
perplexity: 31.45308494567871


training:  32%|███▏      | 3561/10986 [1:21:24<3:32:58,  1.72s/it]

training loss: 3.501802921295166


training:  32%|███▏      | 3562/10986 [1:21:26<3:29:19,  1.69s/it]

training loss: 3.3979156017303467


training:  32%|███▏      | 3563/10986 [1:21:27<3:13:39,  1.57s/it]

training loss: 3.4134271144866943


training:  32%|███▏      | 3564/10986 [1:21:29<3:03:42,  1.49s/it]

training loss: 3.4379138946533203


training:  32%|███▏      | 3565/10986 [1:21:30<2:55:27,  1.42s/it]

training loss: 3.2907416820526123


training:  32%|███▏      | 3566/10986 [1:21:31<2:50:18,  1.38s/it]

training loss: 3.3555030822753906


training:  32%|███▏      | 3567/10986 [1:21:32<2:46:39,  1.35s/it]

training loss: 3.316563606262207


training:  32%|███▏      | 3568/10986 [1:21:34<2:43:55,  1.33s/it]

training loss: 3.3955535888671875


training:  32%|███▏      | 3569/10986 [1:21:35<2:43:01,  1.32s/it]

training loss: 3.4713809490203857


training:  32%|███▏      | 3570/10986 [1:21:36<2:41:16,  1.30s/it]

training loss: 3.4914956092834473


training:  33%|███▎      | 3571/10986 [1:21:38<2:50:03,  1.38s/it]

training loss: 3.452697277069092


training:  33%|███▎      | 3572/10986 [1:21:39<2:48:14,  1.36s/it]

training loss: 3.368157148361206


training:  33%|███▎      | 3573/10986 [1:21:40<2:45:00,  1.34s/it]

training loss: 3.5422306060791016


training:  33%|███▎      | 3574/10986 [1:21:42<2:43:23,  1.32s/it]

training loss: 3.349816083908081


training:  33%|███▎      | 3575/10986 [1:21:43<2:41:56,  1.31s/it]

training loss: 3.4320473670959473


training:  33%|███▎      | 3576/10986 [1:21:44<2:41:16,  1.31s/it]

training loss: 3.446676731109619


training:  33%|███▎      | 3577/10986 [1:21:46<2:42:41,  1.32s/it]

training loss: 3.3924567699432373


training:  33%|███▎      | 3578/10986 [1:21:47<2:41:59,  1.31s/it]

training loss: 3.3740692138671875


training:  33%|███▎      | 3579/10986 [1:21:48<2:40:49,  1.30s/it]

training loss: 3.458559274673462


training:  33%|███▎      | 3580/10986 [1:21:49<2:39:50,  1.29s/it]

training loss: 3.4058034420013428
valid loss: 3.3963260650634766
perplexity: 29.854215621948242


training:  33%|███▎      | 3581/10986 [1:21:52<3:32:48,  1.72s/it]

training loss: 3.387636184692383


training:  33%|███▎      | 3582/10986 [1:21:54<3:30:28,  1.71s/it]

training loss: 3.384732723236084


training:  33%|███▎      | 3583/10986 [1:21:55<3:15:24,  1.58s/it]

training loss: 3.358186721801758


training:  33%|███▎      | 3584/10986 [1:21:56<3:03:43,  1.49s/it]

training loss: 3.466583013534546


training:  33%|███▎      | 3585/10986 [1:21:58<2:55:41,  1.42s/it]

training loss: 3.4099440574645996


training:  33%|███▎      | 3586/10986 [1:21:59<2:50:18,  1.38s/it]

training loss: 3.324505567550659


training:  33%|███▎      | 3587/10986 [1:22:00<2:47:13,  1.36s/it]

training loss: 3.396031141281128


training:  33%|███▎      | 3588/10986 [1:22:02<2:44:43,  1.34s/it]

training loss: 3.44728422164917


training:  33%|███▎      | 3589/10986 [1:22:03<2:42:04,  1.31s/it]

training loss: 3.44269061088562


training:  33%|███▎      | 3590/10986 [1:22:04<2:40:59,  1.31s/it]

training loss: 3.314309597015381


training:  33%|███▎      | 3591/10986 [1:22:06<2:49:38,  1.38s/it]

training loss: 3.5098726749420166


training:  33%|███▎      | 3592/10986 [1:22:07<2:57:47,  1.44s/it]

training loss: 3.500847101211548


training:  33%|███▎      | 3593/10986 [1:22:09<2:51:44,  1.39s/it]

training loss: 3.341254234313965


training:  33%|███▎      | 3594/10986 [1:22:10<2:47:50,  1.36s/it]

training loss: 3.3104097843170166


training:  33%|███▎      | 3595/10986 [1:22:11<2:45:04,  1.34s/it]

training loss: 3.3061540126800537


training:  33%|███▎      | 3596/10986 [1:22:12<2:44:21,  1.33s/it]

training loss: 3.3239879608154297


training:  33%|███▎      | 3597/10986 [1:22:14<2:42:42,  1.32s/it]

training loss: 3.391695976257324


training:  33%|███▎      | 3598/10986 [1:22:15<2:41:18,  1.31s/it]

training loss: 3.3309507369995117


training:  33%|███▎      | 3599/10986 [1:22:16<2:40:18,  1.30s/it]

training loss: 3.3829243183135986


training:  33%|███▎      | 3600/10986 [1:22:18<2:39:18,  1.29s/it]

training loss: 3.4042670726776123
valid loss: 3.4057979583740234
perplexity: 30.138336181640625


training:  33%|███▎      | 3601/10986 [1:22:20<3:32:51,  1.73s/it]

training loss: 3.4801695346832275


training:  33%|███▎      | 3602/10986 [1:22:22<3:27:51,  1.69s/it]

training loss: 3.375739812850952


training:  33%|███▎      | 3603/10986 [1:22:23<3:15:08,  1.59s/it]

training loss: 3.339573383331299


training:  33%|███▎      | 3604/10986 [1:22:25<3:04:15,  1.50s/it]

training loss: 3.386086940765381


training:  33%|███▎      | 3605/10986 [1:22:26<2:55:56,  1.43s/it]

training loss: 3.3109471797943115


training:  33%|███▎      | 3606/10986 [1:22:27<2:50:17,  1.38s/it]

training loss: 3.4570000171661377


training:  33%|███▎      | 3607/10986 [1:22:28<2:46:05,  1.35s/it]

training loss: 3.4346437454223633


training:  33%|███▎      | 3608/10986 [1:22:30<2:43:10,  1.33s/it]

training loss: 3.432628631591797


training:  33%|███▎      | 3609/10986 [1:22:31<2:41:11,  1.31s/it]

training loss: 3.3253517150878906


training:  33%|███▎      | 3610/10986 [1:22:32<2:40:24,  1.30s/it]

training loss: 3.3879330158233643


training:  33%|███▎      | 3611/10986 [1:22:34<2:50:03,  1.38s/it]

training loss: 3.3533079624176025


training:  33%|███▎      | 3612/10986 [1:22:35<2:46:08,  1.35s/it]

training loss: 3.418653964996338


training:  33%|███▎      | 3613/10986 [1:22:36<2:44:04,  1.34s/it]

training loss: 3.33132004737854


training:  33%|███▎      | 3614/10986 [1:22:38<2:41:55,  1.32s/it]

training loss: 3.4180872440338135


training:  33%|███▎      | 3615/10986 [1:22:39<2:40:13,  1.30s/it]

training loss: 3.461801052093506


training:  33%|███▎      | 3616/10986 [1:22:40<2:39:33,  1.30s/it]

training loss: 3.4255030155181885


training:  33%|███▎      | 3617/10986 [1:22:41<2:39:03,  1.30s/it]

training loss: 3.385420799255371


training:  33%|███▎      | 3618/10986 [1:22:43<2:40:05,  1.30s/it]

training loss: 3.408503293991089


training:  33%|███▎      | 3619/10986 [1:22:44<2:39:20,  1.30s/it]

training loss: 3.3624019622802734


training:  33%|███▎      | 3620/10986 [1:22:45<2:39:49,  1.30s/it]

training loss: 3.4396233558654785
valid loss: 3.429447889328003
perplexity: 30.859601974487305


training:  33%|███▎      | 3621/10986 [1:22:48<3:33:20,  1.74s/it]

training loss: 3.468188524246216


training:  33%|███▎      | 3622/10986 [1:22:49<3:18:48,  1.62s/it]

training loss: 3.3258039951324463


training:  33%|███▎      | 3623/10986 [1:22:51<3:06:51,  1.52s/it]

training loss: 3.4938957691192627


training:  33%|███▎      | 3624/10986 [1:22:52<2:58:03,  1.45s/it]

training loss: 3.3712751865386963


training:  33%|███▎      | 3625/10986 [1:22:53<2:53:28,  1.41s/it]

training loss: 3.471327304840088


training:  33%|███▎      | 3626/10986 [1:22:55<2:49:37,  1.38s/it]

training loss: 3.5788912773132324


training:  33%|███▎      | 3627/10986 [1:22:56<2:45:33,  1.35s/it]

training loss: 3.363736867904663


training:  33%|███▎      | 3628/10986 [1:22:57<2:43:21,  1.33s/it]

training loss: 3.3468263149261475


training:  33%|███▎      | 3629/10986 [1:22:59<2:41:30,  1.32s/it]

training loss: 3.3431925773620605


training:  33%|███▎      | 3630/10986 [1:23:00<2:40:22,  1.31s/it]

training loss: 3.5291941165924072


training:  33%|███▎      | 3631/10986 [1:23:01<2:49:29,  1.38s/it]

training loss: 3.351659059524536


training:  33%|███▎      | 3632/10986 [1:23:03<2:46:54,  1.36s/it]

training loss: 3.375330924987793


training:  33%|███▎      | 3633/10986 [1:23:04<2:43:30,  1.33s/it]

training loss: 3.5643653869628906


training:  33%|███▎      | 3634/10986 [1:23:05<2:43:22,  1.33s/it]

training loss: 3.3633551597595215


training:  33%|███▎      | 3635/10986 [1:23:07<2:41:22,  1.32s/it]

training loss: 3.4646408557891846


training:  33%|███▎      | 3636/10986 [1:23:08<2:39:53,  1.31s/it]

training loss: 3.447176694869995


training:  33%|███▎      | 3637/10986 [1:23:09<2:38:57,  1.30s/it]

training loss: 3.394543170928955


training:  33%|███▎      | 3638/10986 [1:23:10<2:38:32,  1.29s/it]

training loss: 3.3793489933013916


training:  33%|███▎      | 3639/10986 [1:23:12<2:39:03,  1.30s/it]

training loss: 3.434363842010498


training:  33%|███▎      | 3640/10986 [1:23:13<2:38:46,  1.30s/it]

training loss: 3.533169746398926
valid loss: 3.5314085483551025
perplexity: 34.17206573486328


training:  33%|███▎      | 3641/10986 [1:23:16<3:33:06,  1.74s/it]

training loss: 3.325978994369507


training:  33%|███▎      | 3642/10986 [1:23:17<3:20:16,  1.64s/it]

training loss: 3.5870158672332764


training:  33%|███▎      | 3643/10986 [1:23:18<3:08:12,  1.54s/it]

training loss: 3.358506679534912


training:  33%|███▎      | 3644/10986 [1:23:20<2:59:23,  1.47s/it]

training loss: 3.3485419750213623


training:  33%|███▎      | 3645/10986 [1:23:21<3:02:41,  1.49s/it]

training loss: 3.3618221282958984


training:  33%|███▎      | 3646/10986 [1:23:23<3:10:42,  1.56s/it]

training loss: 3.4097955226898193


training:  33%|███▎      | 3647/10986 [1:23:25<3:11:28,  1.57s/it]

training loss: 3.4004344940185547


training:  33%|███▎      | 3648/10986 [1:23:26<3:01:57,  1.49s/it]

training loss: 3.4279556274414062


training:  33%|███▎      | 3649/10986 [1:23:27<2:54:56,  1.43s/it]

training loss: 3.3835442066192627


training:  33%|███▎      | 3650/10986 [1:23:29<2:50:25,  1.39s/it]

training loss: 3.305020332336426


training:  33%|███▎      | 3651/10986 [1:23:30<2:57:24,  1.45s/it]

training loss: 3.6209166049957275


training:  33%|███▎      | 3652/10986 [1:23:31<2:52:55,  1.41s/it]

training loss: 3.367436647415161


training:  33%|███▎      | 3653/10986 [1:23:33<2:48:55,  1.38s/it]

training loss: 3.461597442626953


training:  33%|███▎      | 3654/10986 [1:23:34<2:45:45,  1.36s/it]

training loss: 3.2572743892669678


training:  33%|███▎      | 3655/10986 [1:23:35<2:43:58,  1.34s/it]

training loss: 3.4860880374908447


training:  33%|███▎      | 3656/10986 [1:23:37<2:42:14,  1.33s/it]

training loss: 3.4407565593719482


training:  33%|███▎      | 3657/10986 [1:23:38<2:40:23,  1.31s/it]

training loss: 3.4378511905670166


training:  33%|███▎      | 3658/10986 [1:23:39<2:40:06,  1.31s/it]

training loss: 3.4571478366851807


training:  33%|███▎      | 3659/10986 [1:23:41<2:39:16,  1.30s/it]

training loss: 3.4006521701812744


training:  33%|███▎      | 3660/10986 [1:23:42<2:38:25,  1.30s/it]

training loss: 3.58136248588562
valid loss: 3.5570228099823
perplexity: 35.05866241455078


training:  33%|███▎      | 3661/10986 [1:23:45<3:32:17,  1.74s/it]

training loss: 3.4390878677368164


training:  33%|███▎      | 3662/10986 [1:23:46<3:19:27,  1.63s/it]

training loss: 3.3930790424346924


training:  33%|███▎      | 3663/10986 [1:23:47<3:07:02,  1.53s/it]

training loss: 3.3714911937713623


training:  33%|███▎      | 3664/10986 [1:23:49<2:57:29,  1.45s/it]

training loss: 3.5004162788391113


training:  33%|███▎      | 3665/10986 [1:23:50<2:51:11,  1.40s/it]

training loss: 3.313079357147217


training:  33%|███▎      | 3666/10986 [1:23:51<2:46:46,  1.37s/it]

training loss: 3.5539751052856445


training:  33%|███▎      | 3667/10986 [1:23:52<2:43:03,  1.34s/it]

training loss: 3.441504716873169


training:  33%|███▎      | 3668/10986 [1:23:54<2:41:44,  1.33s/it]

training loss: 3.417285442352295


training:  33%|███▎      | 3669/10986 [1:23:55<2:40:34,  1.32s/it]

training loss: 3.486299753189087


training:  33%|███▎      | 3670/10986 [1:23:56<2:39:25,  1.31s/it]

training loss: 3.358819007873535


training:  33%|███▎      | 3671/10986 [1:23:58<2:47:52,  1.38s/it]

training loss: 3.410785675048828


training:  33%|███▎      | 3672/10986 [1:23:59<2:55:18,  1.44s/it]

training loss: 3.4520974159240723


training:  33%|███▎      | 3673/10986 [1:24:01<2:49:55,  1.39s/it]

training loss: 3.5491247177124023


training:  33%|███▎      | 3674/10986 [1:24:02<2:45:48,  1.36s/it]

training loss: 3.4610042572021484


training:  33%|███▎      | 3675/10986 [1:24:03<2:42:38,  1.33s/it]

training loss: 3.560662269592285


training:  33%|███▎      | 3676/10986 [1:24:04<2:40:14,  1.32s/it]

training loss: 3.3726937770843506


training:  33%|███▎      | 3677/10986 [1:24:06<2:38:24,  1.30s/it]

training loss: 3.37562894821167


training:  33%|███▎      | 3678/10986 [1:24:07<2:37:25,  1.29s/it]

training loss: 3.4214282035827637


training:  33%|███▎      | 3679/10986 [1:24:08<2:36:28,  1.28s/it]

training loss: 3.401853561401367


training:  33%|███▎      | 3680/10986 [1:24:10<2:35:52,  1.28s/it]

training loss: 3.5511248111724854
valid loss: 3.5535171031951904
perplexity: 34.935977935791016


training:  34%|███▎      | 3681/10986 [1:24:12<3:28:46,  1.71s/it]

training loss: 3.407797336578369


training:  34%|███▎      | 3682/10986 [1:24:14<3:18:08,  1.63s/it]

training loss: 3.343355894088745


training:  34%|███▎      | 3683/10986 [1:24:15<3:06:05,  1.53s/it]

training loss: 3.4705679416656494


training:  34%|███▎      | 3684/10986 [1:24:16<2:56:32,  1.45s/it]

training loss: 3.338287830352783


training:  34%|███▎      | 3685/10986 [1:24:18<2:50:35,  1.40s/it]

training loss: 3.5307235717773438


training:  34%|███▎      | 3686/10986 [1:24:19<2:46:47,  1.37s/it]

training loss: 3.4506006240844727


training:  34%|███▎      | 3687/10986 [1:24:20<2:43:03,  1.34s/it]

training loss: 3.323431968688965


training:  34%|███▎      | 3688/10986 [1:24:21<2:41:01,  1.32s/it]

training loss: 3.3303415775299072


training:  34%|███▎      | 3689/10986 [1:24:23<2:38:51,  1.31s/it]

training loss: 3.453514814376831


training:  34%|███▎      | 3690/10986 [1:24:24<2:38:11,  1.30s/it]

training loss: 3.41800856590271


training:  34%|███▎      | 3691/10986 [1:24:26<2:49:35,  1.39s/it]

training loss: 3.3358044624328613


training:  34%|███▎      | 3692/10986 [1:24:27<2:55:13,  1.44s/it]

training loss: 3.391845703125


training:  34%|███▎      | 3693/10986 [1:24:28<2:49:22,  1.39s/it]

training loss: 3.419351100921631


training:  34%|███▎      | 3694/10986 [1:24:30<2:45:09,  1.36s/it]

training loss: 3.3979814052581787


training:  34%|███▎      | 3695/10986 [1:24:31<2:42:07,  1.33s/it]

training loss: 3.439293384552002


training:  34%|███▎      | 3696/10986 [1:24:32<2:40:35,  1.32s/it]

training loss: 3.415621042251587


training:  34%|███▎      | 3697/10986 [1:24:34<2:38:37,  1.31s/it]

training loss: 3.431262969970703


training:  34%|███▎      | 3698/10986 [1:24:35<2:37:46,  1.30s/it]

training loss: 3.4443979263305664


training:  34%|███▎      | 3699/10986 [1:24:36<2:37:40,  1.30s/it]

training loss: 3.369821071624756


training:  34%|███▎      | 3700/10986 [1:24:37<2:36:47,  1.29s/it]

training loss: 3.325279712677002
valid loss: 3.3250718116760254
perplexity: 27.800994873046875


training:  34%|███▎      | 3701/10986 [1:24:40<3:28:11,  1.71s/it]

training loss: 3.5419085025787354


training:  34%|███▎      | 3702/10986 [1:24:41<3:15:02,  1.61s/it]

training loss: 3.30104660987854


training:  34%|███▎      | 3703/10986 [1:24:43<3:03:51,  1.51s/it]

training loss: 3.325200319290161


training:  34%|███▎      | 3704/10986 [1:24:44<2:55:04,  1.44s/it]

training loss: 3.3230862617492676


training:  34%|███▎      | 3705/10986 [1:24:45<2:49:51,  1.40s/it]

training loss: 3.4303526878356934


training:  34%|███▎      | 3706/10986 [1:24:47<2:45:26,  1.36s/it]

training loss: 3.343512535095215


training:  34%|███▎      | 3707/10986 [1:24:48<2:43:50,  1.35s/it]

training loss: 3.4077835083007812


training:  34%|███▍      | 3708/10986 [1:24:49<2:41:21,  1.33s/it]

training loss: 3.3790361881256104


training:  34%|███▍      | 3709/10986 [1:24:51<2:40:24,  1.32s/it]

training loss: 3.369267225265503


training:  34%|███▍      | 3710/10986 [1:24:52<2:39:36,  1.32s/it]

training loss: 3.3414459228515625


training:  34%|███▍      | 3711/10986 [1:24:53<2:49:03,  1.39s/it]

training loss: 3.5080127716064453


training:  34%|███▍      | 3712/10986 [1:24:55<2:47:55,  1.39s/it]

training loss: 3.373948335647583


training:  34%|███▍      | 3713/10986 [1:24:56<2:45:27,  1.37s/it]

training loss: 3.311582565307617


training:  34%|███▍      | 3714/10986 [1:24:57<2:43:09,  1.35s/it]

training loss: 3.558708906173706


training:  34%|███▍      | 3715/10986 [1:24:59<2:41:20,  1.33s/it]

training loss: 3.5348429679870605


training:  34%|███▍      | 3716/10986 [1:25:00<2:39:56,  1.32s/it]

training loss: 3.4656894207000732


training:  34%|███▍      | 3717/10986 [1:25:01<2:38:37,  1.31s/it]

training loss: 3.4407312870025635


training:  34%|███▍      | 3718/10986 [1:25:03<2:37:55,  1.30s/it]

training loss: 3.351713180541992


training:  34%|███▍      | 3719/10986 [1:25:04<2:37:10,  1.30s/it]

training loss: 3.3937413692474365


training:  34%|███▍      | 3720/10986 [1:25:05<2:37:06,  1.30s/it]

training loss: 3.4840335845947266
valid loss: 3.483734130859375
perplexity: 32.58115768432617


training:  34%|███▍      | 3721/10986 [1:25:08<3:29:07,  1.73s/it]

training loss: 3.3724911212921143


training:  34%|███▍      | 3722/10986 [1:25:09<3:15:36,  1.62s/it]

training loss: 3.4016592502593994


training:  34%|███▍      | 3723/10986 [1:25:10<3:03:30,  1.52s/it]

training loss: 3.3795371055603027


training:  34%|███▍      | 3724/10986 [1:25:12<2:55:05,  1.45s/it]

training loss: 3.4494526386260986


training:  34%|███▍      | 3725/10986 [1:25:13<2:49:26,  1.40s/it]

training loss: 3.3164749145507812


training:  34%|███▍      | 3726/10986 [1:25:14<2:45:04,  1.36s/it]

training loss: 3.4030213356018066


training:  34%|███▍      | 3727/10986 [1:25:16<2:42:34,  1.34s/it]

training loss: 3.455979824066162


training:  34%|███▍      | 3728/10986 [1:25:17<2:40:06,  1.32s/it]

training loss: 3.434675455093384


training:  34%|███▍      | 3729/10986 [1:25:18<2:39:07,  1.32s/it]

training loss: 3.323021650314331


training:  34%|███▍      | 3730/10986 [1:25:20<2:37:41,  1.30s/it]

training loss: 3.4380276203155518


training:  34%|███▍      | 3731/10986 [1:25:21<2:46:06,  1.37s/it]

training loss: 3.345057725906372


training:  34%|███▍      | 3732/10986 [1:25:22<2:43:29,  1.35s/it]

training loss: 3.410250663757324


training:  34%|███▍      | 3733/10986 [1:25:24<2:41:14,  1.33s/it]

training loss: 3.278900146484375


training:  34%|███▍      | 3734/10986 [1:25:25<2:40:19,  1.33s/it]

training loss: 3.467097520828247


training:  34%|███▍      | 3735/10986 [1:25:26<2:39:52,  1.32s/it]

training loss: 3.2998313903808594


training:  34%|███▍      | 3736/10986 [1:25:28<2:38:22,  1.31s/it]

training loss: 3.3401870727539062


training:  34%|███▍      | 3737/10986 [1:25:29<2:36:06,  1.29s/it]

training loss: 3.3938562870025635


training:  34%|███▍      | 3738/10986 [1:25:30<2:35:04,  1.28s/it]

training loss: 3.3102517127990723


training:  34%|███▍      | 3739/10986 [1:25:31<2:34:55,  1.28s/it]

training loss: 3.2491886615753174


training:  34%|███▍      | 3740/10986 [1:25:33<2:34:21,  1.28s/it]

training loss: 3.497889280319214
valid loss: 3.4896059036254883
perplexity: 32.77302932739258


training:  34%|███▍      | 3741/10986 [1:25:35<3:24:59,  1.70s/it]

training loss: 3.4104089736938477


training:  34%|███▍      | 3742/10986 [1:25:37<3:11:57,  1.59s/it]

training loss: 3.3026561737060547


training:  34%|███▍      | 3743/10986 [1:25:38<3:00:31,  1.50s/it]

training loss: 3.4147441387176514


training:  34%|███▍      | 3744/10986 [1:25:39<2:51:53,  1.42s/it]

training loss: 3.361663579940796


training:  34%|███▍      | 3745/10986 [1:25:40<2:46:59,  1.38s/it]

training loss: 3.391752243041992


training:  34%|███▍      | 3746/10986 [1:25:42<2:42:34,  1.35s/it]

training loss: 3.4107022285461426


training:  34%|███▍      | 3747/10986 [1:25:43<2:40:00,  1.33s/it]

training loss: 3.350045680999756


training:  34%|███▍      | 3748/10986 [1:25:44<2:37:49,  1.31s/it]

training loss: 3.4074363708496094


training:  34%|███▍      | 3749/10986 [1:25:45<2:35:53,  1.29s/it]

training loss: 3.3762331008911133


training:  34%|███▍      | 3750/10986 [1:25:47<2:34:49,  1.28s/it]

training loss: 3.3941681385040283


training:  34%|███▍      | 3751/10986 [1:25:48<2:43:34,  1.36s/it]

training loss: 3.3740792274475098


training:  34%|███▍      | 3752/10986 [1:25:50<2:41:36,  1.34s/it]

training loss: 3.2481849193573


training:  34%|███▍      | 3753/10986 [1:25:51<2:39:28,  1.32s/it]

training loss: 3.370591163635254


training:  34%|███▍      | 3754/10986 [1:25:52<2:37:45,  1.31s/it]

training loss: 3.4430394172668457


training:  34%|███▍      | 3755/10986 [1:25:53<2:35:39,  1.29s/it]

training loss: 3.4826505184173584


training:  34%|███▍      | 3756/10986 [1:25:55<2:33:53,  1.28s/it]

training loss: 3.411639451980591


training:  34%|███▍      | 3757/10986 [1:25:56<2:35:39,  1.29s/it]

training loss: 3.353498935699463


training:  34%|███▍      | 3758/10986 [1:25:57<2:35:46,  1.29s/it]

training loss: 3.2954893112182617


training:  34%|███▍      | 3759/10986 [1:25:59<2:34:53,  1.29s/it]

training loss: 3.4756600856781006


training:  34%|███▍      | 3760/10986 [1:26:00<2:35:19,  1.29s/it]

training loss: 3.4575374126434326
valid loss: 3.45046329498291
perplexity: 31.51498794555664


training:  34%|███▍      | 3761/10986 [1:26:03<3:27:52,  1.73s/it]

training loss: 3.586117744445801


training:  34%|███▍      | 3762/10986 [1:26:04<3:14:24,  1.61s/it]

training loss: 3.3509058952331543


training:  34%|███▍      | 3763/10986 [1:26:05<3:02:13,  1.51s/it]

training loss: 3.4769749641418457


training:  34%|███▍      | 3764/10986 [1:26:06<2:53:55,  1.45s/it]

training loss: 3.4402379989624023


training:  34%|███▍      | 3765/10986 [1:26:08<2:47:44,  1.39s/it]

training loss: 3.5222415924072266


training:  34%|███▍      | 3766/10986 [1:26:09<2:43:08,  1.36s/it]

training loss: 3.3632757663726807


training:  34%|███▍      | 3767/10986 [1:26:10<2:40:10,  1.33s/it]

training loss: 3.399360418319702


training:  34%|███▍      | 3768/10986 [1:26:12<2:38:31,  1.32s/it]

training loss: 3.5324718952178955


training:  34%|███▍      | 3769/10986 [1:26:13<2:38:02,  1.31s/it]

training loss: 3.5422210693359375


training:  34%|███▍      | 3770/10986 [1:26:14<2:36:54,  1.30s/it]

training loss: 3.4675798416137695


training:  34%|███▍      | 3771/10986 [1:26:16<2:46:07,  1.38s/it]

training loss: 3.2968697547912598


training:  34%|███▍      | 3772/10986 [1:26:17<2:43:09,  1.36s/it]

training loss: 3.4373319149017334


training:  34%|███▍      | 3773/10986 [1:26:18<2:41:23,  1.34s/it]

training loss: 3.3901994228363037


training:  34%|███▍      | 3774/10986 [1:26:20<2:38:43,  1.32s/it]

training loss: 3.3568179607391357


training:  34%|███▍      | 3775/10986 [1:26:21<2:37:37,  1.31s/it]

training loss: 3.3103199005126953


training:  34%|███▍      | 3776/10986 [1:26:22<2:36:28,  1.30s/it]

training loss: 3.320711374282837


training:  34%|███▍      | 3777/10986 [1:26:23<2:35:32,  1.29s/it]

training loss: 3.4583723545074463


training:  34%|███▍      | 3778/10986 [1:26:25<2:34:40,  1.29s/it]

training loss: 3.3400166034698486


training:  34%|███▍      | 3779/10986 [1:26:26<2:34:38,  1.29s/it]

training loss: 3.323911666870117


training:  34%|███▍      | 3780/10986 [1:26:27<2:34:34,  1.29s/it]

training loss: 3.322662591934204
valid loss: 3.326179265975952
perplexity: 27.83180046081543


training:  34%|███▍      | 3781/10986 [1:26:30<3:24:53,  1.71s/it]

training loss: 3.351093292236328


training:  34%|███▍      | 3782/10986 [1:26:31<3:13:04,  1.61s/it]

training loss: 3.3375439643859863


training:  34%|███▍      | 3783/10986 [1:26:33<3:03:47,  1.53s/it]

training loss: 3.3083314895629883


training:  34%|███▍      | 3784/10986 [1:26:34<2:55:27,  1.46s/it]

training loss: 3.5742971897125244


training:  34%|███▍      | 3785/10986 [1:26:35<2:49:47,  1.41s/it]

training loss: 3.4913687705993652


training:  34%|███▍      | 3786/10986 [1:26:37<2:45:02,  1.38s/it]

training loss: 3.486337661743164


training:  34%|███▍      | 3787/10986 [1:26:38<2:41:03,  1.34s/it]

training loss: 3.4031715393066406


training:  34%|███▍      | 3788/10986 [1:26:39<2:39:08,  1.33s/it]

training loss: 3.483630418777466


training:  34%|███▍      | 3789/10986 [1:26:40<2:38:22,  1.32s/it]

training loss: 3.31894588470459


training:  34%|███▍      | 3790/10986 [1:26:42<2:36:48,  1.31s/it]

training loss: 3.3561909198760986


training:  35%|███▍      | 3791/10986 [1:26:43<2:46:21,  1.39s/it]

training loss: 3.403575897216797


training:  35%|███▍      | 3792/10986 [1:26:45<2:53:12,  1.44s/it]

training loss: 3.3689193725585938


training:  35%|███▍      | 3793/10986 [1:26:47<3:00:13,  1.50s/it]

training loss: 3.420383930206299


training:  35%|███▍      | 3794/10986 [1:26:48<3:06:07,  1.55s/it]

training loss: 3.4041900634765625


training:  35%|███▍      | 3795/10986 [1:26:50<3:01:35,  1.52s/it]

training loss: 3.337310791015625


training:  35%|███▍      | 3796/10986 [1:26:51<2:53:20,  1.45s/it]

training loss: 3.346587657928467


training:  35%|███▍      | 3797/10986 [1:26:52<2:47:28,  1.40s/it]

training loss: 3.435608148574829


training:  35%|███▍      | 3798/10986 [1:26:54<2:44:02,  1.37s/it]

training loss: 3.395735263824463


training:  35%|███▍      | 3799/10986 [1:26:55<2:41:55,  1.35s/it]

training loss: 3.3892502784729004


training:  35%|███▍      | 3800/10986 [1:26:56<2:40:13,  1.34s/it]

training loss: 3.301926612854004
valid loss: 3.3016929626464844
perplexity: 27.158578872680664


training:  35%|███▍      | 3801/10986 [1:26:59<3:31:16,  1.76s/it]

training loss: 3.470851182937622


training:  35%|███▍      | 3802/10986 [1:27:00<3:16:53,  1.64s/it]

training loss: 3.502157211303711


training:  35%|███▍      | 3803/10986 [1:27:02<3:04:56,  1.54s/it]

training loss: 3.4353280067443848


training:  35%|███▍      | 3804/10986 [1:27:03<2:55:34,  1.47s/it]

training loss: 3.468364953994751


training:  35%|███▍      | 3805/10986 [1:27:04<2:49:27,  1.42s/it]

training loss: 3.5360426902770996


training:  35%|███▍      | 3806/10986 [1:27:05<2:44:28,  1.37s/it]

training loss: 3.42360258102417


training:  35%|███▍      | 3807/10986 [1:27:07<2:40:45,  1.34s/it]

training loss: 3.3598899841308594


training:  35%|███▍      | 3808/10986 [1:27:08<2:38:33,  1.33s/it]

training loss: 3.5142548084259033


training:  35%|███▍      | 3809/10986 [1:27:09<2:36:58,  1.31s/it]

training loss: 3.3824055194854736


training:  35%|███▍      | 3810/10986 [1:27:11<2:37:19,  1.32s/it]

training loss: 3.389045000076294


training:  35%|███▍      | 3811/10986 [1:27:12<2:46:44,  1.39s/it]

training loss: 3.3715617656707764


training:  35%|███▍      | 3812/10986 [1:27:13<2:43:16,  1.37s/it]

training loss: 3.616525173187256


training:  35%|███▍      | 3813/10986 [1:27:15<2:40:52,  1.35s/it]

training loss: 3.528252124786377


training:  35%|███▍      | 3814/10986 [1:27:16<2:38:56,  1.33s/it]

training loss: 3.5009078979492188


training:  35%|███▍      | 3815/10986 [1:27:17<2:36:48,  1.31s/it]

training loss: 3.653372287750244


training:  35%|███▍      | 3816/10986 [1:27:19<2:35:41,  1.30s/it]

training loss: 3.3616340160369873


training:  35%|███▍      | 3817/10986 [1:27:20<2:35:11,  1.30s/it]

training loss: 3.3937466144561768


training:  35%|███▍      | 3818/10986 [1:27:21<2:34:50,  1.30s/it]

training loss: 3.348970890045166


training:  35%|███▍      | 3819/10986 [1:27:22<2:34:38,  1.29s/it]

training loss: 3.404649496078491


training:  35%|███▍      | 3820/10986 [1:27:24<2:33:54,  1.29s/it]

training loss: 3.4371554851531982
valid loss: 3.43674635887146
perplexity: 31.085651397705078


training:  35%|███▍      | 3821/10986 [1:27:27<3:27:15,  1.74s/it]

training loss: 3.493189573287964


training:  35%|███▍      | 3822/10986 [1:27:28<3:13:52,  1.62s/it]

training loss: 3.4115970134735107


training:  35%|███▍      | 3823/10986 [1:27:29<3:03:46,  1.54s/it]

training loss: 3.383580446243286


training:  35%|███▍      | 3824/10986 [1:27:31<2:56:34,  1.48s/it]

training loss: 3.386953830718994


training:  35%|███▍      | 3825/10986 [1:27:32<2:51:12,  1.43s/it]

training loss: 3.3053784370422363


training:  35%|███▍      | 3826/10986 [1:27:33<2:45:50,  1.39s/it]

training loss: 3.4538185596466064


training:  35%|███▍      | 3827/10986 [1:27:34<2:42:46,  1.36s/it]

training loss: 3.471477508544922


training:  35%|███▍      | 3828/10986 [1:27:36<2:40:29,  1.35s/it]

training loss: 3.4538984298706055


training:  35%|███▍      | 3829/10986 [1:27:37<2:38:43,  1.33s/it]

training loss: 3.407599925994873


training:  35%|███▍      | 3830/10986 [1:27:38<2:37:57,  1.32s/it]

training loss: 3.3905017375946045


training:  35%|███▍      | 3831/10986 [1:27:40<2:46:19,  1.39s/it]

training loss: 3.3249993324279785


training:  35%|███▍      | 3832/10986 [1:27:41<2:44:13,  1.38s/it]

training loss: 3.4964230060577393


training:  35%|███▍      | 3833/10986 [1:27:43<2:41:08,  1.35s/it]

training loss: 3.420131206512451


training:  35%|███▍      | 3834/10986 [1:27:44<2:38:46,  1.33s/it]

training loss: 3.4605367183685303


training:  35%|███▍      | 3835/10986 [1:27:45<2:37:12,  1.32s/it]

training loss: 3.505953073501587


training:  35%|███▍      | 3836/10986 [1:27:46<2:36:00,  1.31s/it]

training loss: 3.3642995357513428


training:  35%|███▍      | 3837/10986 [1:27:48<2:35:45,  1.31s/it]

training loss: 3.4348089694976807


training:  35%|███▍      | 3838/10986 [1:27:49<2:34:37,  1.30s/it]

training loss: 3.428729295730591


training:  35%|███▍      | 3839/10986 [1:27:50<2:34:18,  1.30s/it]

training loss: 3.315218448638916


training:  35%|███▍      | 3840/10986 [1:27:52<2:34:13,  1.29s/it]

training loss: 3.375812292098999
valid loss: 3.377135753631592
perplexity: 29.286767959594727


training:  35%|███▍      | 3841/10986 [1:27:54<3:26:58,  1.74s/it]

training loss: 3.4280734062194824


training:  35%|███▍      | 3842/10986 [1:27:56<3:14:32,  1.63s/it]

training loss: 3.457428455352783


training:  35%|███▍      | 3843/10986 [1:27:57<3:02:32,  1.53s/it]

training loss: 3.4089608192443848


training:  35%|███▍      | 3844/10986 [1:27:58<2:54:12,  1.46s/it]

training loss: 3.4474925994873047


training:  35%|███▍      | 3845/10986 [1:28:00<2:49:19,  1.42s/it]

training loss: 3.366220712661743


training:  35%|███▌      | 3846/10986 [1:28:01<2:45:09,  1.39s/it]

training loss: 3.392505407333374


training:  35%|███▌      | 3847/10986 [1:28:02<2:42:42,  1.37s/it]

training loss: 3.3453316688537598


training:  35%|███▌      | 3848/10986 [1:28:04<2:39:58,  1.34s/it]

training loss: 3.4221715927124023


training:  35%|███▌      | 3849/10986 [1:28:05<2:38:23,  1.33s/it]

training loss: 3.3637733459472656


training:  35%|███▌      | 3850/10986 [1:28:06<2:37:22,  1.32s/it]

training loss: 3.4267208576202393


training:  35%|███▌      | 3851/10986 [1:28:08<2:46:04,  1.40s/it]

training loss: 3.3245153427124023


training:  35%|███▌      | 3852/10986 [1:28:09<2:43:56,  1.38s/it]

training loss: 3.3954339027404785


training:  35%|███▌      | 3853/10986 [1:28:10<2:40:39,  1.35s/it]

training loss: 3.3980183601379395


training:  35%|███▌      | 3854/10986 [1:28:12<2:38:08,  1.33s/it]

training loss: 3.3447647094726562


training:  35%|███▌      | 3855/10986 [1:28:13<2:37:37,  1.33s/it]

training loss: 3.4534308910369873


training:  35%|███▌      | 3856/10986 [1:28:14<2:36:45,  1.32s/it]

training loss: 3.452291965484619


training:  35%|███▌      | 3857/10986 [1:28:16<2:36:13,  1.31s/it]

training loss: 3.3371615409851074


training:  35%|███▌      | 3858/10986 [1:28:17<2:35:22,  1.31s/it]

training loss: 3.3488430976867676


training:  35%|███▌      | 3859/10986 [1:28:18<2:35:03,  1.31s/it]

training loss: 3.472618818283081


training:  35%|███▌      | 3860/10986 [1:28:20<2:34:39,  1.30s/it]

training loss: 3.4918997287750244
valid loss: 3.4872236251831055
perplexity: 32.69504928588867


training:  35%|███▌      | 3861/10986 [1:28:22<3:24:56,  1.73s/it]

training loss: 3.4473154544830322


training:  35%|███▌      | 3862/10986 [1:28:24<3:11:57,  1.62s/it]

training loss: 3.477431297302246


training:  35%|███▌      | 3863/10986 [1:28:25<3:00:33,  1.52s/it]

training loss: 3.383713483810425


training:  35%|███▌      | 3864/10986 [1:28:26<2:52:12,  1.45s/it]

training loss: 3.5692811012268066


training:  35%|███▌      | 3865/10986 [1:28:27<2:46:39,  1.40s/it]

training loss: 3.518728733062744


training:  35%|███▌      | 3866/10986 [1:28:29<2:42:26,  1.37s/it]

training loss: 3.4114153385162354


training:  35%|███▌      | 3867/10986 [1:28:30<2:40:36,  1.35s/it]

training loss: 3.461862564086914


training:  35%|███▌      | 3868/10986 [1:28:31<2:38:39,  1.34s/it]

training loss: 3.3231072425842285


training:  35%|███▌      | 3869/10986 [1:28:33<2:36:39,  1.32s/it]

training loss: 3.3887364864349365


training:  35%|███▌      | 3870/10986 [1:28:34<2:35:14,  1.31s/it]

training loss: 3.416898727416992


training:  35%|███▌      | 3871/10986 [1:28:35<2:44:20,  1.39s/it]

training loss: 3.4482500553131104


training:  35%|███▌      | 3872/10986 [1:28:37<2:41:51,  1.37s/it]

training loss: 3.315451145172119


training:  35%|███▌      | 3873/10986 [1:28:38<2:38:59,  1.34s/it]

training loss: 3.397197723388672


training:  35%|███▌      | 3874/10986 [1:28:39<2:37:43,  1.33s/it]

training loss: 3.455634832382202


training:  35%|███▌      | 3875/10986 [1:28:41<2:36:43,  1.32s/it]

training loss: 3.359452486038208


training:  35%|███▌      | 3876/10986 [1:28:42<2:35:20,  1.31s/it]

training loss: 3.423736095428467


training:  35%|███▌      | 3877/10986 [1:28:43<2:34:28,  1.30s/it]

training loss: 3.308561325073242


training:  35%|███▌      | 3878/10986 [1:28:45<2:33:57,  1.30s/it]

training loss: 3.4993653297424316


training:  35%|███▌      | 3879/10986 [1:28:46<2:33:30,  1.30s/it]

training loss: 3.554856061935425


training:  35%|███▌      | 3880/10986 [1:28:47<2:33:08,  1.29s/it]

training loss: 3.4493980407714844
valid loss: 3.441633462905884
perplexity: 31.23794174194336


training:  35%|███▌      | 3881/10986 [1:28:50<3:24:28,  1.73s/it]

training loss: 3.5411770343780518


training:  35%|███▌      | 3882/10986 [1:28:52<3:21:49,  1.70s/it]

training loss: 3.374220371246338


training:  35%|███▌      | 3883/10986 [1:28:53<3:07:30,  1.58s/it]

training loss: 3.352888345718384


training:  35%|███▌      | 3884/10986 [1:28:54<2:57:28,  1.50s/it]

training loss: 3.393453359603882


training:  35%|███▌      | 3885/10986 [1:28:55<2:49:43,  1.43s/it]

training loss: 3.5359363555908203


training:  35%|███▌      | 3886/10986 [1:28:57<2:45:05,  1.40s/it]

training loss: 3.3479104042053223


training:  35%|███▌      | 3887/10986 [1:28:58<2:41:49,  1.37s/it]

training loss: 3.267446279525757


training:  35%|███▌      | 3888/10986 [1:28:59<2:39:25,  1.35s/it]

training loss: 3.4464409351348877


training:  35%|███▌      | 3889/10986 [1:29:01<2:37:54,  1.34s/it]

training loss: 3.40671443939209


training:  35%|███▌      | 3890/10986 [1:29:02<2:37:27,  1.33s/it]

training loss: 3.4402337074279785


training:  35%|███▌      | 3891/10986 [1:29:04<2:45:20,  1.40s/it]

training loss: 3.3640477657318115


training:  35%|███▌      | 3892/10986 [1:29:05<2:52:05,  1.46s/it]

training loss: 3.372506856918335


training:  35%|███▌      | 3893/10986 [1:29:06<2:46:58,  1.41s/it]

training loss: 3.3376035690307617


training:  35%|███▌      | 3894/10986 [1:29:08<2:42:20,  1.37s/it]

training loss: 3.488949775695801


training:  35%|███▌      | 3895/10986 [1:29:09<2:39:48,  1.35s/it]

training loss: 3.419257640838623


training:  35%|███▌      | 3896/10986 [1:29:10<2:38:01,  1.34s/it]

training loss: 3.487438201904297


training:  35%|███▌      | 3897/10986 [1:29:12<2:36:39,  1.33s/it]

training loss: 3.4043772220611572


training:  35%|███▌      | 3898/10986 [1:29:13<2:36:29,  1.32s/it]

training loss: 3.4482874870300293


training:  35%|███▌      | 3899/10986 [1:29:14<2:35:47,  1.32s/it]

training loss: 3.3360729217529297


training:  35%|███▌      | 3900/10986 [1:29:16<2:34:50,  1.31s/it]

training loss: 3.5001285076141357
valid loss: 3.4898412227630615
perplexity: 32.78074264526367


training:  36%|███▌      | 3901/10986 [1:29:18<3:26:15,  1.75s/it]

training loss: 3.461085319519043


training:  36%|███▌      | 3902/10986 [1:29:20<3:13:48,  1.64s/it]

training loss: 3.4640398025512695


training:  36%|███▌      | 3903/10986 [1:29:21<3:02:04,  1.54s/it]

training loss: 3.404325246810913


training:  36%|███▌      | 3904/10986 [1:29:22<2:53:32,  1.47s/it]

training loss: 3.5522613525390625


training:  36%|███▌      | 3905/10986 [1:29:24<2:46:59,  1.41s/it]

training loss: 3.3842997550964355


training:  36%|███▌      | 3906/10986 [1:29:25<2:43:10,  1.38s/it]

training loss: 3.321694850921631


training:  36%|███▌      | 3907/10986 [1:29:26<2:39:54,  1.36s/it]

training loss: 3.3934435844421387


training:  36%|███▌      | 3908/10986 [1:29:27<2:38:18,  1.34s/it]

training loss: 3.3888015747070312


training:  36%|███▌      | 3909/10986 [1:29:29<2:37:25,  1.33s/it]

training loss: 3.423929452896118


training:  36%|███▌      | 3910/10986 [1:29:30<2:36:33,  1.33s/it]

training loss: 3.4288058280944824


training:  36%|███▌      | 3911/10986 [1:29:32<2:46:37,  1.41s/it]

training loss: 3.478890895843506


training:  36%|███▌      | 3912/10986 [1:29:33<2:52:48,  1.47s/it]

training loss: 3.442408561706543


training:  36%|███▌      | 3913/10986 [1:29:35<2:46:51,  1.42s/it]

training loss: 3.37441086769104


training:  36%|███▌      | 3914/10986 [1:29:36<2:42:24,  1.38s/it]

training loss: 3.4077887535095215


training:  36%|███▌      | 3915/10986 [1:29:37<2:39:04,  1.35s/it]

training loss: 3.531667470932007


training:  36%|███▌      | 3916/10986 [1:29:38<2:37:29,  1.34s/it]

training loss: 3.5338802337646484


training:  36%|███▌      | 3917/10986 [1:29:40<2:35:36,  1.32s/it]

training loss: 3.4338810443878174


training:  36%|███▌      | 3918/10986 [1:29:41<2:35:41,  1.32s/it]

training loss: 3.438641309738159


training:  36%|███▌      | 3919/10986 [1:29:42<2:34:35,  1.31s/it]

training loss: 3.4420552253723145


training:  36%|███▌      | 3920/10986 [1:29:44<2:34:26,  1.31s/it]

training loss: 3.4461209774017334
valid loss: 3.4418702125549316
perplexity: 31.245338439941406


training:  36%|███▌      | 3921/10986 [1:29:46<3:26:23,  1.75s/it]

training loss: 3.4449381828308105


training:  36%|███▌      | 3922/10986 [1:29:48<3:13:16,  1.64s/it]

training loss: 3.386828660964966


training:  36%|███▌      | 3923/10986 [1:29:49<3:01:19,  1.54s/it]

training loss: 3.3439784049987793


training:  36%|███▌      | 3924/10986 [1:29:50<2:52:38,  1.47s/it]

training loss: 3.3227875232696533


training:  36%|███▌      | 3925/10986 [1:29:52<2:46:11,  1.41s/it]

training loss: 3.3425779342651367


training:  36%|███▌      | 3926/10986 [1:29:53<2:42:31,  1.38s/it]

training loss: 3.437998056411743


training:  36%|███▌      | 3927/10986 [1:29:54<2:40:44,  1.37s/it]

training loss: 3.3592872619628906


training:  36%|███▌      | 3928/10986 [1:29:56<2:38:55,  1.35s/it]

training loss: 3.3088059425354004


training:  36%|███▌      | 3929/10986 [1:29:57<2:36:54,  1.33s/it]

training loss: 3.49462628364563


training:  36%|███▌      | 3930/10986 [1:29:58<2:36:04,  1.33s/it]

training loss: 3.375929832458496


training:  36%|███▌      | 3931/10986 [1:30:00<2:44:44,  1.40s/it]

training loss: 3.38405704498291


training:  36%|███▌      | 3932/10986 [1:30:01<2:51:32,  1.46s/it]

training loss: 3.4332292079925537


training:  36%|███▌      | 3933/10986 [1:30:03<2:46:10,  1.41s/it]

training loss: 3.345183849334717


training:  36%|███▌      | 3934/10986 [1:30:04<2:41:56,  1.38s/it]

training loss: 3.3298521041870117


training:  36%|███▌      | 3935/10986 [1:30:05<2:40:58,  1.37s/it]

training loss: 3.505117177963257


training:  36%|███▌      | 3936/10986 [1:30:07<2:39:44,  1.36s/it]

training loss: 3.4080045223236084


training:  36%|███▌      | 3937/10986 [1:30:08<2:37:34,  1.34s/it]

training loss: 3.4180819988250732


training:  36%|███▌      | 3938/10986 [1:30:09<2:38:16,  1.35s/it]

training loss: 3.4169387817382812


training:  36%|███▌      | 3939/10986 [1:30:11<2:51:53,  1.46s/it]

training loss: 3.4467687606811523


training:  36%|███▌      | 3940/10986 [1:30:13<3:01:48,  1.55s/it]

training loss: 3.515519380569458
valid loss: 3.513535499572754
perplexity: 33.566734313964844


training:  36%|███▌      | 3941/10986 [1:30:16<3:46:20,  1.93s/it]

training loss: 3.478024482727051


training:  36%|███▌      | 3942/10986 [1:30:17<3:27:23,  1.77s/it]

training loss: 3.429358959197998


training:  36%|███▌      | 3943/10986 [1:30:18<3:11:29,  1.63s/it]

training loss: 3.391390562057495


training:  36%|███▌      | 3944/10986 [1:30:20<2:59:41,  1.53s/it]

training loss: 3.3575942516326904


training:  36%|███▌      | 3945/10986 [1:30:21<2:51:20,  1.46s/it]

training loss: 3.337264060974121


training:  36%|███▌      | 3946/10986 [1:30:22<2:45:54,  1.41s/it]

training loss: 3.401050090789795


training:  36%|███▌      | 3947/10986 [1:30:24<2:43:36,  1.39s/it]

training loss: 3.393402099609375


training:  36%|███▌      | 3948/10986 [1:30:25<2:40:34,  1.37s/it]

training loss: 3.4266457557678223


training:  36%|███▌      | 3949/10986 [1:30:26<2:38:28,  1.35s/it]

training loss: 3.36201548576355


training:  36%|███▌      | 3950/10986 [1:30:28<2:36:58,  1.34s/it]

training loss: 3.4053187370300293


training:  36%|███▌      | 3951/10986 [1:30:29<2:47:14,  1.43s/it]

training loss: 3.465226411819458


training:  36%|███▌      | 3952/10986 [1:30:31<2:43:14,  1.39s/it]

training loss: 3.4767203330993652


training:  36%|███▌      | 3953/10986 [1:30:32<2:42:26,  1.39s/it]

training loss: 3.431851387023926


training:  36%|███▌      | 3954/10986 [1:30:33<2:40:03,  1.37s/it]

training loss: 3.290165424346924


training:  36%|███▌      | 3955/10986 [1:30:35<2:37:56,  1.35s/it]

training loss: 3.4282596111297607


training:  36%|███▌      | 3956/10986 [1:30:36<2:35:53,  1.33s/it]

training loss: 3.340665340423584


training:  36%|███▌      | 3957/10986 [1:30:37<2:34:49,  1.32s/it]

training loss: 3.4699058532714844


training:  36%|███▌      | 3958/10986 [1:30:38<2:33:44,  1.31s/it]

training loss: 3.3806893825531006


training:  36%|███▌      | 3959/10986 [1:30:40<2:33:10,  1.31s/it]

training loss: 3.4246559143066406


training:  36%|███▌      | 3960/10986 [1:30:41<2:32:47,  1.30s/it]

training loss: 3.340963363647461
valid loss: 3.3408586978912354
perplexity: 28.24336814880371


training:  36%|███▌      | 3961/10986 [1:30:44<3:24:09,  1.74s/it]

training loss: 3.4113850593566895


training:  36%|███▌      | 3962/10986 [1:30:45<3:19:27,  1.70s/it]

training loss: 3.3826236724853516


training:  36%|███▌      | 3963/10986 [1:30:47<3:04:55,  1.58s/it]

training loss: 3.4127039909362793


training:  36%|███▌      | 3964/10986 [1:30:48<2:55:15,  1.50s/it]

training loss: 3.3505125045776367


training:  36%|███▌      | 3965/10986 [1:30:49<2:47:45,  1.43s/it]

training loss: 3.499187707901001


training:  36%|███▌      | 3966/10986 [1:30:51<2:42:28,  1.39s/it]

training loss: 3.5648908615112305


training:  36%|███▌      | 3967/10986 [1:30:52<2:39:42,  1.37s/it]

training loss: 3.50765061378479


training:  36%|███▌      | 3968/10986 [1:30:53<2:39:19,  1.36s/it]

training loss: 3.505225419998169


training:  36%|███▌      | 3969/10986 [1:30:55<2:37:41,  1.35s/it]

training loss: 3.327472448348999


training:  36%|███▌      | 3970/10986 [1:30:56<2:35:53,  1.33s/it]

training loss: 3.4367334842681885


training:  36%|███▌      | 3971/10986 [1:30:57<2:44:10,  1.40s/it]

training loss: 3.5353569984436035


training:  36%|███▌      | 3972/10986 [1:30:59<2:49:15,  1.45s/it]

training loss: 3.526214838027954


training:  36%|███▌      | 3973/10986 [1:31:00<2:44:10,  1.40s/it]

training loss: 3.532351493835449


training:  36%|███▌      | 3974/10986 [1:31:02<2:41:29,  1.38s/it]

training loss: 3.365931272506714


training:  36%|███▌      | 3975/10986 [1:31:03<2:38:36,  1.36s/it]

training loss: 3.3337788581848145


training:  36%|███▌      | 3976/10986 [1:31:04<2:36:31,  1.34s/it]

training loss: 3.4131953716278076


training:  36%|███▌      | 3977/10986 [1:31:05<2:34:44,  1.32s/it]

training loss: 3.4642748832702637


training:  36%|███▌      | 3978/10986 [1:31:07<2:33:45,  1.32s/it]

training loss: 3.310932159423828


training:  36%|███▌      | 3979/10986 [1:31:08<2:34:18,  1.32s/it]

training loss: 3.4249987602233887


training:  36%|███▌      | 3980/10986 [1:31:09<2:33:12,  1.31s/it]

training loss: 3.413231611251831
valid loss: 3.4194412231445312
perplexity: 30.552337646484375


training:  36%|███▌      | 3981/10986 [1:31:12<3:23:14,  1.74s/it]

training loss: 3.4027926921844482


training:  36%|███▌      | 3982/10986 [1:31:14<3:11:29,  1.64s/it]

training loss: 3.4209094047546387


training:  36%|███▋      | 3983/10986 [1:31:15<2:59:33,  1.54s/it]

training loss: 3.471257448196411


training:  36%|███▋      | 3984/10986 [1:31:16<2:51:13,  1.47s/it]

training loss: 3.400418281555176


training:  36%|███▋      | 3985/10986 [1:31:17<2:45:51,  1.42s/it]

training loss: 3.5663418769836426


training:  36%|███▋      | 3986/10986 [1:31:19<2:42:43,  1.39s/it]

training loss: 3.422471523284912


training:  36%|███▋      | 3987/10986 [1:31:20<2:38:50,  1.36s/it]

training loss: 3.388517141342163


training:  36%|███▋      | 3988/10986 [1:31:21<2:36:58,  1.35s/it]

training loss: 3.3717334270477295


training:  36%|███▋      | 3989/10986 [1:31:23<2:34:49,  1.33s/it]

training loss: 3.338663339614868


training:  36%|███▋      | 3990/10986 [1:31:24<2:33:23,  1.32s/it]

training loss: 3.3720335960388184


training:  36%|███▋      | 3991/10986 [1:31:26<2:42:34,  1.39s/it]

training loss: 3.506196975708008


training:  36%|███▋      | 3992/10986 [1:31:27<2:40:52,  1.38s/it]

training loss: 3.5517115592956543


training:  36%|███▋      | 3993/10986 [1:31:28<2:37:19,  1.35s/it]

training loss: 3.597693681716919


training:  36%|███▋      | 3994/10986 [1:31:30<2:36:29,  1.34s/it]

training loss: 3.433762788772583


training:  36%|███▋      | 3995/10986 [1:31:31<2:34:36,  1.33s/it]

training loss: 3.484168291091919


training:  36%|███▋      | 3996/10986 [1:31:32<2:34:40,  1.33s/it]

training loss: 3.520555257797241


training:  36%|███▋      | 3997/10986 [1:31:33<2:35:12,  1.33s/it]

training loss: 3.43664288520813


training:  36%|███▋      | 3998/10986 [1:31:35<2:34:55,  1.33s/it]

training loss: 3.3912062644958496


training:  36%|███▋      | 3999/10986 [1:31:36<2:34:06,  1.32s/it]

training loss: 3.31646466255188


training:  36%|███▋      | 4000/10986 [1:31:37<2:33:25,  1.32s/it]

training loss: 3.4310500621795654
valid loss: 3.4263062477111816
perplexity: 30.762802124023438


training:  36%|███▋      | 4001/10986 [1:31:40<3:25:47,  1.77s/it]

training loss: 3.425278425216675


training:  36%|███▋      | 4002/10986 [1:31:42<3:22:50,  1.74s/it]

training loss: 3.465538263320923


training:  36%|███▋      | 4003/10986 [1:31:43<3:07:32,  1.61s/it]

training loss: 3.3652920722961426


training:  36%|███▋      | 4004/10986 [1:31:45<2:56:53,  1.52s/it]

training loss: 3.392913341522217


training:  36%|███▋      | 4005/10986 [1:31:46<2:50:50,  1.47s/it]

training loss: 3.4593355655670166


training:  36%|███▋      | 4006/10986 [1:31:47<2:44:48,  1.42s/it]

training loss: 3.393826484680176


training:  36%|███▋      | 4007/10986 [1:31:48<2:40:36,  1.38s/it]

training loss: 3.3591532707214355


training:  36%|███▋      | 4008/10986 [1:31:50<2:37:18,  1.35s/it]

training loss: 3.5465338230133057


training:  36%|███▋      | 4009/10986 [1:31:51<2:35:32,  1.34s/it]

training loss: 3.4653213024139404


training:  37%|███▋      | 4010/10986 [1:31:52<2:33:29,  1.32s/it]

training loss: 3.4717376232147217


training:  37%|███▋      | 4011/10986 [1:31:54<2:41:44,  1.39s/it]

training loss: 3.4281251430511475


training:  37%|███▋      | 4012/10986 [1:31:55<2:39:11,  1.37s/it]

training loss: 3.4612278938293457


training:  37%|███▋      | 4013/10986 [1:31:56<2:36:03,  1.34s/it]

training loss: 3.4326107501983643


training:  37%|███▋      | 4014/10986 [1:31:58<2:34:30,  1.33s/it]

training loss: 3.5618066787719727


training:  37%|███▋      | 4015/10986 [1:31:59<2:33:09,  1.32s/it]

training loss: 3.390212297439575


training:  37%|███▋      | 4016/10986 [1:32:00<2:32:18,  1.31s/it]

training loss: 3.337362766265869


training:  37%|███▋      | 4017/10986 [1:32:02<2:32:56,  1.32s/it]

training loss: 3.3262856006622314


training:  37%|███▋      | 4018/10986 [1:32:03<2:33:51,  1.32s/it]

training loss: 3.424799680709839


training:  37%|███▋      | 4019/10986 [1:32:04<2:32:43,  1.32s/it]

training loss: 3.443789005279541


training:  37%|███▋      | 4020/10986 [1:32:06<2:32:23,  1.31s/it]

training loss: 3.4806551933288574
valid loss: 3.4766533374786377
perplexity: 32.35127258300781


training:  37%|███▋      | 4021/10986 [1:32:08<3:23:05,  1.75s/it]

training loss: 3.3858907222747803


training:  37%|███▋      | 4022/10986 [1:32:10<3:18:50,  1.71s/it]

training loss: 3.2195324897766113


training:  37%|███▋      | 4023/10986 [1:32:11<3:04:39,  1.59s/it]

training loss: 3.474360942840576


training:  37%|███▋      | 4024/10986 [1:32:13<2:54:03,  1.50s/it]

training loss: 3.4200081825256348


training:  37%|███▋      | 4025/10986 [1:32:14<2:47:08,  1.44s/it]

training loss: 3.4727933406829834


training:  37%|███▋      | 4026/10986 [1:32:15<2:42:03,  1.40s/it]

training loss: 3.46028208732605


training:  37%|███▋      | 4027/10986 [1:32:17<2:38:30,  1.37s/it]

training loss: 3.3105721473693848


training:  37%|███▋      | 4028/10986 [1:32:18<2:35:46,  1.34s/it]

training loss: 3.3960068225860596


training:  37%|███▋      | 4029/10986 [1:32:19<2:34:17,  1.33s/it]

training loss: 3.305835008621216


training:  37%|███▋      | 4030/10986 [1:32:20<2:33:24,  1.32s/it]

training loss: 3.521963119506836


training:  37%|███▋      | 4031/10986 [1:32:22<2:41:39,  1.39s/it]

training loss: 3.384753465652466


training:  37%|███▋      | 4032/10986 [1:32:24<2:46:25,  1.44s/it]

training loss: 3.483950614929199


training:  37%|███▋      | 4033/10986 [1:32:25<2:41:37,  1.39s/it]

training loss: 3.399451971054077


training:  37%|███▋      | 4034/10986 [1:32:26<2:37:45,  1.36s/it]

training loss: 3.4441416263580322


training:  37%|███▋      | 4035/10986 [1:32:27<2:34:56,  1.34s/it]

training loss: 3.348111152648926


training:  37%|███▋      | 4036/10986 [1:32:29<2:33:21,  1.32s/it]

training loss: 3.4024014472961426


training:  37%|███▋      | 4037/10986 [1:32:30<2:32:05,  1.31s/it]

training loss: 3.410377025604248


training:  37%|███▋      | 4038/10986 [1:32:31<2:30:46,  1.30s/it]

training loss: 3.5498759746551514


training:  37%|███▋      | 4039/10986 [1:32:33<2:31:03,  1.30s/it]

training loss: 3.4104301929473877


training:  37%|███▋      | 4040/10986 [1:32:34<2:32:13,  1.31s/it]

training loss: 3.4860177040100098
valid loss: 3.4800186157226562
perplexity: 32.4603271484375


training:  37%|███▋      | 4041/10986 [1:32:37<3:21:47,  1.74s/it]

training loss: 3.421804189682007


training:  37%|███▋      | 4042/10986 [1:32:38<3:08:35,  1.63s/it]

training loss: 3.431614398956299


training:  37%|███▋      | 4043/10986 [1:32:39<2:57:45,  1.54s/it]

training loss: 3.5157651901245117


training:  37%|███▋      | 4044/10986 [1:32:41<2:49:18,  1.46s/it]

training loss: 3.480710506439209


training:  37%|███▋      | 4045/10986 [1:32:42<2:43:41,  1.42s/it]

training loss: 3.403639793395996


training:  37%|███▋      | 4046/10986 [1:32:43<2:40:09,  1.38s/it]

training loss: 3.4114913940429688


training:  37%|███▋      | 4047/10986 [1:32:45<2:37:41,  1.36s/it]

training loss: 3.476659059524536


training:  37%|███▋      | 4048/10986 [1:32:46<2:35:23,  1.34s/it]

training loss: 3.39589262008667


training:  37%|███▋      | 4049/10986 [1:32:47<2:33:24,  1.33s/it]

training loss: 3.4055542945861816


training:  37%|███▋      | 4050/10986 [1:32:48<2:31:54,  1.31s/it]

training loss: 3.333317995071411


training:  37%|███▋      | 4051/10986 [1:32:50<2:39:40,  1.38s/it]

training loss: 3.5233876705169678


training:  37%|███▋      | 4052/10986 [1:32:51<2:39:26,  1.38s/it]

training loss: 3.4956092834472656


training:  37%|███▋      | 4053/10986 [1:32:53<2:36:24,  1.35s/it]

training loss: 3.419261932373047


training:  37%|███▋      | 4054/10986 [1:32:54<2:34:34,  1.34s/it]

training loss: 3.3735873699188232


training:  37%|███▋      | 4055/10986 [1:32:55<2:33:27,  1.33s/it]

training loss: 3.2993576526641846


training:  37%|███▋      | 4056/10986 [1:32:57<2:32:57,  1.32s/it]

training loss: 3.5019750595092773


training:  37%|███▋      | 4057/10986 [1:32:58<2:32:27,  1.32s/it]

training loss: 3.4310379028320312


training:  37%|███▋      | 4058/10986 [1:32:59<2:32:25,  1.32s/it]

training loss: 3.362887144088745


training:  37%|███▋      | 4059/10986 [1:33:00<2:32:37,  1.32s/it]

training loss: 3.471243381500244


training:  37%|███▋      | 4060/10986 [1:33:02<2:32:55,  1.32s/it]

training loss: 3.483635902404785
valid loss: 3.4794726371765137
perplexity: 32.44260787963867


training:  37%|███▋      | 4061/10986 [1:33:05<3:24:33,  1.77s/it]

training loss: 3.435922384262085


training:  37%|███▋      | 4062/10986 [1:33:06<3:10:39,  1.65s/it]

training loss: 3.3115861415863037


training:  37%|███▋      | 4063/10986 [1:33:07<2:58:23,  1.55s/it]

training loss: 3.5562994480133057


training:  37%|███▋      | 4064/10986 [1:33:09<2:49:37,  1.47s/it]

training loss: 3.320399761199951


training:  37%|███▋      | 4065/10986 [1:33:10<2:43:55,  1.42s/it]

training loss: 3.4359095096588135


training:  37%|███▋      | 4066/10986 [1:33:11<2:39:10,  1.38s/it]

training loss: 3.4737436771392822


training:  37%|███▋      | 4067/10986 [1:33:12<2:36:16,  1.36s/it]

training loss: 3.435868740081787


training:  37%|███▋      | 4068/10986 [1:33:14<2:35:25,  1.35s/it]

training loss: 3.392228364944458


training:  37%|███▋      | 4069/10986 [1:33:15<2:33:41,  1.33s/it]

training loss: 3.480214834213257


training:  37%|███▋      | 4070/10986 [1:33:16<2:32:51,  1.33s/it]

training loss: 3.4244418144226074


training:  37%|███▋      | 4071/10986 [1:33:18<2:41:03,  1.40s/it]

training loss: 3.5203332901000977


training:  37%|███▋      | 4072/10986 [1:33:19<2:39:52,  1.39s/it]

training loss: 3.4033586978912354


training:  37%|███▋      | 4073/10986 [1:33:21<2:37:36,  1.37s/it]

training loss: 3.461264133453369


training:  37%|███▋      | 4074/10986 [1:33:22<2:34:34,  1.34s/it]

training loss: 3.3733487129211426


training:  37%|███▋      | 4075/10986 [1:33:23<2:32:59,  1.33s/it]

training loss: 3.344566583633423


training:  37%|███▋      | 4076/10986 [1:33:25<2:32:33,  1.32s/it]

training loss: 3.4702253341674805


training:  37%|███▋      | 4077/10986 [1:33:26<2:31:52,  1.32s/it]

training loss: 3.3813955783843994


training:  37%|███▋      | 4078/10986 [1:33:27<2:31:20,  1.31s/it]

training loss: 3.4431657791137695


training:  37%|███▋      | 4079/10986 [1:33:28<2:30:53,  1.31s/it]

training loss: 3.4023244380950928


training:  37%|███▋      | 4080/10986 [1:33:30<2:30:13,  1.31s/it]

training loss: 3.516915798187256
valid loss: 3.5302634239196777
perplexity: 34.132957458496094


training:  37%|███▋      | 4081/10986 [1:33:33<3:23:10,  1.77s/it]

training loss: 3.367046356201172


training:  37%|███▋      | 4082/10986 [1:33:34<3:22:50,  1.76s/it]

training loss: 3.3935136795043945


training:  37%|███▋      | 4083/10986 [1:33:36<3:21:44,  1.75s/it]

training loss: 3.376863479614258


training:  37%|███▋      | 4084/10986 [1:33:38<3:12:12,  1.67s/it]

training loss: 3.5129268169403076


training:  37%|███▋      | 4085/10986 [1:33:39<2:59:48,  1.56s/it]

training loss: 3.358693838119507


training:  37%|███▋      | 4086/10986 [1:33:40<2:50:04,  1.48s/it]

training loss: 3.342661142349243


training:  37%|███▋      | 4087/10986 [1:33:41<2:44:24,  1.43s/it]

training loss: 3.34389066696167


training:  37%|███▋      | 4088/10986 [1:33:43<2:40:02,  1.39s/it]

training loss: 3.393491744995117


training:  37%|███▋      | 4089/10986 [1:33:44<2:38:03,  1.38s/it]

training loss: 3.5388684272766113


training:  37%|███▋      | 4090/10986 [1:33:45<2:35:01,  1.35s/it]

training loss: 3.406365394592285


training:  37%|███▋      | 4091/10986 [1:33:47<2:43:16,  1.42s/it]

training loss: 3.3084897994995117


training:  37%|███▋      | 4092/10986 [1:33:48<2:45:09,  1.44s/it]

training loss: 3.3804638385772705


training:  37%|███▋      | 4093/10986 [1:33:50<2:40:29,  1.40s/it]

training loss: 3.378822088241577


training:  37%|███▋      | 4094/10986 [1:33:51<2:37:35,  1.37s/it]

training loss: 3.3248674869537354


training:  37%|███▋      | 4095/10986 [1:33:52<2:34:52,  1.35s/it]

training loss: 3.4262449741363525


training:  37%|███▋      | 4096/10986 [1:33:54<2:32:50,  1.33s/it]

training loss: 3.5029213428497314


training:  37%|███▋      | 4097/10986 [1:33:55<2:31:23,  1.32s/it]

training loss: 3.4627535343170166


training:  37%|███▋      | 4098/10986 [1:33:56<2:30:39,  1.31s/it]

training loss: 3.368403196334839


training:  37%|███▋      | 4099/10986 [1:33:58<2:30:30,  1.31s/it]

training loss: 3.426833152770996


training:  37%|███▋      | 4100/10986 [1:33:59<2:30:38,  1.31s/it]

training loss: 3.447444438934326
valid loss: 3.4463155269622803
perplexity: 31.38454246520996


training:  37%|███▋      | 4101/10986 [1:34:02<3:20:19,  1.75s/it]

training loss: 3.3239364624023438


training:  37%|███▋      | 4102/10986 [1:34:03<3:07:50,  1.64s/it]

training loss: 3.3751208782196045


training:  37%|███▋      | 4103/10986 [1:34:04<2:58:45,  1.56s/it]

training loss: 3.5332117080688477


training:  37%|███▋      | 4104/10986 [1:34:06<2:49:32,  1.48s/it]

training loss: 3.3562066555023193


training:  37%|███▋      | 4105/10986 [1:34:07<2:42:39,  1.42s/it]

training loss: 3.483245372772217


training:  37%|███▋      | 4106/10986 [1:34:08<2:38:23,  1.38s/it]

training loss: 3.370582342147827


training:  37%|███▋      | 4107/10986 [1:34:10<2:35:40,  1.36s/it]

training loss: 3.3868446350097656


training:  37%|███▋      | 4108/10986 [1:34:11<2:32:39,  1.33s/it]

training loss: 3.4231927394866943


training:  37%|███▋      | 4109/10986 [1:34:12<2:31:50,  1.32s/it]

training loss: 3.3681726455688477


training:  37%|███▋      | 4110/10986 [1:34:13<2:30:56,  1.32s/it]

training loss: 3.540729522705078


training:  37%|███▋      | 4111/10986 [1:34:15<2:41:01,  1.41s/it]

training loss: 3.3910348415374756


training:  37%|███▋      | 4112/10986 [1:34:17<2:49:13,  1.48s/it]

training loss: 3.3810853958129883


training:  37%|███▋      | 4113/10986 [1:34:18<2:43:28,  1.43s/it]

training loss: 3.3305318355560303


training:  37%|███▋      | 4114/10986 [1:34:19<2:39:43,  1.39s/it]

training loss: 3.4559426307678223


training:  37%|███▋      | 4115/10986 [1:34:21<2:36:14,  1.36s/it]

training loss: 3.3382046222686768


training:  37%|███▋      | 4116/10986 [1:34:22<2:33:31,  1.34s/it]

training loss: 3.4117534160614014


training:  37%|███▋      | 4117/10986 [1:34:23<2:32:35,  1.33s/it]

training loss: 3.2993953227996826


training:  37%|███▋      | 4118/10986 [1:34:25<2:32:16,  1.33s/it]

training loss: 3.3758058547973633


training:  37%|███▋      | 4119/10986 [1:34:26<2:31:28,  1.32s/it]

training loss: 3.4119598865509033


training:  38%|███▊      | 4120/10986 [1:34:27<2:30:24,  1.31s/it]

training loss: 3.3854801654815674
valid loss: 3.3826651573181152
perplexity: 29.449153900146484


training:  38%|███▊      | 4121/10986 [1:34:30<3:20:39,  1.75s/it]

training loss: 3.3418519496917725


training:  38%|███▊      | 4122/10986 [1:34:31<3:09:13,  1.65s/it]

training loss: 3.4076688289642334


training:  38%|███▊      | 4123/10986 [1:34:33<2:57:46,  1.55s/it]

training loss: 3.3876776695251465


training:  38%|███▊      | 4124/10986 [1:34:34<2:50:21,  1.49s/it]

training loss: 3.329456329345703


training:  38%|███▊      | 4125/10986 [1:34:35<2:44:45,  1.44s/it]

training loss: 3.374326229095459


training:  38%|███▊      | 4126/10986 [1:34:37<2:39:23,  1.39s/it]

training loss: 3.419826030731201


training:  38%|███▊      | 4127/10986 [1:34:38<2:36:21,  1.37s/it]

training loss: 3.342756986618042


training:  38%|███▊      | 4128/10986 [1:34:39<2:34:59,  1.36s/it]

training loss: 3.450481653213501


training:  38%|███▊      | 4129/10986 [1:34:41<2:33:27,  1.34s/it]

training loss: 3.417842149734497


training:  38%|███▊      | 4130/10986 [1:34:42<2:32:14,  1.33s/it]

training loss: 3.32066011428833


training:  38%|███▊      | 4131/10986 [1:34:43<2:41:15,  1.41s/it]

training loss: 3.381138324737549


training:  38%|███▊      | 4132/10986 [1:34:45<2:49:26,  1.48s/it]

training loss: 3.3870768547058105


training:  38%|███▊      | 4133/10986 [1:34:46<2:43:23,  1.43s/it]

training loss: 3.4244747161865234


training:  38%|███▊      | 4134/10986 [1:34:48<2:39:19,  1.40s/it]

training loss: 3.3606038093566895


training:  38%|███▊      | 4135/10986 [1:34:49<2:36:41,  1.37s/it]

training loss: 3.48093318939209


training:  38%|███▊      | 4136/10986 [1:34:50<2:34:00,  1.35s/it]

training loss: 3.3495326042175293


training:  38%|███▊      | 4137/10986 [1:34:52<2:33:39,  1.35s/it]

training loss: 3.5738298892974854


training:  38%|███▊      | 4138/10986 [1:34:53<2:32:10,  1.33s/it]

training loss: 3.498474597930908


training:  38%|███▊      | 4139/10986 [1:34:54<2:31:51,  1.33s/it]

training loss: 3.4135727882385254


training:  38%|███▊      | 4140/10986 [1:34:56<2:31:26,  1.33s/it]

training loss: 3.4708187580108643
valid loss: 3.462797164916992
perplexity: 31.906097412109375


training:  38%|███▊      | 4141/10986 [1:34:58<3:21:17,  1.76s/it]

training loss: 3.3834195137023926


training:  38%|███▊      | 4142/10986 [1:35:00<3:07:21,  1.64s/it]

training loss: 3.478337049484253


training:  38%|███▊      | 4143/10986 [1:35:01<2:55:57,  1.54s/it]

training loss: 3.447571277618408


training:  38%|███▊      | 4144/10986 [1:35:02<2:47:27,  1.47s/it]

training loss: 3.4185867309570312


training:  38%|███▊      | 4145/10986 [1:35:04<2:42:24,  1.42s/it]

training loss: 3.421288013458252


training:  38%|███▊      | 4146/10986 [1:35:05<2:39:03,  1.40s/it]

training loss: 3.3561081886291504


training:  38%|███▊      | 4147/10986 [1:35:06<2:35:37,  1.37s/it]

training loss: 3.3980069160461426


training:  38%|███▊      | 4148/10986 [1:35:08<2:33:27,  1.35s/it]

training loss: 3.363003730773926


training:  38%|███▊      | 4149/10986 [1:35:09<2:31:47,  1.33s/it]

training loss: 3.427976131439209


training:  38%|███▊      | 4150/10986 [1:35:10<2:31:02,  1.33s/it]

training loss: 3.3844480514526367


training:  38%|███▊      | 4151/10986 [1:35:12<2:39:13,  1.40s/it]

training loss: 3.4543139934539795


training:  38%|███▊      | 4152/10986 [1:35:13<2:39:54,  1.40s/it]

training loss: 3.399589776992798


training:  38%|███▊      | 4153/10986 [1:35:15<2:36:29,  1.37s/it]

training loss: 3.4632480144500732


training:  38%|███▊      | 4154/10986 [1:35:16<2:33:37,  1.35s/it]

training loss: 3.3299362659454346


training:  38%|███▊      | 4155/10986 [1:35:17<2:31:33,  1.33s/it]

training loss: 3.4757161140441895


training:  38%|███▊      | 4156/10986 [1:35:18<2:29:54,  1.32s/it]

training loss: 3.428662061691284


training:  38%|███▊      | 4157/10986 [1:35:20<2:29:12,  1.31s/it]

training loss: 3.4677271842956543


training:  38%|███▊      | 4158/10986 [1:35:21<2:28:06,  1.30s/it]

training loss: 3.4860806465148926


training:  38%|███▊      | 4159/10986 [1:35:22<2:27:43,  1.30s/it]

training loss: 3.5039265155792236


training:  38%|███▊      | 4160/10986 [1:35:24<2:27:29,  1.30s/it]

training loss: 3.422966957092285
valid loss: 3.4240622520446777
perplexity: 30.693849563598633


training:  38%|███▊      | 4161/10986 [1:35:26<3:18:34,  1.75s/it]

training loss: 3.3515517711639404


training:  38%|███▊      | 4162/10986 [1:35:28<3:14:46,  1.71s/it]

training loss: 3.4065327644348145


training:  38%|███▊      | 4163/10986 [1:35:29<3:00:46,  1.59s/it]

training loss: 3.3857688903808594


training:  38%|███▊      | 4164/10986 [1:35:31<2:50:12,  1.50s/it]

training loss: 3.52546763420105


training:  38%|███▊      | 4165/10986 [1:35:32<2:43:44,  1.44s/it]

training loss: 3.3251395225524902


training:  38%|███▊      | 4166/10986 [1:35:33<2:39:17,  1.40s/it]

training loss: 3.4115071296691895


training:  38%|███▊      | 4167/10986 [1:35:35<2:36:10,  1.37s/it]

training loss: 3.5176870822906494


training:  38%|███▊      | 4168/10986 [1:35:36<2:34:58,  1.36s/it]

training loss: 3.425551414489746


training:  38%|███▊      | 4169/10986 [1:35:37<2:33:45,  1.35s/it]

training loss: 3.5535693168640137


training:  38%|███▊      | 4170/10986 [1:35:39<2:32:49,  1.35s/it]

training loss: 3.385463237762451


training:  38%|███▊      | 4171/10986 [1:35:40<2:41:53,  1.43s/it]

training loss: 3.3926427364349365


training:  38%|███▊      | 4172/10986 [1:35:41<2:38:02,  1.39s/it]

training loss: 3.4655327796936035


training:  38%|███▊      | 4173/10986 [1:35:43<2:34:52,  1.36s/it]

training loss: 3.366763114929199


training:  38%|███▊      | 4174/10986 [1:35:44<2:32:41,  1.34s/it]

training loss: 3.485611915588379


training:  38%|███▊      | 4175/10986 [1:35:45<2:31:33,  1.34s/it]

training loss: 3.3752410411834717


training:  38%|███▊      | 4176/10986 [1:35:47<2:31:42,  1.34s/it]

training loss: 3.427950382232666


training:  38%|███▊      | 4177/10986 [1:35:48<2:30:22,  1.33s/it]

training loss: 3.275887966156006


training:  38%|███▊      | 4178/10986 [1:35:49<2:29:28,  1.32s/it]

training loss: 3.327785015106201


training:  38%|███▊      | 4179/10986 [1:35:51<2:28:50,  1.31s/it]

training loss: 3.3997740745544434


training:  38%|███▊      | 4180/10986 [1:35:52<2:28:33,  1.31s/it]

training loss: 3.302217960357666
valid loss: 3.296567916870117
perplexity: 27.019744873046875


training:  38%|███▊      | 4181/10986 [1:35:55<3:21:42,  1.78s/it]

training loss: 3.411271810531616


training:  38%|███▊      | 4182/10986 [1:35:56<3:08:27,  1.66s/it]

training loss: 3.4530885219573975


training:  38%|███▊      | 4183/10986 [1:35:57<2:56:12,  1.55s/it]

training loss: 3.3576643466949463


training:  38%|███▊      | 4184/10986 [1:35:59<2:47:45,  1.48s/it]

training loss: 3.4715280532836914


training:  38%|███▊      | 4185/10986 [1:36:00<2:42:34,  1.43s/it]

training loss: 3.442385196685791


training:  38%|███▊      | 4186/10986 [1:36:01<2:38:50,  1.40s/it]

training loss: 3.4170756340026855


training:  38%|███▊      | 4187/10986 [1:36:03<2:36:45,  1.38s/it]

training loss: 3.4484755992889404


training:  38%|███▊      | 4188/10986 [1:36:04<2:34:10,  1.36s/it]

training loss: 3.3464341163635254


training:  38%|███▊      | 4189/10986 [1:36:05<2:34:15,  1.36s/it]

training loss: 3.3647358417510986


training:  38%|███▊      | 4190/10986 [1:36:07<2:33:12,  1.35s/it]

training loss: 3.4526238441467285


training:  38%|███▊      | 4191/10986 [1:36:08<2:40:52,  1.42s/it]

training loss: 3.46157169342041


training:  38%|███▊      | 4192/10986 [1:36:10<2:37:14,  1.39s/it]

training loss: 3.513625383377075


training:  38%|███▊      | 4193/10986 [1:36:11<2:34:14,  1.36s/it]

training loss: 3.5635225772857666


training:  38%|███▊      | 4194/10986 [1:36:12<2:32:32,  1.35s/it]

training loss: 3.4838809967041016


training:  38%|███▊      | 4195/10986 [1:36:14<2:31:40,  1.34s/it]

training loss: 3.2350234985351562


training:  38%|███▊      | 4196/10986 [1:36:15<2:30:39,  1.33s/it]

training loss: 3.3665218353271484


training:  38%|███▊      | 4197/10986 [1:36:16<2:30:34,  1.33s/it]

training loss: 3.4028706550598145


training:  38%|███▊      | 4198/10986 [1:36:18<2:29:45,  1.32s/it]

training loss: 3.476792335510254


training:  38%|███▊      | 4199/10986 [1:36:19<2:29:12,  1.32s/it]

training loss: 3.4587388038635254


training:  38%|███▊      | 4200/10986 [1:36:20<2:29:08,  1.32s/it]

training loss: 3.3918216228485107
valid loss: 3.391475200653076
perplexity: 29.709749221801758


training:  38%|███▊      | 4201/10986 [1:36:23<3:19:56,  1.77s/it]

training loss: 3.3837814331054688


training:  38%|███▊      | 4202/10986 [1:36:24<3:06:53,  1.65s/it]

training loss: 3.283485174179077


training:  38%|███▊      | 4203/10986 [1:36:26<2:54:53,  1.55s/it]

training loss: 3.519409656524658


training:  38%|███▊      | 4204/10986 [1:36:27<2:46:48,  1.48s/it]

training loss: 3.4396328926086426


training:  38%|███▊      | 4205/10986 [1:36:28<2:40:29,  1.42s/it]

training loss: 3.4413108825683594


training:  38%|███▊      | 4206/10986 [1:36:30<2:36:33,  1.39s/it]

training loss: 3.3146860599517822


training:  38%|███▊      | 4207/10986 [1:36:31<2:33:43,  1.36s/it]

training loss: 3.446641206741333


training:  38%|███▊      | 4208/10986 [1:36:32<2:33:24,  1.36s/it]

training loss: 3.397488594055176


training:  38%|███▊      | 4209/10986 [1:36:34<2:31:11,  1.34s/it]

training loss: 3.469433546066284


training:  38%|███▊      | 4210/10986 [1:36:35<2:29:36,  1.32s/it]

training loss: 3.4110357761383057


training:  38%|███▊      | 4211/10986 [1:36:36<2:38:07,  1.40s/it]

training loss: 3.275331974029541


training:  38%|███▊      | 4212/10986 [1:36:38<2:38:38,  1.41s/it]

training loss: 3.4646222591400146


training:  38%|███▊      | 4213/10986 [1:36:39<2:34:09,  1.37s/it]

training loss: 3.370148181915283


training:  38%|███▊      | 4214/10986 [1:36:40<2:32:32,  1.35s/it]

training loss: 3.4046590328216553


training:  38%|███▊      | 4215/10986 [1:36:42<2:30:36,  1.33s/it]

training loss: 3.4306938648223877


training:  38%|███▊      | 4216/10986 [1:36:43<2:31:36,  1.34s/it]

training loss: 3.386746406555176


training:  38%|███▊      | 4217/10986 [1:36:44<2:29:40,  1.33s/it]

training loss: 3.5003559589385986


training:  38%|███▊      | 4218/10986 [1:36:46<2:27:46,  1.31s/it]

training loss: 3.407438278198242


training:  38%|███▊      | 4219/10986 [1:36:47<2:26:29,  1.30s/it]

training loss: 3.4490954875946045


training:  38%|███▊      | 4220/10986 [1:36:48<2:27:00,  1.30s/it]

training loss: 3.397507905960083
valid loss: 3.396580219268799
perplexity: 29.861804962158203


training:  38%|███▊      | 4221/10986 [1:36:51<3:15:54,  1.74s/it]

training loss: 3.4508726596832275


training:  38%|███▊      | 4222/10986 [1:36:53<3:11:57,  1.70s/it]

training loss: 3.440387010574341


training:  38%|███▊      | 4223/10986 [1:36:54<2:58:17,  1.58s/it]

training loss: 3.310443878173828


training:  38%|███▊      | 4224/10986 [1:36:55<2:49:29,  1.50s/it]

training loss: 3.4922196865081787


training:  38%|███▊      | 4225/10986 [1:36:56<2:42:04,  1.44s/it]

training loss: 3.3447823524475098


training:  38%|███▊      | 4226/10986 [1:36:58<2:51:10,  1.52s/it]

training loss: 3.283493757247925


training:  38%|███▊      | 4227/10986 [1:37:00<2:57:29,  1.58s/it]

training loss: 3.4412238597869873


training:  38%|███▊      | 4228/10986 [1:37:01<2:52:28,  1.53s/it]

training loss: 3.3054745197296143


training:  38%|███▊      | 4229/10986 [1:37:03<2:44:18,  1.46s/it]

training loss: 3.295898914337158


training:  39%|███▊      | 4230/10986 [1:37:04<2:38:42,  1.41s/it]

training loss: 3.437767267227173


training:  39%|███▊      | 4231/10986 [1:37:05<2:44:23,  1.46s/it]

training loss: 3.4038548469543457


training:  39%|███▊      | 4232/10986 [1:37:07<2:48:23,  1.50s/it]

training loss: 3.4070281982421875


training:  39%|███▊      | 4233/10986 [1:37:08<2:42:27,  1.44s/it]

training loss: 3.393313407897949


training:  39%|███▊      | 4234/10986 [1:37:10<2:39:15,  1.42s/it]

training loss: 3.344979763031006


training:  39%|███▊      | 4235/10986 [1:37:11<2:35:56,  1.39s/it]

training loss: 3.387497663497925


training:  39%|███▊      | 4236/10986 [1:37:12<2:32:39,  1.36s/it]

training loss: 3.378316879272461


training:  39%|███▊      | 4237/10986 [1:37:14<2:30:21,  1.34s/it]

training loss: 3.383568525314331


training:  39%|███▊      | 4238/10986 [1:37:15<2:29:31,  1.33s/it]

training loss: 3.4553308486938477


training:  39%|███▊      | 4239/10986 [1:37:16<2:28:11,  1.32s/it]

training loss: 3.3914742469787598


training:  39%|███▊      | 4240/10986 [1:37:18<2:27:34,  1.31s/it]

training loss: 3.343071699142456
valid loss: 3.338481903076172
perplexity: 28.176319122314453


training:  39%|███▊      | 4241/10986 [1:37:20<3:16:39,  1.75s/it]

training loss: 3.282038927078247


training:  39%|███▊      | 4242/10986 [1:37:22<3:04:10,  1.64s/it]

training loss: 3.4663312435150146


training:  39%|███▊      | 4243/10986 [1:37:23<2:52:52,  1.54s/it]

training loss: 3.402468204498291


training:  39%|███▊      | 4244/10986 [1:37:24<2:44:49,  1.47s/it]

training loss: 3.320356607437134


training:  39%|███▊      | 4245/10986 [1:37:26<2:39:30,  1.42s/it]

training loss: 3.43190336227417


training:  39%|███▊      | 4246/10986 [1:37:27<2:35:07,  1.38s/it]

training loss: 3.424628973007202


training:  39%|███▊      | 4247/10986 [1:37:28<2:32:31,  1.36s/it]

training loss: 3.41382098197937


training:  39%|███▊      | 4248/10986 [1:37:29<2:30:38,  1.34s/it]

training loss: 3.411745071411133


training:  39%|███▊      | 4249/10986 [1:37:31<2:29:15,  1.33s/it]

training loss: 3.392331123352051


training:  39%|███▊      | 4250/10986 [1:37:32<2:29:38,  1.33s/it]

training loss: 3.3956832885742188


training:  39%|███▊      | 4251/10986 [1:37:34<2:38:47,  1.41s/it]

training loss: 3.400261640548706


training:  39%|███▊      | 4252/10986 [1:37:35<2:35:00,  1.38s/it]

training loss: 3.3808932304382324


training:  39%|███▊      | 4253/10986 [1:37:36<2:33:14,  1.37s/it]

training loss: 3.3894946575164795


training:  39%|███▊      | 4254/10986 [1:37:38<2:31:44,  1.35s/it]

training loss: 3.3307456970214844


training:  39%|███▊      | 4255/10986 [1:37:39<2:30:18,  1.34s/it]

training loss: 3.502251148223877


training:  39%|███▊      | 4256/10986 [1:37:40<2:30:44,  1.34s/it]

training loss: 3.3690531253814697


training:  39%|███▊      | 4257/10986 [1:37:42<2:29:05,  1.33s/it]

training loss: 3.472830057144165


training:  39%|███▉      | 4258/10986 [1:37:43<2:27:47,  1.32s/it]

training loss: 3.342599868774414


training:  39%|███▉      | 4259/10986 [1:37:44<2:27:23,  1.31s/it]

training loss: 3.5211095809936523


training:  39%|███▉      | 4260/10986 [1:37:46<2:26:15,  1.30s/it]

training loss: 3.4503579139709473
valid loss: 3.447413206100464
perplexity: 31.41901206970215


training:  39%|███▉      | 4261/10986 [1:37:48<3:15:51,  1.75s/it]

training loss: 3.507416009902954


training:  39%|███▉      | 4262/10986 [1:37:50<3:03:21,  1.64s/it]

training loss: 3.3609821796417236


training:  39%|███▉      | 4263/10986 [1:37:51<2:52:34,  1.54s/it]

training loss: 3.4607558250427246


training:  39%|███▉      | 4264/10986 [1:37:52<2:45:07,  1.47s/it]

training loss: 3.4386327266693115


training:  39%|███▉      | 4265/10986 [1:37:54<2:39:30,  1.42s/it]

training loss: 3.440131187438965


training:  39%|███▉      | 4266/10986 [1:37:55<2:35:07,  1.39s/it]

training loss: 3.3696978092193604


training:  39%|███▉      | 4267/10986 [1:37:56<2:32:24,  1.36s/it]

training loss: 3.304749011993408


training:  39%|███▉      | 4268/10986 [1:37:58<2:30:38,  1.35s/it]

training loss: 3.3525755405426025


training:  39%|███▉      | 4269/10986 [1:37:59<2:29:40,  1.34s/it]

training loss: 3.371722459793091


training:  39%|███▉      | 4270/10986 [1:38:00<2:28:15,  1.32s/it]

training loss: 3.380582094192505


training:  39%|███▉      | 4271/10986 [1:38:02<2:36:03,  1.39s/it]

training loss: 3.228503465652466


training:  39%|███▉      | 4272/10986 [1:38:03<2:34:35,  1.38s/it]

training loss: 3.3692774772644043


training:  39%|███▉      | 4273/10986 [1:38:04<2:32:14,  1.36s/it]

training loss: 3.3084716796875


training:  39%|███▉      | 4274/10986 [1:38:06<2:30:40,  1.35s/it]

training loss: 3.314894914627075


training:  39%|███▉      | 4275/10986 [1:38:07<2:29:27,  1.34s/it]

training loss: 3.3678884506225586


training:  39%|███▉      | 4276/10986 [1:38:08<2:28:48,  1.33s/it]

training loss: 3.4704387187957764


training:  39%|███▉      | 4277/10986 [1:38:10<2:29:43,  1.34s/it]

training loss: 3.3989932537078857


training:  39%|███▉      | 4278/10986 [1:38:11<2:29:11,  1.33s/it]

training loss: 3.446885585784912


training:  39%|███▉      | 4279/10986 [1:38:12<2:28:25,  1.33s/it]

training loss: 3.4016640186309814


training:  39%|███▉      | 4280/10986 [1:38:14<2:28:23,  1.33s/it]

training loss: 3.3626999855041504
valid loss: 3.361238479614258
perplexity: 28.82486915588379


training:  39%|███▉      | 4281/10986 [1:38:16<3:17:29,  1.77s/it]

training loss: 3.439469337463379


training:  39%|███▉      | 4282/10986 [1:38:18<3:05:26,  1.66s/it]

training loss: 3.475564479827881


training:  39%|███▉      | 4283/10986 [1:38:19<2:54:26,  1.56s/it]

training loss: 3.433612823486328


training:  39%|███▉      | 4284/10986 [1:38:20<2:46:28,  1.49s/it]

training loss: 3.4035215377807617


training:  39%|███▉      | 4285/10986 [1:38:22<2:40:42,  1.44s/it]

training loss: 3.4547104835510254


training:  39%|███▉      | 4286/10986 [1:38:23<2:36:13,  1.40s/it]

training loss: 3.5178885459899902


training:  39%|███▉      | 4287/10986 [1:38:24<2:33:59,  1.38s/it]

training loss: 3.435805559158325


training:  39%|███▉      | 4288/10986 [1:38:26<2:31:43,  1.36s/it]

training loss: 3.318413019180298


training:  39%|███▉      | 4289/10986 [1:38:27<2:30:48,  1.35s/it]

training loss: 3.4253950119018555


training:  39%|███▉      | 4290/10986 [1:38:28<2:29:22,  1.34s/it]

training loss: 3.3216543197631836


training:  39%|███▉      | 4291/10986 [1:38:30<2:37:15,  1.41s/it]

training loss: 3.3837783336639404


training:  39%|███▉      | 4292/10986 [1:38:31<2:34:11,  1.38s/it]

training loss: 3.3649306297302246


training:  39%|███▉      | 4293/10986 [1:38:33<2:31:40,  1.36s/it]

training loss: 3.4272360801696777


training:  39%|███▉      | 4294/10986 [1:38:34<2:30:01,  1.35s/it]

training loss: 3.4064102172851562


training:  39%|███▉      | 4295/10986 [1:38:35<2:29:25,  1.34s/it]

training loss: 3.360832929611206


training:  39%|███▉      | 4296/10986 [1:38:37<2:27:39,  1.32s/it]

training loss: 3.446648597717285


training:  39%|███▉      | 4297/10986 [1:38:38<2:27:06,  1.32s/it]

training loss: 3.47629714012146


training:  39%|███▉      | 4298/10986 [1:38:39<2:26:48,  1.32s/it]

training loss: 3.3495914936065674


training:  39%|███▉      | 4299/10986 [1:38:41<2:28:40,  1.33s/it]

training loss: 3.2679383754730225


training:  39%|███▉      | 4300/10986 [1:38:42<2:27:59,  1.33s/it]

training loss: 3.4709346294403076
valid loss: 3.473714828491211
perplexity: 32.25634765625


training:  39%|███▉      | 4301/10986 [1:38:45<3:16:36,  1.76s/it]

training loss: 3.439210891723633


training:  39%|███▉      | 4302/10986 [1:38:46<3:04:59,  1.66s/it]

training loss: 3.3903136253356934


training:  39%|███▉      | 4303/10986 [1:38:47<2:53:09,  1.55s/it]

training loss: 3.4389359951019287


training:  39%|███▉      | 4304/10986 [1:38:49<2:44:47,  1.48s/it]

training loss: 3.3357551097869873


training:  39%|███▉      | 4305/10986 [1:38:50<2:38:49,  1.43s/it]

training loss: 3.3713560104370117


training:  39%|███▉      | 4306/10986 [1:38:51<2:35:13,  1.39s/it]

training loss: 3.502462148666382


training:  39%|███▉      | 4307/10986 [1:38:53<2:32:06,  1.37s/it]

training loss: 3.387624740600586


training:  39%|███▉      | 4308/10986 [1:38:54<2:30:07,  1.35s/it]

training loss: 3.421333074569702


training:  39%|███▉      | 4309/10986 [1:38:55<2:28:28,  1.33s/it]

training loss: 3.3228278160095215


training:  39%|███▉      | 4310/10986 [1:38:56<2:27:44,  1.33s/it]

training loss: 3.373307704925537


training:  39%|███▉      | 4311/10986 [1:38:58<2:36:31,  1.41s/it]

training loss: 3.4632039070129395


training:  39%|███▉      | 4312/10986 [1:38:59<2:33:44,  1.38s/it]

training loss: 3.43507719039917


training:  39%|███▉      | 4313/10986 [1:39:01<2:31:29,  1.36s/it]

training loss: 3.4807910919189453


training:  39%|███▉      | 4314/10986 [1:39:02<2:31:05,  1.36s/it]

training loss: 3.516141653060913


training:  39%|███▉      | 4315/10986 [1:39:03<2:29:20,  1.34s/it]

training loss: 3.4025821685791016


training:  39%|███▉      | 4316/10986 [1:39:05<2:28:16,  1.33s/it]

training loss: 3.3501508235931396


training:  39%|███▉      | 4317/10986 [1:39:06<2:26:56,  1.32s/it]

training loss: 3.396299123764038


training:  39%|███▉      | 4318/10986 [1:39:07<2:27:09,  1.32s/it]

training loss: 3.347336769104004


training:  39%|███▉      | 4319/10986 [1:39:09<2:26:37,  1.32s/it]

training loss: 3.403710126876831


training:  39%|███▉      | 4320/10986 [1:39:10<2:27:21,  1.33s/it]

training loss: 3.525008201599121
valid loss: 3.521846294403076
perplexity: 33.84686279296875


training:  39%|███▉      | 4321/10986 [1:39:13<3:18:00,  1.78s/it]

training loss: 3.416076183319092


training:  39%|███▉      | 4322/10986 [1:39:14<3:06:25,  1.68s/it]

training loss: 3.4367644786834717


training:  39%|███▉      | 4323/10986 [1:39:16<2:54:03,  1.57s/it]

training loss: 3.435227870941162


training:  39%|███▉      | 4324/10986 [1:39:17<2:45:27,  1.49s/it]

training loss: 3.459871292114258


training:  39%|███▉      | 4325/10986 [1:39:18<2:39:42,  1.44s/it]

training loss: 3.3806674480438232


training:  39%|███▉      | 4326/10986 [1:39:20<2:35:33,  1.40s/it]

training loss: 3.3503777980804443


training:  39%|███▉      | 4327/10986 [1:39:21<2:32:41,  1.38s/it]

training loss: 3.323678970336914


training:  39%|███▉      | 4328/10986 [1:39:22<2:30:45,  1.36s/it]

training loss: 3.316606283187866


training:  39%|███▉      | 4329/10986 [1:39:23<2:29:13,  1.34s/it]

training loss: 3.306170701980591


training:  39%|███▉      | 4330/10986 [1:39:25<2:28:36,  1.34s/it]

training loss: 3.4007928371429443


training:  39%|███▉      | 4331/10986 [1:39:26<2:36:59,  1.42s/it]

training loss: 3.2720441818237305


training:  39%|███▉      | 4332/10986 [1:39:28<2:42:53,  1.47s/it]

training loss: 3.432961940765381


training:  39%|███▉      | 4333/10986 [1:39:29<2:37:45,  1.42s/it]

training loss: 3.3943076133728027


training:  39%|███▉      | 4334/10986 [1:39:31<2:33:31,  1.38s/it]

training loss: 3.334857940673828


training:  39%|███▉      | 4335/10986 [1:39:32<2:32:06,  1.37s/it]

training loss: 3.576934576034546


training:  39%|███▉      | 4336/10986 [1:39:33<2:29:50,  1.35s/it]

training loss: 3.4460277557373047


training:  39%|███▉      | 4337/10986 [1:39:35<2:28:27,  1.34s/it]

training loss: 3.2642252445220947


training:  39%|███▉      | 4338/10986 [1:39:36<2:28:02,  1.34s/it]

training loss: 3.361325740814209


training:  39%|███▉      | 4339/10986 [1:39:37<2:27:47,  1.33s/it]

training loss: 3.3519344329833984


training:  40%|███▉      | 4340/10986 [1:39:39<2:27:36,  1.33s/it]

training loss: 3.347111701965332
valid loss: 3.340447425842285
perplexity: 28.231754302978516


training:  40%|███▉      | 4341/10986 [1:39:41<3:16:51,  1.78s/it]

training loss: 3.3358635902404785


training:  40%|███▉      | 4342/10986 [1:39:43<3:12:15,  1.74s/it]

training loss: 3.391324281692505


training:  40%|███▉      | 4343/10986 [1:39:44<2:58:23,  1.61s/it]

training loss: 3.3181395530700684


training:  40%|███▉      | 4344/10986 [1:39:46<2:48:42,  1.52s/it]

training loss: 3.429715156555176


training:  40%|███▉      | 4345/10986 [1:39:47<2:41:47,  1.46s/it]

training loss: 3.3535547256469727


training:  40%|███▉      | 4346/10986 [1:39:48<2:38:26,  1.43s/it]

training loss: 3.329343795776367


training:  40%|███▉      | 4347/10986 [1:39:50<2:34:39,  1.40s/it]

training loss: 3.482619285583496


training:  40%|███▉      | 4348/10986 [1:39:51<2:31:45,  1.37s/it]

training loss: 3.424360752105713


training:  40%|███▉      | 4349/10986 [1:39:52<2:29:29,  1.35s/it]

training loss: 3.238450288772583


training:  40%|███▉      | 4350/10986 [1:39:54<2:28:19,  1.34s/it]

training loss: 3.373985767364502


training:  40%|███▉      | 4351/10986 [1:39:55<2:36:59,  1.42s/it]

training loss: 3.427603006362915


training:  40%|███▉      | 4352/10986 [1:39:57<2:34:48,  1.40s/it]

training loss: 3.2695438861846924


training:  40%|███▉      | 4353/10986 [1:39:58<2:31:28,  1.37s/it]

training loss: 3.417144775390625


training:  40%|███▉      | 4354/10986 [1:39:59<2:29:22,  1.35s/it]

training loss: 3.4413342475891113


training:  40%|███▉      | 4355/10986 [1:40:00<2:27:56,  1.34s/it]

training loss: 3.5270440578460693


training:  40%|███▉      | 4356/10986 [1:40:02<2:27:26,  1.33s/it]

training loss: 3.3333563804626465


training:  40%|███▉      | 4357/10986 [1:40:03<2:26:47,  1.33s/it]

training loss: 3.4043991565704346


training:  40%|███▉      | 4358/10986 [1:40:04<2:26:25,  1.33s/it]

training loss: 3.475759983062744


training:  40%|███▉      | 4359/10986 [1:40:06<2:26:14,  1.32s/it]

training loss: 3.446239948272705


training:  40%|███▉      | 4360/10986 [1:40:07<2:25:36,  1.32s/it]

training loss: 3.343628406524658
valid loss: 3.335587978363037
perplexity: 28.09489631652832


training:  40%|███▉      | 4361/10986 [1:40:10<3:14:35,  1.76s/it]

training loss: 3.314439058303833


training:  40%|███▉      | 4362/10986 [1:40:11<3:04:14,  1.67s/it]

training loss: 3.523787498474121


training:  40%|███▉      | 4363/10986 [1:40:13<2:52:43,  1.56s/it]

training loss: 3.567735195159912


training:  40%|███▉      | 4364/10986 [1:40:14<2:45:02,  1.50s/it]

training loss: 3.5128588676452637


training:  40%|███▉      | 4365/10986 [1:40:15<2:38:52,  1.44s/it]

training loss: 3.2969541549682617


training:  40%|███▉      | 4366/10986 [1:40:17<2:34:57,  1.40s/it]

training loss: 3.4468209743499756


training:  40%|███▉      | 4367/10986 [1:40:18<2:31:47,  1.38s/it]

training loss: 3.473783254623413


training:  40%|███▉      | 4368/10986 [1:40:19<2:31:03,  1.37s/it]

training loss: 3.4885172843933105


training:  40%|███▉      | 4369/10986 [1:40:21<2:35:46,  1.41s/it]

training loss: 3.3460326194763184


training:  40%|███▉      | 4370/10986 [1:40:22<2:46:54,  1.51s/it]

training loss: 3.4127252101898193


training:  40%|███▉      | 4371/10986 [1:40:24<2:58:47,  1.62s/it]

training loss: 3.300529956817627


training:  40%|███▉      | 4372/10986 [1:40:26<2:58:06,  1.62s/it]

training loss: 3.4252099990844727


training:  40%|███▉      | 4373/10986 [1:40:27<2:48:15,  1.53s/it]

training loss: 3.4699952602386475


training:  40%|███▉      | 4374/10986 [1:40:29<2:40:24,  1.46s/it]

training loss: 3.367952823638916


training:  40%|███▉      | 4375/10986 [1:40:30<2:34:52,  1.41s/it]

training loss: 3.37359881401062


training:  40%|███▉      | 4376/10986 [1:40:31<2:32:16,  1.38s/it]

training loss: 3.3949737548828125


training:  40%|███▉      | 4377/10986 [1:40:33<2:31:04,  1.37s/it]

training loss: 3.3837881088256836


training:  40%|███▉      | 4378/10986 [1:40:34<2:28:38,  1.35s/it]

training loss: 3.405141830444336


training:  40%|███▉      | 4379/10986 [1:40:35<2:26:47,  1.33s/it]

training loss: 3.543250799179077


training:  40%|███▉      | 4380/10986 [1:40:36<2:25:56,  1.33s/it]

training loss: 3.3935258388519287
valid loss: 3.3894107341766357
perplexity: 29.648475646972656


training:  40%|███▉      | 4381/10986 [1:40:39<3:15:07,  1.77s/it]

training loss: 3.3945388793945312


training:  40%|███▉      | 4382/10986 [1:40:41<3:12:15,  1.75s/it]

training loss: 3.3918495178222656


training:  40%|███▉      | 4383/10986 [1:40:42<2:59:29,  1.63s/it]

training loss: 3.3612725734710693


training:  40%|███▉      | 4384/10986 [1:40:44<2:49:09,  1.54s/it]

training loss: 3.4496910572052


training:  40%|███▉      | 4385/10986 [1:40:45<2:42:12,  1.47s/it]

training loss: 3.5407612323760986


training:  40%|███▉      | 4386/10986 [1:40:46<2:36:55,  1.43s/it]

training loss: 3.4105477333068848


training:  40%|███▉      | 4387/10986 [1:40:48<2:32:59,  1.39s/it]

training loss: 3.417271852493286


training:  40%|███▉      | 4388/10986 [1:40:49<2:30:48,  1.37s/it]

training loss: 3.447845697402954


training:  40%|███▉      | 4389/10986 [1:40:50<2:29:29,  1.36s/it]

training loss: 3.3994476795196533


training:  40%|███▉      | 4390/10986 [1:40:52<2:29:08,  1.36s/it]

training loss: 3.4983229637145996


training:  40%|███▉      | 4391/10986 [1:40:53<2:39:33,  1.45s/it]

training loss: 3.3804986476898193


training:  40%|███▉      | 4392/10986 [1:40:55<2:43:56,  1.49s/it]

training loss: 3.343371868133545


training:  40%|███▉      | 4393/10986 [1:40:56<2:38:37,  1.44s/it]

training loss: 3.298023223876953


training:  40%|███▉      | 4394/10986 [1:40:57<2:35:18,  1.41s/it]

training loss: 3.4794187545776367


training:  40%|████      | 4395/10986 [1:40:59<2:32:31,  1.39s/it]

training loss: 3.3896119594573975


training:  40%|████      | 4396/10986 [1:41:00<2:30:32,  1.37s/it]

training loss: 3.436249256134033


training:  40%|████      | 4397/10986 [1:41:01<2:29:07,  1.36s/it]

training loss: 3.3652865886688232


training:  40%|████      | 4398/10986 [1:41:03<2:28:26,  1.35s/it]

training loss: 3.4053351879119873


training:  40%|████      | 4399/10986 [1:41:04<2:28:03,  1.35s/it]

training loss: 3.585531234741211


training:  40%|████      | 4400/10986 [1:41:05<2:27:21,  1.34s/it]

training loss: 3.3575236797332764
valid loss: 3.358186721801758
perplexity: 28.737035751342773


training:  40%|████      | 4401/10986 [1:41:08<3:16:01,  1.79s/it]

training loss: 3.3019204139709473


training:  40%|████      | 4402/10986 [1:41:10<3:03:33,  1.67s/it]

training loss: 3.4773244857788086


training:  40%|████      | 4403/10986 [1:41:11<2:51:41,  1.56s/it]

training loss: 3.389723777770996


training:  40%|████      | 4404/10986 [1:41:12<2:44:53,  1.50s/it]

training loss: 3.4351131916046143


training:  40%|████      | 4405/10986 [1:41:14<2:39:08,  1.45s/it]

training loss: 3.319690704345703


training:  40%|████      | 4406/10986 [1:41:15<2:35:35,  1.42s/it]

training loss: 3.3451335430145264


training:  40%|████      | 4407/10986 [1:41:16<2:32:36,  1.39s/it]

training loss: 3.4037413597106934


training:  40%|████      | 4408/10986 [1:41:18<2:30:15,  1.37s/it]

training loss: 3.5493571758270264


training:  40%|████      | 4409/10986 [1:41:19<2:28:52,  1.36s/it]

training loss: 3.4072556495666504


training:  40%|████      | 4410/10986 [1:41:20<2:27:07,  1.34s/it]

training loss: 3.5057225227355957


training:  40%|████      | 4411/10986 [1:41:22<2:35:36,  1.42s/it]

training loss: 3.405046224594116


training:  40%|████      | 4412/10986 [1:41:23<2:32:42,  1.39s/it]

training loss: 3.309936285018921


training:  40%|████      | 4413/10986 [1:41:25<2:30:10,  1.37s/it]

training loss: 3.466491460800171


training:  40%|████      | 4414/10986 [1:41:26<2:28:34,  1.36s/it]

training loss: 3.347421407699585


training:  40%|████      | 4415/10986 [1:41:27<2:27:25,  1.35s/it]

training loss: 3.4343559741973877


training:  40%|████      | 4416/10986 [1:41:29<2:26:26,  1.34s/it]

training loss: 3.440556287765503


training:  40%|████      | 4417/10986 [1:41:30<2:25:54,  1.33s/it]

training loss: 3.404482841491699


training:  40%|████      | 4418/10986 [1:41:31<2:26:03,  1.33s/it]

training loss: 3.4573936462402344


training:  40%|████      | 4419/10986 [1:41:33<2:25:48,  1.33s/it]

training loss: 3.3705530166625977


training:  40%|████      | 4420/10986 [1:41:34<2:25:02,  1.33s/it]

training loss: 3.4127111434936523
valid loss: 3.4082274436950684
perplexity: 30.211647033691406


training:  40%|████      | 4421/10986 [1:41:37<3:13:44,  1.77s/it]

training loss: 3.4569897651672363


training:  40%|████      | 4422/10986 [1:41:38<3:03:18,  1.68s/it]

training loss: 3.425776958465576


training:  40%|████      | 4423/10986 [1:41:39<2:52:07,  1.57s/it]

training loss: 3.5251142978668213


training:  40%|████      | 4424/10986 [1:41:41<2:44:16,  1.50s/it]

training loss: 3.460864305496216


training:  40%|████      | 4425/10986 [1:41:42<2:40:11,  1.46s/it]

training loss: 3.270231246948242


training:  40%|████      | 4426/10986 [1:41:43<2:35:33,  1.42s/it]

training loss: 3.472731590270996


training:  40%|████      | 4427/10986 [1:41:45<2:32:18,  1.39s/it]

training loss: 3.447399377822876


training:  40%|████      | 4428/10986 [1:41:46<2:31:48,  1.39s/it]

training loss: 3.444843292236328


training:  40%|████      | 4429/10986 [1:41:48<2:30:01,  1.37s/it]

training loss: 3.4140944480895996


training:  40%|████      | 4430/10986 [1:41:49<2:27:56,  1.35s/it]

training loss: 3.270932197570801


training:  40%|████      | 4431/10986 [1:41:50<2:35:52,  1.43s/it]

training loss: 3.332707405090332


training:  40%|████      | 4432/10986 [1:41:52<2:42:24,  1.49s/it]

training loss: 3.3911523818969727


training:  40%|████      | 4433/10986 [1:41:53<2:37:01,  1.44s/it]

training loss: 3.3867101669311523


training:  40%|████      | 4434/10986 [1:41:55<2:34:01,  1.41s/it]

training loss: 3.4410455226898193


training:  40%|████      | 4435/10986 [1:41:56<2:31:00,  1.38s/it]

training loss: 3.494317054748535


training:  40%|████      | 4436/10986 [1:41:57<2:29:45,  1.37s/it]

training loss: 3.468956470489502


training:  40%|████      | 4437/10986 [1:41:59<2:27:48,  1.35s/it]

training loss: 3.2843923568725586


training:  40%|████      | 4438/10986 [1:42:00<2:26:15,  1.34s/it]

training loss: 3.386643648147583


training:  40%|████      | 4439/10986 [1:42:01<2:26:09,  1.34s/it]

training loss: 3.326896905899048


training:  40%|████      | 4440/10986 [1:42:03<2:25:36,  1.33s/it]

training loss: 3.3859899044036865
valid loss: 3.381751298904419
perplexity: 29.422252655029297


training:  40%|████      | 4441/10986 [1:42:05<3:13:06,  1.77s/it]

training loss: 3.270179033279419


training:  40%|████      | 4442/10986 [1:42:07<3:01:36,  1.67s/it]

training loss: 3.2888457775115967


training:  40%|████      | 4443/10986 [1:42:08<2:51:21,  1.57s/it]

training loss: 3.364741802215576


training:  40%|████      | 4444/10986 [1:42:10<2:42:25,  1.49s/it]

training loss: 3.3666491508483887


training:  40%|████      | 4445/10986 [1:42:11<2:36:45,  1.44s/it]

training loss: 3.4627747535705566


training:  40%|████      | 4446/10986 [1:42:12<2:34:03,  1.41s/it]

training loss: 3.4099628925323486


training:  40%|████      | 4447/10986 [1:42:14<2:31:49,  1.39s/it]

training loss: 3.3564863204956055


training:  40%|████      | 4448/10986 [1:42:15<2:29:28,  1.37s/it]

training loss: 3.384622097015381


training:  40%|████      | 4449/10986 [1:42:16<2:27:05,  1.35s/it]

training loss: 3.377443552017212


training:  41%|████      | 4450/10986 [1:42:18<2:26:10,  1.34s/it]

training loss: 3.5176427364349365


training:  41%|████      | 4451/10986 [1:42:19<2:35:02,  1.42s/it]

training loss: 3.4230692386627197


training:  41%|████      | 4452/10986 [1:42:21<2:42:07,  1.49s/it]

training loss: 3.5218100547790527


training:  41%|████      | 4453/10986 [1:42:22<2:37:02,  1.44s/it]

training loss: 3.3238136768341064


training:  41%|████      | 4454/10986 [1:42:23<2:33:06,  1.41s/it]

training loss: 3.382640838623047


training:  41%|████      | 4455/10986 [1:42:25<2:30:39,  1.38s/it]

training loss: 3.427487850189209


training:  41%|████      | 4456/10986 [1:42:26<2:28:21,  1.36s/it]

training loss: 3.37984561920166


training:  41%|████      | 4457/10986 [1:42:27<2:27:14,  1.35s/it]

training loss: 3.3458712100982666


training:  41%|████      | 4458/10986 [1:42:29<2:26:17,  1.34s/it]

training loss: 3.4596846103668213


training:  41%|████      | 4459/10986 [1:42:30<2:26:52,  1.35s/it]

training loss: 3.3338770866394043


training:  41%|████      | 4460/10986 [1:42:31<2:26:11,  1.34s/it]

training loss: 3.3002846240997314
valid loss: 3.295030117034912
perplexity: 26.978227615356445


training:  41%|████      | 4461/10986 [1:42:34<3:14:59,  1.79s/it]

training loss: 3.426560640335083


training:  41%|████      | 4462/10986 [1:42:36<3:02:41,  1.68s/it]

training loss: 3.3901772499084473


training:  41%|████      | 4463/10986 [1:42:37<2:50:30,  1.57s/it]

training loss: 3.343775749206543


training:  41%|████      | 4464/10986 [1:42:38<2:42:19,  1.49s/it]

training loss: 3.4361767768859863


training:  41%|████      | 4465/10986 [1:42:40<2:36:11,  1.44s/it]

training loss: 3.4511046409606934


training:  41%|████      | 4466/10986 [1:42:41<2:33:04,  1.41s/it]

training loss: 3.385502338409424


training:  41%|████      | 4467/10986 [1:42:42<2:29:50,  1.38s/it]

training loss: 3.3807947635650635


training:  41%|████      | 4468/10986 [1:42:44<2:28:59,  1.37s/it]

training loss: 3.4936251640319824


training:  41%|████      | 4469/10986 [1:42:45<2:27:05,  1.35s/it]

training loss: 3.3512794971466064


training:  41%|████      | 4470/10986 [1:42:46<2:26:43,  1.35s/it]

training loss: 3.317091226577759


training:  41%|████      | 4471/10986 [1:42:48<2:34:22,  1.42s/it]

training loss: 3.4374401569366455


training:  41%|████      | 4472/10986 [1:42:49<2:32:13,  1.40s/it]

training loss: 3.314687967300415


training:  41%|████      | 4473/10986 [1:42:51<2:29:41,  1.38s/it]

training loss: 3.403731346130371


training:  41%|████      | 4474/10986 [1:42:52<2:27:09,  1.36s/it]

training loss: 3.2574424743652344


training:  41%|████      | 4475/10986 [1:42:53<2:25:49,  1.34s/it]

training loss: 3.4812190532684326


training:  41%|████      | 4476/10986 [1:42:54<2:24:07,  1.33s/it]

training loss: 3.3107430934906006


training:  41%|████      | 4477/10986 [1:42:56<2:23:55,  1.33s/it]

training loss: 3.411409854888916


training:  41%|████      | 4478/10986 [1:42:57<2:22:46,  1.32s/it]

training loss: 3.358689785003662


training:  41%|████      | 4479/10986 [1:42:58<2:22:55,  1.32s/it]

training loss: 3.387190818786621


training:  41%|████      | 4480/10986 [1:43:00<2:22:30,  1.31s/it]

training loss: 3.5510594844818115
valid loss: 3.5395498275756836
perplexity: 34.45140838623047


training:  41%|████      | 4481/10986 [1:43:02<3:11:07,  1.76s/it]

training loss: 3.4496965408325195


training:  41%|████      | 4482/10986 [1:43:04<2:59:19,  1.65s/it]

training loss: 3.3374733924865723


training:  41%|████      | 4483/10986 [1:43:05<2:48:43,  1.56s/it]

training loss: 3.492852210998535


training:  41%|████      | 4484/10986 [1:43:07<2:40:56,  1.49s/it]

training loss: 3.403902053833008


training:  41%|████      | 4485/10986 [1:43:08<2:34:35,  1.43s/it]

training loss: 3.3356945514678955


training:  41%|████      | 4486/10986 [1:43:09<2:30:46,  1.39s/it]

training loss: 3.2618143558502197


training:  41%|████      | 4487/10986 [1:43:10<2:28:04,  1.37s/it]

training loss: 3.2842135429382324


training:  41%|████      | 4488/10986 [1:43:12<2:26:11,  1.35s/it]

training loss: 3.420109748840332


training:  41%|████      | 4489/10986 [1:43:13<2:26:45,  1.36s/it]

training loss: 3.450640916824341


training:  41%|████      | 4490/10986 [1:43:15<2:27:23,  1.36s/it]

training loss: 3.413869857788086


training:  41%|████      | 4491/10986 [1:43:16<2:34:22,  1.43s/it]

training loss: 3.325242519378662


training:  41%|████      | 4492/10986 [1:43:17<2:32:04,  1.41s/it]

training loss: 3.3140993118286133


training:  41%|████      | 4493/10986 [1:43:19<2:29:04,  1.38s/it]

training loss: 3.416050910949707


training:  41%|████      | 4494/10986 [1:43:20<2:26:37,  1.36s/it]

training loss: 3.5277373790740967


training:  41%|████      | 4495/10986 [1:43:21<2:25:03,  1.34s/it]

training loss: 3.558541774749756


training:  41%|████      | 4496/10986 [1:43:23<2:24:07,  1.33s/it]

training loss: 3.2732086181640625


training:  41%|████      | 4497/10986 [1:43:24<2:23:26,  1.33s/it]

training loss: 3.3808434009552


training:  41%|████      | 4498/10986 [1:43:25<2:22:35,  1.32s/it]

training loss: 3.401240825653076


training:  41%|████      | 4499/10986 [1:43:27<2:22:24,  1.32s/it]

training loss: 3.458146572113037


training:  41%|████      | 4500/10986 [1:43:28<2:21:53,  1.31s/it]

training loss: 3.5013532638549805
valid loss: 3.494117498397827
perplexity: 32.92122268676758


training:  41%|████      | 4501/10986 [1:43:31<3:09:27,  1.75s/it]

training loss: 3.567962169647217


training:  41%|████      | 4502/10986 [1:43:32<2:57:54,  1.65s/it]

training loss: 3.4831998348236084


training:  41%|████      | 4503/10986 [1:43:33<2:47:59,  1.55s/it]

training loss: 3.4088003635406494


training:  41%|████      | 4504/10986 [1:43:35<2:39:38,  1.48s/it]

training loss: 3.5181479454040527


training:  41%|████      | 4505/10986 [1:43:36<2:33:58,  1.43s/it]

training loss: 3.459015130996704


training:  41%|████      | 4506/10986 [1:43:37<2:30:10,  1.39s/it]

training loss: 3.38906192779541


training:  41%|████      | 4507/10986 [1:43:39<2:27:58,  1.37s/it]

training loss: 3.3362231254577637


training:  41%|████      | 4508/10986 [1:43:40<2:25:51,  1.35s/it]

training loss: 3.360593795776367


training:  41%|████      | 4509/10986 [1:43:41<2:24:52,  1.34s/it]

training loss: 3.4814798831939697


training:  41%|████      | 4510/10986 [1:43:43<2:23:59,  1.33s/it]

training loss: 3.3171215057373047


training:  41%|████      | 4511/10986 [1:43:44<2:33:18,  1.42s/it]

training loss: 3.474459648132324


training:  41%|████      | 4512/10986 [1:43:46<2:40:57,  1.49s/it]

training loss: 3.499122381210327


training:  41%|████      | 4513/10986 [1:43:47<2:36:24,  1.45s/it]

training loss: 3.4185500144958496


training:  41%|████      | 4514/10986 [1:43:49<2:45:08,  1.53s/it]

training loss: 3.5132648944854736


training:  41%|████      | 4515/10986 [1:43:51<2:50:49,  1.58s/it]

training loss: 3.4820220470428467


training:  41%|████      | 4516/10986 [1:43:52<2:42:44,  1.51s/it]

training loss: 3.5232534408569336


training:  41%|████      | 4517/10986 [1:43:53<2:36:26,  1.45s/it]

training loss: 3.3865714073181152


training:  41%|████      | 4518/10986 [1:43:55<2:32:02,  1.41s/it]

training loss: 3.431910276412964


training:  41%|████      | 4519/10986 [1:43:56<2:28:43,  1.38s/it]

training loss: 3.2810285091400146


training:  41%|████      | 4520/10986 [1:43:57<2:26:09,  1.36s/it]

training loss: 3.327427864074707
valid loss: 3.3299291133880615
perplexity: 27.93636131286621


training:  41%|████      | 4521/10986 [1:44:00<3:11:35,  1.78s/it]

training loss: 3.357633590698242


training:  41%|████      | 4522/10986 [1:44:01<2:58:33,  1.66s/it]

training loss: 3.585726499557495


training:  41%|████      | 4523/10986 [1:44:03<2:47:14,  1.55s/it]

training loss: 3.382864475250244


training:  41%|████      | 4524/10986 [1:44:04<2:39:26,  1.48s/it]

training loss: 3.3711719512939453


training:  41%|████      | 4525/10986 [1:44:05<2:34:51,  1.44s/it]

training loss: 3.450852394104004


training:  41%|████      | 4526/10986 [1:44:07<2:30:13,  1.40s/it]

training loss: 3.4267702102661133


training:  41%|████      | 4527/10986 [1:44:08<2:26:32,  1.36s/it]

training loss: 3.3432776927948


training:  41%|████      | 4528/10986 [1:44:09<2:24:25,  1.34s/it]

training loss: 3.3639674186706543


training:  41%|████      | 4529/10986 [1:44:11<2:23:18,  1.33s/it]

training loss: 3.3340604305267334


training:  41%|████      | 4530/10986 [1:44:12<2:22:13,  1.32s/it]

training loss: 3.3645007610321045


training:  41%|████      | 4531/10986 [1:44:13<2:32:01,  1.41s/it]

training loss: 3.3577752113342285


training:  41%|████▏     | 4532/10986 [1:44:15<2:30:13,  1.40s/it]

training loss: 3.4440040588378906


training:  41%|████▏     | 4533/10986 [1:44:16<2:27:07,  1.37s/it]

training loss: 3.467743396759033


training:  41%|████▏     | 4534/10986 [1:44:17<2:24:49,  1.35s/it]

training loss: 3.370948553085327


training:  41%|████▏     | 4535/10986 [1:44:19<2:23:38,  1.34s/it]

training loss: 3.420215606689453


training:  41%|████▏     | 4536/10986 [1:44:20<2:22:15,  1.32s/it]

training loss: 3.493699073791504


training:  41%|████▏     | 4537/10986 [1:44:21<2:21:48,  1.32s/it]

training loss: 3.4435722827911377


training:  41%|████▏     | 4538/10986 [1:44:23<2:20:52,  1.31s/it]

training loss: 3.3897147178649902


training:  41%|████▏     | 4539/10986 [1:44:24<2:20:39,  1.31s/it]

training loss: 3.3328709602355957


training:  41%|████▏     | 4540/10986 [1:44:25<2:20:08,  1.30s/it]

training loss: 3.343562364578247
valid loss: 3.3370320796966553
perplexity: 28.135499954223633


training:  41%|████▏     | 4541/10986 [1:44:28<3:07:58,  1.75s/it]

training loss: 3.4117796421051025


training:  41%|████▏     | 4542/10986 [1:44:29<2:55:43,  1.64s/it]

training loss: 3.42638897895813


training:  41%|████▏     | 4543/10986 [1:44:31<2:44:57,  1.54s/it]

training loss: 3.3464560508728027


training:  41%|████▏     | 4544/10986 [1:44:32<2:37:48,  1.47s/it]

training loss: 3.474203586578369


training:  41%|████▏     | 4545/10986 [1:44:33<2:32:00,  1.42s/it]

training loss: 3.532396078109741


training:  41%|████▏     | 4546/10986 [1:44:35<2:27:47,  1.38s/it]

training loss: 3.5641074180603027


training:  41%|████▏     | 4547/10986 [1:44:36<2:25:34,  1.36s/it]

training loss: 3.488309860229492


training:  41%|████▏     | 4548/10986 [1:44:37<2:24:18,  1.34s/it]

training loss: 3.3651485443115234


training:  41%|████▏     | 4549/10986 [1:44:38<2:23:24,  1.34s/it]

training loss: 3.360927104949951


training:  41%|████▏     | 4550/10986 [1:44:40<2:22:49,  1.33s/it]

training loss: 3.3330230712890625


training:  41%|████▏     | 4551/10986 [1:44:41<2:32:35,  1.42s/it]

training loss: 3.3633506298065186


training:  41%|████▏     | 4552/10986 [1:44:43<2:29:49,  1.40s/it]

training loss: 3.373878240585327


training:  41%|████▏     | 4553/10986 [1:44:44<2:27:44,  1.38s/it]

training loss: 3.3729469776153564


training:  41%|████▏     | 4554/10986 [1:44:45<2:25:41,  1.36s/it]

training loss: 3.348226308822632


training:  41%|████▏     | 4555/10986 [1:44:47<2:24:40,  1.35s/it]

training loss: 3.381802797317505


training:  41%|████▏     | 4556/10986 [1:44:48<2:24:02,  1.34s/it]

training loss: 3.3543930053710938


training:  41%|████▏     | 4557/10986 [1:44:49<2:22:38,  1.33s/it]

training loss: 3.530089855194092


training:  41%|████▏     | 4558/10986 [1:44:51<2:21:51,  1.32s/it]

training loss: 3.5402932167053223


training:  41%|████▏     | 4559/10986 [1:44:52<2:21:41,  1.32s/it]

training loss: 3.36140513420105


training:  42%|████▏     | 4560/10986 [1:44:53<2:21:06,  1.32s/it]

training loss: 3.4528369903564453
valid loss: 3.4483208656311035
perplexity: 31.447542190551758


training:  42%|████▏     | 4561/10986 [1:44:56<3:08:53,  1.76s/it]

training loss: 3.3278021812438965


training:  42%|████▏     | 4562/10986 [1:44:58<2:57:13,  1.66s/it]

training loss: 3.4307470321655273


training:  42%|████▏     | 4563/10986 [1:44:59<2:46:24,  1.55s/it]

training loss: 3.333254814147949


training:  42%|████▏     | 4564/10986 [1:45:00<2:38:30,  1.48s/it]

training loss: 3.4383633136749268


training:  42%|████▏     | 4565/10986 [1:45:01<2:32:26,  1.42s/it]

training loss: 3.34150767326355


training:  42%|████▏     | 4566/10986 [1:45:03<2:28:54,  1.39s/it]

training loss: 3.4725942611694336


training:  42%|████▏     | 4567/10986 [1:45:04<2:26:13,  1.37s/it]

training loss: 3.391040325164795


training:  42%|████▏     | 4568/10986 [1:45:05<2:23:59,  1.35s/it]

training loss: 3.510955810546875


training:  42%|████▏     | 4569/10986 [1:45:07<2:23:30,  1.34s/it]

training loss: 3.398146629333496


training:  42%|████▏     | 4570/10986 [1:45:08<2:22:31,  1.33s/it]

training loss: 3.412646532058716


training:  42%|████▏     | 4571/10986 [1:45:10<2:30:39,  1.41s/it]

training loss: 3.3268189430236816


training:  42%|████▏     | 4572/10986 [1:45:11<2:28:13,  1.39s/it]

training loss: 3.304227113723755


training:  42%|████▏     | 4573/10986 [1:45:12<2:26:47,  1.37s/it]

training loss: 3.3927299976348877


training:  42%|████▏     | 4574/10986 [1:45:14<2:24:07,  1.35s/it]

training loss: 3.458310604095459


training:  42%|████▏     | 4575/10986 [1:45:15<2:23:57,  1.35s/it]

training loss: 3.3869407176971436


training:  42%|████▏     | 4576/10986 [1:45:16<2:22:02,  1.33s/it]

training loss: 3.3899147510528564


training:  42%|████▏     | 4577/10986 [1:45:17<2:20:31,  1.32s/it]

training loss: 3.3021507263183594


training:  42%|████▏     | 4578/10986 [1:45:19<2:20:33,  1.32s/it]

training loss: 3.3744056224823


training:  42%|████▏     | 4579/10986 [1:45:20<2:19:56,  1.31s/it]

training loss: 3.4581363201141357


training:  42%|████▏     | 4580/10986 [1:45:21<2:19:14,  1.30s/it]

training loss: 3.410540819168091
valid loss: 3.4124081134796143
perplexity: 30.338214874267578


training:  42%|████▏     | 4581/10986 [1:45:24<3:05:34,  1.74s/it]

training loss: 3.506984233856201


training:  42%|████▏     | 4582/10986 [1:45:26<3:04:20,  1.73s/it]

training loss: 3.394296407699585


training:  42%|████▏     | 4583/10986 [1:45:27<2:50:57,  1.60s/it]

training loss: 3.2779858112335205


training:  42%|████▏     | 4584/10986 [1:45:28<2:40:28,  1.50s/it]

training loss: 3.3381989002227783


training:  42%|████▏     | 4585/10986 [1:45:30<2:33:23,  1.44s/it]

training loss: 3.34367036819458


training:  42%|████▏     | 4586/10986 [1:45:31<2:28:51,  1.40s/it]

training loss: 3.559351682662964


training:  42%|████▏     | 4587/10986 [1:45:32<2:26:19,  1.37s/it]

training loss: 3.39631724357605


training:  42%|████▏     | 4588/10986 [1:45:34<2:23:56,  1.35s/it]

training loss: 3.4104583263397217


training:  42%|████▏     | 4589/10986 [1:45:35<2:22:20,  1.34s/it]

training loss: 3.428495168685913


training:  42%|████▏     | 4590/10986 [1:45:36<2:20:49,  1.32s/it]

training loss: 3.323310375213623


training:  42%|████▏     | 4591/10986 [1:45:38<2:28:51,  1.40s/it]

training loss: 3.309717893600464


training:  42%|████▏     | 4592/10986 [1:45:39<2:36:07,  1.47s/it]

training loss: 3.4333086013793945


training:  42%|████▏     | 4593/10986 [1:45:41<2:31:37,  1.42s/it]

training loss: 3.2552521228790283


training:  42%|████▏     | 4594/10986 [1:45:42<2:27:42,  1.39s/it]

training loss: 3.391571521759033


training:  42%|████▏     | 4595/10986 [1:45:43<2:25:58,  1.37s/it]

training loss: 3.3956756591796875


training:  42%|████▏     | 4596/10986 [1:45:45<2:24:15,  1.35s/it]

training loss: 3.317704200744629


training:  42%|████▏     | 4597/10986 [1:45:46<2:23:55,  1.35s/it]

training loss: 3.4584176540374756


training:  42%|████▏     | 4598/10986 [1:45:47<2:22:57,  1.34s/it]

training loss: 3.3810298442840576


training:  42%|████▏     | 4599/10986 [1:45:49<2:22:22,  1.34s/it]

training loss: 3.331730842590332


training:  42%|████▏     | 4600/10986 [1:45:50<2:20:45,  1.32s/it]

training loss: 3.3550214767456055
valid loss: 3.348252773284912
perplexity: 28.45297622680664


training:  42%|████▏     | 4601/10986 [1:45:53<3:07:00,  1.76s/it]

training loss: 3.3668038845062256


training:  42%|████▏     | 4602/10986 [1:45:54<2:55:37,  1.65s/it]

training loss: 3.340022087097168


training:  42%|████▏     | 4603/10986 [1:45:55<2:44:25,  1.55s/it]

training loss: 3.3512697219848633


training:  42%|████▏     | 4604/10986 [1:45:57<2:36:19,  1.47s/it]

training loss: 3.415083408355713


training:  42%|████▏     | 4605/10986 [1:45:58<2:30:28,  1.41s/it]

training loss: 3.352368116378784


training:  42%|████▏     | 4606/10986 [1:45:59<2:26:29,  1.38s/it]

training loss: 3.5476465225219727


training:  42%|████▏     | 4607/10986 [1:46:01<2:24:00,  1.35s/it]

training loss: 3.381960391998291


training:  42%|████▏     | 4608/10986 [1:46:02<2:22:29,  1.34s/it]

training loss: 3.3627748489379883


training:  42%|████▏     | 4609/10986 [1:46:03<2:21:06,  1.33s/it]

training loss: 3.404067277908325


training:  42%|████▏     | 4610/10986 [1:46:05<2:19:41,  1.31s/it]

training loss: 3.4451637268066406


training:  42%|████▏     | 4611/10986 [1:46:06<2:27:34,  1.39s/it]

training loss: 3.4615414142608643


training:  42%|████▏     | 4612/10986 [1:46:08<2:33:02,  1.44s/it]

training loss: 3.3635268211364746


training:  42%|████▏     | 4613/10986 [1:46:09<2:28:51,  1.40s/it]

training loss: 3.445007801055908


training:  42%|████▏     | 4614/10986 [1:46:10<2:25:07,  1.37s/it]

training loss: 3.4440627098083496


training:  42%|████▏     | 4615/10986 [1:46:11<2:22:06,  1.34s/it]

training loss: 3.326906681060791


training:  42%|████▏     | 4616/10986 [1:46:13<2:21:51,  1.34s/it]

training loss: 3.3647751808166504


training:  42%|████▏     | 4617/10986 [1:46:14<2:20:27,  1.32s/it]

training loss: 3.544067859649658


training:  42%|████▏     | 4618/10986 [1:46:15<2:19:45,  1.32s/it]

training loss: 3.4472367763519287


training:  42%|████▏     | 4619/10986 [1:46:17<2:18:15,  1.30s/it]

training loss: 3.3403544425964355


training:  42%|████▏     | 4620/10986 [1:46:18<2:17:21,  1.29s/it]

training loss: 3.3723719120025635
valid loss: 3.3670880794525146
perplexity: 28.993976593017578


training:  42%|████▏     | 4621/10986 [1:46:21<3:03:45,  1.73s/it]

training loss: 3.3018102645874023


training:  42%|████▏     | 4622/10986 [1:46:22<3:00:19,  1.70s/it]

training loss: 3.306225299835205


training:  42%|████▏     | 4623/10986 [1:46:24<2:47:27,  1.58s/it]

training loss: 3.418139934539795


training:  42%|████▏     | 4624/10986 [1:46:25<2:38:01,  1.49s/it]

training loss: 3.372401475906372


training:  42%|████▏     | 4625/10986 [1:46:26<2:31:05,  1.43s/it]

training loss: 3.341679573059082


training:  42%|████▏     | 4626/10986 [1:46:27<2:25:57,  1.38s/it]

training loss: 3.3467113971710205


training:  42%|████▏     | 4627/10986 [1:46:29<2:22:46,  1.35s/it]

training loss: 3.341323137283325


training:  42%|████▏     | 4628/10986 [1:46:30<2:21:07,  1.33s/it]

training loss: 3.3304104804992676


training:  42%|████▏     | 4629/10986 [1:46:31<2:19:52,  1.32s/it]

training loss: 3.5422472953796387


training:  42%|████▏     | 4630/10986 [1:46:33<2:18:06,  1.30s/it]

training loss: 3.528348922729492


training:  42%|████▏     | 4631/10986 [1:46:34<2:26:26,  1.38s/it]

training loss: 3.348320245742798


training:  42%|████▏     | 4632/10986 [1:46:35<2:24:05,  1.36s/it]

training loss: 3.53756046295166


training:  42%|████▏     | 4633/10986 [1:46:37<2:20:42,  1.33s/it]

training loss: 3.312823534011841


training:  42%|████▏     | 4634/10986 [1:46:38<2:18:51,  1.31s/it]

training loss: 3.3867080211639404


training:  42%|████▏     | 4635/10986 [1:46:39<2:17:14,  1.30s/it]

training loss: 3.3744096755981445


training:  42%|████▏     | 4636/10986 [1:46:41<2:16:21,  1.29s/it]

training loss: 3.4189960956573486


training:  42%|████▏     | 4637/10986 [1:46:42<2:16:11,  1.29s/it]

training loss: 3.4470059871673584


training:  42%|████▏     | 4638/10986 [1:46:43<2:15:42,  1.28s/it]

training loss: 3.386512041091919


training:  42%|████▏     | 4639/10986 [1:46:44<2:15:50,  1.28s/it]

training loss: 3.376710891723633


training:  42%|████▏     | 4640/10986 [1:46:46<2:15:31,  1.28s/it]

training loss: 3.50968074798584
valid loss: 3.507500648498535
perplexity: 33.36477279663086


training:  42%|████▏     | 4641/10986 [1:46:48<3:03:25,  1.73s/it]

training loss: 3.4487247467041016


training:  42%|████▏     | 4642/10986 [1:46:50<2:54:18,  1.65s/it]

training loss: 3.4121451377868652


training:  42%|████▏     | 4643/10986 [1:46:51<2:43:12,  1.54s/it]

training loss: 3.4202284812927246


training:  42%|████▏     | 4644/10986 [1:46:52<2:34:45,  1.46s/it]

training loss: 3.53973650932312


training:  42%|████▏     | 4645/10986 [1:46:54<2:29:04,  1.41s/it]

training loss: 3.55735182762146


training:  42%|████▏     | 4646/10986 [1:46:55<2:25:34,  1.38s/it]

training loss: 3.471240520477295


training:  42%|████▏     | 4647/10986 [1:46:56<2:22:37,  1.35s/it]

training loss: 3.4905285835266113


training:  42%|████▏     | 4648/10986 [1:46:58<2:20:50,  1.33s/it]

training loss: 3.4503660202026367


training:  42%|████▏     | 4649/10986 [1:46:59<2:19:21,  1.32s/it]

training loss: 3.395643711090088


training:  42%|████▏     | 4650/10986 [1:47:00<2:18:31,  1.31s/it]

training loss: 3.4327707290649414


training:  42%|████▏     | 4651/10986 [1:47:02<2:25:38,  1.38s/it]

training loss: 3.4367785453796387


training:  42%|████▏     | 4652/10986 [1:47:03<2:23:35,  1.36s/it]

training loss: 3.394197463989258


training:  42%|████▏     | 4653/10986 [1:47:04<2:20:44,  1.33s/it]

training loss: 3.447502851486206


training:  42%|████▏     | 4654/10986 [1:47:06<2:19:23,  1.32s/it]

training loss: 3.381387233734131


training:  42%|████▏     | 4655/10986 [1:47:07<2:18:28,  1.31s/it]

training loss: 3.551835298538208


training:  42%|████▏     | 4656/10986 [1:47:08<2:18:00,  1.31s/it]

training loss: 3.3963942527770996


training:  42%|████▏     | 4657/10986 [1:47:10<2:17:30,  1.30s/it]

training loss: 3.4831292629241943


training:  42%|████▏     | 4658/10986 [1:47:11<2:17:30,  1.30s/it]

training loss: 3.446812391281128


training:  42%|████▏     | 4659/10986 [1:47:12<2:17:32,  1.30s/it]

training loss: 3.4824233055114746


training:  42%|████▏     | 4660/10986 [1:47:14<2:24:47,  1.37s/it]

training loss: 3.3898465633392334
valid loss: 3.3918917179107666
perplexity: 29.722124099731445


training:  42%|████▏     | 4661/10986 [1:47:17<3:31:11,  2.00s/it]

training loss: 3.4941232204437256


training:  42%|████▏     | 4662/10986 [1:47:19<3:12:03,  1.82s/it]

training loss: 3.231269598007202


training:  42%|████▏     | 4663/10986 [1:47:20<2:56:17,  1.67s/it]

training loss: 3.3135979175567627


training:  42%|████▏     | 4664/10986 [1:47:21<2:44:45,  1.56s/it]

training loss: 3.4032652378082275


training:  42%|████▏     | 4665/10986 [1:47:22<2:36:28,  1.49s/it]

training loss: 3.344573974609375


training:  42%|████▏     | 4666/10986 [1:47:24<2:31:18,  1.44s/it]

training loss: 3.4206273555755615


training:  42%|████▏     | 4667/10986 [1:47:25<2:26:59,  1.40s/it]

training loss: 3.3028719425201416


training:  42%|████▏     | 4668/10986 [1:47:26<2:23:52,  1.37s/it]

training loss: 3.4401938915252686


training:  42%|████▏     | 4669/10986 [1:47:28<2:20:47,  1.34s/it]

training loss: 3.3750007152557373


training:  43%|████▎     | 4670/10986 [1:47:29<2:19:01,  1.32s/it]

training loss: 3.3612661361694336


training:  43%|████▎     | 4671/10986 [1:47:31<2:27:02,  1.40s/it]

training loss: 3.4072768688201904


training:  43%|████▎     | 4672/10986 [1:47:32<2:25:07,  1.38s/it]

training loss: 3.5539627075195312


training:  43%|████▎     | 4673/10986 [1:47:33<2:22:14,  1.35s/it]

training loss: 3.358436107635498


training:  43%|████▎     | 4674/10986 [1:47:34<2:19:41,  1.33s/it]

training loss: 3.3958733081817627


training:  43%|████▎     | 4675/10986 [1:47:36<2:18:47,  1.32s/it]

training loss: 3.29439640045166


training:  43%|████▎     | 4676/10986 [1:47:37<2:19:23,  1.33s/it]

training loss: 3.4853570461273193


training:  43%|████▎     | 4677/10986 [1:47:38<2:17:42,  1.31s/it]

training loss: 3.460251569747925


training:  43%|████▎     | 4678/10986 [1:47:40<2:16:47,  1.30s/it]

training loss: 3.6232433319091797


training:  43%|████▎     | 4679/10986 [1:47:41<2:16:32,  1.30s/it]

training loss: 3.356428861618042


training:  43%|████▎     | 4680/10986 [1:47:42<2:15:56,  1.29s/it]

training loss: 3.388765573501587
valid loss: 3.3872454166412354
perplexity: 29.584346771240234


training:  43%|████▎     | 4681/10986 [1:47:45<3:02:53,  1.74s/it]

training loss: 3.408137321472168


training:  43%|████▎     | 4682/10986 [1:47:47<3:01:59,  1.73s/it]

training loss: 3.3888397216796875


training:  43%|████▎     | 4683/10986 [1:47:48<2:49:00,  1.61s/it]

training loss: 3.2996878623962402


training:  43%|████▎     | 4684/10986 [1:47:49<2:39:29,  1.52s/it]

training loss: 3.4441287517547607


training:  43%|████▎     | 4685/10986 [1:47:51<2:31:55,  1.45s/it]

training loss: 3.3320069313049316


training:  43%|████▎     | 4686/10986 [1:47:52<2:27:58,  1.41s/it]

training loss: 3.3961691856384277


training:  43%|████▎     | 4687/10986 [1:47:53<2:24:15,  1.37s/it]

training loss: 3.3973910808563232


training:  43%|████▎     | 4688/10986 [1:47:55<2:22:56,  1.36s/it]

training loss: 3.3571300506591797


training:  43%|████▎     | 4689/10986 [1:47:56<2:20:41,  1.34s/it]

training loss: 3.3684425354003906


training:  43%|████▎     | 4690/10986 [1:47:57<2:19:31,  1.33s/it]

training loss: 3.3566384315490723


training:  43%|████▎     | 4691/10986 [1:47:59<2:27:13,  1.40s/it]

training loss: 3.3304243087768555


training:  43%|████▎     | 4692/10986 [1:48:00<2:26:21,  1.40s/it]

training loss: 3.3631110191345215


training:  43%|████▎     | 4693/10986 [1:48:01<2:24:13,  1.38s/it]

training loss: 3.4519448280334473


training:  43%|████▎     | 4694/10986 [1:48:03<2:20:34,  1.34s/it]

training loss: 3.338726758956909


training:  43%|████▎     | 4695/10986 [1:48:04<2:18:57,  1.33s/it]

training loss: 3.4244415760040283


training:  43%|████▎     | 4696/10986 [1:48:05<2:17:12,  1.31s/it]

training loss: 3.4060120582580566


training:  43%|████▎     | 4697/10986 [1:48:07<2:16:36,  1.30s/it]

training loss: 3.3772776126861572


training:  43%|████▎     | 4698/10986 [1:48:08<2:16:54,  1.31s/it]

training loss: 3.370680093765259


training:  43%|████▎     | 4699/10986 [1:48:09<2:15:36,  1.29s/it]

training loss: 3.395012617111206


training:  43%|████▎     | 4700/10986 [1:48:10<2:14:46,  1.29s/it]

training loss: 3.5267648696899414
valid loss: 3.524223804473877
perplexity: 33.92742919921875


training:  43%|████▎     | 4701/10986 [1:48:13<3:00:14,  1.72s/it]

training loss: 3.513430118560791


training:  43%|████▎     | 4702/10986 [1:48:14<2:48:21,  1.61s/it]

training loss: 3.525524616241455


training:  43%|████▎     | 4703/10986 [1:48:16<2:38:30,  1.51s/it]

training loss: 3.37069034576416


training:  43%|████▎     | 4704/10986 [1:48:17<2:31:19,  1.45s/it]

training loss: 3.4990696907043457


training:  43%|████▎     | 4705/10986 [1:48:18<2:26:54,  1.40s/it]

training loss: 3.435763120651245


training:  43%|████▎     | 4706/10986 [1:48:20<2:25:00,  1.39s/it]

training loss: 3.342045783996582


training:  43%|████▎     | 4707/10986 [1:48:21<2:22:40,  1.36s/it]

training loss: 3.311838150024414


training:  43%|████▎     | 4708/10986 [1:48:22<2:20:10,  1.34s/it]

training loss: 3.4511170387268066


training:  43%|████▎     | 4709/10986 [1:48:24<2:17:35,  1.32s/it]

training loss: 3.439093828201294


training:  43%|████▎     | 4710/10986 [1:48:25<2:16:51,  1.31s/it]

training loss: 3.3959693908691406


training:  43%|████▎     | 4711/10986 [1:48:26<2:24:38,  1.38s/it]

training loss: 3.2945563793182373


training:  43%|████▎     | 4712/10986 [1:48:28<2:29:56,  1.43s/it]

training loss: 3.4353129863739014


training:  43%|████▎     | 4713/10986 [1:48:29<2:26:06,  1.40s/it]

training loss: 3.4436612129211426


training:  43%|████▎     | 4714/10986 [1:48:31<2:22:50,  1.37s/it]

training loss: 3.4926040172576904


training:  43%|████▎     | 4715/10986 [1:48:32<2:20:24,  1.34s/it]

training loss: 3.3540024757385254


training:  43%|████▎     | 4716/10986 [1:48:33<2:17:52,  1.32s/it]

training loss: 3.3037798404693604


training:  43%|████▎     | 4717/10986 [1:48:34<2:17:48,  1.32s/it]

training loss: 3.4392893314361572


training:  43%|████▎     | 4718/10986 [1:48:36<2:16:51,  1.31s/it]

training loss: 3.2685036659240723


training:  43%|████▎     | 4719/10986 [1:48:37<2:16:01,  1.30s/it]

training loss: 3.406541347503662


training:  43%|████▎     | 4720/10986 [1:48:38<2:16:08,  1.30s/it]

training loss: 3.3471686840057373
valid loss: 3.3425440788269043
perplexity: 28.2910099029541


training:  43%|████▎     | 4721/10986 [1:48:41<3:01:37,  1.74s/it]

training loss: 3.3804426193237305


training:  43%|████▎     | 4722/10986 [1:48:43<2:58:29,  1.71s/it]

training loss: 3.301029682159424


training:  43%|████▎     | 4723/10986 [1:48:44<2:45:24,  1.58s/it]

training loss: 3.4921507835388184


training:  43%|████▎     | 4724/10986 [1:48:45<2:36:22,  1.50s/it]

training loss: 3.383449077606201


training:  43%|████▎     | 4725/10986 [1:48:47<2:29:26,  1.43s/it]

training loss: 3.3831207752227783


training:  43%|████▎     | 4726/10986 [1:48:48<2:24:32,  1.39s/it]

training loss: 3.3950111865997314


training:  43%|████▎     | 4727/10986 [1:48:49<2:21:51,  1.36s/it]

training loss: 3.321842670440674


training:  43%|████▎     | 4728/10986 [1:48:50<2:20:48,  1.35s/it]

training loss: 3.46492338180542


training:  43%|████▎     | 4729/10986 [1:48:52<2:18:26,  1.33s/it]

training loss: 3.394623279571533


training:  43%|████▎     | 4730/10986 [1:48:53<2:17:10,  1.32s/it]

training loss: 3.4376723766326904


training:  43%|████▎     | 4731/10986 [1:48:55<2:25:21,  1.39s/it]

training loss: 3.331943988800049


training:  43%|████▎     | 4732/10986 [1:48:56<2:22:33,  1.37s/it]

training loss: 3.370382308959961


training:  43%|████▎     | 4733/10986 [1:48:57<2:20:14,  1.35s/it]

training loss: 3.3641152381896973


training:  43%|████▎     | 4734/10986 [1:48:58<2:18:17,  1.33s/it]

training loss: 3.387515068054199


training:  43%|████▎     | 4735/10986 [1:49:00<2:16:43,  1.31s/it]

training loss: 3.4060189723968506


training:  43%|████▎     | 4736/10986 [1:49:01<2:16:40,  1.31s/it]

training loss: 3.2954721450805664


training:  43%|████▎     | 4737/10986 [1:49:02<2:15:53,  1.30s/it]

training loss: 3.431367874145508


training:  43%|████▎     | 4738/10986 [1:49:04<2:15:42,  1.30s/it]

training loss: 3.421219825744629


training:  43%|████▎     | 4739/10986 [1:49:05<2:15:02,  1.30s/it]

training loss: 3.3414416313171387


training:  43%|████▎     | 4740/10986 [1:49:06<2:14:53,  1.30s/it]

training loss: 3.4418485164642334
valid loss: 3.444622755050659
perplexity: 31.33146095275879


training:  43%|████▎     | 4741/10986 [1:49:09<3:00:22,  1.73s/it]

training loss: 3.2473819255828857


training:  43%|████▎     | 4742/10986 [1:49:10<2:50:38,  1.64s/it]

training loss: 3.3723490238189697


training:  43%|████▎     | 4743/10986 [1:49:12<2:41:03,  1.55s/it]

training loss: 3.386770009994507


training:  43%|████▎     | 4744/10986 [1:49:13<2:33:41,  1.48s/it]

training loss: 3.3377485275268555


training:  43%|████▎     | 4745/10986 [1:49:14<2:28:44,  1.43s/it]

training loss: 3.342662811279297


training:  43%|████▎     | 4746/10986 [1:49:16<2:24:35,  1.39s/it]

training loss: 3.299036979675293


training:  43%|████▎     | 4747/10986 [1:49:17<2:22:09,  1.37s/it]

training loss: 3.3228018283843994


training:  43%|████▎     | 4748/10986 [1:49:18<2:20:27,  1.35s/it]

training loss: 3.4828171730041504


training:  43%|████▎     | 4749/10986 [1:49:20<2:18:26,  1.33s/it]

training loss: 3.369213819503784


training:  43%|████▎     | 4750/10986 [1:49:21<2:19:11,  1.34s/it]

training loss: 3.412501573562622


training:  43%|████▎     | 4751/10986 [1:49:23<2:27:50,  1.42s/it]

training loss: 3.4506142139434814


training:  43%|████▎     | 4752/10986 [1:49:24<2:27:23,  1.42s/it]

training loss: 3.3945422172546387


training:  43%|████▎     | 4753/10986 [1:49:25<2:24:14,  1.39s/it]

training loss: 3.4548466205596924


training:  43%|████▎     | 4754/10986 [1:49:27<2:20:57,  1.36s/it]

training loss: 3.3123435974121094


training:  43%|████▎     | 4755/10986 [1:49:28<2:19:40,  1.35s/it]

training loss: 3.451998472213745


training:  43%|████▎     | 4756/10986 [1:49:29<2:17:37,  1.33s/it]

training loss: 3.4106976985931396


training:  43%|████▎     | 4757/10986 [1:49:30<2:16:50,  1.32s/it]

training loss: 3.38859224319458


training:  43%|████▎     | 4758/10986 [1:49:32<2:16:44,  1.32s/it]

training loss: 3.3682284355163574


training:  43%|████▎     | 4759/10986 [1:49:33<2:16:15,  1.31s/it]

training loss: 3.395322322845459


training:  43%|████▎     | 4760/10986 [1:49:34<2:16:04,  1.31s/it]

training loss: 3.4815425872802734
valid loss: 3.4723925590515137
perplexity: 32.21372604370117


training:  43%|████▎     | 4761/10986 [1:49:37<3:01:03,  1.75s/it]

training loss: 3.280928611755371


training:  43%|████▎     | 4762/10986 [1:49:39<2:57:08,  1.71s/it]

training loss: 3.484180212020874


training:  43%|████▎     | 4763/10986 [1:49:40<2:44:22,  1.58s/it]

training loss: 3.406465530395508


training:  43%|████▎     | 4764/10986 [1:49:41<2:35:56,  1.50s/it]

training loss: 3.386522054672241


training:  43%|████▎     | 4765/10986 [1:49:43<2:30:56,  1.46s/it]

training loss: 3.2694413661956787


training:  43%|████▎     | 4766/10986 [1:49:44<2:26:03,  1.41s/it]

training loss: 3.41384220123291


training:  43%|████▎     | 4767/10986 [1:49:45<2:22:32,  1.38s/it]

training loss: 3.399158239364624


training:  43%|████▎     | 4768/10986 [1:49:47<2:19:50,  1.35s/it]

training loss: 3.3714070320129395


training:  43%|████▎     | 4769/10986 [1:49:48<2:17:33,  1.33s/it]

training loss: 3.286053419113159


training:  43%|████▎     | 4770/10986 [1:49:49<2:16:32,  1.32s/it]

training loss: 3.2836508750915527


training:  43%|████▎     | 4771/10986 [1:49:51<2:26:15,  1.41s/it]

training loss: 3.316652297973633


training:  43%|████▎     | 4772/10986 [1:49:52<2:31:45,  1.47s/it]

training loss: 3.4020771980285645


training:  43%|████▎     | 4773/10986 [1:49:54<2:26:25,  1.41s/it]

training loss: 3.325129270553589


training:  43%|████▎     | 4774/10986 [1:49:55<2:22:56,  1.38s/it]

training loss: 3.4404282569885254


training:  43%|████▎     | 4775/10986 [1:49:56<2:20:37,  1.36s/it]

training loss: 3.5025382041931152


training:  43%|████▎     | 4776/10986 [1:49:58<2:18:58,  1.34s/it]

training loss: 3.401728391647339


training:  43%|████▎     | 4777/10986 [1:49:59<2:17:14,  1.33s/it]

training loss: 3.3421547412872314


training:  43%|████▎     | 4778/10986 [1:50:00<2:16:05,  1.32s/it]

training loss: 3.459545373916626


training:  44%|████▎     | 4779/10986 [1:50:01<2:15:21,  1.31s/it]

training loss: 3.454815626144409


training:  44%|████▎     | 4780/10986 [1:50:03<2:14:52,  1.30s/it]

training loss: 3.3614652156829834
valid loss: 3.3639984130859375
perplexity: 28.90453338623047


training:  44%|████▎     | 4781/10986 [1:50:06<3:01:20,  1.75s/it]

training loss: 3.3437342643737793


training:  44%|████▎     | 4782/10986 [1:50:07<2:48:24,  1.63s/it]

training loss: 3.4181246757507324


training:  44%|████▎     | 4783/10986 [1:50:08<2:38:06,  1.53s/it]

training loss: 3.3771915435791016


training:  44%|████▎     | 4784/10986 [1:50:10<2:30:58,  1.46s/it]

training loss: 3.3803763389587402


training:  44%|████▎     | 4785/10986 [1:50:11<2:25:21,  1.41s/it]

training loss: 3.3290908336639404


training:  44%|████▎     | 4786/10986 [1:50:12<2:21:07,  1.37s/it]

training loss: 3.408830165863037


training:  44%|████▎     | 4787/10986 [1:50:13<2:19:21,  1.35s/it]

training loss: 3.5380942821502686


training:  44%|████▎     | 4788/10986 [1:50:15<2:17:22,  1.33s/it]

training loss: 3.4827065467834473


training:  44%|████▎     | 4789/10986 [1:50:16<2:16:00,  1.32s/it]

training loss: 3.3763036727905273


training:  44%|████▎     | 4790/10986 [1:50:17<2:14:36,  1.30s/it]

training loss: 3.402686595916748


training:  44%|████▎     | 4791/10986 [1:50:19<2:23:20,  1.39s/it]

training loss: 3.3898959159851074


training:  44%|████▎     | 4792/10986 [1:50:20<2:20:20,  1.36s/it]

training loss: 3.3628673553466797


training:  44%|████▎     | 4793/10986 [1:50:21<2:20:14,  1.36s/it]

training loss: 3.3526060581207275


training:  44%|████▎     | 4794/10986 [1:50:23<2:18:21,  1.34s/it]

training loss: 3.2953596115112305


training:  44%|████▎     | 4795/10986 [1:50:24<2:17:07,  1.33s/it]

training loss: 3.255235433578491


training:  44%|████▎     | 4796/10986 [1:50:25<2:16:10,  1.32s/it]

training loss: 3.405357599258423


training:  44%|████▎     | 4797/10986 [1:50:27<2:15:21,  1.31s/it]

training loss: 3.3257293701171875


training:  44%|████▎     | 4798/10986 [1:50:28<2:15:01,  1.31s/it]

training loss: 3.381134510040283


training:  44%|████▎     | 4799/10986 [1:50:29<2:14:18,  1.30s/it]

training loss: 3.4333925247192383


training:  44%|████▎     | 4800/10986 [1:50:31<2:13:50,  1.30s/it]

training loss: 3.3848090171813965
valid loss: 3.385347843170166
perplexity: 29.528261184692383


training:  44%|████▎     | 4801/10986 [1:50:33<2:59:58,  1.75s/it]

training loss: 3.357646942138672


training:  44%|████▎     | 4802/10986 [1:50:35<2:48:01,  1.63s/it]

training loss: 3.4268858432769775


training:  44%|████▎     | 4803/10986 [1:50:36<2:37:49,  1.53s/it]

training loss: 3.4307122230529785


training:  44%|████▎     | 4804/10986 [1:50:37<2:30:13,  1.46s/it]

training loss: 3.4806604385375977


training:  44%|████▎     | 4805/10986 [1:50:39<2:25:02,  1.41s/it]

training loss: 3.3881733417510986


training:  44%|████▎     | 4806/10986 [1:50:40<2:21:39,  1.38s/it]

training loss: 3.329517126083374


training:  44%|████▍     | 4807/10986 [1:50:41<2:20:49,  1.37s/it]

training loss: 3.413625955581665


training:  44%|████▍     | 4808/10986 [1:50:43<2:31:32,  1.47s/it]

training loss: 3.468683958053589


training:  44%|████▍     | 4809/10986 [1:50:45<2:37:55,  1.53s/it]

training loss: 3.431093692779541


training:  44%|████▍     | 4810/10986 [1:50:46<2:31:54,  1.48s/it]

training loss: 3.327927350997925


training:  44%|████▍     | 4811/10986 [1:50:47<2:34:04,  1.50s/it]

training loss: 3.3876562118530273


training:  44%|████▍     | 4812/10986 [1:50:49<2:27:46,  1.44s/it]

training loss: 3.3586442470550537


training:  44%|████▍     | 4813/10986 [1:50:50<2:23:14,  1.39s/it]

training loss: 3.384099006652832


training:  44%|████▍     | 4814/10986 [1:50:51<2:21:39,  1.38s/it]

training loss: 3.299628973007202


training:  44%|████▍     | 4815/10986 [1:50:53<2:19:49,  1.36s/it]

training loss: 3.459796667098999


training:  44%|████▍     | 4816/10986 [1:50:54<2:18:35,  1.35s/it]

training loss: 3.496295213699341


training:  44%|████▍     | 4817/10986 [1:50:55<2:16:20,  1.33s/it]

training loss: 3.351468086242676


training:  44%|████▍     | 4818/10986 [1:50:57<2:15:12,  1.32s/it]

training loss: 3.309799909591675


training:  44%|████▍     | 4819/10986 [1:50:58<2:14:06,  1.30s/it]

training loss: 3.488431453704834


training:  44%|████▍     | 4820/10986 [1:50:59<2:13:12,  1.30s/it]

training loss: 3.3979859352111816
valid loss: 3.3945884704589844
perplexity: 29.802385330200195


training:  44%|████▍     | 4821/10986 [1:51:02<2:58:05,  1.73s/it]

training loss: 3.416358470916748


training:  44%|████▍     | 4822/10986 [1:51:03<2:46:39,  1.62s/it]

training loss: 3.326601028442383


training:  44%|████▍     | 4823/10986 [1:51:05<2:36:27,  1.52s/it]

training loss: 3.543611526489258


training:  44%|████▍     | 4824/10986 [1:51:06<2:29:05,  1.45s/it]

training loss: 3.4656476974487305


training:  44%|████▍     | 4825/10986 [1:51:07<2:23:57,  1.40s/it]

training loss: 3.3924007415771484


training:  44%|████▍     | 4826/10986 [1:51:08<2:19:50,  1.36s/it]

training loss: 3.3416881561279297


training:  44%|████▍     | 4827/10986 [1:51:10<2:18:08,  1.35s/it]

training loss: 3.320559501647949


training:  44%|████▍     | 4828/10986 [1:51:11<2:17:15,  1.34s/it]

training loss: 3.3950204849243164


training:  44%|████▍     | 4829/10986 [1:51:12<2:15:53,  1.32s/it]

training loss: 3.469334602355957


training:  44%|████▍     | 4830/10986 [1:51:14<2:15:14,  1.32s/it]

training loss: 3.3874270915985107


training:  44%|████▍     | 4831/10986 [1:51:15<2:23:07,  1.40s/it]

training loss: 3.364964008331299


training:  44%|████▍     | 4832/10986 [1:51:17<2:30:31,  1.47s/it]

training loss: 3.4187076091766357


training:  44%|████▍     | 4833/10986 [1:51:18<2:24:49,  1.41s/it]

training loss: 3.341014862060547


training:  44%|████▍     | 4834/10986 [1:51:19<2:20:46,  1.37s/it]

training loss: 3.326925039291382


training:  44%|████▍     | 4835/10986 [1:51:21<2:18:22,  1.35s/it]

training loss: 3.3562068939208984


training:  44%|████▍     | 4836/10986 [1:51:22<2:18:11,  1.35s/it]

training loss: 3.3963420391082764


training:  44%|████▍     | 4837/10986 [1:51:23<2:16:31,  1.33s/it]

training loss: 3.385808229446411


training:  44%|████▍     | 4838/10986 [1:51:25<2:15:40,  1.32s/it]

training loss: 3.410404920578003


training:  44%|████▍     | 4839/10986 [1:51:26<2:14:50,  1.32s/it]

training loss: 3.4944887161254883


training:  44%|████▍     | 4840/10986 [1:51:27<2:15:02,  1.32s/it]

training loss: 3.5317471027374268
valid loss: 3.5217089653015137
perplexity: 33.84221649169922


training:  44%|████▍     | 4841/10986 [1:51:30<2:58:30,  1.74s/it]

training loss: 3.3719048500061035


training:  44%|████▍     | 4842/10986 [1:51:31<2:47:33,  1.64s/it]

training loss: 3.3356363773345947


training:  44%|████▍     | 4843/10986 [1:51:33<2:37:19,  1.54s/it]

training loss: 3.4114253520965576


training:  44%|████▍     | 4844/10986 [1:51:34<2:29:43,  1.46s/it]

training loss: 3.393094062805176


training:  44%|████▍     | 4845/10986 [1:51:35<2:24:20,  1.41s/it]

training loss: 3.2965850830078125


training:  44%|████▍     | 4846/10986 [1:51:37<2:20:42,  1.38s/it]

training loss: 3.3914101123809814


training:  44%|████▍     | 4847/10986 [1:51:38<2:20:13,  1.37s/it]

training loss: 3.410989999771118


training:  44%|████▍     | 4848/10986 [1:51:39<2:17:56,  1.35s/it]

training loss: 3.555110216140747


training:  44%|████▍     | 4849/10986 [1:51:41<2:15:52,  1.33s/it]

training loss: 3.35304856300354


training:  44%|████▍     | 4850/10986 [1:51:42<2:14:05,  1.31s/it]

training loss: 3.3834118843078613


training:  44%|████▍     | 4851/10986 [1:51:43<2:21:29,  1.38s/it]

training loss: 3.4843642711639404


training:  44%|████▍     | 4852/10986 [1:51:45<2:28:10,  1.45s/it]

training loss: 3.5106751918792725


training:  44%|████▍     | 4853/10986 [1:51:46<2:23:17,  1.40s/it]

training loss: 3.3627023696899414


training:  44%|████▍     | 4854/10986 [1:51:48<2:19:28,  1.36s/it]

training loss: 3.4264352321624756


training:  44%|████▍     | 4855/10986 [1:51:49<2:17:29,  1.35s/it]

training loss: 3.3838632106781006


training:  44%|████▍     | 4856/10986 [1:51:50<2:15:46,  1.33s/it]

training loss: 3.425246477127075


training:  44%|████▍     | 4857/10986 [1:51:51<2:14:44,  1.32s/it]

training loss: 3.2801547050476074


training:  44%|████▍     | 4858/10986 [1:51:53<2:13:42,  1.31s/it]

training loss: 3.4439990520477295


training:  44%|████▍     | 4859/10986 [1:51:54<2:12:58,  1.30s/it]

training loss: 3.4181344509124756


training:  44%|████▍     | 4860/10986 [1:51:55<2:12:25,  1.30s/it]

training loss: 3.4204490184783936
valid loss: 3.423367977142334
perplexity: 30.672544479370117


training:  44%|████▍     | 4861/10986 [1:51:58<2:56:56,  1.73s/it]

training loss: 3.4411568641662598


training:  44%|████▍     | 4862/10986 [1:52:00<2:53:49,  1.70s/it]

training loss: 3.4631686210632324


training:  44%|████▍     | 4863/10986 [1:52:01<2:41:26,  1.58s/it]

training loss: 3.4071085453033447


training:  44%|████▍     | 4864/10986 [1:52:02<2:32:52,  1.50s/it]

training loss: 3.4659736156463623


training:  44%|████▍     | 4865/10986 [1:52:04<2:26:00,  1.43s/it]

training loss: 3.37866473197937


training:  44%|████▍     | 4866/10986 [1:52:05<2:21:43,  1.39s/it]

training loss: 3.3717126846313477


training:  44%|████▍     | 4867/10986 [1:52:06<2:18:36,  1.36s/it]

training loss: 3.6151368618011475


training:  44%|████▍     | 4868/10986 [1:52:07<2:15:52,  1.33s/it]

training loss: 3.461073398590088


training:  44%|████▍     | 4869/10986 [1:52:09<2:13:35,  1.31s/it]

training loss: 3.4970364570617676


training:  44%|████▍     | 4870/10986 [1:52:10<2:12:36,  1.30s/it]

training loss: 3.3063979148864746


training:  44%|████▍     | 4871/10986 [1:52:11<2:20:27,  1.38s/it]

training loss: 3.4345591068267822


training:  44%|████▍     | 4872/10986 [1:52:13<2:18:13,  1.36s/it]

training loss: 3.45585560798645


training:  44%|████▍     | 4873/10986 [1:52:14<2:15:49,  1.33s/it]

training loss: 3.3277242183685303


training:  44%|████▍     | 4874/10986 [1:52:15<2:14:51,  1.32s/it]

training loss: 3.3519699573516846


training:  44%|████▍     | 4875/10986 [1:52:17<2:12:59,  1.31s/it]

training loss: 3.3462157249450684


training:  44%|████▍     | 4876/10986 [1:52:18<2:11:45,  1.29s/it]

training loss: 3.480441093444824


training:  44%|████▍     | 4877/10986 [1:52:19<2:11:07,  1.29s/it]

training loss: 3.347888231277466


training:  44%|████▍     | 4878/10986 [1:52:20<2:10:59,  1.29s/it]

training loss: 3.352936029434204


training:  44%|████▍     | 4879/10986 [1:52:22<2:10:50,  1.29s/it]

training loss: 3.3958942890167236


training:  44%|████▍     | 4880/10986 [1:52:23<2:10:47,  1.29s/it]

training loss: 3.4978959560394287
valid loss: 3.501244068145752
perplexity: 33.15667724609375


training:  44%|████▍     | 4881/10986 [1:52:26<2:55:12,  1.72s/it]

training loss: 3.3367679119110107


training:  44%|████▍     | 4882/10986 [1:52:27<2:46:38,  1.64s/it]

training loss: 3.4008593559265137


training:  44%|████▍     | 4883/10986 [1:52:29<2:36:48,  1.54s/it]

training loss: 3.4021356105804443


training:  44%|████▍     | 4884/10986 [1:52:30<2:28:46,  1.46s/it]

training loss: 3.432346820831299


training:  44%|████▍     | 4885/10986 [1:52:31<2:23:29,  1.41s/it]

training loss: 3.3828980922698975


training:  44%|████▍     | 4886/10986 [1:52:32<2:19:17,  1.37s/it]

training loss: 3.3274333477020264


training:  44%|████▍     | 4887/10986 [1:52:34<2:17:10,  1.35s/it]

training loss: 3.411776542663574


training:  44%|████▍     | 4888/10986 [1:52:35<2:15:09,  1.33s/it]

training loss: 3.3854148387908936


training:  45%|████▍     | 4889/10986 [1:52:36<2:14:37,  1.32s/it]

training loss: 3.366853952407837


training:  45%|████▍     | 4890/10986 [1:52:38<2:13:08,  1.31s/it]

training loss: 3.4583442211151123


training:  45%|████▍     | 4891/10986 [1:52:39<2:20:49,  1.39s/it]

training loss: 3.4187986850738525


training:  45%|████▍     | 4892/10986 [1:52:40<2:18:16,  1.36s/it]

training loss: 3.399178981781006


training:  45%|████▍     | 4893/10986 [1:52:42<2:16:45,  1.35s/it]

training loss: 3.360139846801758


training:  45%|████▍     | 4894/10986 [1:52:43<2:13:49,  1.32s/it]

training loss: 3.2674810886383057


training:  45%|████▍     | 4895/10986 [1:52:44<2:12:48,  1.31s/it]

training loss: 3.3537158966064453


training:  45%|████▍     | 4896/10986 [1:52:46<2:11:58,  1.30s/it]

training loss: 3.4855127334594727


training:  45%|████▍     | 4897/10986 [1:52:47<2:11:31,  1.30s/it]

training loss: 3.47947096824646


training:  45%|████▍     | 4898/10986 [1:52:48<2:11:57,  1.30s/it]

training loss: 3.427417278289795


training:  45%|████▍     | 4899/10986 [1:52:49<2:11:45,  1.30s/it]

training loss: 3.556356906890869


training:  45%|████▍     | 4900/10986 [1:52:51<2:11:39,  1.30s/it]

training loss: 3.4055168628692627
valid loss: 3.399226665496826
perplexity: 29.940935134887695


training:  45%|████▍     | 4901/10986 [1:52:54<2:57:27,  1.75s/it]

training loss: 3.3860270977020264


training:  45%|████▍     | 4902/10986 [1:52:55<2:46:29,  1.64s/it]

training loss: 3.448859214782715


training:  45%|████▍     | 4903/10986 [1:52:56<2:36:13,  1.54s/it]

training loss: 3.416295051574707


training:  45%|████▍     | 4904/10986 [1:52:58<2:29:34,  1.48s/it]

training loss: 3.4118869304656982


training:  45%|████▍     | 4905/10986 [1:52:59<2:23:50,  1.42s/it]

training loss: 3.345768928527832


training:  45%|████▍     | 4906/10986 [1:53:00<2:19:51,  1.38s/it]

training loss: 3.4699418544769287


training:  45%|████▍     | 4907/10986 [1:53:01<2:17:15,  1.35s/it]

training loss: 3.443082809448242


training:  45%|████▍     | 4908/10986 [1:53:03<2:16:12,  1.34s/it]

training loss: 3.4718716144561768


training:  45%|████▍     | 4909/10986 [1:53:04<2:15:24,  1.34s/it]

training loss: 3.431814193725586


training:  45%|████▍     | 4910/10986 [1:53:05<2:13:53,  1.32s/it]

training loss: 3.4185688495635986


training:  45%|████▍     | 4911/10986 [1:53:07<2:20:58,  1.39s/it]

training loss: 3.372372627258301


training:  45%|████▍     | 4912/10986 [1:53:08<2:19:12,  1.38s/it]

training loss: 3.436861515045166


training:  45%|████▍     | 4913/10986 [1:53:10<2:16:38,  1.35s/it]

training loss: 3.4390408992767334


training:  45%|████▍     | 4914/10986 [1:53:11<2:15:11,  1.34s/it]

training loss: 3.4165213108062744


training:  45%|████▍     | 4915/10986 [1:53:12<2:13:14,  1.32s/it]

training loss: 3.384281873703003


training:  45%|████▍     | 4916/10986 [1:53:13<2:12:45,  1.31s/it]

training loss: 3.4693353176116943


training:  45%|████▍     | 4917/10986 [1:53:15<2:13:01,  1.32s/it]

training loss: 3.342600107192993


training:  45%|████▍     | 4918/10986 [1:53:16<2:12:03,  1.31s/it]

training loss: 3.355968952178955


training:  45%|████▍     | 4919/10986 [1:53:17<2:12:01,  1.31s/it]

training loss: 3.3346540927886963


training:  45%|████▍     | 4920/10986 [1:53:19<2:11:30,  1.30s/it]

training loss: 3.351384162902832
valid loss: 3.348987340927124
perplexity: 28.47388458251953


training:  45%|████▍     | 4921/10986 [1:53:21<2:53:46,  1.72s/it]

training loss: 3.367042064666748


training:  45%|████▍     | 4922/10986 [1:53:23<2:42:45,  1.61s/it]

training loss: 3.271942615509033


training:  45%|████▍     | 4923/10986 [1:53:24<2:34:21,  1.53s/it]

training loss: 3.3899664878845215


training:  45%|████▍     | 4924/10986 [1:53:25<2:27:31,  1.46s/it]

training loss: 3.375615119934082


training:  45%|████▍     | 4925/10986 [1:53:27<2:22:18,  1.41s/it]

training loss: 3.3710265159606934


training:  45%|████▍     | 4926/10986 [1:53:28<2:18:30,  1.37s/it]

training loss: 3.2729620933532715


training:  45%|████▍     | 4927/10986 [1:53:29<2:15:31,  1.34s/it]

training loss: 3.5239336490631104


training:  45%|████▍     | 4928/10986 [1:53:30<2:13:33,  1.32s/it]

training loss: 3.3277950286865234


training:  45%|████▍     | 4929/10986 [1:53:32<2:13:13,  1.32s/it]

training loss: 3.4170849323272705


training:  45%|████▍     | 4930/10986 [1:53:33<2:12:34,  1.31s/it]

training loss: 3.3787362575531006


training:  45%|████▍     | 4931/10986 [1:53:35<2:19:43,  1.38s/it]

training loss: 3.3562607765197754


training:  45%|████▍     | 4932/10986 [1:53:36<2:26:30,  1.45s/it]

training loss: 3.3411405086517334


training:  45%|████▍     | 4933/10986 [1:53:37<2:21:08,  1.40s/it]

training loss: 3.445870876312256


training:  45%|████▍     | 4934/10986 [1:53:39<2:17:44,  1.37s/it]

training loss: 3.451869010925293


training:  45%|████▍     | 4935/10986 [1:53:40<2:15:27,  1.34s/it]

training loss: 3.4376280307769775


training:  45%|████▍     | 4936/10986 [1:53:41<2:13:46,  1.33s/it]

training loss: 3.477638006210327


training:  45%|████▍     | 4937/10986 [1:53:43<2:12:38,  1.32s/it]

training loss: 3.4386825561523438


training:  45%|████▍     | 4938/10986 [1:53:44<2:11:27,  1.30s/it]

training loss: 3.2624459266662598


training:  45%|████▍     | 4939/10986 [1:53:45<2:11:41,  1.31s/it]

training loss: 3.4242758750915527


training:  45%|████▍     | 4940/10986 [1:53:47<2:11:49,  1.31s/it]

training loss: 3.304474353790283
valid loss: 3.3009138107299805
perplexity: 27.137426376342773


training:  45%|████▍     | 4941/10986 [1:53:49<2:55:43,  1.74s/it]

training loss: 3.2840769290924072


training:  45%|████▍     | 4942/10986 [1:53:51<2:54:17,  1.73s/it]

training loss: 3.3737895488739014


training:  45%|████▍     | 4943/10986 [1:53:52<2:42:10,  1.61s/it]

training loss: 3.4789435863494873


training:  45%|████▌     | 4944/10986 [1:53:54<2:33:55,  1.53s/it]

training loss: 3.3783836364746094


training:  45%|████▌     | 4945/10986 [1:53:55<2:26:58,  1.46s/it]

training loss: 3.379307270050049


training:  45%|████▌     | 4946/10986 [1:53:56<2:21:49,  1.41s/it]

training loss: 3.4816179275512695


training:  45%|████▌     | 4947/10986 [1:53:58<2:18:17,  1.37s/it]

training loss: 3.421588897705078


training:  45%|████▌     | 4948/10986 [1:53:59<2:16:11,  1.35s/it]

training loss: 3.429363489151001


training:  45%|████▌     | 4949/10986 [1:54:00<2:14:18,  1.33s/it]

training loss: 3.3866782188415527


training:  45%|████▌     | 4950/10986 [1:54:01<2:12:36,  1.32s/it]

training loss: 3.283812999725342


training:  45%|████▌     | 4951/10986 [1:54:03<2:20:06,  1.39s/it]

training loss: 3.4225144386291504


training:  45%|████▌     | 4952/10986 [1:54:05<2:25:27,  1.45s/it]

training loss: 3.5142006874084473


training:  45%|████▌     | 4953/10986 [1:54:06<2:21:00,  1.40s/it]

training loss: 3.369955539703369


training:  45%|████▌     | 4954/10986 [1:54:07<2:18:04,  1.37s/it]

training loss: 3.3681225776672363


training:  45%|████▌     | 4955/10986 [1:54:08<2:15:51,  1.35s/it]

training loss: 3.3846161365509033


training:  45%|████▌     | 4956/10986 [1:54:10<2:19:16,  1.39s/it]

training loss: 3.4483089447021484


training:  45%|████▌     | 4957/10986 [1:54:12<2:28:52,  1.48s/it]

training loss: 3.3286385536193848


training:  45%|████▌     | 4958/10986 [1:54:13<2:32:49,  1.52s/it]

training loss: 3.313558578491211


training:  45%|████▌     | 4959/10986 [1:54:15<2:25:58,  1.45s/it]

training loss: 3.4605464935302734


training:  45%|████▌     | 4960/10986 [1:54:16<2:22:16,  1.42s/it]

training loss: 3.329207420349121
valid loss: 3.330655574798584
perplexity: 27.956663131713867


training:  45%|████▌     | 4961/10986 [1:54:19<3:02:48,  1.82s/it]

training loss: 3.368743896484375


training:  45%|████▌     | 4962/10986 [1:54:20<2:49:49,  1.69s/it]

training loss: 3.4376277923583984


training:  45%|████▌     | 4963/10986 [1:54:21<2:37:57,  1.57s/it]

training loss: 3.4564051628112793


training:  45%|████▌     | 4964/10986 [1:54:23<2:29:20,  1.49s/it]

training loss: 3.3525004386901855


training:  45%|████▌     | 4965/10986 [1:54:24<2:24:47,  1.44s/it]

training loss: 3.337197780609131


training:  45%|████▌     | 4966/10986 [1:54:25<2:20:36,  1.40s/it]

training loss: 3.4145853519439697


training:  45%|████▌     | 4967/10986 [1:54:27<2:17:56,  1.38s/it]

training loss: 3.4064688682556152


training:  45%|████▌     | 4968/10986 [1:54:28<2:16:02,  1.36s/it]

training loss: 3.4247796535491943


training:  45%|████▌     | 4969/10986 [1:54:29<2:15:13,  1.35s/it]

training loss: 3.4425389766693115


training:  45%|████▌     | 4970/10986 [1:54:30<2:13:46,  1.33s/it]

training loss: 3.4140849113464355


training:  45%|████▌     | 4971/10986 [1:54:32<2:21:47,  1.41s/it]

training loss: 3.3109171390533447


training:  45%|████▌     | 4972/10986 [1:54:33<2:19:12,  1.39s/it]

training loss: 3.344613790512085


training:  45%|████▌     | 4973/10986 [1:54:35<2:16:34,  1.36s/it]

training loss: 3.388477325439453


training:  45%|████▌     | 4974/10986 [1:54:36<2:15:23,  1.35s/it]

training loss: 3.322200298309326


training:  45%|████▌     | 4975/10986 [1:54:37<2:13:55,  1.34s/it]

training loss: 3.4146671295166016


training:  45%|████▌     | 4976/10986 [1:54:39<2:13:27,  1.33s/it]

training loss: 3.3704121112823486


training:  45%|████▌     | 4977/10986 [1:54:40<2:12:35,  1.32s/it]

training loss: 3.395732879638672


training:  45%|████▌     | 4978/10986 [1:54:41<2:12:07,  1.32s/it]

training loss: 3.332537889480591


training:  45%|████▌     | 4979/10986 [1:54:43<2:11:24,  1.31s/it]

training loss: 3.3917062282562256


training:  45%|████▌     | 4980/10986 [1:54:44<2:11:04,  1.31s/it]

training loss: 3.591299295425415
valid loss: 3.5865910053253174
perplexity: 36.11076736450195


training:  45%|████▌     | 4981/10986 [1:54:47<2:55:37,  1.75s/it]

training loss: 3.3364357948303223


training:  45%|████▌     | 4982/10986 [1:54:48<2:51:27,  1.71s/it]

training loss: 3.2835235595703125


training:  45%|████▌     | 4983/10986 [1:54:50<2:38:51,  1.59s/it]

training loss: 3.325474739074707


training:  45%|████▌     | 4984/10986 [1:54:51<2:30:31,  1.50s/it]

training loss: 3.4900381565093994


training:  45%|████▌     | 4985/10986 [1:54:52<2:24:19,  1.44s/it]

training loss: 3.5046133995056152


training:  45%|████▌     | 4986/10986 [1:54:54<2:20:25,  1.40s/it]

training loss: 3.552522659301758


training:  45%|████▌     | 4987/10986 [1:54:55<2:19:19,  1.39s/it]

training loss: 3.422645330429077


training:  45%|████▌     | 4988/10986 [1:54:56<2:16:56,  1.37s/it]

training loss: 3.3990671634674072


training:  45%|████▌     | 4989/10986 [1:54:58<2:15:14,  1.35s/it]

training loss: 3.3800106048583984


training:  45%|████▌     | 4990/10986 [1:54:59<2:13:46,  1.34s/it]

training loss: 3.3246994018554688


training:  45%|████▌     | 4991/10986 [1:55:00<2:21:35,  1.42s/it]

training loss: 3.416337490081787


training:  45%|████▌     | 4992/10986 [1:55:02<2:30:26,  1.51s/it]

training loss: 3.4501843452453613


training:  45%|████▌     | 4993/10986 [1:55:03<2:24:11,  1.44s/it]

training loss: 3.4399306774139404


training:  45%|████▌     | 4994/10986 [1:55:05<2:19:56,  1.40s/it]

training loss: 3.477379560470581


training:  45%|████▌     | 4995/10986 [1:55:06<2:17:55,  1.38s/it]

training loss: 3.298912525177002


training:  45%|████▌     | 4996/10986 [1:55:07<2:15:28,  1.36s/it]

training loss: 3.326810359954834


training:  45%|████▌     | 4997/10986 [1:55:09<2:13:18,  1.34s/it]

training loss: 3.446475028991699


training:  45%|████▌     | 4998/10986 [1:55:10<2:12:41,  1.33s/it]

training loss: 3.4240376949310303


training:  46%|████▌     | 4999/10986 [1:55:11<2:11:04,  1.31s/it]

training loss: 3.4515626430511475


training:  46%|████▌     | 5000/10986 [1:55:13<2:10:32,  1.31s/it]

training loss: 3.394256591796875
valid loss: 3.3860061168670654
perplexity: 29.547706604003906


training:  46%|████▌     | 5001/10986 [1:55:15<2:54:53,  1.75s/it]

training loss: 3.36225962638855


training:  46%|████▌     | 5002/10986 [1:55:17<2:53:34,  1.74s/it]

training loss: 3.353498697280884


training:  46%|████▌     | 5003/10986 [1:55:18<2:40:08,  1.61s/it]

training loss: 3.52262806892395


training:  46%|████▌     | 5004/10986 [1:55:20<2:30:43,  1.51s/it]

training loss: 3.4968361854553223


training:  46%|████▌     | 5005/10986 [1:55:21<2:24:37,  1.45s/it]

training loss: 3.416879892349243


training:  46%|████▌     | 5006/10986 [1:55:22<2:19:28,  1.40s/it]

training loss: 3.397183895111084


training:  46%|████▌     | 5007/10986 [1:55:23<2:15:28,  1.36s/it]

training loss: 3.364264726638794


training:  46%|████▌     | 5008/10986 [1:55:25<2:14:24,  1.35s/it]

training loss: 3.428863763809204


training:  46%|████▌     | 5009/10986 [1:55:26<2:12:38,  1.33s/it]

training loss: 3.361720085144043


training:  46%|████▌     | 5010/10986 [1:55:27<2:11:01,  1.32s/it]

training loss: 3.380666971206665


training:  46%|████▌     | 5011/10986 [1:55:29<2:18:42,  1.39s/it]

training loss: 3.4107728004455566


training:  46%|████▌     | 5012/10986 [1:55:30<2:15:31,  1.36s/it]

training loss: 3.3306643962860107


training:  46%|████▌     | 5013/10986 [1:55:32<2:13:32,  1.34s/it]

training loss: 3.2928929328918457


training:  46%|████▌     | 5014/10986 [1:55:33<2:11:45,  1.32s/it]

training loss: 3.46252179145813


training:  46%|████▌     | 5015/10986 [1:55:34<2:10:26,  1.31s/it]

training loss: 3.4255850315093994


training:  46%|████▌     | 5016/10986 [1:55:35<2:09:11,  1.30s/it]

training loss: 3.3624110221862793


training:  46%|████▌     | 5017/10986 [1:55:37<2:08:04,  1.29s/it]

training loss: 3.5831592082977295


training:  46%|████▌     | 5018/10986 [1:55:38<2:07:48,  1.28s/it]

training loss: 3.320915937423706


training:  46%|████▌     | 5019/10986 [1:55:39<2:07:13,  1.28s/it]

training loss: 3.5694377422332764


training:  46%|████▌     | 5020/10986 [1:55:40<2:06:53,  1.28s/it]

training loss: 3.36555814743042
valid loss: 3.3741207122802734
perplexity: 29.198598861694336


training:  46%|████▌     | 5021/10986 [1:55:43<2:50:39,  1.72s/it]

training loss: 3.298374891281128


training:  46%|████▌     | 5022/10986 [1:55:45<2:48:35,  1.70s/it]

training loss: 3.3996410369873047


training:  46%|████▌     | 5023/10986 [1:55:46<2:36:00,  1.57s/it]

training loss: 3.4285662174224854


training:  46%|████▌     | 5024/10986 [1:55:47<2:27:33,  1.48s/it]

training loss: 3.417595863342285


training:  46%|████▌     | 5025/10986 [1:55:49<2:20:46,  1.42s/it]

training loss: 3.4087905883789062


training:  46%|████▌     | 5026/10986 [1:55:50<2:16:46,  1.38s/it]

training loss: 3.4388234615325928


training:  46%|████▌     | 5027/10986 [1:55:51<2:13:55,  1.35s/it]

training loss: 3.389065742492676


training:  46%|████▌     | 5028/10986 [1:55:53<2:12:23,  1.33s/it]

training loss: 3.3864331245422363


training:  46%|████▌     | 5029/10986 [1:55:54<2:10:29,  1.31s/it]

training loss: 3.3948171138763428


training:  46%|████▌     | 5030/10986 [1:55:55<2:09:50,  1.31s/it]

training loss: 3.3680362701416016


training:  46%|████▌     | 5031/10986 [1:55:57<2:16:36,  1.38s/it]

training loss: 3.5774946212768555


training:  46%|████▌     | 5032/10986 [1:55:58<2:13:44,  1.35s/it]

training loss: 3.4199914932250977


training:  46%|████▌     | 5033/10986 [1:55:59<2:12:04,  1.33s/it]

training loss: 3.3697011470794678


training:  46%|████▌     | 5034/10986 [1:56:00<2:10:48,  1.32s/it]

training loss: 3.487412452697754


training:  46%|████▌     | 5035/10986 [1:56:02<2:09:30,  1.31s/it]

training loss: 3.229586601257324


training:  46%|████▌     | 5036/10986 [1:56:03<2:09:09,  1.30s/it]

training loss: 3.3704819679260254


training:  46%|████▌     | 5037/10986 [1:56:04<2:09:16,  1.30s/it]

training loss: 3.3992984294891357


training:  46%|████▌     | 5038/10986 [1:56:06<2:09:03,  1.30s/it]

training loss: 3.314584732055664


training:  46%|████▌     | 5039/10986 [1:56:07<2:08:17,  1.29s/it]

training loss: 3.363499402999878


training:  46%|████▌     | 5040/10986 [1:56:08<2:08:17,  1.29s/it]

training loss: 3.343038558959961
valid loss: 3.3404300212860107
perplexity: 28.23126220703125


training:  46%|████▌     | 5041/10986 [1:56:11<2:50:16,  1.72s/it]

training loss: 3.433835506439209


training:  46%|████▌     | 5042/10986 [1:56:12<2:41:31,  1.63s/it]

training loss: 3.307116746902466


training:  46%|████▌     | 5043/10986 [1:56:14<2:31:19,  1.53s/it]

training loss: 3.5104780197143555


training:  46%|████▌     | 5044/10986 [1:56:15<2:24:13,  1.46s/it]

training loss: 3.4219143390655518


training:  46%|████▌     | 5045/10986 [1:56:16<2:19:00,  1.40s/it]

training loss: 3.394144058227539


training:  46%|████▌     | 5046/10986 [1:56:17<2:14:59,  1.36s/it]

training loss: 3.4862403869628906


training:  46%|████▌     | 5047/10986 [1:56:19<2:12:35,  1.34s/it]

training loss: 3.262057065963745


training:  46%|████▌     | 5048/10986 [1:56:20<2:10:59,  1.32s/it]

training loss: 3.4820384979248047


training:  46%|████▌     | 5049/10986 [1:56:21<2:10:14,  1.32s/it]

training loss: 3.4606761932373047


training:  46%|████▌     | 5050/10986 [1:56:23<2:08:25,  1.30s/it]

training loss: 3.4699745178222656


training:  46%|████▌     | 5051/10986 [1:56:24<2:16:50,  1.38s/it]

training loss: 3.4147274494171143


training:  46%|████▌     | 5052/10986 [1:56:25<2:14:28,  1.36s/it]

training loss: 3.287627935409546


training:  46%|████▌     | 5053/10986 [1:56:27<2:11:33,  1.33s/it]

training loss: 3.3955986499786377


training:  46%|████▌     | 5054/10986 [1:56:28<2:10:04,  1.32s/it]

training loss: 3.3050460815429688


training:  46%|████▌     | 5055/10986 [1:56:29<2:08:24,  1.30s/it]

training loss: 3.377440929412842


training:  46%|████▌     | 5056/10986 [1:56:31<2:07:49,  1.29s/it]

training loss: 3.4101510047912598


training:  46%|████▌     | 5057/10986 [1:56:32<2:07:25,  1.29s/it]

training loss: 3.4288060665130615


training:  46%|████▌     | 5058/10986 [1:56:33<2:09:10,  1.31s/it]

training loss: 3.389902353286743


training:  46%|████▌     | 5059/10986 [1:56:35<2:09:11,  1.31s/it]

training loss: 3.370382070541382


training:  46%|████▌     | 5060/10986 [1:56:36<2:09:05,  1.31s/it]

training loss: 3.440988540649414
valid loss: 3.4404778480529785
perplexity: 31.20186424255371


training:  46%|████▌     | 5061/10986 [1:56:39<2:52:40,  1.75s/it]

training loss: 3.3590075969696045


training:  46%|████▌     | 5062/10986 [1:56:40<2:41:42,  1.64s/it]

training loss: 3.479732036590576


training:  46%|████▌     | 5063/10986 [1:56:41<2:31:44,  1.54s/it]

training loss: 3.380880832672119


training:  46%|████▌     | 5064/10986 [1:56:43<2:24:03,  1.46s/it]

training loss: 3.5209310054779053


training:  46%|████▌     | 5065/10986 [1:56:44<2:18:16,  1.40s/it]

training loss: 3.337799310684204


training:  46%|████▌     | 5066/10986 [1:56:45<2:14:44,  1.37s/it]

training loss: 3.3574366569519043


training:  46%|████▌     | 5067/10986 [1:56:46<2:12:15,  1.34s/it]

training loss: 3.417393684387207


training:  46%|████▌     | 5068/10986 [1:56:48<2:10:35,  1.32s/it]

training loss: 3.291348457336426


training:  46%|████▌     | 5069/10986 [1:56:49<2:09:32,  1.31s/it]

training loss: 3.431230068206787


training:  46%|████▌     | 5070/10986 [1:56:50<2:08:54,  1.31s/it]

training loss: 3.4666624069213867


training:  46%|████▌     | 5071/10986 [1:56:52<2:16:22,  1.38s/it]

training loss: 3.4120230674743652


training:  46%|████▌     | 5072/10986 [1:56:53<2:13:44,  1.36s/it]

training loss: 3.362112283706665


training:  46%|████▌     | 5073/10986 [1:56:54<2:11:40,  1.34s/it]

training loss: 3.4543826580047607


training:  46%|████▌     | 5074/10986 [1:56:56<2:09:34,  1.32s/it]

training loss: 3.312382221221924


training:  46%|████▌     | 5075/10986 [1:56:57<2:08:32,  1.30s/it]

training loss: 3.352903366088867


training:  46%|████▌     | 5076/10986 [1:56:58<2:09:19,  1.31s/it]

training loss: 3.365324020385742


training:  46%|████▌     | 5077/10986 [1:57:00<2:08:31,  1.31s/it]

training loss: 3.313760757446289


training:  46%|████▌     | 5078/10986 [1:57:01<2:07:56,  1.30s/it]

training loss: 3.427116632461548


training:  46%|████▌     | 5079/10986 [1:57:02<2:08:34,  1.31s/it]

training loss: 3.4059059619903564


training:  46%|████▌     | 5080/10986 [1:57:03<2:08:27,  1.30s/it]

training loss: 3.371354103088379
valid loss: 3.3684027194976807
perplexity: 29.03211784362793


training:  46%|████▌     | 5081/10986 [1:57:06<2:50:03,  1.73s/it]

training loss: 3.471190929412842


training:  46%|████▋     | 5082/10986 [1:57:08<2:46:26,  1.69s/it]

training loss: 3.419924020767212


training:  46%|████▋     | 5083/10986 [1:57:09<2:34:39,  1.57s/it]

training loss: 3.4363954067230225


training:  46%|████▋     | 5084/10986 [1:57:10<2:26:54,  1.49s/it]

training loss: 3.384634017944336


training:  46%|████▋     | 5085/10986 [1:57:12<2:20:45,  1.43s/it]

training loss: 3.353584051132202


training:  46%|████▋     | 5086/10986 [1:57:13<2:17:21,  1.40s/it]

training loss: 3.364495277404785


training:  46%|████▋     | 5087/10986 [1:57:14<2:14:00,  1.36s/it]

training loss: 3.433445930480957


training:  46%|████▋     | 5088/10986 [1:57:16<2:12:08,  1.34s/it]

training loss: 3.472095489501953


training:  46%|████▋     | 5089/10986 [1:57:17<2:10:51,  1.33s/it]

training loss: 3.3622548580169678


training:  46%|████▋     | 5090/10986 [1:57:18<2:10:14,  1.33s/it]

training loss: 3.502861738204956


training:  46%|████▋     | 5091/10986 [1:57:20<2:17:45,  1.40s/it]

training loss: 3.607736349105835


training:  46%|████▋     | 5092/10986 [1:57:21<2:18:03,  1.41s/it]

training loss: 3.4645557403564453


training:  46%|████▋     | 5093/10986 [1:57:22<2:14:32,  1.37s/it]

training loss: 3.5325443744659424


training:  46%|████▋     | 5094/10986 [1:57:24<2:11:47,  1.34s/it]

training loss: 3.380091667175293


training:  46%|████▋     | 5095/10986 [1:57:25<2:10:26,  1.33s/it]

training loss: 3.3098864555358887


training:  46%|████▋     | 5096/10986 [1:57:26<2:09:08,  1.32s/it]

training loss: 3.3948333263397217


training:  46%|████▋     | 5097/10986 [1:57:28<2:08:18,  1.31s/it]

training loss: 3.4384899139404297


training:  46%|████▋     | 5098/10986 [1:57:29<2:08:59,  1.31s/it]

training loss: 3.3676609992980957


training:  46%|████▋     | 5099/10986 [1:57:30<2:09:59,  1.32s/it]

training loss: 3.5043463706970215


training:  46%|████▋     | 5100/10986 [1:57:32<2:09:03,  1.32s/it]

training loss: 3.519451379776001
valid loss: 3.5201148986816406
perplexity: 33.78831100463867


training:  46%|████▋     | 5101/10986 [1:57:34<2:51:20,  1.75s/it]

training loss: 3.462759017944336


training:  46%|████▋     | 5102/10986 [1:57:36<2:39:56,  1.63s/it]

training loss: 3.4066734313964844


training:  46%|████▋     | 5103/10986 [1:57:37<2:42:32,  1.66s/it]

training loss: 3.47359037399292


training:  46%|████▋     | 5104/10986 [1:57:39<2:43:24,  1.67s/it]

training loss: 3.3170440196990967


training:  46%|████▋     | 5105/10986 [1:57:40<2:33:50,  1.57s/it]

training loss: 3.4485023021698


training:  46%|████▋     | 5106/10986 [1:57:42<2:25:34,  1.49s/it]

training loss: 3.4793426990509033


training:  46%|████▋     | 5107/10986 [1:57:43<2:20:04,  1.43s/it]

training loss: 3.548736572265625


training:  46%|████▋     | 5108/10986 [1:57:44<2:15:57,  1.39s/it]

training loss: 3.415386199951172


training:  47%|████▋     | 5109/10986 [1:57:46<2:13:16,  1.36s/it]

training loss: 3.422947645187378


training:  47%|████▋     | 5110/10986 [1:57:47<2:11:12,  1.34s/it]

training loss: 3.3245975971221924


training:  47%|████▋     | 5111/10986 [1:57:49<2:18:47,  1.42s/it]

training loss: 3.572498321533203


training:  47%|████▋     | 5112/10986 [1:57:50<2:15:46,  1.39s/it]

training loss: 3.3748698234558105


training:  47%|████▋     | 5113/10986 [1:57:51<2:13:05,  1.36s/it]

training loss: 3.3372573852539062


training:  47%|████▋     | 5114/10986 [1:57:52<2:10:53,  1.34s/it]

training loss: 3.513115406036377


training:  47%|████▋     | 5115/10986 [1:57:54<2:09:09,  1.32s/it]

training loss: 3.441664218902588


training:  47%|████▋     | 5116/10986 [1:57:55<2:08:17,  1.31s/it]

training loss: 3.3303723335266113


training:  47%|████▋     | 5117/10986 [1:57:56<2:07:28,  1.30s/it]

training loss: 3.3304975032806396


training:  47%|████▋     | 5118/10986 [1:57:58<2:07:50,  1.31s/it]

training loss: 3.3772990703582764


training:  47%|████▋     | 5119/10986 [1:57:59<2:08:00,  1.31s/it]

training loss: 3.4227423667907715


training:  47%|████▋     | 5120/10986 [1:58:00<2:08:00,  1.31s/it]

training loss: 3.4621171951293945
valid loss: 3.4656243324279785
perplexity: 31.996429443359375


training:  47%|████▋     | 5121/10986 [1:58:03<2:49:56,  1.74s/it]

training loss: 3.4994449615478516


training:  47%|████▋     | 5122/10986 [1:58:05<2:46:49,  1.71s/it]

training loss: 3.5813424587249756


training:  47%|████▋     | 5123/10986 [1:58:06<2:34:21,  1.58s/it]

training loss: 3.4240546226501465


training:  47%|████▋     | 5124/10986 [1:58:07<2:25:40,  1.49s/it]

training loss: 3.315598487854004


training:  47%|████▋     | 5125/10986 [1:58:08<2:19:27,  1.43s/it]

training loss: 3.3331198692321777


training:  47%|████▋     | 5126/10986 [1:58:10<2:15:37,  1.39s/it]

training loss: 3.341346025466919


training:  47%|████▋     | 5127/10986 [1:58:11<2:12:51,  1.36s/it]

training loss: 3.348959445953369


training:  47%|████▋     | 5128/10986 [1:58:12<2:11:42,  1.35s/it]

training loss: 3.42326021194458


training:  47%|████▋     | 5129/10986 [1:58:14<2:10:34,  1.34s/it]

training loss: 3.4252939224243164


training:  47%|████▋     | 5130/10986 [1:58:15<2:09:20,  1.33s/it]

training loss: 3.308284282684326


training:  47%|████▋     | 5131/10986 [1:58:17<2:17:57,  1.41s/it]

training loss: 3.4911279678344727


training:  47%|████▋     | 5132/10986 [1:58:18<2:22:41,  1.46s/it]

training loss: 3.4061338901519775


training:  47%|████▋     | 5133/10986 [1:58:19<2:18:16,  1.42s/it]

training loss: 3.420109272003174


training:  47%|████▋     | 5134/10986 [1:58:21<2:16:16,  1.40s/it]

training loss: 3.489269256591797


training:  47%|████▋     | 5135/10986 [1:58:22<2:13:31,  1.37s/it]

training loss: 3.4003524780273438


training:  47%|████▋     | 5136/10986 [1:58:23<2:11:03,  1.34s/it]

training loss: 3.357343912124634


training:  47%|████▋     | 5137/10986 [1:58:25<2:10:03,  1.33s/it]

training loss: 3.370725631713867


training:  47%|████▋     | 5138/10986 [1:58:26<2:09:38,  1.33s/it]

training loss: 3.305102586746216


training:  47%|████▋     | 5139/10986 [1:58:27<2:08:53,  1.32s/it]

training loss: 3.483185052871704


training:  47%|████▋     | 5140/10986 [1:58:29<2:08:45,  1.32s/it]

training loss: 3.5389564037323
valid loss: 3.532158613204956
perplexity: 34.19770812988281


training:  47%|████▋     | 5141/10986 [1:58:32<2:53:58,  1.79s/it]

training loss: 3.4685516357421875


training:  47%|████▋     | 5142/10986 [1:58:33<2:51:16,  1.76s/it]

training loss: 3.624824047088623


training:  47%|████▋     | 5143/10986 [1:58:35<2:37:57,  1.62s/it]

training loss: 3.387070655822754


training:  47%|████▋     | 5144/10986 [1:58:36<2:29:16,  1.53s/it]

training loss: 3.341346025466919


training:  47%|████▋     | 5145/10986 [1:58:37<2:23:20,  1.47s/it]

training loss: 3.3531181812286377


training:  47%|████▋     | 5146/10986 [1:58:39<2:18:50,  1.43s/it]

training loss: 3.3397765159606934


training:  47%|████▋     | 5147/10986 [1:58:40<2:15:53,  1.40s/it]

training loss: 3.318939685821533


training:  47%|████▋     | 5148/10986 [1:58:41<2:13:16,  1.37s/it]

training loss: 3.357360601425171


training:  47%|████▋     | 5149/10986 [1:58:42<2:11:49,  1.36s/it]

training loss: 3.3249385356903076


training:  47%|████▋     | 5150/10986 [1:58:44<2:10:33,  1.34s/it]

training loss: 3.4405112266540527


training:  47%|████▋     | 5151/10986 [1:58:45<2:17:54,  1.42s/it]

training loss: 3.3311243057250977


training:  47%|████▋     | 5152/10986 [1:58:47<2:14:53,  1.39s/it]

training loss: 3.4813387393951416


training:  47%|████▋     | 5153/10986 [1:58:48<2:12:47,  1.37s/it]

training loss: 3.375014066696167


training:  47%|████▋     | 5154/10986 [1:58:49<2:11:10,  1.35s/it]

training loss: 3.4684183597564697


training:  47%|████▋     | 5155/10986 [1:58:51<2:09:57,  1.34s/it]

training loss: 3.4600677490234375


training:  47%|████▋     | 5156/10986 [1:58:52<2:09:24,  1.33s/it]

training loss: 3.4517433643341064


training:  47%|████▋     | 5157/10986 [1:58:53<2:09:06,  1.33s/it]

training loss: 3.2640326023101807


training:  47%|████▋     | 5158/10986 [1:58:55<2:08:19,  1.32s/it]

training loss: 3.407583713531494


training:  47%|████▋     | 5159/10986 [1:58:56<2:07:54,  1.32s/it]

training loss: 3.653588056564331


training:  47%|████▋     | 5160/10986 [1:58:57<2:07:18,  1.31s/it]

training loss: 3.445172071456909
valid loss: 3.440187931060791
perplexity: 31.192819595336914


training:  47%|████▋     | 5161/10986 [1:59:00<2:51:11,  1.76s/it]

training loss: 3.403452157974243


training:  47%|████▋     | 5162/10986 [1:59:02<2:49:00,  1.74s/it]

training loss: 3.424307346343994


training:  47%|████▋     | 5163/10986 [1:59:03<2:36:52,  1.62s/it]

training loss: 3.432509660720825


training:  47%|████▋     | 5164/10986 [1:59:04<2:27:53,  1.52s/it]

training loss: 3.3924965858459473


training:  47%|████▋     | 5165/10986 [1:59:06<2:21:28,  1.46s/it]

training loss: 3.4164037704467773


training:  47%|████▋     | 5166/10986 [1:59:07<2:17:04,  1.41s/it]

training loss: 3.318293809890747


training:  47%|████▋     | 5167/10986 [1:59:08<2:13:47,  1.38s/it]

training loss: 3.3956565856933594


training:  47%|████▋     | 5168/10986 [1:59:10<2:12:02,  1.36s/it]

training loss: 3.3269877433776855


training:  47%|████▋     | 5169/10986 [1:59:11<2:10:37,  1.35s/it]

training loss: 3.3132164478302


training:  47%|████▋     | 5170/10986 [1:59:12<2:09:42,  1.34s/it]

training loss: 3.2957494258880615


training:  47%|████▋     | 5171/10986 [1:59:14<2:17:42,  1.42s/it]

training loss: 3.4840023517608643


training:  47%|████▋     | 5172/10986 [1:59:15<2:21:53,  1.46s/it]

training loss: 3.4748618602752686


training:  47%|████▋     | 5173/10986 [1:59:17<2:17:44,  1.42s/it]

training loss: 3.34405255317688


training:  47%|████▋     | 5174/10986 [1:59:18<2:14:07,  1.38s/it]

training loss: 3.3827359676361084


training:  47%|████▋     | 5175/10986 [1:59:19<2:12:07,  1.36s/it]

training loss: 3.4987244606018066


training:  47%|████▋     | 5176/10986 [1:59:21<2:10:16,  1.35s/it]

training loss: 3.438323497772217


training:  47%|████▋     | 5177/10986 [1:59:22<2:09:14,  1.33s/it]

training loss: 3.4917001724243164


training:  47%|████▋     | 5178/10986 [1:59:23<2:08:00,  1.32s/it]

training loss: 3.32812762260437


training:  47%|████▋     | 5179/10986 [1:59:25<2:08:14,  1.32s/it]

training loss: 3.5020558834075928


training:  47%|████▋     | 5180/10986 [1:59:26<2:07:42,  1.32s/it]

training loss: 3.4289188385009766
valid loss: 3.4288995265960693
perplexity: 30.842681884765625


training:  47%|████▋     | 5181/10986 [1:59:29<2:50:32,  1.76s/it]

training loss: 3.464353561401367


training:  47%|████▋     | 5182/10986 [1:59:30<2:40:03,  1.65s/it]

training loss: 3.408477306365967


training:  47%|████▋     | 5183/10986 [1:59:31<2:31:40,  1.57s/it]

training loss: 3.425626754760742


training:  47%|████▋     | 5184/10986 [1:59:33<2:24:29,  1.49s/it]

training loss: 3.440495014190674


training:  47%|████▋     | 5185/10986 [1:59:34<2:20:20,  1.45s/it]

training loss: 3.483703851699829


training:  47%|████▋     | 5186/10986 [1:59:35<2:16:42,  1.41s/it]

training loss: 3.447542428970337


training:  47%|████▋     | 5187/10986 [1:59:37<2:15:06,  1.40s/it]

training loss: 3.3859450817108154


training:  47%|████▋     | 5188/10986 [1:59:38<2:12:59,  1.38s/it]

training loss: 3.497619390487671


training:  47%|████▋     | 5189/10986 [1:59:39<2:10:58,  1.36s/it]

training loss: 3.446052312850952


training:  47%|████▋     | 5190/10986 [1:59:41<2:09:38,  1.34s/it]

training loss: 3.3187649250030518


training:  47%|████▋     | 5191/10986 [1:59:42<2:17:38,  1.43s/it]

training loss: 3.419776439666748


training:  47%|████▋     | 5192/10986 [1:59:44<2:15:10,  1.40s/it]

training loss: 3.5074896812438965


training:  47%|████▋     | 5193/10986 [1:59:45<2:12:47,  1.38s/it]

training loss: 3.375570297241211


training:  47%|████▋     | 5194/10986 [1:59:46<2:10:46,  1.35s/it]

training loss: 3.406493902206421


training:  47%|████▋     | 5195/10986 [1:59:48<2:09:32,  1.34s/it]

training loss: 3.3683459758758545


training:  47%|████▋     | 5196/10986 [1:59:49<2:08:02,  1.33s/it]

training loss: 3.394228935241699


training:  47%|████▋     | 5197/10986 [1:59:50<2:07:24,  1.32s/it]

training loss: 3.3829009532928467


training:  47%|████▋     | 5198/10986 [1:59:52<2:07:22,  1.32s/it]

training loss: 3.4122793674468994


training:  47%|████▋     | 5199/10986 [1:59:53<2:07:03,  1.32s/it]

training loss: 3.455181360244751


training:  47%|████▋     | 5200/10986 [1:59:54<2:07:27,  1.32s/it]

training loss: 3.3750734329223633
valid loss: 3.375227928161621
perplexity: 29.230945587158203


training:  47%|████▋     | 5201/10986 [1:59:57<2:50:43,  1.77s/it]

training loss: 3.3657689094543457


training:  47%|████▋     | 5202/10986 [1:59:58<2:40:19,  1.66s/it]

training loss: 3.3201119899749756


training:  47%|████▋     | 5203/10986 [2:00:00<2:31:01,  1.57s/it]

training loss: 3.486790895462036


training:  47%|████▋     | 5204/10986 [2:00:01<2:24:44,  1.50s/it]

training loss: 3.3600378036499023


training:  47%|████▋     | 5205/10986 [2:00:02<2:20:22,  1.46s/it]

training loss: 3.3125834465026855


training:  47%|████▋     | 5206/10986 [2:00:04<2:16:06,  1.41s/it]

training loss: 3.351487874984741


training:  47%|████▋     | 5207/10986 [2:00:05<2:12:46,  1.38s/it]

training loss: 3.3602523803710938


training:  47%|████▋     | 5208/10986 [2:00:06<2:11:07,  1.36s/it]

training loss: 3.283935308456421


training:  47%|████▋     | 5209/10986 [2:00:08<2:09:50,  1.35s/it]

training loss: 3.485328197479248


training:  47%|████▋     | 5210/10986 [2:00:09<2:09:00,  1.34s/it]

training loss: 3.448829412460327


training:  47%|████▋     | 5211/10986 [2:00:11<2:16:13,  1.42s/it]

training loss: 3.500847101211548


training:  47%|████▋     | 5212/10986 [2:00:12<2:15:20,  1.41s/it]

training loss: 3.378237724304199


training:  47%|████▋     | 5213/10986 [2:00:13<2:13:29,  1.39s/it]

training loss: 3.4726669788360596


training:  47%|████▋     | 5214/10986 [2:00:15<2:12:15,  1.37s/it]

training loss: 3.517573833465576


training:  47%|████▋     | 5215/10986 [2:00:16<2:11:34,  1.37s/it]

training loss: 3.4505226612091064


training:  47%|████▋     | 5216/10986 [2:00:17<2:11:00,  1.36s/it]

training loss: 3.3759443759918213


training:  47%|████▋     | 5217/10986 [2:00:19<2:10:04,  1.35s/it]

training loss: 3.4506075382232666


training:  47%|████▋     | 5218/10986 [2:00:20<2:09:32,  1.35s/it]

training loss: 3.2958271503448486


training:  48%|████▊     | 5219/10986 [2:00:21<2:08:52,  1.34s/it]

training loss: 3.4016032218933105


training:  48%|████▊     | 5220/10986 [2:00:23<2:08:19,  1.34s/it]

training loss: 3.4213407039642334
valid loss: 3.413663625717163
perplexity: 30.376327514648438


training:  48%|████▊     | 5221/10986 [2:00:26<2:51:05,  1.78s/it]

training loss: 3.448469638824463


training:  48%|████▊     | 5222/10986 [2:00:27<2:49:39,  1.77s/it]

training loss: 3.3758652210235596


training:  48%|████▊     | 5223/10986 [2:00:29<2:36:35,  1.63s/it]

training loss: 3.298318862915039


training:  48%|████▊     | 5224/10986 [2:00:30<2:28:07,  1.54s/it]

training loss: 3.339777708053589


training:  48%|████▊     | 5225/10986 [2:00:31<2:23:26,  1.49s/it]

training loss: 3.439790964126587


training:  48%|████▊     | 5226/10986 [2:00:33<2:18:53,  1.45s/it]

training loss: 3.2897775173187256


training:  48%|████▊     | 5227/10986 [2:00:34<2:15:12,  1.41s/it]

training loss: 3.328127145767212


training:  48%|████▊     | 5228/10986 [2:00:35<2:12:48,  1.38s/it]

training loss: 3.3316643238067627


training:  48%|████▊     | 5229/10986 [2:00:37<2:11:14,  1.37s/it]

training loss: 3.400540351867676


training:  48%|████▊     | 5230/10986 [2:00:38<2:10:17,  1.36s/it]

training loss: 3.3095903396606445


training:  48%|████▊     | 5231/10986 [2:00:40<2:17:27,  1.43s/it]

training loss: 3.4000768661499023


training:  48%|████▊     | 5232/10986 [2:00:41<2:15:16,  1.41s/it]

training loss: 3.37679386138916


training:  48%|████▊     | 5233/10986 [2:00:42<2:13:14,  1.39s/it]

training loss: 3.3274478912353516


training:  48%|████▊     | 5234/10986 [2:00:44<2:11:02,  1.37s/it]

training loss: 3.46716046333313


training:  48%|████▊     | 5235/10986 [2:00:45<2:09:35,  1.35s/it]

training loss: 3.405892848968506


training:  48%|████▊     | 5236/10986 [2:00:46<2:08:45,  1.34s/it]

training loss: 3.399595260620117


training:  48%|████▊     | 5237/10986 [2:00:48<2:08:37,  1.34s/it]

training loss: 3.359025478363037


training:  48%|████▊     | 5238/10986 [2:00:49<2:08:10,  1.34s/it]

training loss: 3.512507677078247


training:  48%|████▊     | 5239/10986 [2:00:50<2:08:28,  1.34s/it]

training loss: 3.486077308654785


training:  48%|████▊     | 5240/10986 [2:00:52<2:08:12,  1.34s/it]

training loss: 3.4029624462127686
valid loss: 3.397120237350464
perplexity: 29.8779354095459


training:  48%|████▊     | 5241/10986 [2:00:54<2:53:03,  1.81s/it]

training loss: 3.435704469680786


training:  48%|████▊     | 5242/10986 [2:00:56<2:41:46,  1.69s/it]

training loss: 3.4748077392578125


training:  48%|████▊     | 5243/10986 [2:00:57<2:31:48,  1.59s/it]

training loss: 3.3444955348968506


training:  48%|████▊     | 5244/10986 [2:00:59<2:24:16,  1.51s/it]

training loss: 3.3564884662628174


training:  48%|████▊     | 5245/10986 [2:01:00<2:18:55,  1.45s/it]

training loss: 3.4450414180755615


training:  48%|████▊     | 5246/10986 [2:01:01<2:16:08,  1.42s/it]

training loss: 3.4798736572265625


training:  48%|████▊     | 5247/10986 [2:01:03<2:22:30,  1.49s/it]

training loss: 3.3956711292266846


training:  48%|████▊     | 5248/10986 [2:01:05<2:30:23,  1.57s/it]

training loss: 3.3089704513549805


training:  48%|████▊     | 5249/10986 [2:01:06<2:30:18,  1.57s/it]

training loss: 3.304931879043579


training:  48%|████▊     | 5250/10986 [2:01:08<2:23:20,  1.50s/it]

training loss: 3.400392770767212


training:  48%|████▊     | 5251/10986 [2:01:09<2:27:32,  1.54s/it]

training loss: 3.272407293319702


training:  48%|████▊     | 5252/10986 [2:01:11<2:21:57,  1.49s/it]

training loss: 3.326470375061035


training:  48%|████▊     | 5253/10986 [2:01:12<2:17:26,  1.44s/it]

training loss: 3.4443557262420654


training:  48%|████▊     | 5254/10986 [2:01:13<2:14:35,  1.41s/it]

training loss: 3.4854726791381836


training:  48%|████▊     | 5255/10986 [2:01:15<2:12:21,  1.39s/it]

training loss: 3.4065213203430176


training:  48%|████▊     | 5256/10986 [2:01:16<2:10:45,  1.37s/it]

training loss: 3.3881711959838867


training:  48%|████▊     | 5257/10986 [2:01:17<2:09:48,  1.36s/it]

training loss: 3.436293840408325


training:  48%|████▊     | 5258/10986 [2:01:19<2:09:26,  1.36s/it]

training loss: 3.3776469230651855


training:  48%|████▊     | 5259/10986 [2:01:20<2:09:16,  1.35s/it]

training loss: 3.387295961380005


training:  48%|████▊     | 5260/10986 [2:01:21<2:09:07,  1.35s/it]

training loss: 3.5380849838256836
valid loss: 3.5346906185150146
perplexity: 34.28440856933594


training:  48%|████▊     | 5261/10986 [2:01:24<2:51:12,  1.79s/it]

training loss: 3.4562721252441406


training:  48%|████▊     | 5262/10986 [2:01:25<2:39:45,  1.67s/it]

training loss: 3.2838985919952393


training:  48%|████▊     | 5263/10986 [2:01:27<2:29:53,  1.57s/it]

training loss: 3.439974784851074


training:  48%|████▊     | 5264/10986 [2:01:28<2:22:06,  1.49s/it]

training loss: 3.268219470977783


training:  48%|████▊     | 5265/10986 [2:01:29<2:17:28,  1.44s/it]

training loss: 3.4043467044830322


training:  48%|████▊     | 5266/10986 [2:01:31<2:14:25,  1.41s/it]

training loss: 3.4378139972686768


training:  48%|████▊     | 5267/10986 [2:01:32<2:13:49,  1.40s/it]

training loss: 3.381009817123413


training:  48%|████▊     | 5268/10986 [2:01:33<2:12:27,  1.39s/it]

training loss: 3.42423415184021


training:  48%|████▊     | 5269/10986 [2:01:35<2:10:18,  1.37s/it]

training loss: 3.4083714485168457


training:  48%|████▊     | 5270/10986 [2:01:36<2:09:17,  1.36s/it]

training loss: 3.3907041549682617


training:  48%|████▊     | 5271/10986 [2:01:38<2:16:50,  1.44s/it]

training loss: 3.3076674938201904


training:  48%|████▊     | 5272/10986 [2:01:39<2:15:12,  1.42s/it]

training loss: 3.3752527236938477


training:  48%|████▊     | 5273/10986 [2:01:40<2:12:40,  1.39s/it]

training loss: 3.267958164215088


training:  48%|████▊     | 5274/10986 [2:01:42<2:10:53,  1.37s/it]

training loss: 3.5080113410949707


training:  48%|████▊     | 5275/10986 [2:01:43<2:09:57,  1.37s/it]

training loss: 3.304560661315918


training:  48%|████▊     | 5276/10986 [2:01:44<2:09:33,  1.36s/it]

training loss: 3.357110023498535


training:  48%|████▊     | 5277/10986 [2:01:46<2:10:14,  1.37s/it]

training loss: 3.402318000793457


training:  48%|████▊     | 5278/10986 [2:01:47<2:08:50,  1.35s/it]

training loss: 3.2681236267089844


training:  48%|████▊     | 5279/10986 [2:01:49<2:08:11,  1.35s/it]

training loss: 3.235541820526123


training:  48%|████▊     | 5280/10986 [2:01:50<2:07:29,  1.34s/it]

training loss: 3.30092716217041
valid loss: 3.3001022338867188
perplexity: 27.11540985107422


training:  48%|████▊     | 5281/10986 [2:01:53<2:49:29,  1.78s/it]

training loss: 3.3438925743103027


training:  48%|████▊     | 5282/10986 [2:01:54<2:46:25,  1.75s/it]

training loss: 3.3617751598358154


training:  48%|████▊     | 5283/10986 [2:01:56<2:34:20,  1.62s/it]

training loss: 3.4113216400146484


training:  48%|████▊     | 5284/10986 [2:01:57<2:25:52,  1.54s/it]

training loss: 3.5676915645599365


training:  48%|████▊     | 5285/10986 [2:01:58<2:20:03,  1.47s/it]

training loss: 3.486941337585449


training:  48%|████▊     | 5286/10986 [2:02:00<2:15:25,  1.43s/it]

training loss: 3.4539215564727783


training:  48%|████▊     | 5287/10986 [2:02:01<2:12:28,  1.39s/it]

training loss: 3.369904041290283


training:  48%|████▊     | 5288/10986 [2:02:02<2:12:28,  1.40s/it]

training loss: 3.381244659423828


training:  48%|████▊     | 5289/10986 [2:02:04<2:10:33,  1.38s/it]

training loss: 3.374948501586914


training:  48%|████▊     | 5290/10986 [2:02:05<2:09:03,  1.36s/it]

training loss: 3.3680968284606934


training:  48%|████▊     | 5291/10986 [2:02:07<2:16:10,  1.43s/it]

training loss: 3.468438148498535


training:  48%|████▊     | 5292/10986 [2:02:08<2:14:16,  1.41s/it]

training loss: 3.466207504272461


training:  48%|████▊     | 5293/10986 [2:02:09<2:12:16,  1.39s/it]

training loss: 3.487398147583008


training:  48%|████▊     | 5294/10986 [2:02:11<2:10:58,  1.38s/it]

training loss: 3.42061710357666


training:  48%|████▊     | 5295/10986 [2:02:12<2:09:40,  1.37s/it]

training loss: 3.437201976776123


training:  48%|████▊     | 5296/10986 [2:02:13<2:08:08,  1.35s/it]

training loss: 3.323324203491211


training:  48%|████▊     | 5297/10986 [2:02:15<2:07:31,  1.35s/it]

training loss: 3.4323363304138184


training:  48%|████▊     | 5298/10986 [2:02:16<2:07:25,  1.34s/it]

training loss: 3.525923013687134


training:  48%|████▊     | 5299/10986 [2:02:17<2:07:12,  1.34s/it]

training loss: 3.3853774070739746


training:  48%|████▊     | 5300/10986 [2:02:19<2:06:58,  1.34s/it]

training loss: 3.2733795642852783
valid loss: 3.2658302783966064
perplexity: 26.20185661315918


training:  48%|████▊     | 5301/10986 [2:02:22<2:49:23,  1.79s/it]

training loss: 3.3751158714294434


training:  48%|████▊     | 5302/10986 [2:02:23<2:46:15,  1.76s/it]

training loss: 3.5337400436401367


training:  48%|████▊     | 5303/10986 [2:02:25<2:35:07,  1.64s/it]

training loss: 3.3688125610351562


training:  48%|████▊     | 5304/10986 [2:02:26<2:26:09,  1.54s/it]

training loss: 3.433175802230835


training:  48%|████▊     | 5305/10986 [2:02:27<2:20:04,  1.48s/it]

training loss: 3.3874435424804688


training:  48%|████▊     | 5306/10986 [2:02:29<2:15:35,  1.43s/it]

training loss: 3.4273524284362793


training:  48%|████▊     | 5307/10986 [2:02:30<2:11:47,  1.39s/it]

training loss: 3.431520700454712


training:  48%|████▊     | 5308/10986 [2:02:31<2:10:51,  1.38s/it]

training loss: 3.403984546661377


training:  48%|████▊     | 5309/10986 [2:02:33<2:09:43,  1.37s/it]

training loss: 3.513749361038208


training:  48%|████▊     | 5310/10986 [2:02:34<2:08:55,  1.36s/it]

training loss: 3.3974859714508057


training:  48%|████▊     | 5311/10986 [2:02:36<2:16:37,  1.44s/it]

training loss: 3.323164939880371


training:  48%|████▊     | 5312/10986 [2:02:37<2:14:19,  1.42s/it]

training loss: 3.3848042488098145


training:  48%|████▊     | 5313/10986 [2:02:38<2:12:10,  1.40s/it]

training loss: 3.4559781551361084


training:  48%|████▊     | 5314/10986 [2:02:40<2:10:27,  1.38s/it]

training loss: 3.322950601577759


training:  48%|████▊     | 5315/10986 [2:02:41<2:09:24,  1.37s/it]

training loss: 3.4735684394836426


training:  48%|████▊     | 5316/10986 [2:02:42<2:08:43,  1.36s/it]

training loss: 3.439284086227417


training:  48%|████▊     | 5317/10986 [2:02:44<2:08:22,  1.36s/it]

training loss: 3.305711507797241


training:  48%|████▊     | 5318/10986 [2:02:45<2:08:44,  1.36s/it]

training loss: 3.4330968856811523


training:  48%|████▊     | 5319/10986 [2:02:46<2:07:41,  1.35s/it]

training loss: 3.5787220001220703


training:  48%|████▊     | 5320/10986 [2:02:48<2:07:08,  1.35s/it]

training loss: 3.4706473350524902
valid loss: 3.4646387100219727
perplexity: 31.964908599853516


training:  48%|████▊     | 5321/10986 [2:02:50<2:49:45,  1.80s/it]

training loss: 3.428396701812744


training:  48%|████▊     | 5322/10986 [2:02:52<2:46:18,  1.76s/it]

training loss: 3.332426071166992


training:  48%|████▊     | 5323/10986 [2:02:54<2:34:04,  1.63s/it]

training loss: 3.330504894256592


training:  48%|████▊     | 5324/10986 [2:02:55<2:25:50,  1.55s/it]

training loss: 3.494281530380249


training:  48%|████▊     | 5325/10986 [2:02:56<2:19:39,  1.48s/it]

training loss: 3.456958293914795


training:  48%|████▊     | 5326/10986 [2:02:58<2:16:00,  1.44s/it]

training loss: 3.472606658935547


training:  48%|████▊     | 5327/10986 [2:02:59<2:13:11,  1.41s/it]

training loss: 3.393448829650879


training:  48%|████▊     | 5328/10986 [2:03:00<2:11:16,  1.39s/it]

training loss: 3.4085922241210938


training:  49%|████▊     | 5329/10986 [2:03:02<2:09:18,  1.37s/it]

training loss: 3.4497859477996826


training:  49%|████▊     | 5330/10986 [2:03:03<2:09:52,  1.38s/it]

training loss: 3.3987653255462646


training:  49%|████▊     | 5331/10986 [2:03:05<2:16:58,  1.45s/it]

training loss: 3.3204140663146973


training:  49%|████▊     | 5332/10986 [2:03:06<2:21:36,  1.50s/it]

training loss: 3.445589303970337


training:  49%|████▊     | 5333/10986 [2:03:08<2:17:11,  1.46s/it]

training loss: 3.4042887687683105


training:  49%|████▊     | 5334/10986 [2:03:09<2:14:08,  1.42s/it]

training loss: 3.465798854827881


training:  49%|████▊     | 5335/10986 [2:03:10<2:11:18,  1.39s/it]

training loss: 3.481288433074951


training:  49%|████▊     | 5336/10986 [2:03:12<2:09:49,  1.38s/it]

training loss: 3.5338308811187744


training:  49%|████▊     | 5337/10986 [2:03:13<2:08:58,  1.37s/it]

training loss: 3.387324333190918


training:  49%|████▊     | 5338/10986 [2:03:14<2:08:11,  1.36s/it]

training loss: 3.5426666736602783


training:  49%|████▊     | 5339/10986 [2:03:16<2:06:43,  1.35s/it]

training loss: 3.311908483505249


training:  49%|████▊     | 5340/10986 [2:03:17<2:06:24,  1.34s/it]

training loss: 3.2905025482177734
valid loss: 3.287468910217285
perplexity: 26.775007247924805


training:  49%|████▊     | 5341/10986 [2:03:20<2:48:57,  1.80s/it]

training loss: 3.3879456520080566


training:  49%|████▊     | 5342/10986 [2:03:21<2:38:18,  1.68s/it]

training loss: 3.399625301361084


training:  49%|████▊     | 5343/10986 [2:03:22<2:28:52,  1.58s/it]

training loss: 3.349132776260376


training:  49%|████▊     | 5344/10986 [2:03:24<2:21:27,  1.50s/it]

training loss: 3.3332648277282715


training:  49%|████▊     | 5345/10986 [2:03:25<2:16:20,  1.45s/it]

training loss: 3.4323766231536865


training:  49%|████▊     | 5346/10986 [2:03:26<2:13:30,  1.42s/it]

training loss: 3.4211578369140625


training:  49%|████▊     | 5347/10986 [2:03:28<2:10:38,  1.39s/it]

training loss: 3.368887186050415


training:  49%|████▊     | 5348/10986 [2:03:29<2:09:07,  1.37s/it]

training loss: 3.3966007232666016


training:  49%|████▊     | 5349/10986 [2:03:30<2:07:50,  1.36s/it]

training loss: 3.4501585960388184


training:  49%|████▊     | 5350/10986 [2:03:32<2:06:53,  1.35s/it]

training loss: 3.3306775093078613


training:  49%|████▊     | 5351/10986 [2:03:33<2:15:58,  1.45s/it]

training loss: 3.339120864868164


training:  49%|████▊     | 5352/10986 [2:03:35<2:19:40,  1.49s/it]

training loss: 3.4662251472473145


training:  49%|████▊     | 5353/10986 [2:03:36<2:15:37,  1.44s/it]

training loss: 3.417922258377075


training:  49%|████▊     | 5354/10986 [2:03:38<2:12:25,  1.41s/it]

training loss: 3.346684455871582


training:  49%|████▊     | 5355/10986 [2:03:39<2:10:15,  1.39s/it]

training loss: 3.4033873081207275


training:  49%|████▉     | 5356/10986 [2:03:40<2:08:25,  1.37s/it]

training loss: 3.4460902214050293


training:  49%|████▉     | 5357/10986 [2:03:42<2:07:26,  1.36s/it]

training loss: 3.3525757789611816


training:  49%|████▉     | 5358/10986 [2:03:43<2:06:21,  1.35s/it]

training loss: 3.4194416999816895


training:  49%|████▉     | 5359/10986 [2:03:44<2:06:09,  1.35s/it]

training loss: 3.4094529151916504


training:  49%|████▉     | 5360/10986 [2:03:46<2:05:24,  1.34s/it]

training loss: 3.3731396198272705
valid loss: 3.3689627647399902
perplexity: 29.048383712768555


training:  49%|████▉     | 5361/10986 [2:03:49<2:47:28,  1.79s/it]

training loss: 3.441080093383789


training:  49%|████▉     | 5362/10986 [2:03:50<2:37:03,  1.68s/it]

training loss: 3.502389907836914


training:  49%|████▉     | 5363/10986 [2:03:51<2:27:52,  1.58s/it]

training loss: 3.2996673583984375


training:  49%|████▉     | 5364/10986 [2:03:53<2:20:44,  1.50s/it]

training loss: 3.329251766204834


training:  49%|████▉     | 5365/10986 [2:03:54<2:16:26,  1.46s/it]

training loss: 3.368441343307495


training:  49%|████▉     | 5366/10986 [2:03:55<2:13:24,  1.42s/it]

training loss: 3.3493828773498535


training:  49%|████▉     | 5367/10986 [2:03:57<2:10:29,  1.39s/it]

training loss: 3.2827258110046387


training:  49%|████▉     | 5368/10986 [2:03:58<2:09:51,  1.39s/it]

training loss: 3.3558597564697266


training:  49%|████▉     | 5369/10986 [2:03:59<2:08:04,  1.37s/it]

training loss: 3.369699716567993


training:  49%|████▉     | 5370/10986 [2:04:01<2:06:51,  1.36s/it]

training loss: 3.3958113193511963


training:  49%|████▉     | 5371/10986 [2:04:02<2:13:53,  1.43s/it]

training loss: 3.394301176071167


training:  49%|████▉     | 5372/10986 [2:04:04<2:12:23,  1.41s/it]

training loss: 3.4697418212890625


training:  49%|████▉     | 5373/10986 [2:04:05<2:09:33,  1.38s/it]

training loss: 3.302515745162964


training:  49%|████▉     | 5374/10986 [2:04:06<2:08:08,  1.37s/it]

training loss: 3.4350240230560303


training:  49%|████▉     | 5375/10986 [2:04:08<2:06:58,  1.36s/it]

training loss: 3.355062484741211


training:  49%|████▉     | 5376/10986 [2:04:09<2:06:30,  1.35s/it]

training loss: 3.4425175189971924


training:  49%|████▉     | 5377/10986 [2:04:10<2:05:43,  1.34s/it]

training loss: 3.496303081512451


training:  49%|████▉     | 5378/10986 [2:04:12<2:04:52,  1.34s/it]

training loss: 3.4253344535827637


training:  49%|████▉     | 5379/10986 [2:04:13<2:04:18,  1.33s/it]

training loss: 3.463493824005127


training:  49%|████▉     | 5380/10986 [2:04:14<2:04:27,  1.33s/it]

training loss: 3.399068593978882
valid loss: 3.3965327739715576
perplexity: 29.86038589477539


training:  49%|████▉     | 5381/10986 [2:04:17<2:47:11,  1.79s/it]

training loss: 3.4363222122192383


training:  49%|████▉     | 5382/10986 [2:04:19<2:37:06,  1.68s/it]

training loss: 3.5321853160858154


training:  49%|████▉     | 5383/10986 [2:04:20<2:27:33,  1.58s/it]

training loss: 3.3729407787323


training:  49%|████▉     | 5384/10986 [2:04:21<2:20:15,  1.50s/it]

training loss: 3.4548864364624023


training:  49%|████▉     | 5385/10986 [2:04:23<2:15:09,  1.45s/it]

training loss: 3.440033197402954


training:  49%|████▉     | 5386/10986 [2:04:24<2:11:10,  1.41s/it]

training loss: 3.3539581298828125


training:  49%|████▉     | 5387/10986 [2:04:25<2:08:34,  1.38s/it]

training loss: 3.4555790424346924


training:  49%|████▉     | 5388/10986 [2:04:26<2:07:06,  1.36s/it]

training loss: 3.572460174560547


training:  49%|████▉     | 5389/10986 [2:04:28<2:05:54,  1.35s/it]

training loss: 3.3170862197875977


training:  49%|████▉     | 5390/10986 [2:04:29<2:05:10,  1.34s/it]

training loss: 3.2890543937683105


training:  49%|████▉     | 5391/10986 [2:04:31<2:25:10,  1.56s/it]

training loss: 3.5229294300079346


training:  49%|████▉     | 5392/10986 [2:04:33<2:39:30,  1.71s/it]

training loss: 3.398364543914795


training:  49%|████▉     | 5393/10986 [2:04:35<2:28:29,  1.59s/it]

training loss: 3.445899248123169


training:  49%|████▉     | 5394/10986 [2:04:36<2:20:49,  1.51s/it]

training loss: 3.3728644847869873


training:  49%|████▉     | 5395/10986 [2:04:37<2:16:03,  1.46s/it]

training loss: 3.3926713466644287


training:  49%|████▉     | 5396/10986 [2:04:39<2:11:58,  1.42s/it]

training loss: 3.504110813140869


training:  49%|████▉     | 5397/10986 [2:04:40<2:09:11,  1.39s/it]

training loss: 3.3427670001983643


training:  49%|████▉     | 5398/10986 [2:04:41<2:07:36,  1.37s/it]

training loss: 3.4007320404052734


training:  49%|████▉     | 5399/10986 [2:04:43<2:06:17,  1.36s/it]

training loss: 3.482584238052368


training:  49%|████▉     | 5400/10986 [2:04:44<2:04:25,  1.34s/it]

training loss: 3.306048631668091
valid loss: 3.304137706756592
perplexity: 27.225055694580078


training:  49%|████▉     | 5401/10986 [2:04:47<2:46:06,  1.78s/it]

training loss: 3.4218740463256836


training:  49%|████▉     | 5402/10986 [2:04:48<2:35:10,  1.67s/it]

training loss: 3.4289872646331787


training:  49%|████▉     | 5403/10986 [2:04:49<2:26:02,  1.57s/it]

training loss: 3.3447420597076416


training:  49%|████▉     | 5404/10986 [2:04:51<2:19:51,  1.50s/it]

training loss: 3.459775686264038


training:  49%|████▉     | 5405/10986 [2:04:52<2:14:42,  1.45s/it]

training loss: 3.450361728668213


training:  49%|████▉     | 5406/10986 [2:04:53<2:11:32,  1.41s/it]

training loss: 3.4178926944732666


training:  49%|████▉     | 5407/10986 [2:04:55<2:09:24,  1.39s/it]

training loss: 3.539841651916504


training:  49%|████▉     | 5408/10986 [2:04:56<2:07:36,  1.37s/it]

training loss: 3.4741177558898926


training:  49%|████▉     | 5409/10986 [2:04:57<2:06:08,  1.36s/it]

training loss: 3.36897349357605


training:  49%|████▉     | 5410/10986 [2:04:59<2:06:23,  1.36s/it]

training loss: 3.5013105869293213


training:  49%|████▉     | 5411/10986 [2:05:00<2:13:07,  1.43s/it]

training loss: 3.3662970066070557


training:  49%|████▉     | 5412/10986 [2:05:02<2:12:42,  1.43s/it]

training loss: 3.5058887004852295


training:  49%|████▉     | 5413/10986 [2:05:03<2:10:04,  1.40s/it]

training loss: 3.3657140731811523


training:  49%|████▉     | 5414/10986 [2:05:04<2:08:51,  1.39s/it]

training loss: 3.390324592590332


training:  49%|████▉     | 5415/10986 [2:05:06<2:06:53,  1.37s/it]

training loss: 3.2541301250457764


training:  49%|████▉     | 5416/10986 [2:05:07<2:05:54,  1.36s/it]

training loss: 3.30698561668396


training:  49%|████▉     | 5417/10986 [2:05:08<2:04:24,  1.34s/it]

training loss: 3.3107385635375977


training:  49%|████▉     | 5418/10986 [2:05:10<2:03:26,  1.33s/it]

training loss: 3.4192423820495605


training:  49%|████▉     | 5419/10986 [2:05:11<2:03:02,  1.33s/it]

training loss: 3.4576034545898438


training:  49%|████▉     | 5420/10986 [2:05:12<2:03:11,  1.33s/it]

training loss: 3.4119482040405273
valid loss: 3.4098637104034424
perplexity: 30.261119842529297


training:  49%|████▉     | 5421/10986 [2:05:15<2:45:12,  1.78s/it]

training loss: 3.401028871536255


training:  49%|████▉     | 5422/10986 [2:05:17<2:38:12,  1.71s/it]

training loss: 3.4189555644989014


training:  49%|████▉     | 5423/10986 [2:05:18<2:28:46,  1.60s/it]

training loss: 3.351635694503784


training:  49%|████▉     | 5424/10986 [2:05:19<2:21:18,  1.52s/it]

training loss: 3.437533378601074


training:  49%|████▉     | 5425/10986 [2:05:21<2:16:39,  1.47s/it]

training loss: 3.38594913482666


training:  49%|████▉     | 5426/10986 [2:05:22<2:13:13,  1.44s/it]

training loss: 3.3947513103485107


training:  49%|████▉     | 5427/10986 [2:05:24<2:11:19,  1.42s/it]

training loss: 3.393765687942505


training:  49%|████▉     | 5428/10986 [2:05:25<2:08:52,  1.39s/it]

training loss: 3.435755729675293


training:  49%|████▉     | 5429/10986 [2:05:26<2:07:19,  1.37s/it]

training loss: 3.330005645751953


training:  49%|████▉     | 5430/10986 [2:05:28<2:06:38,  1.37s/it]

training loss: 3.44683837890625


training:  49%|████▉     | 5431/10986 [2:05:29<2:14:20,  1.45s/it]

training loss: 3.488495349884033


training:  49%|████▉     | 5432/10986 [2:05:31<2:11:29,  1.42s/it]

training loss: 3.3832201957702637


training:  49%|████▉     | 5433/10986 [2:05:32<2:09:16,  1.40s/it]

training loss: 3.4155077934265137


training:  49%|████▉     | 5434/10986 [2:05:33<2:07:53,  1.38s/it]

training loss: 3.3393146991729736


training:  49%|████▉     | 5435/10986 [2:05:35<2:07:20,  1.38s/it]

training loss: 3.410220146179199


training:  49%|████▉     | 5436/10986 [2:05:36<2:06:14,  1.36s/it]

training loss: 3.479496717453003


training:  49%|████▉     | 5437/10986 [2:05:37<2:05:37,  1.36s/it]

training loss: 3.380608320236206


training:  49%|████▉     | 5438/10986 [2:05:39<2:04:54,  1.35s/it]

training loss: 3.4217560291290283


training:  50%|████▉     | 5439/10986 [2:05:40<2:04:58,  1.35s/it]

training loss: 3.4512884616851807


training:  50%|████▉     | 5440/10986 [2:05:41<2:05:06,  1.35s/it]

training loss: 3.3664989471435547
valid loss: 3.3636233806610107
perplexity: 28.893693923950195


training:  50%|████▉     | 5441/10986 [2:05:44<2:48:28,  1.82s/it]

training loss: 3.3693790435791016


training:  50%|████▉     | 5442/10986 [2:05:46<2:38:44,  1.72s/it]

training loss: 3.363178014755249


training:  50%|████▉     | 5443/10986 [2:05:48<2:41:28,  1.75s/it]

training loss: 3.3541650772094727


training:  50%|████▉     | 5444/10986 [2:05:49<2:30:35,  1.63s/it]

training loss: 3.455636501312256


training:  50%|████▉     | 5445/10986 [2:05:50<2:22:28,  1.54s/it]

training loss: 3.3690948486328125


training:  50%|████▉     | 5446/10986 [2:05:52<2:17:15,  1.49s/it]

training loss: 3.3391711711883545


training:  50%|████▉     | 5447/10986 [2:05:53<2:14:21,  1.46s/it]

training loss: 3.6067774295806885


training:  50%|████▉     | 5448/10986 [2:05:54<2:11:50,  1.43s/it]

training loss: 3.3054275512695312


training:  50%|████▉     | 5449/10986 [2:05:56<2:09:03,  1.40s/it]

training loss: 3.459611415863037


training:  50%|████▉     | 5450/10986 [2:05:57<2:07:29,  1.38s/it]

training loss: 3.4374473094940186


training:  50%|████▉     | 5451/10986 [2:05:59<2:13:36,  1.45s/it]

training loss: 3.3794338703155518


training:  50%|████▉     | 5452/10986 [2:06:00<2:12:23,  1.44s/it]

training loss: 3.427460193634033


training:  50%|████▉     | 5453/10986 [2:06:01<2:10:19,  1.41s/it]

training loss: 3.3278932571411133


training:  50%|████▉     | 5454/10986 [2:06:03<2:07:46,  1.39s/it]

training loss: 3.4236104488372803


training:  50%|████▉     | 5455/10986 [2:06:04<2:06:17,  1.37s/it]

training loss: 3.4969639778137207


training:  50%|████▉     | 5456/10986 [2:06:05<2:06:17,  1.37s/it]

training loss: 3.3355071544647217


training:  50%|████▉     | 5457/10986 [2:06:07<2:05:09,  1.36s/it]

training loss: 3.367401123046875


training:  50%|████▉     | 5458/10986 [2:06:08<2:04:14,  1.35s/it]

training loss: 3.469111442565918


training:  50%|████▉     | 5459/10986 [2:06:09<2:03:43,  1.34s/it]

training loss: 3.3672733306884766


training:  50%|████▉     | 5460/10986 [2:06:11<2:03:19,  1.34s/it]

training loss: 3.347959518432617
valid loss: 3.3475730419158936
perplexity: 28.433643341064453


training:  50%|████▉     | 5461/10986 [2:06:14<2:44:06,  1.78s/it]

training loss: 3.250770092010498


training:  50%|████▉     | 5462/10986 [2:06:15<2:42:19,  1.76s/it]

training loss: 3.3771865367889404


training:  50%|████▉     | 5463/10986 [2:06:17<2:31:37,  1.65s/it]

training loss: 3.3473339080810547


training:  50%|████▉     | 5464/10986 [2:06:18<2:22:47,  1.55s/it]

training loss: 3.4637985229492188


training:  50%|████▉     | 5465/10986 [2:06:19<2:16:48,  1.49s/it]

training loss: 3.449514389038086


training:  50%|████▉     | 5466/10986 [2:06:21<2:12:41,  1.44s/it]

training loss: 3.5115809440612793


training:  50%|████▉     | 5467/10986 [2:06:22<2:10:06,  1.41s/it]

training loss: 3.4989383220672607


training:  50%|████▉     | 5468/10986 [2:06:23<2:08:02,  1.39s/it]

training loss: 3.4061524868011475


training:  50%|████▉     | 5469/10986 [2:06:25<2:06:34,  1.38s/it]

training loss: 3.269871234893799


training:  50%|████▉     | 5470/10986 [2:06:26<2:05:26,  1.36s/it]

training loss: 3.412114143371582


training:  50%|████▉     | 5471/10986 [2:06:28<2:12:52,  1.45s/it]

training loss: 3.4797866344451904


training:  50%|████▉     | 5472/10986 [2:06:29<2:19:33,  1.52s/it]

training loss: 3.3730831146240234


training:  50%|████▉     | 5473/10986 [2:06:31<2:14:48,  1.47s/it]

training loss: 3.4316530227661133


training:  50%|████▉     | 5474/10986 [2:06:32<2:11:08,  1.43s/it]

training loss: 3.3654491901397705


training:  50%|████▉     | 5475/10986 [2:06:33<2:08:25,  1.40s/it]

training loss: 3.44572377204895


training:  50%|████▉     | 5476/10986 [2:06:35<2:06:55,  1.38s/it]

training loss: 3.403933048248291


training:  50%|████▉     | 5477/10986 [2:06:36<2:05:45,  1.37s/it]

training loss: 3.3622846603393555


training:  50%|████▉     | 5478/10986 [2:06:37<2:05:26,  1.37s/it]

training loss: 3.459810256958008


training:  50%|████▉     | 5479/10986 [2:06:39<2:04:47,  1.36s/it]

training loss: 3.4666590690612793


training:  50%|████▉     | 5480/10986 [2:06:40<2:04:26,  1.36s/it]

training loss: 3.3976693153381348
valid loss: 3.397259473800659
perplexity: 29.882095336914062


training:  50%|████▉     | 5481/10986 [2:06:43<2:46:45,  1.82s/it]

training loss: 3.373819589614868


training:  50%|████▉     | 5482/10986 [2:06:44<2:36:47,  1.71s/it]

training loss: 3.373506784439087


training:  50%|████▉     | 5483/10986 [2:06:46<2:26:48,  1.60s/it]

training loss: 3.4580554962158203


training:  50%|████▉     | 5484/10986 [2:06:47<2:19:44,  1.52s/it]

training loss: 3.3869740962982178


training:  50%|████▉     | 5485/10986 [2:06:48<2:14:17,  1.46s/it]

training loss: 3.42071795463562


training:  50%|████▉     | 5486/10986 [2:06:50<2:10:26,  1.42s/it]

training loss: 3.3583643436431885


training:  50%|████▉     | 5487/10986 [2:06:51<2:08:00,  1.40s/it]

training loss: 3.3427200317382812


training:  50%|████▉     | 5488/10986 [2:06:52<2:06:43,  1.38s/it]

training loss: 3.4180281162261963


training:  50%|████▉     | 5489/10986 [2:06:54<2:06:08,  1.38s/it]

training loss: 3.41629958152771


training:  50%|████▉     | 5490/10986 [2:06:55<2:05:18,  1.37s/it]

training loss: 3.3830416202545166


training:  50%|████▉     | 5491/10986 [2:06:57<2:12:52,  1.45s/it]

training loss: 3.4505062103271484


training:  50%|████▉     | 5492/10986 [2:06:58<2:10:37,  1.43s/it]

training loss: 3.4360427856445312


training:  50%|█████     | 5493/10986 [2:07:00<2:08:17,  1.40s/it]

training loss: 3.3582470417022705


training:  50%|█████     | 5494/10986 [2:07:01<2:06:50,  1.39s/it]

training loss: 3.3964598178863525


training:  50%|█████     | 5495/10986 [2:07:02<2:05:48,  1.37s/it]

training loss: 3.453967332839966


training:  50%|█████     | 5496/10986 [2:07:04<2:05:10,  1.37s/it]

training loss: 3.493760108947754


training:  50%|█████     | 5497/10986 [2:07:05<2:04:05,  1.36s/it]

training loss: 3.4617369174957275


training:  50%|█████     | 5498/10986 [2:07:06<2:03:53,  1.35s/it]

training loss: 3.3516721725463867


training:  50%|█████     | 5499/10986 [2:07:08<2:04:00,  1.36s/it]

training loss: 3.4899322986602783


training:  50%|█████     | 5500/10986 [2:07:09<2:05:23,  1.37s/it]

training loss: 3.4706931114196777
valid loss: 3.457667112350464
perplexity: 31.74283790588379


training:  50%|█████     | 5501/10986 [2:07:12<2:47:06,  1.83s/it]

training loss: 3.4818782806396484


training:  50%|█████     | 5502/10986 [2:07:13<2:39:59,  1.75s/it]

training loss: 3.4338877201080322


training:  50%|█████     | 5503/10986 [2:07:15<2:28:47,  1.63s/it]

training loss: 3.337740898132324


training:  50%|█████     | 5504/10986 [2:07:16<2:20:22,  1.54s/it]

training loss: 3.382716178894043


training:  50%|█████     | 5505/10986 [2:07:17<2:14:41,  1.47s/it]

training loss: 3.4289121627807617


training:  50%|█████     | 5506/10986 [2:07:19<2:11:50,  1.44s/it]

training loss: 3.354861259460449


training:  50%|█████     | 5507/10986 [2:07:20<2:08:56,  1.41s/it]

training loss: 3.439486026763916


training:  50%|█████     | 5508/10986 [2:07:22<2:07:39,  1.40s/it]

training loss: 3.3081068992614746


training:  50%|█████     | 5509/10986 [2:07:23<2:06:22,  1.38s/it]

training loss: 3.260141372680664


training:  50%|█████     | 5510/10986 [2:07:24<2:05:30,  1.38s/it]

training loss: 3.4305710792541504


training:  50%|█████     | 5511/10986 [2:07:26<2:13:09,  1.46s/it]

training loss: 3.431406259536743


training:  50%|█████     | 5512/10986 [2:07:28<2:18:55,  1.52s/it]

training loss: 3.508917808532715


training:  50%|█████     | 5513/10986 [2:07:29<2:14:04,  1.47s/it]

training loss: 3.372694730758667


training:  50%|█████     | 5514/10986 [2:07:30<2:11:13,  1.44s/it]

training loss: 3.4141829013824463


training:  50%|█████     | 5515/10986 [2:07:32<2:08:33,  1.41s/it]

training loss: 3.323429584503174


training:  50%|█████     | 5516/10986 [2:07:33<2:06:35,  1.39s/it]

training loss: 3.404402256011963


training:  50%|█████     | 5517/10986 [2:07:34<2:05:27,  1.38s/it]

training loss: 3.4222092628479004


training:  50%|█████     | 5518/10986 [2:07:36<2:04:08,  1.36s/it]

training loss: 3.3810088634490967


training:  50%|█████     | 5519/10986 [2:07:37<2:03:51,  1.36s/it]

training loss: 3.397979736328125


training:  50%|█████     | 5520/10986 [2:07:38<2:03:55,  1.36s/it]

training loss: 3.4622933864593506
valid loss: 3.460719108581543
perplexity: 31.839862823486328


training:  50%|█████     | 5521/10986 [2:07:41<2:44:54,  1.81s/it]

training loss: 3.5483386516571045


training:  50%|█████     | 5522/10986 [2:07:43<2:34:55,  1.70s/it]

training loss: 3.371392011642456


training:  50%|█████     | 5523/10986 [2:07:44<2:25:31,  1.60s/it]

training loss: 3.4442999362945557


training:  50%|█████     | 5524/10986 [2:07:45<2:19:01,  1.53s/it]

training loss: 3.383985996246338


training:  50%|█████     | 5525/10986 [2:07:47<2:13:50,  1.47s/it]

training loss: 3.28731632232666


training:  50%|█████     | 5526/10986 [2:07:48<2:10:35,  1.44s/it]

training loss: 3.323150396347046


training:  50%|█████     | 5527/10986 [2:07:49<2:07:52,  1.41s/it]

training loss: 3.5056331157684326


training:  50%|█████     | 5528/10986 [2:07:51<2:06:33,  1.39s/it]

training loss: 3.430431842803955


training:  50%|█████     | 5529/10986 [2:07:52<2:05:19,  1.38s/it]

training loss: 3.4332313537597656


training:  50%|█████     | 5530/10986 [2:07:53<2:03:58,  1.36s/it]

training loss: 3.400881290435791


training:  50%|█████     | 5531/10986 [2:07:55<2:12:24,  1.46s/it]

training loss: 3.3835625648498535


training:  50%|█████     | 5532/10986 [2:07:57<2:21:00,  1.55s/it]

training loss: 3.3149514198303223


training:  50%|█████     | 5533/10986 [2:07:59<2:28:27,  1.63s/it]

training loss: 3.4808762073516846


training:  50%|█████     | 5534/10986 [2:08:00<2:28:55,  1.64s/it]

training loss: 3.390803098678589


training:  50%|█████     | 5535/10986 [2:08:02<2:21:41,  1.56s/it]

training loss: 3.4078612327575684


training:  50%|█████     | 5536/10986 [2:08:03<2:15:49,  1.50s/it]

training loss: 3.409708023071289


training:  50%|█████     | 5537/10986 [2:08:04<2:12:13,  1.46s/it]

training loss: 3.3666653633117676


training:  50%|█████     | 5538/10986 [2:08:06<2:09:05,  1.42s/it]

training loss: 3.359330654144287


training:  50%|█████     | 5539/10986 [2:08:07<2:06:42,  1.40s/it]

training loss: 3.340831756591797


training:  50%|█████     | 5540/10986 [2:08:08<2:05:07,  1.38s/it]

training loss: 3.3797812461853027
valid loss: 3.376114845275879
perplexity: 29.256881713867188


training:  50%|█████     | 5541/10986 [2:08:11<2:46:43,  1.84s/it]

training loss: 3.3812005519866943


training:  50%|█████     | 5542/10986 [2:08:13<2:36:15,  1.72s/it]

training loss: 3.4443860054016113


training:  50%|█████     | 5543/10986 [2:08:14<2:26:09,  1.61s/it]

training loss: 3.4175143241882324


training:  50%|█████     | 5544/10986 [2:08:16<2:18:47,  1.53s/it]

training loss: 3.406459093093872


training:  50%|█████     | 5545/10986 [2:08:17<2:14:19,  1.48s/it]

training loss: 3.38252592086792


training:  50%|█████     | 5546/10986 [2:08:18<2:10:47,  1.44s/it]

training loss: 3.460109233856201


training:  50%|█████     | 5547/10986 [2:08:20<2:08:16,  1.42s/it]

training loss: 3.341412305831909


training:  51%|█████     | 5548/10986 [2:08:21<2:06:16,  1.39s/it]

training loss: 3.479811429977417


training:  51%|█████     | 5549/10986 [2:08:22<2:05:32,  1.39s/it]

training loss: 3.360499858856201


training:  51%|█████     | 5550/10986 [2:08:24<2:04:23,  1.37s/it]

training loss: 3.3632349967956543


training:  51%|█████     | 5551/10986 [2:08:25<2:11:30,  1.45s/it]

training loss: 3.435976505279541


training:  51%|█████     | 5552/10986 [2:08:27<2:14:12,  1.48s/it]

training loss: 3.447824239730835


training:  51%|█████     | 5553/10986 [2:08:28<2:10:04,  1.44s/it]

training loss: 3.5044257640838623


training:  51%|█████     | 5554/10986 [2:08:30<2:07:37,  1.41s/it]

training loss: 3.3279531002044678


training:  51%|█████     | 5555/10986 [2:08:31<2:05:21,  1.38s/it]

training loss: 3.428011178970337


training:  51%|█████     | 5556/10986 [2:08:32<2:04:21,  1.37s/it]

training loss: 3.4390804767608643


training:  51%|█████     | 5557/10986 [2:08:34<2:04:10,  1.37s/it]

training loss: 3.5596108436584473


training:  51%|█████     | 5558/10986 [2:08:35<2:03:27,  1.36s/it]

training loss: 3.4637818336486816


training:  51%|█████     | 5559/10986 [2:08:36<2:03:02,  1.36s/it]

training loss: 3.4474806785583496


training:  51%|█████     | 5560/10986 [2:08:38<2:02:33,  1.36s/it]

training loss: 3.3814539909362793
valid loss: 3.3773534297943115
perplexity: 29.293142318725586


training:  51%|█████     | 5561/10986 [2:08:41<2:45:08,  1.83s/it]

training loss: 3.4407026767730713


training:  51%|█████     | 5562/10986 [2:08:42<2:41:17,  1.78s/it]

training loss: 3.369274854660034


training:  51%|█████     | 5563/10986 [2:08:44<2:28:51,  1.65s/it]

training loss: 3.4427311420440674


training:  51%|█████     | 5564/10986 [2:08:45<2:21:16,  1.56s/it]

training loss: 3.3962318897247314


training:  51%|█████     | 5565/10986 [2:08:46<2:15:02,  1.49s/it]

training loss: 3.3902130126953125


training:  51%|█████     | 5566/10986 [2:08:48<2:10:38,  1.45s/it]

training loss: 3.4354019165039062


training:  51%|█████     | 5567/10986 [2:08:49<2:07:39,  1.41s/it]

training loss: 3.4159841537475586


training:  51%|█████     | 5568/10986 [2:08:50<2:05:30,  1.39s/it]

training loss: 3.2736005783081055


training:  51%|█████     | 5569/10986 [2:08:52<2:04:06,  1.37s/it]

training loss: 3.329012632369995


training:  51%|█████     | 5570/10986 [2:08:53<2:03:00,  1.36s/it]

training loss: 3.481721878051758


training:  51%|█████     | 5571/10986 [2:08:55<2:10:38,  1.45s/it]

training loss: 3.4986114501953125


training:  51%|█████     | 5572/10986 [2:08:56<2:11:38,  1.46s/it]

training loss: 3.269289493560791


training:  51%|█████     | 5573/10986 [2:08:57<2:08:41,  1.43s/it]

training loss: 3.479579448699951


training:  51%|█████     | 5574/10986 [2:08:59<2:06:35,  1.40s/it]

training loss: 3.4428021907806396


training:  51%|█████     | 5575/10986 [2:09:00<2:04:37,  1.38s/it]

training loss: 3.467599391937256


training:  51%|█████     | 5576/10986 [2:09:01<2:03:11,  1.37s/it]

training loss: 3.3456337451934814


training:  51%|█████     | 5577/10986 [2:09:03<2:02:30,  1.36s/it]

training loss: 3.30625581741333


training:  51%|█████     | 5578/10986 [2:09:04<2:02:07,  1.35s/it]

training loss: 3.3523669242858887


training:  51%|█████     | 5579/10986 [2:09:05<2:02:13,  1.36s/it]

training loss: 3.3681888580322266


training:  51%|█████     | 5580/10986 [2:09:07<2:01:49,  1.35s/it]

training loss: 3.4862942695617676
valid loss: 3.4811506271362305
perplexity: 32.497093200683594


training:  51%|█████     | 5581/10986 [2:09:10<2:43:36,  1.82s/it]

training loss: 3.3885722160339355


training:  51%|█████     | 5582/10986 [2:09:11<2:33:49,  1.71s/it]

training loss: 3.3899543285369873


training:  51%|█████     | 5583/10986 [2:09:13<2:24:37,  1.61s/it]

training loss: 3.450777530670166


training:  51%|█████     | 5584/10986 [2:09:14<2:16:59,  1.52s/it]

training loss: 3.333998918533325


training:  51%|█████     | 5585/10986 [2:09:15<2:12:11,  1.47s/it]

training loss: 3.430934190750122


training:  51%|█████     | 5586/10986 [2:09:17<2:08:56,  1.43s/it]

training loss: 3.327854633331299


training:  51%|█████     | 5587/10986 [2:09:18<2:06:57,  1.41s/it]

training loss: 3.3930938243865967


training:  51%|█████     | 5588/10986 [2:09:19<2:05:27,  1.39s/it]

training loss: 3.4601709842681885


training:  51%|█████     | 5589/10986 [2:09:21<2:04:02,  1.38s/it]

training loss: 3.3344779014587402


training:  51%|█████     | 5590/10986 [2:09:22<2:03:03,  1.37s/it]

training loss: 3.425245523452759


training:  51%|█████     | 5591/10986 [2:09:24<2:10:25,  1.45s/it]

training loss: 3.3786728382110596


training:  51%|█████     | 5592/10986 [2:09:25<2:14:34,  1.50s/it]

training loss: 3.4339373111724854


training:  51%|█████     | 5593/10986 [2:09:27<2:10:50,  1.46s/it]

training loss: 3.551914691925049


training:  51%|█████     | 5594/10986 [2:09:28<2:07:48,  1.42s/it]

training loss: 3.487471342086792


training:  51%|█████     | 5595/10986 [2:09:29<2:05:39,  1.40s/it]

training loss: 3.350637435913086


training:  51%|█████     | 5596/10986 [2:09:31<2:04:10,  1.38s/it]

training loss: 3.332730531692505


training:  51%|█████     | 5597/10986 [2:09:32<2:03:20,  1.37s/it]

training loss: 3.291691780090332


training:  51%|█████     | 5598/10986 [2:09:33<2:02:07,  1.36s/it]

training loss: 3.3069961071014404


training:  51%|█████     | 5599/10986 [2:09:35<2:01:36,  1.35s/it]

training loss: 3.444488048553467


training:  51%|█████     | 5600/10986 [2:09:36<2:01:09,  1.35s/it]

training loss: 3.437556266784668
valid loss: 3.431830406188965
perplexity: 30.933210372924805


training:  51%|█████     | 5601/10986 [2:09:39<2:41:40,  1.80s/it]

training loss: 3.3197391033172607


training:  51%|█████     | 5602/10986 [2:09:40<2:34:23,  1.72s/it]

training loss: 3.407942056655884


training:  51%|█████     | 5603/10986 [2:09:42<2:24:32,  1.61s/it]

training loss: 3.438318967819214


training:  51%|█████     | 5604/10986 [2:09:43<2:17:05,  1.53s/it]

training loss: 3.3257699012756348


training:  51%|█████     | 5605/10986 [2:09:44<2:12:15,  1.47s/it]

training loss: 3.3933253288269043


training:  51%|█████     | 5606/10986 [2:09:46<2:08:08,  1.43s/it]

training loss: 3.4401192665100098


training:  51%|█████     | 5607/10986 [2:09:47<2:05:05,  1.40s/it]

training loss: 3.355175018310547


training:  51%|█████     | 5608/10986 [2:09:48<2:03:21,  1.38s/it]

training loss: 3.443770408630371


training:  51%|█████     | 5609/10986 [2:09:50<2:02:04,  1.36s/it]

training loss: 3.3943238258361816


training:  51%|█████     | 5610/10986 [2:09:51<2:01:04,  1.35s/it]

training loss: 3.379024028778076


training:  51%|█████     | 5611/10986 [2:09:53<2:07:59,  1.43s/it]

training loss: 3.458771228790283


training:  51%|█████     | 5612/10986 [2:09:54<2:14:02,  1.50s/it]

training loss: 3.4495015144348145


training:  51%|█████     | 5613/10986 [2:09:56<2:10:18,  1.46s/it]

training loss: 3.3849904537200928


training:  51%|█████     | 5614/10986 [2:09:57<2:07:14,  1.42s/it]

training loss: 3.409146547317505


training:  51%|█████     | 5615/10986 [2:09:58<2:05:01,  1.40s/it]

training loss: 3.3178751468658447


training:  51%|█████     | 5616/10986 [2:10:00<2:03:21,  1.38s/it]

training loss: 3.4818918704986572


training:  51%|█████     | 5617/10986 [2:10:01<2:01:42,  1.36s/it]

training loss: 3.4023709297180176


training:  51%|█████     | 5618/10986 [2:10:02<2:00:12,  1.34s/it]

training loss: 3.391857147216797


training:  51%|█████     | 5619/10986 [2:10:04<1:59:23,  1.33s/it]

training loss: 3.379997491836548


training:  51%|█████     | 5620/10986 [2:10:05<1:58:38,  1.33s/it]

training loss: 3.4706311225891113
valid loss: 3.4635086059570312
perplexity: 31.928804397583008


training:  51%|█████     | 5621/10986 [2:10:08<2:39:57,  1.79s/it]

training loss: 3.335087299346924


training:  51%|█████     | 5622/10986 [2:10:09<2:37:45,  1.76s/it]

training loss: 3.418241500854492


training:  51%|█████     | 5623/10986 [2:10:11<2:28:09,  1.66s/it]

training loss: 3.367436647415161


training:  51%|█████     | 5624/10986 [2:10:12<2:19:11,  1.56s/it]

training loss: 3.4130470752716064


training:  51%|█████     | 5625/10986 [2:10:14<2:12:47,  1.49s/it]

training loss: 3.3493199348449707


training:  51%|█████     | 5626/10986 [2:10:15<2:08:06,  1.43s/it]

training loss: 3.451460838317871


training:  51%|█████     | 5627/10986 [2:10:16<2:05:08,  1.40s/it]

training loss: 3.442711591720581


training:  51%|█████     | 5628/10986 [2:10:17<2:03:37,  1.38s/it]

training loss: 3.366975784301758


training:  51%|█████     | 5629/10986 [2:10:19<2:02:26,  1.37s/it]

training loss: 3.37666392326355


training:  51%|█████     | 5630/10986 [2:10:20<2:01:10,  1.36s/it]

training loss: 3.5507009029388428


training:  51%|█████▏    | 5631/10986 [2:10:22<2:08:40,  1.44s/it]

training loss: 3.452157974243164


training:  51%|█████▏    | 5632/10986 [2:10:23<2:06:28,  1.42s/it]

training loss: 3.550351619720459


training:  51%|█████▏    | 5633/10986 [2:10:24<2:04:23,  1.39s/it]

training loss: 3.3088064193725586


training:  51%|█████▏    | 5634/10986 [2:10:26<2:02:58,  1.38s/it]

training loss: 3.3793327808380127


training:  51%|█████▏    | 5635/10986 [2:10:27<2:02:08,  1.37s/it]

training loss: 3.4155020713806152


training:  51%|█████▏    | 5636/10986 [2:10:29<2:00:50,  1.36s/it]

training loss: 3.342817544937134


training:  51%|█████▏    | 5637/10986 [2:10:30<1:59:53,  1.34s/it]

training loss: 3.323845624923706


training:  51%|█████▏    | 5638/10986 [2:10:31<1:59:29,  1.34s/it]

training loss: 3.2657151222229004


training:  51%|█████▏    | 5639/10986 [2:10:33<2:00:04,  1.35s/it]

training loss: 3.428290843963623


training:  51%|█████▏    | 5640/10986 [2:10:34<1:59:08,  1.34s/it]

training loss: 3.4409475326538086
valid loss: 3.4395034313201904
perplexity: 31.17147445678711


training:  51%|█████▏    | 5641/10986 [2:10:37<2:39:42,  1.79s/it]

training loss: 3.443545341491699


training:  51%|█████▏    | 5642/10986 [2:10:38<2:29:59,  1.68s/it]

training loss: 3.3889143466949463


training:  51%|█████▏    | 5643/10986 [2:10:39<2:21:23,  1.59s/it]

training loss: 3.393960952758789


training:  51%|█████▏    | 5644/10986 [2:10:41<2:15:03,  1.52s/it]

training loss: 3.396188259124756


training:  51%|█████▏    | 5645/10986 [2:10:42<2:10:53,  1.47s/it]

training loss: 3.325167179107666


training:  51%|█████▏    | 5646/10986 [2:10:44<2:07:43,  1.44s/it]

training loss: 3.3834023475646973


training:  51%|█████▏    | 5647/10986 [2:10:45<2:05:21,  1.41s/it]

training loss: 3.3375322818756104


training:  51%|█████▏    | 5648/10986 [2:10:46<2:03:54,  1.39s/it]

training loss: 3.4128191471099854


training:  51%|█████▏    | 5649/10986 [2:10:48<2:03:19,  1.39s/it]

training loss: 3.309814691543579


training:  51%|█████▏    | 5650/10986 [2:10:49<2:01:33,  1.37s/it]

training loss: 3.3942184448242188


training:  51%|█████▏    | 5651/10986 [2:10:51<2:08:53,  1.45s/it]

training loss: 3.51878023147583


training:  51%|█████▏    | 5652/10986 [2:10:52<2:06:15,  1.42s/it]

training loss: 3.3051609992980957


training:  51%|█████▏    | 5653/10986 [2:10:53<2:05:26,  1.41s/it]

training loss: 3.434959650039673


training:  51%|█████▏    | 5654/10986 [2:10:55<2:03:31,  1.39s/it]

training loss: 3.4247584342956543


training:  51%|█████▏    | 5655/10986 [2:10:56<2:01:53,  1.37s/it]

training loss: 3.355461835861206


training:  51%|█████▏    | 5656/10986 [2:10:57<2:01:02,  1.36s/it]

training loss: 3.423610210418701


training:  51%|█████▏    | 5657/10986 [2:10:59<2:00:05,  1.35s/it]

training loss: 3.284440755844116


training:  52%|█████▏    | 5658/10986 [2:11:00<1:59:13,  1.34s/it]

training loss: 3.3611838817596436


training:  52%|█████▏    | 5659/10986 [2:11:01<1:58:50,  1.34s/it]

training loss: 3.3085153102874756


training:  52%|█████▏    | 5660/10986 [2:11:03<1:58:31,  1.34s/it]

training loss: 3.4492502212524414
valid loss: 3.449138879776001
perplexity: 31.473278045654297


training:  52%|█████▏    | 5661/10986 [2:11:05<2:38:20,  1.78s/it]

training loss: 3.35365891456604


training:  52%|█████▏    | 5662/10986 [2:11:07<2:28:17,  1.67s/it]

training loss: 3.415019989013672


training:  52%|█████▏    | 5663/10986 [2:11:08<2:19:27,  1.57s/it]

training loss: 3.448808193206787


training:  52%|█████▏    | 5664/10986 [2:11:10<2:13:44,  1.51s/it]

training loss: 3.423563241958618


training:  52%|█████▏    | 5665/10986 [2:11:11<2:09:50,  1.46s/it]

training loss: 3.3639838695526123


training:  52%|█████▏    | 5666/10986 [2:11:12<2:07:57,  1.44s/it]

training loss: 3.3583154678344727


training:  52%|█████▏    | 5667/10986 [2:11:14<2:04:50,  1.41s/it]

training loss: 3.3181958198547363


training:  52%|█████▏    | 5668/10986 [2:11:15<2:03:27,  1.39s/it]

training loss: 3.4441421031951904


training:  52%|█████▏    | 5669/10986 [2:11:16<2:01:36,  1.37s/it]

training loss: 3.4176998138427734


training:  52%|█████▏    | 5670/10986 [2:11:18<2:00:58,  1.37s/it]

training loss: 3.3456974029541016


training:  52%|█████▏    | 5671/10986 [2:11:19<2:07:35,  1.44s/it]

training loss: 3.3832011222839355


training:  52%|█████▏    | 5672/10986 [2:11:21<2:13:17,  1.51s/it]

training loss: 3.357994556427002


training:  52%|█████▏    | 5673/10986 [2:11:22<2:08:33,  1.45s/it]

training loss: 3.3447306156158447


training:  52%|█████▏    | 5674/10986 [2:11:24<2:05:43,  1.42s/it]

training loss: 3.3025894165039062


training:  52%|█████▏    | 5675/10986 [2:11:25<2:10:09,  1.47s/it]

training loss: 3.4977951049804688


training:  52%|█████▏    | 5676/10986 [2:11:27<2:17:44,  1.56s/it]

training loss: 3.3922624588012695


training:  52%|█████▏    | 5677/10986 [2:11:29<2:18:38,  1.57s/it]

training loss: 3.4880080223083496


training:  52%|█████▏    | 5678/10986 [2:11:30<2:11:54,  1.49s/it]

training loss: 3.483818292617798


training:  52%|█████▏    | 5679/10986 [2:11:31<2:07:35,  1.44s/it]

training loss: 3.425647258758545


training:  52%|█████▏    | 5680/10986 [2:11:33<2:04:57,  1.41s/it]

training loss: 3.372042655944824
valid loss: 3.3733298778533936
perplexity: 29.17551612854004


training:  52%|█████▏    | 5681/10986 [2:11:35<2:43:45,  1.85s/it]

training loss: 3.441159248352051


training:  52%|█████▏    | 5682/10986 [2:11:37<2:37:15,  1.78s/it]

training loss: 3.4413278102874756


training:  52%|█████▏    | 5683/10986 [2:11:38<2:26:16,  1.65s/it]

training loss: 3.4778077602386475


training:  52%|█████▏    | 5684/10986 [2:11:40<2:17:10,  1.55s/it]

training loss: 3.4490621089935303


training:  52%|█████▏    | 5685/10986 [2:11:41<2:11:03,  1.48s/it]

training loss: 3.3561341762542725


training:  52%|█████▏    | 5686/10986 [2:11:42<2:07:06,  1.44s/it]

training loss: 3.496215581893921


training:  52%|█████▏    | 5687/10986 [2:11:44<2:04:00,  1.40s/it]

training loss: 3.435750961303711


training:  52%|█████▏    | 5688/10986 [2:11:45<2:02:51,  1.39s/it]

training loss: 3.459916591644287


training:  52%|█████▏    | 5689/10986 [2:11:46<2:01:15,  1.37s/it]

training loss: 3.4877848625183105


training:  52%|█████▏    | 5690/10986 [2:11:48<1:59:42,  1.36s/it]

training loss: 3.340141534805298


training:  52%|█████▏    | 5691/10986 [2:11:49<2:08:17,  1.45s/it]

training loss: 3.4016709327697754


training:  52%|█████▏    | 5692/10986 [2:11:51<2:15:43,  1.54s/it]

training loss: 3.407169818878174


training:  52%|█████▏    | 5693/10986 [2:11:52<2:10:14,  1.48s/it]

training loss: 3.4574708938598633


training:  52%|█████▏    | 5694/10986 [2:11:54<2:05:59,  1.43s/it]

training loss: 3.478259325027466


training:  52%|█████▏    | 5695/10986 [2:11:55<2:03:08,  1.40s/it]

training loss: 3.344825267791748


training:  52%|█████▏    | 5696/10986 [2:11:56<2:01:23,  1.38s/it]

training loss: 3.3493289947509766


training:  52%|█████▏    | 5697/10986 [2:11:58<2:00:47,  1.37s/it]

training loss: 3.3975601196289062


training:  52%|█████▏    | 5698/10986 [2:11:59<1:59:27,  1.36s/it]

training loss: 3.399247884750366


training:  52%|█████▏    | 5699/10986 [2:12:00<1:58:23,  1.34s/it]

training loss: 3.3530683517456055


training:  52%|█████▏    | 5700/10986 [2:12:02<1:58:20,  1.34s/it]

training loss: 3.3689184188842773
valid loss: 3.374643325805664
perplexity: 29.213863372802734


training:  52%|█████▏    | 5701/10986 [2:12:05<2:37:19,  1.79s/it]

training loss: 3.4223859310150146


training:  52%|█████▏    | 5702/10986 [2:12:06<2:32:42,  1.73s/it]

training loss: 3.340902328491211


training:  52%|█████▏    | 5703/10986 [2:12:08<2:22:10,  1.61s/it]

training loss: 3.4482948780059814


training:  52%|█████▏    | 5704/10986 [2:12:09<2:13:45,  1.52s/it]

training loss: 3.4574968814849854


training:  52%|█████▏    | 5705/10986 [2:12:10<2:07:57,  1.45s/it]

training loss: 3.3675873279571533


training:  52%|█████▏    | 5706/10986 [2:12:11<2:04:08,  1.41s/it]

training loss: 3.3384504318237305


training:  52%|█████▏    | 5707/10986 [2:12:13<2:02:28,  1.39s/it]

training loss: 3.3725075721740723


training:  52%|█████▏    | 5708/10986 [2:12:14<2:00:29,  1.37s/it]

training loss: 3.3101232051849365


training:  52%|█████▏    | 5709/10986 [2:12:15<1:59:00,  1.35s/it]

training loss: 3.3942668437957764


training:  52%|█████▏    | 5710/10986 [2:12:17<1:57:42,  1.34s/it]

training loss: 3.441704034805298


training:  52%|█████▏    | 5711/10986 [2:12:18<2:06:04,  1.43s/it]

training loss: 3.313887596130371


training:  52%|█████▏    | 5712/10986 [2:12:20<2:02:31,  1.39s/it]

training loss: 3.3783793449401855


training:  52%|█████▏    | 5713/10986 [2:12:21<2:00:37,  1.37s/it]

training loss: 3.3395891189575195


training:  52%|█████▏    | 5714/10986 [2:12:22<1:58:29,  1.35s/it]

training loss: 3.4497103691101074


training:  52%|█████▏    | 5715/10986 [2:12:24<1:56:48,  1.33s/it]

training loss: 3.4492807388305664


training:  52%|█████▏    | 5716/10986 [2:12:25<1:56:16,  1.32s/it]

training loss: 3.2641329765319824


training:  52%|█████▏    | 5717/10986 [2:12:26<1:55:42,  1.32s/it]

training loss: 3.3620684146881104


training:  52%|█████▏    | 5718/10986 [2:12:28<1:55:22,  1.31s/it]

training loss: 3.3783092498779297


training:  52%|█████▏    | 5719/10986 [2:12:29<1:54:42,  1.31s/it]

training loss: 3.38637638092041


training:  52%|█████▏    | 5720/10986 [2:12:30<1:54:36,  1.31s/it]

training loss: 3.5232155323028564
valid loss: 3.5170655250549316
perplexity: 33.6854362487793


training:  52%|█████▏    | 5721/10986 [2:12:33<2:33:43,  1.75s/it]

training loss: 3.4928126335144043


training:  52%|█████▏    | 5722/10986 [2:12:34<2:23:44,  1.64s/it]

training loss: 3.3341386318206787


training:  52%|█████▏    | 5723/10986 [2:12:36<2:15:12,  1.54s/it]

training loss: 3.4175961017608643


training:  52%|█████▏    | 5724/10986 [2:12:37<2:08:25,  1.46s/it]

training loss: 3.293008804321289


training:  52%|█████▏    | 5725/10986 [2:12:38<2:04:03,  1.41s/it]

training loss: 3.3489086627960205


training:  52%|█████▏    | 5726/10986 [2:12:39<2:01:32,  1.39s/it]

training loss: 3.442939043045044


training:  52%|█████▏    | 5727/10986 [2:12:41<1:59:25,  1.36s/it]

training loss: 3.3311924934387207


training:  52%|█████▏    | 5728/10986 [2:12:42<1:58:12,  1.35s/it]

training loss: 3.2659990787506104


training:  52%|█████▏    | 5729/10986 [2:12:43<1:57:21,  1.34s/it]

training loss: 3.486522912979126


training:  52%|█████▏    | 5730/10986 [2:12:45<1:56:32,  1.33s/it]

training loss: 3.420962333679199


training:  52%|█████▏    | 5731/10986 [2:12:46<2:03:02,  1.40s/it]

training loss: 3.3660428524017334


training:  52%|█████▏    | 5732/10986 [2:12:48<2:00:47,  1.38s/it]

training loss: 3.321042776107788


training:  52%|█████▏    | 5733/10986 [2:12:49<1:58:31,  1.35s/it]

training loss: 3.464566230773926


training:  52%|█████▏    | 5734/10986 [2:12:50<1:56:57,  1.34s/it]

training loss: 3.460566282272339


training:  52%|█████▏    | 5735/10986 [2:12:52<1:55:51,  1.32s/it]

training loss: 3.3563106060028076


training:  52%|█████▏    | 5736/10986 [2:12:53<1:55:35,  1.32s/it]

training loss: 3.382850170135498


training:  52%|█████▏    | 5737/10986 [2:12:54<1:55:25,  1.32s/it]

training loss: 3.3030028343200684


training:  52%|█████▏    | 5738/10986 [2:12:55<1:54:59,  1.31s/it]

training loss: 3.33756160736084


training:  52%|█████▏    | 5739/10986 [2:12:57<1:54:30,  1.31s/it]

training loss: 3.3600268363952637


training:  52%|█████▏    | 5740/10986 [2:12:58<1:54:40,  1.31s/it]

training loss: 3.3914709091186523
valid loss: 3.385063886642456
perplexity: 29.519878387451172


training:  52%|█████▏    | 5741/10986 [2:13:01<2:33:45,  1.76s/it]

training loss: 3.393810510635376


training:  52%|█████▏    | 5742/10986 [2:13:02<2:23:30,  1.64s/it]

training loss: 3.368473529815674


training:  52%|█████▏    | 5743/10986 [2:13:04<2:14:37,  1.54s/it]

training loss: 3.395591974258423


training:  52%|█████▏    | 5744/10986 [2:13:05<2:08:28,  1.47s/it]

training loss: 3.303342819213867


training:  52%|█████▏    | 5745/10986 [2:13:06<2:04:16,  1.42s/it]

training loss: 3.4542603492736816


training:  52%|█████▏    | 5746/10986 [2:13:07<2:00:53,  1.38s/it]

training loss: 3.4508047103881836


training:  52%|█████▏    | 5747/10986 [2:13:09<1:58:22,  1.36s/it]

training loss: 3.384770154953003


training:  52%|█████▏    | 5748/10986 [2:13:10<1:57:08,  1.34s/it]

training loss: 3.5061373710632324


training:  52%|█████▏    | 5749/10986 [2:13:11<1:55:25,  1.32s/it]

training loss: 3.3352293968200684


training:  52%|█████▏    | 5750/10986 [2:13:13<1:55:29,  1.32s/it]

training loss: 3.3858795166015625


training:  52%|█████▏    | 5751/10986 [2:13:14<2:03:42,  1.42s/it]

training loss: 3.3602042198181152


training:  52%|█████▏    | 5752/10986 [2:13:16<2:01:04,  1.39s/it]

training loss: 3.435922622680664


training:  52%|█████▏    | 5753/10986 [2:13:17<1:58:30,  1.36s/it]

training loss: 3.38938570022583


training:  52%|█████▏    | 5754/10986 [2:13:18<1:57:25,  1.35s/it]

training loss: 3.4156298637390137


training:  52%|█████▏    | 5755/10986 [2:13:20<1:56:17,  1.33s/it]

training loss: 3.413950204849243


training:  52%|█████▏    | 5756/10986 [2:13:21<1:55:47,  1.33s/it]

training loss: 3.403102159500122


training:  52%|█████▏    | 5757/10986 [2:13:22<1:55:09,  1.32s/it]

training loss: 3.3042707443237305


training:  52%|█████▏    | 5758/10986 [2:13:23<1:54:44,  1.32s/it]

training loss: 3.340245008468628


training:  52%|█████▏    | 5759/10986 [2:13:25<1:55:09,  1.32s/it]

training loss: 3.3711588382720947


training:  52%|█████▏    | 5760/10986 [2:13:26<1:55:01,  1.32s/it]

training loss: 3.398881196975708
valid loss: 3.3991925716400146
perplexity: 29.939916610717773


training:  52%|█████▏    | 5761/10986 [2:13:29<2:33:46,  1.77s/it]

training loss: 3.435861110687256


training:  52%|█████▏    | 5762/10986 [2:13:31<2:30:53,  1.73s/it]

training loss: 3.426724433898926


training:  52%|█████▏    | 5763/10986 [2:13:32<2:20:16,  1.61s/it]

training loss: 3.401946544647217


training:  52%|█████▏    | 5764/10986 [2:13:33<2:12:09,  1.52s/it]

training loss: 3.3919448852539062


training:  52%|█████▏    | 5765/10986 [2:13:34<2:06:14,  1.45s/it]

training loss: 3.3556127548217773


training:  52%|█████▏    | 5766/10986 [2:13:36<2:02:30,  1.41s/it]

training loss: 3.379920482635498


training:  52%|█████▏    | 5767/10986 [2:13:37<1:59:46,  1.38s/it]

training loss: 3.416018486022949


training:  53%|█████▎    | 5768/10986 [2:13:38<1:57:52,  1.36s/it]

training loss: 3.350771427154541


training:  53%|█████▎    | 5769/10986 [2:13:40<1:57:03,  1.35s/it]

training loss: 3.45609450340271


training:  53%|█████▎    | 5770/10986 [2:13:41<1:56:18,  1.34s/it]

training loss: 3.4633121490478516


training:  53%|█████▎    | 5771/10986 [2:13:43<2:04:41,  1.43s/it]

training loss: 3.430851459503174


training:  53%|█████▎    | 5772/10986 [2:13:44<2:03:26,  1.42s/it]

training loss: 3.4197304248809814


training:  53%|█████▎    | 5773/10986 [2:13:45<2:01:03,  1.39s/it]

training loss: 3.4345035552978516


training:  53%|█████▎    | 5774/10986 [2:13:47<1:59:05,  1.37s/it]

training loss: 3.3941869735717773


training:  53%|█████▎    | 5775/10986 [2:13:48<1:58:34,  1.37s/it]

training loss: 3.3187413215637207


training:  53%|█████▎    | 5776/10986 [2:13:49<1:56:57,  1.35s/it]

training loss: 3.3940773010253906


training:  53%|█████▎    | 5777/10986 [2:13:51<1:56:03,  1.34s/it]

training loss: 3.3670217990875244


training:  53%|█████▎    | 5778/10986 [2:13:52<1:55:58,  1.34s/it]

training loss: 3.3499410152435303


training:  53%|█████▎    | 5779/10986 [2:13:53<1:55:22,  1.33s/it]

training loss: 3.3246984481811523


training:  53%|█████▎    | 5780/10986 [2:13:55<1:55:39,  1.33s/it]

training loss: 3.409809112548828
valid loss: 3.4037322998046875
perplexity: 30.076143264770508


training:  53%|█████▎    | 5781/10986 [2:13:58<2:34:21,  1.78s/it]

training loss: 3.385131359100342


training:  53%|█████▎    | 5782/10986 [2:13:59<2:30:58,  1.74s/it]

training loss: 3.36722993850708


training:  53%|█████▎    | 5783/10986 [2:14:01<2:20:01,  1.61s/it]

training loss: 3.513671398162842


training:  53%|█████▎    | 5784/10986 [2:14:02<2:13:08,  1.54s/it]

training loss: 3.415125608444214


training:  53%|█████▎    | 5785/10986 [2:14:03<2:07:20,  1.47s/it]

training loss: 3.35075044631958


training:  53%|█████▎    | 5786/10986 [2:14:04<2:03:02,  1.42s/it]

training loss: 3.506654739379883


training:  53%|█████▎    | 5787/10986 [2:14:06<2:03:16,  1.42s/it]

training loss: 3.3784408569335938


training:  53%|█████▎    | 5788/10986 [2:14:07<2:00:52,  1.40s/it]

training loss: 3.2952301502227783


training:  53%|█████▎    | 5789/10986 [2:14:09<1:59:43,  1.38s/it]

training loss: 3.41947603225708


training:  53%|█████▎    | 5790/10986 [2:14:10<1:58:10,  1.36s/it]

training loss: 3.3511829376220703


training:  53%|█████▎    | 5791/10986 [2:14:12<2:05:35,  1.45s/it]

training loss: 3.416114568710327


training:  53%|█████▎    | 5792/10986 [2:14:13<2:05:39,  1.45s/it]

training loss: 3.5691163539886475


training:  53%|█████▎    | 5793/10986 [2:14:14<2:03:13,  1.42s/it]

training loss: 3.4457900524139404


training:  53%|█████▎    | 5794/10986 [2:14:16<2:00:23,  1.39s/it]

training loss: 3.419142246246338


training:  53%|█████▎    | 5795/10986 [2:14:17<1:58:25,  1.37s/it]

training loss: 3.39373779296875


training:  53%|█████▎    | 5796/10986 [2:14:18<1:57:31,  1.36s/it]

training loss: 3.3136796951293945


training:  53%|█████▎    | 5797/10986 [2:14:20<1:56:23,  1.35s/it]

training loss: 3.3348546028137207


training:  53%|█████▎    | 5798/10986 [2:14:21<1:55:19,  1.33s/it]

training loss: 3.4470314979553223


training:  53%|█████▎    | 5799/10986 [2:14:22<1:55:00,  1.33s/it]

training loss: 3.377931594848633


training:  53%|█████▎    | 5800/10986 [2:14:24<1:54:56,  1.33s/it]

training loss: 3.3368232250213623
valid loss: 3.334139585494995
perplexity: 28.05423355102539


training:  53%|█████▎    | 5801/10986 [2:14:26<2:34:04,  1.78s/it]

training loss: 3.3089420795440674


training:  53%|█████▎    | 5802/10986 [2:14:28<2:26:31,  1.70s/it]

training loss: 3.3531877994537354


training:  53%|█████▎    | 5803/10986 [2:14:29<2:17:22,  1.59s/it]

training loss: 3.357790470123291


training:  53%|█████▎    | 5804/10986 [2:14:31<2:10:02,  1.51s/it]

training loss: 3.3373725414276123


training:  53%|█████▎    | 5805/10986 [2:14:32<2:05:21,  1.45s/it]

training loss: 3.577948570251465


training:  53%|█████▎    | 5806/10986 [2:14:33<2:02:15,  1.42s/it]

training loss: 3.3724324703216553


training:  53%|█████▎    | 5807/10986 [2:14:35<2:00:14,  1.39s/it]

training loss: 3.3818912506103516


training:  53%|█████▎    | 5808/10986 [2:14:36<1:58:35,  1.37s/it]

training loss: 3.4354329109191895


training:  53%|█████▎    | 5809/10986 [2:14:37<1:57:24,  1.36s/it]

training loss: 3.3937673568725586


training:  53%|█████▎    | 5810/10986 [2:14:39<1:56:39,  1.35s/it]

training loss: 3.342703342437744


training:  53%|█████▎    | 5811/10986 [2:14:40<2:03:01,  1.43s/it]

training loss: 3.4363205432891846


training:  53%|█████▎    | 5812/10986 [2:14:42<2:00:32,  1.40s/it]

training loss: 3.3598031997680664


training:  53%|█████▎    | 5813/10986 [2:14:43<1:58:31,  1.37s/it]

training loss: 3.485253095626831


training:  53%|█████▎    | 5814/10986 [2:14:44<1:57:37,  1.36s/it]

training loss: 3.3339812755584717


training:  53%|█████▎    | 5815/10986 [2:14:46<1:56:52,  1.36s/it]

training loss: 3.394456624984741


training:  53%|█████▎    | 5816/10986 [2:14:47<1:56:17,  1.35s/it]

training loss: 3.4245176315307617


training:  53%|█████▎    | 5817/10986 [2:14:48<1:55:37,  1.34s/it]

training loss: 3.4565205574035645


training:  53%|█████▎    | 5818/10986 [2:14:49<1:54:56,  1.33s/it]

training loss: 3.4609856605529785


training:  53%|█████▎    | 5819/10986 [2:14:51<1:54:11,  1.33s/it]

training loss: 3.369523286819458


training:  53%|█████▎    | 5820/10986 [2:14:52<1:53:56,  1.32s/it]

training loss: 3.450713872909546
valid loss: 3.448819398880005
perplexity: 31.463224411010742


training:  53%|█████▎    | 5821/10986 [2:14:56<2:53:37,  2.02s/it]

training loss: 3.406527042388916


training:  53%|█████▎    | 5822/10986 [2:14:58<2:49:50,  1.97s/it]

training loss: 3.4609389305114746


training:  53%|█████▎    | 5823/10986 [2:14:59<2:32:56,  1.78s/it]

training loss: 3.35245418548584


training:  53%|█████▎    | 5824/10986 [2:15:00<2:21:06,  1.64s/it]

training loss: 3.3434202671051025


training:  53%|█████▎    | 5825/10986 [2:15:02<2:12:25,  1.54s/it]

training loss: 3.352302312850952


training:  53%|█████▎    | 5826/10986 [2:15:03<2:06:46,  1.47s/it]

training loss: 3.4076201915740967


training:  53%|█████▎    | 5827/10986 [2:15:04<2:01:56,  1.42s/it]

training loss: 3.3653533458709717


training:  53%|█████▎    | 5828/10986 [2:15:05<1:59:30,  1.39s/it]

training loss: 3.480404853820801


training:  53%|█████▎    | 5829/10986 [2:15:07<1:57:24,  1.37s/it]

training loss: 3.2260525226593018


training:  53%|█████▎    | 5830/10986 [2:15:08<1:55:58,  1.35s/it]

training loss: 3.459716796875


training:  53%|█████▎    | 5831/10986 [2:15:10<2:02:40,  1.43s/it]

training loss: 3.339229106903076


training:  53%|█████▎    | 5832/10986 [2:15:11<2:00:26,  1.40s/it]

training loss: 3.340226411819458


training:  53%|█████▎    | 5833/10986 [2:15:12<1:58:48,  1.38s/it]

training loss: 3.366926908493042


training:  53%|█████▎    | 5834/10986 [2:15:14<1:57:08,  1.36s/it]

training loss: 3.469041585922241


training:  53%|█████▎    | 5835/10986 [2:15:15<1:56:50,  1.36s/it]

training loss: 3.404510259628296


training:  53%|█████▎    | 5836/10986 [2:15:16<1:56:33,  1.36s/it]

training loss: 3.466341257095337


training:  53%|█████▎    | 5837/10986 [2:15:18<1:55:24,  1.34s/it]

training loss: 3.4396982192993164


training:  53%|█████▎    | 5838/10986 [2:15:19<1:54:16,  1.33s/it]

training loss: 3.294130325317383


training:  53%|█████▎    | 5839/10986 [2:15:20<1:53:29,  1.32s/it]

training loss: 3.4515137672424316


training:  53%|█████▎    | 5840/10986 [2:15:22<1:52:38,  1.31s/it]

training loss: 3.328558921813965
valid loss: 3.333103895187378
perplexity: 28.02519416809082


training:  53%|█████▎    | 5841/10986 [2:15:24<2:31:33,  1.77s/it]

training loss: 3.37563419342041


training:  53%|█████▎    | 5842/10986 [2:15:26<2:21:58,  1.66s/it]

training loss: 3.4074740409851074


training:  53%|█████▎    | 5843/10986 [2:15:27<2:13:52,  1.56s/it]

training loss: 3.383788585662842


training:  53%|█████▎    | 5844/10986 [2:15:29<2:07:11,  1.48s/it]

training loss: 3.3560450077056885


training:  53%|█████▎    | 5845/10986 [2:15:30<2:02:21,  1.43s/it]

training loss: 3.288705348968506


training:  53%|█████▎    | 5846/10986 [2:15:31<1:59:44,  1.40s/it]

training loss: 3.3563485145568848


training:  53%|█████▎    | 5847/10986 [2:15:32<1:57:18,  1.37s/it]

training loss: 3.5139613151550293


training:  53%|█████▎    | 5848/10986 [2:15:34<1:55:41,  1.35s/it]

training loss: 3.402332067489624


training:  53%|█████▎    | 5849/10986 [2:15:35<1:55:14,  1.35s/it]

training loss: 3.3917064666748047


training:  53%|█████▎    | 5850/10986 [2:15:36<1:54:08,  1.33s/it]

training loss: 3.4445152282714844


training:  53%|█████▎    | 5851/10986 [2:15:38<2:01:11,  1.42s/it]

training loss: 3.4433891773223877


training:  53%|█████▎    | 5852/10986 [2:15:39<1:59:15,  1.39s/it]

training loss: 3.460376262664795


training:  53%|█████▎    | 5853/10986 [2:15:41<1:56:56,  1.37s/it]

training loss: 3.472079277038574


training:  53%|█████▎    | 5854/10986 [2:15:42<1:55:44,  1.35s/it]

training loss: 3.3747787475585938


training:  53%|█████▎    | 5855/10986 [2:15:43<1:54:42,  1.34s/it]

training loss: 3.413574695587158


training:  53%|█████▎    | 5856/10986 [2:15:45<1:54:22,  1.34s/it]

training loss: 3.3487720489501953


training:  53%|█████▎    | 5857/10986 [2:15:46<1:54:15,  1.34s/it]

training loss: 3.594228506088257


training:  53%|█████▎    | 5858/10986 [2:15:47<1:53:43,  1.33s/it]

training loss: 3.4214437007904053


training:  53%|█████▎    | 5859/10986 [2:15:49<1:52:41,  1.32s/it]

training loss: 3.437276601791382


training:  53%|█████▎    | 5860/10986 [2:15:50<1:52:24,  1.32s/it]

training loss: 3.4486680030822754
valid loss: 3.437453031539917
perplexity: 31.107627868652344


training:  53%|█████▎    | 5861/10986 [2:15:53<2:31:48,  1.78s/it]

training loss: 3.3863890171051025


training:  53%|█████▎    | 5862/10986 [2:15:54<2:24:08,  1.69s/it]

training loss: 3.297354221343994


training:  53%|█████▎    | 5863/10986 [2:15:55<2:14:18,  1.57s/it]

training loss: 3.393073558807373


training:  53%|█████▎    | 5864/10986 [2:15:57<2:07:15,  1.49s/it]

training loss: 3.3205528259277344


training:  53%|█████▎    | 5865/10986 [2:15:58<2:03:05,  1.44s/it]

training loss: 3.4210028648376465


training:  53%|█████▎    | 5866/10986 [2:15:59<1:59:34,  1.40s/it]

training loss: 3.3476455211639404


training:  53%|█████▎    | 5867/10986 [2:16:01<1:56:58,  1.37s/it]

training loss: 3.3975746631622314


training:  53%|█████▎    | 5868/10986 [2:16:02<1:56:10,  1.36s/it]

training loss: 3.4526233673095703


training:  53%|█████▎    | 5869/10986 [2:16:03<1:55:03,  1.35s/it]

training loss: 3.3701460361480713


training:  53%|█████▎    | 5870/10986 [2:16:05<1:54:30,  1.34s/it]

training loss: 3.4105536937713623


training:  53%|█████▎    | 5871/10986 [2:16:06<2:01:06,  1.42s/it]

training loss: 3.3022539615631104


training:  53%|█████▎    | 5872/10986 [2:16:08<1:58:37,  1.39s/it]

training loss: 3.367173194885254


training:  53%|█████▎    | 5873/10986 [2:16:09<1:57:06,  1.37s/it]

training loss: 3.434041738510132


training:  53%|█████▎    | 5874/10986 [2:16:10<1:55:06,  1.35s/it]

training loss: 3.4539148807525635


training:  53%|█████▎    | 5875/10986 [2:16:12<1:53:03,  1.33s/it]

training loss: 3.438993215560913


training:  53%|█████▎    | 5876/10986 [2:16:13<1:52:01,  1.32s/it]

training loss: 3.352593183517456


training:  53%|█████▎    | 5877/10986 [2:16:14<1:51:55,  1.31s/it]

training loss: 3.416214942932129


training:  54%|█████▎    | 5878/10986 [2:16:15<1:52:10,  1.32s/it]

training loss: 3.306422710418701


training:  54%|█████▎    | 5879/10986 [2:16:17<1:51:49,  1.31s/it]

training loss: 3.3480374813079834


training:  54%|█████▎    | 5880/10986 [2:16:18<1:52:19,  1.32s/it]

training loss: 3.4449052810668945
valid loss: 3.4396843910217285
perplexity: 31.17711639404297


training:  54%|█████▎    | 5881/10986 [2:16:21<2:29:57,  1.76s/it]

training loss: 3.3905680179595947


training:  54%|█████▎    | 5882/10986 [2:16:22<2:21:18,  1.66s/it]

training loss: 3.461143732070923


training:  54%|█████▎    | 5883/10986 [2:16:24<2:13:18,  1.57s/it]

training loss: 3.37082576751709


training:  54%|█████▎    | 5884/10986 [2:16:25<2:07:02,  1.49s/it]

training loss: 3.422752618789673


training:  54%|█████▎    | 5885/10986 [2:16:26<2:01:55,  1.43s/it]

training loss: 3.4642794132232666


training:  54%|█████▎    | 5886/10986 [2:16:28<1:57:42,  1.38s/it]

training loss: 3.4635353088378906


training:  54%|█████▎    | 5887/10986 [2:16:29<1:55:32,  1.36s/it]

training loss: 3.350261688232422


training:  54%|█████▎    | 5888/10986 [2:16:30<1:54:07,  1.34s/it]

training loss: 3.4007797241210938


training:  54%|█████▎    | 5889/10986 [2:16:31<1:52:40,  1.33s/it]

training loss: 3.2722408771514893


training:  54%|█████▎    | 5890/10986 [2:16:33<1:52:38,  1.33s/it]

training loss: 3.340402841567993


training:  54%|█████▎    | 5891/10986 [2:16:34<1:59:55,  1.41s/it]

training loss: 3.353572368621826


training:  54%|█████▎    | 5892/10986 [2:16:36<2:07:13,  1.50s/it]

training loss: 3.429971933364868


training:  54%|█████▎    | 5893/10986 [2:16:37<2:02:14,  1.44s/it]

training loss: 3.483779191970825


training:  54%|█████▎    | 5894/10986 [2:16:39<1:58:45,  1.40s/it]

training loss: 3.3994333744049072


training:  54%|█████▎    | 5895/10986 [2:16:40<1:57:02,  1.38s/it]

training loss: 3.395897388458252


training:  54%|█████▎    | 5896/10986 [2:16:41<1:55:07,  1.36s/it]

training loss: 3.39658260345459


training:  54%|█████▎    | 5897/10986 [2:16:43<1:54:15,  1.35s/it]

training loss: 3.437008857727051


training:  54%|█████▎    | 5898/10986 [2:16:44<1:53:35,  1.34s/it]

training loss: 3.369232416152954


training:  54%|█████▎    | 5899/10986 [2:16:45<1:53:12,  1.34s/it]

training loss: 3.3263423442840576


training:  54%|█████▎    | 5900/10986 [2:16:47<1:52:35,  1.33s/it]

training loss: 3.4648172855377197
valid loss: 3.4504759311676025
perplexity: 31.5153865814209


training:  54%|█████▎    | 5901/10986 [2:16:49<2:30:50,  1.78s/it]

training loss: 3.3671255111694336


training:  54%|█████▎    | 5902/10986 [2:16:51<2:23:00,  1.69s/it]

training loss: 3.4752378463745117


training:  54%|█████▎    | 5903/10986 [2:16:52<2:14:27,  1.59s/it]

training loss: 3.4625203609466553


training:  54%|█████▎    | 5904/10986 [2:16:54<2:07:55,  1.51s/it]

training loss: 3.4306447505950928


training:  54%|█████▍    | 5905/10986 [2:16:55<2:03:04,  1.45s/it]

training loss: 3.3217318058013916


training:  54%|█████▍    | 5906/10986 [2:16:56<1:59:07,  1.41s/it]

training loss: 3.320531129837036


training:  54%|█████▍    | 5907/10986 [2:16:58<1:56:46,  1.38s/it]

training loss: 3.4281744956970215


training:  54%|█████▍    | 5908/10986 [2:16:59<1:54:50,  1.36s/it]

training loss: 3.3904621601104736


training:  54%|█████▍    | 5909/10986 [2:17:00<1:54:19,  1.35s/it]

training loss: 3.382136583328247


training:  54%|█████▍    | 5910/10986 [2:17:02<1:53:20,  1.34s/it]

training loss: 3.4093234539031982


training:  54%|█████▍    | 5911/10986 [2:17:03<2:00:06,  1.42s/it]

training loss: 3.421760320663452


training:  54%|█████▍    | 5912/10986 [2:17:04<1:57:33,  1.39s/it]

training loss: 3.4760026931762695


training:  54%|█████▍    | 5913/10986 [2:17:06<1:55:24,  1.36s/it]

training loss: 3.3838865756988525


training:  54%|█████▍    | 5914/10986 [2:17:07<1:53:35,  1.34s/it]

training loss: 3.4337356090545654


training:  54%|█████▍    | 5915/10986 [2:17:08<1:52:52,  1.34s/it]

training loss: 3.4928441047668457


training:  54%|█████▍    | 5916/10986 [2:17:10<1:52:34,  1.33s/it]

training loss: 3.4803249835968018


training:  54%|█████▍    | 5917/10986 [2:17:11<1:52:23,  1.33s/it]

training loss: 3.4492876529693604


training:  54%|█████▍    | 5918/10986 [2:17:12<1:51:43,  1.32s/it]

training loss: 3.4871339797973633


training:  54%|█████▍    | 5919/10986 [2:17:14<1:50:55,  1.31s/it]

training loss: 3.3742010593414307


training:  54%|█████▍    | 5920/10986 [2:17:15<1:51:05,  1.32s/it]

training loss: 3.341381549835205
valid loss: 3.3368308544158936
perplexity: 28.129838943481445


training:  54%|█████▍    | 5921/10986 [2:17:18<2:29:54,  1.78s/it]

training loss: 3.4483437538146973


training:  54%|█████▍    | 5922/10986 [2:17:19<2:19:42,  1.66s/it]

training loss: 3.396941661834717


training:  54%|█████▍    | 5923/10986 [2:17:20<2:11:05,  1.55s/it]

training loss: 3.3416175842285156


training:  54%|█████▍    | 5924/10986 [2:17:22<2:05:09,  1.48s/it]

training loss: 3.417015552520752


training:  54%|█████▍    | 5925/10986 [2:17:23<2:01:28,  1.44s/it]

training loss: 3.419203758239746


training:  54%|█████▍    | 5926/10986 [2:17:24<1:58:37,  1.41s/it]

training loss: 3.408444404602051


training:  54%|█████▍    | 5927/10986 [2:17:26<1:56:24,  1.38s/it]

training loss: 3.302469491958618


training:  54%|█████▍    | 5928/10986 [2:17:27<1:55:07,  1.37s/it]

training loss: 3.5219900608062744


training:  54%|█████▍    | 5929/10986 [2:17:28<1:53:05,  1.34s/it]

training loss: 3.458699941635132


training:  54%|█████▍    | 5930/10986 [2:17:30<1:52:08,  1.33s/it]

training loss: 3.4833908081054688


training:  54%|█████▍    | 5931/10986 [2:17:31<1:59:32,  1.42s/it]

training loss: 3.3825957775115967


training:  54%|█████▍    | 5932/10986 [2:17:33<1:57:27,  1.39s/it]

training loss: 3.3738062381744385


training:  54%|█████▍    | 5933/10986 [2:17:34<1:56:01,  1.38s/it]

training loss: 3.3500828742980957


training:  54%|█████▍    | 5934/10986 [2:17:35<1:54:58,  1.37s/it]

training loss: 3.411660671234131


training:  54%|█████▍    | 5935/10986 [2:17:37<1:53:39,  1.35s/it]

training loss: 3.3868637084960938


training:  54%|█████▍    | 5936/10986 [2:17:38<1:52:24,  1.34s/it]

training loss: 3.372539758682251


training:  54%|█████▍    | 5937/10986 [2:17:39<1:52:12,  1.33s/it]

training loss: 3.281229019165039


training:  54%|█████▍    | 5938/10986 [2:17:41<1:51:30,  1.33s/it]

training loss: 3.409198045730591


training:  54%|█████▍    | 5939/10986 [2:17:42<1:51:14,  1.32s/it]

training loss: 3.3765876293182373


training:  54%|█████▍    | 5940/10986 [2:17:43<1:51:21,  1.32s/it]

training loss: 3.4594433307647705
valid loss: 3.4562997817993164
perplexity: 31.699464797973633


training:  54%|█████▍    | 5941/10986 [2:17:46<2:29:03,  1.77s/it]

training loss: 3.3666622638702393


training:  54%|█████▍    | 5942/10986 [2:17:47<2:19:20,  1.66s/it]

training loss: 3.3937718868255615


training:  54%|█████▍    | 5943/10986 [2:17:49<2:11:08,  1.56s/it]

training loss: 3.334153890609741


training:  54%|█████▍    | 5944/10986 [2:17:50<2:05:22,  1.49s/it]

training loss: 3.364839792251587


training:  54%|█████▍    | 5945/10986 [2:17:51<2:01:31,  1.45s/it]

training loss: 3.4319779872894287


training:  54%|█████▍    | 5946/10986 [2:17:53<1:58:04,  1.41s/it]

training loss: 3.4432809352874756


training:  54%|█████▍    | 5947/10986 [2:17:54<1:55:59,  1.38s/it]

training loss: 3.381587505340576


training:  54%|█████▍    | 5948/10986 [2:17:55<1:54:20,  1.36s/it]

training loss: 3.407581090927124


training:  54%|█████▍    | 5949/10986 [2:17:57<1:53:01,  1.35s/it]

training loss: 3.382870674133301


training:  54%|█████▍    | 5950/10986 [2:17:58<1:52:33,  1.34s/it]

training loss: 3.462815523147583


training:  54%|█████▍    | 5951/10986 [2:18:00<2:00:13,  1.43s/it]

training loss: 3.413145065307617


training:  54%|█████▍    | 5952/10986 [2:18:01<1:58:37,  1.41s/it]

training loss: 3.3240716457366943


training:  54%|█████▍    | 5953/10986 [2:18:02<1:56:36,  1.39s/it]

training loss: 3.5119752883911133


training:  54%|█████▍    | 5954/10986 [2:18:04<1:54:24,  1.36s/it]

training loss: 3.497640609741211


training:  54%|█████▍    | 5955/10986 [2:18:05<1:53:33,  1.35s/it]

training loss: 3.2942214012145996


training:  54%|█████▍    | 5956/10986 [2:18:06<1:52:50,  1.35s/it]

training loss: 3.4288671016693115


training:  54%|█████▍    | 5957/10986 [2:18:08<1:52:26,  1.34s/it]

training loss: 3.4396677017211914


training:  54%|█████▍    | 5958/10986 [2:18:09<1:52:09,  1.34s/it]

training loss: 3.3833494186401367


training:  54%|█████▍    | 5959/10986 [2:18:10<1:51:27,  1.33s/it]

training loss: 3.426021099090576


training:  54%|█████▍    | 5960/10986 [2:18:12<1:51:08,  1.33s/it]

training loss: 3.401925802230835
valid loss: 3.4024744033813477
perplexity: 30.0383358001709


training:  54%|█████▍    | 5961/10986 [2:18:14<2:28:04,  1.77s/it]

training loss: 3.4606125354766846


training:  54%|█████▍    | 5962/10986 [2:18:16<2:24:55,  1.73s/it]

training loss: 3.471231698989868


training:  54%|█████▍    | 5963/10986 [2:18:17<2:14:23,  1.61s/it]

training loss: 3.3680245876312256


training:  54%|█████▍    | 5964/10986 [2:18:19<2:07:52,  1.53s/it]

training loss: 3.5125417709350586


training:  54%|█████▍    | 5965/10986 [2:18:20<2:02:41,  1.47s/it]

training loss: 3.4119505882263184


training:  54%|█████▍    | 5966/10986 [2:18:22<2:06:32,  1.51s/it]

training loss: 3.3275668621063232


training:  54%|█████▍    | 5967/10986 [2:18:23<2:12:01,  1.58s/it]

training loss: 3.4378912448883057


training:  54%|█████▍    | 5968/10986 [2:18:25<2:12:01,  1.58s/it]

training loss: 3.399763822555542


training:  54%|█████▍    | 5969/10986 [2:18:26<2:05:32,  1.50s/it]

training loss: 3.3652639389038086


training:  54%|█████▍    | 5970/10986 [2:18:28<2:00:37,  1.44s/it]

training loss: 3.327648878097534


training:  54%|█████▍    | 5971/10986 [2:18:29<2:03:44,  1.48s/it]

training loss: 3.4166817665100098


training:  54%|█████▍    | 5972/10986 [2:18:30<1:59:32,  1.43s/it]

training loss: 3.337327480316162


training:  54%|█████▍    | 5973/10986 [2:18:32<1:56:42,  1.40s/it]

training loss: 3.3847579956054688


training:  54%|█████▍    | 5974/10986 [2:18:33<1:54:10,  1.37s/it]

training loss: 3.3907315731048584


training:  54%|█████▍    | 5975/10986 [2:18:34<1:53:13,  1.36s/it]

training loss: 3.456105947494507


training:  54%|█████▍    | 5976/10986 [2:18:36<1:52:28,  1.35s/it]

training loss: 3.4193592071533203


training:  54%|█████▍    | 5977/10986 [2:18:37<1:51:57,  1.34s/it]

training loss: 3.377880334854126


training:  54%|█████▍    | 5978/10986 [2:18:38<1:51:19,  1.33s/it]

training loss: 3.3059184551239014


training:  54%|█████▍    | 5979/10986 [2:18:40<1:50:40,  1.33s/it]

training loss: 3.3385109901428223


training:  54%|█████▍    | 5980/10986 [2:18:41<1:50:23,  1.32s/it]

training loss: 3.3629140853881836
valid loss: 3.3646738529205322
perplexity: 28.924062728881836


training:  54%|█████▍    | 5981/10986 [2:18:44<2:27:57,  1.77s/it]

training loss: 3.3396103382110596


training:  54%|█████▍    | 5982/10986 [2:18:46<2:25:24,  1.74s/it]

training loss: 3.358873128890991


training:  54%|█████▍    | 5983/10986 [2:18:47<2:14:59,  1.62s/it]

training loss: 3.3391170501708984


training:  54%|█████▍    | 5984/10986 [2:18:48<2:07:31,  1.53s/it]

training loss: 3.4545657634735107


training:  54%|█████▍    | 5985/10986 [2:18:50<2:02:46,  1.47s/it]

training loss: 3.5149030685424805


training:  54%|█████▍    | 5986/10986 [2:18:51<1:59:06,  1.43s/it]

training loss: 3.477128267288208


training:  54%|█████▍    | 5987/10986 [2:18:52<1:56:40,  1.40s/it]

training loss: 3.37967848777771


training:  55%|█████▍    | 5988/10986 [2:18:53<1:54:28,  1.37s/it]

training loss: 3.4903523921966553


training:  55%|█████▍    | 5989/10986 [2:18:55<1:53:14,  1.36s/it]

training loss: 3.453270673751831


training:  55%|█████▍    | 5990/10986 [2:18:56<1:52:07,  1.35s/it]

training loss: 3.500814199447632


training:  55%|█████▍    | 5991/10986 [2:18:58<1:58:21,  1.42s/it]

training loss: 3.294435977935791


training:  55%|█████▍    | 5992/10986 [2:18:59<1:56:22,  1.40s/it]

training loss: 3.4306232929229736


training:  55%|█████▍    | 5993/10986 [2:19:00<1:54:45,  1.38s/it]

training loss: 3.3484480381011963


training:  55%|█████▍    | 5994/10986 [2:19:02<1:53:53,  1.37s/it]

training loss: 3.512767791748047


training:  55%|█████▍    | 5995/10986 [2:19:03<1:52:36,  1.35s/it]

training loss: 3.465916872024536


training:  55%|█████▍    | 5996/10986 [2:19:04<1:51:51,  1.35s/it]

training loss: 3.4412872791290283


training:  55%|█████▍    | 5997/10986 [2:19:06<1:51:08,  1.34s/it]

training loss: 3.4122960567474365


training:  55%|█████▍    | 5998/10986 [2:19:07<1:50:12,  1.33s/it]

training loss: 3.4874255657196045


training:  55%|█████▍    | 5999/10986 [2:19:08<1:50:24,  1.33s/it]

training loss: 3.3870465755462646


training:  55%|█████▍    | 6000/10986 [2:19:10<1:50:42,  1.33s/it]

training loss: 3.3740909099578857
valid loss: 3.3673317432403564
perplexity: 29.001041412353516


training:  55%|█████▍    | 6001/10986 [2:19:13<2:28:41,  1.79s/it]

training loss: 3.352571725845337


training:  55%|█████▍    | 6002/10986 [2:19:14<2:18:54,  1.67s/it]

training loss: 3.422666072845459


training:  55%|█████▍    | 6003/10986 [2:19:15<2:10:40,  1.57s/it]

training loss: 3.2913365364074707


training:  55%|█████▍    | 6004/10986 [2:19:17<2:04:29,  1.50s/it]

training loss: 3.4010069370269775


training:  55%|█████▍    | 6005/10986 [2:19:18<2:00:55,  1.46s/it]

training loss: 3.4566445350646973


training:  55%|█████▍    | 6006/10986 [2:19:19<1:58:14,  1.42s/it]

training loss: 3.426663637161255


training:  55%|█████▍    | 6007/10986 [2:19:21<1:57:04,  1.41s/it]

training loss: 3.425650119781494


training:  55%|█████▍    | 6008/10986 [2:19:22<1:55:20,  1.39s/it]

training loss: 3.3624980449676514


training:  55%|█████▍    | 6009/10986 [2:19:23<1:53:41,  1.37s/it]

training loss: 3.3938074111938477


training:  55%|█████▍    | 6010/10986 [2:19:25<1:52:28,  1.36s/it]

training loss: 3.5068717002868652


training:  55%|█████▍    | 6011/10986 [2:19:26<1:59:23,  1.44s/it]

training loss: 3.418304681777954


training:  55%|█████▍    | 6012/10986 [2:19:28<1:56:20,  1.40s/it]

training loss: 3.457169771194458


training:  55%|█████▍    | 6013/10986 [2:19:29<1:54:06,  1.38s/it]

training loss: 3.3724348545074463


training:  55%|█████▍    | 6014/10986 [2:19:30<1:52:56,  1.36s/it]

training loss: 3.6071794033050537


training:  55%|█████▍    | 6015/10986 [2:19:32<1:52:00,  1.35s/it]

training loss: 3.399106979370117


training:  55%|█████▍    | 6016/10986 [2:19:33<1:51:13,  1.34s/it]

training loss: 3.3838090896606445


training:  55%|█████▍    | 6017/10986 [2:19:34<1:50:51,  1.34s/it]

training loss: 3.375418186187744


training:  55%|█████▍    | 6018/10986 [2:19:36<1:50:29,  1.33s/it]

training loss: 3.309680461883545


training:  55%|█████▍    | 6019/10986 [2:19:37<1:50:09,  1.33s/it]

training loss: 3.354313611984253


training:  55%|█████▍    | 6020/10986 [2:19:38<1:50:18,  1.33s/it]

training loss: 3.4792613983154297
valid loss: 3.4737586975097656
perplexity: 32.25776290893555


training:  55%|█████▍    | 6021/10986 [2:19:41<2:28:18,  1.79s/it]

training loss: 3.3449864387512207


training:  55%|█████▍    | 6022/10986 [2:19:43<2:25:53,  1.76s/it]

training loss: 3.471278667449951


training:  55%|█████▍    | 6023/10986 [2:19:44<2:15:01,  1.63s/it]

training loss: 3.4557292461395264


training:  55%|█████▍    | 6024/10986 [2:19:45<2:07:12,  1.54s/it]

training loss: 3.6655304431915283


training:  55%|█████▍    | 6025/10986 [2:19:47<2:01:37,  1.47s/it]

training loss: 3.5338895320892334


training:  55%|█████▍    | 6026/10986 [2:19:48<1:57:24,  1.42s/it]

training loss: 3.349989414215088


training:  55%|█████▍    | 6027/10986 [2:19:49<1:54:30,  1.39s/it]

training loss: 3.408700942993164


training:  55%|█████▍    | 6028/10986 [2:19:51<1:53:20,  1.37s/it]

training loss: 3.531792640686035


training:  55%|█████▍    | 6029/10986 [2:19:52<1:52:21,  1.36s/it]

training loss: 3.3873817920684814


training:  55%|█████▍    | 6030/10986 [2:19:53<1:51:33,  1.35s/it]

training loss: 3.3685426712036133


training:  55%|█████▍    | 6031/10986 [2:19:55<1:58:05,  1.43s/it]

training loss: 3.3700716495513916


training:  55%|█████▍    | 6032/10986 [2:19:57<2:01:46,  1.47s/it]

training loss: 3.4321892261505127


training:  55%|█████▍    | 6033/10986 [2:19:58<1:57:51,  1.43s/it]

training loss: 3.415872812271118


training:  55%|█████▍    | 6034/10986 [2:19:59<1:55:30,  1.40s/it]

training loss: 3.399200201034546


training:  55%|█████▍    | 6035/10986 [2:20:01<1:53:22,  1.37s/it]

training loss: 3.510822057723999


training:  55%|█████▍    | 6036/10986 [2:20:02<1:53:10,  1.37s/it]

training loss: 3.313598394393921


training:  55%|█████▍    | 6037/10986 [2:20:03<1:51:59,  1.36s/it]

training loss: 3.4150807857513428


training:  55%|█████▍    | 6038/10986 [2:20:05<1:51:32,  1.35s/it]

training loss: 3.343266010284424


training:  55%|█████▍    | 6039/10986 [2:20:06<1:51:01,  1.35s/it]

training loss: 3.3552029132843018


training:  55%|█████▍    | 6040/10986 [2:20:07<1:51:17,  1.35s/it]

training loss: 3.369370937347412
valid loss: 3.3680803775787354
perplexity: 29.02276039123535


training:  55%|█████▍    | 6041/10986 [2:20:10<2:29:00,  1.81s/it]

training loss: 3.408104181289673


training:  55%|█████▍    | 6042/10986 [2:20:12<2:25:29,  1.77s/it]

training loss: 3.430635690689087


training:  55%|█████▌    | 6043/10986 [2:20:13<2:15:22,  1.64s/it]

training loss: 3.3036630153656006


training:  55%|█████▌    | 6044/10986 [2:20:14<2:07:31,  1.55s/it]

training loss: 3.350818157196045


training:  55%|█████▌    | 6045/10986 [2:20:16<2:01:55,  1.48s/it]

training loss: 3.4618232250213623


training:  55%|█████▌    | 6046/10986 [2:20:17<1:57:51,  1.43s/it]

training loss: 3.3162217140197754


training:  55%|█████▌    | 6047/10986 [2:20:18<1:55:29,  1.40s/it]

training loss: 3.4625959396362305


training:  55%|█████▌    | 6048/10986 [2:20:20<1:53:11,  1.38s/it]

training loss: 3.3496272563934326


training:  55%|█████▌    | 6049/10986 [2:20:21<1:52:04,  1.36s/it]

training loss: 3.356118679046631


training:  55%|█████▌    | 6050/10986 [2:20:22<1:50:48,  1.35s/it]

training loss: 3.379120349884033


training:  55%|█████▌    | 6051/10986 [2:20:24<1:57:33,  1.43s/it]

training loss: 3.3747293949127197


training:  55%|█████▌    | 6052/10986 [2:20:25<1:57:41,  1.43s/it]

training loss: 3.3091440200805664


training:  55%|█████▌    | 6053/10986 [2:20:27<1:55:07,  1.40s/it]

training loss: 3.3822274208068848


training:  55%|█████▌    | 6054/10986 [2:20:28<1:53:15,  1.38s/it]

training loss: 3.3890981674194336


training:  55%|█████▌    | 6055/10986 [2:20:29<1:51:47,  1.36s/it]

training loss: 3.351424217224121


training:  55%|█████▌    | 6056/10986 [2:20:31<1:50:14,  1.34s/it]

training loss: 3.5180060863494873


training:  55%|█████▌    | 6057/10986 [2:20:32<1:49:44,  1.34s/it]

training loss: 3.4434640407562256


training:  55%|█████▌    | 6058/10986 [2:20:33<1:48:54,  1.33s/it]

training loss: 3.5556414127349854


training:  55%|█████▌    | 6059/10986 [2:20:35<1:48:45,  1.32s/it]

training loss: 3.4486680030822754


training:  55%|█████▌    | 6060/10986 [2:20:36<1:48:58,  1.33s/it]

training loss: 3.305860996246338
valid loss: 3.306244373321533
perplexity: 27.282470703125


training:  55%|█████▌    | 6061/10986 [2:20:39<2:25:42,  1.78s/it]

training loss: 3.4543070793151855


training:  55%|█████▌    | 6062/10986 [2:20:41<2:22:37,  1.74s/it]

training loss: 3.494835138320923


training:  55%|█████▌    | 6063/10986 [2:20:42<2:12:50,  1.62s/it]

training loss: 3.408596992492676


training:  55%|█████▌    | 6064/10986 [2:20:43<2:05:48,  1.53s/it]

training loss: 3.537750005722046


training:  55%|█████▌    | 6065/10986 [2:20:45<2:01:14,  1.48s/it]

training loss: 3.424410104751587


training:  55%|█████▌    | 6066/10986 [2:20:46<1:57:49,  1.44s/it]

training loss: 3.2985057830810547


training:  55%|█████▌    | 6067/10986 [2:20:47<1:55:17,  1.41s/it]

training loss: 3.4437437057495117


training:  55%|█████▌    | 6068/10986 [2:20:49<1:53:14,  1.38s/it]

training loss: 3.491898775100708


training:  55%|█████▌    | 6069/10986 [2:20:50<1:51:57,  1.37s/it]

training loss: 3.337787389755249


training:  55%|█████▌    | 6070/10986 [2:20:51<1:51:37,  1.36s/it]

training loss: 3.3969225883483887


training:  55%|█████▌    | 6071/10986 [2:20:53<1:58:23,  1.45s/it]

training loss: 3.3946588039398193


training:  55%|█████▌    | 6072/10986 [2:20:54<2:02:39,  1.50s/it]

training loss: 3.36026930809021


training:  55%|█████▌    | 6073/10986 [2:20:56<1:58:37,  1.45s/it]

training loss: 3.3878753185272217


training:  55%|█████▌    | 6074/10986 [2:20:57<1:55:14,  1.41s/it]

training loss: 3.315993070602417


training:  55%|█████▌    | 6075/10986 [2:20:58<1:53:39,  1.39s/it]

training loss: 3.4413516521453857


training:  55%|█████▌    | 6076/10986 [2:21:00<1:52:21,  1.37s/it]

training loss: 3.304171323776245


training:  55%|█████▌    | 6077/10986 [2:21:01<1:51:05,  1.36s/it]

training loss: 3.3606374263763428


training:  55%|█████▌    | 6078/10986 [2:21:02<1:50:29,  1.35s/it]

training loss: 3.3818845748901367


training:  55%|█████▌    | 6079/10986 [2:21:04<1:49:22,  1.34s/it]

training loss: 3.3595240116119385


training:  55%|█████▌    | 6080/10986 [2:21:05<1:49:20,  1.34s/it]

training loss: 3.3480260372161865
valid loss: 3.343021869659424
perplexity: 28.30453109741211


training:  55%|█████▌    | 6081/10986 [2:21:08<2:27:23,  1.80s/it]

training loss: 3.4091272354125977


training:  55%|█████▌    | 6082/10986 [2:21:09<2:17:56,  1.69s/it]

training loss: 3.4379312992095947


training:  55%|█████▌    | 6083/10986 [2:21:11<2:08:58,  1.58s/it]

training loss: 3.3906972408294678


training:  55%|█████▌    | 6084/10986 [2:21:12<2:02:37,  1.50s/it]

training loss: 3.3088538646698


training:  55%|█████▌    | 6085/10986 [2:21:13<1:58:24,  1.45s/it]

training loss: 3.3572354316711426


training:  55%|█████▌    | 6086/10986 [2:21:15<1:55:08,  1.41s/it]

training loss: 3.392705202102661


training:  55%|█████▌    | 6087/10986 [2:21:16<1:53:18,  1.39s/it]

training loss: 3.4068827629089355


training:  55%|█████▌    | 6088/10986 [2:21:17<1:51:36,  1.37s/it]

training loss: 3.429088830947876


training:  55%|█████▌    | 6089/10986 [2:21:19<1:50:55,  1.36s/it]

training loss: 3.3510918617248535


training:  55%|█████▌    | 6090/10986 [2:21:20<1:49:53,  1.35s/it]

training loss: 3.4109325408935547


training:  55%|█████▌    | 6091/10986 [2:21:22<1:57:22,  1.44s/it]

training loss: 3.457744836807251


training:  55%|█████▌    | 6092/10986 [2:21:23<2:00:54,  1.48s/it]

training loss: 3.40852689743042


training:  55%|█████▌    | 6093/10986 [2:21:25<1:57:11,  1.44s/it]

training loss: 3.277236223220825


training:  55%|█████▌    | 6094/10986 [2:21:26<1:54:26,  1.40s/it]

training loss: 3.577169179916382


training:  55%|█████▌    | 6095/10986 [2:21:27<1:53:08,  1.39s/it]

training loss: 3.420860767364502


training:  55%|█████▌    | 6096/10986 [2:21:29<1:51:24,  1.37s/it]

training loss: 3.3774609565734863


training:  55%|█████▌    | 6097/10986 [2:21:30<1:50:21,  1.35s/it]

training loss: 3.4006550312042236


training:  56%|█████▌    | 6098/10986 [2:21:31<1:49:15,  1.34s/it]

training loss: 3.303454875946045


training:  56%|█████▌    | 6099/10986 [2:21:33<1:49:26,  1.34s/it]

training loss: 3.4776153564453125


training:  56%|█████▌    | 6100/10986 [2:21:34<1:48:53,  1.34s/it]

training loss: 3.5464882850646973
valid loss: 3.5386481285095215
perplexity: 34.42035675048828


training:  56%|█████▌    | 6101/10986 [2:21:37<2:24:58,  1.78s/it]

training loss: 3.3336544036865234


training:  56%|█████▌    | 6102/10986 [2:21:38<2:16:51,  1.68s/it]

training loss: 3.439849376678467


training:  56%|█████▌    | 6103/10986 [2:21:39<2:08:18,  1.58s/it]

training loss: 3.4043257236480713


training:  56%|█████▌    | 6104/10986 [2:21:41<2:01:42,  1.50s/it]

training loss: 3.4661571979522705


training:  56%|█████▌    | 6105/10986 [2:21:42<1:57:07,  1.44s/it]

training loss: 3.3673503398895264


training:  56%|█████▌    | 6106/10986 [2:21:43<1:54:19,  1.41s/it]

training loss: 3.402991533279419


training:  56%|█████▌    | 6107/10986 [2:21:45<1:52:13,  1.38s/it]

training loss: 3.287102699279785


training:  56%|█████▌    | 6108/10986 [2:21:46<1:51:22,  1.37s/it]

training loss: 3.2705581188201904


training:  56%|█████▌    | 6109/10986 [2:21:47<1:50:10,  1.36s/it]

training loss: 3.428231716156006


training:  56%|█████▌    | 6110/10986 [2:21:49<1:49:33,  1.35s/it]

training loss: 3.28696346282959


training:  56%|█████▌    | 6111/10986 [2:21:51<2:09:47,  1.60s/it]

training loss: 3.348723888397217


training:  56%|█████▌    | 6112/10986 [2:21:53<2:22:40,  1.76s/it]

training loss: 3.441711902618408


training:  56%|█████▌    | 6113/10986 [2:21:54<2:12:21,  1.63s/it]

training loss: 3.310206174850464


training:  56%|█████▌    | 6114/10986 [2:21:56<2:05:09,  1.54s/it]

training loss: 3.4176242351531982


training:  56%|█████▌    | 6115/10986 [2:21:57<1:59:41,  1.47s/it]

training loss: 3.6057183742523193


training:  56%|█████▌    | 6116/10986 [2:21:58<1:56:02,  1.43s/it]

training loss: 3.432450294494629


training:  56%|█████▌    | 6117/10986 [2:22:00<1:53:59,  1.40s/it]

training loss: 3.496922731399536


training:  56%|█████▌    | 6118/10986 [2:22:01<1:52:10,  1.38s/it]

training loss: 3.42991042137146


training:  56%|█████▌    | 6119/10986 [2:22:02<1:51:00,  1.37s/it]

training loss: 3.434659957885742


training:  56%|█████▌    | 6120/10986 [2:22:04<1:49:50,  1.35s/it]

training loss: 3.4923081398010254
valid loss: 3.491196870803833
perplexity: 32.82521057128906


training:  56%|█████▌    | 6121/10986 [2:22:07<2:25:59,  1.80s/it]

training loss: 3.388261318206787


training:  56%|█████▌    | 6122/10986 [2:22:08<2:22:21,  1.76s/it]

training loss: 3.4360501766204834


training:  56%|█████▌    | 6123/10986 [2:22:09<2:11:35,  1.62s/it]

training loss: 3.4395768642425537


training:  56%|█████▌    | 6124/10986 [2:22:11<2:04:41,  1.54s/it]

training loss: 3.3735451698303223


training:  56%|█████▌    | 6125/10986 [2:22:12<1:59:17,  1.47s/it]

training loss: 3.4076223373413086


training:  56%|█████▌    | 6126/10986 [2:22:13<1:55:22,  1.42s/it]

training loss: 3.5458974838256836


training:  56%|█████▌    | 6127/10986 [2:22:15<1:52:38,  1.39s/it]

training loss: 3.3769068717956543


training:  56%|█████▌    | 6128/10986 [2:22:16<1:50:29,  1.36s/it]

training loss: 3.3464441299438477


training:  56%|█████▌    | 6129/10986 [2:22:17<1:48:43,  1.34s/it]

training loss: 3.4553961753845215


training:  56%|█████▌    | 6130/10986 [2:22:19<1:48:14,  1.34s/it]

training loss: 3.6147611141204834


training:  56%|█████▌    | 6131/10986 [2:22:20<1:55:21,  1.43s/it]

training loss: 3.393054246902466


training:  56%|█████▌    | 6132/10986 [2:22:22<2:00:21,  1.49s/it]

training loss: 3.4592597484588623


training:  56%|█████▌    | 6133/10986 [2:22:23<1:56:21,  1.44s/it]

training loss: 3.314143180847168


training:  56%|█████▌    | 6134/10986 [2:22:25<1:53:19,  1.40s/it]

training loss: 3.336890459060669


training:  56%|█████▌    | 6135/10986 [2:22:26<1:51:13,  1.38s/it]

training loss: 3.2946200370788574


training:  56%|█████▌    | 6136/10986 [2:22:27<1:49:36,  1.36s/it]

training loss: 3.2340452671051025


training:  56%|█████▌    | 6137/10986 [2:22:29<1:48:44,  1.35s/it]

training loss: 3.4900460243225098


training:  56%|█████▌    | 6138/10986 [2:22:30<1:47:58,  1.34s/it]

training loss: 3.346954584121704


training:  56%|█████▌    | 6139/10986 [2:22:31<1:47:37,  1.33s/it]

training loss: 3.289005994796753


training:  56%|█████▌    | 6140/10986 [2:22:33<1:47:28,  1.33s/it]

training loss: 3.3488433361053467
valid loss: 3.3468379974365234
perplexity: 28.412750244140625


training:  56%|█████▌    | 6141/10986 [2:22:35<2:23:29,  1.78s/it]

training loss: 3.3997251987457275


training:  56%|█████▌    | 6142/10986 [2:22:37<2:20:37,  1.74s/it]

training loss: 3.3747212886810303


training:  56%|█████▌    | 6143/10986 [2:22:38<2:10:05,  1.61s/it]

training loss: 3.4043338298797607


training:  56%|█████▌    | 6144/10986 [2:22:40<2:03:00,  1.52s/it]

training loss: 3.5060770511627197


training:  56%|█████▌    | 6145/10986 [2:22:41<1:57:59,  1.46s/it]

training loss: 3.4940295219421387


training:  56%|█████▌    | 6146/10986 [2:22:42<1:55:00,  1.43s/it]

training loss: 3.430635452270508


training:  56%|█████▌    | 6147/10986 [2:22:44<1:52:58,  1.40s/it]

training loss: 3.430424213409424


training:  56%|█████▌    | 6148/10986 [2:22:45<1:51:10,  1.38s/it]

training loss: 3.2844207286834717


training:  56%|█████▌    | 6149/10986 [2:22:46<1:49:39,  1.36s/it]

training loss: 3.324266195297241


training:  56%|█████▌    | 6150/10986 [2:22:48<1:48:30,  1.35s/it]

training loss: 3.3756303787231445


training:  56%|█████▌    | 6151/10986 [2:22:49<1:54:49,  1.42s/it]

training loss: 3.385619640350342


training:  56%|█████▌    | 6152/10986 [2:22:51<1:59:39,  1.49s/it]

training loss: 3.366913080215454


training:  56%|█████▌    | 6153/10986 [2:22:52<1:56:27,  1.45s/it]

training loss: 3.3774030208587646


training:  56%|█████▌    | 6154/10986 [2:22:54<1:54:04,  1.42s/it]

training loss: 3.5268239974975586


training:  56%|█████▌    | 6155/10986 [2:22:55<1:51:50,  1.39s/it]

training loss: 3.4864330291748047


training:  56%|█████▌    | 6156/10986 [2:22:56<1:50:14,  1.37s/it]

training loss: 3.4961681365966797


training:  56%|█████▌    | 6157/10986 [2:22:57<1:48:47,  1.35s/it]

training loss: 3.391162157058716


training:  56%|█████▌    | 6158/10986 [2:22:59<1:48:12,  1.34s/it]

training loss: 3.38631534576416


training:  56%|█████▌    | 6159/10986 [2:23:00<1:47:42,  1.34s/it]

training loss: 3.3119757175445557


training:  56%|█████▌    | 6160/10986 [2:23:01<1:47:32,  1.34s/it]

training loss: 3.398505449295044
valid loss: 3.3933682441711426
perplexity: 29.76604461669922


training:  56%|█████▌    | 6161/10986 [2:23:04<2:24:04,  1.79s/it]

training loss: 3.4088399410247803


training:  56%|█████▌    | 6162/10986 [2:23:06<2:20:35,  1.75s/it]

training loss: 3.4335360527038574


training:  56%|█████▌    | 6163/10986 [2:23:07<2:11:07,  1.63s/it]

training loss: 3.2992353439331055


training:  56%|█████▌    | 6164/10986 [2:23:09<2:03:28,  1.54s/it]

training loss: 3.3311045169830322


training:  56%|█████▌    | 6165/10986 [2:23:10<1:58:33,  1.48s/it]

training loss: 3.381875514984131


training:  56%|█████▌    | 6166/10986 [2:23:11<1:54:35,  1.43s/it]

training loss: 3.5057241916656494


training:  56%|█████▌    | 6167/10986 [2:23:13<1:52:22,  1.40s/it]

training loss: 3.484687328338623


training:  56%|█████▌    | 6168/10986 [2:23:14<1:50:45,  1.38s/it]

training loss: 3.381429433822632


training:  56%|█████▌    | 6169/10986 [2:23:15<1:49:20,  1.36s/it]

training loss: 3.316852569580078


training:  56%|█████▌    | 6170/10986 [2:23:17<1:48:22,  1.35s/it]

training loss: 3.436723470687866


training:  56%|█████▌    | 6171/10986 [2:23:18<1:54:39,  1.43s/it]

training loss: 3.3960041999816895


training:  56%|█████▌    | 6172/10986 [2:23:20<1:52:23,  1.40s/it]

training loss: 3.4959921836853027


training:  56%|█████▌    | 6173/10986 [2:23:21<1:50:35,  1.38s/it]

training loss: 3.4018213748931885


training:  56%|█████▌    | 6174/10986 [2:23:22<1:49:35,  1.37s/it]

training loss: 3.500704526901245


training:  56%|█████▌    | 6175/10986 [2:23:24<1:48:39,  1.36s/it]

training loss: 3.363323211669922


training:  56%|█████▌    | 6176/10986 [2:23:25<1:47:30,  1.34s/it]

training loss: 3.3751235008239746


training:  56%|█████▌    | 6177/10986 [2:23:26<1:47:01,  1.34s/it]

training loss: 3.326805591583252


training:  56%|█████▌    | 6178/10986 [2:23:27<1:46:29,  1.33s/it]

training loss: 3.4364981651306152


training:  56%|█████▌    | 6179/10986 [2:23:29<1:46:22,  1.33s/it]

training loss: 3.332676887512207


training:  56%|█████▋    | 6180/10986 [2:23:30<1:46:29,  1.33s/it]

training loss: 3.343733310699463
valid loss: 3.335125684738159
perplexity: 28.081911087036133


training:  56%|█████▋    | 6181/10986 [2:23:33<2:22:10,  1.78s/it]

training loss: 3.344959259033203


training:  56%|█████▋    | 6182/10986 [2:23:34<2:13:38,  1.67s/it]

training loss: 3.3777923583984375


training:  56%|█████▋    | 6183/10986 [2:23:36<2:05:08,  1.56s/it]

training loss: 3.461442708969116


training:  56%|█████▋    | 6184/10986 [2:23:37<1:59:11,  1.49s/it]

training loss: 3.3013103008270264


training:  56%|█████▋    | 6185/10986 [2:23:38<1:55:28,  1.44s/it]

training loss: 3.344395399093628


training:  56%|█████▋    | 6186/10986 [2:23:40<1:52:25,  1.41s/it]

training loss: 3.28167986869812


training:  56%|█████▋    | 6187/10986 [2:23:41<1:50:32,  1.38s/it]

training loss: 3.3859703540802


training:  56%|█████▋    | 6188/10986 [2:23:42<1:49:12,  1.37s/it]

training loss: 3.4165120124816895


training:  56%|█████▋    | 6189/10986 [2:23:44<1:48:18,  1.35s/it]

training loss: 3.3906586170196533


training:  56%|█████▋    | 6190/10986 [2:23:45<1:47:56,  1.35s/it]

training loss: 3.358976125717163


training:  56%|█████▋    | 6191/10986 [2:23:47<1:54:03,  1.43s/it]

training loss: 3.265390396118164


training:  56%|█████▋    | 6192/10986 [2:23:48<1:58:26,  1.48s/it]

training loss: 3.3275508880615234


training:  56%|█████▋    | 6193/10986 [2:23:50<1:54:42,  1.44s/it]

training loss: 3.2872111797332764


training:  56%|█████▋    | 6194/10986 [2:23:51<1:52:12,  1.41s/it]

training loss: 3.3395345211029053


training:  56%|█████▋    | 6195/10986 [2:23:52<1:49:54,  1.38s/it]

training loss: 3.3710479736328125


training:  56%|█████▋    | 6196/10986 [2:23:54<1:49:32,  1.37s/it]

training loss: 3.4147393703460693


training:  56%|█████▋    | 6197/10986 [2:23:55<1:48:19,  1.36s/it]

training loss: 3.4372479915618896


training:  56%|█████▋    | 6198/10986 [2:23:56<1:47:51,  1.35s/it]

training loss: 3.4742119312286377


training:  56%|█████▋    | 6199/10986 [2:23:58<1:47:11,  1.34s/it]

training loss: 3.323854923248291


training:  56%|█████▋    | 6200/10986 [2:23:59<1:46:37,  1.34s/it]

training loss: 3.473198652267456
valid loss: 3.473478317260742
perplexity: 32.248722076416016


training:  56%|█████▋    | 6201/10986 [2:24:02<2:23:00,  1.79s/it]

training loss: 3.333789587020874


training:  56%|█████▋    | 6202/10986 [2:24:03<2:19:33,  1.75s/it]

training loss: 3.316169023513794


training:  56%|█████▋    | 6203/10986 [2:24:05<2:09:06,  1.62s/it]

training loss: 3.3931198120117188


training:  56%|█████▋    | 6204/10986 [2:24:06<2:02:21,  1.54s/it]

training loss: 3.3539130687713623


training:  56%|█████▋    | 6205/10986 [2:24:07<1:57:12,  1.47s/it]

training loss: 3.5399491786956787


training:  56%|█████▋    | 6206/10986 [2:24:09<1:53:22,  1.42s/it]

training loss: 3.522510528564453


training:  56%|█████▋    | 6207/10986 [2:24:10<1:50:30,  1.39s/it]

training loss: 3.3383703231811523


training:  57%|█████▋    | 6208/10986 [2:24:11<1:48:40,  1.36s/it]

training loss: 3.3296566009521484


training:  57%|█████▋    | 6209/10986 [2:24:13<1:47:36,  1.35s/it]

training loss: 3.411242723464966


training:  57%|█████▋    | 6210/10986 [2:24:14<1:46:39,  1.34s/it]

training loss: 3.450899124145508


training:  57%|█████▋    | 6211/10986 [2:24:16<1:53:33,  1.43s/it]

training loss: 3.4201924800872803


training:  57%|█████▋    | 6212/10986 [2:24:17<1:56:31,  1.46s/it]

training loss: 3.3264079093933105


training:  57%|█████▋    | 6213/10986 [2:24:18<1:52:32,  1.41s/it]

training loss: 3.376051187515259


training:  57%|█████▋    | 6214/10986 [2:24:20<1:50:05,  1.38s/it]

training loss: 3.6003687381744385


training:  57%|█████▋    | 6215/10986 [2:24:21<1:48:35,  1.37s/it]

training loss: 3.4396843910217285


training:  57%|█████▋    | 6216/10986 [2:24:22<1:47:16,  1.35s/it]

training loss: 3.421292543411255


training:  57%|█████▋    | 6217/10986 [2:24:24<1:47:36,  1.35s/it]

training loss: 3.469611406326294


training:  57%|█████▋    | 6218/10986 [2:24:25<1:47:24,  1.35s/it]

training loss: 3.606821060180664


training:  57%|█████▋    | 6219/10986 [2:24:26<1:46:52,  1.35s/it]

training loss: 3.419201374053955


training:  57%|█████▋    | 6220/10986 [2:24:28<1:46:23,  1.34s/it]

training loss: 3.3313746452331543
valid loss: 3.328075885772705
perplexity: 27.8846378326416


training:  57%|█████▋    | 6221/10986 [2:24:30<2:20:56,  1.77s/it]

training loss: 3.374476671218872


training:  57%|█████▋    | 6222/10986 [2:24:32<2:11:34,  1.66s/it]

training loss: 3.429979085922241


training:  57%|█████▋    | 6223/10986 [2:24:33<2:03:48,  1.56s/it]

training loss: 3.341938018798828


training:  57%|█████▋    | 6224/10986 [2:24:34<1:57:58,  1.49s/it]

training loss: 3.3951494693756104


training:  57%|█████▋    | 6225/10986 [2:24:36<1:53:47,  1.43s/it]

training loss: 3.42880916595459


training:  57%|█████▋    | 6226/10986 [2:24:37<1:50:45,  1.40s/it]

training loss: 3.373908281326294


training:  57%|█████▋    | 6227/10986 [2:24:38<1:49:22,  1.38s/it]

training loss: 3.5355801582336426


training:  57%|█████▋    | 6228/10986 [2:24:40<1:47:47,  1.36s/it]

training loss: 3.3927831649780273


training:  57%|█████▋    | 6229/10986 [2:24:41<1:46:47,  1.35s/it]

training loss: 3.373098611831665


training:  57%|█████▋    | 6230/10986 [2:24:42<1:46:13,  1.34s/it]

training loss: 3.376039505004883


training:  57%|█████▋    | 6231/10986 [2:24:44<1:52:12,  1.42s/it]

training loss: 3.312300205230713


training:  57%|█████▋    | 6232/10986 [2:24:45<1:50:28,  1.39s/it]

training loss: 3.422455310821533


training:  57%|█████▋    | 6233/10986 [2:24:47<1:48:49,  1.37s/it]

training loss: 3.477642059326172


training:  57%|█████▋    | 6234/10986 [2:24:48<1:47:25,  1.36s/it]

training loss: 3.3604977130889893


training:  57%|█████▋    | 6235/10986 [2:24:49<1:46:25,  1.34s/it]

training loss: 3.5531435012817383


training:  57%|█████▋    | 6236/10986 [2:24:51<1:45:38,  1.33s/it]

training loss: 3.4402732849121094


training:  57%|█████▋    | 6237/10986 [2:24:52<1:45:21,  1.33s/it]

training loss: 3.4897513389587402


training:  57%|█████▋    | 6238/10986 [2:24:53<1:45:17,  1.33s/it]

training loss: 3.3796074390411377


training:  57%|█████▋    | 6239/10986 [2:24:55<1:45:21,  1.33s/it]

training loss: 3.4716176986694336


training:  57%|█████▋    | 6240/10986 [2:24:56<1:44:32,  1.32s/it]

training loss: 3.432678699493408
valid loss: 3.4336624145507812
perplexity: 30.989933013916016


training:  57%|█████▋    | 6241/10986 [2:24:59<2:19:22,  1.76s/it]

training loss: 3.4721224308013916


training:  57%|█████▋    | 6242/10986 [2:25:00<2:10:27,  1.65s/it]

training loss: 3.319992780685425


training:  57%|█████▋    | 6243/10986 [2:25:01<2:02:28,  1.55s/it]

training loss: 3.305133581161499


training:  57%|█████▋    | 6244/10986 [2:25:03<1:56:54,  1.48s/it]

training loss: 3.2928872108459473


training:  57%|█████▋    | 6245/10986 [2:25:04<1:52:47,  1.43s/it]

training loss: 3.399843215942383


training:  57%|█████▋    | 6246/10986 [2:25:05<1:49:58,  1.39s/it]

training loss: 3.418428421020508


training:  57%|█████▋    | 6247/10986 [2:25:07<1:47:26,  1.36s/it]

training loss: 3.457192897796631


training:  57%|█████▋    | 6248/10986 [2:25:08<1:46:24,  1.35s/it]

training loss: 3.376209259033203


training:  57%|█████▋    | 6249/10986 [2:25:09<1:45:34,  1.34s/it]

training loss: 3.3085896968841553


training:  57%|█████▋    | 6250/10986 [2:25:11<1:45:18,  1.33s/it]

training loss: 3.3266148567199707


training:  57%|█████▋    | 6251/10986 [2:25:12<1:51:46,  1.42s/it]

training loss: 3.404573440551758


training:  57%|█████▋    | 6252/10986 [2:25:14<1:50:03,  1.39s/it]

training loss: 3.3897781372070312


training:  57%|█████▋    | 6253/10986 [2:25:15<1:48:40,  1.38s/it]

training loss: 3.40012264251709


training:  57%|█████▋    | 6254/10986 [2:25:16<1:47:21,  1.36s/it]

training loss: 3.4353790283203125


training:  57%|█████▋    | 6255/10986 [2:25:17<1:46:03,  1.35s/it]

training loss: 3.3871679306030273


training:  57%|█████▋    | 6256/10986 [2:25:19<1:46:00,  1.34s/it]

training loss: 3.3359272480010986


training:  57%|█████▋    | 6257/10986 [2:25:20<1:45:12,  1.33s/it]

training loss: 3.3663253784179688


training:  57%|█████▋    | 6258/10986 [2:25:21<1:45:18,  1.34s/it]

training loss: 3.3175408840179443


training:  57%|█████▋    | 6259/10986 [2:25:23<1:52:57,  1.43s/it]

training loss: 3.4535558223724365


training:  57%|█████▋    | 6260/10986 [2:25:25<2:01:33,  1.54s/it]

training loss: 3.4383206367492676
valid loss: 3.4378490447998047
perplexity: 31.11994743347168


training:  57%|█████▋    | 6261/10986 [2:25:28<2:35:40,  1.98s/it]

training loss: 3.455646514892578


training:  57%|█████▋    | 6262/10986 [2:25:29<2:22:36,  1.81s/it]

training loss: 3.461207151412964


training:  57%|█████▋    | 6263/10986 [2:25:31<2:10:56,  1.66s/it]

training loss: 3.442854166030884


training:  57%|█████▋    | 6264/10986 [2:25:32<2:03:00,  1.56s/it]

training loss: 3.4877498149871826


training:  57%|█████▋    | 6265/10986 [2:25:33<1:57:36,  1.49s/it]

training loss: 3.3079328536987305


training:  57%|█████▋    | 6266/10986 [2:25:35<1:53:38,  1.44s/it]

training loss: 3.3769309520721436


training:  57%|█████▋    | 6267/10986 [2:25:36<1:50:42,  1.41s/it]

training loss: 3.480365514755249


training:  57%|█████▋    | 6268/10986 [2:25:37<1:49:00,  1.39s/it]

training loss: 3.3820526599884033


training:  57%|█████▋    | 6269/10986 [2:25:39<1:47:45,  1.37s/it]

training loss: 3.4864861965179443


training:  57%|█████▋    | 6270/10986 [2:25:40<1:46:35,  1.36s/it]

training loss: 3.3760271072387695


training:  57%|█████▋    | 6271/10986 [2:25:42<1:52:38,  1.43s/it]

training loss: 3.3279857635498047


training:  57%|█████▋    | 6272/10986 [2:25:43<1:51:52,  1.42s/it]

training loss: 3.332267999649048


training:  57%|█████▋    | 6273/10986 [2:25:44<1:49:38,  1.40s/it]

training loss: 3.4075894355773926


training:  57%|█████▋    | 6274/10986 [2:25:46<1:47:34,  1.37s/it]

training loss: 3.349564552307129


training:  57%|█████▋    | 6275/10986 [2:25:47<1:46:15,  1.35s/it]

training loss: 3.434781551361084


training:  57%|█████▋    | 6276/10986 [2:25:48<1:45:23,  1.34s/it]

training loss: 3.2934885025024414


training:  57%|█████▋    | 6277/10986 [2:25:50<1:44:28,  1.33s/it]

training loss: 3.4111011028289795


training:  57%|█████▋    | 6278/10986 [2:25:51<1:43:57,  1.32s/it]

training loss: 3.3809378147125244


training:  57%|█████▋    | 6279/10986 [2:25:52<1:44:23,  1.33s/it]

training loss: 3.342393636703491


training:  57%|█████▋    | 6280/10986 [2:25:54<1:44:22,  1.33s/it]

training loss: 3.3335416316986084
valid loss: 3.331918239593506
perplexity: 27.991985321044922


training:  57%|█████▋    | 6281/10986 [2:25:56<2:20:02,  1.79s/it]

training loss: 3.4126503467559814


training:  57%|█████▋    | 6282/10986 [2:25:58<2:16:48,  1.74s/it]

training loss: 3.4195196628570557


training:  57%|█████▋    | 6283/10986 [2:25:59<2:07:07,  1.62s/it]

training loss: 3.3451709747314453


training:  57%|█████▋    | 6284/10986 [2:26:01<2:00:11,  1.53s/it]

training loss: 3.3723323345184326


training:  57%|█████▋    | 6285/10986 [2:26:02<1:55:36,  1.48s/it]

training loss: 3.4263792037963867


training:  57%|█████▋    | 6286/10986 [2:26:03<1:52:50,  1.44s/it]

training loss: 3.422351837158203


training:  57%|█████▋    | 6287/10986 [2:26:05<1:50:25,  1.41s/it]

training loss: 3.370835781097412


training:  57%|█████▋    | 6288/10986 [2:26:06<1:48:33,  1.39s/it]

training loss: 3.539059638977051


training:  57%|█████▋    | 6289/10986 [2:26:07<1:47:21,  1.37s/it]

training loss: 3.4962258338928223


training:  57%|█████▋    | 6290/10986 [2:26:09<1:46:30,  1.36s/it]

training loss: 3.4439785480499268


training:  57%|█████▋    | 6291/10986 [2:26:10<1:53:28,  1.45s/it]

training loss: 3.466691732406616


training:  57%|█████▋    | 6292/10986 [2:26:12<1:58:07,  1.51s/it]

training loss: 3.4582576751708984


training:  57%|█████▋    | 6293/10986 [2:26:13<1:54:19,  1.46s/it]

training loss: 3.4130077362060547


training:  57%|█████▋    | 6294/10986 [2:26:15<1:51:15,  1.42s/it]

training loss: 3.3585965633392334


training:  57%|█████▋    | 6295/10986 [2:26:16<1:49:21,  1.40s/it]

training loss: 3.4323441982269287


training:  57%|█████▋    | 6296/10986 [2:26:17<1:47:34,  1.38s/it]

training loss: 3.3038768768310547


training:  57%|█████▋    | 6297/10986 [2:26:19<1:47:29,  1.38s/it]

training loss: 3.429014205932617


training:  57%|█████▋    | 6298/10986 [2:26:20<1:46:27,  1.36s/it]

training loss: 3.3980624675750732


training:  57%|█████▋    | 6299/10986 [2:26:21<1:45:55,  1.36s/it]

training loss: 3.302412509918213


training:  57%|█████▋    | 6300/10986 [2:26:23<1:45:09,  1.35s/it]

training loss: 3.3821802139282227
valid loss: 3.3806018829345703
perplexity: 29.388452529907227


training:  57%|█████▋    | 6301/10986 [2:26:26<2:20:49,  1.80s/it]

training loss: 3.438399314880371


training:  57%|█████▋    | 6302/10986 [2:26:27<2:12:08,  1.69s/it]

training loss: 3.347038507461548


training:  57%|█████▋    | 6303/10986 [2:26:28<2:03:54,  1.59s/it]

training loss: 3.500242233276367


training:  57%|█████▋    | 6304/10986 [2:26:30<1:57:52,  1.51s/it]

training loss: 3.443885087966919


training:  57%|█████▋    | 6305/10986 [2:26:31<1:54:14,  1.46s/it]

training loss: 3.519091844558716


training:  57%|█████▋    | 6306/10986 [2:26:32<1:51:39,  1.43s/it]

training loss: 3.5511341094970703


training:  57%|█████▋    | 6307/10986 [2:26:34<1:49:36,  1.41s/it]

training loss: 3.3620026111602783


training:  57%|█████▋    | 6308/10986 [2:26:35<1:47:48,  1.38s/it]

training loss: 3.401221513748169


training:  57%|█████▋    | 6309/10986 [2:26:36<1:46:30,  1.37s/it]

training loss: 3.4051566123962402


training:  57%|█████▋    | 6310/10986 [2:26:38<1:45:38,  1.36s/it]

training loss: 3.417283773422241


training:  57%|█████▋    | 6311/10986 [2:26:39<1:51:50,  1.44s/it]

training loss: 3.4471826553344727


training:  57%|█████▋    | 6312/10986 [2:26:41<1:49:38,  1.41s/it]

training loss: 3.555516242980957


training:  57%|█████▋    | 6313/10986 [2:26:42<1:47:23,  1.38s/it]

training loss: 3.397339344024658


training:  57%|█████▋    | 6314/10986 [2:26:43<1:46:06,  1.36s/it]

training loss: 3.532170534133911


training:  57%|█████▋    | 6315/10986 [2:26:45<1:45:14,  1.35s/it]

training loss: 3.388404369354248


training:  57%|█████▋    | 6316/10986 [2:26:46<1:45:19,  1.35s/it]

training loss: 3.474081039428711


training:  58%|█████▊    | 6317/10986 [2:26:47<1:44:46,  1.35s/it]

training loss: 3.5498664379119873


training:  58%|█████▊    | 6318/10986 [2:26:49<1:43:58,  1.34s/it]

training loss: 3.4334704875946045


training:  58%|█████▊    | 6319/10986 [2:26:50<1:43:45,  1.33s/it]

training loss: 3.4583730697631836


training:  58%|█████▊    | 6320/10986 [2:26:51<1:43:43,  1.33s/it]

training loss: 3.310225009918213
valid loss: 3.307774782180786
perplexity: 27.324254989624023


training:  58%|█████▊    | 6321/10986 [2:26:54<2:18:50,  1.79s/it]

training loss: 3.339838981628418


training:  58%|█████▊    | 6322/10986 [2:26:56<2:10:34,  1.68s/it]

training loss: 3.370800018310547


training:  58%|█████▊    | 6323/10986 [2:26:57<2:03:06,  1.58s/it]

training loss: 3.360626697540283


training:  58%|█████▊    | 6324/10986 [2:26:58<1:57:32,  1.51s/it]

training loss: 3.406712055206299


training:  58%|█████▊    | 6325/10986 [2:27:00<1:53:14,  1.46s/it]

training loss: 3.4289770126342773


training:  58%|█████▊    | 6326/10986 [2:27:01<1:50:17,  1.42s/it]

training loss: 3.3730921745300293


training:  58%|█████▊    | 6327/10986 [2:27:02<1:48:19,  1.40s/it]

training loss: 3.4141478538513184


training:  58%|█████▊    | 6328/10986 [2:27:04<1:46:24,  1.37s/it]

training loss: 3.4451935291290283


training:  58%|█████▊    | 6329/10986 [2:27:05<1:45:20,  1.36s/it]

training loss: 3.481719970703125


training:  58%|█████▊    | 6330/10986 [2:27:06<1:44:34,  1.35s/it]

training loss: 3.369291305541992


training:  58%|█████▊    | 6331/10986 [2:27:08<1:51:23,  1.44s/it]

training loss: 3.2909576892852783


training:  58%|█████▊    | 6332/10986 [2:27:09<1:49:57,  1.42s/it]

training loss: 3.424553871154785


training:  58%|█████▊    | 6333/10986 [2:27:11<1:48:11,  1.40s/it]

training loss: 3.442983388900757


training:  58%|█████▊    | 6334/10986 [2:27:12<1:46:33,  1.37s/it]

training loss: 3.4676120281219482


training:  58%|█████▊    | 6335/10986 [2:27:13<1:45:21,  1.36s/it]

training loss: 3.494732618331909


training:  58%|█████▊    | 6336/10986 [2:27:15<1:44:56,  1.35s/it]

training loss: 3.3553049564361572


training:  58%|█████▊    | 6337/10986 [2:27:16<1:44:00,  1.34s/it]

training loss: 3.498326539993286


training:  58%|█████▊    | 6338/10986 [2:27:17<1:43:45,  1.34s/it]

training loss: 3.3053460121154785


training:  58%|█████▊    | 6339/10986 [2:27:19<1:43:14,  1.33s/it]

training loss: 3.3399393558502197


training:  58%|█████▊    | 6340/10986 [2:27:20<1:43:07,  1.33s/it]

training loss: 3.2917699813842773
valid loss: 3.291489839553833
perplexity: 26.882883071899414


training:  58%|█████▊    | 6341/10986 [2:27:23<2:17:33,  1.78s/it]

training loss: 3.3931641578674316


training:  58%|█████▊    | 6342/10986 [2:27:24<2:15:28,  1.75s/it]

training loss: 3.4913856983184814


training:  58%|█████▊    | 6343/10986 [2:27:26<2:05:57,  1.63s/it]

training loss: 3.4666552543640137


training:  58%|█████▊    | 6344/10986 [2:27:27<1:58:44,  1.53s/it]

training loss: 3.4234426021575928


training:  58%|█████▊    | 6345/10986 [2:27:29<1:55:11,  1.49s/it]

training loss: 3.4385082721710205


training:  58%|█████▊    | 6346/10986 [2:27:30<1:51:45,  1.45s/it]

training loss: 3.434480667114258


training:  58%|█████▊    | 6347/10986 [2:27:31<1:48:32,  1.40s/it]

training loss: 3.340810775756836


training:  58%|█████▊    | 6348/10986 [2:27:33<1:46:27,  1.38s/it]

training loss: 3.418740749359131


training:  58%|█████▊    | 6349/10986 [2:27:34<1:44:57,  1.36s/it]

training loss: 3.2989277839660645


training:  58%|█████▊    | 6350/10986 [2:27:35<1:44:09,  1.35s/it]

training loss: 3.3711671829223633


training:  58%|█████▊    | 6351/10986 [2:27:37<1:50:43,  1.43s/it]

training loss: 3.3876984119415283


training:  58%|█████▊    | 6352/10986 [2:27:38<1:55:47,  1.50s/it]

training loss: 3.443891763687134


training:  58%|█████▊    | 6353/10986 [2:27:40<1:52:39,  1.46s/it]

training loss: 3.5008628368377686


training:  58%|█████▊    | 6354/10986 [2:27:41<1:49:54,  1.42s/it]

training loss: 3.374924898147583


training:  58%|█████▊    | 6355/10986 [2:27:42<1:47:45,  1.40s/it]

training loss: 3.373589515686035


training:  58%|█████▊    | 6356/10986 [2:27:44<1:45:44,  1.37s/it]

training loss: 3.3234846591949463


training:  58%|█████▊    | 6357/10986 [2:27:45<1:44:38,  1.36s/it]

training loss: 3.3805251121520996


training:  58%|█████▊    | 6358/10986 [2:27:46<1:43:20,  1.34s/it]

training loss: 3.4311306476593018


training:  58%|█████▊    | 6359/10986 [2:27:48<1:42:17,  1.33s/it]

training loss: 3.4219894409179688


training:  58%|█████▊    | 6360/10986 [2:27:49<1:42:06,  1.32s/it]

training loss: 3.4559342861175537
valid loss: 3.4519612789154053
perplexity: 31.562232971191406


training:  58%|█████▊    | 6361/10986 [2:27:52<2:16:33,  1.77s/it]

training loss: 3.4438788890838623


training:  58%|█████▊    | 6362/10986 [2:27:54<2:15:03,  1.75s/it]

training loss: 3.4523932933807373


training:  58%|█████▊    | 6363/10986 [2:27:55<2:04:43,  1.62s/it]

training loss: 3.378331184387207


training:  58%|█████▊    | 6364/10986 [2:27:56<1:57:32,  1.53s/it]

training loss: 3.3007678985595703


training:  58%|█████▊    | 6365/10986 [2:27:57<1:52:27,  1.46s/it]

training loss: 3.3437368869781494


training:  58%|█████▊    | 6366/10986 [2:27:59<1:49:47,  1.43s/it]

training loss: 3.2509279251098633


training:  58%|█████▊    | 6367/10986 [2:28:00<1:47:11,  1.39s/it]

training loss: 3.3215880393981934


training:  58%|█████▊    | 6368/10986 [2:28:01<1:45:47,  1.37s/it]

training loss: 3.3533520698547363


training:  58%|█████▊    | 6369/10986 [2:28:03<1:44:34,  1.36s/it]

training loss: 3.466564178466797


training:  58%|█████▊    | 6370/10986 [2:28:04<1:43:28,  1.34s/it]

training loss: 3.405801773071289


training:  58%|█████▊    | 6371/10986 [2:28:06<1:49:54,  1.43s/it]

training loss: 3.4955756664276123


training:  58%|█████▊    | 6372/10986 [2:28:07<1:53:41,  1.48s/it]

training loss: 3.4852828979492188


training:  58%|█████▊    | 6373/10986 [2:28:09<1:50:08,  1.43s/it]

training loss: 3.3711202144622803


training:  58%|█████▊    | 6374/10986 [2:28:10<1:47:35,  1.40s/it]

training loss: 3.315561532974243


training:  58%|█████▊    | 6375/10986 [2:28:11<1:46:04,  1.38s/it]

training loss: 3.477964162826538


training:  58%|█████▊    | 6376/10986 [2:28:13<1:45:22,  1.37s/it]

training loss: 3.505432605743408


training:  58%|█████▊    | 6377/10986 [2:28:14<1:44:15,  1.36s/it]

training loss: 3.493678331375122


training:  58%|█████▊    | 6378/10986 [2:28:15<1:43:20,  1.35s/it]

training loss: 3.444939613342285


training:  58%|█████▊    | 6379/10986 [2:28:17<1:43:02,  1.34s/it]

training loss: 3.4631476402282715


training:  58%|█████▊    | 6380/10986 [2:28:18<1:42:36,  1.34s/it]

training loss: 3.421797275543213
valid loss: 3.4167561531066895
perplexity: 30.470413208007812


training:  58%|█████▊    | 6381/10986 [2:28:21<2:17:40,  1.79s/it]

training loss: 3.3537137508392334


training:  58%|█████▊    | 6382/10986 [2:28:22<2:10:07,  1.70s/it]

training loss: 3.3565452098846436


training:  58%|█████▊    | 6383/10986 [2:28:24<2:01:47,  1.59s/it]

training loss: 3.4443576335906982


training:  58%|█████▊    | 6384/10986 [2:28:25<1:55:40,  1.51s/it]

training loss: 3.4292893409729004


training:  58%|█████▊    | 6385/10986 [2:28:26<1:51:16,  1.45s/it]

training loss: 3.4327163696289062


training:  58%|█████▊    | 6386/10986 [2:28:28<1:48:05,  1.41s/it]

training loss: 3.367922306060791


training:  58%|█████▊    | 6387/10986 [2:28:29<1:46:34,  1.39s/it]

training loss: 3.391422748565674


training:  58%|█████▊    | 6388/10986 [2:28:30<1:45:57,  1.38s/it]

training loss: 3.396289110183716


training:  58%|█████▊    | 6389/10986 [2:28:32<1:44:29,  1.36s/it]

training loss: 3.3162951469421387


training:  58%|█████▊    | 6390/10986 [2:28:33<1:43:16,  1.35s/it]

training loss: 3.4169986248016357


training:  58%|█████▊    | 6391/10986 [2:28:35<1:49:39,  1.43s/it]

training loss: 3.481041193008423


training:  58%|█████▊    | 6392/10986 [2:28:36<1:47:19,  1.40s/it]

training loss: 3.347726821899414


training:  58%|█████▊    | 6393/10986 [2:28:37<1:45:22,  1.38s/it]

training loss: 3.522590160369873


training:  58%|█████▊    | 6394/10986 [2:28:38<1:43:55,  1.36s/it]

training loss: 3.6010539531707764


training:  58%|█████▊    | 6395/10986 [2:28:40<1:43:07,  1.35s/it]

training loss: 3.470224142074585


training:  58%|█████▊    | 6396/10986 [2:28:41<1:42:54,  1.35s/it]

training loss: 3.4429471492767334


training:  58%|█████▊    | 6397/10986 [2:28:43<1:43:08,  1.35s/it]

training loss: 3.3105502128601074


training:  58%|█████▊    | 6398/10986 [2:28:44<1:42:38,  1.34s/it]

training loss: 3.52622652053833


training:  58%|█████▊    | 6399/10986 [2:28:45<1:42:32,  1.34s/it]

training loss: 3.3249149322509766


training:  58%|█████▊    | 6400/10986 [2:28:46<1:42:05,  1.34s/it]

training loss: 3.359739303588867
valid loss: 3.3560314178466797
perplexity: 28.6751651763916


training:  58%|█████▊    | 6401/10986 [2:28:49<2:16:32,  1.79s/it]

training loss: 3.3306150436401367


training:  58%|█████▊    | 6402/10986 [2:28:51<2:09:47,  1.70s/it]

training loss: 3.389244318008423


training:  58%|█████▊    | 6403/10986 [2:28:52<2:01:23,  1.59s/it]

training loss: 3.4530029296875


training:  58%|█████▊    | 6404/10986 [2:28:53<1:55:27,  1.51s/it]

training loss: 3.440293788909912


training:  58%|█████▊    | 6405/10986 [2:28:55<1:51:03,  1.45s/it]

training loss: 3.407270908355713


training:  58%|█████▊    | 6406/10986 [2:28:56<1:47:45,  1.41s/it]

training loss: 3.37709903717041


training:  58%|█████▊    | 6407/10986 [2:28:57<1:45:45,  1.39s/it]

training loss: 3.4990978240966797


training:  58%|█████▊    | 6408/10986 [2:28:59<1:45:00,  1.38s/it]

training loss: 3.4016470909118652


training:  58%|█████▊    | 6409/10986 [2:29:01<1:53:25,  1.49s/it]

training loss: 3.48939847946167


training:  58%|█████▊    | 6410/10986 [2:29:02<2:00:20,  1.58s/it]

training loss: 3.331638813018799


training:  58%|█████▊    | 6411/10986 [2:29:04<2:03:18,  1.62s/it]

training loss: 3.4031448364257812


training:  58%|█████▊    | 6412/10986 [2:29:06<2:02:21,  1.61s/it]

training loss: 3.4190547466278076


training:  58%|█████▊    | 6413/10986 [2:29:07<1:55:40,  1.52s/it]

training loss: 3.364044666290283


training:  58%|█████▊    | 6414/10986 [2:29:08<1:51:36,  1.46s/it]

training loss: 3.499025583267212


training:  58%|█████▊    | 6415/10986 [2:29:10<1:48:03,  1.42s/it]

training loss: 3.472975730895996


training:  58%|█████▊    | 6416/10986 [2:29:11<1:45:53,  1.39s/it]

training loss: 3.4420289993286133


training:  58%|█████▊    | 6417/10986 [2:29:12<1:44:10,  1.37s/it]

training loss: 3.457263231277466


training:  58%|█████▊    | 6418/10986 [2:29:14<1:42:55,  1.35s/it]

training loss: 3.481633424758911


training:  58%|█████▊    | 6419/10986 [2:29:15<1:42:02,  1.34s/it]

training loss: 3.3573670387268066


training:  58%|█████▊    | 6420/10986 [2:29:16<1:41:31,  1.33s/it]

training loss: 3.3229873180389404
valid loss: 3.3193044662475586
perplexity: 27.6411190032959


training:  58%|█████▊    | 6421/10986 [2:29:19<2:16:06,  1.79s/it]

training loss: 3.342054843902588


training:  58%|█████▊    | 6422/10986 [2:29:21<2:09:59,  1.71s/it]

training loss: 3.438628673553467


training:  58%|█████▊    | 6423/10986 [2:29:22<2:01:09,  1.59s/it]

training loss: 3.336515188217163


training:  58%|█████▊    | 6424/10986 [2:29:23<1:54:56,  1.51s/it]

training loss: 3.3120675086975098


training:  58%|█████▊    | 6425/10986 [2:29:25<1:51:13,  1.46s/it]

training loss: 3.3984577655792236


training:  58%|█████▊    | 6426/10986 [2:29:26<1:47:41,  1.42s/it]

training loss: 3.453078269958496


training:  59%|█████▊    | 6427/10986 [2:29:27<1:45:51,  1.39s/it]

training loss: 3.421766996383667


training:  59%|█████▊    | 6428/10986 [2:29:29<1:44:16,  1.37s/it]

training loss: 3.385221242904663


training:  59%|█████▊    | 6429/10986 [2:29:30<1:43:12,  1.36s/it]

training loss: 3.415233612060547


training:  59%|█████▊    | 6430/10986 [2:29:31<1:42:46,  1.35s/it]

training loss: 3.566340446472168


training:  59%|█████▊    | 6431/10986 [2:29:33<1:48:53,  1.43s/it]

training loss: 3.474902629852295


training:  59%|█████▊    | 6432/10986 [2:29:34<1:54:30,  1.51s/it]

training loss: 3.307849884033203


training:  59%|█████▊    | 6433/10986 [2:29:36<1:51:39,  1.47s/it]

training loss: 3.4224720001220703


training:  59%|█████▊    | 6434/10986 [2:29:37<1:48:04,  1.42s/it]

training loss: 3.4270050525665283


training:  59%|█████▊    | 6435/10986 [2:29:39<1:45:45,  1.39s/it]

training loss: 3.5425941944122314


training:  59%|█████▊    | 6436/10986 [2:29:40<1:44:07,  1.37s/it]

training loss: 3.3842620849609375


training:  59%|█████▊    | 6437/10986 [2:29:41<1:43:14,  1.36s/it]

training loss: 3.4768104553222656


training:  59%|█████▊    | 6438/10986 [2:29:42<1:41:47,  1.34s/it]

training loss: 3.332716941833496


training:  59%|█████▊    | 6439/10986 [2:29:44<1:41:44,  1.34s/it]

training loss: 3.365802526473999


training:  59%|█████▊    | 6440/10986 [2:29:45<1:41:37,  1.34s/it]

training loss: 3.264148712158203
valid loss: 3.2622897624969482
perplexity: 26.109251022338867


training:  59%|█████▊    | 6441/10986 [2:29:48<2:15:50,  1.79s/it]

training loss: 3.407341957092285


training:  59%|█████▊    | 6442/10986 [2:29:49<2:06:59,  1.68s/it]

training loss: 3.456613779067993


training:  59%|█████▊    | 6443/10986 [2:29:51<1:58:52,  1.57s/it]

training loss: 3.4334871768951416


training:  59%|█████▊    | 6444/10986 [2:29:52<1:53:19,  1.50s/it]

training loss: 3.4429187774658203


training:  59%|█████▊    | 6445/10986 [2:29:53<1:48:46,  1.44s/it]

training loss: 3.4407432079315186


training:  59%|█████▊    | 6446/10986 [2:29:55<1:46:07,  1.40s/it]

training loss: 3.441927909851074


training:  59%|█████▊    | 6447/10986 [2:29:56<1:44:29,  1.38s/it]

training loss: 3.4845290184020996


training:  59%|█████▊    | 6448/10986 [2:29:57<1:43:16,  1.37s/it]

training loss: 3.5310049057006836


training:  59%|█████▊    | 6449/10986 [2:29:59<1:42:26,  1.35s/it]

training loss: 3.469554901123047


training:  59%|█████▊    | 6450/10986 [2:30:00<1:42:33,  1.36s/it]

training loss: 3.4188032150268555


training:  59%|█████▊    | 6451/10986 [2:30:02<1:48:51,  1.44s/it]

training loss: 3.512089252471924


training:  59%|█████▊    | 6452/10986 [2:30:03<1:51:51,  1.48s/it]

training loss: 3.3727920055389404


training:  59%|█████▊    | 6453/10986 [2:30:05<1:47:46,  1.43s/it]

training loss: 3.3814446926116943


training:  59%|█████▊    | 6454/10986 [2:30:06<1:45:12,  1.39s/it]

training loss: 3.416455030441284


training:  59%|█████▉    | 6455/10986 [2:30:07<1:43:44,  1.37s/it]

training loss: 3.4998483657836914


training:  59%|█████▉    | 6456/10986 [2:30:09<1:42:42,  1.36s/it]

training loss: 3.3590118885040283


training:  59%|█████▉    | 6457/10986 [2:30:10<1:41:29,  1.34s/it]

training loss: 3.3647968769073486


training:  59%|█████▉    | 6458/10986 [2:30:11<1:41:20,  1.34s/it]

training loss: 3.498967409133911


training:  59%|█████▉    | 6459/10986 [2:30:12<1:41:15,  1.34s/it]

training loss: 3.515648365020752


training:  59%|█████▉    | 6460/10986 [2:30:14<1:40:57,  1.34s/it]

training loss: 3.421665906906128
valid loss: 3.4163014888763428
perplexity: 30.456562042236328


training:  59%|█████▉    | 6461/10986 [2:30:17<2:15:33,  1.80s/it]

training loss: 3.319526433944702


training:  59%|█████▉    | 6462/10986 [2:30:18<2:07:40,  1.69s/it]

training loss: 3.4141738414764404


training:  59%|█████▉    | 6463/10986 [2:30:19<1:59:40,  1.59s/it]

training loss: 3.3192238807678223


training:  59%|█████▉    | 6464/10986 [2:30:21<1:54:09,  1.51s/it]

training loss: 3.4668946266174316


training:  59%|█████▉    | 6465/10986 [2:30:22<1:50:12,  1.46s/it]

training loss: 3.4945991039276123


training:  59%|█████▉    | 6466/10986 [2:30:24<1:47:32,  1.43s/it]

training loss: 3.4411301612854004


training:  59%|█████▉    | 6467/10986 [2:30:25<1:45:19,  1.40s/it]

training loss: 3.394155502319336


training:  59%|█████▉    | 6468/10986 [2:30:26<1:43:48,  1.38s/it]

training loss: 3.453512191772461


training:  59%|█████▉    | 6469/10986 [2:30:28<1:42:48,  1.37s/it]

training loss: 3.374753475189209


training:  59%|█████▉    | 6470/10986 [2:30:29<1:42:31,  1.36s/it]

training loss: 3.3838188648223877


training:  59%|█████▉    | 6471/10986 [2:30:31<1:49:21,  1.45s/it]

training loss: 3.3399765491485596


training:  59%|█████▉    | 6472/10986 [2:30:32<1:48:26,  1.44s/it]

training loss: 3.511608600616455


training:  59%|█████▉    | 6473/10986 [2:30:33<1:46:04,  1.41s/it]

training loss: 3.461294174194336


training:  59%|█████▉    | 6474/10986 [2:30:35<1:44:24,  1.39s/it]

training loss: 3.411591053009033


training:  59%|█████▉    | 6475/10986 [2:30:36<1:43:21,  1.37s/it]

training loss: 3.3952884674072266


training:  59%|█████▉    | 6476/10986 [2:30:37<1:42:26,  1.36s/it]

training loss: 3.3233957290649414


training:  59%|█████▉    | 6477/10986 [2:30:39<1:41:34,  1.35s/it]

training loss: 3.365945339202881


training:  59%|█████▉    | 6478/10986 [2:30:40<1:41:09,  1.35s/it]

training loss: 3.461787462234497


training:  59%|█████▉    | 6479/10986 [2:30:41<1:41:04,  1.35s/it]

training loss: 3.339864492416382


training:  59%|█████▉    | 6480/10986 [2:30:43<1:40:57,  1.34s/it]

training loss: 3.43815541267395
valid loss: 3.4360530376434326
perplexity: 31.06410789489746


training:  59%|█████▉    | 6481/10986 [2:30:46<2:15:26,  1.80s/it]

training loss: 3.4787631034851074


training:  59%|█████▉    | 6482/10986 [2:30:47<2:06:30,  1.69s/it]

training loss: 3.478877067565918


training:  59%|█████▉    | 6483/10986 [2:30:48<1:59:09,  1.59s/it]

training loss: 3.382528305053711


training:  59%|█████▉    | 6484/10986 [2:30:50<1:53:47,  1.52s/it]

training loss: 3.509772539138794


training:  59%|█████▉    | 6485/10986 [2:30:51<1:49:59,  1.47s/it]

training loss: 3.369802713394165


training:  59%|█████▉    | 6486/10986 [2:30:52<1:47:51,  1.44s/it]

training loss: 3.437988042831421


training:  59%|█████▉    | 6487/10986 [2:30:54<1:45:30,  1.41s/it]

training loss: 3.3996469974517822


training:  59%|█████▉    | 6488/10986 [2:30:55<1:44:02,  1.39s/it]

training loss: 3.627056837081909


training:  59%|█████▉    | 6489/10986 [2:30:56<1:42:55,  1.37s/it]

training loss: 3.5260117053985596


training:  59%|█████▉    | 6490/10986 [2:30:58<1:41:56,  1.36s/it]

training loss: 3.3881378173828125


training:  59%|█████▉    | 6491/10986 [2:30:59<1:48:44,  1.45s/it]

training loss: 3.3482022285461426


training:  59%|█████▉    | 6492/10986 [2:31:01<1:54:27,  1.53s/it]

training loss: 3.455394983291626


training:  59%|█████▉    | 6493/10986 [2:31:02<1:50:29,  1.48s/it]

training loss: 3.3577513694763184


training:  59%|█████▉    | 6494/10986 [2:31:04<1:47:01,  1.43s/it]

training loss: 3.4271326065063477


training:  59%|█████▉    | 6495/10986 [2:31:05<1:45:00,  1.40s/it]

training loss: 3.4201459884643555


training:  59%|█████▉    | 6496/10986 [2:31:06<1:42:49,  1.37s/it]

training loss: 3.4222631454467773


training:  59%|█████▉    | 6497/10986 [2:31:08<1:41:14,  1.35s/it]

training loss: 3.4859371185302734


training:  59%|█████▉    | 6498/10986 [2:31:09<1:40:41,  1.35s/it]

training loss: 3.441077470779419


training:  59%|█████▉    | 6499/10986 [2:31:10<1:40:49,  1.35s/it]

training loss: 3.482816219329834


training:  59%|█████▉    | 6500/10986 [2:31:12<1:40:58,  1.35s/it]

training loss: 3.296586036682129
valid loss: 3.291755199432373
perplexity: 26.8900203704834


training:  59%|█████▉    | 6501/10986 [2:31:15<2:14:07,  1.79s/it]

training loss: 3.3834357261657715


training:  59%|█████▉    | 6502/10986 [2:31:16<2:05:51,  1.68s/it]

training loss: 3.4301249980926514


training:  59%|█████▉    | 6503/10986 [2:31:17<1:57:56,  1.58s/it]

training loss: 3.4842441082000732


training:  59%|█████▉    | 6504/10986 [2:31:19<1:52:36,  1.51s/it]

training loss: 3.378154754638672


training:  59%|█████▉    | 6505/10986 [2:31:20<1:49:10,  1.46s/it]

training loss: 3.400486469268799


training:  59%|█████▉    | 6506/10986 [2:31:21<1:46:26,  1.43s/it]

training loss: 3.4821348190307617


training:  59%|█████▉    | 6507/10986 [2:31:23<1:43:57,  1.39s/it]

training loss: 3.415360689163208


training:  59%|█████▉    | 6508/10986 [2:31:24<1:42:20,  1.37s/it]

training loss: 3.3411166667938232


training:  59%|█████▉    | 6509/10986 [2:31:25<1:41:18,  1.36s/it]

training loss: 3.4355132579803467


training:  59%|█████▉    | 6510/10986 [2:31:27<1:40:37,  1.35s/it]

training loss: 3.344619035720825


training:  59%|█████▉    | 6511/10986 [2:31:28<1:46:59,  1.43s/it]

training loss: 3.4079020023345947


training:  59%|█████▉    | 6512/10986 [2:31:30<1:44:26,  1.40s/it]

training loss: 3.4099903106689453


training:  59%|█████▉    | 6513/10986 [2:31:31<1:43:05,  1.38s/it]

training loss: 3.470737934112549


training:  59%|█████▉    | 6514/10986 [2:31:32<1:42:34,  1.38s/it]

training loss: 3.4753355979919434


training:  59%|█████▉    | 6515/10986 [2:31:34<1:41:23,  1.36s/it]

training loss: 3.421603202819824


training:  59%|█████▉    | 6516/10986 [2:31:35<1:41:02,  1.36s/it]

training loss: 3.422001600265503


training:  59%|█████▉    | 6517/10986 [2:31:36<1:41:09,  1.36s/it]

training loss: 3.433772087097168


training:  59%|█████▉    | 6518/10986 [2:31:38<1:40:52,  1.35s/it]

training loss: 3.298321008682251


training:  59%|█████▉    | 6519/10986 [2:31:39<1:40:29,  1.35s/it]

training loss: 3.4009201526641846


training:  59%|█████▉    | 6520/10986 [2:31:40<1:40:22,  1.35s/it]

training loss: 3.379122257232666
valid loss: 3.372297525405884
perplexity: 29.14541244506836


training:  59%|█████▉    | 6521/10986 [2:31:43<2:15:24,  1.82s/it]

training loss: 3.3479788303375244


training:  59%|█████▉    | 6522/10986 [2:31:45<2:06:26,  1.70s/it]

training loss: 3.2888941764831543


training:  59%|█████▉    | 6523/10986 [2:31:46<1:58:14,  1.59s/it]

training loss: 3.421537160873413


training:  59%|█████▉    | 6524/10986 [2:31:47<1:52:09,  1.51s/it]

training loss: 3.4228146076202393


training:  59%|█████▉    | 6525/10986 [2:31:49<1:47:57,  1.45s/it]

training loss: 3.4123916625976562


training:  59%|█████▉    | 6526/10986 [2:31:50<1:44:58,  1.41s/it]

training loss: 3.4325990676879883


training:  59%|█████▉    | 6527/10986 [2:31:51<1:43:09,  1.39s/it]

training loss: 3.377234697341919


training:  59%|█████▉    | 6528/10986 [2:31:53<1:42:33,  1.38s/it]

training loss: 3.3391401767730713


training:  59%|█████▉    | 6529/10986 [2:31:54<1:41:53,  1.37s/it]

training loss: 3.491222858428955


training:  59%|█████▉    | 6530/10986 [2:31:55<1:40:46,  1.36s/it]

training loss: 3.3765676021575928


training:  59%|█████▉    | 6531/10986 [2:31:57<1:46:54,  1.44s/it]

training loss: 3.3232226371765137


training:  59%|█████▉    | 6532/10986 [2:31:58<1:46:05,  1.43s/it]

training loss: 3.344566583633423


training:  59%|█████▉    | 6533/10986 [2:32:00<1:43:42,  1.40s/it]

training loss: 3.369016408920288


training:  59%|█████▉    | 6534/10986 [2:32:01<1:42:52,  1.39s/it]

training loss: 3.4699652194976807


training:  59%|█████▉    | 6535/10986 [2:32:02<1:42:36,  1.38s/it]

training loss: 3.3595542907714844


training:  59%|█████▉    | 6536/10986 [2:32:04<1:41:33,  1.37s/it]

training loss: 3.3730950355529785


training:  60%|█████▉    | 6537/10986 [2:32:05<1:40:49,  1.36s/it]

training loss: 3.3966031074523926


training:  60%|█████▉    | 6538/10986 [2:32:06<1:40:14,  1.35s/it]

training loss: 3.5638210773468018


training:  60%|█████▉    | 6539/10986 [2:32:08<1:39:42,  1.35s/it]

training loss: 3.5015249252319336


training:  60%|█████▉    | 6540/10986 [2:32:09<1:39:31,  1.34s/it]

training loss: 3.3154735565185547
valid loss: 3.312354564666748
perplexity: 27.449682235717773


training:  60%|█████▉    | 6541/10986 [2:32:12<2:14:42,  1.82s/it]

training loss: 3.2857351303100586


training:  60%|█████▉    | 6542/10986 [2:32:14<2:06:10,  1.70s/it]

training loss: 3.389216899871826


training:  60%|█████▉    | 6543/10986 [2:32:15<1:57:47,  1.59s/it]

training loss: 3.504302740097046


training:  60%|█████▉    | 6544/10986 [2:32:16<1:51:42,  1.51s/it]

training loss: 3.343111753463745


training:  60%|█████▉    | 6545/10986 [2:32:17<1:47:43,  1.46s/it]

training loss: 3.4402997493743896


training:  60%|█████▉    | 6546/10986 [2:32:19<1:44:54,  1.42s/it]

training loss: 3.4579901695251465


training:  60%|█████▉    | 6547/10986 [2:32:20<1:42:55,  1.39s/it]

training loss: 3.364769697189331


training:  60%|█████▉    | 6548/10986 [2:32:21<1:41:34,  1.37s/it]

training loss: 3.398151397705078


training:  60%|█████▉    | 6549/10986 [2:32:23<1:40:28,  1.36s/it]

training loss: 3.4381604194641113


training:  60%|█████▉    | 6550/10986 [2:32:24<1:40:05,  1.35s/it]

training loss: 3.400296688079834


training:  60%|█████▉    | 6551/10986 [2:32:26<1:46:52,  1.45s/it]

training loss: 3.3487613201141357


training:  60%|█████▉    | 6552/10986 [2:32:27<1:45:32,  1.43s/it]

training loss: 3.4101204872131348


training:  60%|█████▉    | 6553/10986 [2:32:29<1:43:48,  1.41s/it]

training loss: 3.5037753582000732


training:  60%|█████▉    | 6554/10986 [2:32:30<1:42:04,  1.38s/it]

training loss: 3.4014267921447754


training:  60%|█████▉    | 6555/10986 [2:32:31<1:41:25,  1.37s/it]

training loss: 3.4324584007263184


training:  60%|█████▉    | 6556/10986 [2:32:33<1:40:33,  1.36s/it]

training loss: 3.3271732330322266


training:  60%|█████▉    | 6557/10986 [2:32:34<1:39:12,  1.34s/it]

training loss: 3.3729028701782227


training:  60%|█████▉    | 6558/10986 [2:32:35<1:39:30,  1.35s/it]

training loss: 3.3377933502197266


training:  60%|█████▉    | 6559/10986 [2:32:37<1:40:27,  1.36s/it]

training loss: 3.4676802158355713


training:  60%|█████▉    | 6560/10986 [2:32:38<1:48:16,  1.47s/it]

training loss: 3.4332404136657715
valid loss: 3.4356331825256348
perplexity: 31.05106544494629


training:  60%|█████▉    | 6561/10986 [2:32:42<2:28:18,  2.01s/it]

training loss: 3.4200499057769775


training:  60%|█████▉    | 6562/10986 [2:32:43<2:14:36,  1.83s/it]

training loss: 3.2938244342803955


training:  60%|█████▉    | 6563/10986 [2:32:44<2:04:14,  1.69s/it]

training loss: 3.4861204624176025


training:  60%|█████▉    | 6564/10986 [2:32:46<1:56:38,  1.58s/it]

training loss: 3.3532426357269287


training:  60%|█████▉    | 6565/10986 [2:32:47<1:51:05,  1.51s/it]

training loss: 3.443972587585449


training:  60%|█████▉    | 6566/10986 [2:32:48<1:46:40,  1.45s/it]

training loss: 3.436488151550293


training:  60%|█████▉    | 6567/10986 [2:32:50<1:44:15,  1.42s/it]

training loss: 3.3958287239074707


training:  60%|█████▉    | 6568/10986 [2:32:51<1:42:19,  1.39s/it]

training loss: 3.3592371940612793


training:  60%|█████▉    | 6569/10986 [2:32:52<1:40:46,  1.37s/it]

training loss: 3.384427309036255


training:  60%|█████▉    | 6570/10986 [2:32:54<1:39:46,  1.36s/it]

training loss: 3.4386439323425293


training:  60%|█████▉    | 6571/10986 [2:32:55<1:46:12,  1.44s/it]

training loss: 3.4026033878326416


training:  60%|█████▉    | 6572/10986 [2:32:57<1:51:00,  1.51s/it]

training loss: 3.45713472366333


training:  60%|█████▉    | 6573/10986 [2:32:58<1:46:53,  1.45s/it]

training loss: 3.466299057006836


training:  60%|█████▉    | 6574/10986 [2:33:00<1:44:13,  1.42s/it]

training loss: 3.4625489711761475


training:  60%|█████▉    | 6575/10986 [2:33:01<1:42:16,  1.39s/it]

training loss: 3.3766441345214844


training:  60%|█████▉    | 6576/10986 [2:33:02<1:41:56,  1.39s/it]

training loss: 3.445190668106079


training:  60%|█████▉    | 6577/10986 [2:33:04<1:40:20,  1.37s/it]

training loss: 3.299961805343628


training:  60%|█████▉    | 6578/10986 [2:33:05<1:39:27,  1.35s/it]

training loss: 3.4746603965759277


training:  60%|█████▉    | 6579/10986 [2:33:06<1:39:25,  1.35s/it]

training loss: 3.39494252204895


training:  60%|█████▉    | 6580/10986 [2:33:08<1:39:10,  1.35s/it]

training loss: 3.432673692703247
valid loss: 3.4334921836853027
perplexity: 30.98465919494629


training:  60%|█████▉    | 6581/10986 [2:33:11<2:12:45,  1.81s/it]

training loss: 3.486156702041626


training:  60%|█████▉    | 6582/10986 [2:33:12<2:10:42,  1.78s/it]

training loss: 3.4358742237091064


training:  60%|█████▉    | 6583/10986 [2:33:14<2:01:03,  1.65s/it]

training loss: 3.460667848587036


training:  60%|█████▉    | 6584/10986 [2:33:15<1:54:08,  1.56s/it]

training loss: 3.4042835235595703


training:  60%|█████▉    | 6585/10986 [2:33:16<1:48:58,  1.49s/it]

training loss: 3.3191611766815186


training:  60%|█████▉    | 6586/10986 [2:33:18<1:45:32,  1.44s/it]

training loss: 3.4523301124572754


training:  60%|█████▉    | 6587/10986 [2:33:19<1:42:58,  1.40s/it]

training loss: 3.473275899887085


training:  60%|█████▉    | 6588/10986 [2:33:20<1:41:36,  1.39s/it]

training loss: 3.375614643096924


training:  60%|█████▉    | 6589/10986 [2:33:22<1:40:29,  1.37s/it]

training loss: 3.343345880508423


training:  60%|█████▉    | 6590/10986 [2:33:23<1:39:51,  1.36s/it]

training loss: 3.3735971450805664


training:  60%|█████▉    | 6591/10986 [2:33:25<1:45:47,  1.44s/it]

training loss: 3.374443292617798


training:  60%|██████    | 6592/10986 [2:33:26<1:44:20,  1.42s/it]

training loss: 3.4117319583892822


training:  60%|██████    | 6593/10986 [2:33:27<1:42:54,  1.41s/it]

training loss: 3.4614665508270264


training:  60%|██████    | 6594/10986 [2:33:29<1:41:06,  1.38s/it]

training loss: 3.3927862644195557


training:  60%|██████    | 6595/10986 [2:33:30<1:40:01,  1.37s/it]

training loss: 3.4371018409729004


training:  60%|██████    | 6596/10986 [2:33:31<1:39:15,  1.36s/it]

training loss: 3.439490795135498


training:  60%|██████    | 6597/10986 [2:33:33<1:39:00,  1.35s/it]

training loss: 3.4129815101623535


training:  60%|██████    | 6598/10986 [2:33:34<1:38:51,  1.35s/it]

training loss: 3.378617286682129


training:  60%|██████    | 6599/10986 [2:33:35<1:38:25,  1.35s/it]

training loss: 3.3747527599334717


training:  60%|██████    | 6600/10986 [2:33:37<1:38:50,  1.35s/it]

training loss: 3.4492740631103516
valid loss: 3.4491991996765137
perplexity: 31.475175857543945


training:  60%|██████    | 6601/10986 [2:33:40<2:11:57,  1.81s/it]

training loss: 3.343522310256958


training:  60%|██████    | 6602/10986 [2:33:41<2:08:36,  1.76s/it]

training loss: 3.314627170562744


training:  60%|██████    | 6603/10986 [2:33:43<1:59:32,  1.64s/it]

training loss: 3.394787549972534


training:  60%|██████    | 6604/10986 [2:33:44<1:53:10,  1.55s/it]

training loss: 3.423738718032837


training:  60%|██████    | 6605/10986 [2:33:45<1:48:09,  1.48s/it]

training loss: 3.5339853763580322


training:  60%|██████    | 6606/10986 [2:33:47<1:44:48,  1.44s/it]

training loss: 3.399096727371216


training:  60%|██████    | 6607/10986 [2:33:48<1:42:14,  1.40s/it]

training loss: 3.344752311706543


training:  60%|██████    | 6608/10986 [2:33:49<1:40:40,  1.38s/it]

training loss: 3.414177656173706


training:  60%|██████    | 6609/10986 [2:33:51<1:39:35,  1.37s/it]

training loss: 3.446134090423584


training:  60%|██████    | 6610/10986 [2:33:52<1:38:33,  1.35s/it]

training loss: 3.416018486022949


training:  60%|██████    | 6611/10986 [2:33:54<1:44:40,  1.44s/it]

training loss: 3.3711817264556885


training:  60%|██████    | 6612/10986 [2:33:55<1:42:53,  1.41s/it]

training loss: 3.495260715484619


training:  60%|██████    | 6613/10986 [2:33:56<1:40:46,  1.38s/it]

training loss: 3.3955304622650146


training:  60%|██████    | 6614/10986 [2:33:58<1:39:26,  1.36s/it]

training loss: 3.4269416332244873


training:  60%|██████    | 6615/10986 [2:33:59<1:39:06,  1.36s/it]

training loss: 3.4560208320617676


training:  60%|██████    | 6616/10986 [2:34:00<1:38:17,  1.35s/it]

training loss: 3.417795181274414


training:  60%|██████    | 6617/10986 [2:34:01<1:37:38,  1.34s/it]

training loss: 3.3047420978546143


training:  60%|██████    | 6618/10986 [2:34:03<1:38:12,  1.35s/it]

training loss: 3.35166335105896


training:  60%|██████    | 6619/10986 [2:34:04<1:37:57,  1.35s/it]

training loss: 3.4112517833709717


training:  60%|██████    | 6620/10986 [2:34:06<1:37:27,  1.34s/it]

training loss: 3.5107028484344482
valid loss: 3.5054824352264404
perplexity: 33.29750442504883


training:  60%|██████    | 6621/10986 [2:34:08<2:09:55,  1.79s/it]

training loss: 3.4676425457000732


training:  60%|██████    | 6622/10986 [2:34:10<2:02:05,  1.68s/it]

training loss: 3.459946632385254


training:  60%|██████    | 6623/10986 [2:34:11<1:54:46,  1.58s/it]

training loss: 3.338353157043457


training:  60%|██████    | 6624/10986 [2:34:12<1:49:21,  1.50s/it]

training loss: 3.323467493057251


training:  60%|██████    | 6625/10986 [2:34:14<1:45:44,  1.45s/it]

training loss: 3.4455111026763916


training:  60%|██████    | 6626/10986 [2:34:15<1:43:08,  1.42s/it]

training loss: 3.380028486251831


training:  60%|██████    | 6627/10986 [2:34:16<1:41:18,  1.39s/it]

training loss: 3.376160144805908


training:  60%|██████    | 6628/10986 [2:34:18<1:40:13,  1.38s/it]

training loss: 3.284520387649536


training:  60%|██████    | 6629/10986 [2:34:19<1:39:28,  1.37s/it]

training loss: 3.4023921489715576


training:  60%|██████    | 6630/10986 [2:34:21<1:39:02,  1.36s/it]

training loss: 3.4169585704803467


training:  60%|██████    | 6631/10986 [2:34:22<1:44:55,  1.45s/it]

training loss: 3.4171934127807617


training:  60%|██████    | 6632/10986 [2:34:23<1:42:47,  1.42s/it]

training loss: 3.400561571121216


training:  60%|██████    | 6633/10986 [2:34:25<1:40:55,  1.39s/it]

training loss: 3.324575185775757


training:  60%|██████    | 6634/10986 [2:34:26<1:39:11,  1.37s/it]

training loss: 3.4592084884643555


training:  60%|██████    | 6635/10986 [2:34:27<1:38:27,  1.36s/it]

training loss: 3.4194040298461914


training:  60%|██████    | 6636/10986 [2:34:29<1:37:54,  1.35s/it]

training loss: 3.400733709335327


training:  60%|██████    | 6637/10986 [2:34:30<1:37:44,  1.35s/it]

training loss: 3.358463764190674


training:  60%|██████    | 6638/10986 [2:34:31<1:37:37,  1.35s/it]

training loss: 3.4179022312164307


training:  60%|██████    | 6639/10986 [2:34:33<1:37:23,  1.34s/it]

training loss: 3.388068437576294


training:  60%|██████    | 6640/10986 [2:34:34<1:38:22,  1.36s/it]

training loss: 3.3711979389190674
valid loss: 3.36751651763916
perplexity: 29.00640106201172


training:  60%|██████    | 6641/10986 [2:34:37<2:10:23,  1.80s/it]

training loss: 3.3427352905273438


training:  60%|██████    | 6642/10986 [2:34:39<2:02:42,  1.69s/it]

training loss: 3.346806287765503


training:  60%|██████    | 6643/10986 [2:34:40<1:55:06,  1.59s/it]

training loss: 3.4998104572296143


training:  60%|██████    | 6644/10986 [2:34:41<1:49:41,  1.52s/it]

training loss: 3.4627790451049805


training:  60%|██████    | 6645/10986 [2:34:43<1:45:48,  1.46s/it]

training loss: 3.4811503887176514


training:  60%|██████    | 6646/10986 [2:34:44<1:43:00,  1.42s/it]

training loss: 3.421177387237549


training:  61%|██████    | 6647/10986 [2:34:45<1:41:18,  1.40s/it]

training loss: 3.419484853744507


training:  61%|██████    | 6648/10986 [2:34:47<1:39:49,  1.38s/it]

training loss: 3.3957862854003906


training:  61%|██████    | 6649/10986 [2:34:48<1:38:47,  1.37s/it]

training loss: 3.4357261657714844


training:  61%|██████    | 6650/10986 [2:34:49<1:38:22,  1.36s/it]

training loss: 3.3699183464050293


training:  61%|██████    | 6651/10986 [2:34:51<1:43:57,  1.44s/it]

training loss: 3.3420517444610596


training:  61%|██████    | 6652/10986 [2:34:53<1:48:49,  1.51s/it]

training loss: 3.3823959827423096


training:  61%|██████    | 6653/10986 [2:34:54<1:45:08,  1.46s/it]

training loss: 3.353769540786743


training:  61%|██████    | 6654/10986 [2:34:55<1:42:45,  1.42s/it]

training loss: 3.3297533988952637


training:  61%|██████    | 6655/10986 [2:34:57<1:40:50,  1.40s/it]

training loss: 3.5881032943725586


training:  61%|██████    | 6656/10986 [2:34:58<1:39:32,  1.38s/it]

training loss: 3.372453451156616


training:  61%|██████    | 6657/10986 [2:34:59<1:38:33,  1.37s/it]

training loss: 3.4083170890808105


training:  61%|██████    | 6658/10986 [2:35:01<1:37:55,  1.36s/it]

training loss: 3.4399240016937256


training:  61%|██████    | 6659/10986 [2:35:02<1:37:38,  1.35s/it]

training loss: 3.386305570602417


training:  61%|██████    | 6660/10986 [2:35:03<1:37:28,  1.35s/it]

training loss: 3.395380973815918
valid loss: 3.388421058654785
perplexity: 29.619150161743164


training:  61%|██████    | 6661/10986 [2:35:06<2:10:53,  1.82s/it]

training loss: 3.5147581100463867


training:  61%|██████    | 6662/10986 [2:35:08<2:02:23,  1.70s/it]

training loss: 3.4346835613250732


training:  61%|██████    | 6663/10986 [2:35:09<1:54:24,  1.59s/it]

training loss: 3.5160059928894043


training:  61%|██████    | 6664/10986 [2:35:10<1:48:37,  1.51s/it]

training loss: 3.433898687362671


training:  61%|██████    | 6665/10986 [2:35:12<1:45:32,  1.47s/it]

training loss: 3.3326528072357178


training:  61%|██████    | 6666/10986 [2:35:13<1:42:46,  1.43s/it]

training loss: 3.3097991943359375


training:  61%|██████    | 6667/10986 [2:35:14<1:40:35,  1.40s/it]

training loss: 3.3786187171936035


training:  61%|██████    | 6668/10986 [2:35:16<1:39:24,  1.38s/it]

training loss: 3.5224337577819824


training:  61%|██████    | 6669/10986 [2:35:17<1:38:09,  1.36s/it]

training loss: 3.4595718383789062


training:  61%|██████    | 6670/10986 [2:35:18<1:37:16,  1.35s/it]

training loss: 3.475412368774414


training:  61%|██████    | 6671/10986 [2:35:20<1:43:02,  1.43s/it]

training loss: 3.466040849685669


training:  61%|██████    | 6672/10986 [2:35:21<1:42:36,  1.43s/it]

training loss: 3.2991340160369873


training:  61%|██████    | 6673/10986 [2:35:23<1:40:13,  1.39s/it]

training loss: 3.3656303882598877


training:  61%|██████    | 6674/10986 [2:35:24<1:38:28,  1.37s/it]

training loss: 3.277844190597534


training:  61%|██████    | 6675/10986 [2:35:25<1:37:33,  1.36s/it]

training loss: 3.3149783611297607


training:  61%|██████    | 6676/10986 [2:35:27<1:36:52,  1.35s/it]

training loss: 3.3364999294281006


training:  61%|██████    | 6677/10986 [2:35:28<1:36:46,  1.35s/it]

training loss: 3.385915517807007


training:  61%|██████    | 6678/10986 [2:35:29<1:36:23,  1.34s/it]

training loss: 3.391629695892334


training:  61%|██████    | 6679/10986 [2:35:31<1:36:21,  1.34s/it]

training loss: 3.40462327003479


training:  61%|██████    | 6680/10986 [2:35:32<1:36:05,  1.34s/it]

training loss: 3.4084789752960205
valid loss: 3.4028372764587402
perplexity: 30.049238204956055


training:  61%|██████    | 6681/10986 [2:35:35<2:09:41,  1.81s/it]

training loss: 3.36588454246521


training:  61%|██████    | 6682/10986 [2:35:36<2:01:29,  1.69s/it]

training loss: 3.416825771331787


training:  61%|██████    | 6683/10986 [2:35:38<1:53:40,  1.59s/it]

training loss: 3.4766898155212402


training:  61%|██████    | 6684/10986 [2:35:39<1:47:58,  1.51s/it]

training loss: 3.3580026626586914


training:  61%|██████    | 6685/10986 [2:35:40<1:44:58,  1.46s/it]

training loss: 3.345550537109375


training:  61%|██████    | 6686/10986 [2:35:42<1:41:52,  1.42s/it]

training loss: 3.3157942295074463


training:  61%|██████    | 6687/10986 [2:35:43<1:39:56,  1.39s/it]

training loss: 3.444355010986328


training:  61%|██████    | 6688/10986 [2:35:44<1:38:46,  1.38s/it]

training loss: 3.4081497192382812


training:  61%|██████    | 6689/10986 [2:35:46<1:37:37,  1.36s/it]

training loss: 3.4345617294311523


training:  61%|██████    | 6690/10986 [2:35:47<1:36:57,  1.35s/it]

training loss: 3.4218459129333496


training:  61%|██████    | 6691/10986 [2:35:49<1:42:45,  1.44s/it]

training loss: 3.4010097980499268


training:  61%|██████    | 6692/10986 [2:35:50<1:40:52,  1.41s/it]

training loss: 3.3761816024780273


training:  61%|██████    | 6693/10986 [2:35:51<1:39:12,  1.39s/it]

training loss: 3.3841805458068848


training:  61%|██████    | 6694/10986 [2:35:53<1:38:26,  1.38s/it]

training loss: 3.4155080318450928


training:  61%|██████    | 6695/10986 [2:35:54<1:38:00,  1.37s/it]

training loss: 3.4627928733825684


training:  61%|██████    | 6696/10986 [2:35:55<1:37:35,  1.36s/it]

training loss: 3.4342222213745117


training:  61%|██████    | 6697/10986 [2:35:57<1:36:56,  1.36s/it]

training loss: 3.3621838092803955


training:  61%|██████    | 6698/10986 [2:35:58<1:36:34,  1.35s/it]

training loss: 3.3174190521240234


training:  61%|██████    | 6699/10986 [2:35:59<1:35:52,  1.34s/it]

training loss: 3.400860071182251


training:  61%|██████    | 6700/10986 [2:36:01<1:35:52,  1.34s/it]

training loss: 3.3734447956085205
valid loss: 3.3723161220550537
perplexity: 29.14595603942871


training:  61%|██████    | 6701/10986 [2:36:04<2:08:51,  1.80s/it]

training loss: 3.314148426055908


training:  61%|██████    | 6702/10986 [2:36:05<2:01:25,  1.70s/it]

training loss: 3.4511191844940186


training:  61%|██████    | 6703/10986 [2:36:06<1:54:01,  1.60s/it]

training loss: 3.421262264251709


training:  61%|██████    | 6704/10986 [2:36:08<1:48:14,  1.52s/it]

training loss: 3.3475725650787354


training:  61%|██████    | 6705/10986 [2:36:09<1:44:46,  1.47s/it]

training loss: 3.420217514038086


training:  61%|██████    | 6706/10986 [2:36:10<1:41:47,  1.43s/it]

training loss: 3.587653636932373


training:  61%|██████    | 6707/10986 [2:36:12<1:40:05,  1.40s/it]

training loss: 3.48078989982605


training:  61%|██████    | 6708/10986 [2:36:13<1:38:50,  1.39s/it]

training loss: 3.4333977699279785


training:  61%|██████    | 6709/10986 [2:36:15<1:46:35,  1.50s/it]

training loss: 3.4152443408966064


training:  61%|██████    | 6710/10986 [2:36:17<1:52:18,  1.58s/it]

training loss: 3.5711607933044434


training:  61%|██████    | 6711/10986 [2:36:18<1:56:48,  1.64s/it]

training loss: 3.3276736736297607


training:  61%|██████    | 6712/10986 [2:36:20<1:50:39,  1.55s/it]

training loss: 3.348555564880371


training:  61%|██████    | 6713/10986 [2:36:21<1:46:12,  1.49s/it]

training loss: 3.5012664794921875


training:  61%|██████    | 6714/10986 [2:36:22<1:42:45,  1.44s/it]

training loss: 3.395753860473633


training:  61%|██████    | 6715/10986 [2:36:24<1:39:55,  1.40s/it]

training loss: 3.386289119720459


training:  61%|██████    | 6716/10986 [2:36:25<1:38:21,  1.38s/it]

training loss: 3.47711181640625


training:  61%|██████    | 6717/10986 [2:36:26<1:37:32,  1.37s/it]

training loss: 3.334460735321045


training:  61%|██████    | 6718/10986 [2:36:28<1:36:25,  1.36s/it]

training loss: 3.4400060176849365


training:  61%|██████    | 6719/10986 [2:36:29<1:36:06,  1.35s/it]

training loss: 3.350245952606201


training:  61%|██████    | 6720/10986 [2:36:30<1:35:40,  1.35s/it]

training loss: 3.447291374206543
valid loss: 3.44558048248291
perplexity: 31.361482620239258


training:  61%|██████    | 6721/10986 [2:36:33<2:08:18,  1.81s/it]

training loss: 3.465778112411499


training:  61%|██████    | 6722/10986 [2:36:35<1:59:41,  1.68s/it]

training loss: 3.39428448677063


training:  61%|██████    | 6723/10986 [2:36:36<1:52:18,  1.58s/it]

training loss: 3.4694674015045166


training:  61%|██████    | 6724/10986 [2:36:37<1:46:58,  1.51s/it]

training loss: 3.4677734375


training:  61%|██████    | 6725/10986 [2:36:39<1:43:19,  1.45s/it]

training loss: 3.56522798538208


training:  61%|██████    | 6726/10986 [2:36:40<1:41:01,  1.42s/it]

training loss: 3.3466386795043945


training:  61%|██████    | 6727/10986 [2:36:41<1:39:11,  1.40s/it]

training loss: 3.415484666824341


training:  61%|██████    | 6728/10986 [2:36:43<1:38:36,  1.39s/it]

training loss: 3.4752097129821777


training:  61%|██████▏   | 6729/10986 [2:36:44<1:37:48,  1.38s/it]

training loss: 3.4909727573394775


training:  61%|██████▏   | 6730/10986 [2:36:45<1:36:47,  1.36s/it]

training loss: 3.260021448135376


training:  61%|██████▏   | 6731/10986 [2:36:47<1:42:51,  1.45s/it]

training loss: 3.497084140777588


training:  61%|██████▏   | 6732/10986 [2:36:48<1:40:50,  1.42s/it]

training loss: 3.355560779571533


training:  61%|██████▏   | 6733/10986 [2:36:50<1:38:27,  1.39s/it]

training loss: 3.4328746795654297


training:  61%|██████▏   | 6734/10986 [2:36:51<1:37:27,  1.38s/it]

training loss: 3.400811195373535


training:  61%|██████▏   | 6735/10986 [2:36:52<1:36:37,  1.36s/it]

training loss: 3.398916244506836


training:  61%|██████▏   | 6736/10986 [2:36:54<1:35:41,  1.35s/it]

training loss: 3.3879568576812744


training:  61%|██████▏   | 6737/10986 [2:36:55<1:35:03,  1.34s/it]

training loss: 3.5691847801208496


training:  61%|██████▏   | 6738/10986 [2:36:56<1:34:56,  1.34s/it]

training loss: 3.3938026428222656


training:  61%|██████▏   | 6739/10986 [2:36:58<1:34:40,  1.34s/it]

training loss: 3.304807662963867


training:  61%|██████▏   | 6740/10986 [2:36:59<1:34:38,  1.34s/it]

training loss: 3.4012269973754883
valid loss: 3.3976902961730957
perplexity: 29.89497184753418


training:  61%|██████▏   | 6741/10986 [2:37:02<2:06:12,  1.78s/it]

training loss: 3.4590704441070557


training:  61%|██████▏   | 6742/10986 [2:37:03<1:58:14,  1.67s/it]

training loss: 3.483206033706665


training:  61%|██████▏   | 6743/10986 [2:37:05<1:51:31,  1.58s/it]

training loss: 3.3964900970458984


training:  61%|██████▏   | 6744/10986 [2:37:06<1:46:14,  1.50s/it]

training loss: 3.4038684368133545


training:  61%|██████▏   | 6745/10986 [2:37:07<1:42:54,  1.46s/it]

training loss: 3.372356414794922


training:  61%|██████▏   | 6746/10986 [2:37:09<1:40:41,  1.42s/it]

training loss: 3.462552070617676


training:  61%|██████▏   | 6747/10986 [2:37:10<1:39:15,  1.40s/it]

training loss: 3.45173978805542


training:  61%|██████▏   | 6748/10986 [2:37:11<1:37:18,  1.38s/it]

training loss: 3.335904836654663


training:  61%|██████▏   | 6749/10986 [2:37:13<1:36:12,  1.36s/it]

training loss: 3.394101858139038


training:  61%|██████▏   | 6750/10986 [2:37:14<1:35:17,  1.35s/it]

training loss: 3.320319890975952


training:  61%|██████▏   | 6751/10986 [2:37:16<1:41:06,  1.43s/it]

training loss: 3.3934695720672607


training:  61%|██████▏   | 6752/10986 [2:37:17<1:41:37,  1.44s/it]

training loss: 3.441838026046753


training:  61%|██████▏   | 6753/10986 [2:37:18<1:39:22,  1.41s/it]

training loss: 3.3580410480499268


training:  61%|██████▏   | 6754/10986 [2:37:20<1:37:48,  1.39s/it]

training loss: 3.417576551437378


training:  61%|██████▏   | 6755/10986 [2:37:21<1:36:22,  1.37s/it]

training loss: 3.5125153064727783


training:  61%|██████▏   | 6756/10986 [2:37:22<1:35:42,  1.36s/it]

training loss: 3.4409449100494385


training:  62%|██████▏   | 6757/10986 [2:37:24<1:35:23,  1.35s/it]

training loss: 3.439396619796753


training:  62%|██████▏   | 6758/10986 [2:37:25<1:34:51,  1.35s/it]

training loss: 3.434636116027832


training:  62%|██████▏   | 6759/10986 [2:37:26<1:34:07,  1.34s/it]

training loss: 3.4851412773132324


training:  62%|██████▏   | 6760/10986 [2:37:28<1:34:04,  1.34s/it]

training loss: 3.3498477935791016
valid loss: 3.3437275886535645
perplexity: 28.32451057434082


training:  62%|██████▏   | 6761/10986 [2:37:31<2:05:49,  1.79s/it]

training loss: 3.3228533267974854


training:  62%|██████▏   | 6762/10986 [2:37:32<2:03:13,  1.75s/it]

training loss: 3.4117212295532227


training:  62%|██████▏   | 6763/10986 [2:37:34<1:54:34,  1.63s/it]

training loss: 3.3934366703033447


training:  62%|██████▏   | 6764/10986 [2:37:35<1:47:59,  1.53s/it]

training loss: 3.3698301315307617


training:  62%|██████▏   | 6765/10986 [2:37:36<1:43:33,  1.47s/it]

training loss: 3.411062240600586


training:  62%|██████▏   | 6766/10986 [2:37:37<1:40:17,  1.43s/it]

training loss: 3.4499967098236084


training:  62%|██████▏   | 6767/10986 [2:37:39<1:39:15,  1.41s/it]

training loss: 3.370725631713867


training:  62%|██████▏   | 6768/10986 [2:37:40<1:37:44,  1.39s/it]

training loss: 3.410656690597534


training:  62%|██████▏   | 6769/10986 [2:37:42<1:36:47,  1.38s/it]

training loss: 3.4582810401916504


training:  62%|██████▏   | 6770/10986 [2:37:43<1:36:39,  1.38s/it]

training loss: 3.367769241333008


training:  62%|██████▏   | 6771/10986 [2:37:45<1:42:13,  1.46s/it]

training loss: 3.3466012477874756


training:  62%|██████▏   | 6772/10986 [2:37:46<1:47:03,  1.52s/it]

training loss: 3.3459627628326416


training:  62%|██████▏   | 6773/10986 [2:37:48<1:43:08,  1.47s/it]

training loss: 3.3286008834838867


training:  62%|██████▏   | 6774/10986 [2:37:49<1:40:20,  1.43s/it]

training loss: 3.386930465698242


training:  62%|██████▏   | 6775/10986 [2:37:50<1:38:09,  1.40s/it]

training loss: 3.309939384460449


training:  62%|██████▏   | 6776/10986 [2:37:52<1:36:39,  1.38s/it]

training loss: 3.364931583404541


training:  62%|██████▏   | 6777/10986 [2:37:53<1:35:39,  1.36s/it]

training loss: 3.5297226905822754


training:  62%|██████▏   | 6778/10986 [2:37:54<1:35:02,  1.36s/it]

training loss: 3.348231792449951


training:  62%|██████▏   | 6779/10986 [2:37:56<1:34:22,  1.35s/it]

training loss: 3.397193670272827


training:  62%|██████▏   | 6780/10986 [2:37:57<1:33:56,  1.34s/it]

training loss: 3.355400323867798
valid loss: 3.3549211025238037
perplexity: 28.643342971801758


training:  62%|██████▏   | 6781/10986 [2:38:00<2:05:43,  1.79s/it]

training loss: 3.258211851119995


training:  62%|██████▏   | 6782/10986 [2:38:01<2:02:33,  1.75s/it]

training loss: 3.341576099395752


training:  62%|██████▏   | 6783/10986 [2:38:03<1:54:02,  1.63s/it]

training loss: 3.393984794616699


training:  62%|██████▏   | 6784/10986 [2:38:04<1:47:52,  1.54s/it]

training loss: 3.443493127822876


training:  62%|██████▏   | 6785/10986 [2:38:05<1:43:47,  1.48s/it]

training loss: 3.3640646934509277


training:  62%|██████▏   | 6786/10986 [2:38:07<1:40:32,  1.44s/it]

training loss: 3.3334546089172363


training:  62%|██████▏   | 6787/10986 [2:38:08<1:38:36,  1.41s/it]

training loss: 3.5126423835754395


training:  62%|██████▏   | 6788/10986 [2:38:09<1:37:03,  1.39s/it]

training loss: 3.4622530937194824


training:  62%|██████▏   | 6789/10986 [2:38:11<1:35:56,  1.37s/it]

training loss: 3.527036428451538


training:  62%|██████▏   | 6790/10986 [2:38:12<1:35:07,  1.36s/it]

training loss: 3.4689791202545166


training:  62%|██████▏   | 6791/10986 [2:38:14<1:41:26,  1.45s/it]

training loss: 3.3978188037872314


training:  62%|██████▏   | 6792/10986 [2:38:15<1:45:26,  1.51s/it]

training loss: 3.3071088790893555


training:  62%|██████▏   | 6793/10986 [2:38:17<1:41:24,  1.45s/it]

training loss: 3.3780386447906494


training:  62%|██████▏   | 6794/10986 [2:38:18<1:38:41,  1.41s/it]

training loss: 3.3299496173858643


training:  62%|██████▏   | 6795/10986 [2:38:19<1:36:58,  1.39s/it]

training loss: 3.4488086700439453


training:  62%|██████▏   | 6796/10986 [2:38:21<1:35:53,  1.37s/it]

training loss: 3.4463131427764893


training:  62%|██████▏   | 6797/10986 [2:38:22<1:35:02,  1.36s/it]

training loss: 3.3556222915649414


training:  62%|██████▏   | 6798/10986 [2:38:23<1:34:30,  1.35s/it]

training loss: 3.374809503555298


training:  62%|██████▏   | 6799/10986 [2:38:25<1:33:40,  1.34s/it]

training loss: 3.4769582748413086


training:  62%|██████▏   | 6800/10986 [2:38:26<1:33:27,  1.34s/it]

training loss: 3.476381778717041
valid loss: 3.474351167678833
perplexity: 32.276878356933594


training:  62%|██████▏   | 6801/10986 [2:38:29<2:04:35,  1.79s/it]

training loss: 3.3587827682495117


training:  62%|██████▏   | 6802/10986 [2:38:30<1:56:59,  1.68s/it]

training loss: 3.370426893234253


training:  62%|██████▏   | 6803/10986 [2:38:32<1:49:45,  1.57s/it]

training loss: 3.3676657676696777


training:  62%|██████▏   | 6804/10986 [2:38:33<1:44:22,  1.50s/it]

training loss: 3.415393352508545


training:  62%|██████▏   | 6805/10986 [2:38:34<1:41:15,  1.45s/it]

training loss: 3.331287145614624


training:  62%|██████▏   | 6806/10986 [2:38:36<1:38:19,  1.41s/it]

training loss: 3.291563034057617


training:  62%|██████▏   | 6807/10986 [2:38:37<1:36:44,  1.39s/it]

training loss: 3.434126138687134


training:  62%|██████▏   | 6808/10986 [2:38:38<1:35:21,  1.37s/it]

training loss: 3.4011917114257812


training:  62%|██████▏   | 6809/10986 [2:38:40<1:34:08,  1.35s/it]

training loss: 3.327517032623291


training:  62%|██████▏   | 6810/10986 [2:38:41<1:33:37,  1.35s/it]

training loss: 3.3691742420196533


training:  62%|██████▏   | 6811/10986 [2:38:43<1:39:49,  1.43s/it]

training loss: 3.3227248191833496


training:  62%|██████▏   | 6812/10986 [2:38:44<1:38:11,  1.41s/it]

training loss: 3.4399056434631348


training:  62%|██████▏   | 6813/10986 [2:38:45<1:36:10,  1.38s/it]

training loss: 3.4418587684631348


training:  62%|██████▏   | 6814/10986 [2:38:47<1:34:50,  1.36s/it]

training loss: 3.501232862472534


training:  62%|██████▏   | 6815/10986 [2:38:48<1:33:46,  1.35s/it]

training loss: 3.4476969242095947


training:  62%|██████▏   | 6816/10986 [2:38:49<1:33:05,  1.34s/it]

training loss: 3.3926100730895996


training:  62%|██████▏   | 6817/10986 [2:38:51<1:32:55,  1.34s/it]

training loss: 3.377454996109009


training:  62%|██████▏   | 6818/10986 [2:38:52<1:32:35,  1.33s/it]

training loss: 3.389012575149536


training:  62%|██████▏   | 6819/10986 [2:38:53<1:32:15,  1.33s/it]

training loss: 3.389260768890381


training:  62%|██████▏   | 6820/10986 [2:38:54<1:32:24,  1.33s/it]

training loss: 3.3107831478118896
valid loss: 3.308588981628418
perplexity: 27.346511840820312


training:  62%|██████▏   | 6821/10986 [2:38:57<2:03:45,  1.78s/it]

training loss: 3.3905129432678223


training:  62%|██████▏   | 6822/10986 [2:38:59<1:56:01,  1.67s/it]

training loss: 3.380401134490967


training:  62%|██████▏   | 6823/10986 [2:39:00<1:48:52,  1.57s/it]

training loss: 3.370802879333496


training:  62%|██████▏   | 6824/10986 [2:39:01<1:43:30,  1.49s/it]

training loss: 3.465975284576416


training:  62%|██████▏   | 6825/10986 [2:39:03<1:39:38,  1.44s/it]

training loss: 3.4147651195526123


training:  62%|██████▏   | 6826/10986 [2:39:04<1:37:06,  1.40s/it]

training loss: 3.4144973754882812


training:  62%|██████▏   | 6827/10986 [2:39:05<1:35:31,  1.38s/it]

training loss: 3.4396910667419434


training:  62%|██████▏   | 6828/10986 [2:39:07<1:34:16,  1.36s/it]

training loss: 3.520972490310669


training:  62%|██████▏   | 6829/10986 [2:39:08<1:33:27,  1.35s/it]

training loss: 3.459373712539673


training:  62%|██████▏   | 6830/10986 [2:39:09<1:33:30,  1.35s/it]

training loss: 3.4232141971588135


training:  62%|██████▏   | 6831/10986 [2:39:11<1:40:01,  1.44s/it]

training loss: 3.412888765335083


training:  62%|██████▏   | 6832/10986 [2:39:12<1:38:26,  1.42s/it]

training loss: 3.390630006790161


training:  62%|██████▏   | 6833/10986 [2:39:14<1:36:41,  1.40s/it]

training loss: 3.456146478652954


training:  62%|██████▏   | 6834/10986 [2:39:15<1:34:46,  1.37s/it]

training loss: 3.426708698272705


training:  62%|██████▏   | 6835/10986 [2:39:16<1:33:55,  1.36s/it]

training loss: 3.4160947799682617


training:  62%|██████▏   | 6836/10986 [2:39:18<1:33:21,  1.35s/it]

training loss: 3.4025790691375732


training:  62%|██████▏   | 6837/10986 [2:39:19<1:32:48,  1.34s/it]

training loss: 3.435096502304077


training:  62%|██████▏   | 6838/10986 [2:39:20<1:32:24,  1.34s/it]

training loss: 3.3817408084869385


training:  62%|██████▏   | 6839/10986 [2:39:22<1:31:51,  1.33s/it]

training loss: 3.4041588306427


training:  62%|██████▏   | 6840/10986 [2:39:23<1:31:18,  1.32s/it]

training loss: 3.366752862930298
valid loss: 3.3644371032714844
perplexity: 28.91721534729004


training:  62%|██████▏   | 6841/10986 [2:39:26<2:01:53,  1.76s/it]

training loss: 3.379927396774292


training:  62%|██████▏   | 6842/10986 [2:39:27<1:54:37,  1.66s/it]

training loss: 3.3544907569885254


training:  62%|██████▏   | 6843/10986 [2:39:28<1:47:30,  1.56s/it]

training loss: 3.4020020961761475


training:  62%|██████▏   | 6844/10986 [2:39:30<1:42:17,  1.48s/it]

training loss: 3.4934282302856445


training:  62%|██████▏   | 6845/10986 [2:39:31<1:38:51,  1.43s/it]

training loss: 3.321598768234253


training:  62%|██████▏   | 6846/10986 [2:39:32<1:36:23,  1.40s/it]

training loss: 3.361502170562744


training:  62%|██████▏   | 6847/10986 [2:39:34<1:34:18,  1.37s/it]

training loss: 3.392038345336914


training:  62%|██████▏   | 6848/10986 [2:39:35<1:33:34,  1.36s/it]

training loss: 3.4169342517852783


training:  62%|██████▏   | 6849/10986 [2:39:36<1:32:30,  1.34s/it]

training loss: 3.3855319023132324


training:  62%|██████▏   | 6850/10986 [2:39:38<1:31:49,  1.33s/it]

training loss: 3.320361852645874


training:  62%|██████▏   | 6851/10986 [2:39:39<1:37:52,  1.42s/it]

training loss: 3.393531084060669


training:  62%|██████▏   | 6852/10986 [2:39:41<1:36:33,  1.40s/it]

training loss: 3.4728474617004395


training:  62%|██████▏   | 6853/10986 [2:39:42<1:34:40,  1.37s/it]

training loss: 3.403658628463745


training:  62%|██████▏   | 6854/10986 [2:39:43<1:33:13,  1.35s/it]

training loss: 3.354043960571289


training:  62%|██████▏   | 6855/10986 [2:39:45<1:32:34,  1.34s/it]

training loss: 3.386004686355591


training:  62%|██████▏   | 6856/10986 [2:39:46<1:31:42,  1.33s/it]

training loss: 3.4664905071258545


training:  62%|██████▏   | 6857/10986 [2:39:47<1:31:23,  1.33s/it]

training loss: 3.383699417114258


training:  62%|██████▏   | 6858/10986 [2:39:49<1:31:27,  1.33s/it]

training loss: 3.3682174682617188


training:  62%|██████▏   | 6859/10986 [2:39:50<1:38:22,  1.43s/it]

training loss: 3.3567020893096924


training:  62%|██████▏   | 6860/10986 [2:39:52<1:44:31,  1.52s/it]

training loss: 3.3527474403381348
valid loss: 3.3514223098754883
perplexity: 28.543302536010742


training:  62%|██████▏   | 6861/10986 [2:39:55<2:15:26,  1.97s/it]

training loss: 3.4430863857269287


training:  62%|██████▏   | 6862/10986 [2:39:56<2:03:26,  1.80s/it]

training loss: 3.494323253631592


training:  62%|██████▏   | 6863/10986 [2:39:58<1:53:19,  1.65s/it]

training loss: 3.3777987957000732


training:  62%|██████▏   | 6864/10986 [2:39:59<1:46:51,  1.56s/it]

training loss: 3.367941379547119


training:  62%|██████▏   | 6865/10986 [2:40:00<1:41:44,  1.48s/it]

training loss: 3.493098258972168


training:  62%|██████▏   | 6866/10986 [2:40:02<1:37:47,  1.42s/it]

training loss: 3.3779361248016357


training:  63%|██████▎   | 6867/10986 [2:40:03<1:35:37,  1.39s/it]

training loss: 3.4699044227600098


training:  63%|██████▎   | 6868/10986 [2:40:04<1:33:46,  1.37s/it]

training loss: 3.4419054985046387


training:  63%|██████▎   | 6869/10986 [2:40:06<1:32:39,  1.35s/it]

training loss: 3.3165035247802734


training:  63%|██████▎   | 6870/10986 [2:40:07<1:32:58,  1.36s/it]

training loss: 3.432349920272827


training:  63%|██████▎   | 6871/10986 [2:40:09<1:38:52,  1.44s/it]

training loss: 3.3517374992370605


training:  63%|██████▎   | 6872/10986 [2:40:10<1:44:25,  1.52s/it]

training loss: 3.3617501258850098


training:  63%|██████▎   | 6873/10986 [2:40:12<1:40:33,  1.47s/it]

training loss: 3.518056869506836


training:  63%|██████▎   | 6874/10986 [2:40:13<1:37:38,  1.42s/it]

training loss: 3.3395462036132812


training:  63%|██████▎   | 6875/10986 [2:40:14<1:36:24,  1.41s/it]

training loss: 3.4241695404052734


training:  63%|██████▎   | 6876/10986 [2:40:16<1:34:50,  1.38s/it]

training loss: 3.409123420715332


training:  63%|██████▎   | 6877/10986 [2:40:17<1:34:24,  1.38s/it]

training loss: 3.319190502166748


training:  63%|██████▎   | 6878/10986 [2:40:18<1:34:24,  1.38s/it]

training loss: 3.429642915725708


training:  63%|██████▎   | 6879/10986 [2:40:20<1:33:53,  1.37s/it]

training loss: 3.4029786586761475


training:  63%|██████▎   | 6880/10986 [2:40:21<1:33:26,  1.37s/it]

training loss: 3.448364496231079
valid loss: 3.4475817680358887
perplexity: 31.42430877685547


training:  63%|██████▎   | 6881/10986 [2:40:24<2:04:47,  1.82s/it]

training loss: 3.4705967903137207


training:  63%|██████▎   | 6882/10986 [2:40:25<1:56:44,  1.71s/it]

training loss: 3.4085395336151123


training:  63%|██████▎   | 6883/10986 [2:40:27<1:49:16,  1.60s/it]

training loss: 3.528010845184326


training:  63%|██████▎   | 6884/10986 [2:40:28<1:43:32,  1.51s/it]

training loss: 3.448608636856079


training:  63%|██████▎   | 6885/10986 [2:40:29<1:39:55,  1.46s/it]

training loss: 3.5093960762023926


training:  63%|██████▎   | 6886/10986 [2:40:31<1:37:24,  1.43s/it]

training loss: 3.3577091693878174


training:  63%|██████▎   | 6887/10986 [2:40:32<1:35:40,  1.40s/it]

training loss: 3.400519609451294


training:  63%|██████▎   | 6888/10986 [2:40:33<1:34:37,  1.39s/it]

training loss: 3.4807586669921875


training:  63%|██████▎   | 6889/10986 [2:40:35<1:33:59,  1.38s/it]

training loss: 3.531034231185913


training:  63%|██████▎   | 6890/10986 [2:40:36<1:33:02,  1.36s/it]

training loss: 3.32770037651062


training:  63%|██████▎   | 6891/10986 [2:40:38<1:38:50,  1.45s/it]

training loss: 3.418731451034546


training:  63%|██████▎   | 6892/10986 [2:40:39<1:36:38,  1.42s/it]

training loss: 3.4508485794067383


training:  63%|██████▎   | 6893/10986 [2:40:40<1:35:52,  1.41s/it]

training loss: 3.4413468837738037


training:  63%|██████▎   | 6894/10986 [2:40:42<1:35:19,  1.40s/it]

training loss: 3.3126726150512695


training:  63%|██████▎   | 6895/10986 [2:40:43<1:33:51,  1.38s/it]

training loss: 3.358187198638916


training:  63%|██████▎   | 6896/10986 [2:40:45<1:33:06,  1.37s/it]

training loss: 3.495161294937134


training:  63%|██████▎   | 6897/10986 [2:40:46<1:32:06,  1.35s/it]

training loss: 3.47129487991333


training:  63%|██████▎   | 6898/10986 [2:40:47<1:31:41,  1.35s/it]

training loss: 3.424182415008545


training:  63%|██████▎   | 6899/10986 [2:40:48<1:31:27,  1.34s/it]

training loss: 3.3259506225585938


training:  63%|██████▎   | 6900/10986 [2:40:50<1:31:22,  1.34s/it]

training loss: 3.3495757579803467
valid loss: 3.3536183834075928
perplexity: 28.606056213378906


training:  63%|██████▎   | 6901/10986 [2:40:53<2:03:49,  1.82s/it]

training loss: 3.235475778579712


training:  63%|██████▎   | 6902/10986 [2:40:54<1:56:09,  1.71s/it]

training loss: 3.471876621246338


training:  63%|██████▎   | 6903/10986 [2:40:56<1:48:38,  1.60s/it]

training loss: 3.395193099975586


training:  63%|██████▎   | 6904/10986 [2:40:57<1:43:05,  1.52s/it]

training loss: 3.530240297317505


training:  63%|██████▎   | 6905/10986 [2:40:58<1:39:14,  1.46s/it]

training loss: 3.320514678955078


training:  63%|██████▎   | 6906/10986 [2:41:00<1:36:10,  1.41s/it]

training loss: 3.5335640907287598


training:  63%|██████▎   | 6907/10986 [2:41:01<1:34:04,  1.38s/it]

training loss: 3.4129638671875


training:  63%|██████▎   | 6908/10986 [2:41:02<1:32:52,  1.37s/it]

training loss: 3.2944040298461914


training:  63%|██████▎   | 6909/10986 [2:41:04<1:32:55,  1.37s/it]

training loss: 3.343041181564331


training:  63%|██████▎   | 6910/10986 [2:41:05<1:31:58,  1.35s/it]

training loss: 3.3720970153808594


training:  63%|██████▎   | 6911/10986 [2:41:06<1:37:20,  1.43s/it]

training loss: 3.3417458534240723


training:  63%|██████▎   | 6912/10986 [2:41:08<1:35:13,  1.40s/it]

training loss: 3.5340280532836914


training:  63%|██████▎   | 6913/10986 [2:41:09<1:33:40,  1.38s/it]

training loss: 3.367302656173706


training:  63%|██████▎   | 6914/10986 [2:41:10<1:33:23,  1.38s/it]

training loss: 3.307272434234619


training:  63%|██████▎   | 6915/10986 [2:41:12<1:32:19,  1.36s/it]

training loss: 3.448566198348999


training:  63%|██████▎   | 6916/10986 [2:41:13<1:32:02,  1.36s/it]

training loss: 3.4852404594421387


training:  63%|██████▎   | 6917/10986 [2:41:14<1:31:37,  1.35s/it]

training loss: 3.393150568008423


training:  63%|██████▎   | 6918/10986 [2:41:16<1:31:17,  1.35s/it]

training loss: 3.501774787902832


training:  63%|██████▎   | 6919/10986 [2:41:17<1:31:09,  1.34s/it]

training loss: 3.374307632446289


training:  63%|██████▎   | 6920/10986 [2:41:18<1:30:41,  1.34s/it]

training loss: 3.3073277473449707
valid loss: 3.3058154582977295
perplexity: 27.270771026611328


training:  63%|██████▎   | 6921/10986 [2:41:21<2:01:46,  1.80s/it]

training loss: 3.3778185844421387


training:  63%|██████▎   | 6922/10986 [2:41:23<1:59:32,  1.76s/it]

training loss: 3.467115879058838


training:  63%|██████▎   | 6923/10986 [2:41:24<1:51:25,  1.65s/it]

training loss: 3.456517457962036


training:  63%|██████▎   | 6924/10986 [2:41:26<1:44:55,  1.55s/it]

training loss: 3.3525121212005615


training:  63%|██████▎   | 6925/10986 [2:41:27<1:40:13,  1.48s/it]

training loss: 3.484889030456543


training:  63%|██████▎   | 6926/10986 [2:41:28<1:36:44,  1.43s/it]

training loss: 3.4841487407684326


training:  63%|██████▎   | 6927/10986 [2:41:30<1:34:28,  1.40s/it]

training loss: 3.4302892684936523


training:  63%|██████▎   | 6928/10986 [2:41:31<1:32:53,  1.37s/it]

training loss: 3.5149543285369873


training:  63%|██████▎   | 6929/10986 [2:41:32<1:31:57,  1.36s/it]

training loss: 3.408889055252075


training:  63%|██████▎   | 6930/10986 [2:41:34<1:31:19,  1.35s/it]

training loss: 3.3199057579040527


training:  63%|██████▎   | 6931/10986 [2:41:35<1:37:02,  1.44s/it]

training loss: 3.4410510063171387


training:  63%|██████▎   | 6932/10986 [2:41:37<1:40:37,  1.49s/it]

training loss: 3.397143840789795


training:  63%|██████▎   | 6933/10986 [2:41:38<1:37:40,  1.45s/it]

training loss: 3.520507574081421


training:  63%|██████▎   | 6934/10986 [2:41:40<1:35:45,  1.42s/it]

training loss: 3.4262821674346924


training:  63%|██████▎   | 6935/10986 [2:41:41<1:34:29,  1.40s/it]

training loss: 3.3834190368652344


training:  63%|██████▎   | 6936/10986 [2:41:42<1:33:11,  1.38s/it]

training loss: 3.479215145111084


training:  63%|██████▎   | 6937/10986 [2:41:44<1:32:14,  1.37s/it]

training loss: 3.42877459526062


training:  63%|██████▎   | 6938/10986 [2:41:45<1:32:31,  1.37s/it]

training loss: 3.4669041633605957


training:  63%|██████▎   | 6939/10986 [2:41:46<1:31:24,  1.36s/it]

training loss: 3.355015277862549


training:  63%|██████▎   | 6940/10986 [2:41:48<1:30:51,  1.35s/it]

training loss: 3.3339452743530273
valid loss: 3.3338115215301514
perplexity: 28.045032501220703


training:  63%|██████▎   | 6941/10986 [2:41:51<2:01:42,  1.81s/it]

training loss: 3.2972965240478516


training:  63%|██████▎   | 6942/10986 [2:41:52<1:53:47,  1.69s/it]

training loss: 3.468334674835205


training:  63%|██████▎   | 6943/10986 [2:41:53<1:46:12,  1.58s/it]

training loss: 3.555891513824463


training:  63%|██████▎   | 6944/10986 [2:41:55<1:41:13,  1.50s/it]

training loss: 3.459455966949463


training:  63%|██████▎   | 6945/10986 [2:41:56<1:37:33,  1.45s/it]

training loss: 3.3271591663360596


training:  63%|██████▎   | 6946/10986 [2:41:57<1:35:00,  1.41s/it]

training loss: 3.501232147216797


training:  63%|██████▎   | 6947/10986 [2:41:59<1:32:49,  1.38s/it]

training loss: 3.3559341430664062


training:  63%|██████▎   | 6948/10986 [2:42:00<1:31:03,  1.35s/it]

training loss: 3.435929775238037


training:  63%|██████▎   | 6949/10986 [2:42:01<1:30:18,  1.34s/it]

training loss: 3.4501149654388428


training:  63%|██████▎   | 6950/10986 [2:42:02<1:29:48,  1.34s/it]

training loss: 3.3459558486938477


training:  63%|██████▎   | 6951/10986 [2:42:04<1:35:05,  1.41s/it]

training loss: 3.526205539703369


training:  63%|██████▎   | 6952/10986 [2:42:05<1:33:01,  1.38s/it]

training loss: 3.393691062927246


training:  63%|██████▎   | 6953/10986 [2:42:07<1:31:31,  1.36s/it]

training loss: 3.48205828666687


training:  63%|██████▎   | 6954/10986 [2:42:08<1:30:05,  1.34s/it]

training loss: 3.3006584644317627


training:  63%|██████▎   | 6955/10986 [2:42:09<1:29:23,  1.33s/it]

training loss: 3.361462116241455


training:  63%|██████▎   | 6956/10986 [2:42:11<1:28:52,  1.32s/it]

training loss: 3.3899381160736084


training:  63%|██████▎   | 6957/10986 [2:42:12<1:29:23,  1.33s/it]

training loss: 3.418320417404175


training:  63%|██████▎   | 6958/10986 [2:42:13<1:29:17,  1.33s/it]

training loss: 3.413588047027588


training:  63%|██████▎   | 6959/10986 [2:42:15<1:28:57,  1.33s/it]

training loss: 3.3783226013183594


training:  63%|██████▎   | 6960/10986 [2:42:16<1:28:46,  1.32s/it]

training loss: 3.4262654781341553
valid loss: 3.423018455505371
perplexity: 30.66182518005371


training:  63%|██████▎   | 6961/10986 [2:42:19<1:59:02,  1.77s/it]

training loss: 3.400718927383423


training:  63%|██████▎   | 6962/10986 [2:42:20<1:51:38,  1.66s/it]

training loss: 3.4217770099639893


training:  63%|██████▎   | 6963/10986 [2:42:21<1:44:45,  1.56s/it]

training loss: 3.35080623626709


training:  63%|██████▎   | 6964/10986 [2:42:23<1:39:45,  1.49s/it]

training loss: 3.4039227962493896


training:  63%|██████▎   | 6965/10986 [2:42:24<1:36:27,  1.44s/it]

training loss: 3.373495578765869


training:  63%|██████▎   | 6966/10986 [2:42:25<1:33:29,  1.40s/it]

training loss: 3.511836290359497


training:  63%|██████▎   | 6967/10986 [2:42:27<1:31:52,  1.37s/it]

training loss: 3.362766742706299


training:  63%|██████▎   | 6968/10986 [2:42:28<1:30:47,  1.36s/it]

training loss: 3.321558952331543


training:  63%|██████▎   | 6969/10986 [2:42:29<1:29:33,  1.34s/it]

training loss: 3.4564337730407715


training:  63%|██████▎   | 6970/10986 [2:42:31<1:28:41,  1.33s/it]

training loss: 3.507486581802368


training:  63%|██████▎   | 6971/10986 [2:42:32<1:34:17,  1.41s/it]

training loss: 3.3594613075256348


training:  63%|██████▎   | 6972/10986 [2:42:34<1:32:46,  1.39s/it]

training loss: 3.366326332092285


training:  63%|██████▎   | 6973/10986 [2:42:35<1:31:16,  1.36s/it]

training loss: 3.405377149581909


training:  63%|██████▎   | 6974/10986 [2:42:36<1:29:50,  1.34s/it]

training loss: 3.4058687686920166


training:  63%|██████▎   | 6975/10986 [2:42:37<1:29:00,  1.33s/it]

training loss: 3.5385563373565674


training:  63%|██████▎   | 6976/10986 [2:42:39<1:28:16,  1.32s/it]

training loss: 3.2847299575805664


training:  64%|██████▎   | 6977/10986 [2:42:40<1:27:46,  1.31s/it]

training loss: 3.4214019775390625


training:  64%|██████▎   | 6978/10986 [2:42:41<1:28:04,  1.32s/it]

training loss: 3.3312864303588867


training:  64%|██████▎   | 6979/10986 [2:42:43<1:28:32,  1.33s/it]

training loss: 3.3799219131469727


training:  64%|██████▎   | 6980/10986 [2:42:44<1:28:26,  1.32s/it]

training loss: 3.4088613986968994
valid loss: 3.409679889678955
perplexity: 30.255556106567383


training:  64%|██████▎   | 6981/10986 [2:42:47<1:57:07,  1.75s/it]

training loss: 3.342153787612915


training:  64%|██████▎   | 6982/10986 [2:42:48<1:49:23,  1.64s/it]

training loss: 3.352926731109619


training:  64%|██████▎   | 6983/10986 [2:42:50<1:43:27,  1.55s/it]

training loss: 3.469792127609253


training:  64%|██████▎   | 6984/10986 [2:42:51<1:38:19,  1.47s/it]

training loss: 3.3243038654327393


training:  64%|██████▎   | 6985/10986 [2:42:52<1:35:50,  1.44s/it]

training loss: 3.4664969444274902


training:  64%|██████▎   | 6986/10986 [2:42:54<1:33:17,  1.40s/it]

training loss: 3.48232364654541


training:  64%|██████▎   | 6987/10986 [2:42:55<1:31:47,  1.38s/it]

training loss: 3.3315188884735107


training:  64%|██████▎   | 6988/10986 [2:42:56<1:30:33,  1.36s/it]

training loss: 3.4032983779907227


training:  64%|██████▎   | 6989/10986 [2:42:58<1:30:17,  1.36s/it]

training loss: 3.3859448432922363


training:  64%|██████▎   | 6990/10986 [2:42:59<1:29:40,  1.35s/it]

training loss: 3.395550489425659


training:  64%|██████▎   | 6991/10986 [2:43:00<1:35:43,  1.44s/it]

training loss: 3.377039909362793


training:  64%|██████▎   | 6992/10986 [2:43:02<1:33:47,  1.41s/it]

training loss: 3.411391496658325


training:  64%|██████▎   | 6993/10986 [2:43:03<1:32:32,  1.39s/it]

training loss: 3.424842357635498


training:  64%|██████▎   | 6994/10986 [2:43:04<1:31:08,  1.37s/it]

training loss: 3.4019246101379395


training:  64%|██████▎   | 6995/10986 [2:43:06<1:30:10,  1.36s/it]

training loss: 3.4046521186828613


training:  64%|██████▎   | 6996/10986 [2:43:07<1:29:10,  1.34s/it]

training loss: 3.3258867263793945


training:  64%|██████▎   | 6997/10986 [2:43:08<1:28:52,  1.34s/it]

training loss: 3.368347406387329


training:  64%|██████▎   | 6998/10986 [2:43:10<1:28:33,  1.33s/it]

training loss: 3.3425419330596924


training:  64%|██████▎   | 6999/10986 [2:43:11<1:28:42,  1.33s/it]

training loss: 3.345381736755371


training:  64%|██████▎   | 7000/10986 [2:43:12<1:28:55,  1.34s/it]

training loss: 3.361656665802002
valid loss: 3.3553876876831055
perplexity: 28.65671157836914


training:  64%|██████▎   | 7001/10986 [2:43:15<1:59:22,  1.80s/it]

training loss: 3.393710136413574


training:  64%|██████▎   | 7002/10986 [2:43:17<1:51:50,  1.68s/it]

training loss: 3.369570732116699


training:  64%|██████▎   | 7003/10986 [2:43:18<1:44:27,  1.57s/it]

training loss: 3.4073195457458496


training:  64%|██████▍   | 7004/10986 [2:43:19<1:39:27,  1.50s/it]

training loss: 3.3274312019348145


training:  64%|██████▍   | 7005/10986 [2:43:21<1:36:53,  1.46s/it]

training loss: 3.3625898361206055


training:  64%|██████▍   | 7006/10986 [2:43:22<1:34:03,  1.42s/it]

training loss: 3.499582052230835


training:  64%|██████▍   | 7007/10986 [2:43:23<1:31:50,  1.38s/it]

training loss: 3.594820022583008


training:  64%|██████▍   | 7008/10986 [2:43:25<1:31:02,  1.37s/it]

training loss: 3.353271961212158


training:  64%|██████▍   | 7009/10986 [2:43:26<1:31:28,  1.38s/it]

training loss: 3.472630023956299


training:  64%|██████▍   | 7010/10986 [2:43:28<1:37:38,  1.47s/it]

training loss: 3.397531032562256


training:  64%|██████▍   | 7011/10986 [2:43:30<1:48:01,  1.63s/it]

training loss: 3.390960216522217


training:  64%|██████▍   | 7012/10986 [2:43:31<1:42:38,  1.55s/it]

training loss: 3.3896541595458984


training:  64%|██████▍   | 7013/10986 [2:43:33<1:38:59,  1.49s/it]

training loss: 3.3807125091552734


training:  64%|██████▍   | 7014/10986 [2:43:34<1:35:34,  1.44s/it]

training loss: 3.404177188873291


training:  64%|██████▍   | 7015/10986 [2:43:35<1:33:25,  1.41s/it]

training loss: 3.3983588218688965


training:  64%|██████▍   | 7016/10986 [2:43:37<1:31:26,  1.38s/it]

training loss: 3.37062931060791


training:  64%|██████▍   | 7017/10986 [2:43:38<1:30:24,  1.37s/it]

training loss: 3.439573049545288


training:  64%|██████▍   | 7018/10986 [2:43:39<1:29:17,  1.35s/it]

training loss: 3.4155404567718506


training:  64%|██████▍   | 7019/10986 [2:43:41<1:29:40,  1.36s/it]

training loss: 3.4982705116271973


training:  64%|██████▍   | 7020/10986 [2:43:42<1:28:28,  1.34s/it]

training loss: 3.461003303527832
valid loss: 3.4595470428466797
perplexity: 31.802568435668945


training:  64%|██████▍   | 7021/10986 [2:43:45<1:58:41,  1.80s/it]

training loss: 3.430647373199463


training:  64%|██████▍   | 7022/10986 [2:43:46<1:50:42,  1.68s/it]

training loss: 3.458620548248291


training:  64%|██████▍   | 7023/10986 [2:43:47<1:44:01,  1.58s/it]

training loss: 3.3936350345611572


training:  64%|██████▍   | 7024/10986 [2:43:49<1:38:17,  1.49s/it]

training loss: 3.4266505241394043


training:  64%|██████▍   | 7025/10986 [2:43:50<1:34:57,  1.44s/it]

training loss: 3.467616319656372


training:  64%|██████▍   | 7026/10986 [2:43:51<1:32:31,  1.40s/it]

training loss: 3.386958122253418


training:  64%|██████▍   | 7027/10986 [2:43:53<1:30:53,  1.38s/it]

training loss: 3.361116886138916


training:  64%|██████▍   | 7028/10986 [2:43:54<1:29:34,  1.36s/it]

training loss: 3.47721004486084


training:  64%|██████▍   | 7029/10986 [2:43:55<1:28:34,  1.34s/it]

training loss: 3.358044385910034


training:  64%|██████▍   | 7030/10986 [2:43:57<1:27:23,  1.33s/it]

training loss: 3.3130807876586914


training:  64%|██████▍   | 7031/10986 [2:43:58<1:32:52,  1.41s/it]

training loss: 3.404569387435913


training:  64%|██████▍   | 7032/10986 [2:44:00<1:39:36,  1.51s/it]

training loss: 3.4512274265289307


training:  64%|██████▍   | 7033/10986 [2:44:01<1:35:43,  1.45s/it]

training loss: 3.445115804672241


training:  64%|██████▍   | 7034/10986 [2:44:03<1:33:00,  1.41s/it]

training loss: 3.2823402881622314


training:  64%|██████▍   | 7035/10986 [2:44:04<1:30:53,  1.38s/it]

training loss: 3.379276752471924


training:  64%|██████▍   | 7036/10986 [2:44:05<1:29:17,  1.36s/it]

training loss: 3.3631200790405273


training:  64%|██████▍   | 7037/10986 [2:44:06<1:28:28,  1.34s/it]

training loss: 3.466275930404663


training:  64%|██████▍   | 7038/10986 [2:44:08<1:27:23,  1.33s/it]

training loss: 3.426372766494751


training:  64%|██████▍   | 7039/10986 [2:44:09<1:26:37,  1.32s/it]

training loss: 3.5395545959472656


training:  64%|██████▍   | 7040/10986 [2:44:10<1:25:33,  1.30s/it]

training loss: 3.394723653793335
valid loss: 3.40000319480896
perplexity: 29.964195251464844


training:  64%|██████▍   | 7041/10986 [2:44:13<1:56:30,  1.77s/it]

training loss: 3.5207810401916504


training:  64%|██████▍   | 7042/10986 [2:44:15<1:54:35,  1.74s/it]

training loss: 3.4536423683166504


training:  64%|██████▍   | 7043/10986 [2:44:16<1:46:47,  1.63s/it]

training loss: 3.313424587249756


training:  64%|██████▍   | 7044/10986 [2:44:18<1:40:18,  1.53s/it]

training loss: 3.4157447814941406


training:  64%|██████▍   | 7045/10986 [2:44:19<1:35:52,  1.46s/it]

training loss: 3.3977842330932617


training:  64%|██████▍   | 7046/10986 [2:44:20<1:33:02,  1.42s/it]

training loss: 3.4538981914520264


training:  64%|██████▍   | 7047/10986 [2:44:21<1:31:04,  1.39s/it]

training loss: 3.322843313217163


training:  64%|██████▍   | 7048/10986 [2:44:23<1:29:57,  1.37s/it]

training loss: 3.457937240600586


training:  64%|██████▍   | 7049/10986 [2:44:24<1:28:27,  1.35s/it]

training loss: 3.37874436378479


training:  64%|██████▍   | 7050/10986 [2:44:25<1:27:48,  1.34s/it]

training loss: 3.363208532333374


training:  64%|██████▍   | 7051/10986 [2:44:27<1:33:13,  1.42s/it]

training loss: 3.4850258827209473


training:  64%|██████▍   | 7052/10986 [2:44:28<1:31:24,  1.39s/it]

training loss: 3.406520366668701


training:  64%|██████▍   | 7053/10986 [2:44:30<1:29:40,  1.37s/it]

training loss: 3.381439685821533


training:  64%|██████▍   | 7054/10986 [2:44:31<1:27:59,  1.34s/it]

training loss: 3.424577474594116


training:  64%|██████▍   | 7055/10986 [2:44:32<1:26:43,  1.32s/it]

training loss: 3.3820621967315674


training:  64%|██████▍   | 7056/10986 [2:44:34<1:26:17,  1.32s/it]

training loss: 3.4444289207458496


training:  64%|██████▍   | 7057/10986 [2:44:35<1:25:49,  1.31s/it]

training loss: 3.449509620666504


training:  64%|██████▍   | 7058/10986 [2:44:36<1:25:00,  1.30s/it]

training loss: 3.2833211421966553


training:  64%|██████▍   | 7059/10986 [2:44:37<1:24:33,  1.29s/it]

training loss: 3.359065532684326


training:  64%|██████▍   | 7060/10986 [2:44:39<1:24:43,  1.29s/it]

training loss: 3.4868381023406982
valid loss: 3.4829092025756836
perplexity: 32.554290771484375


training:  64%|██████▍   | 7061/10986 [2:44:41<1:52:51,  1.73s/it]

training loss: 3.3330111503601074


training:  64%|██████▍   | 7062/10986 [2:44:43<1:45:53,  1.62s/it]

training loss: 3.5662896633148193


training:  64%|██████▍   | 7063/10986 [2:44:44<1:39:23,  1.52s/it]

training loss: 3.3139870166778564


training:  64%|██████▍   | 7064/10986 [2:44:45<1:34:54,  1.45s/it]

training loss: 3.420020341873169


training:  64%|██████▍   | 7065/10986 [2:44:47<1:31:36,  1.40s/it]

training loss: 3.350954532623291


training:  64%|██████▍   | 7066/10986 [2:44:48<1:29:09,  1.36s/it]

training loss: 3.3704726696014404


training:  64%|██████▍   | 7067/10986 [2:44:49<1:27:13,  1.34s/it]

training loss: 3.4411375522613525


training:  64%|██████▍   | 7068/10986 [2:44:50<1:25:54,  1.32s/it]

training loss: 3.3290200233459473


training:  64%|██████▍   | 7069/10986 [2:44:52<1:25:01,  1.30s/it]

training loss: 3.3917274475097656


training:  64%|██████▍   | 7070/10986 [2:44:53<1:24:39,  1.30s/it]

training loss: 3.395124912261963


training:  64%|██████▍   | 7071/10986 [2:44:55<1:30:30,  1.39s/it]

training loss: 3.4606354236602783


training:  64%|██████▍   | 7072/10986 [2:44:56<1:30:17,  1.38s/it]

training loss: 3.4071590900421143


training:  64%|██████▍   | 7073/10986 [2:44:57<1:28:12,  1.35s/it]

training loss: 3.431976795196533


training:  64%|██████▍   | 7074/10986 [2:44:59<1:26:35,  1.33s/it]

training loss: 3.358097791671753


training:  64%|██████▍   | 7075/10986 [2:45:00<1:25:49,  1.32s/it]

training loss: 3.3589723110198975


training:  64%|██████▍   | 7076/10986 [2:45:01<1:25:17,  1.31s/it]

training loss: 3.3548686504364014


training:  64%|██████▍   | 7077/10986 [2:45:02<1:24:50,  1.30s/it]

training loss: 3.3470489978790283


training:  64%|██████▍   | 7078/10986 [2:45:04<1:24:38,  1.30s/it]

training loss: 3.4563987255096436


training:  64%|██████▍   | 7079/10986 [2:45:05<1:24:24,  1.30s/it]

training loss: 3.3606278896331787


training:  64%|██████▍   | 7080/10986 [2:45:06<1:24:35,  1.30s/it]

training loss: 3.4134774208068848
valid loss: 3.4105231761932373
perplexity: 30.281082153320312


training:  64%|██████▍   | 7081/10986 [2:45:09<1:53:01,  1.74s/it]

training loss: 3.409097194671631


training:  64%|██████▍   | 7082/10986 [2:45:11<1:51:43,  1.72s/it]

training loss: 3.3240175247192383


training:  64%|██████▍   | 7083/10986 [2:45:12<1:43:42,  1.59s/it]

training loss: 3.4548346996307373


training:  64%|██████▍   | 7084/10986 [2:45:13<1:37:57,  1.51s/it]

training loss: 3.4404184818267822


training:  64%|██████▍   | 7085/10986 [2:45:15<1:33:39,  1.44s/it]

training loss: 3.436613082885742


training:  65%|██████▍   | 7086/10986 [2:45:16<1:30:30,  1.39s/it]

training loss: 3.5017929077148438


training:  65%|██████▍   | 7087/10986 [2:45:17<1:28:07,  1.36s/it]

training loss: 3.53157114982605


training:  65%|██████▍   | 7088/10986 [2:45:18<1:26:41,  1.33s/it]

training loss: 3.386434316635132


training:  65%|██████▍   | 7089/10986 [2:45:20<1:25:42,  1.32s/it]

training loss: 3.436978816986084


training:  65%|██████▍   | 7090/10986 [2:45:21<1:25:04,  1.31s/it]

training loss: 3.3679850101470947


training:  65%|██████▍   | 7091/10986 [2:45:23<1:30:39,  1.40s/it]

training loss: 3.323436737060547


training:  65%|██████▍   | 7092/10986 [2:45:24<1:28:42,  1.37s/it]

training loss: 3.4592039585113525


training:  65%|██████▍   | 7093/10986 [2:45:25<1:27:15,  1.34s/it]

training loss: 3.46700382232666


training:  65%|██████▍   | 7094/10986 [2:45:27<1:26:26,  1.33s/it]

training loss: 3.340888023376465


training:  65%|██████▍   | 7095/10986 [2:45:28<1:26:05,  1.33s/it]

training loss: 3.3716342449188232


training:  65%|██████▍   | 7096/10986 [2:45:29<1:25:18,  1.32s/it]

training loss: 3.3802287578582764


training:  65%|██████▍   | 7097/10986 [2:45:30<1:24:14,  1.30s/it]

training loss: 3.3610997200012207


training:  65%|██████▍   | 7098/10986 [2:45:32<1:23:50,  1.29s/it]

training loss: 3.4067330360412598


training:  65%|██████▍   | 7099/10986 [2:45:33<1:23:27,  1.29s/it]

training loss: 3.3822433948516846


training:  65%|██████▍   | 7100/10986 [2:45:34<1:23:00,  1.28s/it]

training loss: 3.555030345916748
valid loss: 3.549941301345825
perplexity: 34.81127166748047


training:  65%|██████▍   | 7101/10986 [2:45:37<1:51:49,  1.73s/it]

training loss: 3.2942683696746826


training:  65%|██████▍   | 7102/10986 [2:45:38<1:45:14,  1.63s/it]

training loss: 3.3834381103515625


training:  65%|██████▍   | 7103/10986 [2:45:40<1:38:52,  1.53s/it]

training loss: 3.3565707206726074


training:  65%|██████▍   | 7104/10986 [2:45:41<1:34:30,  1.46s/it]

training loss: 3.4786832332611084


training:  65%|██████▍   | 7105/10986 [2:45:42<1:31:57,  1.42s/it]

training loss: 3.406566858291626


training:  65%|██████▍   | 7106/10986 [2:45:44<1:29:39,  1.39s/it]

training loss: 3.3653504848480225


training:  65%|██████▍   | 7107/10986 [2:45:45<1:28:16,  1.37s/it]

training loss: 3.5455322265625


training:  65%|██████▍   | 7108/10986 [2:45:46<1:26:57,  1.35s/it]

training loss: 3.283243179321289


training:  65%|██████▍   | 7109/10986 [2:45:48<1:26:25,  1.34s/it]

training loss: 3.4403939247131348


training:  65%|██████▍   | 7110/10986 [2:45:49<1:25:31,  1.32s/it]

training loss: 3.4149913787841797


training:  65%|██████▍   | 7111/10986 [2:45:50<1:30:02,  1.39s/it]

training loss: 3.3665761947631836


training:  65%|██████▍   | 7112/10986 [2:45:52<1:33:42,  1.45s/it]

training loss: 3.42588472366333


training:  65%|██████▍   | 7113/10986 [2:45:53<1:31:33,  1.42s/it]

training loss: 3.3184690475463867


training:  65%|██████▍   | 7114/10986 [2:45:55<1:29:36,  1.39s/it]

training loss: 3.378154754638672


training:  65%|██████▍   | 7115/10986 [2:45:56<1:27:59,  1.36s/it]

training loss: 3.395540237426758


training:  65%|██████▍   | 7116/10986 [2:45:57<1:26:38,  1.34s/it]

training loss: 3.33250093460083


training:  65%|██████▍   | 7117/10986 [2:45:59<1:25:48,  1.33s/it]

training loss: 3.4088709354400635


training:  65%|██████▍   | 7118/10986 [2:46:00<1:24:54,  1.32s/it]

training loss: 3.4360671043395996


training:  65%|██████▍   | 7119/10986 [2:46:01<1:24:15,  1.31s/it]

training loss: 3.3946609497070312


training:  65%|██████▍   | 7120/10986 [2:46:02<1:24:05,  1.31s/it]

training loss: 3.301154613494873
valid loss: 3.297025680541992
perplexity: 27.032115936279297


training:  65%|██████▍   | 7121/10986 [2:46:05<1:52:52,  1.75s/it]

training loss: 3.3799407482147217


training:  65%|██████▍   | 7122/10986 [2:46:07<1:45:25,  1.64s/it]

training loss: 3.382297992706299


training:  65%|██████▍   | 7123/10986 [2:46:08<1:38:45,  1.53s/it]

training loss: 3.291656017303467


training:  65%|██████▍   | 7124/10986 [2:46:09<1:33:58,  1.46s/it]

training loss: 3.419109344482422


training:  65%|██████▍   | 7125/10986 [2:46:10<1:30:27,  1.41s/it]

training loss: 3.2742114067077637


training:  65%|██████▍   | 7126/10986 [2:46:12<1:28:24,  1.37s/it]

training loss: 3.4636154174804688


training:  65%|██████▍   | 7127/10986 [2:46:13<1:27:16,  1.36s/it]

training loss: 3.3211376667022705


training:  65%|██████▍   | 7128/10986 [2:46:14<1:26:06,  1.34s/it]

training loss: 3.4391491413116455


training:  65%|██████▍   | 7129/10986 [2:46:16<1:25:33,  1.33s/it]

training loss: 3.3857877254486084


training:  65%|██████▍   | 7130/10986 [2:46:17<1:25:05,  1.32s/it]

training loss: 3.36930251121521


training:  65%|██████▍   | 7131/10986 [2:46:19<1:29:49,  1.40s/it]

training loss: 3.373267889022827


training:  65%|██████▍   | 7132/10986 [2:46:20<1:28:24,  1.38s/it]

training loss: 3.4327545166015625


training:  65%|██████▍   | 7133/10986 [2:46:21<1:27:23,  1.36s/it]

training loss: 3.3823020458221436


training:  65%|██████▍   | 7134/10986 [2:46:22<1:26:26,  1.35s/it]

training loss: 3.3304929733276367


training:  65%|██████▍   | 7135/10986 [2:46:24<1:25:53,  1.34s/it]

training loss: 3.3032445907592773


training:  65%|██████▍   | 7136/10986 [2:46:25<1:24:54,  1.32s/it]

training loss: 3.2984695434570312


training:  65%|██████▍   | 7137/10986 [2:46:26<1:24:04,  1.31s/it]

training loss: 3.3598439693450928


training:  65%|██████▍   | 7138/10986 [2:46:28<1:23:52,  1.31s/it]

training loss: 3.3883137702941895


training:  65%|██████▍   | 7139/10986 [2:46:29<1:23:41,  1.31s/it]

training loss: 3.326737642288208


training:  65%|██████▍   | 7140/10986 [2:46:30<1:23:08,  1.30s/it]

training loss: 3.3873205184936523
valid loss: 3.3871958255767822
perplexity: 29.5828800201416


training:  65%|██████▌   | 7141/10986 [2:46:33<1:51:31,  1.74s/it]

training loss: 3.4164905548095703


training:  65%|██████▌   | 7142/10986 [2:46:35<1:46:37,  1.66s/it]

training loss: 3.3192334175109863


training:  65%|██████▌   | 7143/10986 [2:46:36<1:40:04,  1.56s/it]

training loss: 3.3757781982421875


training:  65%|██████▌   | 7144/10986 [2:46:37<1:35:06,  1.49s/it]

training loss: 3.462815761566162


training:  65%|██████▌   | 7145/10986 [2:46:38<1:31:10,  1.42s/it]

training loss: 3.45278000831604


training:  65%|██████▌   | 7146/10986 [2:46:40<1:28:51,  1.39s/it]

training loss: 3.4280946254730225


training:  65%|██████▌   | 7147/10986 [2:46:41<1:27:19,  1.36s/it]

training loss: 3.3774049282073975


training:  65%|██████▌   | 7148/10986 [2:46:42<1:26:00,  1.34s/it]

training loss: 3.483776569366455


training:  65%|██████▌   | 7149/10986 [2:46:44<1:25:02,  1.33s/it]

training loss: 3.401494026184082


training:  65%|██████▌   | 7150/10986 [2:46:45<1:24:16,  1.32s/it]

training loss: 3.34492564201355


training:  65%|██████▌   | 7151/10986 [2:46:46<1:28:53,  1.39s/it]

training loss: 3.3766376972198486


training:  65%|██████▌   | 7152/10986 [2:46:48<1:27:26,  1.37s/it]

training loss: 3.411200523376465


training:  65%|██████▌   | 7153/10986 [2:46:49<1:26:21,  1.35s/it]

training loss: 3.4333624839782715


training:  65%|██████▌   | 7154/10986 [2:46:50<1:25:17,  1.34s/it]

training loss: 3.3555514812469482


training:  65%|██████▌   | 7155/10986 [2:46:52<1:24:56,  1.33s/it]

training loss: 3.4411075115203857


training:  65%|██████▌   | 7156/10986 [2:46:53<1:24:38,  1.33s/it]

training loss: 3.3457629680633545


training:  65%|██████▌   | 7157/10986 [2:46:54<1:24:36,  1.33s/it]

training loss: 3.4029159545898438


training:  65%|██████▌   | 7158/10986 [2:46:56<1:24:32,  1.33s/it]

training loss: 3.33957839012146


training:  65%|██████▌   | 7159/10986 [2:46:57<1:23:57,  1.32s/it]

training loss: 3.3291311264038086


training:  65%|██████▌   | 7160/10986 [2:46:58<1:24:47,  1.33s/it]

training loss: 3.372781753540039
valid loss: 3.368535280227661
perplexity: 29.035966873168945


training:  65%|██████▌   | 7161/10986 [2:47:01<1:55:22,  1.81s/it]

training loss: 3.3720431327819824


training:  65%|██████▌   | 7162/10986 [2:47:03<1:56:12,  1.82s/it]

training loss: 3.4529805183410645


training:  65%|██████▌   | 7163/10986 [2:47:05<1:56:12,  1.82s/it]

training loss: 3.4458863735198975


training:  65%|██████▌   | 7164/10986 [2:47:07<1:51:02,  1.74s/it]

training loss: 3.402223825454712


training:  65%|██████▌   | 7165/10986 [2:47:08<1:42:57,  1.62s/it]

training loss: 3.337111473083496


training:  65%|██████▌   | 7166/10986 [2:47:09<1:37:12,  1.53s/it]

training loss: 3.371244192123413


training:  65%|██████▌   | 7167/10986 [2:47:11<1:33:49,  1.47s/it]

training loss: 3.4098849296569824


training:  65%|██████▌   | 7168/10986 [2:47:12<1:30:58,  1.43s/it]

training loss: 3.487623691558838


training:  65%|██████▌   | 7169/10986 [2:47:13<1:29:13,  1.40s/it]

training loss: 3.4327731132507324


training:  65%|██████▌   | 7170/10986 [2:47:15<1:28:14,  1.39s/it]

training loss: 3.399826765060425


training:  65%|██████▌   | 7171/10986 [2:47:16<1:32:17,  1.45s/it]

training loss: 3.47691011428833


training:  65%|██████▌   | 7172/10986 [2:47:18<1:31:39,  1.44s/it]

training loss: 3.388092279434204


training:  65%|██████▌   | 7173/10986 [2:47:19<1:29:42,  1.41s/it]

training loss: 3.406686782836914


training:  65%|██████▌   | 7174/10986 [2:47:20<1:27:31,  1.38s/it]

training loss: 3.3892323970794678


training:  65%|██████▌   | 7175/10986 [2:47:22<1:26:32,  1.36s/it]

training loss: 3.429298162460327


training:  65%|██████▌   | 7176/10986 [2:47:23<1:26:01,  1.35s/it]

training loss: 3.461369514465332


training:  65%|██████▌   | 7177/10986 [2:47:24<1:25:57,  1.35s/it]

training loss: 3.3065202236175537


training:  65%|██████▌   | 7178/10986 [2:47:26<1:25:02,  1.34s/it]

training loss: 3.3734443187713623


training:  65%|██████▌   | 7179/10986 [2:47:27<1:24:24,  1.33s/it]

training loss: 3.3833189010620117


training:  65%|██████▌   | 7180/10986 [2:47:28<1:24:11,  1.33s/it]

training loss: 3.4222733974456787
valid loss: 3.4207401275634766
perplexity: 30.592050552368164


training:  65%|██████▌   | 7181/10986 [2:47:31<1:52:58,  1.78s/it]

training loss: 3.371342182159424


training:  65%|██████▌   | 7182/10986 [2:47:32<1:45:33,  1.67s/it]

training loss: 3.339651346206665


training:  65%|██████▌   | 7183/10986 [2:47:34<1:38:47,  1.56s/it]

training loss: 3.5284337997436523


training:  65%|██████▌   | 7184/10986 [2:47:35<1:33:51,  1.48s/it]

training loss: 3.4970669746398926


training:  65%|██████▌   | 7185/10986 [2:47:36<1:30:32,  1.43s/it]

training loss: 3.3345866203308105


training:  65%|██████▌   | 7186/10986 [2:47:38<1:28:20,  1.39s/it]

training loss: 3.541897773742676


training:  65%|██████▌   | 7187/10986 [2:47:39<1:26:36,  1.37s/it]

training loss: 3.3086044788360596


training:  65%|██████▌   | 7188/10986 [2:47:40<1:25:43,  1.35s/it]

training loss: 3.4577553272247314


training:  65%|██████▌   | 7189/10986 [2:47:42<1:25:20,  1.35s/it]

training loss: 3.3935787677764893


training:  65%|██████▌   | 7190/10986 [2:47:43<1:24:59,  1.34s/it]

training loss: 3.424431324005127


training:  65%|██████▌   | 7191/10986 [2:47:45<1:30:12,  1.43s/it]

training loss: 3.296006202697754


training:  65%|██████▌   | 7192/10986 [2:47:46<1:28:38,  1.40s/it]

training loss: 3.457773208618164


training:  65%|██████▌   | 7193/10986 [2:47:47<1:27:26,  1.38s/it]

training loss: 3.491082191467285


training:  65%|██████▌   | 7194/10986 [2:47:49<1:27:09,  1.38s/it]

training loss: 3.4387784004211426


training:  65%|██████▌   | 7195/10986 [2:47:50<1:26:27,  1.37s/it]

training loss: 3.3298983573913574


training:  66%|██████▌   | 7196/10986 [2:47:51<1:25:27,  1.35s/it]

training loss: 3.3323943614959717


training:  66%|██████▌   | 7197/10986 [2:47:53<1:24:47,  1.34s/it]

training loss: 3.4619624614715576


training:  66%|██████▌   | 7198/10986 [2:47:54<1:24:30,  1.34s/it]

training loss: 3.367013692855835


training:  66%|██████▌   | 7199/10986 [2:47:55<1:24:17,  1.34s/it]

training loss: 3.355424642562866


training:  66%|██████▌   | 7200/10986 [2:47:57<1:24:11,  1.33s/it]

training loss: 3.4456236362457275
valid loss: 3.4385550022125244
perplexity: 31.141925811767578


training:  66%|██████▌   | 7201/10986 [2:47:59<1:51:32,  1.77s/it]

training loss: 3.417442560195923


training:  66%|██████▌   | 7202/10986 [2:48:01<1:49:06,  1.73s/it]

training loss: 3.282838821411133


training:  66%|██████▌   | 7203/10986 [2:48:02<1:41:58,  1.62s/it]

training loss: 3.405731439590454


training:  66%|██████▌   | 7204/10986 [2:48:04<1:36:24,  1.53s/it]

training loss: 3.409644365310669


training:  66%|██████▌   | 7205/10986 [2:48:05<1:32:08,  1.46s/it]

training loss: 3.4791624546051025


training:  66%|██████▌   | 7206/10986 [2:48:06<1:29:02,  1.41s/it]

training loss: 3.366591215133667


training:  66%|██████▌   | 7207/10986 [2:48:08<1:27:04,  1.38s/it]

training loss: 3.512697219848633


training:  66%|██████▌   | 7208/10986 [2:48:09<1:25:19,  1.36s/it]

training loss: 3.4516711235046387


training:  66%|██████▌   | 7209/10986 [2:48:10<1:24:20,  1.34s/it]

training loss: 3.316624402999878


training:  66%|██████▌   | 7210/10986 [2:48:11<1:23:46,  1.33s/it]

training loss: 3.6201274394989014


training:  66%|██████▌   | 7211/10986 [2:48:13<1:30:16,  1.43s/it]

training loss: 3.417628288269043


training:  66%|██████▌   | 7212/10986 [2:48:14<1:28:18,  1.40s/it]

training loss: 3.421464681625366


training:  66%|██████▌   | 7213/10986 [2:48:16<1:26:28,  1.38s/it]

training loss: 3.409797430038452


training:  66%|██████▌   | 7214/10986 [2:48:17<1:25:32,  1.36s/it]

training loss: 3.3893814086914062


training:  66%|██████▌   | 7215/10986 [2:48:18<1:25:13,  1.36s/it]

training loss: 3.326442241668701


training:  66%|██████▌   | 7216/10986 [2:48:20<1:24:38,  1.35s/it]

training loss: 3.4117958545684814


training:  66%|██████▌   | 7217/10986 [2:48:21<1:24:13,  1.34s/it]

training loss: 3.482595205307007


training:  66%|██████▌   | 7218/10986 [2:48:22<1:23:44,  1.33s/it]

training loss: 3.4190733432769775


training:  66%|██████▌   | 7219/10986 [2:48:24<1:23:18,  1.33s/it]

training loss: 3.4258780479431152


training:  66%|██████▌   | 7220/10986 [2:48:25<1:23:17,  1.33s/it]

training loss: 3.388796091079712
valid loss: 3.3848891258239746
perplexity: 29.514720916748047


training:  66%|██████▌   | 7221/10986 [2:48:28<1:50:24,  1.76s/it]

training loss: 3.5311923027038574


training:  66%|██████▌   | 7222/10986 [2:48:29<1:44:33,  1.67s/it]

training loss: 3.505023956298828


training:  66%|██████▌   | 7223/10986 [2:48:31<1:37:41,  1.56s/it]

training loss: 3.4073245525360107


training:  66%|██████▌   | 7224/10986 [2:48:32<1:33:01,  1.48s/it]

training loss: 3.534693956375122


training:  66%|██████▌   | 7225/10986 [2:48:33<1:29:50,  1.43s/it]

training loss: 3.395528793334961


training:  66%|██████▌   | 7226/10986 [2:48:34<1:27:06,  1.39s/it]

training loss: 3.3744726181030273


training:  66%|██████▌   | 7227/10986 [2:48:36<1:25:11,  1.36s/it]

training loss: 3.335071086883545


training:  66%|██████▌   | 7228/10986 [2:48:37<1:24:15,  1.35s/it]

training loss: 3.37477970123291


training:  66%|██████▌   | 7229/10986 [2:48:38<1:23:39,  1.34s/it]

training loss: 3.3431496620178223


training:  66%|██████▌   | 7230/10986 [2:48:40<1:23:02,  1.33s/it]

training loss: 3.2577898502349854


training:  66%|██████▌   | 7231/10986 [2:48:41<1:27:52,  1.40s/it]

training loss: 3.5182712078094482


training:  66%|██████▌   | 7232/10986 [2:48:43<1:26:44,  1.39s/it]

training loss: 3.4505550861358643


training:  66%|██████▌   | 7233/10986 [2:48:44<1:25:06,  1.36s/it]

training loss: 3.399665117263794


training:  66%|██████▌   | 7234/10986 [2:48:45<1:24:06,  1.35s/it]

training loss: 3.4660048484802246


training:  66%|██████▌   | 7235/10986 [2:48:47<1:23:37,  1.34s/it]

training loss: 3.4098904132843018


training:  66%|██████▌   | 7236/10986 [2:48:48<1:22:56,  1.33s/it]

training loss: 3.374573230743408


training:  66%|██████▌   | 7237/10986 [2:48:49<1:23:16,  1.33s/it]

training loss: 3.4549951553344727


training:  66%|██████▌   | 7238/10986 [2:48:51<1:23:03,  1.33s/it]

training loss: 3.4120593070983887


training:  66%|██████▌   | 7239/10986 [2:48:52<1:22:15,  1.32s/it]

training loss: 3.3817203044891357


training:  66%|██████▌   | 7240/10986 [2:48:53<1:21:55,  1.31s/it]

training loss: 3.407395124435425
valid loss: 3.41408109664917
perplexity: 30.38901138305664


training:  66%|██████▌   | 7241/10986 [2:48:56<1:50:24,  1.77s/it]

training loss: 3.406052589416504


training:  66%|██████▌   | 7242/10986 [2:48:57<1:42:57,  1.65s/it]

training loss: 3.3962814807891846


training:  66%|██████▌   | 7243/10986 [2:48:59<1:36:51,  1.55s/it]

training loss: 3.527733325958252


training:  66%|██████▌   | 7244/10986 [2:49:00<1:31:43,  1.47s/it]

training loss: 3.423781633377075


training:  66%|██████▌   | 7245/10986 [2:49:01<1:28:37,  1.42s/it]

training loss: 3.4855949878692627


training:  66%|██████▌   | 7246/10986 [2:49:03<1:26:57,  1.40s/it]

training loss: 3.342578887939453


training:  66%|██████▌   | 7247/10986 [2:49:04<1:25:22,  1.37s/it]

training loss: 3.4464924335479736


training:  66%|██████▌   | 7248/10986 [2:49:05<1:24:11,  1.35s/it]

training loss: 3.4368088245391846


training:  66%|██████▌   | 7249/10986 [2:49:07<1:23:03,  1.33s/it]

training loss: 3.4794442653656006


training:  66%|██████▌   | 7250/10986 [2:49:08<1:21:48,  1.31s/it]

training loss: 3.3377115726470947


training:  66%|██████▌   | 7251/10986 [2:49:09<1:26:59,  1.40s/it]

training loss: 3.4201443195343018


training:  66%|██████▌   | 7252/10986 [2:49:11<1:25:18,  1.37s/it]

training loss: 3.403700828552246


training:  66%|██████▌   | 7253/10986 [2:49:12<1:23:55,  1.35s/it]

training loss: 3.3565802574157715


training:  66%|██████▌   | 7254/10986 [2:49:13<1:23:17,  1.34s/it]

training loss: 3.3480303287506104


training:  66%|██████▌   | 7255/10986 [2:49:15<1:22:26,  1.33s/it]

training loss: 3.4125075340270996


training:  66%|██████▌   | 7256/10986 [2:49:16<1:21:56,  1.32s/it]

training loss: 3.3644561767578125


training:  66%|██████▌   | 7257/10986 [2:49:17<1:21:40,  1.31s/it]

training loss: 3.40527606010437


training:  66%|██████▌   | 7258/10986 [2:49:19<1:22:26,  1.33s/it]

training loss: 3.4297893047332764


training:  66%|██████▌   | 7259/10986 [2:49:20<1:22:49,  1.33s/it]

training loss: 3.3696274757385254


training:  66%|██████▌   | 7260/10986 [2:49:21<1:22:56,  1.34s/it]

training loss: 3.447164297103882
valid loss: 3.4374115467071533
perplexity: 31.10633659362793


training:  66%|██████▌   | 7261/10986 [2:49:24<1:51:08,  1.79s/it]

training loss: 3.373901844024658


training:  66%|██████▌   | 7262/10986 [2:49:25<1:43:53,  1.67s/it]

training loss: 3.3415393829345703


training:  66%|██████▌   | 7263/10986 [2:49:27<1:37:01,  1.56s/it]

training loss: 3.409391403198242


training:  66%|██████▌   | 7264/10986 [2:49:28<1:32:01,  1.48s/it]

training loss: 3.349820613861084


training:  66%|██████▌   | 7265/10986 [2:49:29<1:28:35,  1.43s/it]

training loss: 3.4084880352020264


training:  66%|██████▌   | 7266/10986 [2:49:31<1:26:32,  1.40s/it]

training loss: 3.517331123352051


training:  66%|██████▌   | 7267/10986 [2:49:32<1:25:00,  1.37s/it]

training loss: 3.4735264778137207


training:  66%|██████▌   | 7268/10986 [2:49:33<1:23:27,  1.35s/it]

training loss: 3.3399219512939453


training:  66%|██████▌   | 7269/10986 [2:49:35<1:22:56,  1.34s/it]

training loss: 3.4444780349731445


training:  66%|██████▌   | 7270/10986 [2:49:36<1:22:35,  1.33s/it]

training loss: 3.430694103240967


training:  66%|██████▌   | 7271/10986 [2:49:38<1:27:24,  1.41s/it]

training loss: 3.427461624145508


training:  66%|██████▌   | 7272/10986 [2:49:39<1:31:10,  1.47s/it]

training loss: 3.336361885070801


training:  66%|██████▌   | 7273/10986 [2:49:40<1:28:31,  1.43s/it]

training loss: 3.391791820526123


training:  66%|██████▌   | 7274/10986 [2:49:42<1:26:32,  1.40s/it]

training loss: 3.2937707901000977


training:  66%|██████▌   | 7275/10986 [2:49:43<1:25:28,  1.38s/it]

training loss: 3.322993755340576


training:  66%|██████▌   | 7276/10986 [2:49:44<1:24:26,  1.37s/it]

training loss: 3.430896043777466


training:  66%|██████▌   | 7277/10986 [2:49:46<1:23:16,  1.35s/it]

training loss: 3.496932029724121


training:  66%|██████▌   | 7278/10986 [2:49:47<1:22:55,  1.34s/it]

training loss: 3.413787603378296


training:  66%|██████▋   | 7279/10986 [2:49:48<1:22:26,  1.33s/it]

training loss: 3.314342737197876


training:  66%|██████▋   | 7280/10986 [2:49:50<1:22:31,  1.34s/it]

training loss: 3.3517284393310547
valid loss: 3.3503775596618652
perplexity: 28.513498306274414


training:  66%|██████▋   | 7281/10986 [2:49:53<1:49:55,  1.78s/it]

training loss: 3.4622654914855957


training:  66%|██████▋   | 7282/10986 [2:49:54<1:48:18,  1.75s/it]

training loss: 3.489558219909668


training:  66%|██████▋   | 7283/10986 [2:49:56<1:40:17,  1.63s/it]

training loss: 3.3760809898376465


training:  66%|██████▋   | 7284/10986 [2:49:57<1:34:38,  1.53s/it]

training loss: 3.393728256225586


training:  66%|██████▋   | 7285/10986 [2:49:58<1:30:31,  1.47s/it]

training loss: 3.500537872314453


training:  66%|██████▋   | 7286/10986 [2:50:00<1:27:41,  1.42s/it]

training loss: 3.3674497604370117


training:  66%|██████▋   | 7287/10986 [2:50:01<1:25:45,  1.39s/it]

training loss: 3.4804577827453613


training:  66%|██████▋   | 7288/10986 [2:50:02<1:24:18,  1.37s/it]

training loss: 3.476494550704956


training:  66%|██████▋   | 7289/10986 [2:50:04<1:23:24,  1.35s/it]

training loss: 3.369879722595215


training:  66%|██████▋   | 7290/10986 [2:50:05<1:22:30,  1.34s/it]

training loss: 3.3135221004486084


training:  66%|██████▋   | 7291/10986 [2:50:06<1:27:44,  1.42s/it]

training loss: 3.4206442832946777


training:  66%|██████▋   | 7292/10986 [2:50:08<1:30:39,  1.47s/it]

training loss: 3.324871301651001


training:  66%|██████▋   | 7293/10986 [2:50:09<1:27:42,  1.42s/it]

training loss: 3.4526939392089844


training:  66%|██████▋   | 7294/10986 [2:50:11<1:25:12,  1.38s/it]

training loss: 3.5137572288513184


training:  66%|██████▋   | 7295/10986 [2:50:12<1:23:38,  1.36s/it]

training loss: 3.285198450088501


training:  66%|██████▋   | 7296/10986 [2:50:13<1:22:46,  1.35s/it]

training loss: 3.357699394226074


training:  66%|██████▋   | 7297/10986 [2:50:15<1:22:11,  1.34s/it]

training loss: 3.393247127532959


training:  66%|██████▋   | 7298/10986 [2:50:16<1:21:41,  1.33s/it]

training loss: 3.3444697856903076


training:  66%|██████▋   | 7299/10986 [2:50:17<1:21:06,  1.32s/it]

training loss: 3.39393949508667


training:  66%|██████▋   | 7300/10986 [2:50:18<1:21:01,  1.32s/it]

training loss: 3.3398969173431396
valid loss: 3.334331512451172
perplexity: 28.059619903564453


training:  66%|██████▋   | 7301/10986 [2:50:21<1:49:11,  1.78s/it]

training loss: 3.309678077697754


training:  66%|██████▋   | 7302/10986 [2:50:23<1:42:00,  1.66s/it]

training loss: 3.3841168880462646


training:  66%|██████▋   | 7303/10986 [2:50:24<1:35:50,  1.56s/it]

training loss: 3.4060776233673096


training:  66%|██████▋   | 7304/10986 [2:50:25<1:31:09,  1.49s/it]

training loss: 3.374329090118408


training:  66%|██████▋   | 7305/10986 [2:50:27<1:27:41,  1.43s/it]

training loss: 3.3117947578430176


training:  67%|██████▋   | 7306/10986 [2:50:28<1:25:34,  1.40s/it]

training loss: 3.4048893451690674


training:  67%|██████▋   | 7307/10986 [2:50:29<1:24:04,  1.37s/it]

training loss: 3.4258482456207275


training:  67%|██████▋   | 7308/10986 [2:50:31<1:23:09,  1.36s/it]

training loss: 3.350292921066284


training:  67%|██████▋   | 7309/10986 [2:50:32<1:22:50,  1.35s/it]

training loss: 3.496581792831421


training:  67%|██████▋   | 7310/10986 [2:50:33<1:22:05,  1.34s/it]

training loss: 3.424161434173584


training:  67%|██████▋   | 7311/10986 [2:50:35<1:26:46,  1.42s/it]

training loss: 3.437577486038208


training:  67%|██████▋   | 7312/10986 [2:50:36<1:25:21,  1.39s/it]

training loss: 3.455286979675293


training:  67%|██████▋   | 7313/10986 [2:50:38<1:23:47,  1.37s/it]

training loss: 3.4376261234283447


training:  67%|██████▋   | 7314/10986 [2:50:39<1:22:12,  1.34s/it]

training loss: 3.3117244243621826


training:  67%|██████▋   | 7315/10986 [2:50:40<1:21:58,  1.34s/it]

training loss: 3.43670392036438


training:  67%|██████▋   | 7316/10986 [2:50:42<1:24:49,  1.39s/it]

training loss: 3.4375128746032715


training:  67%|██████▋   | 7317/10986 [2:50:43<1:30:28,  1.48s/it]

training loss: 3.3903164863586426


training:  67%|██████▋   | 7318/10986 [2:50:45<1:34:02,  1.54s/it]

training loss: 3.4915027618408203


training:  67%|██████▋   | 7319/10986 [2:50:46<1:30:02,  1.47s/it]

training loss: 3.4269728660583496


training:  67%|██████▋   | 7320/10986 [2:50:48<1:26:39,  1.42s/it]

training loss: 3.3677902221679688
valid loss: 3.3704638481140137
perplexity: 29.092018127441406


training:  67%|██████▋   | 7321/10986 [2:50:50<1:52:47,  1.85s/it]

training loss: 3.3014400005340576


training:  67%|██████▋   | 7322/10986 [2:50:52<1:49:02,  1.79s/it]

training loss: 3.398442506790161


training:  67%|██████▋   | 7323/10986 [2:50:53<1:41:06,  1.66s/it]

training loss: 3.4473869800567627


training:  67%|██████▋   | 7324/10986 [2:50:55<1:35:02,  1.56s/it]

training loss: 3.3963727951049805


training:  67%|██████▋   | 7325/10986 [2:50:56<1:30:51,  1.49s/it]

training loss: 3.440051317214966


training:  67%|██████▋   | 7326/10986 [2:50:57<1:27:36,  1.44s/it]

training loss: 3.413414239883423


training:  67%|██████▋   | 7327/10986 [2:50:59<1:25:02,  1.39s/it]

training loss: 3.4412691593170166


training:  67%|██████▋   | 7328/10986 [2:51:00<1:23:30,  1.37s/it]

training loss: 3.3209228515625


training:  67%|██████▋   | 7329/10986 [2:51:01<1:22:35,  1.36s/it]

training loss: 3.405977964401245


training:  67%|██████▋   | 7330/10986 [2:51:03<1:22:16,  1.35s/it]

training loss: 3.4602596759796143


training:  67%|██████▋   | 7331/10986 [2:51:04<1:27:07,  1.43s/it]

training loss: 3.377016067504883


training:  67%|██████▋   | 7332/10986 [2:51:06<1:25:32,  1.40s/it]

training loss: 3.4031822681427


training:  67%|██████▋   | 7333/10986 [2:51:07<1:24:00,  1.38s/it]

training loss: 3.4270780086517334


training:  67%|██████▋   | 7334/10986 [2:51:08<1:23:10,  1.37s/it]

training loss: 3.348637580871582


training:  67%|██████▋   | 7335/10986 [2:51:10<1:22:39,  1.36s/it]

training loss: 3.418954849243164


training:  67%|██████▋   | 7336/10986 [2:51:11<1:21:59,  1.35s/it]

training loss: 3.3661556243896484


training:  67%|██████▋   | 7337/10986 [2:51:12<1:21:27,  1.34s/it]

training loss: 3.4139695167541504


training:  67%|██████▋   | 7338/10986 [2:51:14<1:21:07,  1.33s/it]

training loss: 3.405815362930298


training:  67%|██████▋   | 7339/10986 [2:51:15<1:20:42,  1.33s/it]

training loss: 3.4380412101745605


training:  67%|██████▋   | 7340/10986 [2:51:16<1:20:41,  1.33s/it]

training loss: 3.3595001697540283
valid loss: 3.3527469635009766
perplexity: 28.581138610839844


training:  67%|██████▋   | 7341/10986 [2:51:19<1:48:17,  1.78s/it]

training loss: 3.3983154296875


training:  67%|██████▋   | 7342/10986 [2:51:21<1:46:04,  1.75s/it]

training loss: 3.390922784805298


training:  67%|██████▋   | 7343/10986 [2:51:22<1:38:52,  1.63s/it]

training loss: 3.279946804046631


training:  67%|██████▋   | 7344/10986 [2:51:23<1:33:35,  1.54s/it]

training loss: 3.306267499923706


training:  67%|██████▋   | 7345/10986 [2:51:25<1:29:55,  1.48s/it]

training loss: 3.40510892868042


training:  67%|██████▋   | 7346/10986 [2:51:26<1:26:50,  1.43s/it]

training loss: 3.387261152267456


training:  67%|██████▋   | 7347/10986 [2:51:27<1:24:50,  1.40s/it]

training loss: 3.5085031986236572


training:  67%|██████▋   | 7348/10986 [2:51:29<1:22:43,  1.36s/it]

training loss: 3.513627767562866


training:  67%|██████▋   | 7349/10986 [2:51:30<1:21:49,  1.35s/it]

training loss: 3.34753680229187


training:  67%|██████▋   | 7350/10986 [2:51:31<1:20:56,  1.34s/it]

training loss: 3.5522782802581787


training:  67%|██████▋   | 7351/10986 [2:51:33<1:25:54,  1.42s/it]

training loss: 3.470794200897217


training:  67%|██████▋   | 7352/10986 [2:51:35<1:30:39,  1.50s/it]

training loss: 3.4004690647125244


training:  67%|██████▋   | 7353/10986 [2:51:36<1:27:32,  1.45s/it]

training loss: 3.32669734954834


training:  67%|██████▋   | 7354/10986 [2:51:37<1:25:17,  1.41s/it]

training loss: 3.464927911758423


training:  67%|██████▋   | 7355/10986 [2:51:39<1:23:39,  1.38s/it]

training loss: 3.419247627258301


training:  67%|██████▋   | 7356/10986 [2:51:40<1:22:03,  1.36s/it]

training loss: 3.481055498123169


training:  67%|██████▋   | 7357/10986 [2:51:41<1:21:03,  1.34s/it]

training loss: 3.424316644668579


training:  67%|██████▋   | 7358/10986 [2:51:43<1:20:58,  1.34s/it]

training loss: 3.367785930633545


training:  67%|██████▋   | 7359/10986 [2:51:44<1:20:43,  1.34s/it]

training loss: 3.457127094268799


training:  67%|██████▋   | 7360/10986 [2:51:45<1:21:18,  1.35s/it]

training loss: 3.443913221359253
valid loss: 3.442298412322998
perplexity: 31.25872039794922


training:  67%|██████▋   | 7361/10986 [2:51:48<1:48:34,  1.80s/it]

training loss: 3.4141948223114014


training:  67%|██████▋   | 7362/10986 [2:51:50<1:41:57,  1.69s/it]

training loss: 3.472235918045044


training:  67%|██████▋   | 7363/10986 [2:51:51<1:36:11,  1.59s/it]

training loss: 3.394564628601074


training:  67%|██████▋   | 7364/10986 [2:51:52<1:31:06,  1.51s/it]

training loss: 3.2831692695617676


training:  67%|██████▋   | 7365/10986 [2:51:54<1:27:34,  1.45s/it]

training loss: 3.387585401535034


training:  67%|██████▋   | 7366/10986 [2:51:55<1:25:00,  1.41s/it]

training loss: 3.434218168258667


training:  67%|██████▋   | 7367/10986 [2:51:56<1:23:08,  1.38s/it]

training loss: 3.478050708770752


training:  67%|██████▋   | 7368/10986 [2:51:57<1:21:37,  1.35s/it]

training loss: 3.3845901489257812


training:  67%|██████▋   | 7369/10986 [2:51:59<1:20:49,  1.34s/it]

training loss: 3.505042552947998


training:  67%|██████▋   | 7370/10986 [2:52:00<1:20:14,  1.33s/it]

training loss: 3.4374711513519287


training:  67%|██████▋   | 7371/10986 [2:52:02<1:24:40,  1.41s/it]

training loss: 3.381340503692627


training:  67%|██████▋   | 7372/10986 [2:52:03<1:23:14,  1.38s/it]

training loss: 3.312779188156128


training:  67%|██████▋   | 7373/10986 [2:52:04<1:22:14,  1.37s/it]

training loss: 3.356081485748291


training:  67%|██████▋   | 7374/10986 [2:52:06<1:20:55,  1.34s/it]

training loss: 3.3817226886749268


training:  67%|██████▋   | 7375/10986 [2:52:07<1:19:56,  1.33s/it]

training loss: 3.3667454719543457


training:  67%|██████▋   | 7376/10986 [2:52:08<1:19:31,  1.32s/it]

training loss: 3.3639776706695557


training:  67%|██████▋   | 7377/10986 [2:52:09<1:19:30,  1.32s/it]

training loss: 3.4960079193115234


training:  67%|██████▋   | 7378/10986 [2:52:11<1:19:41,  1.33s/it]

training loss: 3.4211647510528564


training:  67%|██████▋   | 7379/10986 [2:52:12<1:18:52,  1.31s/it]

training loss: 3.448625326156616


training:  67%|██████▋   | 7380/10986 [2:52:13<1:18:26,  1.31s/it]

training loss: 3.386566162109375
valid loss: 3.3854634761810303
perplexity: 29.531675338745117


training:  67%|██████▋   | 7381/10986 [2:52:16<1:45:03,  1.75s/it]

training loss: 3.355214834213257


training:  67%|██████▋   | 7382/10986 [2:52:18<1:42:20,  1.70s/it]

training loss: 3.4763011932373047


training:  67%|██████▋   | 7383/10986 [2:52:19<1:35:27,  1.59s/it]

training loss: 3.4201691150665283


training:  67%|██████▋   | 7384/10986 [2:52:20<1:30:31,  1.51s/it]

training loss: 3.3584861755371094


training:  67%|██████▋   | 7385/10986 [2:52:22<1:27:43,  1.46s/it]

training loss: 3.343712568283081


training:  67%|██████▋   | 7386/10986 [2:52:23<1:25:07,  1.42s/it]

training loss: 3.3701651096343994


training:  67%|██████▋   | 7387/10986 [2:52:24<1:23:34,  1.39s/it]

training loss: 3.2807846069335938


training:  67%|██████▋   | 7388/10986 [2:52:26<1:22:01,  1.37s/it]

training loss: 3.3390908241271973


training:  67%|██████▋   | 7389/10986 [2:52:27<1:20:46,  1.35s/it]

training loss: 3.4104366302490234


training:  67%|██████▋   | 7390/10986 [2:52:28<1:20:11,  1.34s/it]

training loss: 3.4209699630737305


training:  67%|██████▋   | 7391/10986 [2:52:30<1:24:32,  1.41s/it]

training loss: 3.3443431854248047


training:  67%|██████▋   | 7392/10986 [2:52:32<1:27:54,  1.47s/it]

training loss: 3.2986581325531006


training:  67%|██████▋   | 7393/10986 [2:52:33<1:24:45,  1.42s/it]

training loss: 3.580425262451172


training:  67%|██████▋   | 7394/10986 [2:52:34<1:22:25,  1.38s/it]

training loss: 3.361633062362671


training:  67%|██████▋   | 7395/10986 [2:52:35<1:20:42,  1.35s/it]

training loss: 3.356466293334961


training:  67%|██████▋   | 7396/10986 [2:52:37<1:19:20,  1.33s/it]

training loss: 3.291766405105591


training:  67%|██████▋   | 7397/10986 [2:52:38<1:19:05,  1.32s/it]

training loss: 3.5466339588165283


training:  67%|██████▋   | 7398/10986 [2:52:39<1:18:36,  1.31s/it]

training loss: 3.4259519577026367


training:  67%|██████▋   | 7399/10986 [2:52:41<1:18:21,  1.31s/it]

training loss: 3.4864492416381836


training:  67%|██████▋   | 7400/10986 [2:52:42<1:17:55,  1.30s/it]

training loss: 3.4602279663085938
valid loss: 3.4576587677001953
perplexity: 31.742572784423828


training:  67%|██████▋   | 7401/10986 [2:52:45<1:43:47,  1.74s/it]

training loss: 3.3422248363494873


training:  67%|██████▋   | 7402/10986 [2:52:46<1:37:00,  1.62s/it]

training loss: 3.3728067874908447


training:  67%|██████▋   | 7403/10986 [2:52:47<1:31:15,  1.53s/it]

training loss: 3.368049383163452


training:  67%|██████▋   | 7404/10986 [2:52:49<1:27:04,  1.46s/it]

training loss: 3.5205891132354736


training:  67%|██████▋   | 7405/10986 [2:52:50<1:24:10,  1.41s/it]

training loss: 3.3361828327178955


training:  67%|██████▋   | 7406/10986 [2:52:51<1:23:24,  1.40s/it]

training loss: 3.4396328926086426


training:  67%|██████▋   | 7407/10986 [2:52:53<1:21:30,  1.37s/it]

training loss: 3.394380807876587


training:  67%|██████▋   | 7408/10986 [2:52:54<1:19:58,  1.34s/it]

training loss: 3.427750825881958


training:  67%|██████▋   | 7409/10986 [2:52:55<1:19:17,  1.33s/it]

training loss: 3.4238083362579346


training:  67%|██████▋   | 7410/10986 [2:52:56<1:19:04,  1.33s/it]

training loss: 3.3012850284576416


training:  67%|██████▋   | 7411/10986 [2:52:58<1:23:24,  1.40s/it]

training loss: 3.4063045978546143


training:  67%|██████▋   | 7412/10986 [2:53:00<1:27:29,  1.47s/it]

training loss: 3.4479763507843018


training:  67%|██████▋   | 7413/10986 [2:53:01<1:24:23,  1.42s/it]

training loss: 3.4489665031433105


training:  67%|██████▋   | 7414/10986 [2:53:02<1:22:19,  1.38s/it]

training loss: 3.3678653240203857


training:  67%|██████▋   | 7415/10986 [2:53:04<1:20:41,  1.36s/it]

training loss: 3.3813743591308594


training:  68%|██████▊   | 7416/10986 [2:53:05<1:19:28,  1.34s/it]

training loss: 3.319822072982788


training:  68%|██████▊   | 7417/10986 [2:53:06<1:19:03,  1.33s/it]

training loss: 3.372586965560913


training:  68%|██████▊   | 7418/10986 [2:53:07<1:18:36,  1.32s/it]

training loss: 3.3786401748657227


training:  68%|██████▊   | 7419/10986 [2:53:09<1:18:17,  1.32s/it]

training loss: 3.3549981117248535


training:  68%|██████▊   | 7420/10986 [2:53:10<1:17:51,  1.31s/it]

training loss: 3.4075944423675537
valid loss: 3.4053168296813965
perplexity: 30.123838424682617


training:  68%|██████▊   | 7421/10986 [2:53:13<1:43:18,  1.74s/it]

training loss: 3.372088670730591


training:  68%|██████▊   | 7422/10986 [2:53:14<1:36:47,  1.63s/it]

training loss: 3.3948440551757812


training:  68%|██████▊   | 7423/10986 [2:53:15<1:31:01,  1.53s/it]

training loss: 3.402355194091797


training:  68%|██████▊   | 7424/10986 [2:53:17<1:26:40,  1.46s/it]

training loss: 3.5195109844207764


training:  68%|██████▊   | 7425/10986 [2:53:18<1:23:56,  1.41s/it]

training loss: 3.3415586948394775


training:  68%|██████▊   | 7426/10986 [2:53:19<1:21:46,  1.38s/it]

training loss: 3.271162509918213


training:  68%|██████▊   | 7427/10986 [2:53:21<1:20:16,  1.35s/it]

training loss: 3.3220179080963135


training:  68%|██████▊   | 7428/10986 [2:53:22<1:20:12,  1.35s/it]

training loss: 3.3900394439697266


training:  68%|██████▊   | 7429/10986 [2:53:23<1:19:04,  1.33s/it]

training loss: 3.3706512451171875


training:  68%|██████▊   | 7430/10986 [2:53:25<1:18:49,  1.33s/it]

training loss: 3.3199007511138916


training:  68%|██████▊   | 7431/10986 [2:53:26<1:23:10,  1.40s/it]

training loss: 3.4220688343048096


training:  68%|██████▊   | 7432/10986 [2:53:28<1:21:45,  1.38s/it]

training loss: 3.459848165512085


training:  68%|██████▊   | 7433/10986 [2:53:29<1:20:17,  1.36s/it]

training loss: 3.310772180557251


training:  68%|██████▊   | 7434/10986 [2:53:30<1:19:18,  1.34s/it]

training loss: 3.2925283908843994


training:  68%|██████▊   | 7435/10986 [2:53:31<1:18:40,  1.33s/it]

training loss: 3.38865065574646


training:  68%|██████▊   | 7436/10986 [2:53:33<1:18:20,  1.32s/it]

training loss: 3.306478261947632


training:  68%|██████▊   | 7437/10986 [2:53:34<1:17:52,  1.32s/it]

training loss: 3.3494865894317627


training:  68%|██████▊   | 7438/10986 [2:53:35<1:17:34,  1.31s/it]

training loss: 3.3960061073303223


training:  68%|██████▊   | 7439/10986 [2:53:37<1:17:36,  1.31s/it]

training loss: 3.3568530082702637


training:  68%|██████▊   | 7440/10986 [2:53:38<1:16:53,  1.30s/it]

training loss: 3.329880714416504
valid loss: 3.3276407718658447
perplexity: 27.87250518798828


training:  68%|██████▊   | 7441/10986 [2:53:41<1:43:21,  1.75s/it]

training loss: 3.365121841430664


training:  68%|██████▊   | 7442/10986 [2:53:42<1:37:00,  1.64s/it]

training loss: 3.3322744369506836


training:  68%|██████▊   | 7443/10986 [2:53:43<1:31:34,  1.55s/it]

training loss: 3.387845039367676


training:  68%|██████▊   | 7444/10986 [2:53:45<1:27:07,  1.48s/it]

training loss: 3.3971498012542725


training:  68%|██████▊   | 7445/10986 [2:53:46<1:23:48,  1.42s/it]

training loss: 3.3044960498809814


training:  68%|██████▊   | 7446/10986 [2:53:47<1:21:58,  1.39s/it]

training loss: 3.3382201194763184


training:  68%|██████▊   | 7447/10986 [2:53:49<1:20:00,  1.36s/it]

training loss: 3.4102284908294678


training:  68%|██████▊   | 7448/10986 [2:53:50<1:19:03,  1.34s/it]

training loss: 3.3115975856781006


training:  68%|██████▊   | 7449/10986 [2:53:51<1:18:13,  1.33s/it]

training loss: 3.4553399085998535


training:  68%|██████▊   | 7450/10986 [2:53:53<1:17:38,  1.32s/it]

training loss: 3.3619837760925293


training:  68%|██████▊   | 7451/10986 [2:53:54<1:22:46,  1.41s/it]

training loss: 3.3226418495178223


training:  68%|██████▊   | 7452/10986 [2:53:55<1:21:03,  1.38s/it]

training loss: 3.4307289123535156


training:  68%|██████▊   | 7453/10986 [2:53:57<1:19:30,  1.35s/it]

training loss: 3.330275535583496


training:  68%|██████▊   | 7454/10986 [2:53:58<1:18:27,  1.33s/it]

training loss: 3.397669553756714


training:  68%|██████▊   | 7455/10986 [2:53:59<1:17:56,  1.32s/it]

training loss: 3.2897160053253174


training:  68%|██████▊   | 7456/10986 [2:54:01<1:17:15,  1.31s/it]

training loss: 3.4490156173706055


training:  68%|██████▊   | 7457/10986 [2:54:02<1:16:30,  1.30s/it]

training loss: 3.500433921813965


training:  68%|██████▊   | 7458/10986 [2:54:03<1:16:28,  1.30s/it]

training loss: 3.429494857788086


training:  68%|██████▊   | 7459/10986 [2:54:04<1:16:25,  1.30s/it]

training loss: 3.273143768310547


training:  68%|██████▊   | 7460/10986 [2:54:06<1:16:39,  1.30s/it]

training loss: 3.4172627925872803
valid loss: 3.4190115928649902
perplexity: 30.539215087890625


training:  68%|██████▊   | 7461/10986 [2:54:09<1:42:38,  1.75s/it]

training loss: 3.5229671001434326


training:  68%|██████▊   | 7462/10986 [2:54:10<1:41:04,  1.72s/it]

training loss: 3.399651527404785


training:  68%|██████▊   | 7463/10986 [2:54:12<1:33:53,  1.60s/it]

training loss: 3.3877885341644287


training:  68%|██████▊   | 7464/10986 [2:54:13<1:28:34,  1.51s/it]

training loss: 3.4057867527008057


training:  68%|██████▊   | 7465/10986 [2:54:14<1:24:46,  1.44s/it]

training loss: 3.373516082763672


training:  68%|██████▊   | 7466/10986 [2:54:15<1:22:33,  1.41s/it]

training loss: 3.4779434204101562


training:  68%|██████▊   | 7467/10986 [2:54:17<1:20:31,  1.37s/it]

training loss: 3.4247303009033203


training:  68%|██████▊   | 7468/10986 [2:54:18<1:19:23,  1.35s/it]

training loss: 3.288567304611206


training:  68%|██████▊   | 7469/10986 [2:54:19<1:18:31,  1.34s/it]

training loss: 3.370753765106201


training:  68%|██████▊   | 7470/10986 [2:54:21<1:21:27,  1.39s/it]

training loss: 3.3294506072998047


training:  68%|██████▊   | 7471/10986 [2:54:23<1:34:10,  1.61s/it]

training loss: 3.3899266719818115


training:  68%|██████▊   | 7472/10986 [2:54:25<1:39:53,  1.71s/it]

training loss: 3.3474996089935303


training:  68%|██████▊   | 7473/10986 [2:54:26<1:33:21,  1.59s/it]

training loss: 3.500577449798584


training:  68%|██████▊   | 7474/10986 [2:54:28<1:28:31,  1.51s/it]

training loss: 3.4169106483459473


training:  68%|██████▊   | 7475/10986 [2:54:29<1:24:45,  1.45s/it]

training loss: 3.4254796504974365


training:  68%|██████▊   | 7476/10986 [2:54:30<1:22:24,  1.41s/it]

training loss: 3.3472249507904053


training:  68%|██████▊   | 7477/10986 [2:54:31<1:20:15,  1.37s/it]

training loss: 3.364954710006714


training:  68%|██████▊   | 7478/10986 [2:54:33<1:18:56,  1.35s/it]

training loss: 3.4612364768981934


training:  68%|██████▊   | 7479/10986 [2:54:34<1:18:05,  1.34s/it]

training loss: 3.3076066970825195


training:  68%|██████▊   | 7480/10986 [2:54:35<1:17:41,  1.33s/it]

training loss: 3.3996589183807373
valid loss: 3.3942883014678955
perplexity: 29.793441772460938


training:  68%|██████▊   | 7481/10986 [2:54:38<1:43:56,  1.78s/it]

training loss: 3.5076560974121094


training:  68%|██████▊   | 7482/10986 [2:54:40<1:36:57,  1.66s/it]

training loss: 3.333163022994995


training:  68%|██████▊   | 7483/10986 [2:54:41<1:31:20,  1.56s/it]

training loss: 3.593679904937744


training:  68%|██████▊   | 7484/10986 [2:54:42<1:26:28,  1.48s/it]

training loss: 3.5341503620147705


training:  68%|██████▊   | 7485/10986 [2:54:44<1:23:38,  1.43s/it]

training loss: 3.426568031311035


training:  68%|██████▊   | 7486/10986 [2:54:45<1:21:43,  1.40s/it]

training loss: 3.363755226135254


training:  68%|██████▊   | 7487/10986 [2:54:46<1:20:05,  1.37s/it]

training loss: 3.368176221847534


training:  68%|██████▊   | 7488/10986 [2:54:48<1:18:53,  1.35s/it]

training loss: 3.4598388671875


training:  68%|██████▊   | 7489/10986 [2:54:49<1:17:50,  1.34s/it]

training loss: 3.5093555450439453


training:  68%|██████▊   | 7490/10986 [2:54:50<1:16:57,  1.32s/it]

training loss: 3.4677181243896484


training:  68%|██████▊   | 7491/10986 [2:54:52<1:21:40,  1.40s/it]

training loss: 3.579822063446045


training:  68%|██████▊   | 7492/10986 [2:54:53<1:25:45,  1.47s/it]

training loss: 3.63960337638855


training:  68%|██████▊   | 7493/10986 [2:54:55<1:23:25,  1.43s/it]

training loss: 3.5363903045654297


training:  68%|██████▊   | 7494/10986 [2:54:56<1:20:40,  1.39s/it]

training loss: 3.433587074279785


training:  68%|██████▊   | 7495/10986 [2:54:57<1:19:26,  1.37s/it]

training loss: 3.4525716304779053


training:  68%|██████▊   | 7496/10986 [2:54:59<1:18:37,  1.35s/it]

training loss: 3.474915027618408


training:  68%|██████▊   | 7497/10986 [2:55:00<1:18:18,  1.35s/it]

training loss: 3.4816057682037354


training:  68%|██████▊   | 7498/10986 [2:55:01<1:17:19,  1.33s/it]

training loss: 3.4393229484558105


training:  68%|██████▊   | 7499/10986 [2:55:03<1:17:29,  1.33s/it]

training loss: 3.460801601409912


training:  68%|██████▊   | 7500/10986 [2:55:04<1:16:58,  1.32s/it]

training loss: 3.674455165863037
valid loss: 3.68743634223938
perplexity: 39.94231414794922


training:  68%|██████▊   | 7501/10986 [2:55:07<1:42:26,  1.76s/it]

training loss: 3.359348773956299


training:  68%|██████▊   | 7502/10986 [2:55:08<1:36:23,  1.66s/it]

training loss: 3.492290496826172


training:  68%|██████▊   | 7503/10986 [2:55:09<1:30:29,  1.56s/it]

training loss: 3.373650312423706


training:  68%|██████▊   | 7504/10986 [2:55:11<1:26:23,  1.49s/it]

training loss: 3.4968764781951904


training:  68%|██████▊   | 7505/10986 [2:55:12<1:23:38,  1.44s/it]

training loss: 3.459261655807495


training:  68%|██████▊   | 7506/10986 [2:55:13<1:21:24,  1.40s/it]

training loss: 3.464238166809082


training:  68%|██████▊   | 7507/10986 [2:55:15<1:20:00,  1.38s/it]

training loss: 3.4242496490478516


training:  68%|██████▊   | 7508/10986 [2:55:16<1:19:16,  1.37s/it]

training loss: 3.4019813537597656


training:  68%|██████▊   | 7509/10986 [2:55:17<1:18:26,  1.35s/it]

training loss: 3.5061118602752686


training:  68%|██████▊   | 7510/10986 [2:55:19<1:17:59,  1.35s/it]

training loss: 3.4641077518463135


training:  68%|██████▊   | 7511/10986 [2:55:20<1:22:00,  1.42s/it]

training loss: 3.524637460708618


training:  68%|██████▊   | 7512/10986 [2:55:22<1:21:27,  1.41s/it]

training loss: 3.4893081188201904


training:  68%|██████▊   | 7513/10986 [2:55:23<1:20:56,  1.40s/it]

training loss: 3.4314258098602295


training:  68%|██████▊   | 7514/10986 [2:55:24<1:19:46,  1.38s/it]

training loss: 3.4694459438323975


training:  68%|██████▊   | 7515/10986 [2:55:26<1:18:33,  1.36s/it]

training loss: 3.4546496868133545


training:  68%|██████▊   | 7516/10986 [2:55:27<1:17:52,  1.35s/it]

training loss: 3.3658790588378906


training:  68%|██████▊   | 7517/10986 [2:55:28<1:17:44,  1.34s/it]

training loss: 3.543461322784424


training:  68%|██████▊   | 7518/10986 [2:55:30<1:16:57,  1.33s/it]

training loss: 3.4965474605560303


training:  68%|██████▊   | 7519/10986 [2:55:31<1:16:50,  1.33s/it]

training loss: 3.4078052043914795


training:  68%|██████▊   | 7520/10986 [2:55:32<1:17:11,  1.34s/it]

training loss: 3.4586899280548096
valid loss: 3.47192645072937
perplexity: 32.19871520996094


training:  68%|██████▊   | 7521/10986 [2:55:35<1:43:03,  1.78s/it]

training loss: 3.54624605178833


training:  68%|██████▊   | 7522/10986 [2:55:37<1:36:35,  1.67s/it]

training loss: 3.4854021072387695


training:  68%|██████▊   | 7523/10986 [2:55:38<1:30:18,  1.56s/it]

training loss: 3.4832069873809814


training:  68%|██████▊   | 7524/10986 [2:55:39<1:26:13,  1.49s/it]

training loss: 3.3515031337738037


training:  68%|██████▊   | 7525/10986 [2:55:41<1:23:41,  1.45s/it]

training loss: 3.4177703857421875


training:  69%|██████▊   | 7526/10986 [2:55:42<1:21:07,  1.41s/it]

training loss: 3.417353868484497


training:  69%|██████▊   | 7527/10986 [2:55:43<1:19:44,  1.38s/it]

training loss: 3.475728750228882


training:  69%|██████▊   | 7528/10986 [2:55:44<1:18:52,  1.37s/it]

training loss: 3.3420121669769287


training:  69%|██████▊   | 7529/10986 [2:55:46<1:17:59,  1.35s/it]

training loss: 3.626380443572998


training:  69%|██████▊   | 7530/10986 [2:55:47<1:17:18,  1.34s/it]

training loss: 3.386568546295166


training:  69%|██████▊   | 7531/10986 [2:55:49<1:21:51,  1.42s/it]

training loss: 3.458474636077881


training:  69%|██████▊   | 7532/10986 [2:55:50<1:20:50,  1.40s/it]

training loss: 3.4717745780944824


training:  69%|██████▊   | 7533/10986 [2:55:51<1:19:35,  1.38s/it]

training loss: 3.408310890197754


training:  69%|██████▊   | 7534/10986 [2:55:53<1:19:17,  1.38s/it]

training loss: 3.610462188720703


training:  69%|██████▊   | 7535/10986 [2:55:54<1:18:43,  1.37s/it]

training loss: 3.483999729156494


training:  69%|██████▊   | 7536/10986 [2:55:55<1:17:45,  1.35s/it]

training loss: 3.4769415855407715


training:  69%|██████▊   | 7537/10986 [2:55:57<1:17:02,  1.34s/it]

training loss: 3.458665609359741


training:  69%|██████▊   | 7538/10986 [2:55:58<1:16:48,  1.34s/it]

training loss: 3.4245333671569824


training:  69%|██████▊   | 7539/10986 [2:55:59<1:16:14,  1.33s/it]

training loss: 3.381958484649658


training:  69%|██████▊   | 7540/10986 [2:56:01<1:16:17,  1.33s/it]

training loss: 3.4217922687530518
valid loss: 3.417618989944458
perplexity: 30.496715545654297


training:  69%|██████▊   | 7541/10986 [2:56:04<1:41:49,  1.77s/it]

training loss: 3.476355791091919


training:  69%|██████▊   | 7542/10986 [2:56:05<1:34:57,  1.65s/it]

training loss: 3.4497101306915283


training:  69%|██████▊   | 7543/10986 [2:56:06<1:28:53,  1.55s/it]

training loss: 3.8352973461151123


training:  69%|██████▊   | 7544/10986 [2:56:08<1:24:22,  1.47s/it]

training loss: 3.479624032974243


training:  69%|██████▊   | 7545/10986 [2:56:09<1:21:27,  1.42s/it]

training loss: 3.5589280128479004


training:  69%|██████▊   | 7546/10986 [2:56:10<1:18:58,  1.38s/it]

training loss: 3.4224774837493896


training:  69%|██████▊   | 7547/10986 [2:56:11<1:17:46,  1.36s/it]

training loss: 3.4623591899871826


training:  69%|██████▊   | 7548/10986 [2:56:13<1:16:33,  1.34s/it]

training loss: 3.4124269485473633


training:  69%|██████▊   | 7549/10986 [2:56:14<1:15:54,  1.33s/it]

training loss: 3.4628846645355225


training:  69%|██████▊   | 7550/10986 [2:56:15<1:15:23,  1.32s/it]

training loss: 3.523192882537842


training:  69%|██████▊   | 7551/10986 [2:56:17<1:19:24,  1.39s/it]

training loss: 3.5834109783172607


training:  69%|██████▊   | 7552/10986 [2:56:18<1:23:10,  1.45s/it]

training loss: 3.4578166007995605


training:  69%|██████▉   | 7553/10986 [2:56:20<1:20:23,  1.40s/it]

training loss: 3.3898863792419434


training:  69%|██████▉   | 7554/10986 [2:56:21<1:18:13,  1.37s/it]

training loss: 3.479475736618042


training:  69%|██████▉   | 7555/10986 [2:56:22<1:17:15,  1.35s/it]

training loss: 3.5151445865631104


training:  69%|██████▉   | 7556/10986 [2:56:24<1:16:16,  1.33s/it]

training loss: 3.453523874282837


training:  69%|██████▉   | 7557/10986 [2:56:25<1:15:29,  1.32s/it]

training loss: 3.6037607192993164


training:  69%|██████▉   | 7558/10986 [2:56:26<1:15:09,  1.32s/it]

training loss: 3.6518945693969727


training:  69%|██████▉   | 7559/10986 [2:56:28<1:14:48,  1.31s/it]

training loss: 3.3901326656341553


training:  69%|██████▉   | 7560/10986 [2:56:29<1:14:31,  1.31s/it]

training loss: 3.510213851928711
valid loss: 3.521028757095337
perplexity: 33.8192024230957


training:  69%|██████▉   | 7561/10986 [2:56:32<1:38:56,  1.73s/it]

training loss: 3.5680582523345947


training:  69%|██████▉   | 7562/10986 [2:56:33<1:36:35,  1.69s/it]

training loss: 3.3963823318481445


training:  69%|██████▉   | 7563/10986 [2:56:34<1:29:20,  1.57s/it]

training loss: 3.402878522872925


training:  69%|██████▉   | 7564/10986 [2:56:36<1:23:57,  1.47s/it]

training loss: 3.3873751163482666


training:  69%|██████▉   | 7565/10986 [2:56:37<1:20:18,  1.41s/it]

training loss: 3.5750653743743896


training:  69%|██████▉   | 7566/10986 [2:56:38<1:17:39,  1.36s/it]

training loss: 3.4545583724975586


training:  69%|██████▉   | 7567/10986 [2:56:39<1:16:12,  1.34s/it]

training loss: 3.3373382091522217


training:  69%|██████▉   | 7568/10986 [2:56:41<1:15:04,  1.32s/it]

training loss: 3.4529008865356445


training:  69%|██████▉   | 7569/10986 [2:56:42<1:14:01,  1.30s/it]

training loss: 3.540992498397827


training:  69%|██████▉   | 7570/10986 [2:56:43<1:13:30,  1.29s/it]

training loss: 3.3153038024902344


training:  69%|██████▉   | 7571/10986 [2:56:45<1:18:21,  1.38s/it]

training loss: 3.4715278148651123


training:  69%|██████▉   | 7572/10986 [2:56:46<1:21:30,  1.43s/it]

training loss: 3.396435260772705


training:  69%|██████▉   | 7573/10986 [2:56:48<1:18:49,  1.39s/it]

training loss: 3.4162497520446777


training:  69%|██████▉   | 7574/10986 [2:56:49<1:16:52,  1.35s/it]

training loss: 3.487720489501953


training:  69%|██████▉   | 7575/10986 [2:56:50<1:15:29,  1.33s/it]

training loss: 3.455798864364624


training:  69%|██████▉   | 7576/10986 [2:56:51<1:14:29,  1.31s/it]

training loss: 3.589104413986206


training:  69%|██████▉   | 7577/10986 [2:56:53<1:13:37,  1.30s/it]

training loss: 3.5716989040374756


training:  69%|██████▉   | 7578/10986 [2:56:54<1:13:48,  1.30s/it]

training loss: 3.5086958408355713


training:  69%|██████▉   | 7579/10986 [2:56:55<1:13:22,  1.29s/it]

training loss: 3.4735360145568848


training:  69%|██████▉   | 7580/10986 [2:56:57<1:13:26,  1.29s/it]

training loss: 3.421391725540161
valid loss: 3.4024593830108643
perplexity: 30.037883758544922


training:  69%|██████▉   | 7581/10986 [2:56:59<1:37:57,  1.73s/it]

training loss: 3.48423433303833


training:  69%|██████▉   | 7582/10986 [2:57:01<1:31:22,  1.61s/it]

training loss: 3.4545960426330566


training:  69%|██████▉   | 7583/10986 [2:57:02<1:25:34,  1.51s/it]

training loss: 3.402517080307007


training:  69%|██████▉   | 7584/10986 [2:57:03<1:21:35,  1.44s/it]

training loss: 3.5257036685943604


training:  69%|██████▉   | 7585/10986 [2:57:04<1:18:14,  1.38s/it]

training loss: 3.43192195892334


training:  69%|██████▉   | 7586/10986 [2:57:06<1:16:07,  1.34s/it]

training loss: 3.488215446472168


training:  69%|██████▉   | 7587/10986 [2:57:07<1:14:12,  1.31s/it]

training loss: 3.563427209854126


training:  69%|██████▉   | 7588/10986 [2:57:08<1:12:58,  1.29s/it]

training loss: 3.4791743755340576


training:  69%|██████▉   | 7589/10986 [2:57:09<1:12:07,  1.27s/it]

training loss: 3.527163028717041


training:  69%|██████▉   | 7590/10986 [2:57:11<1:12:10,  1.28s/it]

training loss: 3.4782724380493164


training:  69%|██████▉   | 7591/10986 [2:57:12<1:16:32,  1.35s/it]

training loss: 3.5268449783325195


training:  69%|██████▉   | 7592/10986 [2:57:14<1:14:47,  1.32s/it]

training loss: 3.3702521324157715


training:  69%|██████▉   | 7593/10986 [2:57:15<1:13:47,  1.30s/it]

training loss: 3.4331119060516357


training:  69%|██████▉   | 7594/10986 [2:57:16<1:12:38,  1.29s/it]

training loss: 3.3928370475769043


training:  69%|██████▉   | 7595/10986 [2:57:17<1:11:45,  1.27s/it]

training loss: 3.3320343494415283


training:  69%|██████▉   | 7596/10986 [2:57:19<1:11:19,  1.26s/it]

training loss: 3.4204013347625732


training:  69%|██████▉   | 7597/10986 [2:57:20<1:10:38,  1.25s/it]

training loss: 3.5172505378723145


training:  69%|██████▉   | 7598/10986 [2:57:21<1:10:32,  1.25s/it]

training loss: 3.4177448749542236


training:  69%|██████▉   | 7599/10986 [2:57:22<1:10:51,  1.26s/it]

training loss: 3.392498016357422


training:  69%|██████▉   | 7600/10986 [2:57:23<1:10:31,  1.25s/it]

training loss: 3.6651413440704346
valid loss: 3.6598761081695557
perplexity: 38.856529235839844


training:  69%|██████▉   | 7601/10986 [2:57:26<1:34:07,  1.67s/it]

training loss: 3.57597017288208


training:  69%|██████▉   | 7602/10986 [2:57:28<1:32:48,  1.65s/it]

training loss: 3.346304178237915


training:  69%|██████▉   | 7603/10986 [2:57:29<1:26:00,  1.53s/it]

training loss: 3.343273639678955


training:  69%|██████▉   | 7604/10986 [2:57:30<1:21:14,  1.44s/it]

training loss: 3.543428421020508


training:  69%|██████▉   | 7605/10986 [2:57:31<1:17:38,  1.38s/it]

training loss: 3.4725754261016846


training:  69%|██████▉   | 7606/10986 [2:57:33<1:15:11,  1.33s/it]

training loss: 3.3458173274993896


training:  69%|██████▉   | 7607/10986 [2:57:34<1:13:35,  1.31s/it]

training loss: 3.3684983253479004


training:  69%|██████▉   | 7608/10986 [2:57:35<1:11:58,  1.28s/it]

training loss: 3.3728444576263428


training:  69%|██████▉   | 7609/10986 [2:57:36<1:10:48,  1.26s/it]

training loss: 3.584320306777954


training:  69%|██████▉   | 7610/10986 [2:57:38<1:10:09,  1.25s/it]

training loss: 3.4519100189208984


training:  69%|██████▉   | 7611/10986 [2:57:39<1:14:19,  1.32s/it]

training loss: 3.5350303649902344


training:  69%|██████▉   | 7612/10986 [2:57:40<1:13:06,  1.30s/it]

training loss: 3.427314519882202


training:  69%|██████▉   | 7613/10986 [2:57:42<1:11:40,  1.27s/it]

training loss: 3.622884750366211


training:  69%|██████▉   | 7614/10986 [2:57:43<1:10:52,  1.26s/it]

training loss: 3.566361665725708


training:  69%|██████▉   | 7615/10986 [2:57:44<1:10:08,  1.25s/it]

training loss: 3.4152512550354004


training:  69%|██████▉   | 7616/10986 [2:57:45<1:09:51,  1.24s/it]

training loss: 3.3180418014526367


training:  69%|██████▉   | 7617/10986 [2:57:46<1:09:45,  1.24s/it]

training loss: 3.4733095169067383


training:  69%|██████▉   | 7618/10986 [2:57:48<1:09:21,  1.24s/it]

training loss: 3.494814872741699


training:  69%|██████▉   | 7619/10986 [2:57:49<1:09:15,  1.23s/it]

training loss: 3.480095624923706


training:  69%|██████▉   | 7620/10986 [2:57:50<1:09:00,  1.23s/it]

training loss: 3.5821409225463867
valid loss: 3.5716075897216797
perplexity: 35.573734283447266


training:  69%|██████▉   | 7621/10986 [2:57:53<1:33:25,  1.67s/it]

training loss: 3.481522798538208


training:  69%|██████▉   | 7622/10986 [2:57:54<1:32:35,  1.65s/it]

training loss: 3.505025625228882


training:  69%|██████▉   | 7623/10986 [2:57:56<1:25:37,  1.53s/it]

training loss: 3.363600730895996


training:  69%|██████▉   | 7624/10986 [2:57:57<1:20:44,  1.44s/it]

training loss: 3.4054954051971436


training:  69%|██████▉   | 7625/10986 [2:57:58<1:17:45,  1.39s/it]

training loss: 3.5463297367095947


training:  69%|██████▉   | 7626/10986 [2:57:59<1:15:22,  1.35s/it]

training loss: 3.3773889541625977


training:  69%|██████▉   | 7627/10986 [2:58:01<1:18:39,  1.40s/it]

training loss: 3.4150984287261963


training:  69%|██████▉   | 7628/10986 [2:58:03<1:21:26,  1.46s/it]

training loss: 3.510485887527466


training:  69%|██████▉   | 7629/10986 [2:58:04<1:22:46,  1.48s/it]

training loss: 3.4055581092834473


training:  69%|██████▉   | 7630/10986 [2:58:05<1:18:26,  1.40s/it]

training loss: 3.438910722732544


training:  69%|██████▉   | 7631/10986 [2:58:07<1:20:08,  1.43s/it]

training loss: 3.4387967586517334


training:  69%|██████▉   | 7632/10986 [2:58:08<1:17:38,  1.39s/it]

training loss: 3.404787540435791


training:  69%|██████▉   | 7633/10986 [2:58:09<1:15:02,  1.34s/it]

training loss: 3.428950548171997


training:  69%|██████▉   | 7634/10986 [2:58:11<1:13:10,  1.31s/it]

training loss: 3.4334053993225098


training:  69%|██████▉   | 7635/10986 [2:58:12<1:12:04,  1.29s/it]

training loss: 3.377002716064453


training:  70%|██████▉   | 7636/10986 [2:58:13<1:11:10,  1.27s/it]

training loss: 3.517845630645752


training:  70%|██████▉   | 7637/10986 [2:58:14<1:10:21,  1.26s/it]

training loss: 3.594860792160034


training:  70%|██████▉   | 7638/10986 [2:58:15<1:09:39,  1.25s/it]

training loss: 3.658144474029541


training:  70%|██████▉   | 7639/10986 [2:58:17<1:09:06,  1.24s/it]

training loss: 3.45864200592041


training:  70%|██████▉   | 7640/10986 [2:58:18<1:09:17,  1.24s/it]

training loss: 3.4140055179595947
valid loss: 3.4080810546875
perplexity: 30.207223892211914


training:  70%|██████▉   | 7641/10986 [2:58:21<1:33:34,  1.68s/it]

training loss: 3.57889461517334


training:  70%|██████▉   | 7642/10986 [2:58:22<1:31:57,  1.65s/it]

training loss: 3.4748218059539795


training:  70%|██████▉   | 7643/10986 [2:58:23<1:25:09,  1.53s/it]

training loss: 3.4189047813415527


training:  70%|██████▉   | 7644/10986 [2:58:25<1:20:41,  1.45s/it]

training loss: 3.4478535652160645


training:  70%|██████▉   | 7645/10986 [2:58:26<1:17:16,  1.39s/it]

training loss: 3.5924956798553467


training:  70%|██████▉   | 7646/10986 [2:58:27<1:14:52,  1.35s/it]

training loss: 3.460204601287842


training:  70%|██████▉   | 7647/10986 [2:58:28<1:13:46,  1.33s/it]

training loss: 3.4746358394622803


training:  70%|██████▉   | 7648/10986 [2:58:30<1:12:34,  1.30s/it]

training loss: 3.4798245429992676


training:  70%|██████▉   | 7649/10986 [2:58:31<1:11:59,  1.29s/it]

training loss: 3.446390390396118


training:  70%|██████▉   | 7650/10986 [2:58:32<1:11:34,  1.29s/it]

training loss: 3.4315481185913086


training:  70%|██████▉   | 7651/10986 [2:58:34<1:15:50,  1.36s/it]

training loss: 3.3948886394500732


training:  70%|██████▉   | 7652/10986 [2:58:35<1:14:31,  1.34s/it]

training loss: 3.5649240016937256


training:  70%|██████▉   | 7653/10986 [2:58:36<1:13:14,  1.32s/it]

training loss: 3.4360294342041016


training:  70%|██████▉   | 7654/10986 [2:58:38<1:12:09,  1.30s/it]

training loss: 3.358210802078247


training:  70%|██████▉   | 7655/10986 [2:58:39<1:11:47,  1.29s/it]

training loss: 3.5705041885375977


training:  70%|██████▉   | 7656/10986 [2:58:40<1:10:49,  1.28s/it]

training loss: 3.377047300338745


training:  70%|██████▉   | 7657/10986 [2:58:41<1:10:38,  1.27s/it]

training loss: 3.491452932357788


training:  70%|██████▉   | 7658/10986 [2:58:43<1:10:23,  1.27s/it]

training loss: 3.4386425018310547


training:  70%|██████▉   | 7659/10986 [2:58:44<1:10:16,  1.27s/it]

training loss: 3.4442732334136963


training:  70%|██████▉   | 7660/10986 [2:58:45<1:10:17,  1.27s/it]

training loss: 3.6139185428619385
valid loss: 3.608833074569702
perplexity: 36.922943115234375


training:  70%|██████▉   | 7661/10986 [2:58:48<1:33:53,  1.69s/it]

training loss: 3.4824295043945312


training:  70%|██████▉   | 7662/10986 [2:58:49<1:28:10,  1.59s/it]

training loss: 3.5337555408477783


training:  70%|██████▉   | 7663/10986 [2:58:51<1:22:28,  1.49s/it]

training loss: 3.5008676052093506


training:  70%|██████▉   | 7664/10986 [2:58:52<1:18:24,  1.42s/it]

training loss: 3.380155563354492


training:  70%|██████▉   | 7665/10986 [2:58:53<1:15:42,  1.37s/it]

training loss: 3.3827333450317383


training:  70%|██████▉   | 7666/10986 [2:58:54<1:14:28,  1.35s/it]

training loss: 3.41336727142334


training:  70%|██████▉   | 7667/10986 [2:58:56<1:13:14,  1.32s/it]

training loss: 3.3745498657226562


training:  70%|██████▉   | 7668/10986 [2:58:57<1:12:04,  1.30s/it]

training loss: 3.4136950969696045


training:  70%|██████▉   | 7669/10986 [2:58:58<1:12:13,  1.31s/it]

training loss: 3.3670082092285156


training:  70%|██████▉   | 7670/10986 [2:58:59<1:11:58,  1.30s/it]

training loss: 3.427464008331299


training:  70%|██████▉   | 7671/10986 [2:59:01<1:15:54,  1.37s/it]

training loss: 3.4904251098632812


training:  70%|██████▉   | 7672/10986 [2:59:02<1:14:46,  1.35s/it]

training loss: 3.6120526790618896


training:  70%|██████▉   | 7673/10986 [2:59:04<1:13:05,  1.32s/it]

training loss: 3.358560562133789


training:  70%|██████▉   | 7674/10986 [2:59:05<1:12:17,  1.31s/it]

training loss: 3.4581234455108643


training:  70%|██████▉   | 7675/10986 [2:59:06<1:11:56,  1.30s/it]

training loss: 3.4807822704315186


training:  70%|██████▉   | 7676/10986 [2:59:07<1:11:22,  1.29s/it]

training loss: 3.3878324031829834


training:  70%|██████▉   | 7677/10986 [2:59:09<1:10:57,  1.29s/it]

training loss: 3.4904913902282715


training:  70%|██████▉   | 7678/10986 [2:59:10<1:10:47,  1.28s/it]

training loss: 3.436021327972412


training:  70%|██████▉   | 7679/10986 [2:59:11<1:10:34,  1.28s/it]

training loss: 3.4709417819976807


training:  70%|██████▉   | 7680/10986 [2:59:12<1:10:26,  1.28s/it]

training loss: 3.4868810176849365
valid loss: 3.483872413635254
perplexity: 32.585662841796875


training:  70%|██████▉   | 7681/10986 [2:59:15<1:34:10,  1.71s/it]

training loss: 3.3446083068847656


training:  70%|██████▉   | 7682/10986 [2:59:17<1:28:04,  1.60s/it]

training loss: 3.530877113342285


training:  70%|██████▉   | 7683/10986 [2:59:18<1:22:43,  1.50s/it]

training loss: 3.3326897621154785


training:  70%|██████▉   | 7684/10986 [2:59:19<1:19:03,  1.44s/it]

training loss: 3.3825485706329346


training:  70%|██████▉   | 7685/10986 [2:59:20<1:16:12,  1.39s/it]

training loss: 3.365548610687256


training:  70%|██████▉   | 7686/10986 [2:59:22<1:14:25,  1.35s/it]

training loss: 3.5038793087005615


training:  70%|██████▉   | 7687/10986 [2:59:23<1:12:57,  1.33s/it]

training loss: 3.479538679122925


training:  70%|██████▉   | 7688/10986 [2:59:24<1:11:57,  1.31s/it]

training loss: 3.396790027618408


training:  70%|██████▉   | 7689/10986 [2:59:25<1:11:29,  1.30s/it]

training loss: 3.4486045837402344


training:  70%|██████▉   | 7690/10986 [2:59:27<1:11:15,  1.30s/it]

training loss: 3.316775321960449


training:  70%|███████   | 7691/10986 [2:59:28<1:15:48,  1.38s/it]

training loss: 3.4683382511138916


training:  70%|███████   | 7692/10986 [2:59:30<1:18:52,  1.44s/it]

training loss: 3.3081042766571045


training:  70%|███████   | 7693/10986 [2:59:31<1:16:43,  1.40s/it]

training loss: 3.467341899871826


training:  70%|███████   | 7694/10986 [2:59:32<1:14:52,  1.36s/it]

training loss: 3.466526508331299


training:  70%|███████   | 7695/10986 [2:59:34<1:13:06,  1.33s/it]

training loss: 3.474576950073242


training:  70%|███████   | 7696/10986 [2:59:35<1:12:08,  1.32s/it]

training loss: 3.444437265396118


training:  70%|███████   | 7697/10986 [2:59:36<1:11:08,  1.30s/it]

training loss: 3.3040006160736084


training:  70%|███████   | 7698/10986 [2:59:38<1:10:58,  1.30s/it]

training loss: 3.392483711242676


training:  70%|███████   | 7699/10986 [2:59:39<1:11:00,  1.30s/it]

training loss: 3.3977551460266113


training:  70%|███████   | 7700/10986 [2:59:40<1:10:48,  1.29s/it]

training loss: 3.3476972579956055
valid loss: 3.352632522583008
perplexity: 28.57786750793457


training:  70%|███████   | 7701/10986 [2:59:43<1:34:56,  1.73s/it]

training loss: 3.42395281791687


training:  70%|███████   | 7702/10986 [2:59:44<1:28:45,  1.62s/it]

training loss: 3.6319103240966797


training:  70%|███████   | 7703/10986 [2:59:46<1:23:06,  1.52s/it]

training loss: 3.4581618309020996


training:  70%|███████   | 7704/10986 [2:59:47<1:18:37,  1.44s/it]

training loss: 3.435760021209717


training:  70%|███████   | 7705/10986 [2:59:48<1:16:01,  1.39s/it]

training loss: 3.332535982131958


training:  70%|███████   | 7706/10986 [2:59:49<1:14:00,  1.35s/it]

training loss: 3.339024782180786


training:  70%|███████   | 7707/10986 [2:59:51<1:12:33,  1.33s/it]

training loss: 3.5606486797332764


training:  70%|███████   | 7708/10986 [2:59:52<1:11:29,  1.31s/it]

training loss: 3.3826916217803955


training:  70%|███████   | 7709/10986 [2:59:53<1:10:36,  1.29s/it]

training loss: 3.503000259399414


training:  70%|███████   | 7710/10986 [2:59:54<1:10:48,  1.30s/it]

training loss: 3.592731237411499


training:  70%|███████   | 7711/10986 [2:59:56<1:14:52,  1.37s/it]

training loss: 3.431379556655884


training:  70%|███████   | 7712/10986 [2:59:57<1:13:55,  1.35s/it]

training loss: 3.409106731414795


training:  70%|███████   | 7713/10986 [2:59:59<1:13:00,  1.34s/it]

training loss: 3.507976770401001


training:  70%|███████   | 7714/10986 [3:00:00<1:11:30,  1.31s/it]

training loss: 3.5242319107055664


training:  70%|███████   | 7715/10986 [3:00:01<1:10:48,  1.30s/it]

training loss: 3.449582099914551


training:  70%|███████   | 7716/10986 [3:00:02<1:10:19,  1.29s/it]

training loss: 3.467958688735962


training:  70%|███████   | 7717/10986 [3:00:04<1:09:34,  1.28s/it]

training loss: 3.3779287338256836


training:  70%|███████   | 7718/10986 [3:00:05<1:09:34,  1.28s/it]

training loss: 3.306129217147827


training:  70%|███████   | 7719/10986 [3:00:06<1:09:54,  1.28s/it]

training loss: 3.334937572479248


training:  70%|███████   | 7720/10986 [3:00:07<1:09:49,  1.28s/it]

training loss: 3.5000109672546387
valid loss: 3.4949705600738525
perplexity: 32.949317932128906


training:  70%|███████   | 7721/10986 [3:00:10<1:33:00,  1.71s/it]

training loss: 3.4447226524353027


training:  70%|███████   | 7722/10986 [3:00:12<1:27:14,  1.60s/it]

training loss: 3.443018913269043


training:  70%|███████   | 7723/10986 [3:00:13<1:22:15,  1.51s/it]

training loss: 3.5519564151763916


training:  70%|███████   | 7724/10986 [3:00:14<1:18:04,  1.44s/it]

training loss: 3.3190107345581055


training:  70%|███████   | 7725/10986 [3:00:15<1:15:14,  1.38s/it]

training loss: 3.435786247253418


training:  70%|███████   | 7726/10986 [3:00:17<1:13:05,  1.35s/it]

training loss: 3.5146942138671875


training:  70%|███████   | 7727/10986 [3:00:18<1:11:44,  1.32s/it]

training loss: 3.3431589603424072


training:  70%|███████   | 7728/10986 [3:00:19<1:11:01,  1.31s/it]

training loss: 3.40954327583313


training:  70%|███████   | 7729/10986 [3:00:20<1:10:22,  1.30s/it]

training loss: 3.449206829071045


training:  70%|███████   | 7730/10986 [3:00:22<1:09:42,  1.28s/it]

training loss: 3.449805736541748


training:  70%|███████   | 7731/10986 [3:00:23<1:14:39,  1.38s/it]

training loss: 3.397373914718628


training:  70%|███████   | 7732/10986 [3:00:25<1:13:32,  1.36s/it]

training loss: 3.427117347717285


training:  70%|███████   | 7733/10986 [3:00:26<1:11:48,  1.32s/it]

training loss: 3.350552797317505


training:  70%|███████   | 7734/10986 [3:00:27<1:11:11,  1.31s/it]

training loss: 3.3243656158447266


training:  70%|███████   | 7735/10986 [3:00:28<1:10:37,  1.30s/it]

training loss: 3.4292945861816406


training:  70%|███████   | 7736/10986 [3:00:30<1:10:04,  1.29s/it]

training loss: 3.370797872543335


training:  70%|███████   | 7737/10986 [3:00:31<1:09:23,  1.28s/it]

training loss: 3.4422032833099365


training:  70%|███████   | 7738/10986 [3:00:32<1:09:19,  1.28s/it]

training loss: 3.3990769386291504


training:  70%|███████   | 7739/10986 [3:00:33<1:09:15,  1.28s/it]

training loss: 3.4012975692749023


training:  70%|███████   | 7740/10986 [3:00:35<1:09:20,  1.28s/it]

training loss: 3.4069135189056396
valid loss: 3.3952817916870117
perplexity: 29.823057174682617


training:  70%|███████   | 7741/10986 [3:00:37<1:32:46,  1.72s/it]

training loss: 3.375467538833618


training:  70%|███████   | 7742/10986 [3:00:39<1:30:31,  1.67s/it]

training loss: 3.483599901199341


training:  70%|███████   | 7743/10986 [3:00:40<1:23:56,  1.55s/it]

training loss: 3.4494881629943848


training:  70%|███████   | 7744/10986 [3:00:42<1:19:22,  1.47s/it]

training loss: 3.53670072555542


training:  70%|███████   | 7745/10986 [3:00:43<1:15:57,  1.41s/it]

training loss: 3.369655132293701


training:  71%|███████   | 7746/10986 [3:00:44<1:13:34,  1.36s/it]

training loss: 3.456907272338867


training:  71%|███████   | 7747/10986 [3:00:45<1:12:24,  1.34s/it]

training loss: 3.4929988384246826


training:  71%|███████   | 7748/10986 [3:00:47<1:11:32,  1.33s/it]

training loss: 3.460937023162842


training:  71%|███████   | 7749/10986 [3:00:48<1:10:29,  1.31s/it]

training loss: 3.3186216354370117


training:  71%|███████   | 7750/10986 [3:00:49<1:09:56,  1.30s/it]

training loss: 3.429694652557373


training:  71%|███████   | 7751/10986 [3:00:51<1:14:01,  1.37s/it]

training loss: 3.526418924331665


training:  71%|███████   | 7752/10986 [3:00:52<1:17:11,  1.43s/it]

training loss: 3.3255021572113037


training:  71%|███████   | 7753/10986 [3:00:54<1:15:23,  1.40s/it]

training loss: 3.4652175903320312


training:  71%|███████   | 7754/10986 [3:00:55<1:13:31,  1.37s/it]

training loss: 3.3172547817230225


training:  71%|███████   | 7755/10986 [3:00:56<1:12:15,  1.34s/it]

training loss: 3.397294759750366


training:  71%|███████   | 7756/10986 [3:00:58<1:11:06,  1.32s/it]

training loss: 3.42324161529541


training:  71%|███████   | 7757/10986 [3:00:59<1:10:24,  1.31s/it]

training loss: 3.4780328273773193


training:  71%|███████   | 7758/10986 [3:01:00<1:10:33,  1.31s/it]

training loss: 3.28722882270813


training:  71%|███████   | 7759/10986 [3:01:01<1:10:45,  1.32s/it]

training loss: 3.4476981163024902


training:  71%|███████   | 7760/10986 [3:01:03<1:10:28,  1.31s/it]

training loss: 3.3806262016296387
valid loss: 3.382931709289551
perplexity: 29.45700454711914


training:  71%|███████   | 7761/10986 [3:01:05<1:33:21,  1.74s/it]

training loss: 3.6537928581237793


training:  71%|███████   | 7762/10986 [3:01:07<1:27:10,  1.62s/it]

training loss: 3.3115055561065674


training:  71%|███████   | 7763/10986 [3:01:08<1:22:15,  1.53s/it]

training loss: 3.4118783473968506


training:  71%|███████   | 7764/10986 [3:01:09<1:18:23,  1.46s/it]

training loss: 3.4391963481903076


training:  71%|███████   | 7765/10986 [3:01:11<1:15:43,  1.41s/it]

training loss: 3.4822750091552734


training:  71%|███████   | 7766/10986 [3:01:12<1:13:19,  1.37s/it]

training loss: 3.5211431980133057


training:  71%|███████   | 7767/10986 [3:01:13<1:12:12,  1.35s/it]

training loss: 3.439333438873291


training:  71%|███████   | 7768/10986 [3:01:15<1:11:33,  1.33s/it]

training loss: 3.5178778171539307


training:  71%|███████   | 7769/10986 [3:01:16<1:10:42,  1.32s/it]

training loss: 3.3918163776397705


training:  71%|███████   | 7770/10986 [3:01:17<1:10:25,  1.31s/it]

training loss: 3.3477511405944824


training:  71%|███████   | 7771/10986 [3:01:19<1:14:20,  1.39s/it]

training loss: 3.5058889389038086


training:  71%|███████   | 7772/10986 [3:01:20<1:13:00,  1.36s/it]

training loss: 3.3766138553619385


training:  71%|███████   | 7773/10986 [3:01:21<1:12:01,  1.35s/it]

training loss: 3.3996200561523438


training:  71%|███████   | 7774/10986 [3:01:23<1:11:15,  1.33s/it]

training loss: 3.4068045616149902


training:  71%|███████   | 7775/10986 [3:01:24<1:10:32,  1.32s/it]

training loss: 3.3232388496398926


training:  71%|███████   | 7776/10986 [3:01:25<1:09:47,  1.30s/it]

training loss: 3.4921250343322754


training:  71%|███████   | 7777/10986 [3:01:27<1:09:26,  1.30s/it]

training loss: 3.3323564529418945


training:  71%|███████   | 7778/10986 [3:01:28<1:09:35,  1.30s/it]

training loss: 3.347459077835083


training:  71%|███████   | 7779/10986 [3:01:29<1:09:23,  1.30s/it]

training loss: 3.3543179035186768


training:  71%|███████   | 7780/10986 [3:01:30<1:09:43,  1.30s/it]

training loss: 3.507232189178467
valid loss: 3.51059627532959
perplexity: 33.46821975708008


training:  71%|███████   | 7781/10986 [3:01:33<1:32:12,  1.73s/it]

training loss: 3.4176254272460938


training:  71%|███████   | 7782/10986 [3:01:35<1:26:47,  1.63s/it]

training loss: 3.4245193004608154


training:  71%|███████   | 7783/10986 [3:01:36<1:21:28,  1.53s/it]

training loss: 3.5099480152130127


training:  71%|███████   | 7784/10986 [3:01:37<1:17:32,  1.45s/it]

training loss: 3.4371330738067627


training:  71%|███████   | 7785/10986 [3:01:38<1:15:36,  1.42s/it]

training loss: 3.4674341678619385


training:  71%|███████   | 7786/10986 [3:01:40<1:13:21,  1.38s/it]

training loss: 3.4374237060546875


training:  71%|███████   | 7787/10986 [3:01:41<1:12:05,  1.35s/it]

training loss: 3.462878704071045


training:  71%|███████   | 7788/10986 [3:01:42<1:11:24,  1.34s/it]

training loss: 3.338646173477173


training:  71%|███████   | 7789/10986 [3:01:44<1:16:36,  1.44s/it]

training loss: 3.4773623943328857


training:  71%|███████   | 7790/10986 [3:01:46<1:20:59,  1.52s/it]

training loss: 3.380261182785034


training:  71%|███████   | 7791/10986 [3:01:48<1:25:09,  1.60s/it]

training loss: 3.354987621307373


training:  71%|███████   | 7792/10986 [3:01:49<1:20:09,  1.51s/it]

training loss: 3.3501996994018555


training:  71%|███████   | 7793/10986 [3:01:50<1:16:23,  1.44s/it]

training loss: 3.394810676574707


training:  71%|███████   | 7794/10986 [3:01:51<1:14:25,  1.40s/it]

training loss: 3.441499948501587


training:  71%|███████   | 7795/10986 [3:01:53<1:12:36,  1.37s/it]

training loss: 3.4696102142333984


training:  71%|███████   | 7796/10986 [3:01:54<1:10:57,  1.33s/it]

training loss: 3.340014696121216


training:  71%|███████   | 7797/10986 [3:01:55<1:10:53,  1.33s/it]

training loss: 3.5057783126831055


training:  71%|███████   | 7798/10986 [3:01:57<1:10:02,  1.32s/it]

training loss: 3.3340437412261963


training:  71%|███████   | 7799/10986 [3:01:58<1:09:51,  1.32s/it]

training loss: 3.385481834411621


training:  71%|███████   | 7800/10986 [3:01:59<1:09:34,  1.31s/it]

training loss: 3.4640018939971924
valid loss: 3.461832284927368
perplexity: 31.87532615661621


training:  71%|███████   | 7801/10986 [3:02:02<1:33:44,  1.77s/it]

training loss: 3.4162755012512207


training:  71%|███████   | 7802/10986 [3:02:03<1:27:45,  1.65s/it]

training loss: 3.409348726272583


training:  71%|███████   | 7803/10986 [3:02:05<1:21:50,  1.54s/it]

training loss: 3.3684465885162354


training:  71%|███████   | 7804/10986 [3:02:06<1:17:24,  1.46s/it]

training loss: 3.3890628814697266


training:  71%|███████   | 7805/10986 [3:02:07<1:14:43,  1.41s/it]

training loss: 3.360947847366333


training:  71%|███████   | 7806/10986 [3:02:09<1:12:48,  1.37s/it]

training loss: 3.5148801803588867


training:  71%|███████   | 7807/10986 [3:02:10<1:11:09,  1.34s/it]

training loss: 3.4334611892700195


training:  71%|███████   | 7808/10986 [3:02:11<1:10:03,  1.32s/it]

training loss: 3.500105381011963


training:  71%|███████   | 7809/10986 [3:02:12<1:09:05,  1.30s/it]

training loss: 3.3746204376220703


training:  71%|███████   | 7810/10986 [3:02:14<1:08:59,  1.30s/it]

training loss: 3.4471583366394043


training:  71%|███████   | 7811/10986 [3:02:15<1:13:17,  1.39s/it]

training loss: 3.3752825260162354


training:  71%|███████   | 7812/10986 [3:02:16<1:12:03,  1.36s/it]

training loss: 3.472411870956421


training:  71%|███████   | 7813/10986 [3:02:18<1:11:11,  1.35s/it]

training loss: 3.4582316875457764


training:  71%|███████   | 7814/10986 [3:02:19<1:10:02,  1.32s/it]

training loss: 3.387157678604126


training:  71%|███████   | 7815/10986 [3:02:20<1:09:31,  1.32s/it]

training loss: 3.497706651687622


training:  71%|███████   | 7816/10986 [3:02:22<1:09:09,  1.31s/it]

training loss: 3.5235402584075928


training:  71%|███████   | 7817/10986 [3:02:23<1:09:07,  1.31s/it]

training loss: 3.487091541290283


training:  71%|███████   | 7818/10986 [3:02:24<1:09:05,  1.31s/it]

training loss: 3.3833982944488525


training:  71%|███████   | 7819/10986 [3:02:26<1:08:53,  1.31s/it]

training loss: 3.3481860160827637


training:  71%|███████   | 7820/10986 [3:02:27<1:08:37,  1.30s/it]

training loss: 3.3507370948791504
valid loss: 3.354659080505371
perplexity: 28.635839462280273


training:  71%|███████   | 7821/10986 [3:02:30<1:32:22,  1.75s/it]

training loss: 3.314035654067993


training:  71%|███████   | 7822/10986 [3:02:31<1:27:43,  1.66s/it]

training loss: 3.527740240097046


training:  71%|███████   | 7823/10986 [3:02:32<1:22:02,  1.56s/it]

training loss: 3.38988995552063


training:  71%|███████   | 7824/10986 [3:02:34<1:17:48,  1.48s/it]

training loss: 3.3808724880218506


training:  71%|███████   | 7825/10986 [3:02:35<1:14:41,  1.42s/it]

training loss: 3.370375871658325


training:  71%|███████   | 7826/10986 [3:02:36<1:13:11,  1.39s/it]

training loss: 3.3294434547424316


training:  71%|███████   | 7827/10986 [3:02:38<1:11:34,  1.36s/it]

training loss: 3.363825559616089


training:  71%|███████▏  | 7828/10986 [3:02:39<1:10:24,  1.34s/it]

training loss: 3.3990283012390137


training:  71%|███████▏  | 7829/10986 [3:02:40<1:09:43,  1.33s/it]

training loss: 3.4363033771514893


training:  71%|███████▏  | 7830/10986 [3:02:42<1:09:31,  1.32s/it]

training loss: 3.3019707202911377


training:  71%|███████▏  | 7831/10986 [3:02:43<1:13:49,  1.40s/it]

training loss: 3.366403579711914


training:  71%|███████▏  | 7832/10986 [3:02:45<1:15:54,  1.44s/it]

training loss: 3.2742867469787598


training:  71%|███████▏  | 7833/10986 [3:02:46<1:13:31,  1.40s/it]

training loss: 3.547987699508667


training:  71%|███████▏  | 7834/10986 [3:02:47<1:11:59,  1.37s/it]

training loss: 3.485264778137207


training:  71%|███████▏  | 7835/10986 [3:02:49<1:10:43,  1.35s/it]

training loss: 3.361917734146118


training:  71%|███████▏  | 7836/10986 [3:02:50<1:09:36,  1.33s/it]

training loss: 3.4752681255340576


training:  71%|███████▏  | 7837/10986 [3:02:51<1:09:09,  1.32s/it]

training loss: 3.3067843914031982


training:  71%|███████▏  | 7838/10986 [3:02:52<1:08:29,  1.31s/it]

training loss: 3.428466320037842


training:  71%|███████▏  | 7839/10986 [3:02:54<1:08:01,  1.30s/it]

training loss: 3.4140312671661377


training:  71%|███████▏  | 7840/10986 [3:02:55<1:07:46,  1.29s/it]

training loss: 3.443835735321045
valid loss: 3.4456355571746826
perplexity: 31.363208770751953


training:  71%|███████▏  | 7841/10986 [3:02:58<1:30:50,  1.73s/it]

training loss: 3.3686141967773438


training:  71%|███████▏  | 7842/10986 [3:02:59<1:25:04,  1.62s/it]

training loss: 3.4234557151794434


training:  71%|███████▏  | 7843/10986 [3:03:00<1:19:40,  1.52s/it]

training loss: 3.4672625064849854


training:  71%|███████▏  | 7844/10986 [3:03:02<1:16:34,  1.46s/it]

training loss: 3.4555020332336426


training:  71%|███████▏  | 7845/10986 [3:03:03<1:13:37,  1.41s/it]

training loss: 3.360377550125122


training:  71%|███████▏  | 7846/10986 [3:03:04<1:11:24,  1.36s/it]

training loss: 3.3885841369628906


training:  71%|███████▏  | 7847/10986 [3:03:06<1:10:08,  1.34s/it]

training loss: 3.399582624435425


training:  71%|███████▏  | 7848/10986 [3:03:07<1:09:31,  1.33s/it]

training loss: 3.398526906967163


training:  71%|███████▏  | 7849/10986 [3:03:08<1:09:08,  1.32s/it]

training loss: 3.4110107421875


training:  71%|███████▏  | 7850/10986 [3:03:09<1:08:54,  1.32s/it]

training loss: 3.4147727489471436


training:  71%|███████▏  | 7851/10986 [3:03:11<1:13:12,  1.40s/it]

training loss: 3.4527928829193115


training:  71%|███████▏  | 7852/10986 [3:03:12<1:12:32,  1.39s/it]

training loss: 3.379274368286133


training:  71%|███████▏  | 7853/10986 [3:03:14<1:11:16,  1.36s/it]

training loss: 3.42343807220459


training:  71%|███████▏  | 7854/10986 [3:03:15<1:10:12,  1.35s/it]

training loss: 3.3572301864624023


training:  72%|███████▏  | 7855/10986 [3:03:16<1:09:37,  1.33s/it]

training loss: 3.329862117767334


training:  72%|███████▏  | 7856/10986 [3:03:18<1:09:06,  1.32s/it]

training loss: 3.324769973754883


training:  72%|███████▏  | 7857/10986 [3:03:19<1:08:45,  1.32s/it]

training loss: 3.416788339614868


training:  72%|███████▏  | 7858/10986 [3:03:20<1:08:13,  1.31s/it]

training loss: 3.3111767768859863


training:  72%|███████▏  | 7859/10986 [3:03:21<1:07:59,  1.30s/it]

training loss: 3.450622081756592


training:  72%|███████▏  | 7860/10986 [3:03:23<1:08:01,  1.31s/it]

training loss: 3.43074369430542
valid loss: 3.4305014610290527
perplexity: 30.89212989807129


training:  72%|███████▏  | 7861/10986 [3:03:26<1:31:46,  1.76s/it]

training loss: 3.5179083347320557


training:  72%|███████▏  | 7862/10986 [3:03:27<1:26:42,  1.67s/it]

training loss: 3.3331756591796875


training:  72%|███████▏  | 7863/10986 [3:03:28<1:20:59,  1.56s/it]

training loss: 3.427915573120117


training:  72%|███████▏  | 7864/10986 [3:03:30<1:16:45,  1.48s/it]

training loss: 3.432682752609253


training:  72%|███████▏  | 7865/10986 [3:03:31<1:13:45,  1.42s/it]

training loss: 3.3482344150543213


training:  72%|███████▏  | 7866/10986 [3:03:32<1:12:14,  1.39s/it]

training loss: 3.380315065383911


training:  72%|███████▏  | 7867/10986 [3:03:34<1:10:28,  1.36s/it]

training loss: 3.3496129512786865


training:  72%|███████▏  | 7868/10986 [3:03:35<1:09:38,  1.34s/it]

training loss: 3.389115810394287


training:  72%|███████▏  | 7869/10986 [3:03:36<1:08:48,  1.32s/it]

training loss: 3.3333206176757812


training:  72%|███████▏  | 7870/10986 [3:03:37<1:08:28,  1.32s/it]

training loss: 3.3888092041015625


training:  72%|███████▏  | 7871/10986 [3:03:39<1:12:21,  1.39s/it]

training loss: 3.324347496032715


training:  72%|███████▏  | 7872/10986 [3:03:40<1:11:21,  1.37s/it]

training loss: 3.3961527347564697


training:  72%|███████▏  | 7873/10986 [3:03:42<1:10:01,  1.35s/it]

training loss: 3.4234209060668945


training:  72%|███████▏  | 7874/10986 [3:03:43<1:09:25,  1.34s/it]

training loss: 3.488737106323242


training:  72%|███████▏  | 7875/10986 [3:03:44<1:08:48,  1.33s/it]

training loss: 3.34873104095459


training:  72%|███████▏  | 7876/10986 [3:03:46<1:08:22,  1.32s/it]

training loss: 3.4721732139587402


training:  72%|███████▏  | 7877/10986 [3:03:47<1:07:45,  1.31s/it]

training loss: 3.4474411010742188


training:  72%|███████▏  | 7878/10986 [3:03:48<1:07:20,  1.30s/it]

training loss: 3.3670527935028076


training:  72%|███████▏  | 7879/10986 [3:03:49<1:07:12,  1.30s/it]

training loss: 3.5080854892730713


training:  72%|███████▏  | 7880/10986 [3:03:51<1:06:56,  1.29s/it]

training loss: 3.4455339908599854
valid loss: 3.442944049835205
perplexity: 31.27890968322754


training:  72%|███████▏  | 7881/10986 [3:03:53<1:30:25,  1.75s/it]

training loss: 3.5271146297454834


training:  72%|███████▏  | 7882/10986 [3:03:55<1:24:29,  1.63s/it]

training loss: 3.534895181655884


training:  72%|███████▏  | 7883/10986 [3:03:56<1:19:08,  1.53s/it]

training loss: 3.310387372970581


training:  72%|███████▏  | 7884/10986 [3:03:57<1:15:21,  1.46s/it]

training loss: 3.380340337753296


training:  72%|███████▏  | 7885/10986 [3:03:59<1:12:45,  1.41s/it]

training loss: 3.5153632164001465


training:  72%|███████▏  | 7886/10986 [3:04:00<1:10:57,  1.37s/it]

training loss: 3.3471896648406982


training:  72%|███████▏  | 7887/10986 [3:04:01<1:10:31,  1.37s/it]

training loss: 3.4662652015686035


training:  72%|███████▏  | 7888/10986 [3:04:03<1:09:38,  1.35s/it]

training loss: 3.343574285507202


training:  72%|███████▏  | 7889/10986 [3:04:04<1:08:55,  1.34s/it]

training loss: 3.4101483821868896


training:  72%|███████▏  | 7890/10986 [3:04:05<1:08:36,  1.33s/it]

training loss: 3.444641590118408


training:  72%|███████▏  | 7891/10986 [3:04:07<1:12:32,  1.41s/it]

training loss: 3.3277344703674316


training:  72%|███████▏  | 7892/10986 [3:04:08<1:11:31,  1.39s/it]

training loss: 3.363257884979248


training:  72%|███████▏  | 7893/10986 [3:04:10<1:10:30,  1.37s/it]

training loss: 3.5320355892181396


training:  72%|███████▏  | 7894/10986 [3:04:11<1:09:02,  1.34s/it]

training loss: 3.3237698078155518


training:  72%|███████▏  | 7895/10986 [3:04:12<1:08:48,  1.34s/it]

training loss: 3.419221878051758


training:  72%|███████▏  | 7896/10986 [3:04:13<1:08:30,  1.33s/it]

training loss: 3.4234702587127686


training:  72%|███████▏  | 7897/10986 [3:04:15<1:07:57,  1.32s/it]

training loss: 3.3154234886169434


training:  72%|███████▏  | 7898/10986 [3:04:16<1:07:32,  1.31s/it]

training loss: 3.457188367843628


training:  72%|███████▏  | 7899/10986 [3:04:17<1:07:10,  1.31s/it]

training loss: 3.3761038780212402


training:  72%|███████▏  | 7900/10986 [3:04:19<1:07:14,  1.31s/it]

training loss: 3.4371161460876465
valid loss: 3.4350619316101074
perplexity: 31.033334732055664


training:  72%|███████▏  | 7901/10986 [3:04:21<1:30:20,  1.76s/it]

training loss: 3.4088916778564453


training:  72%|███████▏  | 7902/10986 [3:04:23<1:29:23,  1.74s/it]

training loss: 3.451866388320923


training:  72%|███████▏  | 7903/10986 [3:04:24<1:23:02,  1.62s/it]

training loss: 3.443269729614258


training:  72%|███████▏  | 7904/10986 [3:04:26<1:18:26,  1.53s/it]

training loss: 3.424468994140625


training:  72%|███████▏  | 7905/10986 [3:04:27<1:15:05,  1.46s/it]

training loss: 3.364698648452759


training:  72%|███████▏  | 7906/10986 [3:04:28<1:12:34,  1.41s/it]

training loss: 3.4346442222595215


training:  72%|███████▏  | 7907/10986 [3:04:30<1:10:37,  1.38s/it]

training loss: 3.3831186294555664


training:  72%|███████▏  | 7908/10986 [3:04:31<1:09:36,  1.36s/it]

training loss: 3.4131550788879395


training:  72%|███████▏  | 7909/10986 [3:04:32<1:09:01,  1.35s/it]

training loss: 3.408432960510254


training:  72%|███████▏  | 7910/10986 [3:04:34<1:08:08,  1.33s/it]

training loss: 3.4745736122131348


training:  72%|███████▏  | 7911/10986 [3:04:35<1:12:20,  1.41s/it]

training loss: 3.280811071395874


training:  72%|███████▏  | 7912/10986 [3:04:37<1:11:37,  1.40s/it]

training loss: 3.375838041305542


training:  72%|███████▏  | 7913/10986 [3:04:38<1:10:21,  1.37s/it]

training loss: 3.297804594039917


training:  72%|███████▏  | 7914/10986 [3:04:39<1:09:40,  1.36s/it]

training loss: 3.415447950363159


training:  72%|███████▏  | 7915/10986 [3:04:41<1:09:01,  1.35s/it]

training loss: 3.4137649536132812


training:  72%|███████▏  | 7916/10986 [3:04:42<1:08:07,  1.33s/it]

training loss: 3.437774658203125


training:  72%|███████▏  | 7917/10986 [3:04:43<1:07:25,  1.32s/it]

training loss: 3.420710802078247


training:  72%|███████▏  | 7918/10986 [3:04:44<1:07:06,  1.31s/it]

training loss: 3.461236000061035


training:  72%|███████▏  | 7919/10986 [3:04:46<1:06:57,  1.31s/it]

training loss: 3.473966121673584


training:  72%|███████▏  | 7920/10986 [3:04:47<1:06:24,  1.30s/it]

training loss: 3.329676389694214
valid loss: 3.326260805130005
perplexity: 27.834070205688477


training:  72%|███████▏  | 7921/10986 [3:04:50<1:28:49,  1.74s/it]

training loss: 3.347039222717285


training:  72%|███████▏  | 7922/10986 [3:04:51<1:23:25,  1.63s/it]

training loss: 3.4448375701904297


training:  72%|███████▏  | 7923/10986 [3:04:52<1:18:07,  1.53s/it]

training loss: 3.3998425006866455


training:  72%|███████▏  | 7924/10986 [3:04:54<1:14:12,  1.45s/it]

training loss: 3.6187100410461426


training:  72%|███████▏  | 7925/10986 [3:04:55<1:11:55,  1.41s/it]

training loss: 3.373047113418579


training:  72%|███████▏  | 7926/10986 [3:04:56<1:09:49,  1.37s/it]

training loss: 3.350412130355835


training:  72%|███████▏  | 7927/10986 [3:04:58<1:08:39,  1.35s/it]

training loss: 3.4021339416503906


training:  72%|███████▏  | 7928/10986 [3:04:59<1:07:51,  1.33s/it]

training loss: 3.3576760292053223


training:  72%|███████▏  | 7929/10986 [3:05:00<1:06:58,  1.31s/it]

training loss: 3.373143434524536


training:  72%|███████▏  | 7930/10986 [3:05:01<1:06:29,  1.31s/it]

training loss: 3.44333553314209


training:  72%|███████▏  | 7931/10986 [3:05:03<1:11:11,  1.40s/it]

training loss: 3.4058897495269775


training:  72%|███████▏  | 7932/10986 [3:05:05<1:13:39,  1.45s/it]

training loss: 3.3749496936798096


training:  72%|███████▏  | 7933/10986 [3:05:06<1:11:17,  1.40s/it]

training loss: 3.402217388153076


training:  72%|███████▏  | 7934/10986 [3:05:07<1:09:35,  1.37s/it]

training loss: 3.4147419929504395


training:  72%|███████▏  | 7935/10986 [3:05:09<1:08:42,  1.35s/it]

training loss: 3.405524492263794


training:  72%|███████▏  | 7936/10986 [3:05:10<1:07:54,  1.34s/it]

training loss: 3.3975412845611572


training:  72%|███████▏  | 7937/10986 [3:05:11<1:07:20,  1.33s/it]

training loss: 3.4403505325317383


training:  72%|███████▏  | 7938/10986 [3:05:12<1:06:59,  1.32s/it]

training loss: 3.447751760482788


training:  72%|███████▏  | 7939/10986 [3:05:14<1:06:30,  1.31s/it]

training loss: 3.3320095539093018


training:  72%|███████▏  | 7940/10986 [3:05:15<1:06:26,  1.31s/it]

training loss: 3.4675278663635254
valid loss: 3.466298818588257
perplexity: 32.01801681518555


training:  72%|███████▏  | 7941/10986 [3:05:18<1:28:29,  1.74s/it]

training loss: 3.44106388092041


training:  72%|███████▏  | 7942/10986 [3:05:19<1:22:44,  1.63s/it]

training loss: 3.54032564163208


training:  72%|███████▏  | 7943/10986 [3:05:20<1:17:38,  1.53s/it]

training loss: 3.3266942501068115


training:  72%|███████▏  | 7944/10986 [3:05:22<1:14:13,  1.46s/it]

training loss: 3.412571907043457


training:  72%|███████▏  | 7945/10986 [3:05:23<1:11:50,  1.42s/it]

training loss: 3.488156318664551


training:  72%|███████▏  | 7946/10986 [3:05:24<1:10:37,  1.39s/it]

training loss: 3.3796513080596924


training:  72%|███████▏  | 7947/10986 [3:05:26<1:09:11,  1.37s/it]

training loss: 3.4012887477874756


training:  72%|███████▏  | 7948/10986 [3:05:27<1:08:02,  1.34s/it]

training loss: 3.362926721572876


training:  72%|███████▏  | 7949/10986 [3:05:28<1:09:51,  1.38s/it]

training loss: 3.4106359481811523


training:  72%|███████▏  | 7950/10986 [3:05:30<1:15:12,  1.49s/it]

training loss: 3.3453242778778076


training:  72%|███████▏  | 7951/10986 [3:05:32<1:23:03,  1.64s/it]

training loss: 3.3561761379241943


training:  72%|███████▏  | 7952/10986 [3:05:34<1:24:27,  1.67s/it]

training loss: 3.393930673599243


training:  72%|███████▏  | 7953/10986 [3:05:35<1:19:16,  1.57s/it]

training loss: 3.457369327545166


training:  72%|███████▏  | 7954/10986 [3:05:37<1:15:23,  1.49s/it]

training loss: 3.3699002265930176


training:  72%|███████▏  | 7955/10986 [3:05:38<1:12:55,  1.44s/it]

training loss: 3.440664529800415


training:  72%|███████▏  | 7956/10986 [3:05:39<1:11:08,  1.41s/it]

training loss: 3.3064708709716797


training:  72%|███████▏  | 7957/10986 [3:05:41<1:09:39,  1.38s/it]

training loss: 3.3400278091430664


training:  72%|███████▏  | 7958/10986 [3:05:42<1:08:30,  1.36s/it]

training loss: 3.4260318279266357


training:  72%|███████▏  | 7959/10986 [3:05:43<1:07:51,  1.35s/it]

training loss: 3.468766689300537


training:  72%|███████▏  | 7960/10986 [3:05:45<1:07:24,  1.34s/it]

training loss: 3.414144277572632
valid loss: 3.4149186611175537
perplexity: 30.414474487304688


training:  72%|███████▏  | 7961/10986 [3:05:47<1:30:21,  1.79s/it]

training loss: 3.421271800994873


training:  72%|███████▏  | 7962/10986 [3:05:49<1:24:17,  1.67s/it]

training loss: 3.4456839561462402


training:  72%|███████▏  | 7963/10986 [3:05:50<1:19:27,  1.58s/it]

training loss: 3.3904836177825928


training:  72%|███████▏  | 7964/10986 [3:05:51<1:15:36,  1.50s/it]

training loss: 3.44207501411438


training:  73%|███████▎  | 7965/10986 [3:05:53<1:12:52,  1.45s/it]

training loss: 3.522690773010254


training:  73%|███████▎  | 7966/10986 [3:05:54<1:10:28,  1.40s/it]

training loss: 3.3195841312408447


training:  73%|███████▎  | 7967/10986 [3:05:55<1:09:47,  1.39s/it]

training loss: 3.444239616394043


training:  73%|███████▎  | 7968/10986 [3:05:57<1:08:40,  1.37s/it]

training loss: 3.4367809295654297


training:  73%|███████▎  | 7969/10986 [3:05:58<1:07:45,  1.35s/it]

training loss: 3.3898141384124756


training:  73%|███████▎  | 7970/10986 [3:05:59<1:07:26,  1.34s/it]

training loss: 3.4319145679473877


training:  73%|███████▎  | 7971/10986 [3:06:01<1:11:49,  1.43s/it]

training loss: 3.3685245513916016


training:  73%|███████▎  | 7972/10986 [3:06:02<1:10:28,  1.40s/it]

training loss: 3.324842691421509


training:  73%|███████▎  | 7973/10986 [3:06:04<1:09:37,  1.39s/it]

training loss: 3.472553253173828


training:  73%|███████▎  | 7974/10986 [3:06:05<1:08:26,  1.36s/it]

training loss: 3.413792371749878


training:  73%|███████▎  | 7975/10986 [3:06:06<1:07:04,  1.34s/it]

training loss: 3.445871353149414


training:  73%|███████▎  | 7976/10986 [3:06:08<1:06:28,  1.33s/it]

training loss: 3.3512370586395264


training:  73%|███████▎  | 7977/10986 [3:06:09<1:06:01,  1.32s/it]

training loss: 3.435774326324463


training:  73%|███████▎  | 7978/10986 [3:06:10<1:05:23,  1.30s/it]

training loss: 3.3136866092681885


training:  73%|███████▎  | 7979/10986 [3:06:11<1:05:34,  1.31s/it]

training loss: 3.4004099369049072


training:  73%|███████▎  | 7980/10986 [3:06:13<1:05:23,  1.31s/it]

training loss: 3.423015594482422
valid loss: 3.422346591949463
perplexity: 30.641231536865234


training:  73%|███████▎  | 7981/10986 [3:06:16<1:27:15,  1.74s/it]

training loss: 3.2579457759857178


training:  73%|███████▎  | 7982/10986 [3:06:17<1:25:59,  1.72s/it]

training loss: 3.4623947143554688


training:  73%|███████▎  | 7983/10986 [3:06:18<1:19:47,  1.59s/it]

training loss: 3.4301884174346924


training:  73%|███████▎  | 7984/10986 [3:06:20<1:15:13,  1.50s/it]

training loss: 3.3646531105041504


training:  73%|███████▎  | 7985/10986 [3:06:21<1:12:16,  1.44s/it]

training loss: 3.4214484691619873


training:  73%|███████▎  | 7986/10986 [3:06:22<1:09:57,  1.40s/it]

training loss: 3.3406546115875244


training:  73%|███████▎  | 7987/10986 [3:06:24<1:08:27,  1.37s/it]

training loss: 3.575944423675537


training:  73%|███████▎  | 7988/10986 [3:06:25<1:07:19,  1.35s/it]

training loss: 3.3758022785186768


training:  73%|███████▎  | 7989/10986 [3:06:26<1:06:56,  1.34s/it]

training loss: 3.4002411365509033


training:  73%|███████▎  | 7990/10986 [3:06:28<1:06:09,  1.33s/it]

training loss: 3.523068428039551


training:  73%|███████▎  | 7991/10986 [3:06:29<1:10:07,  1.40s/it]

training loss: 3.46587872505188


training:  73%|███████▎  | 7992/10986 [3:06:31<1:12:10,  1.45s/it]

training loss: 3.3947746753692627


training:  73%|███████▎  | 7993/10986 [3:06:32<1:10:22,  1.41s/it]

training loss: 3.39542293548584


training:  73%|███████▎  | 7994/10986 [3:06:33<1:08:34,  1.38s/it]

training loss: 3.3498799800872803


training:  73%|███████▎  | 7995/10986 [3:06:35<1:07:03,  1.35s/it]

training loss: 3.351591110229492


training:  73%|███████▎  | 7996/10986 [3:06:36<1:06:19,  1.33s/it]

training loss: 3.352602243423462


training:  73%|███████▎  | 7997/10986 [3:06:37<1:05:51,  1.32s/it]

training loss: 3.3052570819854736


training:  73%|███████▎  | 7998/10986 [3:06:38<1:05:22,  1.31s/it]

training loss: 3.3119890689849854


training:  73%|███████▎  | 7999/10986 [3:06:40<1:05:15,  1.31s/it]

training loss: 3.3795928955078125


training:  73%|███████▎  | 8000/10986 [3:06:41<1:05:14,  1.31s/it]

training loss: 3.392869472503662
valid loss: 3.392737865447998
perplexity: 29.747285842895508


training:  73%|███████▎  | 8001/10986 [3:06:44<1:27:54,  1.77s/it]

training loss: 3.466449499130249


training:  73%|███████▎  | 8002/10986 [3:06:46<1:25:47,  1.72s/it]

training loss: 3.3168938159942627


training:  73%|███████▎  | 8003/10986 [3:06:47<1:19:25,  1.60s/it]

training loss: 3.3663647174835205


training:  73%|███████▎  | 8004/10986 [3:06:48<1:14:57,  1.51s/it]

training loss: 3.4148499965667725


training:  73%|███████▎  | 8005/10986 [3:06:49<1:11:52,  1.45s/it]

training loss: 3.3364131450653076


training:  73%|███████▎  | 8006/10986 [3:06:51<1:09:25,  1.40s/it]

training loss: 3.342212438583374


training:  73%|███████▎  | 8007/10986 [3:06:52<1:07:55,  1.37s/it]

training loss: 3.386500597000122


training:  73%|███████▎  | 8008/10986 [3:06:53<1:06:43,  1.34s/it]

training loss: 3.4460747241973877


training:  73%|███████▎  | 8009/10986 [3:06:55<1:06:25,  1.34s/it]

training loss: 3.306946039199829


training:  73%|███████▎  | 8010/10986 [3:06:56<1:05:47,  1.33s/it]

training loss: 3.4847052097320557


training:  73%|███████▎  | 8011/10986 [3:06:58<1:09:09,  1.39s/it]

training loss: 3.4971847534179688


training:  73%|███████▎  | 8012/10986 [3:06:59<1:08:23,  1.38s/it]

training loss: 3.357215404510498


training:  73%|███████▎  | 8013/10986 [3:07:00<1:07:06,  1.35s/it]

training loss: 3.395038604736328


training:  73%|███████▎  | 8014/10986 [3:07:01<1:06:12,  1.34s/it]

training loss: 3.274468421936035


training:  73%|███████▎  | 8015/10986 [3:07:03<1:05:24,  1.32s/it]

training loss: 3.34928822517395


training:  73%|███████▎  | 8016/10986 [3:07:04<1:05:10,  1.32s/it]

training loss: 3.4061357975006104


training:  73%|███████▎  | 8017/10986 [3:07:05<1:04:50,  1.31s/it]

training loss: 3.438282012939453


training:  73%|███████▎  | 8018/10986 [3:07:07<1:05:05,  1.32s/it]

training loss: 3.4139387607574463


training:  73%|███████▎  | 8019/10986 [3:07:08<1:04:54,  1.31s/it]

training loss: 3.3529465198516846


training:  73%|███████▎  | 8020/10986 [3:07:09<1:04:36,  1.31s/it]

training loss: 3.3922784328460693
valid loss: 3.3897757530212402
perplexity: 29.6593017578125


training:  73%|███████▎  | 8021/10986 [3:07:12<1:27:06,  1.76s/it]

training loss: 3.3839993476867676


training:  73%|███████▎  | 8022/10986 [3:07:13<1:21:37,  1.65s/it]

training loss: 3.4369192123413086


training:  73%|███████▎  | 8023/10986 [3:07:15<1:16:16,  1.54s/it]

training loss: 3.3898613452911377


training:  73%|███████▎  | 8024/10986 [3:07:16<1:12:44,  1.47s/it]

training loss: 3.3513858318328857


training:  73%|███████▎  | 8025/10986 [3:07:17<1:10:13,  1.42s/it]

training loss: 3.278012275695801


training:  73%|███████▎  | 8026/10986 [3:07:19<1:08:28,  1.39s/it]

training loss: 3.3514392375946045


training:  73%|███████▎  | 8027/10986 [3:07:20<1:07:23,  1.37s/it]

training loss: 3.397155284881592


training:  73%|███████▎  | 8028/10986 [3:07:21<1:06:36,  1.35s/it]

training loss: 3.3652901649475098


training:  73%|███████▎  | 8029/10986 [3:07:23<1:05:33,  1.33s/it]

training loss: 3.240016460418701


training:  73%|███████▎  | 8030/10986 [3:07:24<1:05:31,  1.33s/it]

training loss: 3.328373908996582


training:  73%|███████▎  | 8031/10986 [3:07:26<1:10:04,  1.42s/it]

training loss: 3.297846794128418


training:  73%|███████▎  | 8032/10986 [3:07:27<1:08:14,  1.39s/it]

training loss: 3.460895538330078


training:  73%|███████▎  | 8033/10986 [3:07:28<1:07:03,  1.36s/it]

training loss: 3.5130717754364014


training:  73%|███████▎  | 8034/10986 [3:07:30<1:06:11,  1.35s/it]

training loss: 3.3893866539001465


training:  73%|███████▎  | 8035/10986 [3:07:31<1:05:49,  1.34s/it]

training loss: 3.4191243648529053


training:  73%|███████▎  | 8036/10986 [3:07:32<1:05:24,  1.33s/it]

training loss: 3.370745897293091


training:  73%|███████▎  | 8037/10986 [3:07:33<1:04:49,  1.32s/it]

training loss: 3.456967353820801


training:  73%|███████▎  | 8038/10986 [3:07:35<1:04:41,  1.32s/it]

training loss: 3.3711841106414795


training:  73%|███████▎  | 8039/10986 [3:07:36<1:04:25,  1.31s/it]

training loss: 3.514620304107666


training:  73%|███████▎  | 8040/10986 [3:07:37<1:05:10,  1.33s/it]

training loss: 3.3136422634124756
valid loss: 3.312047243118286
perplexity: 27.441246032714844


training:  73%|███████▎  | 8041/10986 [3:07:40<1:26:46,  1.77s/it]

training loss: 3.439091682434082


training:  73%|███████▎  | 8042/10986 [3:07:42<1:22:42,  1.69s/it]

training loss: 3.3508777618408203


training:  73%|███████▎  | 8043/10986 [3:07:43<1:17:44,  1.58s/it]

training loss: 3.4646716117858887


training:  73%|███████▎  | 8044/10986 [3:07:44<1:13:16,  1.49s/it]

training loss: 3.3404102325439453


training:  73%|███████▎  | 8045/10986 [3:07:46<1:10:43,  1.44s/it]

training loss: 3.5264065265655518


training:  73%|███████▎  | 8046/10986 [3:07:47<1:08:37,  1.40s/it]

training loss: 3.3193554878234863


training:  73%|███████▎  | 8047/10986 [3:07:48<1:07:13,  1.37s/it]

training loss: 3.362597703933716


training:  73%|███████▎  | 8048/10986 [3:07:50<1:05:50,  1.34s/it]

training loss: 3.289494037628174


training:  73%|███████▎  | 8049/10986 [3:07:51<1:05:03,  1.33s/it]

training loss: 3.398350477218628


training:  73%|███████▎  | 8050/10986 [3:07:52<1:04:51,  1.33s/it]

training loss: 3.3214457035064697


training:  73%|███████▎  | 8051/10986 [3:07:54<1:08:21,  1.40s/it]

training loss: 3.3541274070739746


training:  73%|███████▎  | 8052/10986 [3:07:55<1:07:13,  1.37s/it]

training loss: 3.308605909347534


training:  73%|███████▎  | 8053/10986 [3:07:56<1:06:28,  1.36s/it]

training loss: 3.477121114730835


training:  73%|███████▎  | 8054/10986 [3:07:58<1:05:15,  1.34s/it]

training loss: 3.391735553741455


training:  73%|███████▎  | 8055/10986 [3:07:59<1:04:25,  1.32s/it]

training loss: 3.442294120788574


training:  73%|███████▎  | 8056/10986 [3:08:00<1:03:47,  1.31s/it]

training loss: 3.380016565322876


training:  73%|███████▎  | 8057/10986 [3:08:01<1:03:40,  1.30s/it]

training loss: 3.331456422805786


training:  73%|███████▎  | 8058/10986 [3:08:03<1:03:43,  1.31s/it]

training loss: 3.340041399002075


training:  73%|███████▎  | 8059/10986 [3:08:04<1:04:03,  1.31s/it]

training loss: 3.4123356342315674


training:  73%|███████▎  | 8060/10986 [3:08:05<1:03:51,  1.31s/it]

training loss: 3.3762757778167725
valid loss: 3.3728599548339844
perplexity: 29.16180992126465


training:  73%|███████▎  | 8061/10986 [3:08:08<1:26:03,  1.77s/it]

training loss: 3.391024589538574


training:  73%|███████▎  | 8062/10986 [3:08:10<1:21:37,  1.68s/it]

training loss: 3.3316197395324707


training:  73%|███████▎  | 8063/10986 [3:08:11<1:16:24,  1.57s/it]

training loss: 3.406376600265503


training:  73%|███████▎  | 8064/10986 [3:08:12<1:12:12,  1.48s/it]

training loss: 3.454305410385132


training:  73%|███████▎  | 8065/10986 [3:08:14<1:09:40,  1.43s/it]

training loss: 3.4201786518096924


training:  73%|███████▎  | 8066/10986 [3:08:15<1:07:24,  1.39s/it]

training loss: 3.393834114074707


training:  73%|███████▎  | 8067/10986 [3:08:16<1:06:13,  1.36s/it]

training loss: 3.399400234222412


training:  73%|███████▎  | 8068/10986 [3:08:17<1:04:46,  1.33s/it]

training loss: 3.4205660820007324


training:  73%|███████▎  | 8069/10986 [3:08:19<1:04:21,  1.32s/it]

training loss: 3.305630922317505


training:  73%|███████▎  | 8070/10986 [3:08:20<1:03:57,  1.32s/it]

training loss: 3.4918110370635986


training:  73%|███████▎  | 8071/10986 [3:08:22<1:08:01,  1.40s/it]

training loss: 3.4660708904266357


training:  73%|███████▎  | 8072/10986 [3:08:23<1:06:42,  1.37s/it]

training loss: 3.3937506675720215


training:  73%|███████▎  | 8073/10986 [3:08:24<1:06:01,  1.36s/it]

training loss: 3.4182558059692383


training:  73%|███████▎  | 8074/10986 [3:08:26<1:04:56,  1.34s/it]

training loss: 3.3515543937683105


training:  74%|███████▎  | 8075/10986 [3:08:27<1:04:00,  1.32s/it]

training loss: 3.4532644748687744


training:  74%|███████▎  | 8076/10986 [3:08:28<1:03:43,  1.31s/it]

training loss: 3.3684282302856445


training:  74%|███████▎  | 8077/10986 [3:08:29<1:03:33,  1.31s/it]

training loss: 3.496630907058716


training:  74%|███████▎  | 8078/10986 [3:08:31<1:03:22,  1.31s/it]

training loss: 3.392911434173584


training:  74%|███████▎  | 8079/10986 [3:08:32<1:03:48,  1.32s/it]

training loss: 3.3851475715637207


training:  74%|███████▎  | 8080/10986 [3:08:33<1:03:31,  1.31s/it]

training loss: 3.360610008239746
valid loss: 3.3570005893707275
perplexity: 28.702970504760742


training:  74%|███████▎  | 8081/10986 [3:08:36<1:24:40,  1.75s/it]

training loss: 3.418943166732788


training:  74%|███████▎  | 8082/10986 [3:08:38<1:19:29,  1.64s/it]

training loss: 3.37345290184021


training:  74%|███████▎  | 8083/10986 [3:08:39<1:14:24,  1.54s/it]

training loss: 3.376174211502075


training:  74%|███████▎  | 8084/10986 [3:08:40<1:11:01,  1.47s/it]

training loss: 3.3642919063568115


training:  74%|███████▎  | 8085/10986 [3:08:42<1:08:59,  1.43s/it]

training loss: 3.389824390411377


training:  74%|███████▎  | 8086/10986 [3:08:43<1:07:07,  1.39s/it]

training loss: 3.388408899307251


training:  74%|███████▎  | 8087/10986 [3:08:44<1:05:39,  1.36s/it]

training loss: 3.3257393836975098


training:  74%|███████▎  | 8088/10986 [3:08:45<1:04:53,  1.34s/it]

training loss: 3.3980917930603027


training:  74%|███████▎  | 8089/10986 [3:08:47<1:04:20,  1.33s/it]

training loss: 3.415085554122925


training:  74%|███████▎  | 8090/10986 [3:08:48<1:03:39,  1.32s/it]

training loss: 3.4435598850250244


training:  74%|███████▎  | 8091/10986 [3:08:50<1:07:08,  1.39s/it]

training loss: 3.355477809906006


training:  74%|███████▎  | 8092/10986 [3:08:51<1:10:47,  1.47s/it]

training loss: 3.480774402618408


training:  74%|███████▎  | 8093/10986 [3:08:53<1:08:24,  1.42s/it]

training loss: 3.3989450931549072


training:  74%|███████▎  | 8094/10986 [3:08:54<1:06:44,  1.38s/it]

training loss: 3.2446885108947754


training:  74%|███████▎  | 8095/10986 [3:08:55<1:05:53,  1.37s/it]

training loss: 3.397716999053955


training:  74%|███████▎  | 8096/10986 [3:08:56<1:05:12,  1.35s/it]

training loss: 3.399594783782959


training:  74%|███████▎  | 8097/10986 [3:08:58<1:04:33,  1.34s/it]

training loss: 3.391319990158081


training:  74%|███████▎  | 8098/10986 [3:08:59<1:04:01,  1.33s/it]

training loss: 3.442538261413574


training:  74%|███████▎  | 8099/10986 [3:09:00<1:03:35,  1.32s/it]

training loss: 3.4498372077941895


training:  74%|███████▎  | 8100/10986 [3:09:02<1:03:26,  1.32s/it]

training loss: 3.45757794380188
valid loss: 3.45534610748291
perplexity: 31.669246673583984


training:  74%|███████▎  | 8101/10986 [3:09:05<1:24:54,  1.77s/it]

training loss: 3.3609228134155273


training:  74%|███████▎  | 8102/10986 [3:09:06<1:19:47,  1.66s/it]

training loss: 3.2739546298980713


training:  74%|███████▍  | 8103/10986 [3:09:07<1:14:32,  1.55s/it]

training loss: 3.45117449760437


training:  74%|███████▍  | 8104/10986 [3:09:09<1:11:27,  1.49s/it]

training loss: 3.3299307823181152


training:  74%|███████▍  | 8105/10986 [3:09:10<1:08:44,  1.43s/it]

training loss: 3.3795206546783447


training:  74%|███████▍  | 8106/10986 [3:09:11<1:06:48,  1.39s/it]

training loss: 3.3472800254821777


training:  74%|███████▍  | 8107/10986 [3:09:12<1:05:38,  1.37s/it]

training loss: 3.4552602767944336


training:  74%|███████▍  | 8108/10986 [3:09:14<1:04:29,  1.34s/it]

training loss: 3.373952627182007


training:  74%|███████▍  | 8109/10986 [3:09:15<1:04:07,  1.34s/it]

training loss: 3.4135773181915283


training:  74%|███████▍  | 8110/10986 [3:09:16<1:04:37,  1.35s/it]

training loss: 3.4277186393737793


training:  74%|███████▍  | 8111/10986 [3:09:19<1:14:44,  1.56s/it]

training loss: 3.523057222366333


training:  74%|███████▍  | 8112/10986 [3:09:20<1:17:46,  1.62s/it]

training loss: 3.476203441619873


training:  74%|███████▍  | 8113/10986 [3:09:22<1:13:00,  1.52s/it]

training loss: 3.329653024673462


training:  74%|███████▍  | 8114/10986 [3:09:23<1:09:51,  1.46s/it]

training loss: 3.347947597503662


training:  74%|███████▍  | 8115/10986 [3:09:24<1:08:00,  1.42s/it]

training loss: 3.38366961479187


training:  74%|███████▍  | 8116/10986 [3:09:26<1:06:37,  1.39s/it]

training loss: 3.420196533203125


training:  74%|███████▍  | 8117/10986 [3:09:27<1:05:25,  1.37s/it]

training loss: 3.476867914199829


training:  74%|███████▍  | 8118/10986 [3:09:28<1:04:27,  1.35s/it]

training loss: 3.3542470932006836


training:  74%|███████▍  | 8119/10986 [3:09:29<1:03:57,  1.34s/it]

training loss: 3.439424991607666


training:  74%|███████▍  | 8120/10986 [3:09:31<1:03:20,  1.33s/it]

training loss: 3.4016222953796387
valid loss: 3.4010143280029297
perplexity: 29.994508743286133


training:  74%|███████▍  | 8121/10986 [3:09:34<1:24:38,  1.77s/it]

training loss: 3.383054494857788


training:  74%|███████▍  | 8122/10986 [3:09:35<1:22:11,  1.72s/it]

training loss: 3.4270331859588623


training:  74%|███████▍  | 8123/10986 [3:09:36<1:16:06,  1.59s/it]

training loss: 3.409440517425537


training:  74%|███████▍  | 8124/10986 [3:09:38<1:12:03,  1.51s/it]

training loss: 3.4609222412109375


training:  74%|███████▍  | 8125/10986 [3:09:39<1:09:09,  1.45s/it]

training loss: 3.33912992477417


training:  74%|███████▍  | 8126/10986 [3:09:40<1:06:53,  1.40s/it]

training loss: 3.445553779602051


training:  74%|███████▍  | 8127/10986 [3:09:42<1:05:15,  1.37s/it]

training loss: 3.407684087753296


training:  74%|███████▍  | 8128/10986 [3:09:43<1:04:08,  1.35s/it]

training loss: 3.344527244567871


training:  74%|███████▍  | 8129/10986 [3:09:44<1:03:33,  1.33s/it]

training loss: 3.432058334350586


training:  74%|███████▍  | 8130/10986 [3:09:46<1:02:57,  1.32s/it]

training loss: 3.430202007293701


training:  74%|███████▍  | 8131/10986 [3:09:47<1:06:47,  1.40s/it]

training loss: 3.381544828414917


training:  74%|███████▍  | 8132/10986 [3:09:49<1:05:56,  1.39s/it]

training loss: 3.342174530029297


training:  74%|███████▍  | 8133/10986 [3:09:50<1:04:47,  1.36s/it]

training loss: 3.4064550399780273


training:  74%|███████▍  | 8134/10986 [3:09:51<1:03:51,  1.34s/it]

training loss: 3.4724643230438232


training:  74%|███████▍  | 8135/10986 [3:09:52<1:03:09,  1.33s/it]

training loss: 3.4976980686187744


training:  74%|███████▍  | 8136/10986 [3:09:54<1:02:28,  1.32s/it]

training loss: 3.400534152984619


training:  74%|███████▍  | 8137/10986 [3:09:55<1:02:13,  1.31s/it]

training loss: 3.3411951065063477


training:  74%|███████▍  | 8138/10986 [3:09:56<1:02:29,  1.32s/it]

training loss: 3.4304256439208984


training:  74%|███████▍  | 8139/10986 [3:09:58<1:02:33,  1.32s/it]

training loss: 3.375555992126465


training:  74%|███████▍  | 8140/10986 [3:09:59<1:02:21,  1.31s/it]

training loss: 3.313823699951172
valid loss: 3.3094327449798584
perplexity: 27.36959457397461


training:  74%|███████▍  | 8141/10986 [3:10:02<1:23:48,  1.77s/it]

training loss: 3.382214307785034


training:  74%|███████▍  | 8142/10986 [3:10:03<1:21:05,  1.71s/it]

training loss: 3.4238545894622803


training:  74%|███████▍  | 8143/10986 [3:10:05<1:14:46,  1.58s/it]

training loss: 3.5119662284851074


training:  74%|███████▍  | 8144/10986 [3:10:06<1:10:51,  1.50s/it]

training loss: 3.347196102142334


training:  74%|███████▍  | 8145/10986 [3:10:07<1:07:55,  1.43s/it]

training loss: 3.370638370513916


training:  74%|███████▍  | 8146/10986 [3:10:09<1:06:44,  1.41s/it]

training loss: 3.473536491394043


training:  74%|███████▍  | 8147/10986 [3:10:10<1:05:26,  1.38s/it]

training loss: 3.475102424621582


training:  74%|███████▍  | 8148/10986 [3:10:11<1:04:18,  1.36s/it]

training loss: 3.4562416076660156


training:  74%|███████▍  | 8149/10986 [3:10:13<1:03:42,  1.35s/it]

training loss: 3.3591253757476807


training:  74%|███████▍  | 8150/10986 [3:10:14<1:02:56,  1.33s/it]

training loss: 3.4199464321136475


training:  74%|███████▍  | 8151/10986 [3:10:15<1:06:20,  1.40s/it]

training loss: 3.4327592849731445


training:  74%|███████▍  | 8152/10986 [3:10:17<1:06:04,  1.40s/it]

training loss: 3.4820988178253174


training:  74%|███████▍  | 8153/10986 [3:10:18<1:04:46,  1.37s/it]

training loss: 3.3929390907287598


training:  74%|███████▍  | 8154/10986 [3:10:19<1:03:43,  1.35s/it]

training loss: 3.3367528915405273


training:  74%|███████▍  | 8155/10986 [3:10:21<1:03:03,  1.34s/it]

training loss: 3.4081716537475586


training:  74%|███████▍  | 8156/10986 [3:10:22<1:02:24,  1.32s/it]

training loss: 3.4641735553741455


training:  74%|███████▍  | 8157/10986 [3:10:23<1:02:09,  1.32s/it]

training loss: 3.351701498031616


training:  74%|███████▍  | 8158/10986 [3:10:25<1:01:52,  1.31s/it]

training loss: 3.2814395427703857


training:  74%|███████▍  | 8159/10986 [3:10:26<1:01:38,  1.31s/it]

training loss: 3.4666988849639893


training:  74%|███████▍  | 8160/10986 [3:10:27<1:01:37,  1.31s/it]

training loss: 3.3728697299957275
valid loss: 3.371537208557129
perplexity: 29.123262405395508


training:  74%|███████▍  | 8161/10986 [3:10:30<1:22:29,  1.75s/it]

training loss: 3.3316991329193115


training:  74%|███████▍  | 8162/10986 [3:10:31<1:17:20,  1.64s/it]

training loss: 3.4627597332000732


training:  74%|███████▍  | 8163/10986 [3:10:33<1:12:44,  1.55s/it]

training loss: 3.413721799850464


training:  74%|███████▍  | 8164/10986 [3:10:34<1:09:26,  1.48s/it]

training loss: 3.458665609359741


training:  74%|███████▍  | 8165/10986 [3:10:35<1:07:13,  1.43s/it]

training loss: 3.383288860321045


training:  74%|███████▍  | 8166/10986 [3:10:37<1:05:20,  1.39s/it]

training loss: 3.34126353263855


training:  74%|███████▍  | 8167/10986 [3:10:38<1:04:08,  1.37s/it]

training loss: 3.386915445327759


training:  74%|███████▍  | 8168/10986 [3:10:39<1:04:00,  1.36s/it]

training loss: 3.4176113605499268


training:  74%|███████▍  | 8169/10986 [3:10:41<1:03:31,  1.35s/it]

training loss: 3.3476407527923584


training:  74%|███████▍  | 8170/10986 [3:10:42<1:02:33,  1.33s/it]

training loss: 3.3669683933258057


training:  74%|███████▍  | 8171/10986 [3:10:44<1:06:27,  1.42s/it]

training loss: 3.374150276184082


training:  74%|███████▍  | 8172/10986 [3:10:45<1:08:45,  1.47s/it]

training loss: 3.377699375152588


training:  74%|███████▍  | 8173/10986 [3:10:46<1:06:48,  1.42s/it]

training loss: 3.394657850265503


training:  74%|███████▍  | 8174/10986 [3:10:48<1:05:09,  1.39s/it]

training loss: 3.3373289108276367


training:  74%|███████▍  | 8175/10986 [3:10:49<1:03:45,  1.36s/it]

training loss: 3.4205548763275146


training:  74%|███████▍  | 8176/10986 [3:10:50<1:02:45,  1.34s/it]

training loss: 3.3252780437469482


training:  74%|███████▍  | 8177/10986 [3:10:52<1:02:09,  1.33s/it]

training loss: 3.363952875137329


training:  74%|███████▍  | 8178/10986 [3:10:53<1:02:25,  1.33s/it]

training loss: 3.3517401218414307


training:  74%|███████▍  | 8179/10986 [3:10:54<1:01:59,  1.32s/it]

training loss: 3.4266629219055176


training:  74%|███████▍  | 8180/10986 [3:10:56<1:01:27,  1.31s/it]

training loss: 3.421004056930542
valid loss: 3.4247591495513916
perplexity: 30.715246200561523


training:  74%|███████▍  | 8181/10986 [3:10:58<1:21:44,  1.75s/it]

training loss: 3.4312825202941895


training:  74%|███████▍  | 8182/10986 [3:11:00<1:16:30,  1.64s/it]

training loss: 3.432602882385254


training:  74%|███████▍  | 8183/10986 [3:11:01<1:11:43,  1.54s/it]

training loss: 3.429568290710449


training:  74%|███████▍  | 8184/10986 [3:11:02<1:08:42,  1.47s/it]

training loss: 3.424382209777832


training:  75%|███████▍  | 8185/10986 [3:11:04<1:06:18,  1.42s/it]

training loss: 3.3695571422576904


training:  75%|███████▍  | 8186/10986 [3:11:05<1:04:31,  1.38s/it]

training loss: 3.421265125274658


training:  75%|███████▍  | 8187/10986 [3:11:06<1:03:08,  1.35s/it]

training loss: 3.3595094680786133


training:  75%|███████▍  | 8188/10986 [3:11:08<1:02:23,  1.34s/it]

training loss: 3.316025972366333


training:  75%|███████▍  | 8189/10986 [3:11:09<1:02:18,  1.34s/it]

training loss: 3.514106273651123


training:  75%|███████▍  | 8190/10986 [3:11:10<1:02:24,  1.34s/it]

training loss: 3.4600391387939453


training:  75%|███████▍  | 8191/10986 [3:11:12<1:05:52,  1.41s/it]

training loss: 3.3991150856018066


training:  75%|███████▍  | 8192/10986 [3:11:13<1:04:57,  1.40s/it]

training loss: 3.3607218265533447


training:  75%|███████▍  | 8193/10986 [3:11:14<1:03:37,  1.37s/it]

training loss: 3.4288008213043213


training:  75%|███████▍  | 8194/10986 [3:11:16<1:02:30,  1.34s/it]

training loss: 3.407248020172119


training:  75%|███████▍  | 8195/10986 [3:11:17<1:01:56,  1.33s/it]

training loss: 3.4048163890838623


training:  75%|███████▍  | 8196/10986 [3:11:18<1:01:49,  1.33s/it]

training loss: 3.3249685764312744


training:  75%|███████▍  | 8197/10986 [3:11:20<1:01:16,  1.32s/it]

training loss: 3.4811933040618896


training:  75%|███████▍  | 8198/10986 [3:11:21<1:00:54,  1.31s/it]

training loss: 3.3369994163513184


training:  75%|███████▍  | 8199/10986 [3:11:22<1:01:12,  1.32s/it]

training loss: 3.3221049308776855


training:  75%|███████▍  | 8200/10986 [3:11:24<1:01:36,  1.33s/it]

training loss: 3.3086633682250977
valid loss: 3.3045310974121094
perplexity: 27.23576545715332


training:  75%|███████▍  | 8201/10986 [3:11:26<1:22:00,  1.77s/it]

training loss: 3.444396734237671


training:  75%|███████▍  | 8202/10986 [3:11:28<1:19:25,  1.71s/it]

training loss: 3.335150718688965


training:  75%|███████▍  | 8203/10986 [3:11:29<1:13:34,  1.59s/it]

training loss: 3.2861363887786865


training:  75%|███████▍  | 8204/10986 [3:11:31<1:09:22,  1.50s/it]

training loss: 3.453390121459961


training:  75%|███████▍  | 8205/10986 [3:11:32<1:06:41,  1.44s/it]

training loss: 3.3504014015197754


training:  75%|███████▍  | 8206/10986 [3:11:33<1:04:46,  1.40s/it]

training loss: 3.4106037616729736


training:  75%|███████▍  | 8207/10986 [3:11:34<1:03:20,  1.37s/it]

training loss: 3.4850575923919678


training:  75%|███████▍  | 8208/10986 [3:11:36<1:01:51,  1.34s/it]

training loss: 3.4182288646698


training:  75%|███████▍  | 8209/10986 [3:11:37<1:00:54,  1.32s/it]

training loss: 3.409083843231201


training:  75%|███████▍  | 8210/10986 [3:11:38<1:00:44,  1.31s/it]

training loss: 3.3873095512390137


training:  75%|███████▍  | 8211/10986 [3:11:40<1:05:12,  1.41s/it]

training loss: 3.368314266204834


training:  75%|███████▍  | 8212/10986 [3:11:41<1:04:04,  1.39s/it]

training loss: 3.3922746181488037


training:  75%|███████▍  | 8213/10986 [3:11:43<1:02:40,  1.36s/it]

training loss: 3.42840313911438


training:  75%|███████▍  | 8214/10986 [3:11:44<1:02:09,  1.35s/it]

training loss: 3.500908851623535


training:  75%|███████▍  | 8215/10986 [3:11:45<1:01:30,  1.33s/it]

training loss: 3.31717586517334


training:  75%|███████▍  | 8216/10986 [3:11:46<1:00:32,  1.31s/it]

training loss: 3.410109043121338


training:  75%|███████▍  | 8217/10986 [3:11:48<59:59,  1.30s/it]  

training loss: 3.3222131729125977


training:  75%|███████▍  | 8218/10986 [3:11:49<59:27,  1.29s/it]

training loss: 3.4055707454681396


training:  75%|███████▍  | 8219/10986 [3:11:50<59:16,  1.29s/it]

training loss: 3.532238483428955


training:  75%|███████▍  | 8220/10986 [3:11:52<59:16,  1.29s/it]

training loss: 3.4888386726379395
valid loss: 3.485689640045166
perplexity: 32.644935607910156


training:  75%|███████▍  | 8221/10986 [3:11:54<1:20:05,  1.74s/it]

training loss: 3.507995367050171


training:  75%|███████▍  | 8222/10986 [3:11:56<1:15:03,  1.63s/it]

training loss: 3.3779008388519287


training:  75%|███████▍  | 8223/10986 [3:11:57<1:10:06,  1.52s/it]

training loss: 3.3303329944610596


training:  75%|███████▍  | 8224/10986 [3:11:58<1:06:52,  1.45s/it]

training loss: 3.43510103225708


training:  75%|███████▍  | 8225/10986 [3:12:00<1:04:20,  1.40s/it]

training loss: 3.3005785942077637


training:  75%|███████▍  | 8226/10986 [3:12:01<1:02:24,  1.36s/it]

training loss: 3.405034065246582


training:  75%|███████▍  | 8227/10986 [3:12:02<1:01:19,  1.33s/it]

training loss: 3.3575756549835205


training:  75%|███████▍  | 8228/10986 [3:12:03<1:00:36,  1.32s/it]

training loss: 3.3197436332702637


training:  75%|███████▍  | 8229/10986 [3:12:05<1:00:00,  1.31s/it]

training loss: 3.3406691551208496


training:  75%|███████▍  | 8230/10986 [3:12:06<59:18,  1.29s/it]  

training loss: 3.4677741527557373


training:  75%|███████▍  | 8231/10986 [3:12:07<1:02:59,  1.37s/it]

training loss: 3.315692663192749


training:  75%|███████▍  | 8232/10986 [3:12:09<1:06:12,  1.44s/it]

training loss: 3.362213611602783


training:  75%|███████▍  | 8233/10986 [3:12:10<1:04:59,  1.42s/it]

training loss: 3.3271899223327637


training:  75%|███████▍  | 8234/10986 [3:12:12<1:03:04,  1.38s/it]

training loss: 3.266130208969116


training:  75%|███████▍  | 8235/10986 [3:12:13<1:01:46,  1.35s/it]

training loss: 3.386873483657837


training:  75%|███████▍  | 8236/10986 [3:12:14<1:00:48,  1.33s/it]

training loss: 3.3928065299987793


training:  75%|███████▍  | 8237/10986 [3:12:16<1:00:27,  1.32s/it]

training loss: 3.3216800689697266


training:  75%|███████▍  | 8238/10986 [3:12:17<1:00:01,  1.31s/it]

training loss: 3.342808485031128


training:  75%|███████▍  | 8239/10986 [3:12:18<59:40,  1.30s/it]  

training loss: 3.3897902965545654


training:  75%|███████▌  | 8240/10986 [3:12:19<59:11,  1.29s/it]

training loss: 3.3197126388549805
valid loss: 3.3187780380249023
perplexity: 27.626571655273438


training:  75%|███████▌  | 8241/10986 [3:12:22<1:19:50,  1.75s/it]

training loss: 3.3803658485412598


training:  75%|███████▌  | 8242/10986 [3:12:24<1:17:45,  1.70s/it]

training loss: 3.3599557876586914


training:  75%|███████▌  | 8243/10986 [3:12:25<1:12:26,  1.58s/it]

training loss: 3.405494451522827


training:  75%|███████▌  | 8244/10986 [3:12:26<1:08:25,  1.50s/it]

training loss: 3.342410087585449


training:  75%|███████▌  | 8245/10986 [3:12:28<1:05:12,  1.43s/it]

training loss: 3.366577625274658


training:  75%|███████▌  | 8246/10986 [3:12:29<1:03:24,  1.39s/it]

training loss: 3.3486435413360596


training:  75%|███████▌  | 8247/10986 [3:12:30<1:02:04,  1.36s/it]

training loss: 3.4053494930267334


training:  75%|███████▌  | 8248/10986 [3:12:32<1:01:24,  1.35s/it]

training loss: 3.455474853515625


training:  75%|███████▌  | 8249/10986 [3:12:33<1:00:43,  1.33s/it]

training loss: 3.399449586868286


training:  75%|███████▌  | 8250/10986 [3:12:34<1:00:21,  1.32s/it]

training loss: 3.3163726329803467


training:  75%|███████▌  | 8251/10986 [3:12:36<1:04:02,  1.41s/it]

training loss: 3.326052188873291


training:  75%|███████▌  | 8252/10986 [3:12:37<1:06:34,  1.46s/it]

training loss: 3.3216652870178223


training:  75%|███████▌  | 8253/10986 [3:12:39<1:04:31,  1.42s/it]

training loss: 3.357649803161621


training:  75%|███████▌  | 8254/10986 [3:12:40<1:02:54,  1.38s/it]

training loss: 3.3845701217651367


training:  75%|███████▌  | 8255/10986 [3:12:41<1:01:51,  1.36s/it]

training loss: 3.427698850631714


training:  75%|███████▌  | 8256/10986 [3:12:43<1:01:20,  1.35s/it]

training loss: 3.4052059650421143


training:  75%|███████▌  | 8257/10986 [3:12:44<1:00:51,  1.34s/it]

training loss: 3.4541189670562744


training:  75%|███████▌  | 8258/10986 [3:12:45<1:00:40,  1.33s/it]

training loss: 3.410095453262329


training:  75%|███████▌  | 8259/10986 [3:12:47<59:54,  1.32s/it]  

training loss: 3.4352774620056152


training:  75%|███████▌  | 8260/10986 [3:12:48<59:21,  1.31s/it]

training loss: 3.2479312419891357
valid loss: 3.2464067935943604
perplexity: 25.69783592224121


training:  75%|███████▌  | 8261/10986 [3:12:51<1:19:10,  1.74s/it]

training loss: 3.463169813156128


training:  75%|███████▌  | 8262/10986 [3:12:52<1:18:00,  1.72s/it]

training loss: 3.4086172580718994


training:  75%|███████▌  | 8263/10986 [3:12:54<1:12:20,  1.59s/it]

training loss: 3.3031015396118164


training:  75%|███████▌  | 8264/10986 [3:12:55<1:08:15,  1.50s/it]

training loss: 3.384610176086426


training:  75%|███████▌  | 8265/10986 [3:12:56<1:05:31,  1.44s/it]

training loss: 3.3504552841186523


training:  75%|███████▌  | 8266/10986 [3:12:57<1:03:30,  1.40s/it]

training loss: 3.3798017501831055


training:  75%|███████▌  | 8267/10986 [3:12:59<1:01:49,  1.36s/it]

training loss: 3.320659637451172


training:  75%|███████▌  | 8268/10986 [3:13:00<1:00:46,  1.34s/it]

training loss: 3.4396400451660156


training:  75%|███████▌  | 8269/10986 [3:13:01<1:00:15,  1.33s/it]

training loss: 3.4679179191589355


training:  75%|███████▌  | 8270/10986 [3:13:03<1:01:46,  1.36s/it]

training loss: 3.3882429599761963


training:  75%|███████▌  | 8271/10986 [3:13:05<1:11:24,  1.58s/it]

training loss: 3.3586885929107666


training:  75%|███████▌  | 8272/10986 [3:13:07<1:15:18,  1.66s/it]

training loss: 3.5498015880584717


training:  75%|███████▌  | 8273/10986 [3:13:08<1:10:25,  1.56s/it]

training loss: 3.277172088623047


training:  75%|███████▌  | 8274/10986 [3:13:09<1:06:49,  1.48s/it]

training loss: 3.3763041496276855


training:  75%|███████▌  | 8275/10986 [3:13:11<1:04:43,  1.43s/it]

training loss: 3.4091603755950928


training:  75%|███████▌  | 8276/10986 [3:13:12<1:02:59,  1.39s/it]

training loss: 3.2545952796936035


training:  75%|███████▌  | 8277/10986 [3:13:13<1:01:37,  1.37s/it]

training loss: 3.4303576946258545


training:  75%|███████▌  | 8278/10986 [3:13:15<1:00:52,  1.35s/it]

training loss: 3.450485944747925


training:  75%|███████▌  | 8279/10986 [3:13:16<1:00:07,  1.33s/it]

training loss: 3.382049083709717


training:  75%|███████▌  | 8280/10986 [3:13:17<59:25,  1.32s/it]  

training loss: 3.3957033157348633
valid loss: 3.396939992904663
perplexity: 29.87255096435547


training:  75%|███████▌  | 8281/10986 [3:13:20<1:19:20,  1.76s/it]

training loss: 3.306668519973755


training:  75%|███████▌  | 8282/10986 [3:13:22<1:17:52,  1.73s/it]

training loss: 3.486201286315918


training:  75%|███████▌  | 8283/10986 [3:13:23<1:11:36,  1.59s/it]

training loss: 3.411050319671631


training:  75%|███████▌  | 8284/10986 [3:13:24<1:07:46,  1.50s/it]

training loss: 3.4122886657714844


training:  75%|███████▌  | 8285/10986 [3:13:25<1:04:35,  1.43s/it]

training loss: 3.3037283420562744


training:  75%|███████▌  | 8286/10986 [3:13:27<1:02:25,  1.39s/it]

training loss: 3.438551425933838


training:  75%|███████▌  | 8287/10986 [3:13:28<1:00:56,  1.35s/it]

training loss: 3.412977933883667


training:  75%|███████▌  | 8288/10986 [3:13:29<59:52,  1.33s/it]  

training loss: 3.3892741203308105


training:  75%|███████▌  | 8289/10986 [3:13:31<59:26,  1.32s/it]

training loss: 3.3915653228759766


training:  75%|███████▌  | 8290/10986 [3:13:32<59:03,  1.31s/it]

training loss: 3.438107967376709


training:  75%|███████▌  | 8291/10986 [3:13:33<1:02:46,  1.40s/it]

training loss: 3.406966209411621


training:  75%|███████▌  | 8292/10986 [3:13:35<1:02:11,  1.39s/it]

training loss: 3.430100202560425


training:  75%|███████▌  | 8293/10986 [3:13:36<1:00:53,  1.36s/it]

training loss: 3.2794156074523926


training:  75%|███████▌  | 8294/10986 [3:13:37<59:40,  1.33s/it]  

training loss: 3.491349220275879


training:  76%|███████▌  | 8295/10986 [3:13:39<58:57,  1.31s/it]

training loss: 3.385070323944092


training:  76%|███████▌  | 8296/10986 [3:13:40<58:32,  1.31s/it]

training loss: 3.3233184814453125


training:  76%|███████▌  | 8297/10986 [3:13:41<58:26,  1.30s/it]

training loss: 3.3292620182037354


training:  76%|███████▌  | 8298/10986 [3:13:43<58:12,  1.30s/it]

training loss: 3.3392701148986816


training:  76%|███████▌  | 8299/10986 [3:13:44<58:19,  1.30s/it]

training loss: 3.3938241004943848


training:  76%|███████▌  | 8300/10986 [3:13:45<58:15,  1.30s/it]

training loss: 3.3510873317718506
valid loss: 3.3478951454162598
perplexity: 28.44280242919922


training:  76%|███████▌  | 8301/10986 [3:13:48<1:18:15,  1.75s/it]

training loss: 3.3427770137786865


training:  76%|███████▌  | 8302/10986 [3:13:49<1:13:16,  1.64s/it]

training loss: 3.3259785175323486


training:  76%|███████▌  | 8303/10986 [3:13:51<1:08:57,  1.54s/it]

training loss: 3.511387586593628


training:  76%|███████▌  | 8304/10986 [3:13:52<1:05:31,  1.47s/it]

training loss: 3.2919650077819824


training:  76%|███████▌  | 8305/10986 [3:13:53<1:03:10,  1.41s/it]

training loss: 3.4325671195983887


training:  76%|███████▌  | 8306/10986 [3:13:55<1:01:48,  1.38s/it]

training loss: 3.372199773788452


training:  76%|███████▌  | 8307/10986 [3:13:56<1:00:43,  1.36s/it]

training loss: 3.380650281906128


training:  76%|███████▌  | 8308/10986 [3:13:57<59:49,  1.34s/it]  

training loss: 3.4616780281066895


training:  76%|███████▌  | 8309/10986 [3:13:58<59:06,  1.32s/it]

training loss: 3.361271858215332


training:  76%|███████▌  | 8310/10986 [3:14:00<58:18,  1.31s/it]

training loss: 3.414773464202881


training:  76%|███████▌  | 8311/10986 [3:14:01<1:02:10,  1.39s/it]

training loss: 3.3616116046905518


training:  76%|███████▌  | 8312/10986 [3:14:03<1:01:07,  1.37s/it]

training loss: 3.340334177017212


training:  76%|███████▌  | 8313/10986 [3:14:04<59:09,  1.33s/it]  

training loss: 3.377153158187866


training:  76%|███████▌  | 8314/10986 [3:14:05<58:31,  1.31s/it]

training loss: 3.4115383625030518


training:  76%|███████▌  | 8315/10986 [3:14:06<58:25,  1.31s/it]

training loss: 3.3927805423736572


training:  76%|███████▌  | 8316/10986 [3:14:08<57:42,  1.30s/it]

training loss: 3.4012324810028076


training:  76%|███████▌  | 8317/10986 [3:14:09<57:26,  1.29s/it]

training loss: 3.4121830463409424


training:  76%|███████▌  | 8318/10986 [3:14:10<57:04,  1.28s/it]

training loss: 3.4061436653137207


training:  76%|███████▌  | 8319/10986 [3:14:12<57:20,  1.29s/it]

training loss: 3.3539252281188965


training:  76%|███████▌  | 8320/10986 [3:14:13<57:10,  1.29s/it]

training loss: 3.4568207263946533
valid loss: 3.44746470451355
perplexity: 31.420629501342773


training:  76%|███████▌  | 8321/10986 [3:14:16<1:16:54,  1.73s/it]

training loss: 3.3577399253845215


training:  76%|███████▌  | 8322/10986 [3:14:17<1:15:02,  1.69s/it]

training loss: 3.3904643058776855


training:  76%|███████▌  | 8323/10986 [3:14:18<1:09:26,  1.56s/it]

training loss: 3.519111156463623


training:  76%|███████▌  | 8324/10986 [3:14:20<1:05:33,  1.48s/it]

training loss: 3.551140069961548


training:  76%|███████▌  | 8325/10986 [3:14:21<1:03:06,  1.42s/it]

training loss: 3.4003753662109375


training:  76%|███████▌  | 8326/10986 [3:14:22<1:01:33,  1.39s/it]

training loss: 3.4685888290405273


training:  76%|███████▌  | 8327/10986 [3:14:24<59:49,  1.35s/it]  

training loss: 3.3292720317840576


training:  76%|███████▌  | 8328/10986 [3:14:25<59:07,  1.33s/it]

training loss: 3.305237054824829


training:  76%|███████▌  | 8329/10986 [3:14:26<58:15,  1.32s/it]

training loss: 3.3551883697509766


training:  76%|███████▌  | 8330/10986 [3:14:27<57:49,  1.31s/it]

training loss: 3.2990496158599854


training:  76%|███████▌  | 8331/10986 [3:14:29<1:01:22,  1.39s/it]

training loss: 3.4944756031036377


training:  76%|███████▌  | 8332/10986 [3:14:30<1:00:34,  1.37s/it]

training loss: 3.496152877807617


training:  76%|███████▌  | 8333/10986 [3:14:32<59:19,  1.34s/it]  

training loss: 3.474182605743408


training:  76%|███████▌  | 8334/10986 [3:14:33<58:33,  1.32s/it]

training loss: 3.353240966796875


training:  76%|███████▌  | 8335/10986 [3:14:34<58:06,  1.32s/it]

training loss: 3.4460597038269043


training:  76%|███████▌  | 8336/10986 [3:14:35<57:54,  1.31s/it]

training loss: 3.4193503856658936


training:  76%|███████▌  | 8337/10986 [3:14:37<57:27,  1.30s/it]

training loss: 3.4186196327209473


training:  76%|███████▌  | 8338/10986 [3:14:38<57:10,  1.30s/it]

training loss: 3.4300997257232666


training:  76%|███████▌  | 8339/10986 [3:14:39<56:57,  1.29s/it]

training loss: 3.388514995574951


training:  76%|███████▌  | 8340/10986 [3:14:41<57:20,  1.30s/it]

training loss: 3.336942672729492
valid loss: 3.342902898788452
perplexity: 28.301162719726562


training:  76%|███████▌  | 8341/10986 [3:14:43<1:16:57,  1.75s/it]

training loss: 3.464500665664673


training:  76%|███████▌  | 8342/10986 [3:14:45<1:11:49,  1.63s/it]

training loss: 3.573138952255249


training:  76%|███████▌  | 8343/10986 [3:14:46<1:07:26,  1.53s/it]

training loss: 3.4843392372131348


training:  76%|███████▌  | 8344/10986 [3:14:47<1:04:22,  1.46s/it]

training loss: 3.319334030151367


training:  76%|███████▌  | 8345/10986 [3:14:49<1:01:53,  1.41s/it]

training loss: 3.4100375175476074


training:  76%|███████▌  | 8346/10986 [3:14:50<1:00:08,  1.37s/it]

training loss: 3.3682804107666016


training:  76%|███████▌  | 8347/10986 [3:14:51<58:54,  1.34s/it]  

training loss: 3.4473936557769775


training:  76%|███████▌  | 8348/10986 [3:14:52<58:13,  1.32s/it]

training loss: 3.394766330718994


training:  76%|███████▌  | 8349/10986 [3:14:54<57:28,  1.31s/it]

training loss: 3.4310388565063477


training:  76%|███████▌  | 8350/10986 [3:14:55<57:15,  1.30s/it]

training loss: 3.455716371536255


training:  76%|███████▌  | 8351/10986 [3:14:57<1:00:47,  1.38s/it]

training loss: 3.4167184829711914


training:  76%|███████▌  | 8352/10986 [3:14:58<59:57,  1.37s/it]  

training loss: 3.352924346923828


training:  76%|███████▌  | 8353/10986 [3:14:59<59:00,  1.34s/it]

training loss: 3.4780614376068115


training:  76%|███████▌  | 8354/10986 [3:15:01<58:19,  1.33s/it]

training loss: 3.3820624351501465


training:  76%|███████▌  | 8355/10986 [3:15:02<57:56,  1.32s/it]

training loss: 3.4009292125701904


training:  76%|███████▌  | 8356/10986 [3:15:03<57:20,  1.31s/it]

training loss: 3.344682455062866


training:  76%|███████▌  | 8357/10986 [3:15:04<56:52,  1.30s/it]

training loss: 3.403820514678955


training:  76%|███████▌  | 8358/10986 [3:15:06<56:37,  1.29s/it]

training loss: 3.384185552597046


training:  76%|███████▌  | 8359/10986 [3:15:07<56:16,  1.29s/it]

training loss: 3.2955644130706787


training:  76%|███████▌  | 8360/10986 [3:15:08<56:20,  1.29s/it]

training loss: 3.492602586746216
valid loss: 3.492290496826172
perplexity: 32.86112976074219


training:  76%|███████▌  | 8361/10986 [3:15:11<1:15:13,  1.72s/it]

training loss: 3.5670909881591797


training:  76%|███████▌  | 8362/10986 [3:15:13<1:15:26,  1.72s/it]

training loss: 3.342050313949585


training:  76%|███████▌  | 8363/10986 [3:15:14<1:09:37,  1.59s/it]

training loss: 3.470208168029785


training:  76%|███████▌  | 8364/10986 [3:15:15<1:05:42,  1.50s/it]

training loss: 3.4017739295959473


training:  76%|███████▌  | 8365/10986 [3:15:17<1:02:55,  1.44s/it]

training loss: 3.393148183822632


training:  76%|███████▌  | 8366/10986 [3:15:18<1:00:54,  1.39s/it]

training loss: 3.3339927196502686


training:  76%|███████▌  | 8367/10986 [3:15:19<59:56,  1.37s/it]  

training loss: 3.445232629776001


training:  76%|███████▌  | 8368/10986 [3:15:20<58:40,  1.34s/it]

training loss: 3.34965181350708


training:  76%|███████▌  | 8369/10986 [3:15:22<57:45,  1.32s/it]

training loss: 3.3731744289398193


training:  76%|███████▌  | 8370/10986 [3:15:23<57:20,  1.32s/it]

training loss: 3.4116480350494385


training:  76%|███████▌  | 8371/10986 [3:15:25<1:01:12,  1.40s/it]

training loss: 3.3855409622192383


training:  76%|███████▌  | 8372/10986 [3:15:26<1:00:09,  1.38s/it]

training loss: 3.3836522102355957


training:  76%|███████▌  | 8373/10986 [3:15:27<59:02,  1.36s/it]  

training loss: 3.3863937854766846


training:  76%|███████▌  | 8374/10986 [3:15:29<58:01,  1.33s/it]

training loss: 3.4755759239196777


training:  76%|███████▌  | 8375/10986 [3:15:30<57:17,  1.32s/it]

training loss: 3.417231559753418


training:  76%|███████▌  | 8376/10986 [3:15:31<56:49,  1.31s/it]

training loss: 3.4481842517852783


training:  76%|███████▋  | 8377/10986 [3:15:32<56:28,  1.30s/it]

training loss: 3.4142494201660156


training:  76%|███████▋  | 8378/10986 [3:15:34<55:52,  1.29s/it]

training loss: 3.375734567642212


training:  76%|███████▋  | 8379/10986 [3:15:35<55:54,  1.29s/it]

training loss: 3.6296117305755615


training:  76%|███████▋  | 8380/10986 [3:15:36<55:58,  1.29s/it]

training loss: 3.291381359100342
valid loss: 3.286388397216797
perplexity: 26.74609375


training:  76%|███████▋  | 8381/10986 [3:15:39<1:14:58,  1.73s/it]

training loss: 3.356700897216797


training:  76%|███████▋  | 8382/10986 [3:15:40<1:10:31,  1.62s/it]

training loss: 3.379568099975586


training:  76%|███████▋  | 8383/10986 [3:15:42<1:06:20,  1.53s/it]

training loss: 3.4284138679504395


training:  76%|███████▋  | 8384/10986 [3:15:43<1:03:23,  1.46s/it]

training loss: 3.3380930423736572


training:  76%|███████▋  | 8385/10986 [3:15:44<1:00:52,  1.40s/it]

training loss: 3.3516995906829834


training:  76%|███████▋  | 8386/10986 [3:15:46<59:23,  1.37s/it]  

training loss: 3.3399429321289062


training:  76%|███████▋  | 8387/10986 [3:15:47<58:22,  1.35s/it]

training loss: 3.2735586166381836


training:  76%|███████▋  | 8388/10986 [3:15:48<57:12,  1.32s/it]

training loss: 3.3440256118774414


training:  76%|███████▋  | 8389/10986 [3:15:49<56:37,  1.31s/it]

training loss: 3.3348636627197266


training:  76%|███████▋  | 8390/10986 [3:15:51<56:10,  1.30s/it]

training loss: 3.3625922203063965


training:  76%|███████▋  | 8391/10986 [3:15:52<59:49,  1.38s/it]

training loss: 3.408337116241455


training:  76%|███████▋  | 8392/10986 [3:15:54<58:51,  1.36s/it]

training loss: 3.4088222980499268


training:  76%|███████▋  | 8393/10986 [3:15:55<57:41,  1.34s/it]

training loss: 3.3730740547180176


training:  76%|███████▋  | 8394/10986 [3:15:56<57:00,  1.32s/it]

training loss: 3.4155635833740234


training:  76%|███████▋  | 8395/10986 [3:15:57<56:32,  1.31s/it]

training loss: 3.37460994720459


training:  76%|███████▋  | 8396/10986 [3:15:59<56:06,  1.30s/it]

training loss: 3.319432258605957


training:  76%|███████▋  | 8397/10986 [3:16:00<55:39,  1.29s/it]

training loss: 3.3806445598602295


training:  76%|███████▋  | 8398/10986 [3:16:01<55:31,  1.29s/it]

training loss: 3.4007632732391357


training:  76%|███████▋  | 8399/10986 [3:16:03<55:33,  1.29s/it]

training loss: 3.3686535358428955


training:  76%|███████▋  | 8400/10986 [3:16:04<55:09,  1.28s/it]

training loss: 3.4443278312683105
valid loss: 3.448533058166504
perplexity: 31.45421600341797


training:  76%|███████▋  | 8401/10986 [3:16:06<1:13:33,  1.71s/it]

training loss: 3.39851975440979


training:  76%|███████▋  | 8402/10986 [3:16:08<1:13:06,  1.70s/it]

training loss: 3.5182340145111084


training:  76%|███████▋  | 8403/10986 [3:16:09<1:07:43,  1.57s/it]

training loss: 3.515829086303711


training:  76%|███████▋  | 8404/10986 [3:16:11<1:03:54,  1.48s/it]

training loss: 3.5551657676696777


training:  77%|███████▋  | 8405/10986 [3:16:12<1:01:17,  1.42s/it]

training loss: 3.377077579498291


training:  77%|███████▋  | 8406/10986 [3:16:13<59:15,  1.38s/it]  

training loss: 3.4312896728515625


training:  77%|███████▋  | 8407/10986 [3:16:15<57:41,  1.34s/it]

training loss: 3.4230594635009766


training:  77%|███████▋  | 8408/10986 [3:16:16<56:44,  1.32s/it]

training loss: 3.329732894897461


training:  77%|███████▋  | 8409/10986 [3:16:17<56:10,  1.31s/it]

training loss: 3.4921090602874756


training:  77%|███████▋  | 8410/10986 [3:16:18<55:48,  1.30s/it]

training loss: 3.4281234741210938


training:  77%|███████▋  | 8411/10986 [3:16:20<59:05,  1.38s/it]

training loss: 3.464921474456787


training:  77%|███████▋  | 8412/10986 [3:16:22<1:02:40,  1.46s/it]

training loss: 3.395958423614502


training:  77%|███████▋  | 8413/10986 [3:16:23<1:00:44,  1.42s/it]

training loss: 3.4338300228118896


training:  77%|███████▋  | 8414/10986 [3:16:24<59:25,  1.39s/it]  

training loss: 3.572810649871826


training:  77%|███████▋  | 8415/10986 [3:16:25<57:38,  1.35s/it]

training loss: 3.394602060317993


training:  77%|███████▋  | 8416/10986 [3:16:27<56:43,  1.32s/it]

training loss: 3.3475358486175537


training:  77%|███████▋  | 8417/10986 [3:16:28<56:02,  1.31s/it]

training loss: 3.379570245742798


training:  77%|███████▋  | 8418/10986 [3:16:29<55:35,  1.30s/it]

training loss: 3.3248820304870605


training:  77%|███████▋  | 8419/10986 [3:16:31<55:20,  1.29s/it]

training loss: 3.342773675918579


training:  77%|███████▋  | 8420/10986 [3:16:32<55:18,  1.29s/it]

training loss: 3.4132790565490723
valid loss: 3.419140338897705
perplexity: 30.543148040771484


training:  77%|███████▋  | 8421/10986 [3:16:35<1:13:41,  1.72s/it]

training loss: 3.3300602436065674


training:  77%|███████▋  | 8422/10986 [3:16:36<1:12:04,  1.69s/it]

training loss: 3.465282440185547


training:  77%|███████▋  | 8423/10986 [3:16:37<1:06:44,  1.56s/it]

training loss: 3.455594778060913


training:  77%|███████▋  | 8424/10986 [3:16:39<1:03:07,  1.48s/it]

training loss: 3.3727049827575684


training:  77%|███████▋  | 8425/10986 [3:16:40<1:00:42,  1.42s/it]

training loss: 3.450259208679199


training:  77%|███████▋  | 8426/10986 [3:16:41<58:56,  1.38s/it]  

training loss: 3.3409481048583984


training:  77%|███████▋  | 8427/10986 [3:16:43<57:39,  1.35s/it]

training loss: 3.353569984436035


training:  77%|███████▋  | 8428/10986 [3:16:44<56:33,  1.33s/it]

training loss: 3.3915603160858154


training:  77%|███████▋  | 8429/10986 [3:16:45<55:59,  1.31s/it]

training loss: 3.464902400970459


training:  77%|███████▋  | 8430/10986 [3:16:46<55:58,  1.31s/it]

training loss: 3.406071186065674


training:  77%|███████▋  | 8431/10986 [3:16:48<59:49,  1.41s/it]

training loss: 3.3899755477905273


training:  77%|███████▋  | 8432/10986 [3:16:49<58:18,  1.37s/it]

training loss: 3.3512582778930664


training:  77%|███████▋  | 8433/10986 [3:16:51<1:01:34,  1.45s/it]

training loss: 3.3812968730926514


training:  77%|███████▋  | 8434/10986 [3:16:53<1:04:43,  1.52s/it]

training loss: 3.4237732887268066


training:  77%|███████▋  | 8435/10986 [3:16:54<1:04:28,  1.52s/it]

training loss: 3.3802003860473633


training:  77%|███████▋  | 8436/10986 [3:16:55<1:01:29,  1.45s/it]

training loss: 3.3881170749664307


training:  77%|███████▋  | 8437/10986 [3:16:57<59:04,  1.39s/it]  

training loss: 3.346315383911133


training:  77%|███████▋  | 8438/10986 [3:16:58<57:37,  1.36s/it]

training loss: 3.4729344844818115


training:  77%|███████▋  | 8439/10986 [3:16:59<56:37,  1.33s/it]

training loss: 3.439197301864624


training:  77%|███████▋  | 8440/10986 [3:17:01<55:48,  1.32s/it]

training loss: 3.4339020252227783
valid loss: 3.4273786544799805
perplexity: 30.795808792114258


training:  77%|███████▋  | 8441/10986 [3:17:03<1:13:57,  1.74s/it]

training loss: 3.4462974071502686


training:  77%|███████▋  | 8442/10986 [3:17:05<1:08:50,  1.62s/it]

training loss: 3.3349008560180664


training:  77%|███████▋  | 8443/10986 [3:17:06<1:04:38,  1.53s/it]

training loss: 3.4022223949432373


training:  77%|███████▋  | 8444/10986 [3:17:07<1:01:25,  1.45s/it]

training loss: 3.412574529647827


training:  77%|███████▋  | 8445/10986 [3:17:08<59:00,  1.39s/it]  

training loss: 3.397524118423462


training:  77%|███████▋  | 8446/10986 [3:17:10<57:28,  1.36s/it]

training loss: 3.370954990386963


training:  77%|███████▋  | 8447/10986 [3:17:11<56:35,  1.34s/it]

training loss: 3.391237735748291


training:  77%|███████▋  | 8448/10986 [3:17:12<55:38,  1.32s/it]

training loss: 3.270808458328247


training:  77%|███████▋  | 8449/10986 [3:17:14<55:21,  1.31s/it]

training loss: 3.5460214614868164


training:  77%|███████▋  | 8450/10986 [3:17:15<55:03,  1.30s/it]

training loss: 3.374814987182617


training:  77%|███████▋  | 8451/10986 [3:17:16<59:03,  1.40s/it]

training loss: 3.3487253189086914


training:  77%|███████▋  | 8452/10986 [3:17:18<57:34,  1.36s/it]

training loss: 3.377514123916626


training:  77%|███████▋  | 8453/10986 [3:17:19<56:33,  1.34s/it]

training loss: 3.469434976577759


training:  77%|███████▋  | 8454/10986 [3:17:20<55:52,  1.32s/it]

training loss: 3.4044082164764404


training:  77%|███████▋  | 8455/10986 [3:17:22<55:18,  1.31s/it]

training loss: 3.4020919799804688


training:  77%|███████▋  | 8456/10986 [3:17:23<54:46,  1.30s/it]

training loss: 3.3022396564483643


training:  77%|███████▋  | 8457/10986 [3:17:24<54:59,  1.30s/it]

training loss: 3.3612802028656006


training:  77%|███████▋  | 8458/10986 [3:17:25<54:30,  1.29s/it]

training loss: 3.400019407272339


training:  77%|███████▋  | 8459/10986 [3:17:27<54:24,  1.29s/it]

training loss: 3.3519341945648193


training:  77%|███████▋  | 8460/10986 [3:17:28<54:05,  1.28s/it]

training loss: 3.389010429382324
valid loss: 3.3831701278686523
perplexity: 29.464027404785156


training:  77%|███████▋  | 8461/10986 [3:17:31<1:13:09,  1.74s/it]

training loss: 3.4093003273010254


training:  77%|███████▋  | 8462/10986 [3:17:32<1:08:08,  1.62s/it]

training loss: 3.406515598297119


training:  77%|███████▋  | 8463/10986 [3:17:33<1:03:43,  1.52s/it]

training loss: 3.344529867172241


training:  77%|███████▋  | 8464/10986 [3:17:35<1:00:27,  1.44s/it]

training loss: 3.418766736984253


training:  77%|███████▋  | 8465/10986 [3:17:36<58:21,  1.39s/it]  

training loss: 3.4234910011291504


training:  77%|███████▋  | 8466/10986 [3:17:37<56:43,  1.35s/it]

training loss: 3.2963666915893555


training:  77%|███████▋  | 8467/10986 [3:17:39<55:51,  1.33s/it]

training loss: 3.3096344470977783


training:  77%|███████▋  | 8468/10986 [3:17:40<55:06,  1.31s/it]

training loss: 3.4130005836486816


training:  77%|███████▋  | 8469/10986 [3:17:41<54:49,  1.31s/it]

training loss: 3.39469051361084


training:  77%|███████▋  | 8470/10986 [3:17:42<54:17,  1.29s/it]

training loss: 3.3766486644744873


training:  77%|███████▋  | 8471/10986 [3:17:44<57:53,  1.38s/it]

training loss: 3.5490825176239014


training:  77%|███████▋  | 8472/10986 [3:17:45<59:53,  1.43s/it]

training loss: 3.352896213531494


training:  77%|███████▋  | 8473/10986 [3:17:47<58:10,  1.39s/it]

training loss: 3.4347429275512695


training:  77%|███████▋  | 8474/10986 [3:17:48<56:43,  1.35s/it]

training loss: 3.478574514389038


training:  77%|███████▋  | 8475/10986 [3:17:49<55:52,  1.34s/it]

training loss: 3.3913350105285645


training:  77%|███████▋  | 8476/10986 [3:17:51<55:21,  1.32s/it]

training loss: 3.2995007038116455


training:  77%|███████▋  | 8477/10986 [3:17:52<55:02,  1.32s/it]

training loss: 3.4042797088623047


training:  77%|███████▋  | 8478/10986 [3:17:53<54:19,  1.30s/it]

training loss: 3.50447416305542


training:  77%|███████▋  | 8479/10986 [3:17:55<54:47,  1.31s/it]

training loss: 3.382481813430786


training:  77%|███████▋  | 8480/10986 [3:17:56<54:30,  1.31s/it]

training loss: 3.4150390625
valid loss: 3.4128706455230713
perplexity: 30.352251052856445


training:  77%|███████▋  | 8481/10986 [3:17:59<1:13:19,  1.76s/it]

training loss: 3.2952518463134766


training:  77%|███████▋  | 8482/10986 [3:18:00<1:08:09,  1.63s/it]

training loss: 3.5103261470794678


training:  77%|███████▋  | 8483/10986 [3:18:01<1:04:05,  1.54s/it]

training loss: 3.3220086097717285


training:  77%|███████▋  | 8484/10986 [3:18:03<1:01:10,  1.47s/it]

training loss: 3.5164883136749268


training:  77%|███████▋  | 8485/10986 [3:18:04<58:52,  1.41s/it]  

training loss: 3.3072896003723145


training:  77%|███████▋  | 8486/10986 [3:18:05<57:31,  1.38s/it]

training loss: 3.505666971206665


training:  77%|███████▋  | 8487/10986 [3:18:06<56:30,  1.36s/it]

training loss: 3.35929012298584


training:  77%|███████▋  | 8488/10986 [3:18:08<55:48,  1.34s/it]

training loss: 3.309248924255371


training:  77%|███████▋  | 8489/10986 [3:18:09<54:51,  1.32s/it]

training loss: 3.355696201324463


training:  77%|███████▋  | 8490/10986 [3:18:10<54:15,  1.30s/it]

training loss: 3.294583320617676


training:  77%|███████▋  | 8491/10986 [3:18:12<57:35,  1.38s/it]

training loss: 3.353522777557373


training:  77%|███████▋  | 8492/10986 [3:18:13<56:55,  1.37s/it]

training loss: 3.422161340713501


training:  77%|███████▋  | 8493/10986 [3:18:15<55:55,  1.35s/it]

training loss: 3.402527093887329


training:  77%|███████▋  | 8494/10986 [3:18:16<54:59,  1.32s/it]

training loss: 3.443173408508301


training:  77%|███████▋  | 8495/10986 [3:18:17<55:00,  1.32s/it]

training loss: 3.414790630340576


training:  77%|███████▋  | 8496/10986 [3:18:18<54:40,  1.32s/it]

training loss: 3.3929505348205566


training:  77%|███████▋  | 8497/10986 [3:18:20<54:10,  1.31s/it]

training loss: 3.4942679405212402


training:  77%|███████▋  | 8498/10986 [3:18:21<53:48,  1.30s/it]

training loss: 3.4475724697113037


training:  77%|███████▋  | 8499/10986 [3:18:22<53:49,  1.30s/it]

training loss: 3.3975682258605957


training:  77%|███████▋  | 8500/10986 [3:18:24<53:45,  1.30s/it]

training loss: 3.4131641387939453
valid loss: 3.4206976890563965
perplexity: 30.590749740600586


training:  77%|███████▋  | 8501/10986 [3:18:26<1:12:35,  1.75s/it]

training loss: 3.329277515411377


training:  77%|███████▋  | 8502/10986 [3:18:28<1:10:54,  1.71s/it]

training loss: 3.3658246994018555


training:  77%|███████▋  | 8503/10986 [3:18:29<1:06:04,  1.60s/it]

training loss: 3.361974000930786


training:  77%|███████▋  | 8504/10986 [3:18:31<1:02:08,  1.50s/it]

training loss: 3.4674251079559326


training:  77%|███████▋  | 8505/10986 [3:18:32<59:49,  1.45s/it]  

training loss: 3.3867132663726807


training:  77%|███████▋  | 8506/10986 [3:18:33<57:47,  1.40s/it]

training loss: 3.4275131225585938


training:  77%|███████▋  | 8507/10986 [3:18:35<56:25,  1.37s/it]

training loss: 3.454540252685547


training:  77%|███████▋  | 8508/10986 [3:18:36<55:22,  1.34s/it]

training loss: 3.3787364959716797


training:  77%|███████▋  | 8509/10986 [3:18:37<54:39,  1.32s/it]

training loss: 3.3842575550079346


training:  77%|███████▋  | 8510/10986 [3:18:38<54:05,  1.31s/it]

training loss: 3.3421642780303955


training:  77%|███████▋  | 8511/10986 [3:18:40<57:03,  1.38s/it]

training loss: 3.246631145477295


training:  77%|███████▋  | 8512/10986 [3:18:41<56:14,  1.36s/it]

training loss: 3.514122486114502


training:  77%|███████▋  | 8513/10986 [3:18:43<55:12,  1.34s/it]

training loss: 3.401294708251953


training:  77%|███████▋  | 8514/10986 [3:18:44<54:26,  1.32s/it]

training loss: 3.447929859161377


training:  78%|███████▊  | 8515/10986 [3:18:45<53:55,  1.31s/it]

training loss: 3.428971290588379


training:  78%|███████▊  | 8516/10986 [3:18:46<53:40,  1.30s/it]

training loss: 3.4304776191711426


training:  78%|███████▊  | 8517/10986 [3:18:48<54:19,  1.32s/it]

training loss: 3.4137701988220215


training:  78%|███████▊  | 8518/10986 [3:18:49<53:48,  1.31s/it]

training loss: 3.2954907417297363


training:  78%|███████▊  | 8519/10986 [3:18:50<53:36,  1.30s/it]

training loss: 3.5370192527770996


training:  78%|███████▊  | 8520/10986 [3:18:52<53:31,  1.30s/it]

training loss: 3.424887180328369
valid loss: 3.4170145988464355
perplexity: 30.478290557861328


training:  78%|███████▊  | 8521/10986 [3:18:54<1:11:41,  1.75s/it]

training loss: 3.3568854331970215


training:  78%|███████▊  | 8522/10986 [3:18:56<1:10:17,  1.71s/it]

training loss: 3.4704477787017822


training:  78%|███████▊  | 8523/10986 [3:18:57<1:05:21,  1.59s/it]

training loss: 3.4262399673461914


training:  78%|███████▊  | 8524/10986 [3:18:59<1:01:46,  1.51s/it]

training loss: 3.5084197521209717


training:  78%|███████▊  | 8525/10986 [3:19:00<59:05,  1.44s/it]  

training loss: 3.4490106105804443


training:  78%|███████▊  | 8526/10986 [3:19:01<57:06,  1.39s/it]

training loss: 3.424553394317627


training:  78%|███████▊  | 8527/10986 [3:19:03<55:56,  1.36s/it]

training loss: 3.3743088245391846


training:  78%|███████▊  | 8528/10986 [3:19:04<54:56,  1.34s/it]

training loss: 3.3583595752716064


training:  78%|███████▊  | 8529/10986 [3:19:05<54:15,  1.32s/it]

training loss: 3.4089574813842773


training:  78%|███████▊  | 8530/10986 [3:19:06<53:59,  1.32s/it]

training loss: 3.4639573097229004


training:  78%|███████▊  | 8531/10986 [3:19:08<57:04,  1.40s/it]

training loss: 3.4244515895843506


training:  78%|███████▊  | 8532/10986 [3:19:09<56:20,  1.38s/it]

training loss: 3.4752590656280518


training:  78%|███████▊  | 8533/10986 [3:19:11<55:25,  1.36s/it]

training loss: 3.3987390995025635


training:  78%|███████▊  | 8534/10986 [3:19:12<54:33,  1.33s/it]

training loss: 3.3773372173309326


training:  78%|███████▊  | 8535/10986 [3:19:13<54:04,  1.32s/it]

training loss: 3.4759981632232666


training:  78%|███████▊  | 8536/10986 [3:19:15<54:04,  1.32s/it]

training loss: 3.4715616703033447


training:  78%|███████▊  | 8537/10986 [3:19:16<53:47,  1.32s/it]

training loss: 3.42295241355896


training:  78%|███████▊  | 8538/10986 [3:19:17<54:15,  1.33s/it]

training loss: 3.4270148277282715


training:  78%|███████▊  | 8539/10986 [3:19:18<54:13,  1.33s/it]

training loss: 3.370516538619995


training:  78%|███████▊  | 8540/10986 [3:19:20<53:50,  1.32s/it]

training loss: 3.3595364093780518
valid loss: 3.356804132461548
perplexity: 28.697330474853516


training:  78%|███████▊  | 8541/10986 [3:19:23<1:11:40,  1.76s/it]

training loss: 3.455095052719116


training:  78%|███████▊  | 8542/10986 [3:19:24<1:07:11,  1.65s/it]

training loss: 3.3852176666259766


training:  78%|███████▊  | 8543/10986 [3:19:25<1:03:12,  1.55s/it]

training loss: 3.384422779083252


training:  78%|███████▊  | 8544/10986 [3:19:27<1:00:13,  1.48s/it]

training loss: 3.428628921508789


training:  78%|███████▊  | 8545/10986 [3:19:28<58:25,  1.44s/it]  

training loss: 3.359182119369507


training:  78%|███████▊  | 8546/10986 [3:19:29<56:51,  1.40s/it]

training loss: 3.5497329235076904


training:  78%|███████▊  | 8547/10986 [3:19:31<55:29,  1.37s/it]

training loss: 3.504390239715576


training:  78%|███████▊  | 8548/10986 [3:19:32<54:57,  1.35s/it]

training loss: 3.4153358936309814


training:  78%|███████▊  | 8549/10986 [3:19:33<54:13,  1.34s/it]

training loss: 3.5122809410095215


training:  78%|███████▊  | 8550/10986 [3:19:34<54:01,  1.33s/it]

training loss: 3.5622382164001465


training:  78%|███████▊  | 8551/10986 [3:19:36<57:09,  1.41s/it]

training loss: 3.433406114578247


training:  78%|███████▊  | 8552/10986 [3:19:37<55:51,  1.38s/it]

training loss: 3.3765370845794678


training:  78%|███████▊  | 8553/10986 [3:19:39<54:53,  1.35s/it]

training loss: 3.3649394512176514


training:  78%|███████▊  | 8554/10986 [3:19:40<54:04,  1.33s/it]

training loss: 3.411438226699829


training:  78%|███████▊  | 8555/10986 [3:19:41<53:35,  1.32s/it]

training loss: 3.434706926345825


training:  78%|███████▊  | 8556/10986 [3:19:43<53:28,  1.32s/it]

training loss: 3.366293430328369


training:  78%|███████▊  | 8557/10986 [3:19:44<53:10,  1.31s/it]

training loss: 3.3554904460906982


training:  78%|███████▊  | 8558/10986 [3:19:45<53:15,  1.32s/it]

training loss: 3.472754716873169


training:  78%|███████▊  | 8559/10986 [3:19:47<53:08,  1.31s/it]

training loss: 3.4367926120758057


training:  78%|███████▊  | 8560/10986 [3:19:48<53:33,  1.32s/it]

training loss: 3.5082058906555176
valid loss: 3.4996445178985596
perplexity: 33.10367965698242


training:  78%|███████▊  | 8561/10986 [3:19:51<1:11:33,  1.77s/it]

training loss: 3.378690481185913


training:  78%|███████▊  | 8562/10986 [3:19:52<1:09:45,  1.73s/it]

training loss: 3.4670026302337646


training:  78%|███████▊  | 8563/10986 [3:19:54<1:04:35,  1.60s/it]

training loss: 3.377073287963867


training:  78%|███████▊  | 8564/10986 [3:19:55<1:01:11,  1.52s/it]

training loss: 3.41025447845459


training:  78%|███████▊  | 8565/10986 [3:19:56<58:08,  1.44s/it]  

training loss: 3.410019874572754


training:  78%|███████▊  | 8566/10986 [3:19:57<56:26,  1.40s/it]

training loss: 3.426624298095703


training:  78%|███████▊  | 8567/10986 [3:19:59<55:38,  1.38s/it]

training loss: 3.497415065765381


training:  78%|███████▊  | 8568/10986 [3:20:00<54:48,  1.36s/it]

training loss: 3.391253709793091


training:  78%|███████▊  | 8569/10986 [3:20:01<54:19,  1.35s/it]

training loss: 3.4286482334136963


training:  78%|███████▊  | 8570/10986 [3:20:03<53:41,  1.33s/it]

training loss: 3.4263508319854736


training:  78%|███████▊  | 8571/10986 [3:20:04<56:56,  1.41s/it]

training loss: 3.5316007137298584


training:  78%|███████▊  | 8572/10986 [3:20:06<56:13,  1.40s/it]

training loss: 3.2712018489837646


training:  78%|███████▊  | 8573/10986 [3:20:07<55:02,  1.37s/it]

training loss: 3.4272377490997314


training:  78%|███████▊  | 8574/10986 [3:20:08<54:17,  1.35s/it]

training loss: 3.423996686935425


training:  78%|███████▊  | 8575/10986 [3:20:10<53:56,  1.34s/it]

training loss: 3.4146728515625


training:  78%|███████▊  | 8576/10986 [3:20:11<53:33,  1.33s/it]

training loss: 3.4004781246185303


training:  78%|███████▊  | 8577/10986 [3:20:12<53:20,  1.33s/it]

training loss: 3.387047290802002


training:  78%|███████▊  | 8578/10986 [3:20:14<53:04,  1.32s/it]

training loss: 3.4083003997802734


training:  78%|███████▊  | 8579/10986 [3:20:15<52:47,  1.32s/it]

training loss: 3.553913116455078


training:  78%|███████▊  | 8580/10986 [3:20:16<52:38,  1.31s/it]

training loss: 3.4960219860076904
valid loss: 3.4857707023620605
perplexity: 32.647579193115234


training:  78%|███████▊  | 8581/10986 [3:20:19<1:11:16,  1.78s/it]

training loss: 3.395913600921631


training:  78%|███████▊  | 8582/10986 [3:20:20<1:06:42,  1.66s/it]

training loss: 3.361968517303467


training:  78%|███████▊  | 8583/10986 [3:20:22<1:02:19,  1.56s/it]

training loss: 3.3943002223968506


training:  78%|███████▊  | 8584/10986 [3:20:23<59:14,  1.48s/it]  

training loss: 3.4125277996063232


training:  78%|███████▊  | 8585/10986 [3:20:24<57:13,  1.43s/it]

training loss: 3.342449188232422


training:  78%|███████▊  | 8586/10986 [3:20:26<55:28,  1.39s/it]

training loss: 3.3505420684814453


training:  78%|███████▊  | 8587/10986 [3:20:27<54:17,  1.36s/it]

training loss: 3.377936840057373


training:  78%|███████▊  | 8588/10986 [3:20:28<53:17,  1.33s/it]

training loss: 3.3203322887420654


training:  78%|███████▊  | 8589/10986 [3:20:30<52:43,  1.32s/it]

training loss: 3.432379722595215


training:  78%|███████▊  | 8590/10986 [3:20:31<52:19,  1.31s/it]

training loss: 3.4088735580444336


training:  78%|███████▊  | 8591/10986 [3:20:32<55:54,  1.40s/it]

training loss: 3.445814371109009


training:  78%|███████▊  | 8592/10986 [3:20:34<55:03,  1.38s/it]

training loss: 3.431485891342163


training:  78%|███████▊  | 8593/10986 [3:20:35<53:58,  1.35s/it]

training loss: 3.3454666137695312


training:  78%|███████▊  | 8594/10986 [3:20:36<53:03,  1.33s/it]

training loss: 3.5119218826293945


training:  78%|███████▊  | 8595/10986 [3:20:38<52:28,  1.32s/it]

training loss: 3.3425545692443848


training:  78%|███████▊  | 8596/10986 [3:20:39<52:10,  1.31s/it]

training loss: 3.2874767780303955


training:  78%|███████▊  | 8597/10986 [3:20:40<52:14,  1.31s/it]

training loss: 3.440934658050537


training:  78%|███████▊  | 8598/10986 [3:20:42<52:14,  1.31s/it]

training loss: 3.305354118347168


training:  78%|███████▊  | 8599/10986 [3:20:43<52:11,  1.31s/it]

training loss: 3.332225799560547


training:  78%|███████▊  | 8600/10986 [3:20:44<53:08,  1.34s/it]

training loss: 3.398798704147339
valid loss: 3.3969388008117676
perplexity: 29.872512817382812


training:  78%|███████▊  | 8601/10986 [3:20:48<1:20:18,  2.02s/it]

training loss: 3.4027926921844482


training:  78%|███████▊  | 8602/10986 [3:20:49<1:13:07,  1.84s/it]

training loss: 3.4610297679901123


training:  78%|███████▊  | 8603/10986 [3:20:51<1:07:02,  1.69s/it]

training loss: 3.4402754306793213


training:  78%|███████▊  | 8604/10986 [3:20:52<1:02:26,  1.57s/it]

training loss: 3.4088876247406006


training:  78%|███████▊  | 8605/10986 [3:20:53<59:47,  1.51s/it]  

training loss: 3.36487078666687


training:  78%|███████▊  | 8606/10986 [3:20:55<57:33,  1.45s/it]

training loss: 3.4198458194732666


training:  78%|███████▊  | 8607/10986 [3:20:56<55:33,  1.40s/it]

training loss: 3.359982967376709


training:  78%|███████▊  | 8608/10986 [3:20:57<54:21,  1.37s/it]

training loss: 3.392648220062256


training:  78%|███████▊  | 8609/10986 [3:20:58<53:32,  1.35s/it]

training loss: 3.3557145595550537


training:  78%|███████▊  | 8610/10986 [3:21:00<52:50,  1.33s/it]

training loss: 3.453382968902588


training:  78%|███████▊  | 8611/10986 [3:21:01<55:48,  1.41s/it]

training loss: 3.486853837966919


training:  78%|███████▊  | 8612/10986 [3:21:03<54:53,  1.39s/it]

training loss: 3.425405263900757


training:  78%|███████▊  | 8613/10986 [3:21:04<53:51,  1.36s/it]

training loss: 3.3983993530273438


training:  78%|███████▊  | 8614/10986 [3:21:05<53:05,  1.34s/it]

training loss: 3.394192934036255


training:  78%|███████▊  | 8615/10986 [3:21:07<52:21,  1.33s/it]

training loss: 3.439326047897339


training:  78%|███████▊  | 8616/10986 [3:21:08<52:05,  1.32s/it]

training loss: 3.3753936290740967


training:  78%|███████▊  | 8617/10986 [3:21:09<51:55,  1.31s/it]

training loss: 3.3215885162353516


training:  78%|███████▊  | 8618/10986 [3:21:10<51:39,  1.31s/it]

training loss: 3.462838888168335


training:  78%|███████▊  | 8619/10986 [3:21:12<51:21,  1.30s/it]

training loss: 3.4356331825256348


training:  78%|███████▊  | 8620/10986 [3:21:13<51:33,  1.31s/it]

training loss: 3.3941268920898438
valid loss: 3.393009662628174
perplexity: 29.75537109375


training:  78%|███████▊  | 8621/10986 [3:21:16<1:08:58,  1.75s/it]

training loss: 3.339992046356201


training:  78%|███████▊  | 8622/10986 [3:21:17<1:04:37,  1.64s/it]

training loss: 3.4459550380706787


training:  78%|███████▊  | 8623/10986 [3:21:19<1:01:15,  1.56s/it]

training loss: 3.435635805130005


training:  78%|███████▊  | 8624/10986 [3:21:20<58:10,  1.48s/it]  

training loss: 3.3880650997161865


training:  79%|███████▊  | 8625/10986 [3:21:21<56:11,  1.43s/it]

training loss: 3.4811251163482666


training:  79%|███████▊  | 8626/10986 [3:21:22<54:35,  1.39s/it]

training loss: 3.299644947052002


training:  79%|███████▊  | 8627/10986 [3:21:24<53:28,  1.36s/it]

training loss: 3.3659489154815674


training:  79%|███████▊  | 8628/10986 [3:21:25<53:11,  1.35s/it]

training loss: 3.3335187435150146


training:  79%|███████▊  | 8629/10986 [3:21:26<52:22,  1.33s/it]

training loss: 3.352367877960205


training:  79%|███████▊  | 8630/10986 [3:21:28<51:52,  1.32s/it]

training loss: 3.424604654312134


training:  79%|███████▊  | 8631/10986 [3:21:29<55:14,  1.41s/it]

training loss: 3.4224014282226562


training:  79%|███████▊  | 8632/10986 [3:21:31<54:03,  1.38s/it]

training loss: 3.4407670497894287


training:  79%|███████▊  | 8633/10986 [3:21:32<53:22,  1.36s/it]

training loss: 3.3505704402923584


training:  79%|███████▊  | 8634/10986 [3:21:33<52:36,  1.34s/it]

training loss: 3.3666977882385254


training:  79%|███████▊  | 8635/10986 [3:21:35<52:13,  1.33s/it]

training loss: 3.3590831756591797


training:  79%|███████▊  | 8636/10986 [3:21:36<51:46,  1.32s/it]

training loss: 3.4997146129608154


training:  79%|███████▊  | 8637/10986 [3:21:37<51:36,  1.32s/it]

training loss: 3.336343765258789


training:  79%|███████▊  | 8638/10986 [3:21:39<51:46,  1.32s/it]

training loss: 3.320744752883911


training:  79%|███████▊  | 8639/10986 [3:21:40<51:29,  1.32s/it]

training loss: 3.397580146789551


training:  79%|███████▊  | 8640/10986 [3:21:41<51:25,  1.32s/it]

training loss: 3.307644844055176
valid loss: 3.306478977203369
perplexity: 27.288869857788086


training:  79%|███████▊  | 8641/10986 [3:21:44<1:08:57,  1.76s/it]

training loss: 3.391007900238037


training:  79%|███████▊  | 8642/10986 [3:21:45<1:04:47,  1.66s/it]

training loss: 3.392084836959839


training:  79%|███████▊  | 8643/10986 [3:21:47<1:00:48,  1.56s/it]

training loss: 3.407087802886963


training:  79%|███████▊  | 8644/10986 [3:21:48<58:09,  1.49s/it]  

training loss: 3.292240619659424


training:  79%|███████▊  | 8645/10986 [3:21:49<56:15,  1.44s/it]

training loss: 3.3372366428375244


training:  79%|███████▊  | 8646/10986 [3:21:51<54:27,  1.40s/it]

training loss: 3.263030529022217


training:  79%|███████▊  | 8647/10986 [3:21:52<53:31,  1.37s/it]

training loss: 3.3405792713165283


training:  79%|███████▊  | 8648/10986 [3:21:53<52:41,  1.35s/it]

training loss: 3.519731283187866


training:  79%|███████▊  | 8649/10986 [3:21:55<52:26,  1.35s/it]

training loss: 3.357104778289795


training:  79%|███████▊  | 8650/10986 [3:21:56<51:39,  1.33s/it]

training loss: 3.378176212310791


training:  79%|███████▊  | 8651/10986 [3:21:57<55:01,  1.41s/it]

training loss: 3.4961507320404053


training:  79%|███████▉  | 8652/10986 [3:21:59<56:51,  1.46s/it]

training loss: 3.387881278991699


training:  79%|███████▉  | 8653/10986 [3:22:00<55:14,  1.42s/it]

training loss: 3.4706802368164062


training:  79%|███████▉  | 8654/10986 [3:22:02<54:00,  1.39s/it]

training loss: 3.398512363433838


training:  79%|███████▉  | 8655/10986 [3:22:03<53:03,  1.37s/it]

training loss: 3.3689401149749756


training:  79%|███████▉  | 8656/10986 [3:22:04<52:17,  1.35s/it]

training loss: 3.4227523803710938


training:  79%|███████▉  | 8657/10986 [3:22:06<52:00,  1.34s/it]

training loss: 3.396164655685425


training:  79%|███████▉  | 8658/10986 [3:22:07<51:40,  1.33s/it]

training loss: 3.343647003173828


training:  79%|███████▉  | 8659/10986 [3:22:08<51:31,  1.33s/it]

training loss: 3.4753787517547607


training:  79%|███████▉  | 8660/10986 [3:22:10<51:25,  1.33s/it]

training loss: 3.4236972332000732
valid loss: 3.4233713150024414
perplexity: 30.672649383544922


training:  79%|███████▉  | 8661/10986 [3:22:12<1:08:39,  1.77s/it]

training loss: 3.4157681465148926


training:  79%|███████▉  | 8662/10986 [3:22:14<1:04:03,  1.65s/it]

training loss: 3.404330015182495


training:  79%|███████▉  | 8663/10986 [3:22:15<1:00:08,  1.55s/it]

training loss: 3.387768030166626


training:  79%|███████▉  | 8664/10986 [3:22:16<57:10,  1.48s/it]  

training loss: 3.5455386638641357


training:  79%|███████▉  | 8665/10986 [3:22:18<55:15,  1.43s/it]

training loss: 3.326730966567993


training:  79%|███████▉  | 8666/10986 [3:22:19<54:02,  1.40s/it]

training loss: 3.3287200927734375


training:  79%|███████▉  | 8667/10986 [3:22:20<53:18,  1.38s/it]

training loss: 3.2919278144836426


training:  79%|███████▉  | 8668/10986 [3:22:22<52:22,  1.36s/it]

training loss: 3.3736462593078613


training:  79%|███████▉  | 8669/10986 [3:22:23<51:50,  1.34s/it]

training loss: 3.589801788330078


training:  79%|███████▉  | 8670/10986 [3:22:24<51:30,  1.33s/it]

training loss: 3.3874902725219727


training:  79%|███████▉  | 8671/10986 [3:22:26<54:46,  1.42s/it]

training loss: 3.366387367248535


training:  79%|███████▉  | 8672/10986 [3:22:27<53:39,  1.39s/it]

training loss: 3.411757707595825


training:  79%|███████▉  | 8673/10986 [3:22:29<52:36,  1.36s/it]

training loss: 3.385101079940796


training:  79%|███████▉  | 8674/10986 [3:22:30<51:40,  1.34s/it]

training loss: 3.4786224365234375


training:  79%|███████▉  | 8675/10986 [3:22:31<51:17,  1.33s/it]

training loss: 3.3236477375030518


training:  79%|███████▉  | 8676/10986 [3:22:32<51:08,  1.33s/it]

training loss: 3.498741626739502


training:  79%|███████▉  | 8677/10986 [3:22:34<50:42,  1.32s/it]

training loss: 3.504575729370117


training:  79%|███████▉  | 8678/10986 [3:22:35<50:36,  1.32s/it]

training loss: 3.4822041988372803


training:  79%|███████▉  | 8679/10986 [3:22:36<50:26,  1.31s/it]

training loss: 3.4174251556396484


training:  79%|███████▉  | 8680/10986 [3:22:38<50:14,  1.31s/it]

training loss: 3.5543642044067383
valid loss: 3.5528998374938965
perplexity: 34.9144172668457


training:  79%|███████▉  | 8681/10986 [3:22:40<1:07:26,  1.76s/it]

training loss: 3.4596824645996094


training:  79%|███████▉  | 8682/10986 [3:22:42<1:06:04,  1.72s/it]

training loss: 3.347970962524414


training:  79%|███████▉  | 8683/10986 [3:22:43<1:01:30,  1.60s/it]

training loss: 3.308612585067749


training:  79%|███████▉  | 8684/10986 [3:22:45<57:45,  1.51s/it]  

training loss: 3.4690651893615723


training:  79%|███████▉  | 8685/10986 [3:22:46<55:30,  1.45s/it]

training loss: 3.4175846576690674


training:  79%|███████▉  | 8686/10986 [3:22:47<53:48,  1.40s/it]

training loss: 3.437082290649414


training:  79%|███████▉  | 8687/10986 [3:22:49<52:43,  1.38s/it]

training loss: 3.4290249347686768


training:  79%|███████▉  | 8688/10986 [3:22:50<52:28,  1.37s/it]

training loss: 3.3889546394348145


training:  79%|███████▉  | 8689/10986 [3:22:51<51:58,  1.36s/it]

training loss: 3.416109561920166


training:  79%|███████▉  | 8690/10986 [3:22:53<51:31,  1.35s/it]

training loss: 3.4823687076568604


training:  79%|███████▉  | 8691/10986 [3:22:54<54:15,  1.42s/it]

training loss: 3.3357083797454834


training:  79%|███████▉  | 8692/10986 [3:22:56<56:03,  1.47s/it]

training loss: 3.4288671016693115


training:  79%|███████▉  | 8693/10986 [3:22:57<54:10,  1.42s/it]

training loss: 3.4480719566345215


training:  79%|███████▉  | 8694/10986 [3:22:58<52:47,  1.38s/it]

training loss: 3.467195987701416


training:  79%|███████▉  | 8695/10986 [3:23:00<52:02,  1.36s/it]

training loss: 3.332045316696167


training:  79%|███████▉  | 8696/10986 [3:23:01<51:27,  1.35s/it]

training loss: 3.4652295112609863


training:  79%|███████▉  | 8697/10986 [3:23:02<51:08,  1.34s/it]

training loss: 3.4088683128356934


training:  79%|███████▉  | 8698/10986 [3:23:04<50:53,  1.33s/it]

training loss: 3.382007598876953


training:  79%|███████▉  | 8699/10986 [3:23:05<50:46,  1.33s/it]

training loss: 3.490185499191284


training:  79%|███████▉  | 8700/10986 [3:23:06<50:30,  1.33s/it]

training loss: 3.4292945861816406
valid loss: 3.4285311698913574
perplexity: 30.831323623657227


training:  79%|███████▉  | 8701/10986 [3:23:09<1:07:16,  1.77s/it]

training loss: 3.4850339889526367


training:  79%|███████▉  | 8702/10986 [3:23:11<1:05:48,  1.73s/it]

training loss: 3.379396438598633


training:  79%|███████▉  | 8703/10986 [3:23:12<1:01:22,  1.61s/it]

training loss: 3.4858593940734863


training:  79%|███████▉  | 8704/10986 [3:23:13<57:59,  1.52s/it]  

training loss: 3.4174811840057373


training:  79%|███████▉  | 8705/10986 [3:23:15<55:42,  1.47s/it]

training loss: 3.5142242908477783


training:  79%|███████▉  | 8706/10986 [3:23:16<53:45,  1.41s/it]

training loss: 3.4063282012939453


training:  79%|███████▉  | 8707/10986 [3:23:17<52:28,  1.38s/it]

training loss: 3.4264135360717773


training:  79%|███████▉  | 8708/10986 [3:23:19<51:29,  1.36s/it]

training loss: 3.635122537612915


training:  79%|███████▉  | 8709/10986 [3:23:20<51:27,  1.36s/it]

training loss: 3.4065091609954834


training:  79%|███████▉  | 8710/10986 [3:23:21<50:54,  1.34s/it]

training loss: 3.404247522354126


training:  79%|███████▉  | 8711/10986 [3:23:23<54:22,  1.43s/it]

training loss: 3.3964672088623047


training:  79%|███████▉  | 8712/10986 [3:23:24<53:33,  1.41s/it]

training loss: 3.4555132389068604


training:  79%|███████▉  | 8713/10986 [3:23:26<52:29,  1.39s/it]

training loss: 3.5140628814697266


training:  79%|███████▉  | 8714/10986 [3:23:27<52:00,  1.37s/it]

training loss: 3.3703417778015137


training:  79%|███████▉  | 8715/10986 [3:23:28<51:20,  1.36s/it]

training loss: 3.289936065673828


training:  79%|███████▉  | 8716/10986 [3:23:30<50:41,  1.34s/it]

training loss: 3.458634614944458


training:  79%|███████▉  | 8717/10986 [3:23:31<50:33,  1.34s/it]

training loss: 3.3699536323547363


training:  79%|███████▉  | 8718/10986 [3:23:32<50:31,  1.34s/it]

training loss: 3.33730411529541


training:  79%|███████▉  | 8719/10986 [3:23:34<50:15,  1.33s/it]

training loss: 3.3848514556884766


training:  79%|███████▉  | 8720/10986 [3:23:35<49:48,  1.32s/it]

training loss: 3.4660120010375977
valid loss: 3.464919328689575
perplexity: 31.973878860473633


training:  79%|███████▉  | 8721/10986 [3:23:38<1:06:54,  1.77s/it]

training loss: 3.29679274559021


training:  79%|███████▉  | 8722/10986 [3:23:39<1:02:27,  1.66s/it]

training loss: 3.368823289871216


training:  79%|███████▉  | 8723/10986 [3:23:40<58:23,  1.55s/it]  

training loss: 3.478227138519287


training:  79%|███████▉  | 8724/10986 [3:23:42<55:35,  1.47s/it]

training loss: 3.493596315383911


training:  79%|███████▉  | 8725/10986 [3:23:43<53:55,  1.43s/it]

training loss: 3.4761850833892822


training:  79%|███████▉  | 8726/10986 [3:23:44<52:35,  1.40s/it]

training loss: 3.310344696044922


training:  79%|███████▉  | 8727/10986 [3:23:46<51:24,  1.37s/it]

training loss: 3.4551734924316406


training:  79%|███████▉  | 8728/10986 [3:23:47<50:46,  1.35s/it]

training loss: 3.4579503536224365


training:  79%|███████▉  | 8729/10986 [3:23:48<50:34,  1.34s/it]

training loss: 3.550192356109619


training:  79%|███████▉  | 8730/10986 [3:23:50<49:58,  1.33s/it]

training loss: 3.3925206661224365


training:  79%|███████▉  | 8731/10986 [3:23:51<53:39,  1.43s/it]

training loss: 3.457961082458496


training:  79%|███████▉  | 8732/10986 [3:23:53<55:48,  1.49s/it]

training loss: 3.407090187072754


training:  79%|███████▉  | 8733/10986 [3:23:54<53:55,  1.44s/it]

training loss: 3.514054298400879


training:  80%|███████▉  | 8734/10986 [3:23:55<52:28,  1.40s/it]

training loss: 3.3383021354675293


training:  80%|███████▉  | 8735/10986 [3:23:57<51:11,  1.36s/it]

training loss: 3.3926000595092773


training:  80%|███████▉  | 8736/10986 [3:23:58<50:53,  1.36s/it]

training loss: 3.435368537902832


training:  80%|███████▉  | 8737/10986 [3:23:59<50:26,  1.35s/it]

training loss: 3.499591827392578


training:  80%|███████▉  | 8738/10986 [3:24:01<49:48,  1.33s/it]

training loss: 3.333415985107422


training:  80%|███████▉  | 8739/10986 [3:24:02<49:36,  1.32s/it]

training loss: 3.4125595092773438


training:  80%|███████▉  | 8740/10986 [3:24:03<49:32,  1.32s/it]

training loss: 3.408156156539917
valid loss: 3.408806800842285
perplexity: 30.22915267944336


training:  80%|███████▉  | 8741/10986 [3:24:06<1:05:57,  1.76s/it]

training loss: 3.377788782119751


training:  80%|███████▉  | 8742/10986 [3:24:08<1:04:23,  1.72s/it]

training loss: 3.315600872039795


training:  80%|███████▉  | 8743/10986 [3:24:09<59:45,  1.60s/it]  

training loss: 3.4676759243011475


training:  80%|███████▉  | 8744/10986 [3:24:10<56:38,  1.52s/it]

training loss: 3.5204598903656006


training:  80%|███████▉  | 8745/10986 [3:24:12<54:08,  1.45s/it]

training loss: 3.4680161476135254


training:  80%|███████▉  | 8746/10986 [3:24:13<52:39,  1.41s/it]

training loss: 3.3240339756011963


training:  80%|███████▉  | 8747/10986 [3:24:14<51:29,  1.38s/it]

training loss: 3.419991970062256


training:  80%|███████▉  | 8748/10986 [3:24:16<50:40,  1.36s/it]

training loss: 3.4591774940490723


training:  80%|███████▉  | 8749/10986 [3:24:17<50:08,  1.34s/it]

training loss: 3.4546890258789062


training:  80%|███████▉  | 8750/10986 [3:24:18<49:41,  1.33s/it]

training loss: 3.3624393939971924


training:  80%|███████▉  | 8751/10986 [3:24:20<52:43,  1.42s/it]

training loss: 3.4024336338043213


training:  80%|███████▉  | 8752/10986 [3:24:21<52:08,  1.40s/it]

training loss: 3.360989809036255


training:  80%|███████▉  | 8753/10986 [3:24:23<51:18,  1.38s/it]

training loss: 3.482579231262207


training:  80%|███████▉  | 8754/10986 [3:24:24<50:44,  1.36s/it]

training loss: 3.413086414337158


training:  80%|███████▉  | 8755/10986 [3:24:25<50:25,  1.36s/it]

training loss: 3.4801180362701416


training:  80%|███████▉  | 8756/10986 [3:24:27<49:49,  1.34s/it]

training loss: 3.479478597640991


training:  80%|███████▉  | 8757/10986 [3:24:28<49:37,  1.34s/it]

training loss: 3.3990461826324463


training:  80%|███████▉  | 8758/10986 [3:24:29<49:11,  1.32s/it]

training loss: 3.4128286838531494


training:  80%|███████▉  | 8759/10986 [3:24:30<49:04,  1.32s/it]

training loss: 3.4447221755981445


training:  80%|███████▉  | 8760/10986 [3:24:32<49:09,  1.33s/it]

training loss: 3.467039108276367
valid loss: 3.4643962383270264
perplexity: 31.95716094970703


training:  80%|███████▉  | 8761/10986 [3:24:35<1:05:54,  1.78s/it]

training loss: 3.2463130950927734


training:  80%|███████▉  | 8762/10986 [3:24:36<1:01:17,  1.65s/it]

training loss: 3.4277844429016113


training:  80%|███████▉  | 8763/10986 [3:24:37<57:25,  1.55s/it]  

training loss: 3.3298535346984863


training:  80%|███████▉  | 8764/10986 [3:24:39<54:51,  1.48s/it]

training loss: 3.499906301498413


training:  80%|███████▉  | 8765/10986 [3:24:40<53:00,  1.43s/it]

training loss: 3.3441128730773926


training:  80%|███████▉  | 8766/10986 [3:24:41<51:41,  1.40s/it]

training loss: 3.279663562774658


training:  80%|███████▉  | 8767/10986 [3:24:43<50:47,  1.37s/it]

training loss: 3.40724778175354


training:  80%|███████▉  | 8768/10986 [3:24:44<49:59,  1.35s/it]

training loss: 3.4574458599090576


training:  80%|███████▉  | 8769/10986 [3:24:45<49:35,  1.34s/it]

training loss: 3.3393466472625732


training:  80%|███████▉  | 8770/10986 [3:24:46<49:21,  1.34s/it]

training loss: 3.372598886489868


training:  80%|███████▉  | 8771/10986 [3:24:48<52:30,  1.42s/it]

training loss: 3.341547727584839


training:  80%|███████▉  | 8772/10986 [3:24:49<51:47,  1.40s/it]

training loss: 3.352839946746826


training:  80%|███████▉  | 8773/10986 [3:24:51<55:45,  1.51s/it]

training loss: 3.39892578125


training:  80%|███████▉  | 8774/10986 [3:24:53<58:33,  1.59s/it]

training loss: 3.447260618209839


training:  80%|███████▉  | 8775/10986 [3:24:55<57:32,  1.56s/it]

training loss: 3.358719825744629


training:  80%|███████▉  | 8776/10986 [3:24:56<54:45,  1.49s/it]

training loss: 3.4359400272369385


training:  80%|███████▉  | 8777/10986 [3:24:57<52:41,  1.43s/it]

training loss: 3.420924425125122


training:  80%|███████▉  | 8778/10986 [3:24:58<51:26,  1.40s/it]

training loss: 3.454896926879883


training:  80%|███████▉  | 8779/10986 [3:25:00<50:11,  1.36s/it]

training loss: 3.3287086486816406


training:  80%|███████▉  | 8780/10986 [3:25:01<49:48,  1.35s/it]

training loss: 3.4087562561035156
valid loss: 3.405860662460327
perplexity: 30.14022445678711


training:  80%|███████▉  | 8781/10986 [3:25:04<1:06:09,  1.80s/it]

training loss: 3.4897525310516357


training:  80%|███████▉  | 8782/10986 [3:25:05<1:01:36,  1.68s/it]

training loss: 3.4098260402679443


training:  80%|███████▉  | 8783/10986 [3:25:07<57:21,  1.56s/it]  

training loss: 3.4316892623901367


training:  80%|███████▉  | 8784/10986 [3:25:08<54:32,  1.49s/it]

training loss: 3.3982017040252686


training:  80%|███████▉  | 8785/10986 [3:25:09<52:31,  1.43s/it]

training loss: 3.4974594116210938


training:  80%|███████▉  | 8786/10986 [3:25:11<51:10,  1.40s/it]

training loss: 3.4614322185516357


training:  80%|███████▉  | 8787/10986 [3:25:12<50:11,  1.37s/it]

training loss: 3.352736234664917


training:  80%|███████▉  | 8788/10986 [3:25:13<49:44,  1.36s/it]

training loss: 3.48095703125


training:  80%|████████  | 8789/10986 [3:25:14<49:06,  1.34s/it]

training loss: 3.3786215782165527


training:  80%|████████  | 8790/10986 [3:25:16<48:33,  1.33s/it]

training loss: 3.440979242324829


training:  80%|████████  | 8791/10986 [3:25:17<51:53,  1.42s/it]

training loss: 3.4422247409820557


training:  80%|████████  | 8792/10986 [3:25:19<50:43,  1.39s/it]

training loss: 3.4820282459259033


training:  80%|████████  | 8793/10986 [3:25:20<49:48,  1.36s/it]

training loss: 3.4321906566619873


training:  80%|████████  | 8794/10986 [3:25:21<49:28,  1.35s/it]

training loss: 3.5759620666503906


training:  80%|████████  | 8795/10986 [3:25:23<49:11,  1.35s/it]

training loss: 3.403290271759033


training:  80%|████████  | 8796/10986 [3:25:24<48:37,  1.33s/it]

training loss: 3.3779585361480713


training:  80%|████████  | 8797/10986 [3:25:25<49:02,  1.34s/it]

training loss: 3.3435564041137695


training:  80%|████████  | 8798/10986 [3:25:27<48:39,  1.33s/it]

training loss: 3.4870941638946533


training:  80%|████████  | 8799/10986 [3:25:28<48:22,  1.33s/it]

training loss: 3.3750953674316406


training:  80%|████████  | 8800/10986 [3:25:29<48:21,  1.33s/it]

training loss: 3.3106298446655273
valid loss: 3.307727336883545
perplexity: 27.32295799255371


training:  80%|████████  | 8801/10986 [3:25:32<1:04:46,  1.78s/it]

training loss: 3.458998680114746


training:  80%|████████  | 8802/10986 [3:25:34<1:00:32,  1.66s/it]

training loss: 3.4752702713012695


training:  80%|████████  | 8803/10986 [3:25:35<56:38,  1.56s/it]  

training loss: 3.508329391479492


training:  80%|████████  | 8804/10986 [3:25:36<53:52,  1.48s/it]

training loss: 3.471006155014038


training:  80%|████████  | 8805/10986 [3:25:37<52:00,  1.43s/it]

training loss: 3.36450457572937


training:  80%|████████  | 8806/10986 [3:25:39<50:36,  1.39s/it]

training loss: 3.354088306427002


training:  80%|████████  | 8807/10986 [3:25:40<49:43,  1.37s/it]

training loss: 3.4657585620880127


training:  80%|████████  | 8808/10986 [3:25:41<49:12,  1.36s/it]

training loss: 3.3007800579071045


training:  80%|████████  | 8809/10986 [3:25:43<48:50,  1.35s/it]

training loss: 3.5025646686553955


training:  80%|████████  | 8810/10986 [3:25:44<48:25,  1.34s/it]

training loss: 3.254645347595215


training:  80%|████████  | 8811/10986 [3:25:46<51:34,  1.42s/it]

training loss: 3.4051191806793213


training:  80%|████████  | 8812/10986 [3:25:47<50:22,  1.39s/it]

training loss: 3.3162641525268555


training:  80%|████████  | 8813/10986 [3:25:48<49:28,  1.37s/it]

training loss: 3.5329978466033936


training:  80%|████████  | 8814/10986 [3:25:50<48:56,  1.35s/it]

training loss: 3.3926541805267334


training:  80%|████████  | 8815/10986 [3:25:51<48:29,  1.34s/it]

training loss: 3.4574968814849854


training:  80%|████████  | 8816/10986 [3:25:52<48:53,  1.35s/it]

training loss: 3.4612536430358887


training:  80%|████████  | 8817/10986 [3:25:54<48:41,  1.35s/it]

training loss: 3.422868251800537


training:  80%|████████  | 8818/10986 [3:25:55<48:31,  1.34s/it]

training loss: 3.472806453704834


training:  80%|████████  | 8819/10986 [3:25:56<48:16,  1.34s/it]

training loss: 3.4327921867370605


training:  80%|████████  | 8820/10986 [3:25:58<47:55,  1.33s/it]

training loss: 3.4101579189300537
valid loss: 3.406384229660034
perplexity: 30.1560115814209


training:  80%|████████  | 8821/10986 [3:26:00<1:04:06,  1.78s/it]

training loss: 3.3374695777893066


training:  80%|████████  | 8822/10986 [3:26:02<1:02:55,  1.74s/it]

training loss: 3.4455182552337646


training:  80%|████████  | 8823/10986 [3:26:03<58:20,  1.62s/it]  

training loss: 3.3643789291381836


training:  80%|████████  | 8824/10986 [3:26:05<55:00,  1.53s/it]

training loss: 3.5069451332092285


training:  80%|████████  | 8825/10986 [3:26:06<52:36,  1.46s/it]

training loss: 3.472259283065796


training:  80%|████████  | 8826/10986 [3:26:07<51:15,  1.42s/it]

training loss: 3.331904172897339


training:  80%|████████  | 8827/10986 [3:26:09<50:03,  1.39s/it]

training loss: 3.36887264251709


training:  80%|████████  | 8828/10986 [3:26:10<49:02,  1.36s/it]

training loss: 3.373239040374756


training:  80%|████████  | 8829/10986 [3:26:11<48:42,  1.36s/it]

training loss: 3.431337833404541


training:  80%|████████  | 8830/10986 [3:26:13<48:09,  1.34s/it]

training loss: 3.4486751556396484


training:  80%|████████  | 8831/10986 [3:26:14<50:42,  1.41s/it]

training loss: 3.3952407836914062


training:  80%|████████  | 8832/10986 [3:26:16<52:47,  1.47s/it]

training loss: 3.410001754760742


training:  80%|████████  | 8833/10986 [3:26:17<50:57,  1.42s/it]

training loss: 3.284616708755493


training:  80%|████████  | 8834/10986 [3:26:18<49:49,  1.39s/it]

training loss: 3.531252861022949


training:  80%|████████  | 8835/10986 [3:26:20<48:58,  1.37s/it]

training loss: 3.2801287174224854


training:  80%|████████  | 8836/10986 [3:26:21<48:22,  1.35s/it]

training loss: 3.53444766998291


training:  80%|████████  | 8837/10986 [3:26:22<47:57,  1.34s/it]

training loss: 3.487536668777466


training:  80%|████████  | 8838/10986 [3:26:24<47:41,  1.33s/it]

training loss: 3.378380298614502


training:  80%|████████  | 8839/10986 [3:26:25<47:43,  1.33s/it]

training loss: 3.6540865898132324


training:  80%|████████  | 8840/10986 [3:26:26<47:32,  1.33s/it]

training loss: 3.3486263751983643
valid loss: 3.346144199371338
perplexity: 28.39304542541504


training:  80%|████████  | 8841/10986 [3:26:29<1:04:10,  1.80s/it]

training loss: 3.3960936069488525


training:  80%|████████  | 8842/10986 [3:26:31<1:02:39,  1.75s/it]

training loss: 3.424771785736084


training:  80%|████████  | 8843/10986 [3:26:32<58:19,  1.63s/it]  

training loss: 3.4314401149749756


training:  81%|████████  | 8844/10986 [3:26:34<54:53,  1.54s/it]

training loss: 3.3816676139831543


training:  81%|████████  | 8845/10986 [3:26:35<52:26,  1.47s/it]

training loss: 3.403312921524048


training:  81%|████████  | 8846/10986 [3:26:36<51:00,  1.43s/it]

training loss: 3.394827365875244


training:  81%|████████  | 8847/10986 [3:26:37<49:50,  1.40s/it]

training loss: 3.500476837158203


training:  81%|████████  | 8848/10986 [3:26:39<49:13,  1.38s/it]

training loss: 3.4565587043762207


training:  81%|████████  | 8849/10986 [3:26:40<48:31,  1.36s/it]

training loss: 3.3264591693878174


training:  81%|████████  | 8850/10986 [3:26:41<47:59,  1.35s/it]

training loss: 3.4387073516845703


training:  81%|████████  | 8851/10986 [3:26:43<50:44,  1.43s/it]

training loss: 3.4077768325805664


training:  81%|████████  | 8852/10986 [3:26:44<49:50,  1.40s/it]

training loss: 3.5015060901641846


training:  81%|████████  | 8853/10986 [3:26:46<48:58,  1.38s/it]

training loss: 3.3065412044525146


training:  81%|████████  | 8854/10986 [3:26:47<48:28,  1.36s/it]

training loss: 3.3520350456237793


training:  81%|████████  | 8855/10986 [3:26:48<47:59,  1.35s/it]

training loss: 3.426917552947998


training:  81%|████████  | 8856/10986 [3:26:50<47:30,  1.34s/it]

training loss: 3.3689279556274414


training:  81%|████████  | 8857/10986 [3:26:51<47:12,  1.33s/it]

training loss: 3.376253604888916


training:  81%|████████  | 8858/10986 [3:26:52<47:11,  1.33s/it]

training loss: 3.4682140350341797


training:  81%|████████  | 8859/10986 [3:26:54<46:52,  1.32s/it]

training loss: 3.4498050212860107


training:  81%|████████  | 8860/10986 [3:26:55<47:19,  1.34s/it]

training loss: 3.383971929550171
valid loss: 3.3877201080322266
perplexity: 29.5983943939209


training:  81%|████████  | 8861/10986 [3:26:58<1:03:41,  1.80s/it]

training loss: 3.3776509761810303


training:  81%|████████  | 8862/10986 [3:26:59<59:21,  1.68s/it]  

training loss: 3.413052558898926


training:  81%|████████  | 8863/10986 [3:27:01<55:41,  1.57s/it]

training loss: 3.4180755615234375


training:  81%|████████  | 8864/10986 [3:27:02<53:04,  1.50s/it]

training loss: 3.399646282196045


training:  81%|████████  | 8865/10986 [3:27:03<51:02,  1.44s/it]

training loss: 3.469578266143799


training:  81%|████████  | 8866/10986 [3:27:05<49:41,  1.41s/it]

training loss: 3.4221670627593994


training:  81%|████████  | 8867/10986 [3:27:06<48:39,  1.38s/it]

training loss: 3.434479236602783


training:  81%|████████  | 8868/10986 [3:27:07<47:54,  1.36s/it]

training loss: 3.5023138523101807


training:  81%|████████  | 8869/10986 [3:27:09<47:30,  1.35s/it]

training loss: 3.4887869358062744


training:  81%|████████  | 8870/10986 [3:27:10<47:21,  1.34s/it]

training loss: 3.4194624423980713


training:  81%|████████  | 8871/10986 [3:27:11<49:56,  1.42s/it]

training loss: 3.578495979309082


training:  81%|████████  | 8872/10986 [3:27:13<49:03,  1.39s/it]

training loss: 3.6832385063171387


training:  81%|████████  | 8873/10986 [3:27:14<48:17,  1.37s/it]

training loss: 3.425954580307007


training:  81%|████████  | 8874/10986 [3:27:15<47:37,  1.35s/it]

training loss: 3.376511335372925


training:  81%|████████  | 8875/10986 [3:27:17<47:13,  1.34s/it]

training loss: 3.406869411468506


training:  81%|████████  | 8876/10986 [3:27:18<46:45,  1.33s/it]

training loss: 3.5402300357818604


training:  81%|████████  | 8877/10986 [3:27:19<46:23,  1.32s/it]

training loss: 3.350250244140625


training:  81%|████████  | 8878/10986 [3:27:21<46:06,  1.31s/it]

training loss: 3.360907554626465


training:  81%|████████  | 8879/10986 [3:27:22<46:12,  1.32s/it]

training loss: 3.4802780151367188


training:  81%|████████  | 8880/10986 [3:27:23<46:13,  1.32s/it]

training loss: 3.358776092529297
valid loss: 3.3540360927581787
perplexity: 28.618005752563477


training:  81%|████████  | 8881/10986 [3:27:26<1:02:32,  1.78s/it]

training loss: 3.353682041168213


training:  81%|████████  | 8882/10986 [3:27:28<1:00:52,  1.74s/it]

training loss: 3.329437494277954


training:  81%|████████  | 8883/10986 [3:27:29<56:36,  1.62s/it]  

training loss: 3.538654088973999


training:  81%|████████  | 8884/10986 [3:27:30<53:22,  1.52s/it]

training loss: 3.304105758666992


training:  81%|████████  | 8885/10986 [3:27:32<51:14,  1.46s/it]

training loss: 3.467911720275879


training:  81%|████████  | 8886/10986 [3:27:33<49:39,  1.42s/it]

training loss: 3.3924412727355957


training:  81%|████████  | 8887/10986 [3:27:34<48:35,  1.39s/it]

training loss: 3.4809999465942383


training:  81%|████████  | 8888/10986 [3:27:36<47:51,  1.37s/it]

training loss: 3.408834934234619


training:  81%|████████  | 8889/10986 [3:27:37<47:13,  1.35s/it]

training loss: 3.4383668899536133


training:  81%|████████  | 8890/10986 [3:27:38<46:54,  1.34s/it]

training loss: 3.397380828857422


training:  81%|████████  | 8891/10986 [3:27:40<49:38,  1.42s/it]

training loss: 3.347668170928955


training:  81%|████████  | 8892/10986 [3:27:41<48:22,  1.39s/it]

training loss: 3.4368793964385986


training:  81%|████████  | 8893/10986 [3:27:43<47:31,  1.36s/it]

training loss: 3.4806575775146484


training:  81%|████████  | 8894/10986 [3:27:44<46:59,  1.35s/it]

training loss: 3.3259787559509277


training:  81%|████████  | 8895/10986 [3:27:45<46:29,  1.33s/it]

training loss: 3.443324565887451


training:  81%|████████  | 8896/10986 [3:27:46<46:23,  1.33s/it]

training loss: 3.3392295837402344


training:  81%|████████  | 8897/10986 [3:27:48<46:02,  1.32s/it]

training loss: 3.336336851119995


training:  81%|████████  | 8898/10986 [3:27:49<45:50,  1.32s/it]

training loss: 3.3588600158691406


training:  81%|████████  | 8899/10986 [3:27:50<45:46,  1.32s/it]

training loss: 3.350604295730591


training:  81%|████████  | 8900/10986 [3:27:52<45:43,  1.32s/it]

training loss: 3.4047110080718994
valid loss: 3.4056432247161865
perplexity: 30.1336727142334


training:  81%|████████  | 8901/10986 [3:27:55<1:01:29,  1.77s/it]

training loss: 3.436955213546753


training:  81%|████████  | 8902/10986 [3:27:56<57:49,  1.66s/it]  

training loss: 3.44336199760437


training:  81%|████████  | 8903/10986 [3:27:57<54:21,  1.57s/it]

training loss: 3.4594521522521973


training:  81%|████████  | 8904/10986 [3:27:59<51:45,  1.49s/it]

training loss: 3.441556692123413


training:  81%|████████  | 8905/10986 [3:28:00<49:58,  1.44s/it]

training loss: 3.3624181747436523


training:  81%|████████  | 8906/10986 [3:28:01<48:31,  1.40s/it]

training loss: 3.440927743911743


training:  81%|████████  | 8907/10986 [3:28:03<47:33,  1.37s/it]

training loss: 3.382610321044922


training:  81%|████████  | 8908/10986 [3:28:04<46:47,  1.35s/it]

training loss: 3.449193000793457


training:  81%|████████  | 8909/10986 [3:28:05<46:24,  1.34s/it]

training loss: 3.36225962638855


training:  81%|████████  | 8910/10986 [3:28:06<46:03,  1.33s/it]

training loss: 3.2822513580322266


training:  81%|████████  | 8911/10986 [3:28:08<48:45,  1.41s/it]

training loss: 3.3958542346954346


training:  81%|████████  | 8912/10986 [3:28:10<50:26,  1.46s/it]

training loss: 3.525590181350708


training:  81%|████████  | 8913/10986 [3:28:11<48:43,  1.41s/it]

training loss: 3.3847107887268066


training:  81%|████████  | 8914/10986 [3:28:12<47:38,  1.38s/it]

training loss: 3.3770697116851807


training:  81%|████████  | 8915/10986 [3:28:14<47:06,  1.36s/it]

training loss: 3.3867053985595703


training:  81%|████████  | 8916/10986 [3:28:15<46:25,  1.35s/it]

training loss: 3.3875486850738525


training:  81%|████████  | 8917/10986 [3:28:16<45:55,  1.33s/it]

training loss: 3.3735055923461914


training:  81%|████████  | 8918/10986 [3:28:17<45:38,  1.32s/it]

training loss: 3.292874574661255


training:  81%|████████  | 8919/10986 [3:28:19<45:22,  1.32s/it]

training loss: 3.3477423191070557


training:  81%|████████  | 8920/10986 [3:28:20<45:04,  1.31s/it]

training loss: 3.3812997341156006
valid loss: 3.3780152797698975
perplexity: 29.312536239624023


training:  81%|████████  | 8921/10986 [3:28:23<1:00:26,  1.76s/it]

training loss: 3.2944252490997314


training:  81%|████████  | 8922/10986 [3:28:24<56:48,  1.65s/it]  

training loss: 3.442976474761963


training:  81%|████████  | 8923/10986 [3:28:26<53:30,  1.56s/it]

training loss: 3.422238349914551


training:  81%|████████  | 8924/10986 [3:28:27<51:42,  1.50s/it]

training loss: 3.3790321350097656


training:  81%|████████  | 8925/10986 [3:28:28<49:44,  1.45s/it]

training loss: 3.356767416000366


training:  81%|████████  | 8926/10986 [3:28:30<48:20,  1.41s/it]

training loss: 3.4132609367370605


training:  81%|████████▏ | 8927/10986 [3:28:31<47:15,  1.38s/it]

training loss: 3.4229588508605957


training:  81%|████████▏ | 8928/10986 [3:28:32<46:45,  1.36s/it]

training loss: 3.461669445037842


training:  81%|████████▏ | 8929/10986 [3:28:34<45:57,  1.34s/it]

training loss: 3.4040729999542236


training:  81%|████████▏ | 8930/10986 [3:28:35<45:24,  1.32s/it]

training loss: 3.527881622314453


training:  81%|████████▏ | 8931/10986 [3:28:36<48:16,  1.41s/it]

training loss: 3.424105644226074


training:  81%|████████▏ | 8932/10986 [3:28:38<49:45,  1.45s/it]

training loss: 3.4237985610961914


training:  81%|████████▏ | 8933/10986 [3:28:39<48:15,  1.41s/it]

training loss: 3.47507381439209


training:  81%|████████▏ | 8934/10986 [3:28:41<47:14,  1.38s/it]

training loss: 3.4097681045532227


training:  81%|████████▏ | 8935/10986 [3:28:42<46:23,  1.36s/it]

training loss: 3.2996790409088135


training:  81%|████████▏ | 8936/10986 [3:28:43<45:52,  1.34s/it]

training loss: 3.469668388366699


training:  81%|████████▏ | 8937/10986 [3:28:45<45:34,  1.33s/it]

training loss: 3.410386562347412


training:  81%|████████▏ | 8938/10986 [3:28:46<45:25,  1.33s/it]

training loss: 3.506873846054077


training:  81%|████████▏ | 8939/10986 [3:28:47<45:25,  1.33s/it]

training loss: 3.467055320739746


training:  81%|████████▏ | 8940/10986 [3:28:49<45:10,  1.32s/it]

training loss: 3.34645414352417
valid loss: 3.3426218032836914
perplexity: 28.293209075927734


training:  81%|████████▏ | 8941/10986 [3:28:51<1:00:22,  1.77s/it]

training loss: 3.383018732070923


training:  81%|████████▏ | 8942/10986 [3:28:53<58:52,  1.73s/it]  

training loss: 3.488152503967285


training:  81%|████████▏ | 8943/10986 [3:28:54<54:49,  1.61s/it]

training loss: 3.4673678874969482


training:  81%|████████▏ | 8944/10986 [3:28:56<51:25,  1.51s/it]

training loss: 3.385507583618164


training:  81%|████████▏ | 8945/10986 [3:28:57<49:49,  1.46s/it]

training loss: 3.3671934604644775


training:  81%|████████▏ | 8946/10986 [3:28:58<48:25,  1.42s/it]

training loss: 3.4218661785125732


training:  81%|████████▏ | 8947/10986 [3:29:00<47:02,  1.38s/it]

training loss: 3.354440689086914


training:  81%|████████▏ | 8948/10986 [3:29:01<46:23,  1.37s/it]

training loss: 3.4104135036468506


training:  81%|████████▏ | 8949/10986 [3:29:02<46:09,  1.36s/it]

training loss: 3.392458915710449


training:  81%|████████▏ | 8950/10986 [3:29:04<45:51,  1.35s/it]

training loss: 3.3174102306365967


training:  81%|████████▏ | 8951/10986 [3:29:05<50:35,  1.49s/it]

training loss: 3.3618805408477783


training:  81%|████████▏ | 8952/10986 [3:29:07<53:54,  1.59s/it]

training loss: 3.496387004852295


training:  81%|████████▏ | 8953/10986 [3:29:09<54:04,  1.60s/it]

training loss: 3.409019947052002


training:  82%|████████▏ | 8954/10986 [3:29:10<51:03,  1.51s/it]

training loss: 3.4820053577423096


training:  82%|████████▏ | 8955/10986 [3:29:11<48:50,  1.44s/it]

training loss: 3.356435537338257


training:  82%|████████▏ | 8956/10986 [3:29:13<47:24,  1.40s/it]

training loss: 3.4543442726135254


training:  82%|████████▏ | 8957/10986 [3:29:14<46:16,  1.37s/it]

training loss: 3.3741564750671387


training:  82%|████████▏ | 8958/10986 [3:29:15<45:45,  1.35s/it]

training loss: 3.4079556465148926


training:  82%|████████▏ | 8959/10986 [3:29:17<45:11,  1.34s/it]

training loss: 3.4032106399536133


training:  82%|████████▏ | 8960/10986 [3:29:18<44:50,  1.33s/it]

training loss: 3.385018825531006
valid loss: 3.385909080505371
perplexity: 29.544837951660156


training:  82%|████████▏ | 8961/10986 [3:29:21<59:45,  1.77s/it]

training loss: 3.452542543411255


training:  82%|████████▏ | 8962/10986 [3:29:22<55:43,  1.65s/it]

training loss: 3.357062816619873


training:  82%|████████▏ | 8963/10986 [3:29:23<52:04,  1.54s/it]

training loss: 3.503645896911621


training:  82%|████████▏ | 8964/10986 [3:29:25<49:51,  1.48s/it]

training loss: 3.351409912109375


training:  82%|████████▏ | 8965/10986 [3:29:26<48:11,  1.43s/it]

training loss: 3.4071621894836426


training:  82%|████████▏ | 8966/10986 [3:29:27<47:34,  1.41s/it]

training loss: 3.3703908920288086


training:  82%|████████▏ | 8967/10986 [3:29:29<46:39,  1.39s/it]

training loss: 3.3413634300231934


training:  82%|████████▏ | 8968/10986 [3:29:30<45:40,  1.36s/it]

training loss: 3.3001747131347656


training:  82%|████████▏ | 8969/10986 [3:29:31<45:17,  1.35s/it]

training loss: 3.360675811767578


training:  82%|████████▏ | 8970/10986 [3:29:33<44:51,  1.34s/it]

training loss: 3.4766364097595215


training:  82%|████████▏ | 8971/10986 [3:29:34<47:41,  1.42s/it]

training loss: 3.348231554031372


training:  82%|████████▏ | 8972/10986 [3:29:36<46:28,  1.38s/it]

training loss: 3.3816120624542236


training:  82%|████████▏ | 8973/10986 [3:29:37<45:35,  1.36s/it]

training loss: 3.367490530014038


training:  82%|████████▏ | 8974/10986 [3:29:38<44:54,  1.34s/it]

training loss: 3.5203535556793213


training:  82%|████████▏ | 8975/10986 [3:29:39<44:29,  1.33s/it]

training loss: 3.34232497215271


training:  82%|████████▏ | 8976/10986 [3:29:41<44:10,  1.32s/it]

training loss: 3.38450288772583


training:  82%|████████▏ | 8977/10986 [3:29:42<43:55,  1.31s/it]

training loss: 3.2862119674682617


training:  82%|████████▏ | 8978/10986 [3:29:43<43:55,  1.31s/it]

training loss: 3.3425984382629395


training:  82%|████████▏ | 8979/10986 [3:29:45<43:40,  1.31s/it]

training loss: 3.336761474609375


training:  82%|████████▏ | 8980/10986 [3:29:46<43:30,  1.30s/it]

training loss: 3.448822021484375
valid loss: 3.447418689727783
perplexity: 31.4191837310791


training:  82%|████████▏ | 8981/10986 [3:29:49<58:33,  1.75s/it]

training loss: 3.4570062160491943


training:  82%|████████▏ | 8982/10986 [3:29:50<54:50,  1.64s/it]

training loss: 3.4628758430480957


training:  82%|████████▏ | 8983/10986 [3:29:51<51:29,  1.54s/it]

training loss: 3.4665470123291016


training:  82%|████████▏ | 8984/10986 [3:29:53<49:04,  1.47s/it]

training loss: 3.3964157104492188


training:  82%|████████▏ | 8985/10986 [3:29:54<46:59,  1.41s/it]

training loss: 3.483492612838745


training:  82%|████████▏ | 8986/10986 [3:29:55<45:52,  1.38s/it]

training loss: 3.3576056957244873


training:  82%|████████▏ | 8987/10986 [3:29:57<45:04,  1.35s/it]

training loss: 3.4049124717712402


training:  82%|████████▏ | 8988/10986 [3:29:58<45:15,  1.36s/it]

training loss: 3.3191442489624023


training:  82%|████████▏ | 8989/10986 [3:29:59<44:33,  1.34s/it]

training loss: 3.4663968086242676


training:  82%|████████▏ | 8990/10986 [3:30:01<44:05,  1.33s/it]

training loss: 3.612520456314087


training:  82%|████████▏ | 8991/10986 [3:30:02<46:42,  1.40s/it]

training loss: 3.4485950469970703


training:  82%|████████▏ | 8992/10986 [3:30:04<47:57,  1.44s/it]

training loss: 3.4431517124176025


training:  82%|████████▏ | 8993/10986 [3:30:05<46:52,  1.41s/it]

training loss: 3.327934980392456


training:  82%|████████▏ | 8994/10986 [3:30:06<45:52,  1.38s/it]

training loss: 3.451021671295166


training:  82%|████████▏ | 8995/10986 [3:30:08<45:04,  1.36s/it]

training loss: 3.443214178085327


training:  82%|████████▏ | 8996/10986 [3:30:09<44:29,  1.34s/it]

training loss: 3.4313278198242188


training:  82%|████████▏ | 8997/10986 [3:30:10<44:17,  1.34s/it]

training loss: 3.475037097930908


training:  82%|████████▏ | 8998/10986 [3:30:12<44:08,  1.33s/it]

training loss: 3.5527374744415283


training:  82%|████████▏ | 8999/10986 [3:30:13<43:54,  1.33s/it]

training loss: 3.3857080936431885


training:  82%|████████▏ | 9000/10986 [3:30:14<44:00,  1.33s/it]

training loss: 3.3432345390319824
valid loss: 3.3483169078826904
perplexity: 28.454801559448242


training:  82%|████████▏ | 9001/10986 [3:30:17<58:26,  1.77s/it]

training loss: 3.264147996902466


training:  82%|████████▏ | 9002/10986 [3:30:18<54:54,  1.66s/it]

training loss: 3.2892558574676514


training:  82%|████████▏ | 9003/10986 [3:30:20<51:12,  1.55s/it]

training loss: 3.459531784057617


training:  82%|████████▏ | 9004/10986 [3:30:21<48:35,  1.47s/it]

training loss: 3.4186666011810303


training:  82%|████████▏ | 9005/10986 [3:30:22<46:50,  1.42s/it]

training loss: 3.4481201171875


training:  82%|████████▏ | 9006/10986 [3:30:24<45:46,  1.39s/it]

training loss: 3.2942070960998535


training:  82%|████████▏ | 9007/10986 [3:30:25<44:57,  1.36s/it]

training loss: 3.454744577407837


training:  82%|████████▏ | 9008/10986 [3:30:26<44:41,  1.36s/it]

training loss: 3.434570074081421


training:  82%|████████▏ | 9009/10986 [3:30:28<44:28,  1.35s/it]

training loss: 3.316863536834717


training:  82%|████████▏ | 9010/10986 [3:30:29<44:37,  1.36s/it]

training loss: 3.428114652633667


training:  82%|████████▏ | 9011/10986 [3:30:31<47:11,  1.43s/it]

training loss: 3.3917007446289062


training:  82%|████████▏ | 9012/10986 [3:30:32<47:16,  1.44s/it]

training loss: 3.412626266479492


training:  82%|████████▏ | 9013/10986 [3:30:33<46:03,  1.40s/it]

training loss: 3.4524457454681396


training:  82%|████████▏ | 9014/10986 [3:30:35<45:13,  1.38s/it]

training loss: 3.4916810989379883


training:  82%|████████▏ | 9015/10986 [3:30:36<44:36,  1.36s/it]

training loss: 3.498887300491333


training:  82%|████████▏ | 9016/10986 [3:30:37<44:16,  1.35s/it]

training loss: 3.334470510482788


training:  82%|████████▏ | 9017/10986 [3:30:39<43:42,  1.33s/it]

training loss: 3.371140480041504


training:  82%|████████▏ | 9018/10986 [3:30:40<43:25,  1.32s/it]

training loss: 3.3904473781585693


training:  82%|████████▏ | 9019/10986 [3:30:41<43:25,  1.32s/it]

training loss: 3.298227071762085


training:  82%|████████▏ | 9020/10986 [3:30:43<43:06,  1.32s/it]

training loss: 3.4736835956573486
valid loss: 3.4740049839019775
perplexity: 32.265708923339844


training:  82%|████████▏ | 9021/10986 [3:30:45<58:08,  1.78s/it]

training loss: 3.436722993850708


training:  82%|████████▏ | 9022/10986 [3:30:47<56:45,  1.73s/it]

training loss: 3.3842391967773438


training:  82%|████████▏ | 9023/10986 [3:30:48<52:37,  1.61s/it]

training loss: 3.402662515640259


training:  82%|████████▏ | 9024/10986 [3:30:50<49:35,  1.52s/it]

training loss: 3.5048069953918457


training:  82%|████████▏ | 9025/10986 [3:30:51<47:37,  1.46s/it]

training loss: 3.3616933822631836


training:  82%|████████▏ | 9026/10986 [3:30:52<46:18,  1.42s/it]

training loss: 3.310777425765991


training:  82%|████████▏ | 9027/10986 [3:30:54<45:39,  1.40s/it]

training loss: 3.3703272342681885


training:  82%|████████▏ | 9028/10986 [3:30:55<44:57,  1.38s/it]

training loss: 3.3819005489349365


training:  82%|████████▏ | 9029/10986 [3:30:56<44:17,  1.36s/it]

training loss: 3.4553871154785156


training:  82%|████████▏ | 9030/10986 [3:30:58<44:06,  1.35s/it]

training loss: 3.381718158721924


training:  82%|████████▏ | 9031/10986 [3:30:59<47:03,  1.44s/it]

training loss: 3.3591129779815674


training:  82%|████████▏ | 9032/10986 [3:31:01<46:05,  1.42s/it]

training loss: 3.5875284671783447


training:  82%|████████▏ | 9033/10986 [3:31:02<45:19,  1.39s/it]

training loss: 3.339200735092163


training:  82%|████████▏ | 9034/10986 [3:31:03<44:29,  1.37s/it]

training loss: 3.4115538597106934


training:  82%|████████▏ | 9035/10986 [3:31:05<43:53,  1.35s/it]

training loss: 3.3714258670806885


training:  82%|████████▏ | 9036/10986 [3:31:06<43:24,  1.34s/it]

training loss: 3.3103861808776855


training:  82%|████████▏ | 9037/10986 [3:31:07<43:06,  1.33s/it]

training loss: 3.3608248233795166


training:  82%|████████▏ | 9038/10986 [3:31:09<42:51,  1.32s/it]

training loss: 3.3624162673950195


training:  82%|████████▏ | 9039/10986 [3:31:10<42:42,  1.32s/it]

training loss: 3.5040805339813232


training:  82%|████████▏ | 9040/10986 [3:31:11<42:23,  1.31s/it]

training loss: 3.4093451499938965
valid loss: 3.408223867416382
perplexity: 30.211536407470703


training:  82%|████████▏ | 9041/10986 [3:31:14<57:04,  1.76s/it]

training loss: 3.415609359741211


training:  82%|████████▏ | 9042/10986 [3:31:16<55:52,  1.72s/it]

training loss: 3.4194579124450684


training:  82%|████████▏ | 9043/10986 [3:31:17<51:48,  1.60s/it]

training loss: 3.4556126594543457


training:  82%|████████▏ | 9044/10986 [3:31:18<48:57,  1.51s/it]

training loss: 3.4555747509002686


training:  82%|████████▏ | 9045/10986 [3:31:19<46:50,  1.45s/it]

training loss: 3.4143056869506836


training:  82%|████████▏ | 9046/10986 [3:31:21<45:32,  1.41s/it]

training loss: 3.319291114807129


training:  82%|████████▏ | 9047/10986 [3:31:22<44:23,  1.37s/it]

training loss: 3.5543053150177


training:  82%|████████▏ | 9048/10986 [3:31:23<43:45,  1.35s/it]

training loss: 3.41660213470459


training:  82%|████████▏ | 9049/10986 [3:31:25<43:15,  1.34s/it]

training loss: 3.434990882873535


training:  82%|████████▏ | 9050/10986 [3:31:26<43:09,  1.34s/it]

training loss: 3.4170289039611816


training:  82%|████████▏ | 9051/10986 [3:31:28<46:01,  1.43s/it]

training loss: 3.3522183895111084


training:  82%|████████▏ | 9052/10986 [3:31:29<44:59,  1.40s/it]

training loss: 3.4250404834747314


training:  82%|████████▏ | 9053/10986 [3:31:30<44:04,  1.37s/it]

training loss: 3.3568267822265625


training:  82%|████████▏ | 9054/10986 [3:31:32<43:29,  1.35s/it]

training loss: 3.390101909637451


training:  82%|████████▏ | 9055/10986 [3:31:33<43:07,  1.34s/it]

training loss: 3.427689790725708


training:  82%|████████▏ | 9056/10986 [3:31:34<42:40,  1.33s/it]

training loss: 3.483292579650879


training:  82%|████████▏ | 9057/10986 [3:31:36<42:24,  1.32s/it]

training loss: 3.391116142272949


training:  82%|████████▏ | 9058/10986 [3:31:37<42:10,  1.31s/it]

training loss: 3.4838297367095947


training:  82%|████████▏ | 9059/10986 [3:31:38<42:31,  1.32s/it]

training loss: 3.4641096591949463


training:  82%|████████▏ | 9060/10986 [3:31:39<42:27,  1.32s/it]

training loss: 3.5420801639556885
valid loss: 3.5383715629577637
perplexity: 34.41083908081055


training:  82%|████████▏ | 9061/10986 [3:31:42<56:45,  1.77s/it]

training loss: 3.3489792346954346


training:  82%|████████▏ | 9062/10986 [3:31:44<53:00,  1.65s/it]

training loss: 3.3131508827209473


training:  82%|████████▏ | 9063/10986 [3:31:45<50:03,  1.56s/it]

training loss: 3.5333399772644043


training:  83%|████████▎ | 9064/10986 [3:31:46<47:46,  1.49s/it]

training loss: 3.3106632232666016


training:  83%|████████▎ | 9065/10986 [3:31:48<45:54,  1.43s/it]

training loss: 3.44789457321167


training:  83%|████████▎ | 9066/10986 [3:31:49<44:37,  1.39s/it]

training loss: 3.351217746734619


training:  83%|████████▎ | 9067/10986 [3:31:50<43:51,  1.37s/it]

training loss: 3.395188331604004


training:  83%|████████▎ | 9068/10986 [3:31:52<43:13,  1.35s/it]

training loss: 3.421396255493164


training:  83%|████████▎ | 9069/10986 [3:31:53<42:46,  1.34s/it]

training loss: 3.3414344787597656


training:  83%|████████▎ | 9070/10986 [3:31:54<42:32,  1.33s/it]

training loss: 3.3567590713500977


training:  83%|████████▎ | 9071/10986 [3:31:56<45:08,  1.41s/it]

training loss: 3.3901610374450684


training:  83%|████████▎ | 9072/10986 [3:31:57<45:00,  1.41s/it]

training loss: 3.400503396987915


training:  83%|████████▎ | 9073/10986 [3:31:59<44:15,  1.39s/it]

training loss: 3.433305025100708


training:  83%|████████▎ | 9074/10986 [3:32:00<43:35,  1.37s/it]

training loss: 3.512571334838867


training:  83%|████████▎ | 9075/10986 [3:32:01<43:21,  1.36s/it]

training loss: 3.429333209991455


training:  83%|████████▎ | 9076/10986 [3:32:03<42:57,  1.35s/it]

training loss: 3.334655284881592


training:  83%|████████▎ | 9077/10986 [3:32:04<42:49,  1.35s/it]

training loss: 3.4323787689208984


training:  83%|████████▎ | 9078/10986 [3:32:05<42:41,  1.34s/it]

training loss: 3.324662685394287


training:  83%|████████▎ | 9079/10986 [3:32:07<42:17,  1.33s/it]

training loss: 3.327282428741455


training:  83%|████████▎ | 9080/10986 [3:32:08<42:10,  1.33s/it]

training loss: 3.3803622722625732
valid loss: 3.3785269260406494
perplexity: 29.327537536621094


training:  83%|████████▎ | 9081/10986 [3:32:11<56:21,  1.78s/it]

training loss: 3.4572293758392334


training:  83%|████████▎ | 9082/10986 [3:32:12<55:00,  1.73s/it]

training loss: 3.460747003555298


training:  83%|████████▎ | 9083/10986 [3:32:14<51:06,  1.61s/it]

training loss: 3.5694947242736816


training:  83%|████████▎ | 9084/10986 [3:32:15<48:18,  1.52s/it]

training loss: 3.3675594329833984


training:  83%|████████▎ | 9085/10986 [3:32:16<46:08,  1.46s/it]

training loss: 3.3958261013031006


training:  83%|████████▎ | 9086/10986 [3:32:18<44:37,  1.41s/it]

training loss: 3.367239475250244


training:  83%|████████▎ | 9087/10986 [3:32:19<43:42,  1.38s/it]

training loss: 3.415860891342163


training:  83%|████████▎ | 9088/10986 [3:32:20<43:01,  1.36s/it]

training loss: 3.48043155670166


training:  83%|████████▎ | 9089/10986 [3:32:21<42:41,  1.35s/it]

training loss: 3.427814245223999


training:  83%|████████▎ | 9090/10986 [3:32:23<42:15,  1.34s/it]

training loss: 3.354099750518799


training:  83%|████████▎ | 9091/10986 [3:32:24<44:58,  1.42s/it]

training loss: 3.4205079078674316


training:  83%|████████▎ | 9092/10986 [3:32:26<44:03,  1.40s/it]

training loss: 3.566437005996704


training:  83%|████████▎ | 9093/10986 [3:32:27<43:11,  1.37s/it]

training loss: 3.440919876098633


training:  83%|████████▎ | 9094/10986 [3:32:28<42:35,  1.35s/it]

training loss: 3.3442986011505127


training:  83%|████████▎ | 9095/10986 [3:32:30<42:31,  1.35s/it]

training loss: 3.334563970565796


training:  83%|████████▎ | 9096/10986 [3:32:31<42:16,  1.34s/it]

training loss: 3.388021945953369


training:  83%|████████▎ | 9097/10986 [3:32:32<42:03,  1.34s/it]

training loss: 3.4262990951538086


training:  83%|████████▎ | 9098/10986 [3:32:34<41:47,  1.33s/it]

training loss: 3.329329252243042


training:  83%|████████▎ | 9099/10986 [3:32:35<41:29,  1.32s/it]

training loss: 3.3656203746795654


training:  83%|████████▎ | 9100/10986 [3:32:36<41:31,  1.32s/it]

training loss: 3.5146660804748535
valid loss: 3.5102717876434326
perplexity: 33.457359313964844


training:  83%|████████▎ | 9101/10986 [3:32:39<55:52,  1.78s/it]

training loss: 3.457632064819336


training:  83%|████████▎ | 9102/10986 [3:32:41<52:01,  1.66s/it]

training loss: 3.4281105995178223


training:  83%|████████▎ | 9103/10986 [3:32:42<48:41,  1.55s/it]

training loss: 3.364112138748169


training:  83%|████████▎ | 9104/10986 [3:32:43<46:35,  1.49s/it]

training loss: 3.3844330310821533


training:  83%|████████▎ | 9105/10986 [3:32:44<45:06,  1.44s/it]

training loss: 3.3936707973480225


training:  83%|████████▎ | 9106/10986 [3:32:46<43:56,  1.40s/it]

training loss: 3.410796880722046


training:  83%|████████▎ | 9107/10986 [3:32:47<43:09,  1.38s/it]

training loss: 3.4375152587890625


training:  83%|████████▎ | 9108/10986 [3:32:48<42:31,  1.36s/it]

training loss: 3.406386613845825


training:  83%|████████▎ | 9109/10986 [3:32:50<42:12,  1.35s/it]

training loss: 3.4550797939300537


training:  83%|████████▎ | 9110/10986 [3:32:51<41:49,  1.34s/it]

training loss: 3.354318857192993


training:  83%|████████▎ | 9111/10986 [3:32:53<44:15,  1.42s/it]

training loss: 3.3682408332824707


training:  83%|████████▎ | 9112/10986 [3:32:54<46:49,  1.50s/it]

training loss: 3.447871208190918


training:  83%|████████▎ | 9113/10986 [3:32:56<45:17,  1.45s/it]

training loss: 3.41202712059021


training:  83%|████████▎ | 9114/10986 [3:32:57<43:58,  1.41s/it]

training loss: 3.3998236656188965


training:  83%|████████▎ | 9115/10986 [3:32:58<43:15,  1.39s/it]

training loss: 3.375865936279297


training:  83%|████████▎ | 9116/10986 [3:33:00<43:04,  1.38s/it]

training loss: 3.46454119682312


training:  83%|████████▎ | 9117/10986 [3:33:01<42:36,  1.37s/it]

training loss: 3.519138813018799


training:  83%|████████▎ | 9118/10986 [3:33:02<42:17,  1.36s/it]

training loss: 3.5003087520599365


training:  83%|████████▎ | 9119/10986 [3:33:04<41:52,  1.35s/it]

training loss: 3.503610134124756


training:  83%|████████▎ | 9120/10986 [3:33:05<41:35,  1.34s/it]

training loss: 3.443917989730835
valid loss: 3.4416403770446777
perplexity: 31.2381591796875


training:  83%|████████▎ | 9121/10986 [3:33:08<55:22,  1.78s/it]

training loss: 3.341265916824341


training:  83%|████████▎ | 9122/10986 [3:33:09<51:53,  1.67s/it]

training loss: 3.352388620376587


training:  83%|████████▎ | 9123/10986 [3:33:11<48:34,  1.56s/it]

training loss: 3.397634506225586


training:  83%|████████▎ | 9124/10986 [3:33:12<46:26,  1.50s/it]

training loss: 3.395379066467285


training:  83%|████████▎ | 9125/10986 [3:33:13<44:45,  1.44s/it]

training loss: 3.4683563709259033


training:  83%|████████▎ | 9126/10986 [3:33:15<43:34,  1.41s/it]

training loss: 3.582702159881592


training:  83%|████████▎ | 9127/10986 [3:33:16<42:43,  1.38s/it]

training loss: 3.458608388900757


training:  83%|████████▎ | 9128/10986 [3:33:17<42:05,  1.36s/it]

training loss: 3.3722615242004395


training:  83%|████████▎ | 9129/10986 [3:33:19<41:50,  1.35s/it]

training loss: 3.451486825942993


training:  83%|████████▎ | 9130/10986 [3:33:20<41:07,  1.33s/it]

training loss: 3.4596023559570312


training:  83%|████████▎ | 9131/10986 [3:33:21<44:08,  1.43s/it]

training loss: 3.360006093978882


training:  83%|████████▎ | 9132/10986 [3:33:23<47:57,  1.55s/it]

training loss: 3.4583730697631836


training:  83%|████████▎ | 9133/10986 [3:33:25<49:09,  1.59s/it]

training loss: 3.397976875305176


training:  83%|████████▎ | 9134/10986 [3:33:26<47:44,  1.55s/it]

training loss: 3.4365220069885254


training:  83%|████████▎ | 9135/10986 [3:33:28<45:23,  1.47s/it]

training loss: 3.2989323139190674


training:  83%|████████▎ | 9136/10986 [3:33:29<43:48,  1.42s/it]

training loss: 3.3663997650146484


training:  83%|████████▎ | 9137/10986 [3:33:30<43:30,  1.41s/it]

training loss: 3.4011144638061523


training:  83%|████████▎ | 9138/10986 [3:33:32<42:35,  1.38s/it]

training loss: 3.419127941131592


training:  83%|████████▎ | 9139/10986 [3:33:33<42:13,  1.37s/it]

training loss: 3.449221611022949


training:  83%|████████▎ | 9140/10986 [3:33:34<41:43,  1.36s/it]

training loss: 3.3987441062927246
valid loss: 3.394892692565918
perplexity: 29.81145477294922


training:  83%|████████▎ | 9141/10986 [3:33:37<55:01,  1.79s/it]

training loss: 3.537576198577881


training:  83%|████████▎ | 9142/10986 [3:33:39<53:53,  1.75s/it]

training loss: 3.408104419708252


training:  83%|████████▎ | 9143/10986 [3:33:40<49:42,  1.62s/it]

training loss: 3.3088393211364746


training:  83%|████████▎ | 9144/10986 [3:33:41<46:50,  1.53s/it]

training loss: 3.351228952407837


training:  83%|████████▎ | 9145/10986 [3:33:43<44:50,  1.46s/it]

training loss: 3.3164548873901367


training:  83%|████████▎ | 9146/10986 [3:33:44<43:13,  1.41s/it]

training loss: 3.3241944313049316


training:  83%|████████▎ | 9147/10986 [3:33:45<42:13,  1.38s/it]

training loss: 3.233011245727539


training:  83%|████████▎ | 9148/10986 [3:33:47<41:37,  1.36s/it]

training loss: 3.403205394744873


training:  83%|████████▎ | 9149/10986 [3:33:48<40:57,  1.34s/it]

training loss: 3.488203525543213


training:  83%|████████▎ | 9150/10986 [3:33:49<40:26,  1.32s/it]

training loss: 3.317725419998169


training:  83%|████████▎ | 9151/10986 [3:33:51<43:02,  1.41s/it]

training loss: 3.3463871479034424


training:  83%|████████▎ | 9152/10986 [3:33:52<42:21,  1.39s/it]

training loss: 3.3416268825531006


training:  83%|████████▎ | 9153/10986 [3:33:54<41:46,  1.37s/it]

training loss: 3.3866770267486572


training:  83%|████████▎ | 9154/10986 [3:33:55<41:07,  1.35s/it]

training loss: 3.3488495349884033


training:  83%|████████▎ | 9155/10986 [3:33:56<41:02,  1.34s/it]

training loss: 3.5343658924102783


training:  83%|████████▎ | 9156/10986 [3:33:57<40:38,  1.33s/it]

training loss: 3.2570862770080566


training:  83%|████████▎ | 9157/10986 [3:33:59<40:19,  1.32s/it]

training loss: 3.4147138595581055


training:  83%|████████▎ | 9158/10986 [3:34:00<40:31,  1.33s/it]

training loss: 3.362769365310669


training:  83%|████████▎ | 9159/10986 [3:34:01<40:51,  1.34s/it]

training loss: 3.381267786026001


training:  83%|████████▎ | 9160/10986 [3:34:03<40:37,  1.33s/it]

training loss: 3.415585994720459
valid loss: 3.4135382175445557
perplexity: 30.37251853942871


training:  83%|████████▎ | 9161/10986 [3:34:06<54:16,  1.78s/it]

training loss: 3.2980165481567383


training:  83%|████████▎ | 9162/10986 [3:34:07<50:34,  1.66s/it]

training loss: 3.489708662033081


training:  83%|████████▎ | 9163/10986 [3:34:08<47:26,  1.56s/it]

training loss: 3.348341464996338


training:  83%|████████▎ | 9164/10986 [3:34:10<45:10,  1.49s/it]

training loss: 3.3949546813964844


training:  83%|████████▎ | 9165/10986 [3:34:11<43:27,  1.43s/it]

training loss: 3.427025556564331


training:  83%|████████▎ | 9166/10986 [3:34:12<42:29,  1.40s/it]

training loss: 3.349445343017578


training:  83%|████████▎ | 9167/10986 [3:34:14<41:43,  1.38s/it]

training loss: 3.3579461574554443


training:  83%|████████▎ | 9168/10986 [3:34:15<41:04,  1.36s/it]

training loss: 3.3294036388397217


training:  83%|████████▎ | 9169/10986 [3:34:16<40:51,  1.35s/it]

training loss: 3.3608486652374268


training:  83%|████████▎ | 9170/10986 [3:34:18<40:27,  1.34s/it]

training loss: 3.4467897415161133


training:  83%|████████▎ | 9171/10986 [3:34:19<42:52,  1.42s/it]

training loss: 3.399033308029175


training:  83%|████████▎ | 9172/10986 [3:34:21<44:10,  1.46s/it]

training loss: 3.450596570968628


training:  83%|████████▎ | 9173/10986 [3:34:22<42:43,  1.41s/it]

training loss: 3.4399216175079346


training:  84%|████████▎ | 9174/10986 [3:34:23<41:48,  1.38s/it]

training loss: 3.3505730628967285


training:  84%|████████▎ | 9175/10986 [3:34:25<40:59,  1.36s/it]

training loss: 3.4462480545043945


training:  84%|████████▎ | 9176/10986 [3:34:26<40:37,  1.35s/it]

training loss: 3.357609272003174


training:  84%|████████▎ | 9177/10986 [3:34:27<40:16,  1.34s/it]

training loss: 3.413583755493164


training:  84%|████████▎ | 9178/10986 [3:34:29<39:58,  1.33s/it]

training loss: 3.3727262020111084


training:  84%|████████▎ | 9179/10986 [3:34:30<39:49,  1.32s/it]

training loss: 3.508415699005127


training:  84%|████████▎ | 9180/10986 [3:34:31<40:19,  1.34s/it]

training loss: 3.4197287559509277
valid loss: 3.415269613265991
perplexity: 30.42514991760254


training:  84%|████████▎ | 9181/10986 [3:34:34<53:29,  1.78s/it]

training loss: 3.391108751296997


training:  84%|████████▎ | 9182/10986 [3:34:35<49:55,  1.66s/it]

training loss: 3.357590436935425


training:  84%|████████▎ | 9183/10986 [3:34:37<46:49,  1.56s/it]

training loss: 3.309170961380005


training:  84%|████████▎ | 9184/10986 [3:34:38<44:41,  1.49s/it]

training loss: 3.304439067840576


training:  84%|████████▎ | 9185/10986 [3:34:39<43:00,  1.43s/it]

training loss: 3.3582167625427246


training:  84%|████████▎ | 9186/10986 [3:34:41<41:56,  1.40s/it]

training loss: 3.2611095905303955


training:  84%|████████▎ | 9187/10986 [3:34:42<41:09,  1.37s/it]

training loss: 3.5009877681732178


training:  84%|████████▎ | 9188/10986 [3:34:43<40:42,  1.36s/it]

training loss: 3.3928775787353516


training:  84%|████████▎ | 9189/10986 [3:34:45<40:15,  1.34s/it]

training loss: 3.416006326675415


training:  84%|████████▎ | 9190/10986 [3:34:46<40:06,  1.34s/it]

training loss: 3.4104843139648438


training:  84%|████████▎ | 9191/10986 [3:34:48<42:12,  1.41s/it]

training loss: 3.4319918155670166


training:  84%|████████▎ | 9192/10986 [3:34:49<41:28,  1.39s/it]

training loss: 3.3628904819488525


training:  84%|████████▎ | 9193/10986 [3:34:50<41:00,  1.37s/it]

training loss: 3.4568793773651123


training:  84%|████████▎ | 9194/10986 [3:34:52<40:20,  1.35s/it]

training loss: 3.4320268630981445


training:  84%|████████▎ | 9195/10986 [3:34:53<39:53,  1.34s/it]

training loss: 3.453397750854492


training:  84%|████████▎ | 9196/10986 [3:34:54<39:40,  1.33s/it]

training loss: 3.3273019790649414


training:  84%|████████▎ | 9197/10986 [3:34:55<39:37,  1.33s/it]

training loss: 3.3910934925079346


training:  84%|████████▎ | 9198/10986 [3:34:57<39:27,  1.32s/it]

training loss: 3.407059669494629


training:  84%|████████▎ | 9199/10986 [3:34:58<39:21,  1.32s/it]

training loss: 3.441359043121338


training:  84%|████████▎ | 9200/10986 [3:34:59<39:28,  1.33s/it]

training loss: 3.3839292526245117
valid loss: 3.3800501823425293
perplexity: 29.37224578857422


training:  84%|████████▍ | 9201/10986 [3:35:02<53:16,  1.79s/it]

training loss: 3.483898162841797


training:  84%|████████▍ | 9202/10986 [3:35:04<51:40,  1.74s/it]

training loss: 3.424031972885132


training:  84%|████████▍ | 9203/10986 [3:35:05<47:47,  1.61s/it]

training loss: 3.352613925933838


training:  84%|████████▍ | 9204/10986 [3:35:07<45:05,  1.52s/it]

training loss: 3.34486985206604


training:  84%|████████▍ | 9205/10986 [3:35:08<43:05,  1.45s/it]

training loss: 3.4067416191101074


training:  84%|████████▍ | 9206/10986 [3:35:09<41:45,  1.41s/it]

training loss: 3.3740739822387695


training:  84%|████████▍ | 9207/10986 [3:35:10<41:06,  1.39s/it]

training loss: 3.372407913208008


training:  84%|████████▍ | 9208/10986 [3:35:12<40:14,  1.36s/it]

training loss: 3.3116044998168945


training:  84%|████████▍ | 9209/10986 [3:35:13<39:46,  1.34s/it]

training loss: 3.3317134380340576


training:  84%|████████▍ | 9210/10986 [3:35:14<39:37,  1.34s/it]

training loss: 3.3862593173980713


training:  84%|████████▍ | 9211/10986 [3:35:16<41:54,  1.42s/it]

training loss: 3.4052841663360596


training:  84%|████████▍ | 9212/10986 [3:35:18<43:24,  1.47s/it]

training loss: 3.453951358795166


training:  84%|████████▍ | 9213/10986 [3:35:19<42:02,  1.42s/it]

training loss: 3.4283132553100586


training:  84%|████████▍ | 9214/10986 [3:35:20<40:54,  1.39s/it]

training loss: 3.3922581672668457


training:  84%|████████▍ | 9215/10986 [3:35:22<40:26,  1.37s/it]

training loss: 3.456216335296631


training:  84%|████████▍ | 9216/10986 [3:35:23<39:53,  1.35s/it]

training loss: 3.3041903972625732


training:  84%|████████▍ | 9217/10986 [3:35:24<39:24,  1.34s/it]

training loss: 3.4738516807556152


training:  84%|████████▍ | 9218/10986 [3:35:25<39:02,  1.32s/it]

training loss: 3.3291215896606445


training:  84%|████████▍ | 9219/10986 [3:35:27<39:04,  1.33s/it]

training loss: 3.361632823944092


training:  84%|████████▍ | 9220/10986 [3:35:28<38:57,  1.32s/it]

training loss: 3.391463279724121
valid loss: 3.3932881355285645
perplexity: 29.76365852355957


training:  84%|████████▍ | 9221/10986 [3:35:31<52:23,  1.78s/it]

training loss: 3.547875165939331


training:  84%|████████▍ | 9222/10986 [3:35:33<51:19,  1.75s/it]

training loss: 3.3559205532073975


training:  84%|████████▍ | 9223/10986 [3:35:34<47:33,  1.62s/it]

training loss: 3.436100482940674


training:  84%|████████▍ | 9224/10986 [3:35:35<44:48,  1.53s/it]

training loss: 3.3658111095428467


training:  84%|████████▍ | 9225/10986 [3:35:37<42:55,  1.46s/it]

training loss: 3.627390146255493


training:  84%|████████▍ | 9226/10986 [3:35:38<41:34,  1.42s/it]

training loss: 3.377214193344116


training:  84%|████████▍ | 9227/10986 [3:35:39<40:38,  1.39s/it]

training loss: 3.4718031883239746


training:  84%|████████▍ | 9228/10986 [3:35:41<40:00,  1.37s/it]

training loss: 3.3070485591888428


training:  84%|████████▍ | 9229/10986 [3:35:42<39:16,  1.34s/it]

training loss: 3.374661684036255


training:  84%|████████▍ | 9230/10986 [3:35:43<39:07,  1.34s/it]

training loss: 3.435961961746216


training:  84%|████████▍ | 9231/10986 [3:35:45<41:37,  1.42s/it]

training loss: 3.4383974075317383


training:  84%|████████▍ | 9232/10986 [3:35:46<42:48,  1.46s/it]

training loss: 3.4959535598754883


training:  84%|████████▍ | 9233/10986 [3:35:48<41:40,  1.43s/it]

training loss: 3.458916187286377


training:  84%|████████▍ | 9234/10986 [3:35:49<40:33,  1.39s/it]

training loss: 3.3780553340911865


training:  84%|████████▍ | 9235/10986 [3:35:50<39:41,  1.36s/it]

training loss: 3.331845760345459


training:  84%|████████▍ | 9236/10986 [3:35:52<39:09,  1.34s/it]

training loss: 3.459270477294922


training:  84%|████████▍ | 9237/10986 [3:35:53<39:07,  1.34s/it]

training loss: 3.3758325576782227


training:  84%|████████▍ | 9238/10986 [3:35:54<39:20,  1.35s/it]

training loss: 3.4885611534118652


training:  84%|████████▍ | 9239/10986 [3:35:56<38:50,  1.33s/it]

training loss: 3.3324410915374756


training:  84%|████████▍ | 9240/10986 [3:35:57<38:36,  1.33s/it]

training loss: 3.5938186645507812
valid loss: 3.5937557220458984
perplexity: 36.37041473388672


training:  84%|████████▍ | 9241/10986 [3:36:00<51:32,  1.77s/it]

training loss: 3.410952091217041


training:  84%|████████▍ | 9242/10986 [3:36:01<48:05,  1.65s/it]

training loss: 3.4712941646575928


training:  84%|████████▍ | 9243/10986 [3:36:02<45:23,  1.56s/it]

training loss: 3.372030019760132


training:  84%|████████▍ | 9244/10986 [3:36:04<43:01,  1.48s/it]

training loss: 3.4657931327819824


training:  84%|████████▍ | 9245/10986 [3:36:05<41:38,  1.44s/it]

training loss: 3.401738166809082


training:  84%|████████▍ | 9246/10986 [3:36:06<40:25,  1.39s/it]

training loss: 3.407399892807007


training:  84%|████████▍ | 9247/10986 [3:36:08<39:38,  1.37s/it]

training loss: 3.37182879447937


training:  84%|████████▍ | 9248/10986 [3:36:09<39:11,  1.35s/it]

training loss: 3.479010581970215


training:  84%|████████▍ | 9249/10986 [3:36:10<38:51,  1.34s/it]

training loss: 3.411656379699707


training:  84%|████████▍ | 9250/10986 [3:36:12<38:25,  1.33s/it]

training loss: 3.486767292022705


training:  84%|████████▍ | 9251/10986 [3:36:13<40:43,  1.41s/it]

training loss: 3.4329605102539062


training:  84%|████████▍ | 9252/10986 [3:36:15<42:24,  1.47s/it]

training loss: 3.3160698413848877


training:  84%|████████▍ | 9253/10986 [3:36:16<41:05,  1.42s/it]

training loss: 3.450533390045166


training:  84%|████████▍ | 9254/10986 [3:36:17<40:10,  1.39s/it]

training loss: 3.4751036167144775


training:  84%|████████▍ | 9255/10986 [3:36:19<39:18,  1.36s/it]

training loss: 3.393254518508911


training:  84%|████████▍ | 9256/10986 [3:36:20<38:52,  1.35s/it]

training loss: 3.387995958328247


training:  84%|████████▍ | 9257/10986 [3:36:21<38:21,  1.33s/it]

training loss: 3.3705928325653076


training:  84%|████████▍ | 9258/10986 [3:36:23<38:01,  1.32s/it]

training loss: 3.37150502204895


training:  84%|████████▍ | 9259/10986 [3:36:24<37:44,  1.31s/it]

training loss: 3.4563982486724854


training:  84%|████████▍ | 9260/10986 [3:36:25<37:49,  1.32s/it]

training loss: 3.3494677543640137
valid loss: 3.3472518920898438
perplexity: 28.42451286315918


training:  84%|████████▍ | 9261/10986 [3:36:28<51:08,  1.78s/it]

training loss: 3.3258166313171387


training:  84%|████████▍ | 9262/10986 [3:36:29<47:57,  1.67s/it]

training loss: 3.35561466217041


training:  84%|████████▍ | 9263/10986 [3:36:31<44:59,  1.57s/it]

training loss: 3.469473361968994


training:  84%|████████▍ | 9264/10986 [3:36:32<43:14,  1.51s/it]

training loss: 3.343979597091675


training:  84%|████████▍ | 9265/10986 [3:36:33<41:31,  1.45s/it]

training loss: 3.43232798576355


training:  84%|████████▍ | 9266/10986 [3:36:35<40:18,  1.41s/it]

training loss: 3.3930094242095947


training:  84%|████████▍ | 9267/10986 [3:36:36<39:46,  1.39s/it]

training loss: 3.418046474456787


training:  84%|████████▍ | 9268/10986 [3:36:37<39:06,  1.37s/it]

training loss: 3.3975863456726074


training:  84%|████████▍ | 9269/10986 [3:36:39<38:44,  1.35s/it]

training loss: 3.478672504425049


training:  84%|████████▍ | 9270/10986 [3:36:40<38:59,  1.36s/it]

training loss: 3.3146603107452393


training:  84%|████████▍ | 9271/10986 [3:36:42<41:04,  1.44s/it]

training loss: 3.411349058151245


training:  84%|████████▍ | 9272/10986 [3:36:43<42:15,  1.48s/it]

training loss: 3.4343678951263428


training:  84%|████████▍ | 9273/10986 [3:36:45<41:02,  1.44s/it]

training loss: 3.4910881519317627


training:  84%|████████▍ | 9274/10986 [3:36:46<39:48,  1.40s/it]

training loss: 3.490276575088501


training:  84%|████████▍ | 9275/10986 [3:36:47<39:21,  1.38s/it]

training loss: 3.396487236022949


training:  84%|████████▍ | 9276/10986 [3:36:49<38:49,  1.36s/it]

training loss: 3.3981547355651855


training:  84%|████████▍ | 9277/10986 [3:36:50<38:21,  1.35s/it]

training loss: 3.347324848175049


training:  84%|████████▍ | 9278/10986 [3:36:51<38:16,  1.34s/it]

training loss: 3.4727115631103516


training:  84%|████████▍ | 9279/10986 [3:36:53<38:01,  1.34s/it]

training loss: 3.5399818420410156


training:  84%|████████▍ | 9280/10986 [3:36:54<37:55,  1.33s/it]

training loss: 3.4395172595977783
valid loss: 3.433279275894165
perplexity: 30.97806167602539


training:  84%|████████▍ | 9281/10986 [3:36:57<51:06,  1.80s/it]

training loss: 3.4615941047668457


training:  84%|████████▍ | 9282/10986 [3:36:59<50:06,  1.76s/it]

training loss: 3.3209195137023926


training:  84%|████████▍ | 9283/10986 [3:37:00<46:05,  1.62s/it]

training loss: 3.406614303588867


training:  85%|████████▍ | 9284/10986 [3:37:01<43:32,  1.53s/it]

training loss: 3.4347989559173584


training:  85%|████████▍ | 9285/10986 [3:37:02<41:45,  1.47s/it]

training loss: 3.3362998962402344


training:  85%|████████▍ | 9286/10986 [3:37:04<40:26,  1.43s/it]

training loss: 3.3324496746063232


training:  85%|████████▍ | 9287/10986 [3:37:05<40:02,  1.41s/it]

training loss: 3.394606590270996


training:  85%|████████▍ | 9288/10986 [3:37:07<39:29,  1.40s/it]

training loss: 3.4566192626953125


training:  85%|████████▍ | 9289/10986 [3:37:08<38:53,  1.38s/it]

training loss: 3.389634609222412


training:  85%|████████▍ | 9290/10986 [3:37:09<38:20,  1.36s/it]

training loss: 3.496016263961792


training:  85%|████████▍ | 9291/10986 [3:37:11<41:05,  1.45s/it]

training loss: 3.3703079223632812


training:  85%|████████▍ | 9292/10986 [3:37:12<41:00,  1.45s/it]

training loss: 3.2936043739318848


training:  85%|████████▍ | 9293/10986 [3:37:14<39:55,  1.41s/it]

training loss: 3.426349639892578


training:  85%|████████▍ | 9294/10986 [3:37:15<39:20,  1.40s/it]

training loss: 3.3977420330047607


training:  85%|████████▍ | 9295/10986 [3:37:16<38:48,  1.38s/it]

training loss: 3.332674026489258


training:  85%|████████▍ | 9296/10986 [3:37:18<38:21,  1.36s/it]

training loss: 3.442533493041992


training:  85%|████████▍ | 9297/10986 [3:37:19<38:06,  1.35s/it]

training loss: 3.375898838043213


training:  85%|████████▍ | 9298/10986 [3:37:20<37:48,  1.34s/it]

training loss: 3.4154720306396484


training:  85%|████████▍ | 9299/10986 [3:37:22<37:39,  1.34s/it]

training loss: 3.3567895889282227


training:  85%|████████▍ | 9300/10986 [3:37:23<37:39,  1.34s/it]

training loss: 3.481257438659668
valid loss: 3.483670234680176
perplexity: 32.57907485961914


training:  85%|████████▍ | 9301/10986 [3:37:26<50:33,  1.80s/it]

training loss: 3.5250091552734375


training:  85%|████████▍ | 9302/10986 [3:37:27<47:37,  1.70s/it]

training loss: 3.3685877323150635


training:  85%|████████▍ | 9303/10986 [3:37:29<44:30,  1.59s/it]

training loss: 3.435713768005371


training:  85%|████████▍ | 9304/10986 [3:37:30<42:12,  1.51s/it]

training loss: 3.443060874938965


training:  85%|████████▍ | 9305/10986 [3:37:31<40:33,  1.45s/it]

training loss: 3.3702619075775146


training:  85%|████████▍ | 9306/10986 [3:37:33<39:39,  1.42s/it]

training loss: 3.5135397911071777


training:  85%|████████▍ | 9307/10986 [3:37:34<38:48,  1.39s/it]

training loss: 3.447305679321289


training:  85%|████████▍ | 9308/10986 [3:37:35<38:31,  1.38s/it]

training loss: 3.4234654903411865


training:  85%|████████▍ | 9309/10986 [3:37:37<38:27,  1.38s/it]

training loss: 3.550783634185791


training:  85%|████████▍ | 9310/10986 [3:37:38<38:05,  1.36s/it]

training loss: 3.368180990219116


training:  85%|████████▍ | 9311/10986 [3:37:40<40:27,  1.45s/it]

training loss: 3.407348871231079


training:  85%|████████▍ | 9312/10986 [3:37:41<39:45,  1.42s/it]

training loss: 3.4063990116119385


training:  85%|████████▍ | 9313/10986 [3:37:42<40:03,  1.44s/it]

training loss: 3.440067768096924


training:  85%|████████▍ | 9314/10986 [3:37:44<42:21,  1.52s/it]

training loss: 3.4319536685943604


training:  85%|████████▍ | 9315/10986 [3:37:46<44:25,  1.60s/it]

training loss: 3.4181625843048096


training:  85%|████████▍ | 9316/10986 [3:37:47<42:21,  1.52s/it]

training loss: 3.4378957748413086


training:  85%|████████▍ | 9317/10986 [3:37:49<40:50,  1.47s/it]

training loss: 3.447613477706909


training:  85%|████████▍ | 9318/10986 [3:37:50<39:44,  1.43s/it]

training loss: 3.390180826187134


training:  85%|████████▍ | 9319/10986 [3:37:51<39:06,  1.41s/it]

training loss: 3.338500499725342


training:  85%|████████▍ | 9320/10986 [3:37:53<38:23,  1.38s/it]

training loss: 3.4622035026550293
valid loss: 3.461639642715454
perplexity: 31.86918830871582


training:  85%|████████▍ | 9321/10986 [3:37:56<50:48,  1.83s/it]

training loss: 3.3481855392456055


training:  85%|████████▍ | 9322/10986 [3:37:57<47:19,  1.71s/it]

training loss: 3.3625411987304688


training:  85%|████████▍ | 9323/10986 [3:37:58<44:18,  1.60s/it]

training loss: 3.415330171585083


training:  85%|████████▍ | 9324/10986 [3:38:00<41:55,  1.51s/it]

training loss: 3.3695409297943115


training:  85%|████████▍ | 9325/10986 [3:38:01<40:38,  1.47s/it]

training loss: 3.3435118198394775


training:  85%|████████▍ | 9326/10986 [3:38:02<39:51,  1.44s/it]

training loss: 3.3786532878875732


training:  85%|████████▍ | 9327/10986 [3:38:04<38:56,  1.41s/it]

training loss: 3.3627536296844482


training:  85%|████████▍ | 9328/10986 [3:38:05<38:25,  1.39s/it]

training loss: 3.460239887237549


training:  85%|████████▍ | 9329/10986 [3:38:06<38:18,  1.39s/it]

training loss: 3.3853917121887207


training:  85%|████████▍ | 9330/10986 [3:38:08<37:44,  1.37s/it]

training loss: 3.406440496444702


training:  85%|████████▍ | 9331/10986 [3:38:09<40:09,  1.46s/it]

training loss: 3.457257032394409


training:  85%|████████▍ | 9332/10986 [3:38:11<39:15,  1.42s/it]

training loss: 3.3761050701141357


training:  85%|████████▍ | 9333/10986 [3:38:12<38:27,  1.40s/it]

training loss: 3.411036252975464


training:  85%|████████▍ | 9334/10986 [3:38:13<38:23,  1.39s/it]

training loss: 3.5035245418548584


training:  85%|████████▍ | 9335/10986 [3:38:15<37:57,  1.38s/it]

training loss: 3.4181933403015137


training:  85%|████████▍ | 9336/10986 [3:38:16<37:49,  1.38s/it]

training loss: 3.456070899963379


training:  85%|████████▍ | 9337/10986 [3:38:18<37:36,  1.37s/it]

training loss: 3.3723223209381104


training:  85%|████████▍ | 9338/10986 [3:38:19<37:35,  1.37s/it]

training loss: 3.4469101428985596


training:  85%|████████▌ | 9339/10986 [3:38:20<37:14,  1.36s/it]

training loss: 3.379650831222534


training:  85%|████████▌ | 9340/10986 [3:38:22<37:13,  1.36s/it]

training loss: 3.4859941005706787
valid loss: 3.482733726501465
perplexity: 32.548580169677734


training:  85%|████████▌ | 9341/10986 [3:38:24<49:51,  1.82s/it]

training loss: 3.315553665161133


training:  85%|████████▌ | 9342/10986 [3:38:26<48:23,  1.77s/it]

training loss: 3.540743350982666


training:  85%|████████▌ | 9343/10986 [3:38:27<44:58,  1.64s/it]

training loss: 3.3716530799865723


training:  85%|████████▌ | 9344/10986 [3:38:29<42:31,  1.55s/it]

training loss: 3.5413599014282227


training:  85%|████████▌ | 9345/10986 [3:38:30<40:35,  1.48s/it]

training loss: 3.4405994415283203


training:  85%|████████▌ | 9346/10986 [3:38:31<39:18,  1.44s/it]

training loss: 3.4397003650665283


training:  85%|████████▌ | 9347/10986 [3:38:33<38:30,  1.41s/it]

training loss: 3.355053424835205


training:  85%|████████▌ | 9348/10986 [3:38:34<37:58,  1.39s/it]

training loss: 3.4149560928344727


training:  85%|████████▌ | 9349/10986 [3:38:35<37:23,  1.37s/it]

training loss: 3.4968345165252686


training:  85%|████████▌ | 9350/10986 [3:38:37<37:15,  1.37s/it]

training loss: 3.584613084793091


training:  85%|████████▌ | 9351/10986 [3:38:39<39:45,  1.46s/it]

training loss: 3.416461944580078


training:  85%|████████▌ | 9352/10986 [3:38:40<39:04,  1.43s/it]

training loss: 3.475794553756714


training:  85%|████████▌ | 9353/10986 [3:38:41<38:23,  1.41s/it]

training loss: 3.3965024948120117


training:  85%|████████▌ | 9354/10986 [3:38:43<37:53,  1.39s/it]

training loss: 3.3612778186798096


training:  85%|████████▌ | 9355/10986 [3:38:44<37:42,  1.39s/it]

training loss: 3.5466244220733643


training:  85%|████████▌ | 9356/10986 [3:38:45<37:27,  1.38s/it]

training loss: 3.4411890506744385


training:  85%|████████▌ | 9357/10986 [3:38:47<36:56,  1.36s/it]

training loss: 3.4034907817840576


training:  85%|████████▌ | 9358/10986 [3:38:48<36:31,  1.35s/it]

training loss: 3.448739528656006


training:  85%|████████▌ | 9359/10986 [3:38:49<36:21,  1.34s/it]

training loss: 3.576761484146118


training:  85%|████████▌ | 9360/10986 [3:38:51<36:07,  1.33s/it]

training loss: 3.3994300365448
valid loss: 3.4023609161376953
perplexity: 30.034927368164062


training:  85%|████████▌ | 9361/10986 [3:38:53<48:03,  1.77s/it]

training loss: 3.498441219329834


training:  85%|████████▌ | 9362/10986 [3:38:55<45:10,  1.67s/it]

training loss: 3.2881855964660645


training:  85%|████████▌ | 9363/10986 [3:38:56<42:27,  1.57s/it]

training loss: 3.4571115970611572


training:  85%|████████▌ | 9364/10986 [3:38:57<40:19,  1.49s/it]

training loss: 3.3543941974639893


training:  85%|████████▌ | 9365/10986 [3:38:59<38:51,  1.44s/it]

training loss: 3.3728134632110596


training:  85%|████████▌ | 9366/10986 [3:39:00<37:50,  1.40s/it]

training loss: 3.4315173625946045


training:  85%|████████▌ | 9367/10986 [3:39:01<37:15,  1.38s/it]

training loss: 3.4265313148498535


training:  85%|████████▌ | 9368/10986 [3:39:03<36:54,  1.37s/it]

training loss: 3.390305519104004


training:  85%|████████▌ | 9369/10986 [3:39:04<36:34,  1.36s/it]

training loss: 3.2999427318573


training:  85%|████████▌ | 9370/10986 [3:39:05<36:25,  1.35s/it]

training loss: 3.437924385070801


training:  85%|████████▌ | 9371/10986 [3:39:07<39:05,  1.45s/it]

training loss: 3.4204721450805664


training:  85%|████████▌ | 9372/10986 [3:39:08<38:16,  1.42s/it]

training loss: 3.376434564590454


training:  85%|████████▌ | 9373/10986 [3:39:10<37:32,  1.40s/it]

training loss: 3.3379197120666504


training:  85%|████████▌ | 9374/10986 [3:39:11<36:54,  1.37s/it]

training loss: 3.355816602706909


training:  85%|████████▌ | 9375/10986 [3:39:12<36:33,  1.36s/it]

training loss: 3.4845380783081055


training:  85%|████████▌ | 9376/10986 [3:39:14<36:04,  1.34s/it]

training loss: 3.3625123500823975


training:  85%|████████▌ | 9377/10986 [3:39:15<35:50,  1.34s/it]

training loss: 3.3432672023773193


training:  85%|████████▌ | 9378/10986 [3:39:16<35:46,  1.33s/it]

training loss: 3.4387543201446533


training:  85%|████████▌ | 9379/10986 [3:39:18<35:30,  1.33s/it]

training loss: 3.3144595623016357


training:  85%|████████▌ | 9380/10986 [3:39:19<35:40,  1.33s/it]

training loss: 3.370098829269409
valid loss: 3.3680038452148438
perplexity: 29.020538330078125


training:  85%|████████▌ | 9381/10986 [3:39:22<47:42,  1.78s/it]

training loss: 3.4063944816589355


training:  85%|████████▌ | 9382/10986 [3:39:23<44:37,  1.67s/it]

training loss: 3.462672233581543


training:  85%|████████▌ | 9383/10986 [3:39:25<41:51,  1.57s/it]

training loss: 3.3251044750213623


training:  85%|████████▌ | 9384/10986 [3:39:26<39:38,  1.48s/it]

training loss: 3.3638622760772705


training:  85%|████████▌ | 9385/10986 [3:39:27<38:38,  1.45s/it]

training loss: 3.3364694118499756


training:  85%|████████▌ | 9386/10986 [3:39:29<37:27,  1.40s/it]

training loss: 3.4625415802001953


training:  85%|████████▌ | 9387/10986 [3:39:30<36:48,  1.38s/it]

training loss: 3.495769739151001


training:  85%|████████▌ | 9388/10986 [3:39:31<36:13,  1.36s/it]

training loss: 3.4431567192077637


training:  85%|████████▌ | 9389/10986 [3:39:33<35:56,  1.35s/it]

training loss: 3.372143268585205


training:  85%|████████▌ | 9390/10986 [3:39:34<35:38,  1.34s/it]

training loss: 3.4405856132507324


training:  85%|████████▌ | 9391/10986 [3:39:36<38:02,  1.43s/it]

training loss: 3.399188995361328


training:  85%|████████▌ | 9392/10986 [3:39:37<37:26,  1.41s/it]

training loss: 3.40732741355896


training:  85%|████████▌ | 9393/10986 [3:39:38<36:49,  1.39s/it]

training loss: 3.519697427749634


training:  86%|████████▌ | 9394/10986 [3:39:40<36:26,  1.37s/it]

training loss: 3.552901029586792


training:  86%|████████▌ | 9395/10986 [3:39:41<35:50,  1.35s/it]

training loss: 3.293348550796509


training:  86%|████████▌ | 9396/10986 [3:39:42<35:32,  1.34s/it]

training loss: 3.4455065727233887


training:  86%|████████▌ | 9397/10986 [3:39:44<35:19,  1.33s/it]

training loss: 3.4086296558380127


training:  86%|████████▌ | 9398/10986 [3:39:45<35:14,  1.33s/it]

training loss: 3.3713343143463135


training:  86%|████████▌ | 9399/10986 [3:39:46<35:11,  1.33s/it]

training loss: 3.416606903076172


training:  86%|████████▌ | 9400/10986 [3:39:48<35:14,  1.33s/it]

training loss: 3.3663878440856934
valid loss: 3.3634533882141113
perplexity: 28.888782501220703


training:  86%|████████▌ | 9401/10986 [3:39:50<47:19,  1.79s/it]

training loss: 3.4282305240631104


training:  86%|████████▌ | 9402/10986 [3:39:52<44:16,  1.68s/it]

training loss: 3.5197651386260986


training:  86%|████████▌ | 9403/10986 [3:39:53<41:39,  1.58s/it]

training loss: 3.3131282329559326


training:  86%|████████▌ | 9404/10986 [3:39:54<39:40,  1.50s/it]

training loss: 3.425593376159668


training:  86%|████████▌ | 9405/10986 [3:39:56<38:25,  1.46s/it]

training loss: 3.3370578289031982


training:  86%|████████▌ | 9406/10986 [3:39:57<37:37,  1.43s/it]

training loss: 3.370360851287842


training:  86%|████████▌ | 9407/10986 [3:39:59<36:53,  1.40s/it]

training loss: 3.3831827640533447


training:  86%|████████▌ | 9408/10986 [3:40:00<36:38,  1.39s/it]

training loss: 3.3539860248565674


training:  86%|████████▌ | 9409/10986 [3:40:01<36:25,  1.39s/it]

training loss: 3.33577299118042


training:  86%|████████▌ | 9410/10986 [3:40:03<36:06,  1.37s/it]

training loss: 3.313016414642334


training:  86%|████████▌ | 9411/10986 [3:40:04<38:03,  1.45s/it]

training loss: 3.3646740913391113


training:  86%|████████▌ | 9412/10986 [3:40:06<39:05,  1.49s/it]

training loss: 3.4187426567077637


training:  86%|████████▌ | 9413/10986 [3:40:07<38:15,  1.46s/it]

training loss: 3.375380039215088


training:  86%|████████▌ | 9414/10986 [3:40:09<37:30,  1.43s/it]

training loss: 3.417963981628418


training:  86%|████████▌ | 9415/10986 [3:40:10<36:37,  1.40s/it]

training loss: 3.4932658672332764


training:  86%|████████▌ | 9416/10986 [3:40:11<36:11,  1.38s/it]

training loss: 3.4920105934143066


training:  86%|████████▌ | 9417/10986 [3:40:13<35:48,  1.37s/it]

training loss: 3.3629279136657715


training:  86%|████████▌ | 9418/10986 [3:40:14<35:28,  1.36s/it]

training loss: 3.345066547393799


training:  86%|████████▌ | 9419/10986 [3:40:15<35:18,  1.35s/it]

training loss: 3.4430418014526367


training:  86%|████████▌ | 9420/10986 [3:40:17<35:10,  1.35s/it]

training loss: 3.442263603210449
valid loss: 3.442133903503418
perplexity: 31.25358009338379


training:  86%|████████▌ | 9421/10986 [3:40:19<47:22,  1.82s/it]

training loss: 3.444789409637451


training:  86%|████████▌ | 9422/10986 [3:40:21<44:15,  1.70s/it]

training loss: 3.4190893173217773


training:  86%|████████▌ | 9423/10986 [3:40:22<41:22,  1.59s/it]

training loss: 3.455918550491333


training:  86%|████████▌ | 9424/10986 [3:40:24<39:03,  1.50s/it]

training loss: 3.32905912399292


training:  86%|████████▌ | 9425/10986 [3:40:25<37:33,  1.44s/it]

training loss: 3.426079273223877


training:  86%|████████▌ | 9426/10986 [3:40:26<36:41,  1.41s/it]

training loss: 3.423241376876831


training:  86%|████████▌ | 9427/10986 [3:40:28<36:00,  1.39s/it]

training loss: 3.3314969539642334


training:  86%|████████▌ | 9428/10986 [3:40:29<35:41,  1.37s/it]

training loss: 3.372159481048584


training:  86%|████████▌ | 9429/10986 [3:40:30<35:16,  1.36s/it]

training loss: 3.4382498264312744


training:  86%|████████▌ | 9430/10986 [3:40:32<35:05,  1.35s/it]

training loss: 3.408876419067383


training:  86%|████████▌ | 9431/10986 [3:40:33<37:22,  1.44s/it]

training loss: 3.407017469406128


training:  86%|████████▌ | 9432/10986 [3:40:35<36:38,  1.42s/it]

training loss: 3.4537854194641113


training:  86%|████████▌ | 9433/10986 [3:40:36<36:05,  1.39s/it]

training loss: 3.475684404373169


training:  86%|████████▌ | 9434/10986 [3:40:37<35:41,  1.38s/it]

training loss: 3.243339776992798


training:  86%|████████▌ | 9435/10986 [3:40:39<35:40,  1.38s/it]

training loss: 3.4290213584899902


training:  86%|████████▌ | 9436/10986 [3:40:40<35:08,  1.36s/it]

training loss: 3.384006977081299


training:  86%|████████▌ | 9437/10986 [3:40:41<34:58,  1.35s/it]

training loss: 3.410125255584717


training:  86%|████████▌ | 9438/10986 [3:40:43<34:56,  1.35s/it]

training loss: 3.3989830017089844


training:  86%|████████▌ | 9439/10986 [3:40:44<34:45,  1.35s/it]

training loss: 3.344604015350342


training:  86%|████████▌ | 9440/10986 [3:40:45<34:38,  1.34s/it]

training loss: 3.3810322284698486
valid loss: 3.378413200378418
perplexity: 29.324203491210938


training:  86%|████████▌ | 9441/10986 [3:40:48<46:48,  1.82s/it]

training loss: 3.317689895629883


training:  86%|████████▌ | 9442/10986 [3:40:50<43:32,  1.69s/it]

training loss: 3.3995308876037598


training:  86%|████████▌ | 9443/10986 [3:40:51<40:46,  1.59s/it]

training loss: 3.4752023220062256


training:  86%|████████▌ | 9444/10986 [3:40:52<39:01,  1.52s/it]

training loss: 3.3393707275390625


training:  86%|████████▌ | 9445/10986 [3:40:54<37:46,  1.47s/it]

training loss: 3.355921983718872


training:  86%|████████▌ | 9446/10986 [3:40:55<36:44,  1.43s/it]

training loss: 3.428244113922119


training:  86%|████████▌ | 9447/10986 [3:40:56<36:08,  1.41s/it]

training loss: 3.3934054374694824


training:  86%|████████▌ | 9448/10986 [3:40:58<35:45,  1.40s/it]

training loss: 3.3737080097198486


training:  86%|████████▌ | 9449/10986 [3:40:59<35:17,  1.38s/it]

training loss: 3.4503567218780518


training:  86%|████████▌ | 9450/10986 [3:41:00<34:56,  1.37s/it]

training loss: 3.325016498565674


training:  86%|████████▌ | 9451/10986 [3:41:02<37:05,  1.45s/it]

training loss: 3.518989086151123


training:  86%|████████▌ | 9452/10986 [3:41:04<38:20,  1.50s/it]

training loss: 3.436488151550293


training:  86%|████████▌ | 9453/10986 [3:41:05<36:55,  1.45s/it]

training loss: 3.4014382362365723


training:  86%|████████▌ | 9454/10986 [3:41:06<36:14,  1.42s/it]

training loss: 3.318264961242676


training:  86%|████████▌ | 9455/10986 [3:41:08<35:51,  1.41s/it]

training loss: 3.34018874168396


training:  86%|████████▌ | 9456/10986 [3:41:09<35:31,  1.39s/it]

training loss: 3.392026901245117


training:  86%|████████▌ | 9457/10986 [3:41:10<34:57,  1.37s/it]

training loss: 3.5048227310180664


training:  86%|████████▌ | 9458/10986 [3:41:12<34:36,  1.36s/it]

training loss: 3.4512412548065186


training:  86%|████████▌ | 9459/10986 [3:41:13<34:29,  1.36s/it]

training loss: 3.5206689834594727


training:  86%|████████▌ | 9460/10986 [3:41:14<34:20,  1.35s/it]

training loss: 3.4483227729797363
valid loss: 3.4432358741760254
perplexity: 31.28803825378418


training:  86%|████████▌ | 9461/10986 [3:41:17<46:16,  1.82s/it]

training loss: 3.463789939880371


training:  86%|████████▌ | 9462/10986 [3:41:19<42:56,  1.69s/it]

training loss: 3.3698089122772217


training:  86%|████████▌ | 9463/10986 [3:41:20<40:26,  1.59s/it]

training loss: 3.459355354309082


training:  86%|████████▌ | 9464/10986 [3:41:21<38:34,  1.52s/it]

training loss: 3.4722142219543457


training:  86%|████████▌ | 9465/10986 [3:41:23<37:08,  1.46s/it]

training loss: 3.3090620040893555


training:  86%|████████▌ | 9466/10986 [3:41:24<36:17,  1.43s/it]

training loss: 3.342766046524048


training:  86%|████████▌ | 9467/10986 [3:41:25<35:35,  1.41s/it]

training loss: 3.5177087783813477


training:  86%|████████▌ | 9468/10986 [3:41:27<35:04,  1.39s/it]

training loss: 3.4001617431640625


training:  86%|████████▌ | 9469/10986 [3:41:28<34:48,  1.38s/it]

training loss: 3.437669515609741


training:  86%|████████▌ | 9470/10986 [3:41:30<34:36,  1.37s/it]

training loss: 3.5207996368408203


training:  86%|████████▌ | 9471/10986 [3:41:31<36:31,  1.45s/it]

training loss: 3.532759428024292


training:  86%|████████▌ | 9472/10986 [3:41:32<35:49,  1.42s/it]

training loss: 3.420515775680542


training:  86%|████████▌ | 9473/10986 [3:41:34<35:08,  1.39s/it]

training loss: 3.439912796020508


training:  86%|████████▌ | 9474/10986 [3:41:35<34:59,  1.39s/it]

training loss: 3.3904523849487305


training:  86%|████████▌ | 9475/10986 [3:41:37<34:41,  1.38s/it]

training loss: 3.4185924530029297


training:  86%|████████▋ | 9476/10986 [3:41:38<34:42,  1.38s/it]

training loss: 3.3161110877990723


training:  86%|████████▋ | 9477/10986 [3:41:39<34:25,  1.37s/it]

training loss: 3.326005220413208


training:  86%|████████▋ | 9478/10986 [3:41:41<34:14,  1.36s/it]

training loss: 3.3766121864318848


training:  86%|████████▋ | 9479/10986 [3:41:42<34:03,  1.36s/it]

training loss: 3.380772590637207


training:  86%|████████▋ | 9480/10986 [3:41:43<34:03,  1.36s/it]

training loss: 3.5147292613983154
valid loss: 3.512024402618408
perplexity: 33.516048431396484


training:  86%|████████▋ | 9481/10986 [3:41:46<46:12,  1.84s/it]

training loss: 3.420032501220703


training:  86%|████████▋ | 9482/10986 [3:41:48<45:01,  1.80s/it]

training loss: 3.38470196723938


training:  86%|████████▋ | 9483/10986 [3:41:49<41:36,  1.66s/it]

training loss: 3.33628511428833


training:  86%|████████▋ | 9484/10986 [3:41:51<39:07,  1.56s/it]

training loss: 3.5594210624694824


training:  86%|████████▋ | 9485/10986 [3:41:52<37:22,  1.49s/it]

training loss: 3.466841220855713


training:  86%|████████▋ | 9486/10986 [3:41:53<36:15,  1.45s/it]

training loss: 3.4348435401916504


training:  86%|████████▋ | 9487/10986 [3:41:55<35:22,  1.42s/it]

training loss: 3.3179595470428467


training:  86%|████████▋ | 9488/10986 [3:41:56<34:46,  1.39s/it]

training loss: 3.3577382564544678


training:  86%|████████▋ | 9489/10986 [3:41:57<34:35,  1.39s/it]

training loss: 3.4454920291900635


training:  86%|████████▋ | 9490/10986 [3:41:59<34:17,  1.38s/it]

training loss: 3.4459123611450195


training:  86%|████████▋ | 9491/10986 [3:42:00<36:27,  1.46s/it]

training loss: 3.421163320541382


training:  86%|████████▋ | 9492/10986 [3:42:02<35:36,  1.43s/it]

training loss: 3.400076389312744


training:  86%|████████▋ | 9493/10986 [3:42:03<34:51,  1.40s/it]

training loss: 3.4124269485473633


training:  86%|████████▋ | 9494/10986 [3:42:04<34:33,  1.39s/it]

training loss: 3.427375078201294


training:  86%|████████▋ | 9495/10986 [3:42:06<34:01,  1.37s/it]

training loss: 3.4489974975585938


training:  86%|████████▋ | 9496/10986 [3:42:07<33:50,  1.36s/it]

training loss: 3.432661294937134


training:  86%|████████▋ | 9497/10986 [3:42:09<34:53,  1.41s/it]

training loss: 3.466323137283325


training:  86%|████████▋ | 9498/10986 [3:42:10<37:47,  1.52s/it]

training loss: 3.43440580368042


training:  86%|████████▋ | 9499/10986 [3:42:12<39:24,  1.59s/it]

training loss: 3.4782369136810303


training:  86%|████████▋ | 9500/10986 [3:42:14<37:44,  1.52s/it]

training loss: 3.3291101455688477
valid loss: 3.328230619430542
perplexity: 27.888952255249023


training:  86%|████████▋ | 9501/10986 [3:42:16<47:29,  1.92s/it]

training loss: 3.3213908672332764


training:  86%|████████▋ | 9502/10986 [3:42:18<44:16,  1.79s/it]

training loss: 3.398677110671997


training:  87%|████████▋ | 9503/10986 [3:42:19<41:01,  1.66s/it]

training loss: 3.4056670665740967


training:  87%|████████▋ | 9504/10986 [3:42:21<38:38,  1.56s/it]

training loss: 3.393390417098999


training:  87%|████████▋ | 9505/10986 [3:42:22<36:59,  1.50s/it]

training loss: 3.408389091491699


training:  87%|████████▋ | 9506/10986 [3:42:23<35:47,  1.45s/it]

training loss: 3.2345006465911865


training:  87%|████████▋ | 9507/10986 [3:42:25<34:50,  1.41s/it]

training loss: 3.3672115802764893


training:  87%|████████▋ | 9508/10986 [3:42:26<34:17,  1.39s/it]

training loss: 3.3748488426208496


training:  87%|████████▋ | 9509/10986 [3:42:27<34:05,  1.38s/it]

training loss: 3.374044895172119


training:  87%|████████▋ | 9510/10986 [3:42:29<33:51,  1.38s/it]

training loss: 3.4832756519317627


training:  87%|████████▋ | 9511/10986 [3:42:30<35:54,  1.46s/it]

training loss: 3.3294918537139893


training:  87%|████████▋ | 9512/10986 [3:42:32<37:18,  1.52s/it]

training loss: 3.368638515472412


training:  87%|████████▋ | 9513/10986 [3:42:33<36:04,  1.47s/it]

training loss: 3.3990137577056885


training:  87%|████████▋ | 9514/10986 [3:42:35<35:06,  1.43s/it]

training loss: 3.45998477935791


training:  87%|████████▋ | 9515/10986 [3:42:36<34:26,  1.40s/it]

training loss: 3.337129831314087


training:  87%|████████▋ | 9516/10986 [3:42:37<33:58,  1.39s/it]

training loss: 3.2967913150787354


training:  87%|████████▋ | 9517/10986 [3:42:39<33:46,  1.38s/it]

training loss: 3.3264963626861572


training:  87%|████████▋ | 9518/10986 [3:42:40<33:33,  1.37s/it]

training loss: 3.419219493865967


training:  87%|████████▋ | 9519/10986 [3:42:41<33:17,  1.36s/it]

training loss: 3.448535442352295


training:  87%|████████▋ | 9520/10986 [3:42:43<33:03,  1.35s/it]

training loss: 3.336569309234619
valid loss: 3.3322503566741943
perplexity: 28.001285552978516


training:  87%|████████▋ | 9521/10986 [3:42:46<44:14,  1.81s/it]

training loss: 3.4244351387023926


training:  87%|████████▋ | 9522/10986 [3:42:47<43:09,  1.77s/it]

training loss: 3.4467968940734863


training:  87%|████████▋ | 9523/10986 [3:42:49<40:03,  1.64s/it]

training loss: 3.4047372341156006


training:  87%|████████▋ | 9524/10986 [3:42:50<37:54,  1.56s/it]

training loss: 3.298630952835083


training:  87%|████████▋ | 9525/10986 [3:42:51<36:16,  1.49s/it]

training loss: 3.323974132537842


training:  87%|████████▋ | 9526/10986 [3:42:53<35:04,  1.44s/it]

training loss: 3.4864487648010254


training:  87%|████████▋ | 9527/10986 [3:42:54<34:15,  1.41s/it]

training loss: 3.3045616149902344


training:  87%|████████▋ | 9528/10986 [3:42:55<33:39,  1.38s/it]

training loss: 3.429917573928833


training:  87%|████████▋ | 9529/10986 [3:42:57<33:23,  1.38s/it]

training loss: 3.388113498687744


training:  87%|████████▋ | 9530/10986 [3:42:58<33:04,  1.36s/it]

training loss: 3.4132518768310547


training:  87%|████████▋ | 9531/10986 [3:43:00<35:11,  1.45s/it]

training loss: 3.3592023849487305


training:  87%|████████▋ | 9532/10986 [3:43:01<34:20,  1.42s/it]

training loss: 3.3717970848083496


training:  87%|████████▋ | 9533/10986 [3:43:02<33:46,  1.39s/it]

training loss: 3.403745412826538


training:  87%|████████▋ | 9534/10986 [3:43:04<33:32,  1.39s/it]

training loss: 3.3174819946289062


training:  87%|████████▋ | 9535/10986 [3:43:05<33:06,  1.37s/it]

training loss: 3.5024688243865967


training:  87%|████████▋ | 9536/10986 [3:43:06<32:44,  1.35s/it]

training loss: 3.405526638031006


training:  87%|████████▋ | 9537/10986 [3:43:08<32:40,  1.35s/it]

training loss: 3.3254354000091553


training:  87%|████████▋ | 9538/10986 [3:43:09<32:48,  1.36s/it]

training loss: 3.4378676414489746


training:  87%|████████▋ | 9539/10986 [3:43:10<32:43,  1.36s/it]

training loss: 3.3927161693573


training:  87%|████████▋ | 9540/10986 [3:43:12<32:46,  1.36s/it]

training loss: 3.3909385204315186
valid loss: 3.3858530521392822
perplexity: 29.543182373046875


training:  87%|████████▋ | 9541/10986 [3:43:15<43:46,  1.82s/it]

training loss: 3.3832380771636963


training:  87%|████████▋ | 9542/10986 [3:43:16<40:51,  1.70s/it]

training loss: 3.397348403930664


training:  87%|████████▋ | 9543/10986 [3:43:17<38:19,  1.59s/it]

training loss: 3.475865125656128


training:  87%|████████▋ | 9544/10986 [3:43:19<36:26,  1.52s/it]

training loss: 3.26663875579834


training:  87%|████████▋ | 9545/10986 [3:43:20<35:09,  1.46s/it]

training loss: 3.4113235473632812


training:  87%|████████▋ | 9546/10986 [3:43:21<34:12,  1.43s/it]

training loss: 3.408646583557129


training:  87%|████████▋ | 9547/10986 [3:43:23<33:42,  1.41s/it]

training loss: 3.321385383605957


training:  87%|████████▋ | 9548/10986 [3:43:24<33:15,  1.39s/it]

training loss: 3.438192129135132


training:  87%|████████▋ | 9549/10986 [3:43:26<33:02,  1.38s/it]

training loss: 3.369163990020752


training:  87%|████████▋ | 9550/10986 [3:43:27<32:50,  1.37s/it]

training loss: 3.375638961791992


training:  87%|████████▋ | 9551/10986 [3:43:29<34:55,  1.46s/it]

training loss: 3.534764051437378


training:  87%|████████▋ | 9552/10986 [3:43:30<35:40,  1.49s/it]

training loss: 3.4351344108581543


training:  87%|████████▋ | 9553/10986 [3:43:31<34:47,  1.46s/it]

training loss: 3.3818697929382324


training:  87%|████████▋ | 9554/10986 [3:43:33<34:10,  1.43s/it]

training loss: 3.378631353378296


training:  87%|████████▋ | 9555/10986 [3:43:34<33:21,  1.40s/it]

training loss: 3.3044912815093994


training:  87%|████████▋ | 9556/10986 [3:43:36<33:00,  1.38s/it]

training loss: 3.292353391647339


training:  87%|████████▋ | 9557/10986 [3:43:37<32:37,  1.37s/it]

training loss: 3.3963356018066406


training:  87%|████████▋ | 9558/10986 [3:43:38<32:32,  1.37s/it]

training loss: 3.3460447788238525


training:  87%|████████▋ | 9559/10986 [3:43:40<32:38,  1.37s/it]

training loss: 3.3771419525146484


training:  87%|████████▋ | 9560/10986 [3:43:41<32:19,  1.36s/it]

training loss: 3.327772617340088
valid loss: 3.32956600189209
perplexity: 27.926219940185547


training:  87%|████████▋ | 9561/10986 [3:43:44<43:21,  1.83s/it]

training loss: 3.3411378860473633


training:  87%|████████▋ | 9562/10986 [3:43:45<41:07,  1.73s/it]

training loss: 3.5618672370910645


training:  87%|████████▋ | 9563/10986 [3:43:47<38:20,  1.62s/it]

training loss: 3.343799114227295


training:  87%|████████▋ | 9564/10986 [3:43:48<36:22,  1.53s/it]

training loss: 3.4586758613586426


training:  87%|████████▋ | 9565/10986 [3:43:49<34:54,  1.47s/it]

training loss: 3.3146657943725586


training:  87%|████████▋ | 9566/10986 [3:43:51<33:53,  1.43s/it]

training loss: 3.4284887313842773


training:  87%|████████▋ | 9567/10986 [3:43:52<33:13,  1.40s/it]

training loss: 3.4094135761260986


training:  87%|████████▋ | 9568/10986 [3:43:53<32:54,  1.39s/it]

training loss: 3.3145296573638916


training:  87%|████████▋ | 9569/10986 [3:43:55<32:45,  1.39s/it]

training loss: 3.5013527870178223


training:  87%|████████▋ | 9570/10986 [3:43:56<32:34,  1.38s/it]

training loss: 3.3739476203918457


training:  87%|████████▋ | 9571/10986 [3:43:58<34:09,  1.45s/it]

training loss: 3.3412976264953613


training:  87%|████████▋ | 9572/10986 [3:43:59<33:29,  1.42s/it]

training loss: 3.365999460220337


training:  87%|████████▋ | 9573/10986 [3:44:00<32:54,  1.40s/it]

training loss: 3.3876230716705322


training:  87%|████████▋ | 9574/10986 [3:44:02<32:37,  1.39s/it]

training loss: 3.3768093585968018


training:  87%|████████▋ | 9575/10986 [3:44:03<32:20,  1.37s/it]

training loss: 3.5886268615722656


training:  87%|████████▋ | 9576/10986 [3:44:05<32:09,  1.37s/it]

training loss: 3.578455924987793


training:  87%|████████▋ | 9577/10986 [3:44:06<31:50,  1.36s/it]

training loss: 3.324575424194336


training:  87%|████████▋ | 9578/10986 [3:44:07<31:45,  1.35s/it]

training loss: 3.4505560398101807


training:  87%|████████▋ | 9579/10986 [3:44:09<31:39,  1.35s/it]

training loss: 3.509115219116211


training:  87%|████████▋ | 9580/10986 [3:44:10<31:49,  1.36s/it]

training loss: 3.426063299179077
valid loss: 3.4256246089935303
perplexity: 30.74184226989746


training:  87%|████████▋ | 9581/10986 [3:44:13<42:30,  1.82s/it]

training loss: 3.4291176795959473


training:  87%|████████▋ | 9582/10986 [3:44:14<39:39,  1.69s/it]

training loss: 3.358675479888916


training:  87%|████████▋ | 9583/10986 [3:44:16<37:16,  1.59s/it]

training loss: 3.4206671714782715


training:  87%|████████▋ | 9584/10986 [3:44:17<35:25,  1.52s/it]

training loss: 3.334048271179199


training:  87%|████████▋ | 9585/10986 [3:44:18<34:08,  1.46s/it]

training loss: 3.3779163360595703


training:  87%|████████▋ | 9586/10986 [3:44:20<33:08,  1.42s/it]

training loss: 3.3394978046417236


training:  87%|████████▋ | 9587/10986 [3:44:21<32:36,  1.40s/it]

training loss: 3.4914133548736572


training:  87%|████████▋ | 9588/10986 [3:44:22<32:14,  1.38s/it]

training loss: 3.3298568725585938


training:  87%|████████▋ | 9589/10986 [3:44:24<31:49,  1.37s/it]

training loss: 3.435683488845825


training:  87%|████████▋ | 9590/10986 [3:44:25<31:35,  1.36s/it]

training loss: 3.2788445949554443


training:  87%|████████▋ | 9591/10986 [3:44:27<33:26,  1.44s/it]

training loss: 3.4335854053497314


training:  87%|████████▋ | 9592/10986 [3:44:28<32:54,  1.42s/it]

training loss: 3.4531877040863037


training:  87%|████████▋ | 9593/10986 [3:44:29<32:18,  1.39s/it]

training loss: 3.447033166885376


training:  87%|████████▋ | 9594/10986 [3:44:31<31:55,  1.38s/it]

training loss: 3.419173240661621


training:  87%|████████▋ | 9595/10986 [3:44:32<31:47,  1.37s/it]

training loss: 3.3972856998443604


training:  87%|████████▋ | 9596/10986 [3:44:33<31:28,  1.36s/it]

training loss: 3.420187473297119


training:  87%|████████▋ | 9597/10986 [3:44:35<31:09,  1.35s/it]

training loss: 3.2255921363830566


training:  87%|████████▋ | 9598/10986 [3:44:36<30:58,  1.34s/it]

training loss: 3.370986223220825


training:  87%|████████▋ | 9599/10986 [3:44:37<30:56,  1.34s/it]

training loss: 3.4919562339782715


training:  87%|████████▋ | 9600/10986 [3:44:39<30:57,  1.34s/it]

training loss: 3.4121689796447754
valid loss: 3.4138989448547363
perplexity: 30.38347625732422


training:  87%|████████▋ | 9601/10986 [3:44:42<42:19,  1.83s/it]

training loss: 3.328035593032837


training:  87%|████████▋ | 9602/10986 [3:44:43<39:26,  1.71s/it]

training loss: 3.4478585720062256


training:  87%|████████▋ | 9603/10986 [3:44:44<37:03,  1.61s/it]

training loss: 3.3718693256378174


training:  87%|████████▋ | 9604/10986 [3:44:46<35:08,  1.53s/it]

training loss: 3.471742868423462


training:  87%|████████▋ | 9605/10986 [3:44:47<33:44,  1.47s/it]

training loss: 3.428481101989746


training:  87%|████████▋ | 9606/10986 [3:44:48<33:00,  1.43s/it]

training loss: 3.3443965911865234


training:  87%|████████▋ | 9607/10986 [3:44:50<32:13,  1.40s/it]

training loss: 3.429337739944458


training:  87%|████████▋ | 9608/10986 [3:44:51<31:55,  1.39s/it]

training loss: 3.3591790199279785


training:  87%|████████▋ | 9609/10986 [3:44:52<31:35,  1.38s/it]

training loss: 3.435185194015503


training:  87%|████████▋ | 9610/10986 [3:44:54<31:19,  1.37s/it]

training loss: 3.410491943359375


training:  87%|████████▋ | 9611/10986 [3:44:55<33:25,  1.46s/it]

training loss: 3.456206798553467


training:  87%|████████▋ | 9612/10986 [3:44:57<33:35,  1.47s/it]

training loss: 3.4011752605438232


training:  88%|████████▊ | 9613/10986 [3:44:58<32:49,  1.43s/it]

training loss: 3.2721004486083984


training:  88%|████████▊ | 9614/10986 [3:45:00<32:18,  1.41s/it]

training loss: 3.368809461593628


training:  88%|████████▊ | 9615/10986 [3:45:01<31:46,  1.39s/it]

training loss: 3.3487768173217773


training:  88%|████████▊ | 9616/10986 [3:45:02<31:25,  1.38s/it]

training loss: 3.513603687286377


training:  88%|████████▊ | 9617/10986 [3:45:04<31:09,  1.37s/it]

training loss: 3.4472241401672363


training:  88%|████████▊ | 9618/10986 [3:45:05<30:57,  1.36s/it]

training loss: 3.252955436706543


training:  88%|████████▊ | 9619/10986 [3:45:06<30:41,  1.35s/it]

training loss: 3.362704277038574


training:  88%|████████▊ | 9620/10986 [3:45:08<30:31,  1.34s/it]

training loss: 3.3956735134124756
valid loss: 3.3946328163146973
perplexity: 29.803709030151367


training:  88%|████████▊ | 9621/10986 [3:45:11<41:18,  1.82s/it]

training loss: 3.4356420040130615


training:  88%|████████▊ | 9622/10986 [3:45:12<38:42,  1.70s/it]

training loss: 3.45548939704895


training:  88%|████████▊ | 9623/10986 [3:45:13<36:19,  1.60s/it]

training loss: 3.3912851810455322


training:  88%|████████▊ | 9624/10986 [3:45:15<34:18,  1.51s/it]

training loss: 3.4236934185028076


training:  88%|████████▊ | 9625/10986 [3:45:16<33:13,  1.47s/it]

training loss: 3.4347729682922363


training:  88%|████████▊ | 9626/10986 [3:45:17<32:14,  1.42s/it]

training loss: 3.4793524742126465


training:  88%|████████▊ | 9627/10986 [3:45:19<31:51,  1.41s/it]

training loss: 3.5214972496032715


training:  88%|████████▊ | 9628/10986 [3:45:20<31:24,  1.39s/it]

training loss: 3.4124624729156494


training:  88%|████████▊ | 9629/10986 [3:45:21<31:07,  1.38s/it]

training loss: 3.4442248344421387


training:  88%|████████▊ | 9630/10986 [3:45:23<30:54,  1.37s/it]

training loss: 3.4164681434631348


training:  88%|████████▊ | 9631/10986 [3:45:24<32:39,  1.45s/it]

training loss: 3.3042073249816895


training:  88%|████████▊ | 9632/10986 [3:45:26<32:22,  1.43s/it]

training loss: 3.4524991512298584


training:  88%|████████▊ | 9633/10986 [3:45:27<31:43,  1.41s/it]

training loss: 3.350646495819092


training:  88%|████████▊ | 9634/10986 [3:45:29<31:05,  1.38s/it]

training loss: 3.3853557109832764


training:  88%|████████▊ | 9635/10986 [3:45:30<30:49,  1.37s/it]

training loss: 3.4566967487335205


training:  88%|████████▊ | 9636/10986 [3:45:31<30:25,  1.35s/it]

training loss: 3.3752026557922363


training:  88%|████████▊ | 9637/10986 [3:45:33<30:22,  1.35s/it]

training loss: 3.4351019859313965


training:  88%|████████▊ | 9638/10986 [3:45:34<30:03,  1.34s/it]

training loss: 3.4237122535705566


training:  88%|████████▊ | 9639/10986 [3:45:35<30:02,  1.34s/it]

training loss: 3.4115939140319824


training:  88%|████████▊ | 9640/10986 [3:45:36<30:03,  1.34s/it]

training loss: 3.4808390140533447
valid loss: 3.477623462677002
perplexity: 32.38267135620117


training:  88%|████████▊ | 9641/10986 [3:45:39<40:16,  1.80s/it]

training loss: 3.453554630279541


training:  88%|████████▊ | 9642/10986 [3:45:41<38:04,  1.70s/it]

training loss: 3.295241117477417


training:  88%|████████▊ | 9643/10986 [3:45:42<35:40,  1.59s/it]

training loss: 3.4031155109405518


training:  88%|████████▊ | 9644/10986 [3:45:44<34:00,  1.52s/it]

training loss: 3.31964111328125


training:  88%|████████▊ | 9645/10986 [3:45:45<32:48,  1.47s/it]

training loss: 3.358682155609131


training:  88%|████████▊ | 9646/10986 [3:45:46<31:48,  1.42s/it]

training loss: 3.3834760189056396


training:  88%|████████▊ | 9647/10986 [3:45:48<31:17,  1.40s/it]

training loss: 3.4549386501312256


training:  88%|████████▊ | 9648/10986 [3:45:49<30:50,  1.38s/it]

training loss: 3.5088868141174316


training:  88%|████████▊ | 9649/10986 [3:45:50<30:37,  1.37s/it]

training loss: 3.313140392303467


training:  88%|████████▊ | 9650/10986 [3:45:52<30:20,  1.36s/it]

training loss: 3.4444258213043213


training:  88%|████████▊ | 9651/10986 [3:45:53<32:28,  1.46s/it]

training loss: 3.3839688301086426


training:  88%|████████▊ | 9652/10986 [3:45:55<31:43,  1.43s/it]

training loss: 3.3111186027526855


training:  88%|████████▊ | 9653/10986 [3:45:56<30:57,  1.39s/it]

training loss: 3.3265864849090576


training:  88%|████████▊ | 9654/10986 [3:45:57<30:30,  1.37s/it]

training loss: 3.3393895626068115


training:  88%|████████▊ | 9655/10986 [3:45:59<30:15,  1.36s/it]

training loss: 3.369441270828247


training:  88%|████████▊ | 9656/10986 [3:46:00<29:49,  1.35s/it]

training loss: 3.375903844833374


training:  88%|████████▊ | 9657/10986 [3:46:01<29:48,  1.35s/it]

training loss: 3.3926711082458496


training:  88%|████████▊ | 9658/10986 [3:46:03<29:48,  1.35s/it]

training loss: 3.340482711791992


training:  88%|████████▊ | 9659/10986 [3:46:04<29:37,  1.34s/it]

training loss: 3.3802623748779297


training:  88%|████████▊ | 9660/10986 [3:46:05<29:45,  1.35s/it]

training loss: 3.4425857067108154
valid loss: 3.440993309020996
perplexity: 31.217952728271484


training:  88%|████████▊ | 9661/10986 [3:46:08<39:45,  1.80s/it]

training loss: 3.4307055473327637


training:  88%|████████▊ | 9662/10986 [3:46:10<38:50,  1.76s/it]

training loss: 3.3567819595336914


training:  88%|████████▊ | 9663/10986 [3:46:11<36:09,  1.64s/it]

training loss: 3.40250563621521


training:  88%|████████▊ | 9664/10986 [3:46:13<34:11,  1.55s/it]

training loss: 3.368767499923706


training:  88%|████████▊ | 9665/10986 [3:46:14<32:41,  1.48s/it]

training loss: 3.313938617706299


training:  88%|████████▊ | 9666/10986 [3:46:15<31:35,  1.44s/it]

training loss: 3.435133218765259


training:  88%|████████▊ | 9667/10986 [3:46:17<31:11,  1.42s/it]

training loss: 3.3581321239471436


training:  88%|████████▊ | 9668/10986 [3:46:18<30:42,  1.40s/it]

training loss: 3.380094051361084


training:  88%|████████▊ | 9669/10986 [3:46:19<30:24,  1.39s/it]

training loss: 3.418523073196411


training:  88%|████████▊ | 9670/10986 [3:46:21<30:24,  1.39s/it]

training loss: 3.4421582221984863


training:  88%|████████▊ | 9671/10986 [3:46:22<32:13,  1.47s/it]

training loss: 3.3362181186676025


training:  88%|████████▊ | 9672/10986 [3:46:24<31:26,  1.44s/it]

training loss: 3.392908811569214


training:  88%|████████▊ | 9673/10986 [3:46:25<30:44,  1.40s/it]

training loss: 3.3395814895629883


training:  88%|████████▊ | 9674/10986 [3:46:26<30:24,  1.39s/it]

training loss: 3.4550814628601074


training:  88%|████████▊ | 9675/10986 [3:46:28<30:13,  1.38s/it]

training loss: 3.313565254211426


training:  88%|████████▊ | 9676/10986 [3:46:29<29:48,  1.37s/it]

training loss: 3.3813860416412354


training:  88%|████████▊ | 9677/10986 [3:46:30<29:30,  1.35s/it]

training loss: 3.4340226650238037


training:  88%|████████▊ | 9678/10986 [3:46:32<29:23,  1.35s/it]

training loss: 3.301999092102051


training:  88%|████████▊ | 9679/10986 [3:46:33<29:25,  1.35s/it]

training loss: 3.419247627258301


training:  88%|████████▊ | 9680/10986 [3:46:34<29:15,  1.34s/it]

training loss: 3.6345677375793457
valid loss: 3.6222801208496094
perplexity: 37.42279815673828


training:  88%|████████▊ | 9681/10986 [3:46:37<38:58,  1.79s/it]

training loss: 3.3357810974121094


training:  88%|████████▊ | 9682/10986 [3:46:39<37:01,  1.70s/it]

training loss: 3.428069591522217


training:  88%|████████▊ | 9683/10986 [3:46:40<37:10,  1.71s/it]

training loss: 3.410491943359375


training:  88%|████████▊ | 9684/10986 [3:46:42<37:38,  1.73s/it]

training loss: 3.3268375396728516


training:  88%|████████▊ | 9685/10986 [3:46:44<36:30,  1.68s/it]

training loss: 3.3471527099609375


training:  88%|████████▊ | 9686/10986 [3:46:45<34:29,  1.59s/it]

training loss: 3.3259122371673584


training:  88%|████████▊ | 9687/10986 [3:46:47<32:53,  1.52s/it]

training loss: 3.4204351902008057


training:  88%|████████▊ | 9688/10986 [3:46:48<31:42,  1.47s/it]

training loss: 3.6356587409973145


training:  88%|████████▊ | 9689/10986 [3:46:49<30:55,  1.43s/it]

training loss: 3.5063507556915283


training:  88%|████████▊ | 9690/10986 [3:46:51<30:09,  1.40s/it]

training loss: 3.463677167892456


training:  88%|████████▊ | 9691/10986 [3:46:52<31:48,  1.47s/it]

training loss: 3.3496017456054688


training:  88%|████████▊ | 9692/10986 [3:46:54<30:56,  1.43s/it]

training loss: 3.430095911026001


training:  88%|████████▊ | 9693/10986 [3:46:55<30:23,  1.41s/it]

training loss: 3.485377311706543


training:  88%|████████▊ | 9694/10986 [3:46:56<29:57,  1.39s/it]

training loss: 3.447833776473999


training:  88%|████████▊ | 9695/10986 [3:46:58<29:36,  1.38s/it]

training loss: 3.5279858112335205


training:  88%|████████▊ | 9696/10986 [3:46:59<29:24,  1.37s/it]

training loss: 3.3558731079101562


training:  88%|████████▊ | 9697/10986 [3:47:00<29:12,  1.36s/it]

training loss: 3.5504913330078125


training:  88%|████████▊ | 9698/10986 [3:47:02<28:59,  1.35s/it]

training loss: 3.354069948196411


training:  88%|████████▊ | 9699/10986 [3:47:03<28:55,  1.35s/it]

training loss: 3.381082057952881


training:  88%|████████▊ | 9700/10986 [3:47:04<28:59,  1.35s/it]

training loss: 3.4269402027130127
valid loss: 3.4226858615875244
perplexity: 30.651630401611328


training:  88%|████████▊ | 9701/10986 [3:47:07<38:47,  1.81s/it]

training loss: 3.5565924644470215


training:  88%|████████▊ | 9702/10986 [3:47:09<36:03,  1.68s/it]

training loss: 3.337191104888916


training:  88%|████████▊ | 9703/10986 [3:47:10<33:54,  1.59s/it]

training loss: 3.408006429672241


training:  88%|████████▊ | 9704/10986 [3:47:11<32:18,  1.51s/it]

training loss: 3.437180995941162


training:  88%|████████▊ | 9705/10986 [3:47:13<31:09,  1.46s/it]

training loss: 3.3699090480804443


training:  88%|████████▊ | 9706/10986 [3:47:14<30:33,  1.43s/it]

training loss: 3.397432565689087


training:  88%|████████▊ | 9707/10986 [3:47:15<30:03,  1.41s/it]

training loss: 3.3860507011413574


training:  88%|████████▊ | 9708/10986 [3:47:17<29:39,  1.39s/it]

training loss: 3.4854283332824707


training:  88%|████████▊ | 9709/10986 [3:47:18<29:22,  1.38s/it]

training loss: 3.435793161392212


training:  88%|████████▊ | 9710/10986 [3:47:19<29:11,  1.37s/it]

training loss: 3.456643581390381


training:  88%|████████▊ | 9711/10986 [3:47:21<30:52,  1.45s/it]

training loss: 3.4836673736572266


training:  88%|████████▊ | 9712/10986 [3:47:22<30:06,  1.42s/it]

training loss: 3.431212902069092


training:  88%|████████▊ | 9713/10986 [3:47:24<29:31,  1.39s/it]

training loss: 3.374488353729248


training:  88%|████████▊ | 9714/10986 [3:47:25<29:12,  1.38s/it]

training loss: 3.5189037322998047


training:  88%|████████▊ | 9715/10986 [3:47:26<28:58,  1.37s/it]

training loss: 3.4163918495178223


training:  88%|████████▊ | 9716/10986 [3:47:28<28:49,  1.36s/it]

training loss: 3.3237972259521484


training:  88%|████████▊ | 9717/10986 [3:47:29<28:36,  1.35s/it]

training loss: 3.43692684173584


training:  88%|████████▊ | 9718/10986 [3:47:30<28:29,  1.35s/it]

training loss: 3.411966323852539


training:  88%|████████▊ | 9719/10986 [3:47:32<28:31,  1.35s/it]

training loss: 3.4669551849365234


training:  88%|████████▊ | 9720/10986 [3:47:33<28:19,  1.34s/it]

training loss: 3.4182584285736084
valid loss: 3.4214072227478027
perplexity: 30.612462997436523


training:  88%|████████▊ | 9721/10986 [3:47:36<38:06,  1.81s/it]

training loss: 3.412457227706909


training:  88%|████████▊ | 9722/10986 [3:47:37<35:43,  1.70s/it]

training loss: 3.359565019607544


training:  89%|████████▊ | 9723/10986 [3:47:39<33:35,  1.60s/it]

training loss: 3.408043146133423


training:  89%|████████▊ | 9724/10986 [3:47:40<31:55,  1.52s/it]

training loss: 3.4395856857299805


training:  89%|████████▊ | 9725/10986 [3:47:41<30:51,  1.47s/it]

training loss: 3.4579901695251465


training:  89%|████████▊ | 9726/10986 [3:47:43<30:12,  1.44s/it]

training loss: 3.368852376937866


training:  89%|████████▊ | 9727/10986 [3:47:44<29:35,  1.41s/it]

training loss: 3.4202685356140137


training:  89%|████████▊ | 9728/10986 [3:47:46<29:11,  1.39s/it]

training loss: 3.390148162841797


training:  89%|████████▊ | 9729/10986 [3:47:47<28:47,  1.37s/it]

training loss: 3.442974090576172


training:  89%|████████▊ | 9730/10986 [3:47:48<28:36,  1.37s/it]

training loss: 3.37738299369812


training:  89%|████████▊ | 9731/10986 [3:47:50<30:17,  1.45s/it]

training loss: 3.45338773727417


training:  89%|████████▊ | 9732/10986 [3:47:51<29:36,  1.42s/it]

training loss: 3.440031051635742


training:  89%|████████▊ | 9733/10986 [3:47:53<29:04,  1.39s/it]

training loss: 3.456106424331665


training:  89%|████████▊ | 9734/10986 [3:47:54<28:42,  1.38s/it]

training loss: 3.4099478721618652


training:  89%|████████▊ | 9735/10986 [3:47:55<28:27,  1.37s/it]

training loss: 3.378312587738037


training:  89%|████████▊ | 9736/10986 [3:47:57<28:08,  1.35s/it]

training loss: 3.4653730392456055


training:  89%|████████▊ | 9737/10986 [3:47:58<27:59,  1.34s/it]

training loss: 3.499798536300659


training:  89%|████████▊ | 9738/10986 [3:47:59<27:57,  1.34s/it]

training loss: 3.366330146789551


training:  89%|████████▊ | 9739/10986 [3:48:01<27:47,  1.34s/it]

training loss: 3.419304609298706


training:  89%|████████▊ | 9740/10986 [3:48:02<27:42,  1.33s/it]

training loss: 3.3129711151123047
valid loss: 3.3139703273773193
perplexity: 27.494070053100586


training:  89%|████████▊ | 9741/10986 [3:48:05<37:08,  1.79s/it]

training loss: 3.573873996734619


training:  89%|████████▊ | 9742/10986 [3:48:06<34:51,  1.68s/it]

training loss: 3.410449266433716


training:  89%|████████▊ | 9743/10986 [3:48:07<32:42,  1.58s/it]

training loss: 3.4074840545654297


training:  89%|████████▊ | 9744/10986 [3:48:09<31:18,  1.51s/it]

training loss: 3.523744821548462


training:  89%|████████▊ | 9745/10986 [3:48:10<30:18,  1.47s/it]

training loss: 3.3741674423217773


training:  89%|████████▊ | 9746/10986 [3:48:12<29:27,  1.43s/it]

training loss: 3.501957416534424


training:  89%|████████▊ | 9747/10986 [3:48:13<28:43,  1.39s/it]

training loss: 3.368952989578247


training:  89%|████████▊ | 9748/10986 [3:48:14<28:21,  1.37s/it]

training loss: 3.4561378955841064


training:  89%|████████▊ | 9749/10986 [3:48:16<28:32,  1.38s/it]

training loss: 3.330735921859741


training:  89%|████████▊ | 9750/10986 [3:48:17<28:13,  1.37s/it]

training loss: 3.452125072479248


training:  89%|████████▉ | 9751/10986 [3:48:19<29:59,  1.46s/it]

training loss: 3.430943727493286


training:  89%|████████▉ | 9752/10986 [3:48:20<29:26,  1.43s/it]

training loss: 3.604501247406006


training:  89%|████████▉ | 9753/10986 [3:48:21<28:55,  1.41s/it]

training loss: 3.329975128173828


training:  89%|████████▉ | 9754/10986 [3:48:23<28:32,  1.39s/it]

training loss: 3.415501832962036


training:  89%|████████▉ | 9755/10986 [3:48:24<28:21,  1.38s/it]

training loss: 3.3859188556671143


training:  89%|████████▉ | 9756/10986 [3:48:25<28:02,  1.37s/it]

training loss: 3.4110870361328125


training:  89%|████████▉ | 9757/10986 [3:48:27<27:44,  1.35s/it]

training loss: 3.265925407409668


training:  89%|████████▉ | 9758/10986 [3:48:28<27:44,  1.36s/it]

training loss: 3.2383031845092773


training:  89%|████████▉ | 9759/10986 [3:48:29<27:28,  1.34s/it]

training loss: 3.329397678375244


training:  89%|████████▉ | 9760/10986 [3:48:31<27:22,  1.34s/it]

training loss: 3.4437646865844727
valid loss: 3.4397478103637695
perplexity: 31.179094314575195


training:  89%|████████▉ | 9761/10986 [3:48:34<36:39,  1.80s/it]

training loss: 3.405735731124878


training:  89%|████████▉ | 9762/10986 [3:48:35<34:17,  1.68s/it]

training loss: 3.541339635848999


training:  89%|████████▉ | 9763/10986 [3:48:36<32:00,  1.57s/it]

training loss: 3.434513807296753


training:  89%|████████▉ | 9764/10986 [3:48:38<30:27,  1.50s/it]

training loss: 3.4994781017303467


training:  89%|████████▉ | 9765/10986 [3:48:39<29:20,  1.44s/it]

training loss: 3.46547532081604


training:  89%|████████▉ | 9766/10986 [3:48:40<28:40,  1.41s/it]

training loss: 3.3665976524353027


training:  89%|████████▉ | 9767/10986 [3:48:42<28:23,  1.40s/it]

training loss: 3.398543357849121


training:  89%|████████▉ | 9768/10986 [3:48:43<27:58,  1.38s/it]

training loss: 3.376413345336914


training:  89%|████████▉ | 9769/10986 [3:48:44<27:51,  1.37s/it]

training loss: 3.3997690677642822


training:  89%|████████▉ | 9770/10986 [3:48:46<27:45,  1.37s/it]

training loss: 3.4089133739471436


training:  89%|████████▉ | 9771/10986 [3:48:47<29:25,  1.45s/it]

training loss: 3.496213436126709


training:  89%|████████▉ | 9772/10986 [3:48:49<28:44,  1.42s/it]

training loss: 3.4234886169433594


training:  89%|████████▉ | 9773/10986 [3:48:50<28:10,  1.39s/it]

training loss: 3.3995957374572754


training:  89%|████████▉ | 9774/10986 [3:48:51<27:52,  1.38s/it]

training loss: 3.4025609493255615


training:  89%|████████▉ | 9775/10986 [3:48:53<27:28,  1.36s/it]

training loss: 3.449406385421753


training:  89%|████████▉ | 9776/10986 [3:48:54<27:16,  1.35s/it]

training loss: 3.4091391563415527


training:  89%|████████▉ | 9777/10986 [3:48:55<27:11,  1.35s/it]

training loss: 3.434105396270752


training:  89%|████████▉ | 9778/10986 [3:48:57<26:59,  1.34s/it]

training loss: 3.3052587509155273


training:  89%|████████▉ | 9779/10986 [3:48:58<26:51,  1.34s/it]

training loss: 3.3313677310943604


training:  89%|████████▉ | 9780/10986 [3:48:59<26:45,  1.33s/it]

training loss: 3.532900810241699
valid loss: 3.5297789573669434
perplexity: 34.11642837524414


training:  89%|████████▉ | 9781/10986 [3:49:02<36:05,  1.80s/it]

training loss: 3.3565659523010254


training:  89%|████████▉ | 9782/10986 [3:49:04<35:37,  1.78s/it]

training loss: 3.4089882373809814


training:  89%|████████▉ | 9783/10986 [3:49:05<32:57,  1.64s/it]

training loss: 3.375819206237793


training:  89%|████████▉ | 9784/10986 [3:49:07<31:04,  1.55s/it]

training loss: 3.3162484169006348


training:  89%|████████▉ | 9785/10986 [3:49:08<29:40,  1.48s/it]

training loss: 3.4302000999450684


training:  89%|████████▉ | 9786/10986 [3:49:09<28:32,  1.43s/it]

training loss: 3.3279025554656982


training:  89%|████████▉ | 9787/10986 [3:49:10<27:54,  1.40s/it]

training loss: 3.382819652557373


training:  89%|████████▉ | 9788/10986 [3:49:12<27:28,  1.38s/it]

training loss: 3.4259543418884277


training:  89%|████████▉ | 9789/10986 [3:49:13<27:15,  1.37s/it]

training loss: 3.41367244720459


training:  89%|████████▉ | 9790/10986 [3:49:15<27:07,  1.36s/it]

training loss: 3.3834121227264404


training:  89%|████████▉ | 9791/10986 [3:49:16<28:56,  1.45s/it]

training loss: 3.244868278503418


training:  89%|████████▉ | 9792/10986 [3:49:18<28:16,  1.42s/it]

training loss: 3.5251190662384033


training:  89%|████████▉ | 9793/10986 [3:49:19<27:46,  1.40s/it]

training loss: 3.419246196746826


training:  89%|████████▉ | 9794/10986 [3:49:20<27:20,  1.38s/it]

training loss: 3.3175854682922363


training:  89%|████████▉ | 9795/10986 [3:49:22<27:06,  1.37s/it]

training loss: 3.287775993347168


training:  89%|████████▉ | 9796/10986 [3:49:23<26:53,  1.36s/it]

training loss: 3.388277292251587


training:  89%|████████▉ | 9797/10986 [3:49:24<26:51,  1.36s/it]

training loss: 3.456260919570923


training:  89%|████████▉ | 9798/10986 [3:49:26<26:37,  1.35s/it]

training loss: 3.3588690757751465


training:  89%|████████▉ | 9799/10986 [3:49:27<26:35,  1.34s/it]

training loss: 3.546274423599243


training:  89%|████████▉ | 9800/10986 [3:49:28<26:27,  1.34s/it]

training loss: 3.3471553325653076
valid loss: 3.3466601371765137
perplexity: 28.407699584960938


training:  89%|████████▉ | 9801/10986 [3:49:31<35:13,  1.78s/it]

training loss: 3.515972137451172


training:  89%|████████▉ | 9802/10986 [3:49:32<33:09,  1.68s/it]

training loss: 3.387791872024536


training:  89%|████████▉ | 9803/10986 [3:49:34<31:01,  1.57s/it]

training loss: 3.39273738861084


training:  89%|████████▉ | 9804/10986 [3:49:35<29:21,  1.49s/it]

training loss: 3.4516470432281494


training:  89%|████████▉ | 9805/10986 [3:49:36<28:27,  1.45s/it]

training loss: 3.451097011566162


training:  89%|████████▉ | 9806/10986 [3:49:38<27:46,  1.41s/it]

training loss: 3.421076536178589


training:  89%|████████▉ | 9807/10986 [3:49:39<27:07,  1.38s/it]

training loss: 3.4305033683776855


training:  89%|████████▉ | 9808/10986 [3:49:40<26:50,  1.37s/it]

training loss: 3.540247917175293


training:  89%|████████▉ | 9809/10986 [3:49:42<26:32,  1.35s/it]

training loss: 3.424670696258545


training:  89%|████████▉ | 9810/10986 [3:49:43<26:34,  1.36s/it]

training loss: 3.3942322731018066


training:  89%|████████▉ | 9811/10986 [3:49:45<27:55,  1.43s/it]

training loss: 3.3645129203796387


training:  89%|████████▉ | 9812/10986 [3:49:46<27:34,  1.41s/it]

training loss: 3.4061453342437744


training:  89%|████████▉ | 9813/10986 [3:49:47<27:09,  1.39s/it]

training loss: 3.5222291946411133


training:  89%|████████▉ | 9814/10986 [3:49:49<26:40,  1.37s/it]

training loss: 3.4002161026000977


training:  89%|████████▉ | 9815/10986 [3:49:50<26:25,  1.35s/it]

training loss: 3.4670209884643555


training:  89%|████████▉ | 9816/10986 [3:49:51<26:12,  1.34s/it]

training loss: 3.405034303665161


training:  89%|████████▉ | 9817/10986 [3:49:53<25:55,  1.33s/it]

training loss: 3.4219093322753906


training:  89%|████████▉ | 9818/10986 [3:49:54<26:04,  1.34s/it]

training loss: 3.343240261077881


training:  89%|████████▉ | 9819/10986 [3:49:55<26:00,  1.34s/it]

training loss: 3.4434375762939453


training:  89%|████████▉ | 9820/10986 [3:49:57<25:54,  1.33s/it]

training loss: 3.3607022762298584
valid loss: 3.3579325675964355
perplexity: 28.729732513427734


training:  89%|████████▉ | 9821/10986 [3:50:00<34:52,  1.80s/it]

training loss: 3.3761048316955566


training:  89%|████████▉ | 9822/10986 [3:50:01<32:26,  1.67s/it]

training loss: 3.4653446674346924


training:  89%|████████▉ | 9823/10986 [3:50:02<30:31,  1.58s/it]

training loss: 3.3928258419036865


training:  89%|████████▉ | 9824/10986 [3:50:04<28:59,  1.50s/it]

training loss: 3.359332323074341


training:  89%|████████▉ | 9825/10986 [3:50:05<27:54,  1.44s/it]

training loss: 3.5163888931274414


training:  89%|████████▉ | 9826/10986 [3:50:06<27:16,  1.41s/it]

training loss: 3.480252742767334


training:  89%|████████▉ | 9827/10986 [3:50:08<26:52,  1.39s/it]

training loss: 3.4485361576080322


training:  89%|████████▉ | 9828/10986 [3:50:09<26:17,  1.36s/it]

training loss: 3.471252918243408


training:  89%|████████▉ | 9829/10986 [3:50:10<26:11,  1.36s/it]

training loss: 3.529043197631836


training:  89%|████████▉ | 9830/10986 [3:50:12<25:58,  1.35s/it]

training loss: 3.426297426223755


training:  89%|████████▉ | 9831/10986 [3:50:13<27:26,  1.43s/it]

training loss: 3.3483364582061768


training:  89%|████████▉ | 9832/10986 [3:50:15<28:48,  1.50s/it]

training loss: 3.427748680114746


training:  90%|████████▉ | 9833/10986 [3:50:16<27:58,  1.46s/it]

training loss: 3.3730456829071045


training:  90%|████████▉ | 9834/10986 [3:50:18<27:20,  1.42s/it]

training loss: 3.3256912231445312


training:  90%|████████▉ | 9835/10986 [3:50:19<26:45,  1.39s/it]

training loss: 3.375640392303467


training:  90%|████████▉ | 9836/10986 [3:50:20<26:18,  1.37s/it]

training loss: 3.350773334503174


training:  90%|████████▉ | 9837/10986 [3:50:22<26:13,  1.37s/it]

training loss: 3.464052677154541


training:  90%|████████▉ | 9838/10986 [3:50:23<26:04,  1.36s/it]

training loss: 3.3376779556274414


training:  90%|████████▉ | 9839/10986 [3:50:24<25:59,  1.36s/it]

training loss: 3.4734811782836914


training:  90%|████████▉ | 9840/10986 [3:50:26<25:51,  1.35s/it]

training loss: 3.469651222229004
valid loss: 3.465681791305542
perplexity: 31.99827003479004


training:  90%|████████▉ | 9841/10986 [3:50:28<34:37,  1.81s/it]

training loss: 3.364436149597168


training:  90%|████████▉ | 9842/10986 [3:50:30<32:15,  1.69s/it]

training loss: 3.317765474319458


training:  90%|████████▉ | 9843/10986 [3:50:31<30:17,  1.59s/it]

training loss: 3.4242959022521973


training:  90%|████████▉ | 9844/10986 [3:50:33<28:47,  1.51s/it]

training loss: 3.585233449935913


training:  90%|████████▉ | 9845/10986 [3:50:34<27:37,  1.45s/it]

training loss: 3.404916524887085


training:  90%|████████▉ | 9846/10986 [3:50:35<26:52,  1.41s/it]

training loss: 3.464071035385132


training:  90%|████████▉ | 9847/10986 [3:50:37<26:23,  1.39s/it]

training loss: 3.3520445823669434


training:  90%|████████▉ | 9848/10986 [3:50:38<26:02,  1.37s/it]

training loss: 3.4506032466888428


training:  90%|████████▉ | 9849/10986 [3:50:39<25:46,  1.36s/it]

training loss: 3.47119402885437


training:  90%|████████▉ | 9850/10986 [3:50:41<25:32,  1.35s/it]

training loss: 3.361382484436035


training:  90%|████████▉ | 9851/10986 [3:50:42<26:59,  1.43s/it]

training loss: 3.514195680618286


training:  90%|████████▉ | 9852/10986 [3:50:44<26:46,  1.42s/it]

training loss: 3.399319648742676


training:  90%|████████▉ | 9853/10986 [3:50:45<26:20,  1.39s/it]

training loss: 3.4425911903381348


training:  90%|████████▉ | 9854/10986 [3:50:46<25:52,  1.37s/it]

training loss: 3.4015634059906006


training:  90%|████████▉ | 9855/10986 [3:50:48<25:57,  1.38s/it]

training loss: 3.4348111152648926


training:  90%|████████▉ | 9856/10986 [3:50:49<25:37,  1.36s/it]

training loss: 3.4177184104919434


training:  90%|████████▉ | 9857/10986 [3:50:50<25:24,  1.35s/it]

training loss: 3.4031500816345215


training:  90%|████████▉ | 9858/10986 [3:50:52<25:13,  1.34s/it]

training loss: 3.360172986984253


training:  90%|████████▉ | 9859/10986 [3:50:53<25:27,  1.35s/it]

training loss: 3.3464670181274414


training:  90%|████████▉ | 9860/10986 [3:50:54<25:20,  1.35s/it]

training loss: 3.4556069374084473
valid loss: 3.452975273132324
perplexity: 31.594253540039062


training:  90%|████████▉ | 9861/10986 [3:50:57<33:50,  1.80s/it]

training loss: 3.344245672225952


training:  90%|████████▉ | 9862/10986 [3:50:59<33:02,  1.76s/it]

training loss: 3.283604621887207


training:  90%|████████▉ | 9863/10986 [3:51:00<30:41,  1.64s/it]

training loss: 3.2791597843170166


training:  90%|████████▉ | 9864/10986 [3:51:01<28:54,  1.55s/it]

training loss: 3.383942127227783


training:  90%|████████▉ | 9865/10986 [3:51:03<27:37,  1.48s/it]

training loss: 3.3879523277282715


training:  90%|████████▉ | 9866/10986 [3:51:04<26:45,  1.43s/it]

training loss: 3.4755146503448486


training:  90%|████████▉ | 9867/10986 [3:51:05<26:06,  1.40s/it]

training loss: 3.3879151344299316


training:  90%|████████▉ | 9868/10986 [3:51:07<25:43,  1.38s/it]

training loss: 3.3625314235687256


training:  90%|████████▉ | 9869/10986 [3:51:08<25:31,  1.37s/it]

training loss: 3.4760875701904297


training:  90%|████████▉ | 9870/10986 [3:51:09<25:20,  1.36s/it]

training loss: 3.3563177585601807


training:  90%|████████▉ | 9871/10986 [3:51:11<27:53,  1.50s/it]

training loss: 3.5142056941986084


training:  90%|████████▉ | 9872/10986 [3:51:13<29:59,  1.62s/it]

training loss: 3.36920166015625


training:  90%|████████▉ | 9873/10986 [3:51:15<29:58,  1.62s/it]

training loss: 3.409648895263672


training:  90%|████████▉ | 9874/10986 [3:51:16<28:24,  1.53s/it]

training loss: 3.3696701526641846


training:  90%|████████▉ | 9875/10986 [3:51:18<27:28,  1.48s/it]

training loss: 3.4497933387756348


training:  90%|████████▉ | 9876/10986 [3:51:19<26:34,  1.44s/it]

training loss: 3.5045158863067627


training:  90%|████████▉ | 9877/10986 [3:51:20<26:06,  1.41s/it]

training loss: 3.334453582763672


training:  90%|████████▉ | 9878/10986 [3:51:22<25:29,  1.38s/it]

training loss: 3.3905510902404785


training:  90%|████████▉ | 9879/10986 [3:51:23<25:08,  1.36s/it]

training loss: 3.449127197265625


training:  90%|████████▉ | 9880/10986 [3:51:24<25:13,  1.37s/it]

training loss: 3.356464385986328
valid loss: 3.3610711097717285
perplexity: 28.820043563842773


training:  90%|████████▉ | 9881/10986 [3:51:27<33:15,  1.81s/it]

training loss: 3.2625885009765625


training:  90%|████████▉ | 9882/10986 [3:51:28<31:10,  1.69s/it]

training loss: 3.5218491554260254


training:  90%|████████▉ | 9883/10986 [3:51:30<29:03,  1.58s/it]

training loss: 3.3588130474090576


training:  90%|████████▉ | 9884/10986 [3:51:31<27:42,  1.51s/it]

training loss: 3.3393561840057373


training:  90%|████████▉ | 9885/10986 [3:51:32<26:45,  1.46s/it]

training loss: 3.409952402114868


training:  90%|████████▉ | 9886/10986 [3:51:34<26:00,  1.42s/it]

training loss: 3.4185447692871094


training:  90%|████████▉ | 9887/10986 [3:51:35<25:31,  1.39s/it]

training loss: 3.356889247894287


training:  90%|█████████ | 9888/10986 [3:51:36<25:11,  1.38s/it]

training loss: 3.3178114891052246


training:  90%|█████████ | 9889/10986 [3:51:38<24:55,  1.36s/it]

training loss: 3.454315662384033


training:  90%|█████████ | 9890/10986 [3:51:39<24:40,  1.35s/it]

training loss: 3.4337780475616455


training:  90%|█████████ | 9891/10986 [3:51:41<26:10,  1.43s/it]

training loss: 3.397162914276123


training:  90%|█████████ | 9892/10986 [3:51:42<25:40,  1.41s/it]

training loss: 3.35935378074646


training:  90%|█████████ | 9893/10986 [3:51:43<25:16,  1.39s/it]

training loss: 3.3797757625579834


training:  90%|█████████ | 9894/10986 [3:51:45<25:03,  1.38s/it]

training loss: 3.379321813583374


training:  90%|█████████ | 9895/10986 [3:51:46<24:57,  1.37s/it]

training loss: 3.3747284412384033


training:  90%|█████████ | 9896/10986 [3:51:47<24:48,  1.37s/it]

training loss: 3.399104595184326


training:  90%|█████████ | 9897/10986 [3:51:49<24:38,  1.36s/it]

training loss: 3.447425603866577


training:  90%|█████████ | 9898/10986 [3:51:50<24:23,  1.34s/it]

training loss: 3.390389919281006


training:  90%|█████████ | 9899/10986 [3:51:51<24:20,  1.34s/it]

training loss: 3.371687889099121


training:  90%|█████████ | 9900/10986 [3:51:53<24:10,  1.34s/it]

training loss: 3.3507297039031982
valid loss: 3.3462576866149902
perplexity: 28.39626693725586


training:  90%|█████████ | 9901/10986 [3:51:56<32:30,  1.80s/it]

training loss: 3.4623022079467773


training:  90%|█████████ | 9902/10986 [3:51:57<30:18,  1.68s/it]

training loss: 3.459216594696045


training:  90%|█████████ | 9903/10986 [3:51:58<28:19,  1.57s/it]

training loss: 3.477501153945923


training:  90%|█████████ | 9904/10986 [3:52:00<27:03,  1.50s/it]

training loss: 3.303706645965576


training:  90%|█████████ | 9905/10986 [3:52:01<26:03,  1.45s/it]

training loss: 3.4167838096618652


training:  90%|█████████ | 9906/10986 [3:52:02<25:28,  1.42s/it]

training loss: 3.386171817779541


training:  90%|█████████ | 9907/10986 [3:52:04<25:01,  1.39s/it]

training loss: 3.345740556716919


training:  90%|█████████ | 9908/10986 [3:52:05<24:40,  1.37s/it]

training loss: 3.3715763092041016


training:  90%|█████████ | 9909/10986 [3:52:06<24:20,  1.36s/it]

training loss: 3.402843952178955


training:  90%|█████████ | 9910/10986 [3:52:08<24:13,  1.35s/it]

training loss: 3.4125962257385254


training:  90%|█████████ | 9911/10986 [3:52:09<25:39,  1.43s/it]

training loss: 3.481825828552246


training:  90%|█████████ | 9912/10986 [3:52:11<26:31,  1.48s/it]

training loss: 3.447746753692627


training:  90%|█████████ | 9913/10986 [3:52:12<25:44,  1.44s/it]

training loss: 3.4272079467773438


training:  90%|█████████ | 9914/10986 [3:52:14<25:08,  1.41s/it]

training loss: 3.332019805908203


training:  90%|█████████ | 9915/10986 [3:52:15<24:43,  1.39s/it]

training loss: 3.50032901763916


training:  90%|█████████ | 9916/10986 [3:52:16<24:27,  1.37s/it]

training loss: 3.396566390991211


training:  90%|█████████ | 9917/10986 [3:52:18<24:23,  1.37s/it]

training loss: 3.4550833702087402


training:  90%|█████████ | 9918/10986 [3:52:19<24:16,  1.36s/it]

training loss: 3.281937599182129


training:  90%|█████████ | 9919/10986 [3:52:20<23:57,  1.35s/it]

training loss: 3.3530092239379883


training:  90%|█████████ | 9920/10986 [3:52:22<23:48,  1.34s/it]

training loss: 3.5020604133605957
valid loss: 3.4974846839904785
perplexity: 33.03226089477539


training:  90%|█████████ | 9921/10986 [3:52:25<31:59,  1.80s/it]

training loss: 3.475860595703125


training:  90%|█████████ | 9922/10986 [3:52:26<29:57,  1.69s/it]

training loss: 3.4682581424713135


training:  90%|█████████ | 9923/10986 [3:52:27<27:56,  1.58s/it]

training loss: 3.385010242462158


training:  90%|█████████ | 9924/10986 [3:52:29<26:43,  1.51s/it]

training loss: 3.427051544189453


training:  90%|█████████ | 9925/10986 [3:52:30<25:39,  1.45s/it]

training loss: 3.437678337097168


training:  90%|█████████ | 9926/10986 [3:52:31<24:59,  1.41s/it]

training loss: 3.355114221572876


training:  90%|█████████ | 9927/10986 [3:52:33<24:25,  1.38s/it]

training loss: 3.42374324798584


training:  90%|█████████ | 9928/10986 [3:52:34<24:05,  1.37s/it]

training loss: 3.4662811756134033


training:  90%|█████████ | 9929/10986 [3:52:35<23:49,  1.35s/it]

training loss: 3.400545120239258


training:  90%|█████████ | 9930/10986 [3:52:37<23:37,  1.34s/it]

training loss: 3.4125092029571533


training:  90%|█████████ | 9931/10986 [3:52:38<25:07,  1.43s/it]

training loss: 3.4675707817077637


training:  90%|█████████ | 9932/10986 [3:52:40<24:41,  1.41s/it]

training loss: 3.288552761077881


training:  90%|█████████ | 9933/10986 [3:52:41<24:12,  1.38s/it]

training loss: 3.4902091026306152


training:  90%|█████████ | 9934/10986 [3:52:42<23:48,  1.36s/it]

training loss: 3.411168098449707


training:  90%|█████████ | 9935/10986 [3:52:43<23:35,  1.35s/it]

training loss: 3.3349287509918213


training:  90%|█████████ | 9936/10986 [3:52:45<23:31,  1.34s/it]

training loss: 3.3952646255493164


training:  90%|█████████ | 9937/10986 [3:52:46<23:28,  1.34s/it]

training loss: 3.386713743209839


training:  90%|█████████ | 9938/10986 [3:52:47<23:24,  1.34s/it]

training loss: 3.3776707649230957


training:  90%|█████████ | 9939/10986 [3:52:49<23:39,  1.36s/it]

training loss: 3.3150107860565186


training:  90%|█████████ | 9940/10986 [3:52:50<23:37,  1.36s/it]

training loss: 3.350721597671509
valid loss: 3.3467819690704346
perplexity: 28.41115951538086


training:  90%|█████████ | 9941/10986 [3:52:53<31:17,  1.80s/it]

training loss: 3.3858227729797363


training:  90%|█████████ | 9942/10986 [3:52:54<29:27,  1.69s/it]

training loss: 3.321566104888916


training:  91%|█████████ | 9943/10986 [3:52:56<27:30,  1.58s/it]

training loss: 3.396421432495117


training:  91%|█████████ | 9944/10986 [3:52:57<26:06,  1.50s/it]

training loss: 3.4019837379455566


training:  91%|█████████ | 9945/10986 [3:52:58<25:15,  1.46s/it]

training loss: 3.479006052017212


training:  91%|█████████ | 9946/10986 [3:53:00<24:30,  1.41s/it]

training loss: 3.4422314167022705


training:  91%|█████████ | 9947/10986 [3:53:01<24:02,  1.39s/it]

training loss: 3.404353141784668


training:  91%|█████████ | 9948/10986 [3:53:02<23:44,  1.37s/it]

training loss: 3.3632545471191406


training:  91%|█████████ | 9949/10986 [3:53:04<23:34,  1.36s/it]

training loss: 3.4329030513763428


training:  91%|█████████ | 9950/10986 [3:53:05<23:27,  1.36s/it]

training loss: 3.49175763130188


training:  91%|█████████ | 9951/10986 [3:53:07<24:53,  1.44s/it]

training loss: 3.3886590003967285


training:  91%|█████████ | 9952/10986 [3:53:08<24:14,  1.41s/it]

training loss: 3.4891417026519775


training:  91%|█████████ | 9953/10986 [3:53:09<23:41,  1.38s/it]

training loss: 3.6285171508789062


training:  91%|█████████ | 9954/10986 [3:53:11<23:31,  1.37s/it]

training loss: 3.3480775356292725


training:  91%|█████████ | 9955/10986 [3:53:12<23:11,  1.35s/it]

training loss: 3.3111793994903564


training:  91%|█████████ | 9956/10986 [3:53:13<23:08,  1.35s/it]

training loss: 3.4622230529785156


training:  91%|█████████ | 9957/10986 [3:53:15<22:59,  1.34s/it]

training loss: 3.3454174995422363


training:  91%|█████████ | 9958/10986 [3:53:16<22:53,  1.34s/it]

training loss: 3.3933143615722656


training:  91%|█████████ | 9959/10986 [3:53:17<22:57,  1.34s/it]

training loss: 3.3788347244262695


training:  91%|█████████ | 9960/10986 [3:53:19<22:56,  1.34s/it]

training loss: 3.575472831726074
valid loss: 3.567491292953491
perplexity: 35.42760467529297


training:  91%|█████████ | 9961/10986 [3:53:22<30:36,  1.79s/it]

training loss: 3.460050582885742


training:  91%|█████████ | 9962/10986 [3:53:23<29:49,  1.75s/it]

training loss: 3.408738613128662


training:  91%|█████████ | 9963/10986 [3:53:25<27:40,  1.62s/it]

training loss: 3.3842625617980957


training:  91%|█████████ | 9964/10986 [3:53:26<26:10,  1.54s/it]

training loss: 3.4151480197906494


training:  91%|█████████ | 9965/10986 [3:53:27<25:12,  1.48s/it]

training loss: 3.369633436203003


training:  91%|█████████ | 9966/10986 [3:53:29<24:24,  1.44s/it]

training loss: 3.4265706539154053


training:  91%|█████████ | 9967/10986 [3:53:30<23:47,  1.40s/it]

training loss: 3.482757568359375


training:  91%|█████████ | 9968/10986 [3:53:31<23:31,  1.39s/it]

training loss: 3.3258304595947266


training:  91%|█████████ | 9969/10986 [3:53:33<23:22,  1.38s/it]

training loss: 3.5640461444854736


training:  91%|█████████ | 9970/10986 [3:53:34<22:57,  1.36s/it]

training loss: 3.4322633743286133


training:  91%|█████████ | 9971/10986 [3:53:36<24:26,  1.45s/it]

training loss: 3.4346659183502197


training:  91%|█████████ | 9972/10986 [3:53:37<23:55,  1.42s/it]

training loss: 3.4898393154144287


training:  91%|█████████ | 9973/10986 [3:53:38<23:27,  1.39s/it]

training loss: 3.4340572357177734


training:  91%|█████████ | 9974/10986 [3:53:40<23:02,  1.37s/it]

training loss: 3.4789323806762695


training:  91%|█████████ | 9975/10986 [3:53:41<22:47,  1.35s/it]

training loss: 3.3357479572296143


training:  91%|█████████ | 9976/10986 [3:53:42<22:43,  1.35s/it]

training loss: 3.451355457305908


training:  91%|█████████ | 9977/10986 [3:53:44<22:44,  1.35s/it]

training loss: 3.387206554412842


training:  91%|█████████ | 9978/10986 [3:53:45<22:42,  1.35s/it]

training loss: 3.5240111351013184


training:  91%|█████████ | 9979/10986 [3:53:46<22:30,  1.34s/it]

training loss: 3.350208044052124


training:  91%|█████████ | 9980/10986 [3:53:48<22:33,  1.35s/it]

training loss: 3.4215760231018066
valid loss: 3.4233689308166504
perplexity: 30.672574996948242


training:  91%|█████████ | 9981/10986 [3:53:51<30:30,  1.82s/it]

training loss: 3.391554594039917


training:  91%|█████████ | 9982/10986 [3:53:52<29:37,  1.77s/it]

training loss: 3.3590049743652344


training:  91%|█████████ | 9983/10986 [3:53:54<27:32,  1.65s/it]

training loss: 3.404703140258789


training:  91%|█████████ | 9984/10986 [3:53:55<26:05,  1.56s/it]

training loss: 3.3556270599365234


training:  91%|█████████ | 9985/10986 [3:53:56<24:51,  1.49s/it]

training loss: 3.4239447116851807


training:  91%|█████████ | 9986/10986 [3:53:58<24:01,  1.44s/it]

training loss: 3.4358601570129395


training:  91%|█████████ | 9987/10986 [3:53:59<23:23,  1.41s/it]

training loss: 3.4734339714050293


training:  91%|█████████ | 9988/10986 [3:54:00<22:47,  1.37s/it]

training loss: 3.3045732975006104


training:  91%|█████████ | 9989/10986 [3:54:02<22:38,  1.36s/it]

training loss: 3.4502882957458496


training:  91%|█████████ | 9990/10986 [3:54:03<22:19,  1.35s/it]

training loss: 3.3910515308380127


training:  91%|█████████ | 9991/10986 [3:54:04<23:43,  1.43s/it]

training loss: 3.3185484409332275


training:  91%|█████████ | 9992/10986 [3:54:06<23:13,  1.40s/it]

training loss: 3.4695518016815186


training:  91%|█████████ | 9993/10986 [3:54:07<22:45,  1.38s/it]

training loss: 3.4774205684661865


training:  91%|█████████ | 9994/10986 [3:54:08<22:27,  1.36s/it]

training loss: 3.3063855171203613


training:  91%|█████████ | 9995/10986 [3:54:10<22:20,  1.35s/it]

training loss: 3.4153406620025635


training:  91%|█████████ | 9996/10986 [3:54:11<22:03,  1.34s/it]

training loss: 3.530360460281372


training:  91%|█████████ | 9997/10986 [3:54:12<21:57,  1.33s/it]

training loss: 3.3986587524414062


training:  91%|█████████ | 9998/10986 [3:54:14<21:54,  1.33s/it]

training loss: 3.3685717582702637


training:  91%|█████████ | 9999/10986 [3:54:15<21:58,  1.34s/it]

training loss: 3.3036375045776367


training:  91%|█████████ | 10000/10986 [3:54:16<21:55,  1.33s/it]

training loss: 3.3440229892730713
valid loss: 3.340843915939331
perplexity: 28.242952346801758


training:  91%|█████████ | 10001/10986 [3:54:19<29:41,  1.81s/it]

training loss: 3.4207308292388916


training:  91%|█████████ | 10002/10986 [3:54:21<28:45,  1.75s/it]

training loss: 3.3183581829071045


training:  91%|█████████ | 10003/10986 [3:54:22<26:38,  1.63s/it]

training loss: 3.365262269973755


training:  91%|█████████ | 10004/10986 [3:54:24<25:07,  1.54s/it]

training loss: 3.3960134983062744


training:  91%|█████████ | 10005/10986 [3:54:25<24:00,  1.47s/it]

training loss: 3.4478793144226074


training:  91%|█████████ | 10006/10986 [3:54:26<23:23,  1.43s/it]

training loss: 3.435201406478882


training:  91%|█████████ | 10007/10986 [3:54:28<22:53,  1.40s/it]

training loss: 3.365748882293701


training:  91%|█████████ | 10008/10986 [3:54:29<22:38,  1.39s/it]

training loss: 3.258885383605957


training:  91%|█████████ | 10009/10986 [3:54:30<22:22,  1.37s/it]

training loss: 3.437865734100342


training:  91%|█████████ | 10010/10986 [3:54:32<22:02,  1.36s/it]

training loss: 3.462679624557495


training:  91%|█████████ | 10011/10986 [3:54:33<23:22,  1.44s/it]

training loss: 3.4798080921173096


training:  91%|█████████ | 10012/10986 [3:54:35<22:48,  1.41s/it]

training loss: 3.5243895053863525


training:  91%|█████████ | 10013/10986 [3:54:36<22:16,  1.37s/it]

training loss: 3.426187753677368


training:  91%|█████████ | 10014/10986 [3:54:37<21:54,  1.35s/it]

training loss: 3.364957332611084


training:  91%|█████████ | 10015/10986 [3:54:38<21:48,  1.35s/it]

training loss: 3.4509942531585693


training:  91%|█████████ | 10016/10986 [3:54:40<21:38,  1.34s/it]

training loss: 3.3177928924560547


training:  91%|█████████ | 10017/10986 [3:54:41<21:34,  1.34s/it]

training loss: 3.533295154571533


training:  91%|█████████ | 10018/10986 [3:54:42<21:22,  1.33s/it]

training loss: 3.470414638519287


training:  91%|█████████ | 10019/10986 [3:54:44<21:19,  1.32s/it]

training loss: 3.389676809310913


training:  91%|█████████ | 10020/10986 [3:54:45<21:18,  1.32s/it]

training loss: 3.340343952178955
valid loss: 3.3372271060943604
perplexity: 28.1409854888916


training:  91%|█████████ | 10021/10986 [3:54:48<28:41,  1.78s/it]

training loss: 3.3540070056915283


training:  91%|█████████ | 10022/10986 [3:54:49<26:47,  1.67s/it]

training loss: 3.5179502964019775


training:  91%|█████████ | 10023/10986 [3:54:51<25:22,  1.58s/it]

training loss: 3.5432467460632324


training:  91%|█████████ | 10024/10986 [3:54:52<24:10,  1.51s/it]

training loss: 3.30472469329834


training:  91%|█████████▏| 10025/10986 [3:54:53<23:12,  1.45s/it]

training loss: 3.295332431793213


training:  91%|█████████▏| 10026/10986 [3:54:55<22:32,  1.41s/it]

training loss: 3.4859840869903564


training:  91%|█████████▏| 10027/10986 [3:54:56<22:01,  1.38s/it]

training loss: 3.3163347244262695


training:  91%|█████████▏| 10028/10986 [3:54:57<21:46,  1.36s/it]

training loss: 3.4295730590820312


training:  91%|█████████▏| 10029/10986 [3:54:59<21:29,  1.35s/it]

training loss: 3.3142104148864746


training:  91%|█████████▏| 10030/10986 [3:55:00<21:16,  1.34s/it]

training loss: 3.356870651245117


training:  91%|█████████▏| 10031/10986 [3:55:02<22:43,  1.43s/it]

training loss: 3.34659481048584


training:  91%|█████████▏| 10032/10986 [3:55:03<22:37,  1.42s/it]

training loss: 3.4325575828552246


training:  91%|█████████▏| 10033/10986 [3:55:04<21:58,  1.38s/it]

training loss: 3.411186695098877


training:  91%|█████████▏| 10034/10986 [3:55:06<21:36,  1.36s/it]

training loss: 3.417140245437622


training:  91%|█████████▏| 10035/10986 [3:55:07<21:19,  1.34s/it]

training loss: 3.308112621307373


training:  91%|█████████▏| 10036/10986 [3:55:08<21:09,  1.34s/it]

training loss: 3.4866204261779785


training:  91%|█████████▏| 10037/10986 [3:55:10<21:07,  1.34s/it]

training loss: 3.338573455810547


training:  91%|█████████▏| 10038/10986 [3:55:11<21:01,  1.33s/it]

training loss: 3.3925135135650635


training:  91%|█████████▏| 10039/10986 [3:55:12<21:02,  1.33s/it]

training loss: 3.40432071685791


training:  91%|█████████▏| 10040/10986 [3:55:14<20:55,  1.33s/it]

training loss: 3.436802864074707
valid loss: 3.438796043395996
perplexity: 31.149433135986328


training:  91%|█████████▏| 10041/10986 [3:55:16<27:55,  1.77s/it]

training loss: 3.4754068851470947


training:  91%|█████████▏| 10042/10986 [3:55:18<25:58,  1.65s/it]

training loss: 3.525716781616211


training:  91%|█████████▏| 10043/10986 [3:55:19<24:27,  1.56s/it]

training loss: 3.353100299835205


training:  91%|█████████▏| 10044/10986 [3:55:20<23:27,  1.49s/it]

training loss: 3.380220651626587


training:  91%|█████████▏| 10045/10986 [3:55:22<22:32,  1.44s/it]

training loss: 3.4403610229492188


training:  91%|█████████▏| 10046/10986 [3:55:23<21:55,  1.40s/it]

training loss: 3.347229480743408


training:  91%|█████████▏| 10047/10986 [3:55:24<21:26,  1.37s/it]

training loss: 3.36474347114563


training:  91%|█████████▏| 10048/10986 [3:55:26<21:10,  1.35s/it]

training loss: 3.3597867488861084


training:  91%|█████████▏| 10049/10986 [3:55:27<21:10,  1.36s/it]

training loss: 3.357745885848999


training:  91%|█████████▏| 10050/10986 [3:55:28<21:08,  1.36s/it]

training loss: 3.5480213165283203


training:  91%|█████████▏| 10051/10986 [3:55:30<22:20,  1.43s/it]

training loss: 3.390256643295288


training:  91%|█████████▏| 10052/10986 [3:55:31<21:50,  1.40s/it]

training loss: 3.386228561401367


training:  92%|█████████▏| 10053/10986 [3:55:33<21:21,  1.37s/it]

training loss: 3.3749067783355713


training:  92%|█████████▏| 10054/10986 [3:55:34<21:00,  1.35s/it]

training loss: 3.415623188018799


training:  92%|█████████▏| 10055/10986 [3:55:35<20:51,  1.34s/it]

training loss: 3.3994858264923096


training:  92%|█████████▏| 10056/10986 [3:55:37<20:43,  1.34s/it]

training loss: 3.367187023162842


training:  92%|█████████▏| 10057/10986 [3:55:38<20:34,  1.33s/it]

training loss: 3.433645486831665


training:  92%|█████████▏| 10058/10986 [3:55:39<20:32,  1.33s/it]

training loss: 3.3723223209381104


training:  92%|█████████▏| 10059/10986 [3:55:41<20:34,  1.33s/it]

training loss: 3.368161678314209


training:  92%|█████████▏| 10060/10986 [3:55:42<20:30,  1.33s/it]

training loss: 3.4502718448638916
valid loss: 3.439727783203125
perplexity: 31.178468704223633


training:  92%|█████████▏| 10061/10986 [3:55:45<29:03,  1.88s/it]

training loss: 3.4365320205688477


training:  92%|█████████▏| 10062/10986 [3:55:47<29:08,  1.89s/it]

training loss: 3.389946460723877


training:  92%|█████████▏| 10063/10986 [3:55:48<27:18,  1.78s/it]

training loss: 3.437967300415039


training:  92%|█████████▏| 10064/10986 [3:55:50<25:16,  1.64s/it]

training loss: 3.2867488861083984


training:  92%|█████████▏| 10065/10986 [3:55:51<23:56,  1.56s/it]

training loss: 3.3414881229400635


training:  92%|█████████▏| 10066/10986 [3:55:52<22:56,  1.50s/it]

training loss: 3.3411858081817627


training:  92%|█████████▏| 10067/10986 [3:55:54<22:06,  1.44s/it]

training loss: 3.459861993789673


training:  92%|█████████▏| 10068/10986 [3:55:55<21:35,  1.41s/it]

training loss: 3.4514782428741455


training:  92%|█████████▏| 10069/10986 [3:55:56<21:06,  1.38s/it]

training loss: 3.391824722290039


training:  92%|█████████▏| 10070/10986 [3:55:58<20:49,  1.36s/it]

training loss: 3.3363492488861084


training:  92%|█████████▏| 10071/10986 [3:55:59<21:49,  1.43s/it]

training loss: 3.3995237350463867


training:  92%|█████████▏| 10072/10986 [3:56:01<21:21,  1.40s/it]

training loss: 3.435892105102539


training:  92%|█████████▏| 10073/10986 [3:56:02<21:04,  1.38s/it]

training loss: 3.3228931427001953


training:  92%|█████████▏| 10074/10986 [3:56:03<20:45,  1.37s/it]

training loss: 3.3311963081359863


training:  92%|█████████▏| 10075/10986 [3:56:05<20:30,  1.35s/it]

training loss: 3.437026023864746


training:  92%|█████████▏| 10076/10986 [3:56:06<20:26,  1.35s/it]

training loss: 3.488776206970215


training:  92%|█████████▏| 10077/10986 [3:56:07<20:25,  1.35s/it]

training loss: 3.380889892578125


training:  92%|█████████▏| 10078/10986 [3:56:09<20:11,  1.33s/it]

training loss: 3.4147396087646484


training:  92%|█████████▏| 10079/10986 [3:56:10<20:02,  1.33s/it]

training loss: 3.4176692962646484


training:  92%|█████████▏| 10080/10986 [3:56:11<19:57,  1.32s/it]

training loss: 3.3902571201324463
valid loss: 3.3881142139434814
perplexity: 29.610061645507812


training:  92%|█████████▏| 10081/10986 [3:56:14<26:51,  1.78s/it]

training loss: 3.4481089115142822


training:  92%|█████████▏| 10082/10986 [3:56:16<26:06,  1.73s/it]

training loss: 3.3984453678131104


training:  92%|█████████▏| 10083/10986 [3:56:17<24:19,  1.62s/it]

training loss: 3.389754056930542


training:  92%|█████████▏| 10084/10986 [3:56:18<23:04,  1.54s/it]

training loss: 3.388118028640747


training:  92%|█████████▏| 10085/10986 [3:56:20<22:10,  1.48s/it]

training loss: 3.4492850303649902


training:  92%|█████████▏| 10086/10986 [3:56:21<21:18,  1.42s/it]

training loss: 3.463104009628296


training:  92%|█████████▏| 10087/10986 [3:56:22<20:52,  1.39s/it]

training loss: 3.485379695892334


training:  92%|█████████▏| 10088/10986 [3:56:24<20:36,  1.38s/it]

training loss: 3.464871883392334


training:  92%|█████████▏| 10089/10986 [3:56:25<20:16,  1.36s/it]

training loss: 3.301274061203003


training:  92%|█████████▏| 10090/10986 [3:56:26<20:10,  1.35s/it]

training loss: 3.3654088973999023


training:  92%|█████████▏| 10091/10986 [3:56:28<21:34,  1.45s/it]

training loss: 3.510561943054199


training:  92%|█████████▏| 10092/10986 [3:56:29<21:12,  1.42s/it]

training loss: 3.528148889541626


training:  92%|█████████▏| 10093/10986 [3:56:31<20:50,  1.40s/it]

training loss: 3.236558675765991


training:  92%|█████████▏| 10094/10986 [3:56:32<20:32,  1.38s/it]

training loss: 3.4046690464019775


training:  92%|█████████▏| 10095/10986 [3:56:33<20:17,  1.37s/it]

training loss: 3.4725401401519775


training:  92%|█████████▏| 10096/10986 [3:56:35<20:00,  1.35s/it]

training loss: 3.3614091873168945


training:  92%|█████████▏| 10097/10986 [3:56:36<19:52,  1.34s/it]

training loss: 3.515324831008911


training:  92%|█████████▏| 10098/10986 [3:56:37<19:43,  1.33s/it]

training loss: 3.332704782485962


training:  92%|█████████▏| 10099/10986 [3:56:39<19:39,  1.33s/it]

training loss: 3.3365793228149414


training:  92%|█████████▏| 10100/10986 [3:56:40<19:38,  1.33s/it]

training loss: 3.374819040298462
valid loss: 3.378760814666748
perplexity: 29.33439826965332


training:  92%|█████████▏| 10101/10986 [3:56:43<26:18,  1.78s/it]

training loss: 3.4903461933135986


training:  92%|█████████▏| 10102/10986 [3:56:44<24:36,  1.67s/it]

training loss: 3.420551300048828


training:  92%|█████████▏| 10103/10986 [3:56:46<23:02,  1.57s/it]

training loss: 3.3473129272460938


training:  92%|█████████▏| 10104/10986 [3:56:47<21:59,  1.50s/it]

training loss: 3.4905741214752197


training:  92%|█████████▏| 10105/10986 [3:56:48<21:17,  1.45s/it]

training loss: 3.4673850536346436


training:  92%|█████████▏| 10106/10986 [3:56:50<20:42,  1.41s/it]

training loss: 3.386711597442627


training:  92%|█████████▏| 10107/10986 [3:56:51<20:20,  1.39s/it]

training loss: 3.494856834411621


training:  92%|█████████▏| 10108/10986 [3:56:52<20:02,  1.37s/it]

training loss: 3.2955572605133057


training:  92%|█████████▏| 10109/10986 [3:56:54<20:03,  1.37s/it]

training loss: 3.3746275901794434


training:  92%|█████████▏| 10110/10986 [3:56:55<19:55,  1.36s/it]

training loss: 3.4305074214935303


training:  92%|█████████▏| 10111/10986 [3:56:57<21:12,  1.45s/it]

training loss: 3.4532055854797363


training:  92%|█████████▏| 10112/10986 [3:56:58<20:50,  1.43s/it]

training loss: 3.4149723052978516


training:  92%|█████████▏| 10113/10986 [3:56:59<20:21,  1.40s/it]

training loss: 3.442713499069214


training:  92%|█████████▏| 10114/10986 [3:57:01<20:10,  1.39s/it]

training loss: 3.4304537773132324


training:  92%|█████████▏| 10115/10986 [3:57:02<19:59,  1.38s/it]

training loss: 3.375828266143799


training:  92%|█████████▏| 10116/10986 [3:57:03<19:44,  1.36s/it]

training loss: 3.4069385528564453


training:  92%|█████████▏| 10117/10986 [3:57:05<19:38,  1.36s/it]

training loss: 3.3824877738952637


training:  92%|█████████▏| 10118/10986 [3:57:06<19:32,  1.35s/it]

training loss: 3.454646110534668


training:  92%|█████████▏| 10119/10986 [3:57:07<19:25,  1.34s/it]

training loss: 3.3041610717773438


training:  92%|█████████▏| 10120/10986 [3:57:09<19:23,  1.34s/it]

training loss: 3.4074559211730957
valid loss: 3.408402681350708
perplexity: 30.21693992614746


training:  92%|█████████▏| 10121/10986 [3:57:12<26:04,  1.81s/it]

training loss: 3.40160870552063


training:  92%|█████████▏| 10122/10986 [3:57:13<24:16,  1.69s/it]

training loss: 3.4032046794891357


training:  92%|█████████▏| 10123/10986 [3:57:14<22:39,  1.58s/it]

training loss: 3.4399611949920654


training:  92%|█████████▏| 10124/10986 [3:57:16<21:30,  1.50s/it]

training loss: 3.4548678398132324


training:  92%|█████████▏| 10125/10986 [3:57:17<20:49,  1.45s/it]

training loss: 3.508655309677124


training:  92%|█████████▏| 10126/10986 [3:57:18<20:18,  1.42s/it]

training loss: 3.3565714359283447


training:  92%|█████████▏| 10127/10986 [3:57:20<19:57,  1.39s/it]

training loss: 3.3808252811431885


training:  92%|█████████▏| 10128/10986 [3:57:21<19:39,  1.38s/it]

training loss: 3.338672161102295


training:  92%|█████████▏| 10129/10986 [3:57:22<19:29,  1.36s/it]

training loss: 3.3180766105651855


training:  92%|█████████▏| 10130/10986 [3:57:24<19:17,  1.35s/it]

training loss: 3.4083495140075684


training:  92%|█████████▏| 10131/10986 [3:57:25<20:34,  1.44s/it]

training loss: 3.4214134216308594


training:  92%|█████████▏| 10132/10986 [3:57:27<20:01,  1.41s/it]

training loss: 3.365943431854248


training:  92%|█████████▏| 10133/10986 [3:57:28<19:38,  1.38s/it]

training loss: 3.4924538135528564


training:  92%|█████████▏| 10134/10986 [3:57:29<19:30,  1.37s/it]

training loss: 3.379178285598755


training:  92%|█████████▏| 10135/10986 [3:57:31<19:19,  1.36s/it]

training loss: 3.4220480918884277


training:  92%|█████████▏| 10136/10986 [3:57:32<19:03,  1.34s/it]

training loss: 3.4882426261901855


training:  92%|█████████▏| 10137/10986 [3:57:33<18:52,  1.33s/it]

training loss: 3.2909016609191895


training:  92%|█████████▏| 10138/10986 [3:57:35<18:49,  1.33s/it]

training loss: 3.4109790325164795


training:  92%|█████████▏| 10139/10986 [3:57:36<18:42,  1.33s/it]

training loss: 3.45902943611145


training:  92%|█████████▏| 10140/10986 [3:57:37<18:42,  1.33s/it]

training loss: 3.3261163234710693
valid loss: 3.3217570781707764
perplexity: 27.708995819091797


training:  92%|█████████▏| 10141/10986 [3:57:40<25:04,  1.78s/it]

training loss: 3.4214894771575928


training:  92%|█████████▏| 10142/10986 [3:57:41<23:22,  1.66s/it]

training loss: 3.43485951423645


training:  92%|█████████▏| 10143/10986 [3:57:43<22:00,  1.57s/it]

training loss: 3.3201351165771484


training:  92%|█████████▏| 10144/10986 [3:57:44<20:59,  1.50s/it]

training loss: 3.428558111190796


training:  92%|█████████▏| 10145/10986 [3:57:45<20:09,  1.44s/it]

training loss: 3.4252378940582275


training:  92%|█████████▏| 10146/10986 [3:57:47<19:39,  1.40s/it]

training loss: 3.433791160583496


training:  92%|█████████▏| 10147/10986 [3:57:48<19:16,  1.38s/it]

training loss: 3.3915302753448486


training:  92%|█████████▏| 10148/10986 [3:57:49<18:58,  1.36s/it]

training loss: 3.46256422996521


training:  92%|█████████▏| 10149/10986 [3:57:51<18:50,  1.35s/it]

training loss: 3.3603830337524414


training:  92%|█████████▏| 10150/10986 [3:57:52<18:46,  1.35s/it]

training loss: 3.3499042987823486


training:  92%|█████████▏| 10151/10986 [3:57:54<19:51,  1.43s/it]

training loss: 3.349853277206421


training:  92%|█████████▏| 10152/10986 [3:57:55<19:40,  1.41s/it]

training loss: 3.4377248287200928


training:  92%|█████████▏| 10153/10986 [3:57:56<19:20,  1.39s/it]

training loss: 3.3615050315856934


training:  92%|█████████▏| 10154/10986 [3:57:58<18:58,  1.37s/it]

training loss: 3.4102790355682373


training:  92%|█████████▏| 10155/10986 [3:57:59<18:51,  1.36s/it]

training loss: 3.338219165802002


training:  92%|█████████▏| 10156/10986 [3:58:00<18:44,  1.35s/it]

training loss: 3.5650362968444824


training:  92%|█████████▏| 10157/10986 [3:58:02<18:36,  1.35s/it]

training loss: 3.391866445541382


training:  92%|█████████▏| 10158/10986 [3:58:03<18:29,  1.34s/it]

training loss: 3.3950042724609375


training:  92%|█████████▏| 10159/10986 [3:58:04<18:21,  1.33s/it]

training loss: 3.5317635536193848


training:  92%|█████████▏| 10160/10986 [3:58:06<18:20,  1.33s/it]

training loss: 3.3774824142456055
valid loss: 3.3767611980438232
perplexity: 29.275800704956055


training:  92%|█████████▏| 10161/10986 [3:58:09<24:30,  1.78s/it]

training loss: 3.3193609714508057


training:  92%|█████████▏| 10162/10986 [3:58:10<22:55,  1.67s/it]

training loss: 3.4301917552948


training:  93%|█████████▎| 10163/10986 [3:58:11<21:27,  1.56s/it]

training loss: 3.5427839756011963


training:  93%|█████████▎| 10164/10986 [3:58:13<20:29,  1.50s/it]

training loss: 3.3578803539276123


training:  93%|█████████▎| 10165/10986 [3:58:14<19:45,  1.44s/it]

training loss: 3.3001251220703125


training:  93%|█████████▎| 10166/10986 [3:58:15<19:16,  1.41s/it]

training loss: 3.574915885925293


training:  93%|█████████▎| 10167/10986 [3:58:17<18:53,  1.38s/it]

training loss: 3.44953989982605


training:  93%|█████████▎| 10168/10986 [3:58:18<18:34,  1.36s/it]

training loss: 3.544743061065674


training:  93%|█████████▎| 10169/10986 [3:58:19<18:24,  1.35s/it]

training loss: 3.346731424331665


training:  93%|█████████▎| 10170/10986 [3:58:21<18:15,  1.34s/it]

training loss: 3.3780291080474854


training:  93%|█████████▎| 10171/10986 [3:58:22<19:19,  1.42s/it]

training loss: 3.435141086578369


training:  93%|█████████▎| 10172/10986 [3:58:24<18:55,  1.40s/it]

training loss: 3.387934684753418


training:  93%|█████████▎| 10173/10986 [3:58:25<18:44,  1.38s/it]

training loss: 3.274496555328369


training:  93%|█████████▎| 10174/10986 [3:58:26<18:36,  1.37s/it]

training loss: 3.318291664123535


training:  93%|█████████▎| 10175/10986 [3:58:28<18:20,  1.36s/it]

training loss: 3.3242428302764893


training:  93%|█████████▎| 10176/10986 [3:58:29<18:24,  1.36s/it]

training loss: 3.4112987518310547


training:  93%|█████████▎| 10177/10986 [3:58:30<18:17,  1.36s/it]

training loss: 3.312835693359375


training:  93%|█████████▎| 10178/10986 [3:58:32<18:09,  1.35s/it]

training loss: 3.313424587249756


training:  93%|█████████▎| 10179/10986 [3:58:33<18:03,  1.34s/it]

training loss: 3.3978140354156494


training:  93%|█████████▎| 10180/10986 [3:58:34<17:59,  1.34s/it]

training loss: 3.370262861251831
valid loss: 3.3736274242401123
perplexity: 29.1841983795166


training:  93%|█████████▎| 10181/10986 [3:58:37<24:08,  1.80s/it]

training loss: 3.4333016872406006


training:  93%|█████████▎| 10182/10986 [3:58:39<22:33,  1.68s/it]

training loss: 3.426610231399536


training:  93%|█████████▎| 10183/10986 [3:58:40<20:59,  1.57s/it]

training loss: 3.3952343463897705


training:  93%|█████████▎| 10184/10986 [3:58:41<19:57,  1.49s/it]

training loss: 3.4251835346221924


training:  93%|█████████▎| 10185/10986 [3:58:42<19:09,  1.44s/it]

training loss: 3.35693359375


training:  93%|█████████▎| 10186/10986 [3:58:44<18:39,  1.40s/it]

training loss: 3.339357614517212


training:  93%|█████████▎| 10187/10986 [3:58:45<18:21,  1.38s/it]

training loss: 3.42080020904541


training:  93%|█████████▎| 10188/10986 [3:58:46<18:10,  1.37s/it]

training loss: 3.3172993659973145


training:  93%|█████████▎| 10189/10986 [3:58:48<17:59,  1.35s/it]

training loss: 3.2397022247314453


training:  93%|█████████▎| 10190/10986 [3:58:49<17:53,  1.35s/it]

training loss: 3.4695682525634766


training:  93%|█████████▎| 10191/10986 [3:58:51<18:56,  1.43s/it]

training loss: 3.336435556411743


training:  93%|█████████▎| 10192/10986 [3:58:52<18:37,  1.41s/it]

training loss: 3.461094856262207


training:  93%|█████████▎| 10193/10986 [3:58:53<18:12,  1.38s/it]

training loss: 3.4411885738372803


training:  93%|█████████▎| 10194/10986 [3:58:55<18:00,  1.36s/it]

training loss: 3.4793758392333984


training:  93%|█████████▎| 10195/10986 [3:58:56<18:02,  1.37s/it]

training loss: 3.3642055988311768


training:  93%|█████████▎| 10196/10986 [3:58:57<17:53,  1.36s/it]

training loss: 3.367647409439087


training:  93%|█████████▎| 10197/10986 [3:58:59<17:43,  1.35s/it]

training loss: 3.348684787750244


training:  93%|█████████▎| 10198/10986 [3:59:00<17:33,  1.34s/it]

training loss: 3.388139486312866


training:  93%|█████████▎| 10199/10986 [3:59:01<17:31,  1.34s/it]

training loss: 3.376405715942383


training:  93%|█████████▎| 10200/10986 [3:59:03<17:34,  1.34s/it]

training loss: 3.408055305480957
valid loss: 3.406942844390869
perplexity: 30.172861099243164


training:  93%|█████████▎| 10201/10986 [3:59:06<23:24,  1.79s/it]

training loss: 3.2931618690490723


training:  93%|█████████▎| 10202/10986 [3:59:07<22:53,  1.75s/it]

training loss: 3.4156863689422607


training:  93%|█████████▎| 10203/10986 [3:59:09<21:17,  1.63s/it]

training loss: 3.405027151107788


training:  93%|█████████▎| 10204/10986 [3:59:10<20:00,  1.54s/it]

training loss: 3.3338241577148438


training:  93%|█████████▎| 10205/10986 [3:59:11<19:07,  1.47s/it]

training loss: 3.3539767265319824


training:  93%|█████████▎| 10206/10986 [3:59:13<18:35,  1.43s/it]

training loss: 3.328660488128662


training:  93%|█████████▎| 10207/10986 [3:59:14<18:07,  1.40s/it]

training loss: 3.3314030170440674


training:  93%|█████████▎| 10208/10986 [3:59:15<17:51,  1.38s/it]

training loss: 3.550571918487549


training:  93%|█████████▎| 10209/10986 [3:59:17<17:43,  1.37s/it]

training loss: 3.456590175628662


training:  93%|█████████▎| 10210/10986 [3:59:18<17:32,  1.36s/it]

training loss: 3.3504536151885986


training:  93%|█████████▎| 10211/10986 [3:59:20<18:41,  1.45s/it]

training loss: 3.3467886447906494


training:  93%|█████████▎| 10212/10986 [3:59:21<18:20,  1.42s/it]

training loss: 3.391366958618164


training:  93%|█████████▎| 10213/10986 [3:59:22<17:57,  1.39s/it]

training loss: 3.4174540042877197


training:  93%|█████████▎| 10214/10986 [3:59:24<17:39,  1.37s/it]

training loss: 3.364177942276001


training:  93%|█████████▎| 10215/10986 [3:59:25<17:30,  1.36s/it]

training loss: 3.4428155422210693


training:  93%|█████████▎| 10216/10986 [3:59:26<17:31,  1.37s/it]

training loss: 3.342679738998413


training:  93%|█████████▎| 10217/10986 [3:59:28<17:22,  1.36s/it]

training loss: 3.364769458770752


training:  93%|█████████▎| 10218/10986 [3:59:29<17:22,  1.36s/it]

training loss: 3.354443311691284


training:  93%|█████████▎| 10219/10986 [3:59:30<17:22,  1.36s/it]

training loss: 3.4838497638702393


training:  93%|█████████▎| 10220/10986 [3:59:32<17:21,  1.36s/it]

training loss: 3.3453545570373535
valid loss: 3.34460186958313
perplexity: 28.349287033081055


training:  93%|█████████▎| 10221/10986 [3:59:34<22:47,  1.79s/it]

training loss: 3.4000165462493896


training:  93%|█████████▎| 10222/10986 [3:59:36<21:19,  1.68s/it]

training loss: 3.3373188972473145


training:  93%|█████████▎| 10223/10986 [3:59:37<20:01,  1.57s/it]

training loss: 3.446786642074585


training:  93%|█████████▎| 10224/10986 [3:59:39<19:10,  1.51s/it]

training loss: 3.394482135772705


training:  93%|█████████▎| 10225/10986 [3:59:40<18:22,  1.45s/it]

training loss: 3.389892101287842


training:  93%|█████████▎| 10226/10986 [3:59:41<17:49,  1.41s/it]

training loss: 3.4253957271575928


training:  93%|█████████▎| 10227/10986 [3:59:43<17:28,  1.38s/it]

training loss: 3.2942020893096924


training:  93%|█████████▎| 10228/10986 [3:59:44<17:16,  1.37s/it]

training loss: 3.3677566051483154


training:  93%|█████████▎| 10229/10986 [3:59:45<17:08,  1.36s/it]

training loss: 3.4229021072387695


training:  93%|█████████▎| 10230/10986 [3:59:47<16:57,  1.35s/it]

training loss: 3.343458890914917


training:  93%|█████████▎| 10231/10986 [3:59:48<18:01,  1.43s/it]

training loss: 3.4005026817321777


training:  93%|█████████▎| 10232/10986 [3:59:49<17:41,  1.41s/it]

training loss: 3.3454232215881348


training:  93%|█████████▎| 10233/10986 [3:59:51<17:21,  1.38s/it]

training loss: 3.297941207885742


training:  93%|█████████▎| 10234/10986 [3:59:52<17:14,  1.38s/it]

training loss: 3.440601110458374


training:  93%|█████████▎| 10235/10986 [3:59:53<17:01,  1.36s/it]

training loss: 3.4272685050964355


training:  93%|█████████▎| 10236/10986 [3:59:55<16:47,  1.34s/it]

training loss: 3.3499162197113037


training:  93%|█████████▎| 10237/10986 [3:59:56<16:50,  1.35s/it]

training loss: 3.3049073219299316


training:  93%|█████████▎| 10238/10986 [3:59:58<16:50,  1.35s/it]

training loss: 3.4449052810668945


training:  93%|█████████▎| 10239/10986 [3:59:59<16:49,  1.35s/it]

training loss: 3.418558359146118


training:  93%|█████████▎| 10240/10986 [4:00:00<16:41,  1.34s/it]

training loss: 3.4490644931793213
valid loss: 3.4363222122192383
perplexity: 31.072467803955078


training:  93%|█████████▎| 10241/10986 [4:00:03<22:22,  1.80s/it]

training loss: 3.4217708110809326


training:  93%|█████████▎| 10242/10986 [4:00:05<20:58,  1.69s/it]

training loss: 3.4611220359802246


training:  93%|█████████▎| 10243/10986 [4:00:06<19:31,  1.58s/it]

training loss: 3.4340877532958984


training:  93%|█████████▎| 10244/10986 [4:00:07<18:28,  1.49s/it]

training loss: 3.399937152862549


training:  93%|█████████▎| 10245/10986 [4:00:08<17:45,  1.44s/it]

training loss: 3.442906379699707


training:  93%|█████████▎| 10246/10986 [4:00:10<17:23,  1.41s/it]

training loss: 3.2872750759124756


training:  93%|█████████▎| 10247/10986 [4:00:11<17:00,  1.38s/it]

training loss: 3.478475332260132


training:  93%|█████████▎| 10248/10986 [4:00:12<16:47,  1.36s/it]

training loss: 3.4554219245910645


training:  93%|█████████▎| 10249/10986 [4:00:14<16:35,  1.35s/it]

training loss: 3.353137493133545


training:  93%|█████████▎| 10250/10986 [4:00:15<16:29,  1.35s/it]

training loss: 3.380183696746826


training:  93%|█████████▎| 10251/10986 [4:00:17<18:26,  1.51s/it]

training loss: 3.4520998001098633


training:  93%|█████████▎| 10252/10986 [4:00:19<19:28,  1.59s/it]

training loss: 3.4106788635253906


training:  93%|█████████▎| 10253/10986 [4:00:20<19:35,  1.60s/it]

training loss: 3.5040907859802246


training:  93%|█████████▎| 10254/10986 [4:00:22<18:26,  1.51s/it]

training loss: 3.2968461513519287


training:  93%|█████████▎| 10255/10986 [4:00:23<17:47,  1.46s/it]

training loss: 3.393418788909912


training:  93%|█████████▎| 10256/10986 [4:00:24<17:22,  1.43s/it]

training loss: 3.413548707962036


training:  93%|█████████▎| 10257/10986 [4:00:26<16:52,  1.39s/it]

training loss: 3.357471227645874


training:  93%|█████████▎| 10258/10986 [4:00:27<16:44,  1.38s/it]

training loss: 3.273993492126465


training:  93%|█████████▎| 10259/10986 [4:00:28<16:27,  1.36s/it]

training loss: 3.351801872253418


training:  93%|█████████▎| 10260/10986 [4:00:30<16:23,  1.35s/it]

training loss: 3.254305124282837
valid loss: 3.255999803543091
perplexity: 25.94554328918457


training:  93%|█████████▎| 10261/10986 [4:00:33<22:00,  1.82s/it]

training loss: 3.2943713665008545


training:  93%|█████████▎| 10262/10986 [4:00:34<20:22,  1.69s/it]

training loss: 3.293882131576538


training:  93%|█████████▎| 10263/10986 [4:00:35<19:07,  1.59s/it]

training loss: 3.532987356185913


training:  93%|█████████▎| 10264/10986 [4:00:37<18:06,  1.50s/it]

training loss: 3.3651652336120605


training:  93%|█████████▎| 10265/10986 [4:00:38<17:29,  1.46s/it]

training loss: 3.2951347827911377


training:  93%|█████████▎| 10266/10986 [4:00:39<17:01,  1.42s/it]

training loss: 3.443329095840454


training:  93%|█████████▎| 10267/10986 [4:00:41<16:42,  1.39s/it]

training loss: 3.329042911529541


training:  93%|█████████▎| 10268/10986 [4:00:42<16:30,  1.38s/it]

training loss: 3.316521167755127


training:  93%|█████████▎| 10269/10986 [4:00:43<16:22,  1.37s/it]

training loss: 3.453871965408325


training:  93%|█████████▎| 10270/10986 [4:00:45<16:12,  1.36s/it]

training loss: 3.4123282432556152


training:  93%|█████████▎| 10271/10986 [4:00:46<17:09,  1.44s/it]

training loss: 3.3235952854156494


training:  94%|█████████▎| 10272/10986 [4:00:48<16:51,  1.42s/it]

training loss: 3.415022373199463


training:  94%|█████████▎| 10273/10986 [4:00:49<16:29,  1.39s/it]

training loss: 3.4126713275909424


training:  94%|█████████▎| 10274/10986 [4:00:50<16:18,  1.37s/it]

training loss: 3.409851551055908


training:  94%|█████████▎| 10275/10986 [4:00:52<16:01,  1.35s/it]

training loss: 3.3662500381469727


training:  94%|█████████▎| 10276/10986 [4:00:53<16:05,  1.36s/it]

training loss: 3.389031171798706


training:  94%|█████████▎| 10277/10986 [4:00:54<15:58,  1.35s/it]

training loss: 3.3728208541870117


training:  94%|█████████▎| 10278/10986 [4:00:56<15:51,  1.34s/it]

training loss: 3.4232308864593506


training:  94%|█████████▎| 10279/10986 [4:00:57<15:53,  1.35s/it]

training loss: 3.334721088409424


training:  94%|█████████▎| 10280/10986 [4:00:58<15:50,  1.35s/it]

training loss: 3.328965187072754
valid loss: 3.33005952835083
perplexity: 27.940006256103516


training:  94%|█████████▎| 10281/10986 [4:01:01<21:12,  1.81s/it]

training loss: 3.326270580291748


training:  94%|█████████▎| 10282/10986 [4:01:03<19:47,  1.69s/it]

training loss: 3.376302719116211


training:  94%|█████████▎| 10283/10986 [4:01:04<18:33,  1.58s/it]

training loss: 3.428316354751587


training:  94%|█████████▎| 10284/10986 [4:01:05<17:36,  1.51s/it]

training loss: 3.2725794315338135


training:  94%|█████████▎| 10285/10986 [4:01:07<16:58,  1.45s/it]

training loss: 3.4091124534606934


training:  94%|█████████▎| 10286/10986 [4:01:08<16:30,  1.42s/it]

training loss: 3.44584059715271


training:  94%|█████████▎| 10287/10986 [4:01:09<16:10,  1.39s/it]

training loss: 3.2921857833862305


training:  94%|█████████▎| 10288/10986 [4:01:11<15:52,  1.36s/it]

training loss: 3.3705031871795654


training:  94%|█████████▎| 10289/10986 [4:01:12<15:40,  1.35s/it]

training loss: 3.3186819553375244


training:  94%|█████████▎| 10290/10986 [4:01:13<15:44,  1.36s/it]

training loss: 3.4853971004486084


training:  94%|█████████▎| 10291/10986 [4:01:15<16:35,  1.43s/it]

training loss: 3.384079694747925


training:  94%|█████████▎| 10292/10986 [4:01:16<16:51,  1.46s/it]

training loss: 3.3668243885040283


training:  94%|█████████▎| 10293/10986 [4:01:18<16:27,  1.42s/it]

training loss: 3.2609219551086426


training:  94%|█████████▎| 10294/10986 [4:01:19<16:05,  1.40s/it]

training loss: 3.3936612606048584


training:  94%|█████████▎| 10295/10986 [4:01:20<15:48,  1.37s/it]

training loss: 3.4203929901123047


training:  94%|█████████▎| 10296/10986 [4:01:22<15:35,  1.36s/it]

training loss: 3.3640058040618896


training:  94%|█████████▎| 10297/10986 [4:01:23<15:27,  1.35s/it]

training loss: 3.2816410064697266


training:  94%|█████████▎| 10298/10986 [4:01:24<15:27,  1.35s/it]

training loss: 3.2892932891845703


training:  94%|█████████▎| 10299/10986 [4:01:26<15:20,  1.34s/it]

training loss: 3.419616937637329


training:  94%|█████████▍| 10300/10986 [4:01:27<15:18,  1.34s/it]

training loss: 3.300896406173706
valid loss: 3.300224542617798
perplexity: 27.118728637695312


training:  94%|█████████▍| 10301/10986 [4:01:30<20:35,  1.80s/it]

training loss: 3.45770263671875


training:  94%|█████████▍| 10302/10986 [4:01:32<19:57,  1.75s/it]

training loss: 3.4274327754974365


training:  94%|█████████▍| 10303/10986 [4:01:33<18:29,  1.62s/it]

training loss: 3.354684829711914


training:  94%|█████████▍| 10304/10986 [4:01:34<17:22,  1.53s/it]

training loss: 3.3490161895751953


training:  94%|█████████▍| 10305/10986 [4:01:36<16:39,  1.47s/it]

training loss: 3.2479476928710938


training:  94%|█████████▍| 10306/10986 [4:01:37<16:08,  1.42s/it]

training loss: 3.293017864227295


training:  94%|█████████▍| 10307/10986 [4:01:38<15:59,  1.41s/it]

training loss: 3.4010369777679443


training:  94%|█████████▍| 10308/10986 [4:01:40<15:38,  1.38s/it]

training loss: 3.2567503452301025


training:  94%|█████████▍| 10309/10986 [4:01:41<15:28,  1.37s/it]

training loss: 3.3478786945343018


training:  94%|█████████▍| 10310/10986 [4:01:42<15:14,  1.35s/it]

training loss: 3.39530873298645


training:  94%|█████████▍| 10311/10986 [4:01:44<16:15,  1.44s/it]

training loss: 3.333153486251831


training:  94%|█████████▍| 10312/10986 [4:01:45<15:57,  1.42s/it]

training loss: 3.4466726779937744


training:  94%|█████████▍| 10313/10986 [4:01:47<15:36,  1.39s/it]

training loss: 3.3919308185577393


training:  94%|█████████▍| 10314/10986 [4:01:48<15:21,  1.37s/it]

training loss: 3.4628241062164307


training:  94%|█████████▍| 10315/10986 [4:01:49<15:14,  1.36s/it]

training loss: 3.3611700534820557


training:  94%|█████████▍| 10316/10986 [4:01:51<15:03,  1.35s/it]

training loss: 3.2923777103424072


training:  94%|█████████▍| 10317/10986 [4:01:52<14:55,  1.34s/it]

training loss: 3.298349380493164


training:  94%|█████████▍| 10318/10986 [4:01:53<14:53,  1.34s/it]

training loss: 3.3755650520324707


training:  94%|█████████▍| 10319/10986 [4:01:55<14:52,  1.34s/it]

training loss: 3.3236331939697266


training:  94%|█████████▍| 10320/10986 [4:01:56<14:47,  1.33s/it]

training loss: 3.3980727195739746
valid loss: 3.3974976539611816
perplexity: 29.88921356201172


training:  94%|█████████▍| 10321/10986 [4:01:59<19:56,  1.80s/it]

training loss: 3.416337490081787


training:  94%|█████████▍| 10322/10986 [4:02:00<18:35,  1.68s/it]

training loss: 3.2793543338775635


training:  94%|█████████▍| 10323/10986 [4:02:01<17:24,  1.58s/it]

training loss: 3.5095906257629395


training:  94%|█████████▍| 10324/10986 [4:02:03<16:34,  1.50s/it]

training loss: 3.4531311988830566


training:  94%|█████████▍| 10325/10986 [4:02:04<15:58,  1.45s/it]

training loss: 3.3506786823272705


training:  94%|█████████▍| 10326/10986 [4:02:05<15:33,  1.41s/it]

training loss: 3.446749448776245


training:  94%|█████████▍| 10327/10986 [4:02:07<15:16,  1.39s/it]

training loss: 3.4373037815093994


training:  94%|█████████▍| 10328/10986 [4:02:08<14:59,  1.37s/it]

training loss: 3.4633781909942627


training:  94%|█████████▍| 10329/10986 [4:02:09<14:52,  1.36s/it]

training loss: 3.4657270908355713


training:  94%|█████████▍| 10330/10986 [4:02:11<14:42,  1.35s/it]

training loss: 3.334449052810669


training:  94%|█████████▍| 10331/10986 [4:02:12<15:31,  1.42s/it]

training loss: 3.357745885848999


training:  94%|█████████▍| 10332/10986 [4:02:14<15:12,  1.40s/it]

training loss: 3.383789300918579


training:  94%|█████████▍| 10333/10986 [4:02:15<14:51,  1.36s/it]

training loss: 3.426258087158203


training:  94%|█████████▍| 10334/10986 [4:02:16<14:38,  1.35s/it]

training loss: 3.514584541320801


training:  94%|█████████▍| 10335/10986 [4:02:18<14:32,  1.34s/it]

training loss: 3.481440305709839


training:  94%|█████████▍| 10336/10986 [4:02:19<14:29,  1.34s/it]

training loss: 3.389559745788574


training:  94%|█████████▍| 10337/10986 [4:02:20<14:22,  1.33s/it]

training loss: 3.3139076232910156


training:  94%|█████████▍| 10338/10986 [4:02:22<14:19,  1.33s/it]

training loss: 3.3132877349853516


training:  94%|█████████▍| 10339/10986 [4:02:23<14:17,  1.33s/it]

training loss: 3.5282578468322754


training:  94%|█████████▍| 10340/10986 [4:02:24<14:18,  1.33s/it]

training loss: 3.286219835281372
valid loss: 3.2902462482452393
perplexity: 26.84947395324707


training:  94%|█████████▍| 10341/10986 [4:02:27<19:07,  1.78s/it]

training loss: 3.38141131401062


training:  94%|█████████▍| 10342/10986 [4:02:28<17:54,  1.67s/it]

training loss: 3.377880573272705


training:  94%|█████████▍| 10343/10986 [4:02:30<16:50,  1.57s/it]

training loss: 3.3893415927886963


training:  94%|█████████▍| 10344/10986 [4:02:31<16:01,  1.50s/it]

training loss: 3.347736358642578


training:  94%|█████████▍| 10345/10986 [4:02:32<15:28,  1.45s/it]

training loss: 3.327327013015747


training:  94%|█████████▍| 10346/10986 [4:02:34<15:00,  1.41s/it]

training loss: 3.331101179122925


training:  94%|█████████▍| 10347/10986 [4:02:35<14:42,  1.38s/it]

training loss: 3.3536694049835205


training:  94%|█████████▍| 10348/10986 [4:02:36<14:26,  1.36s/it]

training loss: 3.2584450244903564


training:  94%|█████████▍| 10349/10986 [4:02:38<14:14,  1.34s/it]

training loss: 3.3365797996520996


training:  94%|█████████▍| 10350/10986 [4:02:39<14:11,  1.34s/it]

training loss: 3.4557080268859863


training:  94%|█████████▍| 10351/10986 [4:02:41<15:00,  1.42s/it]

training loss: 3.466733455657959


training:  94%|█████████▍| 10352/10986 [4:02:42<15:20,  1.45s/it]

training loss: 3.391606569290161


training:  94%|█████████▍| 10353/10986 [4:02:43<14:53,  1.41s/it]

training loss: 3.3954520225524902


training:  94%|█████████▍| 10354/10986 [4:02:45<14:30,  1.38s/it]

training loss: 3.41166615486145


training:  94%|█████████▍| 10355/10986 [4:02:46<14:12,  1.35s/it]

training loss: 3.3888537883758545


training:  94%|█████████▍| 10356/10986 [4:02:47<14:02,  1.34s/it]

training loss: 3.3533718585968018


training:  94%|█████████▍| 10357/10986 [4:02:49<13:58,  1.33s/it]

training loss: 3.4546806812286377


training:  94%|█████████▍| 10358/10986 [4:02:50<13:48,  1.32s/it]

training loss: 3.302699565887451


training:  94%|█████████▍| 10359/10986 [4:02:51<13:47,  1.32s/it]

training loss: 3.3120572566986084


training:  94%|█████████▍| 10360/10986 [4:02:53<13:39,  1.31s/it]

training loss: 3.4846575260162354
valid loss: 3.487267017364502
perplexity: 32.69646453857422


training:  94%|█████████▍| 10361/10986 [4:02:55<18:24,  1.77s/it]

training loss: 3.4679408073425293


training:  94%|█████████▍| 10362/10986 [4:02:57<17:59,  1.73s/it]

training loss: 3.3116350173950195


training:  94%|█████████▍| 10363/10986 [4:02:58<16:44,  1.61s/it]

training loss: 3.3819594383239746


training:  94%|█████████▍| 10364/10986 [4:03:00<15:52,  1.53s/it]

training loss: 3.367398500442505


training:  94%|█████████▍| 10365/10986 [4:03:01<15:09,  1.46s/it]

training loss: 3.4233531951904297


training:  94%|█████████▍| 10366/10986 [4:03:02<14:42,  1.42s/it]

training loss: 3.4632959365844727


training:  94%|█████████▍| 10367/10986 [4:03:04<14:20,  1.39s/it]

training loss: 3.29915452003479


training:  94%|█████████▍| 10368/10986 [4:03:05<14:03,  1.36s/it]

training loss: 3.5679426193237305


training:  94%|█████████▍| 10369/10986 [4:03:06<13:54,  1.35s/it]

training loss: 3.4632129669189453


training:  94%|█████████▍| 10370/10986 [4:03:08<13:46,  1.34s/it]

training loss: 3.389575958251953


training:  94%|█████████▍| 10371/10986 [4:03:09<14:37,  1.43s/it]

training loss: 3.5121889114379883


training:  94%|█████████▍| 10372/10986 [4:03:11<14:28,  1.41s/it]

training loss: 3.336820602416992


training:  94%|█████████▍| 10373/10986 [4:03:12<14:07,  1.38s/it]

training loss: 3.5291662216186523


training:  94%|█████████▍| 10374/10986 [4:03:13<13:53,  1.36s/it]

training loss: 3.348395824432373


training:  94%|█████████▍| 10375/10986 [4:03:15<13:45,  1.35s/it]

training loss: 3.308894395828247


training:  94%|█████████▍| 10376/10986 [4:03:16<13:37,  1.34s/it]

training loss: 3.3110270500183105


training:  94%|█████████▍| 10377/10986 [4:03:17<13:33,  1.34s/it]

training loss: 3.4464309215545654


training:  94%|█████████▍| 10378/10986 [4:03:19<13:39,  1.35s/it]

training loss: 3.4327869415283203


training:  94%|█████████▍| 10379/10986 [4:03:20<13:32,  1.34s/it]

training loss: 3.3579342365264893


training:  94%|█████████▍| 10380/10986 [4:03:21<13:31,  1.34s/it]

training loss: 3.416316270828247
valid loss: 3.414257526397705
perplexity: 30.394372940063477


training:  94%|█████████▍| 10381/10986 [4:03:24<18:02,  1.79s/it]

training loss: 3.3886935710906982


training:  95%|█████████▍| 10382/10986 [4:03:26<17:49,  1.77s/it]

training loss: 3.4672865867614746


training:  95%|█████████▍| 10383/10986 [4:03:27<16:25,  1.63s/it]

training loss: 3.4083380699157715


training:  95%|█████████▍| 10384/10986 [4:03:29<15:31,  1.55s/it]

training loss: 3.3543639183044434


training:  95%|█████████▍| 10385/10986 [4:03:30<14:49,  1.48s/it]

training loss: 3.4731900691986084


training:  95%|█████████▍| 10386/10986 [4:03:31<14:22,  1.44s/it]

training loss: 3.411219358444214


training:  95%|█████████▍| 10387/10986 [4:03:33<14:08,  1.42s/it]

training loss: 3.424020290374756


training:  95%|█████████▍| 10388/10986 [4:03:34<13:52,  1.39s/it]

training loss: 3.3691844940185547


training:  95%|█████████▍| 10389/10986 [4:03:35<13:40,  1.37s/it]

training loss: 3.427090644836426


training:  95%|█████████▍| 10390/10986 [4:03:37<13:32,  1.36s/it]

training loss: 3.454982280731201


training:  95%|█████████▍| 10391/10986 [4:03:38<14:24,  1.45s/it]

training loss: 3.4683175086975098


training:  95%|█████████▍| 10392/10986 [4:03:40<14:05,  1.42s/it]

training loss: 3.3696391582489014


training:  95%|█████████▍| 10393/10986 [4:03:41<13:47,  1.39s/it]

training loss: 3.364922523498535


training:  95%|█████████▍| 10394/10986 [4:03:42<13:35,  1.38s/it]

training loss: 3.3092596530914307


training:  95%|█████████▍| 10395/10986 [4:03:44<13:28,  1.37s/it]

training loss: 3.490726947784424


training:  95%|█████████▍| 10396/10986 [4:03:45<13:20,  1.36s/it]

training loss: 3.396650552749634


training:  95%|█████████▍| 10397/10986 [4:03:46<13:16,  1.35s/it]

training loss: 3.439307689666748


training:  95%|█████████▍| 10398/10986 [4:03:48<13:10,  1.34s/it]

training loss: 3.393559217453003


training:  95%|█████████▍| 10399/10986 [4:03:49<13:08,  1.34s/it]

training loss: 3.425671100616455


training:  95%|█████████▍| 10400/10986 [4:03:50<13:01,  1.33s/it]

training loss: 3.364088773727417
valid loss: 3.3626413345336914
perplexity: 28.865333557128906


training:  95%|█████████▍| 10401/10986 [4:03:53<17:29,  1.79s/it]

training loss: 3.3275747299194336


training:  95%|█████████▍| 10402/10986 [4:03:55<17:02,  1.75s/it]

training loss: 3.4177751541137695


training:  95%|█████████▍| 10403/10986 [4:03:56<15:53,  1.63s/it]

training loss: 3.2404732704162598


training:  95%|█████████▍| 10404/10986 [4:03:57<14:59,  1.55s/it]

training loss: 3.3372671604156494


training:  95%|█████████▍| 10405/10986 [4:03:59<14:26,  1.49s/it]

training loss: 3.370912551879883


training:  95%|█████████▍| 10406/10986 [4:04:00<14:01,  1.45s/it]

training loss: 3.32177472114563


training:  95%|█████████▍| 10407/10986 [4:04:01<13:36,  1.41s/it]

training loss: 3.39609694480896


training:  95%|█████████▍| 10408/10986 [4:04:03<13:22,  1.39s/it]

training loss: 3.306954860687256


training:  95%|█████████▍| 10409/10986 [4:04:04<13:10,  1.37s/it]

training loss: 3.4424679279327393


training:  95%|█████████▍| 10410/10986 [4:04:05<13:02,  1.36s/it]

training loss: 3.3314244747161865


training:  95%|█████████▍| 10411/10986 [4:04:07<13:47,  1.44s/it]

training loss: 3.304072856903076


training:  95%|█████████▍| 10412/10986 [4:04:09<13:45,  1.44s/it]

training loss: 3.3702170848846436


training:  95%|█████████▍| 10413/10986 [4:04:10<13:26,  1.41s/it]

training loss: 3.545452117919922


training:  95%|█████████▍| 10414/10986 [4:04:11<13:08,  1.38s/it]

training loss: 3.37080454826355


training:  95%|█████████▍| 10415/10986 [4:04:13<12:57,  1.36s/it]

training loss: 3.4416325092315674


training:  95%|█████████▍| 10416/10986 [4:04:14<12:51,  1.35s/it]

training loss: 3.519709587097168


training:  95%|█████████▍| 10417/10986 [4:04:15<12:48,  1.35s/it]

training loss: 3.410079002380371


training:  95%|█████████▍| 10418/10986 [4:04:17<12:45,  1.35s/it]

training loss: 3.3597981929779053


training:  95%|█████████▍| 10419/10986 [4:04:18<12:39,  1.34s/it]

training loss: 3.4529731273651123


training:  95%|█████████▍| 10420/10986 [4:04:19<12:36,  1.34s/it]

training loss: 3.4116530418395996
valid loss: 3.406179428100586
perplexity: 30.14983558654785


training:  95%|█████████▍| 10421/10986 [4:04:22<16:50,  1.79s/it]

training loss: 3.3776750564575195


training:  95%|█████████▍| 10422/10986 [4:04:23<15:46,  1.68s/it]

training loss: 3.3284659385681152


training:  95%|█████████▍| 10423/10986 [4:04:25<14:49,  1.58s/it]

training loss: 3.4653384685516357


training:  95%|█████████▍| 10424/10986 [4:04:26<14:07,  1.51s/it]

training loss: 3.400909900665283


training:  95%|█████████▍| 10425/10986 [4:04:27<13:36,  1.46s/it]

training loss: 3.351249933242798


training:  95%|█████████▍| 10426/10986 [4:04:29<13:16,  1.42s/it]

training loss: 3.374183416366577


training:  95%|█████████▍| 10427/10986 [4:04:30<12:58,  1.39s/it]

training loss: 3.443671226501465


training:  95%|█████████▍| 10428/10986 [4:04:31<12:45,  1.37s/it]

training loss: 3.4702765941619873


training:  95%|█████████▍| 10429/10986 [4:04:33<12:34,  1.36s/it]

training loss: 3.4847068786621094


training:  95%|█████████▍| 10430/10986 [4:04:34<12:34,  1.36s/it]

training loss: 3.351043701171875


training:  95%|█████████▍| 10431/10986 [4:04:36<13:23,  1.45s/it]

training loss: 3.421515703201294


training:  95%|█████████▍| 10432/10986 [4:04:37<13:02,  1.41s/it]

training loss: 3.3717586994171143


training:  95%|█████████▍| 10433/10986 [4:04:38<12:43,  1.38s/it]

training loss: 3.3487019538879395


training:  95%|█████████▍| 10434/10986 [4:04:40<12:32,  1.36s/it]

training loss: 3.2653379440307617


training:  95%|█████████▍| 10435/10986 [4:04:41<12:30,  1.36s/it]

training loss: 3.489835739135742


training:  95%|█████████▍| 10436/10986 [4:04:42<12:23,  1.35s/it]

training loss: 3.419555902481079


training:  95%|█████████▌| 10437/10986 [4:04:44<12:19,  1.35s/it]

training loss: 3.4391822814941406


training:  95%|█████████▌| 10438/10986 [4:04:45<12:18,  1.35s/it]

training loss: 3.367286205291748


training:  95%|█████████▌| 10439/10986 [4:04:46<12:14,  1.34s/it]

training loss: 3.420492172241211


training:  95%|█████████▌| 10440/10986 [4:04:48<12:11,  1.34s/it]

training loss: 3.549062490463257
valid loss: 3.54742431640625
perplexity: 34.7237663269043


training:  95%|█████████▌| 10441/10986 [4:04:51<16:19,  1.80s/it]

training loss: 3.288133144378662


training:  95%|█████████▌| 10442/10986 [4:04:53<16:27,  1.81s/it]

training loss: 3.4117610454559326


training:  95%|█████████▌| 10443/10986 [4:04:54<16:14,  1.80s/it]

training loss: 3.4694416522979736


training:  95%|█████████▌| 10444/10986 [4:04:56<15:29,  1.71s/it]

training loss: 3.4509353637695312


training:  95%|█████████▌| 10445/10986 [4:04:57<14:26,  1.60s/it]

training loss: 3.3680310249328613


training:  95%|█████████▌| 10446/10986 [4:04:58<13:36,  1.51s/it]

training loss: 3.3310933113098145


training:  95%|█████████▌| 10447/10986 [4:05:00<13:13,  1.47s/it]

training loss: 3.441326141357422


training:  95%|█████████▌| 10448/10986 [4:05:01<12:52,  1.44s/it]

training loss: 3.327343225479126


training:  95%|█████████▌| 10449/10986 [4:05:02<12:35,  1.41s/it]

training loss: 3.3892197608947754


training:  95%|█████████▌| 10450/10986 [4:05:04<12:21,  1.38s/it]

training loss: 3.404414176940918


training:  95%|█████████▌| 10451/10986 [4:05:05<12:58,  1.46s/it]

training loss: 3.39119815826416


training:  95%|█████████▌| 10452/10986 [4:05:07<12:34,  1.41s/it]

training loss: 3.460216760635376


training:  95%|█████████▌| 10453/10986 [4:05:08<12:21,  1.39s/it]

training loss: 3.2958085536956787


training:  95%|█████████▌| 10454/10986 [4:05:09<12:09,  1.37s/it]

training loss: 3.4690606594085693


training:  95%|█████████▌| 10455/10986 [4:05:11<12:03,  1.36s/it]

training loss: 3.3928303718566895


training:  95%|█████████▌| 10456/10986 [4:05:12<11:59,  1.36s/it]

training loss: 3.4854187965393066


training:  95%|█████████▌| 10457/10986 [4:05:13<11:53,  1.35s/it]

training loss: 3.374713659286499


training:  95%|█████████▌| 10458/10986 [4:05:15<11:50,  1.35s/it]

training loss: 3.4468801021575928


training:  95%|█████████▌| 10459/10986 [4:05:16<11:44,  1.34s/it]

training loss: 3.4584341049194336


training:  95%|█████████▌| 10460/10986 [4:05:17<11:48,  1.35s/it]

training loss: 3.381746768951416
valid loss: 3.3763513565063477
perplexity: 29.263803482055664


training:  95%|█████████▌| 10461/10986 [4:05:20<15:44,  1.80s/it]

training loss: 3.5432374477386475


training:  95%|█████████▌| 10462/10986 [4:05:22<15:24,  1.76s/it]

training loss: 3.409538745880127


training:  95%|█████████▌| 10463/10986 [4:05:23<14:20,  1.65s/it]

training loss: 3.453068733215332


training:  95%|█████████▌| 10464/10986 [4:05:25<13:28,  1.55s/it]

training loss: 3.3718550205230713


training:  95%|█████████▌| 10465/10986 [4:05:26<12:52,  1.48s/it]

training loss: 3.4468870162963867


training:  95%|█████████▌| 10466/10986 [4:05:27<12:27,  1.44s/it]

training loss: 3.444685935974121


training:  95%|█████████▌| 10467/10986 [4:05:29<12:11,  1.41s/it]

training loss: 3.398428440093994


training:  95%|█████████▌| 10468/10986 [4:05:30<12:05,  1.40s/it]

training loss: 3.48663330078125


training:  95%|█████████▌| 10469/10986 [4:05:31<11:55,  1.38s/it]

training loss: 3.3925623893737793


training:  95%|█████████▌| 10470/10986 [4:05:33<11:46,  1.37s/it]

training loss: 3.3636348247528076


training:  95%|█████████▌| 10471/10986 [4:05:34<12:28,  1.45s/it]

training loss: 3.450737953186035


training:  95%|█████████▌| 10472/10986 [4:05:36<12:06,  1.41s/it]

training loss: 3.4518468379974365


training:  95%|█████████▌| 10473/10986 [4:05:37<11:50,  1.39s/it]

training loss: 3.4492428302764893


training:  95%|█████████▌| 10474/10986 [4:05:38<11:43,  1.37s/it]

training loss: 3.372864007949829


training:  95%|█████████▌| 10475/10986 [4:05:40<11:36,  1.36s/it]

training loss: 3.366144895553589


training:  95%|█████████▌| 10476/10986 [4:05:41<11:30,  1.35s/it]

training loss: 3.306471347808838


training:  95%|█████████▌| 10477/10986 [4:05:42<11:27,  1.35s/it]

training loss: 3.426572799682617


training:  95%|█████████▌| 10478/10986 [4:05:44<11:24,  1.35s/it]

training loss: 3.486173152923584


training:  95%|█████████▌| 10479/10986 [4:05:45<11:22,  1.35s/it]

training loss: 3.422111988067627


training:  95%|█████████▌| 10480/10986 [4:05:46<11:17,  1.34s/it]

training loss: 3.5551700592041016
valid loss: 3.5460848808288574
perplexity: 34.677284240722656


training:  95%|█████████▌| 10481/10986 [4:05:49<15:03,  1.79s/it]

training loss: 3.4085025787353516


training:  95%|█████████▌| 10482/10986 [4:05:51<14:05,  1.68s/it]

training loss: 3.4891762733459473


training:  95%|█████████▌| 10483/10986 [4:05:52<13:11,  1.57s/it]

training loss: 3.328120231628418


training:  95%|█████████▌| 10484/10986 [4:05:53<12:37,  1.51s/it]

training loss: 3.46824312210083


training:  95%|█████████▌| 10485/10986 [4:05:55<12:11,  1.46s/it]

training loss: 3.3043978214263916


training:  95%|█████████▌| 10486/10986 [4:05:56<11:47,  1.42s/it]

training loss: 3.362010955810547


training:  95%|█████████▌| 10487/10986 [4:05:57<11:36,  1.40s/it]

training loss: 3.3998970985412598


training:  95%|█████████▌| 10488/10986 [4:05:59<11:24,  1.37s/it]

training loss: 3.3406758308410645


training:  95%|█████████▌| 10489/10986 [4:06:00<11:20,  1.37s/it]

training loss: 3.3174800872802734


training:  95%|█████████▌| 10490/10986 [4:06:01<11:22,  1.38s/it]

training loss: 3.3869781494140625


training:  95%|█████████▌| 10491/10986 [4:06:03<11:58,  1.45s/it]

training loss: 3.384084701538086


training:  96%|█████████▌| 10492/10986 [4:06:04<11:51,  1.44s/it]

training loss: 3.45420503616333


training:  96%|█████████▌| 10493/10986 [4:06:06<11:33,  1.41s/it]

training loss: 3.2392120361328125


training:  96%|█████████▌| 10494/10986 [4:06:07<11:18,  1.38s/it]

training loss: 3.274358034133911


training:  96%|█████████▌| 10495/10986 [4:06:08<11:10,  1.36s/it]

training loss: 3.409459114074707


training:  96%|█████████▌| 10496/10986 [4:06:10<10:57,  1.34s/it]

training loss: 3.3206324577331543


training:  96%|█████████▌| 10497/10986 [4:06:11<10:57,  1.34s/it]

training loss: 3.460737705230713


training:  96%|█████████▌| 10498/10986 [4:06:12<10:51,  1.34s/it]

training loss: 3.3704655170440674


training:  96%|█████████▌| 10499/10986 [4:06:14<10:48,  1.33s/it]

training loss: 3.414714813232422


training:  96%|█████████▌| 10500/10986 [4:06:15<10:44,  1.33s/it]

training loss: 3.387777328491211
valid loss: 3.3886499404907227
perplexity: 29.62592887878418


training:  96%|█████████▌| 10501/10986 [4:06:18<14:17,  1.77s/it]

training loss: 3.345550537109375


training:  96%|█████████▌| 10502/10986 [4:06:19<13:24,  1.66s/it]

training loss: 3.4167091846466064


training:  96%|█████████▌| 10503/10986 [4:06:21<12:30,  1.55s/it]

training loss: 3.3751676082611084


training:  96%|█████████▌| 10504/10986 [4:06:22<11:58,  1.49s/it]

training loss: 3.3970916271209717


training:  96%|█████████▌| 10505/10986 [4:06:23<11:31,  1.44s/it]

training loss: 3.3132107257843018


training:  96%|█████████▌| 10506/10986 [4:06:25<11:13,  1.40s/it]

training loss: 3.3955559730529785


training:  96%|█████████▌| 10507/10986 [4:06:26<11:01,  1.38s/it]

training loss: 3.298849582672119


training:  96%|█████████▌| 10508/10986 [4:06:27<10:51,  1.36s/it]

training loss: 3.4833579063415527


training:  96%|█████████▌| 10509/10986 [4:06:29<10:43,  1.35s/it]

training loss: 3.417577028274536


training:  96%|█████████▌| 10510/10986 [4:06:30<10:39,  1.34s/it]

training loss: 3.4468157291412354


training:  96%|█████████▌| 10511/10986 [4:06:31<11:18,  1.43s/it]

training loss: 3.334789276123047


training:  96%|█████████▌| 10512/10986 [4:06:33<11:38,  1.47s/it]

training loss: 3.432269811630249


training:  96%|█████████▌| 10513/10986 [4:06:34<11:14,  1.43s/it]

training loss: 3.3812735080718994


training:  96%|█████████▌| 10514/10986 [4:06:36<10:58,  1.40s/it]

training loss: 3.4258015155792236


training:  96%|█████████▌| 10515/10986 [4:06:37<10:44,  1.37s/it]

training loss: 3.3542821407318115


training:  96%|█████████▌| 10516/10986 [4:06:38<10:44,  1.37s/it]

training loss: 3.449481248855591


training:  96%|█████████▌| 10517/10986 [4:06:40<10:34,  1.35s/it]

training loss: 3.3901991844177246


training:  96%|█████████▌| 10518/10986 [4:06:41<10:27,  1.34s/it]

training loss: 3.2978856563568115


training:  96%|█████████▌| 10519/10986 [4:06:42<10:23,  1.33s/it]

training loss: 3.5116162300109863


training:  96%|█████████▌| 10520/10986 [4:06:44<10:21,  1.33s/it]

training loss: 3.346250057220459
valid loss: 3.34552001953125
perplexity: 28.375328063964844


training:  96%|█████████▌| 10521/10986 [4:06:46<13:46,  1.78s/it]

training loss: 3.329709768295288


training:  96%|█████████▌| 10522/10986 [4:06:48<12:54,  1.67s/it]

training loss: 3.2713122367858887


training:  96%|█████████▌| 10523/10986 [4:06:49<12:02,  1.56s/it]

training loss: 3.264181613922119


training:  96%|█████████▌| 10524/10986 [4:06:51<11:29,  1.49s/it]

training loss: 3.3956334590911865


training:  96%|█████████▌| 10525/10986 [4:06:52<11:00,  1.43s/it]

training loss: 3.3455514907836914


training:  96%|█████████▌| 10526/10986 [4:06:53<10:42,  1.40s/it]

training loss: 3.368068218231201


training:  96%|█████████▌| 10527/10986 [4:06:54<10:29,  1.37s/it]

training loss: 3.4850995540618896


training:  96%|█████████▌| 10528/10986 [4:06:56<10:20,  1.35s/it]

training loss: 3.2913503646850586


training:  96%|█████████▌| 10529/10986 [4:06:57<10:12,  1.34s/it]

training loss: 3.430169105529785


training:  96%|█████████▌| 10530/10986 [4:06:58<10:07,  1.33s/it]

training loss: 3.5120062828063965


training:  96%|█████████▌| 10531/10986 [4:07:00<10:49,  1.43s/it]

training loss: 3.4282443523406982


training:  96%|█████████▌| 10532/10986 [4:07:01<10:33,  1.40s/it]

training loss: 3.4611079692840576


training:  96%|█████████▌| 10533/10986 [4:07:03<10:21,  1.37s/it]

training loss: 3.4100277423858643


training:  96%|█████████▌| 10534/10986 [4:07:04<10:22,  1.38s/it]

training loss: 3.318812370300293


training:  96%|█████████▌| 10535/10986 [4:07:05<10:14,  1.36s/it]

training loss: 3.4038500785827637


training:  96%|█████████▌| 10536/10986 [4:07:07<10:08,  1.35s/it]

training loss: 3.411686897277832


training:  96%|█████████▌| 10537/10986 [4:07:08<10:01,  1.34s/it]

training loss: 3.3074982166290283


training:  96%|█████████▌| 10538/10986 [4:07:09<09:55,  1.33s/it]

training loss: 3.351269245147705


training:  96%|█████████▌| 10539/10986 [4:07:11<09:54,  1.33s/it]

training loss: 3.378014087677002


training:  96%|█████████▌| 10540/10986 [4:07:12<09:53,  1.33s/it]

training loss: 3.4559338092803955
valid loss: 3.443950891494751
perplexity: 31.31041717529297


training:  96%|█████████▌| 10541/10986 [4:07:15<13:13,  1.78s/it]

training loss: 3.2647480964660645


training:  96%|█████████▌| 10542/10986 [4:07:16<12:16,  1.66s/it]

training loss: 3.2710647583007812


training:  96%|█████████▌| 10543/10986 [4:07:18<11:31,  1.56s/it]

training loss: 3.419217109680176


training:  96%|█████████▌| 10544/10986 [4:07:19<10:55,  1.48s/it]

training loss: 3.4360909461975098


training:  96%|█████████▌| 10545/10986 [4:07:20<10:29,  1.43s/it]

training loss: 3.485520124435425


training:  96%|█████████▌| 10546/10986 [4:07:21<10:16,  1.40s/it]

training loss: 3.213879346847534


training:  96%|█████████▌| 10547/10986 [4:07:23<10:01,  1.37s/it]

training loss: 3.455347776412964


training:  96%|█████████▌| 10548/10986 [4:07:24<09:55,  1.36s/it]

training loss: 3.418475389480591


training:  96%|█████████▌| 10549/10986 [4:07:25<09:50,  1.35s/it]

training loss: 3.3780529499053955


training:  96%|█████████▌| 10550/10986 [4:07:27<09:44,  1.34s/it]

training loss: 3.3047854900360107


training:  96%|█████████▌| 10551/10986 [4:07:28<10:20,  1.43s/it]

training loss: 3.446497678756714


training:  96%|█████████▌| 10552/10986 [4:07:30<10:10,  1.41s/it]

training loss: 3.4094953536987305


training:  96%|█████████▌| 10553/10986 [4:07:31<09:59,  1.39s/it]

training loss: 3.338914394378662


training:  96%|█████████▌| 10554/10986 [4:07:32<09:52,  1.37s/it]

training loss: 3.342768430709839


training:  96%|█████████▌| 10555/10986 [4:07:34<09:44,  1.36s/it]

training loss: 3.375053882598877


training:  96%|█████████▌| 10556/10986 [4:07:35<09:39,  1.35s/it]

training loss: 3.3713417053222656


training:  96%|█████████▌| 10557/10986 [4:07:36<09:35,  1.34s/it]

training loss: 3.3702316284179688


training:  96%|█████████▌| 10558/10986 [4:07:38<09:27,  1.33s/it]

training loss: 3.2998220920562744


training:  96%|█████████▌| 10559/10986 [4:07:39<09:28,  1.33s/it]

training loss: 3.3965086936950684


training:  96%|█████████▌| 10560/10986 [4:07:40<09:23,  1.32s/it]

training loss: 3.3736560344696045
valid loss: 3.3673930168151855
perplexity: 29.002817153930664


training:  96%|█████████▌| 10561/10986 [4:07:43<12:39,  1.79s/it]

training loss: 3.3357765674591064


training:  96%|█████████▌| 10562/10986 [4:07:45<11:48,  1.67s/it]

training loss: 3.3417863845825195


training:  96%|█████████▌| 10563/10986 [4:07:46<11:06,  1.58s/it]

training loss: 3.3812270164489746


training:  96%|█████████▌| 10564/10986 [4:07:47<10:38,  1.51s/it]

training loss: 3.307302474975586


training:  96%|█████████▌| 10565/10986 [4:07:49<10:15,  1.46s/it]

training loss: 3.3444032669067383


training:  96%|█████████▌| 10566/10986 [4:07:50<09:57,  1.42s/it]

training loss: 3.3713955879211426


training:  96%|█████████▌| 10567/10986 [4:07:51<09:42,  1.39s/it]

training loss: 3.3912861347198486


training:  96%|█████████▌| 10568/10986 [4:07:53<09:31,  1.37s/it]

training loss: 3.381648063659668


training:  96%|█████████▌| 10569/10986 [4:07:54<09:28,  1.36s/it]

training loss: 3.472153425216675


training:  96%|█████████▌| 10570/10986 [4:07:55<09:24,  1.36s/it]

training loss: 3.2671051025390625


training:  96%|█████████▌| 10571/10986 [4:07:57<09:56,  1.44s/it]

training loss: 3.3680002689361572


training:  96%|█████████▌| 10572/10986 [4:07:58<09:41,  1.40s/it]

training loss: 3.279797315597534


training:  96%|█████████▌| 10573/10986 [4:08:00<09:32,  1.39s/it]

training loss: 3.2615199089050293


training:  96%|█████████▌| 10574/10986 [4:08:01<09:23,  1.37s/it]

training loss: 3.333301544189453


training:  96%|█████████▋| 10575/10986 [4:08:02<09:15,  1.35s/it]

training loss: 3.414885997772217


training:  96%|█████████▋| 10576/10986 [4:08:04<09:08,  1.34s/it]

training loss: 3.481367588043213


training:  96%|█████████▋| 10577/10986 [4:08:05<09:10,  1.35s/it]

training loss: 3.4264400005340576


training:  96%|█████████▋| 10578/10986 [4:08:06<09:04,  1.33s/it]

training loss: 3.483381986618042


training:  96%|█████████▋| 10579/10986 [4:08:08<09:01,  1.33s/it]

training loss: 3.497875928878784


training:  96%|█████████▋| 10580/10986 [4:08:09<08:56,  1.32s/it]

training loss: 3.406266927719116
valid loss: 3.399484634399414
perplexity: 29.948659896850586


training:  96%|█████████▋| 10581/10986 [4:08:12<11:54,  1.76s/it]

training loss: 3.340175151824951


training:  96%|█████████▋| 10582/10986 [4:08:13<11:07,  1.65s/it]

training loss: 3.464304208755493


training:  96%|█████████▋| 10583/10986 [4:08:14<10:25,  1.55s/it]

training loss: 3.337934732437134


training:  96%|█████████▋| 10584/10986 [4:08:16<09:55,  1.48s/it]

training loss: 3.460273265838623


training:  96%|█████████▋| 10585/10986 [4:08:17<09:34,  1.43s/it]

training loss: 3.3230268955230713


training:  96%|█████████▋| 10586/10986 [4:08:18<09:20,  1.40s/it]

training loss: 3.429389476776123


training:  96%|█████████▋| 10587/10986 [4:08:20<09:08,  1.37s/it]

training loss: 3.33408260345459


training:  96%|█████████▋| 10588/10986 [4:08:21<09:02,  1.36s/it]

training loss: 3.311352252960205


training:  96%|█████████▋| 10589/10986 [4:08:22<08:56,  1.35s/it]

training loss: 3.3642306327819824


training:  96%|█████████▋| 10590/10986 [4:08:24<08:50,  1.34s/it]

training loss: 3.4384968280792236


training:  96%|█████████▋| 10591/10986 [4:08:25<09:25,  1.43s/it]

training loss: 3.348041534423828


training:  96%|█████████▋| 10592/10986 [4:08:27<09:10,  1.40s/it]

training loss: 3.4444527626037598


training:  96%|█████████▋| 10593/10986 [4:08:28<08:59,  1.37s/it]

training loss: 3.4053549766540527


training:  96%|█████████▋| 10594/10986 [4:08:29<08:55,  1.37s/it]

training loss: 3.383329153060913


training:  96%|█████████▋| 10595/10986 [4:08:31<08:50,  1.36s/it]

training loss: 3.5358171463012695


training:  96%|█████████▋| 10596/10986 [4:08:32<08:43,  1.34s/it]

training loss: 3.3150460720062256


training:  96%|█████████▋| 10597/10986 [4:08:33<08:39,  1.33s/it]

training loss: 3.361690044403076


training:  96%|█████████▋| 10598/10986 [4:08:35<08:38,  1.34s/it]

training loss: 3.480137348175049


training:  96%|█████████▋| 10599/10986 [4:08:36<08:37,  1.34s/it]

training loss: 3.4358959197998047


training:  96%|█████████▋| 10600/10986 [4:08:37<08:32,  1.33s/it]

training loss: 3.478299379348755
valid loss: 3.475473403930664
perplexity: 32.3131217956543


training:  96%|█████████▋| 10601/10986 [4:08:40<11:27,  1.79s/it]

training loss: 3.301596164703369


training:  97%|█████████▋| 10602/10986 [4:08:41<10:40,  1.67s/it]

training loss: 3.341176986694336


training:  97%|█████████▋| 10603/10986 [4:08:43<09:59,  1.57s/it]

training loss: 3.3509864807128906


training:  97%|█████████▋| 10604/10986 [4:08:44<09:27,  1.49s/it]

training loss: 3.339165210723877


training:  97%|█████████▋| 10605/10986 [4:08:45<09:05,  1.43s/it]

training loss: 3.4275715351104736


training:  97%|█████████▋| 10606/10986 [4:08:47<08:54,  1.41s/it]

training loss: 3.367666721343994


training:  97%|█████████▋| 10607/10986 [4:08:48<08:45,  1.39s/it]

training loss: 3.417104721069336


training:  97%|█████████▋| 10608/10986 [4:08:49<08:37,  1.37s/it]

training loss: 3.321842908859253


training:  97%|█████████▋| 10609/10986 [4:08:51<08:32,  1.36s/it]

training loss: 3.3576323986053467


training:  97%|█████████▋| 10610/10986 [4:08:52<08:26,  1.35s/it]

training loss: 3.501699209213257


training:  97%|█████████▋| 10611/10986 [4:08:54<08:59,  1.44s/it]

training loss: 3.325702667236328


training:  97%|█████████▋| 10612/10986 [4:08:55<08:43,  1.40s/it]

training loss: 3.3697926998138428


training:  97%|█████████▋| 10613/10986 [4:08:56<08:34,  1.38s/it]

training loss: 3.4438047409057617


training:  97%|█████████▋| 10614/10986 [4:08:58<08:27,  1.36s/it]

training loss: 3.3828866481781006


training:  97%|█████████▋| 10615/10986 [4:08:59<08:19,  1.35s/it]

training loss: 3.4432590007781982


training:  97%|█████████▋| 10616/10986 [4:09:00<08:16,  1.34s/it]

training loss: 3.4180679321289062


training:  97%|█████████▋| 10617/10986 [4:09:02<08:14,  1.34s/it]

training loss: 3.3959367275238037


training:  97%|█████████▋| 10618/10986 [4:09:03<08:09,  1.33s/it]

training loss: 3.4292800426483154


training:  97%|█████████▋| 10619/10986 [4:09:04<08:10,  1.34s/it]

training loss: 3.3830244541168213


training:  97%|█████████▋| 10620/10986 [4:09:06<08:13,  1.35s/it]

training loss: 3.3694908618927
valid loss: 3.3672921657562256
perplexity: 28.999895095825195


training:  97%|█████████▋| 10621/10986 [4:09:09<11:01,  1.81s/it]

training loss: 3.429033041000366


training:  97%|█████████▋| 10622/10986 [4:09:10<10:12,  1.68s/it]

training loss: 3.3036835193634033


training:  97%|█████████▋| 10623/10986 [4:09:11<09:34,  1.58s/it]

training loss: 3.3470563888549805


training:  97%|█████████▋| 10624/10986 [4:09:13<09:08,  1.52s/it]

training loss: 3.52122163772583


training:  97%|█████████▋| 10625/10986 [4:09:14<08:50,  1.47s/it]

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(
          self,
          num_tokens,
          d,
          heads = 8,
          depth = 4,
          hidden_size = 1000,
          dropout = 0.3,
          batch_size = 16
      ):
          # asserts
          assert d % heads == 0

          super(TransformerDecoder, self).__init__()
          self.token_emb = nn.Embedding(num_tokens, d)
          self.positional_emb = PositionalEncoding(d, max_len = 5000)
          self.dim_head = d // heads
          self.d = d
          self.heads = heads
          self.depth = depth
          self.hidden_size = hidden_size
          self.dropout = dropout
          self.batch_size = batch_size

          self.layers = nn.ModuleList([])
          for idx in range(depth):
              attn = MultiHeadAttention(num_tokens, d, heads, self.batch_size)

              self.layers.append(nn.ModuleList([
                  attn,
                  SubLayer(d, dropout, hidden_size)
              ]))

          self.to_out = nn.Sequential(
               nn.LayerNorm(d),
               nn.Linear(d, num_tokens)
          )
          
    def forward(
        self,
        x
    ):
        batch_size, seq_len, *_, device = *x.shape, x.device
        x = self.token_emb(x)
        x = self.positional_emb(x)

        for idx, (attn, sub_l) in enumerate(self.layers):
            
            #attention
            x, mem = attn(x, device)
      
            # normalization + feedforward + residual connection
            x = sub_l(x)

        return self.to_out(x).transpose(1, 2)