<a href="https://colab.research.google.com/github/whoami-Lory271/NN-project-memorizing-transformers/blob/main/NN_project_Antonelli_DeSantis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn as nn
import numpy as np
from torch.nn import functional as F
from math import sqrt
import matplotlib.pyplot as plt
from torch.autograd import Variable
from pathlib import Path
from filelock import FileLock
import random
import tqdm
import gzip
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# KNN Memory

In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.6 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [None]:
#import per la knn memory
import os
import math
import torch
import faiss
import numpy as np
from pathlib import Path
from functools import wraps

from contextlib import ExitStack, contextmanager

from einops import rearrange, pack, unpack

# multiprocessing

from joblib import Parallel, delayed, cpu_count

In [None]:
FAISS_INDEX_GPU_ID = int(os.getenv('FAISS_INDEX_GPU_ID', 0))

DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY = './.tmp/knn.memories'

# helper functions

def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

def cast_list(val):
    return val if isinstance(val, list) else [val]

def all_el_unique(arr):
    return len(set(arr)) == len(arr)

@contextmanager
def multi_context(*cms):
    with ExitStack() as stack:
        yield [stack.enter_context(cls) for cls in cms]

def count_intersect(x, y):
    # returns an array that shows how many times an element in x is contained in tensor y
    return np.sum(rearrange(x, 'i -> i 1') == rearrange(y, 'j -> 1 j'), axis = -1)

def check_shape(tensor, pattern, **kwargs):
    return rearrange(tensor, f"{pattern} -> {pattern}", **kwargs)

# a wrapper around faiss IndexIVFFlat
# taking care of expiring old keys automagically

class KNN():
    def __init__(
        self,
        dim,
        max_num_entries,
        cap_num_entries = False,
        M = 15,
        keep_stats = False
    ):
        index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
        self.index = index
        self.max_num_entries = max_num_entries
        self.cap_num_entries = cap_num_entries
        self.is_trained = False
        self.keep_stats = keep_stats

        self.reset()

    def __del__(self):
        if hasattr(self, 'index'):
            del self.index

    def reset(self):
        self.ids = np.empty((0,), dtype = np.int32)

        if self.keep_stats:
            self.hits = np.empty((0,), dtype = np.int32)
            self.age_num_iterations = np.empty((0,), dtype = np.int32)
            self.ages_since_last_hit = np.empty((0,), dtype = np.int32)

        self.index.reset()
        self.is_trained = False

    def train(self, x):
        self.index.train(x)
        self.is_trained = True

    def add(self, x, ids):
        if not self.is_trained:
            self.train(x)

        self.ids = np.concatenate((ids, self.ids))

        if self.keep_stats:
            self.hits = np.concatenate((np.zeros_like(ids), self.hits))
            self.age_num_iterations = np.concatenate((np.zeros_like(ids), self.age_num_iterations))
            self.ages_since_last_hit = np.concatenate((np.zeros_like(ids), self.ages_since_last_hit))

        if self.cap_num_entries and len(self.ids) > self.max_num_entries:
            self.reset()

        return self.index.add(x)

    def search(
        self,
        x,
        topk,
        nprobe = 8,
        return_distances = False,
        increment_hits = False,
        increment_age = True
    ):
        if not self.is_trained:
            return np.full((x.shape[0], topk), -1)

        distances, indices = self.index.search(x, k = topk)

        if increment_hits and self.keep_stats:
            hits = count_intersect(self.ids, rearrange(indices, '... -> (...)'))
            self.hits += hits

            self.ages_since_last_hit += 1
            self.ages_since_last_hit *= (hits == 0)

        if increment_age and self.keep_stats:
            self.age_num_iterations += 1

        if return_distances:
            return indices, distances

        return indices

# KNN memory layer, where one can store key / value memories
# can automatically take care of a collection of faiss indices (across batch dimension)

class KNNMemory():
    def __init__(
        self,
        dim,
        max_memories = 16000,
        num_indices = 1,
        memmap_filename = './knn.memory.memmap',
        multiprocessing = True
    ):
        self.dim = dim
        self.num_indices = num_indices
        self.scoped_indices = list(range(num_indices))

        self.max_memories = max_memories
        self.shape = (num_indices, max_memories, 2, dim)
        self.db_offsets = np.zeros(num_indices, dtype = np.int32)

        self.db = np.memmap(memmap_filename, mode = 'w+', dtype = np.float32, shape = self.shape)
        self.knns = [KNN(dim = dim, max_num_entries = max_memories, cap_num_entries = True) for _ in range(num_indices)]
    
        self.n_jobs = cpu_count() if multiprocessing else 1

    def set_scoped_indices(self, indices):
        indices = list(indices)
        assert all_el_unique(indices), f'all scoped batch indices must be unique, received: {indices}'
        assert all([0 <= i < self.num_indices for i in indices]), f'each batch index must be between 0 and less than {self.num_indices}: received {indices}'
        self.scoped_indices = indices

    @contextmanager
    def at_batch_indices(self, indices):
        prev_indices = self.scoped_indices
        self.set_scoped_indices(indices)
        yield self
        self.set_scoped_indices(prev_indices)

    def clear(self, batch_indices = None):
        if not exists(batch_indices):
            batch_indices = list(range(self.num_indices))

        batch_indices = cast_list(batch_indices)

        for index in batch_indices:
            knn = self.knns[index]
            knn.reset()

        self.db_offsets[batch_indices] = 0

    def add(self, memories):
        check_shape(memories, 'b n kv d', d = self.dim, kv = 2, b = len(self.scoped_indices))

        memories = memories.detach().cpu().numpy()
        memories = memories[:, -self.max_memories:]
        num_memories = memories.shape[1]

        knn_insert_ids = np.arange(num_memories)

        keys = np.ascontiguousarray(memories[..., 0, :])
        knns = [self.knns[i] for i in self.scoped_indices]
        db_offsets = [self.db_offsets[i] for i in self.scoped_indices]

        # use joblib to insert new key / value memories into faiss index

        @delayed
        def knn_add(knn, key, db_offset):
            knn.add(key, ids = knn_insert_ids + db_offset)
            return knn

        updated_knns = Parallel(n_jobs = self.n_jobs)(knn_add(*args) for args in zip(knns, keys, db_offsets))
        for knn_idx, scoped_idx in enumerate(self.scoped_indices):
            self.knns[scoped_idx] = updated_knns[knn_idx]

        # add the new memories to the memmap "database"

        add_indices = (rearrange(np.arange(num_memories), 'j -> 1 j') + rearrange(self.db_offsets[list(self.scoped_indices)], 'i -> i 1')) % self.max_memories
        self.db[rearrange(np.array(self.scoped_indices), 'i -> i 1'), add_indices] = memories
        self.db.flush()

        self.db_offsets += num_memories

    def search(
        self,
        queries,
        topk,
        nprobe = 8,
        increment_hits = True,
        increment_age = True
    ):
        check_shape(queries, 'b ... d', d = self.dim, b = len(self.scoped_indices))
        queries, ps = pack([queries], 'b * d')

        device = queries.device
        queries = queries.detach().cpu().numpy()

        all_masks = []
        all_key_values = []

        knns = [self.knns[i] for i in self.scoped_indices]

        # parallelize faiss search

        @delayed
        def knn_search(knn, query):
            return knn.search(query, topk, nprobe, increment_hits = increment_hits, increment_age = increment_age)

        fetched_indices = Parallel(n_jobs = self.n_jobs)(knn_search(*args) for args in zip(knns, queries))

        # get all the memory key / values from memmap 'database'
        # todo - remove for loop below

        for batch_index, indices in zip(self.scoped_indices, fetched_indices):
            mask = indices !=  -1
            db_indices = np.where(mask, indices, 0)

            all_masks.append(torch.from_numpy(mask))

            key_values = self.db[batch_index, db_indices % self.max_memories]
            all_key_values.append(torch.from_numpy(key_values))

        all_masks = torch.stack(all_masks)
        all_key_values = torch.stack(all_key_values)
        all_key_values = all_key_values.masked_fill(~rearrange(all_masks, '... -> ... 1 1'), 0.)

        all_key_values, = unpack(all_key_values, ps, 'b * n kv d')
        all_masks, = unpack(all_masks, ps, 'b * n')

        return all_key_values.to(device), all_masks.to(device)

    def __del__(self):
        if hasattr(self, 'knns'):
            for knn in self.knns:
                del knn
        del self.db

# extends list with some extra methods for collections of KNN memories

class KNNMemoryList(list):
    def cleanup(self):
        for memory in self:
            del memory

    @classmethod
    def create_memories(
        self,
        *,
        batch_size,
        num_memory_layers,
        memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY
    ):
        memories_path = Path(memories_directory)
        memories_path.mkdir(exist_ok = True, parents = True)

        def inner(*args, **kwargs):
            return self([KNNMemory(*args, num_indices = batch_size, memmap_filename = str(memories_path / f'knn.memory.layer.{ind + 1}.memmap'), **kwargs) for ind in range(num_memory_layers)])
        return inner

    @contextmanager
    def at_batch_indices(
        self,
        indices
    ):
        knn_batch_indices_contexts = [memory.at_batch_indices(indices) for memory in self]
        with multi_context(*knn_batch_indices_contexts):
            yield

    def clear_memory(
        self,
        batch_indices = None,
        memory_indices = None
    ):
        memory_indices = default(memory_indices, tuple(range(len(self))))

        for memory_index in memory_indices:
            memory = self[memory_index]
            memory.clear(batch_indices)

# Memorizing transformers

In [None]:
def attention(query, key, value, sqrt_q, device):
    t = torch.matmul(query, key.transpose(-2, -1))/sqrt_q
    i, j = t.shape[-2:]
    mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
    return torch.matmul(F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

def KNNattention(query, key, value, sqrt_q, mask):
    t = torch.einsum('b h i d, b h i j d -> b h i j', query, key)/sqrt_q
    return torch.einsum('b h i j, b h i j d -> b h i d', F.softmax(t.masked_fill_(mask, -1e-9), dim = -1), value)

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, n, d, h, batch_size):
    super(MultiHeadAttention, self).__init__()
    assert d % h == 0
    #assume q = v 
    self.q = d // h
    self.sqrt_q = sqrt(self.q)
    self.h = h
    self.batch_size = batch_size
    self.W_q = nn.Linear(d, d, bias = False) #stack of h matrices of dimension (d, q), one for each head
    self.W_k = nn.Linear(d, d, bias = False)
    self.W_v = nn.Linear(d, d, bias = False)
    self.W_o = nn.Linear(d, d, bias = False)

  def forward(self, x, device):
    query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
    new_memories = torch.stack((key, value), dim = -2).detach()
    attention_value = attention(query, key, value, self.sqrt_q, device)
    return self.W_o(attention_value.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_memories

In [None]:
class KNNAttention(nn.Module):
   def __init__(self, n, d, h, num_retrieved_memories, batch_size):
      super(KNNAttention, self).__init__()
      assert d % h == 0
      #assume q = v 
      self.q = d // h
      self.sqrt_q = sqrt(self.q)
      self.h = h
      self.W_q = nn.Linear(d, d, bias = False)
      self.W_k = nn.Linear(d, d, bias = False)
      self.W_v = nn.Linear(d, d, bias = False)
      self.W_o = nn.Linear(d, d, bias = False)
      self.b_g = nn.Parameter(torch.randn((h,))) #one for each head
      self.num_retrieved_memories = num_retrieved_memories
      self.batch_size = batch_size

   def forward(self, x, knn_memory, device):
      # calculate local attention 
      query = self.W_q(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      key = self.W_k(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      value = self.W_v(x).view(self.batch_size, -1, self.h, self.q).transpose(1, 2)
      local_attention = attention(query, key, value, self.sqrt_q, device)

      # calculate knn attention over memory
      mem_kv, mem_mask = knn_memory.search(query, self.num_retrieved_memories)
      mem_key, mem_value = mem_kv.unbind(dim = -2)
      knn_attention = KNNattention(query, mem_key, mem_value, self.sqrt_q, ~mem_mask)

      # memory to be stored
      new_kv_memories = torch.stack((key, value), dim = -2).view(self.batch_size, -1, 2, self.q).detach()

      # add to knn memory
      if new_kv_memories.numel() > 0:
        knn_memory.add(new_kv_memories)

      # combining local and memory
      g = torch.sigmoid(self.b_g)
      final_attention = torch.einsum('b h n d, h -> b h n d', knn_attention, g) + \
                        torch.einsum('b h n d, h -> b h n d', local_attention, (1 - g))
      
      return self.W_o(final_attention.transpose(1, 2).contiguous().view(self.batch_size, -1, self.h*self.q)), new_kv_memories

In [None]:
class SubLayer(nn.Module):
  def __init__(self, d, dropout, hidden_size):
    super(SubLayer, self).__init__()
    self.norm = nn.LayerNorm(d)
    self.mlp = nn.Sequential(nn.Linear(d, hidden_size, bias = True), 
                             nn.ReLU(),
                             nn.Dropout(dropout),
                             nn.Linear(hidden_size, d, bias = True))

  def forward(self, x):
    return x + self.mlp(self.norm(x)) #residual connection and normalization

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super(PositionalEncoding, self).__init__()
    
    # Compute the positional encodings once in log space.
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) *
                          -(math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
      
  def forward(self, x):
    return x + Variable(self.pe[:, :x.size(1)], requires_grad=False)

In [None]:
class MemorizingTransformer(nn.Module):
    def __init__(
          self,
          num_tokens,
          d,
          heads = 8,
          depth = 4,
          knn_attn_idx = 2,
          attn_dropout = 0.,
          hidden_size = 1000,
          dropout = 0.3,
          max_knn_memories = 1000,
          num_retrieved_memories = 8,
          knn_memories_directory = DEFAULT_KNN_MEMORY_MEMMAP_DIRECTORY,
          knn_memory_multiprocessing = False,
          batch_size = 16
      ):
          # asserts
          assert d % heads == 0
          assert knn_attn_idx < depth

          super(MemorizingTransformer, self).__init__()
          self.token_emb = nn.Embedding(num_tokens, d)
          self.positional_emb = PositionalEncoding(d, max_len = 5000)
          self.dim_head = d // heads
          self.d = d
          self.heads = heads
          self.knn_attn_idx = knn_attn_idx
          self.depth = depth
          self.attn_dropout = attn_dropout
          self.hidden_size = hidden_size
          self.dropout = dropout
          self.max_knn_memories = max_knn_memories
          self.num_retrieved_memories = num_retrieved_memories
          self.knn_memories_directory = knn_memories_directory
          self.knn_memory_multiprocessing =knn_memory_multiprocessing
          self.batch_size = batch_size

          self.layers = nn.ModuleList([])
          for idx in range(depth):
              attn = KNNAttention(num_tokens, d, heads, num_retrieved_memories, self.batch_size) \
                  if idx == knn_attn_idx else MultiHeadAttention(num_tokens, d, heads, self.batch_size)

              self.layers.append(nn.ModuleList([
                  attn,
                  SubLayer(d, dropout, hidden_size)
              ]))

          self.to_out = nn.Sequential(
               nn.LayerNorm(d),
               nn.Linear(d, num_tokens)
          )

          # knn memories init

          self.knn_mem_kwargs = dict(
              dim = self.dim_head,
              max_memories = self.max_knn_memories,
              multiprocessing = knn_memory_multiprocessing
          )
          
    def forward(
        self,
        x,
        knn_memory
    ):
        batch_size, seq_len, *_, device = *x.shape, x.device
        x = self.token_emb(x)
        x = self.positional_emb(x)

        for idx, (attn, sub_l) in enumerate(self.layers):
            
            #attention

            x, mem = attn(x, knn_memory, device) if self.knn_attn_idx == idx else attn(x, device)
      
            # normalization + feedforward + residual connection

            x = sub_l(x)

        return self.to_out(x).transpose(1, 2)

    
    def create_knn_memories(
          self,
          *,
          batch_size
      ):  
          return KNNMemoryList.create_memories(
              batch_size = batch_size,
              num_memory_layers = 1,
              memories_directory = self.knn_memories_directory
          )(**self.knn_mem_kwargs)
      
    @contextmanager
    def knn_memories_context(
        self,
        **kwargs
    ):
        knn_dir = Path(self.knn_memories_directory)
        knn_dir.mkdir(exist_ok = True, parents = True)
        lock = FileLock(str(knn_dir / 'mutex'))

        with lock:
            knn_memories = self.create_knn_memories(**kwargs)
            yield knn_memories
            knn_memories.cleanup()

    def clear_memory(self, x, token_id):
        """ clears the KNN memories based on if the batch row contains the specified token id """
        """ for auto-clearing KNN memories based on start and end of strings """

        clear_memory = (x == token_id).any(dim = -1)
        batch_indices, _ = clear_memory.nonzero(as_tuple = True)
        batch_indices_to_clear = batch_indices.tolist()

        if len(batch_indices_to_clear) == 0:
            return

        knn_memories.clear_memory(batch_indices_to_clear)

# Training

In [None]:
# constants

NUM_BATCHES = int(1e5)
BATCH_SIZE = 16
SEQ_LEN = 512
SEGMENTS = 5
HEADS = 8
DIM_HEAD = SEQ_LEN // HEADS

LEARNING_RATE = 2e-4
MAX_GRAD_CLIP_NORM = 0.5

EVAL_EVERY = 20
GENERATE_EVERY  = 500
GENERATE_LENGTH = 512
CHECKPOINT = 100

In [None]:
model = MemorizingTransformer(
    num_tokens = 256,
    d = SEQ_LEN,
    heads = HEADS,
    batch_size = BATCH_SIZE,
    depth = 4,
    knn_attn_idx = 2,
    num_retrieved_memories = 32
).cuda()

memory = KNNMemory(
    dim = DIM_HEAD,                   # dimension of key / values
    max_memories = 1000,       # maximum number of memories to keep (will throw out the oldest memories for now if it overfills)
    num_indices = BATCH_SIZE          # this should be equivalent to batch dimension, as each batch keeps track of its own memories, expiring when it sees a new document
)

# prepare enwik8 data

#Lorenzo
with gzip.open('/content/drive/MyDrive/Secondo Anno/Neural Networks/project/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    print(X.shape)
    # number of samples to take
    # n_samples = math.ceil(0.6*X.shape[0])
    # take the set uniformly at random
    # data = resample(X, n_samples=n_samples, replace=False)
    # trX, vaX = train_test_split(data, test_size=math.ceil(0.2*data.shape[0]))
    trX, vaX = np.split(X, [int(90e6)])
    print(trX.shape)
    print(vaX.shape)
    # assert False
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
    # data = torch.from_numpy(X)
"""

#Luigi
with gzip.open('/content/drive/MyDrive/Colab Notebooks/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
"""

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len

# # dataset and dataloader
# dataset = TextSamplerDataset(data, SEQ_LEN)
# # test_dataset = TextSamplerDataset(data_val, SEQ_LEN)

# data_size = dataset.__len__()
# # data_test_size = test_dataset.__len__()

# perc_data = 0.3
# valid_size=0.2
# indices = list(range(data_size))
# np.random.shuffle(indices)
# data_size = int(np.floor(data_size * 0.3))
# print(data_size)
# indices = indices[:data_size]

# split = int(np.floor(valid_size * data_size))
# train_idx, valid_idx = indices[split:], indices[:split]

# # define samplers for obtaining training and validation batches
# train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
# valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)

# train_loader  = DataLoader(dataset, batch_size = BATCH_SIZE, sampler = train_sampler, drop_last = True)

# test_loader = DataLoader(dataset, batch_size = BATCH_SIZE, sampler =valid_sampler, drop_last = True)


train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
train_loader  = DataLoader(train_dataset, batch_size = BATCH_SIZE, drop_last = True)
test_dataset = TextSamplerDataset(data_val, SEQ_LEN)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, drop_last = True)

(95000000,)
(90000000,)
(5000000,)


  X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)


In [None]:
def print_string(a):
  seq = ""
  for word in a:
    for letter in word:
      seq += chr(letter)
    seq += " "
  return seq

In [None]:
# optimizer

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
loss = nn.CrossEntropyLoss()

# training

perplexity_list = []

for i, data in enumerate(tqdm.tqdm(train_loader, desc = 'training')):
    model.train()

    train_loss = 0.
    # with model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories:
        
    seq, labels = data[:, :-1], data[:, 1:] #the labels are the same sequences shifted by one

    out = model(
          seq,
          knn_memory = memory
    )
    #loss_item = torch.exp(loss(out, labels)) #perplexity
    loss_item = loss(out, labels)
    train_loss += loss_item
    loss_item.backward() 

    print(f'training loss: {train_loss}', flush = True)
    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_CLIP_NORM)
    optimizer.step()
    optimizer.zero_grad()

    if i % EVAL_EVERY == 0:
        model.eval()
      
        test_data = None
        for test_data in test_loader:
          break

        test_loss = 0.

        # with torch.no_grad(), model.knn_memories_context(batch_size = BATCH_SIZE) as knn_memories: 
        seq, labels = test_data[:, :-1], test_data[:, 1:]
        # print(print_string(seq))
        # print(labels.shape)
        # print(print_string(test_data[0:5,:]))
        # assert False
        
        out = model(
          seq,
          knn_memory = memory
        )

        loss_item = loss(out, labels)
        test_loss +=  loss_item
            

        print(f'valid loss: {test_loss}', flush = True)
        print(f'perplexity: {torch.exp(test_loss)}', flush = True)
        perplexity_list.append(torch.exp(test_loss).to('cpu').item())
    
    if i % CHECKPOINT == 0:
      torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
      }, 'model_optimizer2.pt')
      #Lorenzo
      with open('/content/drive/MyDrive/Università/Magistrale/Secondo Anno/Neural Networks/project/perplexity_moreNN.npy', 'wb') as f:
        np.save(f, np.array(perplexity_list))

plt.plot(perplexity_list, label = "Memorizing Transformer Perplexity Plot")
plt.legend()
plt.show()

training:   0%|          | 0/10986 [00:00<?, ?it/s]

training loss: 5.6315836906433105
valid loss: 4.66663932800293
perplexity: 106.33976745605469


training:   0%|          | 1/10986 [00:13<41:32:22, 13.61s/it]

training loss: 4.827857971191406


training:   0%|          | 2/10986 [00:17<24:20:27,  7.98s/it]

training loss: 4.303380966186523


training:   0%|          | 3/10986 [00:23<21:47:30,  7.14s/it]

training loss: 4.066551208496094


training:   0%|          | 4/10986 [00:28<19:13:02,  6.30s/it]

training loss: 4.087843418121338


training:   0%|          | 5/10986 [00:32<16:21:47,  5.36s/it]

training loss: 3.930798053741455


training:   0%|          | 6/10986 [00:34<13:01:16,  4.27s/it]

training loss: 3.843836784362793


training:   0%|          | 7/10986 [00:37<11:30:07,  3.77s/it]

training loss: 3.811279296875


training:   0%|          | 8/10986 [00:39<9:56:31,  3.26s/it] 

training loss: 3.754079580307007


training:   0%|          | 9/10986 [00:42<9:29:04,  3.11s/it]

training loss: 3.8737785816192627


training:   0%|          | 10/10986 [00:44<8:35:47,  2.82s/it]

training loss: 3.6254830360412598


training:   0%|          | 11/10986 [00:47<8:31:55,  2.80s/it]

training loss: 3.645186424255371


training:   0%|          | 12/10986 [00:49<7:58:08,  2.61s/it]

training loss: 3.7075564861297607


training:   0%|          | 13/10986 [00:52<8:05:57,  2.66s/it]

training loss: 3.6014516353607178


training:   0%|          | 14/10986 [00:54<7:37:29,  2.50s/it]

training loss: 3.6774909496307373


training:   0%|          | 15/10986 [00:57<7:49:10,  2.57s/it]

training loss: 3.617432117462158


training:   0%|          | 16/10986 [00:59<7:29:13,  2.46s/it]

training loss: 3.6254374980926514


training:   0%|          | 17/10986 [01:01<7:42:53,  2.53s/it]

training loss: 3.4745559692382812


training:   0%|          | 18/10986 [01:04<7:22:50,  2.42s/it]

training loss: 3.645411491394043


training:   0%|          | 19/10986 [01:06<7:36:25,  2.50s/it]

training loss: 3.5424442291259766


training:   0%|          | 20/10986 [01:09<7:39:10,  2.51s/it]

training loss: 3.5661442279815674
valid loss: 3.5023722648620605
perplexity: 33.1941032409668


training:   0%|          | 21/10986 [01:14<9:56:18,  3.26s/it]

training loss: 3.492518186569214


training:   0%|          | 22/10986 [01:17<9:24:42,  3.09s/it]

training loss: 3.6016287803649902


training:   0%|          | 23/10986 [01:19<8:36:21,  2.83s/it]

training loss: 3.5618581771850586


training:   0%|          | 24/10986 [01:22<8:31:09,  2.80s/it]

training loss: 3.5852222442626953


training:   0%|          | 25/10986 [01:24<7:56:04,  2.61s/it]

training loss: 3.5763068199157715


training:   0%|          | 26/10986 [01:26<7:58:05,  2.62s/it]

training loss: 3.4411754608154297


training:   0%|          | 27/10986 [01:29<7:35:33,  2.49s/it]

training loss: 3.581010580062866


training:   0%|          | 28/10986 [01:31<7:48:45,  2.57s/it]

training loss: 3.4493818283081055


training:   0%|          | 29/10986 [01:33<7:27:50,  2.45s/it]

training loss: 3.632664442062378


training:   0%|          | 30/10986 [01:36<7:40:05,  2.52s/it]

training loss: 3.563114881515503


training:   0%|          | 31/10986 [01:38<7:23:22,  2.43s/it]

training loss: 3.4855003356933594


training:   0%|          | 32/10986 [01:41<7:37:01,  2.50s/it]

training loss: 3.45654296875


training:   0%|          | 33/10986 [01:44<8:02:50,  2.64s/it]

training loss: 3.578385353088379


training:   0%|          | 34/10986 [01:47<8:03:58,  2.65s/it]

training loss: 3.560065269470215


training:   0%|          | 35/10986 [01:49<7:39:47,  2.52s/it]

training loss: 3.411343812942505


training:   0%|          | 36/10986 [01:52<7:51:05,  2.58s/it]

training loss: 3.40154767036438


training:   0%|          | 37/10986 [01:54<7:30:26,  2.47s/it]

training loss: 3.440624952316284


training:   0%|          | 38/10986 [01:56<7:40:57,  2.53s/it]

training loss: 3.522461414337158


training:   0%|          | 39/10986 [01:59<7:26:58,  2.45s/it]

training loss: 3.443829298019409


training:   0%|          | 40/10986 [02:02<7:45:34,  2.55s/it]

training loss: 3.5131306648254395
valid loss: 3.407301187515259
perplexity: 30.18367576599121


training:   0%|          | 41/10986 [02:06<9:41:28,  3.19s/it]

training loss: 3.4372012615203857


training:   0%|          | 42/10986 [02:08<8:44:59,  2.88s/it]

training loss: 3.5561254024505615


training:   0%|          | 43/10986 [02:11<8:35:51,  2.83s/it]

training loss: 3.4938433170318604


training:   0%|          | 44/10986 [02:13<8:01:04,  2.64s/it]

training loss: 3.683349132537842


training:   0%|          | 45/10986 [02:16<8:04:22,  2.66s/it]

training loss: 3.44643235206604


training:   0%|          | 46/10986 [02:18<7:37:16,  2.51s/it]

training loss: 3.5773043632507324


training:   0%|          | 47/10986 [02:21<7:50:45,  2.58s/it]

training loss: 3.5098724365234375


training:   0%|          | 48/10986 [02:23<7:26:52,  2.45s/it]

training loss: 3.671140193939209


training:   0%|          | 49/10986 [02:26<7:43:31,  2.54s/it]

training loss: 3.319530487060547


training:   0%|          | 50/10986 [02:28<7:24:15,  2.44s/it]

training loss: 3.6249969005584717


training:   0%|          | 51/10986 [02:31<7:35:43,  2.50s/it]

training loss: 3.5395257472991943


training:   0%|          | 52/10986 [02:33<7:15:44,  2.39s/it]

training loss: 3.5614945888519287


training:   0%|          | 53/10986 [02:35<7:32:58,  2.49s/it]

training loss: 3.451840400695801


training:   0%|          | 54/10986 [02:38<7:12:32,  2.37s/it]

training loss: 3.474358320236206


training:   1%|          | 55/10986 [02:40<7:33:35,  2.49s/it]

training loss: 3.4761781692504883


training:   1%|          | 56/10986 [02:42<7:14:18,  2.38s/it]

training loss: 3.5674519538879395


training:   1%|          | 57/10986 [02:45<7:31:44,  2.48s/it]

training loss: 3.3840131759643555


training:   1%|          | 58/10986 [02:47<7:13:07,  2.38s/it]

training loss: 3.7109804153442383


training:   1%|          | 59/10986 [02:50<7:33:10,  2.49s/it]

training loss: 3.4722137451171875


training:   1%|          | 60/10986 [02:52<7:15:12,  2.39s/it]

training loss: 3.557234525680542
valid loss: 3.4847419261932373
perplexity: 32.614009857177734


training:   1%|          | 61/10986 [02:57<9:21:47,  3.09s/it]

training loss: 3.5127813816070557


training:   1%|          | 62/10986 [03:00<8:58:05,  2.96s/it]

training loss: 3.501962661743164


training:   1%|          | 63/10986 [03:02<8:11:34,  2.70s/it]

training loss: 3.386857032775879


training:   1%|          | 64/10986 [03:04<8:13:00,  2.71s/it]

training loss: 3.5048129558563232


training:   1%|          | 65/10986 [03:07<7:41:35,  2.54s/it]

training loss: 3.3922436237335205


training:   1%|          | 66/10986 [03:09<7:49:32,  2.58s/it]

training loss: 3.4217965602874756


training:   1%|          | 67/10986 [03:12<8:13:53,  2.71s/it]

training loss: 3.4582390785217285


training:   1%|          | 68/10986 [03:15<8:31:46,  2.81s/it]

training loss: 3.4513111114501953


training:   1%|          | 69/10986 [03:17<7:53:27,  2.60s/it]

training loss: 3.425396203994751


training:   1%|          | 70/10986 [03:20<7:53:17,  2.60s/it]

training loss: 3.623584032058716


training:   1%|          | 71/10986 [03:22<7:28:02,  2.46s/it]

training loss: 3.360506772994995


training:   1%|          | 72/10986 [03:25<7:34:06,  2.50s/it]

training loss: 3.502575397491455


training:   1%|          | 73/10986 [03:27<7:14:49,  2.39s/it]

training loss: 3.624379873275757


training:   1%|          | 74/10986 [03:29<7:22:23,  2.43s/it]

training loss: 3.3867926597595215


training:   1%|          | 75/10986 [03:32<7:05:56,  2.34s/it]

training loss: 3.669727325439453


training:   1%|          | 76/10986 [03:34<7:14:49,  2.39s/it]

training loss: 3.4684901237487793


training:   1%|          | 77/10986 [03:36<7:00:09,  2.31s/it]

training loss: 3.4174718856811523


training:   1%|          | 78/10986 [03:39<7:07:58,  2.35s/it]

training loss: 3.472921133041382


training:   1%|          | 79/10986 [03:41<6:55:28,  2.29s/it]

training loss: 3.5052733421325684


training:   1%|          | 80/10986 [03:43<7:02:53,  2.33s/it]

training loss: 3.4024248123168945
valid loss: 3.3245410919189453
perplexity: 27.786243438720703


training:   1%|          | 81/10986 [03:48<8:53:56,  2.94s/it]

training loss: 3.457066059112549


training:   1%|          | 82/10986 [03:50<8:13:00,  2.71s/it]

training loss: 3.409851551055908


training:   1%|          | 83/10986 [03:52<7:56:44,  2.62s/it]

training loss: 3.3979594707489014


training:   1%|          | 84/10986 [03:54<7:29:51,  2.48s/it]

training loss: 3.460843801498413


training:   1%|          | 85/10986 [03:57<7:27:58,  2.47s/it]

training loss: 3.3861072063446045


training:   1%|          | 86/10986 [03:59<7:07:06,  2.35s/it]

training loss: 3.3736937046051025


training:   1%|          | 87/10986 [04:01<7:13:03,  2.38s/it]

training loss: 3.372326612472534


training:   1%|          | 88/10986 [04:03<6:59:36,  2.31s/it]

training loss: 3.382915735244751


training:   1%|          | 89/10986 [04:06<7:08:06,  2.36s/it]

training loss: 3.411726713180542


training:   1%|          | 90/10986 [04:08<6:54:46,  2.28s/it]

training loss: 3.414977550506592


training:   1%|          | 91/10986 [04:10<7:03:21,  2.33s/it]

training loss: 3.345186233520508


training:   1%|          | 92/10986 [04:13<6:52:07,  2.27s/it]

training loss: 3.410555362701416


training:   1%|          | 93/10986 [04:15<7:01:56,  2.32s/it]

training loss: 3.505265951156616


training:   1%|          | 94/10986 [04:17<6:51:24,  2.27s/it]

training loss: 3.393073320388794


training:   1%|          | 95/10986 [04:20<7:06:38,  2.35s/it]

training loss: 3.2457876205444336


training:   1%|          | 96/10986 [04:22<6:53:46,  2.28s/it]

training loss: 3.341066837310791


training:   1%|          | 97/10986 [04:24<7:02:37,  2.33s/it]

training loss: 3.4359138011932373


training:   1%|          | 98/10986 [04:26<6:54:00,  2.28s/it]

training loss: 3.4870944023132324


training:   1%|          | 99/10986 [04:29<7:06:03,  2.35s/it]

training loss: 3.5165579319000244


training:   1%|          | 100/10986 [04:31<6:54:02,  2.28s/it]

training loss: 3.3799750804901123
valid loss: 3.551800012588501
perplexity: 34.87603759765625


training:   1%|          | 101/10986 [04:36<9:05:10,  3.01s/it]

training loss: 3.474655866622925


training:   1%|          | 102/10986 [04:38<8:39:45,  2.87s/it]

training loss: 3.3007497787475586


training:   1%|          | 103/10986 [04:40<8:00:49,  2.65s/it]

training loss: 3.435089588165283


training:   1%|          | 104/10986 [04:43<7:49:38,  2.59s/it]

training loss: 3.340294361114502


training:   1%|          | 105/10986 [04:45<7:23:57,  2.45s/it]

training loss: 3.3677749633789062


training:   1%|          | 106/10986 [04:47<7:23:21,  2.44s/it]

training loss: 3.396387815475464


training:   1%|          | 107/10986 [04:50<7:05:07,  2.34s/it]

training loss: 3.4837558269500732


training:   1%|          | 108/10986 [04:52<7:08:49,  2.37s/it]

training loss: 3.3384900093078613


training:   1%|          | 109/10986 [04:54<6:56:17,  2.30s/it]

training loss: 3.3748912811279297


training:   1%|          | 110/10986 [04:56<7:03:04,  2.33s/it]

training loss: 3.410780191421509


training:   1%|          | 111/10986 [04:59<6:53:11,  2.28s/it]

training loss: 3.3542280197143555


training:   1%|          | 112/10986 [05:01<7:02:08,  2.33s/it]

training loss: 3.362436532974243


training:   1%|          | 113/10986 [05:03<6:50:53,  2.27s/it]

training loss: 3.369995594024658


training:   1%|          | 114/10986 [05:06<6:59:10,  2.31s/it]

training loss: 3.449810028076172


training:   1%|          | 115/10986 [05:08<6:49:35,  2.26s/it]

training loss: 3.3567464351654053


training:   1%|          | 116/10986 [05:10<6:57:32,  2.30s/it]

training loss: 3.422344446182251


training:   1%|          | 117/10986 [05:13<7:05:54,  2.35s/it]

training loss: 3.378943681716919


training:   1%|          | 118/10986 [05:16<8:16:57,  2.74s/it]

training loss: 3.4065654277801514


training:   1%|          | 119/10986 [05:18<7:45:09,  2.57s/it]

training loss: 3.4574830532073975


training:   1%|          | 120/10986 [05:21<7:42:41,  2.55s/it]

training loss: 3.3369357585906982
valid loss: 3.4411251544952393
perplexity: 31.22206687927246


training:   1%|          | 121/10986 [05:25<9:24:57,  3.12s/it]

training loss: 3.283526659011841


training:   1%|          | 122/10986 [05:28<8:34:18,  2.84s/it]

training loss: 3.4404914379119873


training:   1%|          | 123/10986 [05:30<8:14:37,  2.73s/it]

training loss: 3.3660078048706055


training:   1%|          | 124/10986 [05:32<7:41:13,  2.55s/it]

training loss: 3.4537723064422607


training:   1%|          | 125/10986 [05:35<7:35:05,  2.51s/it]

training loss: 3.4532268047332764


training:   1%|          | 126/10986 [05:37<7:14:52,  2.40s/it]

training loss: 3.309603691101074


training:   1%|          | 127/10986 [05:39<7:18:24,  2.42s/it]

training loss: 3.5095748901367188


training:   1%|          | 128/10986 [05:41<7:03:58,  2.34s/it]

training loss: 3.2931532859802246


training:   1%|          | 129/10986 [05:44<7:09:41,  2.37s/it]

training loss: 3.4482479095458984


training:   1%|          | 130/10986 [05:46<6:56:57,  2.30s/it]

training loss: 3.4793739318847656


training:   1%|          | 131/10986 [05:48<7:06:33,  2.36s/it]

training loss: 3.359806537628174


training:   1%|          | 132/10986 [05:51<6:52:20,  2.28s/it]

training loss: 3.5018038749694824


training:   1%|          | 133/10986 [05:53<7:01:59,  2.33s/it]

training loss: 3.3654537200927734


training:   1%|          | 134/10986 [05:55<6:50:52,  2.27s/it]

training loss: 3.5085301399230957


training:   1%|          | 135/10986 [05:58<6:58:46,  2.32s/it]

training loss: 3.2894222736358643


training:   1%|          | 136/10986 [06:00<7:23:30,  2.45s/it]

training loss: 3.4631762504577637


training:   1%|          | 137/10986 [06:03<8:01:37,  2.66s/it]

training loss: 3.342254161834717


training:   1%|▏         | 138/10986 [06:06<7:32:44,  2.50s/it]

training loss: 3.4164352416992188


training:   1%|▏         | 139/10986 [06:08<7:30:46,  2.49s/it]

training loss: 3.3633108139038086


training:   1%|▏         | 140/10986 [06:10<7:11:31,  2.39s/it]

training loss: 3.301290988922119
valid loss: 3.2902796268463135
perplexity: 26.850370407104492


training:   1%|▏         | 141/10986 [06:16<10:33:43,  3.51s/it]

training loss: 3.4712424278259277


training:   1%|▏         | 142/10986 [06:19<9:43:07,  3.23s/it] 

training loss: 3.447274923324585


training:   1%|▏         | 143/10986 [06:21<8:44:56,  2.90s/it]

training loss: 3.411987781524658


training:   1%|▏         | 144/10986 [06:24<8:19:54,  2.77s/it]

training loss: 3.4072065353393555


training:   1%|▏         | 145/10986 [06:26<7:45:03,  2.57s/it]

training loss: 3.4156653881073


training:   1%|▏         | 146/10986 [06:28<7:37:43,  2.53s/it]

training loss: 3.419215679168701


training:   1%|▏         | 147/10986 [06:30<7:17:13,  2.42s/it]

training loss: 3.324310541152954


training:   1%|▏         | 148/10986 [06:33<7:17:25,  2.42s/it]

training loss: 3.375544548034668


training:   1%|▏         | 149/10986 [06:35<7:00:03,  2.33s/it]

training loss: 3.359344720840454


training:   1%|▏         | 150/10986 [06:37<7:11:40,  2.39s/it]

training loss: 3.5846683979034424


training:   1%|▏         | 151/10986 [06:39<6:58:12,  2.32s/it]

training loss: 3.4295332431793213


training:   1%|▏         | 152/10986 [06:42<7:06:22,  2.36s/it]

training loss: 3.5442042350769043


training:   1%|▏         | 153/10986 [06:44<6:53:38,  2.29s/it]

training loss: 3.4184072017669678


training:   1%|▏         | 154/10986 [06:47<7:04:22,  2.35s/it]

training loss: 3.41556978225708


training:   1%|▏         | 155/10986 [06:49<6:53:46,  2.29s/it]

training loss: 3.5798048973083496


training:   1%|▏         | 156/10986 [06:51<7:04:00,  2.35s/it]

training loss: 3.3592772483825684


training:   1%|▏         | 157/10986 [06:53<6:50:02,  2.27s/it]

training loss: 3.4966623783111572


training:   1%|▏         | 158/10986 [06:56<6:58:54,  2.32s/it]

training loss: 3.5103442668914795


training:   1%|▏         | 159/10986 [06:58<6:47:52,  2.26s/it]

training loss: 3.3440091609954834


training:   1%|▏         | 160/10986 [07:00<6:57:53,  2.32s/it]

training loss: 3.39200496673584
valid loss: 3.2805471420288086
perplexity: 26.59031867980957


training:   1%|▏         | 161/10986 [07:05<8:51:36,  2.95s/it]

training loss: 3.3975794315338135


training:   1%|▏         | 162/10986 [07:07<8:09:00,  2.71s/it]

training loss: 3.3992950916290283


training:   1%|▏         | 163/10986 [07:09<7:54:45,  2.63s/it]

training loss: 3.5184011459350586


training:   1%|▏         | 164/10986 [07:11<7:28:04,  2.48s/it]

training loss: 3.3713881969451904


training:   2%|▏         | 165/10986 [07:15<8:12:18,  2.73s/it]

training loss: 3.491175413131714


training:   2%|▏         | 166/10986 [07:17<7:39:04,  2.55s/it]

training loss: 3.3548827171325684


training:   2%|▏         | 167/10986 [07:19<7:36:06,  2.53s/it]

training loss: 3.521808385848999


training:   2%|▏         | 168/10986 [07:21<7:15:23,  2.41s/it]

training loss: 3.297417640686035


training:   2%|▏         | 169/10986 [07:24<7:15:08,  2.41s/it]

training loss: 3.3722236156463623


training:   2%|▏         | 170/10986 [07:26<6:58:31,  2.32s/it]

training loss: 3.552088975906372


training:   2%|▏         | 171/10986 [07:28<7:03:11,  2.35s/it]

training loss: 3.33099365234375


training:   2%|▏         | 172/10986 [07:31<6:50:38,  2.28s/it]

training loss: 3.4313130378723145


training:   2%|▏         | 173/10986 [07:33<6:59:43,  2.33s/it]

training loss: 3.4644622802734375


training:   2%|▏         | 174/10986 [07:35<6:49:48,  2.27s/it]

training loss: 3.29203200340271


training:   2%|▏         | 175/10986 [07:38<7:03:56,  2.35s/it]

training loss: 3.3325319290161133


training:   2%|▏         | 176/10986 [07:40<6:58:05,  2.32s/it]

training loss: 3.3207170963287354


training:   2%|▏         | 177/10986 [07:43<7:29:03,  2.49s/it]

training loss: 3.380668878555298


training:   2%|▏         | 178/10986 [07:45<7:09:11,  2.38s/it]

training loss: 3.385303258895874


training:   2%|▏         | 179/10986 [07:47<7:11:41,  2.40s/it]

training loss: 3.4434661865234375


training:   2%|▏         | 180/10986 [07:49<6:57:26,  2.32s/it]

training loss: 3.3570380210876465
valid loss: 3.486851215362549
perplexity: 32.6828727722168


training:   2%|▏         | 181/10986 [07:54<8:49:24,  2.94s/it]

training loss: 3.546269416809082


training:   2%|▏         | 182/10986 [07:56<8:26:47,  2.81s/it]

training loss: 3.4247779846191406


training:   2%|▏         | 183/10986 [07:59<7:50:04,  2.61s/it]

training loss: 3.4521324634552


training:   2%|▏         | 184/10986 [08:01<7:40:59,  2.56s/it]

training loss: 3.3980984687805176


training:   2%|▏         | 185/10986 [08:03<7:17:55,  2.43s/it]

training loss: 3.332761526107788


training:   2%|▏         | 186/10986 [08:06<7:19:07,  2.44s/it]

training loss: 3.4260449409484863


training:   2%|▏         | 187/10986 [08:08<7:02:36,  2.35s/it]

training loss: 3.3751070499420166


training:   2%|▏         | 188/10986 [08:10<7:08:02,  2.38s/it]

training loss: 3.4259321689605713


training:   2%|▏         | 189/10986 [08:12<7:03:39,  2.35s/it]

training loss: 3.3122849464416504


training:   2%|▏         | 190/10986 [08:16<7:57:33,  2.65s/it]

training loss: 3.3337769508361816


training:   2%|▏         | 191/10986 [08:18<7:29:16,  2.50s/it]

training loss: 3.3959877490997314


training:   2%|▏         | 192/10986 [08:20<7:26:16,  2.48s/it]

training loss: 3.2952568531036377


training:   2%|▏         | 193/10986 [08:22<7:06:28,  2.37s/it]

training loss: 3.2873196601867676


training:   2%|▏         | 194/10986 [08:25<7:09:38,  2.39s/it]

training loss: 3.376723527908325


training:   2%|▏         | 195/10986 [08:27<6:54:47,  2.31s/it]

training loss: 3.4430041313171387


training:   2%|▏         | 196/10986 [08:29<6:59:29,  2.33s/it]

training loss: 3.4679555892944336


training:   2%|▏         | 197/10986 [08:32<6:48:46,  2.27s/it]

training loss: 3.484804630279541


training:   2%|▏         | 198/10986 [08:34<6:57:55,  2.32s/it]

training loss: 3.3813681602478027


training:   2%|▏         | 199/10986 [08:36<6:45:40,  2.26s/it]

training loss: 3.61553955078125


training:   2%|▏         | 200/10986 [08:39<6:55:57,  2.31s/it]

training loss: 3.380452871322632
valid loss: 3.4514009952545166
perplexity: 31.5445556640625


training:   2%|▏         | 201/10986 [08:43<9:03:48,  3.03s/it]

training loss: 3.444988250732422


training:   2%|▏         | 202/10986 [08:45<8:15:52,  2.76s/it]

training loss: 3.5337774753570557


training:   2%|▏         | 203/10986 [08:49<8:36:17,  2.87s/it]

training loss: 3.429272413253784


training:   2%|▏         | 204/10986 [08:51<8:25:09,  2.81s/it]

training loss: 3.3644144535064697


training:   2%|▏         | 205/10986 [08:54<8:07:27,  2.71s/it]

training loss: 3.3719193935394287


training:   2%|▏         | 206/10986 [08:56<7:33:43,  2.53s/it]

training loss: 3.4681026935577393


training:   2%|▏         | 207/10986 [08:58<7:28:14,  2.50s/it]

training loss: 3.390671968460083


training:   2%|▏         | 208/10986 [09:00<7:07:41,  2.38s/it]

training loss: 3.392392158508301


training:   2%|▏         | 209/10986 [09:03<7:09:38,  2.39s/it]

training loss: 3.3493449687957764


training:   2%|▏         | 210/10986 [09:05<6:56:36,  2.32s/it]

training loss: 3.308044910430908


training:   2%|▏         | 211/10986 [09:07<7:07:25,  2.38s/it]

training loss: 3.3514251708984375


training:   2%|▏         | 212/10986 [09:10<6:54:58,  2.31s/it]

training loss: 3.3865182399749756


training:   2%|▏         | 213/10986 [09:12<7:20:36,  2.45s/it]

training loss: 3.3971822261810303


training:   2%|▏         | 214/10986 [09:15<7:23:18,  2.47s/it]

training loss: 3.442997455596924


training:   2%|▏         | 215/10986 [09:17<7:23:06,  2.47s/it]

training loss: 3.4523208141326904


training:   2%|▏         | 216/10986 [09:19<7:06:09,  2.37s/it]

training loss: 3.3916873931884766


training:   2%|▏         | 217/10986 [09:22<7:10:07,  2.40s/it]

training loss: 3.4091219902038574


training:   2%|▏         | 218/10986 [09:24<6:54:49,  2.31s/it]

training loss: 3.3520209789276123


training:   2%|▏         | 219/10986 [09:26<7:01:58,  2.35s/it]

training loss: 3.349928617477417


training:   2%|▏         | 220/10986 [09:29<6:49:00,  2.28s/it]

training loss: 3.358532428741455
valid loss: 3.4540867805480957
perplexity: 31.629392623901367


training:   2%|▏         | 221/10986 [09:33<8:47:04,  2.94s/it]

training loss: 3.412900686264038


training:   2%|▏         | 222/10986 [09:36<8:23:18,  2.81s/it]

training loss: 3.3707942962646484


training:   2%|▏         | 223/10986 [09:38<7:45:32,  2.60s/it]

training loss: 3.307438373565674


training:   2%|▏         | 224/10986 [09:40<7:38:22,  2.56s/it]

training loss: 3.2603249549865723


training:   2%|▏         | 225/10986 [09:42<7:17:21,  2.44s/it]

training loss: 3.397831916809082


training:   2%|▏         | 226/10986 [09:45<7:18:44,  2.45s/it]

training loss: 3.386843681335449


training:   2%|▏         | 227/10986 [09:47<7:02:11,  2.35s/it]

training loss: 3.3360164165496826


training:   2%|▏         | 228/10986 [09:49<7:08:24,  2.39s/it]

training loss: 3.3427541255950928


training:   2%|▏         | 229/10986 [09:51<6:54:40,  2.31s/it]

training loss: 3.5810980796813965


training:   2%|▏         | 230/10986 [09:54<7:04:15,  2.37s/it]

training loss: 3.3436436653137207


training:   2%|▏         | 231/10986 [09:56<6:53:10,  2.31s/it]

training loss: 3.37412166595459


training:   2%|▏         | 232/10986 [09:59<7:01:15,  2.35s/it]

training loss: 3.288571834564209


training:   2%|▏         | 233/10986 [10:01<6:50:20,  2.29s/it]

training loss: 3.3168888092041016


training:   2%|▏         | 234/10986 [10:03<7:00:26,  2.35s/it]

training loss: 3.36883282661438


training:   2%|▏         | 235/10986 [10:05<6:49:23,  2.28s/it]

training loss: 3.3717117309570312


training:   2%|▏         | 236/10986 [10:08<6:57:42,  2.33s/it]

training loss: 3.331984519958496


training:   2%|▏         | 237/10986 [10:10<6:45:32,  2.26s/it]

training loss: 3.2723805904388428


training:   2%|▏         | 238/10986 [10:13<7:14:06,  2.42s/it]

training loss: 3.414414644241333


training:   2%|▏         | 239/10986 [10:15<7:28:57,  2.51s/it]

training loss: 3.350464344024658


training:   2%|▏         | 240/10986 [10:18<7:24:25,  2.48s/it]

training loss: 3.3871614933013916
valid loss: 3.3839526176452637
perplexity: 29.487092971801758


training:   2%|▏         | 241/10986 [10:22<9:04:42,  3.04s/it]

training loss: 3.275193691253662


training:   2%|▏         | 242/10986 [10:24<8:16:25,  2.77s/it]

training loss: 3.3136472702026367


training:   2%|▏         | 243/10986 [10:27<7:59:02,  2.68s/it]

training loss: 3.3508663177490234


training:   2%|▏         | 244/10986 [10:29<7:30:39,  2.52s/it]

training loss: 3.535093069076538


training:   2%|▏         | 245/10986 [10:31<7:31:25,  2.52s/it]

training loss: 3.3443384170532227


training:   2%|▏         | 246/10986 [10:34<7:12:07,  2.41s/it]

training loss: 3.396237373352051


training:   2%|▏         | 247/10986 [10:36<7:19:50,  2.46s/it]

training loss: 3.4096081256866455


training:   2%|▏         | 248/10986 [10:38<7:03:09,  2.36s/it]

training loss: 3.453955888748169


training:   2%|▏         | 249/10986 [10:41<7:13:07,  2.42s/it]

training loss: 3.467348575592041


training:   2%|▏         | 250/10986 [10:43<7:01:00,  2.35s/it]

training loss: 3.3274595737457275


training:   2%|▏         | 251/10986 [10:46<7:10:36,  2.41s/it]

training loss: 3.4976253509521484


training:   2%|▏         | 252/10986 [10:48<6:58:13,  2.34s/it]

training loss: 3.4057273864746094


training:   2%|▏         | 253/10986 [10:50<7:09:40,  2.40s/it]

training loss: 3.336832046508789


training:   2%|▏         | 254/10986 [10:52<6:56:51,  2.33s/it]

training loss: 3.377087354660034


training:   2%|▏         | 255/10986 [10:55<7:11:25,  2.41s/it]

training loss: 3.4927921295166016


training:   2%|▏         | 256/10986 [10:57<6:58:35,  2.34s/it]

training loss: 3.4333138465881348


training:   2%|▏         | 257/10986 [11:00<7:06:42,  2.39s/it]

training loss: 3.343111276626587


training:   2%|▏         | 258/10986 [11:02<6:55:44,  2.33s/it]

training loss: 3.4235095977783203


training:   2%|▏         | 259/10986 [11:04<7:06:37,  2.39s/it]

training loss: 3.274869203567505


training:   2%|▏         | 260/10986 [11:07<6:51:43,  2.30s/it]

training loss: 3.4171860218048096
valid loss: 3.4095101356506348
perplexity: 30.25042152404785


training:   2%|▏         | 261/10986 [11:11<8:48:28,  2.96s/it]

training loss: 3.318885087966919


training:   2%|▏         | 262/10986 [11:14<9:00:31,  3.02s/it]

training loss: 3.3743646144866943


training:   2%|▏         | 263/10986 [11:16<8:12:14,  2.75s/it]

training loss: 3.3239076137542725


training:   2%|▏         | 264/10986 [11:19<7:57:07,  2.67s/it]

training loss: 3.3251824378967285


training:   2%|▏         | 265/10986 [11:21<7:26:25,  2.50s/it]

training loss: 3.3195416927337646


training:   2%|▏         | 266/10986 [11:23<7:23:26,  2.48s/it]

training loss: 3.3955090045928955


training:   2%|▏         | 267/10986 [11:26<7:05:12,  2.38s/it]

training loss: 3.5118134021759033


training:   2%|▏         | 268/10986 [11:28<7:11:02,  2.41s/it]

training loss: 3.2914376258850098


training:   2%|▏         | 269/10986 [11:30<6:54:39,  2.32s/it]

training loss: 3.2817153930664062


training:   2%|▏         | 270/10986 [11:33<7:02:14,  2.36s/it]

training loss: 3.3187108039855957


training:   2%|▏         | 271/10986 [11:35<6:51:04,  2.30s/it]

training loss: 3.4503440856933594


training:   2%|▏         | 272/10986 [11:38<7:44:45,  2.60s/it]

training loss: 3.3752646446228027


training:   2%|▏         | 273/10986 [11:40<7:32:28,  2.53s/it]

training loss: 3.4662580490112305


training:   2%|▏         | 274/10986 [11:43<7:31:56,  2.53s/it]

training loss: 3.4795827865600586


training:   3%|▎         | 275/10986 [11:45<7:10:52,  2.41s/it]

training loss: 3.327941656112671


training:   3%|▎         | 276/10986 [11:48<7:15:16,  2.44s/it]

training loss: 3.299503803253174


training:   3%|▎         | 277/10986 [11:50<6:58:23,  2.34s/it]

training loss: 3.4310145378112793


training:   3%|▎         | 278/10986 [11:52<7:04:13,  2.38s/it]

training loss: 3.347283363342285


training:   3%|▎         | 279/10986 [11:54<6:51:30,  2.31s/it]

training loss: 3.4522628784179688


training:   3%|▎         | 280/10986 [11:57<6:59:04,  2.35s/it]

training loss: 3.3971948623657227
valid loss: 3.516126871109009
perplexity: 33.65382766723633


training:   3%|▎         | 281/10986 [12:01<8:53:05,  2.99s/it]

training loss: 3.3460240364074707


training:   3%|▎         | 282/10986 [12:03<8:12:18,  2.76s/it]

training loss: 3.344149589538574


training:   3%|▎         | 283/10986 [12:06<7:57:20,  2.68s/it]

training loss: 3.3736071586608887


training:   3%|▎         | 284/10986 [12:08<7:29:02,  2.52s/it]

training loss: 3.4961419105529785


training:   3%|▎         | 285/10986 [12:11<7:28:14,  2.51s/it]

training loss: 3.3347458839416504


training:   3%|▎         | 286/10986 [12:13<7:44:56,  2.61s/it]

training loss: 3.3458425998687744


training:   3%|▎         | 287/10986 [12:16<8:05:36,  2.72s/it]

training loss: 3.456371307373047


training:   3%|▎         | 288/10986 [12:19<7:36:18,  2.56s/it]

training loss: 3.3550047874450684


training:   3%|▎         | 289/10986 [12:21<7:33:03,  2.54s/it]

training loss: 3.37756609916687


training:   3%|▎         | 290/10986 [12:23<7:10:56,  2.42s/it]

training loss: 3.4175937175750732


training:   3%|▎         | 291/10986 [12:26<7:14:06,  2.44s/it]

training loss: 3.4615015983581543


training:   3%|▎         | 292/10986 [12:28<6:57:08,  2.34s/it]

training loss: 3.4194979667663574


training:   3%|▎         | 293/10986 [12:30<7:02:01,  2.37s/it]

training loss: 3.4826438426971436


training:   3%|▎         | 294/10986 [12:32<6:48:32,  2.29s/it]

training loss: 3.567795991897583


training:   3%|▎         | 295/10986 [12:35<6:58:05,  2.35s/it]

training loss: 3.393167018890381


training:   3%|▎         | 296/10986 [12:37<6:45:56,  2.28s/it]

training loss: 3.5048534870147705


training:   3%|▎         | 297/10986 [12:39<6:55:31,  2.33s/it]

training loss: 3.403369903564453


training:   3%|▎         | 298/10986 [12:42<6:46:21,  2.28s/it]

training loss: 3.5281834602355957


training:   3%|▎         | 299/10986 [12:44<6:57:02,  2.34s/it]

training loss: 3.3141636848449707


training:   3%|▎         | 300/10986 [12:46<6:46:09,  2.28s/it]

training loss: 3.4108426570892334
valid loss: 3.5808768272399902
perplexity: 35.90501022338867


training:   3%|▎         | 301/10986 [12:51<8:57:34,  3.02s/it]

training loss: 3.368443489074707


training:   3%|▎         | 302/10986 [12:53<8:31:12,  2.87s/it]

training loss: 3.4658665657043457


training:   3%|▎         | 303/10986 [12:56<7:52:53,  2.66s/it]

training loss: 3.3480606079101562


training:   3%|▎         | 304/10986 [12:58<7:41:47,  2.59s/it]

training loss: 3.444871187210083


training:   3%|▎         | 305/10986 [13:00<7:14:41,  2.44s/it]

training loss: 3.405378580093384


training:   3%|▎         | 306/10986 [13:03<7:13:58,  2.44s/it]

training loss: 3.3553783893585205


training:   3%|▎         | 307/10986 [13:05<6:57:09,  2.34s/it]

training loss: 3.418992757797241


training:   3%|▎         | 308/10986 [13:07<7:05:37,  2.39s/it]

training loss: 3.5298900604248047


training:   3%|▎         | 309/10986 [13:09<6:51:46,  2.31s/it]

training loss: 3.3924713134765625


training:   3%|▎         | 310/10986 [13:12<7:06:46,  2.40s/it]

training loss: 3.4053468704223633


training:   3%|▎         | 311/10986 [13:15<7:19:24,  2.47s/it]

training loss: 3.3551371097564697


training:   3%|▎         | 312/10986 [13:17<7:23:04,  2.49s/it]

training loss: 3.411712408065796


training:   3%|▎         | 313/10986 [13:19<7:03:28,  2.38s/it]

training loss: 3.490002393722534


training:   3%|▎         | 314/10986 [13:22<7:10:35,  2.42s/it]

training loss: 3.340787887573242


training:   3%|▎         | 315/10986 [13:24<6:54:16,  2.33s/it]

training loss: 3.323364496231079


training:   3%|▎         | 316/10986 [13:26<7:01:14,  2.37s/it]

training loss: 3.3508694171905518


training:   3%|▎         | 317/10986 [13:28<6:47:02,  2.29s/it]

training loss: 3.4896159172058105


training:   3%|▎         | 318/10986 [13:31<6:55:39,  2.34s/it]

training loss: 3.37139892578125


training:   3%|▎         | 319/10986 [13:33<6:45:12,  2.28s/it]

training loss: 3.4809794425964355


training:   3%|▎         | 320/10986 [13:35<6:52:08,  2.32s/it]

training loss: 3.356628656387329
valid loss: 3.466200828552246
perplexity: 32.0148811340332


training:   3%|▎         | 321/10986 [13:40<8:39:51,  2.92s/it]

training loss: 3.4487273693084717


training:   3%|▎         | 322/10986 [13:42<7:58:20,  2.69s/it]

training loss: 3.428171396255493


training:   3%|▎         | 323/10986 [13:44<7:42:09,  2.60s/it]

training loss: 3.5721588134765625


training:   3%|▎         | 324/10986 [13:46<7:16:08,  2.45s/it]

training loss: 3.437046766281128


training:   3%|▎         | 325/10986 [13:49<7:16:19,  2.46s/it]

training loss: 3.3673808574676514


training:   3%|▎         | 326/10986 [13:51<6:57:32,  2.35s/it]

training loss: 3.3616182804107666


training:   3%|▎         | 327/10986 [13:53<7:04:50,  2.39s/it]

training loss: 3.38688588142395


training:   3%|▎         | 328/10986 [13:56<6:52:22,  2.32s/it]

training loss: 3.334791421890259


training:   3%|▎         | 329/10986 [13:58<6:58:16,  2.35s/it]

training loss: 3.2862603664398193


training:   3%|▎         | 330/10986 [14:00<6:44:58,  2.28s/it]

training loss: 3.5291895866394043


training:   3%|▎         | 331/10986 [14:03<6:54:06,  2.33s/it]

training loss: 3.4567694664001465


training:   3%|▎         | 332/10986 [14:05<6:41:20,  2.26s/it]

training loss: 3.2685489654541016


training:   3%|▎         | 333/10986 [14:07<6:50:22,  2.31s/it]

training loss: 3.370820999145508


training:   3%|▎         | 334/10986 [14:09<6:38:20,  2.24s/it]

training loss: 3.452652931213379


training:   3%|▎         | 335/10986 [14:12<6:49:10,  2.30s/it]

training loss: 3.3046886920928955


training:   3%|▎         | 336/10986 [14:15<7:30:32,  2.54s/it]

training loss: 3.421638011932373


training:   3%|▎         | 337/10986 [14:17<7:25:06,  2.51s/it]

training loss: 3.4358599185943604


training:   3%|▎         | 338/10986 [14:19<7:03:02,  2.38s/it]

training loss: 3.4395060539245605


training:   3%|▎         | 339/10986 [14:22<7:05:54,  2.40s/it]

training loss: 3.3784990310668945


training:   3%|▎         | 340/10986 [14:24<6:52:17,  2.32s/it]

training loss: 3.3340604305267334
valid loss: 3.3327512741088867
perplexity: 28.01531410217285


training:   3%|▎         | 341/10986 [14:29<9:47:08,  3.31s/it]

training loss: 3.384282112121582


training:   3%|▎         | 342/10986 [14:32<9:02:45,  3.06s/it]

training loss: 3.324615955352783


training:   3%|▎         | 343/10986 [14:34<8:13:31,  2.78s/it]

training loss: 3.357595682144165


training:   3%|▎         | 344/10986 [14:37<7:53:58,  2.67s/it]

training loss: 3.3399367332458496


training:   3%|▎         | 345/10986 [14:39<7:23:54,  2.50s/it]

training loss: 3.3714776039123535


training:   3%|▎         | 346/10986 [14:41<7:19:40,  2.48s/it]

training loss: 3.242638349533081


training:   3%|▎         | 347/10986 [14:43<6:58:58,  2.36s/it]

training loss: 3.2951877117156982


training:   3%|▎         | 348/10986 [14:46<7:03:36,  2.39s/it]

training loss: 3.335460901260376


training:   3%|▎         | 349/10986 [14:48<6:49:12,  2.31s/it]

training loss: 3.330359935760498


training:   3%|▎         | 350/10986 [14:50<6:54:35,  2.34s/it]

training loss: 3.2807652950286865


training:   3%|▎         | 351/10986 [14:52<6:40:45,  2.26s/it]

training loss: 3.427454948425293


training:   3%|▎         | 352/10986 [14:55<6:52:34,  2.33s/it]

training loss: 3.293405771255493


training:   3%|▎         | 353/10986 [14:57<6:41:45,  2.27s/it]

training loss: 3.3063507080078125


training:   3%|▎         | 354/10986 [14:59<6:51:09,  2.32s/it]

training loss: 3.3052213191986084


training:   3%|▎         | 355/10986 [15:01<6:40:05,  2.26s/it]

training loss: 3.388105630874634


training:   3%|▎         | 356/10986 [15:04<6:47:53,  2.30s/it]

training loss: 3.4644815921783447


training:   3%|▎         | 357/10986 [15:06<6:38:03,  2.25s/it]

training loss: 3.379546880722046


training:   3%|▎         | 358/10986 [15:08<6:47:40,  2.30s/it]

training loss: 3.4254379272460938


training:   3%|▎         | 359/10986 [15:10<6:37:47,  2.25s/it]

training loss: 3.3120665550231934


training:   3%|▎         | 360/10986 [15:13<7:15:09,  2.46s/it]

training loss: 3.35245680809021
valid loss: 3.2558279037475586
perplexity: 25.941082000732422


training:   3%|▎         | 361/10986 [15:18<8:57:09,  3.03s/it]

training loss: 3.2916197776794434


training:   3%|▎         | 362/10986 [15:20<8:09:58,  2.77s/it]

training loss: 3.3716726303100586


training:   3%|▎         | 363/10986 [15:22<7:51:28,  2.66s/it]

training loss: 3.3630013465881348


training:   3%|▎         | 364/10986 [15:24<7:22:57,  2.50s/it]

training loss: 3.615910053253174


training:   3%|▎         | 365/10986 [15:27<7:20:49,  2.49s/it]

training loss: 3.375600814819336


training:   3%|▎         | 366/10986 [15:29<6:59:55,  2.37s/it]

training loss: 3.368778705596924


training:   3%|▎         | 367/10986 [15:31<7:03:34,  2.39s/it]

training loss: 3.3922343254089355


training:   3%|▎         | 368/10986 [15:34<6:49:19,  2.31s/it]

training loss: 3.3413476943969727


training:   3%|▎         | 369/10986 [15:36<6:56:40,  2.35s/it]

training loss: 3.4734108448028564


training:   3%|▎         | 370/10986 [15:38<6:42:48,  2.28s/it]

training loss: 3.4208383560180664


training:   3%|▎         | 371/10986 [15:41<6:50:17,  2.32s/it]

training loss: 3.429687023162842


training:   3%|▎         | 372/10986 [15:43<6:40:12,  2.26s/it]

training loss: 3.424377202987671


training:   3%|▎         | 373/10986 [15:45<6:54:18,  2.34s/it]

training loss: 3.344327688217163


training:   3%|▎         | 374/10986 [15:47<6:41:55,  2.27s/it]

training loss: 3.3210368156433105


training:   3%|▎         | 375/10986 [15:50<6:52:29,  2.33s/it]

training loss: 3.372666597366333


training:   3%|▎         | 376/10986 [15:52<6:40:29,  2.26s/it]

training loss: 3.5321197509765625


training:   3%|▎         | 377/10986 [15:54<6:49:25,  2.32s/it]

training loss: 3.336339235305786


training:   3%|▎         | 378/10986 [15:56<6:39:44,  2.26s/it]

training loss: 3.3114941120147705


training:   3%|▎         | 379/10986 [15:59<6:49:10,  2.31s/it]

training loss: 3.412015438079834


training:   3%|▎         | 380/10986 [16:01<6:38:07,  2.25s/it]

training loss: 3.369478702545166
valid loss: 3.412127733230591
perplexity: 30.329710006713867


training:   3%|▎         | 381/10986 [16:05<8:32:41,  2.90s/it]

training loss: 3.376342535018921


training:   3%|▎         | 382/10986 [16:08<8:11:28,  2.78s/it]

training loss: 3.4430158138275146


training:   3%|▎         | 383/10986 [16:10<7:36:46,  2.58s/it]

training loss: 3.5147011280059814


training:   3%|▎         | 384/10986 [16:13<7:55:36,  2.69s/it]

training loss: 3.470041275024414


training:   4%|▎         | 385/10986 [16:15<7:30:49,  2.55s/it]

training loss: 3.451962947845459


training:   4%|▎         | 386/10986 [16:18<7:24:29,  2.52s/it]

training loss: 3.323967695236206


training:   4%|▎         | 387/10986 [16:20<7:04:21,  2.40s/it]

training loss: 3.3761825561523438


training:   4%|▎         | 388/10986 [16:22<7:06:48,  2.42s/it]

training loss: 3.470252275466919


training:   4%|▎         | 389/10986 [16:24<6:51:38,  2.33s/it]

training loss: 3.3525550365448


training:   4%|▎         | 390/10986 [16:27<6:59:52,  2.38s/it]

training loss: 3.3241589069366455


training:   4%|▎         | 391/10986 [16:29<6:47:26,  2.31s/it]

training loss: 3.4174447059631348


training:   4%|▎         | 392/10986 [16:31<6:54:10,  2.35s/it]

training loss: 3.45278263092041


training:   4%|▎         | 393/10986 [16:34<6:42:11,  2.28s/it]

training loss: 3.345853328704834


training:   4%|▎         | 394/10986 [16:36<6:50:31,  2.33s/it]

training loss: 3.3429551124572754


training:   4%|▎         | 395/10986 [16:38<6:40:00,  2.27s/it]

training loss: 3.4201576709747314


training:   4%|▎         | 396/10986 [16:41<6:50:37,  2.33s/it]

training loss: 3.3821027278900146


training:   4%|▎         | 397/10986 [16:43<6:41:06,  2.27s/it]

training loss: 3.4507250785827637


training:   4%|▎         | 398/10986 [16:45<6:50:25,  2.33s/it]

training loss: 3.30621337890625


training:   4%|▎         | 399/10986 [16:47<6:40:54,  2.27s/it]

training loss: 3.383885383605957


training:   4%|▎         | 400/10986 [16:50<6:50:42,  2.33s/it]

training loss: 3.416921377182007
valid loss: 3.361132860183716
perplexity: 28.821823120117188


training:   4%|▎         | 401/10986 [16:54<8:54:56,  3.03s/it]

training loss: 3.4239816665649414


training:   4%|▎         | 402/10986 [16:57<8:09:25,  2.77s/it]

training loss: 3.4430227279663086


training:   4%|▎         | 403/10986 [16:59<7:53:00,  2.68s/it]

training loss: 3.4087045192718506


training:   4%|▎         | 404/10986 [17:01<7:23:06,  2.51s/it]

training loss: 3.26790189743042


training:   4%|▎         | 405/10986 [17:04<7:19:54,  2.49s/it]

training loss: 3.326967239379883


training:   4%|▎         | 406/10986 [17:06<6:58:20,  2.37s/it]

training loss: 3.375230312347412


training:   4%|▎         | 407/10986 [17:08<7:00:45,  2.39s/it]

training loss: 3.432173252105713


training:   4%|▎         | 408/10986 [17:10<6:45:52,  2.30s/it]

training loss: 3.371022939682007


training:   4%|▎         | 409/10986 [17:13<7:31:11,  2.56s/it]

training loss: 3.3395724296569824


training:   4%|▎         | 410/10986 [17:16<7:13:38,  2.46s/it]

training loss: 3.541386127471924


training:   4%|▎         | 411/10986 [17:19<8:12:41,  2.80s/it]

training loss: 3.4033262729644775


training:   4%|▍         | 412/10986 [17:21<7:42:35,  2.62s/it]

training loss: 3.4001870155334473


training:   4%|▍         | 413/10986 [17:24<7:32:22,  2.57s/it]

training loss: 3.3528852462768555


training:   4%|▍         | 414/10986 [17:26<7:10:27,  2.44s/it]

training loss: 3.344435691833496


training:   4%|▍         | 415/10986 [17:29<7:11:23,  2.45s/it]

training loss: 3.3751535415649414


training:   4%|▍         | 416/10986 [17:31<6:55:15,  2.36s/it]

training loss: 3.4397544860839844


training:   4%|▍         | 417/10986 [17:33<6:59:53,  2.38s/it]

training loss: 3.367215156555176


training:   4%|▍         | 418/10986 [17:35<6:45:43,  2.30s/it]

training loss: 3.267009735107422


training:   4%|▍         | 419/10986 [17:38<6:51:52,  2.34s/it]

training loss: 3.4337575435638428


training:   4%|▍         | 420/10986 [17:40<6:39:07,  2.27s/it]

training loss: 3.443696975708008
valid loss: 3.3027453422546387
perplexity: 27.187175750732422


training:   4%|▍         | 421/10986 [17:44<8:32:36,  2.91s/it]

training loss: 3.303077459335327


training:   4%|▍         | 422/10986 [17:47<8:09:33,  2.78s/it]

training loss: 3.5573079586029053


training:   4%|▍         | 423/10986 [17:49<7:36:12,  2.59s/it]

training loss: 3.4262871742248535


training:   4%|▍         | 424/10986 [17:51<7:27:56,  2.54s/it]

training loss: 3.417149782180786


training:   4%|▍         | 425/10986 [17:53<7:05:13,  2.42s/it]

training loss: 3.3903419971466064


training:   4%|▍         | 426/10986 [17:56<7:06:59,  2.43s/it]

training loss: 3.361575126647949


training:   4%|▍         | 427/10986 [17:58<6:49:42,  2.33s/it]

training loss: 3.3671326637268066


training:   4%|▍         | 428/10986 [18:00<6:56:52,  2.37s/it]

training loss: 3.443739414215088


training:   4%|▍         | 429/10986 [18:02<6:43:35,  2.29s/it]

training loss: 3.3619463443756104


training:   4%|▍         | 430/10986 [18:05<6:51:52,  2.34s/it]

training loss: 3.401232957839966


training:   4%|▍         | 431/10986 [18:07<6:39:55,  2.27s/it]

training loss: 3.3192577362060547


training:   4%|▍         | 432/10986 [18:09<6:48:45,  2.32s/it]

training loss: 3.421520709991455


training:   4%|▍         | 433/10986 [18:12<6:39:22,  2.27s/it]

training loss: 3.391580104827881


training:   4%|▍         | 434/10986 [18:14<7:10:26,  2.45s/it]

training loss: 3.355780601501465


training:   4%|▍         | 435/10986 [18:17<6:52:08,  2.34s/it]

training loss: 3.324096202850342


training:   4%|▍         | 436/10986 [18:19<6:56:58,  2.37s/it]

training loss: 3.36427903175354


training:   4%|▍         | 437/10986 [18:21<6:43:32,  2.30s/it]

training loss: 3.4854676723480225


training:   4%|▍         | 438/10986 [18:24<6:50:36,  2.34s/it]

training loss: 3.228222131729126


training:   4%|▍         | 439/10986 [18:26<6:38:23,  2.27s/it]

training loss: 3.4063899517059326


training:   4%|▍         | 440/10986 [18:28<6:49:17,  2.33s/it]

training loss: 3.2451653480529785
valid loss: 3.3991432189941406
perplexity: 29.938438415527344


training:   4%|▍         | 441/10986 [18:33<8:36:38,  2.94s/it]

training loss: 3.3491013050079346


training:   4%|▍         | 442/10986 [18:35<7:55:12,  2.70s/it]

training loss: 3.46450138092041


training:   4%|▍         | 443/10986 [18:37<7:40:38,  2.62s/it]

training loss: 3.4710915088653564


training:   4%|▍         | 444/10986 [18:39<7:12:25,  2.46s/it]

training loss: 3.305370569229126


training:   4%|▍         | 445/10986 [18:42<7:09:37,  2.45s/it]

training loss: 3.2911922931671143


training:   4%|▍         | 446/10986 [18:44<6:52:40,  2.35s/it]

training loss: 3.4240643978118896


training:   4%|▍         | 447/10986 [18:46<6:56:40,  2.37s/it]

training loss: 3.330453395843506


training:   4%|▍         | 448/10986 [18:48<6:43:20,  2.30s/it]

training loss: 3.454399824142456


training:   4%|▍         | 449/10986 [18:51<6:54:59,  2.36s/it]

training loss: 3.318758964538574


training:   4%|▍         | 450/10986 [18:53<6:42:59,  2.29s/it]

training loss: 3.4379727840423584


training:   4%|▍         | 451/10986 [18:55<6:52:23,  2.35s/it]

training loss: 3.460925817489624


training:   4%|▍         | 452/10986 [18:58<6:40:08,  2.28s/it]

training loss: 3.2853479385375977


training:   4%|▍         | 453/10986 [19:01<7:23:41,  2.53s/it]

training loss: 3.3892979621887207


training:   4%|▍         | 454/10986 [19:03<7:36:05,  2.60s/it]

training loss: 3.3559019565582275


training:   4%|▍         | 455/10986 [19:06<7:29:07,  2.56s/it]

training loss: 3.4142143726348877


training:   4%|▍         | 456/10986 [19:08<7:08:48,  2.44s/it]

training loss: 3.342834234237671


training:   4%|▍         | 457/10986 [19:11<7:11:13,  2.46s/it]

training loss: 3.4485812187194824


training:   4%|▍         | 458/10986 [19:13<7:18:49,  2.50s/it]

training loss: 3.4194717407226562


training:   4%|▍         | 459/10986 [19:16<8:03:34,  2.76s/it]

training loss: 3.30525541305542


training:   4%|▍         | 460/10986 [19:19<7:30:59,  2.57s/it]

training loss: 3.342672824859619
valid loss: 3.3903470039367676
perplexity: 29.67624855041504


training:   4%|▍         | 461/10986 [19:23<9:07:41,  3.12s/it]

training loss: 3.419498920440674


training:   4%|▍         | 462/10986 [19:26<8:36:29,  2.94s/it]

training loss: 3.4587743282318115


training:   4%|▍         | 463/10986 [19:28<7:52:44,  2.70s/it]

training loss: 3.3284060955047607


training:   4%|▍         | 464/10986 [19:30<7:39:46,  2.62s/it]

training loss: 3.4134278297424316


training:   4%|▍         | 465/10986 [19:32<7:12:34,  2.47s/it]

training loss: 3.4566004276275635


training:   4%|▍         | 466/10986 [19:35<7:11:08,  2.46s/it]

training loss: 3.309882640838623


training:   4%|▍         | 467/10986 [19:37<6:52:18,  2.35s/it]

training loss: 3.352644920349121


training:   4%|▍         | 468/10986 [19:39<6:54:37,  2.37s/it]

training loss: 3.3617992401123047


training:   4%|▍         | 469/10986 [19:41<6:40:41,  2.29s/it]

training loss: 3.4130935668945312


training:   4%|▍         | 470/10986 [19:44<6:49:14,  2.33s/it]

training loss: 3.4108829498291016


training:   4%|▍         | 471/10986 [19:46<6:35:40,  2.26s/it]

training loss: 3.3075039386749268


training:   4%|▍         | 472/10986 [19:48<6:45:18,  2.31s/it]

training loss: 3.4427475929260254


training:   4%|▍         | 473/10986 [19:50<6:36:11,  2.26s/it]

training loss: 3.527923345565796


training:   4%|▍         | 474/10986 [19:53<6:47:01,  2.32s/it]

training loss: 3.482351541519165


training:   4%|▍         | 475/10986 [19:55<6:38:00,  2.27s/it]

training loss: 3.3512094020843506


training:   4%|▍         | 476/10986 [19:57<6:48:54,  2.33s/it]

training loss: 3.3336031436920166


training:   4%|▍         | 477/10986 [20:00<6:37:34,  2.27s/it]

training loss: 3.399188995361328


training:   4%|▍         | 478/10986 [20:02<6:48:20,  2.33s/it]

training loss: 3.4072961807250977


training:   4%|▍         | 479/10986 [20:04<6:50:39,  2.35s/it]

training loss: 3.3241255283355713


training:   4%|▍         | 480/10986 [20:08<7:54:09,  2.71s/it]

training loss: 3.3790345191955566
valid loss: 3.285900115966797
perplexity: 26.733036041259766


training:   4%|▍         | 481/10986 [20:13<9:50:19,  3.37s/it]

training loss: 3.336258888244629


training:   4%|▍         | 482/10986 [20:16<9:41:39,  3.32s/it]

training loss: 3.399623394012451


training:   4%|▍         | 483/10986 [20:19<8:55:46,  3.06s/it]

training loss: 3.359271287918091


training:   4%|▍         | 484/10986 [20:21<8:06:10,  2.78s/it]

training loss: 3.4799540042877197


training:   4%|▍         | 485/10986 [20:23<7:48:28,  2.68s/it]

training loss: 3.4141156673431396


training:   4%|▍         | 486/10986 [20:25<7:20:14,  2.52s/it]

training loss: 3.4049198627471924


training:   4%|▍         | 487/10986 [20:28<7:18:25,  2.51s/it]

training loss: 3.4184060096740723


training:   4%|▍         | 488/10986 [20:30<6:58:42,  2.39s/it]

training loss: 3.347435235977173


training:   4%|▍         | 489/10986 [20:32<7:03:17,  2.42s/it]

training loss: 3.2896387577056885


training:   4%|▍         | 490/10986 [20:34<6:48:01,  2.33s/it]

training loss: 3.2720258235931396


training:   4%|▍         | 491/10986 [20:37<6:55:11,  2.37s/it]

training loss: 3.3431549072265625


training:   4%|▍         | 492/10986 [20:39<6:43:08,  2.30s/it]

training loss: 3.3255996704101562


training:   4%|▍         | 493/10986 [20:42<6:53:56,  2.37s/it]

training loss: 3.3715662956237793


training:   4%|▍         | 494/10986 [20:44<6:43:28,  2.31s/it]

training loss: 3.367115020751953


training:   5%|▍         | 495/10986 [20:46<6:51:06,  2.35s/it]

training loss: 3.4497687816619873


training:   5%|▍         | 496/10986 [20:48<6:38:58,  2.28s/it]

training loss: 3.488314151763916


training:   5%|▍         | 497/10986 [20:51<6:47:28,  2.33s/it]

training loss: 3.4434664249420166


training:   5%|▍         | 498/10986 [20:53<6:36:21,  2.27s/it]

training loss: 3.328977584838867


training:   5%|▍         | 499/10986 [20:55<6:48:23,  2.34s/it]

training loss: 3.3977484703063965


training:   5%|▍         | 500/10986 [20:58<6:37:21,  2.27s/it]

training loss: 3.395475149154663
valid loss: 3.5526976585388184
perplexity: 34.9073600769043


training:   5%|▍         | 501/10986 [21:02<8:46:15,  3.01s/it]

training loss: 3.4149200916290283


training:   5%|▍         | 502/10986 [21:05<8:22:25,  2.88s/it]

training loss: 3.3353054523468018


training:   5%|▍         | 503/10986 [21:07<7:49:23,  2.69s/it]

training loss: 3.3068716526031494


training:   5%|▍         | 504/10986 [21:10<7:41:31,  2.64s/it]

training loss: 3.329251527786255


training:   5%|▍         | 505/10986 [21:12<7:16:00,  2.50s/it]

training loss: 3.418344259262085


training:   5%|▍         | 506/10986 [21:15<7:53:18,  2.71s/it]

training loss: 3.285565137863159


training:   5%|▍         | 507/10986 [21:17<7:23:31,  2.54s/it]

training loss: 3.351327896118164


training:   5%|▍         | 508/10986 [21:20<7:18:32,  2.51s/it]

training loss: 3.415191173553467


training:   5%|▍         | 509/10986 [21:22<6:58:48,  2.40s/it]

training loss: 3.429386615753174


training:   5%|▍         | 510/10986 [21:24<7:04:11,  2.43s/it]

training loss: 3.3644235134124756


training:   5%|▍         | 511/10986 [21:26<6:50:20,  2.35s/it]

training loss: 3.582082509994507


training:   5%|▍         | 512/10986 [21:29<6:57:30,  2.39s/it]

training loss: 3.334306478500366


training:   5%|▍         | 513/10986 [21:31<6:44:19,  2.32s/it]

training loss: 3.5426104068756104


training:   5%|▍         | 514/10986 [21:33<6:53:16,  2.37s/it]

training loss: 3.316206216812134


training:   5%|▍         | 515/10986 [21:36<6:40:48,  2.30s/it]

training loss: 3.3207547664642334


training:   5%|▍         | 516/10986 [21:38<6:49:27,  2.35s/it]

training loss: 3.3840980529785156


training:   5%|▍         | 517/10986 [21:40<6:38:40,  2.28s/it]

training loss: 3.3559110164642334


training:   5%|▍         | 518/10986 [21:43<6:49:01,  2.34s/it]

training loss: 3.42254376411438


training:   5%|▍         | 519/10986 [21:45<6:38:39,  2.29s/it]

training loss: 3.353759527206421


training:   5%|▍         | 520/10986 [21:47<6:46:19,  2.33s/it]

training loss: 3.314988851547241
valid loss: 3.419788360595703
perplexity: 30.562946319580078


training:   5%|▍         | 521/10986 [21:52<8:31:34,  2.93s/it]

training loss: 3.3847107887268066


training:   5%|▍         | 522/10986 [21:54<7:49:26,  2.69s/it]

training loss: 3.293046474456787


training:   5%|▍         | 523/10986 [21:56<7:37:49,  2.63s/it]

training loss: 3.510249137878418


training:   5%|▍         | 524/10986 [21:58<7:11:31,  2.47s/it]

training loss: 3.3977792263031006


training:   5%|▍         | 525/10986 [22:01<7:07:49,  2.45s/it]

training loss: 3.3413896560668945


training:   5%|▍         | 526/10986 [22:03<6:49:13,  2.35s/it]

training loss: 3.3442461490631104


training:   5%|▍         | 527/10986 [22:05<6:51:43,  2.36s/it]

training loss: 3.4558401107788086


training:   5%|▍         | 528/10986 [22:07<6:40:53,  2.30s/it]

training loss: 3.383284330368042


training:   5%|▍         | 529/10986 [22:10<6:47:42,  2.34s/it]

training loss: 3.475740671157837


training:   5%|▍         | 530/10986 [22:12<6:39:02,  2.29s/it]

training loss: 3.3417463302612305


training:   5%|▍         | 531/10986 [22:15<7:38:10,  2.63s/it]

training loss: 3.3308494091033936


training:   5%|▍         | 532/10986 [22:18<7:11:57,  2.48s/it]

training loss: 3.35322642326355


training:   5%|▍         | 533/10986 [22:20<7:10:42,  2.47s/it]

training loss: 3.489973783493042


training:   5%|▍         | 534/10986 [22:22<6:51:15,  2.36s/it]

training loss: 3.2636821269989014


training:   5%|▍         | 535/10986 [22:25<6:54:20,  2.38s/it]

training loss: 3.3794615268707275


training:   5%|▍         | 536/10986 [22:27<6:43:03,  2.31s/it]

training loss: 3.375863552093506


training:   5%|▍         | 537/10986 [22:29<6:48:59,  2.35s/it]

training loss: 3.2295305728912354


training:   5%|▍         | 538/10986 [22:31<6:36:41,  2.28s/it]

training loss: 3.2758500576019287


training:   5%|▍         | 539/10986 [22:34<6:45:12,  2.33s/it]

training loss: 3.3826851844787598


training:   5%|▍         | 540/10986 [22:36<6:33:16,  2.26s/it]

training loss: 3.367276906967163
valid loss: 3.4453935623168945
perplexity: 31.355621337890625


training:   5%|▍         | 541/10986 [22:40<8:28:25,  2.92s/it]

training loss: 3.2540347576141357


training:   5%|▍         | 542/10986 [22:43<8:08:00,  2.80s/it]

training loss: 3.2598726749420166


training:   5%|▍         | 543/10986 [22:45<7:32:41,  2.60s/it]

training loss: 3.4688878059387207


training:   5%|▍         | 544/10986 [22:47<7:25:52,  2.56s/it]

training loss: 3.453693389892578


training:   5%|▍         | 545/10986 [22:49<7:02:01,  2.43s/it]

training loss: 3.2203242778778076


training:   5%|▍         | 546/10986 [22:52<7:03:20,  2.43s/it]

training loss: 3.5044078826904297


training:   5%|▍         | 547/10986 [22:54<6:46:20,  2.34s/it]

training loss: 3.351166248321533


training:   5%|▍         | 548/10986 [22:57<6:52:51,  2.37s/it]

training loss: 3.3046813011169434


training:   5%|▍         | 549/10986 [23:00<7:26:48,  2.57s/it]

training loss: 3.3530631065368652


training:   5%|▌         | 550/10986 [23:02<7:47:42,  2.69s/it]

training loss: 3.450706720352173


training:   5%|▌         | 551/10986 [23:05<7:16:49,  2.51s/it]

training loss: 3.292771339416504


training:   5%|▌         | 552/10986 [23:07<7:12:02,  2.48s/it]

training loss: 3.3497703075408936


training:   5%|▌         | 553/10986 [23:09<6:53:11,  2.38s/it]

training loss: 3.473140001296997


training:   5%|▌         | 554/10986 [23:12<6:55:25,  2.39s/it]

training loss: 3.2449426651000977


training:   5%|▌         | 555/10986 [23:14<7:12:51,  2.49s/it]

training loss: 3.4584531784057617


training:   5%|▌         | 556/10986 [23:17<7:09:00,  2.47s/it]

training loss: 3.454648971557617


training:   5%|▌         | 557/10986 [23:19<6:51:34,  2.37s/it]

training loss: 3.26271390914917


training:   5%|▌         | 558/10986 [23:21<6:53:55,  2.38s/it]

training loss: 3.2680413722991943


training:   5%|▌         | 559/10986 [23:23<6:40:34,  2.31s/it]

training loss: 3.359919548034668


training:   5%|▌         | 560/10986 [23:26<6:47:39,  2.35s/it]

training loss: 3.459209442138672
valid loss: 3.273740291595459
perplexity: 26.409936904907227


training:   5%|▌         | 561/10986 [23:30<8:35:51,  2.97s/it]

training loss: 3.301515817642212


training:   5%|▌         | 562/10986 [23:32<7:51:18,  2.71s/it]

training loss: 3.3586671352386475


training:   5%|▌         | 563/10986 [23:35<7:34:44,  2.62s/it]

training loss: 3.330613374710083


training:   5%|▌         | 564/10986 [23:37<7:07:17,  2.46s/it]

training loss: 3.285332202911377


training:   5%|▌         | 565/10986 [23:39<7:05:45,  2.45s/it]

training loss: 3.4475672245025635


training:   5%|▌         | 566/10986 [23:41<6:48:23,  2.35s/it]

training loss: 3.2682433128356934


training:   5%|▌         | 567/10986 [23:44<6:52:54,  2.38s/it]

training loss: 3.3483004570007324


training:   5%|▌         | 568/10986 [23:46<6:39:11,  2.30s/it]

training loss: 3.4150960445404053


training:   5%|▌         | 569/10986 [23:48<6:46:17,  2.34s/it]

training loss: 3.375255823135376


training:   5%|▌         | 570/10986 [23:50<6:34:15,  2.27s/it]

training loss: 3.3266384601593018


training:   5%|▌         | 571/10986 [23:53<6:41:14,  2.31s/it]

training loss: 3.3593502044677734


training:   5%|▌         | 572/10986 [23:55<6:30:53,  2.25s/it]

training loss: 3.470388174057007


training:   5%|▌         | 573/10986 [23:57<6:39:53,  2.30s/it]

training loss: 3.269618034362793


training:   5%|▌         | 574/10986 [24:00<6:31:02,  2.25s/it]

training loss: 3.404369592666626


training:   5%|▌         | 575/10986 [24:02<6:39:58,  2.31s/it]

training loss: 3.3330657482147217


training:   5%|▌         | 576/10986 [24:04<6:30:51,  2.25s/it]

training loss: 3.3295021057128906


training:   5%|▌         | 577/10986 [24:07<6:41:07,  2.31s/it]

training loss: 3.254169464111328


training:   5%|▌         | 578/10986 [24:09<6:30:19,  2.25s/it]

training loss: 3.3482656478881836


training:   5%|▌         | 579/10986 [24:11<6:39:18,  2.30s/it]

training loss: 3.4634523391723633


training:   5%|▌         | 580/10986 [24:14<7:30:19,  2.60s/it]

training loss: 3.3531622886657715
valid loss: 3.4378252029418945
perplexity: 31.11920738220215


training:   5%|▌         | 581/10986 [24:19<9:05:04,  3.14s/it]

training loss: 3.4739580154418945


training:   5%|▌         | 582/10986 [24:21<8:32:39,  2.96s/it]

training loss: 3.441603660583496


training:   5%|▌         | 583/10986 [24:23<7:48:36,  2.70s/it]

training loss: 3.3138413429260254


training:   5%|▌         | 584/10986 [24:26<7:37:17,  2.64s/it]

training loss: 3.394014835357666


training:   5%|▌         | 585/10986 [24:28<7:10:54,  2.49s/it]

training loss: 3.2791576385498047


training:   5%|▌         | 586/10986 [24:31<7:09:18,  2.48s/it]

training loss: 3.2779366970062256


training:   5%|▌         | 587/10986 [24:33<6:50:38,  2.37s/it]

training loss: 3.347377300262451


training:   5%|▌         | 588/10986 [24:35<6:53:12,  2.38s/it]

training loss: 3.4736521244049072


training:   5%|▌         | 589/10986 [24:37<6:37:50,  2.30s/it]

training loss: 3.534878730773926


training:   5%|▌         | 590/10986 [24:40<6:43:53,  2.33s/it]

training loss: 3.4741644859313965


training:   5%|▌         | 591/10986 [24:42<6:34:29,  2.28s/it]

training loss: 3.5538251399993896


training:   5%|▌         | 592/10986 [24:44<6:42:40,  2.32s/it]

training loss: 3.3905508518218994


training:   5%|▌         | 593/10986 [24:46<6:31:58,  2.26s/it]

training loss: 3.4989089965820312


training:   5%|▌         | 594/10986 [24:49<6:41:18,  2.32s/it]

training loss: 3.389099359512329


training:   5%|▌         | 595/10986 [24:51<6:30:44,  2.26s/it]

training loss: 3.4073801040649414


training:   5%|▌         | 596/10986 [24:53<6:37:46,  2.30s/it]

training loss: 3.2556533813476562


training:   5%|▌         | 597/10986 [24:55<6:30:33,  2.26s/it]

training loss: 3.43241810798645


training:   5%|▌         | 598/10986 [24:58<6:40:49,  2.32s/it]

training loss: 3.448892831802368


training:   5%|▌         | 599/10986 [25:00<6:30:39,  2.26s/it]

training loss: 3.5568368434906006


training:   5%|▌         | 600/10986 [25:02<6:40:44,  2.32s/it]

training loss: 3.398106336593628
valid loss: 3.3413259983062744
perplexity: 28.25657081604004


training:   5%|▌         | 601/10986 [25:07<8:42:45,  3.02s/it]

training loss: 3.3649210929870605


training:   5%|▌         | 602/10986 [25:09<7:59:48,  2.77s/it]

training loss: 3.3204963207244873


training:   5%|▌         | 603/10986 [25:12<7:42:35,  2.67s/it]

training loss: 3.3968560695648193


training:   5%|▌         | 604/10986 [25:14<7:46:12,  2.69s/it]

training loss: 3.414199113845825


training:   6%|▌         | 605/10986 [25:17<7:32:24,  2.61s/it]

training loss: 3.4961349964141846


training:   6%|▌         | 606/10986 [25:19<7:06:16,  2.46s/it]

training loss: 3.35935115814209


training:   6%|▌         | 607/10986 [25:21<7:08:08,  2.48s/it]

training loss: 3.4752156734466553


training:   6%|▌         | 608/10986 [25:24<6:50:06,  2.37s/it]

training loss: 3.3931665420532227


training:   6%|▌         | 609/10986 [25:26<6:52:24,  2.38s/it]

training loss: 3.3506665229797363


training:   6%|▌         | 610/10986 [25:28<6:38:29,  2.30s/it]

training loss: 3.42455792427063


training:   6%|▌         | 611/10986 [25:31<6:47:16,  2.36s/it]

training loss: 3.3573055267333984


training:   6%|▌         | 612/10986 [25:33<6:33:53,  2.28s/it]

training loss: 3.3868229389190674


training:   6%|▌         | 613/10986 [25:35<6:41:24,  2.32s/it]

training loss: 3.3938910961151123


training:   6%|▌         | 614/10986 [25:37<6:31:14,  2.26s/it]

training loss: 3.3342905044555664


training:   6%|▌         | 615/10986 [25:40<6:38:30,  2.31s/it]

training loss: 3.303313732147217


training:   6%|▌         | 616/10986 [25:42<6:29:35,  2.25s/it]

training loss: 3.3542652130126953


training:   6%|▌         | 617/10986 [25:44<6:39:03,  2.31s/it]

training loss: 3.3134894371032715


training:   6%|▌         | 618/10986 [25:46<6:30:02,  2.26s/it]

training loss: 3.3118789196014404


training:   6%|▌         | 619/10986 [25:49<6:44:36,  2.34s/it]

training loss: 3.433276891708374


training:   6%|▌         | 620/10986 [25:52<7:17:10,  2.53s/it]

training loss: 3.338974952697754
valid loss: 3.3616602420806885
perplexity: 28.83702850341797


training:   6%|▌         | 621/10986 [25:57<9:27:24,  3.28s/it]

training loss: 3.4129512310028076


training:   6%|▌         | 622/10986 [25:59<8:48:44,  3.06s/it]

training loss: 3.498961925506592


training:   6%|▌         | 623/10986 [26:02<8:02:20,  2.79s/it]

training loss: 3.3264315128326416


training:   6%|▌         | 624/10986 [26:04<7:48:30,  2.71s/it]

training loss: 3.4594874382019043


training:   6%|▌         | 625/10986 [26:06<7:18:11,  2.54s/it]

training loss: 3.325613260269165


training:   6%|▌         | 626/10986 [26:09<7:15:21,  2.52s/it]

training loss: 3.362959146499634


training:   6%|▌         | 627/10986 [26:11<6:55:30,  2.41s/it]

training loss: 3.277078866958618


training:   6%|▌         | 628/10986 [26:14<7:55:55,  2.76s/it]

training loss: 3.468175172805786


training:   6%|▌         | 629/10986 [26:17<7:22:05,  2.56s/it]

training loss: 3.2944743633270264


training:   6%|▌         | 630/10986 [26:19<7:18:09,  2.54s/it]

training loss: 3.4064109325408936


training:   6%|▌         | 631/10986 [26:21<6:56:05,  2.41s/it]

training loss: 3.2971065044403076


training:   6%|▌         | 632/10986 [26:24<6:59:35,  2.43s/it]

training loss: 3.3797664642333984


training:   6%|▌         | 633/10986 [26:26<6:44:05,  2.34s/it]

training loss: 3.3758981227874756


training:   6%|▌         | 634/10986 [26:28<6:53:00,  2.39s/it]

training loss: 3.537675380706787


training:   6%|▌         | 635/10986 [26:30<6:41:29,  2.33s/it]

training loss: 3.355095863342285


training:   6%|▌         | 636/10986 [26:33<6:47:43,  2.36s/it]

training loss: 3.613224506378174


training:   6%|▌         | 637/10986 [26:35<6:35:55,  2.30s/it]

training loss: 3.393747091293335


training:   6%|▌         | 638/10986 [26:38<6:45:00,  2.35s/it]

training loss: 3.321230888366699


training:   6%|▌         | 639/10986 [26:40<6:33:04,  2.28s/it]

training loss: 3.3187854290008545


training:   6%|▌         | 640/10986 [26:42<6:39:19,  2.32s/it]

training loss: 3.4155821800231934
valid loss: 3.3249974250793457
perplexity: 27.798927307128906


training:   6%|▌         | 641/10986 [26:47<8:31:25,  2.97s/it]

training loss: 3.399531126022339


training:   6%|▌         | 642/10986 [26:49<7:49:04,  2.72s/it]

training loss: 3.379361629486084


training:   6%|▌         | 643/10986 [26:51<7:35:55,  2.64s/it]

training loss: 3.2982029914855957


training:   6%|▌         | 644/10986 [26:53<7:10:44,  2.50s/it]

training loss: 3.4781463146209717


training:   6%|▌         | 645/10986 [26:56<7:12:05,  2.51s/it]

training loss: 3.465458869934082


training:   6%|▌         | 646/10986 [26:58<6:51:34,  2.39s/it]

training loss: 3.31195330619812


training:   6%|▌         | 647/10986 [27:00<6:57:22,  2.42s/it]

training loss: 3.438386917114258


training:   6%|▌         | 648/10986 [27:03<6:43:08,  2.34s/it]

training loss: 3.4478414058685303


training:   6%|▌         | 649/10986 [27:05<6:53:28,  2.40s/it]

training loss: 3.533679723739624


training:   6%|▌         | 650/10986 [27:07<6:39:51,  2.32s/it]

training loss: 3.4986886978149414


training:   6%|▌         | 651/10986 [27:10<6:49:11,  2.38s/it]

training loss: 3.333897829055786


training:   6%|▌         | 652/10986 [27:12<6:37:50,  2.31s/it]

training loss: 3.476083993911743


training:   6%|▌         | 653/10986 [27:15<7:25:39,  2.59s/it]

training loss: 3.3423573970794678


training:   6%|▌         | 654/10986 [27:17<7:02:29,  2.45s/it]

training loss: 3.378265142440796


training:   6%|▌         | 655/10986 [27:20<7:03:12,  2.46s/it]

training loss: 3.538599729537964


training:   6%|▌         | 656/10986 [27:22<6:46:01,  2.36s/it]

training loss: 3.415736675262451


training:   6%|▌         | 657/10986 [27:24<6:53:03,  2.40s/it]

training loss: 3.493509292602539


training:   6%|▌         | 658/10986 [27:27<6:41:11,  2.33s/it]

training loss: 3.406257390975952


training:   6%|▌         | 659/10986 [27:29<6:47:15,  2.37s/it]

training loss: 3.4263410568237305


training:   6%|▌         | 660/10986 [27:31<6:37:09,  2.31s/it]

training loss: 3.3988378047943115
valid loss: 3.42781925201416
perplexity: 30.809383392333984


training:   6%|▌         | 661/10986 [27:36<8:30:28,  2.97s/it]

training loss: 3.34879732131958


training:   6%|▌         | 662/10986 [27:38<8:11:17,  2.86s/it]

training loss: 3.3418803215026855


training:   6%|▌         | 663/10986 [27:40<7:34:48,  2.64s/it]

training loss: 3.4051382541656494


training:   6%|▌         | 664/10986 [27:43<7:26:57,  2.60s/it]

training loss: 3.40096116065979


training:   6%|▌         | 665/10986 [27:45<7:03:23,  2.46s/it]

training loss: 3.4633901119232178


training:   6%|▌         | 666/10986 [27:48<7:05:09,  2.47s/it]

training loss: 3.470668315887451


training:   6%|▌         | 667/10986 [27:50<6:46:35,  2.36s/it]

training loss: 3.3928165435791016


training:   6%|▌         | 668/10986 [27:52<6:51:30,  2.39s/it]

training loss: 3.295870304107666


training:   6%|▌         | 669/10986 [27:54<6:37:27,  2.31s/it]

training loss: 3.310089588165283


training:   6%|▌         | 670/10986 [27:57<6:46:45,  2.37s/it]

training loss: 3.3337271213531494


training:   6%|▌         | 671/10986 [27:59<6:37:00,  2.31s/it]

training loss: 3.410559892654419


training:   6%|▌         | 672/10986 [28:01<6:48:20,  2.38s/it]

training loss: 3.340074300765991


training:   6%|▌         | 673/10986 [28:04<6:36:26,  2.31s/it]

training loss: 3.6544604301452637


training:   6%|▌         | 674/10986 [28:06<6:43:37,  2.35s/it]

training loss: 3.3753700256347656


training:   6%|▌         | 675/10986 [28:08<6:32:49,  2.29s/it]

training loss: 3.4284567832946777


training:   6%|▌         | 676/10986 [28:11<6:40:52,  2.33s/it]

training loss: 3.3638782501220703


training:   6%|▌         | 677/10986 [28:13<7:01:50,  2.46s/it]

training loss: 3.4533090591430664


training:   6%|▌         | 678/10986 [28:16<7:30:19,  2.62s/it]

training loss: 3.370151996612549


training:   6%|▌         | 679/10986 [28:19<7:05:14,  2.48s/it]

training loss: 3.6207644939422607


training:   6%|▌         | 680/10986 [28:21<7:03:25,  2.47s/it]

training loss: 3.3359034061431885
valid loss: 3.355933904647827
perplexity: 28.67237091064453


training:   6%|▌         | 681/10986 [28:25<8:43:48,  3.05s/it]

training loss: 3.2380847930908203


training:   6%|▌         | 682/10986 [28:28<7:59:15,  2.79s/it]

training loss: 3.270569086074829


training:   6%|▌         | 683/10986 [28:30<7:38:38,  2.67s/it]

training loss: 3.432230234146118


training:   6%|▌         | 684/10986 [28:32<7:12:14,  2.52s/it]

training loss: 3.421077251434326


training:   6%|▌         | 685/10986 [28:35<7:06:35,  2.48s/it]

training loss: 3.5226471424102783


training:   6%|▌         | 686/10986 [28:37<6:47:12,  2.37s/it]

training loss: 3.5163984298706055


training:   6%|▋         | 687/10986 [28:39<6:51:09,  2.40s/it]

training loss: 3.3554553985595703


training:   6%|▋         | 688/10986 [28:41<6:37:51,  2.32s/it]

training loss: 3.3176209926605225


training:   6%|▋         | 689/10986 [28:44<7:11:02,  2.51s/it]

training loss: 3.2348153591156006


training:   6%|▋         | 690/10986 [28:47<7:33:55,  2.65s/it]

training loss: 3.416696310043335


training:   6%|▋         | 691/10986 [28:50<7:22:48,  2.58s/it]

training loss: 3.436546564102173


training:   6%|▋         | 692/10986 [28:52<6:57:49,  2.44s/it]

training loss: 3.3246357440948486


training:   6%|▋         | 693/10986 [28:54<6:57:13,  2.43s/it]

training loss: 3.493659019470215


training:   6%|▋         | 694/10986 [28:56<6:43:10,  2.35s/it]

training loss: 3.378037214279175


training:   6%|▋         | 695/10986 [28:59<6:48:23,  2.38s/it]

training loss: 3.3998794555664062


training:   6%|▋         | 696/10986 [29:01<6:36:49,  2.31s/it]

training loss: 3.35280442237854


training:   6%|▋         | 697/10986 [29:03<6:44:46,  2.36s/it]

training loss: 3.4330363273620605


training:   6%|▋         | 698/10986 [29:05<6:33:10,  2.29s/it]

training loss: 3.3693041801452637


training:   6%|▋         | 699/10986 [29:08<6:42:55,  2.35s/it]

training loss: 3.4742276668548584


training:   6%|▋         | 700/10986 [29:10<6:30:58,  2.28s/it]

training loss: 3.377207040786743
valid loss: 3.459543466567993
perplexity: 31.80245590209961


training:   6%|▋         | 701/10986 [29:15<9:11:13,  3.22s/it]

training loss: 3.3482470512390137


training:   6%|▋         | 702/10986 [29:18<8:36:11,  3.01s/it]

training loss: 3.4544758796691895


training:   6%|▋         | 703/10986 [29:20<7:50:31,  2.75s/it]

training loss: 3.2674717903137207


training:   6%|▋         | 704/10986 [29:23<7:34:56,  2.65s/it]

training loss: 3.4377002716064453


training:   6%|▋         | 705/10986 [29:25<7:06:47,  2.49s/it]

training loss: 3.4513468742370605


training:   6%|▋         | 706/10986 [29:27<7:05:13,  2.48s/it]

training loss: 3.3430662155151367


training:   6%|▋         | 707/10986 [29:29<6:46:14,  2.37s/it]

training loss: 3.312486410140991


training:   6%|▋         | 708/10986 [29:32<6:50:09,  2.39s/it]

training loss: 3.4254696369171143


training:   6%|▋         | 709/10986 [29:34<6:35:55,  2.31s/it]

training loss: 3.2589855194091797


training:   6%|▋         | 710/10986 [29:36<6:41:45,  2.35s/it]

training loss: 3.440239906311035


training:   6%|▋         | 711/10986 [29:38<6:28:40,  2.27s/it]

training loss: 3.354264259338379


training:   6%|▋         | 712/10986 [29:41<6:37:28,  2.32s/it]

training loss: 3.4006922245025635


training:   6%|▋         | 713/10986 [29:43<6:28:43,  2.27s/it]

training loss: 3.4040026664733887


training:   6%|▋         | 714/10986 [29:45<6:41:03,  2.34s/it]

training loss: 3.5260891914367676


training:   7%|▋         | 715/10986 [29:48<6:30:32,  2.28s/it]

training loss: 3.4449727535247803


training:   7%|▋         | 716/10986 [29:50<6:42:08,  2.35s/it]

training loss: 3.4992589950561523


training:   7%|▋         | 717/10986 [29:52<6:30:38,  2.28s/it]

training loss: 3.385471820831299


training:   7%|▋         | 718/10986 [29:55<6:39:27,  2.33s/it]

training loss: 3.534820795059204


training:   7%|▋         | 719/10986 [29:57<6:29:11,  2.27s/it]

training loss: 3.3723514080047607


training:   7%|▋         | 720/10986 [29:59<6:41:14,  2.35s/it]

training loss: 3.2303147315979004
valid loss: 3.335461378097534
perplexity: 28.091341018676758


training:   7%|▋         | 721/10986 [30:04<8:30:40,  2.98s/it]

training loss: 3.3583474159240723


training:   7%|▋         | 722/10986 [30:06<7:48:30,  2.74s/it]

training loss: 3.512640953063965


training:   7%|▋         | 723/10986 [30:08<7:35:05,  2.66s/it]

training loss: 3.3493711948394775


training:   7%|▋         | 724/10986 [30:11<7:08:13,  2.50s/it]

training loss: 3.4520020484924316


training:   7%|▋         | 725/10986 [30:14<7:38:12,  2.68s/it]

training loss: 3.4669814109802246


training:   7%|▋         | 726/10986 [30:16<7:21:13,  2.58s/it]

training loss: 3.224576234817505


training:   7%|▋         | 727/10986 [30:18<7:13:30,  2.54s/it]

training loss: 3.3384461402893066


training:   7%|▋         | 728/10986 [30:21<6:52:14,  2.41s/it]

training loss: 3.438361406326294


training:   7%|▋         | 729/10986 [30:23<6:54:15,  2.42s/it]

training loss: 3.34985613822937


training:   7%|▋         | 730/10986 [30:25<6:40:16,  2.34s/it]

training loss: 3.478360414505005


training:   7%|▋         | 731/10986 [30:28<6:45:46,  2.37s/it]

training loss: 3.3604445457458496


training:   7%|▋         | 732/10986 [30:30<6:32:54,  2.30s/it]

training loss: 3.4265694618225098


training:   7%|▋         | 733/10986 [30:32<6:39:45,  2.34s/it]

training loss: 3.2969374656677246


training:   7%|▋         | 734/10986 [30:34<6:28:09,  2.27s/it]

training loss: 3.4164068698883057


training:   7%|▋         | 735/10986 [30:37<6:37:33,  2.33s/it]

training loss: 3.2956748008728027


training:   7%|▋         | 736/10986 [30:39<6:26:36,  2.26s/it]

training loss: 3.2847657203674316


training:   7%|▋         | 737/10986 [30:41<6:35:52,  2.32s/it]

training loss: 3.436091899871826


training:   7%|▋         | 738/10986 [30:43<6:27:29,  2.27s/it]

training loss: 3.392742395401001


training:   7%|▋         | 739/10986 [30:46<6:36:14,  2.32s/it]

training loss: 3.4918038845062256


training:   7%|▋         | 740/10986 [30:48<6:26:27,  2.26s/it]

training loss: 3.316311836242676
valid loss: 3.3130528926849365
perplexity: 27.468856811523438


training:   7%|▋         | 741/10986 [30:53<8:24:40,  2.96s/it]

training loss: 3.300093650817871


training:   7%|▋         | 742/10986 [30:55<8:03:53,  2.83s/it]

training loss: 3.2816569805145264


training:   7%|▋         | 743/10986 [30:57<7:26:52,  2.62s/it]

training loss: 3.4194509983062744


training:   7%|▋         | 744/10986 [31:00<7:19:23,  2.57s/it]

training loss: 3.411713123321533


training:   7%|▋         | 745/10986 [31:02<6:56:17,  2.44s/it]

training loss: 3.34263014793396


training:   7%|▋         | 746/10986 [31:04<6:57:49,  2.45s/it]

training loss: 3.324890375137329


training:   7%|▋         | 747/10986 [31:06<6:40:02,  2.34s/it]

training loss: 3.274214267730713


training:   7%|▋         | 748/10986 [31:09<6:44:54,  2.37s/it]

training loss: 3.36329984664917


training:   7%|▋         | 749/10986 [31:11<6:32:44,  2.30s/it]

training loss: 3.473341703414917


training:   7%|▋         | 750/10986 [31:14<7:13:22,  2.54s/it]

training loss: 3.4112226963043213


training:   7%|▋         | 751/10986 [31:16<6:53:26,  2.42s/it]

training loss: 3.3047268390655518


training:   7%|▋         | 752/10986 [31:19<6:54:26,  2.43s/it]

training loss: 3.410367250442505


training:   7%|▋         | 753/10986 [31:21<6:36:45,  2.33s/it]

training loss: 3.3846893310546875


training:   7%|▋         | 754/10986 [31:23<6:40:59,  2.35s/it]

training loss: 3.424065589904785


training:   7%|▋         | 755/10986 [31:25<6:31:08,  2.29s/it]

training loss: 3.3783018589019775


training:   7%|▋         | 756/10986 [31:28<6:38:24,  2.34s/it]

training loss: 3.4127120971679688


training:   7%|▋         | 757/10986 [31:30<6:26:41,  2.27s/it]

training loss: 3.423340320587158


training:   7%|▋         | 758/10986 [31:32<6:40:58,  2.35s/it]

training loss: 3.429353713989258


training:   7%|▋         | 759/10986 [31:35<7:13:59,  2.55s/it]

training loss: 3.477465867996216


training:   7%|▋         | 760/10986 [31:38<7:25:56,  2.62s/it]

training loss: 3.4228482246398926
valid loss: 3.26867938041687
perplexity: 26.2766170501709


training:   7%|▋         | 761/10986 [31:43<8:59:20,  3.16s/it]

training loss: 3.268840789794922


training:   7%|▋         | 762/10986 [31:45<8:08:46,  2.87s/it]

training loss: 3.37168025970459


training:   7%|▋         | 763/10986 [31:47<7:52:33,  2.77s/it]

training loss: 3.3637943267822266


training:   7%|▋         | 764/10986 [31:50<7:18:18,  2.57s/it]

training loss: 3.4722235202789307


training:   7%|▋         | 765/10986 [31:52<7:13:13,  2.54s/it]

training loss: 3.3393609523773193


training:   7%|▋         | 766/10986 [31:54<6:55:00,  2.44s/it]

training loss: 3.4900710582733154


training:   7%|▋         | 767/10986 [31:57<6:59:37,  2.46s/it]

training loss: 3.390354633331299


training:   7%|▋         | 768/10986 [31:59<6:42:00,  2.36s/it]

training loss: 3.359839677810669


training:   7%|▋         | 769/10986 [32:01<6:49:15,  2.40s/it]

training loss: 3.4206326007843018


training:   7%|▋         | 770/10986 [32:03<6:35:19,  2.32s/it]

training loss: 3.3593368530273438


training:   7%|▋         | 771/10986 [32:06<6:42:38,  2.37s/it]

training loss: 3.3891165256500244


training:   7%|▋         | 772/10986 [32:08<6:30:09,  2.29s/it]

training loss: 3.2793972492218018


training:   7%|▋         | 773/10986 [32:11<6:40:36,  2.35s/it]

training loss: 3.431649923324585


training:   7%|▋         | 774/10986 [32:13<6:53:58,  2.43s/it]

training loss: 3.4014501571655273


training:   7%|▋         | 775/10986 [32:16<7:01:47,  2.48s/it]

training loss: 3.565021276473999


training:   7%|▋         | 776/10986 [32:18<6:42:36,  2.37s/it]

training loss: 3.3971810340881348


training:   7%|▋         | 777/10986 [32:20<6:48:34,  2.40s/it]

training loss: 3.3010993003845215


training:   7%|▋         | 778/10986 [32:22<6:34:23,  2.32s/it]

training loss: 3.458418369293213


training:   7%|▋         | 779/10986 [32:25<6:45:46,  2.39s/it]

training loss: 3.290616273880005


training:   7%|▋         | 780/10986 [32:27<6:31:45,  2.30s/it]

training loss: 3.467296838760376
valid loss: 3.4084291458129883
perplexity: 30.217741012573242


training:   7%|▋         | 781/10986 [32:32<8:18:46,  2.93s/it]

training loss: 3.399603843688965


training:   7%|▋         | 782/10986 [32:34<7:56:38,  2.80s/it]

training loss: 3.30953311920166


training:   7%|▋         | 783/10986 [32:36<7:24:30,  2.61s/it]

training loss: 3.254009962081909


training:   7%|▋         | 784/10986 [32:39<7:15:38,  2.56s/it]

training loss: 3.3899765014648438


training:   7%|▋         | 785/10986 [32:41<6:52:34,  2.43s/it]

training loss: 3.4127097129821777


training:   7%|▋         | 786/10986 [32:43<6:54:45,  2.44s/it]

training loss: 3.3182382583618164


training:   7%|▋         | 787/10986 [32:45<6:40:05,  2.35s/it]

training loss: 3.3989672660827637


training:   7%|▋         | 788/10986 [32:48<6:45:43,  2.39s/it]

training loss: 3.199398994445801


training:   7%|▋         | 789/10986 [32:50<6:31:25,  2.30s/it]

training loss: 3.4007315635681152


training:   7%|▋         | 790/10986 [32:52<6:39:40,  2.35s/it]

training loss: 3.429999351501465


training:   7%|▋         | 791/10986 [32:55<6:28:00,  2.28s/it]

training loss: 3.3857383728027344


training:   7%|▋         | 792/10986 [32:57<6:38:30,  2.35s/it]

training loss: 3.276655673980713


training:   7%|▋         | 793/10986 [32:59<6:27:06,  2.28s/it]

training loss: 3.3136301040649414


training:   7%|▋         | 794/10986 [33:02<6:35:41,  2.33s/it]

training loss: 3.314382314682007


training:   7%|▋         | 795/10986 [33:04<6:25:26,  2.27s/it]

training loss: 3.339367151260376


training:   7%|▋         | 796/10986 [33:06<6:36:24,  2.33s/it]

training loss: 3.277670383453369


training:   7%|▋         | 797/10986 [33:08<6:26:02,  2.27s/it]

training loss: 3.2555487155914307


training:   7%|▋         | 798/10986 [33:11<6:38:06,  2.34s/it]

training loss: 3.282172918319702


training:   7%|▋         | 799/10986 [33:13<6:54:03,  2.44s/it]

training loss: 3.3682825565338135


training:   7%|▋         | 800/10986 [33:16<7:20:33,  2.60s/it]

training loss: 3.2677950859069824
valid loss: 3.455451011657715
perplexity: 31.672571182250977


training:   7%|▋         | 801/10986 [33:21<9:14:35,  3.27s/it]

training loss: 3.4316940307617188


training:   7%|▋         | 802/10986 [33:23<8:19:43,  2.94s/it]

training loss: 3.3477931022644043


training:   7%|▋         | 803/10986 [33:26<7:57:50,  2.82s/it]

training loss: 3.283125638961792


training:   7%|▋         | 804/10986 [33:28<7:22:39,  2.61s/it]

training loss: 3.4670462608337402


training:   7%|▋         | 805/10986 [33:31<7:14:00,  2.56s/it]

training loss: 3.3100359439849854


training:   7%|▋         | 806/10986 [33:33<6:50:38,  2.42s/it]

training loss: 3.4504311084747314


training:   7%|▋         | 807/10986 [33:35<6:52:49,  2.43s/it]

training loss: 3.580483913421631


training:   7%|▋         | 808/10986 [33:37<6:36:13,  2.34s/it]

training loss: 3.413691282272339


training:   7%|▋         | 809/10986 [33:40<6:42:03,  2.37s/it]

training loss: 3.5017480850219727


training:   7%|▋         | 810/10986 [33:42<6:28:32,  2.29s/it]

training loss: 3.518869400024414


training:   7%|▋         | 811/10986 [33:44<6:38:04,  2.35s/it]

training loss: 3.47066068649292


training:   7%|▋         | 812/10986 [33:46<6:30:04,  2.30s/it]

training loss: 3.3902621269226074


training:   7%|▋         | 813/10986 [33:49<6:38:50,  2.35s/it]

training loss: 3.423149824142456


training:   7%|▋         | 814/10986 [33:51<6:26:40,  2.28s/it]

training loss: 3.445967197418213


training:   7%|▋         | 815/10986 [33:54<6:35:39,  2.33s/it]

training loss: 3.2767457962036133


training:   7%|▋         | 816/10986 [33:56<6:26:18,  2.28s/it]

training loss: 3.371181011199951


training:   7%|▋         | 817/10986 [33:58<6:34:12,  2.33s/it]

training loss: 3.3370981216430664


training:   7%|▋         | 818/10986 [34:00<6:23:46,  2.26s/it]

training loss: 3.3123724460601807


training:   7%|▋         | 819/10986 [34:03<6:34:40,  2.33s/it]

training loss: 3.3629984855651855


training:   7%|▋         | 820/10986 [34:05<6:23:59,  2.27s/it]

training loss: 3.3572824001312256
valid loss: 3.250843048095703
perplexity: 25.812091827392578


training:   7%|▋         | 821/10986 [34:09<8:15:35,  2.93s/it]

training loss: 3.4390437602996826


training:   7%|▋         | 822/10986 [34:12<7:55:41,  2.81s/it]

training loss: 3.270078420639038


training:   7%|▋         | 823/10986 [34:14<7:37:42,  2.70s/it]

training loss: 3.268712043762207


training:   8%|▊         | 824/10986 [34:17<7:25:27,  2.63s/it]

training loss: 3.364211082458496


training:   8%|▊         | 825/10986 [34:19<7:00:52,  2.49s/it]

training loss: 3.400172233581543


training:   8%|▊         | 826/10986 [34:21<7:00:13,  2.48s/it]

training loss: 3.254680633544922


training:   8%|▊         | 827/10986 [34:24<7:24:27,  2.63s/it]

training loss: 3.332275152206421


training:   8%|▊         | 828/10986 [34:27<7:35:18,  2.69s/it]

training loss: 3.4342942237854004


training:   8%|▊         | 829/10986 [34:29<7:06:14,  2.52s/it]

training loss: 3.3560330867767334


training:   8%|▊         | 830/10986 [34:32<7:01:48,  2.49s/it]

training loss: 3.5638427734375


training:   8%|▊         | 831/10986 [34:34<6:43:02,  2.38s/it]

training loss: 3.431035041809082


training:   8%|▊         | 832/10986 [34:36<6:47:26,  2.41s/it]

training loss: 3.551863670349121


training:   8%|▊         | 833/10986 [34:38<6:32:11,  2.32s/it]

training loss: 3.444838285446167


training:   8%|▊         | 834/10986 [34:41<6:38:30,  2.36s/it]

training loss: 3.448702335357666


training:   8%|▊         | 835/10986 [34:43<6:27:12,  2.29s/it]

training loss: 3.5019946098327637


training:   8%|▊         | 836/10986 [34:45<6:35:49,  2.34s/it]

training loss: 3.4134345054626465


training:   8%|▊         | 837/10986 [34:48<6:24:47,  2.27s/it]

training loss: 3.3942737579345703


training:   8%|▊         | 838/10986 [34:50<6:36:02,  2.34s/it]

training loss: 3.3343212604522705


training:   8%|▊         | 839/10986 [34:52<6:25:22,  2.28s/it]

training loss: 3.310993194580078


training:   8%|▊         | 840/10986 [34:55<6:33:06,  2.32s/it]

training loss: 3.3726806640625
valid loss: 3.377659320831299
perplexity: 29.30210304260254


training:   8%|▊         | 841/10986 [34:59<8:19:04,  2.95s/it]

training loss: 3.471724271774292


training:   8%|▊         | 842/10986 [35:01<7:35:44,  2.70s/it]

training loss: 3.485485076904297


training:   8%|▊         | 843/10986 [35:04<7:22:02,  2.61s/it]

training loss: 3.501331090927124


training:   8%|▊         | 844/10986 [35:06<6:57:17,  2.47s/it]

training loss: 3.3024110794067383


training:   8%|▊         | 845/10986 [35:08<6:56:10,  2.46s/it]

training loss: 3.329850912094116


training:   8%|▊         | 846/10986 [35:10<6:37:28,  2.35s/it]

training loss: 3.453502893447876


training:   8%|▊         | 847/10986 [35:13<7:00:12,  2.49s/it]

training loss: 3.3326292037963867


training:   8%|▊         | 848/10986 [35:15<6:40:19,  2.37s/it]

training loss: 3.319793224334717


training:   8%|▊         | 849/10986 [35:18<6:42:39,  2.38s/it]

training loss: 3.238795518875122


training:   8%|▊         | 850/10986 [35:20<6:27:43,  2.30s/it]

training loss: 3.43068790435791


training:   8%|▊         | 851/10986 [35:22<6:33:45,  2.33s/it]

training loss: 3.3213157653808594


training:   8%|▊         | 852/10986 [35:24<6:22:22,  2.26s/it]

training loss: 3.4472901821136475


training:   8%|▊         | 853/10986 [35:27<6:32:35,  2.32s/it]

training loss: 3.3628249168395996


training:   8%|▊         | 854/10986 [35:29<6:20:54,  2.26s/it]

training loss: 3.3132903575897217


training:   8%|▊         | 855/10986 [35:31<6:29:47,  2.31s/it]

training loss: 3.5404067039489746


training:   8%|▊         | 856/10986 [35:33<6:18:55,  2.24s/it]

training loss: 3.3303887844085693


training:   8%|▊         | 857/10986 [35:36<6:28:43,  2.30s/it]

training loss: 3.3155620098114014


training:   8%|▊         | 858/10986 [35:38<6:17:43,  2.24s/it]

training loss: 3.5318758487701416


training:   8%|▊         | 859/10986 [35:40<6:26:21,  2.29s/it]

training loss: 3.2597293853759766


training:   8%|▊         | 860/10986 [35:42<6:18:34,  2.24s/it]

training loss: 3.455195665359497
valid loss: 3.2550861835479736
perplexity: 25.92184829711914


training:   8%|▊         | 861/10986 [35:47<8:06:12,  2.88s/it]

training loss: 3.3206965923309326


training:   8%|▊         | 862/10986 [35:49<7:43:50,  2.75s/it]

training loss: 3.25681209564209


training:   8%|▊         | 863/10986 [35:51<7:11:22,  2.56s/it]

training loss: 3.3027560710906982


training:   8%|▊         | 864/10986 [35:54<7:04:22,  2.52s/it]

training loss: 3.2540223598480225


training:   8%|▊         | 865/10986 [35:56<6:45:27,  2.40s/it]

training loss: 3.2878577709198


training:   8%|▊         | 866/10986 [35:58<6:47:18,  2.41s/it]

training loss: 3.3020503520965576


training:   8%|▊         | 867/10986 [36:00<6:33:30,  2.33s/it]

training loss: 3.4286117553710938


training:   8%|▊         | 868/10986 [36:03<6:38:51,  2.37s/it]

training loss: 3.339109182357788


training:   8%|▊         | 869/10986 [36:05<6:24:31,  2.28s/it]

training loss: 3.3524281978607178


training:   8%|▊         | 870/10986 [36:07<6:31:46,  2.32s/it]

training loss: 3.3869895935058594


training:   8%|▊         | 871/10986 [36:09<6:21:17,  2.26s/it]

training loss: 3.3224174976348877


training:   8%|▊         | 872/10986 [36:12<6:33:39,  2.34s/it]

training loss: 3.3391151428222656


training:   8%|▊         | 873/10986 [36:15<7:00:07,  2.49s/it]

training loss: 3.324782371520996


training:   8%|▊         | 874/10986 [36:17<6:55:35,  2.47s/it]

training loss: 3.4864675998687744


training:   8%|▊         | 875/10986 [36:19<6:36:55,  2.36s/it]

training loss: 3.400834560394287


training:   8%|▊         | 876/10986 [36:22<6:40:26,  2.38s/it]

training loss: 3.428105592727661


training:   8%|▊         | 877/10986 [36:24<6:25:38,  2.29s/it]

training loss: 3.340449094772339


training:   8%|▊         | 878/10986 [36:26<6:30:53,  2.32s/it]

training loss: 3.3134021759033203


training:   8%|▊         | 879/10986 [36:28<6:19:57,  2.26s/it]

training loss: 3.3797552585601807


training:   8%|▊         | 880/10986 [36:31<6:33:50,  2.34s/it]

training loss: 3.480712890625
valid loss: 3.41831111907959
perplexity: 30.51782989501953


training:   8%|▊         | 881/10986 [36:35<8:17:32,  2.95s/it]

training loss: 3.3324854373931885


training:   8%|▊         | 882/10986 [36:37<7:36:28,  2.71s/it]

training loss: 3.390080213546753


training:   8%|▊         | 883/10986 [36:40<7:24:07,  2.64s/it]

training loss: 3.341923952102661


training:   8%|▊         | 884/10986 [36:42<6:58:30,  2.49s/it]

training loss: 3.3845794200897217


training:   8%|▊         | 885/10986 [36:44<6:55:59,  2.47s/it]

training loss: 3.316504716873169


training:   8%|▊         | 886/10986 [36:47<6:39:38,  2.37s/it]

training loss: 3.456511974334717


training:   8%|▊         | 887/10986 [36:49<6:43:32,  2.40s/it]

training loss: 3.346118450164795


training:   8%|▊         | 888/10986 [36:51<6:29:15,  2.31s/it]

training loss: 3.297884225845337


training:   8%|▊         | 889/10986 [36:54<6:35:19,  2.35s/it]

training loss: 3.386728286743164


training:   8%|▊         | 890/10986 [36:56<6:23:38,  2.28s/it]

training loss: 3.388888120651245


training:   8%|▊         | 891/10986 [36:58<6:29:14,  2.31s/it]

training loss: 3.402527093887329


training:   8%|▊         | 892/10986 [37:00<6:17:58,  2.25s/it]

training loss: 3.417876958847046


training:   8%|▊         | 893/10986 [37:03<6:27:00,  2.30s/it]

training loss: 3.381676435470581


training:   8%|▊         | 894/10986 [37:05<6:18:49,  2.25s/it]

training loss: 3.3453047275543213


training:   8%|▊         | 895/10986 [37:07<6:24:24,  2.29s/it]

training loss: 3.371100902557373


training:   8%|▊         | 896/10986 [37:09<6:15:39,  2.23s/it]

training loss: 3.351407766342163


training:   8%|▊         | 897/10986 [37:12<6:24:32,  2.29s/it]

training loss: 3.271756649017334


training:   8%|▊         | 898/10986 [37:16<7:47:24,  2.78s/it]

training loss: 3.548548698425293


training:   8%|▊         | 899/10986 [37:18<7:33:53,  2.70s/it]

training loss: 3.3421154022216797


training:   8%|▊         | 900/10986 [37:20<7:04:02,  2.52s/it]

training loss: 3.390162229537964
valid loss: 3.2194652557373047
perplexity: 25.014738082885742


training:   8%|▊         | 901/10986 [37:25<9:04:38,  3.24s/it]

training loss: 3.287537097930908


training:   8%|▊         | 902/10986 [37:28<8:25:08,  3.01s/it]

training loss: 3.430565595626831


training:   8%|▊         | 903/10986 [37:30<7:39:29,  2.73s/it]

training loss: 3.334857940673828


training:   8%|▊         | 904/10986 [37:32<7:21:51,  2.63s/it]

training loss: 3.415471076965332


training:   8%|▊         | 905/10986 [37:34<6:58:13,  2.49s/it]

training loss: 3.3730480670928955


training:   8%|▊         | 906/10986 [37:37<6:57:20,  2.48s/it]

training loss: 3.3241658210754395


training:   8%|▊         | 907/10986 [37:39<6:37:51,  2.37s/it]

training loss: 3.3969998359680176


training:   8%|▊         | 908/10986 [37:41<6:41:07,  2.39s/it]

training loss: 3.2737226486206055


training:   8%|▊         | 909/10986 [37:43<6:29:47,  2.32s/it]

training loss: 3.3206098079681396


training:   8%|▊         | 910/10986 [37:46<6:34:39,  2.35s/it]

training loss: 3.377795457839966


training:   8%|▊         | 911/10986 [37:48<6:24:41,  2.29s/it]

training loss: 3.243717670440674


training:   8%|▊         | 912/10986 [37:50<6:31:15,  2.33s/it]

training loss: 3.325514078140259


training:   8%|▊         | 913/10986 [37:52<6:20:51,  2.27s/it]

training loss: 3.4914584159851074


training:   8%|▊         | 914/10986 [37:55<6:27:43,  2.31s/it]

training loss: 3.2961509227752686


training:   8%|▊         | 915/10986 [37:57<6:17:46,  2.25s/it]

training loss: 3.3155202865600586


training:   8%|▊         | 916/10986 [37:59<6:28:48,  2.32s/it]

training loss: 3.2839460372924805


training:   8%|▊         | 917/10986 [38:02<6:19:39,  2.26s/it]

training loss: 3.447110652923584


training:   8%|▊         | 918/10986 [38:04<6:27:44,  2.31s/it]

training loss: 3.270202398300171


training:   8%|▊         | 919/10986 [38:06<6:18:36,  2.26s/it]

training loss: 3.2554328441619873


training:   8%|▊         | 920/10986 [38:09<6:29:08,  2.32s/it]

training loss: 3.3852925300598145
valid loss: 3.5254788398742676
perplexity: 33.97003936767578


training:   8%|▊         | 921/10986 [38:13<8:33:00,  3.06s/it]

training loss: 3.471951484680176


training:   8%|▊         | 922/10986 [38:16<7:48:38,  2.79s/it]

training loss: 3.3978517055511475


training:   8%|▊         | 923/10986 [38:18<7:33:09,  2.70s/it]

training loss: 3.2582411766052246


training:   8%|▊         | 924/10986 [38:20<7:03:11,  2.52s/it]

training loss: 3.2847647666931152


training:   8%|▊         | 925/10986 [38:23<6:57:08,  2.49s/it]

training loss: 3.2841477394104004


training:   8%|▊         | 926/10986 [38:25<6:39:14,  2.38s/it]

training loss: 3.411438226699829


training:   8%|▊         | 927/10986 [38:27<6:41:51,  2.40s/it]

training loss: 3.266101837158203


training:   8%|▊         | 928/10986 [38:29<6:28:21,  2.32s/it]

training loss: 3.4699506759643555


training:   8%|▊         | 929/10986 [38:32<6:36:22,  2.36s/it]

training loss: 3.337472677230835


training:   8%|▊         | 930/10986 [38:34<6:22:03,  2.28s/it]

training loss: 3.295199394226074


training:   8%|▊         | 931/10986 [38:36<6:30:07,  2.33s/it]

training loss: 3.46474289894104


training:   8%|▊         | 932/10986 [38:38<6:20:47,  2.27s/it]

training loss: 3.455059051513672


training:   8%|▊         | 933/10986 [38:41<6:30:16,  2.33s/it]

training loss: 3.400831460952759


training:   9%|▊         | 934/10986 [38:43<6:21:46,  2.28s/it]

training loss: 3.312199592590332


training:   9%|▊         | 935/10986 [38:46<6:33:51,  2.35s/it]

training loss: 3.3213984966278076


training:   9%|▊         | 936/10986 [38:48<6:23:20,  2.29s/it]

training loss: 3.326104164123535


training:   9%|▊         | 937/10986 [38:50<6:38:29,  2.38s/it]

training loss: 3.27831768989563


training:   9%|▊         | 938/10986 [38:52<6:26:50,  2.31s/it]

training loss: 3.3603293895721436


training:   9%|▊         | 939/10986 [38:55<6:36:21,  2.37s/it]

training loss: 3.3405940532684326


training:   9%|▊         | 940/10986 [38:57<6:27:05,  2.31s/it]

training loss: 3.3900797367095947
valid loss: 3.2479801177978516
perplexity: 25.738300323486328


training:   9%|▊         | 941/10986 [39:02<8:15:15,  2.96s/it]

training loss: 3.3528072834014893


training:   9%|▊         | 942/10986 [39:04<7:55:55,  2.84s/it]

training loss: 3.4435739517211914


training:   9%|▊         | 943/10986 [39:06<7:19:52,  2.63s/it]

training loss: 3.329712390899658


training:   9%|▊         | 944/10986 [39:09<7:12:28,  2.58s/it]

training loss: 3.315577268600464


training:   9%|▊         | 945/10986 [39:11<6:51:04,  2.46s/it]

training loss: 3.408600330352783


training:   9%|▊         | 946/10986 [39:14<7:27:33,  2.67s/it]

training loss: 3.4381723403930664


training:   9%|▊         | 947/10986 [39:16<7:02:01,  2.52s/it]

training loss: 3.5389997959136963


training:   9%|▊         | 948/10986 [39:19<7:03:29,  2.53s/it]

training loss: 3.5091986656188965


training:   9%|▊         | 949/10986 [39:21<6:43:18,  2.41s/it]

training loss: 3.3247883319854736


training:   9%|▊         | 950/10986 [39:23<6:49:30,  2.45s/it]

training loss: 3.4323246479034424


training:   9%|▊         | 951/10986 [39:26<6:36:38,  2.37s/it]

training loss: 3.3698298931121826


training:   9%|▊         | 952/10986 [39:28<6:47:58,  2.44s/it]

training loss: 3.5203750133514404


training:   9%|▊         | 953/10986 [39:30<6:33:51,  2.36s/it]

training loss: 3.3690900802612305


training:   9%|▊         | 954/10986 [39:33<6:41:41,  2.40s/it]

training loss: 3.4655160903930664


training:   9%|▊         | 955/10986 [39:35<6:28:30,  2.32s/it]

training loss: 3.365241289138794


training:   9%|▊         | 956/10986 [39:38<6:39:34,  2.39s/it]

training loss: 3.495378255844116


training:   9%|▊         | 957/10986 [39:40<6:27:16,  2.32s/it]

training loss: 3.54664945602417


training:   9%|▊         | 958/10986 [39:42<6:38:10,  2.38s/it]

training loss: 3.3796463012695312


training:   9%|▊         | 959/10986 [39:44<6:26:20,  2.31s/it]

training loss: 3.47080659866333


training:   9%|▊         | 960/10986 [39:47<6:37:53,  2.38s/it]

training loss: 3.462707996368408
valid loss: 3.284339427947998
perplexity: 26.691347122192383


training:   9%|▊         | 961/10986 [39:52<8:23:35,  3.01s/it]

training loss: 3.380861759185791


training:   9%|▉         | 962/10986 [39:54<7:40:39,  2.76s/it]

training loss: 3.392345905303955


training:   9%|▉         | 963/10986 [39:56<7:29:42,  2.69s/it]

training loss: 3.374285936355591


training:   9%|▉         | 964/10986 [39:58<7:00:29,  2.52s/it]

training loss: 3.4971065521240234


training:   9%|▉         | 965/10986 [40:01<7:02:41,  2.53s/it]

training loss: 3.5304651260375977


training:   9%|▉         | 966/10986 [40:03<6:48:07,  2.44s/it]

training loss: 3.4256045818328857


training:   9%|▉         | 967/10986 [40:07<7:55:31,  2.85s/it]

training loss: 3.4254543781280518


training:   9%|▉         | 968/10986 [40:09<7:19:01,  2.63s/it]

training loss: 3.468167543411255


training:   9%|▉         | 969/10986 [40:12<7:13:59,  2.60s/it]

training loss: 3.4797043800354004


training:   9%|▉         | 970/10986 [40:14<7:04:57,  2.55s/it]

training loss: 3.5263960361480713


training:   9%|▉         | 971/10986 [40:17<7:05:50,  2.55s/it]

training loss: 3.575882911682129


training:   9%|▉         | 972/10986 [40:19<6:45:41,  2.43s/it]

training loss: 3.406083583831787


training:   9%|▉         | 973/10986 [40:21<6:54:00,  2.48s/it]

training loss: 3.5762455463409424


training:   9%|▉         | 974/10986 [40:23<6:36:28,  2.38s/it]

training loss: 3.7282192707061768


training:   9%|▉         | 975/10986 [40:26<6:49:17,  2.45s/it]

training loss: 3.5112500190734863


training:   9%|▉         | 976/10986 [40:28<6:31:47,  2.35s/it]

training loss: 3.567720413208008


training:   9%|▉         | 977/10986 [40:31<6:44:36,  2.43s/it]

training loss: 3.4386916160583496


training:   9%|▉         | 978/10986 [40:33<6:28:41,  2.33s/it]

training loss: 3.4943315982818604


training:   9%|▉         | 979/10986 [40:35<6:38:55,  2.39s/it]

training loss: 3.6052935123443604


training:   9%|▉         | 980/10986 [40:38<6:24:50,  2.31s/it]

training loss: 3.5931739807128906
valid loss: 3.4283695220947266
perplexity: 30.826339721679688


training:   9%|▉         | 981/10986 [40:42<8:16:48,  2.98s/it]

training loss: 3.741051435470581


training:   9%|▉         | 982/10986 [40:45<7:59:36,  2.88s/it]

training loss: 3.47988224029541


training:   9%|▉         | 983/10986 [40:47<7:21:51,  2.65s/it]

training loss: 3.513584852218628


training:   9%|▉         | 984/10986 [40:49<7:17:46,  2.63s/it]

training loss: 3.546180248260498


training:   9%|▉         | 985/10986 [40:52<6:52:28,  2.47s/it]

training loss: 3.4575116634368896


training:   9%|▉         | 986/10986 [40:54<6:57:34,  2.51s/it]

training loss: 3.492588520050049


training:   9%|▉         | 987/10986 [40:56<6:39:37,  2.40s/it]

training loss: 3.576916217803955


training:   9%|▉         | 988/10986 [40:59<6:47:55,  2.45s/it]

training loss: 3.581353187561035


training:   9%|▉         | 989/10986 [41:01<6:53:07,  2.48s/it]

training loss: 3.4310436248779297


training:   9%|▉         | 990/10986 [41:04<6:57:52,  2.51s/it]

training loss: 3.4283688068389893


training:   9%|▉         | 991/10986 [41:06<6:39:38,  2.40s/it]

training loss: 3.3915345668792725


training:   9%|▉         | 992/10986 [41:09<6:47:10,  2.44s/it]

training loss: 3.413515329360962


training:   9%|▉         | 993/10986 [41:11<6:32:34,  2.36s/it]

training loss: 3.366626262664795


training:   9%|▉         | 994/10986 [41:14<7:35:59,  2.74s/it]

training loss: 3.7816452980041504


training:   9%|▉         | 995/10986 [41:17<7:08:13,  2.57s/it]

training loss: 3.613309383392334


training:   9%|▉         | 996/10986 [41:19<7:08:51,  2.58s/it]

training loss: 3.5331785678863525


training:   9%|▉         | 997/10986 [41:21<6:48:22,  2.45s/it]

training loss: 3.651426315307617


training:   9%|▉         | 998/10986 [41:24<6:52:42,  2.48s/it]

training loss: 3.632882595062256


training:   9%|▉         | 999/10986 [41:26<6:37:07,  2.39s/it]

training loss: 3.4466781616210938


training:   9%|▉         | 1000/10986 [41:29<6:43:16,  2.42s/it]

training loss: 3.5991885662078857
valid loss: 3.426562547683716
perplexity: 30.770689010620117


training:   9%|▉         | 1001/10986 [41:33<8:38:48,  3.12s/it]

training loss: 3.3384668827056885


training:   9%|▉         | 1002/10986 [41:35<7:50:32,  2.83s/it]

training loss: 3.548130750656128


training:   9%|▉         | 1003/10986 [41:38<7:32:51,  2.72s/it]

training loss: 3.4997143745422363


training:   9%|▉         | 1004/10986 [41:40<7:02:04,  2.54s/it]

training loss: 3.528778553009033


training:   9%|▉         | 1005/10986 [41:43<7:01:55,  2.54s/it]

training loss: 3.4323782920837402


training:   9%|▉         | 1006/10986 [41:45<6:41:59,  2.42s/it]

training loss: 3.4335505962371826


training:   9%|▉         | 1007/10986 [41:47<6:45:17,  2.44s/it]

training loss: 3.400132179260254


training:   9%|▉         | 1008/10986 [41:49<6:30:57,  2.35s/it]

training loss: 3.3838515281677246


training:   9%|▉         | 1009/10986 [41:52<6:41:00,  2.41s/it]

training loss: 3.503671407699585


training:   9%|▉         | 1010/10986 [41:54<6:26:19,  2.32s/it]

training loss: 3.5873913764953613


training:   9%|▉         | 1011/10986 [41:56<6:34:05,  2.37s/it]

training loss: 3.4704651832580566


training:   9%|▉         | 1012/10986 [41:59<6:21:40,  2.30s/it]

training loss: 3.382533073425293


training:   9%|▉         | 1013/10986 [42:01<6:30:57,  2.35s/it]

training loss: 3.4002280235290527


training:   9%|▉         | 1014/10986 [42:03<6:20:19,  2.29s/it]

training loss: 3.282442092895508


training:   9%|▉         | 1015/10986 [42:06<6:29:26,  2.34s/it]

training loss: 3.368149995803833


training:   9%|▉         | 1016/10986 [42:08<6:16:08,  2.26s/it]

training loss: 3.496882915496826


training:   9%|▉         | 1017/10986 [42:10<6:28:39,  2.34s/it]

training loss: 3.423171281814575


training:   9%|▉         | 1018/10986 [42:13<6:42:01,  2.42s/it]

training loss: 3.476635217666626


training:   9%|▉         | 1019/10986 [42:16<7:28:53,  2.70s/it]

training loss: 3.4509270191192627


training:   9%|▉         | 1020/10986 [42:18<7:00:06,  2.53s/it]

training loss: 3.4736244678497314
valid loss: 3.3795535564422607
perplexity: 29.3576602935791


training:   9%|▉         | 1021/10986 [42:23<8:44:25,  3.16s/it]

training loss: 3.3550565242767334


training:   9%|▉         | 1022/10986 [42:26<8:16:22,  2.99s/it]

training loss: 3.27707839012146


training:   9%|▉         | 1023/10986 [42:28<7:35:06,  2.74s/it]

training loss: 3.2955288887023926


training:   9%|▉         | 1024/10986 [42:30<7:25:30,  2.68s/it]

training loss: 3.4293935298919678


training:   9%|▉         | 1025/10986 [42:32<6:57:14,  2.51s/it]

training loss: 3.397758722305298


training:   9%|▉         | 1026/10986 [42:35<7:01:01,  2.54s/it]

training loss: 3.5811994075775146


training:   9%|▉         | 1027/10986 [42:37<6:39:27,  2.41s/it]

training loss: 3.5152997970581055


training:   9%|▉         | 1028/10986 [42:40<6:44:10,  2.44s/it]

training loss: 3.4138400554656982


training:   9%|▉         | 1029/10986 [42:42<6:27:48,  2.34s/it]

training loss: 3.384382963180542


training:   9%|▉         | 1030/10986 [42:44<6:36:01,  2.39s/it]

training loss: 3.348526954650879


training:   9%|▉         | 1031/10986 [42:46<6:21:48,  2.30s/it]

training loss: 3.346144914627075


training:   9%|▉         | 1032/10986 [42:49<6:29:46,  2.35s/it]

training loss: 3.5218403339385986


training:   9%|▉         | 1033/10986 [42:51<6:18:21,  2.28s/it]

training loss: 3.340754747390747


training:   9%|▉         | 1034/10986 [42:54<6:35:26,  2.38s/it]

training loss: 3.545248508453369


training:   9%|▉         | 1035/10986 [42:57<7:10:31,  2.60s/it]

training loss: 3.491079330444336


training:   9%|▉         | 1036/10986 [42:59<7:13:17,  2.61s/it]

training loss: 3.423349142074585


training:   9%|▉         | 1037/10986 [43:01<6:48:13,  2.46s/it]

training loss: 3.5078816413879395


training:   9%|▉         | 1038/10986 [43:04<6:49:37,  2.47s/it]

training loss: 3.282214641571045


training:   9%|▉         | 1039/10986 [43:06<6:32:56,  2.37s/it]

training loss: 3.51859974861145


training:   9%|▉         | 1040/10986 [43:09<6:39:49,  2.41s/it]

training loss: 3.4798026084899902
valid loss: 3.5157973766326904
perplexity: 33.64274215698242


training:   9%|▉         | 1041/10986 [43:14<8:57:51,  3.24s/it]

training loss: 3.3470702171325684


training:   9%|▉         | 1042/10986 [43:16<8:04:26,  2.92s/it]

training loss: 3.4220755100250244


training:   9%|▉         | 1043/10986 [43:18<7:40:37,  2.78s/it]

training loss: 3.507110357284546


training:  10%|▉         | 1044/10986 [43:20<7:08:24,  2.59s/it]

training loss: 3.5532302856445312


training:  10%|▉         | 1045/10986 [43:23<7:02:35,  2.55s/it]

training loss: 3.3934216499328613


training:  10%|▉         | 1046/10986 [43:25<6:42:10,  2.43s/it]

training loss: 3.4961745738983154


training:  10%|▉         | 1047/10986 [43:28<6:42:12,  2.43s/it]

training loss: 3.4623067378997803


training:  10%|▉         | 1048/10986 [43:30<6:25:15,  2.33s/it]

training loss: 3.4097354412078857


training:  10%|▉         | 1049/10986 [43:32<6:29:47,  2.35s/it]

training loss: 3.383265495300293


training:  10%|▉         | 1050/10986 [43:34<6:19:08,  2.29s/it]

training loss: 3.3614139556884766


training:  10%|▉         | 1051/10986 [43:37<6:26:22,  2.33s/it]

training loss: 3.5089380741119385


training:  10%|▉         | 1052/10986 [43:39<6:14:17,  2.26s/it]

training loss: 3.38665771484375


training:  10%|▉         | 1053/10986 [43:41<6:24:10,  2.32s/it]

training loss: 3.525350570678711


training:  10%|▉         | 1054/10986 [43:43<6:13:25,  2.26s/it]

training loss: 3.4353864192962646


training:  10%|▉         | 1055/10986 [43:46<6:22:56,  2.31s/it]

training loss: 3.390040159225464


training:  10%|▉         | 1056/10986 [43:48<6:12:39,  2.25s/it]

training loss: 3.3538453578948975


training:  10%|▉         | 1057/10986 [43:50<6:23:01,  2.31s/it]

training loss: 3.4527206420898438


training:  10%|▉         | 1058/10986 [43:52<6:11:27,  2.24s/it]

training loss: 3.4972047805786133


training:  10%|▉         | 1059/10986 [43:55<6:21:14,  2.30s/it]

training loss: 3.5900654792785645


training:  10%|▉         | 1060/10986 [43:57<6:10:37,  2.24s/it]

training loss: 3.3222544193267822
valid loss: 3.4599688053131104
perplexity: 31.815982818603516


training:  10%|▉         | 1061/10986 [44:02<8:13:06,  2.98s/it]

training loss: 3.423304557800293


training:  10%|▉         | 1062/10986 [44:04<7:48:16,  2.83s/it]

training loss: 3.4681758880615234


training:  10%|▉         | 1063/10986 [44:06<7:13:17,  2.62s/it]

training loss: 3.341966152191162


training:  10%|▉         | 1064/10986 [44:09<7:02:58,  2.56s/it]

training loss: 3.5261454582214355


training:  10%|▉         | 1065/10986 [44:11<6:42:10,  2.43s/it]

training loss: 3.489966869354248


training:  10%|▉         | 1066/10986 [44:14<7:43:25,  2.80s/it]

training loss: 3.579648971557617


training:  10%|▉         | 1067/10986 [44:17<7:08:01,  2.59s/it]

training loss: 3.250882625579834


training:  10%|▉         | 1068/10986 [44:19<6:58:38,  2.53s/it]

training loss: 3.4229249954223633


training:  10%|▉         | 1069/10986 [44:21<6:35:55,  2.40s/it]

training loss: 3.4850821495056152


training:  10%|▉         | 1070/10986 [44:23<6:35:42,  2.39s/it]

training loss: 3.4898431301116943


training:  10%|▉         | 1071/10986 [44:26<6:23:58,  2.32s/it]

training loss: 3.4586658477783203


training:  10%|▉         | 1072/10986 [44:28<6:29:44,  2.36s/it]

training loss: 3.4266610145568848


training:  10%|▉         | 1073/10986 [44:30<6:18:25,  2.29s/it]

training loss: 3.414821147918701


training:  10%|▉         | 1074/10986 [44:33<6:24:13,  2.33s/it]

training loss: 3.415774345397949


training:  10%|▉         | 1075/10986 [44:35<6:12:57,  2.26s/it]

training loss: 3.3959624767303467


training:  10%|▉         | 1076/10986 [44:37<6:20:50,  2.31s/it]

training loss: 3.360036849975586


training:  10%|▉         | 1077/10986 [44:39<6:08:19,  2.23s/it]

training loss: 3.4927730560302734


training:  10%|▉         | 1078/10986 [44:41<6:16:22,  2.28s/it]

training loss: 3.383129835128784


training:  10%|▉         | 1079/10986 [44:44<6:07:36,  2.23s/it]

training loss: 3.359394073486328


training:  10%|▉         | 1080/10986 [44:46<6:17:51,  2.29s/it]

training loss: 3.5071070194244385
valid loss: 3.3639655113220215
perplexity: 28.903581619262695


training:  10%|▉         | 1081/10986 [44:50<8:02:41,  2.92s/it]

training loss: 3.447551727294922


training:  10%|▉         | 1082/10986 [44:53<7:24:29,  2.69s/it]

training loss: 3.4453656673431396


training:  10%|▉         | 1083/10986 [44:55<7:13:27,  2.63s/it]

training loss: 3.4972805976867676


training:  10%|▉         | 1084/10986 [44:57<6:47:03,  2.47s/it]

training loss: 3.3261148929595947


training:  10%|▉         | 1085/10986 [45:00<6:48:01,  2.47s/it]

training loss: 3.4949450492858887


training:  10%|▉         | 1086/10986 [45:02<6:28:24,  2.35s/it]

training loss: 3.5253467559814453


training:  10%|▉         | 1087/10986 [45:04<6:32:02,  2.38s/it]

training loss: 3.5297465324401855


training:  10%|▉         | 1088/10986 [45:06<6:18:10,  2.29s/it]

training loss: 3.3579697608947754


training:  10%|▉         | 1089/10986 [45:09<6:22:45,  2.32s/it]

training loss: 3.3818531036376953


training:  10%|▉         | 1090/10986 [45:11<6:11:12,  2.25s/it]

training loss: 3.4164576530456543


training:  10%|▉         | 1091/10986 [45:14<7:18:32,  2.66s/it]

training loss: 3.4749999046325684


training:  10%|▉         | 1092/10986 [45:16<6:51:08,  2.49s/it]

training loss: 3.4581027030944824


training:  10%|▉         | 1093/10986 [45:19<6:47:05,  2.47s/it]

training loss: 3.5372707843780518


training:  10%|▉         | 1094/10986 [45:21<6:26:50,  2.35s/it]

training loss: 3.6165342330932617


training:  10%|▉         | 1095/10986 [45:23<6:31:13,  2.37s/it]

training loss: 3.5395376682281494


training:  10%|▉         | 1096/10986 [45:25<6:18:37,  2.30s/it]

training loss: 3.4184961318969727


training:  10%|▉         | 1097/10986 [45:28<6:24:09,  2.33s/it]

training loss: 3.340407609939575


training:  10%|▉         | 1098/10986 [45:30<6:11:32,  2.25s/it]

training loss: 3.310365915298462


training:  10%|█         | 1099/10986 [45:32<6:19:45,  2.30s/it]

training loss: 3.4870448112487793


training:  10%|█         | 1100/10986 [45:34<6:09:31,  2.24s/it]

training loss: 3.424311876296997
valid loss: 3.4963221549987793
perplexity: 32.9938850402832


training:  10%|█         | 1101/10986 [45:39<8:09:04,  2.97s/it]

training loss: 3.3002443313598633


training:  10%|█         | 1102/10986 [45:42<7:44:55,  2.82s/it]

training loss: 3.518198013305664


training:  10%|█         | 1103/10986 [45:44<7:36:59,  2.77s/it]

training loss: 3.402956247329712


training:  10%|█         | 1104/10986 [45:47<7:58:55,  2.91s/it]

training loss: 3.5043466091156006


training:  10%|█         | 1105/10986 [45:50<7:21:41,  2.68s/it]

training loss: 3.4909396171569824


training:  10%|█         | 1106/10986 [45:52<7:09:09,  2.61s/it]

training loss: 3.4910926818847656


training:  10%|█         | 1107/10986 [45:54<6:44:33,  2.46s/it]

training loss: 3.457019805908203


training:  10%|█         | 1108/10986 [45:57<6:43:59,  2.45s/it]

training loss: 3.518056631088257


training:  10%|█         | 1109/10986 [45:59<6:26:49,  2.35s/it]

training loss: 3.499282121658325


training:  10%|█         | 1110/10986 [46:01<6:32:18,  2.38s/it]

training loss: 3.3987653255462646


training:  10%|█         | 1111/10986 [46:03<6:17:20,  2.29s/it]

training loss: 3.4625232219696045


training:  10%|█         | 1112/10986 [46:06<6:23:45,  2.33s/it]

training loss: 3.442781686782837


training:  10%|█         | 1113/10986 [46:08<6:12:07,  2.26s/it]

training loss: 3.4228885173797607


training:  10%|█         | 1114/10986 [46:10<6:22:29,  2.32s/it]

training loss: 3.480682611465454


training:  10%|█         | 1115/10986 [46:13<6:34:16,  2.40s/it]

training loss: 3.3902337551116943


training:  10%|█         | 1116/10986 [46:16<6:57:34,  2.54s/it]

training loss: 3.4871511459350586


training:  10%|█         | 1117/10986 [46:18<6:35:08,  2.40s/it]

training loss: 3.363795757293701


training:  10%|█         | 1118/10986 [46:20<6:38:54,  2.43s/it]

training loss: 3.346665143966675


training:  10%|█         | 1119/10986 [46:22<6:22:33,  2.33s/it]

training loss: 3.607534408569336


training:  10%|█         | 1120/10986 [46:25<6:28:09,  2.36s/it]

training loss: 3.4252355098724365
valid loss: 3.4995033740997314
perplexity: 33.09900665283203


training:  10%|█         | 1121/10986 [46:29<8:11:38,  2.99s/it]

training loss: 3.5410120487213135


training:  10%|█         | 1122/10986 [46:31<7:31:06,  2.74s/it]

training loss: 3.3935182094573975


training:  10%|█         | 1123/10986 [46:34<7:16:47,  2.66s/it]

training loss: 3.525092124938965


training:  10%|█         | 1124/10986 [46:36<6:48:23,  2.48s/it]

training loss: 3.603212594985962


training:  10%|█         | 1125/10986 [46:38<6:45:06,  2.46s/it]

training loss: 3.5064520835876465


training:  10%|█         | 1126/10986 [46:41<6:28:29,  2.36s/it]

training loss: 3.5179197788238525


training:  10%|█         | 1127/10986 [46:43<6:32:50,  2.39s/it]

training loss: 3.4580564498901367


training:  10%|█         | 1128/10986 [46:45<6:19:58,  2.31s/it]

training loss: 3.522803783416748


training:  10%|█         | 1129/10986 [46:48<6:26:04,  2.35s/it]

training loss: 3.4462051391601562


training:  10%|█         | 1130/10986 [46:50<6:13:33,  2.27s/it]

training loss: 3.490375518798828


training:  10%|█         | 1131/10986 [46:52<6:23:43,  2.34s/it]

training loss: 3.5424158573150635


training:  10%|█         | 1132/10986 [46:54<6:13:01,  2.27s/it]

training loss: 3.6890039443969727


training:  10%|█         | 1133/10986 [46:57<6:20:53,  2.32s/it]

training loss: 3.414735794067383


training:  10%|█         | 1134/10986 [46:59<6:12:43,  2.27s/it]

training loss: 3.4772653579711914


training:  10%|█         | 1135/10986 [47:02<6:39:26,  2.43s/it]

training loss: 3.632312059402466


training:  10%|█         | 1136/10986 [47:04<6:22:35,  2.33s/it]

training loss: 3.5292935371398926


training:  10%|█         | 1137/10986 [47:06<6:25:22,  2.35s/it]

training loss: 3.484821081161499


training:  10%|█         | 1138/10986 [47:08<6:12:58,  2.27s/it]

training loss: 3.525585889816284


training:  10%|█         | 1139/10986 [47:11<6:18:42,  2.31s/it]

training loss: 3.472843647003174


training:  10%|█         | 1140/10986 [47:13<6:46:24,  2.48s/it]

training loss: 3.636975049972534
valid loss: 3.490770101547241
perplexity: 32.81120300292969


training:  10%|█         | 1141/10986 [47:19<8:58:14,  3.28s/it]

training loss: 3.4273688793182373


training:  10%|█         | 1142/10986 [47:21<8:21:05,  3.05s/it]

training loss: 3.4549009799957275


training:  10%|█         | 1143/10986 [47:23<7:33:58,  2.77s/it]

training loss: 3.6269888877868652


training:  10%|█         | 1144/10986 [47:26<7:18:22,  2.67s/it]

training loss: 3.5721445083618164


training:  10%|█         | 1145/10986 [47:28<6:51:25,  2.51s/it]

training loss: 3.456173896789551


training:  10%|█         | 1146/10986 [47:30<6:47:18,  2.48s/it]

training loss: 3.4356086254119873


training:  10%|█         | 1147/10986 [47:32<6:31:49,  2.39s/it]

training loss: 3.5588197708129883


training:  10%|█         | 1148/10986 [47:35<6:32:30,  2.39s/it]

training loss: 3.4545419216156006


training:  10%|█         | 1149/10986 [47:37<6:18:16,  2.31s/it]

training loss: 3.493807792663574


training:  10%|█         | 1150/10986 [47:39<6:23:04,  2.34s/it]

training loss: 3.423541307449341


training:  10%|█         | 1151/10986 [47:41<6:12:24,  2.27s/it]

training loss: 3.4330766201019287


training:  10%|█         | 1152/10986 [47:44<6:19:13,  2.31s/it]

training loss: 3.499239444732666


training:  10%|█         | 1153/10986 [47:46<6:08:49,  2.25s/it]

training loss: 3.439091205596924


training:  11%|█         | 1154/10986 [47:48<6:18:18,  2.31s/it]

training loss: 3.4601755142211914


training:  11%|█         | 1155/10986 [47:51<6:08:35,  2.25s/it]

training loss: 3.5282528400421143


training:  11%|█         | 1156/10986 [47:53<6:17:25,  2.30s/it]

training loss: 3.460496425628662


training:  11%|█         | 1157/10986 [47:55<6:08:51,  2.25s/it]

training loss: 3.346463680267334


training:  11%|█         | 1158/10986 [47:58<6:18:45,  2.31s/it]

training loss: 3.552109718322754


training:  11%|█         | 1159/10986 [48:00<6:10:20,  2.26s/it]

training loss: 3.5153183937072754


training:  11%|█         | 1160/10986 [48:02<6:20:16,  2.32s/it]

training loss: 3.4508886337280273
valid loss: 3.367281198501587
perplexity: 28.999576568603516


training:  11%|█         | 1161/10986 [48:07<8:06:18,  2.97s/it]

training loss: 3.4664688110351562


training:  11%|█         | 1162/10986 [48:09<7:22:53,  2.71s/it]

training loss: 3.5399882793426514


training:  11%|█         | 1163/10986 [48:11<7:08:46,  2.62s/it]

training loss: 3.5154991149902344


training:  11%|█         | 1164/10986 [48:14<7:32:44,  2.77s/it]

training loss: 3.560065746307373


training:  11%|█         | 1165/10986 [48:17<7:25:48,  2.72s/it]

training loss: 3.610326051712036


training:  11%|█         | 1166/10986 [48:19<6:55:06,  2.54s/it]

training loss: 3.515373945236206


training:  11%|█         | 1167/10986 [48:21<6:53:04,  2.52s/it]

training loss: 3.4252803325653076


training:  11%|█         | 1168/10986 [48:24<6:33:57,  2.41s/it]

training loss: 3.5503275394439697


training:  11%|█         | 1169/10986 [48:26<6:37:42,  2.43s/it]

training loss: 3.5374765396118164


training:  11%|█         | 1170/10986 [48:28<6:22:18,  2.34s/it]

training loss: 3.432588577270508


training:  11%|█         | 1171/10986 [48:31<6:26:35,  2.36s/it]

training loss: 3.5071685314178467


training:  11%|█         | 1172/10986 [48:33<6:36:07,  2.42s/it]

training loss: 3.4357333183288574


training:  11%|█         | 1173/10986 [48:37<7:27:04,  2.73s/it]

training loss: 3.4849534034729004


training:  11%|█         | 1174/10986 [48:39<6:57:06,  2.55s/it]

training loss: 3.345733165740967


training:  11%|█         | 1175/10986 [48:41<6:52:36,  2.52s/it]

training loss: 3.4984967708587646


training:  11%|█         | 1176/10986 [48:43<6:31:53,  2.40s/it]

training loss: 3.6116747856140137


training:  11%|█         | 1177/10986 [48:46<6:33:57,  2.41s/it]

training loss: 3.4563117027282715


training:  11%|█         | 1178/10986 [48:48<6:19:17,  2.32s/it]

training loss: 3.4784152507781982


training:  11%|█         | 1179/10986 [48:50<6:24:44,  2.35s/it]

training loss: 3.4989655017852783


training:  11%|█         | 1180/10986 [48:52<6:11:52,  2.28s/it]

training loss: 3.3741466999053955
valid loss: 3.4260106086730957
perplexity: 30.75370979309082


training:  11%|█         | 1181/10986 [48:57<8:04:30,  2.96s/it]

training loss: 3.4600658416748047


training:  11%|█         | 1182/10986 [49:00<7:42:50,  2.83s/it]

training loss: 3.549475908279419


training:  11%|█         | 1183/10986 [49:02<7:07:12,  2.61s/it]

training loss: 3.4349358081817627


training:  11%|█         | 1184/10986 [49:04<6:57:56,  2.56s/it]

training loss: 3.5100150108337402


training:  11%|█         | 1185/10986 [49:06<6:35:18,  2.42s/it]

training loss: 3.455012321472168


training:  11%|█         | 1186/10986 [49:09<6:35:35,  2.42s/it]

training loss: 3.593244791030884


training:  11%|█         | 1187/10986 [49:11<6:19:46,  2.33s/it]

training loss: 3.426875114440918


training:  11%|█         | 1188/10986 [49:14<7:04:57,  2.60s/it]

training loss: 3.475159168243408


training:  11%|█         | 1189/10986 [49:16<6:42:46,  2.47s/it]

training loss: 3.5817196369171143


training:  11%|█         | 1190/10986 [49:18<6:40:48,  2.45s/it]

training loss: 3.4823780059814453


training:  11%|█         | 1191/10986 [49:21<6:21:36,  2.34s/it]

training loss: 3.498373031616211


training:  11%|█         | 1192/10986 [49:23<6:25:39,  2.36s/it]

training loss: 3.447490692138672


training:  11%|█         | 1193/10986 [49:25<6:15:17,  2.30s/it]

training loss: 3.5794153213500977


training:  11%|█         | 1194/10986 [49:28<6:24:16,  2.35s/it]

training loss: 3.5252294540405273


training:  11%|█         | 1195/10986 [49:30<6:13:18,  2.29s/it]

training loss: 3.578660011291504


training:  11%|█         | 1196/10986 [49:32<6:21:02,  2.34s/it]

training loss: 3.485560178756714


training:  11%|█         | 1197/10986 [49:34<6:09:11,  2.26s/it]

training loss: 3.3814480304718018


training:  11%|█         | 1198/10986 [49:37<6:18:53,  2.32s/it]

training loss: 3.5408108234405518


training:  11%|█         | 1199/10986 [49:39<6:07:34,  2.25s/it]

training loss: 3.563359260559082


training:  11%|█         | 1200/10986 [49:41<6:18:23,  2.32s/it]

training loss: 3.4808595180511475
valid loss: 3.3245346546173096
perplexity: 27.78606605529785


training:  11%|█         | 1201/10986 [49:46<8:18:04,  3.05s/it]

training loss: 3.4410977363586426


training:  11%|█         | 1202/10986 [49:48<7:35:16,  2.79s/it]

training loss: 3.474485397338867


training:  11%|█         | 1203/10986 [49:51<7:22:32,  2.71s/it]

training loss: 3.635359525680542


training:  11%|█         | 1204/10986 [49:53<6:55:23,  2.55s/it]

training loss: 3.3284413814544678


training:  11%|█         | 1205/10986 [49:55<6:51:34,  2.52s/it]

training loss: 3.344369649887085


training:  11%|█         | 1206/10986 [49:58<6:32:13,  2.41s/it]

training loss: 3.363710641860962


training:  11%|█         | 1207/10986 [50:00<6:35:32,  2.43s/it]

training loss: 3.3158130645751953


training:  11%|█         | 1208/10986 [50:02<6:20:38,  2.34s/it]

training loss: 3.3218400478363037


training:  11%|█         | 1209/10986 [50:05<6:24:22,  2.36s/it]

training loss: 3.3054449558258057


training:  11%|█         | 1210/10986 [50:07<6:12:42,  2.29s/it]

training loss: 3.4172849655151367


training:  11%|█         | 1211/10986 [50:09<6:21:00,  2.34s/it]

training loss: 3.3985419273376465


training:  11%|█         | 1212/10986 [50:11<6:10:05,  2.27s/it]

training loss: 3.4129350185394287


training:  11%|█         | 1213/10986 [50:15<7:05:15,  2.61s/it]

training loss: 3.4758317470550537


training:  11%|█         | 1214/10986 [50:17<6:43:01,  2.47s/it]

training loss: 3.341392993927002


training:  11%|█         | 1215/10986 [50:19<6:40:43,  2.46s/it]

training loss: 3.4012093544006348


training:  11%|█         | 1216/10986 [50:21<6:22:05,  2.35s/it]

training loss: 3.5030293464660645


training:  11%|█         | 1217/10986 [50:24<6:28:52,  2.39s/it]

training loss: 3.3589673042297363


training:  11%|█         | 1218/10986 [50:26<6:15:59,  2.31s/it]

training loss: 3.4860129356384277


training:  11%|█         | 1219/10986 [50:28<6:25:55,  2.37s/it]

training loss: 3.4200408458709717


training:  11%|█         | 1220/10986 [50:31<6:13:26,  2.29s/it]

training loss: 3.2778987884521484
valid loss: 3.389925718307495
perplexity: 29.663747787475586


training:  11%|█         | 1221/10986 [50:35<8:01:12,  2.96s/it]

training loss: 3.5342185497283936


training:  11%|█         | 1222/10986 [50:38<7:43:19,  2.85s/it]

training loss: 3.228473663330078


training:  11%|█         | 1223/10986 [50:40<7:07:28,  2.63s/it]

training loss: 3.469534397125244


training:  11%|█         | 1224/10986 [50:42<6:57:21,  2.57s/it]

training loss: 3.3985214233398438


training:  11%|█         | 1225/10986 [50:44<6:35:09,  2.43s/it]

training loss: 3.3037941455841064


training:  11%|█         | 1226/10986 [50:47<6:35:34,  2.43s/it]

training loss: 3.515657424926758


training:  11%|█         | 1227/10986 [50:49<6:19:33,  2.33s/it]

training loss: 3.3750243186950684


training:  11%|█         | 1228/10986 [50:51<6:24:49,  2.37s/it]

training loss: 3.460587501525879


training:  11%|█         | 1229/10986 [50:53<6:12:57,  2.29s/it]

training loss: 3.596026659011841


training:  11%|█         | 1230/10986 [50:56<6:19:33,  2.33s/it]

training loss: 3.5273077487945557


training:  11%|█         | 1231/10986 [50:58<6:07:22,  2.26s/it]

training loss: 3.3904221057891846


training:  11%|█         | 1232/10986 [51:00<6:17:13,  2.32s/it]

training loss: 3.432122230529785


training:  11%|█         | 1233/10986 [51:03<6:06:51,  2.26s/it]

training loss: 3.414276123046875


training:  11%|█         | 1234/10986 [51:05<6:16:05,  2.31s/it]

training loss: 3.430642604827881


training:  11%|█         | 1235/10986 [51:07<6:06:55,  2.26s/it]

training loss: 3.414013624191284


training:  11%|█▏        | 1236/10986 [51:10<6:16:13,  2.32s/it]

training loss: 3.3397631645202637


training:  11%|█▏        | 1237/10986 [51:12<6:07:12,  2.26s/it]

training loss: 3.4955482482910156


training:  11%|█▏        | 1238/10986 [51:15<6:50:34,  2.53s/it]

training loss: 3.442188024520874


training:  11%|█▏        | 1239/10986 [51:17<6:30:05,  2.40s/it]

training loss: 3.3582510948181152


training:  11%|█▏        | 1240/10986 [51:19<6:33:53,  2.42s/it]

training loss: 3.4004509449005127
valid loss: 3.398216962814331
perplexity: 29.910720825195312


training:  11%|█▏        | 1241/10986 [51:25<9:21:14,  3.46s/it]

training loss: 3.420666217803955


training:  11%|█▏        | 1242/10986 [51:27<8:19:48,  3.08s/it]

training loss: 3.339374303817749


training:  11%|█▏        | 1243/10986 [51:30<7:52:02,  2.91s/it]

training loss: 3.2679717540740967


training:  11%|█▏        | 1244/10986 [51:32<7:13:41,  2.67s/it]

training loss: 3.4366137981414795


training:  11%|█▏        | 1245/10986 [51:35<7:04:15,  2.61s/it]

training loss: 3.373669385910034


training:  11%|█▏        | 1246/10986 [51:37<6:40:31,  2.47s/it]

training loss: 3.4263806343078613


training:  11%|█▏        | 1247/10986 [51:39<6:39:31,  2.46s/it]

training loss: 3.3659780025482178


training:  11%|█▏        | 1248/10986 [51:41<6:22:19,  2.36s/it]

training loss: 3.5274431705474854


training:  11%|█▏        | 1249/10986 [51:44<6:26:43,  2.38s/it]

training loss: 3.3473408222198486


training:  11%|█▏        | 1250/10986 [51:46<6:13:51,  2.30s/it]

training loss: 3.4187498092651367


training:  11%|█▏        | 1251/10986 [51:48<6:19:49,  2.34s/it]

training loss: 3.4478886127471924


training:  11%|█▏        | 1252/10986 [51:50<6:10:18,  2.28s/it]

training loss: 3.3671998977661133


training:  11%|█▏        | 1253/10986 [51:53<6:17:33,  2.33s/it]

training loss: 3.324568748474121


training:  11%|█▏        | 1254/10986 [51:55<6:07:52,  2.27s/it]

training loss: 3.379378080368042


training:  11%|█▏        | 1255/10986 [51:57<6:16:16,  2.32s/it]

training loss: 3.419743299484253


training:  11%|█▏        | 1256/10986 [52:00<6:07:15,  2.26s/it]

training loss: 3.2786943912506104


training:  11%|█▏        | 1257/10986 [52:02<6:14:12,  2.31s/it]

training loss: 3.4022274017333984


training:  11%|█▏        | 1258/10986 [52:04<6:04:56,  2.25s/it]

training loss: 3.3982291221618652


training:  11%|█▏        | 1259/10986 [52:06<6:13:43,  2.31s/it]

training loss: 3.366743564605713


training:  11%|█▏        | 1260/10986 [52:09<6:03:22,  2.24s/it]

training loss: 3.405343532562256
valid loss: 3.3353524208068848
perplexity: 28.088279724121094


training:  11%|█▏        | 1261/10986 [52:14<8:20:03,  3.09s/it]

training loss: 3.2714881896972656


training:  11%|█▏        | 1262/10986 [52:16<7:53:53,  2.92s/it]

training loss: 3.5123229026794434


training:  11%|█▏        | 1263/10986 [52:18<7:13:46,  2.68s/it]

training loss: 3.448176383972168


training:  12%|█▏        | 1264/10986 [52:21<7:02:15,  2.61s/it]

training loss: 3.5391476154327393


training:  12%|█▏        | 1265/10986 [52:23<6:36:30,  2.45s/it]

training loss: 3.4166388511657715


training:  12%|█▏        | 1266/10986 [52:25<6:35:14,  2.44s/it]

training loss: 3.4133830070495605


training:  12%|█▏        | 1267/10986 [52:27<6:18:22,  2.34s/it]

training loss: 3.400054454803467


training:  12%|█▏        | 1268/10986 [52:30<6:25:37,  2.38s/it]

training loss: 3.441004514694214


training:  12%|█▏        | 1269/10986 [52:32<6:12:14,  2.30s/it]

training loss: 3.5612189769744873


training:  12%|█▏        | 1270/10986 [52:34<6:19:56,  2.35s/it]

training loss: 3.4036712646484375


training:  12%|█▏        | 1271/10986 [52:36<6:08:28,  2.28s/it]

training loss: 3.401308059692383


training:  12%|█▏        | 1272/10986 [52:39<6:16:51,  2.33s/it]

training loss: 3.4143073558807373


training:  12%|█▏        | 1273/10986 [52:41<6:09:48,  2.28s/it]

training loss: 3.335789442062378


training:  12%|█▏        | 1274/10986 [52:44<6:20:44,  2.35s/it]

training loss: 3.4082469940185547


training:  12%|█▏        | 1275/10986 [52:46<6:09:06,  2.28s/it]

training loss: 3.473968982696533


training:  12%|█▏        | 1276/10986 [52:48<6:18:43,  2.34s/it]

training loss: 3.397587537765503


training:  12%|█▏        | 1277/10986 [52:50<6:10:16,  2.29s/it]

training loss: 3.3750574588775635


training:  12%|█▏        | 1278/10986 [52:53<6:17:06,  2.33s/it]

training loss: 3.4819130897521973


training:  12%|█▏        | 1279/10986 [52:55<6:06:59,  2.27s/it]

training loss: 3.513766288757324


training:  12%|█▏        | 1280/10986 [52:57<6:15:02,  2.32s/it]

training loss: 3.3684074878692627
valid loss: 3.32621169090271
perplexity: 27.83270263671875


training:  12%|█▏        | 1281/10986 [53:02<7:59:54,  2.97s/it]

training loss: 3.500962257385254


training:  12%|█▏        | 1282/10986 [53:04<7:22:27,  2.74s/it]

training loss: 3.433077573776245


training:  12%|█▏        | 1283/10986 [53:06<7:08:43,  2.65s/it]

training loss: 3.301246404647827


training:  12%|█▏        | 1284/10986 [53:09<6:41:19,  2.48s/it]

training loss: 3.4609556198120117


training:  12%|█▏        | 1285/10986 [53:11<6:40:58,  2.48s/it]

training loss: 3.2923433780670166


training:  12%|█▏        | 1286/10986 [53:14<7:08:08,  2.65s/it]

training loss: 3.481390953063965


training:  12%|█▏        | 1287/10986 [53:17<6:57:33,  2.58s/it]

training loss: 3.2702276706695557


training:  12%|█▏        | 1288/10986 [53:19<6:33:49,  2.44s/it]

training loss: 3.547804117202759


training:  12%|█▏        | 1289/10986 [53:21<6:32:19,  2.43s/it]

training loss: 3.5303115844726562


training:  12%|█▏        | 1290/10986 [53:23<6:16:38,  2.33s/it]

training loss: 3.5002219676971436


training:  12%|█▏        | 1291/10986 [53:26<6:22:16,  2.37s/it]

training loss: 3.3466615676879883


training:  12%|█▏        | 1292/10986 [53:28<6:09:18,  2.29s/it]

training loss: 3.478728771209717


training:  12%|█▏        | 1293/10986 [53:30<6:17:52,  2.34s/it]

training loss: 3.4418177604675293


training:  12%|█▏        | 1294/10986 [53:32<6:06:26,  2.27s/it]

training loss: 3.510035991668701


training:  12%|█▏        | 1295/10986 [53:35<6:15:22,  2.32s/it]

training loss: 3.5695643424987793


training:  12%|█▏        | 1296/10986 [53:37<6:06:16,  2.27s/it]

training loss: 3.559117555618286


training:  12%|█▏        | 1297/10986 [53:39<6:15:03,  2.32s/it]

training loss: 3.5010361671447754


training:  12%|█▏        | 1298/10986 [53:41<6:06:16,  2.27s/it]

training loss: 3.6355247497558594


training:  12%|█▏        | 1299/10986 [53:44<6:14:46,  2.32s/it]

training loss: 3.3989086151123047


training:  12%|█▏        | 1300/10986 [53:46<6:05:07,  2.26s/it]

training loss: 3.4207942485809326
valid loss: 3.5059306621551514
perplexity: 33.312435150146484


training:  12%|█▏        | 1301/10986 [53:51<8:02:26,  2.99s/it]

training loss: 3.429138422012329


training:  12%|█▏        | 1302/10986 [53:53<7:41:46,  2.86s/it]

training loss: 3.411750316619873


training:  12%|█▏        | 1303/10986 [53:55<7:07:46,  2.65s/it]

training loss: 3.374730110168457


training:  12%|█▏        | 1304/10986 [53:58<6:58:06,  2.59s/it]

training loss: 3.345377206802368


training:  12%|█▏        | 1305/10986 [54:00<6:36:25,  2.46s/it]

training loss: 3.344268321990967


training:  12%|█▏        | 1306/10986 [54:02<6:37:24,  2.46s/it]

training loss: 3.3212597370147705


training:  12%|█▏        | 1307/10986 [54:05<6:22:06,  2.37s/it]

training loss: 3.2981834411621094


training:  12%|█▏        | 1308/10986 [54:07<6:25:45,  2.39s/it]

training loss: 3.566354751586914


training:  12%|█▏        | 1309/10986 [54:09<6:13:01,  2.31s/it]

training loss: 3.441593647003174


training:  12%|█▏        | 1310/10986 [54:13<7:20:05,  2.73s/it]

training loss: 3.32222056388855


training:  12%|█▏        | 1311/10986 [54:16<7:15:22,  2.70s/it]

training loss: 3.461681604385376


training:  12%|█▏        | 1312/10986 [54:18<7:03:16,  2.63s/it]

training loss: 3.4299771785736084


training:  12%|█▏        | 1313/10986 [54:20<6:39:52,  2.48s/it]

training loss: 3.4951956272125244


training:  12%|█▏        | 1314/10986 [54:23<6:41:54,  2.49s/it]

training loss: 3.3914406299591064


training:  12%|█▏        | 1315/10986 [54:25<6:24:01,  2.38s/it]

training loss: 3.433908700942993


training:  12%|█▏        | 1316/10986 [54:27<6:32:29,  2.44s/it]

training loss: 3.393963098526001


training:  12%|█▏        | 1317/10986 [54:29<6:17:45,  2.34s/it]

training loss: 3.363532543182373


training:  12%|█▏        | 1318/10986 [54:32<6:24:46,  2.39s/it]

training loss: 3.3840062618255615


training:  12%|█▏        | 1319/10986 [54:34<6:12:29,  2.31s/it]

training loss: 3.3715224266052246


training:  12%|█▏        | 1320/10986 [54:37<6:21:56,  2.37s/it]

training loss: 3.4170403480529785
valid loss: 3.4625377655029297
perplexity: 31.897823333740234


training:  12%|█▏        | 1321/10986 [54:41<8:01:04,  2.99s/it]

training loss: 3.4326376914978027


training:  12%|█▏        | 1322/10986 [54:43<7:23:28,  2.75s/it]

training loss: 3.559291124343872


training:  12%|█▏        | 1323/10986 [54:46<7:11:01,  2.68s/it]

training loss: 3.435016632080078


training:  12%|█▏        | 1324/10986 [54:48<6:45:52,  2.52s/it]

training loss: 3.4749045372009277


training:  12%|█▏        | 1325/10986 [54:50<6:44:05,  2.51s/it]

training loss: 3.431452512741089


training:  12%|█▏        | 1326/10986 [54:53<6:26:48,  2.40s/it]

training loss: 3.4080028533935547


training:  12%|█▏        | 1327/10986 [54:55<6:33:25,  2.44s/it]

training loss: 3.405715227127075


training:  12%|█▏        | 1328/10986 [54:57<6:17:50,  2.35s/it]

training loss: 3.3521132469177246


training:  12%|█▏        | 1329/10986 [55:00<6:22:53,  2.38s/it]

training loss: 3.357870101928711


training:  12%|█▏        | 1330/10986 [55:02<6:12:22,  2.31s/it]

training loss: 3.5033655166625977


training:  12%|█▏        | 1331/10986 [55:04<6:24:37,  2.39s/it]

training loss: 3.3715450763702393


training:  12%|█▏        | 1332/10986 [55:06<6:11:54,  2.31s/it]

training loss: 3.404827356338501


training:  12%|█▏        | 1333/10986 [55:09<6:18:16,  2.35s/it]

training loss: 3.490260124206543


training:  12%|█▏        | 1334/10986 [55:11<6:06:57,  2.28s/it]

training loss: 3.355830192565918


training:  12%|█▏        | 1335/10986 [55:14<6:32:08,  2.44s/it]

training loss: 3.5396227836608887


training:  12%|█▏        | 1336/10986 [55:16<6:17:17,  2.35s/it]

training loss: 3.4288272857666016


training:  12%|█▏        | 1337/10986 [55:18<6:21:24,  2.37s/it]

training loss: 3.404355525970459


training:  12%|█▏        | 1338/10986 [55:21<6:07:19,  2.28s/it]

training loss: 3.295170307159424


training:  12%|█▏        | 1339/10986 [55:23<6:14:00,  2.33s/it]

training loss: 3.341681480407715


training:  12%|█▏        | 1340/10986 [55:25<6:07:03,  2.28s/it]

training loss: 3.421769857406616
valid loss: 3.4156529903411865
perplexity: 30.436817169189453


training:  12%|█▏        | 1341/10986 [55:30<7:52:17,  2.94s/it]

training loss: 3.3574113845825195


training:  12%|█▏        | 1342/10986 [55:32<7:34:50,  2.83s/it]

training loss: 3.436699867248535


training:  12%|█▏        | 1343/10986 [55:34<6:59:50,  2.61s/it]

training loss: 3.4014947414398193


training:  12%|█▏        | 1344/10986 [55:37<6:54:10,  2.58s/it]

training loss: 3.389479398727417


training:  12%|█▏        | 1345/10986 [55:39<6:32:29,  2.44s/it]

training loss: 3.3529465198516846


training:  12%|█▏        | 1346/10986 [55:41<6:34:27,  2.46s/it]

training loss: 3.4747653007507324


training:  12%|█▏        | 1347/10986 [55:43<6:17:33,  2.35s/it]

training loss: 3.4453299045562744


training:  12%|█▏        | 1348/10986 [55:46<6:23:25,  2.39s/it]

training loss: 3.484004497528076


training:  12%|█▏        | 1349/10986 [55:48<6:11:02,  2.31s/it]

training loss: 3.5024008750915527


training:  12%|█▏        | 1350/10986 [55:51<6:20:14,  2.37s/it]

training loss: 3.3480048179626465


training:  12%|█▏        | 1351/10986 [55:53<6:08:42,  2.30s/it]

training loss: 3.4087302684783936


training:  12%|█▏        | 1352/10986 [55:55<6:19:43,  2.36s/it]

training loss: 3.7277350425720215


training:  12%|█▏        | 1353/10986 [55:57<6:08:13,  2.29s/it]

training loss: 3.4373719692230225


training:  12%|█▏        | 1354/10986 [56:00<6:15:19,  2.34s/it]

training loss: 3.5567214488983154


training:  12%|█▏        | 1355/10986 [56:02<6:05:50,  2.28s/it]

training loss: 3.684269428253174


training:  12%|█▏        | 1356/10986 [56:04<6:13:09,  2.33s/it]

training loss: 3.4062001705169678


training:  12%|█▏        | 1357/10986 [56:07<6:06:00,  2.28s/it]

training loss: 3.5576672554016113


training:  12%|█▏        | 1358/10986 [56:09<6:12:43,  2.32s/it]

training loss: 3.465599536895752


training:  12%|█▏        | 1359/10986 [56:11<6:02:59,  2.26s/it]

training loss: 3.4841108322143555


training:  12%|█▏        | 1360/10986 [56:14<6:25:12,  2.40s/it]

training loss: 3.617260217666626
valid loss: 3.584045886993408
perplexity: 36.01897430419922


training:  12%|█▏        | 1361/10986 [56:18<8:02:10,  3.01s/it]

training loss: 3.5107502937316895


training:  12%|█▏        | 1362/10986 [56:20<7:23:10,  2.76s/it]

training loss: 3.510348081588745


training:  12%|█▏        | 1363/10986 [56:23<7:09:18,  2.68s/it]

training loss: 3.4439897537231445


training:  12%|█▏        | 1364/10986 [56:25<6:42:05,  2.51s/it]

training loss: 3.4383721351623535


training:  12%|█▏        | 1365/10986 [56:27<6:39:53,  2.49s/it]

training loss: 3.437809944152832


training:  12%|█▏        | 1366/10986 [56:30<6:23:00,  2.39s/it]

training loss: 3.388607978820801


training:  12%|█▏        | 1367/10986 [56:32<6:27:47,  2.42s/it]

training loss: 3.6398777961730957


training:  12%|█▏        | 1368/10986 [56:34<6:12:18,  2.32s/it]

training loss: 3.485391855239868


training:  12%|█▏        | 1369/10986 [56:37<6:21:52,  2.38s/it]

training loss: 3.5167040824890137


training:  12%|█▏        | 1370/10986 [56:39<6:09:19,  2.30s/it]

training loss: 3.3615827560424805


training:  12%|█▏        | 1371/10986 [56:41<6:16:58,  2.35s/it]

training loss: 3.570946216583252


training:  12%|█▏        | 1372/10986 [56:43<6:06:46,  2.29s/it]

training loss: 3.4560799598693848


training:  12%|█▏        | 1373/10986 [56:46<6:17:08,  2.35s/it]

training loss: 3.6347367763519287


training:  13%|█▎        | 1374/10986 [56:48<6:08:06,  2.30s/it]

training loss: 3.325644016265869


training:  13%|█▎        | 1375/10986 [56:51<6:17:01,  2.35s/it]

training loss: 3.3984177112579346


training:  13%|█▎        | 1376/10986 [56:53<6:05:14,  2.28s/it]

training loss: 3.443452835083008


training:  13%|█▎        | 1377/10986 [56:55<6:15:31,  2.34s/it]

training loss: 3.4934234619140625


training:  13%|█▎        | 1378/10986 [56:57<6:06:04,  2.29s/it]

training loss: 3.4949166774749756


training:  13%|█▎        | 1379/10986 [57:01<6:47:33,  2.55s/it]

training loss: 3.443369150161743


training:  13%|█▎        | 1380/10986 [57:03<7:04:00,  2.65s/it]

training loss: 3.5364673137664795
valid loss: 3.4273557662963867
perplexity: 30.79510498046875


training:  13%|█▎        | 1381/10986 [57:08<8:33:21,  3.21s/it]

training loss: 3.369612455368042


training:  13%|█▎        | 1382/10986 [57:10<8:00:19,  3.00s/it]

training loss: 3.429542064666748


training:  13%|█▎        | 1383/10986 [57:13<7:41:12,  2.88s/it]

training loss: 3.313633918762207


training:  13%|█▎        | 1384/10986 [57:16<7:43:49,  2.90s/it]

training loss: 3.4984467029571533


training:  13%|█▎        | 1385/10986 [57:18<7:06:22,  2.66s/it]

training loss: 3.5802767276763916


training:  13%|█▎        | 1386/10986 [57:21<6:55:34,  2.60s/it]

training loss: 3.4180045127868652


training:  13%|█▎        | 1387/10986 [57:23<6:32:47,  2.46s/it]

training loss: 3.3387575149536133


training:  13%|█▎        | 1388/10986 [57:25<6:33:37,  2.46s/it]

training loss: 3.4412379264831543


training:  13%|█▎        | 1389/10986 [57:27<6:18:01,  2.36s/it]

training loss: 3.492105722427368


training:  13%|█▎        | 1390/10986 [57:30<6:23:58,  2.40s/it]

training loss: 3.4351470470428467


training:  13%|█▎        | 1391/10986 [57:32<6:09:52,  2.31s/it]

training loss: 3.4543468952178955


training:  13%|█▎        | 1392/10986 [57:34<6:15:49,  2.35s/it]

training loss: 3.4151663780212402


training:  13%|█▎        | 1393/10986 [57:36<6:04:45,  2.28s/it]

training loss: 3.4030542373657227


training:  13%|█▎        | 1394/10986 [57:39<6:13:02,  2.33s/it]

training loss: 3.434269666671753


training:  13%|█▎        | 1395/10986 [57:41<6:01:48,  2.26s/it]

training loss: 3.3777709007263184


training:  13%|█▎        | 1396/10986 [57:44<6:15:44,  2.35s/it]

training loss: 3.422065496444702


training:  13%|█▎        | 1397/10986 [57:46<6:03:45,  2.28s/it]

training loss: 3.6103644371032715


training:  13%|█▎        | 1398/10986 [57:48<6:12:35,  2.33s/it]

training loss: 3.476407051086426


training:  13%|█▎        | 1399/10986 [57:50<6:04:16,  2.28s/it]

training loss: 3.3675246238708496


training:  13%|█▎        | 1400/10986 [57:53<6:14:02,  2.34s/it]

training loss: 3.438887596130371
valid loss: 3.473670244216919
perplexity: 32.25490951538086


training:  13%|█▎        | 1401/10986 [57:58<8:11:03,  3.07s/it]

training loss: 3.449913501739502


training:  13%|█▎        | 1402/10986 [58:00<7:27:46,  2.80s/it]

training loss: 3.40693998336792


training:  13%|█▎        | 1403/10986 [58:02<7:10:34,  2.70s/it]

training loss: 3.516918182373047


training:  13%|█▎        | 1404/10986 [58:04<6:44:17,  2.53s/it]

training loss: 3.5787699222564697


training:  13%|█▎        | 1405/10986 [58:07<6:39:32,  2.50s/it]

training loss: 3.4757330417633057


training:  13%|█▎        | 1406/10986 [58:09<6:22:00,  2.39s/it]

training loss: 3.5022764205932617


training:  13%|█▎        | 1407/10986 [58:11<6:26:14,  2.42s/it]

training loss: 3.4770891666412354


training:  13%|█▎        | 1408/10986 [58:14<6:26:38,  2.42s/it]

training loss: 3.46659779548645


training:  13%|█▎        | 1409/10986 [58:16<6:27:48,  2.43s/it]

training loss: 3.603757858276367


training:  13%|█▎        | 1410/10986 [58:18<6:13:46,  2.34s/it]

training loss: 3.6107304096221924


training:  13%|█▎        | 1411/10986 [58:21<6:19:12,  2.38s/it]

training loss: 3.3836352825164795


training:  13%|█▎        | 1412/10986 [58:23<6:07:31,  2.30s/it]

training loss: 3.4873764514923096


training:  13%|█▎        | 1413/10986 [58:25<6:14:37,  2.35s/it]

training loss: 3.4229226112365723


training:  13%|█▎        | 1414/10986 [58:28<6:03:03,  2.28s/it]

training loss: 3.776456594467163


training:  13%|█▎        | 1415/10986 [58:30<6:11:22,  2.33s/it]

training loss: 3.418684720993042


training:  13%|█▎        | 1416/10986 [58:32<6:02:44,  2.27s/it]

training loss: 3.56912899017334


training:  13%|█▎        | 1417/10986 [58:35<6:08:53,  2.31s/it]

training loss: 3.5304551124572754


training:  13%|█▎        | 1418/10986 [58:37<5:58:48,  2.25s/it]

training loss: 3.584622383117676


training:  13%|█▎        | 1419/10986 [58:39<6:09:04,  2.31s/it]

training loss: 3.4562814235687256


training:  13%|█▎        | 1420/10986 [58:41<6:01:25,  2.27s/it]

training loss: 3.5731754302978516
valid loss: 3.449373722076416
perplexity: 31.480669021606445


training:  13%|█▎        | 1421/10986 [58:46<7:46:21,  2.93s/it]

training loss: 3.4002411365509033


training:  13%|█▎        | 1422/10986 [58:48<7:25:09,  2.79s/it]

training loss: 3.3541431427001953


training:  13%|█▎        | 1423/10986 [58:50<6:53:24,  2.59s/it]

training loss: 3.505615472793579


training:  13%|█▎        | 1424/10986 [58:53<6:45:33,  2.54s/it]

training loss: 3.471342086791992


training:  13%|█▎        | 1425/10986 [58:55<6:24:44,  2.41s/it]

training loss: 3.5432050228118896


training:  13%|█▎        | 1426/10986 [58:57<6:25:27,  2.42s/it]

training loss: 3.668050765991211


training:  13%|█▎        | 1427/10986 [59:00<6:32:43,  2.47s/it]

training loss: 3.5480422973632812


training:  13%|█▎        | 1428/10986 [59:03<7:11:48,  2.71s/it]

training loss: 3.5748202800750732


training:  13%|█▎        | 1429/10986 [59:05<6:44:09,  2.54s/it]

training loss: 3.581988573074341


training:  13%|█▎        | 1430/10986 [59:08<6:37:58,  2.50s/it]

training loss: 3.4788765907287598


training:  13%|█▎        | 1431/10986 [59:10<6:19:43,  2.38s/it]

training loss: 3.413032054901123


training:  13%|█▎        | 1432/10986 [59:13<6:36:17,  2.49s/it]

training loss: 3.403888463973999


training:  13%|█▎        | 1433/10986 [59:16<7:00:11,  2.64s/it]

training loss: 3.4646215438842773


training:  13%|█▎        | 1434/10986 [59:18<6:50:23,  2.58s/it]

training loss: 3.4886648654937744


training:  13%|█▎        | 1435/10986 [59:20<6:28:59,  2.44s/it]

training loss: 3.4343817234039307


training:  13%|█▎        | 1436/10986 [59:23<6:29:49,  2.45s/it]

training loss: 3.3914968967437744


training:  13%|█▎        | 1437/10986 [59:25<6:14:05,  2.35s/it]

training loss: 3.5399725437164307


training:  13%|█▎        | 1438/10986 [59:27<6:16:44,  2.37s/it]

training loss: 3.473616361618042


training:  13%|█▎        | 1439/10986 [59:29<6:08:44,  2.32s/it]

training loss: 3.4344444274902344


training:  13%|█▎        | 1440/10986 [59:32<6:13:41,  2.35s/it]

training loss: 3.4822723865509033
valid loss: 3.5538933277130127
perplexity: 34.94912338256836


training:  13%|█▎        | 1441/10986 [59:36<7:57:13,  3.00s/it]

training loss: 3.399188756942749


training:  13%|█▎        | 1442/10986 [59:38<7:18:39,  2.76s/it]

training loss: 3.4336979389190674


training:  13%|█▎        | 1443/10986 [59:41<7:04:56,  2.67s/it]

training loss: 3.534975528717041


training:  13%|█▎        | 1444/10986 [59:43<6:41:18,  2.52s/it]

training loss: 3.4821531772613525


training:  13%|█▎        | 1445/10986 [59:45<6:36:40,  2.49s/it]

training loss: 3.644254684448242


training:  13%|█▎        | 1446/10986 [59:48<6:43:51,  2.54s/it]

training loss: 3.4798426628112793


training:  13%|█▎        | 1447/10986 [59:51<7:19:48,  2.77s/it]

training loss: 3.4843385219573975


training:  13%|█▎        | 1448/10986 [59:54<6:50:33,  2.58s/it]

training loss: 3.4526567459106445


training:  13%|█▎        | 1449/10986 [59:56<6:43:03,  2.54s/it]

training loss: 3.4964168071746826


training:  13%|█▎        | 1450/10986 [59:58<6:23:25,  2.41s/it]

training loss: 3.4940524101257324


training:  13%|█▎        | 1451/10986 [1:00:01<6:22:43,  2.41s/it]

training loss: 3.480870246887207


training:  13%|█▎        | 1452/10986 [1:00:03<6:07:54,  2.32s/it]

training loss: 3.6245484352111816


training:  13%|█▎        | 1453/10986 [1:00:05<6:13:53,  2.35s/it]

training loss: 3.5661709308624268


training:  13%|█▎        | 1454/10986 [1:00:07<6:03:32,  2.29s/it]

training loss: 3.4075050354003906


training:  13%|█▎        | 1455/10986 [1:00:10<6:11:14,  2.34s/it]

training loss: 3.3674991130828857


training:  13%|█▎        | 1456/10986 [1:00:12<6:00:56,  2.27s/it]

training loss: 3.413228750228882


training:  13%|█▎        | 1457/10986 [1:00:16<7:25:03,  2.80s/it]

training loss: 3.4984729290008545


training:  13%|█▎        | 1458/10986 [1:00:18<6:51:18,  2.59s/it]

training loss: 3.4262640476226807


training:  13%|█▎        | 1459/10986 [1:00:20<6:43:40,  2.54s/it]

training loss: 3.3723089694976807


training:  13%|█▎        | 1460/10986 [1:00:22<6:22:54,  2.41s/it]

training loss: 3.458805561065674
valid loss: 3.333590269088745
perplexity: 28.038827896118164


training:  13%|█▎        | 1461/10986 [1:00:27<7:58:28,  3.01s/it]

training loss: 3.547602415084839


training:  13%|█▎        | 1462/10986 [1:00:29<7:33:47,  2.86s/it]

training loss: 3.5279276371002197


training:  13%|█▎        | 1463/10986 [1:00:31<6:58:16,  2.64s/it]

training loss: 3.601506233215332


training:  13%|█▎        | 1464/10986 [1:00:34<6:46:37,  2.56s/it]

training loss: 3.5059714317321777


training:  13%|█▎        | 1465/10986 [1:00:36<6:25:30,  2.43s/it]

training loss: 3.4333224296569824


training:  13%|█▎        | 1466/10986 [1:00:38<6:24:04,  2.42s/it]

training loss: 3.5929293632507324


training:  13%|█▎        | 1467/10986 [1:00:41<6:09:23,  2.33s/it]

training loss: 3.4574196338653564


training:  13%|█▎        | 1468/10986 [1:00:43<6:13:57,  2.36s/it]

training loss: 3.469334125518799


training:  13%|█▎        | 1469/10986 [1:00:45<6:04:08,  2.30s/it]

training loss: 3.51291823387146


training:  13%|█▎        | 1470/10986 [1:00:47<6:09:45,  2.33s/it]

training loss: 3.399885416030884


training:  13%|█▎        | 1471/10986 [1:00:50<5:59:09,  2.26s/it]

training loss: 3.347085952758789


training:  13%|█▎        | 1472/10986 [1:00:52<6:05:50,  2.31s/it]

training loss: 3.6176650524139404


training:  13%|█▎        | 1473/10986 [1:00:54<5:57:17,  2.25s/it]

training loss: 3.4704413414001465


training:  13%|█▎        | 1474/10986 [1:00:57<6:06:49,  2.31s/it]

training loss: 3.4013476371765137


training:  13%|█▎        | 1475/10986 [1:00:59<5:58:02,  2.26s/it]

training loss: 3.4642364978790283


training:  13%|█▎        | 1476/10986 [1:01:01<6:06:15,  2.31s/it]

training loss: 3.466108798980713


training:  13%|█▎        | 1477/10986 [1:01:03<5:56:49,  2.25s/it]

training loss: 3.424727439880371


training:  13%|█▎        | 1478/10986 [1:01:06<6:04:26,  2.30s/it]

training loss: 3.4364521503448486


training:  13%|█▎        | 1479/10986 [1:01:08<5:55:14,  2.24s/it]

training loss: 3.5673305988311768


training:  13%|█▎        | 1480/10986 [1:01:10<6:03:01,  2.29s/it]

training loss: 3.469956398010254
valid loss: 3.4479482173919678
perplexity: 31.435827255249023


training:  13%|█▎        | 1481/10986 [1:01:15<8:24:51,  3.19s/it]

training loss: 3.563309907913208


training:  13%|█▎        | 1482/10986 [1:01:18<7:36:21,  2.88s/it]

training loss: 3.626687526702881


training:  13%|█▎        | 1483/10986 [1:01:20<7:13:32,  2.74s/it]

training loss: 3.564455509185791


training:  14%|█▎        | 1484/10986 [1:01:22<6:43:39,  2.55s/it]

training loss: 3.3794960975646973


training:  14%|█▎        | 1485/10986 [1:01:25<6:36:43,  2.51s/it]

training loss: 3.5068840980529785


training:  14%|█▎        | 1486/10986 [1:01:27<6:19:15,  2.40s/it]

training loss: 3.3239169120788574


training:  14%|█▎        | 1487/10986 [1:01:29<6:19:47,  2.40s/it]

training loss: 3.5899765491485596


training:  14%|█▎        | 1488/10986 [1:01:31<6:08:36,  2.33s/it]

training loss: 3.4358270168304443


training:  14%|█▎        | 1489/10986 [1:01:34<6:13:12,  2.36s/it]

training loss: 3.5974626541137695


training:  14%|█▎        | 1490/10986 [1:01:36<6:00:57,  2.28s/it]

training loss: 3.5385537147521973


training:  14%|█▎        | 1491/10986 [1:01:38<6:08:15,  2.33s/it]

training loss: 3.5217511653900146


training:  14%|█▎        | 1492/10986 [1:01:40<5:59:52,  2.27s/it]

training loss: 3.3942131996154785


training:  14%|█▎        | 1493/10986 [1:01:43<6:06:31,  2.32s/it]

training loss: 3.4236044883728027


training:  14%|█▎        | 1494/10986 [1:01:45<5:57:02,  2.26s/it]

training loss: 3.468088150024414


training:  14%|█▎        | 1495/10986 [1:01:47<6:06:10,  2.31s/it]

training loss: 3.409670829772949


training:  14%|█▎        | 1496/10986 [1:01:49<5:56:46,  2.26s/it]

training loss: 3.6802940368652344


training:  14%|█▎        | 1497/10986 [1:01:52<6:03:49,  2.30s/it]

training loss: 3.4747860431671143


training:  14%|█▎        | 1498/10986 [1:01:54<5:55:23,  2.25s/it]

training loss: 3.5300140380859375


training:  14%|█▎        | 1499/10986 [1:01:56<6:06:33,  2.32s/it]

training loss: 3.435378074645996


training:  14%|█▎        | 1500/10986 [1:01:59<5:57:44,  2.26s/it]

training loss: 3.387843132019043
valid loss: 3.358336925506592
perplexity: 28.741350173950195


training:  14%|█▎        | 1501/10986 [1:02:03<7:53:50,  3.00s/it]

training loss: 3.5623488426208496


training:  14%|█▎        | 1502/10986 [1:02:06<7:31:00,  2.85s/it]

training loss: 3.54445219039917


training:  14%|█▎        | 1503/10986 [1:02:08<6:56:12,  2.63s/it]

training loss: 3.6292665004730225


training:  14%|█▎        | 1504/10986 [1:02:10<6:48:14,  2.58s/it]

training loss: 3.36419415473938


training:  14%|█▎        | 1505/10986 [1:02:13<6:50:57,  2.60s/it]

training loss: 3.483999490737915


training:  14%|█▎        | 1506/10986 [1:02:17<8:08:03,  3.09s/it]

training loss: 3.3832921981811523


training:  14%|█▎        | 1507/10986 [1:02:20<7:57:34,  3.02s/it]

training loss: 3.5975069999694824


training:  14%|█▎        | 1508/10986 [1:02:23<7:29:46,  2.85s/it]

training loss: 3.5318870544433594


training:  14%|█▎        | 1509/10986 [1:02:25<7:17:16,  2.77s/it]

training loss: 3.3248918056488037


training:  14%|█▎        | 1510/10986 [1:02:28<7:23:31,  2.81s/it]

training loss: 3.5358877182006836


training:  14%|█▍        | 1511/10986 [1:02:30<6:51:01,  2.60s/it]

training loss: 3.6044137477874756


training:  14%|█▍        | 1512/10986 [1:02:33<6:42:47,  2.55s/it]

training loss: 3.349834680557251


training:  14%|█▍        | 1513/10986 [1:02:35<6:22:12,  2.42s/it]

training loss: 3.3676648139953613


training:  14%|█▍        | 1514/10986 [1:02:40<8:18:03,  3.15s/it]

training loss: 3.5427844524383545


training:  14%|█▍        | 1515/10986 [1:02:43<8:36:14,  3.27s/it]

training loss: 3.4549686908721924


training:  14%|█▍        | 1516/10986 [1:02:46<7:57:47,  3.03s/it]

training loss: 3.4510645866394043


training:  14%|█▍        | 1517/10986 [1:02:48<7:15:12,  2.76s/it]

training loss: 3.4832019805908203


training:  14%|█▍        | 1518/10986 [1:02:50<7:00:26,  2.66s/it]

training loss: 3.4334259033203125


training:  14%|█▍        | 1519/10986 [1:02:52<6:36:27,  2.51s/it]

training loss: 3.4891879558563232


training:  14%|█▍        | 1520/10986 [1:02:55<6:33:51,  2.50s/it]

training loss: 3.544130802154541
valid loss: 3.619549512863159
perplexity: 37.32075119018555


training:  14%|█▍        | 1521/10986 [1:03:00<8:22:50,  3.19s/it]

training loss: 3.3761870861053467


training:  14%|█▍        | 1522/10986 [1:03:02<7:35:25,  2.89s/it]

training loss: 3.447762966156006


training:  14%|█▍        | 1523/10986 [1:03:04<7:15:33,  2.76s/it]

training loss: 3.495692253112793


training:  14%|█▍        | 1524/10986 [1:03:06<6:45:26,  2.57s/it]

training loss: 3.573892831802368


training:  14%|█▍        | 1525/10986 [1:03:09<6:39:47,  2.54s/it]

training loss: 3.5195436477661133


training:  14%|█▍        | 1526/10986 [1:03:11<6:19:53,  2.41s/it]

training loss: 3.538986921310425


training:  14%|█▍        | 1527/10986 [1:03:14<6:53:05,  2.62s/it]

training loss: 3.3325207233428955


training:  14%|█▍        | 1528/10986 [1:03:16<6:31:14,  2.48s/it]

training loss: 3.4944119453430176


training:  14%|█▍        | 1529/10986 [1:03:19<6:28:02,  2.46s/it]

training loss: 3.4660401344299316


training:  14%|█▍        | 1530/10986 [1:03:21<6:11:21,  2.36s/it]

training loss: 3.4866199493408203


training:  14%|█▍        | 1531/10986 [1:03:23<6:13:33,  2.37s/it]

training loss: 3.417637825012207


training:  14%|█▍        | 1532/10986 [1:03:25<6:01:50,  2.30s/it]

training loss: 3.3965423107147217


training:  14%|█▍        | 1533/10986 [1:03:28<6:09:18,  2.34s/it]

training loss: 3.392592191696167


training:  14%|█▍        | 1534/10986 [1:03:30<5:59:26,  2.28s/it]

training loss: 3.556962490081787


training:  14%|█▍        | 1535/10986 [1:03:32<6:04:01,  2.31s/it]

training loss: 3.3940541744232178


training:  14%|█▍        | 1536/10986 [1:03:34<5:55:38,  2.26s/it]

training loss: 3.369173049926758


training:  14%|█▍        | 1537/10986 [1:03:37<6:03:13,  2.31s/it]

training loss: 3.6496410369873047


training:  14%|█▍        | 1538/10986 [1:03:39<5:53:28,  2.24s/it]

training loss: 3.444786310195923


training:  14%|█▍        | 1539/10986 [1:03:41<6:00:36,  2.29s/it]

training loss: 3.4569742679595947


training:  14%|█▍        | 1540/10986 [1:03:43<5:53:47,  2.25s/it]

training loss: 3.407478094100952
valid loss: 3.4598450660705566
perplexity: 31.812047958374023


training:  14%|█▍        | 1541/10986 [1:03:48<7:40:09,  2.92s/it]

training loss: 3.463136672973633


training:  14%|█▍        | 1542/10986 [1:03:51<7:22:48,  2.81s/it]

training loss: 3.501248598098755


training:  14%|█▍        | 1543/10986 [1:03:53<6:49:30,  2.60s/it]

training loss: 3.481995105743408


training:  14%|█▍        | 1544/10986 [1:03:55<6:42:34,  2.56s/it]

training loss: 3.5898921489715576


training:  14%|█▍        | 1545/10986 [1:03:57<6:21:40,  2.43s/it]

training loss: 3.660313844680786


training:  14%|█▍        | 1546/10986 [1:04:00<6:22:57,  2.43s/it]

training loss: 3.5789477825164795


training:  14%|█▍        | 1547/10986 [1:04:02<6:09:49,  2.35s/it]

training loss: 3.5683631896972656


training:  14%|█▍        | 1548/10986 [1:04:04<6:12:00,  2.36s/it]

training loss: 3.421037197113037


training:  14%|█▍        | 1549/10986 [1:04:06<6:00:13,  2.29s/it]

training loss: 3.454874277114868


training:  14%|█▍        | 1550/10986 [1:04:09<6:03:52,  2.31s/it]

training loss: 3.4044361114501953


training:  14%|█▍        | 1551/10986 [1:04:11<5:55:47,  2.26s/it]

training loss: 3.551776647567749


training:  14%|█▍        | 1552/10986 [1:04:14<6:22:14,  2.43s/it]

training loss: 3.420771598815918


training:  14%|█▍        | 1553/10986 [1:04:16<6:07:03,  2.33s/it]

training loss: 3.658771276473999


training:  14%|█▍        | 1554/10986 [1:04:18<6:13:26,  2.38s/it]

training loss: 3.4409842491149902


training:  14%|█▍        | 1555/10986 [1:04:20<6:01:18,  2.30s/it]

training loss: 3.420775890350342


training:  14%|█▍        | 1556/10986 [1:04:23<6:07:57,  2.34s/it]

training loss: 3.517185926437378


training:  14%|█▍        | 1557/10986 [1:04:25<6:19:23,  2.41s/it]

training loss: 3.491215705871582


training:  14%|█▍        | 1558/10986 [1:04:28<6:20:13,  2.42s/it]

training loss: 3.5856311321258545


training:  14%|█▍        | 1559/10986 [1:04:30<6:06:28,  2.33s/it]

training loss: 3.5398595333099365


training:  14%|█▍        | 1560/10986 [1:04:32<6:12:48,  2.37s/it]

training loss: 3.6161766052246094
valid loss: 3.540475606918335
perplexity: 34.48331832885742


training:  14%|█▍        | 1561/10986 [1:04:37<7:52:51,  3.01s/it]

training loss: 3.4685022830963135


training:  14%|█▍        | 1562/10986 [1:04:39<7:12:53,  2.76s/it]

training loss: 3.5518367290496826


training:  14%|█▍        | 1563/10986 [1:04:42<6:59:49,  2.67s/it]

training loss: 3.5389976501464844


training:  14%|█▍        | 1564/10986 [1:04:44<6:33:29,  2.51s/it]

training loss: 3.4507923126220703


training:  14%|█▍        | 1565/10986 [1:04:46<6:33:27,  2.51s/it]

training loss: 3.5724236965179443


training:  14%|█▍        | 1566/10986 [1:04:48<6:14:58,  2.39s/it]

training loss: 3.4540505409240723


training:  14%|█▍        | 1567/10986 [1:04:51<6:17:26,  2.40s/it]

training loss: 3.5125107765197754


training:  14%|█▍        | 1568/10986 [1:04:53<6:03:17,  2.31s/it]

training loss: 3.5096168518066406


training:  14%|█▍        | 1569/10986 [1:04:55<6:10:38,  2.36s/it]

training loss: 3.3815407752990723


training:  14%|█▍        | 1570/10986 [1:04:57<5:57:44,  2.28s/it]

training loss: 3.2585718631744385


training:  14%|█▍        | 1571/10986 [1:05:00<6:05:43,  2.33s/it]

training loss: 3.4552910327911377


training:  14%|█▍        | 1572/10986 [1:05:02<5:55:48,  2.27s/it]

training loss: 3.3858485221862793


training:  14%|█▍        | 1573/10986 [1:05:04<6:04:01,  2.32s/it]

training loss: 3.390841484069824


training:  14%|█▍        | 1574/10986 [1:05:07<5:54:06,  2.26s/it]

training loss: 3.407398223876953


training:  14%|█▍        | 1575/10986 [1:05:09<6:01:36,  2.31s/it]

training loss: 3.438340187072754


training:  14%|█▍        | 1576/10986 [1:05:11<5:52:53,  2.25s/it]

training loss: 3.438248634338379


training:  14%|█▍        | 1577/10986 [1:05:15<6:56:01,  2.65s/it]

training loss: 3.3883068561553955


training:  14%|█▍        | 1578/10986 [1:05:17<6:32:37,  2.50s/it]

training loss: 3.5361642837524414


training:  14%|█▍        | 1579/10986 [1:05:19<6:29:01,  2.48s/it]

training loss: 3.417250394821167


training:  14%|█▍        | 1580/10986 [1:05:21<6:10:52,  2.37s/it]

training loss: 3.5664823055267334
valid loss: 3.3636932373046875
perplexity: 28.895713806152344


training:  14%|█▍        | 1581/10986 [1:05:27<8:56:44,  3.42s/it]

training loss: 3.524031639099121


training:  14%|█▍        | 1582/10986 [1:05:30<8:13:34,  3.15s/it]

training loss: 3.574204444885254


training:  14%|█▍        | 1583/10986 [1:05:32<7:24:26,  2.84s/it]

training loss: 3.4084312915802


training:  14%|█▍        | 1584/10986 [1:05:34<7:07:12,  2.73s/it]

training loss: 3.3928236961364746


training:  14%|█▍        | 1585/10986 [1:05:36<6:37:28,  2.54s/it]

training loss: 3.512040615081787


training:  14%|█▍        | 1586/10986 [1:05:39<6:32:36,  2.51s/it]

training loss: 3.4518778324127197


training:  14%|█▍        | 1587/10986 [1:05:41<6:14:50,  2.39s/it]

training loss: 3.4343066215515137


training:  14%|█▍        | 1588/10986 [1:05:43<6:16:46,  2.41s/it]

training loss: 3.4137649536132812


training:  14%|█▍        | 1589/10986 [1:05:46<6:03:21,  2.32s/it]

training loss: 3.4248008728027344


training:  14%|█▍        | 1590/10986 [1:05:48<6:10:57,  2.37s/it]

training loss: 3.5894978046417236


training:  14%|█▍        | 1591/10986 [1:05:50<5:58:38,  2.29s/it]

training loss: 3.4521656036376953


training:  14%|█▍        | 1592/10986 [1:05:53<6:06:13,  2.34s/it]

training loss: 3.5380196571350098


training:  15%|█▍        | 1593/10986 [1:05:55<5:56:46,  2.28s/it]

training loss: 3.5766806602478027


training:  15%|█▍        | 1594/10986 [1:05:57<6:02:49,  2.32s/it]

training loss: 3.5172479152679443


training:  15%|█▍        | 1595/10986 [1:05:59<5:53:37,  2.26s/it]

training loss: 3.6390457153320312


training:  15%|█▍        | 1596/10986 [1:06:02<6:02:57,  2.32s/it]

training loss: 3.4982593059539795


training:  15%|█▍        | 1597/10986 [1:06:04<5:52:43,  2.25s/it]

training loss: 3.5702898502349854


training:  15%|█▍        | 1598/10986 [1:06:06<6:03:04,  2.32s/it]

training loss: 3.3803815841674805


training:  15%|█▍        | 1599/10986 [1:06:08<5:52:46,  2.25s/it]

training loss: 3.5249998569488525


training:  15%|█▍        | 1600/10986 [1:06:11<6:00:54,  2.31s/it]

training loss: 3.5492451190948486
valid loss: 3.401215076446533
perplexity: 30.000532150268555


training:  15%|█▍        | 1601/10986 [1:06:18<10:02:22,  3.85s/it]

training loss: 3.438256025314331


training:  15%|█▍        | 1602/10986 [1:06:20<8:44:38,  3.35s/it] 

training loss: 3.42820405960083


training:  15%|█▍        | 1603/10986 [1:06:23<8:02:44,  3.09s/it]

training loss: 3.295098066329956


training:  15%|█▍        | 1604/10986 [1:06:25<7:20:34,  2.82s/it]

training loss: 3.293731212615967


training:  15%|█▍        | 1605/10986 [1:06:28<7:05:46,  2.72s/it]

training loss: 3.441788911819458


training:  15%|█▍        | 1606/10986 [1:06:30<6:36:49,  2.54s/it]

training loss: 3.4528958797454834


training:  15%|█▍        | 1607/10986 [1:06:32<6:32:30,  2.51s/it]

training loss: 3.500361919403076


training:  15%|█▍        | 1608/10986 [1:06:34<6:15:11,  2.40s/it]

training loss: 3.376307249069214


training:  15%|█▍        | 1609/10986 [1:06:37<6:16:05,  2.41s/it]

training loss: 3.563173532485962


training:  15%|█▍        | 1610/10986 [1:06:39<6:01:13,  2.31s/it]

training loss: 3.4309325218200684


training:  15%|█▍        | 1611/10986 [1:06:41<6:06:03,  2.34s/it]

training loss: 3.404849052429199


training:  15%|█▍        | 1612/10986 [1:06:43<5:56:15,  2.28s/it]

training loss: 3.3918533325195312


training:  15%|█▍        | 1613/10986 [1:06:46<6:03:43,  2.33s/it]

training loss: 3.37695050239563


training:  15%|█▍        | 1614/10986 [1:06:48<5:52:21,  2.26s/it]

training loss: 3.341146469116211


training:  15%|█▍        | 1615/10986 [1:06:50<6:00:52,  2.31s/it]

training loss: 3.4061453342437744


training:  15%|█▍        | 1616/10986 [1:06:52<5:51:38,  2.25s/it]

training loss: 3.3950958251953125


training:  15%|█▍        | 1617/10986 [1:06:55<6:00:00,  2.31s/it]

training loss: 3.453052520751953


training:  15%|█▍        | 1618/10986 [1:06:57<5:51:19,  2.25s/it]

training loss: 3.420048236846924


training:  15%|█▍        | 1619/10986 [1:06:59<6:00:45,  2.31s/it]

training loss: 3.304161548614502


training:  15%|█▍        | 1620/10986 [1:07:02<5:50:54,  2.25s/it]

training loss: 3.4479339122772217
valid loss: 3.431047201156616
perplexity: 30.908994674682617


training:  15%|█▍        | 1621/10986 [1:07:06<7:56:34,  3.05s/it]

training loss: 3.603912830352783


training:  15%|█▍        | 1622/10986 [1:07:09<7:29:40,  2.88s/it]

training loss: 3.524458885192871


training:  15%|█▍        | 1623/10986 [1:07:11<6:55:05,  2.66s/it]

training loss: 3.432630777359009


training:  15%|█▍        | 1624/10986 [1:07:15<7:51:39,  3.02s/it]

training loss: 3.497511625289917


training:  15%|█▍        | 1625/10986 [1:07:17<7:06:41,  2.73s/it]

training loss: 3.56851863861084


training:  15%|█▍        | 1626/10986 [1:07:20<6:54:20,  2.66s/it]

training loss: 3.367392063140869


training:  15%|█▍        | 1627/10986 [1:07:22<6:28:45,  2.49s/it]

training loss: 3.3504300117492676


training:  15%|█▍        | 1628/10986 [1:07:24<6:24:56,  2.47s/it]

training loss: 3.649754285812378


training:  15%|█▍        | 1629/10986 [1:07:26<6:07:20,  2.36s/it]

training loss: 3.3880982398986816


training:  15%|█▍        | 1630/10986 [1:07:29<6:12:17,  2.39s/it]

training loss: 3.491874933242798


training:  15%|█▍        | 1631/10986 [1:07:31<5:59:28,  2.31s/it]

training loss: 3.4490087032318115


training:  15%|█▍        | 1632/10986 [1:07:33<6:05:01,  2.34s/it]

training loss: 3.4682540893554688


training:  15%|█▍        | 1633/10986 [1:07:35<5:52:32,  2.26s/it]

training loss: 3.4696402549743652


training:  15%|█▍        | 1634/10986 [1:07:38<6:00:22,  2.31s/it]

training loss: 3.4414355754852295


training:  15%|█▍        | 1635/10986 [1:07:40<5:50:19,  2.25s/it]

training loss: 3.5570578575134277


training:  15%|█▍        | 1636/10986 [1:07:42<5:58:22,  2.30s/it]

training loss: 3.4144411087036133


training:  15%|█▍        | 1637/10986 [1:07:44<5:49:35,  2.24s/it]

training loss: 3.400364637374878


training:  15%|█▍        | 1638/10986 [1:07:47<6:00:41,  2.32s/it]

training loss: 3.354362964630127


training:  15%|█▍        | 1639/10986 [1:07:49<5:52:16,  2.26s/it]

training loss: 3.4732515811920166


training:  15%|█▍        | 1640/10986 [1:07:51<6:01:24,  2.32s/it]

training loss: 3.4696431159973145
valid loss: 3.543466567993164
perplexity: 34.58660888671875


training:  15%|█▍        | 1641/10986 [1:07:56<7:52:24,  3.03s/it]

training loss: 3.3241047859191895


training:  15%|█▍        | 1642/10986 [1:07:58<7:12:07,  2.77s/it]

training loss: 3.5730912685394287


training:  15%|█▍        | 1643/10986 [1:08:01<6:58:52,  2.69s/it]

training loss: 3.4245314598083496


training:  15%|█▍        | 1644/10986 [1:08:03<6:32:51,  2.52s/it]

training loss: 3.4200003147125244


training:  15%|█▍        | 1645/10986 [1:08:05<6:29:24,  2.50s/it]

training loss: 3.3846051692962646


training:  15%|█▍        | 1646/10986 [1:08:07<6:12:50,  2.40s/it]

training loss: 3.4399092197418213


training:  15%|█▍        | 1647/10986 [1:08:10<6:25:09,  2.47s/it]

training loss: 3.490471601486206


training:  15%|█▌        | 1648/10986 [1:08:14<7:29:18,  2.89s/it]

training loss: 3.3797457218170166


training:  15%|█▌        | 1649/10986 [1:08:17<7:20:43,  2.83s/it]

training loss: 3.589071273803711


training:  15%|█▌        | 1650/10986 [1:08:19<6:49:00,  2.63s/it]

training loss: 3.449211835861206


training:  15%|█▌        | 1651/10986 [1:08:21<6:40:42,  2.58s/it]

training loss: 3.497249126434326


training:  15%|█▌        | 1652/10986 [1:08:23<6:18:44,  2.43s/it]

training loss: 3.399070978164673


training:  15%|█▌        | 1653/10986 [1:08:26<6:18:26,  2.43s/it]

training loss: 3.3636584281921387


training:  15%|█▌        | 1654/10986 [1:08:28<6:01:34,  2.32s/it]

training loss: 3.3737423419952393


training:  15%|█▌        | 1655/10986 [1:08:30<6:05:23,  2.35s/it]

training loss: 3.5991768836975098


training:  15%|█▌        | 1656/10986 [1:08:32<5:54:01,  2.28s/it]

training loss: 3.530261278152466


training:  15%|█▌        | 1657/10986 [1:08:35<6:02:06,  2.33s/it]

training loss: 3.5771186351776123


training:  15%|█▌        | 1658/10986 [1:08:37<5:51:50,  2.26s/it]

training loss: 3.424121141433716


training:  15%|█▌        | 1659/10986 [1:08:39<6:00:47,  2.32s/it]

training loss: 3.429565191268921


training:  15%|█▌        | 1660/10986 [1:08:42<5:51:00,  2.26s/it]

training loss: 3.437077283859253
valid loss: 3.5367431640625
perplexity: 34.35485076904297


training:  15%|█▌        | 1661/10986 [1:08:46<7:33:59,  2.92s/it]

training loss: 3.5529403686523438


training:  15%|█▌        | 1662/10986 [1:08:49<7:15:46,  2.80s/it]

training loss: 3.4285502433776855


training:  15%|█▌        | 1663/10986 [1:08:51<6:45:24,  2.61s/it]

training loss: 3.4461817741394043


training:  15%|█▌        | 1664/10986 [1:08:53<6:38:56,  2.57s/it]

training loss: 3.4937856197357178


training:  15%|█▌        | 1665/10986 [1:08:55<6:17:39,  2.43s/it]

training loss: 3.4919352531433105


training:  15%|█▌        | 1666/10986 [1:08:58<6:16:31,  2.42s/it]

training loss: 3.558842182159424


training:  15%|█▌        | 1667/10986 [1:09:00<6:23:37,  2.47s/it]

training loss: 3.4380650520324707


training:  15%|█▌        | 1668/10986 [1:09:03<6:21:49,  2.46s/it]

training loss: 3.4076995849609375


training:  15%|█▌        | 1669/10986 [1:09:05<6:04:48,  2.35s/it]

training loss: 3.3601348400115967


training:  15%|█▌        | 1670/10986 [1:09:07<6:09:01,  2.38s/it]

training loss: 3.4374167919158936


training:  15%|█▌        | 1671/10986 [1:09:09<5:57:01,  2.30s/it]

training loss: 3.4554343223571777


training:  15%|█▌        | 1672/10986 [1:09:12<6:03:27,  2.34s/it]

training loss: 3.4175846576690674


training:  15%|█▌        | 1673/10986 [1:09:15<7:01:30,  2.72s/it]

training loss: 3.5272037982940674


training:  15%|█▌        | 1674/10986 [1:09:18<6:46:56,  2.62s/it]

training loss: 3.4211249351501465


training:  15%|█▌        | 1675/10986 [1:09:20<6:21:58,  2.46s/it]

training loss: 3.5057008266448975


training:  15%|█▌        | 1676/10986 [1:09:23<6:49:45,  2.64s/it]

training loss: 3.514115571975708


training:  15%|█▌        | 1677/10986 [1:09:25<6:24:47,  2.48s/it]

training loss: 3.4884767532348633


training:  15%|█▌        | 1678/10986 [1:09:27<6:22:02,  2.46s/it]

training loss: 3.4763360023498535


training:  15%|█▌        | 1679/10986 [1:09:30<6:04:31,  2.35s/it]

training loss: 3.419989824295044


training:  15%|█▌        | 1680/10986 [1:09:32<6:08:22,  2.38s/it]

training loss: 3.4924371242523193
valid loss: 3.452540636062622
perplexity: 31.580524444580078


training:  15%|█▌        | 1681/10986 [1:09:37<8:13:19,  3.18s/it]

training loss: 3.487961530685425


training:  15%|█▌        | 1682/10986 [1:09:39<7:24:46,  2.87s/it]

training loss: 3.4320082664489746


training:  15%|█▌        | 1683/10986 [1:09:42<7:05:26,  2.74s/it]

training loss: 3.5110726356506348


training:  15%|█▌        | 1684/10986 [1:09:44<6:34:23,  2.54s/it]

training loss: 3.606863498687744


training:  15%|█▌        | 1685/10986 [1:09:46<6:27:53,  2.50s/it]

training loss: 3.4996113777160645


training:  15%|█▌        | 1686/10986 [1:09:48<6:08:48,  2.38s/it]

training loss: 3.3963682651519775


training:  15%|█▌        | 1687/10986 [1:09:51<6:11:38,  2.40s/it]

training loss: 3.341275691986084


training:  15%|█▌        | 1688/10986 [1:09:53<5:58:51,  2.32s/it]

training loss: 3.52181077003479


training:  15%|█▌        | 1689/10986 [1:09:55<6:04:34,  2.35s/it]

training loss: 3.47542142868042


training:  15%|█▌        | 1690/10986 [1:09:57<5:52:06,  2.27s/it]

training loss: 3.5336923599243164


training:  15%|█▌        | 1691/10986 [1:10:00<5:58:02,  2.31s/it]

training loss: 3.3724982738494873


training:  15%|█▌        | 1692/10986 [1:10:02<5:48:19,  2.25s/it]

training loss: 3.4304089546203613


training:  15%|█▌        | 1693/10986 [1:10:04<5:56:05,  2.30s/it]

training loss: 3.375471830368042


training:  15%|█▌        | 1694/10986 [1:10:06<5:47:29,  2.24s/it]

training loss: 3.5492212772369385


training:  15%|█▌        | 1695/10986 [1:10:09<5:54:20,  2.29s/it]

training loss: 3.3933496475219727


training:  15%|█▌        | 1696/10986 [1:10:11<5:45:39,  2.23s/it]

training loss: 3.520205020904541


training:  15%|█▌        | 1697/10986 [1:10:14<6:53:59,  2.67s/it]

training loss: 3.611564874649048


training:  15%|█▌        | 1698/10986 [1:10:17<6:27:31,  2.50s/it]

training loss: 3.5237889289855957


training:  15%|█▌        | 1699/10986 [1:10:19<6:25:24,  2.49s/it]

training loss: 3.3119068145751953


training:  15%|█▌        | 1700/10986 [1:10:21<6:07:48,  2.38s/it]

training loss: 3.5782272815704346
valid loss: 3.4660942554473877
perplexity: 32.01146697998047


training:  15%|█▌        | 1701/10986 [1:10:26<7:56:57,  3.08s/it]

training loss: 3.4156901836395264


training:  15%|█▌        | 1702/10986 [1:10:28<7:29:39,  2.91s/it]

training loss: 3.4936532974243164


training:  16%|█▌        | 1703/10986 [1:10:31<6:54:27,  2.68s/it]

training loss: 3.409546136856079


training:  16%|█▌        | 1704/10986 [1:10:33<6:44:04,  2.61s/it]

training loss: 3.3166425228118896


training:  16%|█▌        | 1705/10986 [1:10:35<6:20:24,  2.46s/it]

training loss: 3.5756168365478516


training:  16%|█▌        | 1706/10986 [1:10:37<6:17:18,  2.44s/it]

training loss: 3.5662646293640137


training:  16%|█▌        | 1707/10986 [1:10:40<6:01:27,  2.34s/it]

training loss: 3.350152015686035


training:  16%|█▌        | 1708/10986 [1:10:42<6:05:08,  2.36s/it]

training loss: 3.3527181148529053


training:  16%|█▌        | 1709/10986 [1:10:44<5:51:57,  2.28s/it]

training loss: 3.538999557495117


training:  16%|█▌        | 1710/10986 [1:10:47<5:58:52,  2.32s/it]

training loss: 3.607391119003296


training:  16%|█▌        | 1711/10986 [1:10:49<5:49:57,  2.26s/it]

training loss: 3.4833126068115234


training:  16%|█▌        | 1712/10986 [1:10:51<5:56:34,  2.31s/it]

training loss: 3.329033136367798


training:  16%|█▌        | 1713/10986 [1:10:53<5:46:18,  2.24s/it]

training loss: 3.6031508445739746


training:  16%|█▌        | 1714/10986 [1:10:56<5:56:06,  2.30s/it]

training loss: 3.504563808441162


training:  16%|█▌        | 1715/10986 [1:10:58<6:23:38,  2.48s/it]

training loss: 3.4863834381103516


training:  16%|█▌        | 1716/10986 [1:11:02<6:49:40,  2.65s/it]

training loss: 3.4528555870056152


training:  16%|█▌        | 1717/10986 [1:11:04<6:23:40,  2.48s/it]

training loss: 3.643369436264038


training:  16%|█▌        | 1718/10986 [1:11:06<6:19:45,  2.46s/it]

training loss: 3.5811514854431152


training:  16%|█▌        | 1719/10986 [1:11:08<6:02:44,  2.35s/it]

training loss: 3.502168893814087


training:  16%|█▌        | 1720/10986 [1:11:11<6:07:35,  2.38s/it]

training loss: 3.4824752807617188
valid loss: 3.4959959983825684
perplexity: 32.983123779296875


training:  16%|█▌        | 1721/10986 [1:11:16<8:28:50,  3.30s/it]

training loss: 3.6829564571380615


training:  16%|█▌        | 1722/10986 [1:11:18<7:36:14,  2.95s/it]

training loss: 3.667996883392334


training:  16%|█▌        | 1723/10986 [1:11:21<7:12:56,  2.80s/it]

training loss: 3.500788450241089


training:  16%|█▌        | 1724/10986 [1:11:23<6:39:59,  2.59s/it]

training loss: 3.416881561279297


training:  16%|█▌        | 1725/10986 [1:11:25<6:34:08,  2.55s/it]

training loss: 3.5392632484436035


training:  16%|█▌        | 1726/10986 [1:11:27<6:13:14,  2.42s/it]

training loss: 3.408721446990967


training:  16%|█▌        | 1727/10986 [1:11:30<6:12:40,  2.41s/it]

training loss: 3.4384851455688477


training:  16%|█▌        | 1728/10986 [1:11:32<5:58:08,  2.32s/it]

training loss: 3.4978184700012207


training:  16%|█▌        | 1729/10986 [1:11:34<6:04:57,  2.37s/it]

training loss: 3.4890754222869873


training:  16%|█▌        | 1730/10986 [1:11:36<5:53:08,  2.29s/it]

training loss: 3.4974913597106934


training:  16%|█▌        | 1731/10986 [1:11:39<5:58:58,  2.33s/it]

training loss: 3.5032472610473633


training:  16%|█▌        | 1732/10986 [1:11:41<5:48:19,  2.26s/it]

training loss: 3.417341709136963


training:  16%|█▌        | 1733/10986 [1:11:43<5:55:38,  2.31s/it]

training loss: 3.4722254276275635


training:  16%|█▌        | 1734/10986 [1:11:45<5:45:55,  2.24s/it]

training loss: 3.4484314918518066


training:  16%|█▌        | 1735/10986 [1:11:48<5:54:36,  2.30s/it]

training loss: 3.421337127685547


training:  16%|█▌        | 1736/10986 [1:11:50<5:45:15,  2.24s/it]

training loss: 3.385990619659424


training:  16%|█▌        | 1737/10986 [1:11:52<5:55:03,  2.30s/it]

training loss: 3.4151620864868164


training:  16%|█▌        | 1738/10986 [1:11:55<5:47:35,  2.26s/it]

training loss: 3.4925405979156494


training:  16%|█▌        | 1739/10986 [1:11:57<5:57:41,  2.32s/it]

training loss: 3.349602699279785


training:  16%|█▌        | 1740/10986 [1:11:59<5:46:57,  2.25s/it]

training loss: 3.499385118484497
valid loss: 3.527801990509033
perplexity: 34.04904556274414


training:  16%|█▌        | 1741/10986 [1:12:04<7:30:02,  2.92s/it]

training loss: 3.506801128387451


training:  16%|█▌        | 1742/10986 [1:12:06<7:11:08,  2.80s/it]

training loss: 3.4524388313293457


training:  16%|█▌        | 1743/10986 [1:12:08<6:37:45,  2.58s/it]

training loss: 3.5143542289733887


training:  16%|█▌        | 1744/10986 [1:12:11<6:28:36,  2.52s/it]

training loss: 3.4383749961853027


training:  16%|█▌        | 1745/10986 [1:12:13<6:27:37,  2.52s/it]

training loss: 3.390069007873535


training:  16%|█▌        | 1746/10986 [1:12:16<6:27:02,  2.51s/it]

training loss: 3.6519393920898438


training:  16%|█▌        | 1747/10986 [1:12:18<6:07:17,  2.39s/it]

training loss: 3.5683722496032715


training:  16%|█▌        | 1748/10986 [1:12:20<6:10:06,  2.40s/it]

training loss: 3.439027786254883


training:  16%|█▌        | 1749/10986 [1:12:22<5:58:12,  2.33s/it]

training loss: 3.499169111251831


training:  16%|█▌        | 1750/10986 [1:12:25<6:01:59,  2.35s/it]

training loss: 3.476930618286133


training:  16%|█▌        | 1751/10986 [1:12:27<5:51:00,  2.28s/it]

training loss: 3.5365045070648193


training:  16%|█▌        | 1752/10986 [1:12:29<5:56:50,  2.32s/it]

training loss: 3.3436007499694824


training:  16%|█▌        | 1753/10986 [1:12:31<5:46:46,  2.25s/it]

training loss: 3.6597020626068115


training:  16%|█▌        | 1754/10986 [1:12:34<5:53:22,  2.30s/it]

training loss: 3.390537977218628


training:  16%|█▌        | 1755/10986 [1:12:36<5:43:34,  2.23s/it]

training loss: 3.427398920059204


training:  16%|█▌        | 1756/10986 [1:12:38<5:51:19,  2.28s/it]

training loss: 3.5381503105163574


training:  16%|█▌        | 1757/10986 [1:12:40<5:42:44,  2.23s/it]

training loss: 3.501394033432007


training:  16%|█▌        | 1758/10986 [1:12:43<5:50:57,  2.28s/it]

training loss: 3.471219539642334


training:  16%|█▌        | 1759/10986 [1:12:45<5:41:30,  2.22s/it]

training loss: 3.503558874130249


training:  16%|█▌        | 1760/10986 [1:12:47<5:49:19,  2.27s/it]

training loss: 3.482863664627075
valid loss: 3.451110601425171
perplexity: 31.5353946685791


training:  16%|█▌        | 1761/10986 [1:12:52<7:31:49,  2.94s/it]

training loss: 3.5036511421203613


training:  16%|█▌        | 1762/10986 [1:12:54<6:56:25,  2.71s/it]

training loss: 3.414510488510132


training:  16%|█▌        | 1763/10986 [1:12:56<6:44:49,  2.63s/it]

training loss: 3.4415571689605713


training:  16%|█▌        | 1764/10986 [1:12:58<6:19:07,  2.47s/it]

training loss: 3.566131114959717


training:  16%|█▌        | 1765/10986 [1:13:01<6:15:03,  2.44s/it]

training loss: 3.3874874114990234


training:  16%|█▌        | 1766/10986 [1:13:03<5:59:48,  2.34s/it]

training loss: 3.5745155811309814


training:  16%|█▌        | 1767/10986 [1:13:05<6:02:49,  2.36s/it]

training loss: 3.42106294631958


training:  16%|█▌        | 1768/10986 [1:13:07<5:51:19,  2.29s/it]

training loss: 3.444155693054199


training:  16%|█▌        | 1769/10986 [1:13:10<5:56:24,  2.32s/it]

training loss: 3.5026493072509766


training:  16%|█▌        | 1770/10986 [1:13:12<5:47:05,  2.26s/it]

training loss: 3.496558666229248


training:  16%|█▌        | 1771/10986 [1:13:15<6:10:36,  2.41s/it]

training loss: 3.401456356048584


training:  16%|█▌        | 1772/10986 [1:13:17<5:55:26,  2.31s/it]

training loss: 3.451967239379883


training:  16%|█▌        | 1773/10986 [1:13:19<6:00:26,  2.35s/it]

training loss: 3.4572548866271973


training:  16%|█▌        | 1774/10986 [1:13:21<5:49:01,  2.27s/it]

training loss: 3.529620885848999


training:  16%|█▌        | 1775/10986 [1:13:24<5:56:25,  2.32s/it]

training loss: 3.5536839962005615


training:  16%|█▌        | 1776/10986 [1:13:26<5:46:01,  2.25s/it]

training loss: 3.5073087215423584


training:  16%|█▌        | 1777/10986 [1:13:28<5:54:55,  2.31s/it]

training loss: 3.3312742710113525


training:  16%|█▌        | 1778/10986 [1:13:30<5:44:25,  2.24s/it]

training loss: 3.4038467407226562


training:  16%|█▌        | 1779/10986 [1:13:33<5:51:51,  2.29s/it]

training loss: 3.397757053375244


training:  16%|█▌        | 1780/10986 [1:13:35<5:42:44,  2.23s/it]

training loss: 3.4245729446411133
valid loss: 3.5499467849731445
perplexity: 34.811466217041016


training:  16%|█▌        | 1781/10986 [1:13:40<8:01:27,  3.14s/it]

training loss: 3.511568069458008


training:  16%|█▌        | 1782/10986 [1:13:44<8:26:09,  3.30s/it]

training loss: 3.4243297576904297


training:  16%|█▌        | 1783/10986 [1:13:46<7:30:52,  2.94s/it]

training loss: 3.4677743911743164


training:  16%|█▌        | 1784/10986 [1:13:48<7:07:38,  2.79s/it]

training loss: 3.3901126384735107


training:  16%|█▌        | 1785/10986 [1:13:50<6:36:59,  2.59s/it]

training loss: 3.525142192840576


training:  16%|█▋        | 1786/10986 [1:13:53<6:29:09,  2.54s/it]

training loss: 3.5414581298828125


training:  16%|█▋        | 1787/10986 [1:13:55<6:10:23,  2.42s/it]

training loss: 3.4762818813323975


training:  16%|█▋        | 1788/10986 [1:13:57<6:11:41,  2.42s/it]

training loss: 3.4211344718933105


training:  16%|█▋        | 1789/10986 [1:13:59<5:56:35,  2.33s/it]

training loss: 3.4356746673583984


training:  16%|█▋        | 1790/10986 [1:14:02<6:00:07,  2.35s/it]

training loss: 3.386465549468994


training:  16%|█▋        | 1791/10986 [1:14:04<5:51:11,  2.29s/it]

training loss: 3.357309103012085


training:  16%|█▋        | 1792/10986 [1:14:06<5:56:09,  2.32s/it]

training loss: 3.6420111656188965


training:  16%|█▋        | 1793/10986 [1:14:09<5:45:18,  2.25s/it]

training loss: 3.5096917152404785


training:  16%|█▋        | 1794/10986 [1:14:11<5:55:40,  2.32s/it]

training loss: 3.518308401107788


training:  16%|█▋        | 1795/10986 [1:14:14<6:24:34,  2.51s/it]

training loss: 3.4855599403381348


training:  16%|█▋        | 1796/10986 [1:14:16<6:23:51,  2.51s/it]

training loss: 3.4663407802581787


training:  16%|█▋        | 1797/10986 [1:14:19<6:05:36,  2.39s/it]

training loss: 3.443526029586792


training:  16%|█▋        | 1798/10986 [1:14:21<6:07:02,  2.40s/it]

training loss: 3.4617602825164795


training:  16%|█▋        | 1799/10986 [1:14:23<5:53:22,  2.31s/it]

training loss: 3.2940614223480225


training:  16%|█▋        | 1800/10986 [1:14:26<6:00:27,  2.35s/it]

training loss: 3.384329319000244
valid loss: 3.4439516067504883
perplexity: 31.310440063476562


training:  16%|█▋        | 1801/10986 [1:14:31<8:02:15,  3.15s/it]

training loss: 3.4624884128570557


training:  16%|█▋        | 1802/10986 [1:14:33<7:16:21,  2.85s/it]

training loss: 3.58147931098938


training:  16%|█▋        | 1803/10986 [1:14:35<6:57:49,  2.73s/it]

training loss: 3.4487385749816895


training:  16%|█▋        | 1804/10986 [1:14:37<6:29:34,  2.55s/it]

training loss: 3.4254262447357178


training:  16%|█▋        | 1805/10986 [1:14:40<6:24:43,  2.51s/it]

training loss: 3.5317344665527344


training:  16%|█▋        | 1806/10986 [1:14:42<6:05:35,  2.39s/it]

training loss: 3.5516483783721924


training:  16%|█▋        | 1807/10986 [1:14:44<6:07:57,  2.41s/it]

training loss: 3.640953540802002


training:  16%|█▋        | 1808/10986 [1:14:46<5:52:36,  2.31s/it]

training loss: 3.42059326171875


training:  16%|█▋        | 1809/10986 [1:14:49<5:58:58,  2.35s/it]

training loss: 3.387660026550293


training:  16%|█▋        | 1810/10986 [1:14:51<5:48:22,  2.28s/it]

training loss: 3.298969030380249


training:  16%|█▋        | 1811/10986 [1:14:53<5:54:46,  2.32s/it]

training loss: 3.381483554840088


training:  16%|█▋        | 1812/10986 [1:14:55<5:46:19,  2.27s/it]

training loss: 3.444197654724121


training:  17%|█▋        | 1813/10986 [1:14:58<5:53:21,  2.31s/it]

training loss: 3.4912593364715576


training:  17%|█▋        | 1814/10986 [1:15:00<5:45:02,  2.26s/it]

training loss: 3.4693384170532227


training:  17%|█▋        | 1815/10986 [1:15:02<5:53:22,  2.31s/it]

training loss: 3.486480236053467


training:  17%|█▋        | 1816/10986 [1:15:05<5:44:35,  2.25s/it]

training loss: 3.454498291015625


training:  17%|█▋        | 1817/10986 [1:15:07<5:51:32,  2.30s/it]

training loss: 3.4127097129821777


training:  17%|█▋        | 1818/10986 [1:15:09<5:43:14,  2.25s/it]

training loss: 3.5105044841766357


training:  17%|█▋        | 1819/10986 [1:15:12<5:54:41,  2.32s/it]

training loss: 3.3618276119232178


training:  17%|█▋        | 1820/10986 [1:15:14<6:07:25,  2.41s/it]

training loss: 3.455203056335449
valid loss: 3.4305455684661865
perplexity: 30.89349365234375


training:  17%|█▋        | 1821/10986 [1:15:19<7:47:13,  3.06s/it]

training loss: 3.4355480670928955


training:  17%|█▋        | 1822/10986 [1:15:21<7:20:58,  2.89s/it]

training loss: 3.524955987930298


training:  17%|█▋        | 1823/10986 [1:15:23<6:45:01,  2.65s/it]

training loss: 3.604804277420044


training:  17%|█▋        | 1824/10986 [1:15:26<6:34:46,  2.59s/it]

training loss: 3.5259063243865967


training:  17%|█▋        | 1825/10986 [1:15:28<6:12:09,  2.44s/it]

training loss: 3.3900532722473145


training:  17%|█▋        | 1826/10986 [1:15:30<6:13:01,  2.44s/it]

training loss: 3.626697301864624


training:  17%|█▋        | 1827/10986 [1:15:32<5:56:58,  2.34s/it]

training loss: 3.459364652633667


training:  17%|█▋        | 1828/10986 [1:15:35<6:04:55,  2.39s/it]

training loss: 3.502638339996338


training:  17%|█▋        | 1829/10986 [1:15:37<5:50:34,  2.30s/it]

training loss: 3.3747353553771973


training:  17%|█▋        | 1830/10986 [1:15:39<5:56:10,  2.33s/it]

training loss: 3.3618054389953613


training:  17%|█▋        | 1831/10986 [1:15:42<5:44:09,  2.26s/it]

training loss: 3.6481661796569824


training:  17%|█▋        | 1832/10986 [1:15:44<5:51:54,  2.31s/it]

training loss: 3.4265754222869873


training:  17%|█▋        | 1833/10986 [1:15:46<5:42:45,  2.25s/it]

training loss: 3.3105080127716064


training:  17%|█▋        | 1834/10986 [1:15:48<5:50:12,  2.30s/it]

training loss: 3.448152780532837


training:  17%|█▋        | 1835/10986 [1:15:51<5:40:50,  2.23s/it]

training loss: 3.4028236865997314


training:  17%|█▋        | 1836/10986 [1:15:53<5:49:40,  2.29s/it]

training loss: 3.570192575454712


training:  17%|█▋        | 1837/10986 [1:15:55<5:41:17,  2.24s/it]

training loss: 3.3551957607269287


training:  17%|█▋        | 1838/10986 [1:15:58<5:50:16,  2.30s/it]

training loss: 3.5185306072235107


training:  17%|█▋        | 1839/10986 [1:16:00<5:40:55,  2.24s/it]

training loss: 3.426950216293335


training:  17%|█▋        | 1840/10986 [1:16:02<5:49:29,  2.29s/it]

training loss: 3.479224443435669
valid loss: 3.627753973007202
perplexity: 37.62820816040039


training:  17%|█▋        | 1841/10986 [1:16:07<7:39:22,  3.01s/it]

training loss: 3.524317741394043


training:  17%|█▋        | 1842/10986 [1:16:09<7:01:36,  2.77s/it]

training loss: 3.4826254844665527


training:  17%|█▋        | 1843/10986 [1:16:11<6:49:08,  2.68s/it]

training loss: 3.5076301097869873


training:  17%|█▋        | 1844/10986 [1:16:14<6:38:50,  2.62s/it]

training loss: 3.435129404067993


training:  17%|█▋        | 1845/10986 [1:16:16<6:29:28,  2.56s/it]

training loss: 3.2973341941833496


training:  17%|█▋        | 1846/10986 [1:16:18<6:09:09,  2.42s/it]

training loss: 3.6008846759796143


training:  17%|█▋        | 1847/10986 [1:16:21<6:09:10,  2.42s/it]

training loss: 3.411877393722534


training:  17%|█▋        | 1848/10986 [1:16:23<5:53:33,  2.32s/it]

training loss: 3.411870002746582


training:  17%|█▋        | 1849/10986 [1:16:26<6:38:13,  2.62s/it]

training loss: 3.604254722595215


training:  17%|█▋        | 1850/10986 [1:16:29<6:32:25,  2.58s/it]

training loss: 3.48717999458313


training:  17%|█▋        | 1851/10986 [1:16:31<6:24:19,  2.52s/it]

training loss: 3.410870313644409


training:  17%|█▋        | 1852/10986 [1:16:33<6:04:03,  2.39s/it]

training loss: 3.4498097896575928


training:  17%|█▋        | 1853/10986 [1:16:36<6:05:35,  2.40s/it]

training loss: 3.4066412448883057


training:  17%|█▋        | 1854/10986 [1:16:38<5:52:55,  2.32s/it]

training loss: 3.5067660808563232


training:  17%|█▋        | 1855/10986 [1:16:40<5:58:03,  2.35s/it]

training loss: 3.375119209289551


training:  17%|█▋        | 1856/10986 [1:16:42<5:45:09,  2.27s/it]

training loss: 3.4347305297851562


training:  17%|█▋        | 1857/10986 [1:16:45<5:54:45,  2.33s/it]

training loss: 3.619208335876465


training:  17%|█▋        | 1858/10986 [1:16:47<5:44:03,  2.26s/it]

training loss: 3.4389443397521973


training:  17%|█▋        | 1859/10986 [1:16:49<5:51:54,  2.31s/it]

training loss: 3.4785213470458984


training:  17%|█▋        | 1860/10986 [1:16:51<5:41:28,  2.25s/it]

training loss: 3.488792896270752
valid loss: 3.464704990386963
perplexity: 31.96702766418457


training:  17%|█▋        | 1861/10986 [1:16:56<7:27:25,  2.94s/it]

training loss: 3.522675037384033


training:  17%|█▋        | 1862/10986 [1:16:58<7:07:49,  2.81s/it]

training loss: 3.379260540008545


training:  17%|█▋        | 1863/10986 [1:17:01<6:38:15,  2.62s/it]

training loss: 3.5429861545562744


training:  17%|█▋        | 1864/10986 [1:17:03<6:29:17,  2.56s/it]

training loss: 3.5418808460235596


training:  17%|█▋        | 1865/10986 [1:17:05<6:08:33,  2.42s/it]

training loss: 3.387699842453003


training:  17%|█▋        | 1866/10986 [1:17:08<6:09:20,  2.43s/it]

training loss: 3.5292816162109375


training:  17%|█▋        | 1867/10986 [1:17:10<5:54:26,  2.33s/it]

training loss: 3.4901509284973145


training:  17%|█▋        | 1868/10986 [1:17:12<6:06:05,  2.41s/it]

training loss: 3.4317119121551514


training:  17%|█▋        | 1869/10986 [1:17:15<6:21:27,  2.51s/it]

training loss: 3.345137357711792


training:  17%|█▋        | 1870/10986 [1:17:17<6:16:59,  2.48s/it]

training loss: 3.39609956741333


training:  17%|█▋        | 1871/10986 [1:17:20<5:59:47,  2.37s/it]

training loss: 3.4895966053009033


training:  17%|█▋        | 1872/10986 [1:17:22<6:03:05,  2.39s/it]

training loss: 3.40700101852417


training:  17%|█▋        | 1873/10986 [1:17:24<5:50:36,  2.31s/it]

training loss: 3.4465835094451904


training:  17%|█▋        | 1874/10986 [1:17:26<5:55:24,  2.34s/it]

training loss: 3.4673986434936523


training:  17%|█▋        | 1875/10986 [1:17:29<5:45:25,  2.27s/it]

training loss: 3.518368721008301


training:  17%|█▋        | 1876/10986 [1:17:31<5:53:18,  2.33s/it]

training loss: 3.448026180267334


training:  17%|█▋        | 1877/10986 [1:17:33<5:42:54,  2.26s/it]

training loss: 3.4491090774536133


training:  17%|█▋        | 1878/10986 [1:17:36<5:51:02,  2.31s/it]

training loss: 3.519531011581421


training:  17%|█▋        | 1879/10986 [1:17:38<5:41:04,  2.25s/it]

training loss: 3.4435243606567383


training:  17%|█▋        | 1880/10986 [1:17:40<5:50:28,  2.31s/it]

training loss: 3.3823163509368896
valid loss: 3.4613614082336426
perplexity: 31.860322952270508


training:  17%|█▋        | 1881/10986 [1:17:45<7:28:49,  2.96s/it]

training loss: 3.445842742919922


training:  17%|█▋        | 1882/10986 [1:17:47<6:52:11,  2.72s/it]

training loss: 3.28979754447937


training:  17%|█▋        | 1883/10986 [1:17:49<6:39:11,  2.63s/it]

training loss: 3.4619975090026855


training:  17%|█▋        | 1884/10986 [1:17:51<6:14:48,  2.47s/it]

training loss: 3.4570727348327637


training:  17%|█▋        | 1885/10986 [1:17:54<6:12:00,  2.45s/it]

training loss: 3.4060680866241455


training:  17%|█▋        | 1886/10986 [1:17:56<5:56:10,  2.35s/it]

training loss: 3.291372537612915


training:  17%|█▋        | 1887/10986 [1:17:58<5:58:39,  2.37s/it]

training loss: 3.343916893005371


training:  17%|█▋        | 1888/10986 [1:18:00<5:45:39,  2.28s/it]

training loss: 3.365717649459839


training:  17%|█▋        | 1889/10986 [1:18:03<5:53:19,  2.33s/it]

training loss: 3.4684455394744873


training:  17%|█▋        | 1890/10986 [1:18:05<5:43:47,  2.27s/it]

training loss: 3.613744020462036


training:  17%|█▋        | 1891/10986 [1:18:07<5:49:50,  2.31s/it]

training loss: 3.456033706665039


training:  17%|█▋        | 1892/10986 [1:18:09<5:40:18,  2.25s/it]

training loss: 3.3646328449249268


training:  17%|█▋        | 1893/10986 [1:18:12<5:47:14,  2.29s/it]

training loss: 3.5091311931610107


training:  17%|█▋        | 1894/10986 [1:18:14<5:51:30,  2.32s/it]

training loss: 3.4762253761291504


training:  17%|█▋        | 1895/10986 [1:18:17<5:55:50,  2.35s/it]

training loss: 3.391129970550537


training:  17%|█▋        | 1896/10986 [1:18:19<5:44:42,  2.28s/it]

training loss: 3.3889951705932617


training:  17%|█▋        | 1897/10986 [1:18:21<5:49:07,  2.30s/it]

training loss: 3.5634090900421143


training:  17%|█▋        | 1898/10986 [1:18:23<5:38:37,  2.24s/it]

training loss: 3.342630624771118


training:  17%|█▋        | 1899/10986 [1:18:26<5:46:55,  2.29s/it]

training loss: 3.3632423877716064


training:  17%|█▋        | 1900/10986 [1:18:28<5:38:59,  2.24s/it]

training loss: 3.398599863052368
valid loss: 3.4537622928619385
perplexity: 31.619129180908203


training:  17%|█▋        | 1901/10986 [1:18:33<7:43:37,  3.06s/it]

training loss: 3.4916508197784424


training:  17%|█▋        | 1902/10986 [1:18:35<7:23:55,  2.93s/it]

training loss: 3.360677480697632


training:  17%|█▋        | 1903/10986 [1:18:37<6:45:23,  2.68s/it]

training loss: 3.4462571144104004


training:  17%|█▋        | 1904/10986 [1:18:40<6:34:01,  2.60s/it]

training loss: 3.51444149017334


training:  17%|█▋        | 1905/10986 [1:18:42<6:12:20,  2.46s/it]

training loss: 3.482328414916992


training:  17%|█▋        | 1906/10986 [1:18:44<6:10:08,  2.45s/it]

training loss: 3.459540843963623


training:  17%|█▋        | 1907/10986 [1:18:46<5:54:21,  2.34s/it]

training loss: 3.552631139755249


training:  17%|█▋        | 1908/10986 [1:18:49<5:57:25,  2.36s/it]

training loss: 3.6243462562561035


training:  17%|█▋        | 1909/10986 [1:18:51<5:45:10,  2.28s/it]

training loss: 3.4565374851226807


training:  17%|█▋        | 1910/10986 [1:18:53<5:50:08,  2.31s/it]

training loss: 3.448803424835205


training:  17%|█▋        | 1911/10986 [1:18:55<5:40:53,  2.25s/it]

training loss: 3.440899610519409


training:  17%|█▋        | 1912/10986 [1:18:58<5:47:11,  2.30s/it]

training loss: 3.314592123031616


training:  17%|█▋        | 1913/10986 [1:19:00<5:37:26,  2.23s/it]

training loss: 3.5356571674346924


training:  17%|█▋        | 1914/10986 [1:19:02<5:45:28,  2.28s/it]

training loss: 3.450124979019165


training:  17%|█▋        | 1915/10986 [1:19:04<5:39:09,  2.24s/it]

training loss: 3.4228174686431885


training:  17%|█▋        | 1916/10986 [1:19:07<5:47:30,  2.30s/it]

training loss: 3.447244644165039


training:  17%|█▋        | 1917/10986 [1:19:10<6:14:13,  2.48s/it]

training loss: 3.440798759460449


training:  17%|█▋        | 1918/10986 [1:19:13<6:53:15,  2.73s/it]

training loss: 3.510350227355957


training:  17%|█▋        | 1919/10986 [1:19:15<6:23:29,  2.54s/it]

training loss: 3.602156400680542


training:  17%|█▋        | 1920/10986 [1:19:18<6:19:31,  2.51s/it]

training loss: 3.5180435180664062
valid loss: 3.5074801445007324
perplexity: 33.36408996582031


training:  17%|█▋        | 1921/10986 [1:19:22<7:55:42,  3.15s/it]

training loss: 3.496180534362793


training:  17%|█▋        | 1922/10986 [1:19:24<7:10:20,  2.85s/it]

training loss: 3.567455768585205


training:  18%|█▊        | 1923/10986 [1:19:27<6:53:28,  2.74s/it]

training loss: 3.4166383743286133


training:  18%|█▊        | 1924/10986 [1:19:29<6:27:19,  2.56s/it]

training loss: 3.4091057777404785


training:  18%|█▊        | 1925/10986 [1:19:31<6:19:35,  2.51s/it]

training loss: 3.393024206161499


training:  18%|█▊        | 1926/10986 [1:19:34<6:01:11,  2.39s/it]

training loss: 3.3251848220825195


training:  18%|█▊        | 1927/10986 [1:19:36<6:04:29,  2.41s/it]

training loss: 3.5741803646087646


training:  18%|█▊        | 1928/10986 [1:19:38<5:48:54,  2.31s/it]

training loss: 3.4728400707244873


training:  18%|█▊        | 1929/10986 [1:19:41<5:53:03,  2.34s/it]

training loss: 3.3708624839782715


training:  18%|█▊        | 1930/10986 [1:19:43<5:43:37,  2.28s/it]

training loss: 3.3864657878875732


training:  18%|█▊        | 1931/10986 [1:19:45<5:50:47,  2.32s/it]

training loss: 3.5172386169433594


training:  18%|█▊        | 1932/10986 [1:19:47<5:40:37,  2.26s/it]

training loss: 3.48335337638855


training:  18%|█▊        | 1933/10986 [1:19:50<5:49:20,  2.32s/it]

training loss: 3.500666618347168


training:  18%|█▊        | 1934/10986 [1:19:52<5:39:51,  2.25s/it]

training loss: 3.456923246383667


training:  18%|█▊        | 1935/10986 [1:19:54<5:48:18,  2.31s/it]

training loss: 3.3603103160858154


training:  18%|█▊        | 1936/10986 [1:19:56<5:38:54,  2.25s/it]

training loss: 3.4126675128936768


training:  18%|█▊        | 1937/10986 [1:19:59<5:45:09,  2.29s/it]

training loss: 3.5021612644195557


training:  18%|█▊        | 1938/10986 [1:20:01<5:37:24,  2.24s/it]

training loss: 3.4751410484313965


training:  18%|█▊        | 1939/10986 [1:20:03<5:47:15,  2.30s/it]

training loss: 3.4676270484924316


training:  18%|█▊        | 1940/10986 [1:20:05<5:38:05,  2.24s/it]

training loss: 3.4462594985961914
valid loss: 3.4534783363342285
perplexity: 31.610151290893555


training:  18%|█▊        | 1941/10986 [1:20:10<7:19:19,  2.91s/it]

training loss: 3.3444411754608154


training:  18%|█▊        | 1942/10986 [1:20:13<7:10:24,  2.86s/it]

training loss: 3.4167726039886475


training:  18%|█▊        | 1943/10986 [1:20:15<6:58:57,  2.78s/it]

training loss: 3.567357063293457


training:  18%|█▊        | 1944/10986 [1:20:18<6:41:14,  2.66s/it]

training loss: 3.528472900390625


training:  18%|█▊        | 1945/10986 [1:20:20<6:16:00,  2.50s/it]

training loss: 3.5576696395874023


training:  18%|█▊        | 1946/10986 [1:20:22<6:11:32,  2.47s/it]

training loss: 3.5154964923858643


training:  18%|█▊        | 1947/10986 [1:20:24<5:55:59,  2.36s/it]

training loss: 3.394697427749634


training:  18%|█▊        | 1948/10986 [1:20:27<5:58:45,  2.38s/it]

training loss: 3.4707274436950684


training:  18%|█▊        | 1949/10986 [1:20:29<5:45:49,  2.30s/it]

training loss: 3.3970730304718018


training:  18%|█▊        | 1950/10986 [1:20:31<5:50:24,  2.33s/it]

training loss: 3.4535973072052


training:  18%|█▊        | 1951/10986 [1:20:33<5:40:09,  2.26s/it]

training loss: 3.3544132709503174


training:  18%|█▊        | 1952/10986 [1:20:36<5:46:54,  2.30s/it]

training loss: 3.534130334854126


training:  18%|█▊        | 1953/10986 [1:20:38<5:39:22,  2.25s/it]

training loss: 3.5283877849578857


training:  18%|█▊        | 1954/10986 [1:20:40<5:46:40,  2.30s/it]

training loss: 3.430673599243164


training:  18%|█▊        | 1955/10986 [1:20:42<5:37:49,  2.24s/it]

training loss: 3.591376543045044


training:  18%|█▊        | 1956/10986 [1:20:45<5:45:20,  2.29s/it]

training loss: 3.370532751083374


training:  18%|█▊        | 1957/10986 [1:20:47<5:35:39,  2.23s/it]

training loss: 3.3935062885284424


training:  18%|█▊        | 1958/10986 [1:20:49<5:44:35,  2.29s/it]

training loss: 3.492398977279663


training:  18%|█▊        | 1959/10986 [1:20:51<5:38:21,  2.25s/it]

training loss: 3.545574188232422


training:  18%|█▊        | 1960/10986 [1:20:54<5:44:56,  2.29s/it]

training loss: 3.5055806636810303
valid loss: 3.5168256759643555
perplexity: 33.6773567199707


training:  18%|█▊        | 1961/10986 [1:20:58<7:30:48,  3.00s/it]

training loss: 3.540555000305176


training:  18%|█▊        | 1962/10986 [1:21:01<6:54:10,  2.75s/it]

training loss: 3.511733293533325


training:  18%|█▊        | 1963/10986 [1:21:03<6:39:21,  2.66s/it]

training loss: 3.4774370193481445


training:  18%|█▊        | 1964/10986 [1:21:05<6:17:09,  2.51s/it]

training loss: 3.4539027214050293


training:  18%|█▊        | 1965/10986 [1:21:08<6:14:45,  2.49s/it]

training loss: 3.5267210006713867


training:  18%|█▊        | 1966/10986 [1:21:10<5:58:44,  2.39s/it]

training loss: 3.4310882091522217


training:  18%|█▊        | 1967/10986 [1:21:12<6:11:23,  2.47s/it]

training loss: 3.4132890701293945


training:  18%|█▊        | 1968/10986 [1:21:15<5:59:51,  2.39s/it]

training loss: 3.412728786468506


training:  18%|█▊        | 1969/10986 [1:21:17<6:04:25,  2.42s/it]

training loss: 3.4502735137939453


training:  18%|█▊        | 1970/10986 [1:21:19<5:52:28,  2.35s/it]

training loss: 3.515279769897461


training:  18%|█▊        | 1971/10986 [1:21:22<5:55:08,  2.36s/it]

training loss: 3.465770721435547


training:  18%|█▊        | 1972/10986 [1:21:24<5:43:53,  2.29s/it]

training loss: 3.443298816680908


training:  18%|█▊        | 1973/10986 [1:21:26<5:54:33,  2.36s/it]

training loss: 3.356203317642212


training:  18%|█▊        | 1974/10986 [1:21:28<5:41:48,  2.28s/it]

training loss: 3.4679245948791504


training:  18%|█▊        | 1975/10986 [1:21:31<5:46:40,  2.31s/it]

training loss: 3.4294488430023193


training:  18%|█▊        | 1976/10986 [1:21:33<5:38:52,  2.26s/it]

training loss: 3.49961519241333


training:  18%|█▊        | 1977/10986 [1:21:35<5:46:45,  2.31s/it]

training loss: 3.397414207458496


training:  18%|█▊        | 1978/10986 [1:21:37<5:37:08,  2.25s/it]

training loss: 3.4708638191223145


training:  18%|█▊        | 1979/10986 [1:21:40<5:43:17,  2.29s/it]

training loss: 3.574890613555908


training:  18%|█▊        | 1980/10986 [1:21:42<5:33:57,  2.22s/it]

training loss: 3.3642022609710693
valid loss: 3.4897992610931396
perplexity: 32.77936553955078


training:  18%|█▊        | 1981/10986 [1:21:47<7:34:08,  3.03s/it]

training loss: 3.38657808303833


training:  18%|█▊        | 1982/10986 [1:21:49<7:11:34,  2.88s/it]

training loss: 3.432847023010254


training:  18%|█▊        | 1983/10986 [1:21:51<6:38:03,  2.65s/it]

training loss: 3.3605079650878906


training:  18%|█▊        | 1984/10986 [1:21:55<7:00:09,  2.80s/it]

training loss: 3.4905524253845215


training:  18%|█▊        | 1985/10986 [1:21:57<6:50:10,  2.73s/it]

training loss: 3.345304250717163


training:  18%|█▊        | 1986/10986 [1:22:00<6:36:30,  2.64s/it]

training loss: 3.36002254486084


training:  18%|█▊        | 1987/10986 [1:22:02<6:12:05,  2.48s/it]

training loss: 3.3625893592834473


training:  18%|█▊        | 1988/10986 [1:22:04<6:07:59,  2.45s/it]

training loss: 3.6998350620269775


training:  18%|█▊        | 1989/10986 [1:22:06<5:53:31,  2.36s/it]

training loss: 3.4182844161987305


training:  18%|█▊        | 1990/10986 [1:22:09<5:54:11,  2.36s/it]

training loss: 3.3685007095336914


training:  18%|█▊        | 1991/10986 [1:22:11<5:42:03,  2.28s/it]

training loss: 3.4527037143707275


training:  18%|█▊        | 1992/10986 [1:22:13<5:54:46,  2.37s/it]

training loss: 3.4807963371276855


training:  18%|█▊        | 1993/10986 [1:22:16<5:48:40,  2.33s/it]

training loss: 3.485844373703003


training:  18%|█▊        | 1994/10986 [1:22:18<5:51:37,  2.35s/it]

training loss: 3.378800868988037


training:  18%|█▊        | 1995/10986 [1:22:20<5:39:44,  2.27s/it]

training loss: 3.418782949447632


training:  18%|█▊        | 1996/10986 [1:22:22<5:45:09,  2.30s/it]

training loss: 3.45112681388855


training:  18%|█▊        | 1997/10986 [1:22:24<5:35:39,  2.24s/it]

training loss: 3.388533353805542


training:  18%|█▊        | 1998/10986 [1:22:27<5:43:22,  2.29s/it]

training loss: 3.4201910495758057


training:  18%|█▊        | 1999/10986 [1:22:29<5:33:17,  2.23s/it]

training loss: 3.395975112915039


training:  18%|█▊        | 2000/10986 [1:22:31<5:41:36,  2.28s/it]

training loss: 3.5259149074554443
valid loss: 3.445276975631714
perplexity: 31.351966857910156


training:  18%|█▊        | 2001/10986 [1:22:37<7:58:44,  3.20s/it]

training loss: 3.4499261379241943


training:  18%|█▊        | 2002/10986 [1:22:39<7:14:23,  2.90s/it]

training loss: 3.4139647483825684


training:  18%|█▊        | 2003/10986 [1:22:41<6:55:42,  2.78s/it]

training loss: 3.553722858428955


training:  18%|█▊        | 2004/10986 [1:22:43<6:25:40,  2.58s/it]

training loss: 3.4470784664154053


training:  18%|█▊        | 2005/10986 [1:22:46<6:16:13,  2.51s/it]

training loss: 3.376692771911621


training:  18%|█▊        | 2006/10986 [1:22:48<5:56:24,  2.38s/it]

training loss: 3.4550113677978516


training:  18%|█▊        | 2007/10986 [1:22:50<5:57:45,  2.39s/it]

training loss: 3.464614152908325


training:  18%|█▊        | 2008/10986 [1:22:52<5:44:32,  2.30s/it]

training loss: 3.4020824432373047


training:  18%|█▊        | 2009/10986 [1:22:55<5:49:58,  2.34s/it]

training loss: 3.4818832874298096


training:  18%|█▊        | 2010/10986 [1:22:57<5:38:55,  2.27s/it]

training loss: 3.5478193759918213


training:  18%|█▊        | 2011/10986 [1:22:59<5:44:29,  2.30s/it]

training loss: 3.4370462894439697


training:  18%|█▊        | 2012/10986 [1:23:01<5:34:26,  2.24s/it]

training loss: 3.4677159786224365


training:  18%|█▊        | 2013/10986 [1:23:04<5:41:10,  2.28s/it]

training loss: 3.4514098167419434


training:  18%|█▊        | 2014/10986 [1:23:06<5:33:21,  2.23s/it]

training loss: 3.376138687133789


training:  18%|█▊        | 2015/10986 [1:23:08<5:41:32,  2.28s/it]

training loss: 3.34822416305542


training:  18%|█▊        | 2016/10986 [1:23:10<5:33:13,  2.23s/it]

training loss: 3.421478033065796


training:  18%|█▊        | 2017/10986 [1:23:13<5:41:12,  2.28s/it]

training loss: 3.6391379833221436


training:  18%|█▊        | 2018/10986 [1:23:15<5:44:11,  2.30s/it]

training loss: 3.4077653884887695


training:  18%|█▊        | 2019/10986 [1:23:18<6:13:39,  2.50s/it]

training loss: 3.408769369125366


training:  18%|█▊        | 2020/10986 [1:23:20<5:55:29,  2.38s/it]

training loss: 3.367779493331909
valid loss: 3.4316253662109375
perplexity: 30.926870346069336


training:  18%|█▊        | 2021/10986 [1:23:25<7:31:39,  3.02s/it]

training loss: 3.591423511505127


training:  18%|█▊        | 2022/10986 [1:23:27<7:08:39,  2.87s/it]

training loss: 3.4546804428100586


training:  18%|█▊        | 2023/10986 [1:23:29<6:34:22,  2.64s/it]

training loss: 3.3643112182617188


training:  18%|█▊        | 2024/10986 [1:23:32<6:24:17,  2.57s/it]

training loss: 3.425421714782715


training:  18%|█▊        | 2025/10986 [1:23:34<6:02:36,  2.43s/it]

training loss: 3.553401470184326


training:  18%|█▊        | 2026/10986 [1:23:36<6:00:28,  2.41s/it]

training loss: 3.264932632446289


training:  18%|█▊        | 2027/10986 [1:23:38<5:46:37,  2.32s/it]

training loss: 3.3718924522399902


training:  18%|█▊        | 2028/10986 [1:23:41<5:49:41,  2.34s/it]

training loss: 3.305095672607422


training:  18%|█▊        | 2029/10986 [1:23:43<5:38:18,  2.27s/it]

training loss: 3.51167631149292


training:  18%|█▊        | 2030/10986 [1:23:45<5:45:28,  2.31s/it]

training loss: 3.3921408653259277


training:  18%|█▊        | 2031/10986 [1:23:47<5:35:48,  2.25s/it]

training loss: 3.356149196624756


training:  18%|█▊        | 2032/10986 [1:23:50<5:43:21,  2.30s/it]

training loss: 3.3759946823120117


training:  19%|█▊        | 2033/10986 [1:23:52<5:34:26,  2.24s/it]

training loss: 3.437296152114868


training:  19%|█▊        | 2034/10986 [1:23:54<5:43:21,  2.30s/it]

training loss: 3.444908618927002


training:  19%|█▊        | 2035/10986 [1:23:56<5:34:22,  2.24s/it]

training loss: 3.3747808933258057


training:  19%|█▊        | 2036/10986 [1:23:59<5:42:45,  2.30s/it]

training loss: 3.5122456550598145


training:  19%|█▊        | 2037/10986 [1:24:01<5:32:18,  2.23s/it]

training loss: 3.4521634578704834


training:  19%|█▊        | 2038/10986 [1:24:03<5:40:10,  2.28s/it]

training loss: 3.516543388366699


training:  19%|█▊        | 2039/10986 [1:24:05<5:32:12,  2.23s/it]

training loss: 3.430941104888916


training:  19%|█▊        | 2040/10986 [1:24:08<5:40:23,  2.28s/it]

training loss: 3.4850621223449707
valid loss: 3.3687734603881836
perplexity: 29.042884826660156


training:  19%|█▊        | 2041/10986 [1:24:13<7:26:13,  2.99s/it]

training loss: 3.411604404449463


training:  19%|█▊        | 2042/10986 [1:24:15<7:04:42,  2.85s/it]

training loss: 3.460312843322754


training:  19%|█▊        | 2043/10986 [1:24:17<6:45:10,  2.72s/it]

training loss: 3.578033685684204


training:  19%|█▊        | 2044/10986 [1:24:20<6:18:23,  2.54s/it]

training loss: 3.4130921363830566


training:  19%|█▊        | 2045/10986 [1:24:22<6:12:45,  2.50s/it]

training loss: 3.527859926223755


training:  19%|█▊        | 2046/10986 [1:24:24<5:53:10,  2.37s/it]

training loss: 3.287621021270752


training:  19%|█▊        | 2047/10986 [1:24:26<5:54:52,  2.38s/it]

training loss: 3.451366901397705


training:  19%|█▊        | 2048/10986 [1:24:29<5:41:07,  2.29s/it]

training loss: 3.3434231281280518


training:  19%|█▊        | 2049/10986 [1:24:31<5:46:42,  2.33s/it]

training loss: 3.7058701515197754


training:  19%|█▊        | 2050/10986 [1:24:33<5:46:06,  2.32s/it]

training loss: 3.4216039180755615


training:  19%|█▊        | 2051/10986 [1:24:37<6:43:51,  2.71s/it]

training loss: 3.5239126682281494


training:  19%|█▊        | 2052/10986 [1:24:39<6:21:11,  2.56s/it]

training loss: 3.4239654541015625


training:  19%|█▊        | 2053/10986 [1:24:42<6:15:20,  2.52s/it]

training loss: 3.4789888858795166


training:  19%|█▊        | 2054/10986 [1:24:44<5:56:55,  2.40s/it]

training loss: 3.360020399093628


training:  19%|█▊        | 2055/10986 [1:24:46<5:56:02,  2.39s/it]

training loss: 3.371049404144287


training:  19%|█▊        | 2056/10986 [1:24:48<5:42:48,  2.30s/it]

training loss: 3.4839982986450195


training:  19%|█▊        | 2057/10986 [1:24:51<5:48:10,  2.34s/it]

training loss: 3.367229461669922


training:  19%|█▊        | 2058/10986 [1:24:53<5:37:02,  2.27s/it]

training loss: 3.379373550415039


training:  19%|█▊        | 2059/10986 [1:24:55<5:44:47,  2.32s/it]

training loss: 3.527536153793335


training:  19%|█▉        | 2060/10986 [1:24:57<5:35:40,  2.26s/it]

training loss: 3.39176607131958
valid loss: 3.364893913269043
perplexity: 28.93042755126953


training:  19%|█▉        | 2061/10986 [1:25:02<7:28:48,  3.02s/it]

training loss: 3.501659631729126


training:  19%|█▉        | 2062/10986 [1:25:04<7:05:49,  2.86s/it]

training loss: 3.4560415744781494


training:  19%|█▉        | 2063/10986 [1:25:07<6:33:19,  2.64s/it]

training loss: 3.3581061363220215


training:  19%|█▉        | 2064/10986 [1:25:09<6:23:45,  2.58s/it]

training loss: 3.3262736797332764


training:  19%|█▉        | 2065/10986 [1:25:11<6:02:16,  2.44s/it]

training loss: 3.404278516769409


training:  19%|█▉        | 2066/10986 [1:25:14<5:59:45,  2.42s/it]

training loss: 3.3844144344329834


training:  19%|█▉        | 2067/10986 [1:25:16<5:45:37,  2.33s/it]

training loss: 3.3774449825286865


training:  19%|█▉        | 2068/10986 [1:25:18<5:50:05,  2.36s/it]

training loss: 3.3729734420776367


training:  19%|█▉        | 2069/10986 [1:25:20<5:38:57,  2.28s/it]

training loss: 3.4007015228271484


training:  19%|█▉        | 2070/10986 [1:25:23<5:45:19,  2.32s/it]

training loss: 3.3586933612823486


training:  19%|█▉        | 2071/10986 [1:25:25<5:35:37,  2.26s/it]

training loss: 3.5038368701934814


training:  19%|█▉        | 2072/10986 [1:25:27<5:42:10,  2.30s/it]

training loss: 3.433276891708374


training:  19%|█▉        | 2073/10986 [1:25:29<5:33:07,  2.24s/it]

training loss: 3.3498833179473877


training:  19%|█▉        | 2074/10986 [1:25:32<5:41:06,  2.30s/it]

training loss: 3.344728946685791


training:  19%|█▉        | 2075/10986 [1:25:34<5:31:23,  2.23s/it]

training loss: 3.443568706512451


training:  19%|█▉        | 2076/10986 [1:25:36<5:43:00,  2.31s/it]

training loss: 3.4331116676330566


training:  19%|█▉        | 2077/10986 [1:25:38<5:34:06,  2.25s/it]

training loss: 3.4091644287109375


training:  19%|█▉        | 2078/10986 [1:25:41<5:41:03,  2.30s/it]

training loss: 3.5419929027557373


training:  19%|█▉        | 2079/10986 [1:25:43<5:33:31,  2.25s/it]

training loss: 3.374526023864746


training:  19%|█▉        | 2080/10986 [1:25:45<5:43:42,  2.32s/it]

training loss: 3.3645899295806885
valid loss: 3.371920347213745
perplexity: 29.134422302246094


training:  19%|█▉        | 2081/10986 [1:25:51<7:58:06,  3.22s/it]

training loss: 3.3118748664855957


training:  19%|█▉        | 2082/10986 [1:25:53<7:12:25,  2.91s/it]

training loss: 3.507092237472534


training:  19%|█▉        | 2083/10986 [1:25:55<6:53:45,  2.79s/it]

training loss: 3.3181090354919434


training:  19%|█▉        | 2084/10986 [1:25:57<6:24:08,  2.59s/it]

training loss: 3.362124443054199


training:  19%|█▉        | 2085/10986 [1:26:00<6:15:37,  2.53s/it]

training loss: 3.4549498558044434


training:  19%|█▉        | 2086/10986 [1:26:02<5:56:31,  2.40s/it]

training loss: 3.3563108444213867


training:  19%|█▉        | 2087/10986 [1:26:04<5:56:40,  2.40s/it]

training loss: 3.40212082862854


training:  19%|█▉        | 2088/10986 [1:26:07<5:45:59,  2.33s/it]

training loss: 3.4393393993377686


training:  19%|█▉        | 2089/10986 [1:26:09<5:50:47,  2.37s/it]

training loss: 3.452462911605835


training:  19%|█▉        | 2090/10986 [1:26:11<5:38:25,  2.28s/it]

training loss: 3.492420196533203


training:  19%|█▉        | 2091/10986 [1:26:14<5:46:52,  2.34s/it]

training loss: 3.440023422241211


training:  19%|█▉        | 2092/10986 [1:26:16<5:37:16,  2.28s/it]

training loss: 3.555297613143921


training:  19%|█▉        | 2093/10986 [1:26:18<5:43:04,  2.31s/it]

training loss: 3.517090082168579


training:  19%|█▉        | 2094/10986 [1:26:20<5:33:22,  2.25s/it]

training loss: 3.4338369369506836


training:  19%|█▉        | 2095/10986 [1:26:23<5:40:27,  2.30s/it]

training loss: 3.5171585083007812


training:  19%|█▉        | 2096/10986 [1:26:25<5:31:20,  2.24s/it]

training loss: 3.2282285690307617


training:  19%|█▉        | 2097/10986 [1:26:27<5:40:06,  2.30s/it]

training loss: 3.6766467094421387


training:  19%|█▉        | 2098/10986 [1:26:29<5:31:20,  2.24s/it]

training loss: 3.4110429286956787


training:  19%|█▉        | 2099/10986 [1:26:32<5:38:10,  2.28s/it]

training loss: 3.3368940353393555


training:  19%|█▉        | 2100/10986 [1:26:34<5:30:43,  2.23s/it]

training loss: 3.408186197280884
valid loss: 3.552926540374756
perplexity: 34.915348052978516


training:  19%|█▉        | 2101/10986 [1:26:39<7:32:24,  3.06s/it]

training loss: 3.400491714477539


training:  19%|█▉        | 2102/10986 [1:26:41<7:09:48,  2.90s/it]

training loss: 3.427861452102661


training:  19%|█▉        | 2103/10986 [1:26:43<6:34:27,  2.66s/it]

training loss: 3.467454433441162


training:  19%|█▉        | 2104/10986 [1:26:46<6:23:45,  2.59s/it]

training loss: 3.435736894607544


training:  19%|█▉        | 2105/10986 [1:26:48<6:02:12,  2.45s/it]

training loss: 3.390953779220581


training:  19%|█▉        | 2106/10986 [1:26:50<6:01:01,  2.44s/it]

training loss: 3.366954803466797


training:  19%|█▉        | 2107/10986 [1:26:52<5:45:56,  2.34s/it]

training loss: 3.462665557861328


training:  19%|█▉        | 2108/10986 [1:26:55<5:48:34,  2.36s/it]

training loss: 3.438220977783203


training:  19%|█▉        | 2109/10986 [1:26:57<5:38:24,  2.29s/it]

training loss: 3.5173089504241943


training:  19%|█▉        | 2110/10986 [1:26:59<5:44:55,  2.33s/it]

training loss: 3.471652030944824


training:  19%|█▉        | 2111/10986 [1:27:01<5:35:38,  2.27s/it]

training loss: 3.247629404067993


training:  19%|█▉        | 2112/10986 [1:27:04<5:42:48,  2.32s/it]

training loss: 3.5470077991485596


training:  19%|█▉        | 2113/10986 [1:27:06<5:33:17,  2.25s/it]

training loss: 3.32305908203125


training:  19%|█▉        | 2114/10986 [1:27:08<5:40:34,  2.30s/it]

training loss: 3.4656801223754883


training:  19%|█▉        | 2115/10986 [1:27:11<5:31:07,  2.24s/it]

training loss: 3.6364104747772217


training:  19%|█▉        | 2116/10986 [1:27:13<5:42:08,  2.31s/it]

training loss: 3.413649559020996


training:  19%|█▉        | 2117/10986 [1:27:16<6:13:06,  2.52s/it]

training loss: 3.444823741912842


training:  19%|█▉        | 2118/10986 [1:27:19<6:28:10,  2.63s/it]

training loss: 3.419046401977539


training:  19%|█▉        | 2119/10986 [1:27:21<6:06:29,  2.48s/it]

training loss: 3.3629214763641357


training:  19%|█▉        | 2120/10986 [1:27:23<6:04:12,  2.46s/it]

training loss: 3.413147449493408
valid loss: 3.4847025871276855
perplexity: 32.61272430419922


training:  19%|█▉        | 2121/10986 [1:27:28<7:55:08,  3.22s/it]

training loss: 3.3192763328552246


training:  19%|█▉        | 2122/10986 [1:27:31<7:08:09,  2.90s/it]

training loss: 3.508225440979004


training:  19%|█▉        | 2123/10986 [1:27:33<6:45:00,  2.74s/it]

training loss: 3.4699995517730713


training:  19%|█▉        | 2124/10986 [1:27:35<6:17:07,  2.55s/it]

training loss: 3.4526870250701904


training:  19%|█▉        | 2125/10986 [1:27:37<6:11:01,  2.51s/it]

training loss: 3.5144410133361816


training:  19%|█▉        | 2126/10986 [1:27:40<5:52:56,  2.39s/it]

training loss: 3.495678424835205


training:  19%|█▉        | 2127/10986 [1:27:42<5:52:37,  2.39s/it]

training loss: 3.4886720180511475


training:  19%|█▉        | 2128/10986 [1:27:44<5:40:00,  2.30s/it]

training loss: 3.421842575073242


training:  19%|█▉        | 2129/10986 [1:27:47<5:45:30,  2.34s/it]

training loss: 3.445139169692993


training:  19%|█▉        | 2130/10986 [1:27:49<5:53:44,  2.40s/it]

training loss: 3.3387908935546875


training:  19%|█▉        | 2131/10986 [1:27:51<5:55:43,  2.41s/it]

training loss: 3.3190414905548096


training:  19%|█▉        | 2132/10986 [1:27:54<5:43:14,  2.33s/it]

training loss: 3.4666829109191895


training:  19%|█▉        | 2133/10986 [1:27:56<5:47:33,  2.36s/it]

training loss: 3.426417112350464


training:  19%|█▉        | 2134/10986 [1:27:58<5:36:20,  2.28s/it]

training loss: 3.5550427436828613


training:  19%|█▉        | 2135/10986 [1:28:01<5:42:06,  2.32s/it]

training loss: 3.4265475273132324


training:  19%|█▉        | 2136/10986 [1:28:03<5:34:37,  2.27s/it]

training loss: 3.3856425285339355


training:  19%|█▉        | 2137/10986 [1:28:05<5:43:17,  2.33s/it]

training loss: 3.333388090133667


training:  19%|█▉        | 2138/10986 [1:28:07<5:34:55,  2.27s/it]

training loss: 3.4039273262023926


training:  19%|█▉        | 2139/10986 [1:28:10<5:41:57,  2.32s/it]

training loss: 3.398165464401245


training:  19%|█▉        | 2140/10986 [1:28:12<5:32:56,  2.26s/it]

training loss: 3.4245448112487793
valid loss: 3.4430031776428223
perplexity: 31.280759811401367


training:  19%|█▉        | 2141/10986 [1:28:18<8:10:06,  3.32s/it]

training loss: 3.4109957218170166


training:  19%|█▉        | 2142/10986 [1:28:20<7:35:40,  3.09s/it]

training loss: 3.450355052947998


training:  20%|█▉        | 2143/10986 [1:28:22<6:50:08,  2.78s/it]

training loss: 3.4289755821228027


training:  20%|█▉        | 2144/10986 [1:28:25<6:32:42,  2.66s/it]

training loss: 3.3761749267578125


training:  20%|█▉        | 2145/10986 [1:28:27<6:07:31,  2.49s/it]

training loss: 3.360069751739502


training:  20%|█▉        | 2146/10986 [1:28:29<6:02:44,  2.46s/it]

training loss: 3.4233827590942383


training:  20%|█▉        | 2147/10986 [1:28:31<5:46:26,  2.35s/it]

training loss: 3.534393310546875


training:  20%|█▉        | 2148/10986 [1:28:34<5:49:26,  2.37s/it]

training loss: 3.5591955184936523


training:  20%|█▉        | 2149/10986 [1:28:36<5:37:17,  2.29s/it]

training loss: 3.439016342163086


training:  20%|█▉        | 2150/10986 [1:28:38<5:44:43,  2.34s/it]

training loss: 3.338066816329956


training:  20%|█▉        | 2151/10986 [1:28:40<5:34:26,  2.27s/it]

training loss: 3.4904708862304688


training:  20%|█▉        | 2152/10986 [1:28:43<5:40:25,  2.31s/it]

training loss: 3.535216808319092


training:  20%|█▉        | 2153/10986 [1:28:45<5:32:37,  2.26s/it]

training loss: 3.5244109630584717


training:  20%|█▉        | 2154/10986 [1:28:47<5:41:05,  2.32s/it]

training loss: 3.429048776626587


training:  20%|█▉        | 2155/10986 [1:28:49<5:30:54,  2.25s/it]

training loss: 3.3947291374206543


training:  20%|█▉        | 2156/10986 [1:28:52<5:38:20,  2.30s/it]

training loss: 3.401859998703003


training:  20%|█▉        | 2157/10986 [1:28:54<5:30:00,  2.24s/it]

training loss: 3.5113630294799805


training:  20%|█▉        | 2158/10986 [1:28:56<5:38:34,  2.30s/it]

training loss: 3.6478586196899414


training:  20%|█▉        | 2159/10986 [1:28:58<5:28:26,  2.23s/it]

training loss: 3.5053884983062744


training:  20%|█▉        | 2160/10986 [1:29:01<5:35:05,  2.28s/it]

training loss: 3.4912521839141846
valid loss: 3.318004846572876
perplexity: 27.6052188873291


training:  20%|█▉        | 2161/10986 [1:29:05<7:18:24,  2.98s/it]

training loss: 3.4351818561553955


training:  20%|█▉        | 2162/10986 [1:29:08<6:42:40,  2.74s/it]

training loss: 3.3600943088531494


training:  20%|█▉        | 2163/10986 [1:29:10<6:30:08,  2.65s/it]

training loss: 3.4447531700134277


training:  20%|█▉        | 2164/10986 [1:29:12<6:06:05,  2.49s/it]

training loss: 3.5419580936431885


training:  20%|█▉        | 2165/10986 [1:29:15<6:02:48,  2.47s/it]

training loss: 3.39910626411438


training:  20%|█▉        | 2166/10986 [1:29:17<5:46:53,  2.36s/it]

training loss: 3.4705238342285156


training:  20%|█▉        | 2167/10986 [1:29:19<5:49:29,  2.38s/it]

training loss: 3.355884313583374


training:  20%|█▉        | 2168/10986 [1:29:21<5:38:10,  2.30s/it]

training loss: 3.4828200340270996


training:  20%|█▉        | 2169/10986 [1:29:24<5:43:53,  2.34s/it]

training loss: 3.3593099117279053


training:  20%|█▉        | 2170/10986 [1:29:26<5:32:49,  2.27s/it]

training loss: 3.32374906539917


training:  20%|█▉        | 2171/10986 [1:29:28<5:37:39,  2.30s/it]

training loss: 3.354832410812378


training:  20%|█▉        | 2172/10986 [1:29:30<5:29:13,  2.24s/it]

training loss: 3.3653907775878906


training:  20%|█▉        | 2173/10986 [1:29:33<5:37:40,  2.30s/it]

training loss: 3.5589559078216553


training:  20%|█▉        | 2174/10986 [1:29:35<5:28:41,  2.24s/it]

training loss: 3.403062105178833


training:  20%|█▉        | 2175/10986 [1:29:37<5:36:15,  2.29s/it]

training loss: 3.4048991203308105


training:  20%|█▉        | 2176/10986 [1:29:39<5:28:46,  2.24s/it]

training loss: 3.436147451400757


training:  20%|█▉        | 2177/10986 [1:29:42<5:38:10,  2.30s/it]

training loss: 3.3604183197021484


training:  20%|█▉        | 2178/10986 [1:29:44<5:30:57,  2.25s/it]

training loss: 3.412114381790161


training:  20%|█▉        | 2179/10986 [1:29:47<6:13:22,  2.54s/it]

training loss: 3.426581859588623


training:  20%|█▉        | 2180/10986 [1:29:49<5:53:48,  2.41s/it]

training loss: 3.3827695846557617
valid loss: 3.3386404514312744
perplexity: 28.180788040161133


training:  20%|█▉        | 2181/10986 [1:29:55<8:29:36,  3.47s/it]

training loss: 3.349087715148926


training:  20%|█▉        | 2182/10986 [1:29:58<7:59:23,  3.27s/it]

training loss: 3.3329694271087646


training:  20%|█▉        | 2183/10986 [1:30:00<7:07:30,  2.91s/it]

training loss: 3.4551854133605957


training:  20%|█▉        | 2184/10986 [1:30:02<6:44:41,  2.76s/it]

training loss: 3.327178716659546


training:  20%|█▉        | 2185/10986 [1:30:05<6:15:28,  2.56s/it]

training loss: 3.543355703353882


training:  20%|█▉        | 2186/10986 [1:30:07<6:08:23,  2.51s/it]

training loss: 3.410407066345215


training:  20%|█▉        | 2187/10986 [1:30:09<5:50:08,  2.39s/it]

training loss: 3.412665843963623


training:  20%|█▉        | 2188/10986 [1:30:11<5:51:12,  2.40s/it]

training loss: 3.5229952335357666


training:  20%|█▉        | 2189/10986 [1:30:14<5:38:01,  2.31s/it]

training loss: 3.33489727973938


training:  20%|█▉        | 2190/10986 [1:30:16<5:42:43,  2.34s/it]

training loss: 3.5894393920898438


training:  20%|█▉        | 2191/10986 [1:30:18<5:32:48,  2.27s/it]

training loss: 3.429517984390259


training:  20%|█▉        | 2192/10986 [1:30:21<5:40:29,  2.32s/it]

training loss: 3.442668914794922


training:  20%|█▉        | 2193/10986 [1:30:23<5:29:56,  2.25s/it]

training loss: 3.428598642349243


training:  20%|█▉        | 2194/10986 [1:30:25<5:37:28,  2.30s/it]

training loss: 3.4282798767089844


training:  20%|█▉        | 2195/10986 [1:30:27<5:30:11,  2.25s/it]

training loss: 3.5086312294006348


training:  20%|█▉        | 2196/10986 [1:30:30<5:38:00,  2.31s/it]

training loss: 3.4269046783447266


training:  20%|█▉        | 2197/10986 [1:30:32<5:29:02,  2.25s/it]

training loss: 3.4415972232818604


training:  20%|██        | 2198/10986 [1:30:34<5:36:58,  2.30s/it]

training loss: 3.328488826751709


training:  20%|██        | 2199/10986 [1:30:36<5:29:42,  2.25s/it]

training loss: 3.425485849380493


training:  20%|██        | 2200/10986 [1:30:39<5:36:23,  2.30s/it]

training loss: 3.4743552207946777
valid loss: 3.4805452823638916
perplexity: 32.47742462158203


training:  20%|██        | 2201/10986 [1:30:44<7:42:06,  3.16s/it]

training loss: 3.485891342163086


training:  20%|██        | 2202/10986 [1:30:46<6:58:12,  2.86s/it]

training loss: 3.551285982131958


training:  20%|██        | 2203/10986 [1:30:48<6:38:37,  2.72s/it]

training loss: 3.4459292888641357


training:  20%|██        | 2204/10986 [1:30:51<6:12:56,  2.55s/it]

training loss: 3.416304588317871


training:  20%|██        | 2205/10986 [1:30:53<6:08:24,  2.52s/it]

training loss: 3.575197696685791


training:  20%|██        | 2206/10986 [1:30:55<5:51:43,  2.40s/it]

training loss: 3.379410743713379


training:  20%|██        | 2207/10986 [1:30:58<5:53:01,  2.41s/it]

training loss: 3.355238676071167


training:  20%|██        | 2208/10986 [1:31:00<5:39:42,  2.32s/it]

training loss: 3.496314287185669


training:  20%|██        | 2209/10986 [1:31:02<5:43:48,  2.35s/it]

training loss: 3.5657477378845215


training:  20%|██        | 2210/10986 [1:31:04<5:33:36,  2.28s/it]

training loss: 3.320802688598633


training:  20%|██        | 2211/10986 [1:31:07<5:40:01,  2.32s/it]

training loss: 3.439976215362549


training:  20%|██        | 2212/10986 [1:31:09<5:32:01,  2.27s/it]

training loss: 3.550098419189453


training:  20%|██        | 2213/10986 [1:31:11<5:40:00,  2.33s/it]

training loss: 3.521672010421753


training:  20%|██        | 2214/10986 [1:31:13<5:30:10,  2.26s/it]

training loss: 3.4174509048461914


training:  20%|██        | 2215/10986 [1:31:16<5:37:21,  2.31s/it]

training loss: 3.4467861652374268


training:  20%|██        | 2216/10986 [1:31:18<5:28:12,  2.25s/it]

training loss: 3.376007556915283


training:  20%|██        | 2217/10986 [1:31:20<5:36:53,  2.31s/it]

training loss: 3.3596243858337402


training:  20%|██        | 2218/10986 [1:31:22<5:27:12,  2.24s/it]

training loss: 3.3989386558532715


training:  20%|██        | 2219/10986 [1:31:25<5:35:26,  2.30s/it]

training loss: 3.354100227355957


training:  20%|██        | 2220/10986 [1:31:27<5:27:29,  2.24s/it]

training loss: 3.2946956157684326
valid loss: 3.2986488342285156
perplexity: 27.076030731201172


training:  20%|██        | 2221/10986 [1:31:31<7:07:15,  2.92s/it]

training loss: 3.4774482250213623


training:  20%|██        | 2222/10986 [1:31:34<6:50:09,  2.81s/it]

training loss: 3.446507215499878


training:  20%|██        | 2223/10986 [1:31:36<6:20:57,  2.61s/it]

training loss: 3.447371006011963


training:  20%|██        | 2224/10986 [1:31:39<6:12:26,  2.55s/it]

training loss: 3.3509719371795654


training:  20%|██        | 2225/10986 [1:31:41<5:54:01,  2.42s/it]

training loss: 3.406853675842285


training:  20%|██        | 2226/10986 [1:31:43<5:55:10,  2.43s/it]

training loss: 3.4856362342834473


training:  20%|██        | 2227/10986 [1:31:45<5:48:57,  2.39s/it]

training loss: 3.3303515911102295


training:  20%|██        | 2228/10986 [1:31:48<6:00:04,  2.47s/it]

training loss: 3.3983254432678223


training:  20%|██        | 2229/10986 [1:31:50<5:44:59,  2.36s/it]

training loss: 3.417778968811035


training:  20%|██        | 2230/10986 [1:31:53<5:49:40,  2.40s/it]

training loss: 3.507500171661377


training:  20%|██        | 2231/10986 [1:31:55<5:38:05,  2.32s/it]

training loss: 3.4442005157470703


training:  20%|██        | 2232/10986 [1:31:57<5:46:43,  2.38s/it]

training loss: 3.3763539791107178


training:  20%|██        | 2233/10986 [1:31:59<5:36:13,  2.30s/it]

training loss: 3.4019017219543457


training:  20%|██        | 2234/10986 [1:32:02<5:43:25,  2.35s/it]

training loss: 3.396972894668579


training:  20%|██        | 2235/10986 [1:32:04<5:35:24,  2.30s/it]

training loss: 3.4003026485443115


training:  20%|██        | 2236/10986 [1:32:07<5:45:24,  2.37s/it]

training loss: 3.4897334575653076


training:  20%|██        | 2237/10986 [1:32:09<5:34:18,  2.29s/it]

training loss: 3.4371652603149414


training:  20%|██        | 2238/10986 [1:32:11<5:43:55,  2.36s/it]

training loss: 3.5484397411346436


training:  20%|██        | 2239/10986 [1:32:13<5:36:28,  2.31s/it]

training loss: 3.3402562141418457


training:  20%|██        | 2240/10986 [1:32:16<5:43:18,  2.36s/it]

training loss: 3.378145933151245
valid loss: 3.3554012775421143
perplexity: 28.657102584838867


training:  20%|██        | 2241/10986 [1:32:21<7:36:23,  3.13s/it]

training loss: 3.3640120029449463


training:  20%|██        | 2242/10986 [1:32:23<6:55:42,  2.85s/it]

training loss: 3.5350728034973145


training:  20%|██        | 2243/10986 [1:32:26<6:38:44,  2.74s/it]

training loss: 3.429356575012207


training:  20%|██        | 2244/10986 [1:32:28<6:11:38,  2.55s/it]

training loss: 3.3144655227661133


training:  20%|██        | 2245/10986 [1:32:30<6:08:51,  2.53s/it]

training loss: 3.363912582397461


training:  20%|██        | 2246/10986 [1:32:32<5:55:11,  2.44s/it]

training loss: 3.3633310794830322


training:  20%|██        | 2247/10986 [1:32:36<6:49:13,  2.81s/it]

training loss: 3.4240269660949707


training:  20%|██        | 2248/10986 [1:32:38<6:24:17,  2.64s/it]

training loss: 3.3741931915283203


training:  20%|██        | 2249/10986 [1:32:41<6:19:37,  2.61s/it]

training loss: 3.353776454925537


training:  20%|██        | 2250/10986 [1:32:43<6:00:14,  2.47s/it]

training loss: 3.325479507446289


training:  20%|██        | 2251/10986 [1:32:45<6:00:00,  2.47s/it]

training loss: 3.4853627681732178


training:  20%|██        | 2252/10986 [1:32:48<5:43:43,  2.36s/it]

training loss: 3.5011355876922607


training:  21%|██        | 2253/10986 [1:32:50<5:50:11,  2.41s/it]

training loss: 3.4628283977508545


training:  21%|██        | 2254/10986 [1:32:52<5:39:23,  2.33s/it]

training loss: 3.4186477661132812


training:  21%|██        | 2255/10986 [1:32:55<5:46:01,  2.38s/it]

training loss: 3.4953808784484863


training:  21%|██        | 2256/10986 [1:32:57<5:35:40,  2.31s/it]

training loss: 3.3926596641540527


training:  21%|██        | 2257/10986 [1:32:59<5:45:04,  2.37s/it]

training loss: 3.5021791458129883


training:  21%|██        | 2258/10986 [1:33:01<5:34:34,  2.30s/it]

training loss: 3.3914847373962402


training:  21%|██        | 2259/10986 [1:33:04<5:40:48,  2.34s/it]

training loss: 3.4034781455993652


training:  21%|██        | 2260/10986 [1:33:06<5:31:35,  2.28s/it]

training loss: 3.3642473220825195
valid loss: 3.4149422645568848
perplexity: 30.415193557739258


training:  21%|██        | 2261/10986 [1:33:11<7:15:52,  3.00s/it]

training loss: 3.3927524089813232


training:  21%|██        | 2262/10986 [1:33:13<6:56:24,  2.86s/it]

training loss: 3.4367311000823975


training:  21%|██        | 2263/10986 [1:33:15<6:24:48,  2.65s/it]

training loss: 3.3794150352478027


training:  21%|██        | 2264/10986 [1:33:18<6:15:27,  2.58s/it]

training loss: 3.2860522270202637


training:  21%|██        | 2265/10986 [1:33:20<5:53:40,  2.43s/it]

training loss: 3.4848203659057617


training:  21%|██        | 2266/10986 [1:33:22<5:56:06,  2.45s/it]

training loss: 3.373837471008301


training:  21%|██        | 2267/10986 [1:33:25<5:41:00,  2.35s/it]

training loss: 3.4464569091796875


training:  21%|██        | 2268/10986 [1:33:27<5:46:55,  2.39s/it]

training loss: 3.33300518989563


training:  21%|██        | 2269/10986 [1:33:29<5:34:23,  2.30s/it]

training loss: 3.366116523742676


training:  21%|██        | 2270/10986 [1:33:32<5:37:52,  2.33s/it]

training loss: 3.3980441093444824


training:  21%|██        | 2271/10986 [1:33:34<5:28:04,  2.26s/it]

training loss: 3.225938558578491


training:  21%|██        | 2272/10986 [1:33:36<5:35:35,  2.31s/it]

training loss: 3.392759084701538


training:  21%|██        | 2273/10986 [1:33:38<5:25:35,  2.24s/it]

training loss: 3.302997350692749


training:  21%|██        | 2274/10986 [1:33:41<5:31:45,  2.28s/it]

training loss: 3.437734842300415


training:  21%|██        | 2275/10986 [1:33:43<5:24:40,  2.24s/it]

training loss: 3.3352105617523193


training:  21%|██        | 2276/10986 [1:33:45<5:32:42,  2.29s/it]

training loss: 3.3868813514709473


training:  21%|██        | 2277/10986 [1:33:47<5:25:12,  2.24s/it]

training loss: 3.2920267581939697


training:  21%|██        | 2278/10986 [1:33:50<5:35:02,  2.31s/it]

training loss: 3.3736824989318848


training:  21%|██        | 2279/10986 [1:33:52<5:26:56,  2.25s/it]

training loss: 3.44765043258667


training:  21%|██        | 2280/10986 [1:33:54<5:34:45,  2.31s/it]

training loss: 3.4384875297546387
valid loss: 3.3807549476623535
perplexity: 29.392953872680664


training:  21%|██        | 2281/10986 [1:33:59<7:26:45,  3.08s/it]

training loss: 3.4982709884643555


training:  21%|██        | 2282/10986 [1:34:01<6:46:50,  2.80s/it]

training loss: 3.2403924465179443


training:  21%|██        | 2283/10986 [1:34:04<6:32:29,  2.71s/it]

training loss: 3.2905917167663574


training:  21%|██        | 2284/10986 [1:34:06<6:06:37,  2.53s/it]

training loss: 3.3789780139923096


training:  21%|██        | 2285/10986 [1:34:08<6:03:16,  2.51s/it]

training loss: 3.271789789199829


training:  21%|██        | 2286/10986 [1:34:10<5:46:58,  2.39s/it]

training loss: 3.3703534603118896


training:  21%|██        | 2287/10986 [1:34:13<5:50:36,  2.42s/it]

training loss: 3.3766493797302246


training:  21%|██        | 2288/10986 [1:34:15<5:39:11,  2.34s/it]

training loss: 3.291599750518799


training:  21%|██        | 2289/10986 [1:34:17<5:43:47,  2.37s/it]

training loss: 3.5336103439331055


training:  21%|██        | 2290/10986 [1:34:20<5:32:53,  2.30s/it]

training loss: 3.3824429512023926


training:  21%|██        | 2291/10986 [1:34:22<5:38:35,  2.34s/it]

training loss: 3.3291964530944824


training:  21%|██        | 2292/10986 [1:34:24<5:28:16,  2.27s/it]

training loss: 3.3426077365875244


training:  21%|██        | 2293/10986 [1:34:27<5:36:48,  2.32s/it]

training loss: 3.441204786300659


training:  21%|██        | 2294/10986 [1:34:29<5:27:28,  2.26s/it]

training loss: 3.4885406494140625


training:  21%|██        | 2295/10986 [1:34:31<5:34:06,  2.31s/it]

training loss: 3.443342685699463


training:  21%|██        | 2296/10986 [1:34:33<5:26:25,  2.25s/it]

training loss: 3.438037157058716


training:  21%|██        | 2297/10986 [1:34:36<5:35:22,  2.32s/it]

training loss: 3.3168108463287354


training:  21%|██        | 2298/10986 [1:34:38<5:25:41,  2.25s/it]

training loss: 3.3210947513580322


training:  21%|██        | 2299/10986 [1:34:40<5:32:02,  2.29s/it]

training loss: 3.3117575645446777


training:  21%|██        | 2300/10986 [1:34:42<5:24:37,  2.24s/it]

training loss: 3.4318060874938965
valid loss: 3.270510673522949
perplexity: 26.324779510498047


training:  21%|██        | 2301/10986 [1:34:47<7:23:05,  3.06s/it]

training loss: 3.413771867752075


training:  21%|██        | 2302/10986 [1:34:50<6:58:30,  2.89s/it]

training loss: 3.2819700241088867


training:  21%|██        | 2303/10986 [1:34:52<6:24:44,  2.66s/it]

training loss: 3.4595322608947754


training:  21%|██        | 2304/10986 [1:34:54<6:15:39,  2.60s/it]

training loss: 3.358370065689087


training:  21%|██        | 2305/10986 [1:34:56<5:55:11,  2.45s/it]

training loss: 3.4184985160827637


training:  21%|██        | 2306/10986 [1:34:59<5:55:19,  2.46s/it]

training loss: 3.398066759109497


training:  21%|██        | 2307/10986 [1:35:01<5:39:56,  2.35s/it]

training loss: 3.4294326305389404


training:  21%|██        | 2308/10986 [1:35:04<5:45:20,  2.39s/it]

training loss: 3.418111562728882


training:  21%|██        | 2309/10986 [1:35:06<5:35:06,  2.32s/it]

training loss: 3.3896522521972656


training:  21%|██        | 2310/10986 [1:35:08<5:39:00,  2.34s/it]

training loss: 3.3714683055877686


training:  21%|██        | 2311/10986 [1:35:10<5:28:46,  2.27s/it]

training loss: 3.433069944381714


training:  21%|██        | 2312/10986 [1:35:13<6:07:41,  2.54s/it]

training loss: 3.4546988010406494


training:  21%|██        | 2313/10986 [1:35:16<6:09:59,  2.56s/it]

training loss: 3.317457437515259


training:  21%|██        | 2314/10986 [1:35:18<6:04:43,  2.52s/it]

training loss: 3.372079849243164


training:  21%|██        | 2315/10986 [1:35:21<5:47:12,  2.40s/it]

training loss: 3.380048990249634


training:  21%|██        | 2316/10986 [1:35:23<5:49:20,  2.42s/it]

training loss: 3.3199288845062256


training:  21%|██        | 2317/10986 [1:35:25<5:36:37,  2.33s/it]

training loss: 3.3844456672668457


training:  21%|██        | 2318/10986 [1:35:28<5:41:01,  2.36s/it]

training loss: 3.4349613189697266


training:  21%|██        | 2319/10986 [1:35:30<5:29:45,  2.28s/it]

training loss: 3.448127269744873


training:  21%|██        | 2320/10986 [1:35:32<5:35:03,  2.32s/it]

training loss: 3.414240598678589
valid loss: 3.352510452270508
perplexity: 28.574378967285156


training:  21%|██        | 2321/10986 [1:35:37<7:23:05,  3.07s/it]

training loss: 3.4637904167175293


training:  21%|██        | 2322/10986 [1:35:39<6:42:27,  2.79s/it]

training loss: 3.4848499298095703


training:  21%|██        | 2323/10986 [1:35:41<6:26:28,  2.68s/it]

training loss: 3.4434328079223633


training:  21%|██        | 2324/10986 [1:35:44<6:01:25,  2.50s/it]

training loss: 3.328713893890381


training:  21%|██        | 2325/10986 [1:35:46<6:12:56,  2.58s/it]

training loss: 3.3756086826324463


training:  21%|██        | 2326/10986 [1:35:48<5:51:19,  2.43s/it]

training loss: 3.457636833190918


training:  21%|██        | 2327/10986 [1:35:51<5:51:11,  2.43s/it]

training loss: 3.314324140548706


training:  21%|██        | 2328/10986 [1:35:53<5:36:54,  2.33s/it]

training loss: 3.451084852218628


training:  21%|██        | 2329/10986 [1:35:55<5:42:24,  2.37s/it]

training loss: 3.4200258255004883


training:  21%|██        | 2330/10986 [1:35:57<5:30:14,  2.29s/it]

training loss: 3.538257598876953


training:  21%|██        | 2331/10986 [1:36:00<5:35:48,  2.33s/it]

training loss: 3.4490771293640137


training:  21%|██        | 2332/10986 [1:36:02<5:26:13,  2.26s/it]

training loss: 3.3952503204345703


training:  21%|██        | 2333/10986 [1:36:04<5:32:45,  2.31s/it]

training loss: 3.534445285797119


training:  21%|██        | 2334/10986 [1:36:06<5:23:48,  2.25s/it]

training loss: 3.4137609004974365


training:  21%|██▏       | 2335/10986 [1:36:09<5:31:30,  2.30s/it]

training loss: 3.517415761947632


training:  21%|██▏       | 2336/10986 [1:36:11<5:23:46,  2.25s/it]

training loss: 3.550882339477539


training:  21%|██▏       | 2337/10986 [1:36:13<5:32:10,  2.30s/it]

training loss: 3.6177310943603516


training:  21%|██▏       | 2338/10986 [1:36:16<5:22:53,  2.24s/it]

training loss: 3.4535772800445557


training:  21%|██▏       | 2339/10986 [1:36:18<5:31:15,  2.30s/it]

training loss: 3.280791997909546


training:  21%|██▏       | 2340/10986 [1:36:20<5:23:15,  2.24s/it]

training loss: 3.4366817474365234
valid loss: 3.52579665184021
perplexity: 33.9808349609375


training:  21%|██▏       | 2341/10986 [1:36:25<7:00:07,  2.92s/it]

training loss: 3.566004991531372


training:  21%|██▏       | 2342/10986 [1:36:27<6:44:47,  2.81s/it]

training loss: 3.426182746887207


training:  21%|██▏       | 2343/10986 [1:36:29<6:15:08,  2.60s/it]

training loss: 3.4774723052978516


training:  21%|██▏       | 2344/10986 [1:36:32<6:05:14,  2.54s/it]

training loss: 3.5321717262268066


training:  21%|██▏       | 2345/10986 [1:36:34<5:46:15,  2.40s/it]

training loss: 3.329660415649414


training:  21%|██▏       | 2346/10986 [1:36:36<5:47:21,  2.41s/it]

training loss: 3.4182140827178955


training:  21%|██▏       | 2347/10986 [1:36:38<5:33:10,  2.31s/it]

training loss: 3.4394750595092773


training:  21%|██▏       | 2348/10986 [1:36:41<5:38:29,  2.35s/it]

training loss: 3.3288447856903076


training:  21%|██▏       | 2349/10986 [1:36:43<5:27:05,  2.27s/it]

training loss: 3.550619602203369


training:  21%|██▏       | 2350/10986 [1:36:45<5:34:30,  2.32s/it]

training loss: 3.332642078399658


training:  21%|██▏       | 2351/10986 [1:36:47<5:25:10,  2.26s/it]

training loss: 3.391055107116699


training:  21%|██▏       | 2352/10986 [1:36:50<5:31:34,  2.30s/it]

training loss: 3.3630571365356445


training:  21%|██▏       | 2353/10986 [1:36:52<5:22:02,  2.24s/it]

training loss: 3.516516923904419


training:  21%|██▏       | 2354/10986 [1:36:54<5:30:01,  2.29s/it]

training loss: 3.4139389991760254


training:  21%|██▏       | 2355/10986 [1:36:56<5:21:49,  2.24s/it]

training loss: 3.3973875045776367


training:  21%|██▏       | 2356/10986 [1:36:59<5:28:28,  2.28s/it]

training loss: 3.389127731323242


training:  21%|██▏       | 2357/10986 [1:37:01<5:21:01,  2.23s/it]

training loss: 3.3207757472991943


training:  21%|██▏       | 2358/10986 [1:37:03<5:32:33,  2.31s/it]

training loss: 3.443722724914551


training:  21%|██▏       | 2359/10986 [1:37:05<5:22:29,  2.24s/it]

training loss: 3.410701036453247


training:  21%|██▏       | 2360/10986 [1:37:08<5:30:29,  2.30s/it]

training loss: 3.4387083053588867
valid loss: 3.254734754562378
perplexity: 25.91274070739746


training:  21%|██▏       | 2361/10986 [1:37:13<7:35:02,  3.17s/it]

training loss: 3.5319113731384277


training:  22%|██▏       | 2362/10986 [1:37:15<6:52:18,  2.87s/it]

training loss: 3.3562793731689453


training:  22%|██▏       | 2363/10986 [1:37:18<6:33:39,  2.74s/it]

training loss: 3.504718065261841


training:  22%|██▏       | 2364/10986 [1:37:20<6:08:00,  2.56s/it]

training loss: 3.4102301597595215


training:  22%|██▏       | 2365/10986 [1:37:22<6:03:01,  2.53s/it]

training loss: 3.4001781940460205


training:  22%|██▏       | 2366/10986 [1:37:24<5:45:34,  2.41s/it]

training loss: 3.325535535812378


training:  22%|██▏       | 2367/10986 [1:37:27<5:49:23,  2.43s/it]

training loss: 3.4689748287200928


training:  22%|██▏       | 2368/10986 [1:37:29<5:37:39,  2.35s/it]

training loss: 3.410046100616455


training:  22%|██▏       | 2369/10986 [1:37:32<5:41:31,  2.38s/it]

training loss: 3.326860189437866


training:  22%|██▏       | 2370/10986 [1:37:34<5:29:57,  2.30s/it]

training loss: 3.4035820960998535


training:  22%|██▏       | 2371/10986 [1:37:36<5:36:13,  2.34s/it]

training loss: 3.4341940879821777


training:  22%|██▏       | 2372/10986 [1:37:38<5:28:20,  2.29s/it]

training loss: 3.324202537536621


training:  22%|██▏       | 2373/10986 [1:37:41<5:35:25,  2.34s/it]

training loss: 3.521422863006592


training:  22%|██▏       | 2374/10986 [1:37:43<5:25:51,  2.27s/it]

training loss: 3.314700126647949


training:  22%|██▏       | 2375/10986 [1:37:45<5:34:21,  2.33s/it]

training loss: 3.3446402549743652


training:  22%|██▏       | 2376/10986 [1:37:47<5:25:50,  2.27s/it]

training loss: 3.336852550506592


training:  22%|██▏       | 2377/10986 [1:37:50<5:33:14,  2.32s/it]

training loss: 3.3039333820343018


training:  22%|██▏       | 2378/10986 [1:37:52<5:24:25,  2.26s/it]

training loss: 3.4544546604156494


training:  22%|██▏       | 2379/10986 [1:37:56<6:21:27,  2.66s/it]

training loss: 3.3735482692718506


training:  22%|██▏       | 2380/10986 [1:37:58<6:04:10,  2.54s/it]

training loss: 3.3818485736846924
valid loss: 3.4461398124694824
perplexity: 31.379030227661133


training:  22%|██▏       | 2381/10986 [1:38:02<7:26:28,  3.11s/it]

training loss: 3.3984944820404053


training:  22%|██▏       | 2382/10986 [1:38:05<7:00:31,  2.93s/it]

training loss: 3.437814950942993


training:  22%|██▏       | 2383/10986 [1:38:07<6:26:00,  2.69s/it]

training loss: 3.3676950931549072


training:  22%|██▏       | 2384/10986 [1:38:09<6:17:18,  2.63s/it]

training loss: 3.594703435897827


training:  22%|██▏       | 2385/10986 [1:38:11<5:54:19,  2.47s/it]

training loss: 3.4996609687805176


training:  22%|██▏       | 2386/10986 [1:38:14<5:53:57,  2.47s/it]

training loss: 3.311954975128174


training:  22%|██▏       | 2387/10986 [1:38:16<5:38:05,  2.36s/it]

training loss: 3.3802173137664795


training:  22%|██▏       | 2388/10986 [1:38:19<5:42:09,  2.39s/it]

training loss: 3.355445623397827


training:  22%|██▏       | 2389/10986 [1:38:21<5:29:46,  2.30s/it]

training loss: 3.3143908977508545


training:  22%|██▏       | 2390/10986 [1:38:23<5:35:56,  2.34s/it]

training loss: 3.4936184883117676


training:  22%|██▏       | 2391/10986 [1:38:25<5:25:06,  2.27s/it]

training loss: 3.35650372505188


training:  22%|██▏       | 2392/10986 [1:38:28<5:34:51,  2.34s/it]

training loss: 3.375157117843628


training:  22%|██▏       | 2393/10986 [1:38:30<5:24:37,  2.27s/it]

training loss: 3.523207664489746


training:  22%|██▏       | 2394/10986 [1:38:32<5:33:09,  2.33s/it]

training loss: 3.3036656379699707


training:  22%|██▏       | 2395/10986 [1:38:34<5:24:18,  2.26s/it]

training loss: 3.3564610481262207


training:  22%|██▏       | 2396/10986 [1:38:37<5:31:21,  2.31s/it]

training loss: 3.3574185371398926


training:  22%|██▏       | 2397/10986 [1:38:39<5:22:41,  2.25s/it]

training loss: 3.4280283451080322


training:  22%|██▏       | 2398/10986 [1:38:41<5:32:00,  2.32s/it]

training loss: 3.388794422149658


training:  22%|██▏       | 2399/10986 [1:38:43<5:23:36,  2.26s/it]

training loss: 3.264051675796509


training:  22%|██▏       | 2400/10986 [1:38:46<5:32:13,  2.32s/it]

training loss: 3.4524312019348145
valid loss: 3.325693130493164
perplexity: 27.818275451660156


training:  22%|██▏       | 2401/10986 [1:38:51<7:22:05,  3.09s/it]

training loss: 3.3392343521118164


training:  22%|██▏       | 2402/10986 [1:38:53<6:43:24,  2.82s/it]

training loss: 3.32952880859375


training:  22%|██▏       | 2403/10986 [1:38:55<6:28:16,  2.71s/it]

training loss: 3.441149950027466


training:  22%|██▏       | 2404/10986 [1:38:58<6:04:18,  2.55s/it]

training loss: 3.3866465091705322


training:  22%|██▏       | 2405/10986 [1:39:00<6:01:32,  2.53s/it]

training loss: 3.279798746109009


training:  22%|██▏       | 2406/10986 [1:39:02<5:44:03,  2.41s/it]

training loss: 3.5078556537628174


training:  22%|██▏       | 2407/10986 [1:39:05<5:46:32,  2.42s/it]

training loss: 3.428255558013916


training:  22%|██▏       | 2408/10986 [1:39:07<5:33:09,  2.33s/it]

training loss: 3.495668888092041


training:  22%|██▏       | 2409/10986 [1:39:09<5:38:44,  2.37s/it]

training loss: 3.4169437885284424


training:  22%|██▏       | 2410/10986 [1:39:11<5:28:32,  2.30s/it]

training loss: 3.356067180633545


training:  22%|██▏       | 2411/10986 [1:39:14<5:34:13,  2.34s/it]

training loss: 3.563924551010132


training:  22%|██▏       | 2412/10986 [1:39:16<5:26:02,  2.28s/it]

training loss: 3.3555917739868164


training:  22%|██▏       | 2413/10986 [1:39:18<5:31:28,  2.32s/it]

training loss: 3.3407609462738037


training:  22%|██▏       | 2414/10986 [1:39:21<5:23:39,  2.27s/it]

training loss: 3.5095744132995605


training:  22%|██▏       | 2415/10986 [1:39:23<5:32:04,  2.32s/it]

training loss: 3.366656541824341


training:  22%|██▏       | 2416/10986 [1:39:25<5:23:50,  2.27s/it]

training loss: 3.386945962905884


training:  22%|██▏       | 2417/10986 [1:39:28<5:33:12,  2.33s/it]

training loss: 3.4988484382629395


training:  22%|██▏       | 2418/10986 [1:39:30<5:22:37,  2.26s/it]

training loss: 3.351188898086548


training:  22%|██▏       | 2419/10986 [1:39:32<5:29:24,  2.31s/it]

training loss: 3.353806257247925


training:  22%|██▏       | 2420/10986 [1:39:34<5:22:42,  2.26s/it]

training loss: 3.392096519470215
valid loss: 3.50449800491333
perplexity: 33.264739990234375


training:  22%|██▏       | 2421/10986 [1:39:39<6:56:41,  2.92s/it]

training loss: 3.5496413707733154


training:  22%|██▏       | 2422/10986 [1:39:41<6:44:08,  2.83s/it]

training loss: 3.393648862838745


training:  22%|██▏       | 2423/10986 [1:39:43<6:13:54,  2.62s/it]

training loss: 3.4050731658935547


training:  22%|██▏       | 2424/10986 [1:39:46<6:28:57,  2.73s/it]

training loss: 3.363905906677246


training:  22%|██▏       | 2425/10986 [1:39:49<6:11:06,  2.60s/it]

training loss: 3.4725584983825684


training:  22%|██▏       | 2426/10986 [1:39:51<6:06:22,  2.57s/it]

training loss: 3.4146618843078613


training:  22%|██▏       | 2427/10986 [1:39:53<5:47:17,  2.43s/it]

training loss: 3.3376286029815674


training:  22%|██▏       | 2428/10986 [1:39:56<5:48:13,  2.44s/it]

training loss: 3.449772834777832


training:  22%|██▏       | 2429/10986 [1:39:58<5:34:28,  2.35s/it]

training loss: 3.4698731899261475


training:  22%|██▏       | 2430/10986 [1:40:00<5:40:15,  2.39s/it]

training loss: 3.4253385066986084


training:  22%|██▏       | 2431/10986 [1:40:03<5:29:29,  2.31s/it]

training loss: 3.3836116790771484


training:  22%|██▏       | 2432/10986 [1:40:05<5:38:34,  2.37s/it]

training loss: 3.5995075702667236


training:  22%|██▏       | 2433/10986 [1:40:07<5:28:16,  2.30s/it]

training loss: 3.444535493850708


training:  22%|██▏       | 2434/10986 [1:40:10<5:36:38,  2.36s/it]

training loss: 3.4691503047943115


training:  22%|██▏       | 2435/10986 [1:40:12<5:26:36,  2.29s/it]

training loss: 3.381009578704834


training:  22%|██▏       | 2436/10986 [1:40:14<5:34:36,  2.35s/it]

training loss: 3.5440566539764404


training:  22%|██▏       | 2437/10986 [1:40:16<5:24:53,  2.28s/it]

training loss: 3.5220208168029785


training:  22%|██▏       | 2438/10986 [1:40:19<5:33:52,  2.34s/it]

training loss: 3.296440601348877


training:  22%|██▏       | 2439/10986 [1:40:21<5:23:40,  2.27s/it]

training loss: 3.5790131092071533


training:  22%|██▏       | 2440/10986 [1:40:24<5:32:16,  2.33s/it]

training loss: 3.612610340118408
valid loss: 3.4251768589019775
perplexity: 30.728078842163086


training:  22%|██▏       | 2441/10986 [1:40:29<7:46:04,  3.27s/it]

training loss: 3.5724809169769287


training:  22%|██▏       | 2442/10986 [1:40:32<7:13:53,  3.05s/it]

training loss: 3.4639716148376465


training:  22%|██▏       | 2443/10986 [1:40:34<6:48:49,  2.87s/it]

training loss: 3.5780436992645264


training:  22%|██▏       | 2444/10986 [1:40:36<6:17:16,  2.65s/it]

training loss: 3.4184000492095947


training:  22%|██▏       | 2445/10986 [1:40:39<6:08:53,  2.59s/it]

training loss: 3.4998908042907715


training:  22%|██▏       | 2446/10986 [1:40:41<5:48:25,  2.45s/it]

training loss: 3.3679726123809814


training:  22%|██▏       | 2447/10986 [1:40:43<5:50:25,  2.46s/it]

training loss: 3.5501973628997803


training:  22%|██▏       | 2448/10986 [1:40:45<5:35:09,  2.36s/it]

training loss: 3.469132661819458


training:  22%|██▏       | 2449/10986 [1:40:48<5:38:45,  2.38s/it]

training loss: 3.677098512649536


training:  22%|██▏       | 2450/10986 [1:40:50<5:30:36,  2.32s/it]

training loss: 3.396322011947632


training:  22%|██▏       | 2451/10986 [1:40:52<5:35:51,  2.36s/it]

training loss: 3.462233304977417


training:  22%|██▏       | 2452/10986 [1:40:54<5:25:39,  2.29s/it]

training loss: 3.3858773708343506


training:  22%|██▏       | 2453/10986 [1:40:57<5:36:13,  2.36s/it]

training loss: 3.438458204269409


training:  22%|██▏       | 2454/10986 [1:40:59<5:25:30,  2.29s/it]

training loss: 3.4984445571899414


training:  22%|██▏       | 2455/10986 [1:41:02<5:34:07,  2.35s/it]

training loss: 3.505765438079834


training:  22%|██▏       | 2456/10986 [1:41:04<5:23:30,  2.28s/it]

training loss: 3.4100074768066406


training:  22%|██▏       | 2457/10986 [1:41:06<5:29:37,  2.32s/it]

training loss: 3.387629270553589


training:  22%|██▏       | 2458/10986 [1:41:08<5:21:11,  2.26s/it]

training loss: 3.426424980163574


training:  22%|██▏       | 2459/10986 [1:41:11<5:28:42,  2.31s/it]

training loss: 3.4206480979919434


training:  22%|██▏       | 2460/10986 [1:41:13<5:20:03,  2.25s/it]

training loss: 3.478698968887329
valid loss: 3.4540305137634277
perplexity: 31.62761116027832


training:  22%|██▏       | 2461/10986 [1:41:17<6:53:28,  2.91s/it]

training loss: 3.3845975399017334


training:  22%|██▏       | 2462/10986 [1:41:20<6:38:00,  2.80s/it]

training loss: 3.2849061489105225


training:  22%|██▏       | 2463/10986 [1:41:22<6:10:13,  2.61s/it]

training loss: 3.4535410404205322


training:  22%|██▏       | 2464/10986 [1:41:24<6:04:25,  2.57s/it]

training loss: 3.4474899768829346


training:  22%|██▏       | 2465/10986 [1:41:27<5:45:41,  2.43s/it]

training loss: 3.396859645843506


training:  22%|██▏       | 2466/10986 [1:41:29<5:46:24,  2.44s/it]

training loss: 3.3687782287597656


training:  22%|██▏       | 2467/10986 [1:41:31<5:33:39,  2.35s/it]

training loss: 3.434079170227051


training:  22%|██▏       | 2468/10986 [1:41:34<5:35:31,  2.36s/it]

training loss: 3.244964599609375


training:  22%|██▏       | 2469/10986 [1:41:36<5:25:41,  2.29s/it]

training loss: 3.465348243713379


training:  22%|██▏       | 2470/10986 [1:41:38<5:33:55,  2.35s/it]

training loss: 3.4952664375305176


training:  22%|██▏       | 2471/10986 [1:41:40<5:25:16,  2.29s/it]

training loss: 3.360874891281128


training:  23%|██▎       | 2472/10986 [1:41:43<5:33:05,  2.35s/it]

training loss: 3.455188512802124


training:  23%|██▎       | 2473/10986 [1:41:45<5:24:27,  2.29s/it]

training loss: 3.341202735900879


training:  23%|██▎       | 2474/10986 [1:41:47<5:30:05,  2.33s/it]

training loss: 3.4708573818206787


training:  23%|██▎       | 2475/10986 [1:41:50<5:21:47,  2.27s/it]

training loss: 3.392374277114868


training:  23%|██▎       | 2476/10986 [1:41:52<5:30:07,  2.33s/it]

training loss: 3.378173589706421


training:  23%|██▎       | 2477/10986 [1:41:54<5:20:54,  2.26s/it]

training loss: 3.3303894996643066


training:  23%|██▎       | 2478/10986 [1:41:57<5:28:15,  2.31s/it]

training loss: 3.3419549465179443


training:  23%|██▎       | 2479/10986 [1:41:59<5:19:49,  2.26s/it]

training loss: 3.3203611373901367


training:  23%|██▎       | 2480/10986 [1:42:01<5:29:29,  2.32s/it]

training loss: 3.5407116413116455
valid loss: 3.3635706901550293
perplexity: 28.892173767089844


training:  23%|██▎       | 2481/10986 [1:42:06<7:05:33,  3.00s/it]

training loss: 3.3704612255096436


training:  23%|██▎       | 2482/10986 [1:42:08<6:30:13,  2.75s/it]

training loss: 3.2370383739471436


training:  23%|██▎       | 2483/10986 [1:42:10<6:17:00,  2.66s/it]

training loss: 3.3916993141174316


training:  23%|██▎       | 2484/10986 [1:42:12<5:55:51,  2.51s/it]

training loss: 3.441607713699341


training:  23%|██▎       | 2485/10986 [1:42:15<5:53:23,  2.49s/it]

training loss: 3.453871726989746


training:  23%|██▎       | 2486/10986 [1:42:17<5:37:27,  2.38s/it]

training loss: 3.358743190765381


training:  23%|██▎       | 2487/10986 [1:42:20<5:40:39,  2.40s/it]

training loss: 3.3372602462768555


training:  23%|██▎       | 2488/10986 [1:42:22<5:28:21,  2.32s/it]

training loss: 3.337674140930176


training:  23%|██▎       | 2489/10986 [1:42:24<5:33:41,  2.36s/it]

training loss: 3.510810136795044


training:  23%|██▎       | 2490/10986 [1:42:26<5:24:15,  2.29s/it]

training loss: 3.389744758605957


training:  23%|██▎       | 2491/10986 [1:42:29<5:31:53,  2.34s/it]

training loss: 3.376096248626709


training:  23%|██▎       | 2492/10986 [1:42:31<5:23:53,  2.29s/it]

training loss: 3.3304336071014404


training:  23%|██▎       | 2493/10986 [1:42:33<5:30:56,  2.34s/it]

training loss: 3.402073383331299


training:  23%|██▎       | 2494/10986 [1:42:35<5:22:30,  2.28s/it]

training loss: 3.434305191040039


training:  23%|██▎       | 2495/10986 [1:42:38<5:30:32,  2.34s/it]

training loss: 3.4715542793273926


training:  23%|██▎       | 2496/10986 [1:42:40<5:22:19,  2.28s/it]

training loss: 3.4307773113250732


training:  23%|██▎       | 2497/10986 [1:42:43<5:31:45,  2.34s/it]

training loss: 3.352954149246216


training:  23%|██▎       | 2498/10986 [1:42:45<5:23:11,  2.28s/it]

training loss: 3.4709079265594482


training:  23%|██▎       | 2499/10986 [1:42:47<5:30:23,  2.34s/it]

training loss: 3.5585110187530518


training:  23%|██▎       | 2500/10986 [1:42:49<5:23:08,  2.28s/it]

training loss: 3.4547224044799805
valid loss: 3.3724799156188965
perplexity: 29.15073013305664


training:  23%|██▎       | 2501/10986 [1:42:54<7:14:40,  3.07s/it]

training loss: 3.4351956844329834


training:  23%|██▎       | 2502/10986 [1:42:57<6:52:42,  2.92s/it]

training loss: 3.379695415496826


training:  23%|██▎       | 2503/10986 [1:42:59<6:18:49,  2.68s/it]

training loss: 3.30163836479187


training:  23%|██▎       | 2504/10986 [1:43:02<6:32:59,  2.78s/it]

training loss: 3.3312697410583496


training:  23%|██▎       | 2505/10986 [1:43:05<6:40:29,  2.83s/it]

training loss: 3.4623425006866455


training:  23%|██▎       | 2506/10986 [1:43:07<6:23:19,  2.71s/it]

training loss: 3.5354647636413574


training:  23%|██▎       | 2507/10986 [1:43:09<5:58:50,  2.54s/it]

training loss: 3.452691078186035


training:  23%|██▎       | 2508/10986 [1:43:12<5:54:32,  2.51s/it]

training loss: 3.5988564491271973


training:  23%|██▎       | 2509/10986 [1:43:14<5:39:59,  2.41s/it]

training loss: 3.4387147426605225


training:  23%|██▎       | 2510/10986 [1:43:17<5:42:01,  2.42s/it]

training loss: 3.5586915016174316


training:  23%|██▎       | 2511/10986 [1:43:19<5:30:38,  2.34s/it]

training loss: 3.504821538925171


training:  23%|██▎       | 2512/10986 [1:43:21<5:34:37,  2.37s/it]

training loss: 3.514549970626831


training:  23%|██▎       | 2513/10986 [1:43:23<5:24:05,  2.29s/it]

training loss: 3.459132671356201


training:  23%|██▎       | 2514/10986 [1:43:26<5:31:39,  2.35s/it]

training loss: 3.3611679077148438


training:  23%|██▎       | 2515/10986 [1:43:28<5:21:42,  2.28s/it]

training loss: 3.3604776859283447


training:  23%|██▎       | 2516/10986 [1:43:30<5:27:32,  2.32s/it]

training loss: 3.3326327800750732


training:  23%|██▎       | 2517/10986 [1:43:32<5:19:58,  2.27s/it]

training loss: 3.4311392307281494


training:  23%|██▎       | 2518/10986 [1:43:35<5:28:45,  2.33s/it]

training loss: 3.436542510986328


training:  23%|██▎       | 2519/10986 [1:43:37<5:21:06,  2.28s/it]

training loss: 3.5591604709625244


training:  23%|██▎       | 2520/10986 [1:43:39<5:27:28,  2.32s/it]

training loss: 3.412661552429199
valid loss: 3.300682544708252
perplexity: 27.13115119934082


training:  23%|██▎       | 2521/10986 [1:43:44<7:15:17,  3.09s/it]

training loss: 3.422351360321045


training:  23%|██▎       | 2522/10986 [1:43:46<6:37:39,  2.82s/it]

training loss: 3.4801623821258545


training:  23%|██▎       | 2523/10986 [1:43:49<6:24:21,  2.72s/it]

training loss: 3.4074130058288574


training:  23%|██▎       | 2524/10986 [1:43:51<5:59:43,  2.55s/it]

training loss: 3.3725476264953613


training:  23%|██▎       | 2525/10986 [1:43:54<5:55:31,  2.52s/it]

training loss: 3.328491687774658


training:  23%|██▎       | 2526/10986 [1:43:56<5:40:24,  2.41s/it]

training loss: 3.4741525650024414


training:  23%|██▎       | 2527/10986 [1:43:58<5:41:40,  2.42s/it]

training loss: 3.401871681213379


training:  23%|██▎       | 2528/10986 [1:44:00<5:29:33,  2.34s/it]

training loss: 3.5979163646698


training:  23%|██▎       | 2529/10986 [1:44:03<5:33:53,  2.37s/it]

training loss: 3.290559768676758


training:  23%|██▎       | 2530/10986 [1:44:05<5:24:46,  2.30s/it]

training loss: 3.354708433151245


training:  23%|██▎       | 2531/10986 [1:44:07<5:31:39,  2.35s/it]

training loss: 3.537074089050293


training:  23%|██▎       | 2532/10986 [1:44:09<5:20:47,  2.28s/it]

training loss: 3.304030418395996


training:  23%|██▎       | 2533/10986 [1:44:12<5:27:19,  2.32s/it]

training loss: 3.4189252853393555


training:  23%|██▎       | 2534/10986 [1:44:14<5:18:24,  2.26s/it]

training loss: 3.4637227058410645


training:  23%|██▎       | 2535/10986 [1:44:16<5:25:01,  2.31s/it]

training loss: 3.3765599727630615


training:  23%|██▎       | 2536/10986 [1:44:19<5:18:30,  2.26s/it]

training loss: 3.3300156593322754


training:  23%|██▎       | 2537/10986 [1:44:21<5:26:05,  2.32s/it]

training loss: 3.4194397926330566


training:  23%|██▎       | 2538/10986 [1:44:23<5:16:45,  2.25s/it]

training loss: 3.420539379119873


training:  23%|██▎       | 2539/10986 [1:44:26<5:26:23,  2.32s/it]

training loss: 3.349789619445801


training:  23%|██▎       | 2540/10986 [1:44:28<5:18:20,  2.26s/it]

training loss: 3.3336262702941895
valid loss: 3.3891654014587402
perplexity: 29.641202926635742


training:  23%|██▎       | 2541/10986 [1:44:32<6:53:00,  2.93s/it]

training loss: 3.4009974002838135


training:  23%|██▎       | 2542/10986 [1:44:35<6:37:26,  2.82s/it]

training loss: 3.526362657546997


training:  23%|██▎       | 2543/10986 [1:44:37<6:08:32,  2.62s/it]

training loss: 3.3905327320098877


training:  23%|██▎       | 2544/10986 [1:44:39<6:01:21,  2.57s/it]

training loss: 3.333026647567749


training:  23%|██▎       | 2545/10986 [1:44:42<5:42:22,  2.43s/it]

training loss: 3.2868809700012207


training:  23%|██▎       | 2546/10986 [1:44:44<5:43:35,  2.44s/it]

training loss: 3.449200391769409


training:  23%|██▎       | 2547/10986 [1:44:46<5:28:59,  2.34s/it]

training loss: 3.4515368938446045


training:  23%|██▎       | 2548/10986 [1:44:49<5:34:03,  2.38s/it]

training loss: 3.409919500350952


training:  23%|██▎       | 2549/10986 [1:44:51<5:22:03,  2.29s/it]

training loss: 3.380768060684204


training:  23%|██▎       | 2550/10986 [1:44:53<5:28:55,  2.34s/it]

training loss: 3.491156816482544


training:  23%|██▎       | 2551/10986 [1:44:55<5:21:41,  2.29s/it]

training loss: 3.3356692790985107


training:  23%|██▎       | 2552/10986 [1:44:58<5:28:29,  2.34s/it]

training loss: 3.3332629203796387


training:  23%|██▎       | 2553/10986 [1:45:00<5:19:10,  2.27s/it]

training loss: 3.4450910091400146


training:  23%|██▎       | 2554/10986 [1:45:02<5:26:37,  2.32s/it]

training loss: 3.2716357707977295


training:  23%|██▎       | 2555/10986 [1:45:04<5:18:02,  2.26s/it]

training loss: 3.3368592262268066


training:  23%|██▎       | 2556/10986 [1:45:07<5:27:28,  2.33s/it]

training loss: 3.3960752487182617


training:  23%|██▎       | 2557/10986 [1:45:09<5:18:36,  2.27s/it]

training loss: 3.286966562271118


training:  23%|██▎       | 2558/10986 [1:45:11<5:26:48,  2.33s/it]

training loss: 3.4841442108154297


training:  23%|██▎       | 2559/10986 [1:45:14<5:18:59,  2.27s/it]

training loss: 3.4991915225982666


training:  23%|██▎       | 2560/10986 [1:45:16<5:24:56,  2.31s/it]

training loss: 3.4501142501831055
valid loss: 3.3843791484832764
perplexity: 29.499671936035156


training:  23%|██▎       | 2561/10986 [1:45:21<7:04:37,  3.02s/it]

training loss: 3.406038999557495


training:  23%|██▎       | 2562/10986 [1:45:23<6:27:24,  2.76s/it]

training loss: 3.463634729385376


training:  23%|██▎       | 2563/10986 [1:45:25<6:15:43,  2.68s/it]

training loss: 3.430338144302368


training:  23%|██▎       | 2564/10986 [1:45:27<5:52:27,  2.51s/it]

training loss: 3.353562593460083


training:  23%|██▎       | 2565/10986 [1:45:30<5:48:55,  2.49s/it]

training loss: 3.3595266342163086


training:  23%|██▎       | 2566/10986 [1:45:32<5:34:35,  2.38s/it]

training loss: 3.3579297065734863


training:  23%|██▎       | 2567/10986 [1:45:35<6:10:49,  2.64s/it]

training loss: 3.316660165786743


training:  23%|██▎       | 2568/10986 [1:45:38<6:03:34,  2.59s/it]

training loss: 3.4487533569335938


training:  23%|██▎       | 2569/10986 [1:45:40<5:57:54,  2.55s/it]

training loss: 3.2258822917938232


training:  23%|██▎       | 2570/10986 [1:45:42<5:40:27,  2.43s/it]

training loss: 3.337286949157715


training:  23%|██▎       | 2571/10986 [1:45:45<5:39:55,  2.42s/it]

training loss: 3.3631503582000732


training:  23%|██▎       | 2572/10986 [1:45:47<5:25:59,  2.32s/it]

training loss: 3.2912495136260986


training:  23%|██▎       | 2573/10986 [1:45:49<5:31:13,  2.36s/it]

training loss: 3.43829345703125


training:  23%|██▎       | 2574/10986 [1:45:52<5:31:08,  2.36s/it]

training loss: 3.3472886085510254


training:  23%|██▎       | 2575/10986 [1:45:54<5:33:07,  2.38s/it]

training loss: 3.3404531478881836


training:  23%|██▎       | 2576/10986 [1:45:56<5:23:32,  2.31s/it]

training loss: 3.3237712383270264


training:  23%|██▎       | 2577/10986 [1:45:59<5:29:22,  2.35s/it]

training loss: 3.2360100746154785


training:  23%|██▎       | 2578/10986 [1:46:01<5:19:17,  2.28s/it]

training loss: 3.345092296600342


training:  23%|██▎       | 2579/10986 [1:46:03<5:25:17,  2.32s/it]

training loss: 3.526580572128296


training:  23%|██▎       | 2580/10986 [1:46:05<5:17:48,  2.27s/it]

training loss: 3.4311437606811523
valid loss: 3.4309282302856445
perplexity: 30.905317306518555


training:  23%|██▎       | 2581/10986 [1:46:10<6:47:00,  2.91s/it]

training loss: 3.3254222869873047


training:  24%|██▎       | 2582/10986 [1:46:12<6:32:15,  2.80s/it]

training loss: 3.4519810676574707


training:  24%|██▎       | 2583/10986 [1:46:14<6:02:44,  2.59s/it]

training loss: 3.340404987335205


training:  24%|██▎       | 2584/10986 [1:46:17<5:56:43,  2.55s/it]

training loss: 3.3887994289398193


training:  24%|██▎       | 2585/10986 [1:46:19<5:39:09,  2.42s/it]

training loss: 3.3489251136779785


training:  24%|██▎       | 2586/10986 [1:46:21<5:39:13,  2.42s/it]

training loss: 3.404829740524292


training:  24%|██▎       | 2587/10986 [1:46:24<5:28:17,  2.35s/it]

training loss: 3.451263427734375


training:  24%|██▎       | 2588/10986 [1:46:26<5:32:25,  2.38s/it]

training loss: 3.364839553833008


training:  24%|██▎       | 2589/10986 [1:46:28<5:20:36,  2.29s/it]

training loss: 3.3558759689331055


training:  24%|██▎       | 2590/10986 [1:46:31<5:25:24,  2.33s/it]

training loss: 3.295722723007202


training:  24%|██▎       | 2591/10986 [1:46:33<5:18:48,  2.28s/it]

training loss: 3.373824119567871


training:  24%|██▎       | 2592/10986 [1:46:35<5:25:47,  2.33s/it]

training loss: 3.287064790725708


training:  24%|██▎       | 2593/10986 [1:46:37<5:18:27,  2.28s/it]

training loss: 3.4009904861450195


training:  24%|██▎       | 2594/10986 [1:46:40<5:24:32,  2.32s/it]

training loss: 3.40824294090271


training:  24%|██▎       | 2595/10986 [1:46:42<5:16:14,  2.26s/it]

training loss: 3.5303139686584473


training:  24%|██▎       | 2596/10986 [1:46:44<5:24:05,  2.32s/it]

training loss: 3.4969780445098877


training:  24%|██▎       | 2597/10986 [1:46:46<5:16:06,  2.26s/it]

training loss: 3.304266929626465


training:  24%|██▎       | 2598/10986 [1:46:49<5:22:39,  2.31s/it]

training loss: 3.3378970623016357


training:  24%|██▎       | 2599/10986 [1:46:51<5:14:42,  2.25s/it]

training loss: 3.5208580493927


training:  24%|██▎       | 2600/10986 [1:46:53<5:24:10,  2.32s/it]

training loss: 3.225546360015869
valid loss: 3.3834028244018555
perplexity: 29.470884323120117


training:  24%|██▎       | 2601/10986 [1:46:59<7:21:34,  3.16s/it]

training loss: 3.332965850830078


training:  24%|██▎       | 2602/10986 [1:47:01<6:38:06,  2.85s/it]

training loss: 3.3752095699310303


training:  24%|██▎       | 2603/10986 [1:47:03<6:19:01,  2.71s/it]

training loss: 3.447221040725708


training:  24%|██▎       | 2604/10986 [1:47:05<5:54:36,  2.54s/it]

training loss: 3.4464542865753174


training:  24%|██▎       | 2605/10986 [1:47:08<5:50:15,  2.51s/it]

training loss: 3.5157341957092285


training:  24%|██▎       | 2606/10986 [1:47:10<5:33:49,  2.39s/it]

training loss: 3.3745667934417725


training:  24%|██▎       | 2607/10986 [1:47:12<5:34:39,  2.40s/it]

training loss: 3.4150238037109375


training:  24%|██▎       | 2608/10986 [1:47:14<5:23:06,  2.31s/it]

training loss: 3.4448347091674805


training:  24%|██▎       | 2609/10986 [1:47:17<5:29:02,  2.36s/it]

training loss: 3.539043664932251


training:  24%|██▍       | 2610/10986 [1:47:19<5:20:19,  2.29s/it]

training loss: 3.434180498123169


training:  24%|██▍       | 2611/10986 [1:47:21<5:26:12,  2.34s/it]

training loss: 3.348707675933838


training:  24%|██▍       | 2612/10986 [1:47:23<5:16:15,  2.27s/it]

training loss: 3.4638864994049072


training:  24%|██▍       | 2613/10986 [1:47:26<5:23:29,  2.32s/it]

training loss: 3.4361276626586914


training:  24%|██▍       | 2614/10986 [1:47:28<5:15:16,  2.26s/it]

training loss: 3.38643479347229


training:  24%|██▍       | 2615/10986 [1:47:30<5:21:39,  2.31s/it]

training loss: 3.416630506515503


training:  24%|██▍       | 2616/10986 [1:47:33<5:12:52,  2.24s/it]

training loss: 3.2636594772338867


training:  24%|██▍       | 2617/10986 [1:47:35<5:20:45,  2.30s/it]

training loss: 3.5151944160461426


training:  24%|██▍       | 2618/10986 [1:47:37<5:13:50,  2.25s/it]

training loss: 3.598031759262085


training:  24%|██▍       | 2619/10986 [1:47:40<5:22:13,  2.31s/it]

training loss: 3.498661756515503


training:  24%|██▍       | 2620/10986 [1:47:42<5:14:25,  2.26s/it]

training loss: 3.399847984313965
valid loss: 3.2825264930725098
perplexity: 26.64299964904785


training:  24%|██▍       | 2621/10986 [1:47:46<6:44:28,  2.90s/it]

training loss: 3.412351369857788


training:  24%|██▍       | 2622/10986 [1:47:49<6:29:31,  2.79s/it]

training loss: 3.428389549255371


training:  24%|██▍       | 2623/10986 [1:47:51<6:02:04,  2.60s/it]

training loss: 3.360595941543579


training:  24%|██▍       | 2624/10986 [1:47:53<5:54:54,  2.55s/it]

training loss: 3.3885576725006104


training:  24%|██▍       | 2625/10986 [1:47:55<5:38:07,  2.43s/it]

training loss: 3.348689556121826


training:  24%|██▍       | 2626/10986 [1:47:58<5:38:54,  2.43s/it]

training loss: 3.3433456420898438


training:  24%|██▍       | 2627/10986 [1:48:00<5:24:53,  2.33s/it]

training loss: 3.398224353790283


training:  24%|██▍       | 2628/10986 [1:48:02<5:29:27,  2.37s/it]

training loss: 3.363403081893921


training:  24%|██▍       | 2629/10986 [1:48:04<5:19:11,  2.29s/it]

training loss: 3.450556516647339


training:  24%|██▍       | 2630/10986 [1:48:07<5:24:19,  2.33s/it]

training loss: 3.309630870819092


training:  24%|██▍       | 2631/10986 [1:48:09<5:19:35,  2.30s/it]

training loss: 3.355933666229248


training:  24%|██▍       | 2632/10986 [1:48:13<6:15:27,  2.70s/it]

training loss: 3.3543567657470703


training:  24%|██▍       | 2633/10986 [1:48:15<5:55:22,  2.55s/it]

training loss: 3.3580400943756104


training:  24%|██▍       | 2634/10986 [1:48:17<5:49:35,  2.51s/it]

training loss: 3.421903610229492


training:  24%|██▍       | 2635/10986 [1:48:19<5:33:52,  2.40s/it]

training loss: 3.3844385147094727


training:  24%|██▍       | 2636/10986 [1:48:22<5:35:03,  2.41s/it]

training loss: 3.290332794189453


training:  24%|██▍       | 2637/10986 [1:48:24<5:22:23,  2.32s/it]

training loss: 3.361025333404541


training:  24%|██▍       | 2638/10986 [1:48:26<5:28:42,  2.36s/it]

training loss: 3.3687801361083984


training:  24%|██▍       | 2639/10986 [1:48:29<5:18:08,  2.29s/it]

training loss: 3.3434486389160156


training:  24%|██▍       | 2640/10986 [1:48:31<5:24:24,  2.33s/it]

training loss: 3.4076061248779297
valid loss: 3.3648505210876465
perplexity: 28.92917251586914


training:  24%|██▍       | 2641/10986 [1:48:36<7:15:40,  3.13s/it]

training loss: 3.364219903945923


training:  24%|██▍       | 2642/10986 [1:48:38<6:35:48,  2.85s/it]

training loss: 3.3679943084716797


training:  24%|██▍       | 2643/10986 [1:48:41<6:17:39,  2.72s/it]

training loss: 3.264172077178955


training:  24%|██▍       | 2644/10986 [1:48:43<5:51:40,  2.53s/it]

training loss: 3.2996666431427


training:  24%|██▍       | 2645/10986 [1:48:45<5:47:39,  2.50s/it]

training loss: 3.2849202156066895


training:  24%|██▍       | 2646/10986 [1:48:47<5:30:12,  2.38s/it]

training loss: 3.3405468463897705


training:  24%|██▍       | 2647/10986 [1:48:50<5:30:55,  2.38s/it]

training loss: 3.4131033420562744


training:  24%|██▍       | 2648/10986 [1:48:52<5:20:08,  2.30s/it]

training loss: 3.3890292644500732


training:  24%|██▍       | 2649/10986 [1:48:54<5:24:58,  2.34s/it]

training loss: 3.401111364364624


training:  24%|██▍       | 2650/10986 [1:48:56<5:16:31,  2.28s/it]

training loss: 3.3035295009613037


training:  24%|██▍       | 2651/10986 [1:48:59<5:22:49,  2.32s/it]

training loss: 3.2791383266448975


training:  24%|██▍       | 2652/10986 [1:49:01<5:13:37,  2.26s/it]

training loss: 3.4061179161071777


training:  24%|██▍       | 2653/10986 [1:49:03<5:19:49,  2.30s/it]

training loss: 3.4232711791992188


training:  24%|██▍       | 2654/10986 [1:49:05<5:12:14,  2.25s/it]

training loss: 3.6324634552001953


training:  24%|██▍       | 2655/10986 [1:49:08<5:19:33,  2.30s/it]

training loss: 3.363844871520996


training:  24%|██▍       | 2656/10986 [1:49:10<5:12:02,  2.25s/it]

training loss: 3.375204563140869


training:  24%|██▍       | 2657/10986 [1:49:12<5:20:46,  2.31s/it]

training loss: 3.352727174758911


training:  24%|██▍       | 2658/10986 [1:49:14<5:13:34,  2.26s/it]

training loss: 3.3199894428253174


training:  24%|██▍       | 2659/10986 [1:49:17<5:21:12,  2.31s/it]

training loss: 3.408708333969116


training:  24%|██▍       | 2660/10986 [1:49:19<5:13:13,  2.26s/it]

training loss: 3.4118120670318604
valid loss: 3.2911789417266846
perplexity: 26.874528884887695


training:  24%|██▍       | 2661/10986 [1:49:24<6:44:30,  2.92s/it]

training loss: 3.481738805770874


training:  24%|██▍       | 2662/10986 [1:49:26<6:29:25,  2.81s/it]

training loss: 3.300391435623169


training:  24%|██▍       | 2663/10986 [1:49:28<6:00:34,  2.60s/it]

training loss: 3.3181145191192627


training:  24%|██▍       | 2664/10986 [1:49:31<5:53:19,  2.55s/it]

training loss: 3.376199722290039


training:  24%|██▍       | 2665/10986 [1:49:33<5:36:23,  2.43s/it]

training loss: 3.3967888355255127


training:  24%|██▍       | 2666/10986 [1:49:35<5:37:29,  2.43s/it]

training loss: 3.312713861465454


training:  24%|██▍       | 2667/10986 [1:49:37<5:25:44,  2.35s/it]

training loss: 3.414189338684082


training:  24%|██▍       | 2668/10986 [1:49:40<5:30:28,  2.38s/it]

training loss: 3.3940417766571045


training:  24%|██▍       | 2669/10986 [1:49:42<5:19:52,  2.31s/it]

training loss: 3.3949456214904785


training:  24%|██▍       | 2670/10986 [1:49:44<5:25:57,  2.35s/it]

training loss: 3.339585304260254


training:  24%|██▍       | 2671/10986 [1:49:47<5:26:45,  2.36s/it]

training loss: 3.3913111686706543


training:  24%|██▍       | 2672/10986 [1:49:49<5:29:50,  2.38s/it]

training loss: 3.491765260696411


training:  24%|██▍       | 2673/10986 [1:49:51<5:18:23,  2.30s/it]

training loss: 3.455089569091797


training:  24%|██▍       | 2674/10986 [1:49:54<5:24:07,  2.34s/it]

training loss: 3.401315927505493


training:  24%|██▍       | 2675/10986 [1:49:56<5:15:05,  2.27s/it]

training loss: 3.4288954734802246


training:  24%|██▍       | 2676/10986 [1:49:58<5:21:47,  2.32s/it]

training loss: 3.334482431411743


training:  24%|██▍       | 2677/10986 [1:50:00<5:12:33,  2.26s/it]

training loss: 3.41196346282959


training:  24%|██▍       | 2678/10986 [1:50:03<5:18:21,  2.30s/it]

training loss: 3.3571934700012207


training:  24%|██▍       | 2679/10986 [1:50:05<5:10:58,  2.25s/it]

training loss: 3.330328941345215


training:  24%|██▍       | 2680/10986 [1:50:07<5:17:59,  2.30s/it]

training loss: 3.3136749267578125
valid loss: 3.43275785446167
perplexity: 30.9619140625


training:  24%|██▍       | 2681/10986 [1:50:12<6:49:06,  2.96s/it]

training loss: 3.30696439743042


training:  24%|██▍       | 2682/10986 [1:50:14<6:16:54,  2.72s/it]

training loss: 3.3526480197906494


training:  24%|██▍       | 2683/10986 [1:50:16<6:04:24,  2.63s/it]

training loss: 3.3869378566741943


training:  24%|██▍       | 2684/10986 [1:50:19<5:42:25,  2.47s/it]

training loss: 3.275777578353882


training:  24%|██▍       | 2685/10986 [1:50:21<5:41:57,  2.47s/it]

training loss: 3.528294324874878


training:  24%|██▍       | 2686/10986 [1:50:23<5:25:54,  2.36s/it]

training loss: 3.425180196762085


training:  24%|██▍       | 2687/10986 [1:50:26<5:30:03,  2.39s/it]

training loss: 3.450047254562378


training:  24%|██▍       | 2688/10986 [1:50:28<5:19:15,  2.31s/it]

training loss: 3.4016857147216797


training:  24%|██▍       | 2689/10986 [1:50:30<5:25:22,  2.35s/it]

training loss: 3.3054392337799072


training:  24%|██▍       | 2690/10986 [1:50:32<5:15:39,  2.28s/it]

training loss: 3.391615390777588


training:  24%|██▍       | 2691/10986 [1:50:35<5:24:20,  2.35s/it]

training loss: 3.2104616165161133


training:  25%|██▍       | 2692/10986 [1:50:37<5:16:30,  2.29s/it]

training loss: 3.2939822673797607


training:  25%|██▍       | 2693/10986 [1:50:39<5:24:32,  2.35s/it]

training loss: 3.3466813564300537


training:  25%|██▍       | 2694/10986 [1:50:42<5:31:07,  2.40s/it]

training loss: 3.3780407905578613


training:  25%|██▍       | 2695/10986 [1:50:45<6:20:16,  2.75s/it]

training loss: 3.4423575401306152


training:  25%|██▍       | 2696/10986 [1:50:48<5:54:36,  2.57s/it]

training loss: 3.5298352241516113


training:  25%|██▍       | 2697/10986 [1:50:50<5:48:45,  2.52s/it]

training loss: 3.4164018630981445


training:  25%|██▍       | 2698/10986 [1:50:52<5:34:44,  2.42s/it]

training loss: 3.2583394050598145


training:  25%|██▍       | 2699/10986 [1:50:55<5:36:11,  2.43s/it]

training loss: 3.351194381713867


training:  25%|██▍       | 2700/10986 [1:50:57<5:24:29,  2.35s/it]

training loss: 3.4086997509002686
valid loss: 3.2886593341827393
perplexity: 26.806900024414062


training:  25%|██▍       | 2701/10986 [1:51:02<7:00:46,  3.05s/it]

training loss: 3.2326576709747314


training:  25%|██▍       | 2702/10986 [1:51:04<6:35:50,  2.87s/it]

training loss: 3.3642139434814453


training:  25%|██▍       | 2703/10986 [1:51:06<6:05:15,  2.65s/it]

training loss: 3.387779712677002


training:  25%|██▍       | 2704/10986 [1:51:09<5:57:20,  2.59s/it]

training loss: 3.3707616329193115


training:  25%|██▍       | 2705/10986 [1:51:11<5:37:12,  2.44s/it]

training loss: 3.215022325515747


training:  25%|██▍       | 2706/10986 [1:51:13<5:39:19,  2.46s/it]

training loss: 3.3793296813964844


training:  25%|██▍       | 2707/10986 [1:51:15<5:24:44,  2.35s/it]

training loss: 3.449249029159546


training:  25%|██▍       | 2708/10986 [1:51:18<5:27:31,  2.37s/it]

training loss: 3.5414562225341797


training:  25%|██▍       | 2709/10986 [1:51:20<5:17:02,  2.30s/it]

training loss: 3.4142751693725586


training:  25%|██▍       | 2710/10986 [1:51:22<5:23:50,  2.35s/it]

training loss: 3.275679111480713


training:  25%|██▍       | 2711/10986 [1:51:24<5:15:38,  2.29s/it]

training loss: 3.328528642654419


training:  25%|██▍       | 2712/10986 [1:51:27<5:21:28,  2.33s/it]

training loss: 3.2973198890686035


training:  25%|██▍       | 2713/10986 [1:51:29<5:12:05,  2.26s/it]

training loss: 3.3505353927612305


training:  25%|██▍       | 2714/10986 [1:51:31<5:17:55,  2.31s/it]

training loss: 3.3499135971069336


training:  25%|██▍       | 2715/10986 [1:51:33<5:10:14,  2.25s/it]

training loss: 3.4570093154907227


training:  25%|██▍       | 2716/10986 [1:51:36<5:16:55,  2.30s/it]

training loss: 3.488311529159546


training:  25%|██▍       | 2717/10986 [1:51:38<5:08:53,  2.24s/it]

training loss: 3.354672431945801


training:  25%|██▍       | 2718/10986 [1:51:40<5:15:48,  2.29s/it]

training loss: 3.461627721786499


training:  25%|██▍       | 2719/10986 [1:51:43<5:09:13,  2.24s/it]

training loss: 3.372554063796997


training:  25%|██▍       | 2720/10986 [1:51:45<5:16:57,  2.30s/it]

training loss: 3.3181445598602295
valid loss: 3.4005372524261475
perplexity: 29.980201721191406


training:  25%|██▍       | 2721/10986 [1:51:49<6:44:33,  2.94s/it]

training loss: 3.262308120727539


training:  25%|██▍       | 2722/10986 [1:51:52<6:13:05,  2.71s/it]

training loss: 3.5246334075927734


training:  25%|██▍       | 2723/10986 [1:51:54<6:02:25,  2.63s/it]

training loss: 3.4153640270233154


training:  25%|██▍       | 2724/10986 [1:51:56<5:42:01,  2.48s/it]

training loss: 3.3507587909698486


training:  25%|██▍       | 2725/10986 [1:51:59<5:41:14,  2.48s/it]

training loss: 3.468305826187134


training:  25%|██▍       | 2726/10986 [1:52:01<5:25:56,  2.37s/it]

training loss: 3.3221819400787354


training:  25%|██▍       | 2727/10986 [1:52:03<5:28:04,  2.38s/it]

training loss: 3.342947006225586


training:  25%|██▍       | 2728/10986 [1:52:05<5:17:31,  2.31s/it]

training loss: 3.3848226070404053


training:  25%|██▍       | 2729/10986 [1:52:08<5:21:56,  2.34s/it]

training loss: 3.4926538467407227


training:  25%|██▍       | 2730/10986 [1:52:10<5:12:49,  2.27s/it]

training loss: 3.319030284881592


training:  25%|██▍       | 2731/10986 [1:52:12<5:19:02,  2.32s/it]

training loss: 3.2401115894317627


training:  25%|██▍       | 2732/10986 [1:52:14<5:12:04,  2.27s/it]

training loss: 3.26192307472229


training:  25%|██▍       | 2733/10986 [1:52:17<5:19:24,  2.32s/it]

training loss: 3.342089891433716


training:  25%|██▍       | 2734/10986 [1:52:19<5:10:40,  2.26s/it]

training loss: 3.4895355701446533


training:  25%|██▍       | 2735/10986 [1:52:21<5:17:13,  2.31s/it]

training loss: 3.33673357963562


training:  25%|██▍       | 2736/10986 [1:52:23<5:08:41,  2.25s/it]

training loss: 3.5654232501983643


training:  25%|██▍       | 2737/10986 [1:52:26<5:17:00,  2.31s/it]

training loss: 3.3371803760528564


training:  25%|██▍       | 2738/10986 [1:52:28<5:09:03,  2.25s/it]

training loss: 3.357015609741211


training:  25%|██▍       | 2739/10986 [1:52:30<5:16:06,  2.30s/it]

training loss: 3.3534188270568848


training:  25%|██▍       | 2740/10986 [1:52:33<5:07:11,  2.24s/it]

training loss: 3.4356234073638916
valid loss: 3.424898624420166
perplexity: 30.719532012939453


training:  25%|██▍       | 2741/10986 [1:52:37<6:32:49,  2.86s/it]

training loss: 3.3673346042633057


training:  25%|██▍       | 2742/10986 [1:52:39<6:18:07,  2.75s/it]

training loss: 3.5151662826538086


training:  25%|██▍       | 2743/10986 [1:52:41<5:52:01,  2.56s/it]

training loss: 3.353388547897339


training:  25%|██▍       | 2744/10986 [1:52:44<5:46:08,  2.52s/it]

training loss: 3.371758460998535


training:  25%|██▍       | 2745/10986 [1:52:46<5:28:44,  2.39s/it]

training loss: 3.3338308334350586


training:  25%|██▍       | 2746/10986 [1:52:48<5:29:42,  2.40s/it]

training loss: 3.4434142112731934


training:  25%|██▌       | 2747/10986 [1:52:51<5:17:12,  2.31s/it]

training loss: 3.304856538772583


training:  25%|██▌       | 2748/10986 [1:52:53<5:21:53,  2.34s/it]

training loss: 3.4300150871276855


training:  25%|██▌       | 2749/10986 [1:52:55<5:11:27,  2.27s/it]

training loss: 3.3564250469207764


training:  25%|██▌       | 2750/10986 [1:52:57<5:16:38,  2.31s/it]

training loss: 3.3952629566192627


training:  25%|██▌       | 2751/10986 [1:53:00<5:08:50,  2.25s/it]

training loss: 3.4280524253845215


training:  25%|██▌       | 2752/10986 [1:53:02<5:14:36,  2.29s/it]

training loss: 3.3296656608581543


training:  25%|██▌       | 2753/10986 [1:53:04<5:06:19,  2.23s/it]

training loss: 3.329744338989258


training:  25%|██▌       | 2754/10986 [1:53:06<5:14:26,  2.29s/it]

training loss: 3.348020553588867


training:  25%|██▌       | 2755/10986 [1:53:09<5:06:29,  2.23s/it]

training loss: 3.4061708450317383


training:  25%|██▌       | 2756/10986 [1:53:11<5:13:32,  2.29s/it]

training loss: 3.398029088973999


training:  25%|██▌       | 2757/10986 [1:53:13<5:19:37,  2.33s/it]

training loss: 3.3647515773773193


training:  25%|██▌       | 2758/10986 [1:53:17<6:09:39,  2.70s/it]

training loss: 3.4692976474761963


training:  25%|██▌       | 2759/10986 [1:53:19<5:45:47,  2.52s/it]

training loss: 3.385314702987671


training:  25%|██▌       | 2760/10986 [1:53:21<5:40:50,  2.49s/it]

training loss: 3.3777151107788086
valid loss: 3.433213233947754
perplexity: 30.976016998291016


training:  25%|██▌       | 2761/10986 [1:53:26<7:11:58,  3.15s/it]

training loss: 3.379946231842041


training:  25%|██▌       | 2762/10986 [1:53:28<6:30:37,  2.85s/it]

training loss: 3.45699143409729


training:  25%|██▌       | 2763/10986 [1:53:31<6:13:15,  2.72s/it]

training loss: 3.349177360534668


training:  25%|██▌       | 2764/10986 [1:53:33<5:47:55,  2.54s/it]

training loss: 3.356903076171875


training:  25%|██▌       | 2765/10986 [1:53:35<5:43:35,  2.51s/it]

training loss: 3.397874593734741


training:  25%|██▌       | 2766/10986 [1:53:37<5:28:22,  2.40s/it]

training loss: 3.418633222579956


training:  25%|██▌       | 2767/10986 [1:53:40<5:30:22,  2.41s/it]

training loss: 3.385805606842041


training:  25%|██▌       | 2768/10986 [1:53:42<5:17:11,  2.32s/it]

training loss: 3.289598226547241


training:  25%|██▌       | 2769/10986 [1:53:44<5:22:36,  2.36s/it]

training loss: 3.430579423904419


training:  25%|██▌       | 2770/10986 [1:53:47<5:12:59,  2.29s/it]

training loss: 3.343379020690918


training:  25%|██▌       | 2771/10986 [1:53:50<5:46:27,  2.53s/it]

training loss: 3.3929665088653564


training:  25%|██▌       | 2772/10986 [1:53:52<5:29:07,  2.40s/it]

training loss: 3.402729034423828


training:  25%|██▌       | 2773/10986 [1:53:54<5:30:18,  2.41s/it]

training loss: 3.435455083847046


training:  25%|██▌       | 2774/10986 [1:53:56<5:17:21,  2.32s/it]

training loss: 3.390122175216675


training:  25%|██▌       | 2775/10986 [1:53:59<5:20:22,  2.34s/it]

training loss: 3.3587920665740967


training:  25%|██▌       | 2776/10986 [1:54:01<5:10:34,  2.27s/it]

training loss: 3.3327157497406006


training:  25%|██▌       | 2777/10986 [1:54:03<5:16:41,  2.31s/it]

training loss: 3.3713409900665283


training:  25%|██▌       | 2778/10986 [1:54:05<5:08:05,  2.25s/it]

training loss: 3.355802536010742


training:  25%|██▌       | 2779/10986 [1:54:08<5:14:37,  2.30s/it]

training loss: 3.388686418533325


training:  25%|██▌       | 2780/10986 [1:54:10<5:08:17,  2.25s/it]

training loss: 3.4322776794433594
valid loss: 3.3838818073272705
perplexity: 29.485004425048828


training:  25%|██▌       | 2781/10986 [1:54:14<6:36:13,  2.90s/it]

training loss: 3.3263468742370605


training:  25%|██▌       | 2782/10986 [1:54:17<6:20:45,  2.78s/it]

training loss: 3.417959690093994


training:  25%|██▌       | 2783/10986 [1:54:19<5:53:57,  2.59s/it]

training loss: 3.4235076904296875


training:  25%|██▌       | 2784/10986 [1:54:21<5:46:46,  2.54s/it]

training loss: 3.4150075912475586


training:  25%|██▌       | 2785/10986 [1:54:23<5:29:47,  2.41s/it]

training loss: 3.383333683013916


training:  25%|██▌       | 2786/10986 [1:54:26<5:29:19,  2.41s/it]

training loss: 3.3382961750030518


training:  25%|██▌       | 2787/10986 [1:54:28<5:17:34,  2.32s/it]

training loss: 3.321639060974121


training:  25%|██▌       | 2788/10986 [1:54:30<5:21:25,  2.35s/it]

training loss: 3.3350722789764404


training:  25%|██▌       | 2789/10986 [1:54:32<5:10:52,  2.28s/it]

training loss: 3.2687392234802246


training:  25%|██▌       | 2790/10986 [1:54:35<5:17:22,  2.32s/it]

training loss: 3.3115458488464355


training:  25%|██▌       | 2791/10986 [1:54:37<5:09:46,  2.27s/it]

training loss: 3.3190135955810547


training:  25%|██▌       | 2792/10986 [1:54:39<5:16:34,  2.32s/it]

training loss: 3.390425205230713


training:  25%|██▌       | 2793/10986 [1:54:42<5:10:02,  2.27s/it]

training loss: 3.4300549030303955


training:  25%|██▌       | 2794/10986 [1:54:44<5:17:27,  2.33s/it]

training loss: 3.4049627780914307


training:  25%|██▌       | 2795/10986 [1:54:46<5:10:18,  2.27s/it]

training loss: 3.595193386077881


training:  25%|██▌       | 2796/10986 [1:54:49<5:16:47,  2.32s/it]

training loss: 3.475062370300293


training:  25%|██▌       | 2797/10986 [1:54:51<5:07:36,  2.25s/it]

training loss: 3.2537484169006348


training:  25%|██▌       | 2798/10986 [1:54:53<5:17:38,  2.33s/it]

training loss: 3.371325969696045


training:  25%|██▌       | 2799/10986 [1:54:55<5:08:00,  2.26s/it]

training loss: 3.2719531059265137


training:  25%|██▌       | 2800/10986 [1:54:58<5:14:42,  2.31s/it]

training loss: 3.309863805770874
valid loss: 3.2824723720550537
perplexity: 26.641557693481445


training:  25%|██▌       | 2801/10986 [1:55:03<6:55:00,  3.04s/it]

training loss: 3.3681344985961914


training:  26%|██▌       | 2802/10986 [1:55:05<6:18:16,  2.77s/it]

training loss: 3.2881417274475098


training:  26%|██▌       | 2803/10986 [1:55:07<6:02:38,  2.66s/it]

training loss: 3.3796439170837402


training:  26%|██▌       | 2804/10986 [1:55:09<5:40:02,  2.49s/it]

training loss: 3.3699302673339844


training:  26%|██▌       | 2805/10986 [1:55:12<5:36:36,  2.47s/it]

training loss: 3.375335216522217


training:  26%|██▌       | 2806/10986 [1:55:14<5:20:45,  2.35s/it]

training loss: 3.416989326477051


training:  26%|██▌       | 2807/10986 [1:55:16<5:38:12,  2.48s/it]

training loss: 3.462754964828491


training:  26%|██▌       | 2808/10986 [1:55:19<5:23:41,  2.37s/it]

training loss: 3.3011441230773926


training:  26%|██▌       | 2809/10986 [1:55:21<5:28:04,  2.41s/it]

training loss: 3.5102477073669434


training:  26%|██▌       | 2810/10986 [1:55:23<5:15:28,  2.32s/it]

training loss: 3.246891498565674


training:  26%|██▌       | 2811/10986 [1:55:26<5:20:18,  2.35s/it]

training loss: 3.462416172027588


training:  26%|██▌       | 2812/10986 [1:55:28<5:09:32,  2.27s/it]

training loss: 3.5230507850646973


training:  26%|██▌       | 2813/10986 [1:55:30<5:14:42,  2.31s/it]

training loss: 3.3402864933013916


training:  26%|██▌       | 2814/10986 [1:55:32<5:06:19,  2.25s/it]

training loss: 3.3542683124542236


training:  26%|██▌       | 2815/10986 [1:55:35<5:13:17,  2.30s/it]

training loss: 3.3668551445007324


training:  26%|██▌       | 2816/10986 [1:55:37<5:04:39,  2.24s/it]

training loss: 3.356779098510742


training:  26%|██▌       | 2817/10986 [1:55:39<5:12:22,  2.29s/it]

training loss: 3.358485698699951


training:  26%|██▌       | 2818/10986 [1:55:41<5:05:01,  2.24s/it]

training loss: 3.437877655029297


training:  26%|██▌       | 2819/10986 [1:55:44<5:11:56,  2.29s/it]

training loss: 3.3575029373168945


training:  26%|██▌       | 2820/10986 [1:55:47<5:39:02,  2.49s/it]

training loss: 3.468402624130249
valid loss: 3.3606581687927246
perplexity: 28.80814552307129


training:  26%|██▌       | 2821/10986 [1:55:52<7:18:46,  3.22s/it]

training loss: 3.3814234733581543


training:  26%|██▌       | 2822/10986 [1:55:54<6:50:59,  3.02s/it]

training loss: 3.35673189163208


training:  26%|██▌       | 2823/10986 [1:55:56<6:14:56,  2.76s/it]

training loss: 3.51874041557312


training:  26%|██▌       | 2824/10986 [1:55:59<6:01:50,  2.66s/it]

training loss: 3.498626708984375


training:  26%|██▌       | 2825/10986 [1:56:01<5:38:14,  2.49s/it]

training loss: 3.4324941635131836


training:  26%|██▌       | 2826/10986 [1:56:03<5:33:59,  2.46s/it]

training loss: 3.3586270809173584


training:  26%|██▌       | 2827/10986 [1:56:05<5:19:50,  2.35s/it]

training loss: 3.3463447093963623


training:  26%|██▌       | 2828/10986 [1:56:08<5:21:42,  2.37s/it]

training loss: 3.4367477893829346


training:  26%|██▌       | 2829/10986 [1:56:10<5:11:48,  2.29s/it]

training loss: 3.289874315261841


training:  26%|██▌       | 2830/10986 [1:56:12<5:28:15,  2.41s/it]

training loss: 3.458969831466675


training:  26%|██▌       | 2831/10986 [1:56:15<5:19:01,  2.35s/it]

training loss: 3.4048781394958496


training:  26%|██▌       | 2832/10986 [1:56:17<5:20:42,  2.36s/it]

training loss: 3.2505128383636475


training:  26%|██▌       | 2833/10986 [1:56:19<5:12:34,  2.30s/it]

training loss: 3.317145586013794


training:  26%|██▌       | 2834/10986 [1:56:22<5:16:59,  2.33s/it]

training loss: 3.3708648681640625


training:  26%|██▌       | 2835/10986 [1:56:24<5:07:33,  2.26s/it]

training loss: 3.457266330718994


training:  26%|██▌       | 2836/10986 [1:56:26<5:13:46,  2.31s/it]

training loss: 3.3673338890075684


training:  26%|██▌       | 2837/10986 [1:56:28<5:04:48,  2.24s/it]

training loss: 3.5404274463653564


training:  26%|██▌       | 2838/10986 [1:56:31<5:12:11,  2.30s/it]

training loss: 3.325576066970825


training:  26%|██▌       | 2839/10986 [1:56:33<5:03:26,  2.23s/it]

training loss: 3.4210498332977295


training:  26%|██▌       | 2840/10986 [1:56:35<5:11:49,  2.30s/it]

training loss: 3.3768868446350098
valid loss: 3.399771213531494
perplexity: 29.957246780395508


training:  26%|██▌       | 2841/10986 [1:56:40<6:58:50,  3.09s/it]

training loss: 3.2479898929595947


training:  26%|██▌       | 2842/10986 [1:56:42<6:21:50,  2.81s/it]

training loss: 3.296806573867798


training:  26%|██▌       | 2843/10986 [1:56:45<6:07:27,  2.71s/it]

training loss: 3.3091959953308105


training:  26%|██▌       | 2844/10986 [1:56:47<5:41:57,  2.52s/it]

training loss: 3.3674213886260986


training:  26%|██▌       | 2845/10986 [1:56:49<5:37:28,  2.49s/it]

training loss: 3.428284168243408


training:  26%|██▌       | 2846/10986 [1:56:51<5:22:28,  2.38s/it]

training loss: 3.3332839012145996


training:  26%|██▌       | 2847/10986 [1:56:54<5:25:36,  2.40s/it]

training loss: 3.3634910583496094


training:  26%|██▌       | 2848/10986 [1:56:56<5:13:13,  2.31s/it]

training loss: 3.44055438041687


training:  26%|██▌       | 2849/10986 [1:56:58<5:19:39,  2.36s/it]

training loss: 3.3994879722595215


training:  26%|██▌       | 2850/10986 [1:57:01<5:10:09,  2.29s/it]

training loss: 3.5134565830230713


training:  26%|██▌       | 2851/10986 [1:57:03<5:15:30,  2.33s/it]

training loss: 3.2883431911468506


training:  26%|██▌       | 2852/10986 [1:57:05<5:07:20,  2.27s/it]

training loss: 3.4177632331848145


training:  26%|██▌       | 2853/10986 [1:57:08<5:14:02,  2.32s/it]

training loss: 3.3822109699249268


training:  26%|██▌       | 2854/10986 [1:57:10<5:04:25,  2.25s/it]

training loss: 3.3091092109680176


training:  26%|██▌       | 2855/10986 [1:57:12<5:12:25,  2.31s/it]

training loss: 3.452488422393799


training:  26%|██▌       | 2856/10986 [1:57:15<5:28:42,  2.43s/it]

training loss: 3.2823660373687744


training:  26%|██▌       | 2857/10986 [1:57:17<5:28:36,  2.43s/it]

training loss: 3.3439888954162598


training:  26%|██▌       | 2858/10986 [1:57:19<5:15:32,  2.33s/it]

training loss: 3.414586305618286


training:  26%|██▌       | 2859/10986 [1:57:22<5:20:24,  2.37s/it]

training loss: 3.380786895751953


training:  26%|██▌       | 2860/10986 [1:57:24<5:10:08,  2.29s/it]

training loss: 3.4761757850646973
valid loss: 3.388087511062622
perplexity: 29.609270095825195


training:  26%|██▌       | 2861/10986 [1:57:28<6:34:45,  2.92s/it]

training loss: 3.4236769676208496


training:  26%|██▌       | 2862/10986 [1:57:31<6:19:45,  2.80s/it]

training loss: 3.4261810779571533


training:  26%|██▌       | 2863/10986 [1:57:33<5:53:22,  2.61s/it]

training loss: 3.3761327266693115


training:  26%|██▌       | 2864/10986 [1:57:35<5:45:59,  2.56s/it]

training loss: 3.3560214042663574


training:  26%|██▌       | 2865/10986 [1:57:37<5:29:15,  2.43s/it]

training loss: 3.3714592456817627


training:  26%|██▌       | 2866/10986 [1:57:40<5:28:34,  2.43s/it]

training loss: 3.480191230773926


training:  26%|██▌       | 2867/10986 [1:57:42<5:15:33,  2.33s/it]

training loss: 3.385549783706665


training:  26%|██▌       | 2868/10986 [1:57:44<5:19:19,  2.36s/it]

training loss: 3.232579231262207


training:  26%|██▌       | 2869/10986 [1:57:47<5:09:17,  2.29s/it]

training loss: 3.2529404163360596


training:  26%|██▌       | 2870/10986 [1:57:49<5:14:52,  2.33s/it]

training loss: 3.4293880462646484


training:  26%|██▌       | 2871/10986 [1:57:51<5:06:02,  2.26s/it]

training loss: 3.4468212127685547


training:  26%|██▌       | 2872/10986 [1:57:54<5:13:47,  2.32s/it]

training loss: 3.3554749488830566


training:  26%|██▌       | 2873/10986 [1:57:56<5:05:22,  2.26s/it]

training loss: 3.3778114318847656


training:  26%|██▌       | 2874/10986 [1:57:58<5:11:07,  2.30s/it]

training loss: 3.331132173538208


training:  26%|██▌       | 2875/10986 [1:58:00<5:02:17,  2.24s/it]

training loss: 3.4530534744262695


training:  26%|██▌       | 2876/10986 [1:58:03<5:10:04,  2.29s/it]

training loss: 3.422924518585205


training:  26%|██▌       | 2877/10986 [1:58:05<5:02:51,  2.24s/it]

training loss: 3.4200804233551025


training:  26%|██▌       | 2878/10986 [1:58:07<5:09:19,  2.29s/it]

training loss: 3.5744175910949707


training:  26%|██▌       | 2879/10986 [1:58:09<5:01:51,  2.23s/it]

training loss: 3.443117618560791


training:  26%|██▌       | 2880/10986 [1:58:12<5:10:00,  2.29s/it]

training loss: 3.3491859436035156
valid loss: 3.281691551208496
perplexity: 26.620765686035156


training:  26%|██▌       | 2881/10986 [1:58:18<8:00:24,  3.56s/it]

training loss: 3.247802257537842


training:  26%|██▌       | 2882/10986 [1:58:21<7:25:23,  3.30s/it]

training loss: 3.4698736667633057


training:  26%|██▌       | 2883/10986 [1:58:23<6:51:39,  3.05s/it]

training loss: 3.3314030170440674


training:  26%|██▋       | 2884/10986 [1:58:25<6:14:52,  2.78s/it]

training loss: 3.4541280269622803


training:  26%|██▋       | 2885/10986 [1:58:28<6:00:49,  2.67s/it]

training loss: 3.3821301460266113


training:  26%|██▋       | 2886/10986 [1:58:30<5:37:02,  2.50s/it]

training loss: 3.498811960220337


training:  26%|██▋       | 2887/10986 [1:58:32<5:33:38,  2.47s/it]

training loss: 3.3645553588867188


training:  26%|██▋       | 2888/10986 [1:58:34<5:19:46,  2.37s/it]

training loss: 3.373894453048706


training:  26%|██▋       | 2889/10986 [1:58:37<5:20:17,  2.37s/it]

training loss: 3.35632061958313


training:  26%|██▋       | 2890/10986 [1:58:39<5:08:19,  2.29s/it]

training loss: 3.4460158348083496


training:  26%|██▋       | 2891/10986 [1:58:41<5:13:08,  2.32s/it]

training loss: 3.3140451908111572


training:  26%|██▋       | 2892/10986 [1:58:43<5:05:10,  2.26s/it]

training loss: 3.4562885761260986


training:  26%|██▋       | 2893/10986 [1:58:46<5:11:43,  2.31s/it]

training loss: 3.486107349395752


training:  26%|██▋       | 2894/10986 [1:58:48<5:03:17,  2.25s/it]

training loss: 3.3328778743743896


training:  26%|██▋       | 2895/10986 [1:58:50<5:11:13,  2.31s/it]

training loss: 3.3343911170959473


training:  26%|██▋       | 2896/10986 [1:58:53<5:03:40,  2.25s/it]

training loss: 3.395899534225464


training:  26%|██▋       | 2897/10986 [1:58:55<5:10:48,  2.31s/it]

training loss: 3.3579070568084717


training:  26%|██▋       | 2898/10986 [1:58:57<5:03:29,  2.25s/it]

training loss: 3.4416122436523438


training:  26%|██▋       | 2899/10986 [1:59:00<5:11:31,  2.31s/it]

training loss: 3.4207279682159424


training:  26%|██▋       | 2900/10986 [1:59:02<5:02:49,  2.25s/it]

training loss: 3.5728554725646973
valid loss: 3.4490866661071777
perplexity: 31.471633911132812


training:  26%|██▋       | 2901/10986 [1:59:06<6:39:37,  2.97s/it]

training loss: 3.3512606620788574


training:  26%|██▋       | 2902/10986 [1:59:09<6:19:33,  2.82s/it]

training loss: 3.3797786235809326


training:  26%|██▋       | 2903/10986 [1:59:11<5:53:02,  2.62s/it]

training loss: 3.345890998840332


training:  26%|██▋       | 2904/10986 [1:59:13<5:46:12,  2.57s/it]

training loss: 3.2720746994018555


training:  26%|██▋       | 2905/10986 [1:59:16<5:28:09,  2.44s/it]

training loss: 3.506725311279297


training:  26%|██▋       | 2906/10986 [1:59:18<5:38:25,  2.51s/it]

training loss: 3.434816360473633


training:  26%|██▋       | 2907/10986 [1:59:20<5:23:08,  2.40s/it]

training loss: 3.336603879928589


training:  26%|██▋       | 2908/10986 [1:59:23<5:23:50,  2.41s/it]

training loss: 3.4916820526123047


training:  26%|██▋       | 2909/10986 [1:59:25<5:13:44,  2.33s/it]

training loss: 3.292192220687866


training:  26%|██▋       | 2910/10986 [1:59:27<5:17:02,  2.36s/it]

training loss: 3.3651304244995117


training:  26%|██▋       | 2911/10986 [1:59:29<5:06:59,  2.28s/it]

training loss: 3.3022732734680176


training:  27%|██▋       | 2912/10986 [1:59:32<5:11:50,  2.32s/it]

training loss: 3.3775100708007812


training:  27%|██▋       | 2913/10986 [1:59:34<5:02:38,  2.25s/it]

training loss: 3.4483087062835693


training:  27%|██▋       | 2914/10986 [1:59:36<5:08:42,  2.29s/it]

training loss: 3.3892502784729004


training:  27%|██▋       | 2915/10986 [1:59:38<5:02:08,  2.25s/it]

training loss: 3.325864553451538


training:  27%|██▋       | 2916/10986 [1:59:41<5:09:43,  2.30s/it]

training loss: 3.4197287559509277


training:  27%|██▋       | 2917/10986 [1:59:43<5:02:37,  2.25s/it]

training loss: 3.404210090637207


training:  27%|██▋       | 2918/10986 [1:59:45<5:09:50,  2.30s/it]

training loss: 3.3296546936035156


training:  27%|██▋       | 2919/10986 [1:59:48<5:00:56,  2.24s/it]

training loss: 3.3319599628448486


training:  27%|██▋       | 2920/10986 [1:59:50<5:07:02,  2.28s/it]

training loss: 3.327104330062866
valid loss: 3.558851718902588
perplexity: 35.122840881347656


training:  27%|██▋       | 2921/10986 [1:59:55<6:58:11,  3.11s/it]

training loss: 3.3800296783447266


training:  27%|██▋       | 2922/10986 [1:59:57<6:20:21,  2.83s/it]

training loss: 3.532670736312866


training:  27%|██▋       | 2923/10986 [2:00:00<6:03:01,  2.70s/it]

training loss: 3.3398382663726807


training:  27%|██▋       | 2924/10986 [2:00:02<5:39:36,  2.53s/it]

training loss: 3.466104745864868


training:  27%|██▋       | 2925/10986 [2:00:04<5:34:35,  2.49s/it]

training loss: 3.51739764213562


training:  27%|██▋       | 2926/10986 [2:00:06<5:19:48,  2.38s/it]

training loss: 3.5033938884735107


training:  27%|██▋       | 2927/10986 [2:00:09<5:19:40,  2.38s/it]

training loss: 3.394655704498291


training:  27%|██▋       | 2928/10986 [2:00:11<5:08:08,  2.29s/it]

training loss: 3.267110586166382


training:  27%|██▋       | 2929/10986 [2:00:13<5:13:00,  2.33s/it]

training loss: 3.449223518371582


training:  27%|██▋       | 2930/10986 [2:00:15<5:05:07,  2.27s/it]

training loss: 3.334057331085205


training:  27%|██▋       | 2931/10986 [2:00:18<5:43:14,  2.56s/it]

training loss: 3.4452760219573975


training:  27%|██▋       | 2932/10986 [2:00:21<5:24:50,  2.42s/it]

training loss: 3.392509937286377


training:  27%|██▋       | 2933/10986 [2:00:23<5:24:15,  2.42s/it]

training loss: 3.3912739753723145


training:  27%|██▋       | 2934/10986 [2:00:25<5:12:28,  2.33s/it]

training loss: 3.440056800842285


training:  27%|██▋       | 2935/10986 [2:00:28<5:16:09,  2.36s/it]

training loss: 3.3162050247192383


training:  27%|██▋       | 2936/10986 [2:00:30<5:06:04,  2.28s/it]

training loss: 3.516355276107788


training:  27%|██▋       | 2937/10986 [2:00:32<5:10:28,  2.31s/it]

training loss: 3.35111403465271


training:  27%|██▋       | 2938/10986 [2:00:34<5:02:33,  2.26s/it]

training loss: 3.3446881771087646


training:  27%|██▋       | 2939/10986 [2:00:37<5:09:35,  2.31s/it]

training loss: 3.4040775299072266


training:  27%|██▋       | 2940/10986 [2:00:39<5:00:00,  2.24s/it]

training loss: 3.5258474349975586
valid loss: 3.3252949714660645
perplexity: 27.807199478149414


training:  27%|██▋       | 2941/10986 [2:00:43<6:31:12,  2.92s/it]

training loss: 3.288419485092163


training:  27%|██▋       | 2942/10986 [2:00:46<6:14:55,  2.80s/it]

training loss: 3.4555814266204834


training:  27%|██▋       | 2943/10986 [2:00:48<5:46:44,  2.59s/it]

training loss: 3.368212938308716


training:  27%|██▋       | 2944/10986 [2:00:51<6:17:34,  2.82s/it]

training loss: 3.3820388317108154


training:  27%|██▋       | 2945/10986 [2:00:53<6:00:11,  2.69s/it]

training loss: 3.419581174850464


training:  27%|██▋       | 2946/10986 [2:00:56<5:48:31,  2.60s/it]

training loss: 3.3890693187713623


training:  27%|██▋       | 2947/10986 [2:00:58<5:28:34,  2.45s/it]

training loss: 3.3770906925201416


training:  27%|██▋       | 2948/10986 [2:01:00<5:26:16,  2.44s/it]

training loss: 3.427203893661499


training:  27%|██▋       | 2949/10986 [2:01:02<5:11:40,  2.33s/it]

training loss: 3.4329240322113037


training:  27%|██▋       | 2950/10986 [2:01:05<5:15:29,  2.36s/it]

training loss: 3.349900484085083


training:  27%|██▋       | 2951/10986 [2:01:07<5:04:41,  2.28s/it]

training loss: 3.429137945175171


training:  27%|██▋       | 2952/10986 [2:01:09<5:09:53,  2.31s/it]

training loss: 3.5520009994506836


training:  27%|██▋       | 2953/10986 [2:01:11<5:02:02,  2.26s/it]

training loss: 3.4599533081054688


training:  27%|██▋       | 2954/10986 [2:01:14<5:07:04,  2.29s/it]

training loss: 3.4239158630371094


training:  27%|██▋       | 2955/10986 [2:01:16<5:12:07,  2.33s/it]

training loss: 3.594125270843506


training:  27%|██▋       | 2956/10986 [2:01:19<5:13:45,  2.34s/it]

training loss: 3.3774445056915283


training:  27%|██▋       | 2957/10986 [2:01:21<5:04:00,  2.27s/it]

training loss: 3.376091718673706


training:  27%|██▋       | 2958/10986 [2:01:23<5:10:55,  2.32s/it]

training loss: 3.4417452812194824


training:  27%|██▋       | 2959/10986 [2:01:25<5:02:47,  2.26s/it]

training loss: 3.4152798652648926


training:  27%|██▋       | 2960/10986 [2:01:28<5:09:07,  2.31s/it]

training loss: 3.453293561935425
valid loss: 3.5471959114074707
perplexity: 34.71583557128906


training:  27%|██▋       | 2961/10986 [2:01:32<6:44:29,  3.02s/it]

training loss: 3.5431206226348877


training:  27%|██▋       | 2962/10986 [2:01:35<6:10:10,  2.77s/it]

training loss: 3.3144989013671875


training:  27%|██▋       | 2963/10986 [2:01:37<5:56:47,  2.67s/it]

training loss: 3.283069372177124


training:  27%|██▋       | 2964/10986 [2:01:39<5:34:34,  2.50s/it]

training loss: 3.472968578338623


training:  27%|██▋       | 2965/10986 [2:01:42<5:32:28,  2.49s/it]

training loss: 3.472278118133545


training:  27%|██▋       | 2966/10986 [2:01:44<5:17:46,  2.38s/it]

training loss: 3.432560443878174


training:  27%|██▋       | 2967/10986 [2:01:46<5:19:34,  2.39s/it]

training loss: 3.5091819763183594


training:  27%|██▋       | 2968/10986 [2:01:48<5:07:55,  2.30s/it]

training loss: 3.4886107444763184


training:  27%|██▋       | 2969/10986 [2:01:51<5:11:27,  2.33s/it]

training loss: 3.427156448364258


training:  27%|██▋       | 2970/10986 [2:01:53<5:02:02,  2.26s/it]

training loss: 3.3814890384674072


training:  27%|██▋       | 2971/10986 [2:01:55<5:09:18,  2.32s/it]

training loss: 3.3821780681610107


training:  27%|██▋       | 2972/10986 [2:01:57<5:00:59,  2.25s/it]

training loss: 3.4663727283477783


training:  27%|██▋       | 2973/10986 [2:02:00<5:07:22,  2.30s/it]

training loss: 3.365140438079834


training:  27%|██▋       | 2974/10986 [2:02:02<4:59:21,  2.24s/it]

training loss: 3.4457931518554688


training:  27%|██▋       | 2975/10986 [2:02:04<5:08:11,  2.31s/it]

training loss: 3.4070329666137695


training:  27%|██▋       | 2976/10986 [2:02:06<4:59:45,  2.25s/it]

training loss: 3.469418525695801


training:  27%|██▋       | 2977/10986 [2:02:09<5:06:47,  2.30s/it]

training loss: 3.478091239929199


training:  27%|██▋       | 2978/10986 [2:02:11<4:59:59,  2.25s/it]

training loss: 3.465388774871826


training:  27%|██▋       | 2979/10986 [2:02:13<5:06:14,  2.29s/it]

training loss: 3.398226022720337


training:  27%|██▋       | 2980/10986 [2:02:15<4:57:52,  2.23s/it]

training loss: 3.353752613067627
valid loss: 3.3864905834198
perplexity: 29.56202507019043


training:  27%|██▋       | 2981/10986 [2:02:20<6:23:47,  2.88s/it]

training loss: 3.57712984085083


training:  27%|██▋       | 2982/10986 [2:02:22<6:08:53,  2.77s/it]

training loss: 3.3547427654266357


training:  27%|██▋       | 2983/10986 [2:02:24<5:43:25,  2.57s/it]

training loss: 3.417301893234253


training:  27%|██▋       | 2984/10986 [2:02:27<5:37:22,  2.53s/it]

training loss: 3.4002878665924072


training:  27%|██▋       | 2985/10986 [2:02:29<5:20:36,  2.40s/it]

training loss: 3.426095485687256


training:  27%|██▋       | 2986/10986 [2:02:31<5:21:33,  2.41s/it]

training loss: 3.3736133575439453


training:  27%|██▋       | 2987/10986 [2:02:34<5:09:44,  2.32s/it]

training loss: 3.389810800552368


training:  27%|██▋       | 2988/10986 [2:02:36<5:13:34,  2.35s/it]

training loss: 3.537393569946289


training:  27%|██▋       | 2989/10986 [2:02:38<5:02:41,  2.27s/it]

training loss: 3.4193944931030273


training:  27%|██▋       | 2990/10986 [2:02:40<5:07:57,  2.31s/it]

training loss: 3.4335124492645264


training:  27%|██▋       | 2991/10986 [2:02:43<5:00:56,  2.26s/it]

training loss: 3.4815258979797363


training:  27%|██▋       | 2992/10986 [2:02:45<5:13:58,  2.36s/it]

training loss: 3.404336929321289


training:  27%|██▋       | 2993/10986 [2:02:47<5:09:39,  2.32s/it]

training loss: 3.3882248401641846


training:  27%|██▋       | 2994/10986 [2:02:50<5:12:53,  2.35s/it]

training loss: 3.385033130645752


training:  27%|██▋       | 2995/10986 [2:02:52<5:01:55,  2.27s/it]

training loss: 3.2537667751312256


training:  27%|██▋       | 2996/10986 [2:02:54<5:09:15,  2.32s/it]

training loss: 3.342724323272705


training:  27%|██▋       | 2997/10986 [2:02:56<5:01:52,  2.27s/it]

training loss: 3.4452476501464844


training:  27%|██▋       | 2998/10986 [2:02:59<5:09:11,  2.32s/it]

training loss: 3.3869893550872803


training:  27%|██▋       | 2999/10986 [2:03:01<5:00:33,  2.26s/it]

training loss: 3.4640238285064697


training:  27%|██▋       | 3000/10986 [2:03:03<5:07:40,  2.31s/it]

training loss: 3.448829412460327
valid loss: 3.4229824542999268
perplexity: 30.660724639892578


training:  27%|██▋       | 3001/10986 [2:03:08<6:45:57,  3.05s/it]

training loss: 3.4693071842193604


training:  27%|██▋       | 3002/10986 [2:03:10<6:08:46,  2.77s/it]

training loss: 3.3432672023773193


training:  27%|██▋       | 3003/10986 [2:03:14<6:26:00,  2.90s/it]

training loss: 3.4784390926361084


training:  27%|██▋       | 3004/10986 [2:03:16<5:55:18,  2.67s/it]

training loss: 3.3387389183044434


training:  27%|██▋       | 3005/10986 [2:03:18<5:45:58,  2.60s/it]

training loss: 3.270307779312134


training:  27%|██▋       | 3006/10986 [2:03:20<5:26:26,  2.45s/it]

training loss: 3.5299224853515625


training:  27%|██▋       | 3007/10986 [2:03:23<5:50:57,  2.64s/it]

training loss: 3.3147315979003906


training:  27%|██▋       | 3008/10986 [2:03:26<5:51:06,  2.64s/it]

training loss: 3.3378677368164062


training:  27%|██▋       | 3009/10986 [2:03:28<5:43:43,  2.59s/it]

training loss: 3.4185268878936768


training:  27%|██▋       | 3010/10986 [2:03:31<5:23:53,  2.44s/it]

training loss: 3.327057361602783


training:  27%|██▋       | 3011/10986 [2:03:33<5:23:53,  2.44s/it]

training loss: 3.4301376342773438


training:  27%|██▋       | 3012/10986 [2:03:35<5:11:24,  2.34s/it]

training loss: 3.4548752307891846


training:  27%|██▋       | 3013/10986 [2:03:37<5:13:27,  2.36s/it]

training loss: 3.3963680267333984


training:  27%|██▋       | 3014/10986 [2:03:40<5:02:58,  2.28s/it]

training loss: 3.420180082321167


training:  27%|██▋       | 3015/10986 [2:03:42<5:08:01,  2.32s/it]

training loss: 3.433706521987915


training:  27%|██▋       | 3016/10986 [2:03:44<5:00:34,  2.26s/it]

training loss: 3.4831578731536865


training:  27%|██▋       | 3017/10986 [2:03:47<5:07:43,  2.32s/it]

training loss: 3.3827803134918213


training:  27%|██▋       | 3018/10986 [2:03:49<5:01:28,  2.27s/it]

training loss: 3.4111826419830322


training:  27%|██▋       | 3019/10986 [2:03:51<5:09:30,  2.33s/it]

training loss: 3.317032814025879


training:  27%|██▋       | 3020/10986 [2:03:53<5:01:08,  2.27s/it]

training loss: 3.273838758468628
valid loss: 3.4861950874328613
perplexity: 32.66143798828125


training:  27%|██▋       | 3021/10986 [2:03:58<6:27:53,  2.92s/it]

training loss: 3.5938172340393066


training:  28%|██▊       | 3022/10986 [2:04:00<6:11:50,  2.80s/it]

training loss: 3.2945356369018555


training:  28%|██▊       | 3023/10986 [2:04:02<5:44:53,  2.60s/it]

training loss: 3.507143020629883


training:  28%|██▊       | 3024/10986 [2:04:05<5:39:10,  2.56s/it]

training loss: 3.3089423179626465


training:  28%|██▊       | 3025/10986 [2:04:07<5:23:48,  2.44s/it]

training loss: 3.344482898712158


training:  28%|██▊       | 3026/10986 [2:04:09<5:23:27,  2.44s/it]

training loss: 3.4272735118865967


training:  28%|██▊       | 3027/10986 [2:04:12<5:10:39,  2.34s/it]

training loss: 3.4130992889404297


training:  28%|██▊       | 3028/10986 [2:04:14<5:21:32,  2.42s/it]

training loss: 3.356766700744629


training:  28%|██▊       | 3029/10986 [2:04:16<5:16:15,  2.38s/it]

training loss: 3.422316551208496


training:  28%|██▊       | 3030/10986 [2:04:19<5:19:29,  2.41s/it]

training loss: 3.2886974811553955


training:  28%|██▊       | 3031/10986 [2:04:21<5:07:19,  2.32s/it]

training loss: 3.4713830947875977


training:  28%|██▊       | 3032/10986 [2:04:23<5:11:18,  2.35s/it]

training loss: 3.3820652961730957


training:  28%|██▊       | 3033/10986 [2:04:26<5:03:19,  2.29s/it]

training loss: 3.4757132530212402


training:  28%|██▊       | 3034/10986 [2:04:28<5:10:56,  2.35s/it]

training loss: 3.3305721282958984


training:  28%|██▊       | 3035/10986 [2:04:30<5:02:14,  2.28s/it]

training loss: 3.4060018062591553


training:  28%|██▊       | 3036/10986 [2:04:33<5:07:05,  2.32s/it]

training loss: 3.465238332748413


training:  28%|██▊       | 3037/10986 [2:04:35<4:58:44,  2.25s/it]

training loss: 3.5039634704589844


training:  28%|██▊       | 3038/10986 [2:04:37<5:05:07,  2.30s/it]

training loss: 3.4379658699035645


training:  28%|██▊       | 3039/10986 [2:04:39<4:58:11,  2.25s/it]

training loss: 3.3678202629089355


training:  28%|██▊       | 3040/10986 [2:04:42<5:08:19,  2.33s/it]

training loss: 3.3346595764160156
valid loss: 3.333519697189331
perplexity: 28.036849975585938


training:  28%|██▊       | 3041/10986 [2:04:47<6:50:01,  3.10s/it]

training loss: 3.404754400253296


training:  28%|██▊       | 3042/10986 [2:04:49<6:13:53,  2.82s/it]

training loss: 3.4252402782440186


training:  28%|██▊       | 3043/10986 [2:04:51<5:59:21,  2.71s/it]

training loss: 3.502020835876465


training:  28%|██▊       | 3044/10986 [2:04:53<5:36:28,  2.54s/it]

training loss: 3.4339089393615723


training:  28%|██▊       | 3045/10986 [2:04:56<5:33:31,  2.52s/it]

training loss: 3.3594605922698975


training:  28%|██▊       | 3046/10986 [2:04:58<5:18:23,  2.41s/it]

training loss: 3.4908993244171143


training:  28%|██▊       | 3047/10986 [2:05:01<5:19:14,  2.41s/it]

training loss: 3.4088239669799805


training:  28%|██▊       | 3048/10986 [2:05:03<5:08:01,  2.33s/it]

training loss: 3.3794877529144287


training:  28%|██▊       | 3049/10986 [2:05:05<5:10:19,  2.35s/it]

training loss: 3.4364259243011475


training:  28%|██▊       | 3050/10986 [2:05:07<5:02:51,  2.29s/it]

training loss: 3.357065200805664


training:  28%|██▊       | 3051/10986 [2:05:10<5:07:58,  2.33s/it]

training loss: 3.426906108856201


training:  28%|██▊       | 3052/10986 [2:05:12<4:59:58,  2.27s/it]

training loss: 3.5373449325561523


training:  28%|██▊       | 3053/10986 [2:05:14<5:06:18,  2.32s/it]

training loss: 3.3130643367767334


training:  28%|██▊       | 3054/10986 [2:05:17<5:11:08,  2.35s/it]

training loss: 3.422363042831421


training:  28%|██▊       | 3055/10986 [2:05:19<5:14:07,  2.38s/it]

training loss: 3.4380943775177


training:  28%|██▊       | 3056/10986 [2:05:21<5:03:30,  2.30s/it]

training loss: 3.4311470985412598


training:  28%|██▊       | 3057/10986 [2:05:24<5:08:17,  2.33s/it]

training loss: 3.4574217796325684


training:  28%|██▊       | 3058/10986 [2:05:26<5:03:09,  2.29s/it]

training loss: 3.431612253189087


training:  28%|██▊       | 3059/10986 [2:05:28<5:08:03,  2.33s/it]

training loss: 3.5608108043670654


training:  28%|██▊       | 3060/10986 [2:05:30<5:00:08,  2.27s/it]

training loss: 3.389847755432129
valid loss: 3.393064260482788
perplexity: 29.756996154785156


training:  28%|██▊       | 3061/10986 [2:05:35<6:25:19,  2.92s/it]

training loss: 3.431313991546631


training:  28%|██▊       | 3062/10986 [2:05:37<6:08:51,  2.79s/it]

training loss: 3.3198955059051514


training:  28%|██▊       | 3063/10986 [2:05:39<5:42:19,  2.59s/it]

training loss: 3.336463212966919


training:  28%|██▊       | 3064/10986 [2:05:42<5:36:07,  2.55s/it]

training loss: 3.309373140335083


training:  28%|██▊       | 3065/10986 [2:05:44<5:19:13,  2.42s/it]

training loss: 3.32985782623291


training:  28%|██▊       | 3066/10986 [2:05:46<5:18:18,  2.41s/it]

training loss: 3.4451611042022705


training:  28%|██▊       | 3067/10986 [2:05:48<5:07:00,  2.33s/it]

training loss: 3.4280142784118652


training:  28%|██▊       | 3068/10986 [2:05:51<5:09:59,  2.35s/it]

training loss: 3.3472230434417725


training:  28%|██▊       | 3069/10986 [2:05:53<5:00:48,  2.28s/it]

training loss: 3.3045589923858643


training:  28%|██▊       | 3070/10986 [2:05:56<5:35:20,  2.54s/it]

training loss: 3.4177236557006836


training:  28%|██▊       | 3071/10986 [2:05:59<5:39:58,  2.58s/it]

training loss: 3.363807439804077


training:  28%|██▊       | 3072/10986 [2:06:01<5:34:33,  2.54s/it]

training loss: 3.620880603790283


training:  28%|██▊       | 3073/10986 [2:06:03<5:18:47,  2.42s/it]

training loss: 3.3855679035186768


training:  28%|██▊       | 3074/10986 [2:06:06<5:17:49,  2.41s/it]

training loss: 3.345094919204712


training:  28%|██▊       | 3075/10986 [2:06:08<5:06:28,  2.32s/it]

training loss: 3.37209153175354


training:  28%|██▊       | 3076/10986 [2:06:10<5:11:15,  2.36s/it]

training loss: 3.31453800201416


training:  28%|██▊       | 3077/10986 [2:06:12<5:03:02,  2.30s/it]

training loss: 3.411761999130249


training:  28%|██▊       | 3078/10986 [2:06:15<5:10:20,  2.35s/it]

training loss: 3.5173051357269287


training:  28%|██▊       | 3079/10986 [2:06:18<5:34:06,  2.54s/it]

training loss: 3.4302122592926025


training:  28%|██▊       | 3080/10986 [2:06:20<5:33:24,  2.53s/it]

training loss: 3.4028372764587402
valid loss: 3.4055418968200684
perplexity: 30.1306209564209


training:  28%|██▊       | 3081/10986 [2:06:25<6:51:33,  3.12s/it]

training loss: 3.388136148452759


training:  28%|██▊       | 3082/10986 [2:06:27<6:16:13,  2.86s/it]

training loss: 3.4876937866210938


training:  28%|██▊       | 3083/10986 [2:06:30<6:02:04,  2.75s/it]

training loss: 3.526493549346924


training:  28%|██▊       | 3084/10986 [2:06:32<5:38:45,  2.57s/it]

training loss: 3.4028568267822266


training:  28%|██▊       | 3085/10986 [2:06:34<5:33:18,  2.53s/it]

training loss: 3.439159393310547


training:  28%|██▊       | 3086/10986 [2:06:36<5:17:42,  2.41s/it]

training loss: 3.470276117324829


training:  28%|██▊       | 3087/10986 [2:06:39<5:20:29,  2.43s/it]

training loss: 3.425208330154419


training:  28%|██▊       | 3088/10986 [2:06:41<5:08:49,  2.35s/it]

training loss: 3.412055730819702


training:  28%|██▊       | 3089/10986 [2:06:44<5:15:20,  2.40s/it]

training loss: 3.4191956520080566


training:  28%|██▊       | 3090/10986 [2:06:46<5:04:54,  2.32s/it]

training loss: 3.4317398071289062


training:  28%|██▊       | 3091/10986 [2:06:48<5:10:30,  2.36s/it]

training loss: 3.47992205619812


training:  28%|██▊       | 3092/10986 [2:06:50<5:01:43,  2.29s/it]

training loss: 3.5077686309814453


training:  28%|██▊       | 3093/10986 [2:06:53<5:07:14,  2.34s/it]

training loss: 3.3279471397399902


training:  28%|██▊       | 3094/10986 [2:06:55<4:59:48,  2.28s/it]

training loss: 3.400393486022949


training:  28%|██▊       | 3095/10986 [2:06:57<5:06:00,  2.33s/it]

training loss: 3.400075912475586


training:  28%|██▊       | 3096/10986 [2:06:59<4:58:11,  2.27s/it]

training loss: 3.3313727378845215


training:  28%|██▊       | 3097/10986 [2:07:02<5:05:13,  2.32s/it]

training loss: 3.431626081466675


training:  28%|██▊       | 3098/10986 [2:07:04<4:57:02,  2.26s/it]

training loss: 3.4935338497161865


training:  28%|██▊       | 3099/10986 [2:07:06<5:05:59,  2.33s/it]

training loss: 3.4813687801361084


training:  28%|██▊       | 3100/10986 [2:07:09<4:58:02,  2.27s/it]

training loss: 3.583467960357666
valid loss: 3.4149746894836426
perplexity: 30.41617774963379


training:  28%|██▊       | 3101/10986 [2:07:13<6:31:46,  2.98s/it]

training loss: 3.363887310028076


training:  28%|██▊       | 3102/10986 [2:07:16<6:21:45,  2.91s/it]

training loss: 3.2505199909210205


training:  28%|██▊       | 3103/10986 [2:07:18<5:53:25,  2.69s/it]

training loss: 3.4589693546295166


training:  28%|██▊       | 3104/10986 [2:07:21<5:43:38,  2.62s/it]

training loss: 3.454920530319214


training:  28%|██▊       | 3105/10986 [2:07:23<5:23:55,  2.47s/it]

training loss: 3.4791855812072754


training:  28%|██▊       | 3106/10986 [2:07:25<5:24:02,  2.47s/it]

training loss: 3.448284864425659


training:  28%|██▊       | 3107/10986 [2:07:27<5:10:41,  2.37s/it]

training loss: 3.547231435775757


training:  28%|██▊       | 3108/10986 [2:07:30<5:13:12,  2.39s/it]

training loss: 3.322110176086426


training:  28%|██▊       | 3109/10986 [2:07:32<5:04:29,  2.32s/it]

training loss: 3.520731210708618


training:  28%|██▊       | 3110/10986 [2:07:34<5:08:35,  2.35s/it]

training loss: 3.337512254714966


training:  28%|██▊       | 3111/10986 [2:07:37<5:00:57,  2.29s/it]

training loss: 3.4707348346710205


training:  28%|██▊       | 3112/10986 [2:07:39<5:06:00,  2.33s/it]

training loss: 3.421875238418579


training:  28%|██▊       | 3113/10986 [2:07:41<4:57:06,  2.26s/it]

training loss: 3.4902822971343994


training:  28%|██▊       | 3114/10986 [2:07:43<5:03:00,  2.31s/it]

training loss: 3.441539764404297


training:  28%|██▊       | 3115/10986 [2:07:46<4:55:26,  2.25s/it]

training loss: 3.449176549911499


training:  28%|██▊       | 3116/10986 [2:07:48<5:01:42,  2.30s/it]

training loss: 3.4475414752960205


training:  28%|██▊       | 3117/10986 [2:07:50<4:54:10,  2.24s/it]

training loss: 3.482170343399048


training:  28%|██▊       | 3118/10986 [2:07:53<5:01:05,  2.30s/it]

training loss: 3.442056179046631


training:  28%|██▊       | 3119/10986 [2:07:55<4:53:47,  2.24s/it]

training loss: 3.4216268062591553


training:  28%|██▊       | 3120/10986 [2:07:57<4:59:42,  2.29s/it]

training loss: 3.4033000469207764
valid loss: 3.473637104034424
perplexity: 32.253841400146484


training:  28%|██▊       | 3121/10986 [2:08:01<6:21:10,  2.91s/it]

training loss: 3.4582581520080566


training:  28%|██▊       | 3122/10986 [2:08:04<5:52:10,  2.69s/it]

training loss: 3.328354835510254


training:  28%|██▊       | 3123/10986 [2:08:06<5:44:15,  2.63s/it]

training loss: 3.421996593475342


training:  28%|██▊       | 3124/10986 [2:08:08<5:24:42,  2.48s/it]

training loss: 3.4012181758880615


training:  28%|██▊       | 3125/10986 [2:08:11<5:21:09,  2.45s/it]

training loss: 3.430938720703125


training:  28%|██▊       | 3126/10986 [2:08:13<5:07:32,  2.35s/it]

training loss: 3.4153928756713867


training:  28%|██▊       | 3127/10986 [2:08:15<5:08:00,  2.35s/it]

training loss: 3.4252982139587402


training:  28%|██▊       | 3128/10986 [2:08:17<5:04:40,  2.33s/it]

training loss: 3.3714537620544434


training:  28%|██▊       | 3129/10986 [2:08:20<5:15:22,  2.41s/it]

training loss: 3.522966146469116


training:  28%|██▊       | 3130/10986 [2:08:22<5:03:29,  2.32s/it]

training loss: 3.485974073410034


training:  28%|██▊       | 3131/10986 [2:08:24<5:07:17,  2.35s/it]

training loss: 3.4897139072418213


training:  29%|██▊       | 3132/10986 [2:08:27<5:08:37,  2.36s/it]

training loss: 3.5502238273620605


training:  29%|██▊       | 3133/10986 [2:08:30<5:55:44,  2.72s/it]

training loss: 3.4459340572357178


training:  29%|██▊       | 3134/10986 [2:08:32<5:32:02,  2.54s/it]

training loss: 3.3972206115722656


training:  29%|██▊       | 3135/10986 [2:08:35<5:26:57,  2.50s/it]

training loss: 3.4127860069274902


training:  29%|██▊       | 3136/10986 [2:08:37<5:11:57,  2.38s/it]

training loss: 3.298208475112915


training:  29%|██▊       | 3137/10986 [2:08:39<5:10:54,  2.38s/it]

training loss: 3.430436849594116


training:  29%|██▊       | 3138/10986 [2:08:41<5:00:15,  2.30s/it]

training loss: 3.439749002456665


training:  29%|██▊       | 3139/10986 [2:08:44<5:04:58,  2.33s/it]

training loss: 3.4533238410949707


training:  29%|██▊       | 3140/10986 [2:08:46<4:55:46,  2.26s/it]

training loss: 3.3541159629821777
valid loss: 3.423577308654785
perplexity: 30.67896842956543


training:  29%|██▊       | 3141/10986 [2:08:50<6:20:40,  2.91s/it]

training loss: 3.4207732677459717


training:  29%|██▊       | 3142/10986 [2:08:53<6:05:35,  2.80s/it]

training loss: 3.342336416244507


training:  29%|██▊       | 3143/10986 [2:08:55<5:40:21,  2.60s/it]

training loss: 3.46219801902771


training:  29%|██▊       | 3144/10986 [2:08:57<5:31:56,  2.54s/it]

training loss: 3.499089002609253


training:  29%|██▊       | 3145/10986 [2:09:00<5:16:15,  2.42s/it]

training loss: 3.2607219219207764


training:  29%|██▊       | 3146/10986 [2:09:02<5:16:11,  2.42s/it]

training loss: 3.339982271194458


training:  29%|██▊       | 3147/10986 [2:09:04<5:04:07,  2.33s/it]

training loss: 3.5157337188720703


training:  29%|██▊       | 3148/10986 [2:09:07<5:06:42,  2.35s/it]

training loss: 3.4598352909088135


training:  29%|██▊       | 3149/10986 [2:09:09<4:57:34,  2.28s/it]

training loss: 3.4575865268707275


training:  29%|██▊       | 3150/10986 [2:09:11<5:03:02,  2.32s/it]

training loss: 3.313884735107422


training:  29%|██▊       | 3151/10986 [2:09:13<4:54:42,  2.26s/it]

training loss: 3.3025455474853516


training:  29%|██▊       | 3152/10986 [2:09:16<4:59:43,  2.30s/it]

training loss: 3.524418354034424


training:  29%|██▊       | 3153/10986 [2:09:18<4:53:43,  2.25s/it]

training loss: 3.507781982421875


training:  29%|██▊       | 3154/10986 [2:09:20<5:00:12,  2.30s/it]

training loss: 3.4379942417144775


training:  29%|██▊       | 3155/10986 [2:09:22<4:53:06,  2.25s/it]

training loss: 3.5266242027282715


training:  29%|██▊       | 3156/10986 [2:09:25<4:58:11,  2.28s/it]

training loss: 3.420619010925293


training:  29%|██▊       | 3157/10986 [2:09:27<4:51:06,  2.23s/it]

training loss: 3.5097315311431885


training:  29%|██▊       | 3158/10986 [2:09:29<4:57:12,  2.28s/it]

training loss: 3.3348841667175293


training:  29%|██▉       | 3159/10986 [2:09:31<4:50:11,  2.22s/it]

training loss: 3.3072452545166016


training:  29%|██▉       | 3160/10986 [2:09:34<4:57:09,  2.28s/it]

training loss: 3.515909433364868
valid loss: 3.3802270889282227
perplexity: 29.377443313598633


training:  29%|██▉       | 3161/10986 [2:09:38<6:18:39,  2.90s/it]

training loss: 3.424252510070801


training:  29%|██▉       | 3162/10986 [2:09:40<5:49:38,  2.68s/it]

training loss: 3.3731155395507812


training:  29%|██▉       | 3163/10986 [2:09:43<5:40:17,  2.61s/it]

training loss: 3.4385645389556885


training:  29%|██▉       | 3164/10986 [2:09:45<5:21:15,  2.46s/it]

training loss: 3.4286179542541504


training:  29%|██▉       | 3165/10986 [2:09:47<5:19:47,  2.45s/it]

training loss: 3.551798105239868


training:  29%|██▉       | 3166/10986 [2:09:49<5:05:59,  2.35s/it]

training loss: 3.3630104064941406


training:  29%|██▉       | 3167/10986 [2:09:52<5:07:52,  2.36s/it]

training loss: 3.3746492862701416


training:  29%|██▉       | 3168/10986 [2:09:54<4:58:07,  2.29s/it]

training loss: 3.5483975410461426


training:  29%|██▉       | 3169/10986 [2:09:56<5:02:07,  2.32s/it]

training loss: 3.302168846130371


training:  29%|██▉       | 3170/10986 [2:09:58<4:53:08,  2.25s/it]

training loss: 3.35473370552063


training:  29%|██▉       | 3171/10986 [2:10:01<4:59:02,  2.30s/it]

training loss: 3.4061968326568604


training:  29%|██▉       | 3172/10986 [2:10:03<4:53:06,  2.25s/it]

training loss: 3.461907386779785


training:  29%|██▉       | 3173/10986 [2:10:05<4:59:26,  2.30s/it]

training loss: 3.3903260231018066


training:  29%|██▉       | 3174/10986 [2:10:07<4:52:12,  2.24s/it]

training loss: 3.3495941162109375


training:  29%|██▉       | 3175/10986 [2:10:10<4:57:12,  2.28s/it]

training loss: 3.4792442321777344


training:  29%|██▉       | 3176/10986 [2:10:12<4:50:10,  2.23s/it]

training loss: 3.476979970932007


training:  29%|██▉       | 3177/10986 [2:10:14<4:56:34,  2.28s/it]

training loss: 3.5378546714782715


training:  29%|██▉       | 3178/10986 [2:10:16<4:50:14,  2.23s/it]

training loss: 3.3934104442596436


training:  29%|██▉       | 3179/10986 [2:10:19<4:57:52,  2.29s/it]

training loss: 3.4087750911712646


training:  29%|██▉       | 3180/10986 [2:10:21<4:49:43,  2.23s/it]

training loss: 3.4802539348602295
valid loss: 3.4588751792907715
perplexity: 31.781208038330078


training:  29%|██▉       | 3181/10986 [2:10:25<6:13:38,  2.87s/it]

training loss: 3.4363889694213867


training:  29%|██▉       | 3182/10986 [2:10:28<5:57:37,  2.75s/it]

training loss: 3.3308839797973633


training:  29%|██▉       | 3183/10986 [2:10:30<5:33:28,  2.56s/it]

training loss: 3.3727216720581055


training:  29%|██▉       | 3184/10986 [2:10:32<5:26:58,  2.51s/it]

training loss: 3.3799643516540527


training:  29%|██▉       | 3185/10986 [2:10:34<5:10:37,  2.39s/it]

training loss: 3.359386444091797


training:  29%|██▉       | 3186/10986 [2:10:37<5:10:48,  2.39s/it]

training loss: 3.395960569381714


training:  29%|██▉       | 3187/10986 [2:10:39<5:00:20,  2.31s/it]

training loss: 3.2954776287078857


training:  29%|██▉       | 3188/10986 [2:10:41<5:05:41,  2.35s/it]

training loss: 3.3271138668060303


training:  29%|██▉       | 3189/10986 [2:10:43<4:57:01,  2.29s/it]

training loss: 3.304075241088867


training:  29%|██▉       | 3190/10986 [2:10:46<5:01:51,  2.32s/it]

training loss: 3.467194080352783


training:  29%|██▉       | 3191/10986 [2:10:48<4:53:43,  2.26s/it]

training loss: 3.275261163711548


training:  29%|██▉       | 3192/10986 [2:10:50<4:58:12,  2.30s/it]

training loss: 3.442565679550171


training:  29%|██▉       | 3193/10986 [2:10:52<4:50:44,  2.24s/it]

training loss: 3.3952038288116455


training:  29%|██▉       | 3194/10986 [2:10:55<4:56:17,  2.28s/it]

training loss: 3.394688367843628


training:  29%|██▉       | 3195/10986 [2:10:57<4:48:43,  2.22s/it]

training loss: 3.4342551231384277


training:  29%|██▉       | 3196/10986 [2:11:00<5:26:23,  2.51s/it]

training loss: 3.4235706329345703


training:  29%|██▉       | 3197/10986 [2:11:03<5:26:25,  2.51s/it]

training loss: 3.3066298961639404


training:  29%|██▉       | 3198/10986 [2:11:05<5:24:09,  2.50s/it]

training loss: 3.337721109390259


training:  29%|██▉       | 3199/10986 [2:11:07<5:09:25,  2.38s/it]

training loss: 3.395630359649658


training:  29%|██▉       | 3200/10986 [2:11:10<5:09:47,  2.39s/it]

training loss: 3.3127853870391846
valid loss: 3.4629101753234863
perplexity: 31.909704208374023


training:  29%|██▉       | 3201/10986 [2:11:14<6:40:14,  3.08s/it]

training loss: 3.355022430419922


training:  29%|██▉       | 3202/10986 [2:11:16<6:06:53,  2.83s/it]

training loss: 3.3887295722961426


training:  29%|██▉       | 3203/10986 [2:11:19<5:51:26,  2.71s/it]

training loss: 3.3875997066497803


training:  29%|██▉       | 3204/10986 [2:11:21<5:28:53,  2.54s/it]

training loss: 3.36696195602417


training:  29%|██▉       | 3205/10986 [2:11:23<5:24:18,  2.50s/it]

training loss: 3.4522080421447754


training:  29%|██▉       | 3206/10986 [2:11:26<5:08:54,  2.38s/it]

training loss: 3.2912261486053467


training:  29%|██▉       | 3207/10986 [2:11:28<5:08:31,  2.38s/it]

training loss: 3.410431385040283


training:  29%|██▉       | 3208/10986 [2:11:30<4:57:38,  2.30s/it]

training loss: 3.4441182613372803


training:  29%|██▉       | 3209/10986 [2:11:32<5:02:06,  2.33s/it]

training loss: 3.351109027862549


training:  29%|██▉       | 3210/10986 [2:11:35<4:54:18,  2.27s/it]

training loss: 3.3257694244384766


training:  29%|██▉       | 3211/10986 [2:11:37<4:59:01,  2.31s/it]

training loss: 3.3417694568634033


training:  29%|██▉       | 3212/10986 [2:11:39<4:50:36,  2.24s/it]

training loss: 3.3202905654907227


training:  29%|██▉       | 3213/10986 [2:11:41<4:56:33,  2.29s/it]

training loss: 3.3673863410949707


training:  29%|██▉       | 3214/10986 [2:11:44<4:48:38,  2.23s/it]

training loss: 3.4619202613830566


training:  29%|██▉       | 3215/10986 [2:11:46<4:56:20,  2.29s/it]

training loss: 3.374117851257324


training:  29%|██▉       | 3216/10986 [2:11:48<4:50:29,  2.24s/it]

training loss: 3.4023308753967285


training:  29%|██▉       | 3217/10986 [2:11:51<4:56:46,  2.29s/it]

training loss: 3.361278772354126


training:  29%|██▉       | 3218/10986 [2:11:53<4:49:05,  2.23s/it]

training loss: 3.4465270042419434


training:  29%|██▉       | 3219/10986 [2:11:55<4:57:05,  2.30s/it]

training loss: 3.4429945945739746


training:  29%|██▉       | 3220/10986 [2:11:57<4:49:15,  2.23s/it]

training loss: 3.496504783630371
valid loss: 3.3737688064575195
perplexity: 29.188325881958008


training:  29%|██▉       | 3221/10986 [2:12:02<6:15:33,  2.90s/it]

training loss: 3.3425803184509277


training:  29%|██▉       | 3222/10986 [2:12:04<6:00:38,  2.79s/it]

training loss: 3.4764106273651123


training:  29%|██▉       | 3223/10986 [2:12:06<5:34:00,  2.58s/it]

training loss: 3.339946746826172


training:  29%|██▉       | 3224/10986 [2:12:09<5:29:08,  2.54s/it]

training loss: 3.3510239124298096


training:  29%|██▉       | 3225/10986 [2:12:11<5:11:22,  2.41s/it]

training loss: 3.3816397190093994


training:  29%|██▉       | 3226/10986 [2:12:13<5:11:44,  2.41s/it]

training loss: 3.3943281173706055


training:  29%|██▉       | 3227/10986 [2:12:15<4:59:03,  2.31s/it]

training loss: 3.386840343475342


training:  29%|██▉       | 3228/10986 [2:12:18<5:14:59,  2.44s/it]

training loss: 3.454960823059082


training:  29%|██▉       | 3229/10986 [2:12:21<5:19:51,  2.47s/it]

training loss: 3.595524311065674


training:  29%|██▉       | 3230/10986 [2:12:23<5:17:57,  2.46s/it]

training loss: 3.460364580154419


training:  29%|██▉       | 3231/10986 [2:12:25<5:04:46,  2.36s/it]

training loss: 3.3878912925720215


training:  29%|██▉       | 3232/10986 [2:12:28<5:07:23,  2.38s/it]

training loss: 3.4683444499969482


training:  29%|██▉       | 3233/10986 [2:12:30<4:57:58,  2.31s/it]

training loss: 3.5328352451324463


training:  29%|██▉       | 3234/10986 [2:12:32<5:02:46,  2.34s/it]

training loss: 3.32354736328125


training:  29%|██▉       | 3235/10986 [2:12:34<4:53:21,  2.27s/it]

training loss: 3.500993490219116


training:  29%|██▉       | 3236/10986 [2:12:37<4:58:19,  2.31s/it]

training loss: 3.4189577102661133


training:  29%|██▉       | 3237/10986 [2:12:39<4:50:02,  2.25s/it]

training loss: 3.4118032455444336


training:  29%|██▉       | 3238/10986 [2:12:41<4:55:34,  2.29s/it]

training loss: 3.3553078174591064


training:  29%|██▉       | 3239/10986 [2:12:43<4:49:45,  2.24s/it]

training loss: 3.394111156463623


training:  29%|██▉       | 3240/10986 [2:12:46<4:56:30,  2.30s/it]

training loss: 3.3573737144470215
valid loss: 3.355663299560547
perplexity: 28.66461181640625


training:  30%|██▉       | 3241/10986 [2:12:50<6:29:43,  3.02s/it]

training loss: 3.4193685054779053


training:  30%|██▉       | 3242/10986 [2:12:53<5:55:57,  2.76s/it]

training loss: 3.3423447608947754


training:  30%|██▉       | 3243/10986 [2:12:55<5:46:09,  2.68s/it]

training loss: 3.4062652587890625


training:  30%|██▉       | 3244/10986 [2:12:57<5:23:08,  2.50s/it]

training loss: 3.378560781478882


training:  30%|██▉       | 3245/10986 [2:13:00<5:19:01,  2.47s/it]

training loss: 3.4507453441619873


training:  30%|██▉       | 3246/10986 [2:13:02<5:04:24,  2.36s/it]

training loss: 3.430260181427002


training:  30%|██▉       | 3247/10986 [2:13:04<5:06:12,  2.37s/it]

training loss: 3.4993338584899902


training:  30%|██▉       | 3248/10986 [2:13:06<4:55:26,  2.29s/it]

training loss: 3.4223971366882324


training:  30%|██▉       | 3249/10986 [2:13:09<4:59:45,  2.32s/it]

training loss: 3.4032843112945557


training:  30%|██▉       | 3250/10986 [2:13:11<4:50:55,  2.26s/it]

training loss: 3.496492862701416


training:  30%|██▉       | 3251/10986 [2:13:13<4:56:40,  2.30s/it]

training loss: 3.3305838108062744


training:  30%|██▉       | 3252/10986 [2:13:15<4:48:17,  2.24s/it]

training loss: 3.3226146697998047


training:  30%|██▉       | 3253/10986 [2:13:18<4:56:14,  2.30s/it]

training loss: 3.436957597732544


training:  30%|██▉       | 3254/10986 [2:13:20<4:48:29,  2.24s/it]

training loss: 3.3249576091766357


training:  30%|██▉       | 3255/10986 [2:13:22<5:06:59,  2.38s/it]

training loss: 3.2374625205993652


training:  30%|██▉       | 3256/10986 [2:13:24<4:56:00,  2.30s/it]

training loss: 3.4452340602874756


training:  30%|██▉       | 3257/10986 [2:13:27<5:00:37,  2.33s/it]

training loss: 3.4158060550689697


training:  30%|██▉       | 3258/10986 [2:13:30<5:23:58,  2.52s/it]

training loss: 3.4214046001434326


training:  30%|██▉       | 3259/10986 [2:13:33<5:42:32,  2.66s/it]

training loss: 3.4105629920959473


training:  30%|██▉       | 3260/10986 [2:13:35<5:20:58,  2.49s/it]

training loss: 3.374283790588379
valid loss: 3.6239402294158936
perplexity: 37.48497772216797


training:  30%|██▉       | 3261/10986 [2:13:39<6:36:21,  3.08s/it]

training loss: 3.388537645339966


training:  30%|██▉       | 3262/10986 [2:13:42<6:14:48,  2.91s/it]

training loss: 3.5864408016204834


training:  30%|██▉       | 3263/10986 [2:13:44<5:43:58,  2.67s/it]

training loss: 3.5054259300231934


training:  30%|██▉       | 3264/10986 [2:13:46<5:33:54,  2.59s/it]

training loss: 3.36711049079895


training:  30%|██▉       | 3265/10986 [2:13:49<5:15:07,  2.45s/it]

training loss: 3.4076712131500244


training:  30%|██▉       | 3266/10986 [2:13:51<5:14:55,  2.45s/it]

training loss: 3.6124002933502197


training:  30%|██▉       | 3267/10986 [2:13:53<5:01:30,  2.34s/it]

training loss: 3.400010108947754


training:  30%|██▉       | 3268/10986 [2:13:56<5:05:09,  2.37s/it]

training loss: 3.323573589324951


training:  30%|██▉       | 3269/10986 [2:13:58<4:55:11,  2.30s/it]

training loss: 3.5420944690704346


training:  30%|██▉       | 3270/10986 [2:14:00<4:59:15,  2.33s/it]

training loss: 3.417342185974121


training:  30%|██▉       | 3271/10986 [2:14:02<4:49:34,  2.25s/it]

training loss: 3.567312002182007


training:  30%|██▉       | 3272/10986 [2:14:04<4:54:16,  2.29s/it]

training loss: 3.481806755065918


training:  30%|██▉       | 3273/10986 [2:14:07<4:46:01,  2.23s/it]

training loss: 3.409466505050659


training:  30%|██▉       | 3274/10986 [2:14:09<4:54:30,  2.29s/it]

training loss: 3.3837506771087646


training:  30%|██▉       | 3275/10986 [2:14:11<4:46:53,  2.23s/it]

training loss: 3.3730196952819824


training:  30%|██▉       | 3276/10986 [2:14:14<4:54:19,  2.29s/it]

training loss: 3.539822578430176


training:  30%|██▉       | 3277/10986 [2:14:16<4:58:57,  2.33s/it]

training loss: 3.4150280952453613


training:  30%|██▉       | 3278/10986 [2:14:18<5:02:23,  2.35s/it]

training loss: 3.390441656112671


training:  30%|██▉       | 3279/10986 [2:14:20<4:51:34,  2.27s/it]

training loss: 3.3554153442382812


training:  30%|██▉       | 3280/10986 [2:14:23<4:55:49,  2.30s/it]

training loss: 3.31591796875
valid loss: 3.3975934982299805
perplexity: 29.89207649230957


training:  30%|██▉       | 3281/10986 [2:14:27<6:17:30,  2.94s/it]

training loss: 3.4125821590423584


training:  30%|██▉       | 3282/10986 [2:14:29<5:47:51,  2.71s/it]

training loss: 3.4413204193115234


training:  30%|██▉       | 3283/10986 [2:14:32<5:37:08,  2.63s/it]

training loss: 3.3084964752197266


training:  30%|██▉       | 3284/10986 [2:14:34<5:15:28,  2.46s/it]

training loss: 3.5347957611083984


training:  30%|██▉       | 3285/10986 [2:14:36<5:12:28,  2.43s/it]

training loss: 3.4884204864501953


training:  30%|██▉       | 3286/10986 [2:14:38<4:58:44,  2.33s/it]

training loss: 3.5186052322387695


training:  30%|██▉       | 3287/10986 [2:14:41<5:02:05,  2.35s/it]

training loss: 3.387741804122925


training:  30%|██▉       | 3288/10986 [2:14:43<4:53:02,  2.28s/it]

training loss: 3.5743038654327393


training:  30%|██▉       | 3289/10986 [2:14:45<4:58:25,  2.33s/it]

training loss: 3.38238263130188


training:  30%|██▉       | 3290/10986 [2:14:47<4:50:16,  2.26s/it]

training loss: 3.419511318206787


training:  30%|██▉       | 3291/10986 [2:14:50<4:57:22,  2.32s/it]

training loss: 3.4581658840179443


training:  30%|██▉       | 3292/10986 [2:14:52<4:49:02,  2.25s/it]

training loss: 3.678816080093384


training:  30%|██▉       | 3293/10986 [2:14:54<4:55:39,  2.31s/it]

training loss: 3.309767007827759


training:  30%|██▉       | 3294/10986 [2:14:57<4:47:30,  2.24s/it]

training loss: 3.4087412357330322


training:  30%|██▉       | 3295/10986 [2:14:59<4:53:47,  2.29s/it]

training loss: 3.38633131980896


training:  30%|███       | 3296/10986 [2:15:01<4:48:40,  2.25s/it]

training loss: 3.4337761402130127


training:  30%|███       | 3297/10986 [2:15:04<4:57:06,  2.32s/it]

training loss: 3.340853691101074


training:  30%|███       | 3298/10986 [2:15:06<4:49:23,  2.26s/it]

training loss: 3.3485894203186035


training:  30%|███       | 3299/10986 [2:15:08<4:55:39,  2.31s/it]

training loss: 3.54618501663208


training:  30%|███       | 3300/10986 [2:15:10<4:48:11,  2.25s/it]

training loss: 3.557936906814575
valid loss: 3.4217774868011475
perplexity: 30.62380027770996


training:  30%|███       | 3301/10986 [2:15:15<6:23:37,  3.00s/it]

training loss: 3.50278639793396


training:  30%|███       | 3302/10986 [2:15:17<6:02:55,  2.83s/it]

training loss: 3.387831211090088


training:  30%|███       | 3303/10986 [2:15:20<6:05:42,  2.86s/it]

training loss: 3.4250683784484863


training:  30%|███       | 3304/10986 [2:15:23<5:48:41,  2.72s/it]

training loss: 3.3917407989501953


training:  30%|███       | 3305/10986 [2:15:25<5:25:07,  2.54s/it]

training loss: 3.448373317718506


training:  30%|███       | 3306/10986 [2:15:27<5:19:41,  2.50s/it]

training loss: 3.3452515602111816


training:  30%|███       | 3307/10986 [2:15:29<5:04:04,  2.38s/it]

training loss: 3.391998291015625


training:  30%|███       | 3308/10986 [2:15:32<5:04:48,  2.38s/it]

training loss: 3.371889114379883


training:  30%|███       | 3309/10986 [2:15:34<4:54:34,  2.30s/it]

training loss: 3.5107598304748535


training:  30%|███       | 3310/10986 [2:15:36<4:59:48,  2.34s/it]

training loss: 3.4259276390075684


training:  30%|███       | 3311/10986 [2:15:38<4:50:43,  2.27s/it]

training loss: 3.3812201023101807


training:  30%|███       | 3312/10986 [2:15:41<4:57:11,  2.32s/it]

training loss: 3.3459181785583496


training:  30%|███       | 3313/10986 [2:15:43<4:48:41,  2.26s/it]

training loss: 3.51706862449646


training:  30%|███       | 3314/10986 [2:15:45<4:53:47,  2.30s/it]

training loss: 3.515711784362793


training:  30%|███       | 3315/10986 [2:15:47<4:45:03,  2.23s/it]

training loss: 3.353764533996582


training:  30%|███       | 3316/10986 [2:15:50<4:53:50,  2.30s/it]

training loss: 3.3744728565216064


training:  30%|███       | 3317/10986 [2:15:52<4:46:07,  2.24s/it]

training loss: 3.5600509643554688


training:  30%|███       | 3318/10986 [2:15:54<4:52:58,  2.29s/it]

training loss: 3.4442100524902344


training:  30%|███       | 3319/10986 [2:15:56<4:46:29,  2.24s/it]

training loss: 3.2773609161376953


training:  30%|███       | 3320/10986 [2:15:59<4:54:12,  2.30s/it]

training loss: 3.394848346710205
valid loss: 3.5893800258636475
perplexity: 36.21161651611328


training:  30%|███       | 3321/10986 [2:16:05<7:04:39,  3.32s/it]

training loss: 3.5799338817596436


training:  30%|███       | 3322/10986 [2:16:07<6:22:06,  2.99s/it]

training loss: 3.4707448482513428


training:  30%|███       | 3323/10986 [2:16:09<6:01:07,  2.83s/it]

training loss: 3.4666459560394287


training:  30%|███       | 3324/10986 [2:16:11<5:36:32,  2.64s/it]

training loss: 3.327042579650879


training:  30%|███       | 3325/10986 [2:16:14<5:43:52,  2.69s/it]

training loss: 3.414670944213867


training:  30%|███       | 3326/10986 [2:16:16<5:20:22,  2.51s/it]

training loss: 3.218473196029663


training:  30%|███       | 3327/10986 [2:16:19<5:16:33,  2.48s/it]

training loss: 3.5343823432922363


training:  30%|███       | 3328/10986 [2:16:21<5:02:12,  2.37s/it]

training loss: 3.460813522338867


training:  30%|███       | 3329/10986 [2:16:23<5:03:14,  2.38s/it]

training loss: 3.354461193084717


training:  30%|███       | 3330/10986 [2:16:25<4:52:42,  2.29s/it]

training loss: 3.321251153945923


training:  30%|███       | 3331/10986 [2:16:28<4:56:51,  2.33s/it]

training loss: 3.3633835315704346


training:  30%|███       | 3332/10986 [2:16:30<4:47:23,  2.25s/it]

training loss: 3.412092685699463


training:  30%|███       | 3333/10986 [2:16:32<4:53:32,  2.30s/it]

training loss: 3.3716280460357666


training:  30%|███       | 3334/10986 [2:16:34<4:45:40,  2.24s/it]

training loss: 3.5311472415924072


training:  30%|███       | 3335/10986 [2:16:37<4:51:20,  2.28s/it]

training loss: 3.460575580596924


training:  30%|███       | 3336/10986 [2:16:39<4:43:39,  2.22s/it]

training loss: 3.4826860427856445


training:  30%|███       | 3337/10986 [2:16:41<4:51:08,  2.28s/it]

training loss: 3.497889280319214


training:  30%|███       | 3338/10986 [2:16:43<4:44:28,  2.23s/it]

training loss: 3.448150873184204


training:  30%|███       | 3339/10986 [2:16:46<4:50:23,  2.28s/it]

training loss: 3.3190927505493164


training:  30%|███       | 3340/10986 [2:16:48<4:42:46,  2.22s/it]

training loss: 3.3314924240112305
valid loss: 3.34900164604187
perplexity: 28.474292755126953


training:  30%|███       | 3341/10986 [2:16:52<6:03:16,  2.85s/it]

training loss: 3.4035778045654297


training:  30%|███       | 3342/10986 [2:16:55<5:51:01,  2.76s/it]

training loss: 3.423037052154541


training:  30%|███       | 3343/10986 [2:16:57<5:25:28,  2.56s/it]

training loss: 3.4355945587158203


training:  30%|███       | 3344/10986 [2:16:59<5:18:08,  2.50s/it]

training loss: 3.316073417663574


training:  30%|███       | 3345/10986 [2:17:01<5:02:05,  2.37s/it]

training loss: 3.303109884262085


training:  30%|███       | 3346/10986 [2:17:04<5:02:40,  2.38s/it]

training loss: 3.4636638164520264


training:  30%|███       | 3347/10986 [2:17:06<4:50:46,  2.28s/it]

training loss: 3.3853888511657715


training:  30%|███       | 3348/10986 [2:17:08<4:54:21,  2.31s/it]

training loss: 3.4766063690185547


training:  30%|███       | 3349/10986 [2:17:10<4:45:12,  2.24s/it]

training loss: 3.469960927963257


training:  30%|███       | 3350/10986 [2:17:13<4:51:24,  2.29s/it]

training loss: 3.467938184738159


training:  31%|███       | 3351/10986 [2:17:15<4:44:20,  2.23s/it]

training loss: 3.465484619140625


training:  31%|███       | 3352/10986 [2:17:17<5:04:05,  2.39s/it]

training loss: 3.380092144012451


training:  31%|███       | 3353/10986 [2:17:20<4:52:20,  2.30s/it]

training loss: 3.4617762565612793


training:  31%|███       | 3354/10986 [2:17:22<4:55:10,  2.32s/it]

training loss: 3.4996230602264404


training:  31%|███       | 3355/10986 [2:17:24<4:46:33,  2.25s/it]

training loss: 3.4632229804992676


training:  31%|███       | 3356/10986 [2:17:26<4:52:49,  2.30s/it]

training loss: 3.3284716606140137


training:  31%|███       | 3357/10986 [2:17:28<4:44:51,  2.24s/it]

training loss: 3.3878326416015625


training:  31%|███       | 3358/10986 [2:17:31<4:51:26,  2.29s/it]

training loss: 3.444995403289795


training:  31%|███       | 3359/10986 [2:17:33<4:43:27,  2.23s/it]

training loss: 3.402268409729004


training:  31%|███       | 3360/10986 [2:17:35<4:49:44,  2.28s/it]

training loss: 3.2879343032836914
valid loss: 3.4065186977386475
perplexity: 30.160064697265625


training:  31%|███       | 3361/10986 [2:17:40<6:13:44,  2.94s/it]

training loss: 3.4765658378601074


training:  31%|███       | 3362/10986 [2:17:42<5:43:07,  2.70s/it]

training loss: 3.395402431488037


training:  31%|███       | 3363/10986 [2:17:44<5:31:35,  2.61s/it]

training loss: 3.478426456451416


training:  31%|███       | 3364/10986 [2:17:46<5:11:13,  2.45s/it]

training loss: 3.4523651599884033


training:  31%|███       | 3365/10986 [2:17:49<5:08:51,  2.43s/it]

training loss: 3.515056848526001


training:  31%|███       | 3366/10986 [2:17:51<4:55:17,  2.33s/it]

training loss: 3.3971152305603027


training:  31%|███       | 3367/10986 [2:17:53<4:58:28,  2.35s/it]

training loss: 3.4729068279266357


training:  31%|███       | 3368/10986 [2:17:55<4:49:24,  2.28s/it]

training loss: 3.4602794647216797


training:  31%|███       | 3369/10986 [2:17:58<4:52:03,  2.30s/it]

training loss: 3.372518539428711


training:  31%|███       | 3370/10986 [2:18:00<4:43:25,  2.23s/it]

training loss: 3.5240731239318848


training:  31%|███       | 3371/10986 [2:18:02<4:49:06,  2.28s/it]

training loss: 3.4885573387145996


training:  31%|███       | 3372/10986 [2:18:04<4:41:08,  2.22s/it]

training loss: 3.4461071491241455


training:  31%|███       | 3373/10986 [2:18:07<4:48:10,  2.27s/it]

training loss: 3.357724905014038


training:  31%|███       | 3374/10986 [2:18:09<4:40:43,  2.21s/it]

training loss: 3.487586259841919


training:  31%|███       | 3375/10986 [2:18:11<4:47:14,  2.26s/it]

training loss: 3.4126036167144775


training:  31%|███       | 3376/10986 [2:18:13<4:41:38,  2.22s/it]

training loss: 3.36221981048584


training:  31%|███       | 3377/10986 [2:18:16<4:48:45,  2.28s/it]

training loss: 3.395747423171997


training:  31%|███       | 3378/10986 [2:18:18<4:41:03,  2.22s/it]

training loss: 3.526827812194824


training:  31%|███       | 3379/10986 [2:18:20<4:48:18,  2.27s/it]

training loss: 3.426470994949341


training:  31%|███       | 3380/10986 [2:18:22<4:40:21,  2.21s/it]

training loss: 3.435194730758667
valid loss: 3.3381659984588623
perplexity: 28.16741943359375


training:  31%|███       | 3381/10986 [2:18:27<5:59:41,  2.84s/it]

training loss: 3.406707286834717


training:  31%|███       | 3382/10986 [2:18:29<5:48:55,  2.75s/it]

training loss: 3.4029617309570312


training:  31%|███       | 3383/10986 [2:18:32<5:57:32,  2.82s/it]

training loss: 3.436838150024414


training:  31%|███       | 3384/10986 [2:18:35<5:50:55,  2.77s/it]

training loss: 3.395918607711792


training:  31%|███       | 3385/10986 [2:18:37<5:25:03,  2.57s/it]

training loss: 3.447293758392334


training:  31%|███       | 3386/10986 [2:18:39<5:17:05,  2.50s/it]

training loss: 3.296781063079834


training:  31%|███       | 3387/10986 [2:18:41<5:01:12,  2.38s/it]

training loss: 3.39426326751709


training:  31%|███       | 3388/10986 [2:18:44<5:01:36,  2.38s/it]

training loss: 3.426934242248535


training:  31%|███       | 3389/10986 [2:18:46<4:49:36,  2.29s/it]

training loss: 3.356205463409424


training:  31%|███       | 3390/10986 [2:18:48<4:51:34,  2.30s/it]

training loss: 3.3911380767822266


training:  31%|███       | 3391/10986 [2:18:50<4:43:13,  2.24s/it]

training loss: 3.402925968170166


training:  31%|███       | 3392/10986 [2:18:53<4:49:10,  2.28s/it]

training loss: 3.530276298522949


training:  31%|███       | 3393/10986 [2:18:55<4:41:51,  2.23s/it]

training loss: 3.665011405944824


training:  31%|███       | 3394/10986 [2:18:57<4:47:27,  2.27s/it]

training loss: 3.320504903793335


training:  31%|███       | 3395/10986 [2:18:59<4:40:12,  2.21s/it]

training loss: 3.3777880668640137


training:  31%|███       | 3396/10986 [2:19:02<4:46:46,  2.27s/it]

training loss: 3.3264408111572266


training:  31%|███       | 3397/10986 [2:19:04<4:39:50,  2.21s/it]

training loss: 3.512192487716675


training:  31%|███       | 3398/10986 [2:19:06<4:46:04,  2.26s/it]

training loss: 3.3529088497161865


training:  31%|███       | 3399/10986 [2:19:08<4:39:44,  2.21s/it]

training loss: 3.4886412620544434


training:  31%|███       | 3400/10986 [2:19:10<4:45:46,  2.26s/it]

training loss: 3.399675130844116
valid loss: 3.3604676723480225
perplexity: 28.802658081054688


training:  31%|███       | 3401/10986 [2:19:15<6:16:50,  2.98s/it]

training loss: 3.344247579574585


training:  31%|███       | 3402/10986 [2:19:17<5:43:42,  2.72s/it]

training loss: 3.4874937534332275


training:  31%|███       | 3403/10986 [2:19:20<5:30:21,  2.61s/it]

training loss: 3.6043081283569336


training:  31%|███       | 3404/10986 [2:19:22<5:10:19,  2.46s/it]

training loss: 3.5399820804595947


training:  31%|███       | 3405/10986 [2:19:24<5:07:22,  2.43s/it]

training loss: 3.4372129440307617


training:  31%|███       | 3406/10986 [2:19:26<4:56:44,  2.35s/it]

training loss: 3.444801092147827


training:  31%|███       | 3407/10986 [2:19:29<4:57:59,  2.36s/it]

training loss: 3.4577736854553223


training:  31%|███       | 3408/10986 [2:19:31<4:47:45,  2.28s/it]

training loss: 3.364490509033203


training:  31%|███       | 3409/10986 [2:19:33<4:51:17,  2.31s/it]

training loss: 3.4580869674682617


training:  31%|███       | 3410/10986 [2:19:35<4:43:06,  2.24s/it]

training loss: 3.4424283504486084


training:  31%|███       | 3411/10986 [2:19:38<4:48:00,  2.28s/it]

training loss: 3.489422082901001


training:  31%|███       | 3412/10986 [2:19:40<4:39:44,  2.22s/it]

training loss: 3.4516117572784424


training:  31%|███       | 3413/10986 [2:19:42<4:46:38,  2.27s/it]

training loss: 3.371455669403076


training:  31%|███       | 3414/10986 [2:19:44<4:40:54,  2.23s/it]

training loss: 3.47921085357666


training:  31%|███       | 3415/10986 [2:19:47<4:48:16,  2.28s/it]

training loss: 3.3459455966949463


training:  31%|███       | 3416/10986 [2:19:49<4:40:30,  2.22s/it]

training loss: 3.45044207572937


training:  31%|███       | 3417/10986 [2:19:51<4:47:17,  2.28s/it]

training loss: 3.370969772338867


training:  31%|███       | 3418/10986 [2:19:53<4:40:25,  2.22s/it]

training loss: 3.5351884365081787


training:  31%|███       | 3419/10986 [2:19:56<4:48:01,  2.28s/it]

training loss: 3.453701972961426


training:  31%|███       | 3420/10986 [2:19:58<4:40:12,  2.22s/it]

training loss: 3.458327293395996
valid loss: 3.3892934322357178
perplexity: 29.64499855041504


training:  31%|███       | 3421/10986 [2:20:02<5:58:46,  2.85s/it]

training loss: 3.3834147453308105


training:  31%|███       | 3422/10986 [2:20:04<5:46:22,  2.75s/it]

training loss: 3.483936071395874


training:  31%|███       | 3423/10986 [2:20:06<5:20:28,  2.54s/it]

training loss: 3.5037643909454346


training:  31%|███       | 3424/10986 [2:20:09<5:13:42,  2.49s/it]

training loss: 3.4828035831451416


training:  31%|███       | 3425/10986 [2:20:11<4:58:48,  2.37s/it]

training loss: 3.3535664081573486


training:  31%|███       | 3426/10986 [2:20:13<5:00:33,  2.39s/it]

training loss: 3.4696459770202637


training:  31%|███       | 3427/10986 [2:20:15<4:49:41,  2.30s/it]

training loss: 3.5661203861236572


training:  31%|███       | 3428/10986 [2:20:18<4:52:53,  2.33s/it]

training loss: 3.414135456085205


training:  31%|███       | 3429/10986 [2:20:20<4:43:40,  2.25s/it]

training loss: 3.4282755851745605


training:  31%|███       | 3430/10986 [2:20:22<4:49:06,  2.30s/it]

training loss: 3.4414212703704834


training:  31%|███       | 3431/10986 [2:20:24<4:41:01,  2.23s/it]

training loss: 3.417876720428467


training:  31%|███       | 3432/10986 [2:20:27<4:47:06,  2.28s/it]

training loss: 3.5133960247039795


training:  31%|███       | 3433/10986 [2:20:29<4:40:53,  2.23s/it]

training loss: 3.458042860031128


training:  31%|███▏      | 3434/10986 [2:20:31<4:47:29,  2.28s/it]

training loss: 3.5800185203552246


training:  31%|███▏      | 3435/10986 [2:20:33<4:40:46,  2.23s/it]

training loss: 3.42159366607666


training:  31%|███▏      | 3436/10986 [2:20:36<4:46:31,  2.28s/it]

training loss: 3.4143056869506836


training:  31%|███▏      | 3437/10986 [2:20:38<4:39:20,  2.22s/it]

training loss: 3.4027438163757324


training:  31%|███▏      | 3438/10986 [2:20:40<4:45:45,  2.27s/it]

training loss: 3.5372095108032227


training:  31%|███▏      | 3439/10986 [2:20:42<4:38:58,  2.22s/it]

training loss: 3.3423070907592773


training:  31%|███▏      | 3440/10986 [2:20:45<4:45:55,  2.27s/it]

training loss: 3.4205050468444824
valid loss: 3.5942366123199463
perplexity: 36.387908935546875


training:  31%|███▏      | 3441/10986 [2:20:49<6:15:15,  2.98s/it]

training loss: 3.4200804233551025


training:  31%|███▏      | 3442/10986 [2:20:52<5:43:17,  2.73s/it]

training loss: 3.4829154014587402


training:  31%|███▏      | 3443/10986 [2:20:54<5:31:15,  2.63s/it]

training loss: 3.526750087738037


training:  31%|███▏      | 3444/10986 [2:20:56<5:13:25,  2.49s/it]

training loss: 3.3955941200256348


training:  31%|███▏      | 3445/10986 [2:20:59<5:09:34,  2.46s/it]

training loss: 3.487212657928467


training:  31%|███▏      | 3446/10986 [2:21:01<5:00:49,  2.39s/it]

training loss: 3.45404314994812


training:  31%|███▏      | 3447/10986 [2:21:04<5:44:23,  2.74s/it]

training loss: 3.4604909420013428


training:  31%|███▏      | 3448/10986 [2:21:06<5:20:57,  2.55s/it]

training loss: 3.428049087524414


training:  31%|███▏      | 3449/10986 [2:21:09<5:14:44,  2.51s/it]

training loss: 3.383565902709961


training:  31%|███▏      | 3450/10986 [2:21:11<4:58:54,  2.38s/it]

training loss: 3.377568244934082


training:  31%|███▏      | 3451/10986 [2:21:13<4:59:29,  2.38s/it]

training loss: 3.4772958755493164


training:  31%|███▏      | 3452/10986 [2:21:15<4:48:08,  2.29s/it]

training loss: 3.388310194015503


training:  31%|███▏      | 3453/10986 [2:21:18<4:50:48,  2.32s/it]

training loss: 3.5998599529266357


training:  31%|███▏      | 3454/10986 [2:21:20<4:41:39,  2.24s/it]

training loss: 3.4390974044799805


training:  31%|███▏      | 3455/10986 [2:21:22<4:46:40,  2.28s/it]

training loss: 3.385359764099121


training:  31%|███▏      | 3456/10986 [2:21:24<4:38:52,  2.22s/it]

training loss: 3.367868185043335


training:  31%|███▏      | 3457/10986 [2:21:27<4:45:58,  2.28s/it]

training loss: 3.387277603149414


training:  31%|███▏      | 3458/10986 [2:21:29<4:37:44,  2.21s/it]

training loss: 3.5563199520111084


training:  31%|███▏      | 3459/10986 [2:21:31<4:43:43,  2.26s/it]

training loss: 3.427884101867676


training:  31%|███▏      | 3460/10986 [2:21:33<4:36:48,  2.21s/it]

training loss: 3.611680030822754
valid loss: 3.4501357078552246
perplexity: 31.504667282104492


training:  32%|███▏      | 3461/10986 [2:21:38<5:57:26,  2.85s/it]

training loss: 3.4029572010040283


training:  32%|███▏      | 3462/10986 [2:21:40<5:42:56,  2.73s/it]

training loss: 3.391728639602661


training:  32%|███▏      | 3463/10986 [2:21:42<5:17:29,  2.53s/it]

training loss: 3.515047550201416


training:  32%|███▏      | 3464/10986 [2:21:44<5:11:44,  2.49s/it]

training loss: 3.4206669330596924


training:  32%|███▏      | 3465/10986 [2:21:47<4:56:02,  2.36s/it]

training loss: 3.4381296634674072


training:  32%|███▏      | 3466/10986 [2:21:49<4:56:10,  2.36s/it]

training loss: 3.4619596004486084


training:  32%|███▏      | 3467/10986 [2:21:51<4:46:09,  2.28s/it]

training loss: 3.561687707901001


training:  32%|███▏      | 3468/10986 [2:21:53<4:50:40,  2.32s/it]

training loss: 3.4292590618133545


training:  32%|███▏      | 3469/10986 [2:21:56<4:43:17,  2.26s/it]

training loss: 3.4624550342559814


training:  32%|███▏      | 3470/10986 [2:21:58<4:47:55,  2.30s/it]

training loss: 3.3008668422698975


training:  32%|███▏      | 3471/10986 [2:22:00<4:39:26,  2.23s/it]

training loss: 3.508237361907959


training:  32%|███▏      | 3472/10986 [2:22:02<4:44:50,  2.27s/it]

training loss: 3.4081170558929443


training:  32%|███▏      | 3473/10986 [2:22:04<4:37:25,  2.22s/it]

training loss: 3.3761215209960938


training:  32%|███▏      | 3474/10986 [2:22:07<4:43:11,  2.26s/it]

training loss: 3.439685583114624


training:  32%|███▏      | 3475/10986 [2:22:09<4:35:51,  2.20s/it]

training loss: 3.4875168800354004


training:  32%|███▏      | 3476/10986 [2:22:11<4:42:42,  2.26s/it]

training loss: 3.5223193168640137


training:  32%|███▏      | 3477/10986 [2:22:13<4:36:54,  2.21s/it]

training loss: 3.4517180919647217


training:  32%|███▏      | 3478/10986 [2:22:16<4:44:23,  2.27s/it]

training loss: 3.433758020401001


training:  32%|███▏      | 3479/10986 [2:22:18<4:37:31,  2.22s/it]

training loss: 3.511188268661499


training:  32%|███▏      | 3480/10986 [2:22:20<4:44:10,  2.27s/it]

training loss: 3.38004732131958
valid loss: 3.356994152069092
perplexity: 28.70278549194336


training:  32%|███▏      | 3481/10986 [2:22:25<6:00:16,  2.88s/it]

training loss: 3.474544048309326


training:  32%|███▏      | 3482/10986 [2:22:27<5:32:31,  2.66s/it]

training loss: 3.4501771926879883


training:  32%|███▏      | 3483/10986 [2:22:29<5:22:07,  2.58s/it]

training loss: 3.4254822731018066


training:  32%|███▏      | 3484/10986 [2:22:31<5:02:39,  2.42s/it]

training loss: 3.370661735534668


training:  32%|███▏      | 3485/10986 [2:22:34<5:00:43,  2.41s/it]

training loss: 3.4549567699432373


training:  32%|███▏      | 3486/10986 [2:22:36<4:47:43,  2.30s/it]

training loss: 3.4474875926971436


training:  32%|███▏      | 3487/10986 [2:22:38<4:50:23,  2.32s/it]

training loss: 3.5607478618621826


training:  32%|███▏      | 3488/10986 [2:22:40<4:41:26,  2.25s/it]

training loss: 3.345658540725708


training:  32%|███▏      | 3489/10986 [2:22:42<4:45:28,  2.28s/it]

training loss: 3.5352141857147217


training:  32%|███▏      | 3490/10986 [2:22:45<4:38:56,  2.23s/it]

training loss: 3.431001901626587


training:  32%|███▏      | 3491/10986 [2:22:48<5:11:00,  2.49s/it]

training loss: 3.568143367767334


training:  32%|███▏      | 3492/10986 [2:22:50<4:55:48,  2.37s/it]

training loss: 3.4423611164093018


training:  32%|███▏      | 3493/10986 [2:22:52<4:58:16,  2.39s/it]

training loss: 3.4492030143737793


training:  32%|███▏      | 3494/10986 [2:22:54<4:47:24,  2.30s/it]

training loss: 3.3797621726989746


training:  32%|███▏      | 3495/10986 [2:22:57<4:51:41,  2.34s/it]

training loss: 3.579374074935913


training:  32%|███▏      | 3496/10986 [2:22:59<4:42:19,  2.26s/it]

training loss: 3.4080002307891846


training:  32%|███▏      | 3497/10986 [2:23:01<4:47:42,  2.31s/it]

training loss: 3.428316354751587


training:  32%|███▏      | 3498/10986 [2:23:03<4:39:09,  2.24s/it]

training loss: 3.6373300552368164


training:  32%|███▏      | 3499/10986 [2:23:06<4:43:21,  2.27s/it]

training loss: 3.439133644104004


training:  32%|███▏      | 3500/10986 [2:23:08<4:37:23,  2.22s/it]

training loss: 3.452024459838867
valid loss: 3.596074104309082
perplexity: 36.454833984375


training:  32%|███▏      | 3501/10986 [2:23:12<6:09:58,  2.97s/it]

training loss: 3.4248452186584473


training:  32%|███▏      | 3502/10986 [2:23:15<5:50:47,  2.81s/it]

training loss: 3.425424575805664


training:  32%|███▏      | 3503/10986 [2:23:17<5:26:32,  2.62s/it]

training loss: 3.415236473083496


training:  32%|███▏      | 3504/10986 [2:23:19<5:18:17,  2.55s/it]

training loss: 3.549265146255493


training:  32%|███▏      | 3505/10986 [2:23:22<5:01:07,  2.42s/it]

training loss: 3.5535731315612793


training:  32%|███▏      | 3506/10986 [2:23:24<5:00:02,  2.41s/it]

training loss: 3.3490970134735107


training:  32%|███▏      | 3507/10986 [2:23:26<4:47:06,  2.30s/it]

training loss: 3.5877740383148193


training:  32%|███▏      | 3508/10986 [2:23:28<4:49:05,  2.32s/it]

training loss: 3.4414381980895996


training:  32%|███▏      | 3509/10986 [2:23:31<5:04:48,  2.45s/it]

training loss: 3.362736225128174


training:  32%|███▏      | 3510/10986 [2:23:34<5:28:31,  2.64s/it]

training loss: 3.445918560028076


training:  32%|███▏      | 3511/10986 [2:23:36<5:07:50,  2.47s/it]

training loss: 3.4910359382629395


training:  32%|███▏      | 3512/10986 [2:23:39<5:03:51,  2.44s/it]

training loss: 3.3775854110717773


training:  32%|███▏      | 3513/10986 [2:23:41<4:50:16,  2.33s/it]

training loss: 3.44781231880188


training:  32%|███▏      | 3514/10986 [2:23:43<4:51:47,  2.34s/it]

training loss: 3.437408685684204


training:  32%|███▏      | 3515/10986 [2:23:45<4:42:27,  2.27s/it]

training loss: 3.463563919067383


training:  32%|███▏      | 3516/10986 [2:23:48<4:45:55,  2.30s/it]

training loss: 3.5396885871887207


training:  32%|███▏      | 3517/10986 [2:23:50<4:38:19,  2.24s/it]

training loss: 3.427046537399292


training:  32%|███▏      | 3518/10986 [2:23:52<4:44:05,  2.28s/it]

training loss: 3.3271985054016113


training:  32%|███▏      | 3519/10986 [2:23:54<4:36:32,  2.22s/it]

training loss: 3.4025001525878906


training:  32%|███▏      | 3520/10986 [2:23:56<4:43:25,  2.28s/it]

training loss: 3.572875738143921
valid loss: 3.4645416736602783
perplexity: 31.961807250976562


training:  32%|███▏      | 3521/10986 [2:24:01<6:01:46,  2.91s/it]

training loss: 3.5165352821350098


training:  32%|███▏      | 3522/10986 [2:24:03<5:32:12,  2.67s/it]

training loss: 3.455312490463257


training:  32%|███▏      | 3523/10986 [2:24:05<5:21:59,  2.59s/it]

training loss: 3.527595043182373


training:  32%|███▏      | 3524/10986 [2:24:07<5:03:08,  2.44s/it]

training loss: 3.4383363723754883


training:  32%|███▏      | 3525/10986 [2:24:10<5:00:33,  2.42s/it]

training loss: 3.440171003341675


training:  32%|███▏      | 3526/10986 [2:24:12<4:47:59,  2.32s/it]

training loss: 3.290106773376465


training:  32%|███▏      | 3527/10986 [2:24:14<4:49:36,  2.33s/it]

training loss: 3.436166524887085


training:  32%|███▏      | 3528/10986 [2:24:16<4:42:57,  2.28s/it]

training loss: 3.408686399459839


training:  32%|███▏      | 3529/10986 [2:24:19<4:47:12,  2.31s/it]

training loss: 3.499509572982788


training:  32%|███▏      | 3530/10986 [2:24:21<4:38:59,  2.25s/it]

training loss: 3.3475303649902344


training:  32%|███▏      | 3531/10986 [2:24:23<4:44:38,  2.29s/it]

training loss: 3.6550064086914062


training:  32%|███▏      | 3532/10986 [2:24:25<4:38:26,  2.24s/it]

training loss: 3.3476974964141846


training:  32%|███▏      | 3533/10986 [2:24:28<4:45:50,  2.30s/it]

training loss: 3.4230074882507324


training:  32%|███▏      | 3534/10986 [2:24:30<4:37:32,  2.23s/it]

training loss: 3.409179210662842


training:  32%|███▏      | 3535/10986 [2:24:32<4:42:01,  2.27s/it]

training loss: 3.4965546131134033


training:  32%|███▏      | 3536/10986 [2:24:34<4:35:47,  2.22s/it]

training loss: 3.3214850425720215


training:  32%|███▏      | 3537/10986 [2:24:37<4:42:26,  2.28s/it]

training loss: 3.4179704189300537


training:  32%|███▏      | 3538/10986 [2:24:39<4:35:22,  2.22s/it]

training loss: 3.4990456104278564


training:  32%|███▏      | 3539/10986 [2:24:41<4:41:57,  2.27s/it]

training loss: 3.412719964981079


training:  32%|███▏      | 3540/10986 [2:24:43<4:34:46,  2.21s/it]

training loss: 3.5982325077056885
valid loss: 3.3461647033691406
perplexity: 28.393627166748047


training:  32%|███▏      | 3541/10986 [2:24:48<5:53:51,  2.85s/it]

training loss: 3.4593076705932617


training:  32%|███▏      | 3542/10986 [2:24:50<5:39:00,  2.73s/it]

training loss: 3.4176318645477295


training:  32%|███▏      | 3543/10986 [2:24:52<5:14:28,  2.54s/it]

training loss: 3.4427218437194824


training:  32%|███▏      | 3544/10986 [2:24:55<5:08:43,  2.49s/it]

training loss: 3.4359242916107178


training:  32%|███▏      | 3545/10986 [2:24:57<4:53:22,  2.37s/it]

training loss: 3.3229010105133057


training:  32%|███▏      | 3546/10986 [2:24:59<4:53:44,  2.37s/it]

training loss: 3.4777331352233887


training:  32%|███▏      | 3547/10986 [2:25:01<4:42:50,  2.28s/it]

training loss: 3.539987325668335


training:  32%|███▏      | 3548/10986 [2:25:04<4:45:35,  2.30s/it]

training loss: 3.3657872676849365


training:  32%|███▏      | 3549/10986 [2:25:06<4:36:48,  2.23s/it]

training loss: 3.5188803672790527


training:  32%|███▏      | 3550/10986 [2:25:08<4:41:56,  2.27s/it]

training loss: 3.451792001724243


training:  32%|███▏      | 3551/10986 [2:25:10<4:35:27,  2.22s/it]

training loss: 3.4562063217163086


training:  32%|███▏      | 3552/10986 [2:25:12<4:41:18,  2.27s/it]

training loss: 3.492582082748413


training:  32%|███▏      | 3553/10986 [2:25:15<4:35:09,  2.22s/it]

training loss: 3.3415870666503906


training:  32%|███▏      | 3554/10986 [2:25:17<4:41:13,  2.27s/it]

training loss: 3.3691489696502686


training:  32%|███▏      | 3555/10986 [2:25:19<4:34:06,  2.21s/it]

training loss: 3.4693245887756348


training:  32%|███▏      | 3556/10986 [2:25:21<4:39:24,  2.26s/it]

training loss: 3.4423770904541016


training:  32%|███▏      | 3557/10986 [2:25:23<4:33:14,  2.21s/it]

training loss: 3.3513550758361816


training:  32%|███▏      | 3558/10986 [2:25:26<4:40:23,  2.26s/it]

training loss: 3.3571479320526123


training:  32%|███▏      | 3559/10986 [2:25:28<4:32:55,  2.20s/it]

training loss: 3.508084774017334


training:  32%|███▏      | 3560/10986 [2:25:30<4:38:33,  2.25s/it]

training loss: 3.508725881576538
valid loss: 3.324445962905884
perplexity: 27.783601760864258


training:  32%|███▏      | 3561/10986 [2:25:35<5:54:00,  2.86s/it]

training loss: 3.3166563510894775


training:  32%|███▏      | 3562/10986 [2:25:37<5:27:28,  2.65s/it]

training loss: 3.4529523849487305


training:  32%|███▏      | 3563/10986 [2:25:39<5:18:30,  2.57s/it]

training loss: 3.5366451740264893


training:  32%|███▏      | 3564/10986 [2:25:41<5:00:58,  2.43s/it]

training loss: 3.383207321166992


training:  32%|███▏      | 3565/10986 [2:25:44<4:59:17,  2.42s/it]

training loss: 3.436748504638672


training:  32%|███▏      | 3566/10986 [2:25:46<4:47:23,  2.32s/it]

training loss: 3.3666255474090576


training:  32%|███▏      | 3567/10986 [2:25:48<4:51:30,  2.36s/it]

training loss: 3.5620665550231934


training:  32%|███▏      | 3568/10986 [2:25:50<4:42:24,  2.28s/it]

training loss: 3.4959049224853516


training:  32%|███▏      | 3569/10986 [2:25:53<4:45:34,  2.31s/it]

training loss: 3.3840484619140625


training:  32%|███▏      | 3570/10986 [2:25:55<4:41:12,  2.28s/it]

training loss: 3.4249370098114014


training:  33%|███▎      | 3571/10986 [2:25:57<4:46:25,  2.32s/it]

training loss: 3.5757524967193604


training:  33%|███▎      | 3572/10986 [2:26:00<5:12:24,  2.53s/it]

training loss: 3.5145580768585205


training:  33%|███▎      | 3573/10986 [2:26:03<5:18:23,  2.58s/it]

training loss: 3.5263442993164062


training:  33%|███▎      | 3574/10986 [2:26:05<5:01:31,  2.44s/it]

training loss: 3.4729104042053223


training:  33%|███▎      | 3575/10986 [2:26:07<5:00:32,  2.43s/it]

training loss: 3.4016735553741455


training:  33%|███▎      | 3576/10986 [2:26:10<4:46:50,  2.32s/it]

training loss: 3.307814359664917


training:  33%|███▎      | 3577/10986 [2:26:12<4:50:20,  2.35s/it]

training loss: 3.4341681003570557


training:  33%|███▎      | 3578/10986 [2:26:14<4:41:56,  2.28s/it]

training loss: 3.3146636486053467


training:  33%|███▎      | 3579/10986 [2:26:16<4:46:09,  2.32s/it]

training loss: 3.4235336780548096


training:  33%|███▎      | 3580/10986 [2:26:19<4:37:46,  2.25s/it]

training loss: 3.403636932373047
valid loss: 3.4623682498931885
perplexity: 31.892417907714844


training:  33%|███▎      | 3581/10986 [2:26:23<5:53:43,  2.87s/it]

training loss: 3.478809118270874


training:  33%|███▎      | 3582/10986 [2:26:25<5:40:28,  2.76s/it]

training loss: 3.5614941120147705


training:  33%|███▎      | 3583/10986 [2:26:27<5:14:48,  2.55s/it]

training loss: 3.428004741668701


training:  33%|███▎      | 3584/10986 [2:26:30<5:08:02,  2.50s/it]

training loss: 3.465312957763672


training:  33%|███▎      | 3585/10986 [2:26:32<4:52:58,  2.38s/it]

training loss: 3.3006482124328613


training:  33%|███▎      | 3586/10986 [2:26:34<4:53:51,  2.38s/it]

training loss: 3.3644230365753174


training:  33%|███▎      | 3587/10986 [2:26:36<4:42:41,  2.29s/it]

training loss: 3.524521827697754


training:  33%|███▎      | 3588/10986 [2:26:39<4:45:22,  2.31s/it]

training loss: 3.4939708709716797


training:  33%|███▎      | 3589/10986 [2:26:41<4:37:16,  2.25s/it]

training loss: 3.43658709526062


training:  33%|███▎      | 3590/10986 [2:26:43<4:42:21,  2.29s/it]

training loss: 3.5391087532043457


training:  33%|███▎      | 3591/10986 [2:26:46<4:44:37,  2.31s/it]

training loss: 3.4490320682525635


training:  33%|███▎      | 3592/10986 [2:26:48<4:51:39,  2.37s/it]

training loss: 3.4149436950683594


training:  33%|███▎      | 3593/10986 [2:26:50<4:41:22,  2.28s/it]

training loss: 3.598012924194336


training:  33%|███▎      | 3594/10986 [2:26:53<4:44:27,  2.31s/it]

training loss: 3.4063055515289307


training:  33%|███▎      | 3595/10986 [2:26:55<4:37:02,  2.25s/it]

training loss: 3.4346611499786377


training:  33%|███▎      | 3596/10986 [2:26:57<4:44:08,  2.31s/it]

training loss: 3.3612678050994873


training:  33%|███▎      | 3597/10986 [2:26:59<4:36:52,  2.25s/it]

training loss: 3.3769772052764893


training:  33%|███▎      | 3598/10986 [2:27:02<4:43:26,  2.30s/it]

training loss: 3.287184476852417


training:  33%|███▎      | 3599/10986 [2:27:04<4:36:23,  2.24s/it]

training loss: 3.4884660243988037


training:  33%|███▎      | 3600/10986 [2:27:06<4:42:24,  2.29s/it]

training loss: 3.487013339996338
valid loss: 3.489008665084839
perplexity: 32.75345993041992


training:  33%|███▎      | 3601/10986 [2:27:11<6:12:51,  3.03s/it]

training loss: 3.4937376976013184


training:  33%|███▎      | 3602/10986 [2:27:13<5:40:18,  2.77s/it]

training loss: 3.318272113800049


training:  33%|███▎      | 3603/10986 [2:27:15<5:26:59,  2.66s/it]

training loss: 3.356444835662842


training:  33%|███▎      | 3604/10986 [2:27:18<5:05:30,  2.48s/it]

training loss: 3.4563002586364746


training:  33%|███▎      | 3605/10986 [2:27:20<5:01:48,  2.45s/it]

training loss: 3.3928165435791016


training:  33%|███▎      | 3606/10986 [2:27:22<4:47:16,  2.34s/it]

training loss: 3.3646352291107178


training:  33%|███▎      | 3607/10986 [2:27:24<4:48:36,  2.35s/it]

training loss: 3.4224250316619873


training:  33%|███▎      | 3608/10986 [2:27:26<4:39:02,  2.27s/it]

training loss: 3.460939884185791


training:  33%|███▎      | 3609/10986 [2:27:29<4:41:41,  2.29s/it]

training loss: 3.3902342319488525


training:  33%|███▎      | 3610/10986 [2:27:31<4:33:21,  2.22s/it]

training loss: 3.410557985305786


training:  33%|███▎      | 3611/10986 [2:27:33<4:37:33,  2.26s/it]

training loss: 3.4791431427001953


training:  33%|███▎      | 3612/10986 [2:27:35<4:31:46,  2.21s/it]

training loss: 3.572993516921997


training:  33%|███▎      | 3613/10986 [2:27:38<4:38:03,  2.26s/it]

training loss: 3.5623979568481445


training:  33%|███▎      | 3614/10986 [2:27:40<4:31:20,  2.21s/it]

training loss: 3.5517020225524902


training:  33%|███▎      | 3615/10986 [2:27:42<4:37:53,  2.26s/it]

training loss: 3.4857900142669678


training:  33%|███▎      | 3616/10986 [2:27:44<4:31:45,  2.21s/it]

training loss: 3.4477062225341797


training:  33%|███▎      | 3617/10986 [2:27:47<4:37:53,  2.26s/it]

training loss: 3.4264779090881348


training:  33%|███▎      | 3618/10986 [2:27:49<4:31:26,  2.21s/it]

training loss: 3.441303014755249


training:  33%|███▎      | 3619/10986 [2:27:51<4:37:45,  2.26s/it]

training loss: 3.37664794921875


training:  33%|███▎      | 3620/10986 [2:27:53<4:30:52,  2.21s/it]

training loss: 3.387479782104492
valid loss: 3.38010573387146
perplexity: 29.373876571655273


training:  33%|███▎      | 3621/10986 [2:27:58<5:51:43,  2.87s/it]

training loss: 3.4494333267211914


training:  33%|███▎      | 3622/10986 [2:28:00<5:38:33,  2.76s/it]

training loss: 3.434880256652832


training:  33%|███▎      | 3623/10986 [2:28:02<5:14:01,  2.56s/it]

training loss: 3.4618141651153564


training:  33%|███▎      | 3624/10986 [2:28:05<5:06:56,  2.50s/it]

training loss: 3.3470358848571777


training:  33%|███▎      | 3625/10986 [2:28:07<4:51:08,  2.37s/it]

training loss: 3.2589101791381836


training:  33%|███▎      | 3626/10986 [2:28:09<4:51:41,  2.38s/it]

training loss: 3.444504737854004


training:  33%|███▎      | 3627/10986 [2:28:11<4:40:30,  2.29s/it]

training loss: 3.3617467880249023


training:  33%|███▎      | 3628/10986 [2:28:13<4:42:32,  2.30s/it]

training loss: 3.4434871673583984


training:  33%|███▎      | 3629/10986 [2:28:16<4:35:01,  2.24s/it]

training loss: 3.361248016357422


training:  33%|███▎      | 3630/10986 [2:28:18<4:39:39,  2.28s/it]

training loss: 3.4522762298583984


training:  33%|███▎      | 3631/10986 [2:28:20<4:32:25,  2.22s/it]

training loss: 3.417365550994873


training:  33%|███▎      | 3632/10986 [2:28:22<4:38:53,  2.28s/it]

training loss: 3.54451584815979


training:  33%|███▎      | 3633/10986 [2:28:25<4:33:31,  2.23s/it]

training loss: 3.317845582962036


training:  33%|███▎      | 3634/10986 [2:28:28<5:17:28,  2.59s/it]

training loss: 3.39418888092041


training:  33%|███▎      | 3635/10986 [2:28:30<5:04:27,  2.48s/it]

training loss: 3.376875400543213


training:  33%|███▎      | 3636/10986 [2:28:33<5:00:05,  2.45s/it]

training loss: 3.388273239135742


training:  33%|███▎      | 3637/10986 [2:28:35<4:46:41,  2.34s/it]

training loss: 3.4478518962860107


training:  33%|███▎      | 3638/10986 [2:28:37<4:48:08,  2.35s/it]

training loss: 3.5329694747924805


training:  33%|███▎      | 3639/10986 [2:28:39<4:38:13,  2.27s/it]

training loss: 3.6278982162475586


training:  33%|███▎      | 3640/10986 [2:28:41<4:42:19,  2.31s/it]

training loss: 3.330456018447876
valid loss: 3.6169273853302
perplexity: 37.223018646240234


training:  33%|███▎      | 3641/10986 [2:28:46<6:03:13,  2.97s/it]

training loss: 3.4203779697418213


training:  33%|███▎      | 3642/10986 [2:28:48<5:34:56,  2.74s/it]

training loss: 3.3718068599700928


training:  33%|███▎      | 3643/10986 [2:28:51<5:25:37,  2.66s/it]

training loss: 3.5599265098571777


training:  33%|███▎      | 3644/10986 [2:28:53<5:04:12,  2.49s/it]

training loss: 3.541879653930664


training:  33%|███▎      | 3645/10986 [2:28:55<5:01:39,  2.47s/it]

training loss: 3.449974536895752


training:  33%|███▎      | 3646/10986 [2:28:57<4:49:13,  2.36s/it]

training loss: 3.5631351470947266


training:  33%|███▎      | 3647/10986 [2:29:00<4:50:10,  2.37s/it]

training loss: 3.344494342803955


training:  33%|███▎      | 3648/10986 [2:29:02<4:40:35,  2.29s/it]

training loss: 3.576596975326538


training:  33%|███▎      | 3649/10986 [2:29:04<4:43:55,  2.32s/it]

training loss: 3.5060460567474365


training:  33%|███▎      | 3650/10986 [2:29:06<4:36:36,  2.26s/it]

training loss: 3.350064754486084


training:  33%|███▎      | 3651/10986 [2:29:09<4:41:43,  2.30s/it]

training loss: 3.482924222946167


training:  33%|███▎      | 3652/10986 [2:29:11<4:33:30,  2.24s/it]

training loss: 3.5275461673736572


training:  33%|███▎      | 3653/10986 [2:29:13<4:38:59,  2.28s/it]

training loss: 3.4468460083007812


training:  33%|███▎      | 3654/10986 [2:29:15<4:31:49,  2.22s/it]

training loss: 3.339473009109497


training:  33%|███▎      | 3655/10986 [2:29:18<4:37:13,  2.27s/it]

training loss: 3.3753387928009033


training:  33%|███▎      | 3656/10986 [2:29:20<4:30:58,  2.22s/it]

training loss: 3.343810558319092


training:  33%|███▎      | 3657/10986 [2:29:22<4:39:03,  2.28s/it]

training loss: 3.414802074432373


training:  33%|███▎      | 3658/10986 [2:29:24<4:32:42,  2.23s/it]

training loss: 3.381617546081543


training:  33%|███▎      | 3659/10986 [2:29:27<4:38:37,  2.28s/it]

training loss: 3.4937987327575684


training:  33%|███▎      | 3660/10986 [2:29:29<4:30:51,  2.22s/it]

training loss: 3.450288772583008
valid loss: 3.525527238845825
perplexity: 33.9716796875


training:  33%|███▎      | 3661/10986 [2:29:33<5:50:02,  2.87s/it]

training loss: 3.3743233680725098


training:  33%|███▎      | 3662/10986 [2:29:36<5:38:03,  2.77s/it]

training loss: 3.384401798248291


training:  33%|███▎      | 3663/10986 [2:29:38<5:13:29,  2.57s/it]

training loss: 3.3198418617248535


training:  33%|███▎      | 3664/10986 [2:29:40<5:07:42,  2.52s/it]

training loss: 3.450441360473633


training:  33%|███▎      | 3665/10986 [2:29:42<4:55:53,  2.42s/it]

training loss: 3.4811477661132812


training:  33%|███▎      | 3666/10986 [2:29:45<4:54:42,  2.42s/it]

training loss: 3.4148051738739014


training:  33%|███▎      | 3667/10986 [2:29:47<4:42:52,  2.32s/it]

training loss: 3.461280584335327


training:  33%|███▎      | 3668/10986 [2:29:49<4:45:08,  2.34s/it]

training loss: 3.714872360229492


training:  33%|███▎      | 3669/10986 [2:29:51<4:37:20,  2.27s/it]

training loss: 3.3775782585144043


training:  33%|███▎      | 3670/10986 [2:29:54<4:42:55,  2.32s/it]

training loss: 3.378861904144287


training:  33%|███▎      | 3671/10986 [2:29:56<4:35:13,  2.26s/it]

training loss: 3.413360357284546


training:  33%|███▎      | 3672/10986 [2:29:58<4:39:55,  2.30s/it]

training loss: 3.335906744003296


training:  33%|███▎      | 3673/10986 [2:30:00<4:35:22,  2.26s/it]

training loss: 3.5180914402008057


training:  33%|███▎      | 3674/10986 [2:30:03<4:42:27,  2.32s/it]

training loss: 3.3007805347442627


training:  33%|███▎      | 3675/10986 [2:30:05<4:37:30,  2.28s/it]

training loss: 3.368558168411255


training:  33%|███▎      | 3676/10986 [2:30:08<4:42:11,  2.32s/it]

training loss: 3.4053289890289307


training:  33%|███▎      | 3677/10986 [2:30:10<4:34:33,  2.25s/it]

training loss: 3.4642796516418457


training:  33%|███▎      | 3678/10986 [2:30:12<4:40:43,  2.30s/it]

training loss: 3.38594913482666


training:  33%|███▎      | 3679/10986 [2:30:14<4:33:01,  2.24s/it]

training loss: 3.5239696502685547


training:  33%|███▎      | 3680/10986 [2:30:17<4:39:35,  2.30s/it]

training loss: 3.4007771015167236
valid loss: 3.387295961380005
perplexity: 29.585844039916992


training:  34%|███▎      | 3681/10986 [2:30:21<6:03:02,  2.98s/it]

training loss: 3.373544454574585


training:  34%|███▎      | 3682/10986 [2:30:23<5:35:06,  2.75s/it]

training loss: 3.460286855697632


training:  34%|███▎      | 3683/10986 [2:30:26<5:26:02,  2.68s/it]

training loss: 3.3094029426574707


training:  34%|███▎      | 3684/10986 [2:30:28<5:05:11,  2.51s/it]

training loss: 3.613895893096924


training:  34%|███▎      | 3685/10986 [2:30:30<5:02:20,  2.48s/it]

training loss: 3.5202696323394775


training:  34%|███▎      | 3686/10986 [2:30:33<4:47:24,  2.36s/it]

training loss: 3.4103381633758545


training:  34%|███▎      | 3687/10986 [2:30:35<4:48:46,  2.37s/it]

training loss: 3.3858635425567627


training:  34%|███▎      | 3688/10986 [2:30:37<4:38:41,  2.29s/it]

training loss: 3.361179828643799


training:  34%|███▎      | 3689/10986 [2:30:39<4:42:50,  2.33s/it]

training loss: 3.330758571624756


training:  34%|███▎      | 3690/10986 [2:30:42<4:35:27,  2.27s/it]

training loss: 3.2986106872558594


training:  34%|███▎      | 3691/10986 [2:30:44<4:40:18,  2.31s/it]

training loss: 3.5229456424713135


training:  34%|███▎      | 3692/10986 [2:30:46<4:48:21,  2.37s/it]

training loss: 3.4840586185455322


training:  34%|███▎      | 3693/10986 [2:30:49<4:49:15,  2.38s/it]

training loss: 3.388960838317871


training:  34%|███▎      | 3694/10986 [2:30:51<4:39:16,  2.30s/it]

training loss: 3.312901735305786


training:  34%|███▎      | 3695/10986 [2:30:54<5:11:15,  2.56s/it]

training loss: 3.5106120109558105


training:  34%|███▎      | 3696/10986 [2:30:57<5:14:38,  2.59s/it]

training loss: 3.471299648284912


training:  34%|███▎      | 3697/10986 [2:30:59<5:09:20,  2.55s/it]

training loss: 3.3371005058288574


training:  34%|███▎      | 3698/10986 [2:31:01<4:54:37,  2.43s/it]

training loss: 3.356757640838623


training:  34%|███▎      | 3699/10986 [2:31:04<4:55:34,  2.43s/it]

training loss: 3.3628649711608887


training:  34%|███▎      | 3700/10986 [2:31:06<4:42:37,  2.33s/it]

training loss: 3.346820592880249
valid loss: 3.3560452461242676
perplexity: 28.675561904907227


training:  34%|███▎      | 3701/10986 [2:31:10<6:04:00,  3.00s/it]

training loss: 3.4097087383270264


training:  34%|███▎      | 3702/10986 [2:31:13<5:44:11,  2.84s/it]

training loss: 3.361175060272217


training:  34%|███▎      | 3703/10986 [2:31:15<5:18:42,  2.63s/it]

training loss: 3.605372428894043


training:  34%|███▎      | 3704/10986 [2:31:18<5:11:14,  2.56s/it]

training loss: 3.626549243927002


training:  34%|███▎      | 3705/10986 [2:31:20<4:54:38,  2.43s/it]

training loss: 3.465608835220337


training:  34%|███▎      | 3706/10986 [2:31:22<4:54:26,  2.43s/it]

training loss: 3.349813938140869


training:  34%|███▎      | 3707/10986 [2:31:24<4:43:50,  2.34s/it]

training loss: 3.4522879123687744


training:  34%|███▍      | 3708/10986 [2:31:27<4:46:21,  2.36s/it]

training loss: 3.444922685623169


training:  34%|███▍      | 3709/10986 [2:31:29<4:36:03,  2.28s/it]

training loss: 3.4139981269836426


training:  34%|███▍      | 3710/10986 [2:31:31<4:42:11,  2.33s/it]

training loss: 3.326798915863037


training:  34%|███▍      | 3711/10986 [2:31:33<4:34:12,  2.26s/it]

training loss: 3.3370308876037598


training:  34%|███▍      | 3712/10986 [2:31:36<4:40:01,  2.31s/it]

training loss: 3.54789137840271


training:  34%|███▍      | 3713/10986 [2:31:38<4:32:20,  2.25s/it]

training loss: 3.4319818019866943


training:  34%|███▍      | 3714/10986 [2:31:40<4:38:38,  2.30s/it]

training loss: 3.434040069580078


training:  34%|███▍      | 3715/10986 [2:31:42<4:32:18,  2.25s/it]

training loss: 3.5101850032806396


training:  34%|███▍      | 3716/10986 [2:31:45<4:37:54,  2.29s/it]

training loss: 3.4527525901794434


training:  34%|███▍      | 3717/10986 [2:31:47<4:30:38,  2.23s/it]

training loss: 3.497433662414551


training:  34%|███▍      | 3718/10986 [2:31:49<4:37:23,  2.29s/it]

training loss: 3.444573163986206


training:  34%|███▍      | 3719/10986 [2:31:51<4:30:57,  2.24s/it]

training loss: 3.4299588203430176


training:  34%|███▍      | 3720/10986 [2:31:54<4:37:19,  2.29s/it]

training loss: 3.4658915996551514
valid loss: 3.4170758724212646
perplexity: 30.48015594482422


training:  34%|███▍      | 3721/10986 [2:31:58<5:59:36,  2.97s/it]

training loss: 3.456209659576416


training:  34%|███▍      | 3722/10986 [2:32:00<5:30:42,  2.73s/it]

training loss: 3.3432540893554688


training:  34%|███▍      | 3723/10986 [2:32:03<5:19:38,  2.64s/it]

training loss: 3.4046146869659424


training:  34%|███▍      | 3724/10986 [2:32:05<5:00:39,  2.48s/it]

training loss: 3.4032931327819824


training:  34%|███▍      | 3725/10986 [2:32:07<4:57:35,  2.46s/it]

training loss: 3.6180200576782227


training:  34%|███▍      | 3726/10986 [2:32:09<4:43:40,  2.34s/it]

training loss: 3.347076416015625


training:  34%|███▍      | 3727/10986 [2:32:12<4:46:45,  2.37s/it]

training loss: 3.4423274993896484


training:  34%|███▍      | 3728/10986 [2:32:14<4:37:12,  2.29s/it]

training loss: 3.4968605041503906


training:  34%|███▍      | 3729/10986 [2:32:16<4:41:49,  2.33s/it]

training loss: 3.427985429763794


training:  34%|███▍      | 3730/10986 [2:32:19<4:34:00,  2.27s/it]

training loss: 3.457688808441162


training:  34%|███▍      | 3731/10986 [2:32:21<4:39:28,  2.31s/it]

training loss: 3.346450090408325


training:  34%|███▍      | 3732/10986 [2:32:23<4:33:51,  2.27s/it]

training loss: 3.521515369415283


training:  34%|███▍      | 3733/10986 [2:32:26<4:40:30,  2.32s/it]

training loss: 3.3985424041748047


training:  34%|███▍      | 3734/10986 [2:32:28<4:32:52,  2.26s/it]

training loss: 3.4263789653778076


training:  34%|███▍      | 3735/10986 [2:32:30<4:40:21,  2.32s/it]

training loss: 3.4885735511779785


training:  34%|███▍      | 3736/10986 [2:32:32<4:33:23,  2.26s/it]

training loss: 3.5603301525115967


training:  34%|███▍      | 3737/10986 [2:32:35<4:38:23,  2.30s/it]

training loss: 3.446840524673462


training:  34%|███▍      | 3738/10986 [2:32:37<4:31:42,  2.25s/it]

training loss: 3.519097328186035


training:  34%|███▍      | 3739/10986 [2:32:39<4:38:15,  2.30s/it]

training loss: 3.433462619781494


training:  34%|███▍      | 3740/10986 [2:32:41<4:31:27,  2.25s/it]

training loss: 3.5040009021759033
valid loss: 3.358765125274658
perplexity: 28.753662109375


training:  34%|███▍      | 3741/10986 [2:32:46<5:49:55,  2.90s/it]

training loss: 3.2944788932800293


training:  34%|███▍      | 3742/10986 [2:32:48<5:36:47,  2.79s/it]

training loss: 3.392409086227417


training:  34%|███▍      | 3743/10986 [2:32:50<5:12:44,  2.59s/it]

training loss: 3.5400784015655518


training:  34%|███▍      | 3744/10986 [2:32:53<5:06:56,  2.54s/it]

training loss: 3.4269087314605713


training:  34%|███▍      | 3745/10986 [2:32:55<4:52:41,  2.43s/it]

training loss: 3.4648735523223877


training:  34%|███▍      | 3746/10986 [2:32:57<4:54:22,  2.44s/it]

training loss: 3.387565851211548


training:  34%|███▍      | 3747/10986 [2:33:00<4:43:18,  2.35s/it]

training loss: 3.5781702995300293


training:  34%|███▍      | 3748/10986 [2:33:02<4:49:04,  2.40s/it]

training loss: 3.3662941455841064


training:  34%|███▍      | 3749/10986 [2:33:04<4:39:06,  2.31s/it]

training loss: 3.37009596824646


training:  34%|███▍      | 3750/10986 [2:33:07<4:43:03,  2.35s/it]

training loss: 3.5151727199554443


training:  34%|███▍      | 3751/10986 [2:33:09<4:35:29,  2.28s/it]

training loss: 3.5314507484436035


training:  34%|███▍      | 3752/10986 [2:33:11<4:42:11,  2.34s/it]

training loss: 3.4122416973114014


training:  34%|███▍      | 3753/10986 [2:33:13<4:33:40,  2.27s/it]

training loss: 3.445439100265503


training:  34%|███▍      | 3754/10986 [2:33:16<4:37:56,  2.31s/it]

training loss: 3.4717185497283936


training:  34%|███▍      | 3755/10986 [2:33:18<4:30:52,  2.25s/it]

training loss: 3.373831272125244


training:  34%|███▍      | 3756/10986 [2:33:21<4:54:36,  2.44s/it]

training loss: 3.5559682846069336


training:  34%|███▍      | 3757/10986 [2:33:24<5:17:24,  2.63s/it]

training loss: 3.3538262844085693


training:  34%|███▍      | 3758/10986 [2:33:26<5:09:20,  2.57s/it]

training loss: 3.2994208335876465


training:  34%|███▍      | 3759/10986 [2:33:28<4:52:43,  2.43s/it]

training loss: 3.386009931564331


training:  34%|███▍      | 3760/10986 [2:33:31<4:54:33,  2.45s/it]

training loss: 3.5722084045410156
valid loss: 3.494961738586426
perplexity: 32.94902801513672


training:  34%|███▍      | 3761/10986 [2:33:35<6:08:15,  3.06s/it]

training loss: 3.5871224403381348


training:  34%|███▍      | 3762/10986 [2:33:38<5:37:56,  2.81s/it]

training loss: 3.495530605316162


training:  34%|███▍      | 3763/10986 [2:33:40<5:25:31,  2.70s/it]

training loss: 3.381744861602783


training:  34%|███▍      | 3764/10986 [2:33:42<5:06:14,  2.54s/it]

training loss: 3.476551055908203


training:  34%|███▍      | 3765/10986 [2:33:45<5:03:05,  2.52s/it]

training loss: 3.4878079891204834


training:  34%|███▍      | 3766/10986 [2:33:47<4:50:35,  2.41s/it]

training loss: 3.3243629932403564


training:  34%|███▍      | 3767/10986 [2:33:49<4:52:19,  2.43s/it]

training loss: 3.5013186931610107


training:  34%|███▍      | 3768/10986 [2:33:51<4:41:26,  2.34s/it]

training loss: 3.294105291366577


training:  34%|███▍      | 3769/10986 [2:33:54<4:44:44,  2.37s/it]

training loss: 3.414727210998535


training:  34%|███▍      | 3770/10986 [2:33:56<4:36:36,  2.30s/it]

training loss: 3.383809804916382


training:  34%|███▍      | 3771/10986 [2:33:58<4:42:18,  2.35s/it]

training loss: 3.4186854362487793


training:  34%|███▍      | 3772/10986 [2:34:01<4:35:27,  2.29s/it]

training loss: 3.3565893173217773


training:  34%|███▍      | 3773/10986 [2:34:03<4:43:06,  2.35s/it]

training loss: 3.4708003997802734


training:  34%|███▍      | 3774/10986 [2:34:05<4:35:26,  2.29s/it]

training loss: 3.390934705734253


training:  34%|███▍      | 3775/10986 [2:34:08<4:41:31,  2.34s/it]

training loss: 3.4187681674957275


training:  34%|███▍      | 3776/10986 [2:34:10<4:33:32,  2.28s/it]

training loss: 3.489962100982666


training:  34%|███▍      | 3777/10986 [2:34:12<4:39:08,  2.32s/it]

training loss: 3.3964860439300537


training:  34%|███▍      | 3778/10986 [2:34:14<4:32:19,  2.27s/it]

training loss: 3.5133914947509766


training:  34%|███▍      | 3779/10986 [2:34:17<4:39:46,  2.33s/it]

training loss: 3.353109836578369


training:  34%|███▍      | 3780/10986 [2:34:19<4:32:47,  2.27s/it]

training loss: 3.427622079849243
valid loss: 3.508730411529541
perplexity: 33.40583038330078


training:  34%|███▍      | 3781/10986 [2:34:23<5:49:25,  2.91s/it]

training loss: 3.508084774017334


training:  34%|███▍      | 3782/10986 [2:34:26<5:36:34,  2.80s/it]

training loss: 3.352799892425537


training:  34%|███▍      | 3783/10986 [2:34:28<5:12:05,  2.60s/it]

training loss: 3.391158103942871


training:  34%|███▍      | 3784/10986 [2:34:31<5:06:14,  2.55s/it]

training loss: 3.4686787128448486


training:  34%|███▍      | 3785/10986 [2:34:33<4:50:18,  2.42s/it]

training loss: 3.4973537921905518


training:  34%|███▍      | 3786/10986 [2:34:35<4:49:40,  2.41s/it]

training loss: 3.397979974746704


training:  34%|███▍      | 3787/10986 [2:34:37<4:38:49,  2.32s/it]

training loss: 3.29150128364563


training:  34%|███▍      | 3788/10986 [2:34:40<4:44:20,  2.37s/it]

training loss: 3.4122562408447266


training:  34%|███▍      | 3789/10986 [2:34:42<4:34:57,  2.29s/it]

training loss: 3.4675683975219727


training:  34%|███▍      | 3790/10986 [2:34:44<4:39:34,  2.33s/it]

training loss: 3.3159942626953125


training:  35%|███▍      | 3791/10986 [2:34:46<4:31:38,  2.27s/it]

training loss: 3.4316000938415527


training:  35%|███▍      | 3792/10986 [2:34:49<4:39:14,  2.33s/it]

training loss: 3.44675612449646


training:  35%|███▍      | 3793/10986 [2:34:51<4:30:31,  2.26s/it]

training loss: 3.4649229049682617


training:  35%|███▍      | 3794/10986 [2:34:53<4:35:56,  2.30s/it]

training loss: 3.5595500469207764


training:  35%|███▍      | 3795/10986 [2:34:55<4:30:07,  2.25s/it]

training loss: 3.4189562797546387


training:  35%|███▍      | 3796/10986 [2:34:58<4:36:00,  2.30s/it]

training loss: 3.4329512119293213


training:  35%|███▍      | 3797/10986 [2:35:00<4:29:03,  2.25s/it]

training loss: 3.5314979553222656


training:  35%|███▍      | 3798/10986 [2:35:02<4:34:41,  2.29s/it]

training loss: 3.4780497550964355


training:  35%|███▍      | 3799/10986 [2:35:05<4:29:25,  2.25s/it]

training loss: 3.401822090148926


training:  35%|███▍      | 3800/10986 [2:35:07<4:34:49,  2.29s/it]

training loss: 3.4650180339813232
valid loss: 3.3507297039031982
perplexity: 28.523540496826172


training:  35%|███▍      | 3801/10986 [2:35:12<6:02:59,  3.03s/it]

training loss: 3.3082618713378906


training:  35%|███▍      | 3802/10986 [2:35:14<5:30:44,  2.76s/it]

training loss: 3.424330472946167


training:  35%|███▍      | 3803/10986 [2:35:16<5:19:53,  2.67s/it]

training loss: 3.409698009490967


training:  35%|███▍      | 3804/10986 [2:35:18<5:01:02,  2.52s/it]

training loss: 3.493980884552002


training:  35%|███▍      | 3805/10986 [2:35:21<4:58:33,  2.49s/it]

training loss: 3.420003652572632


training:  35%|███▍      | 3806/10986 [2:35:23<4:45:33,  2.39s/it]

training loss: 3.406355619430542


training:  35%|███▍      | 3807/10986 [2:35:25<4:47:44,  2.40s/it]

training loss: 3.3856823444366455


training:  35%|███▍      | 3808/10986 [2:35:28<4:37:25,  2.32s/it]

training loss: 3.445812463760376


training:  35%|███▍      | 3809/10986 [2:35:30<4:41:44,  2.36s/it]

training loss: 3.4304802417755127


training:  35%|███▍      | 3810/10986 [2:35:32<4:33:25,  2.29s/it]

training loss: 3.3576934337615967


training:  35%|███▍      | 3811/10986 [2:35:35<4:39:09,  2.33s/it]

training loss: 3.397273063659668


training:  35%|███▍      | 3812/10986 [2:35:37<4:30:24,  2.26s/it]

training loss: 3.5130152702331543


training:  35%|███▍      | 3813/10986 [2:35:39<4:35:53,  2.31s/it]

training loss: 3.4386000633239746


training:  35%|███▍      | 3814/10986 [2:35:41<4:28:46,  2.25s/it]

training loss: 3.3841657638549805


training:  35%|███▍      | 3815/10986 [2:35:44<4:35:31,  2.31s/it]

training loss: 3.421041250228882


training:  35%|███▍      | 3816/10986 [2:35:46<4:29:34,  2.26s/it]

training loss: 3.416621685028076


training:  35%|███▍      | 3817/10986 [2:35:50<5:54:45,  2.97s/it]

training loss: 3.372459888458252


training:  35%|███▍      | 3818/10986 [2:35:53<5:26:54,  2.74s/it]

training loss: 3.440821409225464


training:  35%|███▍      | 3819/10986 [2:35:55<5:15:29,  2.64s/it]

training loss: 3.4510421752929688


training:  35%|███▍      | 3820/10986 [2:35:57<4:56:25,  2.48s/it]

training loss: 3.37509822845459
valid loss: 3.4112296104431152
perplexity: 30.302478790283203


training:  35%|███▍      | 3821/10986 [2:36:02<6:07:41,  3.08s/it]

training loss: 3.4632697105407715


training:  35%|███▍      | 3822/10986 [2:36:04<5:49:10,  2.92s/it]

training loss: 3.4026689529418945


training:  35%|███▍      | 3823/10986 [2:36:06<5:21:45,  2.70s/it]

training loss: 3.4869790077209473


training:  35%|███▍      | 3824/10986 [2:36:09<5:12:22,  2.62s/it]

training loss: 3.4388277530670166


training:  35%|███▍      | 3825/10986 [2:36:11<4:55:59,  2.48s/it]

training loss: 3.372816801071167


training:  35%|███▍      | 3826/10986 [2:36:13<4:54:40,  2.47s/it]

training loss: 3.5043938159942627


training:  35%|███▍      | 3827/10986 [2:36:15<4:41:53,  2.36s/it]

training loss: 3.3283915519714355


training:  35%|███▍      | 3828/10986 [2:36:18<4:43:02,  2.37s/it]

training loss: 3.5406432151794434


training:  35%|███▍      | 3829/10986 [2:36:20<4:33:03,  2.29s/it]

training loss: 3.6024868488311768


training:  35%|███▍      | 3830/10986 [2:36:22<4:37:50,  2.33s/it]

training loss: 3.3674509525299072


training:  35%|███▍      | 3831/10986 [2:36:25<4:31:49,  2.28s/it]

training loss: 3.58650279045105


training:  35%|███▍      | 3832/10986 [2:36:27<4:37:41,  2.33s/it]

training loss: 3.385380506515503


training:  35%|███▍      | 3833/10986 [2:36:29<4:29:52,  2.26s/it]

training loss: 3.3739752769470215


training:  35%|███▍      | 3834/10986 [2:36:32<4:35:16,  2.31s/it]

training loss: 3.42598295211792


training:  35%|███▍      | 3835/10986 [2:36:34<4:28:15,  2.25s/it]

training loss: 3.602879285812378


training:  35%|███▍      | 3836/10986 [2:36:36<4:36:25,  2.32s/it]

training loss: 3.3710460662841797


training:  35%|███▍      | 3837/10986 [2:36:38<4:31:30,  2.28s/it]

training loss: 3.5089356899261475


training:  35%|███▍      | 3838/10986 [2:36:41<4:36:36,  2.32s/it]

training loss: 3.4675068855285645


training:  35%|███▍      | 3839/10986 [2:36:43<4:28:44,  2.26s/it]

training loss: 3.433823585510254


training:  35%|███▍      | 3840/10986 [2:36:45<4:36:51,  2.32s/it]

training loss: 3.4939651489257812
valid loss: 3.436692237854004
perplexity: 31.083967208862305


training:  35%|███▍      | 3841/10986 [2:36:50<5:53:06,  2.97s/it]

training loss: 3.6504950523376465


training:  35%|███▍      | 3842/10986 [2:36:52<5:25:43,  2.74s/it]

training loss: 3.4351518154144287


training:  35%|███▍      | 3843/10986 [2:36:54<5:14:48,  2.64s/it]

training loss: 3.365236282348633


training:  35%|███▍      | 3844/10986 [2:36:57<4:58:05,  2.50s/it]

training loss: 3.52889347076416


training:  35%|███▍      | 3845/10986 [2:36:59<4:56:41,  2.49s/it]

training loss: 3.4604005813598633


training:  35%|███▌      | 3846/10986 [2:37:01<4:43:19,  2.38s/it]

training loss: 3.4303507804870605


training:  35%|███▌      | 3847/10986 [2:37:04<4:45:04,  2.40s/it]

training loss: 3.4175589084625244


training:  35%|███▌      | 3848/10986 [2:37:06<4:35:11,  2.31s/it]

training loss: 3.5332834720611572


training:  35%|███▌      | 3849/10986 [2:37:08<4:41:14,  2.36s/it]

training loss: 3.2916250228881836


training:  35%|███▌      | 3850/10986 [2:37:10<4:32:02,  2.29s/it]

training loss: 3.406970262527466


training:  35%|███▌      | 3851/10986 [2:37:13<4:36:43,  2.33s/it]

training loss: 3.562285900115967


training:  35%|███▌      | 3852/10986 [2:37:15<4:30:28,  2.27s/it]

training loss: 3.4731268882751465


training:  35%|███▌      | 3853/10986 [2:37:17<4:36:47,  2.33s/it]

training loss: 3.393578052520752


training:  35%|███▌      | 3854/10986 [2:37:19<4:29:03,  2.26s/it]

training loss: 3.532409429550171


training:  35%|███▌      | 3855/10986 [2:37:22<4:33:26,  2.30s/it]

training loss: 3.4137144088745117


training:  35%|███▌      | 3856/10986 [2:37:24<4:27:36,  2.25s/it]

training loss: 3.3615670204162598


training:  35%|███▌      | 3857/10986 [2:37:26<4:34:57,  2.31s/it]

training loss: 3.4547693729400635


training:  35%|███▌      | 3858/10986 [2:37:29<4:30:21,  2.28s/it]

training loss: 3.4892940521240234


training:  35%|███▌      | 3859/10986 [2:37:31<4:36:40,  2.33s/it]

training loss: 3.3945653438568115


training:  35%|███▌      | 3860/10986 [2:37:33<4:30:37,  2.28s/it]

training loss: 3.4273681640625
valid loss: 3.4408841133117676
perplexity: 31.214542388916016


training:  35%|███▌      | 3861/10986 [2:37:38<5:51:14,  2.96s/it]

training loss: 3.4929730892181396


training:  35%|███▌      | 3862/10986 [2:37:40<5:35:50,  2.83s/it]

training loss: 3.590895175933838


training:  35%|███▌      | 3863/10986 [2:37:42<5:11:53,  2.63s/it]

training loss: 3.370079517364502


training:  35%|███▌      | 3864/10986 [2:37:45<5:08:13,  2.60s/it]

training loss: 3.356961727142334


training:  35%|███▌      | 3865/10986 [2:37:47<4:51:47,  2.46s/it]

training loss: 3.5410056114196777


training:  35%|███▌      | 3866/10986 [2:37:50<4:50:17,  2.45s/it]

training loss: 3.4599437713623047


training:  35%|███▌      | 3867/10986 [2:37:52<4:38:49,  2.35s/it]

training loss: 3.4372594356536865


training:  35%|███▌      | 3868/10986 [2:37:54<4:43:05,  2.39s/it]

training loss: 3.3284707069396973


training:  35%|███▌      | 3869/10986 [2:37:56<4:33:24,  2.30s/it]

training loss: 3.581448793411255


training:  35%|███▌      | 3870/10986 [2:37:59<4:37:00,  2.34s/it]

training loss: 3.511483669281006


training:  35%|███▌      | 3871/10986 [2:38:01<4:30:30,  2.28s/it]

training loss: 3.339217185974121


training:  35%|███▌      | 3872/10986 [2:38:03<4:35:24,  2.32s/it]

training loss: 3.502157688140869


training:  35%|███▌      | 3873/10986 [2:38:05<4:27:52,  2.26s/it]

training loss: 3.4293835163116455


training:  35%|███▌      | 3874/10986 [2:38:08<4:35:35,  2.32s/it]

training loss: 3.370483636856079


training:  35%|███▌      | 3875/10986 [2:38:10<4:28:48,  2.27s/it]

training loss: 3.471285820007324


training:  35%|███▌      | 3876/10986 [2:38:12<4:33:56,  2.31s/it]

training loss: 3.425222396850586


training:  35%|███▌      | 3877/10986 [2:38:15<4:39:25,  2.36s/it]

training loss: 3.4865968227386475


training:  35%|███▌      | 3878/10986 [2:38:18<5:20:10,  2.70s/it]

training loss: 3.4203474521636963


training:  35%|███▌      | 3879/10986 [2:38:20<4:59:08,  2.53s/it]

training loss: 3.524531364440918


training:  35%|███▌      | 3880/10986 [2:38:23<4:55:13,  2.49s/it]

training loss: 3.4476163387298584
valid loss: 3.463020086288452
perplexity: 31.913211822509766


training:  35%|███▌      | 3881/10986 [2:38:27<6:03:28,  3.07s/it]

training loss: 3.460991859436035


training:  35%|███▌      | 3882/10986 [2:38:29<5:31:33,  2.80s/it]

training loss: 3.307849407196045


training:  35%|███▌      | 3883/10986 [2:38:32<5:18:10,  2.69s/it]

training loss: 3.3741369247436523


training:  35%|███▌      | 3884/10986 [2:38:34<4:58:18,  2.52s/it]

training loss: 3.500659465789795


training:  35%|███▌      | 3885/10986 [2:38:36<4:54:43,  2.49s/it]

training loss: 3.2995765209198


training:  35%|███▌      | 3886/10986 [2:38:39<4:40:55,  2.37s/it]

training loss: 3.4283721446990967


training:  35%|███▌      | 3887/10986 [2:38:41<4:43:11,  2.39s/it]

training loss: 3.5603995323181152


training:  35%|███▌      | 3888/10986 [2:38:43<4:33:13,  2.31s/it]

training loss: 3.3959968090057373


training:  35%|███▌      | 3889/10986 [2:38:46<4:38:41,  2.36s/it]

training loss: 3.4234137535095215


training:  35%|███▌      | 3890/10986 [2:38:48<4:31:05,  2.29s/it]

training loss: 3.3987410068511963


training:  35%|███▌      | 3891/10986 [2:38:50<4:35:18,  2.33s/it]

training loss: 3.408569574356079


training:  35%|███▌      | 3892/10986 [2:38:52<4:28:43,  2.27s/it]

training loss: 3.5578322410583496


training:  35%|███▌      | 3893/10986 [2:38:55<4:36:20,  2.34s/it]

training loss: 3.402522087097168


training:  35%|███▌      | 3894/10986 [2:38:57<4:29:35,  2.28s/it]

training loss: 3.3794302940368652


training:  35%|███▌      | 3895/10986 [2:38:59<4:34:47,  2.33s/it]

training loss: 3.4905338287353516


training:  35%|███▌      | 3896/10986 [2:39:01<4:27:21,  2.26s/it]

training loss: 3.437981605529785


training:  35%|███▌      | 3897/10986 [2:39:04<4:31:31,  2.30s/it]

training loss: 3.4364266395568848


training:  35%|███▌      | 3898/10986 [2:39:06<4:24:57,  2.24s/it]

training loss: 3.371480941772461


training:  35%|███▌      | 3899/10986 [2:39:08<4:31:34,  2.30s/it]

training loss: 3.475815773010254


training:  35%|███▌      | 3900/10986 [2:39:10<4:24:57,  2.24s/it]

training loss: 3.446470022201538
valid loss: 3.4194955825805664
perplexity: 30.553998947143555


training:  36%|███▌      | 3901/10986 [2:39:15<5:50:43,  2.97s/it]

training loss: 3.4574341773986816


training:  36%|███▌      | 3902/10986 [2:39:18<5:33:35,  2.83s/it]

training loss: 3.5196549892425537


training:  36%|███▌      | 3903/10986 [2:39:20<5:08:35,  2.61s/it]

training loss: 3.5052192211151123


training:  36%|███▌      | 3904/10986 [2:39:22<5:05:25,  2.59s/it]

training loss: 3.4982664585113525


training:  36%|███▌      | 3905/10986 [2:39:24<4:48:58,  2.45s/it]

training loss: 3.396747589111328


training:  36%|███▌      | 3906/10986 [2:39:27<4:47:44,  2.44s/it]

training loss: 3.490149974822998


training:  36%|███▌      | 3907/10986 [2:39:29<4:36:44,  2.35s/it]

training loss: 3.487199068069458


training:  36%|███▌      | 3908/10986 [2:39:31<4:40:01,  2.37s/it]

training loss: 3.498028039932251


training:  36%|███▌      | 3909/10986 [2:39:34<4:31:11,  2.30s/it]

training loss: 3.452239513397217


training:  36%|███▌      | 3910/10986 [2:39:36<4:36:57,  2.35s/it]

training loss: 3.346310615539551


training:  36%|███▌      | 3911/10986 [2:39:38<4:29:44,  2.29s/it]

training loss: 3.4785008430480957


training:  36%|███▌      | 3912/10986 [2:39:41<4:36:20,  2.34s/it]

training loss: 3.507456064224243


training:  36%|███▌      | 3913/10986 [2:39:43<4:28:40,  2.28s/it]

training loss: 3.3577706813812256


training:  36%|███▌      | 3914/10986 [2:39:45<4:34:08,  2.33s/it]

training loss: 3.43353271484375


training:  36%|███▌      | 3915/10986 [2:39:47<4:28:13,  2.28s/it]

training loss: 3.5150375366210938


training:  36%|███▌      | 3916/10986 [2:39:50<4:42:27,  2.40s/it]

training loss: 3.401723623275757


training:  36%|███▌      | 3917/10986 [2:39:52<4:33:02,  2.32s/it]

training loss: 3.3511505126953125


training:  36%|███▌      | 3918/10986 [2:39:55<4:37:33,  2.36s/it]

training loss: 3.4590530395507812


training:  36%|███▌      | 3919/10986 [2:39:57<4:30:09,  2.29s/it]

training loss: 3.3600339889526367


training:  36%|███▌      | 3920/10986 [2:39:59<4:36:38,  2.35s/it]

training loss: 3.4977669715881348
valid loss: 3.4822933673858643
perplexity: 32.53424835205078


training:  36%|███▌      | 3921/10986 [2:40:04<5:52:31,  2.99s/it]

training loss: 3.3342432975769043


training:  36%|███▌      | 3922/10986 [2:40:06<5:23:49,  2.75s/it]

training loss: 3.446324586868286


training:  36%|███▌      | 3923/10986 [2:40:08<5:13:42,  2.66s/it]

training loss: 3.3334157466888428


training:  36%|███▌      | 3924/10986 [2:40:10<4:53:44,  2.50s/it]

training loss: 3.388718605041504


training:  36%|███▌      | 3925/10986 [2:40:13<4:55:50,  2.51s/it]

training loss: 3.4424049854278564


training:  36%|███▌      | 3926/10986 [2:40:15<4:41:53,  2.40s/it]

training loss: 3.2923405170440674


training:  36%|███▌      | 3927/10986 [2:40:18<4:43:54,  2.41s/it]

training loss: 3.3829121589660645


training:  36%|███▌      | 3928/10986 [2:40:20<4:34:55,  2.34s/it]

training loss: 3.320430278778076


training:  36%|███▌      | 3929/10986 [2:40:22<4:39:40,  2.38s/it]

training loss: 3.3996427059173584


training:  36%|███▌      | 3930/10986 [2:40:24<4:30:39,  2.30s/it]

training loss: 3.558821439743042


training:  36%|███▌      | 3931/10986 [2:40:27<4:35:31,  2.34s/it]

training loss: 3.4785540103912354


training:  36%|███▌      | 3932/10986 [2:40:29<4:27:08,  2.27s/it]

training loss: 3.3479366302490234


training:  36%|███▌      | 3933/10986 [2:40:31<4:33:05,  2.32s/it]

training loss: 3.5105133056640625


training:  36%|███▌      | 3934/10986 [2:40:33<4:26:13,  2.27s/it]

training loss: 3.398541212081909


training:  36%|███▌      | 3935/10986 [2:40:36<4:32:15,  2.32s/it]

training loss: 3.4717040061950684


training:  36%|███▌      | 3936/10986 [2:40:38<4:25:39,  2.26s/it]

training loss: 3.465512275695801


training:  36%|███▌      | 3937/10986 [2:40:41<4:39:24,  2.38s/it]

training loss: 3.4867966175079346


training:  36%|███▌      | 3938/10986 [2:40:44<5:06:39,  2.61s/it]

training loss: 3.379591941833496


training:  36%|███▌      | 3939/10986 [2:40:46<5:06:15,  2.61s/it]

training loss: 3.5103189945220947


training:  36%|███▌      | 3940/10986 [2:40:49<4:49:48,  2.47s/it]

training loss: 3.3420281410217285
valid loss: 3.4269516468048096
perplexity: 30.782663345336914


training:  36%|███▌      | 3941/10986 [2:40:53<6:00:41,  3.07s/it]

training loss: 3.446024179458618


training:  36%|███▌      | 3942/10986 [2:40:56<5:42:58,  2.92s/it]

training loss: 3.5996315479278564


training:  36%|███▌      | 3943/10986 [2:40:58<5:15:55,  2.69s/it]

training loss: 3.386277914047241


training:  36%|███▌      | 3944/10986 [2:41:00<5:07:56,  2.62s/it]

training loss: 3.4256420135498047


training:  36%|███▌      | 3945/10986 [2:41:02<4:50:24,  2.47s/it]

training loss: 3.5239548683166504


training:  36%|███▌      | 3946/10986 [2:41:05<4:51:38,  2.49s/it]

training loss: 3.4425158500671387


training:  36%|███▌      | 3947/10986 [2:41:07<4:38:19,  2.37s/it]

training loss: 3.528887987136841


training:  36%|███▌      | 3948/10986 [2:41:10<4:43:42,  2.42s/it]

training loss: 3.4963412284851074


training:  36%|███▌      | 3949/10986 [2:41:12<4:33:11,  2.33s/it]

training loss: 3.3543217182159424


training:  36%|███▌      | 3950/10986 [2:41:14<4:39:10,  2.38s/it]

training loss: 3.38244891166687


training:  36%|███▌      | 3951/10986 [2:41:16<4:29:34,  2.30s/it]

training loss: 3.5586276054382324


training:  36%|███▌      | 3952/10986 [2:41:19<4:36:53,  2.36s/it]

training loss: 3.457918405532837


training:  36%|███▌      | 3953/10986 [2:41:21<4:29:23,  2.30s/it]

training loss: 3.4680330753326416


training:  36%|███▌      | 3954/10986 [2:41:23<4:36:01,  2.36s/it]

training loss: 3.392333745956421


training:  36%|███▌      | 3955/10986 [2:41:26<4:28:56,  2.30s/it]

training loss: 3.560549736022949


training:  36%|███▌      | 3956/10986 [2:41:28<4:37:01,  2.36s/it]

training loss: 3.424642562866211


training:  36%|███▌      | 3957/10986 [2:41:30<4:29:20,  2.30s/it]

training loss: 3.328691005706787


training:  36%|███▌      | 3958/10986 [2:41:33<4:36:10,  2.36s/it]

training loss: 3.559027910232544


training:  36%|███▌      | 3959/10986 [2:41:35<4:27:27,  2.28s/it]

training loss: 3.365248203277588


training:  36%|███▌      | 3960/10986 [2:41:37<4:36:02,  2.36s/it]

training loss: 3.4027185440063477
valid loss: 3.3568809032440186
perplexity: 28.699533462524414


training:  36%|███▌      | 3961/10986 [2:41:42<5:52:30,  3.01s/it]

training loss: 3.296008348464966


training:  36%|███▌      | 3962/10986 [2:41:44<5:24:53,  2.78s/it]

training loss: 3.4552929401397705


training:  36%|███▌      | 3963/10986 [2:41:47<5:16:02,  2.70s/it]

training loss: 3.4544272422790527


training:  36%|███▌      | 3964/10986 [2:41:49<4:58:25,  2.55s/it]

training loss: 3.5243678092956543


training:  36%|███▌      | 3965/10986 [2:41:51<4:57:54,  2.55s/it]

training loss: 3.4403786659240723


training:  36%|███▌      | 3966/10986 [2:41:53<4:42:32,  2.41s/it]

training loss: 3.376392364501953


training:  36%|███▌      | 3967/10986 [2:41:56<4:46:40,  2.45s/it]

training loss: 3.2958805561065674


training:  36%|███▌      | 3968/10986 [2:41:58<4:35:23,  2.35s/it]

training loss: 3.5527048110961914


training:  36%|███▌      | 3969/10986 [2:42:01<4:41:13,  2.40s/it]

training loss: 3.340841770172119


training:  36%|███▌      | 3970/10986 [2:42:03<4:31:59,  2.33s/it]

training loss: 3.3203766345977783


training:  36%|███▌      | 3971/10986 [2:42:05<4:38:02,  2.38s/it]

training loss: 3.5649781227111816


training:  36%|███▌      | 3972/10986 [2:42:07<4:29:21,  2.30s/it]

training loss: 3.469102621078491


training:  36%|███▌      | 3973/10986 [2:42:10<4:34:33,  2.35s/it]

training loss: 3.3772687911987305


training:  36%|███▌      | 3974/10986 [2:42:12<4:26:20,  2.28s/it]

training loss: 3.4580185413360596


training:  36%|███▌      | 3975/10986 [2:42:14<4:33:03,  2.34s/it]

training loss: 3.3669991493225098


training:  36%|███▌      | 3976/10986 [2:42:17<4:25:20,  2.27s/it]

training loss: 3.4358553886413574


training:  36%|███▌      | 3977/10986 [2:42:19<4:32:47,  2.34s/it]

training loss: 3.3957319259643555


training:  36%|███▌      | 3978/10986 [2:42:21<4:25:42,  2.27s/it]

training loss: 3.416034698486328


training:  36%|███▌      | 3979/10986 [2:42:24<4:31:54,  2.33s/it]

training loss: 3.4442708492279053


training:  36%|███▌      | 3980/10986 [2:42:26<4:25:53,  2.28s/it]

training loss: 3.3304293155670166
valid loss: 3.4149270057678223
perplexity: 30.41472816467285


training:  36%|███▌      | 3981/10986 [2:42:30<5:41:08,  2.92s/it]

training loss: 3.4449915885925293


training:  36%|███▌      | 3982/10986 [2:42:33<5:27:44,  2.81s/it]

training loss: 3.436075210571289


training:  36%|███▋      | 3983/10986 [2:42:35<5:05:46,  2.62s/it]

training loss: 3.3375747203826904


training:  36%|███▋      | 3984/10986 [2:42:37<5:00:10,  2.57s/it]

training loss: 3.4899563789367676


training:  36%|███▋      | 3985/10986 [2:42:40<4:44:07,  2.44s/it]

training loss: 3.373122215270996


training:  36%|███▋      | 3986/10986 [2:42:42<4:46:46,  2.46s/it]

training loss: 3.4092836380004883


training:  36%|███▋      | 3987/10986 [2:42:44<4:35:31,  2.36s/it]

training loss: 3.2677841186523438


training:  36%|███▋      | 3988/10986 [2:42:47<4:37:43,  2.38s/it]

training loss: 3.398686647415161


training:  36%|███▋      | 3989/10986 [2:42:49<4:28:33,  2.30s/it]

training loss: 3.4185972213745117


training:  36%|███▋      | 3990/10986 [2:42:51<4:33:59,  2.35s/it]

training loss: 3.40328049659729


training:  36%|███▋      | 3991/10986 [2:42:53<4:26:51,  2.29s/it]

training loss: 3.3467001914978027


training:  36%|███▋      | 3992/10986 [2:42:56<4:33:20,  2.34s/it]

training loss: 3.2979564666748047


training:  36%|███▋      | 3993/10986 [2:42:58<4:27:27,  2.29s/it]

training loss: 3.3577680587768555


training:  36%|███▋      | 3994/10986 [2:43:00<4:32:03,  2.33s/it]

training loss: 3.329059362411499


training:  36%|███▋      | 3995/10986 [2:43:03<4:24:45,  2.27s/it]

training loss: 3.414876699447632


training:  36%|███▋      | 3996/10986 [2:43:05<4:30:51,  2.32s/it]

training loss: 3.484816074371338


training:  36%|███▋      | 3997/10986 [2:43:07<4:25:34,  2.28s/it]

training loss: 3.401686668395996


training:  36%|███▋      | 3998/10986 [2:43:11<5:14:37,  2.70s/it]

training loss: 3.506186008453369


training:  36%|███▋      | 3999/10986 [2:43:13<4:56:04,  2.54s/it]

training loss: 3.4595282077789307


training:  36%|███▋      | 4000/10986 [2:43:16<4:53:15,  2.52s/it]

training loss: 3.55818772315979
valid loss: 3.651463270187378
perplexity: 38.531005859375


training:  36%|███▋      | 4001/10986 [2:43:20<6:11:46,  3.19s/it]

training loss: 3.5587992668151855


training:  36%|███▋      | 4002/10986 [2:43:23<5:39:40,  2.92s/it]

training loss: 3.4038915634155273


training:  36%|███▋      | 4003/10986 [2:43:25<5:24:30,  2.79s/it]

training loss: 3.4568042755126953


training:  36%|███▋      | 4004/10986 [2:43:27<5:02:10,  2.60s/it]

training loss: 3.3668386936187744


training:  36%|███▋      | 4005/10986 [2:43:30<4:59:24,  2.57s/it]

training loss: 3.383669137954712


training:  36%|███▋      | 4006/10986 [2:43:32<4:43:56,  2.44s/it]

training loss: 3.50101637840271


training:  36%|███▋      | 4007/10986 [2:43:34<4:44:22,  2.44s/it]

training loss: 3.3414604663848877


training:  36%|███▋      | 4008/10986 [2:43:36<4:34:11,  2.36s/it]

training loss: 3.503211259841919


training:  36%|███▋      | 4009/10986 [2:43:39<4:37:18,  2.38s/it]

training loss: 3.4725003242492676


training:  37%|███▋      | 4010/10986 [2:43:41<4:27:42,  2.30s/it]

training loss: 3.443432092666626


training:  37%|███▋      | 4011/10986 [2:43:43<4:32:59,  2.35s/it]

training loss: 3.3956139087677


training:  37%|███▋      | 4012/10986 [2:43:46<4:40:46,  2.42s/it]

training loss: 3.353686809539795


training:  37%|███▋      | 4013/10986 [2:43:48<4:41:39,  2.42s/it]

training loss: 3.438589572906494


training:  37%|███▋      | 4014/10986 [2:43:51<4:30:24,  2.33s/it]

training loss: 3.5310471057891846


training:  37%|███▋      | 4015/10986 [2:43:53<4:34:04,  2.36s/it]

training loss: 3.4660651683807373


training:  37%|███▋      | 4016/10986 [2:43:55<4:26:44,  2.30s/it]

training loss: 3.5166211128234863


training:  37%|███▋      | 4017/10986 [2:43:58<4:32:44,  2.35s/it]

training loss: 3.3782691955566406


training:  37%|███▋      | 4018/10986 [2:44:00<4:25:18,  2.28s/it]

training loss: 3.4440152645111084


training:  37%|███▋      | 4019/10986 [2:44:02<4:30:29,  2.33s/it]

training loss: 3.521836042404175


training:  37%|███▋      | 4020/10986 [2:44:04<4:22:54,  2.26s/it]

training loss: 3.4129059314727783
valid loss: 3.4353153705596924
perplexity: 31.04119873046875


training:  37%|███▋      | 4021/10986 [2:44:09<5:40:14,  2.93s/it]

training loss: 3.4583473205566406


training:  37%|███▋      | 4022/10986 [2:44:11<5:25:49,  2.81s/it]

training loss: 3.479412078857422


training:  37%|███▋      | 4023/10986 [2:44:13<5:03:18,  2.61s/it]

training loss: 3.519732713699341


training:  37%|███▋      | 4024/10986 [2:44:16<4:56:26,  2.55s/it]

training loss: 3.359712600708008


training:  37%|███▋      | 4025/10986 [2:44:18<4:40:58,  2.42s/it]

training loss: 3.442861318588257


training:  37%|███▋      | 4026/10986 [2:44:20<4:42:19,  2.43s/it]

training loss: 3.6116816997528076


training:  37%|███▋      | 4027/10986 [2:44:23<4:31:36,  2.34s/it]

training loss: 3.5220303535461426


training:  37%|███▋      | 4028/10986 [2:44:25<4:35:20,  2.37s/it]

training loss: 3.520759344100952


training:  37%|███▋      | 4029/10986 [2:44:27<4:25:53,  2.29s/it]

training loss: 3.3320999145507812


training:  37%|███▋      | 4030/10986 [2:44:30<4:31:33,  2.34s/it]

training loss: 3.31520938873291


training:  37%|███▋      | 4031/10986 [2:44:32<4:24:00,  2.28s/it]

training loss: 3.464144706726074


training:  37%|███▋      | 4032/10986 [2:44:34<4:29:44,  2.33s/it]

training loss: 3.4827213287353516


training:  37%|███▋      | 4033/10986 [2:44:36<4:22:56,  2.27s/it]

training loss: 3.4186432361602783


training:  37%|███▋      | 4034/10986 [2:44:39<4:29:12,  2.32s/it]

training loss: 3.43394136428833


training:  37%|███▋      | 4035/10986 [2:44:41<4:23:21,  2.27s/it]

training loss: 3.4445111751556396


training:  37%|███▋      | 4036/10986 [2:44:43<4:31:16,  2.34s/it]

training loss: 3.2706222534179688


training:  37%|███▋      | 4037/10986 [2:44:46<4:24:27,  2.28s/it]

training loss: 3.368870735168457


training:  37%|███▋      | 4038/10986 [2:44:48<4:30:21,  2.33s/it]

training loss: 3.445120334625244


training:  37%|███▋      | 4039/10986 [2:44:50<4:23:22,  2.27s/it]

training loss: 3.341840982437134


training:  37%|███▋      | 4040/10986 [2:44:53<4:30:21,  2.34s/it]

training loss: 3.38447904586792
valid loss: 3.461580514907837
perplexity: 31.86730194091797


training:  37%|███▋      | 4041/10986 [2:44:57<5:43:38,  2.97s/it]

training loss: 3.3875651359558105


training:  37%|███▋      | 4042/10986 [2:44:59<5:16:33,  2.74s/it]

training loss: 3.392542600631714


training:  37%|███▋      | 4043/10986 [2:45:02<5:06:11,  2.65s/it]

training loss: 3.4321823120117188


training:  37%|███▋      | 4044/10986 [2:45:04<4:46:37,  2.48s/it]

training loss: 3.3017876148223877


training:  37%|███▋      | 4045/10986 [2:45:06<4:45:53,  2.47s/it]

training loss: 3.5607171058654785


training:  37%|███▋      | 4046/10986 [2:45:08<4:34:37,  2.37s/it]

training loss: 3.3389086723327637


training:  37%|███▋      | 4047/10986 [2:45:11<4:39:03,  2.41s/it]

training loss: 3.461345672607422


training:  37%|███▋      | 4048/10986 [2:45:13<4:28:28,  2.32s/it]

training loss: 3.396294355392456


training:  37%|███▋      | 4049/10986 [2:45:15<4:33:31,  2.37s/it]

training loss: 3.344359874725342


training:  37%|███▋      | 4050/10986 [2:45:18<4:26:09,  2.30s/it]

training loss: 3.4034879207611084


training:  37%|███▋      | 4051/10986 [2:45:20<4:32:22,  2.36s/it]

training loss: 3.5165116786956787


training:  37%|███▋      | 4052/10986 [2:45:22<4:25:19,  2.30s/it]

training loss: 3.377254009246826


training:  37%|███▋      | 4053/10986 [2:45:25<4:30:24,  2.34s/it]

training loss: 3.412177085876465


training:  37%|███▋      | 4054/10986 [2:45:27<4:22:51,  2.28s/it]

training loss: 3.403432846069336


training:  37%|███▋      | 4055/10986 [2:45:29<4:29:27,  2.33s/it]

training loss: 3.4651498794555664


training:  37%|███▋      | 4056/10986 [2:45:31<4:22:11,  2.27s/it]

training loss: 3.4299771785736084


training:  37%|███▋      | 4057/10986 [2:45:35<4:53:42,  2.54s/it]

training loss: 3.4693188667297363


training:  37%|███▋      | 4058/10986 [2:45:37<4:55:51,  2.56s/it]

training loss: 3.4307589530944824


training:  37%|███▋      | 4059/10986 [2:45:40<4:53:31,  2.54s/it]

training loss: 3.438378095626831


training:  37%|███▋      | 4060/10986 [2:45:42<4:38:52,  2.42s/it]

training loss: 3.324526309967041
valid loss: 3.3686957359313965
perplexity: 29.040626525878906


training:  37%|███▋      | 4061/10986 [2:45:46<5:49:17,  3.03s/it]

training loss: 3.4357242584228516


training:  37%|███▋      | 4062/10986 [2:45:49<5:34:12,  2.90s/it]

training loss: 3.3907995223999023


training:  37%|███▋      | 4063/10986 [2:45:51<5:09:01,  2.68s/it]

training loss: 3.568007469177246


training:  37%|███▋      | 4064/10986 [2:45:54<5:01:26,  2.61s/it]

training loss: 3.4738645553588867


training:  37%|███▋      | 4065/10986 [2:45:56<4:46:24,  2.48s/it]

training loss: 3.3301749229431152


training:  37%|███▋      | 4066/10986 [2:45:58<4:48:47,  2.50s/it]

training loss: 3.515902519226074


training:  37%|███▋      | 4067/10986 [2:46:00<4:35:57,  2.39s/it]

training loss: 3.462977647781372


training:  37%|███▋      | 4068/10986 [2:46:03<4:39:07,  2.42s/it]

training loss: 3.4793343544006348


training:  37%|███▋      | 4069/10986 [2:46:05<4:29:27,  2.34s/it]

training loss: 3.6219635009765625


training:  37%|███▋      | 4070/10986 [2:46:07<4:34:54,  2.39s/it]

training loss: 3.4257280826568604


training:  37%|███▋      | 4071/10986 [2:46:10<4:27:59,  2.33s/it]

training loss: 3.502384662628174


training:  37%|███▋      | 4072/10986 [2:46:12<4:34:13,  2.38s/it]

training loss: 3.3866467475891113


training:  37%|███▋      | 4073/10986 [2:46:14<4:25:28,  2.30s/it]

training loss: 3.3911335468292236


training:  37%|███▋      | 4074/10986 [2:46:17<4:31:48,  2.36s/it]

training loss: 3.509219169616699


training:  37%|███▋      | 4075/10986 [2:46:19<4:25:03,  2.30s/it]

training loss: 3.442345380783081


training:  37%|███▋      | 4076/10986 [2:46:21<4:31:09,  2.35s/it]

training loss: 3.5116143226623535


training:  37%|███▋      | 4077/10986 [2:46:24<4:22:53,  2.28s/it]

training loss: 3.384774684906006


training:  37%|███▋      | 4078/10986 [2:46:26<4:30:15,  2.35s/it]

training loss: 3.4096567630767822


training:  37%|███▋      | 4079/10986 [2:46:28<4:21:56,  2.28s/it]

training loss: 3.5138890743255615


training:  37%|███▋      | 4080/10986 [2:46:31<4:29:24,  2.34s/it]

training loss: 3.4426190853118896
valid loss: 3.5417966842651367
perplexity: 34.52890396118164


training:  37%|███▋      | 4081/10986 [2:46:35<5:43:44,  2.99s/it]

training loss: 3.3967347145080566


training:  37%|███▋      | 4082/10986 [2:46:37<5:16:20,  2.75s/it]

training loss: 3.425797462463379


training:  37%|███▋      | 4083/10986 [2:46:40<5:07:17,  2.67s/it]

training loss: 3.3506534099578857


training:  37%|███▋      | 4084/10986 [2:46:42<4:48:30,  2.51s/it]

training loss: 3.422192096710205


training:  37%|███▋      | 4085/10986 [2:46:44<4:47:32,  2.50s/it]

training loss: 3.4467215538024902


training:  37%|███▋      | 4086/10986 [2:46:47<4:34:45,  2.39s/it]

training loss: 3.397552251815796


training:  37%|███▋      | 4087/10986 [2:46:49<4:38:40,  2.42s/it]

training loss: 3.4602205753326416


training:  37%|███▋      | 4088/10986 [2:46:51<4:28:33,  2.34s/it]

training loss: 3.408125638961792


training:  37%|███▋      | 4089/10986 [2:46:54<4:32:52,  2.37s/it]

training loss: 3.3930416107177734


training:  37%|███▋      | 4090/10986 [2:46:56<4:25:22,  2.31s/it]

training loss: 3.5110220909118652


training:  37%|███▋      | 4091/10986 [2:46:58<4:31:10,  2.36s/it]

training loss: 3.673253297805786


training:  37%|███▋      | 4092/10986 [2:47:00<4:23:46,  2.30s/it]

training loss: 3.460242986679077


training:  37%|███▋      | 4093/10986 [2:47:03<4:30:13,  2.35s/it]

training loss: 3.474764108657837


training:  37%|███▋      | 4094/10986 [2:47:05<4:23:39,  2.30s/it]

training loss: 3.353543996810913


training:  37%|███▋      | 4095/10986 [2:47:08<4:30:11,  2.35s/it]

training loss: 3.3866262435913086


training:  37%|███▋      | 4096/10986 [2:47:10<4:22:27,  2.29s/it]

training loss: 3.4325296878814697


training:  37%|███▋      | 4097/10986 [2:47:12<4:29:11,  2.34s/it]

training loss: 3.487628221511841


training:  37%|███▋      | 4098/10986 [2:47:14<4:22:09,  2.28s/it]

training loss: 3.3813157081604004


training:  37%|███▋      | 4099/10986 [2:47:17<4:29:03,  2.34s/it]

training loss: 3.595036745071411


training:  37%|███▋      | 4100/10986 [2:47:19<4:23:02,  2.29s/it]

training loss: 3.4016404151916504
valid loss: 3.3727078437805176
perplexity: 29.157373428344727


training:  37%|███▋      | 4101/10986 [2:47:24<5:46:47,  3.02s/it]

training loss: 3.419187068939209


training:  37%|███▋      | 4102/10986 [2:47:26<5:30:23,  2.88s/it]

training loss: 3.3881561756134033


training:  37%|███▋      | 4103/10986 [2:47:28<5:04:22,  2.65s/it]

training loss: 3.3918142318725586


training:  37%|███▋      | 4104/10986 [2:47:31<5:00:44,  2.62s/it]

training loss: 3.534029006958008


training:  37%|███▋      | 4105/10986 [2:47:33<4:43:28,  2.47s/it]

training loss: 3.4150052070617676


training:  37%|███▋      | 4106/10986 [2:47:36<4:43:02,  2.47s/it]

training loss: 3.4033169746398926


training:  37%|███▋      | 4107/10986 [2:47:38<4:33:41,  2.39s/it]

training loss: 3.588153839111328


training:  37%|███▋      | 4108/10986 [2:47:40<4:38:08,  2.43s/it]

training loss: 3.5371713638305664


training:  37%|███▋      | 4109/10986 [2:47:42<4:27:17,  2.33s/it]

training loss: 3.4287643432617188


training:  37%|███▋      | 4110/10986 [2:47:45<4:32:43,  2.38s/it]

training loss: 3.491107940673828


training:  37%|███▋      | 4111/10986 [2:47:47<4:23:45,  2.30s/it]

training loss: 3.33752703666687


training:  37%|███▋      | 4112/10986 [2:47:49<4:29:18,  2.35s/it]

training loss: 3.3441545963287354


training:  37%|███▋      | 4113/10986 [2:47:52<4:21:42,  2.28s/it]

training loss: 3.5323293209075928


training:  37%|███▋      | 4114/10986 [2:47:54<4:28:04,  2.34s/it]

training loss: 3.348158359527588


training:  37%|███▋      | 4115/10986 [2:47:56<4:21:53,  2.29s/it]

training loss: 3.385678291320801


training:  37%|███▋      | 4116/10986 [2:47:59<4:48:30,  2.52s/it]

training loss: 3.346336841583252


training:  37%|███▋      | 4117/10986 [2:48:02<4:57:09,  2.60s/it]

training loss: 3.368250608444214


training:  37%|███▋      | 4118/10986 [2:48:04<4:52:38,  2.56s/it]

training loss: 3.3190412521362305


training:  37%|███▋      | 4119/10986 [2:48:07<4:38:03,  2.43s/it]

training loss: 3.528475522994995


training:  38%|███▊      | 4120/10986 [2:48:09<4:41:03,  2.46s/it]

training loss: 3.399688720703125
valid loss: 3.532590627670288
perplexity: 34.21248245239258


training:  38%|███▊      | 4121/10986 [2:48:14<5:48:39,  3.05s/it]

training loss: 3.4652557373046875


training:  38%|███▊      | 4122/10986 [2:48:16<5:41:04,  2.98s/it]

training loss: 3.460522413253784


training:  38%|███▊      | 4123/10986 [2:48:19<5:25:31,  2.85s/it]

training loss: 3.426873207092285


training:  38%|███▊      | 4124/10986 [2:48:21<5:00:54,  2.63s/it]

training loss: 3.3449783325195312


training:  38%|███▊      | 4125/10986 [2:48:24<4:56:57,  2.60s/it]

training loss: 3.4151430130004883


training:  38%|███▊      | 4126/10986 [2:48:26<4:41:27,  2.46s/it]

training loss: 3.3353278636932373


training:  38%|███▊      | 4127/10986 [2:48:28<4:42:10,  2.47s/it]

training loss: 3.4154138565063477


training:  38%|███▊      | 4128/10986 [2:48:30<4:30:01,  2.36s/it]

training loss: 3.38995623588562


training:  38%|███▊      | 4129/10986 [2:48:33<4:34:12,  2.40s/it]

training loss: 3.409550905227661


training:  38%|███▊      | 4130/10986 [2:48:35<4:24:26,  2.31s/it]

training loss: 3.409649133682251


training:  38%|███▊      | 4131/10986 [2:48:37<4:29:14,  2.36s/it]

training loss: 3.342907667160034


training:  38%|███▊      | 4132/10986 [2:48:40<4:21:17,  2.29s/it]

training loss: 3.6016018390655518


training:  38%|███▊      | 4133/10986 [2:48:42<4:29:50,  2.36s/it]

training loss: 3.4512856006622314


training:  38%|███▊      | 4134/10986 [2:48:44<4:21:55,  2.29s/it]

training loss: 3.4861581325531006


training:  38%|███▊      | 4135/10986 [2:48:47<4:28:53,  2.35s/it]

training loss: 3.457606554031372


training:  38%|███▊      | 4136/10986 [2:48:49<4:21:14,  2.29s/it]

training loss: 3.407810688018799


training:  38%|███▊      | 4137/10986 [2:48:51<4:30:11,  2.37s/it]

training loss: 3.4884088039398193


training:  38%|███▊      | 4138/10986 [2:48:53<4:20:45,  2.28s/it]

training loss: 3.366702079772949


training:  38%|███▊      | 4139/10986 [2:48:56<4:28:56,  2.36s/it]

training loss: 3.3256514072418213


training:  38%|███▊      | 4140/10986 [2:48:58<4:20:24,  2.28s/it]

training loss: 3.5514819622039795
valid loss: 3.4242632389068604
perplexity: 30.70001792907715


training:  38%|███▊      | 4141/10986 [2:49:03<5:36:24,  2.95s/it]

training loss: 3.3805007934570312


training:  38%|███▊      | 4142/10986 [2:49:05<5:25:29,  2.85s/it]

training loss: 3.4885315895080566


training:  38%|███▊      | 4143/10986 [2:49:07<5:02:06,  2.65s/it]

training loss: 3.4896602630615234


training:  38%|███▊      | 4144/10986 [2:49:10<4:58:41,  2.62s/it]

training loss: 3.3965346813201904


training:  38%|███▊      | 4145/10986 [2:49:12<4:42:33,  2.48s/it]

training loss: 3.379720687866211


training:  38%|███▊      | 4146/10986 [2:49:15<4:41:31,  2.47s/it]

training loss: 3.455662488937378


training:  38%|███▊      | 4147/10986 [2:49:17<4:30:30,  2.37s/it]

training loss: 3.3834338188171387


training:  38%|███▊      | 4148/10986 [2:49:20<4:47:43,  2.52s/it]

training loss: 3.553790330886841


training:  38%|███▊      | 4149/10986 [2:49:22<4:34:35,  2.41s/it]

training loss: 3.432716131210327


training:  38%|███▊      | 4150/10986 [2:49:24<4:38:25,  2.44s/it]

training loss: 3.4249978065490723


training:  38%|███▊      | 4151/10986 [2:49:26<4:28:15,  2.35s/it]

training loss: 3.4880306720733643


training:  38%|███▊      | 4152/10986 [2:49:29<4:32:03,  2.39s/it]

training loss: 3.4265968799591064


training:  38%|███▊      | 4153/10986 [2:49:31<4:23:18,  2.31s/it]

training loss: 3.3560373783111572


training:  38%|███▊      | 4154/10986 [2:49:34<4:30:28,  2.38s/it]

training loss: 3.3386592864990234


training:  38%|███▊      | 4155/10986 [2:49:36<4:22:55,  2.31s/it]

training loss: 3.369100570678711


training:  38%|███▊      | 4156/10986 [2:49:38<4:29:56,  2.37s/it]

training loss: 3.46484375


training:  38%|███▊      | 4157/10986 [2:49:40<4:21:29,  2.30s/it]

training loss: 3.5062222480773926


training:  38%|███▊      | 4158/10986 [2:49:43<4:31:13,  2.38s/it]

training loss: 3.3720767498016357


training:  38%|███▊      | 4159/10986 [2:49:45<4:22:36,  2.31s/it]

training loss: 3.583141565322876


training:  38%|███▊      | 4160/10986 [2:49:47<4:28:26,  2.36s/it]

training loss: 3.514564275741577
valid loss: 3.5034396648406982
perplexity: 33.22955322265625


training:  38%|███▊      | 4161/10986 [2:49:52<5:41:49,  3.00s/it]

training loss: 3.519594192504883


training:  38%|███▊      | 4162/10986 [2:49:54<5:14:43,  2.77s/it]

training loss: 3.47689151763916


training:  38%|███▊      | 4163/10986 [2:49:57<5:06:22,  2.69s/it]

training loss: 3.397247314453125


training:  38%|███▊      | 4164/10986 [2:49:59<4:46:44,  2.52s/it]

training loss: 3.4081876277923584


training:  38%|███▊      | 4165/10986 [2:50:01<4:46:07,  2.52s/it]

training loss: 3.5542469024658203


training:  38%|███▊      | 4166/10986 [2:50:03<4:31:40,  2.39s/it]

training loss: 3.4669272899627686


training:  38%|███▊      | 4167/10986 [2:50:06<4:35:47,  2.43s/it]

training loss: 3.5087227821350098


training:  38%|███▊      | 4168/10986 [2:50:08<4:25:16,  2.33s/it]

training loss: 3.487208604812622


training:  38%|███▊      | 4169/10986 [2:50:11<4:31:51,  2.39s/it]

training loss: 3.4538943767547607


training:  38%|███▊      | 4170/10986 [2:50:13<4:22:19,  2.31s/it]

training loss: 3.424172878265381


training:  38%|███▊      | 4171/10986 [2:50:15<4:28:40,  2.37s/it]

training loss: 3.4923086166381836


training:  38%|███▊      | 4172/10986 [2:50:17<4:20:25,  2.29s/it]

training loss: 3.609285831451416


training:  38%|███▊      | 4173/10986 [2:50:20<4:27:40,  2.36s/it]

training loss: 3.661241292953491


training:  38%|███▊      | 4174/10986 [2:50:22<4:33:12,  2.41s/it]

training loss: 3.390927791595459


training:  38%|███▊      | 4175/10986 [2:50:26<5:09:06,  2.72s/it]

training loss: 3.4098410606384277


training:  38%|███▊      | 4176/10986 [2:50:28<4:56:59,  2.62s/it]

training loss: 3.439229965209961


training:  38%|███▊      | 4177/10986 [2:50:31<4:53:32,  2.59s/it]

training loss: 3.410729169845581


training:  38%|███▊      | 4178/10986 [2:50:33<4:38:12,  2.45s/it]

training loss: 3.499542236328125


training:  38%|███▊      | 4179/10986 [2:50:35<4:41:38,  2.48s/it]

training loss: 3.4062225818634033


training:  38%|███▊      | 4180/10986 [2:50:38<4:31:22,  2.39s/it]

training loss: 3.382575273513794
valid loss: 3.590254783630371
perplexity: 36.24330520629883


training:  38%|███▊      | 4181/10986 [2:50:42<5:46:06,  3.05s/it]

training loss: 3.428046464920044


training:  38%|███▊      | 4182/10986 [2:50:45<5:32:13,  2.93s/it]

training loss: 3.688852310180664


training:  38%|███▊      | 4183/10986 [2:50:47<5:05:00,  2.69s/it]

training loss: 3.524756669998169


training:  38%|███▊      | 4184/10986 [2:50:50<5:00:51,  2.65s/it]

training loss: 3.4890239238739014


training:  38%|███▊      | 4185/10986 [2:50:52<4:43:30,  2.50s/it]

training loss: 3.47747540473938


training:  38%|███▊      | 4186/10986 [2:50:54<4:43:59,  2.51s/it]

training loss: 3.6339609622955322


training:  38%|███▊      | 4187/10986 [2:50:56<4:31:51,  2.40s/it]

training loss: 3.4102015495300293


training:  38%|███▊      | 4188/10986 [2:50:59<4:38:25,  2.46s/it]

training loss: 3.353673219680786


training:  38%|███▊      | 4189/10986 [2:51:01<4:27:13,  2.36s/it]

training loss: 3.4504599571228027


training:  38%|███▊      | 4190/10986 [2:51:04<4:32:56,  2.41s/it]

training loss: 3.3949334621429443


training:  38%|███▊      | 4191/10986 [2:51:06<4:25:25,  2.34s/it]

training loss: 3.4519546031951904


training:  38%|███▊      | 4192/10986 [2:51:08<4:31:23,  2.40s/it]

training loss: 3.3686740398406982


training:  38%|███▊      | 4193/10986 [2:51:10<4:22:10,  2.32s/it]

training loss: 3.3863275051116943


training:  38%|███▊      | 4194/10986 [2:51:13<4:29:14,  2.38s/it]

training loss: 3.4338765144348145


training:  38%|███▊      | 4195/10986 [2:51:15<4:21:00,  2.31s/it]

training loss: 3.603994607925415


training:  38%|███▊      | 4196/10986 [2:51:18<4:30:07,  2.39s/it]

training loss: 3.312474012374878


training:  38%|███▊      | 4197/10986 [2:51:20<4:22:35,  2.32s/it]

training loss: 3.3323566913604736


training:  38%|███▊      | 4198/10986 [2:51:23<4:52:58,  2.59s/it]

training loss: 3.4752886295318604


training:  38%|███▊      | 4199/10986 [2:51:25<4:36:43,  2.45s/it]

training loss: 3.433654308319092


training:  38%|███▊      | 4200/10986 [2:51:28<4:39:51,  2.47s/it]

training loss: 3.4436678886413574
valid loss: 3.3008553981781006
perplexity: 27.135841369628906


training:  38%|███▊      | 4201/10986 [2:51:33<5:59:41,  3.18s/it]

training loss: 3.314185619354248


training:  38%|███▊      | 4202/10986 [2:51:35<5:26:20,  2.89s/it]

training loss: 3.385068893432617


training:  38%|███▊      | 4203/10986 [2:51:37<5:12:15,  2.76s/it]

training loss: 3.385371685028076


training:  38%|███▊      | 4204/10986 [2:51:39<4:51:35,  2.58s/it]

training loss: 3.3857104778289795


training:  38%|███▊      | 4205/10986 [2:51:42<4:50:26,  2.57s/it]

training loss: 3.37984561920166


training:  38%|███▊      | 4206/10986 [2:51:44<4:35:37,  2.44s/it]

training loss: 3.2981619834899902


training:  38%|███▊      | 4207/10986 [2:51:47<4:38:47,  2.47s/it]

training loss: 3.3998491764068604


training:  38%|███▊      | 4208/10986 [2:51:49<4:26:24,  2.36s/it]

training loss: 3.390996217727661


training:  38%|███▊      | 4209/10986 [2:51:51<4:32:19,  2.41s/it]

training loss: 3.3568308353424072


training:  38%|███▊      | 4210/10986 [2:51:53<4:22:26,  2.32s/it]

training loss: 3.415987014770508


training:  38%|███▊      | 4211/10986 [2:51:56<4:28:31,  2.38s/it]

training loss: 3.4441707134246826


training:  38%|███▊      | 4212/10986 [2:51:58<4:20:11,  2.30s/it]

training loss: 3.5753896236419678


training:  38%|███▊      | 4213/10986 [2:52:00<4:26:43,  2.36s/it]

training loss: 3.461350202560425


training:  38%|███▊      | 4214/10986 [2:52:03<4:18:33,  2.29s/it]

training loss: 3.4749481678009033


training:  38%|███▊      | 4215/10986 [2:52:05<4:26:18,  2.36s/it]

training loss: 3.4106225967407227


training:  38%|███▊      | 4216/10986 [2:52:07<4:17:55,  2.29s/it]

training loss: 3.5984532833099365


training:  38%|███▊      | 4217/10986 [2:52:10<4:24:47,  2.35s/it]

training loss: 3.366428852081299


training:  38%|███▊      | 4218/10986 [2:52:12<4:17:22,  2.28s/it]

training loss: 3.50266695022583


training:  38%|███▊      | 4219/10986 [2:52:15<4:32:24,  2.42s/it]

training loss: 3.466620922088623


training:  38%|███▊      | 4220/10986 [2:52:17<4:22:36,  2.33s/it]

training loss: 3.550938844680786
valid loss: 3.4602694511413574
perplexity: 31.825551986694336


training:  38%|███▊      | 4221/10986 [2:52:21<5:34:40,  2.97s/it]

training loss: 3.4676120281219482


training:  38%|███▊      | 4222/10986 [2:52:24<5:21:27,  2.85s/it]

training loss: 3.4980015754699707


training:  38%|███▊      | 4223/10986 [2:52:26<4:57:38,  2.64s/it]

training loss: 3.521185874938965


training:  38%|███▊      | 4224/10986 [2:52:28<4:55:12,  2.62s/it]

training loss: 3.399683713912964


training:  38%|███▊      | 4225/10986 [2:52:31<4:39:14,  2.48s/it]

training loss: 3.350080966949463


training:  38%|███▊      | 4226/10986 [2:52:33<4:40:49,  2.49s/it]

training loss: 3.440498113632202


training:  38%|███▊      | 4227/10986 [2:52:35<4:30:18,  2.40s/it]

training loss: 3.392672061920166


training:  38%|███▊      | 4228/10986 [2:52:38<4:36:05,  2.45s/it]

training loss: 3.46899151802063


training:  38%|███▊      | 4229/10986 [2:52:40<4:24:41,  2.35s/it]

training loss: 3.4557416439056396


training:  39%|███▊      | 4230/10986 [2:52:43<4:31:19,  2.41s/it]

training loss: 3.432241439819336


training:  39%|███▊      | 4231/10986 [2:52:45<4:22:20,  2.33s/it]

training loss: 3.3909401893615723


training:  39%|███▊      | 4232/10986 [2:52:47<4:27:56,  2.38s/it]

training loss: 3.536402463912964


training:  39%|███▊      | 4233/10986 [2:52:50<4:49:25,  2.57s/it]

training loss: 3.4500999450683594


training:  39%|███▊      | 4234/10986 [2:52:53<5:08:30,  2.74s/it]

training loss: 3.507329225540161


training:  39%|███▊      | 4235/10986 [2:52:56<4:50:04,  2.58s/it]

training loss: 3.3953981399536133


training:  39%|███▊      | 4236/10986 [2:52:58<4:47:37,  2.56s/it]

training loss: 3.4239447116851807


training:  39%|███▊      | 4237/10986 [2:53:00<4:32:35,  2.42s/it]

training loss: 3.4792280197143555


training:  39%|███▊      | 4238/10986 [2:53:03<4:35:24,  2.45s/it]

training loss: 3.405203104019165


training:  39%|███▊      | 4239/10986 [2:53:05<4:24:21,  2.35s/it]

training loss: 3.387141466140747


training:  39%|███▊      | 4240/10986 [2:53:07<4:29:07,  2.39s/it]

training loss: 3.545450210571289
valid loss: 3.3382556438446045
perplexity: 28.169946670532227


training:  39%|███▊      | 4241/10986 [2:53:12<5:42:01,  3.04s/it]

training loss: 3.572976589202881


training:  39%|███▊      | 4242/10986 [2:53:14<5:14:28,  2.80s/it]

training loss: 3.5530145168304443


training:  39%|███▊      | 4243/10986 [2:53:17<5:05:10,  2.72s/it]

training loss: 3.6132302284240723


training:  39%|███▊      | 4244/10986 [2:53:19<4:56:47,  2.64s/it]

training loss: 3.4308886528015137


training:  39%|███▊      | 4245/10986 [2:53:22<4:53:15,  2.61s/it]

training loss: 3.508483648300171


training:  39%|███▊      | 4246/10986 [2:53:24<4:37:14,  2.47s/it]

training loss: 3.4396862983703613


training:  39%|███▊      | 4247/10986 [2:53:26<4:40:10,  2.49s/it]

training loss: 3.545773506164551


training:  39%|███▊      | 4248/10986 [2:53:28<4:27:23,  2.38s/it]

training loss: 3.3989524841308594


training:  39%|███▊      | 4249/10986 [2:53:31<4:31:26,  2.42s/it]

training loss: 3.4791741371154785


training:  39%|███▊      | 4250/10986 [2:53:33<4:21:47,  2.33s/it]

training loss: 3.3516108989715576


training:  39%|███▊      | 4251/10986 [2:53:36<4:27:07,  2.38s/it]

training loss: 3.497209310531616


training:  39%|███▊      | 4252/10986 [2:53:38<4:20:17,  2.32s/it]

training loss: 3.290987730026245


training:  39%|███▊      | 4253/10986 [2:53:40<4:28:11,  2.39s/it]

training loss: 3.4895308017730713


training:  39%|███▊      | 4254/10986 [2:53:42<4:18:43,  2.31s/it]

training loss: 3.5096404552459717


training:  39%|███▊      | 4255/10986 [2:53:45<4:24:54,  2.36s/it]

training loss: 3.525235891342163


training:  39%|███▊      | 4256/10986 [2:53:47<4:17:12,  2.29s/it]

training loss: 3.51947021484375


training:  39%|███▊      | 4257/10986 [2:53:50<4:25:05,  2.36s/it]

training loss: 3.3734984397888184


training:  39%|███▉      | 4258/10986 [2:53:52<4:16:19,  2.29s/it]

training loss: 3.5307328701019287


training:  39%|███▉      | 4259/10986 [2:53:54<4:25:22,  2.37s/it]

training loss: 3.485869884490967


training:  39%|███▉      | 4260/10986 [2:53:56<4:17:36,  2.30s/it]

training loss: 3.359090566635132
valid loss: 3.419555902481079
perplexity: 30.555843353271484


training:  39%|███▉      | 4261/10986 [2:54:01<5:32:05,  2.96s/it]

training loss: 3.408311605453491


training:  39%|███▉      | 4262/10986 [2:54:03<5:20:18,  2.86s/it]

training loss: 3.35064697265625


training:  39%|███▉      | 4263/10986 [2:54:06<4:56:29,  2.65s/it]

training loss: 3.342604398727417


training:  39%|███▉      | 4264/10986 [2:54:08<4:52:58,  2.62s/it]

training loss: 3.5621190071105957


training:  39%|███▉      | 4265/10986 [2:54:10<4:37:18,  2.48s/it]

training loss: 3.4040164947509766


training:  39%|███▉      | 4266/10986 [2:54:13<4:40:13,  2.50s/it]

training loss: 3.335200548171997


training:  39%|███▉      | 4267/10986 [2:54:15<4:28:35,  2.40s/it]

training loss: 3.406938314437866


training:  39%|███▉      | 4268/10986 [2:54:18<4:33:34,  2.44s/it]

training loss: 3.3624796867370605


training:  39%|███▉      | 4269/10986 [2:54:20<4:24:27,  2.36s/it]

training loss: 3.373312473297119


training:  39%|███▉      | 4270/10986 [2:54:22<4:30:54,  2.42s/it]

training loss: 3.338240385055542


training:  39%|███▉      | 4271/10986 [2:54:24<4:21:32,  2.34s/it]

training loss: 3.4127981662750244


training:  39%|███▉      | 4272/10986 [2:54:27<4:29:19,  2.41s/it]

training loss: 3.296602249145508


training:  39%|███▉      | 4273/10986 [2:54:29<4:19:36,  2.32s/it]

training loss: 3.3888099193573


training:  39%|███▉      | 4274/10986 [2:54:32<4:26:38,  2.38s/it]

training loss: 3.644599437713623


training:  39%|███▉      | 4275/10986 [2:54:34<4:19:57,  2.32s/it]

training loss: 3.4144694805145264


training:  39%|███▉      | 4276/10986 [2:54:36<4:28:56,  2.40s/it]

training loss: 3.476889133453369


training:  39%|███▉      | 4277/10986 [2:54:39<4:18:41,  2.31s/it]

training loss: 3.30595064163208


training:  39%|███▉      | 4278/10986 [2:54:41<4:27:09,  2.39s/it]

training loss: 3.4833779335021973


training:  39%|███▉      | 4279/10986 [2:54:43<4:19:28,  2.32s/it]

training loss: 3.3440396785736084


training:  39%|███▉      | 4280/10986 [2:54:46<4:27:55,  2.40s/it]

training loss: 3.4912679195404053
valid loss: 3.378772497177124
perplexity: 29.334739685058594


training:  39%|███▉      | 4281/10986 [2:54:50<5:38:42,  3.03s/it]

training loss: 3.2781291007995605


training:  39%|███▉      | 4282/10986 [2:54:53<5:11:13,  2.79s/it]

training loss: 3.3212203979492188


training:  39%|███▉      | 4283/10986 [2:54:55<5:04:28,  2.73s/it]

training loss: 3.4780478477478027


training:  39%|███▉      | 4284/10986 [2:54:57<4:44:38,  2.55s/it]

training loss: 3.5334103107452393


training:  39%|███▉      | 4285/10986 [2:55:00<4:46:59,  2.57s/it]

training loss: 3.3457388877868652


training:  39%|███▉      | 4286/10986 [2:55:02<4:32:03,  2.44s/it]

training loss: 3.4483180046081543


training:  39%|███▉      | 4287/10986 [2:55:05<4:35:59,  2.47s/it]

training loss: 3.475841522216797


training:  39%|███▉      | 4288/10986 [2:55:07<4:24:20,  2.37s/it]

training loss: 3.620438814163208


training:  39%|███▉      | 4289/10986 [2:55:09<4:29:40,  2.42s/it]

training loss: 3.428565502166748


training:  39%|███▉      | 4290/10986 [2:55:11<4:20:34,  2.33s/it]

training loss: 3.3613486289978027


training:  39%|███▉      | 4291/10986 [2:55:15<4:55:02,  2.64s/it]

training loss: 3.41825795173645


training:  39%|███▉      | 4292/10986 [2:55:17<4:54:57,  2.64s/it]

training loss: 3.406363010406494


training:  39%|███▉      | 4293/10986 [2:55:20<4:51:41,  2.61s/it]

training loss: 3.4213216304779053


training:  39%|███▉      | 4294/10986 [2:55:22<4:35:25,  2.47s/it]

training loss: 3.341440439224243


training:  39%|███▉      | 4295/10986 [2:55:25<4:40:13,  2.51s/it]

training loss: 3.4645907878875732


training:  39%|███▉      | 4296/10986 [2:55:27<4:27:39,  2.40s/it]

training loss: 3.5056374073028564


training:  39%|███▉      | 4297/10986 [2:55:29<4:32:53,  2.45s/it]

training loss: 3.4827122688293457


training:  39%|███▉      | 4298/10986 [2:55:32<4:22:51,  2.36s/it]

training loss: 3.395404577255249


training:  39%|███▉      | 4299/10986 [2:55:34<4:28:28,  2.41s/it]

training loss: 3.471360921859741


training:  39%|███▉      | 4300/10986 [2:55:36<4:20:29,  2.34s/it]

training loss: 3.5130932331085205
valid loss: 3.3498687744140625
perplexity: 28.498992919921875


training:  39%|███▉      | 4301/10986 [2:55:41<5:44:07,  3.09s/it]

training loss: 3.4365394115448


training:  39%|███▉      | 4302/10986 [2:55:44<5:27:09,  2.94s/it]

training loss: 3.3907723426818848


training:  39%|███▉      | 4303/10986 [2:55:46<4:59:30,  2.69s/it]

training loss: 3.361461877822876


training:  39%|███▉      | 4304/10986 [2:55:48<4:55:17,  2.65s/it]

training loss: 3.608218193054199


training:  39%|███▉      | 4305/10986 [2:55:50<4:37:17,  2.49s/it]

training loss: 3.3131649494171143


training:  39%|███▉      | 4306/10986 [2:55:53<4:40:15,  2.52s/it]

training loss: 3.366685628890991


training:  39%|███▉      | 4307/10986 [2:55:55<4:27:18,  2.40s/it]

training loss: 3.384511709213257


training:  39%|███▉      | 4308/10986 [2:55:58<4:32:33,  2.45s/it]

training loss: 3.501307487487793


training:  39%|███▉      | 4309/10986 [2:56:00<4:21:35,  2.35s/it]

training loss: 3.4925551414489746


training:  39%|███▉      | 4310/10986 [2:56:02<4:26:54,  2.40s/it]

training loss: 3.3813235759735107


training:  39%|███▉      | 4311/10986 [2:56:04<4:17:46,  2.32s/it]

training loss: 3.4090638160705566


training:  39%|███▉      | 4312/10986 [2:56:07<4:26:06,  2.39s/it]

training loss: 3.628915786743164


training:  39%|███▉      | 4313/10986 [2:56:09<4:17:53,  2.32s/it]

training loss: 3.5577714443206787


training:  39%|███▉      | 4314/10986 [2:56:12<4:25:31,  2.39s/it]

training loss: 3.453521728515625


training:  39%|███▉      | 4315/10986 [2:56:14<4:17:20,  2.31s/it]

training loss: 3.599917411804199


training:  39%|███▉      | 4316/10986 [2:56:16<4:25:39,  2.39s/it]

training loss: 3.502595901489258


training:  39%|███▉      | 4317/10986 [2:56:19<4:17:23,  2.32s/it]

training loss: 3.4669792652130127


training:  39%|███▉      | 4318/10986 [2:56:21<4:25:19,  2.39s/it]

training loss: 3.3599061965942383


training:  39%|███▉      | 4319/10986 [2:56:23<4:17:39,  2.32s/it]

training loss: 3.394529104232788


training:  39%|███▉      | 4320/10986 [2:56:26<4:25:13,  2.39s/it]

training loss: 3.3484392166137695
valid loss: 3.476982593536377
perplexity: 32.36192321777344


training:  39%|███▉      | 4321/10986 [2:56:30<5:37:50,  3.04s/it]

training loss: 3.3947224617004395


training:  39%|███▉      | 4322/10986 [2:56:33<5:09:46,  2.79s/it]

training loss: 3.373356342315674


training:  39%|███▉      | 4323/10986 [2:56:35<5:00:27,  2.71s/it]

training loss: 3.368302822113037


training:  39%|███▉      | 4324/10986 [2:56:37<4:41:05,  2.53s/it]

training loss: 3.5689585208892822


training:  39%|███▉      | 4325/10986 [2:56:40<4:39:52,  2.52s/it]

training loss: 3.5272631645202637


training:  39%|███▉      | 4326/10986 [2:56:42<4:26:24,  2.40s/it]

training loss: 3.359762191772461


training:  39%|███▉      | 4327/10986 [2:56:44<4:30:43,  2.44s/it]

training loss: 3.3823277950286865


training:  39%|███▉      | 4328/10986 [2:56:47<4:22:35,  2.37s/it]

training loss: 3.4502012729644775


training:  39%|███▉      | 4329/10986 [2:56:49<4:28:49,  2.42s/it]

training loss: 3.4211580753326416


training:  39%|███▉      | 4330/10986 [2:56:51<4:19:49,  2.34s/it]

training loss: 3.4171302318573


training:  39%|███▉      | 4331/10986 [2:56:54<4:25:59,  2.40s/it]

training loss: 3.4996745586395264


training:  39%|███▉      | 4332/10986 [2:56:56<4:17:17,  2.32s/it]

training loss: 3.5495543479919434


training:  39%|███▉      | 4333/10986 [2:56:58<4:23:03,  2.37s/it]

training loss: 3.4355554580688477


training:  39%|███▉      | 4334/10986 [2:57:01<4:15:00,  2.30s/it]

training loss: 3.504969358444214


training:  39%|███▉      | 4335/10986 [2:57:03<4:21:18,  2.36s/it]

training loss: 3.4501914978027344


training:  39%|███▉      | 4336/10986 [2:57:05<4:12:44,  2.28s/it]

training loss: 3.3957550525665283


training:  39%|███▉      | 4337/10986 [2:57:08<4:19:13,  2.34s/it]

training loss: 3.400362730026245


training:  39%|███▉      | 4338/10986 [2:57:10<4:12:06,  2.28s/it]

training loss: 3.4025301933288574


training:  39%|███▉      | 4339/10986 [2:57:12<4:20:56,  2.36s/it]

training loss: 3.3809022903442383


training:  40%|███▉      | 4340/10986 [2:57:14<4:14:12,  2.30s/it]

training loss: 3.577974319458008
valid loss: 3.415806531906128
perplexity: 30.441490173339844


training:  40%|███▉      | 4341/10986 [2:57:19<5:27:37,  2.96s/it]

training loss: 3.3674187660217285


training:  40%|███▉      | 4342/10986 [2:57:22<5:17:48,  2.87s/it]

training loss: 3.38130784034729


training:  40%|███▉      | 4343/10986 [2:57:24<4:53:14,  2.65s/it]

training loss: 3.507930278778076


training:  40%|███▉      | 4344/10986 [2:57:26<4:49:03,  2.61s/it]

training loss: 3.395963668823242


training:  40%|███▉      | 4345/10986 [2:57:28<4:33:07,  2.47s/it]

training loss: 3.4245638847351074


training:  40%|███▉      | 4346/10986 [2:57:31<4:35:06,  2.49s/it]

training loss: 3.4883012771606445


training:  40%|███▉      | 4347/10986 [2:57:33<4:22:37,  2.37s/it]

training loss: 3.4383702278137207


training:  40%|███▉      | 4348/10986 [2:57:36<4:26:50,  2.41s/it]

training loss: 3.477424383163452


training:  40%|███▉      | 4349/10986 [2:57:38<4:19:06,  2.34s/it]

training loss: 3.4883511066436768


training:  40%|███▉      | 4350/10986 [2:57:41<5:04:32,  2.75s/it]

training loss: 3.4677281379699707


training:  40%|███▉      | 4351/10986 [2:57:44<4:57:55,  2.69s/it]

training loss: 3.4209396839141846


training:  40%|███▉      | 4352/10986 [2:57:47<5:03:52,  2.75s/it]

training loss: 3.392829656600952


training:  40%|███▉      | 4353/10986 [2:57:49<4:42:56,  2.56s/it]

training loss: 3.3651602268218994


training:  40%|███▉      | 4354/10986 [2:57:52<4:43:00,  2.56s/it]

training loss: 3.5994930267333984


training:  40%|███▉      | 4355/10986 [2:57:54<4:28:04,  2.43s/it]

training loss: 3.3842835426330566


training:  40%|███▉      | 4356/10986 [2:57:56<4:32:56,  2.47s/it]

training loss: 3.4544897079467773


training:  40%|███▉      | 4357/10986 [2:57:58<4:23:05,  2.38s/it]

training loss: 3.42105770111084


training:  40%|███▉      | 4358/10986 [2:58:01<4:29:19,  2.44s/it]

training loss: 3.472813367843628


training:  40%|███▉      | 4359/10986 [2:58:03<4:18:50,  2.34s/it]

training loss: 3.487501859664917


training:  40%|███▉      | 4360/10986 [2:58:06<4:23:27,  2.39s/it]

training loss: 3.4935710430145264
valid loss: 3.471590995788574
perplexity: 32.18791198730469


training:  40%|███▉      | 4361/10986 [2:58:10<5:33:45,  3.02s/it]

training loss: 3.4375052452087402


training:  40%|███▉      | 4362/10986 [2:58:12<5:06:52,  2.78s/it]

training loss: 3.441962718963623


training:  40%|███▉      | 4363/10986 [2:58:15<4:58:52,  2.71s/it]

training loss: 3.4114491939544678


training:  40%|███▉      | 4364/10986 [2:58:17<4:39:26,  2.53s/it]

training loss: 3.3794915676116943


training:  40%|███▉      | 4365/10986 [2:58:20<4:39:31,  2.53s/it]

training loss: 3.615421772003174


training:  40%|███▉      | 4366/10986 [2:58:22<4:25:15,  2.40s/it]

training loss: 3.3173937797546387


training:  40%|███▉      | 4367/10986 [2:58:24<4:29:37,  2.44s/it]

training loss: 3.480306625366211


training:  40%|███▉      | 4368/10986 [2:58:26<4:18:24,  2.34s/it]

training loss: 3.4148566722869873


training:  40%|███▉      | 4369/10986 [2:58:29<4:23:44,  2.39s/it]

training loss: 3.6363723278045654


training:  40%|███▉      | 4370/10986 [2:58:31<4:14:26,  2.31s/it]

training loss: 3.3858418464660645


training:  40%|███▉      | 4371/10986 [2:58:33<4:21:35,  2.37s/it]

training loss: 3.4186673164367676


training:  40%|███▉      | 4372/10986 [2:58:36<4:13:20,  2.30s/it]

training loss: 3.442232608795166


training:  40%|███▉      | 4373/10986 [2:58:38<4:20:59,  2.37s/it]

training loss: 3.473679304122925


training:  40%|███▉      | 4374/10986 [2:58:40<4:12:15,  2.29s/it]

training loss: 3.585186004638672


training:  40%|███▉      | 4375/10986 [2:58:43<4:19:38,  2.36s/it]

training loss: 3.6091978549957275


training:  40%|███▉      | 4376/10986 [2:58:45<4:11:18,  2.28s/it]

training loss: 3.5200769901275635


training:  40%|███▉      | 4377/10986 [2:58:47<4:20:21,  2.36s/it]

training loss: 3.3008840084075928


training:  40%|███▉      | 4378/10986 [2:58:49<4:12:11,  2.29s/it]

training loss: 3.4134232997894287


training:  40%|███▉      | 4379/10986 [2:58:52<4:19:19,  2.35s/it]

training loss: 3.3040595054626465


training:  40%|███▉      | 4380/10986 [2:58:54<4:11:39,  2.29s/it]

training loss: 3.4536430835723877
valid loss: 3.377995252609253
perplexity: 29.31195068359375


training:  40%|███▉      | 4381/10986 [2:58:59<5:25:00,  2.95s/it]

training loss: 3.4688756465911865


training:  40%|███▉      | 4382/10986 [2:59:01<5:14:58,  2.86s/it]

training loss: 3.482562780380249


training:  40%|███▉      | 4383/10986 [2:59:03<4:50:23,  2.64s/it]

training loss: 3.4413652420043945


training:  40%|███▉      | 4384/10986 [2:59:06<4:46:04,  2.60s/it]

training loss: 3.5570104122161865


training:  40%|███▉      | 4385/10986 [2:59:08<4:30:29,  2.46s/it]

training loss: 3.3811728954315186


training:  40%|███▉      | 4386/10986 [2:59:11<4:31:56,  2.47s/it]

training loss: 3.4647185802459717


training:  40%|███▉      | 4387/10986 [2:59:13<4:20:06,  2.37s/it]

training loss: 3.399993658065796


training:  40%|███▉      | 4388/10986 [2:59:15<4:24:59,  2.41s/it]

training loss: 3.357640504837036


training:  40%|███▉      | 4389/10986 [2:59:17<4:15:23,  2.32s/it]

training loss: 3.3944921493530273


training:  40%|███▉      | 4390/10986 [2:59:20<4:21:15,  2.38s/it]

training loss: 3.4643044471740723


training:  40%|███▉      | 4391/10986 [2:59:22<4:12:01,  2.29s/it]

training loss: 3.4808530807495117


training:  40%|███▉      | 4392/10986 [2:59:24<4:18:48,  2.35s/it]

training loss: 3.5655462741851807


training:  40%|███▉      | 4393/10986 [2:59:27<4:11:05,  2.29s/it]

training loss: 3.4470207691192627


training:  40%|███▉      | 4394/10986 [2:59:29<4:17:42,  2.35s/it]

training loss: 3.41371750831604


training:  40%|████      | 4395/10986 [2:59:31<4:11:04,  2.29s/it]

training loss: 3.4053831100463867


training:  40%|████      | 4396/10986 [2:59:34<4:18:20,  2.35s/it]

training loss: 3.4533631801605225


training:  40%|████      | 4397/10986 [2:59:36<4:10:37,  2.28s/it]

training loss: 3.4493770599365234


training:  40%|████      | 4398/10986 [2:59:38<4:17:36,  2.35s/it]

training loss: 3.425001382827759


training:  40%|████      | 4399/10986 [2:59:40<4:10:23,  2.28s/it]

training loss: 3.38782000541687


training:  40%|████      | 4400/10986 [2:59:43<4:18:11,  2.35s/it]

training loss: 3.4534430503845215
valid loss: 3.5836164951324463
perplexity: 36.003509521484375


training:  40%|████      | 4401/10986 [2:59:48<5:38:29,  3.08s/it]

training loss: 3.5243918895721436


training:  40%|████      | 4402/10986 [2:59:50<5:07:53,  2.81s/it]

training loss: 3.596670389175415


training:  40%|████      | 4403/10986 [2:59:52<4:57:14,  2.71s/it]

training loss: 3.5109541416168213


training:  40%|████      | 4404/10986 [2:59:55<4:39:50,  2.55s/it]

training loss: 3.400848150253296


training:  40%|████      | 4405/10986 [2:59:57<4:38:28,  2.54s/it]

training loss: 3.4972965717315674


training:  40%|████      | 4406/10986 [2:59:59<4:24:25,  2.41s/it]

training loss: 3.5384953022003174


training:  40%|████      | 4407/10986 [3:00:02<4:34:30,  2.50s/it]

training loss: 3.314710855484009


training:  40%|████      | 4408/10986 [3:00:05<4:55:08,  2.69s/it]

training loss: 3.3495044708251953


training:  40%|████      | 4409/10986 [3:00:08<4:51:48,  2.66s/it]

training loss: 3.5561723709106445


training:  40%|████      | 4410/10986 [3:00:10<4:33:37,  2.50s/it]

training loss: 3.4955127239227295


training:  40%|████      | 4411/10986 [3:00:12<4:34:08,  2.50s/it]

training loss: 3.3423426151275635


training:  40%|████      | 4412/10986 [3:00:14<4:21:32,  2.39s/it]

training loss: 3.408003807067871


training:  40%|████      | 4413/10986 [3:00:17<4:24:42,  2.42s/it]

training loss: 3.2967238426208496


training:  40%|████      | 4414/10986 [3:00:19<4:15:46,  2.34s/it]

training loss: 3.5547776222229004


training:  40%|████      | 4415/10986 [3:00:22<4:22:56,  2.40s/it]

training loss: 3.5124704837799072


training:  40%|████      | 4416/10986 [3:00:24<4:13:18,  2.31s/it]

training loss: 3.3899309635162354


training:  40%|████      | 4417/10986 [3:00:26<4:20:36,  2.38s/it]

training loss: 3.402914524078369


training:  40%|████      | 4418/10986 [3:00:28<4:12:00,  2.30s/it]

training loss: 3.565473794937134


training:  40%|████      | 4419/10986 [3:00:31<4:18:32,  2.36s/it]

training loss: 3.469590187072754


training:  40%|████      | 4420/10986 [3:00:33<4:09:38,  2.28s/it]

training loss: 3.353064775466919
valid loss: 3.415459632873535
perplexity: 30.430932998657227


training:  40%|████      | 4421/10986 [3:00:37<5:24:03,  2.96s/it]

training loss: 3.3493568897247314


training:  40%|████      | 4422/10986 [3:00:40<5:11:07,  2.84s/it]

training loss: 3.5341086387634277


training:  40%|████      | 4423/10986 [3:00:42<4:46:54,  2.62s/it]

training loss: 3.5830531120300293


training:  40%|████      | 4424/10986 [3:00:45<4:42:18,  2.58s/it]

training loss: 3.4391942024230957


training:  40%|████      | 4425/10986 [3:00:47<4:27:02,  2.44s/it]

training loss: 3.3176991939544678


training:  40%|████      | 4426/10986 [3:00:49<4:26:57,  2.44s/it]

training loss: 3.499553680419922


training:  40%|████      | 4427/10986 [3:00:51<4:17:14,  2.35s/it]

training loss: 3.4479968547821045


training:  40%|████      | 4428/10986 [3:00:54<4:20:37,  2.38s/it]

training loss: 3.5171217918395996


training:  40%|████      | 4429/10986 [3:00:56<4:12:33,  2.31s/it]

training loss: 3.419541597366333


training:  40%|████      | 4430/10986 [3:00:58<4:17:48,  2.36s/it]

training loss: 3.5534920692443848


training:  40%|████      | 4431/10986 [3:01:00<4:09:46,  2.29s/it]

training loss: 3.4490952491760254


training:  40%|████      | 4432/10986 [3:01:03<4:15:29,  2.34s/it]

training loss: 3.4343066215515137


training:  40%|████      | 4433/10986 [3:01:05<4:09:13,  2.28s/it]

training loss: 3.4739649295806885


training:  40%|████      | 4434/10986 [3:01:08<4:16:09,  2.35s/it]

training loss: 3.493074893951416


training:  40%|████      | 4435/10986 [3:01:10<4:08:07,  2.27s/it]

training loss: 3.420846462249756


training:  40%|████      | 4436/10986 [3:01:12<4:14:32,  2.33s/it]

training loss: 3.412720203399658


training:  40%|████      | 4437/10986 [3:01:14<4:07:28,  2.27s/it]

training loss: 3.477234363555908


training:  40%|████      | 4438/10986 [3:01:17<4:12:50,  2.32s/it]

training loss: 3.456962823867798


training:  40%|████      | 4439/10986 [3:01:19<4:07:12,  2.27s/it]

training loss: 3.393129587173462


training:  40%|████      | 4440/10986 [3:01:21<4:13:24,  2.32s/it]

training loss: 3.462019681930542
valid loss: 3.4730324745178223
perplexity: 32.234344482421875


training:  40%|████      | 4441/10986 [3:01:26<5:24:47,  2.98s/it]

training loss: 3.3880300521850586


training:  40%|████      | 4442/10986 [3:01:28<4:58:30,  2.74s/it]

training loss: 3.430314302444458


training:  40%|████      | 4443/10986 [3:01:30<4:48:53,  2.65s/it]

training loss: 3.4067423343658447


training:  40%|████      | 4444/10986 [3:01:33<4:31:57,  2.49s/it]

training loss: 3.4368438720703125


training:  40%|████      | 4445/10986 [3:01:35<4:30:47,  2.48s/it]

training loss: 3.4522135257720947


training:  40%|████      | 4446/10986 [3:01:37<4:19:29,  2.38s/it]

training loss: 3.6300721168518066


training:  40%|████      | 4447/10986 [3:01:40<4:20:02,  2.39s/it]

training loss: 3.4140243530273438


training:  40%|████      | 4448/10986 [3:01:42<4:10:59,  2.30s/it]

training loss: 3.690797805786133


training:  40%|████      | 4449/10986 [3:01:45<4:27:55,  2.46s/it]

training loss: 3.494999408721924


training:  41%|████      | 4450/10986 [3:01:47<4:17:30,  2.36s/it]