In [1]:
#based off crowsonkb's code

In [None]:
!pip install accelerate peft

In [None]:
!pip install bitsandbytes

In [1]:
import argparse
import json
from pathlib import Path
import sys

import accelerate
import peft
import torch
from torch import optim
from torch.utils import data
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import trange, tqdm

In [2]:
print = tqdm.external_write_mode()(print)


In [3]:
def batch_to_tensors(batch, device="cpu"):
    seq_len = max(len(x) for x in batch)
    input_ids = torch.zeros(len(batch), seq_len, dtype=torch.long, device=device)
    attention_mask = torch.zeros(len(batch), seq_len, dtype=torch.long, device=device)
    for i, x in enumerate(batch):
        input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long, device=device)
        attention_mask[i, : len(x)] = 1
    return input_ids, attention_mask

In [4]:
accelerator = accelerate.Accelerator(
        mixed_precision="bf16", gradient_accumulation_steps=1
    )

In [5]:
device = accelerator.device
is_main = accelerator.is_main_process
print0 = accelerator.on_main_process(print)

In [6]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit= True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

In [7]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
pip install -U bitsandbytes accelerate

In [26]:
with accelerator.main_process_first():
    model_base = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-chat-hf",
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
accelerator.wait_for_everyone()

In [10]:
dropout = 0.0 #or 0.1

In [None]:
# only relevant if you alr have model
# if args.start_from is not None:
#         print0(f"Loading adapter: {args.start_from}", file=sys.stderr)
#         with accelerator.main_process_first():
#             model = peft.PeftModel.from_pretrained(model_base, args.start_from, is_trainable=True)
#         if args.dropout is not None:
#             model.active_peft_config.lora_dropout = dropout

In [11]:
#otherwise, if starting from scratch
#intializing adapter
peft_config = peft.LoraConfig(
    peft.TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=8,
    lora_dropout=dropout,
)

In [12]:
#otherwise
model = peft.get_peft_model(model_base, peft_config)

In [13]:
accelerator.wait_for_everyone()


In [19]:
model.train()

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): lora.Li

In [14]:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

In [21]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [15]:
opt = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.99))

In [28]:
# import pandas
# dataset_train = pandas.read_csv('train_tokenized.csv')

In [31]:
# dataset_test = pandas.read_csv('test_tokenized.csv')

In [32]:
# dataset = dataset_train['tokenized_texts'].tolist() + dataset_test['tokenized_texts'].tolist()

In [16]:
import pickle

with open('dataset.pkl', 'rb') as file:
    dataset = pickle.load(file)

In [None]:
# dataset

In [32]:
dataloader = data.DataLoader(
    dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=batch_to_tensors,
    drop_last=True,
)

In [33]:
model, opt, dataloader = accelerator.prepare(model, opt, dataloader)

In [20]:
i = 0

In [29]:
from accelerate import Accelerator
import torch

accelerator = Accelerator()
device = accelerator.device
print(f"Using device: {device}")
print(f"Device count: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")


Using device: cuda
Device count: 4
Device 0: NVIDIA A100 80GB PCIe
Device 1: NVIDIA A100 80GB PCIe
Device 2: NVIDIA A100 80GB PCIe
Device 3: NVIDIA A100 80GB PCIe


In [None]:
for epoch in trange(1):
    for input_ids, attention_mask in tqdm(dataloader):
        with accelerator.accumulate(model):
            outputs = model(
                input_ids[:, :-1],
                attention_mask=attention_mask[:, :-1],
                use_cache=False,
            )
            losses = torch.nn.functional.cross_entropy(
                outputs.logits.transpose(-1, -2),
                input_ids[:, 1:],
                reduction="none",
            )
            mask = attention_mask[:, :-1] * attention_mask[:, 1:]
            loss = torch.sum(losses * mask, dtype=torch.float32) / torch.sum(
                mask, dtype=torch.float32
            )

            accelerator.backward(loss)
            opt.step()
            opt.zero_grad()

            loss_global = accelerator.reduce(loss, "mean")
            print0(f"epoch: {epoch}, step: {i}, loss: {loss_global.item():g}")
            i += 1

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2754 [00:00<?, ?it/s][A
                                        
  0%|          | 0/1 [00:00<?, ?it/s]/s][A
  0%|          | 1/2754 [00:00<12:11,  3.76it/s][A

epoch: 0, step: 205, loss: 3.66617



                                                
  0%|          | 0/1 [00:00<?, ?it/s]  3.76it/s][A
  0%|          | 2/2754 [00:00<12:57,  3.54it/s][A

epoch: 0, step: 206, loss: 5.42273



                                                
  0%|          | 0/1 [00:00<?, ?it/s]  3.54it/s][A
  0%|          | 3/2754 [00:00<12:43,  3.60it/s][A

epoch: 0, step: 207, loss: 3.80678



                                                
  0%|          | 0/1 [00:01<?, ?it/s]  3.60it/s][A
  0%|          | 4/2754 [00:01<14:29,  3.16it/s][A

epoch: 0, step: 208, loss: 2.43614



                                                
  0%|          | 0/1 [00:01<?, ?it/s]  3.16it/s][A
  0%|          | 5/2754 [00:01<14:06,  3.25it/s][A

epoch: 0, step: 209, loss: 3.3106



                                                
  0%|          | 0/1 [00:01<?, ?it/s]  3.25it/s][A
  0%|          | 6/2754 [00:01<13:11,  3.47it/s][A

epoch: 0, step: 210, loss: 5.35378



                                                
  0%|          | 0/1 [00:01<?, ?it/s]  3.47it/s][A
  0%|          | 7/2754 [00:01<12:30,  3.66it/s][A

epoch: 0, step: 211, loss: 3.39202



                                                
  0%|          | 0/1 [00:02<?, ?it/s]  3.66it/s][A
  0%|          | 8/2754 [00:02<12:09,  3.77it/s][A

epoch: 0, step: 212, loss: 2.24307



                                                
  0%|          | 0/1 [00:02<?, ?it/s]  3.77it/s][A
  0%|          | 9/2754 [00:02<18:01,  2.54it/s][A

epoch: 0, step: 213, loss: 1.17834



                                                
  0%|          | 0/1 [00:03<?, ?it/s]  2.54it/s][A
  0%|          | 10/2754 [00:03<16:42,  2.74it/s][A

epoch: 0, step: 214, loss: 3.3422



                                                 
  0%|          | 0/1 [00:03<?, ?it/s],  2.74it/s][A
  0%|          | 11/2754 [00:03<15:23,  2.97it/s][A

epoch: 0, step: 215, loss: 3.2632



                                                 
  0%|          | 0/1 [00:03<?, ?it/s],  2.97it/s][A
  0%|          | 12/2754 [00:03<14:08,  3.23it/s][A

epoch: 0, step: 216, loss: 3.34055



                                                 
  0%|          | 0/1 [00:04<?, ?it/s],  3.23it/s][A
  0%|          | 13/2754 [00:04<13:53,  3.29it/s][A

epoch: 0, step: 217, loss: 3.10835



                                                 
  0%|          | 0/1 [00:04<?, ?it/s],  3.29it/s][A
  1%|          | 14/2754 [00:04<15:23,  2.97it/s][A

epoch: 0, step: 218, loss: 2.80999



                                                 
  0%|          | 0/1 [00:04<?, ?it/s],  2.97it/s][A
  1%|          | 15/2754 [00:04<14:37,  3.12it/s][A

epoch: 0, step: 219, loss: 3.85311



                                                 
  0%|          | 0/1 [00:04<?, ?it/s],  3.12it/s][A
  1%|          | 16/2754 [00:04<13:47,  3.31it/s][A

epoch: 0, step: 220, loss: 3.46706



                                                 
  0%|          | 0/1 [00:05<?, ?it/s],  3.31it/s][A
  1%|          | 17/2754 [00:05<13:06,  3.48it/s][A

epoch: 0, step: 221, loss: 3.84933



                                                 
  0%|          | 0/1 [00:05<?, ?it/s],  3.48it/s][A
  1%|          | 18/2754 [00:05<13:01,  3.50it/s][A

epoch: 0, step: 222, loss: 4.15187



                                                 
  0%|          | 0/1 [00:05<?, ?it/s],  3.50it/s][A
  1%|          | 19/2754 [00:05<12:13,  3.73it/s][A

epoch: 0, step: 223, loss: 4.04624



                                                 
  0%|          | 0/1 [00:05<?, ?it/s],  3.73it/s][A
  1%|          | 20/2754 [00:05<11:26,  3.98it/s][A

epoch: 0, step: 224, loss: 2.64567



                                                 
  0%|          | 0/1 [00:06<?, ?it/s],  3.98it/s][A
  1%|          | 21/2754 [00:06<11:43,  3.89it/s][A

epoch: 0, step: 225, loss: 3.58748



                                                 
  0%|          | 0/1 [00:06<?, ?it/s],  3.89it/s][A
  1%|          | 22/2754 [00:06<12:08,  3.75it/s][A

epoch: 0, step: 226, loss: 3.51779



                                                 
  0%|          | 0/1 [00:06<?, ?it/s],  3.75it/s][A
  1%|          | 23/2754 [00:06<11:50,  3.85it/s][A

epoch: 0, step: 227, loss: 2.38848



                                                 
  0%|          | 0/1 [00:07<?, ?it/s],  3.85it/s][A
  1%|          | 24/2754 [00:07<13:41,  3.32it/s][A

epoch: 0, step: 228, loss: 3.32658



                                                 
  0%|          | 0/1 [00:07<?, ?it/s],  3.32it/s][A
  1%|          | 25/2754 [00:07<13:12,  3.44it/s][A

epoch: 0, step: 229, loss: 3.75199



                                                 
  0%|          | 0/1 [00:07<?, ?it/s],  3.44it/s][A
  1%|          | 26/2754 [00:07<12:33,  3.62it/s][A

epoch: 0, step: 230, loss: 1.97323



                                                 
  0%|          | 0/1 [00:07<?, ?it/s],  3.62it/s][A
  1%|          | 27/2754 [00:07<12:04,  3.77it/s][A

epoch: 0, step: 231, loss: 1.56016



                                                 
  0%|          | 0/1 [00:08<?, ?it/s],  3.77it/s][A
  1%|          | 28/2754 [00:08<11:49,  3.84it/s][A

epoch: 0, step: 232, loss: 3.35227



                                                 
  0%|          | 0/1 [00:08<?, ?it/s],  3.84it/s][A
  1%|          | 29/2754 [00:08<12:08,  3.74it/s][A

epoch: 0, step: 233, loss: 2.99571



                                                 
  0%|          | 0/1 [00:08<?, ?it/s],  3.74it/s][A
  1%|          | 30/2754 [00:08<11:53,  3.82it/s][A

epoch: 0, step: 234, loss: 3.99366



                                                 
  0%|          | 0/1 [00:08<?, ?it/s],  3.82it/s][A
  1%|          | 31/2754 [00:08<11:48,  3.84it/s][A

epoch: 0, step: 235, loss: 4.49229



                                                 
  0%|          | 0/1 [00:09<?, ?it/s],  3.84it/s][A
  1%|          | 32/2754 [00:09<12:04,  3.76it/s][A

epoch: 0, step: 236, loss: 4.22028



                                                 
  0%|          | 0/1 [00:09<?, ?it/s],  3.76it/s][A
  1%|          | 33/2754 [00:09<11:59,  3.78it/s][A

epoch: 0, step: 237, loss: 3.64427



                                                 
  0%|          | 0/1 [00:09<?, ?it/s],  3.78it/s][A
  1%|          | 34/2754 [00:09<11:42,  3.87it/s][A

epoch: 0, step: 238, loss: 2.72684



                                                 
  0%|          | 0/1 [00:09<?, ?it/s],  3.87it/s][A
  1%|▏         | 35/2754 [00:09<11:42,  3.87it/s][A

epoch: 0, step: 239, loss: 3.27409



                                                 
  0%|          | 0/1 [00:10<?, ?it/s],  3.87it/s][A
  1%|▏         | 36/2754 [00:10<11:47,  3.84it/s][A

epoch: 0, step: 240, loss: 4.13316



                                                 
  0%|          | 0/1 [00:10<?, ?it/s],  3.84it/s][A
  1%|▏         | 37/2754 [00:10<11:32,  3.92it/s][A

epoch: 0, step: 241, loss: 4.72855



                                                 
  0%|          | 0/1 [00:10<?, ?it/s],  3.92it/s][A
  1%|▏         | 38/2754 [00:10<11:28,  3.95it/s][A

epoch: 0, step: 242, loss: 3.62307



                                                 
  0%|          | 0/1 [00:11<?, ?it/s],  3.95it/s][A
  1%|▏         | 39/2754 [00:11<11:27,  3.95it/s][A

epoch: 0, step: 243, loss: 3.03644



                                                 
  0%|          | 0/1 [00:11<?, ?it/s],  3.95it/s][A
  1%|▏         | 40/2754 [00:11<11:15,  4.02it/s][A

epoch: 0, step: 244, loss: 3.03559



                                                 
  0%|          | 0/1 [00:11<?, ?it/s],  4.02it/s][A
  1%|▏         | 41/2754 [00:11<11:24,  3.97it/s][A

epoch: 0, step: 245, loss: 4.44678



                                                 
  0%|          | 0/1 [00:11<?, ?it/s],  3.97it/s][A
  2%|▏         | 42/2754 [00:11<11:51,  3.81it/s][A

epoch: 0, step: 246, loss: 3.73679



                                                 
  0%|          | 0/1 [00:12<?, ?it/s],  3.81it/s][A
  2%|▏         | 43/2754 [00:12<11:41,  3.86it/s][A

epoch: 0, step: 247, loss: 3.08996



                                                 
  0%|          | 0/1 [00:12<?, ?it/s],  3.86it/s][A
  2%|▏         | 44/2754 [00:12<11:15,  4.01it/s][A

epoch: 0, step: 248, loss: 3.6421



                                                 
  0%|          | 0/1 [00:12<?, ?it/s],  4.01it/s][A
  2%|▏         | 45/2754 [00:12<11:26,  3.94it/s][A

epoch: 0, step: 249, loss: 3.71057



                                                 
  0%|          | 0/1 [00:12<?, ?it/s],  3.94it/s][A
  2%|▏         | 46/2754 [00:12<11:04,  4.07it/s][A

epoch: 0, step: 250, loss: 4.03333



                                                 
  0%|          | 0/1 [00:13<?, ?it/s],  4.07it/s][A
  2%|▏         | 47/2754 [00:13<11:31,  3.92it/s][A

epoch: 0, step: 251, loss: 3.90862



                                                 
  0%|          | 0/1 [00:13<?, ?it/s],  3.92it/s][A
  2%|▏         | 48/2754 [00:13<11:34,  3.89it/s][A

epoch: 0, step: 252, loss: 1.18782



                                                 
  0%|          | 0/1 [00:13<?, ?it/s],  3.89it/s][A
  2%|▏         | 49/2754 [00:13<11:18,  3.99it/s][A

epoch: 0, step: 253, loss: 3.88649



                                                 
  0%|          | 0/1 [00:13<?, ?it/s],  3.99it/s][A
  2%|▏         | 50/2754 [00:13<11:31,  3.91it/s][A

epoch: 0, step: 254, loss: 3.10294



                                                 
  0%|          | 0/1 [00:14<?, ?it/s],  3.91it/s][A
  2%|▏         | 51/2754 [00:14<11:31,  3.91it/s][A

epoch: 0, step: 255, loss: 4.68226



                                                 
  0%|          | 0/1 [00:14<?, ?it/s],  3.91it/s][A
  2%|▏         | 52/2754 [00:14<11:22,  3.96it/s][A

epoch: 0, step: 256, loss: 3.27737



                                                 
  0%|          | 0/1 [00:14<?, ?it/s],  3.96it/s][A
  2%|▏         | 53/2754 [00:14<11:09,  4.03it/s][A

epoch: 0, step: 257, loss: 3.88352



                                                 
  0%|          | 0/1 [00:14<?, ?it/s],  4.03it/s][A
  2%|▏         | 54/2754 [00:14<12:21,  3.64it/s][A

epoch: 0, step: 258, loss: 3.34372



                                                 
  0%|          | 0/1 [00:15<?, ?it/s],  3.64it/s][A
  2%|▏         | 55/2754 [00:15<11:49,  3.80it/s][A

epoch: 0, step: 259, loss: 4.15517



                                                 
  0%|          | 0/1 [00:15<?, ?it/s],  3.80it/s][A
  2%|▏         | 56/2754 [00:15<11:26,  3.93it/s][A

epoch: 0, step: 260, loss: 2.71872



                                                 
  0%|          | 0/1 [00:15<?, ?it/s],  3.93it/s][A
  2%|▏         | 57/2754 [00:15<11:48,  3.81it/s][A

epoch: 0, step: 261, loss: 2.0941



                                                 
  0%|          | 0/1 [00:15<?, ?it/s],  3.81it/s][A
  2%|▏         | 58/2754 [00:15<11:32,  3.89it/s][A

epoch: 0, step: 262, loss: 4.57063



                                                 
  0%|          | 0/1 [00:16<?, ?it/s],  3.89it/s][A
  2%|▏         | 59/2754 [00:16<11:37,  3.87it/s][A

epoch: 0, step: 263, loss: 4.07173



                                                 
  0%|          | 0/1 [00:16<?, ?it/s],  3.87it/s][A
  2%|▏         | 60/2754 [00:16<11:14,  3.99it/s][A

epoch: 0, step: 264, loss: 2.80647



                                                 
  0%|          | 0/1 [00:16<?, ?it/s],  3.99it/s][A
  2%|▏         | 61/2754 [00:16<11:46,  3.81it/s][A

epoch: 0, step: 265, loss: 3.94362



                                                 
  0%|          | 0/1 [00:16<?, ?it/s],  3.81it/s][A
  2%|▏         | 62/2754 [00:16<12:52,  3.48it/s][A

epoch: 0, step: 266, loss: 3.29372



                                                 
  0%|          | 0/1 [00:17<?, ?it/s],  3.48it/s][A
  2%|▏         | 63/2754 [00:17<12:36,  3.56it/s][A

epoch: 0, step: 267, loss: 3.54486



                                                 
  0%|          | 0/1 [00:17<?, ?it/s],  3.56it/s][A
  2%|▏         | 64/2754 [00:17<12:40,  3.54it/s][A

epoch: 0, step: 268, loss: 3.41058



                                                 
  0%|          | 0/1 [00:17<?, ?it/s],  3.54it/s][A
  2%|▏         | 65/2754 [00:17<12:15,  3.65it/s][A

epoch: 0, step: 269, loss: 4.93275



                                                 
  0%|          | 0/1 [00:18<?, ?it/s],  3.65it/s][A
  2%|▏         | 66/2754 [00:18<11:47,  3.80it/s][A

epoch: 0, step: 270, loss: 4.52947



                                                 
  0%|          | 0/1 [00:18<?, ?it/s],  3.80it/s][A
  2%|▏         | 67/2754 [00:18<11:35,  3.86it/s][A

epoch: 0, step: 271, loss: 4.61609



                                                 
  0%|          | 0/1 [00:18<?, ?it/s],  3.86it/s][A
  2%|▏         | 68/2754 [00:18<11:21,  3.94it/s][A

epoch: 0, step: 272, loss: 3.64834



                                                 
  0%|          | 0/1 [00:18<?, ?it/s],  3.94it/s][A
  3%|▎         | 69/2754 [00:18<11:34,  3.87it/s][A

epoch: 0, step: 273, loss: 4.60011



                                                 
  0%|          | 0/1 [00:19<?, ?it/s],  3.87it/s][A
  3%|▎         | 70/2754 [00:19<11:25,  3.92it/s][A

epoch: 0, step: 274, loss: 1.88901



                                                 
  0%|          | 0/1 [00:19<?, ?it/s],  3.92it/s][A
  3%|▎         | 71/2754 [00:19<11:22,  3.93it/s][A

epoch: 0, step: 275, loss: 2.51918



                                                 
  0%|          | 0/1 [00:19<?, ?it/s],  3.93it/s][A
  3%|▎         | 72/2754 [00:19<11:37,  3.85it/s][A

epoch: 0, step: 276, loss: 1.48503



                                                 
  0%|          | 0/1 [00:19<?, ?it/s],  3.85it/s][A
  3%|▎         | 73/2754 [00:19<11:45,  3.80it/s][A

epoch: 0, step: 277, loss: 3.99357



                                                 
  0%|          | 0/1 [00:20<?, ?it/s],  3.80it/s][A
  3%|▎         | 74/2754 [00:20<12:18,  3.63it/s][A

epoch: 0, step: 278, loss: 4.17916



                                                 
  0%|          | 0/1 [00:20<?, ?it/s],  3.63it/s][A
  3%|▎         | 75/2754 [00:20<12:35,  3.54it/s][A

epoch: 0, step: 279, loss: 2.09533



                                                 
  0%|          | 0/1 [00:20<?, ?it/s],  3.54it/s][A
  3%|▎         | 76/2754 [00:20<11:53,  3.76it/s][A

epoch: 0, step: 280, loss: 3.3869



                                                 
  0%|          | 0/1 [00:20<?, ?it/s],  3.76it/s][A
  3%|▎         | 77/2754 [00:20<12:16,  3.64it/s][A

epoch: 0, step: 281, loss: 3.79856



                                                 
  0%|          | 0/1 [00:21<?, ?it/s],  3.64it/s][A
  3%|▎         | 78/2754 [00:21<11:51,  3.76it/s][A

epoch: 0, step: 282, loss: 3.54153



                                                 
  0%|          | 0/1 [00:21<?, ?it/s],  3.76it/s][A
  3%|▎         | 79/2754 [00:21<11:43,  3.80it/s][A

epoch: 0, step: 283, loss: 2.2063



                                                 
  0%|          | 0/1 [00:21<?, ?it/s],  3.80it/s][A
  3%|▎         | 80/2754 [00:21<11:25,  3.90it/s][A

epoch: 0, step: 284, loss: 4.02467



                                                 
  0%|          | 0/1 [00:21<?, ?it/s],  3.90it/s][A
  3%|▎         | 81/2754 [00:21<11:08,  4.00it/s][A

epoch: 0, step: 285, loss: 3.17518



                                                 
  0%|          | 0/1 [00:22<?, ?it/s],  4.00it/s][A
  3%|▎         | 82/2754 [00:22<11:01,  4.04it/s][A

epoch: 0, step: 286, loss: 3.90108



                                                 
  0%|          | 0/1 [00:22<?, ?it/s],  4.04it/s][A
  3%|▎         | 83/2754 [00:22<11:17,  3.94it/s][A

epoch: 0, step: 287, loss: 4.16667



                                                 
  0%|          | 0/1 [00:22<?, ?it/s],  3.94it/s][A
  3%|▎         | 84/2754 [00:22<11:02,  4.03it/s][A

epoch: 0, step: 288, loss: 5.39947



                                                 
  0%|          | 0/1 [00:22<?, ?it/s],  4.03it/s][A
  3%|▎         | 85/2754 [00:22<11:30,  3.87it/s][A

epoch: 0, step: 289, loss: 3.20376



                                                 
  0%|          | 0/1 [00:23<?, ?it/s],  3.87it/s][A
  3%|▎         | 86/2754 [00:23<11:28,  3.87it/s][A

epoch: 0, step: 290, loss: 5.92516



                                                 
  0%|          | 0/1 [00:23<?, ?it/s],  3.87it/s][A
  3%|▎         | 87/2754 [00:23<11:17,  3.94it/s][A

epoch: 0, step: 291, loss: 4.40173



                                                 
  0%|          | 0/1 [00:23<?, ?it/s],  3.94it/s][A
  3%|▎         | 88/2754 [00:23<11:57,  3.72it/s][A

epoch: 0, step: 292, loss: 3.67326



                                                 
  0%|          | 0/1 [00:24<?, ?it/s],  3.72it/s][A
  3%|▎         | 89/2754 [00:24<12:12,  3.64it/s][A

epoch: 0, step: 293, loss: 3.05385



                                                 
  0%|          | 0/1 [00:24<?, ?it/s],  3.64it/s][A
  3%|▎         | 90/2754 [00:24<11:43,  3.79it/s][A

epoch: 0, step: 294, loss: 6.83824



                                                 
  0%|          | 0/1 [00:24<?, ?it/s],  3.79it/s][A
  3%|▎         | 91/2754 [00:24<11:24,  3.89it/s][A

epoch: 0, step: 295, loss: 3.01629



                                                 
  0%|          | 0/1 [00:24<?, ?it/s],  3.89it/s][A
  3%|▎         | 92/2754 [00:24<11:46,  3.77it/s][A

epoch: 0, step: 296, loss: 2.8877



                                                 
  0%|          | 0/1 [00:25<?, ?it/s],  3.77it/s][A
  3%|▎         | 93/2754 [00:25<11:45,  3.77it/s][A

epoch: 0, step: 297, loss: 3.60201



                                                 
  0%|          | 0/1 [00:25<?, ?it/s],  3.77it/s][A
  3%|▎         | 94/2754 [00:25<13:48,  3.21it/s][A

epoch: 0, step: 298, loss: 3.32225



                                                 
  0%|          | 0/1 [00:25<?, ?it/s],  3.21it/s][A
  3%|▎         | 95/2754 [00:25<14:03,  3.15it/s][A

epoch: 0, step: 299, loss: 3.68336



                                                 
  0%|          | 0/1 [00:26<?, ?it/s],  3.15it/s][A
  3%|▎         | 96/2754 [00:26<13:40,  3.24it/s][A

epoch: 0, step: 300, loss: 3.31522



                                                 
  0%|          | 0/1 [00:26<?, ?it/s],  3.24it/s][A
  4%|▎         | 97/2754 [00:26<12:52,  3.44it/s][A

epoch: 0, step: 301, loss: 4.12134



                                                 
  0%|          | 0/1 [00:26<?, ?it/s],  3.44it/s][A
  4%|▎         | 98/2754 [00:26<11:57,  3.70it/s][A

epoch: 0, step: 302, loss: 3.46809



                                                 
  0%|          | 0/1 [00:26<?, ?it/s],  3.70it/s][A
  4%|▎         | 99/2754 [00:26<11:56,  3.70it/s][A

epoch: 0, step: 303, loss: 3.62503



                                                 
  0%|          | 0/1 [00:27<?, ?it/s],  3.70it/s][A
  4%|▎         | 100/2754 [00:27<12:19,  3.59it/s][A

epoch: 0, step: 304, loss: 3.42162



                                                  
  0%|          | 0/1 [00:27<?, ?it/s]9,  3.59it/s][A
  4%|▎         | 101/2754 [00:27<12:26,  3.55it/s][A

epoch: 0, step: 305, loss: 3.50941



                                                  
  0%|          | 0/1 [00:27<?, ?it/s]6,  3.55it/s][A
  4%|▎         | 102/2754 [00:27<15:36,  2.83it/s][A

epoch: 0, step: 306, loss: 3.40481



                                                  
  0%|          | 0/1 [00:28<?, ?it/s]6,  2.83it/s][A
  4%|▎         | 103/2754 [00:28<14:14,  3.10it/s][A

epoch: 0, step: 307, loss: 5.63833



                                                  
  0%|          | 0/1 [00:28<?, ?it/s]4,  3.10it/s][A
  4%|▍         | 104/2754 [00:28<13:20,  3.31it/s][A

epoch: 0, step: 308, loss: 6.26277



                                                  
  0%|          | 0/1 [00:28<?, ?it/s]0,  3.31it/s][A
  4%|▍         | 105/2754 [00:28<12:36,  3.50it/s][A

epoch: 0, step: 309, loss: 4.19889



                                                  
  0%|          | 0/1 [00:29<?, ?it/s]6,  3.50it/s][A
  4%|▍         | 106/2754 [00:29<12:17,  3.59it/s][A

epoch: 0, step: 310, loss: 2.58846



                                                  
  0%|          | 0/1 [00:29<?, ?it/s]7,  3.59it/s][A
  4%|▍         | 107/2754 [00:29<12:02,  3.66it/s][A

epoch: 0, step: 311, loss: 4.29682



                                                  
  0%|          | 0/1 [00:29<?, ?it/s]2,  3.66it/s][A
  4%|▍         | 108/2754 [00:29<11:41,  3.77it/s][A

epoch: 0, step: 312, loss: 3.14754



                                                  
  0%|          | 0/1 [00:29<?, ?it/s]1,  3.77it/s][A
  4%|▍         | 109/2754 [00:29<11:32,  3.82it/s][A

epoch: 0, step: 313, loss: 4.4692



                                                  
  0%|          | 0/1 [00:30<?, ?it/s]2,  3.82it/s][A
  4%|▍         | 110/2754 [00:30<11:24,  3.86it/s][A

epoch: 0, step: 314, loss: 4.54924



                                                  
  0%|          | 0/1 [00:30<?, ?it/s]4,  3.86it/s][A
  4%|▍         | 111/2754 [00:30<11:19,  3.89it/s][A

epoch: 0, step: 315, loss: 3.34663



                                                  
  0%|          | 0/1 [00:30<?, ?it/s]9,  3.89it/s][A
  4%|▍         | 112/2754 [00:30<11:14,  3.92it/s][A

epoch: 0, step: 316, loss: 2.8009



                                                  
  0%|          | 0/1 [00:30<?, ?it/s]4,  3.92it/s][A
  4%|▍         | 113/2754 [00:30<12:36,  3.49it/s][A

epoch: 0, step: 317, loss: 2.83788



                                                  
  0%|          | 0/1 [00:31<?, ?it/s]6,  3.49it/s][A
  4%|▍         | 114/2754 [00:31<12:04,  3.64it/s][A

epoch: 0, step: 318, loss: 3.03549



                                                  
  0%|          | 0/1 [00:31<?, ?it/s]4,  3.64it/s][A
  4%|▍         | 115/2754 [00:31<13:47,  3.19it/s][A

epoch: 0, step: 319, loss: 3.74989



                                                  
  0%|          | 0/1 [00:31<?, ?it/s]7,  3.19it/s][A
  4%|▍         | 116/2754 [00:31<14:04,  3.12it/s][A

epoch: 0, step: 320, loss: 2.38716



                                                  
  0%|          | 0/1 [00:32<?, ?it/s]4,  3.12it/s][A
  4%|▍         | 117/2754 [00:32<13:07,  3.35it/s][A

epoch: 0, step: 321, loss: 2.47



                                                  
  0%|          | 0/1 [00:32<?, ?it/s]7,  3.35it/s][A
  4%|▍         | 118/2754 [00:32<14:47,  2.97it/s][A

epoch: 0, step: 322, loss: 1.50511



                                                  
  0%|          | 0/1 [00:32<?, ?it/s]7,  2.97it/s][A
  4%|▍         | 119/2754 [00:32<14:59,  2.93it/s][A

epoch: 0, step: 323, loss: 2.63175



                                                  
  0%|          | 0/1 [00:33<?, ?it/s]9,  2.93it/s][A
  4%|▍         | 120/2754 [00:33<13:33,  3.24it/s][A

epoch: 0, step: 324, loss: 4.70768



                                                  
  0%|          | 0/1 [00:33<?, ?it/s]3,  3.24it/s][A
  4%|▍         | 121/2754 [00:33<12:41,  3.46it/s][A

epoch: 0, step: 325, loss: 3.26793



                                                  
  0%|          | 0/1 [00:33<?, ?it/s]1,  3.46it/s][A
  4%|▍         | 122/2754 [00:33<11:56,  3.67it/s][A

epoch: 0, step: 326, loss: 4.11758



                                                  
  0%|          | 0/1 [00:33<?, ?it/s]6,  3.67it/s][A
  4%|▍         | 123/2754 [00:33<11:37,  3.77it/s][A

epoch: 0, step: 327, loss: 2.75991



                                                  
  0%|          | 0/1 [00:34<?, ?it/s]7,  3.77it/s][A
  5%|▍         | 124/2754 [00:34<11:24,  3.84it/s][A

epoch: 0, step: 328, loss: 4.29884



                                                  
  0%|          | 0/1 [00:34<?, ?it/s]4,  3.84it/s][A
  5%|▍         | 125/2754 [00:34<11:36,  3.77it/s][A

epoch: 0, step: 329, loss: 4.00476



                                                  
  0%|          | 0/1 [00:34<?, ?it/s]6,  3.77it/s][A
  5%|▍         | 126/2754 [00:34<11:28,  3.82it/s][A

epoch: 0, step: 330, loss: 3.44235



                                                  
  0%|          | 0/1 [00:34<?, ?it/s]8,  3.82it/s][A
  5%|▍         | 127/2754 [00:34<11:19,  3.87it/s][A

epoch: 0, step: 331, loss: 2.54827



                                                  
  0%|          | 0/1 [00:35<?, ?it/s]9,  3.87it/s][A
  5%|▍         | 128/2754 [00:35<11:15,  3.89it/s][A

epoch: 0, step: 332, loss: 4.23158



                                                  
  0%|          | 0/1 [00:35<?, ?it/s]5,  3.89it/s][A
  5%|▍         | 129/2754 [00:35<11:02,  3.96it/s][A

epoch: 0, step: 333, loss: 6.52653



                                                  
  0%|          | 0/1 [00:35<?, ?it/s]2,  3.96it/s][A
  5%|▍         | 130/2754 [00:35<10:46,  4.06it/s][A

epoch: 0, step: 334, loss: 4.28433



                                                  
  0%|          | 0/1 [00:35<?, ?it/s]6,  4.06it/s][A
  5%|▍         | 131/2754 [00:35<10:32,  4.15it/s][A

epoch: 0, step: 335, loss: 3.08851



                                                  
  0%|          | 0/1 [00:36<?, ?it/s]2,  4.15it/s][A
  5%|▍         | 132/2754 [00:36<11:09,  3.92it/s][A

epoch: 0, step: 336, loss: 2.96878



                                                  
  0%|          | 0/1 [00:36<?, ?it/s]9,  3.92it/s][A
  5%|▍         | 133/2754 [00:36<11:28,  3.81it/s][A

epoch: 0, step: 337, loss: 4.18248



                                                  
  0%|          | 0/1 [00:36<?, ?it/s]8,  3.81it/s][A
  5%|▍         | 134/2754 [00:36<11:15,  3.88it/s][A

epoch: 0, step: 338, loss: 2.63918



                                                  
  0%|          | 0/1 [00:36<?, ?it/s]5,  3.88it/s][A
  5%|▍         | 135/2754 [00:36<11:11,  3.90it/s][A

epoch: 0, step: 339, loss: 2.81373



                                                  
  0%|          | 0/1 [00:37<?, ?it/s]1,  3.90it/s][A
  5%|▍         | 136/2754 [00:37<12:38,  3.45it/s][A

epoch: 0, step: 340, loss: 2.58313



                                                  
  0%|          | 0/1 [00:37<?, ?it/s]8,  3.45it/s][A
  5%|▍         | 137/2754 [00:37<12:20,  3.53it/s][A

epoch: 0, step: 341, loss: 2.8152



                                                  
  0%|          | 0/1 [00:37<?, ?it/s]0,  3.53it/s][A
  5%|▌         | 138/2754 [00:37<12:20,  3.53it/s][A

epoch: 0, step: 342, loss: 3.32677



                                                  
  0%|          | 0/1 [00:38<?, ?it/s]0,  3.53it/s][A
  5%|▌         | 139/2754 [00:38<12:02,  3.62it/s][A

epoch: 0, step: 343, loss: 4.62548



                                                  
  0%|          | 0/1 [00:38<?, ?it/s]2,  3.62it/s][A
  5%|▌         | 140/2754 [00:38<12:05,  3.60it/s][A

epoch: 0, step: 344, loss: 1.7856



                                                  
  0%|          | 0/1 [00:38<?, ?it/s]5,  3.60it/s][A
  5%|▌         | 141/2754 [00:38<11:36,  3.75it/s][A

epoch: 0, step: 345, loss: 2.87994



                                                  
  0%|          | 0/1 [00:38<?, ?it/s]6,  3.75it/s][A
  5%|▌         | 142/2754 [00:38<11:58,  3.64it/s][A

epoch: 0, step: 346, loss: 3.30511



                                                  
  0%|          | 0/1 [00:39<?, ?it/s]8,  3.64it/s][A
  5%|▌         | 143/2754 [00:39<11:39,  3.73it/s][A

epoch: 0, step: 347, loss: 3.40546



                                                  
  0%|          | 0/1 [00:39<?, ?it/s]9,  3.73it/s][A
  5%|▌         | 144/2754 [00:39<12:07,  3.59it/s][A

epoch: 0, step: 348, loss: 1.803



                                                  
  0%|          | 0/1 [00:39<?, ?it/s]7,  3.59it/s][A
  5%|▌         | 145/2754 [00:39<12:01,  3.61it/s][A

epoch: 0, step: 349, loss: 2.43636



                                                  
  0%|          | 0/1 [00:40<?, ?it/s]1,  3.61it/s][A
  5%|▌         | 146/2754 [00:40<12:12,  3.56it/s][A

epoch: 0, step: 350, loss: 3.57118



                                                  
  0%|          | 0/1 [00:40<?, ?it/s]2,  3.56it/s][A
  5%|▌         | 147/2754 [00:40<12:25,  3.50it/s][A

epoch: 0, step: 351, loss: 3.43656



                                                  
  0%|          | 0/1 [00:40<?, ?it/s]5,  3.50it/s][A
  5%|▌         | 148/2754 [00:40<11:59,  3.62it/s][A

epoch: 0, step: 352, loss: 4.15267



                                                  
  0%|          | 0/1 [00:40<?, ?it/s]9,  3.62it/s][A
  5%|▌         | 149/2754 [00:40<11:38,  3.73it/s][A

epoch: 0, step: 353, loss: 5.50641



                                                  
  0%|          | 0/1 [00:41<?, ?it/s]8,  3.73it/s][A
  5%|▌         | 150/2754 [00:41<11:26,  3.79it/s][A

epoch: 0, step: 354, loss: 4.16922



                                                  
  0%|          | 0/1 [00:41<?, ?it/s]6,  3.79it/s][A
  5%|▌         | 151/2754 [00:41<11:40,  3.72it/s][A

epoch: 0, step: 355, loss: 3.42682



                                                  
  0%|          | 0/1 [00:41<?, ?it/s]0,  3.72it/s][A
  6%|▌         | 152/2754 [00:41<11:36,  3.74it/s][A

epoch: 0, step: 356, loss: 3.61017



                                                  
  0%|          | 0/1 [00:41<?, ?it/s]6,  3.74it/s][A
  6%|▌         | 153/2754 [00:41<11:52,  3.65it/s][A

epoch: 0, step: 357, loss: 3.56195



                                                  
  0%|          | 0/1 [00:42<?, ?it/s]2,  3.65it/s][A
  6%|▌         | 154/2754 [00:42<11:23,  3.80it/s][A

epoch: 0, step: 358, loss: 2.97373



                                                  
  0%|          | 0/1 [00:42<?, ?it/s]3,  3.80it/s][A
  6%|▌         | 155/2754 [00:42<11:27,  3.78it/s][A

epoch: 0, step: 359, loss: 1.69608



                                                  
  0%|          | 0/1 [00:42<?, ?it/s]7,  3.78it/s][A
  6%|▌         | 156/2754 [00:42<11:29,  3.77it/s][A

epoch: 0, step: 360, loss: 3.58631



                                                  
  0%|          | 0/1 [00:42<?, ?it/s]9,  3.77it/s][A
  6%|▌         | 157/2754 [00:42<11:22,  3.81it/s][A

epoch: 0, step: 361, loss: 2.55297



                                                  
  0%|          | 0/1 [00:43<?, ?it/s]2,  3.81it/s][A
  6%|▌         | 158/2754 [00:43<11:11,  3.87it/s][A

epoch: 0, step: 362, loss: 3.78242



                                                  
  0%|          | 0/1 [00:43<?, ?it/s]1,  3.87it/s][A
  6%|▌         | 159/2754 [00:43<13:32,  3.19it/s][A

epoch: 0, step: 363, loss: 2.83898



                                                  
  0%|          | 0/1 [00:43<?, ?it/s]2,  3.19it/s][A
  6%|▌         | 160/2754 [00:43<13:33,  3.19it/s][A

epoch: 0, step: 364, loss: 3.37111



                                                  
  0%|          | 0/1 [00:44<?, ?it/s]3,  3.19it/s][A
  6%|▌         | 161/2754 [00:44<12:30,  3.45it/s][A

epoch: 0, step: 365, loss: 4.23903



                                                  
  0%|          | 0/1 [00:44<?, ?it/s]0,  3.45it/s][A
  6%|▌         | 162/2754 [00:44<11:48,  3.66it/s][A

epoch: 0, step: 366, loss: 2.99998



                                                  
  0%|          | 0/1 [00:45<?, ?it/s]8,  3.66it/s][A
  6%|▌         | 163/2754 [00:45<21:36,  2.00it/s][A

epoch: 0, step: 367, loss: 2.74293



                                                  
  0%|          | 0/1 [00:46<?, ?it/s]6,  2.00it/s][A
  6%|▌         | 164/2754 [00:46<24:02,  1.80it/s][A

epoch: 0, step: 368, loss: 2.91123



                                                  
  0%|          | 0/1 [00:46<?, ?it/s]2,  1.80it/s][A
  6%|▌         | 165/2754 [00:46<20:00,  2.16it/s][A

epoch: 0, step: 369, loss: 2.46685



                                                  
  0%|          | 0/1 [00:46<?, ?it/s]0,  2.16it/s][A
  6%|▌         | 166/2754 [00:46<17:14,  2.50it/s][A

epoch: 0, step: 370, loss: 0.642047



                                                  
  0%|          | 0/1 [00:46<?, ?it/s]4,  2.50it/s][A
  6%|▌         | 167/2754 [00:46<15:35,  2.76it/s][A

epoch: 0, step: 371, loss: 3.48201



                                                  
  0%|          | 0/1 [00:47<?, ?it/s]5,  2.76it/s][A
  6%|▌         | 168/2754 [00:47<14:04,  3.06it/s][A

epoch: 0, step: 372, loss: 3.26691



                                                  
  0%|          | 0/1 [00:47<?, ?it/s]4,  3.06it/s][A
  6%|▌         | 169/2754 [00:47<13:08,  3.28it/s][A

epoch: 0, step: 373, loss: 2.77259



                                                  
  0%|          | 0/1 [00:47<?, ?it/s]8,  3.28it/s][A
  6%|▌         | 170/2754 [00:47<12:28,  3.45it/s][A

epoch: 0, step: 374, loss: 2.36162



                                                  
  0%|          | 0/1 [00:47<?, ?it/s]8,  3.45it/s][A
  6%|▌         | 171/2754 [00:47<11:56,  3.61it/s][A

epoch: 0, step: 375, loss: 4.3228



                                                  
  0%|          | 0/1 [00:48<?, ?it/s]6,  3.61it/s][A
  6%|▌         | 172/2754 [00:48<11:34,  3.72it/s][A

epoch: 0, step: 376, loss: 4.60615



                                                  
  0%|          | 0/1 [00:48<?, ?it/s]4,  3.72it/s][A
  6%|▋         | 173/2754 [00:48<11:32,  3.73it/s][A

epoch: 0, step: 377, loss: 2.87302



                                                  
  0%|          | 0/1 [00:48<?, ?it/s]2,  3.73it/s][A
  6%|▋         | 174/2754 [00:48<11:27,  3.76it/s][A

epoch: 0, step: 378, loss: 3.57969



                                                  
  0%|          | 0/1 [00:48<?, ?it/s]7,  3.76it/s][A
  6%|▋         | 175/2754 [00:48<11:40,  3.68it/s][A

epoch: 0, step: 379, loss: 3.16934



                                                  
  0%|          | 0/1 [00:49<?, ?it/s]0,  3.68it/s][A
  6%|▋         | 176/2754 [00:49<11:37,  3.70it/s][A

epoch: 0, step: 380, loss: 3.80949



                                                  
  0%|          | 0/1 [00:49<?, ?it/s]7,  3.70it/s][A
  6%|▋         | 177/2754 [00:49<11:07,  3.86it/s][A

epoch: 0, step: 381, loss: 2.51054



                                                  
  0%|          | 0/1 [00:49<?, ?it/s]7,  3.86it/s][A
  6%|▋         | 178/2754 [00:49<11:32,  3.72it/s][A

epoch: 0, step: 382, loss: 3.62357



                                                  
  0%|          | 0/1 [00:50<?, ?it/s]2,  3.72it/s][A
  6%|▋         | 179/2754 [00:50<11:31,  3.72it/s][A

epoch: 0, step: 383, loss: 2.69603



                                                  
  0%|          | 0/1 [00:50<?, ?it/s]1,  3.72it/s][A
  7%|▋         | 180/2754 [00:50<11:04,  3.87it/s][A

epoch: 0, step: 384, loss: 3.93412



                                                  
  0%|          | 0/1 [00:50<?, ?it/s]4,  3.87it/s][A
  7%|▋         | 181/2754 [00:50<13:56,  3.08it/s][A

epoch: 0, step: 385, loss: 3.36212



                                                  
  0%|          | 0/1 [00:51<?, ?it/s]6,  3.08it/s][A
  7%|▋         | 182/2754 [00:51<13:28,  3.18it/s][A

epoch: 0, step: 386, loss: 3.1912



                                                  
  0%|          | 0/1 [00:51<?, ?it/s]8,  3.18it/s][A
  7%|▋         | 183/2754 [00:51<13:10,  3.25it/s][A

epoch: 0, step: 387, loss: 3.40212



                                                  
  0%|          | 0/1 [00:51<?, ?it/s]0,  3.25it/s][A
  7%|▋         | 184/2754 [00:51<12:20,  3.47it/s][A

epoch: 0, step: 388, loss: 2.83964



                                                  
  0%|          | 0/1 [00:51<?, ?it/s]0,  3.47it/s][A
  7%|▋         | 185/2754 [00:51<12:04,  3.55it/s][A

epoch: 0, step: 389, loss: 3.91988



                                                  
  0%|          | 0/1 [00:52<?, ?it/s]4,  3.55it/s][A
  7%|▋         | 186/2754 [00:52<12:10,  3.52it/s][A

epoch: 0, step: 390, loss: 3.7436



                                                  
  0%|          | 0/1 [00:52<?, ?it/s]0,  3.52it/s][A
  7%|▋         | 187/2754 [00:52<18:30,  2.31it/s][A

epoch: 0, step: 391, loss: 2.90823



                                                  
  0%|          | 0/1 [00:53<?, ?it/s]0,  2.31it/s][A
  7%|▋         | 188/2754 [00:53<17:20,  2.47it/s][A

epoch: 0, step: 392, loss: 3.48168



                                                  
  0%|          | 0/1 [00:53<?, ?it/s]0,  2.47it/s][A
  7%|▋         | 189/2754 [00:53<15:43,  2.72it/s][A

epoch: 0, step: 393, loss: 3.89455



                                                  
  0%|          | 0/1 [00:53<?, ?it/s]3,  2.72it/s][A
  7%|▋         | 190/2754 [00:53<14:00,  3.05it/s][A

epoch: 0, step: 394, loss: 3.82549



                                                  
  0%|          | 0/1 [00:53<?, ?it/s]0,  3.05it/s][A
  7%|▋         | 191/2754 [00:53<12:54,  3.31it/s][A

epoch: 0, step: 395, loss: 2.24587



                                                  
  0%|          | 0/1 [00:54<?, ?it/s]4,  3.31it/s][A
  7%|▋         | 192/2754 [00:54<12:37,  3.38it/s][A

epoch: 0, step: 396, loss: 3.88926



                                                  
  0%|          | 0/1 [00:54<?, ?it/s]7,  3.38it/s][A
  7%|▋         | 193/2754 [00:54<12:01,  3.55it/s][A

epoch: 0, step: 397, loss: 4.49968



                                                  
  0%|          | 0/1 [00:54<?, ?it/s]1,  3.55it/s][A
  7%|▋         | 194/2754 [00:54<11:57,  3.57it/s][A

epoch: 0, step: 398, loss: 2.98566



                                                  
  0%|          | 0/1 [00:55<?, ?it/s]7,  3.57it/s][A
  7%|▋         | 195/2754 [00:55<11:24,  3.74it/s][A

epoch: 0, step: 399, loss: 4.78037



                                                  
  0%|          | 0/1 [00:55<?, ?it/s]4,  3.74it/s][A
  7%|▋         | 196/2754 [00:55<11:12,  3.81it/s][A

epoch: 0, step: 400, loss: 4.19909



                                                  
  0%|          | 0/1 [00:55<?, ?it/s]2,  3.81it/s][A
  7%|▋         | 197/2754 [00:55<11:03,  3.86it/s][A

epoch: 0, step: 401, loss: 3.44992



                                                  
  0%|          | 0/1 [00:55<?, ?it/s]3,  3.86it/s][A
  7%|▋         | 198/2754 [00:55<10:59,  3.88it/s][A

epoch: 0, step: 402, loss: 4.33016



                                                  
  0%|          | 0/1 [00:56<?, ?it/s]9,  3.88it/s][A
  7%|▋         | 199/2754 [00:56<10:51,  3.92it/s][A

epoch: 0, step: 403, loss: 3.38394



                                                  
  0%|          | 0/1 [00:56<?, ?it/s]1,  3.92it/s][A
  7%|▋         | 200/2754 [00:56<11:18,  3.77it/s][A

epoch: 0, step: 404, loss: 2.93995



                                                  
  0%|          | 0/1 [00:56<?, ?it/s]8,  3.77it/s][A
  7%|▋         | 201/2754 [00:56<11:39,  3.65it/s][A

epoch: 0, step: 405, loss: 4.05374



                                                  
  0%|          | 0/1 [00:56<?, ?it/s]9,  3.65it/s][A
  7%|▋         | 202/2754 [00:56<11:18,  3.76it/s][A

epoch: 0, step: 406, loss: 3.48905



                                                  
  0%|          | 0/1 [00:57<?, ?it/s]8,  3.76it/s][A
  7%|▋         | 203/2754 [00:57<11:00,  3.86it/s][A

epoch: 0, step: 407, loss: 4.98204



                                                  
  0%|          | 0/1 [00:57<?, ?it/s]0,  3.86it/s][A
  7%|▋         | 204/2754 [00:57<10:46,  3.95it/s][A

epoch: 0, step: 408, loss: 3.28893



                                                  
  0%|          | 0/1 [00:57<?, ?it/s]6,  3.95it/s][A
  7%|▋         | 205/2754 [00:57<10:35,  4.01it/s][A

epoch: 0, step: 409, loss: 3.00081



                                                  
  0%|          | 0/1 [00:57<?, ?it/s]5,  4.01it/s][A
  7%|▋         | 206/2754 [00:57<11:16,  3.76it/s][A

epoch: 0, step: 410, loss: 3.38635



                                                  
  0%|          | 0/1 [00:58<?, ?it/s]6,  3.76it/s][A
  8%|▊         | 207/2754 [00:58<10:51,  3.91it/s][A

epoch: 0, step: 411, loss: 3.23054



                                                  
  0%|          | 0/1 [00:58<?, ?it/s]1,  3.91it/s][A
  8%|▊         | 208/2754 [00:58<10:58,  3.87it/s][A

epoch: 0, step: 412, loss: 5.01794



                                                  
  0%|          | 0/1 [00:58<?, ?it/s]8,  3.87it/s][A
  8%|▊         | 209/2754 [00:58<11:04,  3.83it/s][A

epoch: 0, step: 413, loss: 2.78829



                                                  
  0%|          | 0/1 [00:58<?, ?it/s]4,  3.83it/s][A
  8%|▊         | 210/2754 [00:58<11:06,  3.81it/s][A

epoch: 0, step: 414, loss: 2.76352



                                                  
  0%|          | 0/1 [00:59<?, ?it/s]6,  3.81it/s][A
  8%|▊         | 211/2754 [00:59<11:41,  3.63it/s][A

epoch: 0, step: 415, loss: 3.5562



                                                  
  0%|          | 0/1 [00:59<?, ?it/s]1,  3.63it/s][A
  8%|▊         | 212/2754 [00:59<11:57,  3.54it/s][A

epoch: 0, step: 416, loss: 3.2949



                                                  
  0%|          | 0/1 [00:59<?, ?it/s]7,  3.54it/s][A
  8%|▊         | 213/2754 [00:59<11:16,  3.75it/s][A

epoch: 0, step: 417, loss: 2.07085



                                                  
  0%|          | 0/1 [01:00<?, ?it/s]6,  3.75it/s][A
  8%|▊         | 214/2754 [01:00<10:58,  3.86it/s][A

epoch: 0, step: 418, loss: 4.09898



                                                  
  0%|          | 0/1 [01:00<?, ?it/s]8,  3.86it/s][A
  8%|▊         | 215/2754 [01:00<10:47,  3.92it/s][A

epoch: 0, step: 419, loss: 2.70977



                                                  
  0%|          | 0/1 [01:00<?, ?it/s]7,  3.92it/s][A
  8%|▊         | 216/2754 [01:00<10:38,  3.98it/s][A

epoch: 0, step: 420, loss: 3.30435



                                                  
  0%|          | 0/1 [01:00<?, ?it/s]8,  3.98it/s][A
  8%|▊         | 217/2754 [01:00<10:36,  3.99it/s][A

epoch: 0, step: 421, loss: 2.85375



                                                  
  0%|          | 0/1 [01:01<?, ?it/s]6,  3.99it/s][A
  8%|▊         | 218/2754 [01:01<11:03,  3.82it/s][A

epoch: 0, step: 422, loss: 4.37877



                                                  
  0%|          | 0/1 [01:01<?, ?it/s]3,  3.82it/s][A
  8%|▊         | 219/2754 [01:01<10:57,  3.86it/s][A

epoch: 0, step: 423, loss: 3.14568



                                                  
  0%|          | 0/1 [01:01<?, ?it/s]7,  3.86it/s][A
  8%|▊         | 220/2754 [01:01<10:52,  3.88it/s][A

epoch: 0, step: 424, loss: 2.94064



                                                  
  0%|          | 0/1 [01:01<?, ?it/s]2,  3.88it/s][A
  8%|▊         | 221/2754 [01:01<12:13,  3.46it/s][A

epoch: 0, step: 425, loss: 3.70271



                                                  
  0%|          | 0/1 [01:02<?, ?it/s]3,  3.46it/s][A
  8%|▊         | 222/2754 [01:02<12:13,  3.45it/s][A

epoch: 0, step: 426, loss: 3.62886



                                                  
  0%|          | 0/1 [01:02<?, ?it/s]3,  3.45it/s][A
  8%|▊         | 223/2754 [01:02<11:37,  3.63it/s][A

epoch: 0, step: 427, loss: 4.34209



                                                  
  0%|          | 0/1 [01:02<?, ?it/s]7,  3.63it/s][A
  8%|▊         | 224/2754 [01:02<11:24,  3.70it/s][A

epoch: 0, step: 428, loss: 4.58293



                                                  
  0%|          | 0/1 [01:03<?, ?it/s]4,  3.70it/s][A
  8%|▊         | 225/2754 [01:03<13:03,  3.23it/s][A

epoch: 0, step: 429, loss: 3.65505



                                                  
  0%|          | 0/1 [01:03<?, ?it/s]3,  3.23it/s][A
  8%|▊         | 226/2754 [01:03<15:27,  2.73it/s][A

epoch: 0, step: 430, loss: 3.60054



                                                  
  0%|          | 0/1 [01:03<?, ?it/s]7,  2.73it/s][A
  8%|▊         | 227/2754 [01:03<13:49,  3.05it/s][A

epoch: 0, step: 431, loss: 2.96328



                                                  
  0%|          | 0/1 [01:04<?, ?it/s]9,  3.05it/s][A
  8%|▊         | 228/2754 [01:04<12:43,  3.31it/s][A

epoch: 0, step: 432, loss: 2.96978



                                                  
  0%|          | 0/1 [01:04<?, ?it/s]3,  3.31it/s][A
  8%|▊         | 229/2754 [01:04<11:54,  3.53it/s][A

epoch: 0, step: 433, loss: 4.68107



                                                  
  0%|          | 0/1 [01:04<?, ?it/s]4,  3.53it/s][A
  8%|▊         | 230/2754 [01:04<11:46,  3.57it/s][A

epoch: 0, step: 434, loss: 2.24691



                                                  
  0%|          | 0/1 [01:04<?, ?it/s]6,  3.57it/s][A
  8%|▊         | 231/2754 [01:04<11:23,  3.69it/s][A

epoch: 0, step: 435, loss: 3.73914



                                                  
  0%|          | 0/1 [01:05<?, ?it/s]3,  3.69it/s][A
  8%|▊         | 232/2754 [01:05<10:55,  3.85it/s][A

epoch: 0, step: 436, loss: 4.50115



                                                  
  0%|          | 0/1 [01:05<?, ?it/s]5,  3.85it/s][A
  8%|▊         | 233/2754 [01:05<10:36,  3.96it/s][A

epoch: 0, step: 437, loss: 2.40228



                                                  
  0%|          | 0/1 [01:05<?, ?it/s]6,  3.96it/s][A
  8%|▊         | 234/2754 [01:05<10:21,  4.05it/s][A

epoch: 0, step: 438, loss: 5.67238



                                                  
  0%|          | 0/1 [01:05<?, ?it/s]1,  4.05it/s][A
  9%|▊         | 235/2754 [01:05<10:12,  4.11it/s][A

epoch: 0, step: 439, loss: 3.68044



                                                  
  0%|          | 0/1 [01:06<?, ?it/s]2,  4.11it/s][A
  9%|▊         | 236/2754 [01:06<10:04,  4.17it/s][A

epoch: 0, step: 440, loss: 3.24368



                                                  
  0%|          | 0/1 [01:06<?, ?it/s]4,  4.17it/s][A
  9%|▊         | 237/2754 [01:06<10:08,  4.14it/s][A

epoch: 0, step: 441, loss: 2.02785



                                                  
  0%|          | 0/1 [01:06<?, ?it/s]8,  4.14it/s][A
  9%|▊         | 238/2754 [01:06<10:05,  4.16it/s][A

epoch: 0, step: 442, loss: 4.83636



                                                  
  0%|          | 0/1 [01:07<?, ?it/s]5,  4.16it/s][A
  9%|▊         | 239/2754 [01:07<21:54,  1.91it/s][A

epoch: 0, step: 443, loss: 1.47581



                                                  
  0%|          | 0/1 [01:08<?, ?it/s]4,  1.91it/s][A
  9%|▊         | 240/2754 [01:08<25:35,  1.64it/s][A

epoch: 0, step: 444, loss: 3.21986



                                                  
  0%|          | 0/1 [01:08<?, ?it/s]5,  1.64it/s][A
  9%|▉         | 241/2754 [01:08<20:54,  2.00it/s][A

epoch: 0, step: 445, loss: 4.00437



                                                  
  0%|          | 0/1 [01:08<?, ?it/s]4,  2.00it/s][A
  9%|▉         | 242/2754 [01:08<17:41,  2.37it/s][A

epoch: 0, step: 446, loss: 2.63601



                                                  
  0%|          | 0/1 [01:09<?, ?it/s]1,  2.37it/s][A
  9%|▉         | 243/2754 [01:09<15:33,  2.69it/s][A

epoch: 0, step: 447, loss: 3.92877



                                                  
  0%|          | 0/1 [01:09<?, ?it/s]3,  2.69it/s][A
  9%|▉         | 244/2754 [01:09<14:30,  2.88it/s][A

epoch: 0, step: 448, loss: 3.02335



                                                  
  0%|          | 0/1 [01:09<?, ?it/s]0,  2.88it/s][A
  9%|▉         | 245/2754 [01:09<13:44,  3.04it/s][A

epoch: 0, step: 449, loss: 2.33233



                                                  
  0%|          | 0/1 [01:10<?, ?it/s]4,  3.04it/s][A
  9%|▉         | 246/2754 [01:10<12:40,  3.30it/s][A

epoch: 0, step: 450, loss: 4.13031



                                                  
  0%|          | 0/1 [01:10<?, ?it/s]0,  3.30it/s][A
  9%|▉         | 247/2754 [01:10<11:56,  3.50it/s][A

epoch: 0, step: 451, loss: 3.25196



                                                  
  0%|          | 0/1 [01:10<?, ?it/s]6,  3.50it/s][A
  9%|▉         | 248/2754 [01:10<11:18,  3.69it/s][A

epoch: 0, step: 452, loss: 3.19326



                                                  
  0%|          | 0/1 [01:10<?, ?it/s]8,  3.69it/s][A
  9%|▉         | 249/2754 [01:10<10:54,  3.83it/s][A

epoch: 0, step: 453, loss: 2.60717



                                                  
  0%|          | 0/1 [01:11<?, ?it/s]4,  3.83it/s][A
  9%|▉         | 250/2754 [01:11<10:45,  3.88it/s][A

epoch: 0, step: 454, loss: 3.75688



                                                  
  0%|          | 0/1 [01:11<?, ?it/s]5,  3.88it/s][A
  9%|▉         | 251/2754 [01:11<10:17,  4.05it/s][A

epoch: 0, step: 455, loss: 3.94674



                                                  
  0%|          | 0/1 [01:11<?, ?it/s]7,  4.05it/s][A
  9%|▉         | 252/2754 [01:11<10:10,  4.10it/s][A

epoch: 0, step: 456, loss: 4.47095



                                                  
  0%|          | 0/1 [01:11<?, ?it/s]0,  4.10it/s][A
  9%|▉         | 253/2754 [01:11<09:56,  4.20it/s][A

epoch: 0, step: 457, loss: 3.6167



                                                  
  0%|          | 0/1 [01:11<?, ?it/s]6,  4.20it/s][A
  9%|▉         | 254/2754 [01:11<10:03,  4.14it/s][A

epoch: 0, step: 458, loss: 2.28514



                                                  
  0%|          | 0/1 [01:12<?, ?it/s]3,  4.14it/s][A
  9%|▉         | 255/2754 [01:12<10:45,  3.87it/s][A

epoch: 0, step: 459, loss: 3.5062



                                                  
  0%|          | 0/1 [01:12<?, ?it/s]5,  3.87it/s][A
  9%|▉         | 256/2754 [01:12<10:35,  3.93it/s][A

epoch: 0, step: 460, loss: 1.37293



                                                  
  0%|          | 0/1 [01:12<?, ?it/s]5,  3.93it/s][A
  9%|▉         | 257/2754 [01:12<10:21,  4.02it/s][A

epoch: 0, step: 461, loss: 3.83897



                                                  
  0%|          | 0/1 [01:12<?, ?it/s]1,  4.02it/s][A
  9%|▉         | 258/2754 [01:12<10:22,  4.01it/s][A

epoch: 0, step: 462, loss: 1.86158



                                                  
  0%|          | 0/1 [01:13<?, ?it/s]2,  4.01it/s][A
  9%|▉         | 259/2754 [01:13<10:20,  4.02it/s][A

epoch: 0, step: 463, loss: 3.85383



                                                  
  0%|          | 0/1 [01:13<?, ?it/s]0,  4.02it/s][A
  9%|▉         | 260/2754 [01:13<10:22,  4.01it/s][A

epoch: 0, step: 464, loss: 3.95231



                                                  
  0%|          | 0/1 [01:13<?, ?it/s]2,  4.01it/s][A
  9%|▉         | 261/2754 [01:13<10:25,  3.99it/s][A

epoch: 0, step: 465, loss: 1.9576



                                                  
  0%|          | 0/1 [01:14<?, ?it/s]5,  3.99it/s][A
 10%|▉         | 262/2754 [01:14<10:50,  3.83it/s][A

epoch: 0, step: 466, loss: 3.83757



                                                  
  0%|          | 0/1 [01:14<?, ?it/s]0,  3.83it/s][A
 10%|▉         | 263/2754 [01:14<10:43,  3.87it/s][A

epoch: 0, step: 467, loss: 5.8115



                                                  
  0%|          | 0/1 [01:14<?, ?it/s]3,  3.87it/s][A
 10%|▉         | 264/2754 [01:14<10:33,  3.93it/s][A

epoch: 0, step: 468, loss: 2.64558



                                                  
  0%|          | 0/1 [01:14<?, ?it/s]3,  3.93it/s][A
 10%|▉         | 265/2754 [01:14<10:48,  3.84it/s][A

epoch: 0, step: 469, loss: 3.42539



                                                  
  0%|          | 0/1 [01:15<?, ?it/s]8,  3.84it/s][A
 10%|▉         | 266/2754 [01:15<10:55,  3.80it/s][A

epoch: 0, step: 470, loss: 2.81744



                                                  
  0%|          | 0/1 [01:15<?, ?it/s]5,  3.80it/s][A
 10%|▉         | 267/2754 [01:15<12:43,  3.26it/s][A

epoch: 0, step: 471, loss: 3.45125



                                                  
  0%|          | 0/1 [01:15<?, ?it/s]3,  3.26it/s][A
 10%|▉         | 268/2754 [01:15<11:40,  3.55it/s][A

epoch: 0, step: 472, loss: 2.56253



                                                  
  0%|          | 0/1 [01:16<?, ?it/s]0,  3.55it/s][A
 10%|▉         | 269/2754 [01:16<13:26,  3.08it/s][A

epoch: 0, step: 473, loss: 2.00227



                                                  
  0%|          | 0/1 [01:16<?, ?it/s]6,  3.08it/s][A
 10%|▉         | 270/2754 [01:16<12:25,  3.33it/s][A

epoch: 0, step: 474, loss: 3.63717



                                                  
  0%|          | 0/1 [01:16<?, ?it/s]5,  3.33it/s][A
 10%|▉         | 271/2754 [01:16<11:38,  3.55it/s][A

epoch: 0, step: 475, loss: 5.70425



                                                  
  0%|          | 0/1 [01:16<?, ?it/s]8,  3.55it/s][A
 10%|▉         | 272/2754 [01:16<11:28,  3.60it/s][A

epoch: 0, step: 476, loss: 3.83256



                                                  
  0%|          | 0/1 [01:17<?, ?it/s]8,  3.60it/s][A
 10%|▉         | 273/2754 [01:17<16:17,  2.54it/s][A

epoch: 0, step: 477, loss: 4.0797



                                                  
  0%|          | 0/1 [01:17<?, ?it/s]7,  2.54it/s][A
 10%|▉         | 274/2754 [01:17<14:40,  2.82it/s][A

epoch: 0, step: 478, loss: 2.52973



                                                  
  0%|          | 0/1 [01:18<?, ?it/s]0,  2.82it/s][A
 10%|▉         | 275/2754 [01:18<13:02,  3.17it/s][A

epoch: 0, step: 479, loss: 2.75232



                                                  
  0%|          | 0/1 [01:18<?, ?it/s]2,  3.17it/s][A
 10%|█         | 276/2754 [01:18<11:55,  3.47it/s][A

epoch: 0, step: 480, loss: 2.03971



                                                  
  0%|          | 0/1 [01:18<?, ?it/s]5,  3.47it/s][A
 10%|█         | 277/2754 [01:18<11:32,  3.58it/s][A

epoch: 0, step: 481, loss: 2.63565



                                                  
  0%|          | 0/1 [01:18<?, ?it/s]2,  3.58it/s][A
 10%|█         | 278/2754 [01:18<11:18,  3.65it/s][A

epoch: 0, step: 482, loss: 4.26055



                                                  
  0%|          | 0/1 [01:19<?, ?it/s]8,  3.65it/s][A
 10%|█         | 279/2754 [01:19<11:06,  3.72it/s][A

epoch: 0, step: 483, loss: 3.19127



                                                  
  0%|          | 0/1 [01:19<?, ?it/s]6,  3.72it/s][A
 10%|█         | 280/2754 [01:19<11:12,  3.68it/s][A

epoch: 0, step: 484, loss: 3.8822



                                                  
  0%|          | 0/1 [01:19<?, ?it/s]2,  3.68it/s][A
 10%|█         | 281/2754 [01:19<11:06,  3.71it/s][A

epoch: 0, step: 485, loss: 1.40462



                                                  
  0%|          | 0/1 [01:19<?, ?it/s]6,  3.71it/s][A
 10%|█         | 282/2754 [01:19<10:46,  3.82it/s][A

epoch: 0, step: 486, loss: 2.98328



                                                  
  0%|          | 0/1 [01:20<?, ?it/s]6,  3.82it/s][A
 10%|█         | 283/2754 [01:20<10:31,  3.91it/s][A

epoch: 0, step: 487, loss: 5.0309



                                                  
  0%|          | 0/1 [01:20<?, ?it/s]1,  3.91it/s][A
 10%|█         | 284/2754 [01:20<10:25,  3.95it/s][A

epoch: 0, step: 488, loss: 1.78357



                                                  
  0%|          | 0/1 [01:20<?, ?it/s]5,  3.95it/s][A
 10%|█         | 285/2754 [01:20<10:21,  3.97it/s][A

epoch: 0, step: 489, loss: 4.93008



                                                  
  0%|          | 0/1 [01:20<?, ?it/s]1,  3.97it/s][A
 10%|█         | 286/2754 [01:20<10:50,  3.79it/s][A

epoch: 0, step: 490, loss: 2.59657



                                                  
  0%|          | 0/1 [01:21<?, ?it/s]0,  3.79it/s][A
 10%|█         | 287/2754 [01:21<10:41,  3.85it/s][A

epoch: 0, step: 491, loss: 3.68342



                                                  
  0%|          | 0/1 [01:21<?, ?it/s]1,  3.85it/s][A
 10%|█         | 288/2754 [01:21<10:41,  3.85it/s][A

epoch: 0, step: 492, loss: 2.89054



                                                  
  0%|          | 0/1 [01:21<?, ?it/s]1,  3.85it/s][A
 10%|█         | 289/2754 [01:21<10:27,  3.93it/s][A

epoch: 0, step: 493, loss: 4.54565



                                                  
  0%|          | 0/1 [01:21<?, ?it/s]7,  3.93it/s][A
 11%|█         | 290/2754 [01:21<10:43,  3.83it/s][A

epoch: 0, step: 494, loss: 3.77442



                                                  
  0%|          | 0/1 [01:22<?, ?it/s]3,  3.83it/s][A
 11%|█         | 291/2754 [01:22<10:49,  3.79it/s][A

epoch: 0, step: 495, loss: 3.25548



                                                  
  0%|          | 0/1 [01:22<?, ?it/s]9,  3.79it/s][A
 11%|█         | 292/2754 [01:22<11:41,  3.51it/s][A

epoch: 0, step: 496, loss: 3.66676



                                                  
  0%|          | 0/1 [01:22<?, ?it/s]1,  3.51it/s][A
 11%|█         | 293/2754 [01:22<11:07,  3.69it/s][A

epoch: 0, step: 497, loss: 3.46608



                                                  
  0%|          | 0/1 [01:22<?, ?it/s]7,  3.69it/s][A
 11%|█         | 294/2754 [01:22<10:50,  3.78it/s][A

epoch: 0, step: 498, loss: 1.96166



                                                  
  0%|          | 0/1 [01:23<?, ?it/s]0,  3.78it/s][A
 11%|█         | 295/2754 [01:23<10:41,  3.83it/s][A

epoch: 0, step: 499, loss: 2.67237



                                                  
  0%|          | 0/1 [01:23<?, ?it/s]1,  3.83it/s][A
 11%|█         | 296/2754 [01:23<10:28,  3.91it/s][A

epoch: 0, step: 500, loss: 4.88365



                                                  
  0%|          | 0/1 [01:23<?, ?it/s]8,  3.91it/s][A
 11%|█         | 297/2754 [01:23<11:39,  3.51it/s][A

epoch: 0, step: 501, loss: 4.15286



                                                  
  0%|          | 0/1 [01:24<?, ?it/s]9,  3.51it/s][A
 11%|█         | 298/2754 [01:24<11:19,  3.61it/s][A

epoch: 0, step: 502, loss: 3.59999



                                                  
  0%|          | 0/1 [01:24<?, ?it/s]9,  3.61it/s][A
 11%|█         | 299/2754 [01:24<10:58,  3.73it/s][A

epoch: 0, step: 503, loss: 2.57541



                                                  
  0%|          | 0/1 [01:24<?, ?it/s]8,  3.73it/s][A
 11%|█         | 300/2754 [01:24<11:13,  3.64it/s][A

epoch: 0, step: 504, loss: 3.70754



                                                  
  0%|          | 0/1 [01:24<?, ?it/s]3,  3.64it/s][A
 11%|█         | 301/2754 [01:24<10:47,  3.79it/s][A

epoch: 0, step: 505, loss: 4.32977



                                                  
  0%|          | 0/1 [01:25<?, ?it/s]7,  3.79it/s][A
 11%|█         | 302/2754 [01:25<10:33,  3.87it/s][A

epoch: 0, step: 506, loss: 3.9637



                                                  
  0%|          | 0/1 [01:25<?, ?it/s]3,  3.87it/s][A
 11%|█         | 303/2754 [01:25<11:00,  3.71it/s][A

epoch: 0, step: 507, loss: 2.27854



                                                  
  0%|          | 0/1 [01:25<?, ?it/s]0,  3.71it/s][A
 11%|█         | 304/2754 [01:25<11:12,  3.64it/s][A

epoch: 0, step: 508, loss: 3.62948



                                                  
  0%|          | 0/1 [01:25<?, ?it/s]2,  3.64it/s][A
 11%|█         | 305/2754 [01:25<11:28,  3.55it/s][A

epoch: 0, step: 509, loss: 3.62172



                                                  
  0%|          | 0/1 [01:26<?, ?it/s]8,  3.55it/s][A
 11%|█         | 306/2754 [01:26<11:24,  3.58it/s][A

epoch: 0, step: 510, loss: 3.63466



                                                  
  0%|          | 0/1 [01:26<?, ?it/s]4,  3.58it/s][A
 11%|█         | 307/2754 [01:26<11:01,  3.70it/s][A

epoch: 0, step: 511, loss: 2.87558



                                                  
  0%|          | 0/1 [01:26<?, ?it/s]1,  3.70it/s][A
 11%|█         | 308/2754 [01:26<10:46,  3.78it/s][A

epoch: 0, step: 512, loss: 3.42901



                                                  
  0%|          | 0/1 [01:26<?, ?it/s]6,  3.78it/s][A
 11%|█         | 309/2754 [01:26<10:29,  3.88it/s][A

epoch: 0, step: 513, loss: 4.09837



                                                  
  0%|          | 0/1 [01:27<?, ?it/s]9,  3.88it/s][A
 11%|█▏        | 310/2754 [01:27<10:20,  3.94it/s][A

epoch: 0, step: 514, loss: 3.30951



                                                  
  0%|          | 0/1 [01:27<?, ?it/s]0,  3.94it/s][A
 11%|█▏        | 311/2754 [01:27<10:10,  4.00it/s][A

epoch: 0, step: 515, loss: 3.14488



                                                  
  0%|          | 0/1 [01:28<?, ?it/s]0,  4.00it/s][A
 11%|█▏        | 312/2754 [01:28<17:21,  2.35it/s][A

epoch: 0, step: 516, loss: 0.222191



                                                  
  0%|          | 0/1 [01:28<?, ?it/s]1,  2.35it/s][A
 11%|█▏        | 313/2754 [01:28<16:35,  2.45it/s][A

epoch: 0, step: 517, loss: 3.78664



                                                  
  0%|          | 0/1 [01:28<?, ?it/s]5,  2.45it/s][A
 11%|█▏        | 314/2754 [01:28<14:44,  2.76it/s][A

epoch: 0, step: 518, loss: 3.11413



                                                  
  0%|          | 0/1 [01:29<?, ?it/s]4,  2.76it/s][A
 11%|█▏        | 315/2754 [01:29<13:40,  2.97it/s][A

epoch: 0, step: 519, loss: 1.4084



                                                  
  0%|          | 0/1 [01:29<?, ?it/s]0,  2.97it/s][A
 11%|█▏        | 316/2754 [01:29<12:32,  3.24it/s][A

epoch: 0, step: 520, loss: 4.98929



                                                  
  0%|          | 0/1 [01:29<?, ?it/s]2,  3.24it/s][A
 12%|█▏        | 317/2754 [01:29<11:49,  3.44it/s][A

epoch: 0, step: 521, loss: 1.26857



                                                  
  0%|          | 0/1 [01:29<?, ?it/s]9,  3.44it/s][A
 12%|█▏        | 318/2754 [01:29<11:26,  3.55it/s][A

epoch: 0, step: 522, loss: 1.69235



                                                  
  0%|          | 0/1 [01:30<?, ?it/s]6,  3.55it/s][A
 12%|█▏        | 319/2754 [01:30<11:38,  3.49it/s][A

epoch: 0, step: 523, loss: 3.30089



                                                  
  0%|          | 0/1 [01:30<?, ?it/s]8,  3.49it/s][A
 12%|█▏        | 320/2754 [01:30<12:52,  3.15it/s][A

epoch: 0, step: 524, loss: 3.48872



                                                  
  0%|          | 0/1 [01:30<?, ?it/s]2,  3.15it/s][A
 12%|█▏        | 321/2754 [01:30<12:02,  3.37it/s][A

epoch: 0, step: 525, loss: 3.91087



                                                  
  0%|          | 0/1 [01:31<?, ?it/s]2,  3.37it/s][A
 12%|█▏        | 322/2754 [01:31<11:32,  3.51it/s][A

epoch: 0, step: 526, loss: 3.84629



                                                  
  0%|          | 0/1 [01:31<?, ?it/s]2,  3.51it/s][A
 12%|█▏        | 323/2754 [01:31<11:10,  3.63it/s][A

epoch: 0, step: 527, loss: 2.51842



                                                  
  0%|          | 0/1 [01:31<?, ?it/s]0,  3.63it/s][A
 12%|█▏        | 324/2754 [01:31<11:31,  3.51it/s][A

epoch: 0, step: 528, loss: 3.56736



                                                  
  0%|          | 0/1 [01:32<?, ?it/s]1,  3.51it/s][A
 12%|█▏        | 325/2754 [01:32<11:43,  3.46it/s][A

epoch: 0, step: 529, loss: 2.94026



                                                  
  0%|          | 0/1 [01:32<?, ?it/s]3,  3.46it/s][A
 12%|█▏        | 326/2754 [01:32<11:15,  3.59it/s][A

epoch: 0, step: 530, loss: 3.45644



                                                  
  0%|          | 0/1 [01:32<?, ?it/s]5,  3.59it/s][A
 12%|█▏        | 327/2754 [01:32<10:46,  3.75it/s][A

epoch: 0, step: 531, loss: 3.25691



                                                  
  0%|          | 0/1 [01:32<?, ?it/s]6,  3.75it/s][A
 12%|█▏        | 328/2754 [01:32<12:28,  3.24it/s][A

epoch: 0, step: 532, loss: 3.42884



                                                  
  0%|          | 0/1 [01:33<?, ?it/s]8,  3.24it/s][A
 12%|█▏        | 329/2754 [01:33<11:41,  3.46it/s][A

epoch: 0, step: 533, loss: 5.7889



                                                  
  0%|          | 0/1 [01:33<?, ?it/s]1,  3.46it/s][A
 12%|█▏        | 330/2754 [01:33<11:14,  3.59it/s][A

epoch: 0, step: 534, loss: 5.10139



                                                  
  0%|          | 0/1 [01:33<?, ?it/s]4,  3.59it/s][A
 12%|█▏        | 331/2754 [01:33<10:57,  3.68it/s][A

epoch: 0, step: 535, loss: 3.71362



                                                  
  0%|          | 0/1 [01:33<?, ?it/s]7,  3.68it/s][A
 12%|█▏        | 332/2754 [01:33<11:08,  3.62it/s][A

epoch: 0, step: 536, loss: 3.01064



                                                  
  0%|          | 0/1 [01:34<?, ?it/s]8,  3.62it/s][A
 12%|█▏        | 333/2754 [01:34<10:43,  3.76it/s][A

epoch: 0, step: 537, loss: 4.12719



                                                  
  0%|          | 0/1 [01:34<?, ?it/s]3,  3.76it/s][A
 12%|█▏        | 334/2754 [01:34<10:40,  3.78it/s][A

epoch: 0, step: 538, loss: 1.91683



                                                  
  0%|          | 0/1 [01:34<?, ?it/s]0,  3.78it/s][A
 12%|█▏        | 335/2754 [01:34<10:46,  3.74it/s][A

epoch: 0, step: 539, loss: 2.74799



                                                  
  0%|          | 0/1 [01:35<?, ?it/s]6,  3.74it/s][A
 12%|█▏        | 336/2754 [01:35<12:34,  3.20it/s][A

epoch: 0, step: 540, loss: 2.50401



                                                  
  0%|          | 0/1 [01:35<?, ?it/s]4,  3.20it/s][A
 12%|█▏        | 337/2754 [01:35<14:26,  2.79it/s][A

epoch: 0, step: 541, loss: 3.05647



                                                  
  0%|          | 0/1 [01:35<?, ?it/s]6,  2.79it/s][A
 12%|█▏        | 338/2754 [01:35<13:04,  3.08it/s][A

epoch: 0, step: 542, loss: 4.06017



                                                  
  0%|          | 0/1 [01:36<?, ?it/s]4,  3.08it/s][A
 12%|█▏        | 339/2754 [01:36<12:18,  3.27it/s][A

epoch: 0, step: 543, loss: 4.36296



                                                  
  0%|          | 0/1 [01:36<?, ?it/s]8,  3.27it/s][A
 12%|█▏        | 340/2754 [01:36<11:33,  3.48it/s][A

epoch: 0, step: 544, loss: 2.88101



                                                  
  0%|          | 0/1 [01:36<?, ?it/s]3,  3.48it/s][A
 12%|█▏        | 341/2754 [01:36<11:05,  3.63it/s][A

epoch: 0, step: 545, loss: 2.92074



                                                  
  0%|          | 0/1 [01:36<?, ?it/s]5,  3.63it/s][A
 12%|█▏        | 342/2754 [01:36<10:46,  3.73it/s][A

epoch: 0, step: 546, loss: 4.04173



                                                  
  0%|          | 0/1 [01:37<?, ?it/s]6,  3.73it/s][A
 12%|█▏        | 343/2754 [01:37<10:24,  3.86it/s][A

epoch: 0, step: 547, loss: 4.14515



                                                  
  0%|          | 0/1 [01:37<?, ?it/s]4,  3.86it/s][A
 12%|█▏        | 344/2754 [01:37<10:25,  3.85it/s][A

epoch: 0, step: 548, loss: 2.94577



                                                  
  0%|          | 0/1 [01:37<?, ?it/s]5,  3.85it/s][A
 13%|█▎        | 345/2754 [01:37<10:17,  3.90it/s][A

epoch: 0, step: 549, loss: 3.37229



                                                  
  0%|          | 0/1 [01:37<?, ?it/s]7,  3.90it/s][A
 13%|█▎        | 346/2754 [01:37<09:59,  4.02it/s][A

epoch: 0, step: 550, loss: 5.3425



                                                  
  0%|          | 0/1 [01:38<?, ?it/s]9,  4.02it/s][A
 13%|█▎        | 347/2754 [01:38<10:18,  3.89it/s][A

epoch: 0, step: 551, loss: 4.32398



                                                  
  0%|          | 0/1 [01:38<?, ?it/s]8,  3.89it/s][A
 13%|█▎        | 348/2754 [01:38<09:49,  4.08it/s][A

epoch: 0, step: 552, loss: 3.05302



                                                  
  0%|          | 0/1 [01:38<?, ?it/s]9,  4.08it/s][A
 13%|█▎        | 349/2754 [01:38<09:37,  4.16it/s][A

epoch: 0, step: 553, loss: 5.02375



                                                  
  0%|          | 0/1 [01:39<?, ?it/s]7,  4.16it/s][A
 13%|█▎        | 350/2754 [01:39<20:21,  1.97it/s][A

epoch: 0, step: 554, loss: 2.05149



                                                  
  0%|          | 0/1 [01:40<?, ?it/s]1,  1.97it/s][A
 13%|█▎        | 351/2754 [01:40<23:47,  1.68it/s][A

epoch: 0, step: 555, loss: 5.68106



                                                  
  0%|          | 0/1 [01:40<?, ?it/s]7,  1.68it/s][A
 13%|█▎        | 352/2754 [01:40<20:11,  1.98it/s][A

epoch: 0, step: 556, loss: 1.35809



                                                  
  0%|          | 0/1 [01:41<?, ?it/s]1,  1.98it/s][A
 13%|█▎        | 353/2754 [01:41<17:04,  2.34it/s][A

epoch: 0, step: 557, loss: 1.57351



                                                  
  0%|          | 0/1 [01:41<?, ?it/s]4,  2.34it/s][A
 13%|█▎        | 354/2754 [01:41<22:03,  1.81it/s][A

epoch: 0, step: 558, loss: 1.03864



                                                  
  0%|          | 0/1 [01:42<?, ?it/s]3,  1.81it/s][A
 13%|█▎        | 355/2754 [01:42<19:29,  2.05it/s][A

epoch: 0, step: 559, loss: 4.08268



                                                  
  0%|          | 0/1 [01:42<?, ?it/s]9,  2.05it/s][A
 13%|█▎        | 356/2754 [01:42<17:05,  2.34it/s][A

epoch: 0, step: 560, loss: 3.07362



                                                  
  0%|          | 0/1 [01:42<?, ?it/s]5,  2.34it/s][A
 13%|█▎        | 357/2754 [01:42<14:58,  2.67it/s][A

epoch: 0, step: 561, loss: 4.31722



                                                  
  0%|          | 0/1 [01:42<?, ?it/s]8,  2.67it/s][A
 13%|█▎        | 358/2754 [01:42<13:29,  2.96it/s][A

epoch: 0, step: 562, loss: 3.30806



                                                  
  0%|          | 0/1 [01:43<?, ?it/s]9,  2.96it/s][A
 13%|█▎        | 359/2754 [01:43<12:25,  3.21it/s][A

epoch: 0, step: 563, loss: 1.57692



                                                  
  0%|          | 0/1 [01:43<?, ?it/s]5,  3.21it/s][A
 13%|█▎        | 360/2754 [01:43<11:58,  3.33it/s][A

epoch: 0, step: 564, loss: 2.51654



                                                  
  0%|          | 0/1 [01:43<?, ?it/s]8,  3.33it/s][A
 13%|█▎        | 361/2754 [01:43<11:29,  3.47it/s][A

epoch: 0, step: 565, loss: 5.63772



                                                  
  0%|          | 0/1 [01:44<?, ?it/s]9,  3.47it/s][A
 13%|█▎        | 362/2754 [01:44<11:05,  3.60it/s][A

epoch: 0, step: 566, loss: 3.02487



                                                  
  0%|          | 0/1 [01:44<?, ?it/s]5,  3.60it/s][A
 13%|█▎        | 363/2754 [01:44<11:06,  3.59it/s][A

epoch: 0, step: 567, loss: 3.05741



                                                  
  0%|          | 0/1 [01:44<?, ?it/s]6,  3.59it/s][A
 13%|█▎        | 364/2754 [01:44<12:58,  3.07it/s][A

epoch: 0, step: 568, loss: 3.45887



                                                  
  0%|          | 0/1 [01:45<?, ?it/s]8,  3.07it/s][A
 13%|█▎        | 365/2754 [01:45<12:55,  3.08it/s][A

epoch: 0, step: 569, loss: 3.05449



                                                  
  0%|          | 0/1 [01:45<?, ?it/s]5,  3.08it/s][A
 13%|█▎        | 366/2754 [01:45<11:47,  3.38it/s][A

epoch: 0, step: 570, loss: 4.4857



                                                  
  0%|          | 0/1 [01:45<?, ?it/s]7,  3.38it/s][A
 13%|█▎        | 367/2754 [01:45<11:12,  3.55it/s][A

epoch: 0, step: 571, loss: 3.50441



                                                  
  0%|          | 0/1 [01:45<?, ?it/s]2,  3.55it/s][A
 13%|█▎        | 368/2754 [01:45<10:44,  3.70it/s][A

epoch: 0, step: 572, loss: 4.54028



                                                  
  0%|          | 0/1 [01:46<?, ?it/s]4,  3.70it/s][A
 13%|█▎        | 369/2754 [01:46<10:52,  3.66it/s][A

epoch: 0, step: 573, loss: 3.27876



                                                  
  0%|          | 0/1 [01:46<?, ?it/s]2,  3.66it/s][A
 13%|█▎        | 370/2754 [01:46<11:41,  3.40it/s][A

epoch: 0, step: 574, loss: 2.93172



                                                  
  0%|          | 0/1 [01:46<?, ?it/s]1,  3.40it/s][A
 13%|█▎        | 371/2754 [01:46<11:23,  3.49it/s][A

epoch: 0, step: 575, loss: 2.86381



                                                  
  0%|          | 0/1 [01:46<?, ?it/s]3,  3.49it/s][A
 14%|█▎        | 372/2754 [01:46<10:42,  3.71it/s][A

epoch: 0, step: 576, loss: 3.20447



                                                  
  0%|          | 0/1 [01:47<?, ?it/s]2,  3.71it/s][A
 14%|█▎        | 373/2754 [01:47<10:09,  3.90it/s][A

epoch: 0, step: 577, loss: 3.74476



                                                  
  0%|          | 0/1 [01:47<?, ?it/s]9,  3.90it/s][A
 14%|█▎        | 374/2754 [01:47<09:52,  4.01it/s][A

epoch: 0, step: 578, loss: 2.57044



                                                  
  0%|          | 0/1 [01:47<?, ?it/s]2,  4.01it/s][A
 14%|█▎        | 375/2754 [01:47<09:47,  4.05it/s][A

epoch: 0, step: 579, loss: 3.1965



                                                  
  0%|          | 0/1 [01:47<?, ?it/s]7,  4.05it/s][A
 14%|█▎        | 376/2754 [01:47<09:37,  4.12it/s][A

epoch: 0, step: 580, loss: 3.51375



                                                  
  0%|          | 0/1 [01:48<?, ?it/s]7,  4.12it/s][A
 14%|█▎        | 377/2754 [01:48<10:14,  3.87it/s][A

epoch: 0, step: 581, loss: 3.62327



                                                  
  0%|          | 0/1 [01:48<?, ?it/s]4,  3.87it/s][A
 14%|█▎        | 378/2754 [01:48<10:37,  3.73it/s][A

epoch: 0, step: 582, loss: 2.68696



                                                  
  0%|          | 0/1 [01:48<?, ?it/s]7,  3.73it/s][A
 14%|█▍        | 379/2754 [01:48<10:10,  3.89it/s][A

epoch: 0, step: 583, loss: 3.01817



                                                  
  0%|          | 0/1 [01:48<?, ?it/s]0,  3.89it/s][A
 14%|█▍        | 380/2754 [01:48<10:32,  3.76it/s][A

epoch: 0, step: 584, loss: 2.17752



                                                  
  0%|          | 0/1 [01:49<?, ?it/s]2,  3.76it/s][A
 14%|█▍        | 381/2754 [01:49<10:11,  3.88it/s][A

epoch: 0, step: 585, loss: 3.68214



                                                  
  0%|          | 0/1 [01:49<?, ?it/s]1,  3.88it/s][A
 14%|█▍        | 382/2754 [01:49<09:44,  4.06it/s][A

epoch: 0, step: 586, loss: 2.98878



                                                  
  0%|          | 0/1 [01:49<?, ?it/s]4,  4.06it/s][A
 14%|█▍        | 383/2754 [01:49<09:30,  4.16it/s][A

epoch: 0, step: 587, loss: 2.73083



                                                  
  0%|          | 0/1 [01:49<?, ?it/s]0,  4.16it/s][A
 14%|█▍        | 384/2754 [01:49<09:18,  4.25it/s][A

epoch: 0, step: 588, loss: 1.77671



                                                  
  0%|          | 0/1 [01:50<?, ?it/s]8,  4.25it/s][A
 14%|█▍        | 385/2754 [01:50<09:05,  4.34it/s][A

epoch: 0, step: 589, loss: 2.53895



                                                  
  0%|          | 0/1 [01:50<?, ?it/s]5,  4.34it/s][A
 14%|█▍        | 386/2754 [01:50<09:01,  4.38it/s][A

epoch: 0, step: 590, loss: 2.47196



                                                  
  0%|          | 0/1 [01:50<?, ?it/s]1,  4.38it/s][A
 14%|█▍        | 387/2754 [01:50<09:03,  4.35it/s][A

epoch: 0, step: 591, loss: 4.34527



                                                  
  0%|          | 0/1 [01:50<?, ?it/s]3,  4.35it/s][A
 14%|█▍        | 388/2754 [01:50<09:07,  4.32it/s][A

epoch: 0, step: 592, loss: 3.79839



                                                  
  0%|          | 0/1 [01:51<?, ?it/s]7,  4.32it/s][A
 14%|█▍        | 389/2754 [01:51<09:39,  4.08it/s][A

epoch: 0, step: 593, loss: 3.75813



                                                  
  0%|          | 0/1 [01:51<?, ?it/s]9,  4.08it/s][A
 14%|█▍        | 390/2754 [01:51<09:35,  4.11it/s][A

epoch: 0, step: 594, loss: 3.33495



                                                  
  0%|          | 0/1 [01:51<?, ?it/s]5,  4.11it/s][A
 14%|█▍        | 391/2754 [01:51<10:17,  3.82it/s][A

epoch: 0, step: 595, loss: 3.45617



                                                  
  0%|          | 0/1 [01:51<?, ?it/s]7,  3.82it/s][A
 14%|█▍        | 392/2754 [01:51<10:21,  3.80it/s][A

epoch: 0, step: 596, loss: 3.89488



                                                  
  0%|          | 0/1 [01:52<?, ?it/s]1,  3.80it/s][A
 14%|█▍        | 393/2754 [01:52<12:17,  3.20it/s][A

epoch: 0, step: 597, loss: 3.35498



                                                  
  0%|          | 0/1 [01:52<?, ?it/s]7,  3.20it/s][A
 14%|█▍        | 394/2754 [01:52<11:25,  3.44it/s][A

epoch: 0, step: 598, loss: 4.16769



                                                  
  0%|          | 0/1 [01:52<?, ?it/s]5,  3.44it/s][A
 14%|█▍        | 395/2754 [01:52<13:01,  3.02it/s][A

epoch: 0, step: 599, loss: 3.39073



                                                  
  0%|          | 0/1 [01:53<?, ?it/s]1,  3.02it/s][A
 14%|█▍        | 396/2754 [01:53<12:00,  3.27it/s][A

epoch: 0, step: 600, loss: 3.42377



                                                  
  0%|          | 0/1 [01:53<?, ?it/s]0,  3.27it/s][A
 14%|█▍        | 397/2754 [01:53<11:08,  3.53it/s][A

epoch: 0, step: 601, loss: 5.12151



                                                  
  0%|          | 0/1 [01:53<?, ?it/s]8,  3.53it/s][A
 14%|█▍        | 398/2754 [01:53<10:36,  3.70it/s][A

epoch: 0, step: 602, loss: 4.10243



                                                  
  0%|          | 0/1 [01:53<?, ?it/s]6,  3.70it/s][A
 14%|█▍        | 399/2754 [01:53<10:10,  3.86it/s][A

epoch: 0, step: 603, loss: 3.40884



                                                  
  0%|          | 0/1 [01:54<?, ?it/s]0,  3.86it/s][A
 15%|█▍        | 400/2754 [01:54<11:19,  3.47it/s][A

epoch: 0, step: 604, loss: 3.38331



                                                  
  0%|          | 0/1 [01:54<?, ?it/s]9,  3.47it/s][A
 15%|█▍        | 401/2754 [01:54<10:43,  3.66it/s][A

epoch: 0, step: 605, loss: 4.35488



                                                  
  0%|          | 0/1 [01:54<?, ?it/s]3,  3.66it/s][A
 15%|█▍        | 402/2754 [01:54<10:04,  3.89it/s][A

epoch: 0, step: 606, loss: 4.1387



                                                  
  0%|          | 0/1 [01:54<?, ?it/s]4,  3.89it/s][A
 15%|█▍        | 403/2754 [01:54<09:42,  4.04it/s][A

epoch: 0, step: 607, loss: 2.57835



                                                  
  0%|          | 0/1 [01:55<?, ?it/s]2,  4.04it/s][A
 15%|█▍        | 404/2754 [01:55<09:25,  4.15it/s][A

epoch: 0, step: 608, loss: 3.15401



                                                  
  0%|          | 0/1 [01:55<?, ?it/s]5,  4.15it/s][A
 15%|█▍        | 405/2754 [01:55<09:22,  4.18it/s][A

epoch: 0, step: 609, loss: 4.29306



                                                  
  0%|          | 0/1 [01:55<?, ?it/s]2,  4.18it/s][A
 15%|█▍        | 406/2754 [01:55<09:05,  4.30it/s][A

epoch: 0, step: 610, loss: 3.10146



                                                  
  0%|          | 0/1 [01:55<?, ?it/s]5,  4.30it/s][A
 15%|█▍        | 407/2754 [01:55<08:59,  4.35it/s][A

epoch: 0, step: 611, loss: 4.05203



                                                  
  0%|          | 0/1 [01:56<?, ?it/s]9,  4.35it/s][A
 15%|█▍        | 408/2754 [01:56<09:36,  4.07it/s][A

epoch: 0, step: 612, loss: 2.64517



                                                  
  0%|          | 0/1 [01:56<?, ?it/s]6,  4.07it/s][A
 15%|█▍        | 409/2754 [01:56<09:26,  4.14it/s][A

epoch: 0, step: 613, loss: 1.87236



                                                  
  0%|          | 0/1 [01:56<?, ?it/s]6,  4.14it/s][A
 15%|█▍        | 410/2754 [01:56<09:26,  4.14it/s][A

epoch: 0, step: 614, loss: 3.52633



                                                  
  0%|          | 0/1 [01:56<?, ?it/s]6,  4.14it/s][A
 15%|█▍        | 411/2754 [01:56<09:33,  4.09it/s][A

epoch: 0, step: 615, loss: 4.01138



                                                  
  0%|          | 0/1 [01:57<?, ?it/s]3,  4.09it/s][A
 15%|█▍        | 412/2754 [01:57<09:35,  4.07it/s][A

epoch: 0, step: 616, loss: 2.70379



                                                  
  0%|          | 0/1 [01:57<?, ?it/s]5,  4.07it/s][A
 15%|█▍        | 413/2754 [01:57<09:48,  3.98it/s][A

epoch: 0, step: 617, loss: 3.7661



                                                  
  0%|          | 0/1 [01:57<?, ?it/s]8,  3.98it/s][A
 15%|█▌        | 414/2754 [01:57<09:37,  4.05it/s][A

epoch: 0, step: 618, loss: 3.82838



                                                  
  0%|          | 0/1 [01:57<?, ?it/s]7,  4.05it/s][A
 15%|█▌        | 415/2754 [01:57<09:36,  4.06it/s][A

epoch: 0, step: 619, loss: 4.53705



                                                  
  0%|          | 0/1 [01:58<?, ?it/s]6,  4.06it/s][A
 15%|█▌        | 416/2754 [01:58<09:32,  4.08it/s][A

epoch: 0, step: 620, loss: 2.58041



                                                  
  0%|          | 0/1 [01:58<?, ?it/s]2,  4.08it/s][A
 15%|█▌        | 417/2754 [01:58<09:37,  4.05it/s][A

epoch: 0, step: 621, loss: 3.4867



                                                  
  0%|          | 0/1 [01:58<?, ?it/s]7,  4.05it/s][A
 15%|█▌        | 418/2754 [01:58<09:58,  3.90it/s][A

epoch: 0, step: 622, loss: 4.23767



                                                  
  0%|          | 0/1 [01:58<?, ?it/s]8,  3.90it/s][A
 15%|█▌        | 419/2754 [01:58<09:48,  3.97it/s][A

epoch: 0, step: 623, loss: 4.41



                                                  
  0%|          | 0/1 [01:59<?, ?it/s]8,  3.97it/s][A
 15%|█▌        | 420/2754 [01:59<09:44,  3.99it/s][A

epoch: 0, step: 624, loss: 2.96359



                                                  
  0%|          | 0/1 [01:59<?, ?it/s]4,  3.99it/s][A
 15%|█▌        | 421/2754 [01:59<09:25,  4.12it/s][A

epoch: 0, step: 625, loss: 6.35496



                                                  
  0%|          | 0/1 [01:59<?, ?it/s]5,  4.12it/s][A
 15%|█▌        | 422/2754 [01:59<09:19,  4.17it/s][A

epoch: 0, step: 626, loss: 6.17866



                                                  
  0%|          | 0/1 [01:59<?, ?it/s]9,  4.17it/s][A
 15%|█▌        | 423/2754 [01:59<09:04,  4.28it/s][A

epoch: 0, step: 627, loss: 2.22193



                                                  
  0%|          | 0/1 [02:00<?, ?it/s]4,  4.28it/s][A
 15%|█▌        | 424/2754 [02:00<08:56,  4.34it/s][A

epoch: 0, step: 628, loss: 1.87976



                                                  
  0%|          | 0/1 [02:00<?, ?it/s]6,  4.34it/s][A
 15%|█▌        | 425/2754 [02:00<08:51,  4.38it/s][A

epoch: 0, step: 629, loss: 5.66551



                                                  
  0%|          | 0/1 [02:00<?, ?it/s]1,  4.38it/s][A
 15%|█▌        | 426/2754 [02:00<09:23,  4.13it/s][A

epoch: 0, step: 630, loss: 1.20045



                                                  
  0%|          | 0/1 [02:00<?, ?it/s]3,  4.13it/s][A
 16%|█▌        | 427/2754 [02:00<09:20,  4.15it/s][A

epoch: 0, step: 631, loss: 3.793



                                                  
  0%|          | 0/1 [02:01<?, ?it/s]0,  4.15it/s][A
 16%|█▌        | 428/2754 [02:01<10:39,  3.64it/s][A

epoch: 0, step: 632, loss: 2.30791



                                                  
  0%|          | 0/1 [02:01<?, ?it/s]9,  3.64it/s][A
 16%|█▌        | 429/2754 [02:01<10:09,  3.82it/s][A

epoch: 0, step: 633, loss: 3.49693



                                                  
  0%|          | 0/1 [02:01<?, ?it/s]9,  3.82it/s][A
 16%|█▌        | 430/2754 [02:01<11:14,  3.45it/s][A

epoch: 0, step: 634, loss: 2.73143



                                                  
  0%|          | 0/1 [02:01<?, ?it/s]4,  3.45it/s][A
 16%|█▌        | 431/2754 [02:01<10:34,  3.66it/s][A

epoch: 0, step: 635, loss: 2.91767



                                                  
  0%|          | 0/1 [02:02<?, ?it/s]4,  3.66it/s][A
 16%|█▌        | 432/2754 [02:02<10:15,  3.77it/s][A

epoch: 0, step: 636, loss: 1.21772



                                                  
  0%|          | 0/1 [02:02<?, ?it/s]5,  3.77it/s][A
 16%|█▌        | 433/2754 [02:02<10:09,  3.81it/s][A

epoch: 0, step: 637, loss: 3.80911



                                                  
  0%|          | 0/1 [02:02<?, ?it/s]9,  3.81it/s][A
 16%|█▌        | 434/2754 [02:02<09:52,  3.92it/s][A

epoch: 0, step: 638, loss: 3.0427



                                                  
  0%|          | 0/1 [02:02<?, ?it/s]2,  3.92it/s][A
 16%|█▌        | 435/2754 [02:02<09:39,  4.00it/s][A

epoch: 0, step: 639, loss: 3.6104



                                                  
  0%|          | 0/1 [02:03<?, ?it/s]9,  4.00it/s][A
 16%|█▌        | 436/2754 [02:03<11:56,  3.24it/s][A

epoch: 0, step: 640, loss: 3.49817



                                                  
  0%|          | 0/1 [02:03<?, ?it/s]6,  3.24it/s][A
 16%|█▌        | 437/2754 [02:03<11:02,  3.50it/s][A

epoch: 0, step: 641, loss: 4.82429



                                                  
  0%|          | 0/1 [02:04<?, ?it/s]2,  3.50it/s][A
 16%|█▌        | 438/2754 [02:04<13:14,  2.92it/s][A

epoch: 0, step: 642, loss: 3.11778



                                                  
  0%|          | 0/1 [02:04<?, ?it/s]4,  2.92it/s][A
 16%|█▌        | 439/2754 [02:04<12:21,  3.12it/s][A

epoch: 0, step: 643, loss: 3.23272



                                                  
  0%|          | 0/1 [02:04<?, ?it/s]1,  3.12it/s][A
 16%|█▌        | 440/2754 [02:04<11:15,  3.43it/s][A

epoch: 0, step: 644, loss: 4.43022



                                                  
  0%|          | 0/1 [02:04<?, ?it/s]5,  3.43it/s][A
 16%|█▌        | 441/2754 [02:04<10:51,  3.55it/s][A

epoch: 0, step: 645, loss: 3.93354



                                                  
  0%|          | 0/1 [02:05<?, ?it/s]1,  3.55it/s][A
 16%|█▌        | 442/2754 [02:05<10:18,  3.74it/s][A

epoch: 0, step: 646, loss: 4.92041



                                                  
  0%|          | 0/1 [02:05<?, ?it/s]8,  3.74it/s][A
 16%|█▌        | 443/2754 [02:05<10:09,  3.79it/s][A

epoch: 0, step: 647, loss: 4.03707



                                                  
  0%|          | 0/1 [02:05<?, ?it/s]9,  3.79it/s][A
 16%|█▌        | 444/2754 [02:05<09:47,  3.93it/s][A

epoch: 0, step: 648, loss: 2.70463



                                                  
  0%|          | 0/1 [02:05<?, ?it/s]7,  3.93it/s][A
 16%|█▌        | 445/2754 [02:05<09:40,  3.98it/s][A

epoch: 0, step: 649, loss: 3.33753



                                                  
  0%|          | 0/1 [02:06<?, ?it/s]0,  3.98it/s][A
 16%|█▌        | 446/2754 [02:06<11:12,  3.43it/s][A

epoch: 0, step: 650, loss: 3.16105



                                                  
  0%|          | 0/1 [02:06<?, ?it/s]2,  3.43it/s][A
 16%|█▌        | 447/2754 [02:06<10:48,  3.56it/s][A

epoch: 0, step: 651, loss: 2.83741



                                                  
  0%|          | 0/1 [02:06<?, ?it/s]8,  3.56it/s][A
 16%|█▋        | 448/2754 [02:06<10:33,  3.64it/s][A

epoch: 0, step: 652, loss: 4.24687



                                                  
  0%|          | 0/1 [02:06<?, ?it/s]3,  3.64it/s][A
 16%|█▋        | 449/2754 [02:06<10:20,  3.71it/s][A

epoch: 0, step: 653, loss: 2.75106



                                                  
  0%|          | 0/1 [02:07<?, ?it/s]0,  3.71it/s][A
 16%|█▋        | 450/2754 [02:07<10:05,  3.80it/s][A

epoch: 0, step: 654, loss: 2.4846



                                                  
  0%|          | 0/1 [02:07<?, ?it/s]5,  3.80it/s][A
 16%|█▋        | 451/2754 [02:07<10:29,  3.66it/s][A

epoch: 0, step: 655, loss: 3.2484



                                                  
  0%|          | 0/1 [02:07<?, ?it/s]9,  3.66it/s][A
 16%|█▋        | 452/2754 [02:07<10:07,  3.79it/s][A

epoch: 0, step: 656, loss: 3.89288



                                                  
  0%|          | 0/1 [02:07<?, ?it/s]7,  3.79it/s][A
 16%|█▋        | 453/2754 [02:07<09:42,  3.95it/s][A

epoch: 0, step: 657, loss: 1.80363



                                                  
  0%|          | 0/1 [02:08<?, ?it/s]2,  3.95it/s][A
 16%|█▋        | 454/2754 [02:08<09:47,  3.92it/s][A

epoch: 0, step: 658, loss: 2.40063



                                                  
  0%|          | 0/1 [02:08<?, ?it/s]7,  3.92it/s][A
 17%|█▋        | 455/2754 [02:08<09:46,  3.92it/s][A

epoch: 0, step: 659, loss: 2.36914



                                                  
  0%|          | 0/1 [02:08<?, ?it/s]6,  3.92it/s][A
 17%|█▋        | 456/2754 [02:08<09:30,  4.03it/s][A

epoch: 0, step: 660, loss: 2.70656



                                                  
  0%|          | 0/1 [02:08<?, ?it/s]0,  4.03it/s][A
 17%|█▋        | 457/2754 [02:08<09:27,  4.05it/s][A

epoch: 0, step: 661, loss: 5.258



                                                  
  0%|          | 0/1 [02:09<?, ?it/s]7,  4.05it/s][A
 17%|█▋        | 458/2754 [02:09<09:52,  3.88it/s][A

epoch: 0, step: 662, loss: 3.58187



                                                  
  0%|          | 0/1 [02:09<?, ?it/s]2,  3.88it/s][A
 17%|█▋        | 459/2754 [02:09<09:44,  3.93it/s][A

epoch: 0, step: 663, loss: 3.01506



                                                  
  0%|          | 0/1 [02:09<?, ?it/s]4,  3.93it/s][A
 17%|█▋        | 460/2754 [02:09<09:40,  3.95it/s][A

epoch: 0, step: 664, loss: 2.96053



                                                  
  0%|          | 0/1 [02:09<?, ?it/s]0,  3.95it/s][A
 17%|█▋        | 461/2754 [02:09<09:47,  3.91it/s][A

epoch: 0, step: 665, loss: 2.46974



                                                  
  0%|          | 0/1 [02:10<?, ?it/s]7,  3.91it/s][A
 17%|█▋        | 462/2754 [02:10<09:46,  3.91it/s][A

epoch: 0, step: 666, loss: 6.59423



                                                  
  0%|          | 0/1 [02:10<?, ?it/s]6,  3.91it/s][A
 17%|█▋        | 463/2754 [02:10<10:20,  3.69it/s][A

epoch: 0, step: 667, loss: 3.48334



                                                  
  0%|          | 0/1 [02:10<?, ?it/s]0,  3.69it/s][A
 17%|█▋        | 464/2754 [02:10<10:21,  3.68it/s][A

epoch: 0, step: 668, loss: 3.39298



                                                  
  0%|          | 0/1 [02:11<?, ?it/s]1,  3.68it/s][A
 17%|█▋        | 465/2754 [02:11<10:02,  3.80it/s][A

epoch: 0, step: 669, loss: 4.15466



                                                  
  0%|          | 0/1 [02:11<?, ?it/s]2,  3.80it/s][A
 17%|█▋        | 466/2754 [02:11<09:31,  4.00it/s][A

epoch: 0, step: 670, loss: 2.74029



                                                  
  0%|          | 0/1 [02:11<?, ?it/s]1,  4.00it/s][A
 17%|█▋        | 467/2754 [02:11<09:59,  3.82it/s][A

epoch: 0, step: 671, loss: 2.86292



                                                  
  0%|          | 0/1 [02:11<?, ?it/s]9,  3.82it/s][A
 17%|█▋        | 468/2754 [02:11<09:51,  3.87it/s][A

epoch: 0, step: 672, loss: 5.20591



                                                  
  0%|          | 0/1 [02:12<?, ?it/s]1,  3.87it/s][A
 17%|█▋        | 469/2754 [02:12<09:46,  3.90it/s][A

epoch: 0, step: 673, loss: 2.12808



                                                  
  0%|          | 0/1 [02:12<?, ?it/s]6,  3.90it/s][A
 17%|█▋        | 470/2754 [02:12<09:28,  4.02it/s][A

epoch: 0, step: 674, loss: 4.09081



                                                  
  0%|          | 0/1 [02:12<?, ?it/s]8,  4.02it/s][A
 17%|█▋        | 471/2754 [02:12<09:31,  3.99it/s][A

epoch: 0, step: 675, loss: 3.30018



                                                  
  0%|          | 0/1 [02:12<?, ?it/s]1,  3.99it/s][A
 17%|█▋        | 472/2754 [02:12<09:31,  3.99it/s][A

epoch: 0, step: 676, loss: 3.46832



                                                  
  0%|          | 0/1 [02:13<?, ?it/s]1,  3.99it/s][A
 17%|█▋        | 473/2754 [02:13<10:02,  3.79it/s][A

epoch: 0, step: 677, loss: 3.2836



                                                  
  0%|          | 0/1 [02:13<?, ?it/s]2,  3.79it/s][A
 17%|█▋        | 474/2754 [02:13<09:56,  3.82it/s][A

epoch: 0, step: 678, loss: 4.63676



                                                  
  0%|          | 0/1 [02:13<?, ?it/s]6,  3.82it/s][A
 17%|█▋        | 475/2754 [02:13<12:19,  3.08it/s][A

epoch: 0, step: 679, loss: 2.3913



                                                  
  0%|          | 0/1 [02:14<?, ?it/s]9,  3.08it/s][A
 17%|█▋        | 476/2754 [02:14<15:24,  2.46it/s][A

epoch: 0, step: 680, loss: 3.174



                                                  
  0%|          | 0/1 [02:14<?, ?it/s]4,  2.46it/s][A
 17%|█▋        | 477/2754 [02:14<13:37,  2.78it/s][A

epoch: 0, step: 681, loss: 3.60543



                                                  
  0%|          | 0/1 [02:14<?, ?it/s]7,  2.78it/s][A
 17%|█▋        | 478/2754 [02:14<12:20,  3.08it/s][A

epoch: 0, step: 682, loss: 2.32514



                                                  
  0%|          | 0/1 [02:15<?, ?it/s]0,  3.08it/s][A
 17%|█▋        | 479/2754 [02:15<11:59,  3.16it/s][A

epoch: 0, step: 683, loss: 3.34257



                                                  
  0%|          | 0/1 [02:15<?, ?it/s]9,  3.16it/s][A
 17%|█▋        | 480/2754 [02:15<11:48,  3.21it/s][A

epoch: 0, step: 684, loss: 3.4047



                                                  
  0%|          | 0/1 [02:15<?, ?it/s]8,  3.21it/s][A
 17%|█▋        | 481/2754 [02:15<11:10,  3.39it/s][A

epoch: 0, step: 685, loss: 3.19972



                                                  
  0%|          | 0/1 [02:16<?, ?it/s]0,  3.39it/s][A
 18%|█▊        | 482/2754 [02:16<10:36,  3.57it/s][A

epoch: 0, step: 686, loss: 4.8929



                                                  
  0%|          | 0/1 [02:16<?, ?it/s]6,  3.57it/s][A
 18%|█▊        | 483/2754 [02:16<10:15,  3.69it/s][A

epoch: 0, step: 687, loss: 2.43353



                                                  
  0%|          | 0/1 [02:16<?, ?it/s]5,  3.69it/s][A
 18%|█▊        | 484/2754 [02:16<09:53,  3.83it/s][A

epoch: 0, step: 688, loss: 2.93697



                                                  
  0%|          | 0/1 [02:16<?, ?it/s]3,  3.83it/s][A
 18%|█▊        | 485/2754 [02:16<09:48,  3.85it/s][A

epoch: 0, step: 689, loss: 3.6036



                                                  
  0%|          | 0/1 [02:17<?, ?it/s]8,  3.85it/s][A
 18%|█▊        | 486/2754 [02:17<12:23,  3.05it/s][A

epoch: 0, step: 690, loss: 3.73221



                                                  
  0%|          | 0/1 [02:17<?, ?it/s]3,  3.05it/s][A
 18%|█▊        | 487/2754 [02:17<11:35,  3.26it/s][A

epoch: 0, step: 691, loss: 3.50092



                                                  
  0%|          | 0/1 [02:17<?, ?it/s]5,  3.26it/s][A
 18%|█▊        | 488/2754 [02:17<10:54,  3.46it/s][A

epoch: 0, step: 692, loss: 1.8134



                                                  
  0%|          | 0/1 [02:17<?, ?it/s]4,  3.46it/s][A
 18%|█▊        | 489/2754 [02:17<10:23,  3.63it/s][A

epoch: 0, step: 693, loss: 3.89509



                                                  
  0%|          | 0/1 [02:18<?, ?it/s]3,  3.63it/s][A
 18%|█▊        | 490/2754 [02:18<10:06,  3.73it/s][A

epoch: 0, step: 694, loss: 2.62442



                                                  
  0%|          | 0/1 [02:18<?, ?it/s]6,  3.73it/s][A
 18%|█▊        | 491/2754 [02:18<10:00,  3.77it/s][A

epoch: 0, step: 695, loss: 3.29111



                                                  
  0%|          | 0/1 [02:18<?, ?it/s]0,  3.77it/s][A
 18%|█▊        | 492/2754 [02:18<12:23,  3.04it/s][A

epoch: 0, step: 696, loss: 2.47877



                                                  
  0%|          | 0/1 [02:19<?, ?it/s]3,  3.04it/s][A
 18%|█▊        | 493/2754 [02:19<13:23,  2.81it/s][A

epoch: 0, step: 697, loss: 1.60257



                                                  
  0%|          | 0/1 [02:19<?, ?it/s]3,  2.81it/s][A
 18%|█▊        | 494/2754 [02:19<12:14,  3.08it/s][A

epoch: 0, step: 698, loss: 4.01418



                                                  
  0%|          | 0/1 [02:19<?, ?it/s]4,  3.08it/s][A
 18%|█▊        | 495/2754 [02:19<11:27,  3.29it/s][A

epoch: 0, step: 699, loss: 3.97293



                                                  
  0%|          | 0/1 [02:20<?, ?it/s]7,  3.29it/s][A
 18%|█▊        | 496/2754 [02:20<10:49,  3.48it/s][A

epoch: 0, step: 700, loss: 3.83908



                                                  
  0%|          | 0/1 [02:20<?, ?it/s]9,  3.48it/s][A
 18%|█▊        | 497/2754 [02:20<10:42,  3.51it/s][A

epoch: 0, step: 701, loss: 3.41565



                                                  
  0%|          | 0/1 [02:20<?, ?it/s]2,  3.51it/s][A
 18%|█▊        | 498/2754 [02:20<10:10,  3.70it/s][A

epoch: 0, step: 702, loss: 3.89602



                                                  
  0%|          | 0/1 [02:20<?, ?it/s]0,  3.70it/s][A
 18%|█▊        | 499/2754 [02:20<09:56,  3.78it/s][A

epoch: 0, step: 703, loss: 2.44275



                                                  
  0%|          | 0/1 [02:21<?, ?it/s]6,  3.78it/s][A
 18%|█▊        | 500/2754 [02:21<09:34,  3.92it/s][A

epoch: 0, step: 704, loss: 2.89072



                                                  
  0%|          | 0/1 [02:21<?, ?it/s]4,  3.92it/s][A
 18%|█▊        | 501/2754 [02:21<09:36,  3.91it/s][A

epoch: 0, step: 705, loss: 2.93976



                                                  
  0%|          | 0/1 [02:21<?, ?it/s]6,  3.91it/s][A
 18%|█▊        | 502/2754 [02:21<09:37,  3.90it/s][A

epoch: 0, step: 706, loss: 4.31124



                                                  
  0%|          | 0/1 [02:21<?, ?it/s]7,  3.90it/s][A
 18%|█▊        | 503/2754 [02:21<09:53,  3.80it/s][A

epoch: 0, step: 707, loss: 3.3171



                                                  
  0%|          | 0/1 [02:22<?, ?it/s]3,  3.80it/s][A
 18%|█▊        | 504/2754 [02:22<09:39,  3.88it/s][A

epoch: 0, step: 708, loss: 4.99418



                                                  
  0%|          | 0/1 [02:22<?, ?it/s]9,  3.88it/s][A
 18%|█▊        | 505/2754 [02:22<09:31,  3.93it/s][A

epoch: 0, step: 709, loss: 5.33856



                                                  
  0%|          | 0/1 [02:22<?, ?it/s]1,  3.93it/s][A
 18%|█▊        | 506/2754 [02:22<09:25,  3.98it/s][A

epoch: 0, step: 710, loss: 3.97889



                                                  
  0%|          | 0/1 [02:23<?, ?it/s]5,  3.98it/s][A
 18%|█▊        | 507/2754 [02:23<14:34,  2.57it/s][A

epoch: 0, step: 711, loss: 2.57453



                                                  
  0%|          | 0/1 [02:23<?, ?it/s]4,  2.57it/s][A
 18%|█▊        | 508/2754 [02:23<13:23,  2.80it/s][A

epoch: 0, step: 712, loss: 2.82506



                                                  
  0%|          | 0/1 [02:23<?, ?it/s]3,  2.80it/s][A
 18%|█▊        | 509/2754 [02:23<12:15,  3.05it/s][A

epoch: 0, step: 713, loss: 1.04479



                                                  
  0%|          | 0/1 [02:24<?, ?it/s]5,  3.05it/s][A
 19%|█▊        | 510/2754 [02:24<11:25,  3.28it/s][A

epoch: 0, step: 714, loss: 3.11042



                                                  
  0%|          | 0/1 [02:24<?, ?it/s]5,  3.28it/s][A
 19%|█▊        | 511/2754 [02:24<10:46,  3.47it/s][A

epoch: 0, step: 715, loss: 3.77747



                                                  
  0%|          | 0/1 [02:24<?, ?it/s]6,  3.47it/s][A
 19%|█▊        | 512/2754 [02:24<10:34,  3.53it/s][A

epoch: 0, step: 716, loss: 3.0285



                                                  
  0%|          | 0/1 [02:25<?, ?it/s]4,  3.53it/s][A
 19%|█▊        | 513/2754 [02:25<10:46,  3.47it/s][A

epoch: 0, step: 717, loss: 3.29921



                                                  
  0%|          | 0/1 [02:25<?, ?it/s]6,  3.47it/s][A
 19%|█▊        | 514/2754 [02:25<10:45,  3.47it/s][A

epoch: 0, step: 718, loss: 3.94614



                                                  
  0%|          | 0/1 [02:25<?, ?it/s]5,  3.47it/s][A
 19%|█▊        | 515/2754 [02:25<10:46,  3.46it/s][A

epoch: 0, step: 719, loss: 3.94537



                                                  
  0%|          | 0/1 [02:25<?, ?it/s]6,  3.46it/s][A
 19%|█▊        | 516/2754 [02:25<10:52,  3.43it/s][A

epoch: 0, step: 720, loss: 3.02624



                                                  
  0%|          | 0/1 [02:26<?, ?it/s]2,  3.43it/s][A
 19%|█▉        | 517/2754 [02:26<10:50,  3.44it/s][A

epoch: 0, step: 721, loss: 3.43191



                                                  
  0%|          | 0/1 [02:26<?, ?it/s]0,  3.44it/s][A
 19%|█▉        | 518/2754 [02:26<10:55,  3.41it/s][A

epoch: 0, step: 722, loss: 2.67969



                                                  
  0%|          | 0/1 [02:26<?, ?it/s]5,  3.41it/s][A
 19%|█▉        | 519/2754 [02:26<10:26,  3.56it/s][A

epoch: 0, step: 723, loss: 4.30982



                                                  
  0%|          | 0/1 [02:26<?, ?it/s]6,  3.56it/s][A
 19%|█▉        | 520/2754 [02:26<09:59,  3.72it/s][A

epoch: 0, step: 724, loss: 1.94057



                                                  
  0%|          | 0/1 [02:27<?, ?it/s]9,  3.72it/s][A
 19%|█▉        | 521/2754 [02:27<10:11,  3.65it/s][A

epoch: 0, step: 725, loss: 3.38694



                                                  
  0%|          | 0/1 [02:27<?, ?it/s]1,  3.65it/s][A
 19%|█▉        | 522/2754 [02:27<11:38,  3.19it/s][A

epoch: 0, step: 726, loss: 3.68153



                                                  
  0%|          | 0/1 [02:28<?, ?it/s]8,  3.19it/s][A
 19%|█▉        | 523/2754 [02:28<15:59,  2.33it/s][A

epoch: 0, step: 727, loss: 3.37222



                                                  
  0%|          | 0/1 [02:28<?, ?it/s]9,  2.33it/s][A
 19%|█▉        | 524/2754 [02:28<16:44,  2.22it/s][A

epoch: 0, step: 728, loss: 5.05358



                                                  
  0%|          | 0/1 [02:29<?, ?it/s]4,  2.22it/s][A
 19%|█▉        | 525/2754 [02:29<14:45,  2.52it/s][A

epoch: 0, step: 729, loss: 3.24217



                                                  
  0%|          | 0/1 [02:29<?, ?it/s]5,  2.52it/s][A
 19%|█▉        | 526/2754 [02:29<13:03,  2.84it/s][A

epoch: 0, step: 730, loss: 4.68602



                                                  
  0%|          | 0/1 [02:29<?, ?it/s]3,  2.84it/s][A
 19%|█▉        | 527/2754 [02:29<13:58,  2.66it/s][A

epoch: 0, step: 731, loss: 2.86296



                                                  
  0%|          | 0/1 [02:30<?, ?it/s]8,  2.66it/s][A
 19%|█▉        | 528/2754 [02:30<12:40,  2.93it/s][A

epoch: 0, step: 732, loss: 3.28781



                                                  
  0%|          | 0/1 [02:30<?, ?it/s]0,  2.93it/s][A
 19%|█▉        | 529/2754 [02:30<11:39,  3.18it/s][A

epoch: 0, step: 733, loss: 3.29327



                                                  
  0%|          | 0/1 [02:30<?, ?it/s]9,  3.18it/s][A
 19%|█▉        | 530/2754 [02:30<12:38,  2.93it/s][A

epoch: 0, step: 734, loss: 1.11434



                                                  
  0%|          | 0/1 [02:31<?, ?it/s]8,  2.93it/s][A
 19%|█▉        | 531/2754 [02:31<12:00,  3.08it/s][A

epoch: 0, step: 735, loss: 2.68353



                                                  
  0%|          | 0/1 [02:31<?, ?it/s]0,  3.08it/s][A
 19%|█▉        | 532/2754 [02:31<12:24,  2.99it/s][A

epoch: 0, step: 736, loss: 3.34288



                                                  
  0%|          | 0/1 [02:31<?, ?it/s]4,  2.99it/s][A
 19%|█▉        | 533/2754 [02:31<11:50,  3.13it/s][A

epoch: 0, step: 737, loss: 2.56942



                                                  
  0%|          | 0/1 [02:31<?, ?it/s]0,  3.13it/s][A
 19%|█▉        | 534/2754 [02:31<10:57,  3.37it/s][A

epoch: 0, step: 738, loss: 3.85673



                                                  
  0%|          | 0/1 [02:32<?, ?it/s]7,  3.37it/s][A
 19%|█▉        | 535/2754 [02:32<10:04,  3.67it/s][A

epoch: 0, step: 739, loss: 3.0508



                                                  
  0%|          | 0/1 [02:32<?, ?it/s]4,  3.67it/s][A
 19%|█▉        | 536/2754 [02:32<09:40,  3.82it/s][A

epoch: 0, step: 740, loss: 5.26848



                                                  
  0%|          | 0/1 [02:32<?, ?it/s]0,  3.82it/s][A
 19%|█▉        | 537/2754 [02:32<09:33,  3.86it/s][A

epoch: 0, step: 741, loss: 1.82545



                                                  
  0%|          | 0/1 [02:32<?, ?it/s]3,  3.86it/s][A
 20%|█▉        | 538/2754 [02:32<09:12,  4.01it/s][A

epoch: 0, step: 742, loss: 5.43971



                                                  
  0%|          | 0/1 [02:33<?, ?it/s]2,  4.01it/s][A
 20%|█▉        | 539/2754 [02:33<09:18,  3.96it/s][A

epoch: 0, step: 743, loss: 4.2209



                                                  
  0%|          | 0/1 [02:33<?, ?it/s]8,  3.96it/s][A
 20%|█▉        | 540/2754 [02:33<09:03,  4.07it/s][A

epoch: 0, step: 744, loss: 2.89614



                                                  
  0%|          | 0/1 [02:33<?, ?it/s]3,  4.07it/s][A
 20%|█▉        | 541/2754 [02:33<09:12,  4.01it/s][A

epoch: 0, step: 745, loss: 3.02436



                                                  
  0%|          | 0/1 [02:33<?, ?it/s]2,  4.01it/s][A
 20%|█▉        | 542/2754 [02:33<09:26,  3.91it/s][A

epoch: 0, step: 746, loss: 2.52999



                                                  
  0%|          | 0/1 [02:34<?, ?it/s]6,  3.91it/s][A
 20%|█▉        | 543/2754 [02:34<09:42,  3.79it/s][A

epoch: 0, step: 747, loss: 3.82267



                                                  
  0%|          | 0/1 [02:34<?, ?it/s]2,  3.79it/s][A
 20%|█▉        | 544/2754 [02:34<09:50,  3.74it/s][A

epoch: 0, step: 748, loss: 3.82891



                                                  
  0%|          | 0/1 [02:34<?, ?it/s]0,  3.74it/s][A
 20%|█▉        | 545/2754 [02:34<09:36,  3.83it/s][A

epoch: 0, step: 749, loss: 4.52819



                                                  
  0%|          | 0/1 [02:34<?, ?it/s]6,  3.83it/s][A
 20%|█▉        | 546/2754 [02:34<09:26,  3.90it/s][A

epoch: 0, step: 750, loss: 2.38659



                                                  
  0%|          | 0/1 [02:35<?, ?it/s]6,  3.90it/s][A
 20%|█▉        | 547/2754 [02:35<09:18,  3.95it/s][A

epoch: 0, step: 751, loss: 3.69552



                                                  
  0%|          | 0/1 [02:35<?, ?it/s]8,  3.95it/s][A
 20%|█▉        | 548/2754 [02:35<09:20,  3.93it/s][A

epoch: 0, step: 752, loss: 3.43579



                                                  
  0%|          | 0/1 [02:35<?, ?it/s]0,  3.93it/s][A
 20%|█▉        | 549/2754 [02:35<09:10,  4.01it/s][A

epoch: 0, step: 753, loss: 2.32587



                                                  
  0%|          | 0/1 [02:35<?, ?it/s]0,  4.01it/s][A
 20%|█▉        | 550/2754 [02:35<09:40,  3.80it/s][A

epoch: 0, step: 754, loss: 3.34205



                                                  
  0%|          | 0/1 [02:36<?, ?it/s]0,  3.80it/s][A
 20%|██        | 551/2754 [02:36<09:37,  3.81it/s][A

epoch: 0, step: 755, loss: 2.92927



                                                  
  0%|          | 0/1 [02:36<?, ?it/s]7,  3.81it/s][A
 20%|██        | 552/2754 [02:36<11:13,  3.27it/s][A

epoch: 0, step: 756, loss: 3.32101



                                                  
  0%|          | 0/1 [02:36<?, ?it/s]3,  3.27it/s][A
 20%|██        | 553/2754 [02:36<11:11,  3.28it/s][A

epoch: 0, step: 757, loss: 3.22408



                                                  
  0%|          | 0/1 [02:37<?, ?it/s]1,  3.28it/s][A
 20%|██        | 554/2754 [02:37<10:47,  3.40it/s][A

epoch: 0, step: 758, loss: 2.11218



                                                  
  0%|          | 0/1 [02:37<?, ?it/s]7,  3.40it/s][A
 20%|██        | 555/2754 [02:37<10:19,  3.55it/s][A

epoch: 0, step: 759, loss: 3.01814



                                                  
  0%|          | 0/1 [02:37<?, ?it/s]9,  3.55it/s][A
 20%|██        | 556/2754 [02:37<09:53,  3.70it/s][A

epoch: 0, step: 760, loss: 3.72665



                                                  
  0%|          | 0/1 [02:37<?, ?it/s]3,  3.70it/s][A
 20%|██        | 557/2754 [02:37<09:27,  3.87it/s][A

epoch: 0, step: 761, loss: 4.91441



                                                  
  0%|          | 0/1 [02:38<?, ?it/s]7,  3.87it/s][A
 20%|██        | 558/2754 [02:38<09:15,  3.95it/s][A

epoch: 0, step: 762, loss: 3.25099



                                                  
  0%|          | 0/1 [02:38<?, ?it/s]5,  3.95it/s][A
 20%|██        | 559/2754 [02:38<09:36,  3.81it/s][A

epoch: 0, step: 763, loss: 3.55467



                                                  
  0%|          | 0/1 [02:38<?, ?it/s]6,  3.81it/s][A
 20%|██        | 560/2754 [02:38<09:18,  3.93it/s][A

epoch: 0, step: 764, loss: 3.64306



                                                  
  0%|          | 0/1 [02:38<?, ?it/s]8,  3.93it/s][A
 20%|██        | 561/2754 [02:38<09:09,  3.99it/s][A

epoch: 0, step: 765, loss: 6.68936



                                                  
  0%|          | 0/1 [02:39<?, ?it/s]9,  3.99it/s][A
 20%|██        | 562/2754 [02:39<08:58,  4.07it/s][A

epoch: 0, step: 766, loss: 4.2469



                                                  
  0%|          | 0/1 [02:39<?, ?it/s]8,  4.07it/s][A
 20%|██        | 563/2754 [02:39<09:23,  3.89it/s][A

epoch: 0, step: 767, loss: 3.16889



                                                  
  0%|          | 0/1 [02:39<?, ?it/s]3,  3.89it/s][A
 20%|██        | 564/2754 [02:39<09:14,  3.95it/s][A

epoch: 0, step: 768, loss: 4.63799



                                                  
  0%|          | 0/1 [02:39<?, ?it/s]4,  3.95it/s][A
 21%|██        | 565/2754 [02:39<09:23,  3.89it/s][A

epoch: 0, step: 769, loss: 3.06548



                                                  
  0%|          | 0/1 [02:40<?, ?it/s]3,  3.89it/s][A
 21%|██        | 566/2754 [02:40<09:08,  3.99it/s][A

epoch: 0, step: 770, loss: 4.16906



                                                  
  0%|          | 0/1 [02:40<?, ?it/s]8,  3.99it/s][A
 21%|██        | 567/2754 [02:40<09:08,  3.99it/s][A

epoch: 0, step: 771, loss: 4.04402



                                                  
  0%|          | 0/1 [02:40<?, ?it/s]8,  3.99it/s][A
 21%|██        | 568/2754 [02:40<09:40,  3.77it/s][A

epoch: 0, step: 772, loss: 2.88516



                                                  
  0%|          | 0/1 [02:40<?, ?it/s]0,  3.77it/s][A
 21%|██        | 569/2754 [02:40<09:36,  3.79it/s][A

epoch: 0, step: 773, loss: 4.06771



                                                  
  0%|          | 0/1 [02:41<?, ?it/s]6,  3.79it/s][A
 21%|██        | 570/2754 [02:41<09:56,  3.66it/s][A

epoch: 0, step: 774, loss: 3.8376



                                                  
  0%|          | 0/1 [02:41<?, ?it/s]6,  3.66it/s][A
 21%|██        | 571/2754 [02:41<09:31,  3.82it/s][A

epoch: 0, step: 775, loss: 4.58053



                                                  
  0%|          | 0/1 [02:41<?, ?it/s]1,  3.82it/s][A
 21%|██        | 572/2754 [02:41<09:22,  3.88it/s][A

epoch: 0, step: 776, loss: 3.26955



                                                  
  0%|          | 0/1 [02:41<?, ?it/s]2,  3.88it/s][A
 21%|██        | 573/2754 [02:41<09:09,  3.97it/s][A

epoch: 0, step: 777, loss: 1.47281



                                                  
  0%|          | 0/1 [02:42<?, ?it/s]9,  3.97it/s][A
 21%|██        | 574/2754 [02:42<09:29,  3.83it/s][A

epoch: 0, step: 778, loss: 2.62695



                                                  
  0%|          | 0/1 [02:42<?, ?it/s]9,  3.83it/s][A
 21%|██        | 575/2754 [02:42<09:39,  3.76it/s][A

epoch: 0, step: 779, loss: 2.51779



                                                  
  0%|          | 0/1 [02:42<?, ?it/s]9,  3.76it/s][A
 21%|██        | 576/2754 [02:42<09:29,  3.83it/s][A

epoch: 0, step: 780, loss: 3.56703



                                                  
  0%|          | 0/1 [02:43<?, ?it/s]9,  3.83it/s][A
 21%|██        | 577/2754 [02:43<09:29,  3.82it/s][A

epoch: 0, step: 781, loss: 4.86908



                                                  
  0%|          | 0/1 [02:43<?, ?it/s]9,  3.82it/s][A
 21%|██        | 578/2754 [02:43<09:21,  3.88it/s][A

epoch: 0, step: 782, loss: 2.78346



                                                  
  0%|          | 0/1 [02:43<?, ?it/s]1,  3.88it/s][A
 21%|██        | 579/2754 [02:43<11:26,  3.17it/s][A

epoch: 0, step: 783, loss: 2.80912



                                                  
  0%|          | 0/1 [02:43<?, ?it/s]6,  3.17it/s][A
 21%|██        | 580/2754 [02:43<10:29,  3.45it/s][A

epoch: 0, step: 784, loss: 2.73297



                                                  
  0%|          | 0/1 [02:44<?, ?it/s]9,  3.45it/s][A
 21%|██        | 581/2754 [02:44<10:05,  3.59it/s][A

epoch: 0, step: 785, loss: 3.5378



                                                  
  0%|          | 0/1 [02:44<?, ?it/s]5,  3.59it/s][A
 21%|██        | 582/2754 [02:44<10:20,  3.50it/s][A

epoch: 0, step: 786, loss: 2.79075



                                                  
  0%|          | 0/1 [02:45<?, ?it/s]0,  3.50it/s][A
 21%|██        | 583/2754 [02:45<12:23,  2.92it/s][A

epoch: 0, step: 787, loss: 3.27405



                                                  
  0%|          | 0/1 [02:45<?, ?it/s]3,  2.92it/s][A
 21%|██        | 584/2754 [02:45<12:45,  2.83it/s][A

epoch: 0, step: 788, loss: 3.95457



                                                  
  0%|          | 0/1 [02:45<?, ?it/s]5,  2.83it/s][A
 21%|██        | 585/2754 [02:45<11:29,  3.15it/s][A

epoch: 0, step: 789, loss: 3.61431



                                                  
  0%|          | 0/1 [02:45<?, ?it/s]9,  3.15it/s][A
 21%|██▏       | 586/2754 [02:45<10:58,  3.29it/s][A

epoch: 0, step: 790, loss: 2.21222



                                                  
  0%|          | 0/1 [02:46<?, ?it/s]8,  3.29it/s][A
 21%|██▏       | 587/2754 [02:46<10:48,  3.34it/s][A

epoch: 0, step: 791, loss: 3.58115



                                                  
  0%|          | 0/1 [02:46<?, ?it/s]8,  3.34it/s][A
 21%|██▏       | 588/2754 [02:46<10:11,  3.54it/s][A

epoch: 0, step: 792, loss: 2.72751



                                                  
  0%|          | 0/1 [02:46<?, ?it/s]1,  3.54it/s][A
 21%|██▏       | 589/2754 [02:46<09:48,  3.68it/s][A

epoch: 0, step: 793, loss: 4.19221



                                                  
  0%|          | 0/1 [02:46<?, ?it/s]8,  3.68it/s][A
 21%|██▏       | 590/2754 [02:46<09:34,  3.77it/s][A

epoch: 0, step: 794, loss: 3.91462



                                                  
  0%|          | 0/1 [02:47<?, ?it/s]4,  3.77it/s][A
 21%|██▏       | 591/2754 [02:47<09:21,  3.85it/s][A

epoch: 0, step: 795, loss: 3.68454



                                                  
  0%|          | 0/1 [02:47<?, ?it/s]1,  3.85it/s][A
 21%|██▏       | 592/2754 [02:47<09:11,  3.92it/s][A

epoch: 0, step: 796, loss: 3.81735



                                                  
  0%|          | 0/1 [02:47<?, ?it/s]1,  3.92it/s][A
 22%|██▏       | 593/2754 [02:47<08:51,  4.07it/s][A

epoch: 0, step: 797, loss: 7.40459



                                                  
  0%|          | 0/1 [02:47<?, ?it/s]1,  4.07it/s][A
 22%|██▏       | 594/2754 [02:47<08:44,  4.12it/s][A

epoch: 0, step: 798, loss: 4.02112



                                                  
  0%|          | 0/1 [02:48<?, ?it/s]4,  4.12it/s][A
 22%|██▏       | 595/2754 [02:48<08:59,  4.00it/s][A

epoch: 0, step: 799, loss: 2.83432



                                                  
  0%|          | 0/1 [02:48<?, ?it/s]9,  4.00it/s][A
 22%|██▏       | 596/2754 [02:48<08:42,  4.13it/s][A

epoch: 0, step: 800, loss: 1.55846



                                                  
  0%|          | 0/1 [02:48<?, ?it/s]2,  4.13it/s][A
 22%|██▏       | 597/2754 [02:48<08:23,  4.28it/s][A

epoch: 0, step: 801, loss: 1.81385



                                                  
  0%|          | 0/1 [02:48<?, ?it/s]3,  4.28it/s][A
 22%|██▏       | 598/2754 [02:48<08:17,  4.33it/s][A

epoch: 0, step: 802, loss: 2.76525



                                                  
  0%|          | 0/1 [02:49<?, ?it/s]7,  4.33it/s][A
 22%|██▏       | 599/2754 [02:49<08:38,  4.16it/s][A

epoch: 0, step: 803, loss: 3.74053



                                                  
  0%|          | 0/1 [02:49<?, ?it/s]8,  4.16it/s][A
 22%|██▏       | 600/2754 [02:49<08:47,  4.09it/s][A

epoch: 0, step: 804, loss: 3.15798



                                                  
  0%|          | 0/1 [02:49<?, ?it/s]7,  4.09it/s][A
 22%|██▏       | 601/2754 [02:49<09:15,  3.88it/s][A

epoch: 0, step: 805, loss: 3.05041



                                                  
  0%|          | 0/1 [02:49<?, ?it/s]5,  3.88it/s][A
 22%|██▏       | 602/2754 [02:49<09:44,  3.68it/s][A

epoch: 0, step: 806, loss: 3.46133



                                                  
  0%|          | 0/1 [02:50<?, ?it/s]4,  3.68it/s][A
 22%|██▏       | 603/2754 [02:50<09:47,  3.66it/s][A

epoch: 0, step: 807, loss: 3.93285



                                                  
  0%|          | 0/1 [02:50<?, ?it/s]7,  3.66it/s][A
 22%|██▏       | 604/2754 [02:50<09:21,  3.83it/s][A

epoch: 0, step: 808, loss: 4.14027



                                                  
  0%|          | 0/1 [02:50<?, ?it/s]1,  3.83it/s][A
 22%|██▏       | 605/2754 [02:50<08:57,  4.00it/s][A

epoch: 0, step: 809, loss: 2.82113



                                                  
  0%|          | 0/1 [02:50<?, ?it/s]7,  4.00it/s][A
 22%|██▏       | 606/2754 [02:50<08:45,  4.09it/s][A

epoch: 0, step: 810, loss: 2.14622



                                                  
  0%|          | 0/1 [02:51<?, ?it/s]5,  4.09it/s][A
 22%|██▏       | 607/2754 [02:51<08:38,  4.14it/s][A

epoch: 0, step: 811, loss: 2.66562



                                                  
  0%|          | 0/1 [02:51<?, ?it/s]8,  4.14it/s][A
 22%|██▏       | 608/2754 [02:51<09:09,  3.91it/s][A

epoch: 0, step: 812, loss: 3.40897



                                                  
  0%|          | 0/1 [02:51<?, ?it/s]9,  3.91it/s][A
 22%|██▏       | 609/2754 [02:51<08:50,  4.04it/s][A

epoch: 0, step: 813, loss: 4.05257



                                                  
  0%|          | 0/1 [02:51<?, ?it/s]0,  4.04it/s][A
 22%|██▏       | 610/2754 [02:51<08:43,  4.10it/s][A

epoch: 0, step: 814, loss: 6.11911



                                                  
  0%|          | 0/1 [02:52<?, ?it/s]3,  4.10it/s][A
 22%|██▏       | 611/2754 [02:52<08:30,  4.20it/s][A

epoch: 0, step: 815, loss: 2.09087



                                                  
  0%|          | 0/1 [02:52<?, ?it/s]0,  4.20it/s][A
 22%|██▏       | 612/2754 [02:52<08:45,  4.08it/s][A

epoch: 0, step: 816, loss: 3.33885



                                                  
  0%|          | 0/1 [02:52<?, ?it/s]5,  4.08it/s][A
 22%|██▏       | 613/2754 [02:52<08:47,  4.06it/s][A

epoch: 0, step: 817, loss: 3.23109



                                                  
  0%|          | 0/1 [02:52<?, ?it/s]7,  4.06it/s][A
 22%|██▏       | 614/2754 [02:52<08:42,  4.09it/s][A

epoch: 0, step: 818, loss: 1.79288



                                                  
  0%|          | 0/1 [02:53<?, ?it/s]2,  4.09it/s][A
 22%|██▏       | 615/2754 [02:53<08:39,  4.11it/s][A

epoch: 0, step: 819, loss: 5.38987



                                                  
  0%|          | 0/1 [02:53<?, ?it/s]9,  4.11it/s][A
 22%|██▏       | 616/2754 [02:53<08:55,  3.99it/s][A

epoch: 0, step: 820, loss: 2.61269



                                                  
  0%|          | 0/1 [02:53<?, ?it/s]5,  3.99it/s][A
 22%|██▏       | 617/2754 [02:53<08:37,  4.13it/s][A

epoch: 0, step: 821, loss: 1.91422



                                                  
  0%|          | 0/1 [02:53<?, ?it/s]7,  4.13it/s][A
 22%|██▏       | 618/2754 [02:53<08:58,  3.96it/s][A

epoch: 0, step: 822, loss: 3.96184



                                                  
  0%|          | 0/1 [02:54<?, ?it/s]8,  3.96it/s][A
 22%|██▏       | 619/2754 [02:54<09:00,  3.95it/s][A

epoch: 0, step: 823, loss: 2.99876



                                                  
  0%|          | 0/1 [02:54<?, ?it/s]0,  3.95it/s][A
 23%|██▎       | 620/2754 [02:54<08:45,  4.06it/s][A

epoch: 0, step: 824, loss: 5.14599



                                                  
  0%|          | 0/1 [02:54<?, ?it/s]5,  4.06it/s][A
 23%|██▎       | 621/2754 [02:54<08:53,  4.00it/s][A

epoch: 0, step: 825, loss: 3.73222



                                                  
  0%|          | 0/1 [02:54<?, ?it/s]3,  4.00it/s][A
 23%|██▎       | 622/2754 [02:54<08:41,  4.09it/s][A

epoch: 0, step: 826, loss: 2.80507



                                                  
  0%|          | 0/1 [02:55<?, ?it/s]1,  4.09it/s][A
 23%|██▎       | 623/2754 [02:55<08:30,  4.18it/s][A

epoch: 0, step: 827, loss: 4.56621



                                                  
  0%|          | 0/1 [02:55<?, ?it/s]0,  4.18it/s][A
 23%|██▎       | 624/2754 [02:55<08:26,  4.20it/s][A

epoch: 0, step: 828, loss: 3.46173



                                                  
  0%|          | 0/1 [02:55<?, ?it/s]6,  4.20it/s][A
 23%|██▎       | 625/2754 [02:55<08:29,  4.18it/s][A

epoch: 0, step: 829, loss: 4.86788



                                                  
  0%|          | 0/1 [02:55<?, ?it/s]9,  4.18it/s][A
 23%|██▎       | 626/2754 [02:55<08:21,  4.25it/s][A

epoch: 0, step: 830, loss: 2.1984



                                                  
  0%|          | 0/1 [02:56<?, ?it/s]1,  4.25it/s][A
 23%|██▎       | 627/2754 [02:56<08:39,  4.09it/s][A

epoch: 0, step: 831, loss: 2.56658



                                                  
  0%|          | 0/1 [02:56<?, ?it/s]9,  4.09it/s][A
 23%|██▎       | 628/2754 [02:56<09:08,  3.88it/s][A

epoch: 0, step: 832, loss: 3.04381



                                                  
  0%|          | 0/1 [02:56<?, ?it/s]8,  3.88it/s][A
 23%|██▎       | 629/2754 [02:56<09:19,  3.80it/s][A

epoch: 0, step: 833, loss: 3.39317



                                                  
  0%|          | 0/1 [02:56<?, ?it/s]9,  3.80it/s][A
 23%|██▎       | 630/2754 [02:56<09:01,  3.92it/s][A

epoch: 0, step: 834, loss: 2.78006



                                                  
  0%|          | 0/1 [02:57<?, ?it/s]1,  3.92it/s][A
 23%|██▎       | 631/2754 [02:57<09:05,  3.89it/s][A

epoch: 0, step: 835, loss: 3.70413



                                                  
  0%|          | 0/1 [02:57<?, ?it/s]5,  3.89it/s][A
 23%|██▎       | 632/2754 [02:57<09:02,  3.91it/s][A

epoch: 0, step: 836, loss: 2.45856



                                                  
  0%|          | 0/1 [02:57<?, ?it/s]2,  3.91it/s][A
 23%|██▎       | 633/2754 [02:57<08:55,  3.96it/s][A

epoch: 0, step: 837, loss: 2.47413



                                                  
  0%|          | 0/1 [02:57<?, ?it/s]5,  3.96it/s][A
 23%|██▎       | 634/2754 [02:57<08:50,  3.99it/s][A

epoch: 0, step: 838, loss: 3.7268



                                                  
  0%|          | 0/1 [02:58<?, ?it/s]0,  3.99it/s][A
 23%|██▎       | 635/2754 [02:58<09:25,  3.75it/s][A

epoch: 0, step: 839, loss: 3.35016



                                                  
  0%|          | 0/1 [02:58<?, ?it/s]5,  3.75it/s][A
 23%|██▎       | 636/2754 [02:58<09:08,  3.86it/s][A

epoch: 0, step: 840, loss: 3.66964



                                                  
  0%|          | 0/1 [02:58<?, ?it/s]8,  3.86it/s][A
 23%|██▎       | 637/2754 [02:58<09:11,  3.84it/s][A

epoch: 0, step: 841, loss: 0.651018



                                                  
  0%|          | 0/1 [02:58<?, ?it/s]1,  3.84it/s][A
 23%|██▎       | 638/2754 [02:58<09:04,  3.89it/s][A

epoch: 0, step: 842, loss: 3.79439



                                                  
  0%|          | 0/1 [02:59<?, ?it/s]4,  3.89it/s][A
 23%|██▎       | 639/2754 [02:59<09:02,  3.90it/s][A

epoch: 0, step: 843, loss: 1.36973



                                                  
  0%|          | 0/1 [02:59<?, ?it/s]2,  3.90it/s][A
 23%|██▎       | 640/2754 [02:59<08:53,  3.96it/s][A

epoch: 0, step: 844, loss: 3.46651



                                                  
  0%|          | 0/1 [02:59<?, ?it/s]3,  3.96it/s][A
 23%|██▎       | 641/2754 [02:59<08:43,  4.04it/s][A

epoch: 0, step: 845, loss: 5.22301



                                                  
  0%|          | 0/1 [02:59<?, ?it/s]3,  4.04it/s][A
 23%|██▎       | 642/2754 [02:59<08:38,  4.07it/s][A

epoch: 0, step: 846, loss: 5.23563



                                                  
  0%|          | 0/1 [03:00<?, ?it/s]8,  4.07it/s][A
 23%|██▎       | 643/2754 [03:00<08:34,  4.10it/s][A

epoch: 0, step: 847, loss: 3.90329



                                                  
  0%|          | 0/1 [03:00<?, ?it/s]4,  4.10it/s][A
 23%|██▎       | 644/2754 [03:00<08:29,  4.14it/s][A

epoch: 0, step: 848, loss: 4.58864



                                                  
  0%|          | 0/1 [03:00<?, ?it/s]9,  4.14it/s][A
 23%|██▎       | 645/2754 [03:00<08:27,  4.16it/s][A

epoch: 0, step: 849, loss: 4.17612



                                                  
  0%|          | 0/1 [03:00<?, ?it/s]7,  4.16it/s][A
 23%|██▎       | 646/2754 [03:00<08:14,  4.26it/s][A

epoch: 0, step: 850, loss: 5.55103



                                                  
  0%|          | 0/1 [03:01<?, ?it/s]4,  4.26it/s][A
 23%|██▎       | 647/2754 [03:01<08:20,  4.21it/s][A

epoch: 0, step: 851, loss: 2.80057



                                                  
  0%|          | 0/1 [03:01<?, ?it/s]0,  4.21it/s][A
 24%|██▎       | 648/2754 [03:01<08:18,  4.23it/s][A

epoch: 0, step: 852, loss: 4.07701



                                                  
  0%|          | 0/1 [03:01<?, ?it/s]8,  4.23it/s][A
 24%|██▎       | 649/2754 [03:01<08:11,  4.29it/s][A

epoch: 0, step: 853, loss: 2.74368



                                                  
  0%|          | 0/1 [03:01<?, ?it/s]1,  4.29it/s][A
 24%|██▎       | 650/2754 [03:01<08:10,  4.29it/s][A

epoch: 0, step: 854, loss: 3.83556



                                                  
  0%|          | 0/1 [03:01<?, ?it/s]0,  4.29it/s][A
 24%|██▎       | 651/2754 [03:01<08:11,  4.28it/s][A

epoch: 0, step: 855, loss: 4.12421



                                                  
  0%|          | 0/1 [03:02<?, ?it/s]1,  4.28it/s][A
 24%|██▎       | 652/2754 [03:02<08:12,  4.27it/s][A

epoch: 0, step: 856, loss: 1.6859



                                                  
  0%|          | 0/1 [03:02<?, ?it/s]2,  4.27it/s][A
 24%|██▎       | 653/2754 [03:02<08:15,  4.24it/s][A

epoch: 0, step: 857, loss: 1.39192



                                                  
  0%|          | 0/1 [03:02<?, ?it/s]5,  4.24it/s][A
 24%|██▎       | 654/2754 [03:02<08:11,  4.27it/s][A

epoch: 0, step: 858, loss: 3.76473



                                                  
  0%|          | 0/1 [03:02<?, ?it/s]1,  4.27it/s][A
 24%|██▍       | 655/2754 [03:02<08:46,  3.98it/s][A

epoch: 0, step: 859, loss: 3.75692



                                                  
  0%|          | 0/1 [03:03<?, ?it/s]6,  3.98it/s][A
 24%|██▍       | 656/2754 [03:03<09:06,  3.84it/s][A

epoch: 0, step: 860, loss: 3.44579



                                                  
  0%|          | 0/1 [03:03<?, ?it/s]6,  3.84it/s][A
 24%|██▍       | 657/2754 [03:03<09:18,  3.76it/s][A

epoch: 0, step: 861, loss: 3.34575



                                                  
  0%|          | 0/1 [03:03<?, ?it/s]8,  3.76it/s][A
 24%|██▍       | 658/2754 [03:03<08:53,  3.93it/s][A

epoch: 0, step: 862, loss: 3.06489



                                                  
  0%|          | 0/1 [03:03<?, ?it/s]3,  3.93it/s][A
 24%|██▍       | 659/2754 [03:03<08:40,  4.03it/s][A

epoch: 0, step: 863, loss: 1.76736



                                                  
  0%|          | 0/1 [03:04<?, ?it/s]0,  4.03it/s][A
 24%|██▍       | 660/2754 [03:04<08:56,  3.91it/s][A

epoch: 0, step: 864, loss: 3.34289



                                                  
  0%|          | 0/1 [03:04<?, ?it/s]6,  3.91it/s][A
 24%|██▍       | 661/2754 [03:04<08:43,  4.00it/s][A

epoch: 0, step: 865, loss: 2.62877



                                                  
  0%|          | 0/1 [03:04<?, ?it/s]3,  4.00it/s][A
 24%|██▍       | 662/2754 [03:04<08:31,  4.09it/s][A

epoch: 0, step: 866, loss: 3.72356



                                                  
  0%|          | 0/1 [03:04<?, ?it/s]1,  4.09it/s][A
 24%|██▍       | 663/2754 [03:04<08:29,  4.11it/s][A

epoch: 0, step: 867, loss: 1.30641



                                                  
  0%|          | 0/1 [03:05<?, ?it/s]9,  4.11it/s][A
 24%|██▍       | 664/2754 [03:05<08:55,  3.90it/s][A

epoch: 0, step: 868, loss: 4.17637



                                                  
  0%|          | 0/1 [03:05<?, ?it/s]5,  3.90it/s][A
 24%|██▍       | 665/2754 [03:05<08:39,  4.02it/s][A

epoch: 0, step: 869, loss: 3.20079



                                                  
  0%|          | 0/1 [03:05<?, ?it/s]9,  4.02it/s][A
 24%|██▍       | 666/2754 [03:05<09:56,  3.50it/s][A

epoch: 0, step: 870, loss: 2.79667



                                                  
  0%|          | 0/1 [03:06<?, ?it/s]6,  3.50it/s][A
 24%|██▍       | 667/2754 [03:06<09:21,  3.72it/s][A

epoch: 0, step: 871, loss: 2.99904



                                                  
  0%|          | 0/1 [03:06<?, ?it/s]1,  3.72it/s][A
 24%|██▍       | 668/2754 [03:06<09:37,  3.61it/s][A

epoch: 0, step: 872, loss: 2.90592



                                                  
  0%|          | 0/1 [03:06<?, ?it/s]7,  3.61it/s][A
 24%|██▍       | 669/2754 [03:06<09:09,  3.80it/s][A

epoch: 0, step: 873, loss: 3.62972



                                                  
  0%|          | 0/1 [03:06<?, ?it/s]9,  3.80it/s][A
 24%|██▍       | 670/2754 [03:06<08:49,  3.93it/s][A

epoch: 0, step: 874, loss: 2.8969



                                                  
  0%|          | 0/1 [03:07<?, ?it/s]9,  3.93it/s][A
 24%|██▍       | 671/2754 [03:07<08:39,  4.01it/s][A

epoch: 0, step: 875, loss: 0.60858



                                                  
  0%|          | 0/1 [03:07<?, ?it/s]9,  4.01it/s][A
 24%|██▍       | 672/2754 [03:07<08:34,  4.05it/s][A

epoch: 0, step: 876, loss: 3.66008



                                                  
  0%|          | 0/1 [03:07<?, ?it/s]4,  4.05it/s][A
 24%|██▍       | 673/2754 [03:07<08:26,  4.11it/s][A

epoch: 0, step: 877, loss: 5.00867



                                                  
  0%|          | 0/1 [03:07<?, ?it/s]6,  4.11it/s][A
 24%|██▍       | 674/2754 [03:07<08:23,  4.13it/s][A

epoch: 0, step: 878, loss: 1.20858



                                                  
  0%|          | 0/1 [03:08<?, ?it/s]3,  4.13it/s][A
 25%|██▍       | 675/2754 [03:08<08:45,  3.95it/s][A

epoch: 0, step: 879, loss: 4.01285



                                                  
  0%|          | 0/1 [03:08<?, ?it/s]5,  3.95it/s][A
 25%|██▍       | 676/2754 [03:08<08:44,  3.96it/s][A

epoch: 0, step: 880, loss: 5.33803



                                                  
  0%|          | 0/1 [03:08<?, ?it/s]4,  3.96it/s][A
 25%|██▍       | 677/2754 [03:08<08:52,  3.90it/s][A

epoch: 0, step: 881, loss: 3.60928



                                                  
  0%|          | 0/1 [03:08<?, ?it/s]2,  3.90it/s][A
 25%|██▍       | 678/2754 [03:08<10:13,  3.39it/s][A

epoch: 0, step: 882, loss: 3.31934



                                                  
  0%|          | 0/1 [03:09<?, ?it/s]3,  3.39it/s][A
 25%|██▍       | 679/2754 [03:09<09:34,  3.61it/s][A

epoch: 0, step: 883, loss: 3.17987



                                                  
  0%|          | 0/1 [03:09<?, ?it/s]4,  3.61it/s][A
 25%|██▍       | 680/2754 [03:09<09:47,  3.53it/s][A

epoch: 0, step: 884, loss: 1.97974



                                                  
  0%|          | 0/1 [03:09<?, ?it/s]7,  3.53it/s][A
 25%|██▍       | 681/2754 [03:09<09:38,  3.59it/s][A

epoch: 0, step: 885, loss: 1.81954



                                                  
  0%|          | 0/1 [03:10<?, ?it/s]8,  3.59it/s][A
 25%|██▍       | 682/2754 [03:10<09:33,  3.61it/s][A

epoch: 0, step: 886, loss: 3.19664



                                                  
  0%|          | 0/1 [03:10<?, ?it/s]3,  3.61it/s][A
 25%|██▍       | 683/2754 [03:10<09:03,  3.81it/s][A

epoch: 0, step: 887, loss: 4.34264



                                                  
  0%|          | 0/1 [03:10<?, ?it/s]3,  3.81it/s][A
 25%|██▍       | 684/2754 [03:10<13:16,  2.60it/s][A

epoch: 0, step: 888, loss: 3.79537



                                                  
  0%|          | 0/1 [03:12<?, ?it/s]6,  2.60it/s][A
 25%|██▍       | 685/2754 [03:12<20:09,  1.71it/s][A

epoch: 0, step: 889, loss: 3.69378



                                                  
  0%|          | 0/1 [03:12<?, ?it/s]9,  1.71it/s][A
 25%|██▍       | 686/2754 [03:12<19:51,  1.74it/s][A

epoch: 0, step: 890, loss: 2.98829



                                                  
  0%|          | 0/1 [03:12<?, ?it/s]1,  1.74it/s][A
 25%|██▍       | 687/2754 [03:12<16:41,  2.06it/s][A

epoch: 0, step: 891, loss: 4.36758



                                                  
  0%|          | 0/1 [03:13<?, ?it/s]1,  2.06it/s][A
 25%|██▍       | 688/2754 [03:13<14:15,  2.42it/s][A

epoch: 0, step: 892, loss: 4.12559



                                                  
  0%|          | 0/1 [03:13<?, ?it/s]5,  2.42it/s][A
 25%|██▌       | 689/2754 [03:13<13:39,  2.52it/s][A

epoch: 0, step: 893, loss: 2.57503



                                                  
  0%|          | 0/1 [03:13<?, ?it/s]9,  2.52it/s][A
 25%|██▌       | 690/2754 [03:13<12:04,  2.85it/s][A

epoch: 0, step: 894, loss: 2.76636



                                                  
  0%|          | 0/1 [03:13<?, ?it/s]4,  2.85it/s][A
 25%|██▌       | 691/2754 [03:13<11:01,  3.12it/s][A

epoch: 0, step: 895, loss: 6.30764



                                                  
  0%|          | 0/1 [03:14<?, ?it/s]1,  3.12it/s][A
 25%|██▌       | 692/2754 [03:14<10:14,  3.35it/s][A

epoch: 0, step: 896, loss: 4.5876



                                                  
  0%|          | 0/1 [03:14<?, ?it/s]4,  3.35it/s][A
 25%|██▌       | 693/2754 [03:14<10:54,  3.15it/s][A

epoch: 0, step: 897, loss: 3.78429



                                                  
  0%|          | 0/1 [03:14<?, ?it/s]4,  3.15it/s][A
 25%|██▌       | 694/2754 [03:14<10:19,  3.32it/s][A

epoch: 0, step: 898, loss: 3.17489



                                                  
  0%|          | 0/1 [03:15<?, ?it/s]9,  3.32it/s][A
 25%|██▌       | 695/2754 [03:15<09:42,  3.54it/s][A

epoch: 0, step: 899, loss: 2.73069



                                                  
  0%|          | 0/1 [03:15<?, ?it/s]2,  3.54it/s][A
 25%|██▌       | 696/2754 [03:15<09:16,  3.70it/s][A

epoch: 0, step: 900, loss: 3.92199



                                                  
  0%|          | 0/1 [03:15<?, ?it/s]6,  3.70it/s][A
 25%|██▌       | 697/2754 [03:15<09:15,  3.70it/s][A

epoch: 0, step: 901, loss: 4.01892



                                                  
  0%|          | 0/1 [03:15<?, ?it/s]5,  3.70it/s][A
 25%|██▌       | 698/2754 [03:15<08:49,  3.88it/s][A

epoch: 0, step: 902, loss: 2.68383



                                                  
  0%|          | 0/1 [03:16<?, ?it/s]9,  3.88it/s][A
 25%|██▌       | 699/2754 [03:16<08:40,  3.94it/s][A

epoch: 0, step: 903, loss: 4.31086



                                                  
  0%|          | 0/1 [03:16<?, ?it/s]0,  3.94it/s][A
 25%|██▌       | 700/2754 [03:16<08:24,  4.07it/s][A

epoch: 0, step: 904, loss: 3.81222



                                                  
  0%|          | 0/1 [03:16<?, ?it/s]4,  4.07it/s][A
 25%|██▌       | 701/2754 [03:16<08:49,  3.88it/s][A

epoch: 0, step: 905, loss: 2.69943



                                                  
  0%|          | 0/1 [03:16<?, ?it/s]9,  3.88it/s][A
 25%|██▌       | 702/2754 [03:16<08:54,  3.84it/s][A

epoch: 0, step: 906, loss: 2.77667



                                                  
  0%|          | 0/1 [03:17<?, ?it/s]4,  3.84it/s][A
 26%|██▌       | 703/2754 [03:17<08:42,  3.93it/s][A

epoch: 0, step: 907, loss: 4.59099



                                                  
  0%|          | 0/1 [03:17<?, ?it/s]2,  3.93it/s][A
 26%|██▌       | 704/2754 [03:17<09:04,  3.76it/s][A

epoch: 0, step: 908, loss: 3.03022



                                                  
  0%|          | 0/1 [03:17<?, ?it/s]4,  3.76it/s][A
 26%|██▌       | 705/2754 [03:17<08:47,  3.88it/s][A

epoch: 0, step: 909, loss: 3.55908



                                                  
  0%|          | 0/1 [03:17<?, ?it/s]7,  3.88it/s][A
 26%|██▌       | 706/2754 [03:17<08:47,  3.88it/s][A

epoch: 0, step: 910, loss: 2.27929



                                                  
  0%|          | 0/1 [03:18<?, ?it/s]7,  3.88it/s][A
 26%|██▌       | 707/2754 [03:18<13:05,  2.60it/s][A

epoch: 0, step: 911, loss: 3.57139



                                                  
  0%|          | 0/1 [03:18<?, ?it/s]5,  2.60it/s][A
 26%|██▌       | 708/2754 [03:18<12:04,  2.82it/s][A

epoch: 0, step: 912, loss: 3.49912



                                                  
  0%|          | 0/1 [03:19<?, ?it/s]4,  2.82it/s][A
 26%|██▌       | 709/2754 [03:19<10:42,  3.18it/s][A

epoch: 0, step: 913, loss: 3.62191



                                                  
  0%|          | 0/1 [03:19<?, ?it/s]2,  3.18it/s][A
 26%|██▌       | 710/2754 [03:19<10:08,  3.36it/s][A

epoch: 0, step: 914, loss: 3.82755



                                                  
  0%|          | 0/1 [03:19<?, ?it/s]8,  3.36it/s][A
 26%|██▌       | 711/2754 [03:19<09:43,  3.50it/s][A

epoch: 0, step: 915, loss: 3.93783



                                                  
  0%|          | 0/1 [03:19<?, ?it/s]3,  3.50it/s][A
 26%|██▌       | 712/2754 [03:19<09:32,  3.57it/s][A

epoch: 0, step: 916, loss: 3.39031



                                                  
  0%|          | 0/1 [03:20<?, ?it/s]2,  3.57it/s][A
 26%|██▌       | 713/2754 [03:20<09:22,  3.63it/s][A

epoch: 0, step: 917, loss: 3.13602



                                                  
  0%|          | 0/1 [03:20<?, ?it/s]2,  3.63it/s][A
 26%|██▌       | 714/2754 [03:20<09:15,  3.67it/s][A

epoch: 0, step: 918, loss: 3.35471



                                                  
  0%|          | 0/1 [03:20<?, ?it/s]5,  3.67it/s][A
 26%|██▌       | 715/2754 [03:20<08:46,  3.87it/s][A

epoch: 0, step: 919, loss: 2.10521



                                                  
  0%|          | 0/1 [03:20<?, ?it/s]6,  3.87it/s][A
 26%|██▌       | 716/2754 [03:20<09:01,  3.76it/s][A

epoch: 0, step: 920, loss: 3.92563



                                                  
  0%|          | 0/1 [03:21<?, ?it/s]1,  3.76it/s][A
 26%|██▌       | 717/2754 [03:21<09:29,  3.58it/s][A

epoch: 0, step: 921, loss: 3.09916



                                                  
  0%|          | 0/1 [03:21<?, ?it/s]9,  3.58it/s][A
 26%|██▌       | 718/2754 [03:21<09:10,  3.70it/s][A

epoch: 0, step: 922, loss: 3.03716



                                                  
  0%|          | 0/1 [03:21<?, ?it/s]0,  3.70it/s][A
 26%|██▌       | 719/2754 [03:21<08:55,  3.80it/s][A

epoch: 0, step: 923, loss: 4.41782



                                                  
  0%|          | 0/1 [03:21<?, ?it/s]5,  3.80it/s][A
 26%|██▌       | 720/2754 [03:21<08:41,  3.90it/s][A

epoch: 0, step: 924, loss: 2.88882



                                                  
  0%|          | 0/1 [03:22<?, ?it/s]1,  3.90it/s][A
 26%|██▌       | 721/2754 [03:22<08:24,  4.03it/s][A

epoch: 0, step: 925, loss: 3.29239



                                                  
  0%|          | 0/1 [03:22<?, ?it/s]4,  4.03it/s][A
 26%|██▌       | 722/2754 [03:22<08:16,  4.09it/s][A

epoch: 0, step: 926, loss: 2.42864



                                                  
  0%|          | 0/1 [03:22<?, ?it/s]6,  4.09it/s][A
 26%|██▋       | 723/2754 [03:22<08:11,  4.13it/s][A

epoch: 0, step: 927, loss: 3.74244



                                                  
  0%|          | 0/1 [03:22<?, ?it/s]1,  4.13it/s][A
 26%|██▋       | 724/2754 [03:22<08:11,  4.13it/s][A

epoch: 0, step: 928, loss: 3.72025



                                                  
  0%|          | 0/1 [03:23<?, ?it/s]1,  4.13it/s][A
 26%|██▋       | 725/2754 [03:23<08:27,  4.00it/s][A

epoch: 0, step: 929, loss: 1.86321



                                                  
  0%|          | 0/1 [03:23<?, ?it/s]7,  4.00it/s][A
 26%|██▋       | 726/2754 [03:23<08:53,  3.80it/s][A

epoch: 0, step: 930, loss: 3.87872



                                                  
  0%|          | 0/1 [03:23<?, ?it/s]3,  3.80it/s][A
 26%|██▋       | 727/2754 [03:23<08:45,  3.86it/s][A

epoch: 0, step: 931, loss: 3.62278



                                                  
  0%|          | 0/1 [03:23<?, ?it/s]5,  3.86it/s][A
 26%|██▋       | 728/2754 [03:23<08:30,  3.97it/s][A

epoch: 0, step: 932, loss: 3.35823



                                                  
  0%|          | 0/1 [03:24<?, ?it/s]0,  3.97it/s][A
 26%|██▋       | 729/2754 [03:24<09:31,  3.55it/s][A

epoch: 0, step: 933, loss: 2.96461



                                                  
  0%|          | 0/1 [03:24<?, ?it/s]1,  3.55it/s][A
 27%|██▋       | 730/2754 [03:24<08:56,  3.77it/s][A

epoch: 0, step: 934, loss: 3.57993



                                                  
  0%|          | 0/1 [03:24<?, ?it/s]6,  3.77it/s][A
 27%|██▋       | 731/2754 [03:24<11:04,  3.04it/s][A

epoch: 0, step: 935, loss: 2.8791



                                                  
  0%|          | 0/1 [03:25<?, ?it/s]4,  3.04it/s][A
 27%|██▋       | 732/2754 [03:25<10:12,  3.30it/s][A

epoch: 0, step: 936, loss: 2.96825



                                                  
  0%|          | 0/1 [03:25<?, ?it/s]2,  3.30it/s][A
 27%|██▋       | 733/2754 [03:25<09:36,  3.50it/s][A

epoch: 0, step: 937, loss: 2.20131



                                                  
  0%|          | 0/1 [03:25<?, ?it/s]6,  3.50it/s][A
 27%|██▋       | 734/2754 [03:25<09:21,  3.60it/s][A

epoch: 0, step: 938, loss: 4.16726



                                                  
  0%|          | 0/1 [03:25<?, ?it/s]1,  3.60it/s][A
 27%|██▋       | 735/2754 [03:25<08:54,  3.78it/s][A

epoch: 0, step: 939, loss: 3.1127



                                                  
  0%|          | 0/1 [03:26<?, ?it/s]4,  3.78it/s][A
 27%|██▋       | 736/2754 [03:26<09:15,  3.63it/s][A

epoch: 0, step: 940, loss: 2.65507



                                                  
  0%|          | 0/1 [03:26<?, ?it/s]5,  3.63it/s][A
 27%|██▋       | 737/2754 [03:26<09:11,  3.66it/s][A

epoch: 0, step: 941, loss: 2.82136



                                                  
  0%|          | 0/1 [03:26<?, ?it/s]1,  3.66it/s][A
 27%|██▋       | 738/2754 [03:26<08:52,  3.79it/s][A

epoch: 0, step: 942, loss: 4.9543



                                                  
  0%|          | 0/1 [03:26<?, ?it/s]2,  3.79it/s][A
 27%|██▋       | 739/2754 [03:26<08:41,  3.86it/s][A

epoch: 0, step: 943, loss: 4.02588



                                                  
  0%|          | 0/1 [03:27<?, ?it/s]1,  3.86it/s][A
 27%|██▋       | 740/2754 [03:27<09:02,  3.71it/s][A

epoch: 0, step: 944, loss: 0.949936



                                                  
  0%|          | 0/1 [03:27<?, ?it/s]2,  3.71it/s][A
 27%|██▋       | 741/2754 [03:27<09:13,  3.64it/s][A

epoch: 0, step: 945, loss: 3.07098



                                                  
  0%|          | 0/1 [03:27<?, ?it/s]3,  3.64it/s][A
 27%|██▋       | 742/2754 [03:27<09:06,  3.68it/s][A

epoch: 0, step: 946, loss: 4.52563



                                                  
  0%|          | 0/1 [03:28<?, ?it/s]6,  3.68it/s][A
 27%|██▋       | 743/2754 [03:28<08:50,  3.79it/s][A

epoch: 0, step: 947, loss: 4.21883



                                                  
  0%|          | 0/1 [03:28<?, ?it/s]0,  3.79it/s][A
 27%|██▋       | 744/2754 [03:28<10:01,  3.34it/s][A

epoch: 0, step: 948, loss: 2.71964



                                                  
  0%|          | 0/1 [03:28<?, ?it/s]1,  3.34it/s][A
 27%|██▋       | 745/2754 [03:28<11:16,  2.97it/s][A

epoch: 0, step: 949, loss: 3.21514



                                                  
  0%|          | 0/1 [03:29<?, ?it/s]6,  2.97it/s][A
 27%|██▋       | 746/2754 [03:29<10:27,  3.20it/s][A

epoch: 0, step: 950, loss: 2.87673



                                                  
  0%|          | 0/1 [03:29<?, ?it/s]7,  3.20it/s][A
 27%|██▋       | 747/2754 [03:29<09:44,  3.43it/s][A

epoch: 0, step: 951, loss: 3.76012



                                                  
  0%|          | 0/1 [03:29<?, ?it/s]4,  3.43it/s][A
 27%|██▋       | 748/2754 [03:29<09:12,  3.63it/s][A

epoch: 0, step: 952, loss: 2.78699



                                                  
  0%|          | 0/1 [03:29<?, ?it/s]2,  3.63it/s][A
 27%|██▋       | 749/2754 [03:29<09:07,  3.66it/s][A

epoch: 0, step: 953, loss: 2.93658



                                                  
  0%|          | 0/1 [03:30<?, ?it/s]7,  3.66it/s][A
 27%|██▋       | 750/2754 [03:30<08:50,  3.77it/s][A

epoch: 0, step: 954, loss: 4.78479



                                                  
  0%|          | 0/1 [03:30<?, ?it/s]0,  3.77it/s][A
 27%|██▋       | 751/2754 [03:30<08:40,  3.85it/s][A

epoch: 0, step: 955, loss: 3.90196



                                                  
  0%|          | 0/1 [03:30<?, ?it/s]0,  3.85it/s][A
 27%|██▋       | 752/2754 [03:30<08:30,  3.92it/s][A

epoch: 0, step: 956, loss: 5.17603



                                                  
  0%|          | 0/1 [03:30<?, ?it/s]0,  3.92it/s][A
 27%|██▋       | 753/2754 [03:30<08:19,  4.00it/s][A

epoch: 0, step: 957, loss: 2.79054



                                                  
  0%|          | 0/1 [03:31<?, ?it/s]9,  4.00it/s][A
 27%|██▋       | 754/2754 [03:31<08:07,  4.10it/s][A

epoch: 0, step: 958, loss: 4.38681



                                                  
  0%|          | 0/1 [03:31<?, ?it/s]7,  4.10it/s][A
 27%|██▋       | 755/2754 [03:31<08:42,  3.83it/s][A

epoch: 0, step: 959, loss: 3.2782



                                                  
  0%|          | 0/1 [03:31<?, ?it/s]2,  3.83it/s][A
 27%|██▋       | 756/2754 [03:31<08:51,  3.76it/s][A

epoch: 0, step: 960, loss: 3.30756



                                                  
  0%|          | 0/1 [03:31<?, ?it/s]1,  3.76it/s][A
 27%|██▋       | 757/2754 [03:31<08:37,  3.86it/s][A

epoch: 0, step: 961, loss: 3.71181



                                                  
  0%|          | 0/1 [03:32<?, ?it/s]7,  3.86it/s][A
 28%|██▊       | 758/2754 [03:32<08:23,  3.96it/s][A

epoch: 0, step: 962, loss: 1.37957



                                                  
  0%|          | 0/1 [03:32<?, ?it/s]3,  3.96it/s][A
 28%|██▊       | 759/2754 [03:32<08:37,  3.85it/s][A

epoch: 0, step: 963, loss: 2.87928



                                                  
  0%|          | 0/1 [03:32<?, ?it/s]7,  3.85it/s][A
 28%|██▊       | 760/2754 [03:32<09:48,  3.39it/s][A

epoch: 0, step: 964, loss: 3.67084



                                                  
  0%|          | 0/1 [03:33<?, ?it/s]8,  3.39it/s][A
 28%|██▊       | 761/2754 [03:33<09:23,  3.54it/s][A

epoch: 0, step: 965, loss: 2.98454



                                                  
  0%|          | 0/1 [03:33<?, ?it/s]3,  3.54it/s][A
 28%|██▊       | 762/2754 [03:33<09:06,  3.65it/s][A

epoch: 0, step: 966, loss: 3.97047



                                                  
  0%|          | 0/1 [03:33<?, ?it/s]6,  3.65it/s][A
 28%|██▊       | 763/2754 [03:33<09:02,  3.67it/s][A

epoch: 0, step: 967, loss: 2.6462



                                                  
  0%|          | 0/1 [03:33<?, ?it/s]2,  3.67it/s][A
 28%|██▊       | 764/2754 [03:33<09:02,  3.67it/s][A

epoch: 0, step: 968, loss: 3.31596



                                                  
  0%|          | 0/1 [03:34<?, ?it/s]2,  3.67it/s][A
 28%|██▊       | 765/2754 [03:34<10:14,  3.24it/s][A

epoch: 0, step: 969, loss: 3.75537



                                                  
  0%|          | 0/1 [03:34<?, ?it/s]4,  3.24it/s][A
 28%|██▊       | 766/2754 [03:34<09:55,  3.34it/s][A

epoch: 0, step: 970, loss: 3.14941



                                                  
  0%|          | 0/1 [03:34<?, ?it/s]5,  3.34it/s][A
 28%|██▊       | 767/2754 [03:34<09:46,  3.39it/s][A

epoch: 0, step: 971, loss: 3.7548



                                                  
  0%|          | 0/1 [03:35<?, ?it/s]6,  3.39it/s][A
 28%|██▊       | 768/2754 [03:35<09:09,  3.61it/s][A

epoch: 0, step: 972, loss: 2.44397



                                                  
  0%|          | 0/1 [03:35<?, ?it/s]9,  3.61it/s][A
 28%|██▊       | 769/2754 [03:35<09:23,  3.52it/s][A

epoch: 0, step: 973, loss: 3.63816



                                                  
  0%|          | 0/1 [03:35<?, ?it/s]3,  3.52it/s][A
 28%|██▊       | 770/2754 [03:35<09:05,  3.64it/s][A

epoch: 0, step: 974, loss: 3.49755



                                                  
  0%|          | 0/1 [03:35<?, ?it/s]5,  3.64it/s][A
 28%|██▊       | 771/2754 [03:35<08:44,  3.78it/s][A

epoch: 0, step: 975, loss: 3.82188



                                                  
  0%|          | 0/1 [03:36<?, ?it/s]4,  3.78it/s][A
 28%|██▊       | 772/2754 [03:36<08:31,  3.88it/s][A

epoch: 0, step: 976, loss: 3.45634



                                                  
  0%|          | 0/1 [03:36<?, ?it/s]1,  3.88it/s][A
 28%|██▊       | 773/2754 [03:36<08:19,  3.96it/s][A

epoch: 0, step: 977, loss: 2.16427



                                                  
  0%|          | 0/1 [03:36<?, ?it/s]9,  3.96it/s][A
 28%|██▊       | 774/2754 [03:36<07:56,  4.15it/s][A

epoch: 0, step: 978, loss: 6.42143



                                                  
  0%|          | 0/1 [03:36<?, ?it/s]6,  4.15it/s][A
 28%|██▊       | 775/2754 [03:36<07:40,  4.30it/s][A

epoch: 0, step: 979, loss: 5.50633



                                                  
  0%|          | 0/1 [03:36<?, ?it/s]0,  4.30it/s][A
 28%|██▊       | 776/2754 [03:36<07:37,  4.32it/s][A

epoch: 0, step: 980, loss: 1.36789



                                                  
  0%|          | 0/1 [03:37<?, ?it/s]7,  4.32it/s][A
 28%|██▊       | 777/2754 [03:37<07:39,  4.30it/s][A

epoch: 0, step: 981, loss: 4.4948



                                                  
  0%|          | 0/1 [03:37<?, ?it/s]9,  4.30it/s][A
 28%|██▊       | 778/2754 [03:37<07:35,  4.34it/s][A

epoch: 0, step: 982, loss: 4.2079



                                                  
  0%|          | 0/1 [03:37<?, ?it/s]5,  4.34it/s][A
 28%|██▊       | 779/2754 [03:37<07:35,  4.34it/s][A

epoch: 0, step: 983, loss: 2.7806



                                                  
  0%|          | 0/1 [03:37<?, ?it/s]5,  4.34it/s][A
 28%|██▊       | 780/2754 [03:37<07:42,  4.26it/s][A

epoch: 0, step: 984, loss: 2.56639



                                                  
  0%|          | 0/1 [03:38<?, ?it/s]2,  4.26it/s][A
 28%|██▊       | 781/2754 [03:38<07:33,  4.35it/s][A

epoch: 0, step: 985, loss: 4.23566



                                                  
  0%|          | 0/1 [03:38<?, ?it/s]3,  4.35it/s][A
 28%|██▊       | 782/2754 [03:38<09:12,  3.57it/s][A

epoch: 0, step: 986, loss: 3.80734



                                                  
  0%|          | 0/1 [03:38<?, ?it/s]2,  3.57it/s][A
 28%|██▊       | 783/2754 [03:38<08:34,  3.83it/s][A

epoch: 0, step: 987, loss: 4.77928



                                                  
  0%|          | 0/1 [03:38<?, ?it/s]4,  3.83it/s][A
 28%|██▊       | 784/2754 [03:38<08:12,  4.00it/s][A

epoch: 0, step: 988, loss: 2.83028



                                                  
  0%|          | 0/1 [03:39<?, ?it/s]2,  4.00it/s][A
 29%|██▊       | 785/2754 [03:39<07:55,  4.14it/s][A

epoch: 0, step: 989, loss: 4.4774



                                                  
  0%|          | 0/1 [03:39<?, ?it/s]5,  4.14it/s][A
 29%|██▊       | 786/2754 [03:39<07:47,  4.21it/s][A

epoch: 0, step: 990, loss: 3.91552



                                                  
  0%|          | 0/1 [03:40<?, ?it/s]7,  4.21it/s][A
 29%|██▊       | 787/2754 [03:40<11:56,  2.74it/s][A

epoch: 0, step: 991, loss: 3.31774



                                                  
  0%|          | 0/1 [03:40<?, ?it/s]6,  2.74it/s][A
 29%|██▊       | 788/2754 [03:40<11:08,  2.94it/s][A

epoch: 0, step: 992, loss: 5.44827



                                                  
  0%|          | 0/1 [03:40<?, ?it/s]8,  2.94it/s][A
 29%|██▊       | 789/2754 [03:40<10:11,  3.21it/s][A

epoch: 0, step: 993, loss: 3.67267



                                                  
  0%|          | 0/1 [03:40<?, ?it/s]1,  3.21it/s][A
 29%|██▊       | 790/2754 [03:40<09:35,  3.41it/s][A

epoch: 0, step: 994, loss: 2.80343



                                                  
  0%|          | 0/1 [03:41<?, ?it/s]5,  3.41it/s][A
 29%|██▊       | 791/2754 [03:41<09:34,  3.42it/s][A

epoch: 0, step: 995, loss: 2.57621



                                                  
  0%|          | 0/1 [03:41<?, ?it/s]4,  3.42it/s][A
 29%|██▉       | 792/2754 [03:41<09:08,  3.57it/s][A

epoch: 0, step: 996, loss: 4.78389



                                                  
  0%|          | 0/1 [03:41<?, ?it/s]8,  3.57it/s][A
 29%|██▉       | 793/2754 [03:41<09:03,  3.61it/s][A

epoch: 0, step: 997, loss: 2.89641



                                                  
  0%|          | 0/1 [03:41<?, ?it/s]3,  3.61it/s][A
 29%|██▉       | 794/2754 [03:41<08:48,  3.71it/s][A

epoch: 0, step: 998, loss: 1.58849



                                                  
  0%|          | 0/1 [03:42<?, ?it/s]8,  3.71it/s][A
 29%|██▉       | 795/2754 [03:42<08:31,  3.83it/s][A

epoch: 0, step: 999, loss: 3.83393



                                                  
  0%|          | 0/1 [03:42<?, ?it/s]1,  3.83it/s][A
 29%|██▉       | 796/2754 [03:42<08:35,  3.79it/s][A

epoch: 0, step: 1000, loss: 3.57565



                                                  
  0%|          | 0/1 [03:42<?, ?it/s]5,  3.79it/s][A
 29%|██▉       | 797/2754 [03:42<08:35,  3.80it/s][A

epoch: 0, step: 1001, loss: 3.80316



                                                  
  0%|          | 0/1 [03:42<?, ?it/s]5,  3.80it/s][A
 29%|██▉       | 798/2754 [03:42<08:40,  3.76it/s][A

epoch: 0, step: 1002, loss: 3.53728



                                                  
  0%|          | 0/1 [03:43<?, ?it/s]0,  3.76it/s][A
 29%|██▉       | 799/2754 [03:43<08:41,  3.75it/s][A

epoch: 0, step: 1003, loss: 3.59834



                                                  
  0%|          | 0/1 [03:43<?, ?it/s]1,  3.75it/s][A
 29%|██▉       | 800/2754 [03:43<08:19,  3.91it/s][A

epoch: 0, step: 1004, loss: 5.51548



                                                  
  0%|          | 0/1 [03:43<?, ?it/s]9,  3.91it/s][A
 29%|██▉       | 801/2754 [03:43<08:03,  4.04it/s][A

epoch: 0, step: 1005, loss: 2.67834



                                                  
  0%|          | 0/1 [03:44<?, ?it/s]3,  4.04it/s][A
 29%|██▉       | 802/2754 [03:44<11:48,  2.76it/s][A

epoch: 0, step: 1006, loss: 3.28452



                                                  
  0%|          | 0/1 [03:44<?, ?it/s]8,  2.76it/s][A
 29%|██▉       | 803/2754 [03:44<13:34,  2.40it/s][A

epoch: 0, step: 1007, loss: 2.83922



                                                  
  0%|          | 0/1 [03:45<?, ?it/s]4,  2.40it/s][A
 29%|██▉       | 804/2754 [03:45<13:26,  2.42it/s][A

epoch: 0, step: 1008, loss: 3.18144



                                                  
  0%|          | 0/1 [03:45<?, ?it/s]6,  2.42it/s][A
 29%|██▉       | 805/2754 [03:45<11:44,  2.77it/s][A

epoch: 0, step: 1009, loss: 4.83122



                                                  
  0%|          | 0/1 [03:45<?, ?it/s]4,  2.77it/s][A
 29%|██▉       | 806/2754 [03:45<11:06,  2.92it/s][A

epoch: 0, step: 1010, loss: 3.04858



                                                  
  0%|          | 0/1 [03:46<?, ?it/s]6,  2.92it/s][A
 29%|██▉       | 807/2754 [03:46<10:14,  3.17it/s][A

epoch: 0, step: 1011, loss: 2.34464



                                                  
  0%|          | 0/1 [03:46<?, ?it/s]4,  3.17it/s][A
 29%|██▉       | 808/2754 [03:46<09:27,  3.43it/s][A

epoch: 0, step: 1012, loss: 3.47754



                                                  
  0%|          | 0/1 [03:46<?, ?it/s]7,  3.43it/s][A
 29%|██▉       | 809/2754 [03:46<09:05,  3.57it/s][A

epoch: 0, step: 1013, loss: 2.03275



                                                  
  0%|          | 0/1 [03:46<?, ?it/s]5,  3.57it/s][A
 29%|██▉       | 810/2754 [03:46<09:23,  3.45it/s][A

epoch: 0, step: 1014, loss: 3.6813



                                                  
  0%|          | 0/1 [03:47<?, ?it/s]3,  3.45it/s][A
 29%|██▉       | 811/2754 [03:47<09:00,  3.60it/s][A

epoch: 0, step: 1015, loss: 3.29213



                                                  
  0%|          | 0/1 [03:47<?, ?it/s]0,  3.60it/s][A
 29%|██▉       | 812/2754 [03:47<08:43,  3.71it/s][A

epoch: 0, step: 1016, loss: 3.69896



                                                  
  0%|          | 0/1 [03:47<?, ?it/s]3,  3.71it/s][A
 30%|██▉       | 813/2754 [03:47<08:24,  3.84it/s][A

epoch: 0, step: 1017, loss: 2.81969



                                                  
  0%|          | 0/1 [03:47<?, ?it/s]4,  3.84it/s][A
 30%|██▉       | 814/2754 [03:47<08:15,  3.91it/s][A

epoch: 0, step: 1018, loss: 4.54883



                                                  
  0%|          | 0/1 [03:48<?, ?it/s]5,  3.91it/s][A
 30%|██▉       | 815/2754 [03:48<08:06,  3.98it/s][A

epoch: 0, step: 1019, loss: 3.32794



                                                  
  0%|          | 0/1 [03:48<?, ?it/s]6,  3.98it/s][A
 30%|██▉       | 816/2754 [03:48<08:00,  4.03it/s][A

epoch: 0, step: 1020, loss: 2.53194



                                                  
  0%|          | 0/1 [03:48<?, ?it/s]0,  4.03it/s][A
 30%|██▉       | 817/2754 [03:48<08:14,  3.91it/s][A

epoch: 0, step: 1021, loss: 2.32716



                                                  
  0%|          | 0/1 [03:48<?, ?it/s]4,  3.91it/s][A
 30%|██▉       | 818/2754 [03:48<08:08,  3.96it/s][A

epoch: 0, step: 1022, loss: 2.81119



                                                  
  0%|          | 0/1 [03:49<?, ?it/s]8,  3.96it/s][A
 30%|██▉       | 819/2754 [03:49<08:29,  3.80it/s][A

epoch: 0, step: 1023, loss: 3.74536



                                                  
  0%|          | 0/1 [03:49<?, ?it/s]9,  3.80it/s][A
 30%|██▉       | 820/2754 [03:49<08:21,  3.86it/s][A

epoch: 0, step: 1024, loss: 3.96136



                                                  
  0%|          | 0/1 [03:49<?, ?it/s]1,  3.86it/s][A
 30%|██▉       | 821/2754 [03:49<08:06,  3.97it/s][A

epoch: 0, step: 1025, loss: 3.77831



                                                  
  0%|          | 0/1 [03:49<?, ?it/s]6,  3.97it/s][A
 30%|██▉       | 822/2754 [03:49<08:04,  3.99it/s]

In [39]:
def print_memory_usage():
    print(f"Allocated memory: {torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
    print(f"Cached memory: {torch.cuda.memory_reserved() / (1024 ** 3):.2f} GB")

print_memory_usage()

Allocated memory: 73.42 GB
Cached memory: 76.90 GB


In [None]:
accelerator.unwrap_model(model).save_pretrained("model", safe_serialization=True)