In [1]:
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import torch
from trl import SFTTrainer, setup_chat_format



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Importing the dataset
dataset = load_dataset("yunfan-y/trump-qa", split="train")

# Load the tokenizer and model
model_name = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

# Assuming setup_chat_format is a custom function you've defined elsewhere
# If not, you might need to implement or import it
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

def format_chat_template(row):    
    row_json = [{"role": "system", "content": row["instruction"]},
               {"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting to the dataset
dataset = dataset.map(
    format_chat_template
)

print(dataset)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Dataset({
    features: ['output', 'input', 'instruction', 'text'],
    num_rows: 33596
})


In [3]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./result",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    group_by_length=True,
    report_to="wandb",
    weight_decay=0.01,
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()

model.push_to_hub("yunfan-y/trump-gemma-qa")
trainer.push_to_hub("yunfan-y/trump-gemma-qa")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maidenyang66[0m ([33myyfsss[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 1/16798 [00:00<2:55:22,  1.60it/s]

{'loss': 3.5246, 'grad_norm': 3.473902463912964, 'learning_rate': 2e-05, 'epoch': 0.0}


  0%|          | 2/16798 [00:00<1:52:32,  2.49it/s]

{'loss': 4.0154, 'grad_norm': 3.20951771736145, 'learning_rate': 4e-05, 'epoch': 0.0}


  0%|          | 3/16798 [00:01<1:37:50,  2.86it/s]

{'loss': 4.7408, 'grad_norm': 4.272924423217773, 'learning_rate': 6e-05, 'epoch': 0.0}


  0%|          | 4/16798 [00:01<1:27:03,  3.22it/s]

{'loss': 4.5748, 'grad_norm': 3.4644196033477783, 'learning_rate': 8e-05, 'epoch': 0.0}


  0%|          | 5/16798 [00:01<1:21:15,  3.44it/s]

{'loss': 4.4156, 'grad_norm': 3.0535202026367188, 'learning_rate': 0.0001, 'epoch': 0.0}


  0%|          | 6/16798 [00:01<1:17:56,  3.59it/s]

{'loss': 4.5662, 'grad_norm': 3.3364381790161133, 'learning_rate': 0.00012, 'epoch': 0.0}


  0%|          | 7/16798 [00:02<1:23:37,  3.35it/s]

{'loss': 3.7148, 'grad_norm': 2.821734666824341, 'learning_rate': 0.00014, 'epoch': 0.0}


  0%|          | 8/16798 [00:02<1:19:43,  3.51it/s]

{'loss': 3.9282, 'grad_norm': 3.176267147064209, 'learning_rate': 0.00016, 'epoch': 0.0}


  0%|          | 9/16798 [00:02<1:15:05,  3.73it/s]

{'loss': 3.676, 'grad_norm': 3.2430899143218994, 'learning_rate': 0.00018, 'epoch': 0.0}


  0%|          | 10/16798 [00:02<1:13:12,  3.82it/s]

{'loss': 4.387, 'grad_norm': 3.1358554363250732, 'learning_rate': 0.0002, 'epoch': 0.0}


  0%|          | 11/16798 [00:03<1:09:50,  4.01it/s]

{'loss': 3.4169, 'grad_norm': 3.2043588161468506, 'learning_rate': 0.00019998808672861567, 'epoch': 0.0}


  0%|          | 12/16798 [00:03<1:07:44,  4.13it/s]

{'loss': 3.3419, 'grad_norm': 3.818754196166992, 'learning_rate': 0.00019997617345723136, 'epoch': 0.0}


  0%|          | 13/16798 [00:03<1:07:13,  4.16it/s]

{'loss': 2.7295, 'grad_norm': 3.978616237640381, 'learning_rate': 0.00019996426018584703, 'epoch': 0.0}


  0%|          | 14/16798 [00:03<1:07:00,  4.17it/s]

{'loss': 3.1863, 'grad_norm': 4.241055488586426, 'learning_rate': 0.00019995234691446272, 'epoch': 0.0}


  0%|          | 15/16798 [00:04<1:08:09,  4.10it/s]

{'loss': 3.1373, 'grad_norm': 4.385052680969238, 'learning_rate': 0.00019994043364307838, 'epoch': 0.0}


  0%|          | 16/16798 [00:04<1:16:46,  3.64it/s]

{'loss': 2.7138, 'grad_norm': 4.62932825088501, 'learning_rate': 0.00019992852037169407, 'epoch': 0.0}


  0%|          | 17/16798 [00:04<1:21:21,  3.44it/s]

{'loss': 2.4189, 'grad_norm': 4.3601274490356445, 'learning_rate': 0.00019991660710030974, 'epoch': 0.0}


  0%|          | 18/16798 [00:05<1:24:12,  3.32it/s]

{'loss': 2.1017, 'grad_norm': 6.609704494476318, 'learning_rate': 0.00019990469382892543, 'epoch': 0.0}


  0%|          | 19/16798 [00:05<1:20:15,  3.48it/s]

{'loss': 2.5272, 'grad_norm': 4.298148155212402, 'learning_rate': 0.0001998927805575411, 'epoch': 0.0}


  0%|          | 20/16798 [00:05<1:19:58,  3.50it/s]

{'loss': 2.3446, 'grad_norm': 5.478255748748779, 'learning_rate': 0.00019988086728615678, 'epoch': 0.0}


  0%|          | 21/16798 [00:05<1:17:15,  3.62it/s]

{'loss': 2.4561, 'grad_norm': 6.543558597564697, 'learning_rate': 0.00019986895401477245, 'epoch': 0.0}


  0%|          | 22/16798 [00:06<1:14:38,  3.75it/s]

{'loss': 2.3551, 'grad_norm': 5.497115135192871, 'learning_rate': 0.00019985704074338814, 'epoch': 0.0}


  0%|          | 23/16798 [00:06<1:14:06,  3.77it/s]

{'loss': 2.4243, 'grad_norm': 5.539404392242432, 'learning_rate': 0.0001998451274720038, 'epoch': 0.0}


  0%|          | 24/16798 [00:06<1:12:21,  3.86it/s]

{'loss': 2.1464, 'grad_norm': 4.99462366104126, 'learning_rate': 0.0001998332142006195, 'epoch': 0.0}


  0%|          | 25/16798 [00:06<1:12:07,  3.88it/s]

{'loss': 2.0497, 'grad_norm': 3.509547472000122, 'learning_rate': 0.00019982130092923515, 'epoch': 0.0}


  0%|          | 26/16798 [00:07<1:10:28,  3.97it/s]

{'loss': 1.5093, 'grad_norm': 3.4117929935455322, 'learning_rate': 0.00019980938765785087, 'epoch': 0.0}


  0%|          | 28/16798 [00:07<1:04:48,  4.31it/s]

{'loss': 1.832, 'grad_norm': 5.0644307136535645, 'learning_rate': 0.00019979747438646654, 'epoch': 0.0}


  0%|          | 28/16798 [00:07<1:04:48,  4.31it/s]

{'loss': 2.0855, 'grad_norm': 4.277103424072266, 'learning_rate': 0.00019978556111508223, 'epoch': 0.0}


  0%|          | 29/16798 [00:07<1:05:28,  4.27it/s]

{'loss': 2.02, 'grad_norm': 3.9931344985961914, 'learning_rate': 0.0001997736478436979, 'epoch': 0.0}


  0%|          | 30/16798 [00:08<1:06:17,  4.22it/s]

{'loss': 2.2582, 'grad_norm': 4.954016208648682, 'learning_rate': 0.00019976173457231358, 'epoch': 0.0}


  0%|          | 31/16798 [00:08<1:09:53,  4.00it/s]

{'loss': 2.0926, 'grad_norm': 3.674499034881592, 'learning_rate': 0.00019974982130092924, 'epoch': 0.0}


  0%|          | 32/16798 [00:08<1:10:04,  3.99it/s]

{'loss': 1.6636, 'grad_norm': 3.557192087173462, 'learning_rate': 0.00019973790802954494, 'epoch': 0.0}


  0%|          | 33/16798 [00:08<1:08:53,  4.06it/s]

{'loss': 1.9629, 'grad_norm': 3.05800461769104, 'learning_rate': 0.0001997259947581606, 'epoch': 0.0}


  0%|          | 34/16798 [00:09<1:06:29,  4.20it/s]

{'loss': 1.667, 'grad_norm': 2.958118200302124, 'learning_rate': 0.0001997140814867763, 'epoch': 0.0}


  0%|          | 35/16798 [00:09<1:04:48,  4.31it/s]

{'loss': 1.919, 'grad_norm': 3.529329776763916, 'learning_rate': 0.00019970216821539195, 'epoch': 0.0}


  0%|          | 37/16798 [00:09<1:03:32,  4.40it/s]

{'loss': 1.8032, 'grad_norm': 3.4424331188201904, 'learning_rate': 0.00019969025494400764, 'epoch': 0.0}


  0%|          | 37/16798 [00:09<1:03:32,  4.40it/s]

{'loss': 1.5493, 'grad_norm': 2.951488733291626, 'learning_rate': 0.0001996783416726233, 'epoch': 0.0}


  0%|          | 39/16798 [00:10<1:01:26,  4.55it/s]

{'loss': 1.6402, 'grad_norm': 3.027912139892578, 'learning_rate': 0.000199666428401239, 'epoch': 0.0}


  0%|          | 39/16798 [00:10<1:01:26,  4.55it/s]

{'loss': 1.2951, 'grad_norm': 2.9349021911621094, 'learning_rate': 0.00019965451512985466, 'epoch': 0.0}


  0%|          | 40/16798 [00:10<1:01:19,  4.55it/s]

{'loss': 1.4604, 'grad_norm': 2.693211793899536, 'learning_rate': 0.00019964260185847035, 'epoch': 0.0}


  0%|          | 41/16798 [00:10<1:01:02,  4.58it/s]

{'loss': 1.2901, 'grad_norm': 2.8101935386657715, 'learning_rate': 0.00019963068858708602, 'epoch': 0.0}


  0%|          | 42/16798 [00:10<1:00:57,  4.58it/s]

{'loss': 1.6399, 'grad_norm': 4.173896312713623, 'learning_rate': 0.0001996187753157017, 'epoch': 0.0}


  0%|          | 44/16798 [00:11<59:28,  4.69it/s]  

{'loss': 1.4014, 'grad_norm': 3.055272102355957, 'learning_rate': 0.00019960686204431737, 'epoch': 0.0}


  0%|          | 44/16798 [00:11<59:28,  4.69it/s]

{'loss': 1.1466, 'grad_norm': 3.1896145343780518, 'learning_rate': 0.00019959494877293306, 'epoch': 0.0}


  0%|          | 46/16798 [00:11<58:49,  4.75it/s]

{'loss': 1.1024, 'grad_norm': 2.987196922302246, 'learning_rate': 0.00019958303550154873, 'epoch': 0.0}


  0%|          | 46/16798 [00:11<58:49,  4.75it/s]

{'loss': 1.1624, 'grad_norm': 2.982992172241211, 'learning_rate': 0.00019957112223016442, 'epoch': 0.0}


  0%|          | 48/16798 [00:12<58:02,  4.81it/s]

{'loss': 1.0349, 'grad_norm': 3.1904730796813965, 'learning_rate': 0.00019955920895878008, 'epoch': 0.0}


  0%|          | 49/16798 [00:12<56:57,  4.90it/s]

{'loss': 0.7187, 'grad_norm': 3.956303834915161, 'learning_rate': 0.00019954729568739577, 'epoch': 0.0}


  0%|          | 49/16798 [00:12<56:57,  4.90it/s]

{'loss': 0.6549, 'grad_norm': 2.912600040435791, 'learning_rate': 0.00019953538241601143, 'epoch': 0.0}


  0%|          | 50/16798 [00:12<58:06,  4.80it/s]

{'loss': 0.6263, 'grad_norm': 3.053001880645752, 'learning_rate': 0.00019952346914462713, 'epoch': 0.0}


  0%|          | 51/16798 [00:12<1:00:26,  4.62it/s]

{'loss': 2.4403, 'grad_norm': 5.40882682800293, 'learning_rate': 0.0001995115558732428, 'epoch': 0.0}


  0%|          | 52/16798 [00:12<1:02:26,  4.47it/s]

{'loss': 2.3026, 'grad_norm': 4.075570106506348, 'learning_rate': 0.00019949964260185848, 'epoch': 0.0}


  0%|          | 53/16798 [00:13<1:03:10,  4.42it/s]

{'loss': 2.2079, 'grad_norm': 3.945481061935425, 'learning_rate': 0.00019948772933047414, 'epoch': 0.0}


  0%|          | 54/16798 [00:13<1:02:28,  4.47it/s]

{'loss': 2.5609, 'grad_norm': 3.8040714263916016, 'learning_rate': 0.00019947581605908983, 'epoch': 0.0}


  0%|          | 55/16798 [00:13<1:02:00,  4.50it/s]

{'loss': 2.0003, 'grad_norm': 2.6449522972106934, 'learning_rate': 0.0001994639027877055, 'epoch': 0.0}


  0%|          | 56/16798 [00:13<1:02:59,  4.43it/s]

{'loss': 2.008, 'grad_norm': 3.2706897258758545, 'learning_rate': 0.0001994519895163212, 'epoch': 0.0}


  0%|          | 57/16798 [00:14<1:03:05,  4.42it/s]

{'loss': 1.9291, 'grad_norm': 2.3812100887298584, 'learning_rate': 0.00019944007624493688, 'epoch': 0.0}


  0%|          | 58/16798 [00:14<1:02:12,  4.48it/s]

{'loss': 2.0577, 'grad_norm': 2.4738502502441406, 'learning_rate': 0.00019942816297355257, 'epoch': 0.0}


  0%|          | 59/16798 [00:14<1:01:53,  4.51it/s]

{'loss': 2.3598, 'grad_norm': 2.618337631225586, 'learning_rate': 0.00019941624970216823, 'epoch': 0.0}


  0%|          | 60/16798 [00:14<1:01:38,  4.53it/s]

{'loss': 2.0674, 'grad_norm': 2.2337894439697266, 'learning_rate': 0.00019940433643078392, 'epoch': 0.0}


  0%|          | 61/16798 [00:14<1:00:43,  4.59it/s]

{'loss': 2.6533, 'grad_norm': 2.4096269607543945, 'learning_rate': 0.0001993924231593996, 'epoch': 0.0}


  0%|          | 63/16798 [00:15<59:21,  4.70it/s]  

{'loss': 1.7188, 'grad_norm': 2.002056121826172, 'learning_rate': 0.00019938050988801528, 'epoch': 0.0}


  0%|          | 63/16798 [00:15<59:21,  4.70it/s]

{'loss': 2.267, 'grad_norm': 2.9906952381134033, 'learning_rate': 0.00019936859661663094, 'epoch': 0.0}


  0%|          | 64/16798 [00:15<1:02:10,  4.49it/s]

{'loss': 1.7019, 'grad_norm': 2.935520648956299, 'learning_rate': 0.00019935668334524663, 'epoch': 0.0}


  0%|          | 65/16798 [00:15<1:02:21,  4.47it/s]

{'loss': 2.1225, 'grad_norm': 2.9608304500579834, 'learning_rate': 0.0001993447700738623, 'epoch': 0.0}


  0%|          | 67/16798 [00:16<1:00:48,  4.59it/s]

{'loss': 1.6218, 'grad_norm': 1.9683712720870972, 'learning_rate': 0.000199332856802478, 'epoch': 0.0}


  0%|          | 68/16798 [00:16<59:55,  4.65it/s]  

{'loss': 1.5323, 'grad_norm': 2.2481374740600586, 'learning_rate': 0.00019932094353109365, 'epoch': 0.0}


  0%|          | 69/16798 [00:16<58:25,  4.77it/s]

{'loss': 1.5746, 'grad_norm': 2.57795786857605, 'learning_rate': 0.00019930903025970931, 'epoch': 0.0}


  0%|          | 70/16798 [00:16<58:14,  4.79it/s]

{'loss': 2.1818, 'grad_norm': 2.883207321166992, 'learning_rate': 0.000199297116988325, 'epoch': 0.0}


  0%|          | 71/16798 [00:17<57:53,  4.82it/s]

{'loss': 2.2942, 'grad_norm': 2.6793863773345947, 'learning_rate': 0.00019928520371694067, 'epoch': 0.0}


  0%|          | 72/16798 [00:17<57:30,  4.85it/s]

{'loss': 1.4353, 'grad_norm': 2.2051217555999756, 'learning_rate': 0.00019927329044555636, 'epoch': 0.0}


  0%|          | 73/16798 [00:17<57:26,  4.85it/s]

{'loss': 1.5152, 'grad_norm': 2.5738818645477295, 'learning_rate': 0.00019926137717417202, 'epoch': 0.0}


  0%|          | 73/16798 [00:17<57:26,  4.85it/s]

{'loss': 1.6438, 'grad_norm': 2.4852023124694824, 'learning_rate': 0.00019924946390278771, 'epoch': 0.0}


  0%|          | 75/16798 [00:17<57:30,  4.85it/s]

{'loss': 2.0485, 'grad_norm': 2.4955215454101562, 'learning_rate': 0.00019923755063140338, 'epoch': 0.0}


  0%|          | 76/16798 [00:18<58:07,  4.80it/s]

{'loss': 1.7082, 'grad_norm': 2.2850332260131836, 'learning_rate': 0.00019922563736001907, 'epoch': 0.0}


  0%|          | 76/16798 [00:18<58:07,  4.80it/s]

{'loss': 1.9258, 'grad_norm': 2.5296318531036377, 'learning_rate': 0.00019921372408863473, 'epoch': 0.0}


  0%|          | 77/16798 [00:18<57:59,  4.81it/s]

{'loss': 1.3471, 'grad_norm': 1.9650959968566895, 'learning_rate': 0.00019920181081725042, 'epoch': 0.0}


  0%|          | 78/16798 [00:18<59:32,  4.68it/s]

{'loss': 1.6016, 'grad_norm': 2.8467116355895996, 'learning_rate': 0.0001991898975458661, 'epoch': 0.0}


  0%|          | 79/16798 [00:18<59:56,  4.65it/s]

{'loss': 1.8747, 'grad_norm': 2.979612350463867, 'learning_rate': 0.00019917798427448178, 'epoch': 0.0}


  0%|          | 80/16798 [00:19<59:24,  4.69it/s]

{'loss': 1.504, 'grad_norm': 2.6125004291534424, 'learning_rate': 0.00019916607100309744, 'epoch': 0.0}


  0%|          | 82/16798 [00:19<58:00,  4.80it/s]

{'loss': 1.8073, 'grad_norm': 2.777540683746338, 'learning_rate': 0.00019915415773171313, 'epoch': 0.0}


  0%|          | 83/16798 [00:19<57:44,  4.83it/s]

{'loss': 1.4359, 'grad_norm': 2.4791438579559326, 'learning_rate': 0.0001991422444603288, 'epoch': 0.0}


  1%|          | 84/16798 [00:19<57:44,  4.82it/s]

{'loss': 1.8341, 'grad_norm': 2.4518983364105225, 'learning_rate': 0.0001991303311889445, 'epoch': 0.0}


  1%|          | 85/16798 [00:20<58:19,  4.78it/s]

{'loss': 1.2506, 'grad_norm': 2.192383050918579, 'learning_rate': 0.00019911841791756015, 'epoch': 0.01}


  1%|          | 85/16798 [00:20<58:19,  4.78it/s]

{'loss': 1.4987, 'grad_norm': 2.5649044513702393, 'learning_rate': 0.00019910650464617584, 'epoch': 0.01}


  1%|          | 86/16798 [00:20<58:18,  4.78it/s]

{'loss': 1.4396, 'grad_norm': 2.5283162593841553, 'learning_rate': 0.0001990945913747915, 'epoch': 0.01}


  1%|          | 88/16798 [00:20<58:58,  4.72it/s]

{'loss': 1.7395, 'grad_norm': 2.629659414291382, 'learning_rate': 0.0001990826781034072, 'epoch': 0.01}


  1%|          | 88/16798 [00:20<58:58,  4.72it/s]

{'loss': 1.1875, 'grad_norm': 2.723053216934204, 'learning_rate': 0.00019907076483202289, 'epoch': 0.01}


  1%|          | 89/16798 [00:20<59:23,  4.69it/s]

{'loss': 1.0595, 'grad_norm': 2.2763710021972656, 'learning_rate': 0.00019905885156063858, 'epoch': 0.01}


  1%|          | 91/16798 [00:21<58:35,  4.75it/s]

{'loss': 1.1455, 'grad_norm': 2.076033115386963, 'learning_rate': 0.00019904693828925424, 'epoch': 0.01}


  1%|          | 92/16798 [00:21<57:46,  4.82it/s]

{'loss': 1.3147, 'grad_norm': 2.608227491378784, 'learning_rate': 0.00019903502501786993, 'epoch': 0.01}


  1%|          | 93/16798 [00:21<57:31,  4.84it/s]

{'loss': 1.3731, 'grad_norm': 2.4622631072998047, 'learning_rate': 0.0001990231117464856, 'epoch': 0.01}


  1%|          | 94/16798 [00:21<57:08,  4.87it/s]

{'loss': 1.3776, 'grad_norm': 3.174724817276001, 'learning_rate': 0.00019901119847510129, 'epoch': 0.01}


  1%|          | 94/16798 [00:21<57:08,  4.87it/s]

{'loss': 1.6168, 'grad_norm': 2.9857230186462402, 'learning_rate': 0.00019899928520371695, 'epoch': 0.01}


  1%|          | 96/16798 [00:22<57:42,  4.82it/s]

{'loss': 1.0409, 'grad_norm': 2.139450788497925, 'learning_rate': 0.00019898737193233264, 'epoch': 0.01}


  1%|          | 96/16798 [00:22<57:42,  4.82it/s]

{'loss': 0.8962, 'grad_norm': 2.356783866882324, 'learning_rate': 0.0001989754586609483, 'epoch': 0.01}


  1%|          | 98/16798 [00:22<58:02,  4.80it/s]

{'loss': 1.5335, 'grad_norm': 3.0167176723480225, 'learning_rate': 0.000198963545389564, 'epoch': 0.01}


  1%|          | 99/16798 [00:22<58:30,  4.76it/s]

{'loss': 0.5441, 'grad_norm': 2.124826669692993, 'learning_rate': 0.00019895163211817966, 'epoch': 0.01}


  1%|          | 100/16798 [00:23<57:26,  4.84it/s]

{'loss': 0.55, 'grad_norm': 1.9362972974777222, 'learning_rate': 0.00019893971884679535, 'epoch': 0.01}


  1%|          | 100/16798 [00:23<57:26,  4.84it/s]

{'loss': 0.7243, 'grad_norm': 1.940068244934082, 'learning_rate': 0.000198927805575411, 'epoch': 0.01}


  1%|          | 101/16798 [00:23<58:59,  4.72it/s]

{'loss': 1.9972, 'grad_norm': 2.1533591747283936, 'learning_rate': 0.0001989158923040267, 'epoch': 0.01}


  1%|          | 102/16798 [00:23<1:00:06,  4.63it/s]

{'loss': 2.296, 'grad_norm': 2.565781593322754, 'learning_rate': 0.00019890397903264237, 'epoch': 0.01}


  1%|          | 104/16798 [00:24<59:39,  4.66it/s]  

{'loss': 2.2274, 'grad_norm': 2.704712390899658, 'learning_rate': 0.00019889206576125806, 'epoch': 0.01}


  1%|          | 104/16798 [00:24<59:39,  4.66it/s]

{'loss': 2.0814, 'grad_norm': 2.5417473316192627, 'learning_rate': 0.00019888015248987372, 'epoch': 0.01}


  1%|          | 105/16798 [00:24<1:00:04,  4.63it/s]

{'loss': 2.2843, 'grad_norm': 2.3420779705047607, 'learning_rate': 0.0001988682392184894, 'epoch': 0.01}


  1%|          | 106/16798 [00:24<59:55,  4.64it/s]  

{'loss': 2.4008, 'grad_norm': 2.454010248184204, 'learning_rate': 0.00019885632594710508, 'epoch': 0.01}


  1%|          | 107/16798 [00:24<1:00:21,  4.61it/s]

{'loss': 2.2881, 'grad_norm': 2.3762118816375732, 'learning_rate': 0.00019884441267572077, 'epoch': 0.01}


  1%|          | 108/16798 [00:24<1:00:39,  4.59it/s]

{'loss': 1.9764, 'grad_norm': 2.233440399169922, 'learning_rate': 0.00019883249940433643, 'epoch': 0.01}


  1%|          | 109/16798 [00:25<59:28,  4.68it/s]  

{'loss': 2.2282, 'grad_norm': 2.5956156253814697, 'learning_rate': 0.00019882058613295212, 'epoch': 0.01}


  1%|          | 110/16798 [00:25<1:00:25,  4.60it/s]

{'loss': 1.7037, 'grad_norm': 2.266998052597046, 'learning_rate': 0.00019880867286156778, 'epoch': 0.01}


  1%|          | 111/16798 [00:25<1:00:35,  4.59it/s]

{'loss': 1.7745, 'grad_norm': 2.1386892795562744, 'learning_rate': 0.00019879675959018348, 'epoch': 0.01}


  1%|          | 112/16798 [00:25<1:00:11,  4.62it/s]

{'loss': 1.9607, 'grad_norm': 2.373445987701416, 'learning_rate': 0.00019878484631879914, 'epoch': 0.01}


  1%|          | 113/16798 [00:26<59:47,  4.65it/s]  

{'loss': 1.9459, 'grad_norm': 2.2997825145721436, 'learning_rate': 0.00019877293304741483, 'epoch': 0.01}


  1%|          | 114/16798 [00:26<1:00:22,  4.61it/s]

{'loss': 1.9139, 'grad_norm': 2.3915417194366455, 'learning_rate': 0.0001987610197760305, 'epoch': 0.01}


  1%|          | 115/16798 [00:26<59:23,  4.68it/s]  

{'loss': 1.7437, 'grad_norm': 2.1672234535217285, 'learning_rate': 0.00019874910650464618, 'epoch': 0.01}


  1%|          | 116/16798 [00:26<58:58,  4.71it/s]

{'loss': 1.4941, 'grad_norm': 2.116889238357544, 'learning_rate': 0.00019873719323326185, 'epoch': 0.01}


  1%|          | 117/16798 [00:26<59:06,  4.70it/s]

{'loss': 1.5776, 'grad_norm': 2.035808801651001, 'learning_rate': 0.00019872527996187754, 'epoch': 0.01}


  1%|          | 118/16798 [00:27<1:00:48,  4.57it/s]

{'loss': 1.8668, 'grad_norm': 2.145151376724243, 'learning_rate': 0.0001987133666904932, 'epoch': 0.01}


  1%|          | 119/16798 [00:27<1:00:48,  4.57it/s]

{'loss': 1.889, 'grad_norm': 2.2098445892333984, 'learning_rate': 0.00019870145341910892, 'epoch': 0.01}


  1%|          | 120/16798 [00:27<59:30,  4.67it/s]  

{'loss': 1.913, 'grad_norm': 2.5893092155456543, 'learning_rate': 0.00019868954014772458, 'epoch': 0.01}


  1%|          | 122/16798 [00:27<59:13,  4.69it/s]

{'loss': 1.3982, 'grad_norm': 1.907443881034851, 'learning_rate': 0.00019867762687634027, 'epoch': 0.01}


  1%|          | 122/16798 [00:27<59:13,  4.69it/s]

{'loss': 1.617, 'grad_norm': 2.1048247814178467, 'learning_rate': 0.00019866571360495594, 'epoch': 0.01}


  1%|          | 123/16798 [00:28<1:00:40,  4.58it/s]

{'loss': 1.3489, 'grad_norm': 2.1700544357299805, 'learning_rate': 0.00019865380033357163, 'epoch': 0.01}


  1%|          | 124/16798 [00:28<1:00:24,  4.60it/s]

{'loss': 1.6822, 'grad_norm': 2.334956407546997, 'learning_rate': 0.0001986418870621873, 'epoch': 0.01}


  1%|          | 125/16798 [00:28<1:00:25,  4.60it/s]

{'loss': 1.7654, 'grad_norm': 2.2888238430023193, 'learning_rate': 0.00019862997379080296, 'epoch': 0.01}


  1%|          | 126/16798 [00:28<1:00:11,  4.62it/s]

{'loss': 2.0366, 'grad_norm': 2.4373486042022705, 'learning_rate': 0.00019861806051941865, 'epoch': 0.01}


  1%|          | 127/16798 [00:29<59:45,  4.65it/s]  

{'loss': 1.2877, 'grad_norm': 1.9650906324386597, 'learning_rate': 0.0001986061472480343, 'epoch': 0.01}


  1%|          | 128/16798 [00:29<59:00,  4.71it/s]

{'loss': 1.9169, 'grad_norm': 2.7252023220062256, 'learning_rate': 0.00019859423397665, 'epoch': 0.01}


  1%|          | 130/16798 [00:29<59:15,  4.69it/s]  

{'loss': 1.5521, 'grad_norm': 2.401395082473755, 'learning_rate': 0.00019858232070526567, 'epoch': 0.01}


  1%|          | 131/16798 [00:29<58:48,  4.72it/s]

{'loss': 1.7421, 'grad_norm': 2.5504753589630127, 'learning_rate': 0.00019857040743388136, 'epoch': 0.01}


  1%|          | 131/16798 [00:29<58:48,  4.72it/s]

{'loss': 1.1865, 'grad_norm': 1.9076839685440063, 'learning_rate': 0.00019855849416249702, 'epoch': 0.01}


  1%|          | 132/16798 [00:30<58:09,  4.78it/s]

{'loss': 1.4855, 'grad_norm': 2.0753769874572754, 'learning_rate': 0.0001985465808911127, 'epoch': 0.01}


  1%|          | 134/16798 [00:30<58:01,  4.79it/s]

{'loss': 1.5635, 'grad_norm': 2.3771166801452637, 'learning_rate': 0.00019853466761972837, 'epoch': 0.01}


  1%|          | 134/16798 [00:30<58:01,  4.79it/s]

{'loss': 1.3525, 'grad_norm': 2.2227914333343506, 'learning_rate': 0.00019852275434834407, 'epoch': 0.01}


  1%|          | 135/16798 [00:30<59:03,  4.70it/s]

{'loss': 1.725, 'grad_norm': 2.6166422367095947, 'learning_rate': 0.00019851084107695973, 'epoch': 0.01}


  1%|          | 137/16798 [00:31<59:56,  4.63it/s]  

{'loss': 1.5886, 'grad_norm': 2.4900121688842773, 'learning_rate': 0.00019849892780557542, 'epoch': 0.01}


  1%|          | 138/16798 [00:31<59:07,  4.70it/s]

{'loss': 0.9806, 'grad_norm': 1.8156174421310425, 'learning_rate': 0.00019848701453419108, 'epoch': 0.01}


  1%|          | 139/16798 [00:31<58:30,  4.74it/s]

{'loss': 1.1863, 'grad_norm': 2.363743543624878, 'learning_rate': 0.00019847510126280677, 'epoch': 0.01}


  1%|          | 140/16798 [00:31<58:24,  4.75it/s]

{'loss': 1.5108, 'grad_norm': 2.46883487701416, 'learning_rate': 0.00019846318799142244, 'epoch': 0.01}


  1%|          | 141/16798 [00:31<57:26,  4.83it/s]

{'loss': 1.5194, 'grad_norm': 2.4065682888031006, 'learning_rate': 0.00019845127472003813, 'epoch': 0.01}


  1%|          | 142/16798 [00:32<56:41,  4.90it/s]

{'loss': 1.4602, 'grad_norm': 2.813140392303467, 'learning_rate': 0.0001984393614486538, 'epoch': 0.01}


  1%|          | 143/16798 [00:32<56:48,  4.89it/s]

{'loss': 1.2304, 'grad_norm': 2.188528299331665, 'learning_rate': 0.00019842744817726948, 'epoch': 0.01}


  1%|          | 143/16798 [00:32<56:48,  4.89it/s]

{'loss': 1.3218, 'grad_norm': 2.665053129196167, 'learning_rate': 0.00019841553490588515, 'epoch': 0.01}


  1%|          | 145/16798 [00:32<57:43,  4.81it/s]

{'loss': 1.5525, 'grad_norm': 2.3601443767547607, 'learning_rate': 0.00019840362163450084, 'epoch': 0.01}


  1%|          | 146/16798 [00:33<56:54,  4.88it/s]

{'loss': 0.9858, 'grad_norm': 5.638690948486328, 'learning_rate': 0.0001983917083631165, 'epoch': 0.01}


  1%|          | 146/16798 [00:33<56:54,  4.88it/s]

{'loss': 1.0202, 'grad_norm': 2.7236976623535156, 'learning_rate': 0.0001983797950917322, 'epoch': 0.01}


  1%|          | 147/16798 [00:33<57:21,  4.84it/s]

{'loss': 1.2023, 'grad_norm': 2.6936593055725098, 'learning_rate': 0.00019836788182034786, 'epoch': 0.01}


  1%|          | 148/16798 [00:33<58:21,  4.75it/s]

{'loss': 0.6123, 'grad_norm': 1.8098739385604858, 'learning_rate': 0.00019835596854896355, 'epoch': 0.01}


  1%|          | 150/16798 [00:33<58:17,  4.76it/s]

{'loss': 0.6401, 'grad_norm': 1.7161237001419067, 'learning_rate': 0.00019834405527757924, 'epoch': 0.01}


  1%|          | 150/16798 [00:33<58:17,  4.76it/s]

{'loss': 0.6838, 'grad_norm': 1.909170150756836, 'learning_rate': 0.00019833214200619493, 'epoch': 0.01}


  1%|          | 151/16798 [00:34<59:19,  4.68it/s]

{'loss': 2.18, 'grad_norm': 3.1034228801727295, 'learning_rate': 0.0001983202287348106, 'epoch': 0.01}


  1%|          | 152/16798 [00:34<1:00:10,  4.61it/s]

{'loss': 2.4559, 'grad_norm': 2.7219955921173096, 'learning_rate': 0.00019830831546342628, 'epoch': 0.01}


  1%|          | 153/16798 [00:34<1:00:20,  4.60it/s]

{'loss': 2.4293, 'grad_norm': 2.67626690864563, 'learning_rate': 0.00019829640219204195, 'epoch': 0.01}


  1%|          | 154/16798 [00:34<1:00:45,  4.57it/s]

{'loss': 2.0208, 'grad_norm': 2.704387903213501, 'learning_rate': 0.00019828448892065764, 'epoch': 0.01}


  1%|          | 155/16798 [00:34<1:01:51,  4.48it/s]

{'loss': 2.4117, 'grad_norm': 2.6426379680633545, 'learning_rate': 0.0001982725756492733, 'epoch': 0.01}


  1%|          | 157/16798 [00:35<59:54,  4.63it/s]  

{'loss': 2.269, 'grad_norm': 2.3542702198028564, 'learning_rate': 0.000198260662377889, 'epoch': 0.01}


  1%|          | 158/16798 [00:35<59:55,  4.63it/s]

{'loss': 2.0394, 'grad_norm': 2.29575252532959, 'learning_rate': 0.00019824874910650465, 'epoch': 0.01}


  1%|          | 158/16798 [00:35<59:55,  4.63it/s]

{'loss': 1.978, 'grad_norm': 2.2662014961242676, 'learning_rate': 0.00019823683583512035, 'epoch': 0.01}


  1%|          | 159/16798 [00:35<59:43,  4.64it/s]

{'loss': 2.2014, 'grad_norm': 2.704313039779663, 'learning_rate': 0.000198224922563736, 'epoch': 0.01}


  1%|          | 160/16798 [00:36<1:00:32,  4.58it/s]

{'loss': 1.6968, 'grad_norm': 2.0236146450042725, 'learning_rate': 0.0001982130092923517, 'epoch': 0.01}


  1%|          | 161/16798 [00:36<1:01:35,  4.50it/s]

{'loss': 1.5209, 'grad_norm': 2.0073249340057373, 'learning_rate': 0.00019820109602096736, 'epoch': 0.01}


  1%|          | 162/16798 [00:36<1:00:49,  4.56it/s]

{'loss': 1.8277, 'grad_norm': 2.057034969329834, 'learning_rate': 0.00019818918274958305, 'epoch': 0.01}


  1%|          | 163/16798 [00:36<1:00:00,  4.62it/s]

{'loss': 1.6293, 'grad_norm': 2.1133875846862793, 'learning_rate': 0.00019817726947819872, 'epoch': 0.01}


  1%|          | 164/16798 [00:36<1:00:26,  4.59it/s]

{'loss': 1.7207, 'grad_norm': 2.2120563983917236, 'learning_rate': 0.0001981653562068144, 'epoch': 0.01}


  1%|          | 166/16798 [00:37<59:49,  4.63it/s]  

{'loss': 1.7284, 'grad_norm': 2.135263681411743, 'learning_rate': 0.00019815344293543007, 'epoch': 0.01}


  1%|          | 166/16798 [00:37<59:49,  4.63it/s]

{'loss': 1.9178, 'grad_norm': 1.9324592351913452, 'learning_rate': 0.00019814152966404576, 'epoch': 0.01}


  1%|          | 167/16798 [00:37<59:35,  4.65it/s]

{'loss': 2.089, 'grad_norm': 2.4446651935577393, 'learning_rate': 0.00019812961639266143, 'epoch': 0.01}


  1%|          | 168/16798 [00:37<59:21,  4.67it/s]

{'loss': 1.3794, 'grad_norm': 1.810041904449463, 'learning_rate': 0.00019811770312127712, 'epoch': 0.01}


  1%|          | 170/16798 [00:38<58:54,  4.70it/s]

{'loss': 1.8145, 'grad_norm': 2.1079189777374268, 'learning_rate': 0.00019810578984989278, 'epoch': 0.01}


  1%|          | 170/16798 [00:38<58:54,  4.70it/s]

{'loss': 1.7226, 'grad_norm': 1.990718126296997, 'learning_rate': 0.00019809387657850847, 'epoch': 0.01}


  1%|          | 171/16798 [00:38<58:41,  4.72it/s]

{'loss': 1.806, 'grad_norm': 2.3429062366485596, 'learning_rate': 0.00019808196330712414, 'epoch': 0.01}


  1%|          | 172/16798 [00:38<58:54,  4.70it/s]

{'loss': 1.5199, 'grad_norm': 1.9774984121322632, 'learning_rate': 0.00019807005003573983, 'epoch': 0.01}


  1%|          | 174/16798 [00:39<58:28,  4.74it/s]

{'loss': 1.5292, 'grad_norm': 2.3615505695343018, 'learning_rate': 0.0001980581367643555, 'epoch': 0.01}


  1%|          | 175/16798 [00:39<58:16,  4.75it/s]

{'loss': 1.8631, 'grad_norm': 2.314798593521118, 'learning_rate': 0.00019804622349297118, 'epoch': 0.01}


  1%|          | 176/16798 [00:39<58:00,  4.78it/s]

{'loss': 1.7103, 'grad_norm': 2.415259599685669, 'learning_rate': 0.00019803431022158684, 'epoch': 0.01}


  1%|          | 176/16798 [00:39<58:00,  4.78it/s]

{'loss': 2.151, 'grad_norm': 2.2372758388519287, 'learning_rate': 0.00019802239695020253, 'epoch': 0.01}


  1%|          | 177/16798 [00:39<57:59,  4.78it/s]

{'loss': 1.6604, 'grad_norm': 2.141331911087036, 'learning_rate': 0.0001980104836788182, 'epoch': 0.01}


  1%|          | 179/16798 [00:40<58:28,  4.74it/s]

{'loss': 1.6753, 'grad_norm': 2.042346954345703, 'learning_rate': 0.0001979985704074339, 'epoch': 0.01}


  1%|          | 180/16798 [00:40<57:32,  4.81it/s]

{'loss': 1.3117, 'grad_norm': 1.6105762720108032, 'learning_rate': 0.00019798665713604955, 'epoch': 0.01}


  1%|          | 180/16798 [00:40<57:32,  4.81it/s]

{'loss': 1.4588, 'grad_norm': 1.8968907594680786, 'learning_rate': 0.00019797474386466527, 'epoch': 0.01}


  1%|          | 182/16798 [00:40<57:03,  4.85it/s]

{'loss': 1.3975, 'grad_norm': 1.7669689655303955, 'learning_rate': 0.00019796283059328093, 'epoch': 0.01}


  1%|          | 182/16798 [00:40<57:03,  4.85it/s]

{'loss': 1.8118, 'grad_norm': 2.376194715499878, 'learning_rate': 0.0001979509173218966, 'epoch': 0.01}


  1%|          | 183/16798 [00:40<59:48,  4.63it/s]

{'loss': 1.5643, 'grad_norm': 2.0253546237945557, 'learning_rate': 0.0001979390040505123, 'epoch': 0.01}


  1%|          | 185/16798 [00:41<58:38,  4.72it/s]

{'loss': 1.6785, 'grad_norm': 2.458179235458374, 'learning_rate': 0.00019792709077912795, 'epoch': 0.01}


  1%|          | 185/16798 [00:41<58:38,  4.72it/s]

{'loss': 1.6313, 'grad_norm': 2.102092981338501, 'learning_rate': 0.00019791517750774364, 'epoch': 0.01}


  1%|          | 187/16798 [00:41<57:40,  4.80it/s]

{'loss': 1.4035, 'grad_norm': 1.922805666923523, 'learning_rate': 0.0001979032642363593, 'epoch': 0.01}


  1%|          | 188/16798 [00:42<57:57,  4.78it/s]

{'loss': 1.3389, 'grad_norm': 1.9028115272521973, 'learning_rate': 0.000197891350964975, 'epoch': 0.01}


  1%|          | 189/16798 [00:42<57:25,  4.82it/s]

{'loss': 1.9654, 'grad_norm': 2.319019079208374, 'learning_rate': 0.00019787943769359066, 'epoch': 0.01}


  1%|          | 190/16798 [00:42<57:38,  4.80it/s]

{'loss': 1.4281, 'grad_norm': 1.9038681983947754, 'learning_rate': 0.00019786752442220635, 'epoch': 0.01}


  1%|          | 190/16798 [00:42<57:38,  4.80it/s]

{'loss': 1.3518, 'grad_norm': 2.173862934112549, 'learning_rate': 0.00019785561115082202, 'epoch': 0.01}


  1%|          | 191/16798 [00:42<58:06,  4.76it/s]

{'loss': 1.4022, 'grad_norm': 2.2556653022766113, 'learning_rate': 0.0001978436978794377, 'epoch': 0.01}


  1%|          | 192/16798 [00:42<57:46,  4.79it/s]

{'loss': 1.0044, 'grad_norm': 1.7112444639205933, 'learning_rate': 0.00019783178460805337, 'epoch': 0.01}


  1%|          | 193/16798 [00:43<57:54,  4.78it/s]

{'loss': 1.7032, 'grad_norm': 2.477653980255127, 'learning_rate': 0.00019781987133666906, 'epoch': 0.01}


  1%|          | 195/16798 [00:43<58:17,  4.75it/s]

{'loss': 1.2602, 'grad_norm': 2.215149164199829, 'learning_rate': 0.00019780795806528472, 'epoch': 0.01}


  1%|          | 195/16798 [00:43<58:17,  4.75it/s]

{'loss': 1.202, 'grad_norm': 2.096195936203003, 'learning_rate': 0.00019779604479390042, 'epoch': 0.01}


  1%|          | 196/16798 [00:43<58:03,  4.77it/s]

{'loss': 0.9801, 'grad_norm': 1.9516470432281494, 'learning_rate': 0.00019778413152251608, 'epoch': 0.01}


  1%|          | 198/16798 [00:44<57:43,  4.79it/s]

{'loss': 1.2075, 'grad_norm': 2.4570207595825195, 'learning_rate': 0.00019777221825113177, 'epoch': 0.01}


  1%|          | 199/16798 [00:44<57:43,  4.79it/s]

{'loss': 0.6693, 'grad_norm': 1.8698316812515259, 'learning_rate': 0.00019776030497974743, 'epoch': 0.01}


  1%|          | 199/16798 [00:44<57:43,  4.79it/s]

{'loss': 0.4114, 'grad_norm': 1.446107029914856, 'learning_rate': 0.00019774839170836312, 'epoch': 0.01}


  1%|          | 200/16798 [00:44<57:31,  4.81it/s]

{'loss': 0.6357, 'grad_norm': 2.0771312713623047, 'learning_rate': 0.0001977364784369788, 'epoch': 0.01}


  1%|          | 201/16798 [00:44<59:08,  4.68it/s]

{'loss': 2.0382, 'grad_norm': 1.8953179121017456, 'learning_rate': 0.00019772456516559448, 'epoch': 0.01}


  1%|          | 202/16798 [00:44<1:00:37,  4.56it/s]

{'loss': 2.0846, 'grad_norm': 1.8845754861831665, 'learning_rate': 0.00019771265189421014, 'epoch': 0.01}


  1%|          | 203/16798 [00:45<1:01:02,  4.53it/s]

{'loss': 2.4369, 'grad_norm': 2.42836332321167, 'learning_rate': 0.00019770073862282583, 'epoch': 0.01}


  1%|          | 204/16798 [00:45<1:01:38,  4.49it/s]

{'loss': 2.4592, 'grad_norm': 2.6333417892456055, 'learning_rate': 0.0001976888253514415, 'epoch': 0.01}


  1%|          | 205/16798 [00:45<1:02:00,  4.46it/s]

{'loss': 2.0344, 'grad_norm': 2.170881986618042, 'learning_rate': 0.0001976769120800572, 'epoch': 0.01}


  1%|          | 206/16798 [00:45<1:02:12,  4.44it/s]

{'loss': 2.2768, 'grad_norm': 2.364957094192505, 'learning_rate': 0.00019766499880867285, 'epoch': 0.01}


  1%|          | 208/16798 [00:46<1:01:29,  4.50it/s]

{'loss': 1.9706, 'grad_norm': 2.209972620010376, 'learning_rate': 0.00019765308553728854, 'epoch': 0.01}


  1%|          | 209/16798 [00:46<1:00:53,  4.54it/s]

{'loss': 1.7102, 'grad_norm': 2.0537941455841064, 'learning_rate': 0.0001976411722659042, 'epoch': 0.01}


  1%|          | 209/16798 [00:46<1:00:53,  4.54it/s]

{'loss': 1.7851, 'grad_norm': 1.9925668239593506, 'learning_rate': 0.0001976292589945199, 'epoch': 0.01}


  1%|▏         | 210/16798 [00:46<1:01:46,  4.48it/s]

{'loss': 2.1173, 'grad_norm': 2.6618049144744873, 'learning_rate': 0.00019761734572313556, 'epoch': 0.01}


  1%|▏         | 211/16798 [00:46<1:02:13,  4.44it/s]

{'loss': 1.9866, 'grad_norm': 2.1765873432159424, 'learning_rate': 0.00019760543245175128, 'epoch': 0.01}


  1%|▏         | 212/16798 [00:47<1:01:36,  4.49it/s]

{'loss': 1.5583, 'grad_norm': 1.9965304136276245, 'learning_rate': 0.00019759351918036694, 'epoch': 0.01}


  1%|▏         | 214/16798 [00:47<1:00:12,  4.59it/s]

{'loss': 1.7436, 'grad_norm': 2.0396435260772705, 'learning_rate': 0.00019758160590898263, 'epoch': 0.01}


  1%|▏         | 215/16798 [00:47<59:36,  4.64it/s]  

{'loss': 1.6071, 'grad_norm': 2.0230987071990967, 'learning_rate': 0.0001975696926375983, 'epoch': 0.01}


  1%|▏         | 215/16798 [00:47<59:36,  4.64it/s]

{'loss': 1.3277, 'grad_norm': 1.9052561521530151, 'learning_rate': 0.000197557779366214, 'epoch': 0.01}


  1%|▏         | 217/16798 [00:48<58:27,  4.73it/s]

{'loss': 1.5723, 'grad_norm': 1.867274284362793, 'learning_rate': 0.00019754586609482965, 'epoch': 0.01}


  1%|▏         | 218/16798 [00:48<57:47,  4.78it/s]

{'loss': 1.5329, 'grad_norm': 2.0550572872161865, 'learning_rate': 0.00019753395282344534, 'epoch': 0.01}


  1%|▏         | 218/16798 [00:48<57:47,  4.78it/s]

{'loss': 1.667, 'grad_norm': 2.2016007900238037, 'learning_rate': 0.000197522039552061, 'epoch': 0.01}


  1%|▏         | 219/16798 [00:48<57:01,  4.84it/s]

{'loss': 1.6739, 'grad_norm': 1.9884381294250488, 'learning_rate': 0.0001975101262806767, 'epoch': 0.01}


  1%|▏         | 220/16798 [00:48<59:08,  4.67it/s]

{'loss': 1.6688, 'grad_norm': 2.054093837738037, 'learning_rate': 0.00019749821300929236, 'epoch': 0.01}


  1%|▏         | 221/16798 [00:49<59:57,  4.61it/s]

{'loss': 1.6554, 'grad_norm': 2.0312888622283936, 'learning_rate': 0.00019748629973790805, 'epoch': 0.01}


  1%|▏         | 222/16798 [00:49<1:00:16,  4.58it/s]

{'loss': 1.4674, 'grad_norm': 1.7785509824752808, 'learning_rate': 0.00019747438646652371, 'epoch': 0.01}


  1%|▏         | 223/16798 [00:49<1:00:29,  4.57it/s]

{'loss': 1.9997, 'grad_norm': 2.376373767852783, 'learning_rate': 0.0001974624731951394, 'epoch': 0.01}


  1%|▏         | 224/16798 [00:49<1:00:17,  4.58it/s]

{'loss': 1.3002, 'grad_norm': 1.7020272016525269, 'learning_rate': 0.00019745055992375507, 'epoch': 0.01}


  1%|▏         | 225/16798 [00:50<1:01:21,  4.50it/s]

{'loss': 1.6039, 'grad_norm': 2.208775520324707, 'learning_rate': 0.00019743864665237076, 'epoch': 0.01}


  1%|▏         | 226/16798 [00:50<1:00:55,  4.53it/s]

{'loss': 2.1595, 'grad_norm': 2.3835926055908203, 'learning_rate': 0.00019742673338098642, 'epoch': 0.01}


  1%|▏         | 228/16798 [00:50<59:25,  4.65it/s]  

{'loss': 1.4721, 'grad_norm': 2.2630388736724854, 'learning_rate': 0.0001974148201096021, 'epoch': 0.01}


  1%|▏         | 228/16798 [00:50<59:25,  4.65it/s]

{'loss': 1.4875, 'grad_norm': 2.201864242553711, 'learning_rate': 0.00019740290683821778, 'epoch': 0.01}


  1%|▏         | 229/16798 [00:50<59:53,  4.61it/s]

{'loss': 1.2913, 'grad_norm': 2.6516880989074707, 'learning_rate': 0.00019739099356683347, 'epoch': 0.01}


  1%|▏         | 230/16798 [00:51<1:00:05,  4.60it/s]

{'loss': 1.752, 'grad_norm': 2.224027395248413, 'learning_rate': 0.00019737908029544913, 'epoch': 0.01}


  1%|▏         | 231/16798 [00:51<1:00:55,  4.53it/s]

{'loss': 1.7153, 'grad_norm': 2.3002166748046875, 'learning_rate': 0.00019736716702406482, 'epoch': 0.01}


  1%|▏         | 232/16798 [00:51<1:00:32,  4.56it/s]

{'loss': 1.9527, 'grad_norm': 4.799469470977783, 'learning_rate': 0.00019735525375268049, 'epoch': 0.01}


  1%|▏         | 233/16798 [00:51<1:00:27,  4.57it/s]

{'loss': 1.8057, 'grad_norm': 2.1627395153045654, 'learning_rate': 0.00019734334048129618, 'epoch': 0.01}


  1%|▏         | 234/16798 [00:51<1:00:40,  4.55it/s]

{'loss': 1.4964, 'grad_norm': 2.0076022148132324, 'learning_rate': 0.00019733142720991184, 'epoch': 0.01}


  1%|▏         | 235/16798 [00:52<1:00:12,  4.58it/s]

{'loss': 1.6104, 'grad_norm': 2.0474343299865723, 'learning_rate': 0.00019731951393852753, 'epoch': 0.01}


  1%|▏         | 237/16798 [00:52<59:31,  4.64it/s]  

{'loss': 1.8749, 'grad_norm': 2.3191487789154053, 'learning_rate': 0.0001973076006671432, 'epoch': 0.01}


  1%|▏         | 237/16798 [00:52<59:31,  4.64it/s]

{'loss': 1.8098, 'grad_norm': 2.2568588256835938, 'learning_rate': 0.00019729568739575889, 'epoch': 0.01}


  1%|▏         | 239/16798 [00:53<58:12,  4.74it/s]

{'loss': 1.5792, 'grad_norm': 2.261024236679077, 'learning_rate': 0.00019728377412437455, 'epoch': 0.01}


  1%|▏         | 239/16798 [00:53<58:12,  4.74it/s]

{'loss': 1.5829, 'grad_norm': 2.245866537094116, 'learning_rate': 0.00019727186085299024, 'epoch': 0.01}


  1%|▏         | 240/16798 [00:53<58:29,  4.72it/s]

{'loss': 1.7417, 'grad_norm': 2.327711343765259, 'learning_rate': 0.0001972599475816059, 'epoch': 0.01}


  1%|▏         | 241/16798 [00:53<58:38,  4.71it/s]

{'loss': 1.3685, 'grad_norm': 1.8610303401947021, 'learning_rate': 0.0001972480343102216, 'epoch': 0.01}


  1%|▏         | 242/16798 [00:53<59:35,  4.63it/s]

{'loss': 1.0498, 'grad_norm': 1.9057050943374634, 'learning_rate': 0.00019723612103883729, 'epoch': 0.01}


  1%|▏         | 243/16798 [00:53<59:55,  4.60it/s]

{'loss': 1.2931, 'grad_norm': 1.7988786697387695, 'learning_rate': 0.00019722420776745295, 'epoch': 0.01}


  1%|▏         | 244/16798 [00:54<1:00:57,  4.53it/s]

{'loss': 1.2403, 'grad_norm': 2.15163254737854, 'learning_rate': 0.00019721229449606864, 'epoch': 0.01}


  1%|▏         | 246/16798 [00:54<59:55,  4.60it/s]  

{'loss': 1.2052, 'grad_norm': 1.8202126026153564, 'learning_rate': 0.0001972003812246843, 'epoch': 0.01}


  1%|▏         | 247/16798 [00:54<57:57,  4.76it/s]

{'loss': 1.3736, 'grad_norm': 2.4169023036956787, 'learning_rate': 0.0001971884679533, 'epoch': 0.01}


  1%|▏         | 247/16798 [00:54<57:57,  4.76it/s]

{'loss': 1.0763, 'grad_norm': 2.077352285385132, 'learning_rate': 0.00019717655468191566, 'epoch': 0.01}


  1%|▏         | 248/16798 [00:54<59:30,  4.64it/s]

{'loss': 1.1433, 'grad_norm': 2.1834876537323, 'learning_rate': 0.00019716464141053135, 'epoch': 0.01}


  1%|▏         | 249/16798 [00:55<1:01:19,  4.50it/s]

{'loss': 0.4021, 'grad_norm': 1.3587993383407593, 'learning_rate': 0.000197152728139147, 'epoch': 0.01}


  1%|▏         | 250/16798 [00:55<1:03:22,  4.35it/s]

{'loss': 0.951, 'grad_norm': 1.9515643119812012, 'learning_rate': 0.0001971408148677627, 'epoch': 0.01}


  1%|▏         | 251/16798 [00:55<1:07:35,  4.08it/s]

{'loss': 2.0051, 'grad_norm': 1.9619868993759155, 'learning_rate': 0.00019712890159637837, 'epoch': 0.01}


  2%|▏         | 252/16798 [00:56<1:19:07,  3.49it/s]

{'loss': 2.416, 'grad_norm': 2.379528522491455, 'learning_rate': 0.00019711698832499406, 'epoch': 0.02}


  2%|▏         | 253/16798 [00:56<1:19:35,  3.46it/s]

{'loss': 2.4047, 'grad_norm': 2.2018301486968994, 'learning_rate': 0.00019710507505360972, 'epoch': 0.02}


  2%|▏         | 254/16798 [00:56<1:16:03,  3.63it/s]

{'loss': 2.2409, 'grad_norm': 2.4836812019348145, 'learning_rate': 0.0001970931617822254, 'epoch': 0.02}


  2%|▏         | 255/16798 [00:56<1:16:20,  3.61it/s]

{'loss': 2.2257, 'grad_norm': 2.138796806335449, 'learning_rate': 0.00019708124851084108, 'epoch': 0.02}


  2%|▏         | 256/16798 [00:57<1:14:11,  3.72it/s]

{'loss': 2.1903, 'grad_norm': 2.200956344604492, 'learning_rate': 0.00019706933523945677, 'epoch': 0.02}


  2%|▏         | 257/16798 [00:57<1:11:50,  3.84it/s]

{'loss': 2.415, 'grad_norm': 2.4542815685272217, 'learning_rate': 0.00019705742196807243, 'epoch': 0.02}


  2%|▏         | 258/16798 [00:57<1:10:01,  3.94it/s]

{'loss': 2.2537, 'grad_norm': 2.1763806343078613, 'learning_rate': 0.00019704550869668812, 'epoch': 0.02}


  2%|▏         | 259/16798 [00:57<1:09:30,  3.97it/s]

{'loss': 2.378, 'grad_norm': 2.1888585090637207, 'learning_rate': 0.00019703359542530378, 'epoch': 0.02}


  2%|▏         | 261/16798 [00:58<1:06:29,  4.15it/s]

{'loss': 2.0755, 'grad_norm': 1.965177297592163, 'learning_rate': 0.00019702168215391947, 'epoch': 0.02}


  2%|▏         | 261/16798 [00:58<1:06:29,  4.15it/s]

{'loss': 1.7837, 'grad_norm': 1.8720804452896118, 'learning_rate': 0.00019700976888253514, 'epoch': 0.02}


  2%|▏         | 262/16798 [00:58<1:05:02,  4.24it/s]

{'loss': 1.5594, 'grad_norm': 1.8765202760696411, 'learning_rate': 0.00019699785561115083, 'epoch': 0.02}


  2%|▏         | 263/16798 [00:58<1:12:06,  3.82it/s]

{'loss': 1.729, 'grad_norm': 1.8997656106948853, 'learning_rate': 0.0001969859423397665, 'epoch': 0.02}


  2%|▏         | 264/16798 [00:59<1:12:15,  3.81it/s]

{'loss': 1.7651, 'grad_norm': 2.1443049907684326, 'learning_rate': 0.00019697402906838218, 'epoch': 0.02}


  2%|▏         | 265/16798 [00:59<1:08:58,  4.00it/s]

{'loss': 1.8296, 'grad_norm': 2.1163198947906494, 'learning_rate': 0.00019696211579699785, 'epoch': 0.02}


  2%|▏         | 266/16798 [00:59<1:07:11,  4.10it/s]

{'loss': 1.7773, 'grad_norm': 1.9756485223770142, 'learning_rate': 0.00019695020252561354, 'epoch': 0.02}


  2%|▏         | 267/16798 [00:59<1:06:46,  4.13it/s]

{'loss': 1.2862, 'grad_norm': 1.7551133632659912, 'learning_rate': 0.0001969382892542292, 'epoch': 0.02}


  2%|▏         | 268/16798 [01:00<1:06:24,  4.15it/s]

{'loss': 1.6815, 'grad_norm': 1.7944221496582031, 'learning_rate': 0.0001969263759828449, 'epoch': 0.02}


  2%|▏         | 269/16798 [01:00<1:06:46,  4.13it/s]

{'loss': 1.7629, 'grad_norm': 1.9090925455093384, 'learning_rate': 0.00019691446271146056, 'epoch': 0.02}


  2%|▏         | 270/16798 [01:00<1:05:59,  4.17it/s]

{'loss': 1.6676, 'grad_norm': 2.0765159130096436, 'learning_rate': 0.00019690254944007625, 'epoch': 0.02}


  2%|▏         | 271/16798 [01:00<1:05:44,  4.19it/s]

{'loss': 1.5883, 'grad_norm': 1.9542120695114136, 'learning_rate': 0.0001968906361686919, 'epoch': 0.02}


  2%|▏         | 272/16798 [01:01<1:06:57,  4.11it/s]

{'loss': 2.1191, 'grad_norm': 2.152585029602051, 'learning_rate': 0.0001968787228973076, 'epoch': 0.02}


  2%|▏         | 273/16798 [01:01<1:08:24,  4.03it/s]

{'loss': 1.5738, 'grad_norm': 1.7911635637283325, 'learning_rate': 0.0001968668096259233, 'epoch': 0.02}


  2%|▏         | 274/16798 [01:01<1:17:09,  3.57it/s]

{'loss': 1.7361, 'grad_norm': 1.9939531087875366, 'learning_rate': 0.00019685489635453898, 'epoch': 0.02}


  2%|▏         | 275/16798 [01:01<1:12:42,  3.79it/s]

{'loss': 1.682, 'grad_norm': 2.483640432357788, 'learning_rate': 0.00019684298308315465, 'epoch': 0.02}


  2%|▏         | 276/16798 [01:02<1:08:47,  4.00it/s]

{'loss': 1.4873, 'grad_norm': 2.124577760696411, 'learning_rate': 0.00019683106981177034, 'epoch': 0.02}


  2%|▏         | 277/16798 [01:02<1:06:15,  4.16it/s]

{'loss': 1.9259, 'grad_norm': 2.0771682262420654, 'learning_rate': 0.000196819156540386, 'epoch': 0.02}


  2%|▏         | 278/16798 [01:02<1:03:59,  4.30it/s]

{'loss': 1.4832, 'grad_norm': 1.7991842031478882, 'learning_rate': 0.0001968072432690017, 'epoch': 0.02}


  2%|▏         | 279/16798 [01:02<1:02:47,  4.39it/s]

{'loss': 1.646, 'grad_norm': 1.922683835029602, 'learning_rate': 0.00019679532999761736, 'epoch': 0.02}


  2%|▏         | 280/16798 [01:03<1:02:23,  4.41it/s]

{'loss': 1.622, 'grad_norm': 2.097231388092041, 'learning_rate': 0.00019678341672623305, 'epoch': 0.02}


  2%|▏         | 281/16798 [01:03<1:03:44,  4.32it/s]

{'loss': 1.1533, 'grad_norm': 1.6639786958694458, 'learning_rate': 0.0001967715034548487, 'epoch': 0.02}


  2%|▏         | 282/16798 [01:03<1:03:36,  4.33it/s]

{'loss': 1.0589, 'grad_norm': 1.9741259813308716, 'learning_rate': 0.0001967595901834644, 'epoch': 0.02}


  2%|▏         | 283/16798 [01:03<1:04:46,  4.25it/s]

{'loss': 1.465, 'grad_norm': 1.8762718439102173, 'learning_rate': 0.00019674767691208006, 'epoch': 0.02}


  2%|▏         | 284/16798 [01:03<1:05:07,  4.23it/s]

{'loss': 1.7553, 'grad_norm': 1.9125159978866577, 'learning_rate': 0.00019673576364069576, 'epoch': 0.02}


  2%|▏         | 285/16798 [01:04<1:04:47,  4.25it/s]

{'loss': 1.2548, 'grad_norm': 1.9569467306137085, 'learning_rate': 0.00019672385036931142, 'epoch': 0.02}


  2%|▏         | 286/16798 [01:04<1:03:05,  4.36it/s]

{'loss': 1.2796, 'grad_norm': 1.849221110343933, 'learning_rate': 0.0001967119370979271, 'epoch': 0.02}


  2%|▏         | 287/16798 [01:04<1:04:24,  4.27it/s]

{'loss': 1.3229, 'grad_norm': 1.923566222190857, 'learning_rate': 0.00019670002382654277, 'epoch': 0.02}


  2%|▏         | 288/16798 [01:04<1:03:43,  4.32it/s]

{'loss': 1.2996, 'grad_norm': 3.169421434402466, 'learning_rate': 0.00019668811055515846, 'epoch': 0.02}


  2%|▏         | 289/16798 [01:05<1:07:41,  4.06it/s]

{'loss': 1.6851, 'grad_norm': 2.0132029056549072, 'learning_rate': 0.00019667619728377413, 'epoch': 0.02}


  2%|▏         | 290/16798 [01:05<1:07:45,  4.06it/s]

{'loss': 0.8931, 'grad_norm': 1.7395206689834595, 'learning_rate': 0.00019666428401238982, 'epoch': 0.02}


  2%|▏         | 291/16798 [01:05<1:09:18,  3.97it/s]

{'loss': 1.5128, 'grad_norm': 2.2040810585021973, 'learning_rate': 0.00019665237074100548, 'epoch': 0.02}


  2%|▏         | 292/16798 [01:05<1:07:54,  4.05it/s]

{'loss': 1.3604, 'grad_norm': 1.9485152959823608, 'learning_rate': 0.00019664045746962117, 'epoch': 0.02}


  2%|▏         | 293/16798 [01:06<1:06:51,  4.11it/s]

{'loss': 1.3525, 'grad_norm': 2.358582019805908, 'learning_rate': 0.00019662854419823684, 'epoch': 0.02}


  2%|▏         | 294/16798 [01:06<1:05:25,  4.20it/s]

{'loss': 1.5771, 'grad_norm': 2.8431215286254883, 'learning_rate': 0.00019661663092685253, 'epoch': 0.02}


  2%|▏         | 295/16798 [01:06<1:04:04,  4.29it/s]

{'loss': 1.3115, 'grad_norm': 2.3401873111724854, 'learning_rate': 0.0001966047176554682, 'epoch': 0.02}


  2%|▏         | 297/16798 [01:07<1:01:14,  4.49it/s]

{'loss': 1.1807, 'grad_norm': 1.9833452701568604, 'learning_rate': 0.00019659280438408388, 'epoch': 0.02}


  2%|▏         | 297/16798 [01:07<1:01:14,  4.49it/s]

{'loss': 1.1946, 'grad_norm': 1.8205004930496216, 'learning_rate': 0.00019658089111269955, 'epoch': 0.02}


  2%|▏         | 298/16798 [01:07<1:01:48,  4.45it/s]

{'loss': 1.0069, 'grad_norm': 2.8054206371307373, 'learning_rate': 0.00019656897784131524, 'epoch': 0.02}


  2%|▏         | 299/16798 [01:07<1:01:06,  4.50it/s]

{'loss': 1.4594, 'grad_norm': 2.4612221717834473, 'learning_rate': 0.0001965570645699309, 'epoch': 0.02}


  2%|▏         | 300/16798 [01:07<1:02:37,  4.39it/s]

{'loss': 0.6376, 'grad_norm': 1.5667922496795654, 'learning_rate': 0.0001965451512985466, 'epoch': 0.02}


  2%|▏         | 301/16798 [01:08<1:05:57,  4.17it/s]

{'loss': 2.0443, 'grad_norm': 1.9498023986816406, 'learning_rate': 0.00019653323802716225, 'epoch': 0.02}


  2%|▏         | 302/16798 [01:08<1:10:13,  3.92it/s]

{'loss': 2.1028, 'grad_norm': 1.9106261730194092, 'learning_rate': 0.00019652132475577794, 'epoch': 0.02}


  2%|▏         | 303/16798 [01:08<1:11:29,  3.85it/s]

{'loss': 2.0677, 'grad_norm': 2.046537160873413, 'learning_rate': 0.0001965094114843936, 'epoch': 0.02}


  2%|▏         | 304/16798 [01:08<1:14:52,  3.67it/s]

{'loss': 2.7807, 'grad_norm': 2.2435238361358643, 'learning_rate': 0.0001964974982130093, 'epoch': 0.02}


  2%|▏         | 305/16798 [01:09<1:16:01,  3.62it/s]

{'loss': 2.0146, 'grad_norm': 2.1008853912353516, 'learning_rate': 0.000196485584941625, 'epoch': 0.02}


  2%|▏         | 306/16798 [01:09<1:16:50,  3.58it/s]

{'loss': 2.1812, 'grad_norm': 2.0031325817108154, 'learning_rate': 0.00019647367167024065, 'epoch': 0.02}


  2%|▏         | 307/16798 [01:09<1:13:54,  3.72it/s]

{'loss': 2.2633, 'grad_norm': 2.062415361404419, 'learning_rate': 0.00019646175839885634, 'epoch': 0.02}


  2%|▏         | 308/16798 [01:09<1:11:50,  3.83it/s]

{'loss': 2.3079, 'grad_norm': 1.9236615896224976, 'learning_rate': 0.000196449845127472, 'epoch': 0.02}


  2%|▏         | 309/16798 [01:10<1:09:49,  3.94it/s]

{'loss': 2.0169, 'grad_norm': 2.301265239715576, 'learning_rate': 0.0001964379318560877, 'epoch': 0.02}


  2%|▏         | 310/16798 [01:10<1:08:00,  4.04it/s]

{'loss': 2.2907, 'grad_norm': 2.196918487548828, 'learning_rate': 0.00019642601858470336, 'epoch': 0.02}


  2%|▏         | 311/16798 [01:10<1:15:03,  3.66it/s]

{'loss': 1.5479, 'grad_norm': 1.8359678983688354, 'learning_rate': 0.00019641410531331905, 'epoch': 0.02}


  2%|▏         | 312/16798 [01:11<1:14:49,  3.67it/s]

{'loss': 1.5043, 'grad_norm': 1.8483844995498657, 'learning_rate': 0.00019640219204193472, 'epoch': 0.02}


  2%|▏         | 313/16798 [01:11<1:13:44,  3.73it/s]

{'loss': 1.6852, 'grad_norm': 1.9930298328399658, 'learning_rate': 0.0001963902787705504, 'epoch': 0.02}


  2%|▏         | 314/16798 [01:11<1:12:53,  3.77it/s]

{'loss': 1.9381, 'grad_norm': 1.8213274478912354, 'learning_rate': 0.00019637836549916607, 'epoch': 0.02}


  2%|▏         | 315/16798 [01:11<1:09:38,  3.94it/s]

{'loss': 1.6577, 'grad_norm': 1.8549909591674805, 'learning_rate': 0.00019636645222778176, 'epoch': 0.02}


  2%|▏         | 316/16798 [01:12<1:12:59,  3.76it/s]

{'loss': 1.4813, 'grad_norm': 1.7188847064971924, 'learning_rate': 0.00019635453895639743, 'epoch': 0.02}


  2%|▏         | 317/16798 [01:12<1:11:41,  3.83it/s]

{'loss': 1.7676, 'grad_norm': 2.147650718688965, 'learning_rate': 0.00019634262568501312, 'epoch': 0.02}


  2%|▏         | 318/16798 [01:12<1:12:05,  3.81it/s]

{'loss': 1.7215, 'grad_norm': 1.9424092769622803, 'learning_rate': 0.00019633071241362878, 'epoch': 0.02}


  2%|▏         | 319/16798 [01:12<1:09:29,  3.95it/s]

{'loss': 1.492, 'grad_norm': 1.9470691680908203, 'learning_rate': 0.00019631879914224447, 'epoch': 0.02}


  2%|▏         | 320/16798 [01:13<1:07:46,  4.05it/s]

{'loss': 1.3669, 'grad_norm': 1.727635145187378, 'learning_rate': 0.00019630688587086013, 'epoch': 0.02}


  2%|▏         | 321/16798 [01:13<1:07:01,  4.10it/s]

{'loss': 1.8127, 'grad_norm': 2.103749990463257, 'learning_rate': 0.00019629497259947583, 'epoch': 0.02}


  2%|▏         | 322/16798 [01:13<1:06:38,  4.12it/s]

{'loss': 1.4879, 'grad_norm': 2.032835006713867, 'learning_rate': 0.0001962830593280915, 'epoch': 0.02}


  2%|▏         | 323/16798 [01:13<1:07:25,  4.07it/s]

{'loss': 1.7311, 'grad_norm': 2.098585367202759, 'learning_rate': 0.00019627114605670718, 'epoch': 0.02}


  2%|▏         | 324/16798 [01:13<1:05:42,  4.18it/s]

{'loss': 1.6552, 'grad_norm': 1.864400029182434, 'learning_rate': 0.00019625923278532284, 'epoch': 0.02}


  2%|▏         | 325/16798 [01:14<1:09:35,  3.95it/s]

{'loss': 1.7155, 'grad_norm': 2.262047529220581, 'learning_rate': 0.00019624731951393853, 'epoch': 0.02}


  2%|▏         | 326/16798 [01:14<1:07:07,  4.09it/s]

{'loss': 1.6151, 'grad_norm': 1.9935674667358398, 'learning_rate': 0.0001962354062425542, 'epoch': 0.02}


  2%|▏         | 327/16798 [01:14<1:12:53,  3.77it/s]

{'loss': 1.6979, 'grad_norm': 2.1889760494232178, 'learning_rate': 0.0001962234929711699, 'epoch': 0.02}


  2%|▏         | 328/16798 [01:15<1:15:32,  3.63it/s]

{'loss': 1.7715, 'grad_norm': 2.1491270065307617, 'learning_rate': 0.00019621157969978555, 'epoch': 0.02}


  2%|▏         | 329/16798 [01:15<1:18:45,  3.49it/s]

{'loss': 1.3468, 'grad_norm': 1.9933687448501587, 'learning_rate': 0.00019619966642840124, 'epoch': 0.02}


  2%|▏         | 330/16798 [01:15<1:16:07,  3.61it/s]

{'loss': 1.9109, 'grad_norm': 2.284022331237793, 'learning_rate': 0.0001961877531570169, 'epoch': 0.02}


  2%|▏         | 331/16798 [01:15<1:17:28,  3.54it/s]

{'loss': 1.5249, 'grad_norm': 1.835411787033081, 'learning_rate': 0.0001961758398856326, 'epoch': 0.02}


  2%|▏         | 332/16798 [01:16<1:13:15,  3.75it/s]

{'loss': 1.6344, 'grad_norm': 2.8117427825927734, 'learning_rate': 0.00019616392661424826, 'epoch': 0.02}


  2%|▏         | 333/16798 [01:16<1:10:58,  3.87it/s]

{'loss': 1.5291, 'grad_norm': 2.0891306400299072, 'learning_rate': 0.00019615201334286395, 'epoch': 0.02}


  2%|▏         | 334/16798 [01:16<1:10:51,  3.87it/s]

{'loss': 1.487, 'grad_norm': 2.2317512035369873, 'learning_rate': 0.00019614010007147962, 'epoch': 0.02}


  2%|▏         | 335/16798 [01:16<1:07:27,  4.07it/s]

{'loss': 1.4896, 'grad_norm': 1.77900230884552, 'learning_rate': 0.00019612818680009533, 'epoch': 0.02}


  2%|▏         | 336/16798 [01:17<1:08:01,  4.03it/s]

{'loss': 1.1677, 'grad_norm': 1.866194725036621, 'learning_rate': 0.000196116273528711, 'epoch': 0.02}


  2%|▏         | 337/16798 [01:17<1:06:58,  4.10it/s]

{'loss': 1.3409, 'grad_norm': 2.074066400527954, 'learning_rate': 0.0001961043602573267, 'epoch': 0.02}


  2%|▏         | 338/16798 [01:17<1:09:34,  3.94it/s]

{'loss': 1.3884, 'grad_norm': 2.1593739986419678, 'learning_rate': 0.00019609244698594235, 'epoch': 0.02}


  2%|▏         | 339/16798 [01:17<1:13:34,  3.73it/s]

{'loss': 1.4853, 'grad_norm': 2.0379538536071777, 'learning_rate': 0.00019608053371455804, 'epoch': 0.02}


  2%|▏         | 340/16798 [01:18<1:16:02,  3.61it/s]

{'loss': 1.0618, 'grad_norm': 1.622749924659729, 'learning_rate': 0.0001960686204431737, 'epoch': 0.02}


  2%|▏         | 341/16798 [01:18<1:13:22,  3.74it/s]

{'loss': 1.2181, 'grad_norm': 1.8286019563674927, 'learning_rate': 0.0001960567071717894, 'epoch': 0.02}


  2%|▏         | 342/16798 [01:18<1:14:22,  3.69it/s]

{'loss': 1.0533, 'grad_norm': 1.7527836561203003, 'learning_rate': 0.00019604479390040506, 'epoch': 0.02}


  2%|▏         | 343/16798 [01:19<1:14:46,  3.67it/s]

{'loss': 0.8144, 'grad_norm': 1.4755982160568237, 'learning_rate': 0.00019603288062902075, 'epoch': 0.02}


  2%|▏         | 344/16798 [01:19<1:12:14,  3.80it/s]

{'loss': 0.9376, 'grad_norm': 1.9681576490402222, 'learning_rate': 0.00019602096735763641, 'epoch': 0.02}


  2%|▏         | 345/16798 [01:19<1:10:55,  3.87it/s]

{'loss': 0.9204, 'grad_norm': 1.8118231296539307, 'learning_rate': 0.0001960090540862521, 'epoch': 0.02}


  2%|▏         | 346/16798 [01:19<1:12:01,  3.81it/s]

{'loss': 0.87, 'grad_norm': 1.8004264831542969, 'learning_rate': 0.00019599714081486777, 'epoch': 0.02}


  2%|▏         | 348/16798 [01:20<1:09:30,  3.94it/s]

{'loss': 1.1067, 'grad_norm': 2.0076382160186768, 'learning_rate': 0.00019598522754348346, 'epoch': 0.02}


  2%|▏         | 348/16798 [01:20<1:09:30,  3.94it/s]

{'loss': 0.4175, 'grad_norm': 1.3207827806472778, 'learning_rate': 0.00019597331427209912, 'epoch': 0.02}


  2%|▏         | 350/16798 [01:20<1:03:25,  4.32it/s]

{'loss': 0.2934, 'grad_norm': 1.2875959873199463, 'learning_rate': 0.00019596140100071481, 'epoch': 0.02}


  2%|▏         | 350/16798 [01:20<1:03:25,  4.32it/s]

{'loss': 0.4939, 'grad_norm': 58.730262756347656, 'learning_rate': 0.00019594948772933048, 'epoch': 0.02}


  2%|▏         | 351/16798 [01:21<1:07:59,  4.03it/s]

{'loss': 2.0094, 'grad_norm': 1.7960150241851807, 'learning_rate': 0.00019593757445794617, 'epoch': 0.02}


  2%|▏         | 352/16798 [01:21<1:12:27,  3.78it/s]

{'loss': 2.4395, 'grad_norm': 2.270406723022461, 'learning_rate': 0.00019592566118656183, 'epoch': 0.02}


  2%|▏         | 353/16798 [01:21<1:10:46,  3.87it/s]

{'loss': 1.8364, 'grad_norm': 2.135462999343872, 'learning_rate': 0.00019591374791517752, 'epoch': 0.02}


  2%|▏         | 354/16798 [01:21<1:07:57,  4.03it/s]

{'loss': 2.1887, 'grad_norm': 2.1596968173980713, 'learning_rate': 0.0001959018346437932, 'epoch': 0.02}


  2%|▏         | 356/16798 [01:22<1:03:36,  4.31it/s]

{'loss': 2.1713, 'grad_norm': 2.4562478065490723, 'learning_rate': 0.00019588992137240888, 'epoch': 0.02}


  2%|▏         | 356/16798 [01:22<1:03:36,  4.31it/s]

{'loss': 2.2489, 'grad_norm': 2.291203737258911, 'learning_rate': 0.00019587800810102454, 'epoch': 0.02}


  2%|▏         | 358/16798 [01:22<1:01:31,  4.45it/s]

{'loss': 2.0242, 'grad_norm': 2.2096705436706543, 'learning_rate': 0.00019586609482964023, 'epoch': 0.02}


  2%|▏         | 358/16798 [01:22<1:01:31,  4.45it/s]

{'loss': 1.7869, 'grad_norm': 2.0462961196899414, 'learning_rate': 0.0001958541815582559, 'epoch': 0.02}


  2%|▏         | 360/16798 [01:23<59:12,  4.63it/s]  

{'loss': 2.11, 'grad_norm': 2.0574872493743896, 'learning_rate': 0.0001958422682868716, 'epoch': 0.02}


  2%|▏         | 361/16798 [01:23<58:26,  4.69it/s]

{'loss': 1.7817, 'grad_norm': 2.184309959411621, 'learning_rate': 0.00019583035501548725, 'epoch': 0.02}


  2%|▏         | 361/16798 [01:23<58:26,  4.69it/s]

{'loss': 1.3181, 'grad_norm': 1.9053013324737549, 'learning_rate': 0.00019581844174410294, 'epoch': 0.02}


  2%|▏         | 362/16798 [01:23<1:00:25,  4.53it/s]

{'loss': 1.9418, 'grad_norm': 2.34025502204895, 'learning_rate': 0.0001958065284727186, 'epoch': 0.02}


  2%|▏         | 363/16798 [01:23<1:03:02,  4.35it/s]

{'loss': 1.2077, 'grad_norm': 1.842119812965393, 'learning_rate': 0.0001957946152013343, 'epoch': 0.02}


  2%|▏         | 364/16798 [01:24<1:06:48,  4.10it/s]

{'loss': 1.3321, 'grad_norm': 1.7025331258773804, 'learning_rate': 0.00019578270192994996, 'epoch': 0.02}


  2%|▏         | 365/16798 [01:24<1:09:42,  3.93it/s]

{'loss': 1.6739, 'grad_norm': 1.7770429849624634, 'learning_rate': 0.00019577078865856565, 'epoch': 0.02}


  2%|▏         | 366/16798 [01:24<1:11:17,  3.84it/s]

{'loss': 2.1479, 'grad_norm': 2.4355266094207764, 'learning_rate': 0.00019575887538718134, 'epoch': 0.02}


  2%|▏         | 367/16798 [01:24<1:09:06,  3.96it/s]

{'loss': 1.7902, 'grad_norm': 2.1801865100860596, 'learning_rate': 0.000195746962115797, 'epoch': 0.02}


  2%|▏         | 369/16798 [01:25<1:05:18,  4.19it/s]

{'loss': 1.4951, 'grad_norm': 1.9772958755493164, 'learning_rate': 0.0001957350488444127, 'epoch': 0.02}


  2%|▏         | 369/16798 [01:25<1:05:18,  4.19it/s]

{'loss': 1.6755, 'grad_norm': 1.93393075466156, 'learning_rate': 0.00019572313557302836, 'epoch': 0.02}


  2%|▏         | 370/16798 [01:25<1:04:10,  4.27it/s]

{'loss': 1.4889, 'grad_norm': 1.8790379762649536, 'learning_rate': 0.00019571122230164405, 'epoch': 0.02}


  2%|▏         | 371/16798 [01:25<1:05:51,  4.16it/s]

{'loss': 1.8397, 'grad_norm': 1.967643141746521, 'learning_rate': 0.0001956993090302597, 'epoch': 0.02}


  2%|▏         | 372/16798 [01:26<1:09:29,  3.94it/s]

{'loss': 2.2474, 'grad_norm': 2.8106484413146973, 'learning_rate': 0.0001956873957588754, 'epoch': 0.02}


  2%|▏         | 373/16798 [01:26<1:11:59,  3.80it/s]

{'loss': 1.7951, 'grad_norm': 2.0968785285949707, 'learning_rate': 0.00019567548248749107, 'epoch': 0.02}


  2%|▏         | 374/16798 [01:26<1:10:42,  3.87it/s]

{'loss': 1.6291, 'grad_norm': 1.9483661651611328, 'learning_rate': 0.00019566356921610676, 'epoch': 0.02}


  2%|▏         | 375/16798 [01:26<1:07:58,  4.03it/s]

{'loss': 1.5801, 'grad_norm': 1.889424204826355, 'learning_rate': 0.00019565165594472242, 'epoch': 0.02}


  2%|▏         | 376/16798 [01:27<1:12:34,  3.77it/s]

{'loss': 2.0788, 'grad_norm': 2.3638436794281006, 'learning_rate': 0.0001956397426733381, 'epoch': 0.02}


  2%|▏         | 377/16798 [01:27<1:11:03,  3.85it/s]

{'loss': 2.0366, 'grad_norm': 2.620436906814575, 'learning_rate': 0.00019562782940195378, 'epoch': 0.02}


  2%|▏         | 378/16798 [01:27<1:11:31,  3.83it/s]

{'loss': 1.664, 'grad_norm': 2.0769095420837402, 'learning_rate': 0.00019561591613056947, 'epoch': 0.02}


  2%|▏         | 379/16798 [01:27<1:11:18,  3.84it/s]

{'loss': 1.0354, 'grad_norm': 1.5249402523040771, 'learning_rate': 0.00019560400285918513, 'epoch': 0.02}


  2%|▏         | 380/16798 [01:28<1:09:43,  3.92it/s]

{'loss': 1.2051, 'grad_norm': 1.809664011001587, 'learning_rate': 0.00019559208958780082, 'epoch': 0.02}


  2%|▏         | 381/16798 [01:28<1:09:17,  3.95it/s]

{'loss': 1.3689, 'grad_norm': 1.8368676900863647, 'learning_rate': 0.00019558017631641649, 'epoch': 0.02}


  2%|▏         | 382/16798 [01:28<1:09:06,  3.96it/s]

{'loss': 1.6171, 'grad_norm': 2.023812770843506, 'learning_rate': 0.00019556826304503218, 'epoch': 0.02}


  2%|▏         | 383/16798 [01:28<1:09:41,  3.93it/s]

{'loss': 1.6484, 'grad_norm': 2.210949182510376, 'learning_rate': 0.00019555634977364784, 'epoch': 0.02}


  2%|▏         | 384/16798 [01:29<1:08:28,  4.00it/s]

{'loss': 1.451, 'grad_norm': 2.1276609897613525, 'learning_rate': 0.00019554443650226353, 'epoch': 0.02}


  2%|▏         | 385/16798 [01:29<1:08:01,  4.02it/s]

{'loss': 1.4205, 'grad_norm': 1.8627465963363647, 'learning_rate': 0.0001955325232308792, 'epoch': 0.02}


  2%|▏         | 386/16798 [01:29<1:07:27,  4.05it/s]

{'loss': 1.7305, 'grad_norm': 2.5618908405303955, 'learning_rate': 0.00019552060995949488, 'epoch': 0.02}


  2%|▏         | 387/16798 [01:29<1:07:45,  4.04it/s]

{'loss': 1.5257, 'grad_norm': 2.369739532470703, 'learning_rate': 0.00019550869668811055, 'epoch': 0.02}


  2%|▏         | 388/16798 [01:30<1:11:06,  3.85it/s]

{'loss': 1.0998, 'grad_norm': 1.853822112083435, 'learning_rate': 0.00019549678341672624, 'epoch': 0.02}


  2%|▏         | 389/16798 [01:30<1:12:02,  3.80it/s]

{'loss': 1.1891, 'grad_norm': 2.4934043884277344, 'learning_rate': 0.0001954848701453419, 'epoch': 0.02}


  2%|▏         | 390/16798 [01:30<1:11:57,  3.80it/s]

{'loss': 1.0748, 'grad_norm': 1.6352349519729614, 'learning_rate': 0.0001954729568739576, 'epoch': 0.02}


  2%|▏         | 391/16798 [01:30<1:10:19,  3.89it/s]

{'loss': 1.0025, 'grad_norm': 1.5658552646636963, 'learning_rate': 0.00019546104360257326, 'epoch': 0.02}


  2%|▏         | 392/16798 [01:31<1:11:07,  3.84it/s]

{'loss': 1.304, 'grad_norm': 1.85373055934906, 'learning_rate': 0.00019544913033118895, 'epoch': 0.02}


  2%|▏         | 393/16798 [01:31<1:08:10,  4.01it/s]

{'loss': 1.0754, 'grad_norm': 1.6274399757385254, 'learning_rate': 0.0001954372170598046, 'epoch': 0.02}


  2%|▏         | 394/16798 [01:31<1:08:51,  3.97it/s]

{'loss': 1.5811, 'grad_norm': 2.488400459289551, 'learning_rate': 0.0001954253037884203, 'epoch': 0.02}


  2%|▏         | 395/16798 [01:31<1:07:43,  4.04it/s]

{'loss': 1.01, 'grad_norm': 1.6976889371871948, 'learning_rate': 0.00019541339051703597, 'epoch': 0.02}


  2%|▏         | 396/16798 [01:32<1:06:40,  4.10it/s]

{'loss': 1.116, 'grad_norm': 1.8457515239715576, 'learning_rate': 0.00019540147724565168, 'epoch': 0.02}


  2%|▏         | 397/16798 [01:32<1:05:13,  4.19it/s]

{'loss': 0.4867, 'grad_norm': 1.1559808254241943, 'learning_rate': 0.00019538956397426735, 'epoch': 0.02}


  2%|▏         | 398/16798 [01:32<1:06:01,  4.14it/s]

{'loss': 0.8481, 'grad_norm': 1.6902529001235962, 'learning_rate': 0.00019537765070288304, 'epoch': 0.02}


  2%|▏         | 399/16798 [01:32<1:06:11,  4.13it/s]

{'loss': 0.2715, 'grad_norm': 0.9242864847183228, 'learning_rate': 0.0001953657374314987, 'epoch': 0.02}


  2%|▏         | 400/16798 [01:33<1:04:50,  4.22it/s]

{'loss': 0.3124, 'grad_norm': 1.1025429964065552, 'learning_rate': 0.0001953538241601144, 'epoch': 0.02}


  2%|▏         | 401/16798 [01:33<1:10:07,  3.90it/s]

{'loss': 2.2799, 'grad_norm': 2.0878682136535645, 'learning_rate': 0.00019534191088873006, 'epoch': 0.02}


  2%|▏         | 402/16798 [01:33<1:09:17,  3.94it/s]

{'loss': 2.377, 'grad_norm': 1.9095394611358643, 'learning_rate': 0.00019532999761734575, 'epoch': 0.02}


  2%|▏         | 403/16798 [01:33<1:06:26,  4.11it/s]

{'loss': 2.1381, 'grad_norm': 2.1706979274749756, 'learning_rate': 0.0001953180843459614, 'epoch': 0.02}


  2%|▏         | 404/16798 [01:34<1:04:42,  4.22it/s]

{'loss': 2.3128, 'grad_norm': 2.439021587371826, 'learning_rate': 0.0001953061710745771, 'epoch': 0.02}


  2%|▏         | 405/16798 [01:34<1:04:20,  4.25it/s]

{'loss': 2.6345, 'grad_norm': 2.224755048751831, 'learning_rate': 0.00019529425780319277, 'epoch': 0.02}


  2%|▏         | 406/16798 [01:34<1:03:36,  4.30it/s]

{'loss': 2.5652, 'grad_norm': 3.132610321044922, 'learning_rate': 0.00019528234453180846, 'epoch': 0.02}


  2%|▏         | 407/16798 [01:34<1:05:00,  4.20it/s]

{'loss': 1.9158, 'grad_norm': 2.060866594314575, 'learning_rate': 0.00019527043126042412, 'epoch': 0.02}


  2%|▏         | 408/16798 [01:35<1:05:47,  4.15it/s]

{'loss': 1.5733, 'grad_norm': 1.7876304388046265, 'learning_rate': 0.0001952585179890398, 'epoch': 0.02}


  2%|▏         | 409/16798 [01:35<1:07:02,  4.07it/s]

{'loss': 1.6357, 'grad_norm': 1.81991446018219, 'learning_rate': 0.00019524660471765547, 'epoch': 0.02}


  2%|▏         | 410/16798 [01:35<1:06:41,  4.10it/s]

{'loss': 1.8526, 'grad_norm': 2.34303879737854, 'learning_rate': 0.00019523469144627116, 'epoch': 0.02}


  2%|▏         | 411/16798 [01:35<1:05:23,  4.18it/s]

{'loss': 1.93, 'grad_norm': 2.669883966445923, 'learning_rate': 0.00019522277817488683, 'epoch': 0.02}


  2%|▏         | 412/16798 [01:36<1:04:35,  4.23it/s]

{'loss': 2.1728, 'grad_norm': 2.253340244293213, 'learning_rate': 0.00019521086490350252, 'epoch': 0.02}


  2%|▏         | 413/16798 [01:36<1:02:50,  4.35it/s]

{'loss': 1.9312, 'grad_norm': 2.2432825565338135, 'learning_rate': 0.00019519895163211818, 'epoch': 0.02}


  2%|▏         | 414/16798 [01:36<1:01:26,  4.44it/s]

{'loss': 1.4029, 'grad_norm': 2.16784930229187, 'learning_rate': 0.00019518703836073387, 'epoch': 0.02}


  2%|▏         | 415/16798 [01:36<1:00:49,  4.49it/s]

{'loss': 2.0349, 'grad_norm': 2.160498857498169, 'learning_rate': 0.00019517512508934954, 'epoch': 0.02}


  2%|▏         | 417/16798 [01:37<59:42,  4.57it/s]  

{'loss': 1.5421, 'grad_norm': 1.8004149198532104, 'learning_rate': 0.00019516321181796523, 'epoch': 0.02}


  2%|▏         | 417/16798 [01:37<59:42,  4.57it/s]

{'loss': 1.5231, 'grad_norm': 1.7772921323776245, 'learning_rate': 0.0001951512985465809, 'epoch': 0.02}


  2%|▏         | 418/16798 [01:37<59:53,  4.56it/s]

{'loss': 1.6049, 'grad_norm': 1.8950250148773193, 'learning_rate': 0.00019513938527519658, 'epoch': 0.02}


  2%|▏         | 419/16798 [01:37<1:01:41,  4.43it/s]

{'loss': 1.6411, 'grad_norm': 1.8577271699905396, 'learning_rate': 0.00019512747200381225, 'epoch': 0.02}


  3%|▎         | 420/16798 [01:37<1:00:48,  4.49it/s]

{'loss': 1.6922, 'grad_norm': 2.022001266479492, 'learning_rate': 0.00019511555873242794, 'epoch': 0.03}


  3%|▎         | 421/16798 [01:38<1:01:39,  4.43it/s]

{'loss': 1.5216, 'grad_norm': 1.9603067636489868, 'learning_rate': 0.0001951036454610436, 'epoch': 0.03}


  3%|▎         | 422/16798 [01:38<1:01:23,  4.45it/s]

{'loss': 1.5438, 'grad_norm': 2.0398361682891846, 'learning_rate': 0.0001950917321896593, 'epoch': 0.03}


  3%|▎         | 423/16798 [01:38<1:01:41,  4.42it/s]

{'loss': 1.349, 'grad_norm': 1.8398486375808716, 'learning_rate': 0.00019507981891827496, 'epoch': 0.03}


  3%|▎         | 424/16798 [01:38<1:01:26,  4.44it/s]

{'loss': 1.7264, 'grad_norm': 1.9805619716644287, 'learning_rate': 0.00019506790564689065, 'epoch': 0.03}


  3%|▎         | 425/16798 [01:38<1:00:15,  4.53it/s]

{'loss': 1.2395, 'grad_norm': 1.673750400543213, 'learning_rate': 0.0001950559923755063, 'epoch': 0.03}


  3%|▎         | 427/16798 [01:39<1:01:44,  4.42it/s]

{'loss': 1.6457, 'grad_norm': 2.968613386154175, 'learning_rate': 0.000195044079104122, 'epoch': 0.03}


  3%|▎         | 427/16798 [01:39<1:01:44,  4.42it/s]

{'loss': 1.7404, 'grad_norm': 2.165969133377075, 'learning_rate': 0.0001950321658327377, 'epoch': 0.03}


  3%|▎         | 428/16798 [01:39<1:00:47,  4.49it/s]

{'loss': 1.6188, 'grad_norm': 2.0122344493865967, 'learning_rate': 0.00019502025256135335, 'epoch': 0.03}


  3%|▎         | 429/16798 [01:39<1:00:02,  4.54it/s]

{'loss': 1.7353, 'grad_norm': 2.130331516265869, 'learning_rate': 0.00019500833928996905, 'epoch': 0.03}


  3%|▎         | 430/16798 [01:40<59:49,  4.56it/s]  

{'loss': 1.6591, 'grad_norm': 2.113694429397583, 'learning_rate': 0.0001949964260185847, 'epoch': 0.03}


  3%|▎         | 431/16798 [01:40<59:53,  4.55it/s]

{'loss': 1.7809, 'grad_norm': 2.116286516189575, 'learning_rate': 0.0001949845127472004, 'epoch': 0.03}


  3%|▎         | 432/16798 [01:40<1:00:06,  4.54it/s]

{'loss': 1.8463, 'grad_norm': 2.2574303150177, 'learning_rate': 0.00019497259947581606, 'epoch': 0.03}


  3%|▎         | 433/16798 [01:40<59:33,  4.58it/s]  

{'loss': 1.5421, 'grad_norm': 1.8333114385604858, 'learning_rate': 0.00019496068620443175, 'epoch': 0.03}


  3%|▎         | 434/16798 [01:40<1:00:26,  4.51it/s]

{'loss': 1.4845, 'grad_norm': 2.064150810241699, 'learning_rate': 0.00019494877293304742, 'epoch': 0.03}


  3%|▎         | 435/16798 [01:41<1:00:34,  4.50it/s]

{'loss': 0.9129, 'grad_norm': 1.6478369235992432, 'learning_rate': 0.0001949368596616631, 'epoch': 0.03}


  3%|▎         | 436/16798 [01:41<1:02:31,  4.36it/s]

{'loss': 1.156, 'grad_norm': 1.6052331924438477, 'learning_rate': 0.00019492494639027877, 'epoch': 0.03}


  3%|▎         | 437/16798 [01:41<1:06:18,  4.11it/s]

{'loss': 1.4006, 'grad_norm': 1.9411890506744385, 'learning_rate': 0.00019491303311889446, 'epoch': 0.03}


  3%|▎         | 438/16798 [01:41<1:05:45,  4.15it/s]

{'loss': 1.2902, 'grad_norm': 1.7974826097488403, 'learning_rate': 0.00019490111984751013, 'epoch': 0.03}


  3%|▎         | 439/16798 [01:42<1:10:01,  3.89it/s]

{'loss': 1.2486, 'grad_norm': 1.9324439764022827, 'learning_rate': 0.00019488920657612582, 'epoch': 0.03}


  3%|▎         | 441/16798 [01:42<1:06:53,  4.08it/s]

{'loss': 1.0216, 'grad_norm': 1.6260778903961182, 'learning_rate': 0.00019487729330474148, 'epoch': 0.03}


  3%|▎         | 441/16798 [01:42<1:06:53,  4.08it/s]

{'loss': 1.5137, 'grad_norm': 1.979149341583252, 'learning_rate': 0.00019486538003335717, 'epoch': 0.03}


  3%|▎         | 442/16798 [01:42<1:05:08,  4.18it/s]

{'loss': 1.3488, 'grad_norm': 1.7981865406036377, 'learning_rate': 0.00019485346676197284, 'epoch': 0.03}


  3%|▎         | 443/16798 [01:43<1:02:46,  4.34it/s]

{'loss': 0.9885, 'grad_norm': 2.1000640392303467, 'learning_rate': 0.00019484155349058853, 'epoch': 0.03}


  3%|▎         | 444/16798 [01:43<1:01:37,  4.42it/s]

{'loss': 1.4515, 'grad_norm': 2.300989866256714, 'learning_rate': 0.0001948296402192042, 'epoch': 0.03}


  3%|▎         | 445/16798 [01:43<1:01:43,  4.42it/s]

{'loss': 1.1087, 'grad_norm': 1.7678965330123901, 'learning_rate': 0.00019481772694781988, 'epoch': 0.03}


  3%|▎         | 446/16798 [01:43<1:00:37,  4.50it/s]

{'loss': 1.2791, 'grad_norm': 1.8839819431304932, 'learning_rate': 0.00019480581367643554, 'epoch': 0.03}


  3%|▎         | 447/16798 [01:43<1:00:57,  4.47it/s]

{'loss': 1.0726, 'grad_norm': 1.8233410120010376, 'learning_rate': 0.00019479390040505124, 'epoch': 0.03}


  3%|▎         | 448/16798 [01:44<1:02:54,  4.33it/s]

{'loss': 0.8783, 'grad_norm': 1.4949575662612915, 'learning_rate': 0.0001947819871336669, 'epoch': 0.03}


  3%|▎         | 450/16798 [01:44<1:00:07,  4.53it/s]

{'loss': 0.811, 'grad_norm': 1.527035117149353, 'learning_rate': 0.0001947700738622826, 'epoch': 0.03}


  3%|▎         | 450/16798 [01:44<1:00:07,  4.53it/s]

{'loss': 0.2165, 'grad_norm': 0.8388798832893372, 'learning_rate': 0.00019475816059089825, 'epoch': 0.03}


  3%|▎         | 451/16798 [01:44<1:01:10,  4.45it/s]

{'loss': 2.2614, 'grad_norm': 2.1690480709075928, 'learning_rate': 0.00019474624731951394, 'epoch': 0.03}


  3%|▎         | 452/16798 [01:45<1:01:44,  4.41it/s]

{'loss': 2.0802, 'grad_norm': 2.1112051010131836, 'learning_rate': 0.0001947343340481296, 'epoch': 0.03}


  3%|▎         | 453/16798 [01:45<1:02:42,  4.34it/s]

{'loss': 2.1265, 'grad_norm': 2.1233270168304443, 'learning_rate': 0.0001947224207767453, 'epoch': 0.03}


  3%|▎         | 454/16798 [01:45<1:04:19,  4.24it/s]

{'loss': 1.766, 'grad_norm': 2.0106287002563477, 'learning_rate': 0.00019471050750536096, 'epoch': 0.03}


  3%|▎         | 455/16798 [01:45<1:04:17,  4.24it/s]

{'loss': 2.3475, 'grad_norm': 2.2929368019104004, 'learning_rate': 0.00019469859423397665, 'epoch': 0.03}


  3%|▎         | 456/16798 [01:46<1:04:12,  4.24it/s]

{'loss': 2.1665, 'grad_norm': 2.2313811779022217, 'learning_rate': 0.00019468668096259232, 'epoch': 0.03}


  3%|▎         | 457/16798 [01:46<1:04:11,  4.24it/s]

{'loss': 1.9242, 'grad_norm': 2.8840696811676025, 'learning_rate': 0.000194674767691208, 'epoch': 0.03}


  3%|▎         | 458/16798 [01:46<1:03:27,  4.29it/s]

{'loss': 2.5605, 'grad_norm': 2.485210657119751, 'learning_rate': 0.0001946628544198237, 'epoch': 0.03}


  3%|▎         | 459/16798 [01:46<1:03:58,  4.26it/s]

{'loss': 1.8198, 'grad_norm': 2.0874600410461426, 'learning_rate': 0.0001946509411484394, 'epoch': 0.03}


  3%|▎         | 460/16798 [01:46<1:03:38,  4.28it/s]

{'loss': 1.9788, 'grad_norm': 2.2278525829315186, 'learning_rate': 0.00019463902787705505, 'epoch': 0.03}


  3%|▎         | 461/16798 [01:47<1:01:54,  4.40it/s]

{'loss': 2.3501, 'grad_norm': 2.142026424407959, 'learning_rate': 0.00019462711460567074, 'epoch': 0.03}


  3%|▎         | 462/16798 [01:47<1:00:48,  4.48it/s]

{'loss': 2.1357, 'grad_norm': 2.2621490955352783, 'learning_rate': 0.0001946152013342864, 'epoch': 0.03}


  3%|▎         | 463/16798 [01:47<1:01:25,  4.43it/s]

{'loss': 1.9432, 'grad_norm': 2.161118984222412, 'learning_rate': 0.0001946032880629021, 'epoch': 0.03}


  3%|▎         | 464/16798 [01:47<1:02:06,  4.38it/s]

{'loss': 2.1454, 'grad_norm': 2.089585781097412, 'learning_rate': 0.00019459137479151776, 'epoch': 0.03}


  3%|▎         | 465/16798 [01:48<1:02:38,  4.35it/s]

{'loss': 1.654, 'grad_norm': 1.7559691667556763, 'learning_rate': 0.00019457946152013345, 'epoch': 0.03}


  3%|▎         | 466/16798 [01:48<1:02:22,  4.36it/s]

{'loss': 1.8059, 'grad_norm': 2.4589881896972656, 'learning_rate': 0.00019456754824874912, 'epoch': 0.03}


  3%|▎         | 467/16798 [01:48<1:03:02,  4.32it/s]

{'loss': 1.8895, 'grad_norm': 2.0890419483184814, 'learning_rate': 0.0001945556349773648, 'epoch': 0.03}


  3%|▎         | 468/16798 [01:48<1:02:14,  4.37it/s]

{'loss': 1.9334, 'grad_norm': 2.028951644897461, 'learning_rate': 0.00019454372170598047, 'epoch': 0.03}


  3%|▎         | 469/16798 [01:49<1:02:26,  4.36it/s]

{'loss': 1.4306, 'grad_norm': 1.7041716575622559, 'learning_rate': 0.00019453180843459616, 'epoch': 0.03}


  3%|▎         | 470/16798 [01:49<1:00:54,  4.47it/s]

{'loss': 2.1083, 'grad_norm': 2.1654176712036133, 'learning_rate': 0.00019451989516321182, 'epoch': 0.03}


  3%|▎         | 471/16798 [01:49<1:01:41,  4.41it/s]

{'loss': 1.4675, 'grad_norm': 1.8715529441833496, 'learning_rate': 0.00019450798189182752, 'epoch': 0.03}


  3%|▎         | 472/16798 [01:49<1:01:40,  4.41it/s]

{'loss': 1.9926, 'grad_norm': 2.071564197540283, 'learning_rate': 0.00019449606862044318, 'epoch': 0.03}


  3%|▎         | 473/16798 [01:49<1:01:13,  4.44it/s]

{'loss': 1.393, 'grad_norm': 1.772890567779541, 'learning_rate': 0.00019448415534905887, 'epoch': 0.03}


  3%|▎         | 474/16798 [01:50<1:01:50,  4.40it/s]

{'loss': 1.6712, 'grad_norm': 2.097805976867676, 'learning_rate': 0.00019447224207767453, 'epoch': 0.03}


  3%|▎         | 475/16798 [01:50<1:03:03,  4.31it/s]

{'loss': 1.7243, 'grad_norm': 1.993105411529541, 'learning_rate': 0.00019446032880629022, 'epoch': 0.03}


  3%|▎         | 476/16798 [01:50<1:02:53,  4.33it/s]

{'loss': 2.0899, 'grad_norm': 2.7243452072143555, 'learning_rate': 0.0001944484155349059, 'epoch': 0.03}


  3%|▎         | 477/16798 [01:50<1:01:25,  4.43it/s]

{'loss': 1.2407, 'grad_norm': 1.6941945552825928, 'learning_rate': 0.00019443650226352158, 'epoch': 0.03}


  3%|▎         | 479/16798 [01:51<59:22,  4.58it/s]  

{'loss': 1.8607, 'grad_norm': 1.9582850933074951, 'learning_rate': 0.00019442458899213724, 'epoch': 0.03}


  3%|▎         | 480/16798 [01:51<57:55,  4.69it/s]

{'loss': 1.5021, 'grad_norm': 1.6675944328308105, 'learning_rate': 0.00019441267572075293, 'epoch': 0.03}


  3%|▎         | 481/16798 [01:51<57:42,  4.71it/s]

{'loss': 1.5852, 'grad_norm': 2.0538220405578613, 'learning_rate': 0.0001944007624493686, 'epoch': 0.03}


  3%|▎         | 481/16798 [01:51<57:42,  4.71it/s]

{'loss': 1.5859, 'grad_norm': 1.7626038789749146, 'learning_rate': 0.0001943888491779843, 'epoch': 0.03}


  3%|▎         | 483/16798 [01:52<56:29,  4.81it/s]

{'loss': 1.9587, 'grad_norm': 2.074087619781494, 'learning_rate': 0.00019437693590659995, 'epoch': 0.03}


  3%|▎         | 484/16798 [01:52<56:28,  4.81it/s]

{'loss': 1.475, 'grad_norm': 1.6583198308944702, 'learning_rate': 0.00019436502263521564, 'epoch': 0.03}


  3%|▎         | 485/16798 [01:52<56:16,  4.83it/s]

{'loss': 1.6738, 'grad_norm': 1.9625288248062134, 'learning_rate': 0.0001943531093638313, 'epoch': 0.03}


  3%|▎         | 485/16798 [01:52<56:16,  4.83it/s]

{'loss': 1.8152, 'grad_norm': 2.1870059967041016, 'learning_rate': 0.000194341196092447, 'epoch': 0.03}


  3%|▎         | 486/16798 [01:52<57:22,  4.74it/s]

{'loss': 1.5091, 'grad_norm': 1.9308750629425049, 'learning_rate': 0.00019432928282106266, 'epoch': 0.03}


  3%|▎         | 488/16798 [01:53<56:51,  4.78it/s]

{'loss': 1.8329, 'grad_norm': 2.2235312461853027, 'learning_rate': 0.00019431736954967835, 'epoch': 0.03}


  3%|▎         | 489/16798 [01:53<56:14,  4.83it/s]

{'loss': 1.9282, 'grad_norm': 2.467604875564575, 'learning_rate': 0.00019430545627829401, 'epoch': 0.03}


  3%|▎         | 490/16798 [01:53<54:58,  4.94it/s]

{'loss': 1.5078, 'grad_norm': 1.962799310684204, 'learning_rate': 0.0001942935430069097, 'epoch': 0.03}


  3%|▎         | 491/16798 [01:53<55:08,  4.93it/s]

{'loss': 1.5273, 'grad_norm': 2.2949087619781494, 'learning_rate': 0.0001942816297355254, 'epoch': 0.03}


  3%|▎         | 491/16798 [01:53<55:08,  4.93it/s]

{'loss': 1.611, 'grad_norm': 1.845600962638855, 'learning_rate': 0.00019426971646414106, 'epoch': 0.03}


  3%|▎         | 492/16798 [01:53<55:50,  4.87it/s]

{'loss': 1.6527, 'grad_norm': 2.198838710784912, 'learning_rate': 0.00019425780319275675, 'epoch': 0.03}


  3%|▎         | 494/16798 [01:54<56:13,  4.83it/s]

{'loss': 1.5755, 'grad_norm': 1.888519048690796, 'learning_rate': 0.00019424588992137241, 'epoch': 0.03}


  3%|▎         | 494/16798 [01:54<56:13,  4.83it/s]

{'loss': 1.1458, 'grad_norm': 1.5538139343261719, 'learning_rate': 0.0001942339766499881, 'epoch': 0.03}


  3%|▎         | 495/16798 [01:54<57:15,  4.75it/s]

{'loss': 1.0383, 'grad_norm': 1.768768310546875, 'learning_rate': 0.00019422206337860377, 'epoch': 0.03}


  3%|▎         | 497/16798 [01:55<57:16,  4.74it/s]

{'loss': 1.029, 'grad_norm': 1.7395625114440918, 'learning_rate': 0.00019421015010721946, 'epoch': 0.03}


  3%|▎         | 498/16798 [01:55<57:32,  4.72it/s]

{'loss': 1.0423, 'grad_norm': 1.7344508171081543, 'learning_rate': 0.00019419823683583512, 'epoch': 0.03}


  3%|▎         | 499/16798 [01:55<56:04,  4.84it/s]

{'loss': 0.593, 'grad_norm': 1.2486426830291748, 'learning_rate': 0.00019418632356445081, 'epoch': 0.03}


  3%|▎         | 500/16798 [01:55<55:13,  4.92it/s]

{'loss': 1.2629, 'grad_norm': 2.0359764099121094, 'learning_rate': 0.00019417441029306648, 'epoch': 0.03}




{'loss': 0.3907, 'grad_norm': 0.9378038048744202, 'learning_rate': 0.00019416249702168217, 'epoch': 0.03}


  3%|▎         | 501/16798 [01:58<4:11:22,  1.08it/s]

{'loss': 2.1408, 'grad_norm': 1.7441121339797974, 'learning_rate': 0.00019415058375029783, 'epoch': 0.03}


  3%|▎         | 502/16798 [01:58<3:14:07,  1.40it/s]

{'loss': 1.9573, 'grad_norm': 1.7155354022979736, 'learning_rate': 0.00019413867047891352, 'epoch': 0.03}


  3%|▎         | 503/16798 [01:58<2:34:23,  1.76it/s]

{'loss': 2.0263, 'grad_norm': 2.461183786392212, 'learning_rate': 0.00019412675720752919, 'epoch': 0.03}


  3%|▎         | 504/16798 [01:58<2:05:55,  2.16it/s]

{'loss': 2.3287, 'grad_norm': 1.9696872234344482, 'learning_rate': 0.00019411484393614488, 'epoch': 0.03}


  3%|▎         | 505/16798 [01:59<1:46:47,  2.54it/s]

{'loss': 2.3116, 'grad_norm': 1.8003199100494385, 'learning_rate': 0.00019410293066476054, 'epoch': 0.03}


  3%|▎         | 506/16798 [01:59<1:32:39,  2.93it/s]

{'loss': 2.0045, 'grad_norm': 1.7229338884353638, 'learning_rate': 0.00019409101739337623, 'epoch': 0.03}


  3%|▎         | 508/16798 [01:59<1:15:32,  3.59it/s]

{'loss': 2.3011, 'grad_norm': 1.8790409564971924, 'learning_rate': 0.0001940791041219919, 'epoch': 0.03}


  3%|▎         | 509/16798 [01:59<1:10:29,  3.85it/s]

{'loss': 2.7673, 'grad_norm': 2.3036913871765137, 'learning_rate': 0.00019406719085060759, 'epoch': 0.03}


  3%|▎         | 510/16798 [02:00<1:06:46,  4.07it/s]

{'loss': 1.4568, 'grad_norm': 1.545309066772461, 'learning_rate': 0.00019405527757922325, 'epoch': 0.03}


  3%|▎         | 510/16798 [02:00<1:06:46,  4.07it/s]

{'loss': 1.6646, 'grad_norm': 1.6434988975524902, 'learning_rate': 0.00019404336430783894, 'epoch': 0.03}


  3%|▎         | 512/16798 [02:00<1:02:11,  4.36it/s]

{'loss': 1.9504, 'grad_norm': 2.0322670936584473, 'learning_rate': 0.0001940314510364546, 'epoch': 0.03}


  3%|▎         | 513/16798 [02:00<1:00:56,  4.45it/s]

{'loss': 2.0756, 'grad_norm': 2.0868313312530518, 'learning_rate': 0.0001940195377650703, 'epoch': 0.03}


  3%|▎         | 514/16798 [02:01<59:58,  4.52it/s]  

{'loss': 1.8658, 'grad_norm': 2.0172457695007324, 'learning_rate': 0.00019400762449368596, 'epoch': 0.03}


  3%|▎         | 514/16798 [02:01<59:58,  4.52it/s]

{'loss': 1.3551, 'grad_norm': 1.6516530513763428, 'learning_rate': 0.00019399571122230165, 'epoch': 0.03}


  3%|▎         | 516/16798 [02:01<1:03:19,  4.28it/s]

{'loss': 2.0224, 'grad_norm': 1.9448654651641846, 'learning_rate': 0.0001939837979509173, 'epoch': 0.03}


  3%|▎         | 517/16798 [02:01<1:01:25,  4.42it/s]

{'loss': 1.9613, 'grad_norm': 2.693441867828369, 'learning_rate': 0.000193971884679533, 'epoch': 0.03}


  3%|▎         | 517/16798 [02:01<1:01:25,  4.42it/s]

{'loss': 1.672, 'grad_norm': 1.9374759197235107, 'learning_rate': 0.00019395997140814867, 'epoch': 0.03}


  3%|▎         | 519/16798 [02:02<58:02,  4.67it/s]  

{'loss': 2.368, 'grad_norm': 2.322611093521118, 'learning_rate': 0.00019394805813676436, 'epoch': 0.03}


  3%|▎         | 519/16798 [02:02<58:02,  4.67it/s]

{'loss': 1.5167, 'grad_norm': 1.7612282037734985, 'learning_rate': 0.00019393614486538002, 'epoch': 0.03}


  3%|▎         | 520/16798 [02:02<58:52,  4.61it/s]

{'loss': 1.4596, 'grad_norm': 1.7564226388931274, 'learning_rate': 0.00019392423159399574, 'epoch': 0.03}


  3%|▎         | 522/16798 [02:02<58:05,  4.67it/s]

{'loss': 2.3141, 'grad_norm': 2.6709518432617188, 'learning_rate': 0.0001939123183226114, 'epoch': 0.03}


  3%|▎         | 522/16798 [02:02<58:05,  4.67it/s]

{'loss': 1.4765, 'grad_norm': 1.5133304595947266, 'learning_rate': 0.0001939004050512271, 'epoch': 0.03}


  3%|▎         | 523/16798 [02:03<57:56,  4.68it/s]

{'loss': 1.2582, 'grad_norm': 2.1202828884124756, 'learning_rate': 0.00019388849177984276, 'epoch': 0.03}


  3%|▎         | 525/16798 [02:03<57:20,  4.73it/s]

{'loss': 1.7873, 'grad_norm': 1.7748254537582397, 'learning_rate': 0.00019387657850845845, 'epoch': 0.03}


  3%|▎         | 525/16798 [02:03<57:20,  4.73it/s]

{'loss': 1.4938, 'grad_norm': 1.8223223686218262, 'learning_rate': 0.0001938646652370741, 'epoch': 0.03}


  3%|▎         | 527/16798 [02:03<56:40,  4.78it/s]

{'loss': 1.4051, 'grad_norm': 1.6893097162246704, 'learning_rate': 0.0001938527519656898, 'epoch': 0.03}


  3%|▎         | 528/16798 [02:04<57:03,  4.75it/s]

{'loss': 1.7878, 'grad_norm': 2.238983392715454, 'learning_rate': 0.00019384083869430547, 'epoch': 0.03}


  3%|▎         | 528/16798 [02:04<57:03,  4.75it/s]

{'loss': 1.5667, 'grad_norm': 1.795053482055664, 'learning_rate': 0.00019382892542292116, 'epoch': 0.03}


  3%|▎         | 530/16798 [02:04<57:10,  4.74it/s]

{'loss': 1.6385, 'grad_norm': 2.0056586265563965, 'learning_rate': 0.00019381701215153682, 'epoch': 0.03}


  3%|▎         | 530/16798 [02:04<57:10,  4.74it/s]

{'loss': 1.1138, 'grad_norm': 1.7047337293624878, 'learning_rate': 0.0001938050988801525, 'epoch': 0.03}


  3%|▎         | 531/16798 [02:04<58:30,  4.63it/s]

{'loss': 1.7221, 'grad_norm': 1.9741166830062866, 'learning_rate': 0.00019379318560876818, 'epoch': 0.03}


  3%|▎         | 532/16798 [02:04<57:52,  4.68it/s]

{'loss': 1.5255, 'grad_norm': 1.8391205072402954, 'learning_rate': 0.00019378127233738387, 'epoch': 0.03}


  3%|▎         | 533/16798 [02:05<58:06,  4.67it/s]

{'loss': 1.3477, 'grad_norm': 2.1382601261138916, 'learning_rate': 0.00019376935906599953, 'epoch': 0.03}


  3%|▎         | 535/16798 [02:05<58:45,  4.61it/s]

{'loss': 1.462, 'grad_norm': 1.6058050394058228, 'learning_rate': 0.00019375744579461522, 'epoch': 0.03}


  3%|▎         | 535/16798 [02:05<58:45,  4.61it/s]

{'loss': 1.1642, 'grad_norm': 1.5082380771636963, 'learning_rate': 0.00019374553252323088, 'epoch': 0.03}


  3%|▎         | 536/16798 [02:05<58:34,  4.63it/s]

{'loss': 1.3332, 'grad_norm': 1.674491286277771, 'learning_rate': 0.00019373361925184657, 'epoch': 0.03}


  3%|▎         | 538/16798 [02:06<56:45,  4.77it/s]

{'loss': 1.8537, 'grad_norm': 2.25516414642334, 'learning_rate': 0.00019372170598046224, 'epoch': 0.03}


  3%|▎         | 538/16798 [02:06<56:45,  4.77it/s]

{'loss': 1.1188, 'grad_norm': 1.9373470544815063, 'learning_rate': 0.00019370979270907793, 'epoch': 0.03}


  3%|▎         | 540/16798 [02:06<55:45,  4.86it/s]

{'loss': 1.2229, 'grad_norm': 1.7628759145736694, 'learning_rate': 0.0001936978794376936, 'epoch': 0.03}


  3%|▎         | 541/16798 [02:06<56:03,  4.83it/s]

{'loss': 1.4446, 'grad_norm': 1.8164836168289185, 'learning_rate': 0.00019368596616630928, 'epoch': 0.03}


  3%|▎         | 542/16798 [02:07<55:51,  4.85it/s]

{'loss': 1.3389, 'grad_norm': 1.9517436027526855, 'learning_rate': 0.00019367405289492495, 'epoch': 0.03}


  3%|▎         | 543/16798 [02:07<55:20,  4.89it/s]

{'loss': 1.1648, 'grad_norm': 1.4847387075424194, 'learning_rate': 0.00019366213962354064, 'epoch': 0.03}


  3%|▎         | 544/16798 [02:07<54:50,  4.94it/s]

{'loss': 1.6397, 'grad_norm': 1.9424561262130737, 'learning_rate': 0.0001936502263521563, 'epoch': 0.03}


  3%|▎         | 545/16798 [02:07<54:45,  4.95it/s]

{'loss': 1.2142, 'grad_norm': 1.922444462776184, 'learning_rate': 0.000193638313080772, 'epoch': 0.03}


  3%|▎         | 545/16798 [02:07<54:45,  4.95it/s]

{'loss': 1.0937, 'grad_norm': 1.7082624435424805, 'learning_rate': 0.00019362639980938766, 'epoch': 0.03}


  3%|▎         | 547/16798 [02:08<55:32,  4.88it/s]

{'loss': 0.924, 'grad_norm': 1.3358575105667114, 'learning_rate': 0.00019361448653800335, 'epoch': 0.03}


  3%|▎         | 548/16798 [02:08<55:19,  4.90it/s]

{'loss': 1.0182, 'grad_norm': 1.7332050800323486, 'learning_rate': 0.000193602573266619, 'epoch': 0.03}


  3%|▎         | 548/16798 [02:08<55:19,  4.90it/s]

{'loss': 0.972, 'grad_norm': 2.0003602504730225, 'learning_rate': 0.0001935906599952347, 'epoch': 0.03}


  3%|▎         | 550/16798 [02:08<55:16,  4.90it/s]

{'loss': 0.7974, 'grad_norm': 1.8566737174987793, 'learning_rate': 0.00019357874672385036, 'epoch': 0.03}


  3%|▎         | 550/16798 [02:08<55:16,  4.90it/s]

{'loss': 0.5735, 'grad_norm': 1.6097453832626343, 'learning_rate': 0.00019356683345246606, 'epoch': 0.03}


  3%|▎         | 551/16798 [02:08<56:57,  4.75it/s]

{'loss': 1.8212, 'grad_norm': 2.3404312133789062, 'learning_rate': 0.00019355492018108175, 'epoch': 0.03}


  3%|▎         | 552/16798 [02:09<59:26,  4.56it/s]

{'loss': 1.7416, 'grad_norm': 1.8964289426803589, 'learning_rate': 0.0001935430069096974, 'epoch': 0.03}


  3%|▎         | 553/16798 [02:09<59:53,  4.52it/s]

{'loss': 2.089, 'grad_norm': 2.025188684463501, 'learning_rate': 0.0001935310936383131, 'epoch': 0.03}


  3%|▎         | 554/16798 [02:09<59:30,  4.55it/s]

{'loss': 2.2065, 'grad_norm': 2.0081748962402344, 'learning_rate': 0.00019351918036692876, 'epoch': 0.03}


  3%|▎         | 555/16798 [02:09<1:01:11,  4.42it/s]

{'loss': 2.1825, 'grad_norm': 2.277827024459839, 'learning_rate': 0.00019350726709554446, 'epoch': 0.03}


  3%|▎         | 556/16798 [02:10<1:02:20,  4.34it/s]

{'loss': 2.1821, 'grad_norm': 2.268972873687744, 'learning_rate': 0.00019349535382416012, 'epoch': 0.03}


  3%|▎         | 557/16798 [02:10<1:02:11,  4.35it/s]

{'loss': 1.8637, 'grad_norm': 2.068621873855591, 'learning_rate': 0.0001934834405527758, 'epoch': 0.03}


  3%|▎         | 558/16798 [02:10<1:02:04,  4.36it/s]

{'loss': 1.8377, 'grad_norm': 1.7113107442855835, 'learning_rate': 0.00019347152728139147, 'epoch': 0.03}


  3%|▎         | 559/16798 [02:10<1:02:48,  4.31it/s]

{'loss': 2.2918, 'grad_norm': 2.094525098800659, 'learning_rate': 0.00019345961401000716, 'epoch': 0.03}


  3%|▎         | 561/16798 [02:11<1:00:17,  4.49it/s]

{'loss': 1.6993, 'grad_norm': 1.601882815361023, 'learning_rate': 0.00019344770073862283, 'epoch': 0.03}


  3%|▎         | 561/16798 [02:11<1:00:17,  4.49it/s]

{'loss': 2.0658, 'grad_norm': 1.9075926542282104, 'learning_rate': 0.00019343578746723852, 'epoch': 0.03}


  3%|▎         | 562/16798 [02:11<59:44,  4.53it/s]  

{'loss': 1.6659, 'grad_norm': 1.697797179222107, 'learning_rate': 0.00019342387419585418, 'epoch': 0.03}


  3%|▎         | 563/16798 [02:11<59:32,  4.55it/s]

{'loss': 1.4827, 'grad_norm': 1.7971185445785522, 'learning_rate': 0.00019341196092446987, 'epoch': 0.03}


  3%|▎         | 564/16798 [02:11<59:52,  4.52it/s]

{'loss': 1.8551, 'grad_norm': 1.7847228050231934, 'learning_rate': 0.00019340004765308554, 'epoch': 0.03}


  3%|▎         | 565/16798 [02:12<1:00:27,  4.47it/s]

{'loss': 1.3905, 'grad_norm': 1.8073352575302124, 'learning_rate': 0.00019338813438170123, 'epoch': 0.03}


  3%|▎         | 566/16798 [02:12<59:43,  4.53it/s]  

{'loss': 1.5584, 'grad_norm': 1.6310566663742065, 'learning_rate': 0.0001933762211103169, 'epoch': 0.03}


  3%|▎         | 567/16798 [02:12<59:49,  4.52it/s]

{'loss': 1.5423, 'grad_norm': 1.818164348602295, 'learning_rate': 0.00019336430783893258, 'epoch': 0.03}


  3%|▎         | 568/16798 [02:12<59:10,  4.57it/s]

{'loss': 2.0711, 'grad_norm': 2.020521402359009, 'learning_rate': 0.00019335239456754825, 'epoch': 0.03}


  3%|▎         | 569/16798 [02:12<59:10,  4.57it/s]

{'loss': 2.0042, 'grad_norm': 1.7134065628051758, 'learning_rate': 0.00019334048129616394, 'epoch': 0.03}


  3%|▎         | 571/16798 [02:13<57:33,  4.70it/s]

{'loss': 1.4952, 'grad_norm': 1.569326639175415, 'learning_rate': 0.0001933285680247796, 'epoch': 0.03}


  3%|▎         | 571/16798 [02:13<57:33,  4.70it/s]

{'loss': 1.8782, 'grad_norm': 1.8165266513824463, 'learning_rate': 0.0001933166547533953, 'epoch': 0.03}


  3%|▎         | 572/16798 [02:13<58:03,  4.66it/s]

{'loss': 1.565, 'grad_norm': 1.5718610286712646, 'learning_rate': 0.00019330474148201095, 'epoch': 0.03}


  3%|▎         | 573/16798 [02:13<59:15,  4.56it/s]

{'loss': 1.8263, 'grad_norm': 1.7359883785247803, 'learning_rate': 0.00019329282821062665, 'epoch': 0.03}


  3%|▎         | 574/16798 [02:14<59:13,  4.57it/s]

{'loss': 1.5934, 'grad_norm': 1.8248622417449951, 'learning_rate': 0.0001932809149392423, 'epoch': 0.03}


  3%|▎         | 576/16798 [02:14<58:13,  4.64it/s]

{'loss': 1.7737, 'grad_norm': 1.7265145778656006, 'learning_rate': 0.000193269001667858, 'epoch': 0.03}


  3%|▎         | 577/16798 [02:14<58:02,  4.66it/s]

{'loss': 1.7024, 'grad_norm': 1.7490679025650024, 'learning_rate': 0.00019325708839647366, 'epoch': 0.03}


  3%|▎         | 577/16798 [02:14<58:02,  4.66it/s]

{'loss': 1.2144, 'grad_norm': 1.6132761240005493, 'learning_rate': 0.00019324517512508935, 'epoch': 0.03}


  3%|▎         | 578/16798 [02:14<58:35,  4.61it/s]

{'loss': 1.6699, 'grad_norm': 1.7421895265579224, 'learning_rate': 0.00019323326185370502, 'epoch': 0.03}


  3%|▎         | 579/16798 [02:15<59:45,  4.52it/s]

{'loss': 1.641, 'grad_norm': 1.7995988130569458, 'learning_rate': 0.0001932213485823207, 'epoch': 0.03}


  3%|▎         | 580/16798 [02:15<59:53,  4.51it/s]

{'loss': 1.8434, 'grad_norm': 2.1443793773651123, 'learning_rate': 0.00019320943531093637, 'epoch': 0.03}


  3%|▎         | 581/16798 [02:15<59:26,  4.55it/s]

{'loss': 1.2607, 'grad_norm': 1.6485099792480469, 'learning_rate': 0.0001931975220395521, 'epoch': 0.03}


  3%|▎         | 583/16798 [02:16<58:23,  4.63it/s]

{'loss': 1.612, 'grad_norm': 1.8047089576721191, 'learning_rate': 0.00019318560876816775, 'epoch': 0.03}


  3%|▎         | 583/16798 [02:16<58:23,  4.63it/s]

{'loss': 1.1971, 'grad_norm': 1.5489609241485596, 'learning_rate': 0.00019317369549678344, 'epoch': 0.03}


  3%|▎         | 584/16798 [02:16<58:27,  4.62it/s]

{'loss': 1.4506, 'grad_norm': 1.8853366374969482, 'learning_rate': 0.0001931617822253991, 'epoch': 0.03}


  3%|▎         | 586/16798 [02:16<57:52,  4.67it/s]

{'loss': 1.3851, 'grad_norm': 1.8462499380111694, 'learning_rate': 0.0001931498689540148, 'epoch': 0.03}


  3%|▎         | 587/16798 [02:16<57:54,  4.67it/s]

{'loss': 1.0577, 'grad_norm': 1.4263253211975098, 'learning_rate': 0.00019313795568263046, 'epoch': 0.03}


  3%|▎         | 587/16798 [02:16<57:54,  4.67it/s]

{'loss': 1.7435, 'grad_norm': 1.9864120483398438, 'learning_rate': 0.00019312604241124615, 'epoch': 0.03}


  4%|▎         | 589/16798 [02:17<56:14,  4.80it/s]

{'loss': 1.4096, 'grad_norm': 1.7701307535171509, 'learning_rate': 0.00019311412913986182, 'epoch': 0.04}


  4%|▎         | 589/16798 [02:17<56:14,  4.80it/s]

{'loss': 1.1728, 'grad_norm': 1.7058571577072144, 'learning_rate': 0.0001931022158684775, 'epoch': 0.04}


  4%|▎         | 591/16798 [02:17<56:25,  4.79it/s]

{'loss': 1.1784, 'grad_norm': 1.7032463550567627, 'learning_rate': 0.00019309030259709317, 'epoch': 0.04}


  4%|▎         | 591/16798 [02:17<56:25,  4.79it/s]

{'loss': 0.7443, 'grad_norm': 1.9959216117858887, 'learning_rate': 0.00019307838932570886, 'epoch': 0.04}


  4%|▎         | 593/16798 [02:18<56:36,  4.77it/s]

{'loss': 1.4722, 'grad_norm': 3.252854347229004, 'learning_rate': 0.00019306647605432453, 'epoch': 0.04}


  4%|▎         | 593/16798 [02:18<56:36,  4.77it/s]

{'loss': 1.2092, 'grad_norm': 1.6229982376098633, 'learning_rate': 0.00019305456278294022, 'epoch': 0.04}


  4%|▎         | 594/16798 [02:18<57:10,  4.72it/s]

{'loss': 0.8588, 'grad_norm': 1.5017250776290894, 'learning_rate': 0.00019304264951155588, 'epoch': 0.04}


  4%|▎         | 595/16798 [02:18<57:37,  4.69it/s]

{'loss': 1.3171, 'grad_norm': 1.7604674100875854, 'learning_rate': 0.00019303073624017157, 'epoch': 0.04}


  4%|▎         | 597/16798 [02:18<56:49,  4.75it/s]

{'loss': 1.355, 'grad_norm': 2.3334577083587646, 'learning_rate': 0.00019301882296878723, 'epoch': 0.04}


  4%|▎         | 598/16798 [02:19<55:44,  4.84it/s]

{'loss': 0.8408, 'grad_norm': 1.8041971921920776, 'learning_rate': 0.00019300690969740293, 'epoch': 0.04}


  4%|▎         | 599/16798 [02:19<55:19,  4.88it/s]

{'loss': 0.8425, 'grad_norm': 1.7767729759216309, 'learning_rate': 0.0001929949964260186, 'epoch': 0.04}


  4%|▎         | 600/16798 [02:19<55:55,  4.83it/s]

{'loss': 0.4281, 'grad_norm': 1.5315372943878174, 'learning_rate': 0.00019298308315463428, 'epoch': 0.04}


  4%|▎         | 600/16798 [02:19<55:55,  4.83it/s]

{'loss': 0.8208, 'grad_norm': 1.5944849252700806, 'learning_rate': 0.00019297116988324994, 'epoch': 0.04}


  4%|▎         | 601/16798 [02:19<56:29,  4.78it/s]

{'loss': 1.8679, 'grad_norm': 1.7619991302490234, 'learning_rate': 0.00019295925661186563, 'epoch': 0.04}


  4%|▎         | 602/16798 [02:20<57:09,  4.72it/s]

{'loss': 2.0248, 'grad_norm': 2.0837764739990234, 'learning_rate': 0.0001929473433404813, 'epoch': 0.04}


  4%|▎         | 603/16798 [02:20<59:22,  4.55it/s]

{'loss': 1.9996, 'grad_norm': 1.6680231094360352, 'learning_rate': 0.000192935430069097, 'epoch': 0.04}


  4%|▎         | 604/16798 [02:20<59:38,  4.53it/s]

{'loss': 2.4081, 'grad_norm': 2.1103973388671875, 'learning_rate': 0.00019292351679771265, 'epoch': 0.04}


  4%|▎         | 605/16798 [02:20<1:00:05,  4.49it/s]

{'loss': 2.4045, 'grad_norm': 2.304348945617676, 'learning_rate': 0.00019291160352632834, 'epoch': 0.04}


  4%|▎         | 606/16798 [02:20<59:55,  4.50it/s]  

{'loss': 2.1594, 'grad_norm': 1.8272343873977661, 'learning_rate': 0.000192899690254944, 'epoch': 0.04}


  4%|▎         | 607/16798 [02:21<59:17,  4.55it/s]

{'loss': 2.1435, 'grad_norm': 1.7986980676651, 'learning_rate': 0.0001928877769835597, 'epoch': 0.04}


  4%|▎         | 609/16798 [02:21<59:31,  4.53it/s]  

{'loss': 1.9689, 'grad_norm': 1.837700605392456, 'learning_rate': 0.00019287586371217536, 'epoch': 0.04}


  4%|▎         | 609/16798 [02:21<59:31,  4.53it/s]

{'loss': 1.8657, 'grad_norm': 1.9625017642974854, 'learning_rate': 0.00019286395044079105, 'epoch': 0.04}


  4%|▎         | 610/16798 [02:21<59:58,  4.50it/s]

{'loss': 1.646, 'grad_norm': 1.6908303499221802, 'learning_rate': 0.00019285203716940672, 'epoch': 0.04}


  4%|▎         | 611/16798 [02:22<59:07,  4.56it/s]

{'loss': 1.6208, 'grad_norm': 1.7405740022659302, 'learning_rate': 0.0001928401238980224, 'epoch': 0.04}


  4%|▎         | 612/16798 [02:22<59:32,  4.53it/s]

{'loss': 1.6773, 'grad_norm': 1.8257054090499878, 'learning_rate': 0.0001928282106266381, 'epoch': 0.04}


  4%|▎         | 613/16798 [02:22<59:54,  4.50it/s]

{'loss': 1.5357, 'grad_norm': 1.7338236570358276, 'learning_rate': 0.00019281629735525376, 'epoch': 0.04}


  4%|▎         | 615/16798 [02:22<58:34,  4.61it/s]

{'loss': 2.0363, 'grad_norm': 2.0960781574249268, 'learning_rate': 0.00019280438408386945, 'epoch': 0.04}


  4%|▎         | 615/16798 [02:22<58:34,  4.61it/s]

{'loss': 1.6585, 'grad_norm': 1.670904278755188, 'learning_rate': 0.00019279247081248512, 'epoch': 0.04}


  4%|▎         | 616/16798 [02:23<59:08,  4.56it/s]

{'loss': 1.5225, 'grad_norm': 1.7997241020202637, 'learning_rate': 0.0001927805575411008, 'epoch': 0.04}


  4%|▎         | 618/16798 [02:23<58:14,  4.63it/s]

{'loss': 1.3484, 'grad_norm': 1.6395933628082275, 'learning_rate': 0.00019276864426971647, 'epoch': 0.04}


  4%|▎         | 619/16798 [02:23<57:06,  4.72it/s]

{'loss': 1.6914, 'grad_norm': 1.6940118074417114, 'learning_rate': 0.00019275673099833216, 'epoch': 0.04}


  4%|▎         | 619/16798 [02:23<57:06,  4.72it/s]

{'loss': 1.9133, 'grad_norm': 2.193920135498047, 'learning_rate': 0.00019274481772694782, 'epoch': 0.04}


  4%|▎         | 620/16798 [02:23<57:02,  4.73it/s]

{'loss': 1.3854, 'grad_norm': 1.559238314628601, 'learning_rate': 0.00019273290445556351, 'epoch': 0.04}


  4%|▎         | 622/16798 [02:24<57:04,  4.72it/s]

{'loss': 1.442, 'grad_norm': 1.6211141347885132, 'learning_rate': 0.00019272099118417918, 'epoch': 0.04}


  4%|▎         | 622/16798 [02:24<57:04,  4.72it/s]

{'loss': 1.4108, 'grad_norm': 1.6408261060714722, 'learning_rate': 0.00019270907791279487, 'epoch': 0.04}


  4%|▎         | 624/16798 [02:24<58:04,  4.64it/s]

{'loss': 2.1959, 'grad_norm': 2.2563793659210205, 'learning_rate': 0.00019269716464141053, 'epoch': 0.04}


  4%|▎         | 624/16798 [02:24<58:04,  4.64it/s]

{'loss': 1.876, 'grad_norm': 1.8808259963989258, 'learning_rate': 0.00019268525137002622, 'epoch': 0.04}


  4%|▎         | 625/16798 [02:25<58:09,  4.63it/s]

{'loss': 1.8024, 'grad_norm': 2.1437320709228516, 'learning_rate': 0.0001926733380986419, 'epoch': 0.04}


  4%|▎         | 626/16798 [02:25<58:37,  4.60it/s]

{'loss': 2.1696, 'grad_norm': 2.3748693466186523, 'learning_rate': 0.00019266142482725758, 'epoch': 0.04}


  4%|▎         | 627/16798 [02:25<59:07,  4.56it/s]

{'loss': 1.8101, 'grad_norm': 2.126181125640869, 'learning_rate': 0.00019264951155587324, 'epoch': 0.04}


  4%|▎         | 628/16798 [02:25<58:31,  4.60it/s]

{'loss': 1.4465, 'grad_norm': 1.601849913597107, 'learning_rate': 0.00019263759828448893, 'epoch': 0.04}


  4%|▎         | 629/16798 [02:25<58:33,  4.60it/s]

{'loss': 1.9438, 'grad_norm': 2.5861592292785645, 'learning_rate': 0.0001926256850131046, 'epoch': 0.04}


  4%|▍         | 630/16798 [02:26<59:12,  4.55it/s]

{'loss': 1.5926, 'grad_norm': 1.825006365776062, 'learning_rate': 0.0001926137717417203, 'epoch': 0.04}


  4%|▍         | 631/16798 [02:26<58:59,  4.57it/s]

{'loss': 1.4807, 'grad_norm': 1.8225001096725464, 'learning_rate': 0.00019260185847033595, 'epoch': 0.04}


  4%|▍         | 632/16798 [02:26<58:13,  4.63it/s]

{'loss': 1.8179, 'grad_norm': 2.126580238342285, 'learning_rate': 0.00019258994519895164, 'epoch': 0.04}


  4%|▍         | 633/16798 [02:26<58:14,  4.63it/s]

{'loss': 1.8882, 'grad_norm': 1.8644559383392334, 'learning_rate': 0.0001925780319275673, 'epoch': 0.04}


  4%|▍         | 634/16798 [02:26<58:28,  4.61it/s]

{'loss': 1.4311, 'grad_norm': 1.7597105503082275, 'learning_rate': 0.000192566118656183, 'epoch': 0.04}


  4%|▍         | 636/16798 [02:27<57:55,  4.65it/s]

{'loss': 1.746, 'grad_norm': 1.980077862739563, 'learning_rate': 0.00019255420538479866, 'epoch': 0.04}


  4%|▍         | 636/16798 [02:27<57:55,  4.65it/s]

{'loss': 1.6501, 'grad_norm': 1.980024814605713, 'learning_rate': 0.00019254229211341435, 'epoch': 0.04}


  4%|▍         | 638/16798 [02:27<57:35,  4.68it/s]

{'loss': 1.0835, 'grad_norm': 1.446729302406311, 'learning_rate': 0.00019253037884203001, 'epoch': 0.04}


  4%|▍         | 639/16798 [02:28<56:54,  4.73it/s]

{'loss': 1.5202, 'grad_norm': 1.7984308004379272, 'learning_rate': 0.0001925184655706457, 'epoch': 0.04}


  4%|▍         | 639/16798 [02:28<56:54,  4.73it/s]

{'loss': 1.3799, 'grad_norm': 1.6584943532943726, 'learning_rate': 0.00019250655229926137, 'epoch': 0.04}


  4%|▍         | 640/16798 [02:28<57:49,  4.66it/s]

{'loss': 1.4302, 'grad_norm': 1.960970163345337, 'learning_rate': 0.00019249463902787706, 'epoch': 0.04}


  4%|▍         | 641/16798 [02:28<58:26,  4.61it/s]

{'loss': 0.8212, 'grad_norm': 1.4080028533935547, 'learning_rate': 0.00019248272575649272, 'epoch': 0.04}


  4%|▍         | 642/16798 [02:28<58:53,  4.57it/s]

{'loss': 1.3738, 'grad_norm': 1.8400934934616089, 'learning_rate': 0.0001924708124851084, 'epoch': 0.04}


  4%|▍         | 644/16798 [02:29<57:43,  4.66it/s]

{'loss': 0.7992, 'grad_norm': 1.31954026222229, 'learning_rate': 0.0001924588992137241, 'epoch': 0.04}


  4%|▍         | 645/16798 [02:29<57:07,  4.71it/s]

{'loss': 0.8761, 'grad_norm': 1.700609564781189, 'learning_rate': 0.0001924469859423398, 'epoch': 0.04}


  4%|▍         | 646/16798 [02:29<56:29,  4.76it/s]

{'loss': 0.7496, 'grad_norm': 1.5667835474014282, 'learning_rate': 0.00019243507267095546, 'epoch': 0.04}


  4%|▍         | 647/16798 [02:29<55:56,  4.81it/s]

{'loss': 1.3121, 'grad_norm': 1.8533105850219727, 'learning_rate': 0.00019242315939957115, 'epoch': 0.04}


  4%|▍         | 648/16798 [02:29<56:18,  4.78it/s]

{'loss': 1.1697, 'grad_norm': 2.26990008354187, 'learning_rate': 0.0001924112461281868, 'epoch': 0.04}


  4%|▍         | 649/16798 [02:30<55:59,  4.81it/s]

{'loss': 0.7806, 'grad_norm': 1.3977638483047485, 'learning_rate': 0.0001923993328568025, 'epoch': 0.04}


  4%|▍         | 649/16798 [02:30<55:59,  4.81it/s]

{'loss': 0.4863, 'grad_norm': 1.2922483682632446, 'learning_rate': 0.00019238741958541817, 'epoch': 0.04}


  4%|▍         | 650/16798 [02:30<56:33,  4.76it/s]

{'loss': 0.3026, 'grad_norm': 1.0222989320755005, 'learning_rate': 0.00019237550631403386, 'epoch': 0.04}


  4%|▍         | 651/16798 [02:30<58:04,  4.63it/s]

{'loss': 2.3312, 'grad_norm': 1.99472975730896, 'learning_rate': 0.00019236359304264952, 'epoch': 0.04}


  4%|▍         | 652/16798 [02:30<58:51,  4.57it/s]

{'loss': 2.1643, 'grad_norm': 1.8648037910461426, 'learning_rate': 0.0001923516797712652, 'epoch': 0.04}


  4%|▍         | 653/16798 [02:31<1:00:33,  4.44it/s]

{'loss': 2.5415, 'grad_norm': 2.284714937210083, 'learning_rate': 0.00019233976649988088, 'epoch': 0.04}


  4%|▍         | 654/16798 [02:31<1:01:29,  4.38it/s]

{'loss': 2.0496, 'grad_norm': 1.79265296459198, 'learning_rate': 0.00019232785322849657, 'epoch': 0.04}


  4%|▍         | 656/16798 [02:31<1:00:16,  4.46it/s]

{'loss': 1.9182, 'grad_norm': 1.859117865562439, 'learning_rate': 0.00019231593995711223, 'epoch': 0.04}


  4%|▍         | 656/16798 [02:31<1:00:16,  4.46it/s]

{'loss': 1.7823, 'grad_norm': 1.7486226558685303, 'learning_rate': 0.00019230402668572792, 'epoch': 0.04}


  4%|▍         | 657/16798 [02:32<1:02:25,  4.31it/s]

{'loss': 2.0746, 'grad_norm': 1.9664918184280396, 'learning_rate': 0.00019229211341434359, 'epoch': 0.04}


  4%|▍         | 658/16798 [02:32<1:01:32,  4.37it/s]

{'loss': 2.5344, 'grad_norm': 2.1688084602355957, 'learning_rate': 0.00019228020014295928, 'epoch': 0.04}


  4%|▍         | 659/16798 [02:32<1:01:10,  4.40it/s]

{'loss': 2.1281, 'grad_norm': 1.9373286962509155, 'learning_rate': 0.00019226828687157494, 'epoch': 0.04}


  4%|▍         | 660/16798 [02:32<1:01:35,  4.37it/s]

{'loss': 2.1829, 'grad_norm': 1.9274444580078125, 'learning_rate': 0.00019225637360019063, 'epoch': 0.04}


  4%|▍         | 662/16798 [02:33<1:01:35,  4.37it/s]

{'loss': 2.0235, 'grad_norm': 1.910102367401123, 'learning_rate': 0.0001922444603288063, 'epoch': 0.04}


  4%|▍         | 662/16798 [02:33<1:01:35,  4.37it/s]

{'loss': 2.0396, 'grad_norm': 1.881033182144165, 'learning_rate': 0.00019223254705742198, 'epoch': 0.04}


  4%|▍         | 664/16798 [02:33<59:57,  4.48it/s]  

{'loss': 2.1526, 'grad_norm': 2.0009942054748535, 'learning_rate': 0.00019222063378603765, 'epoch': 0.04}


  4%|▍         | 665/16798 [02:33<58:52,  4.57it/s]

{'loss': 2.0099, 'grad_norm': 2.1299386024475098, 'learning_rate': 0.00019220872051465334, 'epoch': 0.04}


  4%|▍         | 665/16798 [02:33<58:52,  4.57it/s]

{'loss': 1.5777, 'grad_norm': 1.9702975749969482, 'learning_rate': 0.000192196807243269, 'epoch': 0.04}


  4%|▍         | 666/16798 [02:34<1:00:03,  4.48it/s]

{'loss': 2.2197, 'grad_norm': 2.1572251319885254, 'learning_rate': 0.0001921848939718847, 'epoch': 0.04}


  4%|▍         | 667/16798 [02:34<1:02:01,  4.33it/s]

{'loss': 1.7031, 'grad_norm': 1.8265774250030518, 'learning_rate': 0.00019217298070050036, 'epoch': 0.04}


  4%|▍         | 669/16798 [02:34<59:30,  4.52it/s]  

{'loss': 1.6731, 'grad_norm': 1.7250169515609741, 'learning_rate': 0.00019216106742911605, 'epoch': 0.04}


  4%|▍         | 669/16798 [02:34<59:30,  4.52it/s]

{'loss': 1.8707, 'grad_norm': 2.0070033073425293, 'learning_rate': 0.0001921491541577317, 'epoch': 0.04}


  4%|▍         | 671/16798 [02:35<58:07,  4.62it/s]

{'loss': 1.571, 'grad_norm': 1.9529049396514893, 'learning_rate': 0.0001921372408863474, 'epoch': 0.04}


  4%|▍         | 671/16798 [02:35<58:07,  4.62it/s]

{'loss': 1.9459, 'grad_norm': 1.9345673322677612, 'learning_rate': 0.00019212532761496307, 'epoch': 0.04}


  4%|▍         | 672/16798 [02:35<57:39,  4.66it/s]

{'loss': 1.9595, 'grad_norm': 2.0267081260681152, 'learning_rate': 0.00019211341434357876, 'epoch': 0.04}


  4%|▍         | 673/16798 [02:35<57:49,  4.65it/s]

{'loss': 1.4014, 'grad_norm': 1.5735822916030884, 'learning_rate': 0.00019210150107219442, 'epoch': 0.04}


  4%|▍         | 674/16798 [02:35<58:48,  4.57it/s]

{'loss': 1.6773, 'grad_norm': 1.7251633405685425, 'learning_rate': 0.0001920895878008101, 'epoch': 0.04}


  4%|▍         | 676/16798 [02:36<58:16,  4.61it/s]

{'loss': 1.5585, 'grad_norm': 1.6117392778396606, 'learning_rate': 0.0001920776745294258, 'epoch': 0.04}


  4%|▍         | 677/16798 [02:36<56:44,  4.74it/s]

{'loss': 1.437, 'grad_norm': 1.6261518001556396, 'learning_rate': 0.00019206576125804147, 'epoch': 0.04}


  4%|▍         | 677/16798 [02:36<56:44,  4.74it/s]

{'loss': 1.8571, 'grad_norm': 1.9124634265899658, 'learning_rate': 0.00019205384798665716, 'epoch': 0.04}


  4%|▍         | 678/16798 [02:36<56:38,  4.74it/s]

{'loss': 1.4723, 'grad_norm': 1.7920902967453003, 'learning_rate': 0.00019204193471527282, 'epoch': 0.04}


  4%|▍         | 680/16798 [02:37<55:50,  4.81it/s]

{'loss': 1.5525, 'grad_norm': 1.776593565940857, 'learning_rate': 0.0001920300214438885, 'epoch': 0.04}


  4%|▍         | 680/16798 [02:37<55:50,  4.81it/s]

{'loss': 1.618, 'grad_norm': 1.9086583852767944, 'learning_rate': 0.00019201810817250417, 'epoch': 0.04}


  4%|▍         | 681/16798 [02:37<56:06,  4.79it/s]

{'loss': 1.5508, 'grad_norm': 1.9132760763168335, 'learning_rate': 0.00019200619490111987, 'epoch': 0.04}


  4%|▍         | 683/16798 [02:37<56:35,  4.75it/s]

{'loss': 1.4201, 'grad_norm': 1.6438477039337158, 'learning_rate': 0.00019199428162973553, 'epoch': 0.04}


  4%|▍         | 683/16798 [02:37<56:35,  4.75it/s]

{'loss': 1.3592, 'grad_norm': 1.639686942100525, 'learning_rate': 0.00019198236835835122, 'epoch': 0.04}


  4%|▍         | 684/16798 [02:37<56:55,  4.72it/s]

{'loss': 1.2837, 'grad_norm': 1.5978938341140747, 'learning_rate': 0.00019197045508696688, 'epoch': 0.04}


  4%|▍         | 685/16798 [02:38<56:48,  4.73it/s]

{'loss': 1.5389, 'grad_norm': 2.026198387145996, 'learning_rate': 0.00019195854181558257, 'epoch': 0.04}


  4%|▍         | 687/16798 [02:38<56:17,  4.77it/s]

{'loss': 1.5821, 'grad_norm': 1.7333906888961792, 'learning_rate': 0.00019194662854419824, 'epoch': 0.04}


  4%|▍         | 688/16798 [02:38<55:49,  4.81it/s]

{'loss': 1.5331, 'grad_norm': 1.8329980373382568, 'learning_rate': 0.00019193471527281393, 'epoch': 0.04}


  4%|▍         | 689/16798 [02:38<55:22,  4.85it/s]

{'loss': 1.4385, 'grad_norm': 1.7468243837356567, 'learning_rate': 0.0001919228020014296, 'epoch': 0.04}


  4%|▍         | 690/16798 [02:39<55:18,  4.85it/s]

{'loss': 1.3838, 'grad_norm': 1.6406660079956055, 'learning_rate': 0.00019191088873004528, 'epoch': 0.04}


  4%|▍         | 690/16798 [02:39<55:18,  4.85it/s]

{'loss': 1.7029, 'grad_norm': 1.9008805751800537, 'learning_rate': 0.00019189897545866095, 'epoch': 0.04}


  4%|▍         | 691/16798 [02:39<56:18,  4.77it/s]

{'loss': 1.1307, 'grad_norm': 1.464961290359497, 'learning_rate': 0.00019188706218727664, 'epoch': 0.04}


  4%|▍         | 693/16798 [02:39<56:36,  4.74it/s]

{'loss': 0.8778, 'grad_norm': 1.4489420652389526, 'learning_rate': 0.0001918751489158923, 'epoch': 0.04}


  4%|▍         | 693/16798 [02:39<56:36,  4.74it/s]

{'loss': 1.7459, 'grad_norm': 2.4316630363464355, 'learning_rate': 0.000191863235644508, 'epoch': 0.04}


  4%|▍         | 694/16798 [02:40<58:04,  4.62it/s]

{'loss': 1.5187, 'grad_norm': 1.6357868909835815, 'learning_rate': 0.00019185132237312366, 'epoch': 0.04}


  4%|▍         | 695/16798 [02:40<58:15,  4.61it/s]

{'loss': 1.2609, 'grad_norm': 1.476539134979248, 'learning_rate': 0.00019183940910173935, 'epoch': 0.04}


  4%|▍         | 696/16798 [02:40<58:54,  4.56it/s]

{'loss': 1.0797, 'grad_norm': 1.5659681558609009, 'learning_rate': 0.000191827495830355, 'epoch': 0.04}


  4%|▍         | 698/16798 [02:40<57:18,  4.68it/s]

{'loss': 1.2155, 'grad_norm': 1.924585223197937, 'learning_rate': 0.0001918155825589707, 'epoch': 0.04}


  4%|▍         | 698/16798 [02:40<57:18,  4.68it/s]

{'loss': 0.7782, 'grad_norm': 1.3124079704284668, 'learning_rate': 0.00019180366928758636, 'epoch': 0.04}


  4%|▍         | 699/16798 [02:41<58:22,  4.60it/s]

{'loss': 0.7462, 'grad_norm': 1.4483015537261963, 'learning_rate': 0.00019179175601620205, 'epoch': 0.04}


  4%|▍         | 700/16798 [02:41<57:29,  4.67it/s]

{'loss': 0.4249, 'grad_norm': 1.1347171068191528, 'learning_rate': 0.00019177984274481772, 'epoch': 0.04}


  4%|▍         | 702/16798 [02:41<57:07,  4.70it/s]

{'loss': 2.1933, 'grad_norm': 1.9606962203979492, 'learning_rate': 0.0001917679294734334, 'epoch': 0.04}


  4%|▍         | 703/16798 [02:41<57:13,  4.69it/s]

{'loss': 1.9817, 'grad_norm': 1.6861958503723145, 'learning_rate': 0.00019175601620204907, 'epoch': 0.04}


  4%|▍         | 703/16798 [02:41<57:13,  4.69it/s]

{'loss': 2.3631, 'grad_norm': 1.749089241027832, 'learning_rate': 0.00019174410293066476, 'epoch': 0.04}


  4%|▍         | 704/16798 [02:42<57:06,  4.70it/s]

{'loss': 2.0987, 'grad_norm': 1.8634501695632935, 'learning_rate': 0.00019173218965928043, 'epoch': 0.04}


  4%|▍         | 705/16798 [02:42<57:58,  4.63it/s]

{'loss': 2.1431, 'grad_norm': 1.7527209520339966, 'learning_rate': 0.00019172027638789615, 'epoch': 0.04}


  4%|▍         | 706/16798 [02:42<59:41,  4.49it/s]

{'loss': 2.5045, 'grad_norm': 2.2240540981292725, 'learning_rate': 0.0001917083631165118, 'epoch': 0.04}


  4%|▍         | 707/16798 [02:42<59:59,  4.47it/s]

{'loss': 2.3871, 'grad_norm': 1.9836840629577637, 'learning_rate': 0.0001916964498451275, 'epoch': 0.04}


  4%|▍         | 708/16798 [02:43<59:40,  4.49it/s]

{'loss': 1.7908, 'grad_norm': 1.77638840675354, 'learning_rate': 0.00019168453657374316, 'epoch': 0.04}


  4%|▍         | 709/16798 [02:43<59:33,  4.50it/s]

{'loss': 1.5585, 'grad_norm': 1.8299673795700073, 'learning_rate': 0.00019167262330235885, 'epoch': 0.04}


  4%|▍         | 711/16798 [02:43<58:17,  4.60it/s]

{'loss': 1.5211, 'grad_norm': 1.529987096786499, 'learning_rate': 0.00019166071003097452, 'epoch': 0.04}


  4%|▍         | 711/16798 [02:43<58:17,  4.60it/s]

{'loss': 1.5233, 'grad_norm': 1.5960613489151, 'learning_rate': 0.0001916487967595902, 'epoch': 0.04}


  4%|▍         | 712/16798 [02:43<58:14,  4.60it/s]

{'loss': 1.3701, 'grad_norm': 1.624371886253357, 'learning_rate': 0.00019163688348820587, 'epoch': 0.04}


  4%|▍         | 714/16798 [02:44<57:39,  4.65it/s]

{'loss': 1.5011, 'grad_norm': 1.6290501356124878, 'learning_rate': 0.00019162497021682156, 'epoch': 0.04}


  4%|▍         | 715/16798 [02:44<57:06,  4.69it/s]

{'loss': 1.8057, 'grad_norm': 2.0661120414733887, 'learning_rate': 0.00019161305694543723, 'epoch': 0.04}


  4%|▍         | 715/16798 [02:44<57:06,  4.69it/s]

{'loss': 1.1876, 'grad_norm': 1.6626931428909302, 'learning_rate': 0.00019160114367405292, 'epoch': 0.04}


  4%|▍         | 716/16798 [02:44<57:18,  4.68it/s]

{'loss': 1.6601, 'grad_norm': 1.7140357494354248, 'learning_rate': 0.00019158923040266858, 'epoch': 0.04}


  4%|▍         | 717/16798 [02:44<57:16,  4.68it/s]

{'loss': 1.8189, 'grad_norm': 2.0787150859832764, 'learning_rate': 0.00019157731713128427, 'epoch': 0.04}


  4%|▍         | 718/16798 [02:45<57:55,  4.63it/s]

{'loss': 1.5339, 'grad_norm': 1.4199596643447876, 'learning_rate': 0.00019156540385989994, 'epoch': 0.04}


  4%|▍         | 720/16798 [02:45<56:44,  4.72it/s]

{'loss': 1.6866, 'grad_norm': 1.9495898485183716, 'learning_rate': 0.00019155349058851563, 'epoch': 0.04}


  4%|▍         | 720/16798 [02:45<56:44,  4.72it/s]

{'loss': 2.1293, 'grad_norm': 1.9808859825134277, 'learning_rate': 0.0001915415773171313, 'epoch': 0.04}


  4%|▍         | 721/16798 [02:45<58:17,  4.60it/s]

{'loss': 1.4973, 'grad_norm': 1.775115966796875, 'learning_rate': 0.00019152966404574698, 'epoch': 0.04}


  4%|▍         | 722/16798 [02:46<58:08,  4.61it/s]

{'loss': 1.3745, 'grad_norm': 2.1230499744415283, 'learning_rate': 0.00019151775077436264, 'epoch': 0.04}


  4%|▍         | 724/16798 [02:46<57:01,  4.70it/s]

{'loss': 1.2825, 'grad_norm': 1.865043044090271, 'learning_rate': 0.00019150583750297834, 'epoch': 0.04}


  4%|▍         | 724/16798 [02:46<57:01,  4.70it/s]

{'loss': 2.0084, 'grad_norm': 2.0514163970947266, 'learning_rate': 0.000191493924231594, 'epoch': 0.04}


  4%|▍         | 725/16798 [02:46<58:23,  4.59it/s]

{'loss': 1.49, 'grad_norm': 2.089123487472534, 'learning_rate': 0.0001914820109602097, 'epoch': 0.04}


  4%|▍         | 727/16798 [02:47<57:56,  4.62it/s]

{'loss': 1.4912, 'grad_norm': 1.9562262296676636, 'learning_rate': 0.00019147009768882535, 'epoch': 0.04}


  4%|▍         | 727/16798 [02:47<57:56,  4.62it/s]

{'loss': 1.4087, 'grad_norm': 2.113961935043335, 'learning_rate': 0.00019145818441744104, 'epoch': 0.04}


  4%|▍         | 728/16798 [02:47<57:59,  4.62it/s]

{'loss': 1.7634, 'grad_norm': 2.002713680267334, 'learning_rate': 0.0001914462711460567, 'epoch': 0.04}


  4%|▍         | 729/16798 [02:47<58:46,  4.56it/s]

{'loss': 1.502, 'grad_norm': 2.341020107269287, 'learning_rate': 0.0001914343578746724, 'epoch': 0.04}


  4%|▍         | 730/16798 [02:47<58:48,  4.55it/s]

{'loss': 1.6958, 'grad_norm': 2.042858123779297, 'learning_rate': 0.00019142244460328806, 'epoch': 0.04}


  4%|▍         | 732/16798 [02:48<58:13,  4.60it/s]

{'loss': 2.049, 'grad_norm': 2.3206326961517334, 'learning_rate': 0.00019141053133190375, 'epoch': 0.04}


  4%|▍         | 733/16798 [02:48<57:20,  4.67it/s]

{'loss': 1.1858, 'grad_norm': 1.798403024673462, 'learning_rate': 0.00019139861806051942, 'epoch': 0.04}


  4%|▍         | 733/16798 [02:48<57:20,  4.67it/s]

{'loss': 1.8065, 'grad_norm': 1.9479106664657593, 'learning_rate': 0.0001913867047891351, 'epoch': 0.04}


  4%|▍         | 734/16798 [02:48<57:00,  4.70it/s]

{'loss': 1.4026, 'grad_norm': 1.9161487817764282, 'learning_rate': 0.00019137479151775077, 'epoch': 0.04}


  4%|▍         | 735/16798 [02:48<56:47,  4.71it/s]

{'loss': 1.5003, 'grad_norm': 1.9150766134262085, 'learning_rate': 0.00019136287824636646, 'epoch': 0.04}


  4%|▍         | 736/16798 [02:49<57:53,  4.62it/s]

{'loss': 1.5412, 'grad_norm': 1.9208394289016724, 'learning_rate': 0.00019135096497498215, 'epoch': 0.04}


  4%|▍         | 737/16798 [02:49<56:51,  4.71it/s]

{'loss': 1.4893, 'grad_norm': 1.8890607357025146, 'learning_rate': 0.00019133905170359782, 'epoch': 0.04}


  4%|▍         | 739/16798 [02:49<56:01,  4.78it/s]

{'loss': 1.4476, 'grad_norm': 1.7322957515716553, 'learning_rate': 0.0001913271384322135, 'epoch': 0.04}


  4%|▍         | 740/16798 [02:49<55:59,  4.78it/s]

{'loss': 1.3308, 'grad_norm': 1.9026557207107544, 'learning_rate': 0.00019131522516082917, 'epoch': 0.04}


  4%|▍         | 740/16798 [02:49<55:59,  4.78it/s]

{'loss': 1.2293, 'grad_norm': 1.7430790662765503, 'learning_rate': 0.00019130331188944486, 'epoch': 0.04}


  4%|▍         | 742/16798 [02:50<56:36,  4.73it/s]

{'loss': 1.2316, 'grad_norm': 1.7857102155685425, 'learning_rate': 0.00019129139861806052, 'epoch': 0.04}


  4%|▍         | 743/16798 [02:50<55:48,  4.79it/s]

{'loss': 1.3109, 'grad_norm': 1.737300157546997, 'learning_rate': 0.00019127948534667622, 'epoch': 0.04}


  4%|▍         | 744/16798 [02:50<55:34,  4.81it/s]

{'loss': 1.3736, 'grad_norm': 1.8900175094604492, 'learning_rate': 0.00019126757207529188, 'epoch': 0.04}


  4%|▍         | 744/16798 [02:50<55:34,  4.81it/s]

{'loss': 0.8298, 'grad_norm': 1.6286754608154297, 'learning_rate': 0.00019125565880390757, 'epoch': 0.04}


  4%|▍         | 746/16798 [02:51<55:37,  4.81it/s]

{'loss': 1.2334, 'grad_norm': 1.7815768718719482, 'learning_rate': 0.00019124374553252323, 'epoch': 0.04}


  4%|▍         | 746/16798 [02:51<55:37,  4.81it/s]

{'loss': 1.2059, 'grad_norm': 1.7751750946044922, 'learning_rate': 0.00019123183226113892, 'epoch': 0.04}


  4%|▍         | 748/16798 [02:51<55:13,  4.84it/s]

{'loss': 0.8535, 'grad_norm': 1.4246329069137573, 'learning_rate': 0.0001912199189897546, 'epoch': 0.04}


  4%|▍         | 748/16798 [02:51<55:13,  4.84it/s]

{'loss': 0.5704, 'grad_norm': 1.3398222923278809, 'learning_rate': 0.00019120800571837028, 'epoch': 0.04}


  4%|▍         | 749/16798 [02:51<55:30,  4.82it/s]

{'loss': 0.9045, 'grad_norm': 1.8815805912017822, 'learning_rate': 0.00019119609244698594, 'epoch': 0.04}


  4%|▍         | 750/16798 [02:52<56:17,  4.75it/s]

{'loss': 0.6908, 'grad_norm': 1.3192973136901855, 'learning_rate': 0.00019118417917560163, 'epoch': 0.04}


  4%|▍         | 751/16798 [02:52<57:59,  4.61it/s]

{'loss': 1.8923, 'grad_norm': 2.083181381225586, 'learning_rate': 0.0001911722659042173, 'epoch': 0.04}


  4%|▍         | 752/16798 [02:52<59:33,  4.49it/s]

{'loss': 1.8236, 'grad_norm': 1.7435011863708496, 'learning_rate': 0.000191160352632833, 'epoch': 0.04}


  4%|▍         | 753/16798 [02:52<1:00:22,  4.43it/s]

{'loss': 2.1942, 'grad_norm': 2.005570888519287, 'learning_rate': 0.00019114843936144865, 'epoch': 0.04}


  4%|▍         | 754/16798 [02:52<1:00:32,  4.42it/s]

{'loss': 2.2052, 'grad_norm': 1.9470131397247314, 'learning_rate': 0.00019113652609006434, 'epoch': 0.04}


  4%|▍         | 755/16798 [02:53<1:01:11,  4.37it/s]

{'loss': 2.5279, 'grad_norm': 1.9992280006408691, 'learning_rate': 0.00019112461281868, 'epoch': 0.04}


  5%|▍         | 756/16798 [02:53<1:00:47,  4.40it/s]

{'loss': 2.1409, 'grad_norm': 1.8864545822143555, 'learning_rate': 0.0001911126995472957, 'epoch': 0.05}


  5%|▍         | 757/16798 [02:53<1:02:18,  4.29it/s]

{'loss': 1.8926, 'grad_norm': 1.6767771244049072, 'learning_rate': 0.00019110078627591136, 'epoch': 0.05}


  5%|▍         | 758/16798 [02:53<1:00:42,  4.40it/s]

{'loss': 2.099, 'grad_norm': 1.7976267337799072, 'learning_rate': 0.00019108887300452705, 'epoch': 0.05}


  5%|▍         | 759/16798 [02:54<1:00:13,  4.44it/s]

{'loss': 2.3109, 'grad_norm': 2.0512166023254395, 'learning_rate': 0.00019107695973314271, 'epoch': 0.05}


  5%|▍         | 760/16798 [02:54<59:57,  4.46it/s]  

{'loss': 1.6483, 'grad_norm': 1.9884332418441772, 'learning_rate': 0.0001910650464617584, 'epoch': 0.05}


  5%|▍         | 761/16798 [02:54<1:00:01,  4.45it/s]

{'loss': 1.7419, 'grad_norm': 2.260653495788574, 'learning_rate': 0.00019105313319037407, 'epoch': 0.05}


  5%|▍         | 762/16798 [02:54<1:00:04,  4.45it/s]

{'loss': 2.1454, 'grad_norm': 2.021498918533325, 'learning_rate': 0.00019104121991898976, 'epoch': 0.05}


  5%|▍         | 763/16798 [02:54<59:15,  4.51it/s]  

{'loss': 1.5445, 'grad_norm': 1.9305245876312256, 'learning_rate': 0.00019102930664760542, 'epoch': 0.05}


  5%|▍         | 764/16798 [02:55<59:33,  4.49it/s]

{'loss': 1.3906, 'grad_norm': 1.4849697351455688, 'learning_rate': 0.00019101739337622111, 'epoch': 0.05}


  5%|▍         | 765/16798 [02:55<59:20,  4.50it/s]

{'loss': 1.3637, 'grad_norm': 1.4385509490966797, 'learning_rate': 0.00019100548010483678, 'epoch': 0.05}


  5%|▍         | 766/16798 [02:55<58:58,  4.53it/s]

{'loss': 1.2315, 'grad_norm': 1.5361342430114746, 'learning_rate': 0.00019099356683345247, 'epoch': 0.05}


  5%|▍         | 767/16798 [02:55<59:58,  4.46it/s]

{'loss': 1.4849, 'grad_norm': 1.7373743057250977, 'learning_rate': 0.00019098165356206816, 'epoch': 0.05}


  5%|▍         | 768/16798 [02:56<1:00:19,  4.43it/s]

{'loss': 1.321, 'grad_norm': 1.506162166595459, 'learning_rate': 0.00019096974029068385, 'epoch': 0.05}


  5%|▍         | 769/16798 [02:56<59:15,  4.51it/s]  

{'loss': 1.9518, 'grad_norm': 2.1760621070861816, 'learning_rate': 0.00019095782701929951, 'epoch': 0.05}


  5%|▍         | 770/16798 [02:56<59:21,  4.50it/s]

{'loss': 1.6269, 'grad_norm': 2.348519802093506, 'learning_rate': 0.0001909459137479152, 'epoch': 0.05}


  5%|▍         | 771/16798 [02:56<58:22,  4.58it/s]

{'loss': 1.6962, 'grad_norm': 1.7852137088775635, 'learning_rate': 0.00019093400047653087, 'epoch': 0.05}


  5%|▍         | 773/16798 [02:57<57:23,  4.65it/s]

{'loss': 1.6879, 'grad_norm': 1.8148365020751953, 'learning_rate': 0.00019092208720514656, 'epoch': 0.05}


  5%|▍         | 773/16798 [02:57<57:23,  4.65it/s]

{'loss': 1.431, 'grad_norm': 1.6759576797485352, 'learning_rate': 0.00019091017393376222, 'epoch': 0.05}


  5%|▍         | 774/16798 [02:57<58:00,  4.60it/s]

{'loss': 1.7845, 'grad_norm': 1.8634414672851562, 'learning_rate': 0.0001908982606623779, 'epoch': 0.05}


  5%|▍         | 776/16798 [02:57<57:41,  4.63it/s]

{'loss': 1.7744, 'grad_norm': 1.9208177328109741, 'learning_rate': 0.00019088634739099358, 'epoch': 0.05}


  5%|▍         | 776/16798 [02:57<57:41,  4.63it/s]

{'loss': 1.1581, 'grad_norm': 1.4314528703689575, 'learning_rate': 0.00019087443411960927, 'epoch': 0.05}


  5%|▍         | 777/16798 [02:58<57:51,  4.61it/s]

{'loss': 1.7536, 'grad_norm': 2.071823835372925, 'learning_rate': 0.00019086252084822493, 'epoch': 0.05}


  5%|▍         | 778/16798 [02:58<57:57,  4.61it/s]

{'loss': 1.5461, 'grad_norm': 1.7456815242767334, 'learning_rate': 0.00019085060757684062, 'epoch': 0.05}


  5%|▍         | 779/16798 [02:58<58:38,  4.55it/s]

{'loss': 1.4431, 'grad_norm': 1.8753553628921509, 'learning_rate': 0.00019083869430545629, 'epoch': 0.05}


  5%|▍         | 780/16798 [02:58<58:31,  4.56it/s]

{'loss': 1.9694, 'grad_norm': 2.033674478530884, 'learning_rate': 0.00019082678103407198, 'epoch': 0.05}


  5%|▍         | 781/16798 [02:58<58:34,  4.56it/s]

{'loss': 1.3432, 'grad_norm': 1.9747319221496582, 'learning_rate': 0.00019081486776268764, 'epoch': 0.05}


  5%|▍         | 783/16798 [02:59<57:34,  4.64it/s]

{'loss': 1.4888, 'grad_norm': 1.6352235078811646, 'learning_rate': 0.00019080295449130333, 'epoch': 0.05}


  5%|▍         | 783/16798 [02:59<57:34,  4.64it/s]

{'loss': 1.6182, 'grad_norm': 1.7003750801086426, 'learning_rate': 0.000190791041219919, 'epoch': 0.05}


  5%|▍         | 785/16798 [02:59<56:55,  4.69it/s]

{'loss': 1.4166, 'grad_norm': 1.5892857313156128, 'learning_rate': 0.00019077912794853469, 'epoch': 0.05}


  5%|▍         | 786/16798 [02:59<56:43,  4.70it/s]

{'loss': 1.3813, 'grad_norm': 1.9008976221084595, 'learning_rate': 0.00019076721467715035, 'epoch': 0.05}


  5%|▍         | 786/16798 [02:59<56:43,  4.70it/s]

{'loss': 1.684, 'grad_norm': 1.7497066259384155, 'learning_rate': 0.00019075530140576604, 'epoch': 0.05}


  5%|▍         | 788/16798 [03:00<55:36,  4.80it/s]

{'loss': 1.2983, 'grad_norm': 2.0052449703216553, 'learning_rate': 0.0001907433881343817, 'epoch': 0.05}


  5%|▍         | 788/16798 [03:00<55:36,  4.80it/s]

{'loss': 1.7681, 'grad_norm': 2.290151596069336, 'learning_rate': 0.0001907314748629974, 'epoch': 0.05}


  5%|▍         | 790/16798 [03:00<56:22,  4.73it/s]

{'loss': 1.0781, 'grad_norm': 2.0531797409057617, 'learning_rate': 0.00019071956159161306, 'epoch': 0.05}


  5%|▍         | 790/16798 [03:00<56:22,  4.73it/s]

{'loss': 1.682, 'grad_norm': 2.0117595195770264, 'learning_rate': 0.00019070764832022875, 'epoch': 0.05}


  5%|▍         | 792/16798 [03:01<57:06,  4.67it/s]

{'loss': 1.6635, 'grad_norm': 2.162740707397461, 'learning_rate': 0.0001906957350488444, 'epoch': 0.05}


  5%|▍         | 793/16798 [03:01<56:07,  4.75it/s]

{'loss': 1.2622, 'grad_norm': 1.822646975517273, 'learning_rate': 0.0001906838217774601, 'epoch': 0.05}


  5%|▍         | 793/16798 [03:01<56:07,  4.75it/s]

{'loss': 1.1249, 'grad_norm': 1.7245506048202515, 'learning_rate': 0.00019067190850607577, 'epoch': 0.05}


  5%|▍         | 794/16798 [03:01<56:08,  4.75it/s]

{'loss': 1.3271, 'grad_norm': 1.8773446083068848, 'learning_rate': 0.00019065999523469146, 'epoch': 0.05}


  5%|▍         | 796/16798 [03:02<57:09,  4.67it/s]

{'loss': 0.7208, 'grad_norm': 1.3497425317764282, 'learning_rate': 0.00019064808196330712, 'epoch': 0.05}


  5%|▍         | 797/16798 [03:02<55:47,  4.78it/s]

{'loss': 1.1121, 'grad_norm': 1.9362210035324097, 'learning_rate': 0.0001906361686919228, 'epoch': 0.05}


  5%|▍         | 798/16798 [03:02<55:17,  4.82it/s]

{'loss': 0.7134, 'grad_norm': 1.4774789810180664, 'learning_rate': 0.0001906242554205385, 'epoch': 0.05}


  5%|▍         | 798/16798 [03:02<55:17,  4.82it/s]

{'loss': 0.6993, 'grad_norm': 1.4214019775390625, 'learning_rate': 0.00019061234214915417, 'epoch': 0.05}


  5%|▍         | 800/16798 [03:02<55:44,  4.78it/s]

{'loss': 0.6168, 'grad_norm': 1.3470532894134521, 'learning_rate': 0.00019060042887776986, 'epoch': 0.05}


  5%|▍         | 800/16798 [03:02<55:44,  4.78it/s]

{'loss': 0.8574, 'grad_norm': 1.9584558010101318, 'learning_rate': 0.00019058851560638552, 'epoch': 0.05}


  5%|▍         | 801/16798 [03:03<56:48,  4.69it/s]

{'loss': 1.895, 'grad_norm': 1.8253053426742554, 'learning_rate': 0.0001905766023350012, 'epoch': 0.05}


  5%|▍         | 802/16798 [03:03<57:20,  4.65it/s]

{'loss': 1.991, 'grad_norm': 2.0995779037475586, 'learning_rate': 0.00019056468906361688, 'epoch': 0.05}


  5%|▍         | 803/16798 [03:03<1:00:01,  4.44it/s]

{'loss': 1.9391, 'grad_norm': 2.326353073120117, 'learning_rate': 0.00019055277579223257, 'epoch': 0.05}


  5%|▍         | 804/16798 [03:03<1:00:43,  4.39it/s]

{'loss': 2.2698, 'grad_norm': 2.0907840728759766, 'learning_rate': 0.00019054086252084823, 'epoch': 0.05}


  5%|▍         | 805/16798 [03:04<1:01:35,  4.33it/s]

{'loss': 2.3818, 'grad_norm': 2.0925099849700928, 'learning_rate': 0.00019052894924946392, 'epoch': 0.05}


  5%|▍         | 806/16798 [03:04<1:01:11,  4.36it/s]

{'loss': 2.0003, 'grad_norm': 2.201592445373535, 'learning_rate': 0.00019051703597807958, 'epoch': 0.05}


  5%|▍         | 807/16798 [03:04<1:00:29,  4.41it/s]

{'loss': 2.0414, 'grad_norm': 2.1035611629486084, 'learning_rate': 0.00019050512270669528, 'epoch': 0.05}


  5%|▍         | 808/16798 [03:04<1:00:40,  4.39it/s]

{'loss': 1.7987, 'grad_norm': 1.8717104196548462, 'learning_rate': 0.00019049320943531094, 'epoch': 0.05}


  5%|▍         | 809/16798 [03:05<59:57,  4.44it/s]  

{'loss': 2.1234, 'grad_norm': 2.0986180305480957, 'learning_rate': 0.00019048129616392663, 'epoch': 0.05}


  5%|▍         | 810/16798 [03:05<1:00:05,  4.43it/s]

{'loss': 2.3135, 'grad_norm': 2.057420015335083, 'learning_rate': 0.0001904693828925423, 'epoch': 0.05}


  5%|▍         | 811/16798 [03:05<59:41,  4.46it/s]  

{'loss': 1.9935, 'grad_norm': 1.9895904064178467, 'learning_rate': 0.00019045746962115798, 'epoch': 0.05}


  5%|▍         | 812/16798 [03:05<59:53,  4.45it/s]

{'loss': 1.4745, 'grad_norm': 1.9350024461746216, 'learning_rate': 0.00019044555634977365, 'epoch': 0.05}


  5%|▍         | 813/16798 [03:05<1:00:47,  4.38it/s]

{'loss': 1.81, 'grad_norm': 1.9515295028686523, 'learning_rate': 0.00019043364307838934, 'epoch': 0.05}


  5%|▍         | 814/16798 [03:06<1:00:15,  4.42it/s]

{'loss': 1.7032, 'grad_norm': 1.7902600765228271, 'learning_rate': 0.000190421729807005, 'epoch': 0.05}


  5%|▍         | 816/16798 [03:06<59:56,  4.44it/s]  

{'loss': 1.6036, 'grad_norm': 2.24596905708313, 'learning_rate': 0.0001904098165356207, 'epoch': 0.05}


  5%|▍         | 816/16798 [03:06<59:56,  4.44it/s]

{'loss': 1.6507, 'grad_norm': 1.857176423072815, 'learning_rate': 0.00019039790326423636, 'epoch': 0.05}


  5%|▍         | 817/16798 [03:06<58:51,  4.52it/s]

{'loss': 1.6528, 'grad_norm': 1.7223284244537354, 'learning_rate': 0.00019038598999285205, 'epoch': 0.05}


  5%|▍         | 818/16798 [03:07<58:25,  4.56it/s]

{'loss': 1.3312, 'grad_norm': 1.5063228607177734, 'learning_rate': 0.0001903740767214677, 'epoch': 0.05}


  5%|▍         | 820/16798 [03:07<57:09,  4.66it/s]

{'loss': 1.4687, 'grad_norm': 1.5849202871322632, 'learning_rate': 0.0001903621634500834, 'epoch': 0.05}


  5%|▍         | 820/16798 [03:07<57:09,  4.66it/s]

{'loss': 2.1559, 'grad_norm': 2.3391621112823486, 'learning_rate': 0.00019035025017869907, 'epoch': 0.05}


  5%|▍         | 821/16798 [03:07<56:19,  4.73it/s]

{'loss': 1.4202, 'grad_norm': 1.698606014251709, 'learning_rate': 0.00019033833690731476, 'epoch': 0.05}


  5%|▍         | 823/16798 [03:08<56:16,  4.73it/s]

{'loss': 1.881, 'grad_norm': 2.0895793437957764, 'learning_rate': 0.00019032642363593042, 'epoch': 0.05}


  5%|▍         | 823/16798 [03:08<56:16,  4.73it/s]

{'loss': 1.6601, 'grad_norm': 1.8840833902359009, 'learning_rate': 0.0001903145103645461, 'epoch': 0.05}


  5%|▍         | 825/16798 [03:08<55:45,  4.77it/s]

{'loss': 1.5651, 'grad_norm': 1.6669942140579224, 'learning_rate': 0.00019030259709316177, 'epoch': 0.05}


  5%|▍         | 825/16798 [03:08<55:45,  4.77it/s]

{'loss': 1.4933, 'grad_norm': 1.5679348707199097, 'learning_rate': 0.00019029068382177746, 'epoch': 0.05}


  5%|▍         | 827/16798 [03:08<55:16,  4.82it/s]

{'loss': 1.5092, 'grad_norm': 1.6331391334533691, 'learning_rate': 0.00019027877055039313, 'epoch': 0.05}


  5%|▍         | 828/16798 [03:09<55:32,  4.79it/s]

{'loss': 1.409, 'grad_norm': 1.5832830667495728, 'learning_rate': 0.00019026685727900882, 'epoch': 0.05}


  5%|▍         | 828/16798 [03:09<55:32,  4.79it/s]

{'loss': 1.3033, 'grad_norm': 1.7826411724090576, 'learning_rate': 0.0001902549440076245, 'epoch': 0.05}


  5%|▍         | 829/16798 [03:09<55:57,  4.76it/s]

{'loss': 1.8665, 'grad_norm': 2.1001434326171875, 'learning_rate': 0.0001902430307362402, 'epoch': 0.05}


  5%|▍         | 830/16798 [03:09<56:56,  4.67it/s]

{'loss': 1.6901, 'grad_norm': 1.8633062839508057, 'learning_rate': 0.00019023111746485586, 'epoch': 0.05}


  5%|▍         | 832/16798 [03:09<55:59,  4.75it/s]

{'loss': 1.5822, 'grad_norm': 1.972965121269226, 'learning_rate': 0.00019021920419347156, 'epoch': 0.05}


  5%|▍         | 832/16798 [03:09<55:59,  4.75it/s]

{'loss': 1.5329, 'grad_norm': 1.811586618423462, 'learning_rate': 0.00019020729092208722, 'epoch': 0.05}


  5%|▍         | 834/16798 [03:10<55:30,  4.79it/s]

{'loss': 1.5921, 'grad_norm': 2.231154680252075, 'learning_rate': 0.0001901953776507029, 'epoch': 0.05}


  5%|▍         | 834/16798 [03:10<55:30,  4.79it/s]

{'loss': 1.6025, 'grad_norm': 1.8042727708816528, 'learning_rate': 0.00019018346437931857, 'epoch': 0.05}


  5%|▍         | 836/16798 [03:10<55:57,  4.75it/s]

{'loss': 2.0758, 'grad_norm': 2.1687769889831543, 'learning_rate': 0.00019017155110793426, 'epoch': 0.05}


  5%|▍         | 836/16798 [03:10<55:57,  4.75it/s]

{'loss': 1.2735, 'grad_norm': 1.81400728225708, 'learning_rate': 0.00019015963783654993, 'epoch': 0.05}


  5%|▍         | 838/16798 [03:11<56:07,  4.74it/s]

{'loss': 1.2912, 'grad_norm': 1.6070311069488525, 'learning_rate': 0.00019014772456516562, 'epoch': 0.05}


  5%|▍         | 839/16798 [03:11<55:11,  4.82it/s]

{'loss': 1.2899, 'grad_norm': 1.6600723266601562, 'learning_rate': 0.00019013581129378128, 'epoch': 0.05}


  5%|▌         | 840/16798 [03:11<54:37,  4.87it/s]

{'loss': 1.4444, 'grad_norm': 1.7693606615066528, 'learning_rate': 0.00019012389802239697, 'epoch': 0.05}


  5%|▌         | 841/16798 [03:11<54:11,  4.91it/s]

{'loss': 1.4691, 'grad_norm': 1.8521568775177002, 'learning_rate': 0.00019011198475101264, 'epoch': 0.05}


  5%|▌         | 841/16798 [03:11<54:11,  4.91it/s]

{'loss': 1.0243, 'grad_norm': 1.8661543130874634, 'learning_rate': 0.00019010007147962833, 'epoch': 0.05}


  5%|▌         | 843/16798 [03:12<54:34,  4.87it/s]

{'loss': 1.2578, 'grad_norm': 1.682490348815918, 'learning_rate': 0.000190088158208244, 'epoch': 0.05}


  5%|▌         | 844/16798 [03:12<54:45,  4.86it/s]

{'loss': 1.6355, 'grad_norm': 2.075263738632202, 'learning_rate': 0.00019007624493685968, 'epoch': 0.05}


  5%|▌         | 845/16798 [03:12<54:22,  4.89it/s]

{'loss': 0.9518, 'grad_norm': 1.5534114837646484, 'learning_rate': 0.00019006433166547535, 'epoch': 0.05}


  5%|▌         | 846/16798 [03:12<53:49,  4.94it/s]

{'loss': 0.8721, 'grad_norm': 1.6122931241989136, 'learning_rate': 0.00019005241839409104, 'epoch': 0.05}


  5%|▌         | 847/16798 [03:13<53:42,  4.95it/s]

{'loss': 0.7398, 'grad_norm': 1.3563016653060913, 'learning_rate': 0.0001900405051227067, 'epoch': 0.05}


  5%|▌         | 847/16798 [03:13<53:42,  4.95it/s]

{'loss': 1.2094, 'grad_norm': 2.082747459411621, 'learning_rate': 0.0001900285918513224, 'epoch': 0.05}


  5%|▌         | 849/16798 [03:13<53:59,  4.92it/s]

{'loss': 0.6178, 'grad_norm': 1.3020697832107544, 'learning_rate': 0.00019001667857993805, 'epoch': 0.05}


  5%|▌         | 850/16798 [03:13<53:30,  4.97it/s]

{'loss': 0.3157, 'grad_norm': 1.2696211338043213, 'learning_rate': 0.00019000476530855374, 'epoch': 0.05}


  5%|▌         | 850/16798 [03:13<53:30,  4.97it/s]

{'loss': 0.2535, 'grad_norm': 0.8155974745750427, 'learning_rate': 0.0001899928520371694, 'epoch': 0.05}


  5%|▌         | 851/16798 [03:13<55:43,  4.77it/s]

{'loss': 2.5387, 'grad_norm': 2.147615432739258, 'learning_rate': 0.0001899809387657851, 'epoch': 0.05}


  5%|▌         | 852/16798 [03:14<56:32,  4.70it/s]

{'loss': 2.2184, 'grad_norm': 2.220660448074341, 'learning_rate': 0.00018996902549440076, 'epoch': 0.05}


  5%|▌         | 853/16798 [03:14<56:57,  4.67it/s]

{'loss': 2.3651, 'grad_norm': 2.128910779953003, 'learning_rate': 0.00018995711222301645, 'epoch': 0.05}


  5%|▌         | 854/16798 [03:14<57:03,  4.66it/s]

{'loss': 2.3312, 'grad_norm': 2.2736527919769287, 'learning_rate': 0.00018994519895163212, 'epoch': 0.05}


  5%|▌         | 855/16798 [03:14<57:20,  4.63it/s]

{'loss': 2.2092, 'grad_norm': 2.116311550140381, 'learning_rate': 0.0001899332856802478, 'epoch': 0.05}


  5%|▌         | 856/16798 [03:14<56:57,  4.66it/s]

{'loss': 1.8049, 'grad_norm': 1.8814952373504639, 'learning_rate': 0.00018992137240886347, 'epoch': 0.05}


  5%|▌         | 858/16798 [03:15<55:57,  4.75it/s]

{'loss': 1.6643, 'grad_norm': 1.765346884727478, 'learning_rate': 0.00018990945913747916, 'epoch': 0.05}


  5%|▌         | 858/16798 [03:15<55:57,  4.75it/s]

{'loss': 2.2764, 'grad_norm': 2.257655143737793, 'learning_rate': 0.00018989754586609483, 'epoch': 0.05}


  5%|▌         | 860/16798 [03:15<56:01,  4.74it/s]

{'loss': 1.6715, 'grad_norm': 1.8468362092971802, 'learning_rate': 0.00018988563259471052, 'epoch': 0.05}


  5%|▌         | 861/16798 [03:16<56:17,  4.72it/s]

{'loss': 1.7546, 'grad_norm': 1.8330812454223633, 'learning_rate': 0.0001898737193233262, 'epoch': 0.05}


  5%|▌         | 862/16798 [03:16<55:25,  4.79it/s]

{'loss': 1.8547, 'grad_norm': 2.0783464908599854, 'learning_rate': 0.00018986180605194187, 'epoch': 0.05}


  5%|▌         | 863/16798 [03:16<55:08,  4.82it/s]

{'loss': 1.7212, 'grad_norm': 1.8388092517852783, 'learning_rate': 0.00018984989278055756, 'epoch': 0.05}


  5%|▌         | 863/16798 [03:16<55:08,  4.82it/s]

{'loss': 1.4301, 'grad_norm': 1.6875574588775635, 'learning_rate': 0.00018983797950917323, 'epoch': 0.05}


  5%|▌         | 865/16798 [03:16<55:13,  4.81it/s]

{'loss': 1.7026, 'grad_norm': 1.835723876953125, 'learning_rate': 0.00018982606623778892, 'epoch': 0.05}


  5%|▌         | 865/16798 [03:16<55:13,  4.81it/s]

{'loss': 1.6019, 'grad_norm': 2.1055848598480225, 'learning_rate': 0.00018981415296640458, 'epoch': 0.05}


  5%|▌         | 867/16798 [03:17<54:30,  4.87it/s]

{'loss': 1.9087, 'grad_norm': 2.008970022201538, 'learning_rate': 0.00018980223969502027, 'epoch': 0.05}


  5%|▌         | 867/16798 [03:17<54:30,  4.87it/s]

{'loss': 1.4359, 'grad_norm': 1.5752484798431396, 'learning_rate': 0.00018979032642363593, 'epoch': 0.05}


  5%|▌         | 868/16798 [03:17<54:10,  4.90it/s]

{'loss': 1.9363, 'grad_norm': 1.9049582481384277, 'learning_rate': 0.00018977841315225163, 'epoch': 0.05}


  5%|▌         | 869/16798 [03:17<54:10,  4.90it/s]

{'loss': 1.5489, 'grad_norm': 1.6637940406799316, 'learning_rate': 0.0001897664998808673, 'epoch': 0.05}


  5%|▌         | 871/16798 [03:18<54:01,  4.91it/s]

{'loss': 1.551, 'grad_norm': 1.689407229423523, 'learning_rate': 0.00018975458660948298, 'epoch': 0.05}


  5%|▌         | 871/16798 [03:18<54:01,  4.91it/s]

{'loss': 1.3768, 'grad_norm': 1.9919698238372803, 'learning_rate': 0.00018974267333809864, 'epoch': 0.05}


  5%|▌         | 872/16798 [03:18<53:54,  4.92it/s]

{'loss': 1.5598, 'grad_norm': 1.71713125705719, 'learning_rate': 0.00018973076006671433, 'epoch': 0.05}


  5%|▌         | 873/16798 [03:18<54:31,  4.87it/s]

{'loss': 1.3267, 'grad_norm': 1.759618878364563, 'learning_rate': 0.00018971884679533, 'epoch': 0.05}


  5%|▌         | 875/16798 [03:18<55:02,  4.82it/s]

{'loss': 1.4195, 'grad_norm': 1.699475646018982, 'learning_rate': 0.0001897069335239457, 'epoch': 0.05}


  5%|▌         | 876/16798 [03:19<54:49,  4.84it/s]

{'loss': 1.3469, 'grad_norm': 1.7313382625579834, 'learning_rate': 0.00018969502025256135, 'epoch': 0.05}


  5%|▌         | 877/16798 [03:19<55:28,  4.78it/s]

{'loss': 1.6709, 'grad_norm': 1.6722029447555542, 'learning_rate': 0.00018968310698117704, 'epoch': 0.05}


  5%|▌         | 877/16798 [03:19<55:28,  4.78it/s]

{'loss': 1.7258, 'grad_norm': 2.1834661960601807, 'learning_rate': 0.0001896711937097927, 'epoch': 0.05}


  5%|▌         | 879/16798 [03:19<55:15,  4.80it/s]

{'loss': 1.7819, 'grad_norm': 2.020475149154663, 'learning_rate': 0.0001896592804384084, 'epoch': 0.05}


  5%|▌         | 880/16798 [03:19<54:53,  4.83it/s]

{'loss': 1.2993, 'grad_norm': 1.6504731178283691, 'learning_rate': 0.00018964736716702406, 'epoch': 0.05}


  5%|▌         | 881/16798 [03:20<54:43,  4.85it/s]

{'loss': 1.4234, 'grad_norm': 1.8790149688720703, 'learning_rate': 0.00018963545389563975, 'epoch': 0.05}


  5%|▌         | 881/16798 [03:20<54:43,  4.85it/s]

{'loss': 1.343, 'grad_norm': 1.6434285640716553, 'learning_rate': 0.00018962354062425542, 'epoch': 0.05}


  5%|▌         | 882/16798 [03:20<55:01,  4.82it/s]

{'loss': 2.0939, 'grad_norm': 2.3087401390075684, 'learning_rate': 0.0001896116273528711, 'epoch': 0.05}


  5%|▌         | 884/16798 [03:20<54:50,  4.84it/s]

{'loss': 1.6357, 'grad_norm': 2.1365396976470947, 'learning_rate': 0.00018959971408148677, 'epoch': 0.05}


  5%|▌         | 884/16798 [03:20<54:50,  4.84it/s]

{'loss': 0.9507, 'grad_norm': 1.3915929794311523, 'learning_rate': 0.00018958780081010246, 'epoch': 0.05}


  5%|▌         | 886/16798 [03:21<54:59,  4.82it/s]

{'loss': 1.7906, 'grad_norm': 2.1702051162719727, 'learning_rate': 0.00018957588753871812, 'epoch': 0.05}


  5%|▌         | 887/16798 [03:21<54:20,  4.88it/s]

{'loss': 1.2945, 'grad_norm': 1.6151862144470215, 'learning_rate': 0.00018956397426733382, 'epoch': 0.05}


  5%|▌         | 888/16798 [03:21<54:02,  4.91it/s]

{'loss': 1.0938, 'grad_norm': 1.5906826257705688, 'learning_rate': 0.00018955206099594948, 'epoch': 0.05}


  5%|▌         | 888/16798 [03:21<54:02,  4.91it/s]

{'loss': 1.2355, 'grad_norm': 1.8819440603256226, 'learning_rate': 0.00018954014772456517, 'epoch': 0.05}


  5%|▌         | 890/16798 [03:22<54:52,  4.83it/s]

{'loss': 1.0753, 'grad_norm': 1.8111649751663208, 'learning_rate': 0.00018952823445318083, 'epoch': 0.05}


  5%|▌         | 890/16798 [03:22<54:52,  4.83it/s]

{'loss': 1.1445, 'grad_norm': 1.6693525314331055, 'learning_rate': 0.00018951632118179655, 'epoch': 0.05}


  5%|▌         | 892/16798 [03:22<54:38,  4.85it/s]

{'loss': 1.3176, 'grad_norm': 1.7130684852600098, 'learning_rate': 0.00018950440791041221, 'epoch': 0.05}


  5%|▌         | 892/16798 [03:22<54:38,  4.85it/s]

{'loss': 1.548, 'grad_norm': 2.077601432800293, 'learning_rate': 0.0001894924946390279, 'epoch': 0.05}


  5%|▌         | 894/16798 [03:22<54:36,  4.85it/s]

{'loss': 1.2439, 'grad_norm': 1.7200108766555786, 'learning_rate': 0.00018948058136764357, 'epoch': 0.05}


  5%|▌         | 895/16798 [03:23<54:00,  4.91it/s]

{'loss': 1.5419, 'grad_norm': 2.022510290145874, 'learning_rate': 0.00018946866809625926, 'epoch': 0.05}


  5%|▌         | 896/16798 [03:23<53:40,  4.94it/s]

{'loss': 0.804, 'grad_norm': 1.4632219076156616, 'learning_rate': 0.00018945675482487492, 'epoch': 0.05}


  5%|▌         | 897/16798 [03:23<53:31,  4.95it/s]

{'loss': 0.6481, 'grad_norm': 1.2979031801223755, 'learning_rate': 0.00018944484155349061, 'epoch': 0.05}


  5%|▌         | 898/16798 [03:23<53:40,  4.94it/s]

{'loss': 0.9619, 'grad_norm': 1.5950844287872314, 'learning_rate': 0.00018943292828210628, 'epoch': 0.05}


  5%|▌         | 899/16798 [03:23<53:03,  4.99it/s]

{'loss': 0.702, 'grad_norm': 1.6069060564041138, 'learning_rate': 0.00018942101501072197, 'epoch': 0.05}


  5%|▌         | 899/16798 [03:23<53:03,  4.99it/s]

{'loss': 0.3568, 'grad_norm': 0.9999698996543884, 'learning_rate': 0.00018940910173933763, 'epoch': 0.05}


  5%|▌         | 900/16798 [03:24<53:18,  4.97it/s]

{'loss': 0.6261, 'grad_norm': 1.5089569091796875, 'learning_rate': 0.00018939718846795332, 'epoch': 0.05}


  5%|▌         | 902/16798 [03:24<55:13,  4.80it/s]

{'loss': 2.1956, 'grad_norm': 1.8736408948898315, 'learning_rate': 0.000189385275196569, 'epoch': 0.05}


  5%|▌         | 902/16798 [03:24<55:13,  4.80it/s]

{'loss': 2.1082, 'grad_norm': 1.7514430284500122, 'learning_rate': 0.00018937336192518468, 'epoch': 0.05}


  5%|▌         | 903/16798 [03:24<55:54,  4.74it/s]

{'loss': 2.154, 'grad_norm': 1.9079645872116089, 'learning_rate': 0.00018936144865380034, 'epoch': 0.05}


  5%|▌         | 904/16798 [03:24<56:09,  4.72it/s]

{'loss': 2.2061, 'grad_norm': 1.9813838005065918, 'learning_rate': 0.00018934953538241603, 'epoch': 0.05}


  5%|▌         | 905/16798 [03:25<57:23,  4.62it/s]

{'loss': 2.0122, 'grad_norm': 1.7976391315460205, 'learning_rate': 0.0001893376221110317, 'epoch': 0.05}


  5%|▌         | 907/16798 [03:25<57:05,  4.64it/s]

{'loss': 2.4708, 'grad_norm': 2.0699613094329834, 'learning_rate': 0.0001893257088396474, 'epoch': 0.05}


  5%|▌         | 907/16798 [03:25<57:05,  4.64it/s]

{'loss': 1.6854, 'grad_norm': 1.7905049324035645, 'learning_rate': 0.00018931379556826305, 'epoch': 0.05}


  5%|▌         | 908/16798 [03:25<57:08,  4.63it/s]

{'loss': 2.2369, 'grad_norm': 2.070136070251465, 'learning_rate': 0.00018930188229687874, 'epoch': 0.05}


  5%|▌         | 909/16798 [03:26<57:39,  4.59it/s]

{'loss': 2.1276, 'grad_norm': 2.317284345626831, 'learning_rate': 0.0001892899690254944, 'epoch': 0.05}


  5%|▌         | 910/16798 [03:26<58:19,  4.54it/s]

{'loss': 1.7234, 'grad_norm': 2.1359238624572754, 'learning_rate': 0.0001892780557541101, 'epoch': 0.05}


  5%|▌         | 912/16798 [03:26<57:24,  4.61it/s]

{'loss': 2.0235, 'grad_norm': 2.056838274002075, 'learning_rate': 0.00018926614248272576, 'epoch': 0.05}


  5%|▌         | 912/16798 [03:26<57:24,  4.61it/s]

{'loss': 1.7693, 'grad_norm': 1.797628402709961, 'learning_rate': 0.00018925422921134145, 'epoch': 0.05}


  5%|▌         | 913/16798 [03:26<57:06,  4.64it/s]

{'loss': 1.7804, 'grad_norm': 1.9331642389297485, 'learning_rate': 0.00018924231593995711, 'epoch': 0.05}


  5%|▌         | 914/16798 [03:27<56:47,  4.66it/s]

{'loss': 1.6377, 'grad_norm': 1.797864556312561, 'learning_rate': 0.0001892304026685728, 'epoch': 0.05}


  5%|▌         | 916/16798 [03:27<56:27,  4.69it/s]

{'loss': 1.5721, 'grad_norm': 1.780566930770874, 'learning_rate': 0.00018921848939718847, 'epoch': 0.05}


  5%|▌         | 916/16798 [03:27<56:27,  4.69it/s]

{'loss': 1.5126, 'grad_norm': 1.5900932550430298, 'learning_rate': 0.00018920657612580416, 'epoch': 0.05}


  5%|▌         | 918/16798 [03:27<56:38,  4.67it/s]

{'loss': 1.5505, 'grad_norm': 1.8506709337234497, 'learning_rate': 0.00018919466285441982, 'epoch': 0.05}


  5%|▌         | 919/16798 [03:28<55:22,  4.78it/s]

{'loss': 1.6749, 'grad_norm': 1.7806646823883057, 'learning_rate': 0.0001891827495830355, 'epoch': 0.05}


  5%|▌         | 920/16798 [03:28<54:52,  4.82it/s]

{'loss': 1.7004, 'grad_norm': 1.941994547843933, 'learning_rate': 0.00018917083631165118, 'epoch': 0.05}


  5%|▌         | 921/16798 [03:28<55:05,  4.80it/s]

{'loss': 2.3527, 'grad_norm': 2.2131824493408203, 'learning_rate': 0.00018915892304026687, 'epoch': 0.05}


  5%|▌         | 921/16798 [03:28<55:05,  4.80it/s]

{'loss': 1.396, 'grad_norm': 1.8413976430892944, 'learning_rate': 0.00018914700976888256, 'epoch': 0.05}


  5%|▌         | 923/16798 [03:28<55:19,  4.78it/s]

{'loss': 1.7021, 'grad_norm': 1.9618102312088013, 'learning_rate': 0.00018913509649749822, 'epoch': 0.05}


  6%|▌         | 924/16798 [03:29<54:57,  4.81it/s]

{'loss': 2.0636, 'grad_norm': 2.1737725734710693, 'learning_rate': 0.0001891231832261139, 'epoch': 0.05}


  6%|▌         | 925/16798 [03:29<54:28,  4.86it/s]

{'loss': 1.7851, 'grad_norm': 2.002532482147217, 'learning_rate': 0.00018911126995472958, 'epoch': 0.06}


  6%|▌         | 926/16798 [03:29<54:18,  4.87it/s]

{'loss': 1.3938, 'grad_norm': 1.871903657913208, 'learning_rate': 0.00018909935668334527, 'epoch': 0.06}


  6%|▌         | 927/16798 [03:29<54:10,  4.88it/s]

{'loss': 1.8382, 'grad_norm': 2.0149216651916504, 'learning_rate': 0.00018908744341196093, 'epoch': 0.06}


  6%|▌         | 928/16798 [03:29<54:09,  4.88it/s]

{'loss': 1.6359, 'grad_norm': 2.0432465076446533, 'learning_rate': 0.00018907553014057662, 'epoch': 0.06}


  6%|▌         | 929/16798 [03:30<54:30,  4.85it/s]

{'loss': 1.6126, 'grad_norm': 2.0101544857025146, 'learning_rate': 0.00018906361686919229, 'epoch': 0.06}


  6%|▌         | 929/16798 [03:30<54:30,  4.85it/s]

{'loss': 2.2814, 'grad_norm': 2.461045026779175, 'learning_rate': 0.00018905170359780798, 'epoch': 0.06}


  6%|▌         | 930/16798 [03:30<55:14,  4.79it/s]

{'loss': 1.6554, 'grad_norm': 2.084547758102417, 'learning_rate': 0.00018903979032642364, 'epoch': 0.06}


  6%|▌         | 932/16798 [03:30<55:30,  4.76it/s]

{'loss': 1.1991, 'grad_norm': 1.478849172592163, 'learning_rate': 0.00018902787705503933, 'epoch': 0.06}


  6%|▌         | 932/16798 [03:30<55:30,  4.76it/s]

{'loss': 1.4797, 'grad_norm': 1.838555932044983, 'learning_rate': 0.000189015963783655, 'epoch': 0.06}


  6%|▌         | 933/16798 [03:31<55:43,  4.75it/s]

{'loss': 1.1367, 'grad_norm': 1.5180041790008545, 'learning_rate': 0.00018900405051227068, 'epoch': 0.06}


  6%|▌         | 934/16798 [03:31<55:55,  4.73it/s]

{'loss': 1.1018, 'grad_norm': 2.0763580799102783, 'learning_rate': 0.00018899213724088635, 'epoch': 0.06}


  6%|▌         | 936/16798 [03:31<54:49,  4.82it/s]

{'loss': 1.3163, 'grad_norm': 1.6934870481491089, 'learning_rate': 0.00018898022396950204, 'epoch': 0.06}


  6%|▌         | 937/16798 [03:31<54:34,  4.84it/s]

{'loss': 1.5879, 'grad_norm': 2.0181498527526855, 'learning_rate': 0.0001889683106981177, 'epoch': 0.06}


  6%|▌         | 938/16798 [03:32<53:48,  4.91it/s]

{'loss': 1.5228, 'grad_norm': 1.9495701789855957, 'learning_rate': 0.0001889563974267334, 'epoch': 0.06}


  6%|▌         | 939/16798 [03:32<53:43,  4.92it/s]

{'loss': 1.5617, 'grad_norm': 1.9049137830734253, 'learning_rate': 0.00018894448415534906, 'epoch': 0.06}


  6%|▌         | 940/16798 [03:32<54:04,  4.89it/s]

{'loss': 1.5146, 'grad_norm': 1.7854827642440796, 'learning_rate': 0.00018893257088396475, 'epoch': 0.06}


  6%|▌         | 940/16798 [03:32<54:04,  4.89it/s]

{'loss': 1.5159, 'grad_norm': 1.8463459014892578, 'learning_rate': 0.0001889206576125804, 'epoch': 0.06}


  6%|▌         | 941/16798 [03:32<55:02,  4.80it/s]

{'loss': 1.4785, 'grad_norm': 2.256532907485962, 'learning_rate': 0.0001889087443411961, 'epoch': 0.06}


  6%|▌         | 943/16798 [03:33<54:26,  4.85it/s]

{'loss': 1.1099, 'grad_norm': 1.685430884361267, 'learning_rate': 0.00018889683106981177, 'epoch': 0.06}


  6%|▌         | 943/16798 [03:33<54:26,  4.85it/s]

{'loss': 1.7422, 'grad_norm': 2.421596050262451, 'learning_rate': 0.00018888491779842746, 'epoch': 0.06}


  6%|▌         | 945/16798 [03:33<54:06,  4.88it/s]

{'loss': 0.9287, 'grad_norm': 1.3710219860076904, 'learning_rate': 0.00018887300452704312, 'epoch': 0.06}


  6%|▌         | 946/16798 [03:33<53:42,  4.92it/s]

{'loss': 1.1145, 'grad_norm': 2.1513140201568604, 'learning_rate': 0.0001888610912556588, 'epoch': 0.06}


  6%|▌         | 947/16798 [03:33<53:17,  4.96it/s]

{'loss': 1.2561, 'grad_norm': 1.8927668333053589, 'learning_rate': 0.00018884917798427448, 'epoch': 0.06}


  6%|▌         | 948/16798 [03:34<53:03,  4.98it/s]

{'loss': 1.2123, 'grad_norm': 1.8519552946090698, 'learning_rate': 0.00018883726471289017, 'epoch': 0.06}


  6%|▌         | 949/16798 [03:34<53:44,  4.92it/s]

{'loss': 1.3543, 'grad_norm': 2.161372184753418, 'learning_rate': 0.00018882535144150583, 'epoch': 0.06}


  6%|▌         | 950/16798 [03:34<53:35,  4.93it/s]

{'loss': 0.7805, 'grad_norm': 1.421688437461853, 'learning_rate': 0.00018881343817012152, 'epoch': 0.06}


  6%|▌         | 950/16798 [03:34<53:35,  4.93it/s]

{'loss': 0.4745, 'grad_norm': 1.260144829750061, 'learning_rate': 0.00018880152489873718, 'epoch': 0.06}


  6%|▌         | 951/16798 [03:34<54:52,  4.81it/s]

{'loss': 2.2374, 'grad_norm': 2.5315585136413574, 'learning_rate': 0.00018878961162735287, 'epoch': 0.06}


  6%|▌         | 952/16798 [03:34<55:39,  4.75it/s]

{'loss': 2.2796, 'grad_norm': 2.2205638885498047, 'learning_rate': 0.00018877769835596857, 'epoch': 0.06}


  6%|▌         | 953/16798 [03:35<56:42,  4.66it/s]

{'loss': 2.1943, 'grad_norm': 2.221611976623535, 'learning_rate': 0.00018876578508458426, 'epoch': 0.06}


  6%|▌         | 954/16798 [03:35<57:34,  4.59it/s]

{'loss': 2.2741, 'grad_norm': 2.0704474449157715, 'learning_rate': 0.00018875387181319992, 'epoch': 0.06}


  6%|▌         | 955/16798 [03:35<57:36,  4.58it/s]

{'loss': 1.8547, 'grad_norm': 2.150266408920288, 'learning_rate': 0.0001887419585418156, 'epoch': 0.06}


  6%|▌         | 956/16798 [03:35<58:29,  4.51it/s]

{'loss': 1.6717, 'grad_norm': 1.8275965452194214, 'learning_rate': 0.00018873004527043127, 'epoch': 0.06}


  6%|▌         | 957/16798 [03:36<59:34,  4.43it/s]

{'loss': 1.9142, 'grad_norm': 1.8885908126831055, 'learning_rate': 0.00018871813199904697, 'epoch': 0.06}


  6%|▌         | 958/16798 [03:36<59:41,  4.42it/s]

{'loss': 1.712, 'grad_norm': 1.7041542530059814, 'learning_rate': 0.00018870621872766263, 'epoch': 0.06}


  6%|▌         | 959/16798 [03:36<58:45,  4.49it/s]

{'loss': 1.6918, 'grad_norm': 1.9905365705490112, 'learning_rate': 0.00018869430545627832, 'epoch': 0.06}


  6%|▌         | 960/16798 [03:36<57:58,  4.55it/s]

{'loss': 1.5917, 'grad_norm': 1.744814157485962, 'learning_rate': 0.00018868239218489398, 'epoch': 0.06}


  6%|▌         | 962/16798 [03:37<57:29,  4.59it/s]

{'loss': 2.1882, 'grad_norm': 2.1100914478302, 'learning_rate': 0.00018867047891350967, 'epoch': 0.06}


  6%|▌         | 962/16798 [03:37<57:29,  4.59it/s]

{'loss': 1.5418, 'grad_norm': 1.8458164930343628, 'learning_rate': 0.00018865856564212534, 'epoch': 0.06}


  6%|▌         | 964/16798 [03:37<55:40,  4.74it/s]

{'loss': 1.8283, 'grad_norm': 1.8010690212249756, 'learning_rate': 0.00018864665237074103, 'epoch': 0.06}


  6%|▌         | 965/16798 [03:37<54:37,  4.83it/s]

{'loss': 1.4195, 'grad_norm': 1.8216406106948853, 'learning_rate': 0.0001886347390993567, 'epoch': 0.06}


  6%|▌         | 966/16798 [03:37<54:01,  4.88it/s]

{'loss': 1.792, 'grad_norm': 1.8480165004730225, 'learning_rate': 0.00018862282582797238, 'epoch': 0.06}


  6%|▌         | 966/16798 [03:37<54:01,  4.88it/s]

{'loss': 1.6881, 'grad_norm': 1.7690348625183105, 'learning_rate': 0.00018861091255658805, 'epoch': 0.06}


  6%|▌         | 968/16798 [03:38<54:10,  4.87it/s]

{'loss': 2.1359, 'grad_norm': 2.0982766151428223, 'learning_rate': 0.00018859899928520374, 'epoch': 0.06}


  6%|▌         | 969/16798 [03:38<53:52,  4.90it/s]

{'loss': 1.5763, 'grad_norm': 1.7816877365112305, 'learning_rate': 0.0001885870860138194, 'epoch': 0.06}


  6%|▌         | 969/16798 [03:38<53:52,  4.90it/s]

{'loss': 1.9671, 'grad_norm': 2.162644386291504, 'learning_rate': 0.0001885751727424351, 'epoch': 0.06}


  6%|▌         | 971/16798 [03:39<53:40,  4.91it/s]

{'loss': 1.5853, 'grad_norm': 1.9031692743301392, 'learning_rate': 0.00018856325947105076, 'epoch': 0.06}


  6%|▌         | 972/16798 [03:39<53:14,  4.95it/s]

{'loss': 1.8013, 'grad_norm': 1.9397627115249634, 'learning_rate': 0.00018855134619966645, 'epoch': 0.06}


  6%|▌         | 973/16798 [03:39<53:32,  4.93it/s]

{'loss': 1.9882, 'grad_norm': 1.9478130340576172, 'learning_rate': 0.0001885394329282821, 'epoch': 0.06}


  6%|▌         | 974/16798 [03:39<53:23,  4.94it/s]

{'loss': 1.5931, 'grad_norm': 1.6517401933670044, 'learning_rate': 0.0001885275196568978, 'epoch': 0.06}


  6%|▌         | 975/16798 [03:39<53:59,  4.88it/s]

{'loss': 1.623, 'grad_norm': 1.772201657295227, 'learning_rate': 0.00018851560638551346, 'epoch': 0.06}


  6%|▌         | 975/16798 [03:39<53:59,  4.88it/s]

{'loss': 1.5586, 'grad_norm': 1.8657437562942505, 'learning_rate': 0.00018850369311412915, 'epoch': 0.06}


  6%|▌         | 976/16798 [03:40<54:47,  4.81it/s]

{'loss': 1.4451, 'grad_norm': 1.6267588138580322, 'learning_rate': 0.00018849177984274482, 'epoch': 0.06}


  6%|▌         | 977/16798 [03:40<55:14,  4.77it/s]

{'loss': 1.2347, 'grad_norm': 2.096024513244629, 'learning_rate': 0.0001884798665713605, 'epoch': 0.06}


  6%|▌         | 978/16798 [03:40<55:27,  4.75it/s]

{'loss': 1.4664, 'grad_norm': 1.838077425956726, 'learning_rate': 0.00018846795329997617, 'epoch': 0.06}


  6%|▌         | 979/16798 [03:40<56:21,  4.68it/s]

{'loss': 1.5999, 'grad_norm': 1.8438595533370972, 'learning_rate': 0.00018845604002859186, 'epoch': 0.06}


  6%|▌         | 981/16798 [03:41<54:56,  4.80it/s]

{'loss': 1.4778, 'grad_norm': 2.1953160762786865, 'learning_rate': 0.00018844412675720753, 'epoch': 0.06}


  6%|▌         | 982/16798 [03:41<54:19,  4.85it/s]

{'loss': 1.3144, 'grad_norm': 1.643864393234253, 'learning_rate': 0.00018843221348582322, 'epoch': 0.06}


  6%|▌         | 982/16798 [03:41<54:19,  4.85it/s]

{'loss': 1.2733, 'grad_norm': 1.7000091075897217, 'learning_rate': 0.00018842030021443888, 'epoch': 0.06}


  6%|▌         | 983/16798 [03:41<54:01,  4.88it/s]

{'loss': 2.0338, 'grad_norm': 2.4034390449523926, 'learning_rate': 0.00018840838694305457, 'epoch': 0.06}


  6%|▌         | 985/16798 [03:41<54:00,  4.88it/s]

{'loss': 1.5331, 'grad_norm': 2.0275940895080566, 'learning_rate': 0.00018839647367167026, 'epoch': 0.06}


  6%|▌         | 985/16798 [03:41<54:00,  4.88it/s]

{'loss': 1.4372, 'grad_norm': 1.5692620277404785, 'learning_rate': 0.00018838456040028593, 'epoch': 0.06}


  6%|▌         | 986/16798 [03:42<54:26,  4.84it/s]

{'loss': 1.3307, 'grad_norm': 1.9098625183105469, 'learning_rate': 0.00018837264712890162, 'epoch': 0.06}


  6%|▌         | 987/16798 [03:42<56:24,  4.67it/s]

{'loss': 1.5083, 'grad_norm': 1.8384400606155396, 'learning_rate': 0.00018836073385751728, 'epoch': 0.06}


  6%|▌         | 988/16798 [03:42<1:01:23,  4.29it/s]

{'loss': 1.4307, 'grad_norm': 2.0142319202423096, 'learning_rate': 0.00018834882058613297, 'epoch': 0.06}


  6%|▌         | 989/16798 [03:43<1:14:26,  3.54it/s]

{'loss': 1.0921, 'grad_norm': 1.6002296209335327, 'learning_rate': 0.00018833690731474864, 'epoch': 0.06}


  6%|▌         | 990/16798 [03:43<1:14:42,  3.53it/s]

{'loss': 1.4468, 'grad_norm': 2.528336763381958, 'learning_rate': 0.00018832499404336433, 'epoch': 0.06}


  6%|▌         | 991/16798 [03:43<1:13:33,  3.58it/s]

{'loss': 1.239, 'grad_norm': 2.070570230484009, 'learning_rate': 0.00018831308077198, 'epoch': 0.06}


  6%|▌         | 992/16798 [03:43<1:12:15,  3.65it/s]

{'loss': 1.4631, 'grad_norm': 1.8771042823791504, 'learning_rate': 0.00018830116750059568, 'epoch': 0.06}


  6%|▌         | 993/16798 [03:44<1:07:52,  3.88it/s]

{'loss': 1.1806, 'grad_norm': 1.8399282693862915, 'learning_rate': 0.00018828925422921134, 'epoch': 0.06}


  6%|▌         | 994/16798 [03:44<1:05:31,  4.02it/s]

{'loss': 0.8288, 'grad_norm': 1.3911793231964111, 'learning_rate': 0.00018827734095782704, 'epoch': 0.06}


  6%|▌         | 995/16798 [03:44<1:07:36,  3.90it/s]

{'loss': 1.2713, 'grad_norm': 1.8441399335861206, 'learning_rate': 0.0001882654276864427, 'epoch': 0.06}


  6%|▌         | 997/16798 [03:44<1:01:01,  4.32it/s]

{'loss': 1.0756, 'grad_norm': 1.7717511653900146, 'learning_rate': 0.0001882535144150584, 'epoch': 0.06}


  6%|▌         | 997/16798 [03:44<1:01:01,  4.32it/s]

{'loss': 0.7179, 'grad_norm': 1.4910273551940918, 'learning_rate': 0.00018824160114367405, 'epoch': 0.06}


  6%|▌         | 998/16798 [03:45<1:00:08,  4.38it/s]

{'loss': 0.6062, 'grad_norm': 1.5522454977035522, 'learning_rate': 0.00018822968787228974, 'epoch': 0.06}


  6%|▌         | 999/16798 [03:45<58:25,  4.51it/s]  

{'loss': 0.6026, 'grad_norm': 1.367972493171692, 'learning_rate': 0.0001882177746009054, 'epoch': 0.06}




{'loss': 0.4282, 'grad_norm': 1.2027567625045776, 'learning_rate': 0.0001882058613295211, 'epoch': 0.06}


  6%|▌         | 1001/16798 [03:48<4:07:41,  1.06it/s]

{'loss': 2.2794, 'grad_norm': 2.070108652114868, 'learning_rate': 0.00018819394805813676, 'epoch': 0.06}


  6%|▌         | 1002/16798 [03:48<3:12:15,  1.37it/s]

{'loss': 1.8435, 'grad_norm': 1.9049274921417236, 'learning_rate': 0.00018818203478675245, 'epoch': 0.06}


  6%|▌         | 1003/16798 [03:48<2:32:56,  1.72it/s]

{'loss': 1.971, 'grad_norm': 1.7940818071365356, 'learning_rate': 0.00018817012151536812, 'epoch': 0.06}


  6%|▌         | 1004/16798 [03:48<2:06:25,  2.08it/s]

{'loss': 2.2947, 'grad_norm': 3.053131103515625, 'learning_rate': 0.0001881582082439838, 'epoch': 0.06}


  6%|▌         | 1005/16798 [03:49<1:47:01,  2.46it/s]

{'loss': 1.4204, 'grad_norm': 1.749006986618042, 'learning_rate': 0.00018814629497259947, 'epoch': 0.06}


  6%|▌         | 1006/16798 [03:49<1:37:54,  2.69it/s]

{'loss': 2.2227, 'grad_norm': 2.0801854133605957, 'learning_rate': 0.00018813438170121516, 'epoch': 0.06}


  6%|▌         | 1007/16798 [03:49<1:31:06,  2.89it/s]

{'loss': 2.0418, 'grad_norm': 1.9356673955917358, 'learning_rate': 0.00018812246842983083, 'epoch': 0.06}


  6%|▌         | 1009/16798 [03:50<1:13:15,  3.59it/s]

{'loss': 2.4419, 'grad_norm': 2.2599239349365234, 'learning_rate': 0.00018811055515844652, 'epoch': 0.06}


  6%|▌         | 1009/16798 [03:50<1:13:15,  3.59it/s]

{'loss': 1.9507, 'grad_norm': 2.2198739051818848, 'learning_rate': 0.00018809864188706218, 'epoch': 0.06}


  6%|▌         | 1010/16798 [03:50<1:09:36,  3.78it/s]

{'loss': 1.541, 'grad_norm': 1.7418450117111206, 'learning_rate': 0.00018808672861567787, 'epoch': 0.06}


  6%|▌         | 1011/16798 [03:50<1:05:41,  4.01it/s]

{'loss': 1.817, 'grad_norm': 1.830138921737671, 'learning_rate': 0.00018807481534429353, 'epoch': 0.06}


  6%|▌         | 1012/16798 [03:50<1:03:54,  4.12it/s]

{'loss': 1.6084, 'grad_norm': 1.819258689880371, 'learning_rate': 0.00018806290207290923, 'epoch': 0.06}


  6%|▌         | 1013/16798 [03:51<1:05:06,  4.04it/s]

{'loss': 1.5414, 'grad_norm': 1.709615707397461, 'learning_rate': 0.00018805098880152492, 'epoch': 0.06}


  6%|▌         | 1014/16798 [03:51<1:07:16,  3.91it/s]

{'loss': 1.5055, 'grad_norm': 1.9212502241134644, 'learning_rate': 0.0001880390755301406, 'epoch': 0.06}


  6%|▌         | 1015/16798 [03:51<1:05:23,  4.02it/s]

{'loss': 1.665, 'grad_norm': 2.0279295444488525, 'learning_rate': 0.00018802716225875627, 'epoch': 0.06}


  6%|▌         | 1016/16798 [03:51<1:05:37,  4.01it/s]

{'loss': 1.5502, 'grad_norm': 1.997809648513794, 'learning_rate': 0.00018801524898737196, 'epoch': 0.06}


  6%|▌         | 1017/16798 [03:52<1:02:50,  4.19it/s]

{'loss': 1.7606, 'grad_norm': 1.867167353630066, 'learning_rate': 0.00018800333571598762, 'epoch': 0.06}


  6%|▌         | 1018/16798 [03:52<1:02:05,  4.24it/s]

{'loss': 1.7788, 'grad_norm': 2.158360719680786, 'learning_rate': 0.00018799142244460332, 'epoch': 0.06}


  6%|▌         | 1019/16798 [03:52<1:01:30,  4.28it/s]

{'loss': 1.7078, 'grad_norm': 2.129269599914551, 'learning_rate': 0.00018797950917321898, 'epoch': 0.06}


  6%|▌         | 1021/16798 [03:52<58:51,  4.47it/s]  

{'loss': 1.5888, 'grad_norm': 2.0197012424468994, 'learning_rate': 0.00018796759590183467, 'epoch': 0.06}


  6%|▌         | 1021/16798 [03:52<58:51,  4.47it/s]

{'loss': 1.3105, 'grad_norm': 1.639706015586853, 'learning_rate': 0.00018795568263045033, 'epoch': 0.06}


  6%|▌         | 1022/16798 [03:53<1:04:14,  4.09it/s]

{'loss': 1.3767, 'grad_norm': 1.6495846509933472, 'learning_rate': 0.00018794376935906602, 'epoch': 0.06}


  6%|▌         | 1023/16798 [03:53<1:02:32,  4.20it/s]

{'loss': 1.7049, 'grad_norm': 1.919082760810852, 'learning_rate': 0.0001879318560876817, 'epoch': 0.06}


  6%|▌         | 1024/16798 [03:53<1:00:28,  4.35it/s]

{'loss': 1.6664, 'grad_norm': 1.834770917892456, 'learning_rate': 0.00018791994281629738, 'epoch': 0.06}


  6%|▌         | 1025/16798 [03:53<1:00:48,  4.32it/s]

{'loss': 1.4174, 'grad_norm': 1.787580966949463, 'learning_rate': 0.00018790802954491304, 'epoch': 0.06}


  6%|▌         | 1026/16798 [03:54<59:54,  4.39it/s]  

{'loss': 1.359, 'grad_norm': 1.792832851409912, 'learning_rate': 0.00018789611627352873, 'epoch': 0.06}


  6%|▌         | 1027/16798 [03:54<1:00:38,  4.33it/s]

{'loss': 1.255, 'grad_norm': 1.908521294593811, 'learning_rate': 0.0001878842030021444, 'epoch': 0.06}


  6%|▌         | 1028/16798 [03:54<1:00:40,  4.33it/s]

{'loss': 1.5327, 'grad_norm': 1.9756954908370972, 'learning_rate': 0.0001878722897307601, 'epoch': 0.06}


  6%|▌         | 1029/16798 [03:54<59:25,  4.42it/s]  

{'loss': 1.6231, 'grad_norm': 1.874800443649292, 'learning_rate': 0.00018786037645937575, 'epoch': 0.06}


  6%|▌         | 1030/16798 [03:55<1:01:43,  4.26it/s]

{'loss': 1.7221, 'grad_norm': 1.9293546676635742, 'learning_rate': 0.00018784846318799144, 'epoch': 0.06}


  6%|▌         | 1032/16798 [03:55<1:00:48,  4.32it/s]

{'loss': 1.4606, 'grad_norm': 1.903199553489685, 'learning_rate': 0.0001878365499166071, 'epoch': 0.06}


  6%|▌         | 1032/16798 [03:55<1:00:48,  4.32it/s]

{'loss': 1.379, 'grad_norm': 1.7434484958648682, 'learning_rate': 0.0001878246366452228, 'epoch': 0.06}


  6%|▌         | 1033/16798 [03:55<1:01:32,  4.27it/s]

{'loss': 1.3641, 'grad_norm': 1.6761813163757324, 'learning_rate': 0.00018781272337383846, 'epoch': 0.06}


  6%|▌         | 1034/16798 [03:56<1:00:16,  4.36it/s]

{'loss': 1.3949, 'grad_norm': 1.6170984506607056, 'learning_rate': 0.00018780081010245415, 'epoch': 0.06}


  6%|▌         | 1035/16798 [03:56<1:03:05,  4.16it/s]

{'loss': 1.4073, 'grad_norm': 1.8362274169921875, 'learning_rate': 0.00018778889683106981, 'epoch': 0.06}


  6%|▌         | 1036/16798 [03:56<1:01:28,  4.27it/s]

{'loss': 1.8052, 'grad_norm': 2.071669101715088, 'learning_rate': 0.0001877769835596855, 'epoch': 0.06}


  6%|▌         | 1037/16798 [03:56<1:06:12,  3.97it/s]

{'loss': 1.0225, 'grad_norm': 1.4446154832839966, 'learning_rate': 0.00018776507028830117, 'epoch': 0.06}


  6%|▌         | 1038/16798 [03:57<1:03:25,  4.14it/s]

{'loss': 1.2801, 'grad_norm': 1.6776776313781738, 'learning_rate': 0.00018775315701691686, 'epoch': 0.06}


  6%|▌         | 1040/16798 [03:57<59:47,  4.39it/s]  

{'loss': 1.2601, 'grad_norm': 2.188445568084717, 'learning_rate': 0.00018774124374553252, 'epoch': 0.06}


  6%|▌         | 1040/16798 [03:57<59:47,  4.39it/s]

{'loss': 1.2821, 'grad_norm': 1.7539395093917847, 'learning_rate': 0.00018772933047414821, 'epoch': 0.06}


  6%|▌         | 1041/16798 [03:57<58:56,  4.46it/s]

{'loss': 1.0798, 'grad_norm': 1.763360619544983, 'learning_rate': 0.00018771741720276388, 'epoch': 0.06}


  6%|▌         | 1042/16798 [03:57<58:03,  4.52it/s]

{'loss': 1.314, 'grad_norm': 2.0672621726989746, 'learning_rate': 0.00018770550393137957, 'epoch': 0.06}


  6%|▌         | 1043/16798 [03:58<58:12,  4.51it/s]

{'loss': 0.9491, 'grad_norm': 1.6130516529083252, 'learning_rate': 0.00018769359065999523, 'epoch': 0.06}


  6%|▌         | 1044/16798 [03:58<1:01:24,  4.28it/s]

{'loss': 0.9243, 'grad_norm': 1.5079220533370972, 'learning_rate': 0.00018768167738861092, 'epoch': 0.06}


  6%|▌         | 1045/16798 [03:58<1:01:39,  4.26it/s]

{'loss': 0.7185, 'grad_norm': 1.5050783157348633, 'learning_rate': 0.00018766976411722661, 'epoch': 0.06}


  6%|▌         | 1046/16798 [03:58<1:00:27,  4.34it/s]

{'loss': 0.5954, 'grad_norm': 1.242690920829773, 'learning_rate': 0.00018765785084584228, 'epoch': 0.06}


  6%|▌         | 1047/16798 [03:59<1:00:02,  4.37it/s]

{'loss': 0.5228, 'grad_norm': 1.2468724250793457, 'learning_rate': 0.00018764593757445797, 'epoch': 0.06}


  6%|▌         | 1048/16798 [03:59<1:00:08,  4.37it/s]

{'loss': 0.7443, 'grad_norm': 1.3356356620788574, 'learning_rate': 0.00018763402430307363, 'epoch': 0.06}


  6%|▋         | 1050/16798 [03:59<56:29,  4.65it/s]  

{'loss': 0.3224, 'grad_norm': 0.959251344203949, 'learning_rate': 0.00018762211103168932, 'epoch': 0.06}


  6%|▋         | 1050/16798 [03:59<56:29,  4.65it/s]

{'loss': 0.3605, 'grad_norm': 1.10800302028656, 'learning_rate': 0.00018761019776030499, 'epoch': 0.06}


  6%|▋         | 1051/16798 [03:59<1:00:40,  4.33it/s]

{'loss': 2.5295, 'grad_norm': 2.158562421798706, 'learning_rate': 0.00018759828448892068, 'epoch': 0.06}


  6%|▋         | 1052/16798 [04:00<1:03:13,  4.15it/s]

{'loss': 2.0612, 'grad_norm': 1.7557501792907715, 'learning_rate': 0.00018758637121753634, 'epoch': 0.06}


  6%|▋         | 1053/16798 [04:00<1:02:41,  4.19it/s]

{'loss': 1.6226, 'grad_norm': 3.6769652366638184, 'learning_rate': 0.00018757445794615203, 'epoch': 0.06}


  6%|▋         | 1054/16798 [04:00<1:01:02,  4.30it/s]

{'loss': 2.1771, 'grad_norm': 1.9053635597229004, 'learning_rate': 0.0001875625446747677, 'epoch': 0.06}


  6%|▋         | 1055/16798 [04:00<1:00:29,  4.34it/s]

{'loss': 2.1123, 'grad_norm': 2.0149922370910645, 'learning_rate': 0.00018755063140338339, 'epoch': 0.06}


  6%|▋         | 1056/16798 [04:01<1:01:44,  4.25it/s]

{'loss': 2.5794, 'grad_norm': 2.260525703430176, 'learning_rate': 0.00018753871813199905, 'epoch': 0.06}


  6%|▋         | 1057/16798 [04:01<1:00:35,  4.33it/s]

{'loss': 1.3165, 'grad_norm': 1.7511036396026611, 'learning_rate': 0.00018752680486061474, 'epoch': 0.06}


  6%|▋         | 1058/16798 [04:01<1:03:31,  4.13it/s]

{'loss': 1.6836, 'grad_norm': 1.7314739227294922, 'learning_rate': 0.0001875148915892304, 'epoch': 0.06}


  6%|▋         | 1059/16798 [04:01<1:05:18,  4.02it/s]

{'loss': 2.3441, 'grad_norm': 2.1127095222473145, 'learning_rate': 0.0001875029783178461, 'epoch': 0.06}


  6%|▋         | 1060/16798 [04:02<1:03:20,  4.14it/s]

{'loss': 1.8161, 'grad_norm': 2.0754940509796143, 'learning_rate': 0.00018749106504646176, 'epoch': 0.06}


  6%|▋         | 1061/16798 [04:02<1:01:48,  4.24it/s]

{'loss': 1.5746, 'grad_norm': 2.1148312091827393, 'learning_rate': 0.00018747915177507745, 'epoch': 0.06}


  6%|▋         | 1062/16798 [04:02<1:01:01,  4.30it/s]

{'loss': 1.6673, 'grad_norm': 2.035737991333008, 'learning_rate': 0.0001874672385036931, 'epoch': 0.06}


  6%|▋         | 1063/16798 [04:02<59:45,  4.39it/s]  

{'loss': 1.7594, 'grad_norm': 2.008282423019409, 'learning_rate': 0.0001874553252323088, 'epoch': 0.06}


  6%|▋         | 1064/16798 [04:03<59:34,  4.40it/s]

{'loss': 1.4674, 'grad_norm': 1.8427278995513916, 'learning_rate': 0.00018744341196092447, 'epoch': 0.06}


  6%|▋         | 1065/16798 [04:03<1:02:39,  4.19it/s]

{'loss': 1.542, 'grad_norm': 1.8456463813781738, 'learning_rate': 0.00018743149868954016, 'epoch': 0.06}


  6%|▋         | 1066/16798 [04:03<1:01:13,  4.28it/s]

{'loss': 1.8275, 'grad_norm': 1.9901007413864136, 'learning_rate': 0.00018741958541815582, 'epoch': 0.06}


  6%|▋         | 1067/16798 [04:03<1:00:42,  4.32it/s]

{'loss': 2.0209, 'grad_norm': 2.103210210800171, 'learning_rate': 0.0001874076721467715, 'epoch': 0.06}


  6%|▋         | 1068/16798 [04:03<59:39,  4.39it/s]  

{'loss': 1.8155, 'grad_norm': 1.9570119380950928, 'learning_rate': 0.00018739575887538718, 'epoch': 0.06}


  6%|▋         | 1069/16798 [04:04<59:23,  4.41it/s]

{'loss': 1.435, 'grad_norm': 1.5852854251861572, 'learning_rate': 0.00018738384560400287, 'epoch': 0.06}


  6%|▋         | 1071/16798 [04:04<56:25,  4.65it/s]

{'loss': 1.6768, 'grad_norm': 2.0028367042541504, 'learning_rate': 0.00018737193233261853, 'epoch': 0.06}


  6%|▋         | 1071/16798 [04:04<56:25,  4.65it/s]

{'loss': 1.6243, 'grad_norm': 2.7499334812164307, 'learning_rate': 0.00018736001906123422, 'epoch': 0.06}


  6%|▋         | 1072/16798 [04:04<56:38,  4.63it/s]

{'loss': 1.3704, 'grad_norm': 1.7308777570724487, 'learning_rate': 0.00018734810578984988, 'epoch': 0.06}


  6%|▋         | 1073/16798 [04:05<59:45,  4.39it/s]

{'loss': 2.1219, 'grad_norm': 2.2153072357177734, 'learning_rate': 0.00018733619251846558, 'epoch': 0.06}


  6%|▋         | 1075/16798 [04:05<59:15,  4.42it/s]  

{'loss': 1.4328, 'grad_norm': 1.7136666774749756, 'learning_rate': 0.00018732427924708124, 'epoch': 0.06}


  6%|▋         | 1076/16798 [04:05<58:01,  4.52it/s]

{'loss': 1.5398, 'grad_norm': 1.9655497074127197, 'learning_rate': 0.00018731236597569696, 'epoch': 0.06}


  6%|▋         | 1076/16798 [04:05<58:01,  4.52it/s]

{'loss': 1.7208, 'grad_norm': 2.6211650371551514, 'learning_rate': 0.00018730045270431262, 'epoch': 0.06}


  6%|▋         | 1078/16798 [04:06<56:25,  4.64it/s]

{'loss': 1.4241, 'grad_norm': 1.725598692893982, 'learning_rate': 0.0001872885394329283, 'epoch': 0.06}


  6%|▋         | 1078/16798 [04:06<56:25,  4.64it/s]

{'loss': 1.6861, 'grad_norm': 1.8810793161392212, 'learning_rate': 0.00018727662616154398, 'epoch': 0.06}


  6%|▋         | 1079/16798 [04:06<55:18,  4.74it/s]

{'loss': 1.551, 'grad_norm': 1.9491121768951416, 'learning_rate': 0.00018726471289015967, 'epoch': 0.06}


  6%|▋         | 1080/16798 [04:06<59:32,  4.40it/s]

{'loss': 2.0775, 'grad_norm': 2.143501043319702, 'learning_rate': 0.00018725279961877533, 'epoch': 0.06}


  6%|▋         | 1081/16798 [04:06<58:43,  4.46it/s]

{'loss': 1.4036, 'grad_norm': 1.7699388265609741, 'learning_rate': 0.00018724088634739102, 'epoch': 0.06}


  6%|▋         | 1082/16798 [04:07<58:37,  4.47it/s]

{'loss': 1.3404, 'grad_norm': 1.6437442302703857, 'learning_rate': 0.00018722897307600668, 'epoch': 0.06}


  6%|▋         | 1083/16798 [04:07<1:02:09,  4.21it/s]

{'loss': 1.4913, 'grad_norm': 2.0067574977874756, 'learning_rate': 0.00018721705980462237, 'epoch': 0.06}


  6%|▋         | 1084/16798 [04:07<1:00:20,  4.34it/s]

{'loss': 1.4599, 'grad_norm': 2.185281276702881, 'learning_rate': 0.00018720514653323804, 'epoch': 0.06}


  6%|▋         | 1085/16798 [04:07<1:00:36,  4.32it/s]

{'loss': 1.46, 'grad_norm': 2.2591326236724854, 'learning_rate': 0.00018719323326185373, 'epoch': 0.06}


  6%|▋         | 1086/16798 [04:08<1:01:04,  4.29it/s]

{'loss': 1.5386, 'grad_norm': 2.1630818843841553, 'learning_rate': 0.0001871813199904694, 'epoch': 0.06}


  6%|▋         | 1087/16798 [04:08<1:01:37,  4.25it/s]

{'loss': 0.9215, 'grad_norm': 1.3960081338882446, 'learning_rate': 0.00018716940671908508, 'epoch': 0.06}


  6%|▋         | 1089/16798 [04:08<58:30,  4.47it/s]  

{'loss': 1.2819, 'grad_norm': 1.8949373960494995, 'learning_rate': 0.00018715749344770075, 'epoch': 0.06}


  6%|▋         | 1089/16798 [04:08<58:30,  4.47it/s]

{'loss': 1.7474, 'grad_norm': 2.493344783782959, 'learning_rate': 0.00018714558017631644, 'epoch': 0.06}


  6%|▋         | 1090/16798 [04:08<56:55,  4.60it/s]

{'loss': 1.537, 'grad_norm': 2.307473659515381, 'learning_rate': 0.0001871336669049321, 'epoch': 0.06}


  7%|▋         | 1092/16798 [04:09<56:44,  4.61it/s]

{'loss': 1.6508, 'grad_norm': 1.9828299283981323, 'learning_rate': 0.0001871217536335478, 'epoch': 0.06}


  7%|▋         | 1092/16798 [04:09<56:44,  4.61it/s]

{'loss': 1.5797, 'grad_norm': 2.0890896320343018, 'learning_rate': 0.00018710984036216346, 'epoch': 0.07}


  7%|▋         | 1093/16798 [04:09<56:19,  4.65it/s]

{'loss': 0.6807, 'grad_norm': 1.3332319259643555, 'learning_rate': 0.00018709792709077915, 'epoch': 0.07}


  7%|▋         | 1094/16798 [04:09<1:00:02,  4.36it/s]

{'loss': 0.8739, 'grad_norm': 1.7657835483551025, 'learning_rate': 0.0001870860138193948, 'epoch': 0.07}


  7%|▋         | 1095/16798 [04:09<57:53,  4.52it/s]  

{'loss': 0.8033, 'grad_norm': 1.205901026725769, 'learning_rate': 0.0001870741005480105, 'epoch': 0.07}


  7%|▋         | 1096/16798 [04:10<56:29,  4.63it/s]

{'loss': 0.8483, 'grad_norm': 1.5910736322402954, 'learning_rate': 0.00018706218727662617, 'epoch': 0.07}


  7%|▋         | 1097/16798 [04:10<54:59,  4.76it/s]

{'loss': 0.7066, 'grad_norm': 1.3898091316223145, 'learning_rate': 0.00018705027400524186, 'epoch': 0.07}


  7%|▋         | 1098/16798 [04:10<55:34,  4.71it/s]

{'loss': 0.6122, 'grad_norm': 1.2876545190811157, 'learning_rate': 0.00018703836073385752, 'epoch': 0.07}


  7%|▋         | 1100/16798 [04:11<57:08,  4.58it/s]

{'loss': 0.4534, 'grad_norm': 1.2564188241958618, 'learning_rate': 0.0001870264474624732, 'epoch': 0.07}


  7%|▋         | 1100/16798 [04:11<57:08,  4.58it/s]

{'loss': 0.931, 'grad_norm': 1.7178950309753418, 'learning_rate': 0.00018701453419108887, 'epoch': 0.07}


  7%|▋         | 1101/16798 [04:11<57:19,  4.56it/s]

{'loss': 2.2311, 'grad_norm': 1.8828524351119995, 'learning_rate': 0.00018700262091970456, 'epoch': 0.07}


  7%|▋         | 1102/16798 [04:11<57:17,  4.57it/s]

{'loss': 2.1202, 'grad_norm': 1.8528105020523071, 'learning_rate': 0.00018699070764832023, 'epoch': 0.07}


  7%|▋         | 1103/16798 [04:11<58:29,  4.47it/s]

{'loss': 1.6822, 'grad_norm': 1.6772204637527466, 'learning_rate': 0.00018697879437693592, 'epoch': 0.07}


  7%|▋         | 1104/16798 [04:12<1:01:25,  4.26it/s]

{'loss': 2.1033, 'grad_norm': 1.9443387985229492, 'learning_rate': 0.00018696688110555158, 'epoch': 0.07}


  7%|▋         | 1105/16798 [04:12<1:01:15,  4.27it/s]

{'loss': 2.3207, 'grad_norm': 2.054102659225464, 'learning_rate': 0.00018695496783416727, 'epoch': 0.07}


  7%|▋         | 1106/16798 [04:12<1:00:32,  4.32it/s]

{'loss': 2.4451, 'grad_norm': 1.9346671104431152, 'learning_rate': 0.00018694305456278296, 'epoch': 0.07}


  7%|▋         | 1107/16798 [04:12<1:00:35,  4.32it/s]

{'loss': 2.4406, 'grad_norm': 2.357818365097046, 'learning_rate': 0.00018693114129139863, 'epoch': 0.07}


  7%|▋         | 1108/16798 [04:12<1:00:23,  4.33it/s]

{'loss': 1.6605, 'grad_norm': 1.666832685470581, 'learning_rate': 0.00018691922802001432, 'epoch': 0.07}


  7%|▋         | 1109/16798 [04:13<59:36,  4.39it/s]  

{'loss': 1.6215, 'grad_norm': 1.7492499351501465, 'learning_rate': 0.00018690731474862998, 'epoch': 0.07}


  7%|▋         | 1111/16798 [04:13<1:01:04,  4.28it/s]

{'loss': 1.4537, 'grad_norm': 1.5529614686965942, 'learning_rate': 0.00018689540147724567, 'epoch': 0.07}


  7%|▋         | 1111/16798 [04:13<1:01:04,  4.28it/s]

{'loss': 2.025, 'grad_norm': 1.9535881280899048, 'learning_rate': 0.00018688348820586134, 'epoch': 0.07}


  7%|▋         | 1112/16798 [04:13<1:01:01,  4.28it/s]

{'loss': 1.3346, 'grad_norm': 1.5575013160705566, 'learning_rate': 0.00018687157493447703, 'epoch': 0.07}


  7%|▋         | 1113/16798 [04:14<1:06:08,  3.95it/s]

{'loss': 1.6313, 'grad_norm': 1.5340814590454102, 'learning_rate': 0.0001868596616630927, 'epoch': 0.07}


  7%|▋         | 1115/16798 [04:14<1:01:44,  4.23it/s]

{'loss': 1.3188, 'grad_norm': 1.7354415655136108, 'learning_rate': 0.00018684774839170838, 'epoch': 0.07}


  7%|▋         | 1115/16798 [04:14<1:01:44,  4.23it/s]

{'loss': 1.743, 'grad_norm': 1.7783241271972656, 'learning_rate': 0.00018683583512032405, 'epoch': 0.07}


  7%|▋         | 1117/16798 [04:15<58:33,  4.46it/s]  

{'loss': 1.7719, 'grad_norm': 1.7503159046173096, 'learning_rate': 0.00018682392184893974, 'epoch': 0.07}


  7%|▋         | 1117/16798 [04:15<58:33,  4.46it/s]

{'loss': 1.9624, 'grad_norm': 1.9253381490707397, 'learning_rate': 0.0001868120085775554, 'epoch': 0.07}


  7%|▋         | 1118/16798 [04:15<1:01:14,  4.27it/s]

{'loss': 1.3989, 'grad_norm': 1.8108466863632202, 'learning_rate': 0.0001868000953061711, 'epoch': 0.07}


  7%|▋         | 1120/16798 [04:15<58:46,  4.45it/s]  

{'loss': 1.9205, 'grad_norm': 2.028550386428833, 'learning_rate': 0.00018678818203478675, 'epoch': 0.07}


  7%|▋         | 1120/16798 [04:15<58:46,  4.45it/s]

{'loss': 1.8343, 'grad_norm': 2.1024856567382812, 'learning_rate': 0.00018677626876340245, 'epoch': 0.07}


  7%|▋         | 1121/16798 [04:15<59:06,  4.42it/s]

{'loss': 1.6922, 'grad_norm': 2.122840166091919, 'learning_rate': 0.0001867643554920181, 'epoch': 0.07}


  7%|▋         | 1122/16798 [04:16<1:00:40,  4.31it/s]

{'loss': 1.5702, 'grad_norm': 2.002171039581299, 'learning_rate': 0.0001867524422206338, 'epoch': 0.07}


  7%|▋         | 1123/16798 [04:16<1:01:53,  4.22it/s]

{'loss': 1.6259, 'grad_norm': 1.8531898260116577, 'learning_rate': 0.00018674052894924946, 'epoch': 0.07}


  7%|▋         | 1124/16798 [04:16<1:00:10,  4.34it/s]

{'loss': 1.2114, 'grad_norm': 1.5184513330459595, 'learning_rate': 0.00018672861567786515, 'epoch': 0.07}


  7%|▋         | 1125/16798 [04:16<58:56,  4.43it/s]  

{'loss': 1.8985, 'grad_norm': 2.016972064971924, 'learning_rate': 0.00018671670240648082, 'epoch': 0.07}


  7%|▋         | 1126/16798 [04:17<1:01:47,  4.23it/s]

{'loss': 1.5159, 'grad_norm': 1.7384198904037476, 'learning_rate': 0.0001867047891350965, 'epoch': 0.07}


  7%|▋         | 1128/16798 [04:17<58:36,  4.46it/s]  

{'loss': 1.7422, 'grad_norm': 1.9002978801727295, 'learning_rate': 0.00018669287586371217, 'epoch': 0.07}


  7%|▋         | 1128/16798 [04:17<58:36,  4.46it/s]

{'loss': 1.8384, 'grad_norm': 2.2689826488494873, 'learning_rate': 0.00018668096259232786, 'epoch': 0.07}


  7%|▋         | 1129/16798 [04:17<57:24,  4.55it/s]

{'loss': 0.8994, 'grad_norm': 1.626272201538086, 'learning_rate': 0.00018666904932094353, 'epoch': 0.07}


  7%|▋         | 1131/16798 [04:18<56:05,  4.66it/s]

{'loss': 1.5025, 'grad_norm': 1.9797279834747314, 'learning_rate': 0.00018665713604955922, 'epoch': 0.07}


  7%|▋         | 1131/16798 [04:18<56:05,  4.66it/s]

{'loss': 1.5065, 'grad_norm': 2.1287379264831543, 'learning_rate': 0.00018664522277817488, 'epoch': 0.07}


  7%|▋         | 1132/16798 [04:18<57:24,  4.55it/s]

{'loss': 1.5142, 'grad_norm': 1.7228323221206665, 'learning_rate': 0.00018663330950679057, 'epoch': 0.07}


  7%|▋         | 1133/16798 [04:18<1:01:20,  4.26it/s]

{'loss': 1.5599, 'grad_norm': 1.8083808422088623, 'learning_rate': 0.00018662139623540624, 'epoch': 0.07}


  7%|▋         | 1134/16798 [04:18<1:02:38,  4.17it/s]

{'loss': 1.1089, 'grad_norm': 1.689290165901184, 'learning_rate': 0.00018660948296402193, 'epoch': 0.07}


  7%|▋         | 1135/16798 [04:19<1:03:21,  4.12it/s]

{'loss': 1.6295, 'grad_norm': 1.9250352382659912, 'learning_rate': 0.0001865975696926376, 'epoch': 0.07}


  7%|▋         | 1137/16798 [04:19<59:15,  4.40it/s]  

{'loss': 1.1311, 'grad_norm': 1.6448317766189575, 'learning_rate': 0.00018658565642125328, 'epoch': 0.07}


  7%|▋         | 1138/16798 [04:19<57:46,  4.52it/s]

{'loss': 1.4821, 'grad_norm': 2.104764938354492, 'learning_rate': 0.00018657374314986897, 'epoch': 0.07}


  7%|▋         | 1138/16798 [04:19<57:46,  4.52it/s]

{'loss': 1.3438, 'grad_norm': 1.784734845161438, 'learning_rate': 0.00018656182987848466, 'epoch': 0.07}


  7%|▋         | 1139/16798 [04:20<1:04:10,  4.07it/s]

{'loss': 1.0696, 'grad_norm': 1.5245932340621948, 'learning_rate': 0.00018654991660710033, 'epoch': 0.07}


  7%|▋         | 1140/16798 [04:20<1:04:43,  4.03it/s]

{'loss': 0.9376, 'grad_norm': 1.4423025846481323, 'learning_rate': 0.00018653800333571602, 'epoch': 0.07}


  7%|▋         | 1141/16798 [04:20<1:02:20,  4.19it/s]

{'loss': 1.0663, 'grad_norm': 1.6591885089874268, 'learning_rate': 0.00018652609006433168, 'epoch': 0.07}


  7%|▋         | 1143/16798 [04:21<57:45,  4.52it/s]  

{'loss': 1.1502, 'grad_norm': 1.6810814142227173, 'learning_rate': 0.00018651417679294737, 'epoch': 0.07}


  7%|▋         | 1144/16798 [04:21<56:33,  4.61it/s]

{'loss': 1.1163, 'grad_norm': 1.5162686109542847, 'learning_rate': 0.00018650226352156303, 'epoch': 0.07}


  7%|▋         | 1145/16798 [04:21<55:22,  4.71it/s]

{'loss': 1.0434, 'grad_norm': 1.6076003313064575, 'learning_rate': 0.00018649035025017873, 'epoch': 0.07}


  7%|▋         | 1145/16798 [04:21<55:22,  4.71it/s]

{'loss': 0.9776, 'grad_norm': 1.5613048076629639, 'learning_rate': 0.0001864784369787944, 'epoch': 0.07}


  7%|▋         | 1147/16798 [04:21<54:29,  4.79it/s]

{'loss': 0.9975, 'grad_norm': 1.4893361330032349, 'learning_rate': 0.00018646652370741008, 'epoch': 0.07}


  7%|▋         | 1147/16798 [04:21<54:29,  4.79it/s]

{'loss': 1.1063, 'grad_norm': 1.8709772825241089, 'learning_rate': 0.00018645461043602574, 'epoch': 0.07}


  7%|▋         | 1148/16798 [04:22<58:47,  4.44it/s]

{'loss': 0.5308, 'grad_norm': 1.3019771575927734, 'learning_rate': 0.00018644269716464143, 'epoch': 0.07}


  7%|▋         | 1149/16798 [04:22<1:01:39,  4.23it/s]

{'loss': 0.2757, 'grad_norm': 0.8469470143318176, 'learning_rate': 0.0001864307838932571, 'epoch': 0.07}


  7%|▋         | 1150/16798 [04:22<1:00:07,  4.34it/s]

{'loss': 0.5782, 'grad_norm': 1.7268611192703247, 'learning_rate': 0.0001864188706218728, 'epoch': 0.07}


  7%|▋         | 1151/16798 [04:22<59:17,  4.40it/s]  

{'loss': 2.0551, 'grad_norm': 1.8133416175842285, 'learning_rate': 0.00018640695735048845, 'epoch': 0.07}


  7%|▋         | 1152/16798 [04:23<58:47,  4.44it/s]

{'loss': 2.0838, 'grad_norm': 1.7672529220581055, 'learning_rate': 0.00018639504407910414, 'epoch': 0.07}


  7%|▋         | 1153/16798 [04:23<1:00:13,  4.33it/s]

{'loss': 1.6399, 'grad_norm': 1.8643749952316284, 'learning_rate': 0.0001863831308077198, 'epoch': 0.07}


  7%|▋         | 1154/16798 [04:23<1:00:52,  4.28it/s]

{'loss': 2.0716, 'grad_norm': 1.8393211364746094, 'learning_rate': 0.0001863712175363355, 'epoch': 0.07}


  7%|▋         | 1155/16798 [04:23<59:24,  4.39it/s]  

{'loss': 2.5413, 'grad_norm': 2.281878709793091, 'learning_rate': 0.00018635930426495116, 'epoch': 0.07}


  7%|▋         | 1156/16798 [04:23<1:02:43,  4.16it/s]

{'loss': 2.0375, 'grad_norm': 2.1586227416992188, 'learning_rate': 0.00018634739099356685, 'epoch': 0.07}


  7%|▋         | 1157/16798 [04:24<1:06:52,  3.90it/s]

{'loss': 2.5664, 'grad_norm': 2.012437343597412, 'learning_rate': 0.00018633547772218252, 'epoch': 0.07}


  7%|▋         | 1158/16798 [04:24<1:07:08,  3.88it/s]

{'loss': 2.1996, 'grad_norm': 2.2236762046813965, 'learning_rate': 0.0001863235644507982, 'epoch': 0.07}


  7%|▋         | 1159/16798 [04:24<1:05:47,  3.96it/s]

{'loss': 2.0778, 'grad_norm': 2.0084285736083984, 'learning_rate': 0.00018631165117941387, 'epoch': 0.07}


  7%|▋         | 1160/16798 [04:25<1:04:55,  4.01it/s]

{'loss': 2.1933, 'grad_norm': 2.2388434410095215, 'learning_rate': 0.00018629973790802956, 'epoch': 0.07}


  7%|▋         | 1161/16798 [04:25<1:07:28,  3.86it/s]

{'loss': 1.95, 'grad_norm': 1.818354606628418, 'learning_rate': 0.00018628782463664522, 'epoch': 0.07}


  7%|▋         | 1162/16798 [04:25<1:04:23,  4.05it/s]

{'loss': 1.7216, 'grad_norm': 2.036541223526001, 'learning_rate': 0.00018627591136526092, 'epoch': 0.07}


  7%|▋         | 1163/16798 [04:25<1:05:33,  3.97it/s]

{'loss': 1.8407, 'grad_norm': 1.8783072233200073, 'learning_rate': 0.00018626399809387658, 'epoch': 0.07}


  7%|▋         | 1164/16798 [04:26<1:03:50,  4.08it/s]

{'loss': 1.7142, 'grad_norm': 2.0322344303131104, 'learning_rate': 0.00018625208482249227, 'epoch': 0.07}


  7%|▋         | 1166/16798 [04:26<1:00:09,  4.33it/s]

{'loss': 1.7948, 'grad_norm': 1.8298826217651367, 'learning_rate': 0.00018624017155110793, 'epoch': 0.07}


  7%|▋         | 1166/16798 [04:26<1:00:09,  4.33it/s]

{'loss': 1.6258, 'grad_norm': 1.84412682056427, 'learning_rate': 0.00018622825827972362, 'epoch': 0.07}


  7%|▋         | 1167/16798 [04:26<59:46,  4.36it/s]  

{'loss': 1.448, 'grad_norm': 1.5925965309143066, 'learning_rate': 0.0001862163450083393, 'epoch': 0.07}


  7%|▋         | 1168/16798 [04:26<1:04:43,  4.02it/s]

{'loss': 1.2984, 'grad_norm': 1.7930899858474731, 'learning_rate': 0.00018620443173695498, 'epoch': 0.07}


  7%|▋         | 1169/16798 [04:27<1:02:37,  4.16it/s]

{'loss': 1.7457, 'grad_norm': 1.9553946256637573, 'learning_rate': 0.00018619251846557067, 'epoch': 0.07}


  7%|▋         | 1170/16798 [04:27<1:01:49,  4.21it/s]

{'loss': 1.6844, 'grad_norm': 1.7518723011016846, 'learning_rate': 0.00018618060519418633, 'epoch': 0.07}


  7%|▋         | 1171/16798 [04:27<1:01:06,  4.26it/s]

{'loss': 1.5951, 'grad_norm': 2.175212860107422, 'learning_rate': 0.00018616869192280202, 'epoch': 0.07}


  7%|▋         | 1172/16798 [04:27<1:00:10,  4.33it/s]

{'loss': 1.5316, 'grad_norm': 1.8698900938034058, 'learning_rate': 0.0001861567786514177, 'epoch': 0.07}


  7%|▋         | 1173/16798 [04:28<58:49,  4.43it/s]  

{'loss': 1.7972, 'grad_norm': 1.8281855583190918, 'learning_rate': 0.00018614486538003338, 'epoch': 0.07}


  7%|▋         | 1174/16798 [04:28<1:00:58,  4.27it/s]

{'loss': 1.4575, 'grad_norm': 1.8200271129608154, 'learning_rate': 0.00018613295210864904, 'epoch': 0.07}


  7%|▋         | 1175/16798 [04:28<58:56,  4.42it/s]  

{'loss': 1.6757, 'grad_norm': 1.8718922138214111, 'learning_rate': 0.00018612103883726473, 'epoch': 0.07}


  7%|▋         | 1176/16798 [04:28<1:02:44,  4.15it/s]

{'loss': 2.0094, 'grad_norm': 2.1652023792266846, 'learning_rate': 0.0001861091255658804, 'epoch': 0.07}


  7%|▋         | 1177/16798 [04:29<1:03:21,  4.11it/s]

{'loss': 1.5956, 'grad_norm': 1.8123530149459839, 'learning_rate': 0.0001860972122944961, 'epoch': 0.07}


  7%|▋         | 1178/16798 [04:29<1:03:00,  4.13it/s]

{'loss': 1.3556, 'grad_norm': 1.7786035537719727, 'learning_rate': 0.00018608529902311175, 'epoch': 0.07}


  7%|▋         | 1179/16798 [04:29<1:01:25,  4.24it/s]

{'loss': 1.421, 'grad_norm': 1.7271257638931274, 'learning_rate': 0.00018607338575172744, 'epoch': 0.07}


  7%|▋         | 1180/16798 [04:29<1:00:24,  4.31it/s]

{'loss': 2.1167, 'grad_norm': 2.2004175186157227, 'learning_rate': 0.0001860614724803431, 'epoch': 0.07}


  7%|▋         | 1181/16798 [04:29<59:27,  4.38it/s]  

{'loss': 1.367, 'grad_norm': 2.05195951461792, 'learning_rate': 0.0001860495592089588, 'epoch': 0.07}


  7%|▋         | 1182/16798 [04:30<59:49,  4.35it/s]

{'loss': 1.841, 'grad_norm': 1.98722243309021, 'learning_rate': 0.00018603764593757446, 'epoch': 0.07}


  7%|▋         | 1183/16798 [04:30<1:05:03,  4.00it/s]

{'loss': 1.6581, 'grad_norm': 1.7670047283172607, 'learning_rate': 0.00018602573266619015, 'epoch': 0.07}


  7%|▋         | 1184/16798 [04:30<1:03:06,  4.12it/s]

{'loss': 1.7383, 'grad_norm': 2.075408935546875, 'learning_rate': 0.00018601381939480581, 'epoch': 0.07}


  7%|▋         | 1185/16798 [04:30<1:03:08,  4.12it/s]

{'loss': 1.6663, 'grad_norm': 1.7657783031463623, 'learning_rate': 0.0001860019061234215, 'epoch': 0.07}


  7%|▋         | 1186/16798 [04:31<1:01:37,  4.22it/s]

{'loss': 1.4059, 'grad_norm': 1.7112617492675781, 'learning_rate': 0.00018598999285203717, 'epoch': 0.07}


  7%|▋         | 1187/16798 [04:31<1:01:38,  4.22it/s]

{'loss': 1.3834, 'grad_norm': 1.6867995262145996, 'learning_rate': 0.00018597807958065286, 'epoch': 0.07}


  7%|▋         | 1188/16798 [04:31<1:01:51,  4.21it/s]

{'loss': 1.6886, 'grad_norm': 1.834862232208252, 'learning_rate': 0.00018596616630926852, 'epoch': 0.07}


  7%|▋         | 1189/16798 [04:31<1:04:30,  4.03it/s]

{'loss': 1.0589, 'grad_norm': 1.3876633644104004, 'learning_rate': 0.0001859542530378842, 'epoch': 0.07}


  7%|▋         | 1190/16798 [04:32<1:04:06,  4.06it/s]

{'loss': 1.2915, 'grad_norm': 1.476585865020752, 'learning_rate': 0.00018594233976649988, 'epoch': 0.07}


  7%|▋         | 1191/16798 [04:32<1:02:34,  4.16it/s]

{'loss': 0.9611, 'grad_norm': 1.3588608503341675, 'learning_rate': 0.00018593042649511557, 'epoch': 0.07}


  7%|▋         | 1192/16798 [04:32<1:03:47,  4.08it/s]

{'loss': 1.3523, 'grad_norm': 1.8796799182891846, 'learning_rate': 0.00018591851322373123, 'epoch': 0.07}


  7%|▋         | 1193/16798 [04:32<1:02:41,  4.15it/s]

{'loss': 0.9758, 'grad_norm': 1.409447193145752, 'learning_rate': 0.00018590659995234692, 'epoch': 0.07}


  7%|▋         | 1194/16798 [04:33<1:02:15,  4.18it/s]

{'loss': 0.7623, 'grad_norm': 1.3124696016311646, 'learning_rate': 0.00018589468668096259, 'epoch': 0.07}


  7%|▋         | 1195/16798 [04:33<1:04:00,  4.06it/s]

{'loss': 1.0525, 'grad_norm': 1.498044490814209, 'learning_rate': 0.00018588277340957828, 'epoch': 0.07}


  7%|▋         | 1196/16798 [04:33<1:02:44,  4.14it/s]

{'loss': 1.1381, 'grad_norm': 1.9541434049606323, 'learning_rate': 0.00018587086013819394, 'epoch': 0.07}


  7%|▋         | 1197/16798 [04:33<1:00:41,  4.28it/s]

{'loss': 0.6545, 'grad_norm': 1.1677829027175903, 'learning_rate': 0.00018585894686680963, 'epoch': 0.07}


  7%|▋         | 1198/16798 [04:34<1:03:08,  4.12it/s]

{'loss': 0.4129, 'grad_norm': 0.9836329817771912, 'learning_rate': 0.0001858470335954253, 'epoch': 0.07}


  7%|▋         | 1199/16798 [04:34<1:01:48,  4.21it/s]

{'loss': 0.4637, 'grad_norm': 1.050260305404663, 'learning_rate': 0.000185835120324041, 'epoch': 0.07}


  7%|▋         | 1200/16798 [04:34<1:00:47,  4.28it/s]

{'loss': 0.2616, 'grad_norm': 0.9049206376075745, 'learning_rate': 0.00018582320705265668, 'epoch': 0.07}


  7%|▋         | 1201/16798 [04:34<1:02:06,  4.19it/s]

{'loss': 2.1651, 'grad_norm': 1.9295960664749146, 'learning_rate': 0.00018581129378127237, 'epoch': 0.07}


  7%|▋         | 1202/16798 [04:35<1:03:11,  4.11it/s]

{'loss': 1.6444, 'grad_norm': 3.385094165802002, 'learning_rate': 0.00018579938050988803, 'epoch': 0.07}


  7%|▋         | 1203/16798 [04:35<1:07:44,  3.84it/s]

{'loss': 2.4569, 'grad_norm': 2.30898118019104, 'learning_rate': 0.00018578746723850372, 'epoch': 0.07}


  7%|▋         | 1204/16798 [04:35<1:07:24,  3.86it/s]

{'loss': 2.1631, 'grad_norm': 1.7525798082351685, 'learning_rate': 0.00018577555396711939, 'epoch': 0.07}


  7%|▋         | 1205/16798 [04:35<1:04:11,  4.05it/s]

{'loss': 1.8154, 'grad_norm': 1.6224867105484009, 'learning_rate': 0.00018576364069573508, 'epoch': 0.07}


  7%|▋         | 1206/16798 [04:36<1:04:06,  4.05it/s]

{'loss': 2.1688, 'grad_norm': 1.7249972820281982, 'learning_rate': 0.00018575172742435074, 'epoch': 0.07}


  7%|▋         | 1207/16798 [04:36<1:05:06,  3.99it/s]

{'loss': 2.0623, 'grad_norm': 1.7849245071411133, 'learning_rate': 0.00018573981415296643, 'epoch': 0.07}


  7%|▋         | 1208/16798 [04:36<1:07:17,  3.86it/s]

{'loss': 1.4582, 'grad_norm': 1.5535976886749268, 'learning_rate': 0.0001857279008815821, 'epoch': 0.07}


  7%|▋         | 1209/16798 [04:36<1:03:54,  4.07it/s]

{'loss': 1.7493, 'grad_norm': 1.8363438844680786, 'learning_rate': 0.00018571598761019778, 'epoch': 0.07}


  7%|▋         | 1210/16798 [04:37<1:01:31,  4.22it/s]

{'loss': 1.5378, 'grad_norm': 1.8507673740386963, 'learning_rate': 0.00018570407433881345, 'epoch': 0.07}


  7%|▋         | 1211/16798 [04:37<1:04:38,  4.02it/s]

{'loss': 1.6589, 'grad_norm': 1.907776951789856, 'learning_rate': 0.00018569216106742914, 'epoch': 0.07}


  7%|▋         | 1212/16798 [04:37<1:04:25,  4.03it/s]

{'loss': 1.6489, 'grad_norm': 1.7284084558486938, 'learning_rate': 0.0001856802477960448, 'epoch': 0.07}


  7%|▋         | 1213/16798 [04:37<1:03:52,  4.07it/s]

{'loss': 1.6587, 'grad_norm': 2.098287343978882, 'learning_rate': 0.0001856683345246605, 'epoch': 0.07}


  7%|▋         | 1214/16798 [04:38<1:03:56,  4.06it/s]

{'loss': 1.8842, 'grad_norm': 2.3754353523254395, 'learning_rate': 0.00018565642125327616, 'epoch': 0.07}


  7%|▋         | 1215/16798 [04:38<1:10:20,  3.69it/s]

{'loss': 1.4783, 'grad_norm': 1.9890764951705933, 'learning_rate': 0.00018564450798189185, 'epoch': 0.07}


  7%|▋         | 1216/16798 [04:38<1:09:50,  3.72it/s]

{'loss': 1.3222, 'grad_norm': 1.7973952293395996, 'learning_rate': 0.0001856325947105075, 'epoch': 0.07}


  7%|▋         | 1217/16798 [04:38<1:07:30,  3.85it/s]

{'loss': 1.5597, 'grad_norm': 2.196138620376587, 'learning_rate': 0.0001856206814391232, 'epoch': 0.07}


  7%|▋         | 1218/16798 [04:39<1:07:35,  3.84it/s]

{'loss': 1.5096, 'grad_norm': 2.998778820037842, 'learning_rate': 0.00018560876816773887, 'epoch': 0.07}


  7%|▋         | 1219/16798 [04:39<1:04:56,  4.00it/s]

{'loss': 1.6498, 'grad_norm': 1.9564027786254883, 'learning_rate': 0.00018559685489635456, 'epoch': 0.07}


  7%|▋         | 1220/16798 [04:39<1:03:47,  4.07it/s]

{'loss': 1.816, 'grad_norm': 2.1087796688079834, 'learning_rate': 0.00018558494162497022, 'epoch': 0.07}


  7%|▋         | 1221/16798 [04:39<1:05:38,  3.96it/s]

{'loss': 1.6538, 'grad_norm': 2.032163143157959, 'learning_rate': 0.0001855730283535859, 'epoch': 0.07}


  7%|▋         | 1222/16798 [04:40<1:04:31,  4.02it/s]

{'loss': 1.3913, 'grad_norm': 1.7724814414978027, 'learning_rate': 0.00018556111508220157, 'epoch': 0.07}


  7%|▋         | 1224/16798 [04:40<1:01:55,  4.19it/s]

{'loss': 1.4547, 'grad_norm': 2.555251121520996, 'learning_rate': 0.00018554920181081727, 'epoch': 0.07}


  7%|▋         | 1224/16798 [04:40<1:01:55,  4.19it/s]

{'loss': 1.4451, 'grad_norm': 2.0021603107452393, 'learning_rate': 0.00018553728853943293, 'epoch': 0.07}


  7%|▋         | 1226/16798 [04:41<58:37,  4.43it/s]  

{'loss': 2.045, 'grad_norm': 2.1964828968048096, 'learning_rate': 0.00018552537526804862, 'epoch': 0.07}


  7%|▋         | 1226/16798 [04:41<58:37,  4.43it/s]

{'loss': 1.6259, 'grad_norm': 1.799871563911438, 'learning_rate': 0.00018551346199666428, 'epoch': 0.07}


  7%|▋         | 1227/16798 [04:41<58:27,  4.44it/s]

{'loss': 1.0877, 'grad_norm': 1.7060623168945312, 'learning_rate': 0.00018550154872527997, 'epoch': 0.07}


  7%|▋         | 1228/16798 [04:41<1:01:08,  4.24it/s]

{'loss': 1.7372, 'grad_norm': 1.9392578601837158, 'learning_rate': 0.00018548963545389564, 'epoch': 0.07}


  7%|▋         | 1229/16798 [04:41<59:19,  4.37it/s]  

{'loss': 1.3377, 'grad_norm': 1.6945701837539673, 'learning_rate': 0.00018547772218251133, 'epoch': 0.07}


  7%|▋         | 1230/16798 [04:41<1:00:31,  4.29it/s]

{'loss': 1.2639, 'grad_norm': 1.68577241897583, 'learning_rate': 0.00018546580891112702, 'epoch': 0.07}


  7%|▋         | 1232/16798 [04:42<58:48,  4.41it/s]  

{'loss': 1.5107, 'grad_norm': 1.8037611246109009, 'learning_rate': 0.00018545389563974268, 'epoch': 0.07}


  7%|▋         | 1232/16798 [04:42<58:48,  4.41it/s]

{'loss': 1.8636, 'grad_norm': 2.0397825241088867, 'learning_rate': 0.00018544198236835837, 'epoch': 0.07}


  7%|▋         | 1233/16798 [04:42<1:01:46,  4.20it/s]

{'loss': 1.104, 'grad_norm': 1.674765706062317, 'learning_rate': 0.00018543006909697404, 'epoch': 0.07}


  7%|▋         | 1234/16798 [04:42<1:00:48,  4.27it/s]

{'loss': 1.6755, 'grad_norm': 2.057546377182007, 'learning_rate': 0.00018541815582558973, 'epoch': 0.07}


  7%|▋         | 1235/16798 [04:43<1:00:25,  4.29it/s]

{'loss': 1.7631, 'grad_norm': 2.099547863006592, 'learning_rate': 0.0001854062425542054, 'epoch': 0.07}


  7%|▋         | 1236/16798 [04:43<59:58,  4.32it/s]  

{'loss': 1.6019, 'grad_norm': 2.239760398864746, 'learning_rate': 0.00018539432928282108, 'epoch': 0.07}


  7%|▋         | 1237/16798 [04:43<58:49,  4.41it/s]

{'loss': 1.1836, 'grad_norm': 1.7555441856384277, 'learning_rate': 0.00018538241601143675, 'epoch': 0.07}


  7%|▋         | 1238/16798 [04:43<57:49,  4.48it/s]

{'loss': 1.5327, 'grad_norm': 2.1155896186828613, 'learning_rate': 0.00018537050274005244, 'epoch': 0.07}


  7%|▋         | 1239/16798 [04:44<57:02,  4.55it/s]

{'loss': 1.7949, 'grad_norm': 1.9790518283843994, 'learning_rate': 0.0001853585894686681, 'epoch': 0.07}


  7%|▋         | 1240/16798 [04:44<56:35,  4.58it/s]

{'loss': 1.1756, 'grad_norm': 1.7203906774520874, 'learning_rate': 0.0001853466761972838, 'epoch': 0.07}


  7%|▋         | 1241/16798 [04:44<1:00:19,  4.30it/s]

{'loss': 1.3641, 'grad_norm': 1.8908029794692993, 'learning_rate': 0.00018533476292589946, 'epoch': 0.07}


  7%|▋         | 1242/16798 [04:44<1:02:28,  4.15it/s]

{'loss': 1.0834, 'grad_norm': 1.4475080966949463, 'learning_rate': 0.00018532284965451515, 'epoch': 0.07}


  7%|▋         | 1243/16798 [04:44<1:01:21,  4.22it/s]

{'loss': 1.1608, 'grad_norm': 2.619719982147217, 'learning_rate': 0.0001853109363831308, 'epoch': 0.07}


  7%|▋         | 1245/16798 [04:45<57:00,  4.55it/s]  

{'loss': 0.8631, 'grad_norm': 1.4121999740600586, 'learning_rate': 0.0001852990231117465, 'epoch': 0.07}


  7%|▋         | 1246/16798 [04:45<56:14,  4.61it/s]

{'loss': 1.5067, 'grad_norm': 1.9181944131851196, 'learning_rate': 0.00018528710984036216, 'epoch': 0.07}


  7%|▋         | 1246/16798 [04:45<56:14,  4.61it/s]

{'loss': 0.9065, 'grad_norm': 1.500461459159851, 'learning_rate': 0.00018527519656897786, 'epoch': 0.07}


  7%|▋         | 1247/16798 [04:45<55:20,  4.68it/s]

{'loss': 1.1516, 'grad_norm': 2.061636447906494, 'learning_rate': 0.00018526328329759352, 'epoch': 0.07}


  7%|▋         | 1248/16798 [04:46<59:57,  4.32it/s]

{'loss': 0.592, 'grad_norm': 1.3015941381454468, 'learning_rate': 0.0001852513700262092, 'epoch': 0.07}


  7%|▋         | 1250/16798 [04:46<59:02,  4.39it/s]  

{'loss': 0.2384, 'grad_norm': 0.9161567687988281, 'learning_rate': 0.00018523945675482487, 'epoch': 0.07}


  7%|▋         | 1250/16798 [04:46<59:02,  4.39it/s]

{'loss': 0.5233, 'grad_norm': 1.175950050354004, 'learning_rate': 0.00018522754348344056, 'epoch': 0.07}


  7%|▋         | 1251/16798 [04:46<59:07,  4.38it/s]

{'loss': 1.9603, 'grad_norm': 1.7812031507492065, 'learning_rate': 0.00018521563021205623, 'epoch': 0.07}


  7%|▋         | 1252/16798 [04:46<58:13,  4.45it/s]

{'loss': 2.4173, 'grad_norm': 2.062769651412964, 'learning_rate': 0.00018520371694067192, 'epoch': 0.07}


  7%|▋         | 1253/16798 [04:47<58:15,  4.45it/s]

{'loss': 2.2075, 'grad_norm': 2.4541361331939697, 'learning_rate': 0.00018519180366928758, 'epoch': 0.07}


  7%|▋         | 1254/16798 [04:47<1:01:50,  4.19it/s]

{'loss': 2.0298, 'grad_norm': 1.7495442628860474, 'learning_rate': 0.00018517989039790327, 'epoch': 0.07}


  7%|▋         | 1255/16798 [04:47<1:00:26,  4.29it/s]

{'loss': 2.4256, 'grad_norm': 2.2326722145080566, 'learning_rate': 0.00018516797712651894, 'epoch': 0.07}


  7%|▋         | 1256/16798 [04:47<1:00:13,  4.30it/s]

{'loss': 2.5165, 'grad_norm': 2.2898662090301514, 'learning_rate': 0.00018515606385513463, 'epoch': 0.07}


  7%|▋         | 1257/16798 [04:48<59:50,  4.33it/s]  

{'loss': 2.2438, 'grad_norm': 2.175936698913574, 'learning_rate': 0.0001851441505837503, 'epoch': 0.07}


  7%|▋         | 1258/16798 [04:48<1:00:14,  4.30it/s]

{'loss': 1.8563, 'grad_norm': 1.8892689943313599, 'learning_rate': 0.00018513223731236598, 'epoch': 0.07}


  7%|▋         | 1259/16798 [04:48<59:47,  4.33it/s]  

{'loss': 1.9015, 'grad_norm': 2.155113697052002, 'learning_rate': 0.00018512032404098165, 'epoch': 0.07}


  8%|▊         | 1260/16798 [04:48<1:02:59,  4.11it/s]

{'loss': 1.7339, 'grad_norm': 2.012502908706665, 'learning_rate': 0.00018510841076959736, 'epoch': 0.08}


  8%|▊         | 1261/16798 [04:49<1:01:25,  4.22it/s]

{'loss': 1.8908, 'grad_norm': 1.9086003303527832, 'learning_rate': 0.00018509649749821303, 'epoch': 0.08}


  8%|▊         | 1263/16798 [04:49<58:40,  4.41it/s]  

{'loss': 1.7978, 'grad_norm': 1.8434878587722778, 'learning_rate': 0.00018508458422682872, 'epoch': 0.08}


  8%|▊         | 1263/16798 [04:49<58:40,  4.41it/s]

{'loss': 1.4969, 'grad_norm': 1.792290449142456, 'learning_rate': 0.00018507267095544438, 'epoch': 0.08}


  8%|▊         | 1265/16798 [04:49<56:18,  4.60it/s]

{'loss': 1.6817, 'grad_norm': 2.0318968296051025, 'learning_rate': 0.00018506075768406007, 'epoch': 0.08}


  8%|▊         | 1266/16798 [04:50<55:35,  4.66it/s]

{'loss': 1.7933, 'grad_norm': 1.9960932731628418, 'learning_rate': 0.00018504884441267574, 'epoch': 0.08}


  8%|▊         | 1266/16798 [04:50<55:35,  4.66it/s]

{'loss': 1.6873, 'grad_norm': 1.8375234603881836, 'learning_rate': 0.00018503693114129143, 'epoch': 0.08}


  8%|▊         | 1267/16798 [04:50<1:01:10,  4.23it/s]

{'loss': 2.0178, 'grad_norm': 2.058504581451416, 'learning_rate': 0.0001850250178699071, 'epoch': 0.08}


  8%|▊         | 1268/16798 [04:50<1:00:55,  4.25it/s]

{'loss': 1.7973, 'grad_norm': 1.828601360321045, 'learning_rate': 0.00018501310459852278, 'epoch': 0.08}


  8%|▊         | 1269/16798 [04:50<1:03:11,  4.10it/s]

{'loss': 1.6195, 'grad_norm': 1.5569982528686523, 'learning_rate': 0.00018500119132713844, 'epoch': 0.08}


  8%|▊         | 1270/16798 [04:51<1:00:58,  4.24it/s]

{'loss': 1.5634, 'grad_norm': 1.6460920572280884, 'learning_rate': 0.00018498927805575414, 'epoch': 0.08}


  8%|▊         | 1271/16798 [04:51<59:14,  4.37it/s]  

{'loss': 1.6032, 'grad_norm': 1.8948858976364136, 'learning_rate': 0.0001849773647843698, 'epoch': 0.08}


  8%|▊         | 1273/16798 [04:51<57:20,  4.51it/s]

{'loss': 1.5089, 'grad_norm': 1.6424695253372192, 'learning_rate': 0.0001849654515129855, 'epoch': 0.08}


  8%|▊         | 1273/16798 [04:51<57:20,  4.51it/s]

{'loss': 1.274, 'grad_norm': 1.5466504096984863, 'learning_rate': 0.00018495353824160115, 'epoch': 0.08}


  8%|▊         | 1274/16798 [04:52<59:35,  4.34it/s]

{'loss': 1.712, 'grad_norm': 1.7853120565414429, 'learning_rate': 0.00018494162497021684, 'epoch': 0.08}


  8%|▊         | 1275/16798 [04:52<1:04:00,  4.04it/s]

{'loss': 1.6779, 'grad_norm': 2.1151556968688965, 'learning_rate': 0.0001849297116988325, 'epoch': 0.08}


  8%|▊         | 1276/16798 [04:52<1:01:49,  4.18it/s]

{'loss': 1.4982, 'grad_norm': 2.079432964324951, 'learning_rate': 0.0001849177984274482, 'epoch': 0.08}


  8%|▊         | 1277/16798 [04:52<1:00:43,  4.26it/s]

{'loss': 1.8534, 'grad_norm': 1.939117193222046, 'learning_rate': 0.00018490588515606386, 'epoch': 0.08}


  8%|▊         | 1278/16798 [04:53<59:18,  4.36it/s]  

{'loss': 1.404, 'grad_norm': 1.8443272113800049, 'learning_rate': 0.00018489397188467955, 'epoch': 0.08}


  8%|▊         | 1279/16798 [04:53<58:59,  4.38it/s]

{'loss': 1.3657, 'grad_norm': 1.818414568901062, 'learning_rate': 0.00018488205861329522, 'epoch': 0.08}


  8%|▊         | 1280/16798 [04:53<58:13,  4.44it/s]

{'loss': 1.5326, 'grad_norm': 1.7960104942321777, 'learning_rate': 0.0001848701453419109, 'epoch': 0.08}


  8%|▊         | 1281/16798 [04:53<1:00:59,  4.24it/s]

{'loss': 1.4691, 'grad_norm': 2.192837953567505, 'learning_rate': 0.00018485823207052657, 'epoch': 0.08}


  8%|▊         | 1282/16798 [04:54<1:03:50,  4.05it/s]

{'loss': 1.4411, 'grad_norm': 2.837036371231079, 'learning_rate': 0.00018484631879914226, 'epoch': 0.08}


  8%|▊         | 1284/16798 [04:54<1:00:40,  4.26it/s]

{'loss': 1.2537, 'grad_norm': 1.6554454565048218, 'learning_rate': 0.00018483440552775793, 'epoch': 0.08}


  8%|▊         | 1284/16798 [04:54<1:00:40,  4.26it/s]

{'loss': 1.5725, 'grad_norm': 1.8508687019348145, 'learning_rate': 0.00018482249225637362, 'epoch': 0.08}


  8%|▊         | 1285/16798 [04:54<58:37,  4.41it/s]  

{'loss': 1.1937, 'grad_norm': 1.6566932201385498, 'learning_rate': 0.00018481057898498928, 'epoch': 0.08}


  8%|▊         | 1286/16798 [04:54<59:04,  4.38it/s]

{'loss': 1.2514, 'grad_norm': 4.174309730529785, 'learning_rate': 0.00018479866571360494, 'epoch': 0.08}


  8%|▊         | 1287/16798 [04:55<1:00:30,  4.27it/s]

{'loss': 1.4297, 'grad_norm': 1.8334745168685913, 'learning_rate': 0.00018478675244222063, 'epoch': 0.08}


  8%|▊         | 1288/16798 [04:55<1:02:08,  4.16it/s]

{'loss': 1.1153, 'grad_norm': 1.5508984327316284, 'learning_rate': 0.0001847748391708363, 'epoch': 0.08}


  8%|▊         | 1289/16798 [04:55<1:04:27,  4.01it/s]

{'loss': 1.3019, 'grad_norm': 1.7573758363723755, 'learning_rate': 0.000184762925899452, 'epoch': 0.08}


  8%|▊         | 1290/16798 [04:55<1:01:05,  4.23it/s]

{'loss': 1.3783, 'grad_norm': 1.9800981283187866, 'learning_rate': 0.00018475101262806765, 'epoch': 0.08}


  8%|▊         | 1291/16798 [04:56<1:08:44,  3.76it/s]

{'loss': 1.022, 'grad_norm': 1.5583558082580566, 'learning_rate': 0.00018473909935668337, 'epoch': 0.08}


  8%|▊         | 1292/16798 [04:56<1:05:24,  3.95it/s]

{'loss': 1.5609, 'grad_norm': 1.9666273593902588, 'learning_rate': 0.00018472718608529903, 'epoch': 0.08}


  8%|▊         | 1293/16798 [04:56<1:04:44,  3.99it/s]

{'loss': 0.9658, 'grad_norm': 1.4382673501968384, 'learning_rate': 0.00018471527281391472, 'epoch': 0.08}


  8%|▊         | 1294/16798 [04:56<1:03:38,  4.06it/s]

{'loss': 0.9736, 'grad_norm': 1.6160058975219727, 'learning_rate': 0.0001847033595425304, 'epoch': 0.08}


  8%|▊         | 1295/16798 [04:57<1:09:52,  3.70it/s]

{'loss': 0.9311, 'grad_norm': 1.4478906393051147, 'learning_rate': 0.00018469144627114608, 'epoch': 0.08}


  8%|▊         | 1296/16798 [04:57<1:05:44,  3.93it/s]

{'loss': 0.9941, 'grad_norm': 1.6491378545761108, 'learning_rate': 0.00018467953299976174, 'epoch': 0.08}


  8%|▊         | 1297/16798 [04:57<1:04:54,  3.98it/s]

{'loss': 0.4983, 'grad_norm': 1.2822495698928833, 'learning_rate': 0.00018466761972837743, 'epoch': 0.08}


  8%|▊         | 1298/16798 [04:57<1:03:11,  4.09it/s]

{'loss': 0.3362, 'grad_norm': 1.0357049703598022, 'learning_rate': 0.0001846557064569931, 'epoch': 0.08}


  8%|▊         | 1299/16798 [04:58<1:03:17,  4.08it/s]

{'loss': 0.2807, 'grad_norm': 0.8925226926803589, 'learning_rate': 0.0001846437931856088, 'epoch': 0.08}


  8%|▊         | 1300/16798 [04:58<1:01:56,  4.17it/s]

{'loss': 0.34, 'grad_norm': 1.132257103919983, 'learning_rate': 0.00018463187991422445, 'epoch': 0.08}


  8%|▊         | 1301/16798 [04:58<1:05:43,  3.93it/s]

{'loss': 2.1039, 'grad_norm': 1.6665529012680054, 'learning_rate': 0.00018461996664284014, 'epoch': 0.08}


  8%|▊         | 1302/16798 [04:58<1:04:06,  4.03it/s]

{'loss': 2.0131, 'grad_norm': 1.7085994482040405, 'learning_rate': 0.0001846080533714558, 'epoch': 0.08}


  8%|▊         | 1303/16798 [04:59<1:04:38,  4.00it/s]

{'loss': 2.2426, 'grad_norm': 1.8218754529953003, 'learning_rate': 0.0001845961401000715, 'epoch': 0.08}


  8%|▊         | 1304/16798 [04:59<1:04:06,  4.03it/s]

{'loss': 1.9039, 'grad_norm': 1.723848819732666, 'learning_rate': 0.00018458422682868716, 'epoch': 0.08}


  8%|▊         | 1305/16798 [04:59<1:05:00,  3.97it/s]

{'loss': 2.4085, 'grad_norm': 2.072213888168335, 'learning_rate': 0.00018457231355730285, 'epoch': 0.08}


  8%|▊         | 1306/16798 [04:59<1:03:25,  4.07it/s]

{'loss': 2.3607, 'grad_norm': 2.167290449142456, 'learning_rate': 0.00018456040028591851, 'epoch': 0.08}


  8%|▊         | 1307/16798 [05:00<1:06:36,  3.88it/s]

{'loss': 1.8594, 'grad_norm': 1.8260679244995117, 'learning_rate': 0.0001845484870145342, 'epoch': 0.08}


  8%|▊         | 1308/16798 [05:00<1:05:08,  3.96it/s]

{'loss': 1.6846, 'grad_norm': 1.801431655883789, 'learning_rate': 0.00018453657374314987, 'epoch': 0.08}


  8%|▊         | 1309/16798 [05:00<1:03:46,  4.05it/s]

{'loss': 1.5188, 'grad_norm': 1.861235499382019, 'learning_rate': 0.00018452466047176556, 'epoch': 0.08}


  8%|▊         | 1310/16798 [05:00<1:04:56,  3.97it/s]

{'loss': 1.6633, 'grad_norm': 1.9772820472717285, 'learning_rate': 0.00018451274720038122, 'epoch': 0.08}


  8%|▊         | 1311/16798 [05:01<1:06:33,  3.88it/s]

{'loss': 1.5451, 'grad_norm': 1.7145966291427612, 'learning_rate': 0.00018450083392899691, 'epoch': 0.08}


  8%|▊         | 1312/16798 [05:01<1:05:11,  3.96it/s]

{'loss': 1.8489, 'grad_norm': 1.9869388341903687, 'learning_rate': 0.00018448892065761258, 'epoch': 0.08}


  8%|▊         | 1313/16798 [05:01<1:07:05,  3.85it/s]

{'loss': 1.8004, 'grad_norm': 1.9148070812225342, 'learning_rate': 0.00018447700738622827, 'epoch': 0.08}


  8%|▊         | 1314/16798 [05:01<1:04:19,  4.01it/s]

{'loss': 1.3766, 'grad_norm': 1.8544622659683228, 'learning_rate': 0.00018446509411484393, 'epoch': 0.08}


  8%|▊         | 1315/16798 [05:02<1:04:03,  4.03it/s]

{'loss': 1.5257, 'grad_norm': 1.893370509147644, 'learning_rate': 0.00018445318084345962, 'epoch': 0.08}


  8%|▊         | 1316/16798 [05:02<1:05:49,  3.92it/s]

{'loss': 1.5399, 'grad_norm': 1.822303295135498, 'learning_rate': 0.0001844412675720753, 'epoch': 0.08}


  8%|▊         | 1317/16798 [05:02<1:05:32,  3.94it/s]

{'loss': 1.4682, 'grad_norm': 1.8174729347229004, 'learning_rate': 0.00018442935430069098, 'epoch': 0.08}


  8%|▊         | 1318/16798 [05:03<1:08:27,  3.77it/s]

{'loss': 1.7426, 'grad_norm': 2.1300768852233887, 'learning_rate': 0.00018441744102930664, 'epoch': 0.08}


  8%|▊         | 1319/16798 [05:03<1:08:01,  3.79it/s]

{'loss': 1.5899, 'grad_norm': 1.7324968576431274, 'learning_rate': 0.00018440552775792233, 'epoch': 0.08}


  8%|▊         | 1320/16798 [05:03<1:07:20,  3.83it/s]

{'loss': 1.1127, 'grad_norm': 1.5695736408233643, 'learning_rate': 0.000184393614486538, 'epoch': 0.08}


  8%|▊         | 1321/16798 [05:03<1:05:29,  3.94it/s]

{'loss': 1.3096, 'grad_norm': 1.9069669246673584, 'learning_rate': 0.0001843817012151537, 'epoch': 0.08}


  8%|▊         | 1322/16798 [05:04<1:05:08,  3.96it/s]

{'loss': 1.0939, 'grad_norm': 1.5087593793869019, 'learning_rate': 0.00018436978794376938, 'epoch': 0.08}


  8%|▊         | 1323/16798 [05:04<1:04:29,  4.00it/s]

{'loss': 1.7641, 'grad_norm': 1.9810214042663574, 'learning_rate': 0.00018435787467238507, 'epoch': 0.08}


  8%|▊         | 1324/16798 [05:04<1:07:36,  3.81it/s]

{'loss': 1.7745, 'grad_norm': 2.1860451698303223, 'learning_rate': 0.00018434596140100073, 'epoch': 0.08}


  8%|▊         | 1325/16798 [05:04<1:06:29,  3.88it/s]

{'loss': 1.4922, 'grad_norm': 2.026085376739502, 'learning_rate': 0.00018433404812961642, 'epoch': 0.08}


  8%|▊         | 1326/16798 [05:05<1:06:36,  3.87it/s]

{'loss': 1.2062, 'grad_norm': 1.5910532474517822, 'learning_rate': 0.00018432213485823209, 'epoch': 0.08}


  8%|▊         | 1327/16798 [05:05<1:05:49,  3.92it/s]

{'loss': 1.6428, 'grad_norm': 1.8455415964126587, 'learning_rate': 0.00018431022158684778, 'epoch': 0.08}


  8%|▊         | 1328/16798 [05:05<1:04:29,  4.00it/s]

{'loss': 1.0846, 'grad_norm': 1.474753975868225, 'learning_rate': 0.00018429830831546344, 'epoch': 0.08}


  8%|▊         | 1329/16798 [05:05<1:04:02,  4.03it/s]

{'loss': 1.4272, 'grad_norm': 1.720690369606018, 'learning_rate': 0.00018428639504407913, 'epoch': 0.08}


  8%|▊         | 1330/16798 [05:06<1:06:07,  3.90it/s]

{'loss': 1.6408, 'grad_norm': 1.8022446632385254, 'learning_rate': 0.0001842744817726948, 'epoch': 0.08}


  8%|▊         | 1331/16798 [05:06<1:05:50,  3.91it/s]

{'loss': 1.1951, 'grad_norm': 1.580531358718872, 'learning_rate': 0.00018426256850131049, 'epoch': 0.08}


  8%|▊         | 1332/16798 [05:06<1:03:18,  4.07it/s]

{'loss': 1.5729, 'grad_norm': 1.8444223403930664, 'learning_rate': 0.00018425065522992615, 'epoch': 0.08}


  8%|▊         | 1333/16798 [05:06<1:02:36,  4.12it/s]

{'loss': 1.443, 'grad_norm': 1.8539966344833374, 'learning_rate': 0.00018423874195854184, 'epoch': 0.08}


  8%|▊         | 1334/16798 [05:06<1:01:16,  4.21it/s]

{'loss': 1.6279, 'grad_norm': 1.978578805923462, 'learning_rate': 0.0001842268286871575, 'epoch': 0.08}


  8%|▊         | 1335/16798 [05:07<1:01:24,  4.20it/s]

{'loss': 0.8303, 'grad_norm': 1.3682740926742554, 'learning_rate': 0.0001842149154157732, 'epoch': 0.08}


  8%|▊         | 1336/16798 [05:07<1:05:08,  3.96it/s]

{'loss': 1.8502, 'grad_norm': 2.250732660293579, 'learning_rate': 0.00018420300214438886, 'epoch': 0.08}


  8%|▊         | 1337/16798 [05:07<1:03:19,  4.07it/s]

{'loss': 1.2477, 'grad_norm': 1.7937549352645874, 'learning_rate': 0.00018419108887300455, 'epoch': 0.08}


  8%|▊         | 1338/16798 [05:07<1:02:49,  4.10it/s]

{'loss': 1.4483, 'grad_norm': 1.918326735496521, 'learning_rate': 0.0001841791756016202, 'epoch': 0.08}


  8%|▊         | 1339/16798 [05:08<1:01:11,  4.21it/s]

{'loss': 1.2923, 'grad_norm': 1.741827130317688, 'learning_rate': 0.0001841672623302359, 'epoch': 0.08}


  8%|▊         | 1340/16798 [05:08<1:02:29,  4.12it/s]

{'loss': 1.7723, 'grad_norm': 2.6720664501190186, 'learning_rate': 0.00018415534905885157, 'epoch': 0.08}


  8%|▊         | 1341/16798 [05:08<1:01:33,  4.18it/s]

{'loss': 0.6043, 'grad_norm': 1.246219277381897, 'learning_rate': 0.00018414343578746726, 'epoch': 0.08}


  8%|▊         | 1342/16798 [05:08<1:06:07,  3.90it/s]

{'loss': 1.054, 'grad_norm': 1.5577703714370728, 'learning_rate': 0.00018413152251608292, 'epoch': 0.08}


  8%|▊         | 1343/16798 [05:09<1:05:38,  3.92it/s]

{'loss': 1.2494, 'grad_norm': 1.7840170860290527, 'learning_rate': 0.00018411960924469859, 'epoch': 0.08}


  8%|▊         | 1344/16798 [05:09<1:04:10,  4.01it/s]

{'loss': 0.9059, 'grad_norm': 1.8217512369155884, 'learning_rate': 0.00018410769597331428, 'epoch': 0.08}


  8%|▊         | 1345/16798 [05:09<1:05:39,  3.92it/s]

{'loss': 0.7243, 'grad_norm': 1.663385272026062, 'learning_rate': 0.00018409578270192994, 'epoch': 0.08}


  8%|▊         | 1346/16798 [05:09<1:05:00,  3.96it/s]

{'loss': 0.5013, 'grad_norm': 1.2227725982666016, 'learning_rate': 0.00018408386943054563, 'epoch': 0.08}


  8%|▊         | 1347/16798 [05:10<1:02:23,  4.13it/s]

{'loss': 0.8074, 'grad_norm': 1.724489688873291, 'learning_rate': 0.0001840719561591613, 'epoch': 0.08}


  8%|▊         | 1348/16798 [05:10<1:00:59,  4.22it/s]

{'loss': 0.3796, 'grad_norm': 0.9983022212982178, 'learning_rate': 0.00018406004288777698, 'epoch': 0.08}


  8%|▊         | 1349/16798 [05:10<1:03:57,  4.03it/s]

{'loss': 0.3913, 'grad_norm': 0.9619731307029724, 'learning_rate': 0.00018404812961639265, 'epoch': 0.08}


  8%|▊         | 1350/16798 [05:10<1:01:24,  4.19it/s]

{'loss': 0.2617, 'grad_norm': 0.8712614178657532, 'learning_rate': 0.00018403621634500834, 'epoch': 0.08}


  8%|▊         | 1351/16798 [05:11<1:03:02,  4.08it/s]

{'loss': 1.8482, 'grad_norm': 1.9178893566131592, 'learning_rate': 0.000184024303073624, 'epoch': 0.08}


  8%|▊         | 1352/16798 [05:11<1:02:58,  4.09it/s]

{'loss': 1.6545, 'grad_norm': 1.8733528852462769, 'learning_rate': 0.0001840123898022397, 'epoch': 0.08}


  8%|▊         | 1353/16798 [05:11<1:03:39,  4.04it/s]

{'loss': 1.7617, 'grad_norm': 1.7280503511428833, 'learning_rate': 0.00018400047653085538, 'epoch': 0.08}


  8%|▊         | 1354/16798 [05:11<1:01:36,  4.18it/s]

{'loss': 2.2366, 'grad_norm': 2.004629611968994, 'learning_rate': 0.00018398856325947108, 'epoch': 0.08}


  8%|▊         | 1355/16798 [05:12<1:01:54,  4.16it/s]

{'loss': 2.0154, 'grad_norm': 2.037609815597534, 'learning_rate': 0.00018397664998808674, 'epoch': 0.08}


  8%|▊         | 1356/16798 [05:12<1:03:01,  4.08it/s]

{'loss': 1.7503, 'grad_norm': 1.836753487586975, 'learning_rate': 0.00018396473671670243, 'epoch': 0.08}


  8%|▊         | 1357/16798 [05:12<1:01:39,  4.17it/s]

{'loss': 2.0213, 'grad_norm': 2.0433855056762695, 'learning_rate': 0.0001839528234453181, 'epoch': 0.08}


  8%|▊         | 1358/16798 [05:12<1:02:28,  4.12it/s]

{'loss': 1.836, 'grad_norm': 1.8594989776611328, 'learning_rate': 0.00018394091017393378, 'epoch': 0.08}


  8%|▊         | 1359/16798 [05:13<1:02:31,  4.12it/s]

{'loss': 1.9436, 'grad_norm': 2.3096210956573486, 'learning_rate': 0.00018392899690254945, 'epoch': 0.08}


  8%|▊         | 1360/16798 [05:13<1:02:36,  4.11it/s]

{'loss': 2.1474, 'grad_norm': 1.8997732400894165, 'learning_rate': 0.00018391708363116514, 'epoch': 0.08}


  8%|▊         | 1361/16798 [05:13<1:07:22,  3.82it/s]

{'loss': 1.5198, 'grad_norm': 2.0593039989471436, 'learning_rate': 0.0001839051703597808, 'epoch': 0.08}


  8%|▊         | 1362/16798 [05:13<1:05:54,  3.90it/s]

{'loss': 1.7796, 'grad_norm': 2.3052515983581543, 'learning_rate': 0.0001838932570883965, 'epoch': 0.08}


  8%|▊         | 1363/16798 [05:14<1:04:37,  3.98it/s]

{'loss': 1.9829, 'grad_norm': 2.1237454414367676, 'learning_rate': 0.00018388134381701216, 'epoch': 0.08}


  8%|▊         | 1364/16798 [05:14<1:05:09,  3.95it/s]

{'loss': 1.8036, 'grad_norm': 2.5353457927703857, 'learning_rate': 0.00018386943054562785, 'epoch': 0.08}


  8%|▊         | 1365/16798 [05:14<1:04:49,  3.97it/s]

{'loss': 1.8697, 'grad_norm': 1.990601897239685, 'learning_rate': 0.0001838575172742435, 'epoch': 0.08}


  8%|▊         | 1366/16798 [05:14<1:08:59,  3.73it/s]

{'loss': 1.4252, 'grad_norm': 1.8951619863510132, 'learning_rate': 0.0001838456040028592, 'epoch': 0.08}


  8%|▊         | 1367/16798 [05:15<1:12:11,  3.56it/s]

{'loss': 1.7702, 'grad_norm': 1.8294825553894043, 'learning_rate': 0.00018383369073147487, 'epoch': 0.08}


  8%|▊         | 1368/16798 [05:15<1:10:19,  3.66it/s]

{'loss': 1.6108, 'grad_norm': 1.776794195175171, 'learning_rate': 0.00018382177746009056, 'epoch': 0.08}


  8%|▊         | 1369/16798 [05:15<1:07:13,  3.83it/s]

{'loss': 1.3151, 'grad_norm': 1.6437541246414185, 'learning_rate': 0.00018380986418870622, 'epoch': 0.08}


  8%|▊         | 1370/16798 [05:16<1:06:15,  3.88it/s]

{'loss': 1.5875, 'grad_norm': 1.9946058988571167, 'learning_rate': 0.0001837979509173219, 'epoch': 0.08}


  8%|▊         | 1371/16798 [05:16<1:04:53,  3.96it/s]

{'loss': 1.4606, 'grad_norm': 1.7294425964355469, 'learning_rate': 0.00018378603764593757, 'epoch': 0.08}


  8%|▊         | 1372/16798 [05:16<1:04:09,  4.01it/s]

{'loss': 1.3547, 'grad_norm': 1.7566105127334595, 'learning_rate': 0.00018377412437455326, 'epoch': 0.08}


  8%|▊         | 1373/16798 [05:16<1:05:40,  3.91it/s]

{'loss': 1.593, 'grad_norm': 1.8755215406417847, 'learning_rate': 0.00018376221110316893, 'epoch': 0.08}


  8%|▊         | 1374/16798 [05:17<1:07:00,  3.84it/s]

{'loss': 1.7511, 'grad_norm': 1.8312193155288696, 'learning_rate': 0.00018375029783178462, 'epoch': 0.08}


  8%|▊         | 1375/16798 [05:17<1:08:02,  3.78it/s]

{'loss': 1.4538, 'grad_norm': 1.962253451347351, 'learning_rate': 0.00018373838456040028, 'epoch': 0.08}


  8%|▊         | 1376/16798 [05:17<1:10:21,  3.65it/s]

{'loss': 1.781, 'grad_norm': 1.7897173166275024, 'learning_rate': 0.00018372647128901597, 'epoch': 0.08}


  8%|▊         | 1377/16798 [05:17<1:08:13,  3.77it/s]

{'loss': 1.6878, 'grad_norm': 1.8718703985214233, 'learning_rate': 0.00018371455801763164, 'epoch': 0.08}


  8%|▊         | 1378/16798 [05:18<1:10:16,  3.66it/s]

{'loss': 1.4072, 'grad_norm': 1.6597553491592407, 'learning_rate': 0.00018370264474624733, 'epoch': 0.08}


  8%|▊         | 1379/16798 [05:18<1:07:34,  3.80it/s]

{'loss': 1.6199, 'grad_norm': 2.16623854637146, 'learning_rate': 0.000183690731474863, 'epoch': 0.08}


  8%|▊         | 1380/16798 [05:18<1:04:21,  3.99it/s]

{'loss': 1.5785, 'grad_norm': 1.7457876205444336, 'learning_rate': 0.00018367881820347868, 'epoch': 0.08}


  8%|▊         | 1381/16798 [05:18<1:07:26,  3.81it/s]

{'loss': 1.4702, 'grad_norm': 1.8869601488113403, 'learning_rate': 0.00018366690493209435, 'epoch': 0.08}


  8%|▊         | 1382/16798 [05:19<1:06:06,  3.89it/s]

{'loss': 1.6562, 'grad_norm': 1.9915225505828857, 'learning_rate': 0.00018365499166071004, 'epoch': 0.08}


  8%|▊         | 1383/16798 [05:19<1:03:36,  4.04it/s]

{'loss': 1.3589, 'grad_norm': 2.0381991863250732, 'learning_rate': 0.0001836430783893257, 'epoch': 0.08}


  8%|▊         | 1384/16798 [05:19<1:07:23,  3.81it/s]

{'loss': 1.6463, 'grad_norm': 2.1069693565368652, 'learning_rate': 0.00018363116511794142, 'epoch': 0.08}


  8%|▊         | 1385/16798 [05:19<1:05:06,  3.95it/s]

{'loss': 1.6273, 'grad_norm': 1.9517574310302734, 'learning_rate': 0.00018361925184655708, 'epoch': 0.08}


  8%|▊         | 1386/16798 [05:20<1:03:18,  4.06it/s]

{'loss': 1.0516, 'grad_norm': 1.5344091653823853, 'learning_rate': 0.00018360733857517277, 'epoch': 0.08}


  8%|▊         | 1387/16798 [05:20<1:02:06,  4.14it/s]

{'loss': 1.3392, 'grad_norm': 1.8975075483322144, 'learning_rate': 0.00018359542530378844, 'epoch': 0.08}


  8%|▊         | 1388/16798 [05:20<1:01:01,  4.21it/s]

{'loss': 1.3676, 'grad_norm': 1.688225507736206, 'learning_rate': 0.00018358351203240413, 'epoch': 0.08}


  8%|▊         | 1389/16798 [05:20<1:03:29,  4.04it/s]

{'loss': 1.3525, 'grad_norm': 1.9532923698425293, 'learning_rate': 0.0001835715987610198, 'epoch': 0.08}


  8%|▊         | 1390/16798 [05:21<1:07:19,  3.81it/s]

{'loss': 1.3407, 'grad_norm': 1.891755223274231, 'learning_rate': 0.00018355968548963548, 'epoch': 0.08}


  8%|▊         | 1391/16798 [05:21<1:04:53,  3.96it/s]

{'loss': 1.5923, 'grad_norm': 1.9544575214385986, 'learning_rate': 0.00018354777221825115, 'epoch': 0.08}


  8%|▊         | 1392/16798 [05:21<1:10:29,  3.64it/s]

{'loss': 1.3379, 'grad_norm': 1.8162544965744019, 'learning_rate': 0.00018353585894686684, 'epoch': 0.08}


  8%|▊         | 1393/16798 [05:21<1:07:43,  3.79it/s]

{'loss': 1.3792, 'grad_norm': 2.182804584503174, 'learning_rate': 0.0001835239456754825, 'epoch': 0.08}


  8%|▊         | 1394/16798 [05:22<1:04:14,  4.00it/s]

{'loss': 1.2558, 'grad_norm': 1.9627444744110107, 'learning_rate': 0.0001835120324040982, 'epoch': 0.08}


  8%|▊         | 1395/16798 [05:22<1:02:16,  4.12it/s]

{'loss': 1.2229, 'grad_norm': 1.7629987001419067, 'learning_rate': 0.00018350011913271385, 'epoch': 0.08}


  8%|▊         | 1396/16798 [05:22<1:05:13,  3.94it/s]

{'loss': 1.144, 'grad_norm': 1.7314075231552124, 'learning_rate': 0.00018348820586132955, 'epoch': 0.08}


  8%|▊         | 1397/16798 [05:22<1:04:19,  3.99it/s]

{'loss': 0.8092, 'grad_norm': 1.4105262756347656, 'learning_rate': 0.0001834762925899452, 'epoch': 0.08}


  8%|▊         | 1398/16798 [05:23<1:05:00,  3.95it/s]

{'loss': 0.7534, 'grad_norm': 1.6178889274597168, 'learning_rate': 0.0001834643793185609, 'epoch': 0.08}


  8%|▊         | 1399/16798 [05:23<1:06:31,  3.86it/s]

{'loss': 0.7352, 'grad_norm': 1.5342990159988403, 'learning_rate': 0.00018345246604717656, 'epoch': 0.08}


  8%|▊         | 1400/16798 [05:23<1:04:50,  3.96it/s]

{'loss': 0.5687, 'grad_norm': 1.5078567266464233, 'learning_rate': 0.00018344055277579223, 'epoch': 0.08}


  8%|▊         | 1401/16798 [05:24<1:15:38,  3.39it/s]

{'loss': 1.7637, 'grad_norm': 1.6952067613601685, 'learning_rate': 0.00018342863950440792, 'epoch': 0.08}


  8%|▊         | 1402/16798 [05:24<1:12:57,  3.52it/s]

{'loss': 2.4992, 'grad_norm': 2.356015682220459, 'learning_rate': 0.00018341672623302358, 'epoch': 0.08}


  8%|▊         | 1403/16798 [05:24<1:08:59,  3.72it/s]

{'loss': 2.0244, 'grad_norm': 1.805336594581604, 'learning_rate': 0.00018340481296163927, 'epoch': 0.08}


  8%|▊         | 1404/16798 [05:24<1:06:26,  3.86it/s]

{'loss': 2.0037, 'grad_norm': 1.8836747407913208, 'learning_rate': 0.00018339289969025494, 'epoch': 0.08}


  8%|▊         | 1405/16798 [05:25<1:04:26,  3.98it/s]

{'loss': 1.8649, 'grad_norm': 1.7293610572814941, 'learning_rate': 0.00018338098641887063, 'epoch': 0.08}


  8%|▊         | 1406/16798 [05:25<1:06:42,  3.85it/s]

{'loss': 1.764, 'grad_norm': 1.8899861574172974, 'learning_rate': 0.0001833690731474863, 'epoch': 0.08}


  8%|▊         | 1407/16798 [05:25<1:04:02,  4.01it/s]

{'loss': 1.9817, 'grad_norm': 1.9980065822601318, 'learning_rate': 0.00018335715987610198, 'epoch': 0.08}


  8%|▊         | 1408/16798 [05:25<1:05:38,  3.91it/s]

{'loss': 2.0563, 'grad_norm': 2.0655415058135986, 'learning_rate': 0.00018334524660471764, 'epoch': 0.08}


  8%|▊         | 1410/16798 [05:26<59:54,  4.28it/s]  

{'loss': 1.8052, 'grad_norm': 1.8994085788726807, 'learning_rate': 0.00018333333333333334, 'epoch': 0.08}


  8%|▊         | 1410/16798 [05:26<59:54,  4.28it/s]

{'loss': 1.7552, 'grad_norm': 2.112741231918335, 'learning_rate': 0.000183321420061949, 'epoch': 0.08}


  8%|▊         | 1411/16798 [05:26<59:58,  4.28it/s]

{'loss': 1.5144, 'grad_norm': 1.6796773672103882, 'learning_rate': 0.0001833095067905647, 'epoch': 0.08}


  8%|▊         | 1412/16798 [05:26<1:04:52,  3.95it/s]

{'loss': 1.7364, 'grad_norm': 1.7246158123016357, 'learning_rate': 0.00018329759351918035, 'epoch': 0.08}


  8%|▊         | 1413/16798 [05:27<1:07:14,  3.81it/s]

{'loss': 1.3896, 'grad_norm': 1.6621004343032837, 'learning_rate': 0.00018328568024779604, 'epoch': 0.08}


  8%|▊         | 1414/16798 [05:27<1:07:37,  3.79it/s]

{'loss': 1.6541, 'grad_norm': 1.9003442525863647, 'learning_rate': 0.0001832737669764117, 'epoch': 0.08}


  8%|▊         | 1415/16798 [05:27<1:06:30,  3.85it/s]

{'loss': 1.7625, 'grad_norm': 1.9218322038650513, 'learning_rate': 0.00018326185370502743, 'epoch': 0.08}


  8%|▊         | 1416/16798 [05:27<1:05:33,  3.91it/s]

{'loss': 1.2428, 'grad_norm': 1.587192177772522, 'learning_rate': 0.0001832499404336431, 'epoch': 0.08}


  8%|▊         | 1417/16798 [05:28<1:09:40,  3.68it/s]

{'loss': 1.1028, 'grad_norm': 1.6911110877990723, 'learning_rate': 0.00018323802716225878, 'epoch': 0.08}


  8%|▊         | 1418/16798 [05:28<1:06:35,  3.85it/s]

{'loss': 1.2589, 'grad_norm': 1.6934224367141724, 'learning_rate': 0.00018322611389087444, 'epoch': 0.08}


  8%|▊         | 1420/16798 [05:28<1:06:46,  3.84it/s]

{'loss': 1.8116, 'grad_norm': 1.938718557357788, 'learning_rate': 0.00018321420061949013, 'epoch': 0.08}


  8%|▊         | 1420/16798 [05:28<1:06:46,  3.84it/s]

{'loss': 1.5997, 'grad_norm': 2.01503586769104, 'learning_rate': 0.0001832022873481058, 'epoch': 0.08}


  8%|▊         | 1421/16798 [05:29<1:03:52,  4.01it/s]

{'loss': 1.5062, 'grad_norm': 1.8159123659133911, 'learning_rate': 0.0001831903740767215, 'epoch': 0.08}


  8%|▊         | 1422/16798 [05:29<1:02:24,  4.11it/s]

{'loss': 2.0298, 'grad_norm': 2.1618289947509766, 'learning_rate': 0.00018317846080533715, 'epoch': 0.08}


  8%|▊         | 1423/16798 [05:29<1:03:59,  4.00it/s]

{'loss': 1.9669, 'grad_norm': 2.054081678390503, 'learning_rate': 0.00018316654753395284, 'epoch': 0.08}


  8%|▊         | 1424/16798 [05:29<1:04:15,  3.99it/s]

{'loss': 1.1404, 'grad_norm': 1.5660271644592285, 'learning_rate': 0.0001831546342625685, 'epoch': 0.08}


  8%|▊         | 1425/16798 [05:30<1:03:01,  4.06it/s]

{'loss': 1.3262, 'grad_norm': 1.5992145538330078, 'learning_rate': 0.0001831427209911842, 'epoch': 0.08}


  8%|▊         | 1426/16798 [05:30<1:04:26,  3.98it/s]

{'loss': 1.2755, 'grad_norm': 1.7792332172393799, 'learning_rate': 0.00018313080771979986, 'epoch': 0.08}


  8%|▊         | 1427/16798 [05:30<1:02:02,  4.13it/s]

{'loss': 0.9806, 'grad_norm': 1.5295249223709106, 'learning_rate': 0.00018311889444841555, 'epoch': 0.08}


  9%|▊         | 1428/16798 [05:30<1:00:50,  4.21it/s]

{'loss': 1.2053, 'grad_norm': 1.8014023303985596, 'learning_rate': 0.00018310698117703122, 'epoch': 0.09}


  9%|▊         | 1429/16798 [05:31<1:00:25,  4.24it/s]

{'loss': 1.3808, 'grad_norm': 1.7714262008666992, 'learning_rate': 0.0001830950679056469, 'epoch': 0.09}


  9%|▊         | 1430/16798 [05:31<59:17,  4.32it/s]  

{'loss': 1.5603, 'grad_norm': 1.9314384460449219, 'learning_rate': 0.00018308315463426257, 'epoch': 0.09}


  9%|▊         | 1431/16798 [05:31<1:02:28,  4.10it/s]

{'loss': 1.3339, 'grad_norm': 1.789656639099121, 'learning_rate': 0.00018307124136287826, 'epoch': 0.09}


  9%|▊         | 1432/16798 [05:31<1:09:14,  3.70it/s]

{'loss': 1.2696, 'grad_norm': 1.726586937904358, 'learning_rate': 0.00018305932809149392, 'epoch': 0.09}


  9%|▊         | 1433/16798 [05:32<1:07:07,  3.82it/s]

{'loss': 1.3948, 'grad_norm': 1.9310524463653564, 'learning_rate': 0.00018304741482010962, 'epoch': 0.09}


  9%|▊         | 1434/16798 [05:32<1:04:34,  3.97it/s]

{'loss': 1.2477, 'grad_norm': 1.902407169342041, 'learning_rate': 0.00018303550154872528, 'epoch': 0.09}


  9%|▊         | 1435/16798 [05:32<1:02:01,  4.13it/s]

{'loss': 1.5053, 'grad_norm': 1.878057837486267, 'learning_rate': 0.00018302358827734097, 'epoch': 0.09}


  9%|▊         | 1436/16798 [05:32<1:03:52,  4.01it/s]

{'loss': 1.1066, 'grad_norm': 1.7077128887176514, 'learning_rate': 0.00018301167500595663, 'epoch': 0.09}


  9%|▊         | 1437/16798 [05:33<1:07:09,  3.81it/s]

{'loss': 1.4035, 'grad_norm': 1.8079702854156494, 'learning_rate': 0.00018299976173457232, 'epoch': 0.09}


  9%|▊         | 1438/16798 [05:33<1:06:43,  3.84it/s]

{'loss': 1.2131, 'grad_norm': 1.9428787231445312, 'learning_rate': 0.000182987848463188, 'epoch': 0.09}


  9%|▊         | 1439/16798 [05:33<1:08:18,  3.75it/s]

{'loss': 1.3882, 'grad_norm': 1.9483393430709839, 'learning_rate': 0.00018297593519180368, 'epoch': 0.09}


  9%|▊         | 1440/16798 [05:33<1:06:18,  3.86it/s]

{'loss': 1.4592, 'grad_norm': 2.075883388519287, 'learning_rate': 0.00018296402192041934, 'epoch': 0.09}


  9%|▊         | 1441/16798 [05:34<1:03:35,  4.03it/s]

{'loss': 0.8964, 'grad_norm': 1.5953319072723389, 'learning_rate': 0.00018295210864903503, 'epoch': 0.09}


  9%|▊         | 1442/16798 [05:34<1:07:05,  3.81it/s]

{'loss': 1.1102, 'grad_norm': 1.788327693939209, 'learning_rate': 0.0001829401953776507, 'epoch': 0.09}


  9%|▊         | 1443/16798 [05:34<1:04:37,  3.96it/s]

{'loss': 1.0385, 'grad_norm': 1.591063380241394, 'learning_rate': 0.0001829282821062664, 'epoch': 0.09}


  9%|▊         | 1444/16798 [05:34<1:05:22,  3.91it/s]

{'loss': 0.9882, 'grad_norm': 1.7018591165542603, 'learning_rate': 0.00018291636883488205, 'epoch': 0.09}


  9%|▊         | 1445/16798 [05:35<1:04:20,  3.98it/s]

{'loss': 1.1923, 'grad_norm': 1.8668735027313232, 'learning_rate': 0.00018290445556349774, 'epoch': 0.09}


  9%|▊         | 1446/16798 [05:35<1:02:50,  4.07it/s]

{'loss': 0.4571, 'grad_norm': 1.0164316892623901, 'learning_rate': 0.00018289254229211343, 'epoch': 0.09}


  9%|▊         | 1447/16798 [05:35<1:06:41,  3.84it/s]

{'loss': 0.5469, 'grad_norm': 1.2205735445022583, 'learning_rate': 0.00018288062902072912, 'epoch': 0.09}


  9%|▊         | 1448/16798 [05:35<1:05:57,  3.88it/s]

{'loss': 0.402, 'grad_norm': 0.9541183114051819, 'learning_rate': 0.0001828687157493448, 'epoch': 0.09}


  9%|▊         | 1449/16798 [05:36<1:05:12,  3.92it/s]

{'loss': 0.2708, 'grad_norm': 0.812231183052063, 'learning_rate': 0.00018285680247796048, 'epoch': 0.09}


  9%|▊         | 1450/16798 [05:36<1:09:26,  3.68it/s]

{'loss': 0.2552, 'grad_norm': 0.8680610656738281, 'learning_rate': 0.00018284488920657614, 'epoch': 0.09}


  9%|▊         | 1451/16798 [05:36<1:07:41,  3.78it/s]

{'loss': 1.7895, 'grad_norm': 1.693255066871643, 'learning_rate': 0.00018283297593519183, 'epoch': 0.09}


  9%|▊         | 1452/16798 [05:37<1:06:15,  3.86it/s]

{'loss': 2.5467, 'grad_norm': 2.345236301422119, 'learning_rate': 0.0001828210626638075, 'epoch': 0.09}


  9%|▊         | 1453/16798 [05:37<1:06:22,  3.85it/s]

{'loss': 2.2088, 'grad_norm': 1.9809027910232544, 'learning_rate': 0.0001828091493924232, 'epoch': 0.09}


  9%|▊         | 1454/16798 [05:37<1:06:44,  3.83it/s]

{'loss': 2.1125, 'grad_norm': 1.8540135622024536, 'learning_rate': 0.00018279723612103885, 'epoch': 0.09}


  9%|▊         | 1455/16798 [05:37<1:10:45,  3.61it/s]

{'loss': 2.2089, 'grad_norm': 2.101304531097412, 'learning_rate': 0.00018278532284965454, 'epoch': 0.09}


  9%|▊         | 1456/16798 [05:38<1:07:40,  3.78it/s]

{'loss': 1.8394, 'grad_norm': 2.6320455074310303, 'learning_rate': 0.0001827734095782702, 'epoch': 0.09}


  9%|▊         | 1457/16798 [05:38<1:08:24,  3.74it/s]

{'loss': 2.0004, 'grad_norm': 2.1459455490112305, 'learning_rate': 0.00018276149630688587, 'epoch': 0.09}


  9%|▊         | 1458/16798 [05:38<1:06:21,  3.85it/s]

{'loss': 2.071, 'grad_norm': 1.9310811758041382, 'learning_rate': 0.00018274958303550156, 'epoch': 0.09}


  9%|▊         | 1459/16798 [05:38<1:05:04,  3.93it/s]

{'loss': 2.3932, 'grad_norm': 2.2975246906280518, 'learning_rate': 0.00018273766976411722, 'epoch': 0.09}


  9%|▊         | 1460/16798 [05:39<1:08:45,  3.72it/s]

{'loss': 1.9829, 'grad_norm': 1.954158067703247, 'learning_rate': 0.00018272575649273291, 'epoch': 0.09}


  9%|▊         | 1461/16798 [05:39<1:09:43,  3.67it/s]

{'loss': 1.6073, 'grad_norm': 1.951425552368164, 'learning_rate': 0.00018271384322134858, 'epoch': 0.09}


  9%|▊         | 1462/16798 [05:39<1:06:56,  3.82it/s]

{'loss': 1.386, 'grad_norm': 2.0131988525390625, 'learning_rate': 0.00018270192994996427, 'epoch': 0.09}


  9%|▊         | 1463/16798 [05:39<1:03:15,  4.04it/s]

{'loss': 1.9515, 'grad_norm': 2.259387969970703, 'learning_rate': 0.00018269001667857993, 'epoch': 0.09}


  9%|▊         | 1464/16798 [05:40<1:03:39,  4.01it/s]

{'loss': 1.8243, 'grad_norm': 1.8655972480773926, 'learning_rate': 0.00018267810340719562, 'epoch': 0.09}


  9%|▊         | 1465/16798 [05:40<1:05:52,  3.88it/s]

{'loss': 1.574, 'grad_norm': 1.8140594959259033, 'learning_rate': 0.00018266619013581129, 'epoch': 0.09}


  9%|▊         | 1466/16798 [05:40<1:05:16,  3.92it/s]

{'loss': 1.4346, 'grad_norm': 1.829750418663025, 'learning_rate': 0.00018265427686442698, 'epoch': 0.09}


  9%|▊         | 1467/16798 [05:40<1:02:21,  4.10it/s]

{'loss': 1.6624, 'grad_norm': 2.2344553470611572, 'learning_rate': 0.00018264236359304264, 'epoch': 0.09}


  9%|▊         | 1468/16798 [05:41<1:01:48,  4.13it/s]

{'loss': 1.8041, 'grad_norm': 2.0068423748016357, 'learning_rate': 0.00018263045032165833, 'epoch': 0.09}


  9%|▊         | 1469/16798 [05:41<1:01:54,  4.13it/s]

{'loss': 1.6468, 'grad_norm': 1.90073561668396, 'learning_rate': 0.000182618537050274, 'epoch': 0.09}


  9%|▉         | 1470/16798 [05:41<1:10:24,  3.63it/s]

{'loss': 1.6637, 'grad_norm': 1.8973466157913208, 'learning_rate': 0.00018260662377888969, 'epoch': 0.09}


  9%|▉         | 1471/16798 [05:41<1:09:12,  3.69it/s]

{'loss': 0.9883, 'grad_norm': 1.6153301000595093, 'learning_rate': 0.00018259471050750535, 'epoch': 0.09}


  9%|▉         | 1472/16798 [05:42<1:06:57,  3.81it/s]

{'loss': 1.5326, 'grad_norm': 1.6412566900253296, 'learning_rate': 0.00018258279723612104, 'epoch': 0.09}


  9%|▉         | 1473/16798 [05:42<1:04:52,  3.94it/s]

{'loss': 1.5902, 'grad_norm': 2.1046619415283203, 'learning_rate': 0.0001825708839647367, 'epoch': 0.09}


  9%|▉         | 1474/16798 [05:42<1:02:19,  4.10it/s]

{'loss': 1.4798, 'grad_norm': 1.78419828414917, 'learning_rate': 0.0001825589706933524, 'epoch': 0.09}


  9%|▉         | 1475/16798 [05:42<1:02:53,  4.06it/s]

{'loss': 1.5604, 'grad_norm': 1.8673192262649536, 'learning_rate': 0.00018254705742196806, 'epoch': 0.09}


  9%|▉         | 1476/16798 [05:43<1:05:38,  3.89it/s]

{'loss': 1.8651, 'grad_norm': 2.0074336528778076, 'learning_rate': 0.00018253514415058378, 'epoch': 0.09}


  9%|▉         | 1477/16798 [05:43<1:02:57,  4.06it/s]

{'loss': 1.1175, 'grad_norm': 1.5333406925201416, 'learning_rate': 0.00018252323087919944, 'epoch': 0.09}


  9%|▉         | 1478/16798 [05:43<1:03:58,  3.99it/s]

{'loss': 1.6074, 'grad_norm': 1.6895614862442017, 'learning_rate': 0.00018251131760781513, 'epoch': 0.09}


  9%|▉         | 1479/16798 [05:43<1:05:51,  3.88it/s]

{'loss': 1.6699, 'grad_norm': 2.0479674339294434, 'learning_rate': 0.0001824994043364308, 'epoch': 0.09}


  9%|▉         | 1480/16798 [05:44<1:04:03,  3.99it/s]

{'loss': 1.356, 'grad_norm': 1.7069727182388306, 'learning_rate': 0.00018248749106504649, 'epoch': 0.09}


  9%|▉         | 1481/16798 [05:44<1:04:53,  3.93it/s]

{'loss': 1.3577, 'grad_norm': 1.8938136100769043, 'learning_rate': 0.00018247557779366215, 'epoch': 0.09}


  9%|▉         | 1482/16798 [05:44<1:06:30,  3.84it/s]

{'loss': 1.4369, 'grad_norm': 2.065664768218994, 'learning_rate': 0.00018246366452227784, 'epoch': 0.09}


  9%|▉         | 1483/16798 [05:44<1:05:18,  3.91it/s]

{'loss': 2.0181, 'grad_norm': 2.2880373001098633, 'learning_rate': 0.0001824517512508935, 'epoch': 0.09}


  9%|▉         | 1484/16798 [05:45<1:05:49,  3.88it/s]

{'loss': 1.7526, 'grad_norm': 2.4802987575531006, 'learning_rate': 0.0001824398379795092, 'epoch': 0.09}


  9%|▉         | 1485/16798 [05:45<1:01:51,  4.13it/s]

{'loss': 1.8416, 'grad_norm': 2.4628212451934814, 'learning_rate': 0.00018242792470812486, 'epoch': 0.09}


  9%|▉         | 1486/16798 [05:45<1:04:30,  3.96it/s]

{'loss': 1.1164, 'grad_norm': 1.6233800649642944, 'learning_rate': 0.00018241601143674055, 'epoch': 0.09}


  9%|▉         | 1487/16798 [05:45<1:06:07,  3.86it/s]

{'loss': 1.8719, 'grad_norm': 2.1716413497924805, 'learning_rate': 0.0001824040981653562, 'epoch': 0.09}


  9%|▉         | 1488/16798 [05:46<1:05:11,  3.91it/s]

{'loss': 1.352, 'grad_norm': 1.967883586883545, 'learning_rate': 0.0001823921848939719, 'epoch': 0.09}


  9%|▉         | 1489/16798 [05:46<1:07:09,  3.80it/s]

{'loss': 1.1304, 'grad_norm': 1.4631688594818115, 'learning_rate': 0.00018238027162258757, 'epoch': 0.09}


  9%|▉         | 1490/16798 [05:46<1:08:33,  3.72it/s]

{'loss': 1.3673, 'grad_norm': 1.721044898033142, 'learning_rate': 0.00018236835835120326, 'epoch': 0.09}


  9%|▉         | 1491/16798 [05:47<1:09:03,  3.69it/s]

{'loss': 1.4569, 'grad_norm': 2.301368236541748, 'learning_rate': 0.00018235644507981892, 'epoch': 0.09}


  9%|▉         | 1492/16798 [05:47<1:06:22,  3.84it/s]

{'loss': 1.2687, 'grad_norm': 1.8440659046173096, 'learning_rate': 0.0001823445318084346, 'epoch': 0.09}


  9%|▉         | 1493/16798 [05:47<1:07:27,  3.78it/s]

{'loss': 1.3294, 'grad_norm': 2.74471116065979, 'learning_rate': 0.00018233261853705028, 'epoch': 0.09}


  9%|▉         | 1494/16798 [05:47<1:04:39,  3.94it/s]

{'loss': 1.1523, 'grad_norm': 1.5885714292526245, 'learning_rate': 0.00018232070526566597, 'epoch': 0.09}


  9%|▉         | 1495/16798 [05:48<1:07:03,  3.80it/s]

{'loss': 1.2293, 'grad_norm': 3.1387553215026855, 'learning_rate': 0.00018230879199428163, 'epoch': 0.09}


  9%|▉         | 1496/16798 [05:48<1:04:55,  3.93it/s]

{'loss': 0.7559, 'grad_norm': 1.3093931674957275, 'learning_rate': 0.00018229687872289732, 'epoch': 0.09}


  9%|▉         | 1497/16798 [05:48<1:04:37,  3.95it/s]

{'loss': 0.7339, 'grad_norm': 1.4168343544006348, 'learning_rate': 0.00018228496545151298, 'epoch': 0.09}


  9%|▉         | 1498/16798 [05:48<1:05:00,  3.92it/s]

{'loss': 0.5847, 'grad_norm': 1.1903208494186401, 'learning_rate': 0.00018227305218012867, 'epoch': 0.09}


  9%|▉         | 1499/16798 [05:49<1:03:31,  4.01it/s]

{'loss': 0.2053, 'grad_norm': 0.8334194421768188, 'learning_rate': 0.00018226113890874434, 'epoch': 0.09}




{'loss': 0.6688, 'grad_norm': 1.2899075746536255, 'learning_rate': 0.00018224922563736003, 'epoch': 0.09}


  9%|▉         | 1501/16798 [05:52<4:10:38,  1.02it/s]

{'loss': 2.2398, 'grad_norm': 1.805168628692627, 'learning_rate': 0.0001822373123659757, 'epoch': 0.09}


  9%|▉         | 1502/16798 [05:52<3:15:17,  1.31it/s]

{'loss': 2.1148, 'grad_norm': 1.6261271238327026, 'learning_rate': 0.00018222539909459138, 'epoch': 0.09}


  9%|▉         | 1503/16798 [05:52<2:37:05,  1.62it/s]

{'loss': 2.1658, 'grad_norm': 1.827405571937561, 'learning_rate': 0.00018221348582320705, 'epoch': 0.09}


  9%|▉         | 1504/16798 [05:52<2:09:12,  1.97it/s]

{'loss': 2.0076, 'grad_norm': 1.7442519664764404, 'learning_rate': 0.00018220157255182274, 'epoch': 0.09}


  9%|▉         | 1505/16798 [05:53<1:52:33,  2.26it/s]

{'loss': 2.1686, 'grad_norm': 1.8770332336425781, 'learning_rate': 0.0001821896592804384, 'epoch': 0.09}


  9%|▉         | 1506/16798 [05:53<1:37:55,  2.60it/s]

{'loss': 1.8857, 'grad_norm': 1.6340875625610352, 'learning_rate': 0.0001821777460090541, 'epoch': 0.09}


  9%|▉         | 1507/16798 [05:53<1:28:04,  2.89it/s]

{'loss': 2.0656, 'grad_norm': 1.8296544551849365, 'learning_rate': 0.00018216583273766978, 'epoch': 0.09}


  9%|▉         | 1508/16798 [05:53<1:21:21,  3.13it/s]

{'loss': 1.9132, 'grad_norm': 1.8645585775375366, 'learning_rate': 0.00018215391946628547, 'epoch': 0.09}


  9%|▉         | 1509/16798 [05:54<1:23:19,  3.06it/s]

{'loss': 1.9414, 'grad_norm': 2.0382182598114014, 'learning_rate': 0.00018214200619490114, 'epoch': 0.09}


  9%|▉         | 1510/16798 [05:54<1:17:12,  3.30it/s]

{'loss': 2.0097, 'grad_norm': 1.8968441486358643, 'learning_rate': 0.00018213009292351683, 'epoch': 0.09}


  9%|▉         | 1511/16798 [05:54<1:14:13,  3.43it/s]

{'loss': 1.3234, 'grad_norm': 1.4812631607055664, 'learning_rate': 0.0001821181796521325, 'epoch': 0.09}


  9%|▉         | 1512/16798 [05:54<1:09:21,  3.67it/s]

{'loss': 1.4969, 'grad_norm': 1.7645516395568848, 'learning_rate': 0.00018210626638074818, 'epoch': 0.09}


  9%|▉         | 1514/16798 [05:55<1:05:02,  3.92it/s]

{'loss': 1.8637, 'grad_norm': 1.9546726942062378, 'learning_rate': 0.00018209435310936385, 'epoch': 0.09}


  9%|▉         | 1514/16798 [05:55<1:05:02,  3.92it/s]

{'loss': 1.736, 'grad_norm': 1.838334321975708, 'learning_rate': 0.0001820824398379795, 'epoch': 0.09}


  9%|▉         | 1515/16798 [05:55<1:03:50,  3.99it/s]

{'loss': 1.8529, 'grad_norm': 1.9666613340377808, 'learning_rate': 0.0001820705265665952, 'epoch': 0.09}


  9%|▉         | 1516/16798 [05:55<1:02:54,  4.05it/s]

{'loss': 1.6771, 'grad_norm': 2.1146621704101562, 'learning_rate': 0.00018205861329521086, 'epoch': 0.09}


  9%|▉         | 1517/16798 [05:56<1:01:57,  4.11it/s]

{'loss': 1.8203, 'grad_norm': 2.126612663269043, 'learning_rate': 0.00018204670002382656, 'epoch': 0.09}


  9%|▉         | 1519/16798 [05:56<1:00:20,  4.22it/s]

{'loss': 1.4897, 'grad_norm': 1.6809661388397217, 'learning_rate': 0.00018203478675244222, 'epoch': 0.09}


  9%|▉         | 1519/16798 [05:56<1:00:20,  4.22it/s]

{'loss': 1.6565, 'grad_norm': 1.87069833278656, 'learning_rate': 0.0001820228734810579, 'epoch': 0.09}


  9%|▉         | 1520/16798 [05:56<59:11,  4.30it/s]  

{'loss': 1.6451, 'grad_norm': 1.6376092433929443, 'learning_rate': 0.00018201096020967357, 'epoch': 0.09}


  9%|▉         | 1521/16798 [05:57<58:26,  4.36it/s]

{'loss': 1.5409, 'grad_norm': 1.8284904956817627, 'learning_rate': 0.00018199904693828926, 'epoch': 0.09}


  9%|▉         | 1522/16798 [05:57<59:30,  4.28it/s]

{'loss': 1.9299, 'grad_norm': 2.0862631797790527, 'learning_rate': 0.00018198713366690493, 'epoch': 0.09}


  9%|▉         | 1523/16798 [05:57<58:42,  4.34it/s]

{'loss': 1.8766, 'grad_norm': 2.0024068355560303, 'learning_rate': 0.00018197522039552062, 'epoch': 0.09}


  9%|▉         | 1524/16798 [05:57<1:07:28,  3.77it/s]

{'loss': 1.5908, 'grad_norm': 2.079134702682495, 'learning_rate': 0.00018196330712413628, 'epoch': 0.09}


  9%|▉         | 1525/16798 [05:58<1:09:30,  3.66it/s]

{'loss': 1.1457, 'grad_norm': 1.4353803396224976, 'learning_rate': 0.00018195139385275197, 'epoch': 0.09}


  9%|▉         | 1526/16798 [05:58<1:10:21,  3.62it/s]

{'loss': 1.3206, 'grad_norm': 1.579485297203064, 'learning_rate': 0.00018193948058136764, 'epoch': 0.09}


  9%|▉         | 1527/16798 [05:58<1:12:01,  3.53it/s]

{'loss': 1.2589, 'grad_norm': 1.5508779287338257, 'learning_rate': 0.00018192756730998333, 'epoch': 0.09}


  9%|▉         | 1528/16798 [05:59<1:10:53,  3.59it/s]

{'loss': 1.3623, 'grad_norm': 1.632023811340332, 'learning_rate': 0.000181915654038599, 'epoch': 0.09}


  9%|▉         | 1529/16798 [05:59<1:07:18,  3.78it/s]

{'loss': 1.4312, 'grad_norm': 1.8770502805709839, 'learning_rate': 0.00018190374076721468, 'epoch': 0.09}


  9%|▉         | 1530/16798 [05:59<1:04:09,  3.97it/s]

{'loss': 1.5295, 'grad_norm': 1.914509892463684, 'learning_rate': 0.00018189182749583035, 'epoch': 0.09}


  9%|▉         | 1531/16798 [05:59<1:03:48,  3.99it/s]

{'loss': 1.1716, 'grad_norm': 1.7285935878753662, 'learning_rate': 0.00018187991422444604, 'epoch': 0.09}


  9%|▉         | 1532/16798 [05:59<1:03:06,  4.03it/s]

{'loss': 1.5574, 'grad_norm': 2.0674333572387695, 'learning_rate': 0.0001818680009530617, 'epoch': 0.09}


  9%|▉         | 1533/16798 [06:00<1:07:51,  3.75it/s]

{'loss': 1.2971, 'grad_norm': 1.724289059638977, 'learning_rate': 0.0001818560876816774, 'epoch': 0.09}


  9%|▉         | 1534/16798 [06:00<1:04:57,  3.92it/s]

{'loss': 1.015, 'grad_norm': 1.6264090538024902, 'learning_rate': 0.00018184417441029305, 'epoch': 0.09}


  9%|▉         | 1535/16798 [06:00<1:02:55,  4.04it/s]

{'loss': 1.4172, 'grad_norm': 2.0825750827789307, 'learning_rate': 0.00018183226113890875, 'epoch': 0.09}


  9%|▉         | 1536/16798 [06:00<1:00:54,  4.18it/s]

{'loss': 1.1748, 'grad_norm': 1.7486830949783325, 'learning_rate': 0.0001818203478675244, 'epoch': 0.09}


  9%|▉         | 1537/16798 [06:01<1:05:27,  3.89it/s]

{'loss': 1.3303, 'grad_norm': 1.7244279384613037, 'learning_rate': 0.0001818084345961401, 'epoch': 0.09}


  9%|▉         | 1538/16798 [06:01<1:03:37,  4.00it/s]

{'loss': 1.5851, 'grad_norm': 2.0574791431427, 'learning_rate': 0.0001817965213247558, 'epoch': 0.09}


  9%|▉         | 1539/16798 [06:01<1:01:16,  4.15it/s]

{'loss': 1.3494, 'grad_norm': 1.6425282955169678, 'learning_rate': 0.00018178460805337148, 'epoch': 0.09}


  9%|▉         | 1541/16798 [06:02<59:44,  4.26it/s]  

{'loss': 1.1495, 'grad_norm': 1.372380256652832, 'learning_rate': 0.00018177269478198714, 'epoch': 0.09}


  9%|▉         | 1541/16798 [06:02<59:44,  4.26it/s]

{'loss': 1.22, 'grad_norm': 1.8557132482528687, 'learning_rate': 0.00018176078151060284, 'epoch': 0.09}


  9%|▉         | 1543/16798 [06:02<59:44,  4.26it/s]  

{'loss': 1.5056, 'grad_norm': 2.0753867626190186, 'learning_rate': 0.0001817488682392185, 'epoch': 0.09}


  9%|▉         | 1544/16798 [06:02<56:48,  4.48it/s]

{'loss': 1.3631, 'grad_norm': 1.9825843572616577, 'learning_rate': 0.0001817369549678342, 'epoch': 0.09}


  9%|▉         | 1544/16798 [06:02<56:48,  4.48it/s]

{'loss': 1.2084, 'grad_norm': 2.8662140369415283, 'learning_rate': 0.00018172504169644985, 'epoch': 0.09}


  9%|▉         | 1545/16798 [06:03<55:33,  4.58it/s]

{'loss': 1.1509, 'grad_norm': 1.9607584476470947, 'learning_rate': 0.00018171312842506554, 'epoch': 0.09}


  9%|▉         | 1546/16798 [06:03<57:03,  4.46it/s]

{'loss': 0.9137, 'grad_norm': 1.6462647914886475, 'learning_rate': 0.0001817012151536812, 'epoch': 0.09}


  9%|▉         | 1547/16798 [06:03<1:01:55,  4.10it/s]

{'loss': 0.321, 'grad_norm': 1.0011909008026123, 'learning_rate': 0.0001816893018822969, 'epoch': 0.09}


  9%|▉         | 1548/16798 [06:03<1:00:24,  4.21it/s]

{'loss': 0.3895, 'grad_norm': 1.33267343044281, 'learning_rate': 0.00018167738861091256, 'epoch': 0.09}


  9%|▉         | 1549/16798 [06:04<1:01:40,  4.12it/s]

{'loss': 0.2331, 'grad_norm': 0.7347395420074463, 'learning_rate': 0.00018166547533952825, 'epoch': 0.09}


  9%|▉         | 1551/16798 [06:04<1:01:10,  4.15it/s]

{'loss': 0.2269, 'grad_norm': 0.8156595230102539, 'learning_rate': 0.00018165356206814392, 'epoch': 0.09}


  9%|▉         | 1551/16798 [06:04<1:01:10,  4.15it/s]

{'loss': 2.0852, 'grad_norm': 1.7010990381240845, 'learning_rate': 0.0001816416487967596, 'epoch': 0.09}


  9%|▉         | 1552/16798 [06:04<1:04:29,  3.94it/s]

{'loss': 2.0418, 'grad_norm': 1.8790303468704224, 'learning_rate': 0.00018162973552537527, 'epoch': 0.09}


  9%|▉         | 1553/16798 [06:05<1:04:53,  3.92it/s]

{'loss': 2.3253, 'grad_norm': 1.9248846769332886, 'learning_rate': 0.00018161782225399096, 'epoch': 0.09}


  9%|▉         | 1554/16798 [06:05<1:02:51,  4.04it/s]

{'loss': 1.8408, 'grad_norm': 2.7102813720703125, 'learning_rate': 0.00018160590898260663, 'epoch': 0.09}


  9%|▉         | 1555/16798 [06:05<1:01:37,  4.12it/s]

{'loss': 1.9202, 'grad_norm': 1.8003277778625488, 'learning_rate': 0.00018159399571122232, 'epoch': 0.09}


  9%|▉         | 1556/16798 [06:05<1:04:34,  3.93it/s]

{'loss': 2.3339, 'grad_norm': 1.9657847881317139, 'learning_rate': 0.00018158208243983798, 'epoch': 0.09}


  9%|▉         | 1557/16798 [06:06<1:04:19,  3.95it/s]

{'loss': 1.8423, 'grad_norm': 1.9719469547271729, 'learning_rate': 0.00018157016916845367, 'epoch': 0.09}


  9%|▉         | 1559/16798 [06:06<1:00:54,  4.17it/s]

{'loss': 2.1215, 'grad_norm': 1.9404352903366089, 'learning_rate': 0.00018155825589706933, 'epoch': 0.09}


  9%|▉         | 1559/16798 [06:06<1:00:54,  4.17it/s]

{'loss': 2.0729, 'grad_norm': 1.9206039905548096, 'learning_rate': 0.00018154634262568503, 'epoch': 0.09}


  9%|▉         | 1560/16798 [06:06<1:00:35,  4.19it/s]

{'loss': 1.8583, 'grad_norm': 2.0336451530456543, 'learning_rate': 0.0001815344293543007, 'epoch': 0.09}


  9%|▉         | 1561/16798 [06:07<1:03:40,  3.99it/s]

{'loss': 1.903, 'grad_norm': 1.9882707595825195, 'learning_rate': 0.00018152251608291638, 'epoch': 0.09}


  9%|▉         | 1562/16798 [06:07<1:03:05,  4.02it/s]

{'loss': 1.2597, 'grad_norm': 1.5106812715530396, 'learning_rate': 0.00018151060281153204, 'epoch': 0.09}


  9%|▉         | 1563/16798 [06:07<1:04:21,  3.95it/s]

{'loss': 1.9961, 'grad_norm': 2.406893491744995, 'learning_rate': 0.00018149868954014773, 'epoch': 0.09}


  9%|▉         | 1564/16798 [06:07<1:03:15,  4.01it/s]

{'loss': 1.3029, 'grad_norm': 1.708350658416748, 'learning_rate': 0.0001814867762687634, 'epoch': 0.09}


  9%|▉         | 1565/16798 [06:08<1:04:19,  3.95it/s]

{'loss': 2.0793, 'grad_norm': 2.6463589668273926, 'learning_rate': 0.0001814748629973791, 'epoch': 0.09}


  9%|▉         | 1566/16798 [06:08<1:04:15,  3.95it/s]

{'loss': 1.4429, 'grad_norm': 1.8020336627960205, 'learning_rate': 0.00018146294972599475, 'epoch': 0.09}


  9%|▉         | 1567/16798 [06:08<1:04:20,  3.95it/s]

{'loss': 1.6734, 'grad_norm': 1.7197681665420532, 'learning_rate': 0.00018145103645461044, 'epoch': 0.09}


  9%|▉         | 1568/16798 [06:08<1:02:22,  4.07it/s]

{'loss': 1.5836, 'grad_norm': 1.9498505592346191, 'learning_rate': 0.0001814391231832261, 'epoch': 0.09}


  9%|▉         | 1569/16798 [06:09<1:05:51,  3.85it/s]

{'loss': 1.6894, 'grad_norm': 1.82356595993042, 'learning_rate': 0.00018142720991184182, 'epoch': 0.09}


  9%|▉         | 1570/16798 [06:09<1:05:39,  3.87it/s]

{'loss': 1.4027, 'grad_norm': 1.6557466983795166, 'learning_rate': 0.0001814152966404575, 'epoch': 0.09}


  9%|▉         | 1571/16798 [06:09<1:02:57,  4.03it/s]

{'loss': 1.3215, 'grad_norm': 2.215660810470581, 'learning_rate': 0.00018140338336907315, 'epoch': 0.09}


  9%|▉         | 1572/16798 [06:09<1:07:53,  3.74it/s]

{'loss': 1.6625, 'grad_norm': 1.831056833267212, 'learning_rate': 0.00018139147009768884, 'epoch': 0.09}


  9%|▉         | 1573/16798 [06:10<1:08:41,  3.69it/s]

{'loss': 1.6348, 'grad_norm': 1.7780154943466187, 'learning_rate': 0.0001813795568263045, 'epoch': 0.09}


  9%|▉         | 1574/16798 [06:10<1:04:44,  3.92it/s]

{'loss': 1.7951, 'grad_norm': 1.7934863567352295, 'learning_rate': 0.0001813676435549202, 'epoch': 0.09}


  9%|▉         | 1575/16798 [06:10<1:02:46,  4.04it/s]

{'loss': 1.4218, 'grad_norm': 1.6547225713729858, 'learning_rate': 0.00018135573028353586, 'epoch': 0.09}


  9%|▉         | 1576/16798 [06:10<1:04:11,  3.95it/s]

{'loss': 1.6226, 'grad_norm': 1.9629263877868652, 'learning_rate': 0.00018134381701215155, 'epoch': 0.09}


  9%|▉         | 1577/16798 [06:11<1:05:51,  3.85it/s]

{'loss': 0.9579, 'grad_norm': 1.466680645942688, 'learning_rate': 0.00018133190374076722, 'epoch': 0.09}


  9%|▉         | 1578/16798 [06:11<1:03:28,  4.00it/s]

{'loss': 1.6736, 'grad_norm': 1.8600821495056152, 'learning_rate': 0.0001813199904693829, 'epoch': 0.09}


  9%|▉         | 1579/16798 [06:11<1:03:09,  4.02it/s]

{'loss': 1.8496, 'grad_norm': 2.1708858013153076, 'learning_rate': 0.00018130807719799857, 'epoch': 0.09}


  9%|▉         | 1580/16798 [06:11<1:04:56,  3.91it/s]

{'loss': 1.8849, 'grad_norm': 1.9265179634094238, 'learning_rate': 0.00018129616392661426, 'epoch': 0.09}


  9%|▉         | 1581/16798 [06:12<1:03:48,  3.97it/s]

{'loss': 1.4595, 'grad_norm': 1.8564096689224243, 'learning_rate': 0.00018128425065522992, 'epoch': 0.09}


  9%|▉         | 1582/16798 [06:12<1:03:53,  3.97it/s]

{'loss': 1.5913, 'grad_norm': 1.9503233432769775, 'learning_rate': 0.00018127233738384561, 'epoch': 0.09}


  9%|▉         | 1584/16798 [06:12<1:02:00,  4.09it/s]

{'loss': 1.6623, 'grad_norm': 2.0219974517822266, 'learning_rate': 0.00018126042411246128, 'epoch': 0.09}


  9%|▉         | 1584/16798 [06:12<1:02:00,  4.09it/s]

{'loss': 1.2771, 'grad_norm': 1.748673439025879, 'learning_rate': 0.00018124851084107697, 'epoch': 0.09}


  9%|▉         | 1585/16798 [06:13<1:02:14,  4.07it/s]

{'loss': 1.9895, 'grad_norm': 2.1408963203430176, 'learning_rate': 0.00018123659756969263, 'epoch': 0.09}


  9%|▉         | 1586/16798 [06:13<1:00:12,  4.21it/s]

{'loss': 1.4451, 'grad_norm': 1.71332585811615, 'learning_rate': 0.00018122468429830832, 'epoch': 0.09}


  9%|▉         | 1587/16798 [06:13<1:04:25,  3.94it/s]

{'loss': 1.5197, 'grad_norm': 2.1428751945495605, 'learning_rate': 0.000181212771026924, 'epoch': 0.09}


  9%|▉         | 1588/16798 [06:13<1:02:38,  4.05it/s]

{'loss': 1.2291, 'grad_norm': 1.6121983528137207, 'learning_rate': 0.00018120085775553968, 'epoch': 0.09}


  9%|▉         | 1589/16798 [06:14<1:02:31,  4.05it/s]

{'loss': 1.4677, 'grad_norm': 3.015221118927002, 'learning_rate': 0.00018118894448415534, 'epoch': 0.09}


  9%|▉         | 1590/16798 [06:14<1:04:34,  3.92it/s]

{'loss': 0.9255, 'grad_norm': 1.4479241371154785, 'learning_rate': 0.00018117703121277103, 'epoch': 0.09}


  9%|▉         | 1591/16798 [06:14<1:01:25,  4.13it/s]

{'loss': 1.3593, 'grad_norm': 1.8222376108169556, 'learning_rate': 0.0001811651179413867, 'epoch': 0.09}


  9%|▉         | 1592/16798 [06:14<1:02:37,  4.05it/s]

{'loss': 1.7813, 'grad_norm': 1.882839560508728, 'learning_rate': 0.0001811532046700024, 'epoch': 0.09}


  9%|▉         | 1593/16798 [06:15<1:02:36,  4.05it/s]

{'loss': 1.0128, 'grad_norm': 2.7153804302215576, 'learning_rate': 0.00018114129139861805, 'epoch': 0.09}


  9%|▉         | 1594/16798 [06:15<1:06:37,  3.80it/s]

{'loss': 1.4584, 'grad_norm': 1.9401025772094727, 'learning_rate': 0.00018112937812723374, 'epoch': 0.09}


  9%|▉         | 1595/16798 [06:15<1:05:29,  3.87it/s]

{'loss': 1.235, 'grad_norm': 1.7400065660476685, 'learning_rate': 0.0001811174648558494, 'epoch': 0.09}


 10%|▉         | 1596/16798 [06:15<1:03:38,  3.98it/s]

{'loss': 1.2216, 'grad_norm': 1.5660964250564575, 'learning_rate': 0.0001811055515844651, 'epoch': 0.1}


 10%|▉         | 1597/16798 [06:16<1:05:55,  3.84it/s]

{'loss': 0.6561, 'grad_norm': 1.2445244789123535, 'learning_rate': 0.00018109363831308076, 'epoch': 0.1}


 10%|▉         | 1598/16798 [06:16<1:07:31,  3.75it/s]

{'loss': 0.3956, 'grad_norm': 0.993425190448761, 'learning_rate': 0.00018108172504169645, 'epoch': 0.1}


 10%|▉         | 1599/16798 [06:16<1:06:19,  3.82it/s]

{'loss': 0.3361, 'grad_norm': 0.8319274187088013, 'learning_rate': 0.00018106981177031211, 'epoch': 0.1}


 10%|▉         | 1600/16798 [06:16<1:02:47,  4.03it/s]

{'loss': 0.9017, 'grad_norm': 1.5833262205123901, 'learning_rate': 0.00018105789849892783, 'epoch': 0.1}


 10%|▉         | 1601/16798 [06:17<1:03:24,  3.99it/s]

{'loss': 1.8934, 'grad_norm': 1.6700890064239502, 'learning_rate': 0.0001810459852275435, 'epoch': 0.1}


 10%|▉         | 1602/16798 [06:17<1:04:09,  3.95it/s]

{'loss': 2.4444, 'grad_norm': 2.2605221271514893, 'learning_rate': 0.00018103407195615919, 'epoch': 0.1}


 10%|▉         | 1603/16798 [06:17<1:06:48,  3.79it/s]

{'loss': 2.0198, 'grad_norm': 1.7863779067993164, 'learning_rate': 0.00018102215868477485, 'epoch': 0.1}


 10%|▉         | 1604/16798 [06:17<1:04:38,  3.92it/s]

{'loss': 1.8298, 'grad_norm': 1.7480326890945435, 'learning_rate': 0.00018101024541339054, 'epoch': 0.1}


 10%|▉         | 1605/16798 [06:18<1:05:33,  3.86it/s]

{'loss': 1.7836, 'grad_norm': 1.7362217903137207, 'learning_rate': 0.0001809983321420062, 'epoch': 0.1}


 10%|▉         | 1606/16798 [06:18<1:07:20,  3.76it/s]

{'loss': 1.8525, 'grad_norm': 1.9127142429351807, 'learning_rate': 0.0001809864188706219, 'epoch': 0.1}


 10%|▉         | 1607/16798 [06:18<1:07:28,  3.75it/s]

{'loss': 1.5499, 'grad_norm': 1.7929130792617798, 'learning_rate': 0.00018097450559923756, 'epoch': 0.1}


 10%|▉         | 1608/16798 [06:19<1:06:34,  3.80it/s]

{'loss': 1.49, 'grad_norm': 1.9549487829208374, 'learning_rate': 0.00018096259232785325, 'epoch': 0.1}


 10%|▉         | 1609/16798 [06:19<1:04:39,  3.92it/s]

{'loss': 1.8991, 'grad_norm': 1.8344119787216187, 'learning_rate': 0.0001809506790564689, 'epoch': 0.1}


 10%|▉         | 1610/16798 [06:19<1:11:21,  3.55it/s]

{'loss': 1.5302, 'grad_norm': 1.7850714921951294, 'learning_rate': 0.0001809387657850846, 'epoch': 0.1}


 10%|▉         | 1611/16798 [06:19<1:07:28,  3.75it/s]

{'loss': 1.4815, 'grad_norm': 1.7916090488433838, 'learning_rate': 0.00018092685251370027, 'epoch': 0.1}


 10%|▉         | 1612/16798 [06:20<1:03:14,  4.00it/s]

{'loss': 1.9891, 'grad_norm': 2.1072804927825928, 'learning_rate': 0.00018091493924231596, 'epoch': 0.1}


 10%|▉         | 1613/16798 [06:20<1:01:08,  4.14it/s]

{'loss': 1.5938, 'grad_norm': 1.7584187984466553, 'learning_rate': 0.00018090302597093162, 'epoch': 0.1}


 10%|▉         | 1614/16798 [06:20<1:00:59,  4.15it/s]

{'loss': 1.3227, 'grad_norm': 1.7395689487457275, 'learning_rate': 0.0001808911126995473, 'epoch': 0.1}


 10%|▉         | 1615/16798 [06:20<1:04:23,  3.93it/s]

{'loss': 1.5863, 'grad_norm': 1.9451813697814941, 'learning_rate': 0.00018087919942816298, 'epoch': 0.1}


 10%|▉         | 1616/16798 [06:21<1:04:53,  3.90it/s]

{'loss': 1.729, 'grad_norm': 1.993139386177063, 'learning_rate': 0.00018086728615677867, 'epoch': 0.1}


 10%|▉         | 1617/16798 [06:21<1:05:11,  3.88it/s]

{'loss': 1.5287, 'grad_norm': 1.901171088218689, 'learning_rate': 0.00018085537288539433, 'epoch': 0.1}


 10%|▉         | 1618/16798 [06:21<1:08:13,  3.71it/s]

{'loss': 2.0905, 'grad_norm': 2.2850279808044434, 'learning_rate': 0.00018084345961401002, 'epoch': 0.1}


 10%|▉         | 1619/16798 [06:21<1:09:41,  3.63it/s]

{'loss': 1.3515, 'grad_norm': 1.9448295831680298, 'learning_rate': 0.00018083154634262569, 'epoch': 0.1}


 10%|▉         | 1620/16798 [06:22<1:07:12,  3.76it/s]

{'loss': 1.592, 'grad_norm': 2.0675830841064453, 'learning_rate': 0.00018081963307124138, 'epoch': 0.1}


 10%|▉         | 1621/16798 [06:22<1:07:28,  3.75it/s]

{'loss': 1.4975, 'grad_norm': 1.8161765336990356, 'learning_rate': 0.00018080771979985704, 'epoch': 0.1}


 10%|▉         | 1622/16798 [06:22<1:07:37,  3.74it/s]

{'loss': 1.3166, 'grad_norm': 1.5995055437088013, 'learning_rate': 0.00018079580652847273, 'epoch': 0.1}


 10%|▉         | 1623/16798 [06:22<1:05:05,  3.89it/s]

{'loss': 1.6768, 'grad_norm': 2.1232852935791016, 'learning_rate': 0.0001807838932570884, 'epoch': 0.1}


 10%|▉         | 1624/16798 [06:23<1:04:38,  3.91it/s]

{'loss': 1.5473, 'grad_norm': 1.9647122621536255, 'learning_rate': 0.00018077197998570408, 'epoch': 0.1}


 10%|▉         | 1625/16798 [06:23<1:05:19,  3.87it/s]

{'loss': 1.7814, 'grad_norm': 2.300875663757324, 'learning_rate': 0.00018076006671431975, 'epoch': 0.1}


 10%|▉         | 1626/16798 [06:23<1:06:41,  3.79it/s]

{'loss': 1.8896, 'grad_norm': 2.295680522918701, 'learning_rate': 0.00018074815344293544, 'epoch': 0.1}


 10%|▉         | 1627/16798 [06:23<1:05:59,  3.83it/s]

{'loss': 2.043, 'grad_norm': 2.2323524951934814, 'learning_rate': 0.0001807362401715511, 'epoch': 0.1}


 10%|▉         | 1628/16798 [06:24<1:05:41,  3.85it/s]

{'loss': 1.3032, 'grad_norm': 1.8173211812973022, 'learning_rate': 0.0001807243269001668, 'epoch': 0.1}


 10%|▉         | 1629/16798 [06:24<1:06:34,  3.80it/s]

{'loss': 1.0441, 'grad_norm': 3.200894594192505, 'learning_rate': 0.00018071241362878246, 'epoch': 0.1}


 10%|▉         | 1630/16798 [06:24<1:03:22,  3.99it/s]

{'loss': 1.2184, 'grad_norm': 1.6519606113433838, 'learning_rate': 0.00018070050035739815, 'epoch': 0.1}


 10%|▉         | 1631/16798 [06:24<1:01:59,  4.08it/s]

{'loss': 1.6488, 'grad_norm': 2.379350423812866, 'learning_rate': 0.00018068858708601384, 'epoch': 0.1}


 10%|▉         | 1632/16798 [06:25<1:03:22,  3.99it/s]

{'loss': 1.6725, 'grad_norm': 1.9355992078781128, 'learning_rate': 0.0001806766738146295, 'epoch': 0.1}


 10%|▉         | 1633/16798 [06:25<1:04:50,  3.90it/s]

{'loss': 1.3559, 'grad_norm': 1.688700795173645, 'learning_rate': 0.0001806647605432452, 'epoch': 0.1}


 10%|▉         | 1634/16798 [06:25<1:02:47,  4.02it/s]

{'loss': 0.9722, 'grad_norm': 1.4999803304672241, 'learning_rate': 0.00018065284727186086, 'epoch': 0.1}


 10%|▉         | 1635/16798 [06:25<1:00:37,  4.17it/s]

{'loss': 1.5788, 'grad_norm': 1.9779366254806519, 'learning_rate': 0.00018064093400047655, 'epoch': 0.1}


 10%|▉         | 1636/16798 [06:26<1:02:02,  4.07it/s]

{'loss': 1.521, 'grad_norm': 1.8161932229995728, 'learning_rate': 0.0001806290207290922, 'epoch': 0.1}


 10%|▉         | 1637/16798 [06:26<1:05:51,  3.84it/s]

{'loss': 1.5837, 'grad_norm': 2.0878922939300537, 'learning_rate': 0.0001806171074577079, 'epoch': 0.1}


 10%|▉         | 1638/16798 [06:26<1:03:24,  3.99it/s]

{'loss': 1.4389, 'grad_norm': 1.91736900806427, 'learning_rate': 0.00018060519418632357, 'epoch': 0.1}


 10%|▉         | 1639/16798 [06:26<1:00:46,  4.16it/s]

{'loss': 0.8819, 'grad_norm': 1.5238531827926636, 'learning_rate': 0.00018059328091493926, 'epoch': 0.1}


 10%|▉         | 1640/16798 [06:27<58:47,  4.30it/s]  

{'loss': 0.5814, 'grad_norm': 1.4351403713226318, 'learning_rate': 0.00018058136764355492, 'epoch': 0.1}


 10%|▉         | 1641/16798 [06:27<58:48,  4.30it/s]

{'loss': 1.5738, 'grad_norm': 2.470743417739868, 'learning_rate': 0.0001805694543721706, 'epoch': 0.1}


 10%|▉         | 1642/16798 [06:27<1:02:01,  4.07it/s]

{'loss': 0.7481, 'grad_norm': 1.5044938325881958, 'learning_rate': 0.00018055754110078627, 'epoch': 0.1}


 10%|▉         | 1643/16798 [06:27<1:00:50,  4.15it/s]

{'loss': 0.8764, 'grad_norm': 1.4347716569900513, 'learning_rate': 0.00018054562782940197, 'epoch': 0.1}


 10%|▉         | 1644/16798 [06:28<1:02:47,  4.02it/s]

{'loss': 0.9566, 'grad_norm': 1.5418885946273804, 'learning_rate': 0.00018053371455801763, 'epoch': 0.1}


 10%|▉         | 1645/16798 [06:28<59:42,  4.23it/s]  

{'loss': 0.9188, 'grad_norm': 1.8048535585403442, 'learning_rate': 0.00018052180128663332, 'epoch': 0.1}


 10%|▉         | 1646/16798 [06:28<1:00:50,  4.15it/s]

{'loss': 0.5439, 'grad_norm': 1.2712280750274658, 'learning_rate': 0.00018050988801524898, 'epoch': 0.1}


 10%|▉         | 1647/16798 [06:28<1:00:21,  4.18it/s]

{'loss': 0.8032, 'grad_norm': 1.5406384468078613, 'learning_rate': 0.00018049797474386467, 'epoch': 0.1}


 10%|▉         | 1648/16798 [06:29<1:01:49,  4.08it/s]

{'loss': 1.1572, 'grad_norm': 2.0508484840393066, 'learning_rate': 0.00018048606147248034, 'epoch': 0.1}


 10%|▉         | 1649/16798 [06:29<1:00:05,  4.20it/s]

{'loss': 0.194, 'grad_norm': 0.7756627202033997, 'learning_rate': 0.00018047414820109603, 'epoch': 0.1}


 10%|▉         | 1650/16798 [06:29<58:44,  4.30it/s]  

{'loss': 0.2796, 'grad_norm': 1.0851198434829712, 'learning_rate': 0.0001804622349297117, 'epoch': 0.1}


 10%|▉         | 1651/16798 [06:29<1:01:10,  4.13it/s]

{'loss': 2.0753, 'grad_norm': 2.245335102081299, 'learning_rate': 0.00018045032165832738, 'epoch': 0.1}


 10%|▉         | 1652/16798 [06:30<1:01:33,  4.10it/s]

{'loss': 1.7786, 'grad_norm': 2.281216859817505, 'learning_rate': 0.00018043840838694305, 'epoch': 0.1}


 10%|▉         | 1653/16798 [06:30<1:04:28,  3.92it/s]

{'loss': 2.0709, 'grad_norm': 1.8769872188568115, 'learning_rate': 0.00018042649511555874, 'epoch': 0.1}


 10%|▉         | 1654/16798 [06:30<1:04:11,  3.93it/s]

{'loss': 2.1439, 'grad_norm': 2.3546319007873535, 'learning_rate': 0.0001804145818441744, 'epoch': 0.1}


 10%|▉         | 1655/16798 [06:30<1:04:59,  3.88it/s]

{'loss': 2.0266, 'grad_norm': 2.1310460567474365, 'learning_rate': 0.0001804026685727901, 'epoch': 0.1}


 10%|▉         | 1656/16798 [06:31<1:02:27,  4.04it/s]

{'loss': 2.3267, 'grad_norm': 2.4635260105133057, 'learning_rate': 0.00018039075530140576, 'epoch': 0.1}


 10%|▉         | 1657/16798 [06:31<1:02:58,  4.01it/s]

{'loss': 2.1758, 'grad_norm': 1.9453752040863037, 'learning_rate': 0.00018037884203002145, 'epoch': 0.1}


 10%|▉         | 1658/16798 [06:31<1:07:29,  3.74it/s]

{'loss': 1.8438, 'grad_norm': 1.8346221446990967, 'learning_rate': 0.0001803669287586371, 'epoch': 0.1}


 10%|▉         | 1659/16798 [06:31<1:07:03,  3.76it/s]

{'loss': 1.9365, 'grad_norm': 2.1597211360931396, 'learning_rate': 0.0001803550154872528, 'epoch': 0.1}


 10%|▉         | 1660/16798 [06:32<1:04:06,  3.94it/s]

{'loss': 1.8257, 'grad_norm': 1.9546900987625122, 'learning_rate': 0.00018034310221586846, 'epoch': 0.1}


 10%|▉         | 1661/16798 [06:32<1:02:51,  4.01it/s]

{'loss': 1.6865, 'grad_norm': 1.8188762664794922, 'learning_rate': 0.00018033118894448416, 'epoch': 0.1}


 10%|▉         | 1662/16798 [06:32<1:05:42,  3.84it/s]

{'loss': 1.7363, 'grad_norm': 1.7987093925476074, 'learning_rate': 0.00018031927567309985, 'epoch': 0.1}


 10%|▉         | 1663/16798 [06:32<1:04:50,  3.89it/s]

{'loss': 1.7722, 'grad_norm': 2.3785812854766846, 'learning_rate': 0.00018030736240171554, 'epoch': 0.1}


 10%|▉         | 1664/16798 [06:33<1:03:29,  3.97it/s]

{'loss': 1.3089, 'grad_norm': 1.6848337650299072, 'learning_rate': 0.0001802954491303312, 'epoch': 0.1}


 10%|▉         | 1665/16798 [06:33<1:05:59,  3.82it/s]

{'loss': 1.9631, 'grad_norm': 2.0772898197174072, 'learning_rate': 0.0001802835358589469, 'epoch': 0.1}


 10%|▉         | 1666/16798 [06:33<1:03:49,  3.95it/s]

{'loss': 1.3941, 'grad_norm': 1.6231906414031982, 'learning_rate': 0.00018027162258756255, 'epoch': 0.1}


 10%|▉         | 1667/16798 [06:33<1:07:20,  3.74it/s]

{'loss': 1.6031, 'grad_norm': 2.2627859115600586, 'learning_rate': 0.00018025970931617825, 'epoch': 0.1}


 10%|▉         | 1668/16798 [06:34<1:11:55,  3.51it/s]

{'loss': 1.6993, 'grad_norm': 1.8766791820526123, 'learning_rate': 0.0001802477960447939, 'epoch': 0.1}


 10%|▉         | 1669/16798 [06:34<1:08:38,  3.67it/s]

{'loss': 1.655, 'grad_norm': 2.030400276184082, 'learning_rate': 0.0001802358827734096, 'epoch': 0.1}


 10%|▉         | 1670/16798 [06:34<1:05:23,  3.86it/s]

{'loss': 1.6379, 'grad_norm': 1.7962652444839478, 'learning_rate': 0.00018022396950202526, 'epoch': 0.1}


 10%|▉         | 1671/16798 [06:35<1:02:58,  4.00it/s]

{'loss': 1.7254, 'grad_norm': 2.3901703357696533, 'learning_rate': 0.00018021205623064095, 'epoch': 0.1}


 10%|▉         | 1672/16798 [06:35<1:00:51,  4.14it/s]

{'loss': 1.4356, 'grad_norm': 1.6971261501312256, 'learning_rate': 0.00018020014295925662, 'epoch': 0.1}


 10%|▉         | 1673/16798 [06:35<1:03:18,  3.98it/s]

{'loss': 1.794, 'grad_norm': 2.4339487552642822, 'learning_rate': 0.0001801882296878723, 'epoch': 0.1}


 10%|▉         | 1674/16798 [06:35<1:05:58,  3.82it/s]

{'loss': 1.7187, 'grad_norm': 2.1582229137420654, 'learning_rate': 0.00018017631641648797, 'epoch': 0.1}


 10%|▉         | 1675/16798 [06:36<1:07:31,  3.73it/s]

{'loss': 1.6201, 'grad_norm': 2.0927646160125732, 'learning_rate': 0.00018016440314510366, 'epoch': 0.1}


 10%|▉         | 1676/16798 [06:36<1:09:50,  3.61it/s]

{'loss': 1.5233, 'grad_norm': 2.411003589630127, 'learning_rate': 0.00018015248987371933, 'epoch': 0.1}


 10%|▉         | 1677/16798 [06:36<1:06:41,  3.78it/s]

{'loss': 1.3047, 'grad_norm': 1.5768768787384033, 'learning_rate': 0.00018014057660233502, 'epoch': 0.1}


 10%|▉         | 1678/16798 [06:36<1:04:11,  3.93it/s]

{'loss': 1.3215, 'grad_norm': 1.6388400793075562, 'learning_rate': 0.00018012866333095068, 'epoch': 0.1}


 10%|▉         | 1679/16798 [06:37<1:07:54,  3.71it/s]

{'loss': 1.23, 'grad_norm': 1.7332489490509033, 'learning_rate': 0.00018011675005956637, 'epoch': 0.1}


 10%|█         | 1680/16798 [06:37<1:04:53,  3.88it/s]

{'loss': 1.3227, 'grad_norm': 1.5245335102081299, 'learning_rate': 0.00018010483678818204, 'epoch': 0.1}


 10%|█         | 1681/16798 [06:37<1:07:23,  3.74it/s]

{'loss': 1.3296, 'grad_norm': 2.1402862071990967, 'learning_rate': 0.00018009292351679773, 'epoch': 0.1}


 10%|█         | 1682/16798 [06:37<1:04:32,  3.90it/s]

{'loss': 1.5862, 'grad_norm': 1.99593985080719, 'learning_rate': 0.0001800810102454134, 'epoch': 0.1}


 10%|█         | 1683/16798 [06:38<1:06:56,  3.76it/s]

{'loss': 1.5479, 'grad_norm': 2.0073773860931396, 'learning_rate': 0.00018006909697402908, 'epoch': 0.1}


 10%|█         | 1684/16798 [06:38<1:05:28,  3.85it/s]

{'loss': 1.3567, 'grad_norm': 1.9512379169464111, 'learning_rate': 0.00018005718370264474, 'epoch': 0.1}


 10%|█         | 1685/16798 [06:38<1:03:12,  3.99it/s]

{'loss': 1.8339, 'grad_norm': 2.164097309112549, 'learning_rate': 0.00018004527043126044, 'epoch': 0.1}


 10%|█         | 1686/16798 [06:38<1:02:08,  4.05it/s]

{'loss': 1.4962, 'grad_norm': 1.9135117530822754, 'learning_rate': 0.0001800333571598761, 'epoch': 0.1}


 10%|█         | 1687/16798 [06:39<1:05:38,  3.84it/s]

{'loss': 1.3182, 'grad_norm': 1.7157498598098755, 'learning_rate': 0.0001800214438884918, 'epoch': 0.1}


 10%|█         | 1688/16798 [06:39<1:12:17,  3.48it/s]

{'loss': 1.3992, 'grad_norm': 2.2161612510681152, 'learning_rate': 0.00018000953061710745, 'epoch': 0.1}


 10%|█         | 1689/16798 [06:39<1:09:05,  3.64it/s]

{'loss': 0.8254, 'grad_norm': 1.4718337059020996, 'learning_rate': 0.00017999761734572314, 'epoch': 0.1}


 10%|█         | 1690/16798 [06:40<1:07:27,  3.73it/s]

{'loss': 1.1025, 'grad_norm': 1.6679415702819824, 'learning_rate': 0.0001799857040743388, 'epoch': 0.1}


 10%|█         | 1691/16798 [06:40<1:08:13,  3.69it/s]

{'loss': 1.0648, 'grad_norm': 1.5309875011444092, 'learning_rate': 0.0001799737908029545, 'epoch': 0.1}


 10%|█         | 1692/16798 [06:40<1:04:01,  3.93it/s]

{'loss': 1.2042, 'grad_norm': 1.5903061628341675, 'learning_rate': 0.0001799618775315702, 'epoch': 0.1}


 10%|█         | 1693/16798 [06:40<1:04:38,  3.89it/s]

{'loss': 1.0651, 'grad_norm': 1.652902364730835, 'learning_rate': 0.00017994996426018585, 'epoch': 0.1}


 10%|█         | 1694/16798 [06:41<1:05:25,  3.85it/s]

{'loss': 1.1192, 'grad_norm': 1.8466639518737793, 'learning_rate': 0.00017993805098880154, 'epoch': 0.1}


 10%|█         | 1695/16798 [06:41<1:04:56,  3.88it/s]

{'loss': 0.8619, 'grad_norm': 1.8392870426177979, 'learning_rate': 0.0001799261377174172, 'epoch': 0.1}


 10%|█         | 1697/16798 [06:41<59:51,  4.20it/s]  

{'loss': 0.4873, 'grad_norm': 1.1540213823318481, 'learning_rate': 0.0001799142244460329, 'epoch': 0.1}


 10%|█         | 1697/16798 [06:41<59:51,  4.20it/s]

{'loss': 0.5519, 'grad_norm': 1.3305399417877197, 'learning_rate': 0.00017990231117464856, 'epoch': 0.1}


 10%|█         | 1698/16798 [06:42<1:01:32,  4.09it/s]

{'loss': 0.3301, 'grad_norm': 0.9873209595680237, 'learning_rate': 0.00017989039790326425, 'epoch': 0.1}


 10%|█         | 1699/16798 [06:42<58:50,  4.28it/s]  

{'loss': 0.1732, 'grad_norm': 0.7335129976272583, 'learning_rate': 0.00017987848463187992, 'epoch': 0.1}


 10%|█         | 1700/16798 [06:42<56:26,  4.46it/s]

{'loss': 0.2692, 'grad_norm': 0.9204623699188232, 'learning_rate': 0.0001798665713604956, 'epoch': 0.1}


 10%|█         | 1701/16798 [06:42<57:07,  4.41it/s]

{'loss': 1.9817, 'grad_norm': 1.5655604600906372, 'learning_rate': 0.00017985465808911127, 'epoch': 0.1}


 10%|█         | 1702/16798 [06:42<1:00:54,  4.13it/s]

{'loss': 2.4576, 'grad_norm': 2.00346302986145, 'learning_rate': 0.00017984274481772696, 'epoch': 0.1}


 10%|█         | 1703/16798 [06:43<1:01:23,  4.10it/s]

{'loss': 1.4356, 'grad_norm': 1.5525343418121338, 'learning_rate': 0.00017983083154634263, 'epoch': 0.1}


 10%|█         | 1704/16798 [06:43<1:01:59,  4.06it/s]

{'loss': 1.9153, 'grad_norm': 1.9739001989364624, 'learning_rate': 0.00017981891827495832, 'epoch': 0.1}


 10%|█         | 1705/16798 [06:43<1:04:14,  3.92it/s]

{'loss': 1.909, 'grad_norm': 1.7879446744918823, 'learning_rate': 0.00017980700500357398, 'epoch': 0.1}


 10%|█         | 1706/16798 [06:43<1:01:44,  4.07it/s]

{'loss': 1.8901, 'grad_norm': 1.7391371726989746, 'learning_rate': 0.00017979509173218967, 'epoch': 0.1}


 10%|█         | 1707/16798 [06:44<1:02:47,  4.01it/s]

{'loss': 1.6724, 'grad_norm': 1.6596492528915405, 'learning_rate': 0.00017978317846080533, 'epoch': 0.1}


 10%|█         | 1708/16798 [06:44<1:04:22,  3.91it/s]

{'loss': 1.6045, 'grad_norm': 1.8593429327011108, 'learning_rate': 0.00017977126518942102, 'epoch': 0.1}


 10%|█         | 1709/16798 [06:44<1:02:10,  4.04it/s]

{'loss': 1.3376, 'grad_norm': 1.726436734199524, 'learning_rate': 0.0001797593519180367, 'epoch': 0.1}


 10%|█         | 1710/16798 [06:44<1:01:05,  4.12it/s]

{'loss': 1.5599, 'grad_norm': 1.8006093502044678, 'learning_rate': 0.00017974743864665238, 'epoch': 0.1}


 10%|█         | 1711/16798 [06:45<1:00:12,  4.18it/s]

{'loss': 2.0645, 'grad_norm': 2.111475944519043, 'learning_rate': 0.00017973552537526804, 'epoch': 0.1}


 10%|█         | 1712/16798 [06:45<1:03:30,  3.96it/s]

{'loss': 2.0539, 'grad_norm': 2.506166458129883, 'learning_rate': 0.00017972361210388373, 'epoch': 0.1}


 10%|█         | 1713/16798 [06:45<1:01:43,  4.07it/s]

{'loss': 1.6412, 'grad_norm': 2.239806652069092, 'learning_rate': 0.0001797116988324994, 'epoch': 0.1}


 10%|█         | 1714/16798 [06:45<1:00:11,  4.18it/s]

{'loss': 1.5675, 'grad_norm': 1.8466753959655762, 'learning_rate': 0.0001796997855611151, 'epoch': 0.1}


 10%|█         | 1715/16798 [06:46<1:03:21,  3.97it/s]

{'loss': 1.7785, 'grad_norm': 2.1345014572143555, 'learning_rate': 0.00017968787228973075, 'epoch': 0.1}


 10%|█         | 1716/16798 [06:46<1:04:30,  3.90it/s]

{'loss': 1.3756, 'grad_norm': 1.9451441764831543, 'learning_rate': 0.00017967595901834644, 'epoch': 0.1}


 10%|█         | 1717/16798 [06:46<1:00:46,  4.14it/s]

{'loss': 1.7167, 'grad_norm': 2.0352346897125244, 'learning_rate': 0.0001796640457469621, 'epoch': 0.1}


 10%|█         | 1718/16798 [06:46<1:04:29,  3.90it/s]

{'loss': 1.5718, 'grad_norm': 1.7429959774017334, 'learning_rate': 0.0001796521324755778, 'epoch': 0.1}


 10%|█         | 1719/16798 [06:47<1:02:50,  4.00it/s]

{'loss': 1.6834, 'grad_norm': 1.7252273559570312, 'learning_rate': 0.00017964021920419346, 'epoch': 0.1}


 10%|█         | 1720/16798 [06:47<1:01:05,  4.11it/s]

{'loss': 1.7365, 'grad_norm': 1.8827967643737793, 'learning_rate': 0.00017962830593280915, 'epoch': 0.1}


 10%|█         | 1721/16798 [06:47<1:03:43,  3.94it/s]

{'loss': 1.5006, 'grad_norm': 1.7716543674468994, 'learning_rate': 0.00017961639266142481, 'epoch': 0.1}


 10%|█         | 1722/16798 [06:47<1:00:41,  4.14it/s]

{'loss': 1.7977, 'grad_norm': 2.038752794265747, 'learning_rate': 0.0001796044793900405, 'epoch': 0.1}


 10%|█         | 1723/16798 [06:48<58:53,  4.27it/s]  

{'loss': 1.4992, 'grad_norm': 1.8519713878631592, 'learning_rate': 0.0001795925661186562, 'epoch': 0.1}


 10%|█         | 1724/16798 [06:48<1:00:10,  4.18it/s]

{'loss': 1.4172, 'grad_norm': 1.8785971403121948, 'learning_rate': 0.0001795806528472719, 'epoch': 0.1}


 10%|█         | 1725/16798 [06:48<1:02:33,  4.02it/s]

{'loss': 1.0693, 'grad_norm': 2.342562198638916, 'learning_rate': 0.00017956873957588755, 'epoch': 0.1}


 10%|█         | 1726/16798 [06:48<1:00:34,  4.15it/s]

{'loss': 2.1938, 'grad_norm': 2.714012384414673, 'learning_rate': 0.00017955682630450324, 'epoch': 0.1}


 10%|█         | 1727/16798 [06:49<58:35,  4.29it/s]  

{'loss': 1.5797, 'grad_norm': 2.08463978767395, 'learning_rate': 0.0001795449130331189, 'epoch': 0.1}


 10%|█         | 1728/16798 [06:49<1:00:19,  4.16it/s]

{'loss': 1.6407, 'grad_norm': 1.797084093093872, 'learning_rate': 0.0001795329997617346, 'epoch': 0.1}


 10%|█         | 1729/16798 [06:49<1:03:22,  3.96it/s]

{'loss': 1.6747, 'grad_norm': 2.0634562969207764, 'learning_rate': 0.00017952108649035026, 'epoch': 0.1}


 10%|█         | 1730/16798 [06:49<1:00:27,  4.15it/s]

{'loss': 1.5648, 'grad_norm': 2.001326322555542, 'learning_rate': 0.00017950917321896595, 'epoch': 0.1}


 10%|█         | 1731/16798 [06:50<58:39,  4.28it/s]  

{'loss': 1.4404, 'grad_norm': 2.004366874694824, 'learning_rate': 0.00017949725994758161, 'epoch': 0.1}


 10%|█         | 1733/16798 [06:50<56:11,  4.47it/s]

{'loss': 1.1834, 'grad_norm': 1.8846009969711304, 'learning_rate': 0.0001794853466761973, 'epoch': 0.1}


 10%|█         | 1733/16798 [06:50<56:11,  4.47it/s]

{'loss': 1.9051, 'grad_norm': 2.3525595664978027, 'learning_rate': 0.00017947343340481297, 'epoch': 0.1}


 10%|█         | 1734/16798 [06:50<1:00:27,  4.15it/s]

{'loss': 1.6429, 'grad_norm': 2.07112717628479, 'learning_rate': 0.00017946152013342866, 'epoch': 0.1}


 10%|█         | 1735/16798 [06:50<59:14,  4.24it/s]  

{'loss': 1.679, 'grad_norm': 2.0608835220336914, 'learning_rate': 0.00017944960686204432, 'epoch': 0.1}


 10%|█         | 1736/16798 [06:51<1:02:22,  4.02it/s]

{'loss': 1.4569, 'grad_norm': 1.7167139053344727, 'learning_rate': 0.00017943769359066001, 'epoch': 0.1}


 10%|█         | 1737/16798 [06:51<1:02:52,  3.99it/s]

{'loss': 1.2185, 'grad_norm': 1.7275522947311401, 'learning_rate': 0.00017942578031927568, 'epoch': 0.1}


 10%|█         | 1738/16798 [06:51<1:03:06,  3.98it/s]

{'loss': 1.6675, 'grad_norm': 2.009215831756592, 'learning_rate': 0.00017941386704789137, 'epoch': 0.1}


 10%|█         | 1739/16798 [06:51<1:02:04,  4.04it/s]

{'loss': 0.9023, 'grad_norm': 1.5791925191879272, 'learning_rate': 0.00017940195377650703, 'epoch': 0.1}


 10%|█         | 1740/16798 [06:52<1:07:34,  3.71it/s]

{'loss': 1.3312, 'grad_norm': 2.41078782081604, 'learning_rate': 0.00017939004050512272, 'epoch': 0.1}


 10%|█         | 1741/16798 [06:52<1:05:29,  3.83it/s]

{'loss': 1.3101, 'grad_norm': 1.8330459594726562, 'learning_rate': 0.00017937812723373839, 'epoch': 0.1}


 10%|█         | 1742/16798 [06:52<1:02:37,  4.01it/s]

{'loss': 1.0601, 'grad_norm': 1.676246166229248, 'learning_rate': 0.00017936621396235408, 'epoch': 0.1}


 10%|█         | 1743/16798 [06:53<1:00:49,  4.13it/s]

{'loss': 0.9934, 'grad_norm': 1.5056051015853882, 'learning_rate': 0.00017935430069096974, 'epoch': 0.1}


 10%|█         | 1744/16798 [06:53<1:05:57,  3.80it/s]

{'loss': 0.9297, 'grad_norm': 1.8151389360427856, 'learning_rate': 0.00017934238741958543, 'epoch': 0.1}


 10%|█         | 1745/16798 [06:53<1:03:33,  3.95it/s]

{'loss': 1.17, 'grad_norm': 1.6922627687454224, 'learning_rate': 0.0001793304741482011, 'epoch': 0.1}


 10%|█         | 1746/16798 [06:53<1:01:20,  4.09it/s]

{'loss': 0.8611, 'grad_norm': 1.4931107759475708, 'learning_rate': 0.00017931856087681679, 'epoch': 0.1}


 10%|█         | 1747/16798 [06:53<59:36,  4.21it/s]  

{'loss': 0.9913, 'grad_norm': 1.6155104637145996, 'learning_rate': 0.00017930664760543245, 'epoch': 0.1}


 10%|█         | 1748/16798 [06:54<1:01:46,  4.06it/s]

{'loss': 0.4577, 'grad_norm': 1.317538857460022, 'learning_rate': 0.00017929473433404814, 'epoch': 0.1}


 10%|█         | 1749/16798 [06:54<1:01:00,  4.11it/s]

{'loss': 0.585, 'grad_norm': 1.2834147214889526, 'learning_rate': 0.0001792828210626638, 'epoch': 0.1}


 10%|█         | 1750/16798 [06:54<59:00,  4.25it/s]  

{'loss': 0.5855, 'grad_norm': 1.310826063156128, 'learning_rate': 0.0001792709077912795, 'epoch': 0.1}


 10%|█         | 1751/16798 [06:54<58:52,  4.26it/s]

{'loss': 2.254, 'grad_norm': 9.074318885803223, 'learning_rate': 0.00017925899451989516, 'epoch': 0.1}


 10%|█         | 1752/16798 [06:55<1:02:02,  4.04it/s]

{'loss': 2.2308, 'grad_norm': 2.038612127304077, 'learning_rate': 0.00017924708124851085, 'epoch': 0.1}


 10%|█         | 1753/16798 [06:55<1:00:38,  4.13it/s]

{'loss': 2.0376, 'grad_norm': 1.7799410820007324, 'learning_rate': 0.0001792351679771265, 'epoch': 0.1}


 10%|█         | 1754/16798 [06:55<1:00:36,  4.14it/s]

{'loss': 2.3323, 'grad_norm': 2.035417318344116, 'learning_rate': 0.0001792232547057422, 'epoch': 0.1}


 10%|█         | 1755/16798 [06:55<59:58,  4.18it/s]  

{'loss': 2.0957, 'grad_norm': 1.91181480884552, 'learning_rate': 0.0001792113414343579, 'epoch': 0.1}


 10%|█         | 1756/16798 [06:56<1:03:29,  3.95it/s]

{'loss': 2.3396, 'grad_norm': 2.3264002799987793, 'learning_rate': 0.00017919942816297356, 'epoch': 0.1}


 10%|█         | 1757/16798 [06:56<1:03:06,  3.97it/s]

{'loss': 2.0273, 'grad_norm': 1.6901262998580933, 'learning_rate': 0.00017918751489158925, 'epoch': 0.1}


 10%|█         | 1758/16798 [06:56<1:03:20,  3.96it/s]

{'loss': 2.471, 'grad_norm': 2.3651227951049805, 'learning_rate': 0.0001791756016202049, 'epoch': 0.1}


 10%|█         | 1759/16798 [06:56<1:03:16,  3.96it/s]

{'loss': 2.2066, 'grad_norm': 1.7853468656539917, 'learning_rate': 0.0001791636883488206, 'epoch': 0.1}


 10%|█         | 1760/16798 [06:57<1:04:04,  3.91it/s]

{'loss': 1.6576, 'grad_norm': 1.8747080564498901, 'learning_rate': 0.00017915177507743627, 'epoch': 0.1}


 10%|█         | 1761/16798 [06:57<1:01:49,  4.05it/s]

{'loss': 1.7609, 'grad_norm': 1.8366854190826416, 'learning_rate': 0.00017913986180605196, 'epoch': 0.1}


 10%|█         | 1762/16798 [06:57<1:01:30,  4.07it/s]

{'loss': 1.5967, 'grad_norm': 1.676440715789795, 'learning_rate': 0.00017912794853466762, 'epoch': 0.1}


 10%|█         | 1763/16798 [06:57<1:03:41,  3.93it/s]

{'loss': 1.4101, 'grad_norm': 1.930721640586853, 'learning_rate': 0.0001791160352632833, 'epoch': 0.1}


 11%|█         | 1764/16798 [06:58<1:04:58,  3.86it/s]

{'loss': 1.576, 'grad_norm': 1.8548511266708374, 'learning_rate': 0.00017910412199189898, 'epoch': 0.11}


 11%|█         | 1765/16798 [06:58<1:03:15,  3.96it/s]

{'loss': 1.4707, 'grad_norm': 1.7589181661605835, 'learning_rate': 0.00017909220872051467, 'epoch': 0.11}


 11%|█         | 1766/16798 [06:58<1:03:23,  3.95it/s]

{'loss': 1.6286, 'grad_norm': 1.9358181953430176, 'learning_rate': 0.00017908029544913033, 'epoch': 0.11}


 11%|█         | 1767/16798 [06:59<1:04:48,  3.87it/s]

{'loss': 1.9163, 'grad_norm': 2.107769727706909, 'learning_rate': 0.00017906838217774602, 'epoch': 0.11}


 11%|█         | 1768/16798 [06:59<1:04:35,  3.88it/s]

{'loss': 1.2443, 'grad_norm': 1.5690662860870361, 'learning_rate': 0.00017905646890636168, 'epoch': 0.11}


 11%|█         | 1769/16798 [06:59<1:02:51,  3.99it/s]

{'loss': 1.5351, 'grad_norm': 2.226712942123413, 'learning_rate': 0.00017904455563497738, 'epoch': 0.11}


 11%|█         | 1770/16798 [06:59<1:04:04,  3.91it/s]

{'loss': 1.7728, 'grad_norm': 1.7742401361465454, 'learning_rate': 0.00017903264236359304, 'epoch': 0.11}


 11%|█         | 1771/16798 [06:59<1:01:24,  4.08it/s]

{'loss': 1.551, 'grad_norm': 1.6933207511901855, 'learning_rate': 0.00017902072909220873, 'epoch': 0.11}


 11%|█         | 1772/16798 [07:00<1:03:12,  3.96it/s]

{'loss': 1.5073, 'grad_norm': 1.867673397064209, 'learning_rate': 0.0001790088158208244, 'epoch': 0.11}


 11%|█         | 1773/16798 [07:00<1:01:15,  4.09it/s]

{'loss': 1.4161, 'grad_norm': 1.6704033613204956, 'learning_rate': 0.00017899690254944008, 'epoch': 0.11}


 11%|█         | 1774/16798 [07:00<1:04:26,  3.89it/s]

{'loss': 1.5654, 'grad_norm': 1.7686713933944702, 'learning_rate': 0.00017898498927805575, 'epoch': 0.11}


 11%|█         | 1775/16798 [07:01<1:03:27,  3.95it/s]

{'loss': 1.6059, 'grad_norm': 1.7723925113677979, 'learning_rate': 0.00017897307600667144, 'epoch': 0.11}


 11%|█         | 1776/16798 [07:01<1:01:26,  4.07it/s]

{'loss': 1.4777, 'grad_norm': 1.7424205541610718, 'learning_rate': 0.0001789611627352871, 'epoch': 0.11}


 11%|█         | 1777/16798 [07:01<1:04:24,  3.89it/s]

{'loss': 1.6315, 'grad_norm': 1.8684086799621582, 'learning_rate': 0.0001789492494639028, 'epoch': 0.11}


 11%|█         | 1778/16798 [07:01<1:00:43,  4.12it/s]

{'loss': 1.321, 'grad_norm': 1.7456114292144775, 'learning_rate': 0.00017893733619251846, 'epoch': 0.11}


 11%|█         | 1779/16798 [07:01<59:05,  4.24it/s]  

{'loss': 1.2929, 'grad_norm': 1.9630597829818726, 'learning_rate': 0.00017892542292113415, 'epoch': 0.11}


 11%|█         | 1780/16798 [07:02<57:25,  4.36it/s]

{'loss': 1.7181, 'grad_norm': 2.002129316329956, 'learning_rate': 0.0001789135096497498, 'epoch': 0.11}


 11%|█         | 1781/16798 [07:02<1:01:18,  4.08it/s]

{'loss': 1.6975, 'grad_norm': 2.21651554107666, 'learning_rate': 0.0001789015963783655, 'epoch': 0.11}


 11%|█         | 1782/16798 [07:02<59:20,  4.22it/s]  

{'loss': 1.6079, 'grad_norm': 2.086439371109009, 'learning_rate': 0.00017888968310698117, 'epoch': 0.11}


 11%|█         | 1783/16798 [07:02<57:49,  4.33it/s]

{'loss': 1.3645, 'grad_norm': 1.776054859161377, 'learning_rate': 0.00017887776983559686, 'epoch': 0.11}


 11%|█         | 1784/16798 [07:03<57:11,  4.37it/s]

{'loss': 1.6406, 'grad_norm': 1.8015896081924438, 'learning_rate': 0.00017886585656421252, 'epoch': 0.11}


 11%|█         | 1785/16798 [07:03<57:55,  4.32it/s]

{'loss': 1.2575, 'grad_norm': 1.7706116437911987, 'learning_rate': 0.00017885394329282824, 'epoch': 0.11}


 11%|█         | 1786/16798 [07:03<1:02:01,  4.03it/s]

{'loss': 1.6538, 'grad_norm': 2.099536180496216, 'learning_rate': 0.0001788420300214439, 'epoch': 0.11}


 11%|█         | 1787/16798 [07:03<1:00:55,  4.11it/s]

{'loss': 1.4787, 'grad_norm': 2.260702610015869, 'learning_rate': 0.0001788301167500596, 'epoch': 0.11}


 11%|█         | 1788/16798 [07:04<1:03:36,  3.93it/s]

{'loss': 1.4539, 'grad_norm': 1.9150952100753784, 'learning_rate': 0.00017881820347867526, 'epoch': 0.11}


 11%|█         | 1789/16798 [07:04<1:01:30,  4.07it/s]

{'loss': 1.2657, 'grad_norm': 2.0150046348571777, 'learning_rate': 0.00017880629020729095, 'epoch': 0.11}


 11%|█         | 1790/16798 [07:04<1:02:35,  4.00it/s]

{'loss': 2.0567, 'grad_norm': 2.273538112640381, 'learning_rate': 0.0001787943769359066, 'epoch': 0.11}


 11%|█         | 1791/16798 [07:04<1:00:58,  4.10it/s]

{'loss': 1.4698, 'grad_norm': 1.9158647060394287, 'learning_rate': 0.0001787824636645223, 'epoch': 0.11}


 11%|█         | 1792/16798 [07:05<1:00:31,  4.13it/s]

{'loss': 1.5223, 'grad_norm': 1.9483880996704102, 'learning_rate': 0.00017877055039313796, 'epoch': 0.11}


 11%|█         | 1793/16798 [07:05<1:00:57,  4.10it/s]

{'loss': 1.7689, 'grad_norm': 2.2691502571105957, 'learning_rate': 0.00017875863712175366, 'epoch': 0.11}


 11%|█         | 1794/16798 [07:05<1:02:29,  4.00it/s]

{'loss': 1.265, 'grad_norm': 1.7903650999069214, 'learning_rate': 0.00017874672385036932, 'epoch': 0.11}


 11%|█         | 1795/16798 [07:05<59:33,  4.20it/s]  

{'loss': 1.5713, 'grad_norm': 2.1649985313415527, 'learning_rate': 0.000178734810578985, 'epoch': 0.11}


 11%|█         | 1796/16798 [07:06<58:23,  4.28it/s]

{'loss': 1.0599, 'grad_norm': 1.6135610342025757, 'learning_rate': 0.00017872289730760067, 'epoch': 0.11}


 11%|█         | 1798/16798 [07:06<56:45,  4.40it/s]

{'loss': 1.2229, 'grad_norm': 2.0537350177764893, 'learning_rate': 0.00017871098403621636, 'epoch': 0.11}


 11%|█         | 1798/16798 [07:06<56:45,  4.40it/s]

{'loss': 0.5258, 'grad_norm': 1.3240402936935425, 'learning_rate': 0.00017869907076483203, 'epoch': 0.11}


 11%|█         | 1799/16798 [07:06<59:39,  4.19it/s]

{'loss': 0.2325, 'grad_norm': 0.8038090467453003, 'learning_rate': 0.00017868715749344772, 'epoch': 0.11}


 11%|█         | 1800/16798 [07:07<1:01:34,  4.06it/s]

{'loss': 0.6528, 'grad_norm': 1.4234098196029663, 'learning_rate': 0.00017867524422206338, 'epoch': 0.11}


 11%|█         | 1801/16798 [07:07<1:00:51,  4.11it/s]

{'loss': 2.267, 'grad_norm': 1.9066517353057861, 'learning_rate': 0.00017866333095067907, 'epoch': 0.11}


 11%|█         | 1802/16798 [07:07<1:04:34,  3.87it/s]

{'loss': 2.3185, 'grad_norm': 2.0786242485046387, 'learning_rate': 0.00017865141767929474, 'epoch': 0.11}


 11%|█         | 1803/16798 [07:07<1:02:25,  4.00it/s]

{'loss': 2.0002, 'grad_norm': 1.806838870048523, 'learning_rate': 0.00017863950440791043, 'epoch': 0.11}


 11%|█         | 1804/16798 [07:08<1:02:15,  4.01it/s]

{'loss': 1.9139, 'grad_norm': 2.199655055999756, 'learning_rate': 0.0001786275911365261, 'epoch': 0.11}


 11%|█         | 1805/16798 [07:08<1:05:35,  3.81it/s]

{'loss': 1.8311, 'grad_norm': 1.6771074533462524, 'learning_rate': 0.00017861567786514178, 'epoch': 0.11}


 11%|█         | 1806/16798 [07:08<1:03:40,  3.92it/s]

{'loss': 1.869, 'grad_norm': 1.8285717964172363, 'learning_rate': 0.00017860376459375745, 'epoch': 0.11}


 11%|█         | 1807/16798 [07:08<1:02:22,  4.01it/s]

{'loss': 1.9482, 'grad_norm': 1.984452486038208, 'learning_rate': 0.00017859185132237314, 'epoch': 0.11}


 11%|█         | 1808/16798 [07:09<1:06:23,  3.76it/s]

{'loss': 1.7164, 'grad_norm': 1.781248688697815, 'learning_rate': 0.0001785799380509888, 'epoch': 0.11}


 11%|█         | 1809/16798 [07:09<1:02:44,  3.98it/s]

{'loss': 1.6155, 'grad_norm': 1.6209980249404907, 'learning_rate': 0.0001785680247796045, 'epoch': 0.11}


 11%|█         | 1810/16798 [07:09<1:01:32,  4.06it/s]

{'loss': 1.6009, 'grad_norm': 1.7526259422302246, 'learning_rate': 0.00017855611150822015, 'epoch': 0.11}


 11%|█         | 1811/16798 [07:09<1:00:36,  4.12it/s]

{'loss': 1.6602, 'grad_norm': 1.8014698028564453, 'learning_rate': 0.00017854419823683585, 'epoch': 0.11}


 11%|█         | 1812/16798 [07:10<1:02:43,  3.98it/s]

{'loss': 1.6777, 'grad_norm': 1.7609436511993408, 'learning_rate': 0.0001785322849654515, 'epoch': 0.11}


 11%|█         | 1813/16798 [07:10<1:01:00,  4.09it/s]

{'loss': 1.6194, 'grad_norm': 1.7155269384384155, 'learning_rate': 0.0001785203716940672, 'epoch': 0.11}


 11%|█         | 1814/16798 [07:10<1:07:21,  3.71it/s]

{'loss': 2.0893, 'grad_norm': 2.0694785118103027, 'learning_rate': 0.00017850845842268286, 'epoch': 0.11}


 11%|█         | 1815/16798 [07:10<1:06:35,  3.75it/s]

{'loss': 1.4684, 'grad_norm': 1.9037963151931763, 'learning_rate': 0.00017849654515129855, 'epoch': 0.11}


 11%|█         | 1816/16798 [07:11<1:05:40,  3.80it/s]

{'loss': 1.7478, 'grad_norm': 1.840114951133728, 'learning_rate': 0.00017848463187991424, 'epoch': 0.11}


 11%|█         | 1817/16798 [07:11<1:03:33,  3.93it/s]

{'loss': 1.2385, 'grad_norm': 1.778487205505371, 'learning_rate': 0.0001784727186085299, 'epoch': 0.11}


 11%|█         | 1818/16798 [07:11<1:04:11,  3.89it/s]

{'loss': 1.3578, 'grad_norm': 1.4307975769042969, 'learning_rate': 0.0001784608053371456, 'epoch': 0.11}


 11%|█         | 1820/16798 [07:12<58:57,  4.23it/s]  

{'loss': 1.5574, 'grad_norm': 1.6473419666290283, 'learning_rate': 0.00017844889206576126, 'epoch': 0.11}


 11%|█         | 1820/16798 [07:12<58:57,  4.23it/s]

{'loss': 1.4859, 'grad_norm': 1.8972439765930176, 'learning_rate': 0.00017843697879437695, 'epoch': 0.11}


 11%|█         | 1821/16798 [07:12<1:02:09,  4.02it/s]

{'loss': 1.7686, 'grad_norm': 1.7560075521469116, 'learning_rate': 0.00017842506552299262, 'epoch': 0.11}


 11%|█         | 1822/16798 [07:12<58:52,  4.24it/s]  

{'loss': 1.395, 'grad_norm': 1.6367664337158203, 'learning_rate': 0.0001784131522516083, 'epoch': 0.11}


 11%|█         | 1824/16798 [07:12<56:29,  4.42it/s]

{'loss': 1.7162, 'grad_norm': 1.7496718168258667, 'learning_rate': 0.00017840123898022397, 'epoch': 0.11}


 11%|█         | 1824/16798 [07:12<56:29,  4.42it/s]

{'loss': 1.3787, 'grad_norm': 1.6034694910049438, 'learning_rate': 0.00017838932570883966, 'epoch': 0.11}


 11%|█         | 1825/16798 [07:13<57:21,  4.35it/s]

{'loss': 1.5665, 'grad_norm': 2.0599396228790283, 'learning_rate': 0.00017837741243745533, 'epoch': 0.11}


 11%|█         | 1826/16798 [07:13<1:00:57,  4.09it/s]

{'loss': 2.0753, 'grad_norm': 2.1895456314086914, 'learning_rate': 0.00017836549916607102, 'epoch': 0.11}


 11%|█         | 1827/16798 [07:13<58:16,  4.28it/s]  

{'loss': 1.2737, 'grad_norm': 1.865860939025879, 'learning_rate': 0.00017835358589468668, 'epoch': 0.11}


 11%|█         | 1828/16798 [07:13<57:46,  4.32it/s]

{'loss': 1.9842, 'grad_norm': 2.3926444053649902, 'learning_rate': 0.00017834167262330237, 'epoch': 0.11}


 11%|█         | 1829/16798 [07:14<57:20,  4.35it/s]

{'loss': 1.4173, 'grad_norm': 1.7909706830978394, 'learning_rate': 0.00017832975935191803, 'epoch': 0.11}


 11%|█         | 1830/16798 [07:14<1:00:15,  4.14it/s]

{'loss': 1.5393, 'grad_norm': 1.7546366453170776, 'learning_rate': 0.00017831784608053373, 'epoch': 0.11}


 11%|█         | 1831/16798 [07:14<58:27,  4.27it/s]  

{'loss': 1.3076, 'grad_norm': 1.7815182209014893, 'learning_rate': 0.0001783059328091494, 'epoch': 0.11}


 11%|█         | 1832/16798 [07:14<57:06,  4.37it/s]

{'loss': 1.6251, 'grad_norm': 2.116565465927124, 'learning_rate': 0.00017829401953776508, 'epoch': 0.11}


 11%|█         | 1833/16798 [07:15<59:36,  4.18it/s]

{'loss': 1.2466, 'grad_norm': 1.8016347885131836, 'learning_rate': 0.00017828210626638074, 'epoch': 0.11}


 11%|█         | 1834/16798 [07:15<1:02:41,  3.98it/s]

{'loss': 1.9719, 'grad_norm': 1.974491000175476, 'learning_rate': 0.00017827019299499643, 'epoch': 0.11}


 11%|█         | 1835/16798 [07:15<1:01:40,  4.04it/s]

{'loss': 0.9761, 'grad_norm': 1.9218522310256958, 'learning_rate': 0.0001782582797236121, 'epoch': 0.11}


 11%|█         | 1836/16798 [07:15<59:43,  4.18it/s]  

{'loss': 1.4037, 'grad_norm': 1.7238003015518188, 'learning_rate': 0.0001782463664522278, 'epoch': 0.11}


 11%|█         | 1837/16798 [07:16<1:03:17,  3.94it/s]

{'loss': 1.5451, 'grad_norm': 1.984591007232666, 'learning_rate': 0.00017823445318084345, 'epoch': 0.11}


 11%|█         | 1838/16798 [07:16<1:03:00,  3.96it/s]

{'loss': 1.5282, 'grad_norm': 1.901770830154419, 'learning_rate': 0.00017822253990945914, 'epoch': 0.11}


 11%|█         | 1839/16798 [07:16<1:01:12,  4.07it/s]

{'loss': 1.0024, 'grad_norm': 1.7802925109863281, 'learning_rate': 0.0001782106266380748, 'epoch': 0.11}


 11%|█         | 1840/16798 [07:16<1:01:45,  4.04it/s]

{'loss': 1.0217, 'grad_norm': 1.535770058631897, 'learning_rate': 0.0001781987133666905, 'epoch': 0.11}


 11%|█         | 1841/16798 [07:17<1:07:29,  3.69it/s]

{'loss': 0.5658, 'grad_norm': 1.5068680047988892, 'learning_rate': 0.00017818680009530616, 'epoch': 0.11}


 11%|█         | 1842/16798 [07:17<1:06:08,  3.77it/s]

{'loss': 1.0865, 'grad_norm': 1.6652164459228516, 'learning_rate': 0.00017817488682392185, 'epoch': 0.11}


 11%|█         | 1843/16798 [07:17<1:02:52,  3.96it/s]

{'loss': 1.1309, 'grad_norm': 1.6784809827804565, 'learning_rate': 0.00017816297355253752, 'epoch': 0.11}


 11%|█         | 1844/16798 [07:17<1:04:10,  3.88it/s]

{'loss': 1.0735, 'grad_norm': 1.7824738025665283, 'learning_rate': 0.0001781510602811532, 'epoch': 0.11}


 11%|█         | 1845/16798 [07:18<1:02:17,  4.00it/s]

{'loss': 1.1052, 'grad_norm': 1.4973111152648926, 'learning_rate': 0.00017813914700976887, 'epoch': 0.11}


 11%|█         | 1846/16798 [07:18<1:02:16,  4.00it/s]

{'loss': 0.8444, 'grad_norm': 1.5815056562423706, 'learning_rate': 0.00017812723373838456, 'epoch': 0.11}


 11%|█         | 1847/16798 [07:18<1:03:38,  3.92it/s]

{'loss': 0.5236, 'grad_norm': 1.2159128189086914, 'learning_rate': 0.00017811532046700025, 'epoch': 0.11}


 11%|█         | 1848/16798 [07:18<1:00:15,  4.13it/s]

{'loss': 0.8661, 'grad_norm': 1.4646804332733154, 'learning_rate': 0.00017810340719561594, 'epoch': 0.11}


 11%|█         | 1849/16798 [07:19<59:52,  4.16it/s]  

{'loss': 0.2557, 'grad_norm': 0.9001687169075012, 'learning_rate': 0.0001780914939242316, 'epoch': 0.11}


 11%|█         | 1850/16798 [07:19<1:02:25,  3.99it/s]

{'loss': 0.2772, 'grad_norm': 0.8605404496192932, 'learning_rate': 0.0001780795806528473, 'epoch': 0.11}


 11%|█         | 1851/16798 [07:19<59:51,  4.16it/s]  

{'loss': 2.0388, 'grad_norm': 1.7539869546890259, 'learning_rate': 0.00017806766738146296, 'epoch': 0.11}


 11%|█         | 1853/16798 [07:20<57:13,  4.35it/s]

{'loss': 2.2693, 'grad_norm': 1.9266895055770874, 'learning_rate': 0.00017805575411007865, 'epoch': 0.11}


 11%|█         | 1853/16798 [07:20<57:13,  4.35it/s]

{'loss': 2.3276, 'grad_norm': 1.8468258380889893, 'learning_rate': 0.00017804384083869432, 'epoch': 0.11}


 11%|█         | 1854/16798 [07:20<58:47,  4.24it/s]

{'loss': 1.7161, 'grad_norm': 1.8767528533935547, 'learning_rate': 0.00017803192756731, 'epoch': 0.11}


 11%|█         | 1855/16798 [07:20<1:01:30,  4.05it/s]

{'loss': 1.9365, 'grad_norm': 1.9109069108963013, 'learning_rate': 0.00017802001429592567, 'epoch': 0.11}


 11%|█         | 1856/16798 [07:20<58:49,  4.23it/s]  

{'loss': 2.2604, 'grad_norm': 1.9207241535186768, 'learning_rate': 0.00017800810102454136, 'epoch': 0.11}


 11%|█         | 1857/16798 [07:21<57:07,  4.36it/s]

{'loss': 2.2268, 'grad_norm': 2.1979849338531494, 'learning_rate': 0.00017799618775315702, 'epoch': 0.11}


 11%|█         | 1858/16798 [07:21<57:47,  4.31it/s]

{'loss': 1.7076, 'grad_norm': 1.8676785230636597, 'learning_rate': 0.00017798427448177271, 'epoch': 0.11}


 11%|█         | 1859/16798 [07:21<1:01:06,  4.07it/s]

{'loss': 1.6562, 'grad_norm': 1.7184303998947144, 'learning_rate': 0.00017797236121038838, 'epoch': 0.11}


 11%|█         | 1860/16798 [07:21<59:02,  4.22it/s]  

{'loss': 1.6471, 'grad_norm': 1.9890137910842896, 'learning_rate': 0.00017796044793900407, 'epoch': 0.11}


 11%|█         | 1862/16798 [07:22<55:50,  4.46it/s]

{'loss': 1.4508, 'grad_norm': 1.6550023555755615, 'learning_rate': 0.00017794853466761973, 'epoch': 0.11}


 11%|█         | 1863/16798 [07:22<54:50,  4.54it/s]

{'loss': 1.9666, 'grad_norm': 1.9421087503433228, 'learning_rate': 0.00017793662139623542, 'epoch': 0.11}


 11%|█         | 1863/16798 [07:22<54:50,  4.54it/s]

{'loss': 1.7554, 'grad_norm': 1.9572761058807373, 'learning_rate': 0.0001779247081248511, 'epoch': 0.11}


 11%|█         | 1864/16798 [07:22<59:19,  4.20it/s]

{'loss': 1.4078, 'grad_norm': 1.5592000484466553, 'learning_rate': 0.00017791279485346678, 'epoch': 0.11}


 11%|█         | 1865/16798 [07:22<1:02:23,  3.99it/s]

{'loss': 1.3764, 'grad_norm': 1.721300482749939, 'learning_rate': 0.00017790088158208244, 'epoch': 0.11}


 11%|█         | 1866/16798 [07:23<1:00:00,  4.15it/s]

{'loss': 1.3715, 'grad_norm': 1.848651647567749, 'learning_rate': 0.00017788896831069813, 'epoch': 0.11}


 11%|█         | 1867/16798 [07:23<1:02:42,  3.97it/s]

{'loss': 1.6872, 'grad_norm': 1.9179595708847046, 'learning_rate': 0.0001778770550393138, 'epoch': 0.11}


 11%|█         | 1868/16798 [07:23<1:02:31,  3.98it/s]

{'loss': 1.483, 'grad_norm': 1.7250398397445679, 'learning_rate': 0.0001778651417679295, 'epoch': 0.11}


 11%|█         | 1869/16798 [07:23<1:00:08,  4.14it/s]

{'loss': 2.2442, 'grad_norm': 2.2903366088867188, 'learning_rate': 0.00017785322849654515, 'epoch': 0.11}


 11%|█         | 1870/16798 [07:24<1:00:29,  4.11it/s]

{'loss': 1.6292, 'grad_norm': 2.2086453437805176, 'learning_rate': 0.00017784131522516084, 'epoch': 0.11}


 11%|█         | 1871/16798 [07:24<1:04:01,  3.89it/s]

{'loss': 1.5365, 'grad_norm': 1.875918984413147, 'learning_rate': 0.0001778294019537765, 'epoch': 0.11}


 11%|█         | 1872/16798 [07:24<1:01:22,  4.05it/s]

{'loss': 1.6786, 'grad_norm': 1.9796286821365356, 'learning_rate': 0.0001778174886823922, 'epoch': 0.11}


 11%|█         | 1873/16798 [07:24<59:23,  4.19it/s]  

{'loss': 1.4595, 'grad_norm': 1.9387245178222656, 'learning_rate': 0.00017780557541100786, 'epoch': 0.11}


 11%|█         | 1874/16798 [07:25<58:28,  4.25it/s]

{'loss': 1.6626, 'grad_norm': 2.346951723098755, 'learning_rate': 0.00017779366213962355, 'epoch': 0.11}


 11%|█         | 1875/16798 [07:25<1:01:13,  4.06it/s]

{'loss': 1.6576, 'grad_norm': 2.1891849040985107, 'learning_rate': 0.00017778174886823921, 'epoch': 0.11}


 11%|█         | 1877/16798 [07:25<58:55,  4.22it/s]  

{'loss': 1.3331, 'grad_norm': 1.734277367591858, 'learning_rate': 0.0001777698355968549, 'epoch': 0.11}


 11%|█         | 1877/16798 [07:25<58:55,  4.22it/s]

{'loss': 1.5785, 'grad_norm': 2.0290870666503906, 'learning_rate': 0.00017775792232547057, 'epoch': 0.11}


 11%|█         | 1878/16798 [07:26<1:01:19,  4.05it/s]

{'loss': 1.306, 'grad_norm': 1.895457148551941, 'learning_rate': 0.00017774600905408626, 'epoch': 0.11}


 11%|█         | 1879/16798 [07:26<1:06:19,  3.75it/s]

{'loss': 1.572, 'grad_norm': 2.136821985244751, 'learning_rate': 0.00017773409578270195, 'epoch': 0.11}


 11%|█         | 1880/16798 [07:26<1:02:27,  3.98it/s]

{'loss': 1.1261, 'grad_norm': 1.5074052810668945, 'learning_rate': 0.0001777221825113176, 'epoch': 0.11}


 11%|█         | 1881/16798 [07:26<59:53,  4.15it/s]  

{'loss': 1.4338, 'grad_norm': 2.2041406631469727, 'learning_rate': 0.0001777102692399333, 'epoch': 0.11}


 11%|█         | 1882/16798 [07:27<1:00:39,  4.10it/s]

{'loss': 1.2045, 'grad_norm': 1.9665656089782715, 'learning_rate': 0.00017769835596854897, 'epoch': 0.11}


 11%|█         | 1883/16798 [07:27<1:00:32,  4.11it/s]

{'loss': 1.3668, 'grad_norm': 1.844788908958435, 'learning_rate': 0.00017768644269716466, 'epoch': 0.11}


 11%|█         | 1884/16798 [07:27<1:01:05,  4.07it/s]

{'loss': 1.0666, 'grad_norm': 2.076491355895996, 'learning_rate': 0.00017767452942578032, 'epoch': 0.11}


 11%|█         | 1885/16798 [07:27<1:00:02,  4.14it/s]

{'loss': 1.29, 'grad_norm': 1.7017347812652588, 'learning_rate': 0.000177662616154396, 'epoch': 0.11}


 11%|█         | 1886/16798 [07:28<1:03:36,  3.91it/s]

{'loss': 1.7674, 'grad_norm': 2.3556478023529053, 'learning_rate': 0.00017765070288301168, 'epoch': 0.11}


 11%|█         | 1887/16798 [07:28<1:04:38,  3.84it/s]

{'loss': 1.1204, 'grad_norm': 1.6067218780517578, 'learning_rate': 0.00017763878961162737, 'epoch': 0.11}


 11%|█         | 1888/16798 [07:28<1:02:32,  3.97it/s]

{'loss': 1.3967, 'grad_norm': 1.7350964546203613, 'learning_rate': 0.00017762687634024303, 'epoch': 0.11}


 11%|█         | 1889/16798 [07:28<1:04:56,  3.83it/s]

{'loss': 1.6287, 'grad_norm': 2.0639660358428955, 'learning_rate': 0.00017761496306885872, 'epoch': 0.11}


 11%|█▏        | 1890/16798 [07:29<1:05:26,  3.80it/s]

{'loss': 1.1272, 'grad_norm': 2.061960458755493, 'learning_rate': 0.00017760304979747439, 'epoch': 0.11}


 11%|█▏        | 1891/16798 [07:29<1:02:13,  3.99it/s]

{'loss': 1.2873, 'grad_norm': 1.8471413850784302, 'learning_rate': 0.00017759113652609008, 'epoch': 0.11}


 11%|█▏        | 1892/16798 [07:29<1:00:05,  4.13it/s]

{'loss': 1.2988, 'grad_norm': 1.8118751049041748, 'learning_rate': 0.00017757922325470574, 'epoch': 0.11}


 11%|█▏        | 1893/16798 [07:29<59:23,  4.18it/s]  

{'loss': 1.2254, 'grad_norm': 1.6672261953353882, 'learning_rate': 0.00017756730998332143, 'epoch': 0.11}


 11%|█▏        | 1894/16798 [07:30<1:01:03,  4.07it/s]

{'loss': 1.5211, 'grad_norm': 2.261275053024292, 'learning_rate': 0.0001775553967119371, 'epoch': 0.11}


 11%|█▏        | 1895/16798 [07:30<59:32,  4.17it/s]  

{'loss': 1.1078, 'grad_norm': 2.0379245281219482, 'learning_rate': 0.00017754348344055278, 'epoch': 0.11}


 11%|█▏        | 1896/16798 [07:30<59:30,  4.17it/s]

{'loss': 1.2182, 'grad_norm': 1.7399473190307617, 'learning_rate': 0.00017753157016916845, 'epoch': 0.11}


 11%|█▏        | 1897/16798 [07:30<1:00:16,  4.12it/s]

{'loss': 0.974, 'grad_norm': 1.6535190343856812, 'learning_rate': 0.00017751965689778414, 'epoch': 0.11}


 11%|█▏        | 1898/16798 [07:31<1:02:26,  3.98it/s]

{'loss': 0.7436, 'grad_norm': 1.584513545036316, 'learning_rate': 0.0001775077436263998, 'epoch': 0.11}


 11%|█▏        | 1900/16798 [07:31<58:21,  4.25it/s]  

{'loss': 0.4224, 'grad_norm': 1.1357930898666382, 'learning_rate': 0.0001774958303550155, 'epoch': 0.11}


 11%|█▏        | 1900/16798 [07:31<58:21,  4.25it/s]

{'loss': 0.8876, 'grad_norm': 1.5629066228866577, 'learning_rate': 0.00017748391708363116, 'epoch': 0.11}


 11%|█▏        | 1901/16798 [07:31<59:15,  4.19it/s]

{'loss': 1.6641, 'grad_norm': 1.5955227613449097, 'learning_rate': 0.00017747200381224685, 'epoch': 0.11}


 11%|█▏        | 1902/16798 [07:32<1:05:43,  3.78it/s]

{'loss': 1.8402, 'grad_norm': 1.692521333694458, 'learning_rate': 0.0001774600905408625, 'epoch': 0.11}


 11%|█▏        | 1903/16798 [07:32<1:02:35,  3.97it/s]

{'loss': 1.979, 'grad_norm': 2.23746395111084, 'learning_rate': 0.0001774481772694782, 'epoch': 0.11}


 11%|█▏        | 1904/16798 [07:32<1:02:11,  3.99it/s]

{'loss': 1.7177, 'grad_norm': 1.6193958520889282, 'learning_rate': 0.00017743626399809387, 'epoch': 0.11}


 11%|█▏        | 1905/16798 [07:32<1:04:29,  3.85it/s]

{'loss': 1.7072, 'grad_norm': 1.790322184562683, 'learning_rate': 0.00017742435072670956, 'epoch': 0.11}


 11%|█▏        | 1906/16798 [07:33<1:02:10,  3.99it/s]

{'loss': 1.8126, 'grad_norm': 2.075077533721924, 'learning_rate': 0.00017741243745532522, 'epoch': 0.11}


 11%|█▏        | 1907/16798 [07:33<1:03:05,  3.93it/s]

{'loss': 1.4294, 'grad_norm': 1.7612107992172241, 'learning_rate': 0.0001774005241839409, 'epoch': 0.11}


 11%|█▏        | 1908/16798 [07:33<1:05:58,  3.76it/s]

{'loss': 2.0098, 'grad_norm': 2.3423056602478027, 'learning_rate': 0.0001773886109125566, 'epoch': 0.11}


 11%|█▏        | 1909/16798 [07:33<1:03:54,  3.88it/s]

{'loss': 1.7049, 'grad_norm': 1.9462103843688965, 'learning_rate': 0.0001773766976411723, 'epoch': 0.11}


 11%|█▏        | 1910/16798 [07:34<1:03:47,  3.89it/s]

{'loss': 1.7185, 'grad_norm': 1.793404221534729, 'learning_rate': 0.00017736478436978796, 'epoch': 0.11}


 11%|█▏        | 1911/16798 [07:34<1:06:17,  3.74it/s]

{'loss': 1.9035, 'grad_norm': 2.1426069736480713, 'learning_rate': 0.00017735287109840365, 'epoch': 0.11}


 11%|█▏        | 1912/16798 [07:34<1:02:52,  3.95it/s]

{'loss': 1.662, 'grad_norm': 2.2145638465881348, 'learning_rate': 0.0001773409578270193, 'epoch': 0.11}


 11%|█▏        | 1913/16798 [07:34<1:01:41,  4.02it/s]

{'loss': 1.3504, 'grad_norm': 1.5647817850112915, 'learning_rate': 0.000177329044555635, 'epoch': 0.11}


 11%|█▏        | 1914/16798 [07:35<1:08:35,  3.62it/s]

{'loss': 1.7301, 'grad_norm': 1.8801219463348389, 'learning_rate': 0.00017731713128425067, 'epoch': 0.11}


 11%|█▏        | 1915/16798 [07:35<1:05:02,  3.81it/s]

{'loss': 1.8008, 'grad_norm': 2.060107469558716, 'learning_rate': 0.00017730521801286636, 'epoch': 0.11}


 11%|█▏        | 1916/16798 [07:35<1:02:56,  3.94it/s]

{'loss': 1.4616, 'grad_norm': 1.918971061706543, 'learning_rate': 0.00017729330474148202, 'epoch': 0.11}


 11%|█▏        | 1917/16798 [07:35<1:05:10,  3.81it/s]

{'loss': 1.4512, 'grad_norm': 1.7499758005142212, 'learning_rate': 0.0001772813914700977, 'epoch': 0.11}


 11%|█▏        | 1918/16798 [07:36<1:02:31,  3.97it/s]

{'loss': 1.5171, 'grad_norm': 1.802801251411438, 'learning_rate': 0.00017726947819871337, 'epoch': 0.11}


 11%|█▏        | 1919/16798 [07:36<1:00:44,  4.08it/s]

{'loss': 1.0853, 'grad_norm': 2.1448609828948975, 'learning_rate': 0.00017725756492732907, 'epoch': 0.11}


 11%|█▏        | 1920/16798 [07:36<1:00:05,  4.13it/s]

{'loss': 1.3925, 'grad_norm': 1.7673152685165405, 'learning_rate': 0.00017724565165594473, 'epoch': 0.11}


 11%|█▏        | 1921/16798 [07:36<1:01:40,  4.02it/s]

{'loss': 1.5708, 'grad_norm': 1.6637985706329346, 'learning_rate': 0.00017723373838456042, 'epoch': 0.11}


 11%|█▏        | 1922/16798 [07:37<1:03:18,  3.92it/s]

{'loss': 1.3231, 'grad_norm': 1.776971697807312, 'learning_rate': 0.00017722182511317608, 'epoch': 0.11}


 11%|█▏        | 1923/16798 [07:37<1:02:43,  3.95it/s]

{'loss': 1.3525, 'grad_norm': 1.756332516670227, 'learning_rate': 0.00017720991184179177, 'epoch': 0.11}


 11%|█▏        | 1924/16798 [07:37<1:05:09,  3.80it/s]

{'loss': 1.3508, 'grad_norm': 2.0251340866088867, 'learning_rate': 0.00017719799857040744, 'epoch': 0.11}


 11%|█▏        | 1925/16798 [07:37<1:02:29,  3.97it/s]

{'loss': 1.5142, 'grad_norm': 1.7409758567810059, 'learning_rate': 0.00017718608529902313, 'epoch': 0.11}


 11%|█▏        | 1926/16798 [07:38<1:02:25,  3.97it/s]

{'loss': 1.1932, 'grad_norm': 1.5059046745300293, 'learning_rate': 0.0001771741720276388, 'epoch': 0.11}


 11%|█▏        | 1927/16798 [07:38<1:01:48,  4.01it/s]

{'loss': 0.932, 'grad_norm': 1.4428091049194336, 'learning_rate': 0.00017716225875625448, 'epoch': 0.11}


 11%|█▏        | 1928/16798 [07:38<1:03:11,  3.92it/s]

{'loss': 1.5096, 'grad_norm': 1.7599728107452393, 'learning_rate': 0.00017715034548487015, 'epoch': 0.11}


 11%|█▏        | 1929/16798 [07:39<1:03:43,  3.89it/s]

{'loss': 1.2588, 'grad_norm': 1.7902467250823975, 'learning_rate': 0.00017713843221348584, 'epoch': 0.11}


 11%|█▏        | 1930/16798 [07:39<1:03:16,  3.92it/s]

{'loss': 1.3729, 'grad_norm': 1.9074746370315552, 'learning_rate': 0.0001771265189421015, 'epoch': 0.11}


 11%|█▏        | 1931/16798 [07:39<1:01:01,  4.06it/s]

{'loss': 1.634, 'grad_norm': 1.837876319885254, 'learning_rate': 0.0001771146056707172, 'epoch': 0.11}


 12%|█▏        | 1932/16798 [07:39<1:05:20,  3.79it/s]

{'loss': 1.8657, 'grad_norm': 2.3169476985931396, 'learning_rate': 0.00017710269239933286, 'epoch': 0.12}


 12%|█▏        | 1934/16798 [07:40<1:00:04,  4.12it/s]

{'loss': 1.4907, 'grad_norm': 2.042832374572754, 'learning_rate': 0.00017709077912794855, 'epoch': 0.12}


 12%|█▏        | 1934/16798 [07:40<1:00:04,  4.12it/s]

{'loss': 1.3067, 'grad_norm': 1.678472876548767, 'learning_rate': 0.0001770788658565642, 'epoch': 0.12}


 12%|█▏        | 1935/16798 [07:40<59:08,  4.19it/s]  

{'loss': 1.0784, 'grad_norm': 1.645372748374939, 'learning_rate': 0.0001770669525851799, 'epoch': 0.12}


 12%|█▏        | 1936/16798 [07:40<1:02:20,  3.97it/s]

{'loss': 1.4465, 'grad_norm': 2.107673168182373, 'learning_rate': 0.00017705503931379556, 'epoch': 0.12}


 12%|█▏        | 1937/16798 [07:40<1:01:12,  4.05it/s]

{'loss': 1.3778, 'grad_norm': 1.7485253810882568, 'learning_rate': 0.00017704312604241125, 'epoch': 0.12}


 12%|█▏        | 1938/16798 [07:41<1:01:56,  4.00it/s]

{'loss': 1.085, 'grad_norm': 1.5728799104690552, 'learning_rate': 0.00017703121277102692, 'epoch': 0.12}


 12%|█▏        | 1939/16798 [07:41<1:05:50,  3.76it/s]

{'loss': 1.6319, 'grad_norm': 2.367344379425049, 'learning_rate': 0.0001770192994996426, 'epoch': 0.12}


 12%|█▏        | 1940/16798 [07:41<1:02:42,  3.95it/s]

{'loss': 1.0012, 'grad_norm': 1.7243846654891968, 'learning_rate': 0.0001770073862282583, 'epoch': 0.12}


 12%|█▏        | 1941/16798 [07:41<1:00:39,  4.08it/s]

{'loss': 1.1443, 'grad_norm': 2.6916444301605225, 'learning_rate': 0.00017699547295687396, 'epoch': 0.12}


 12%|█▏        | 1942/16798 [07:42<1:00:17,  4.11it/s]

{'loss': 0.9347, 'grad_norm': 1.5101008415222168, 'learning_rate': 0.00017698355968548965, 'epoch': 0.12}


 12%|█▏        | 1943/16798 [07:42<1:02:39,  3.95it/s]

{'loss': 1.1015, 'grad_norm': 1.785042405128479, 'learning_rate': 0.00017697164641410532, 'epoch': 0.12}


 12%|█▏        | 1944/16798 [07:42<1:01:12,  4.04it/s]

{'loss': 1.1523, 'grad_norm': 1.703055739402771, 'learning_rate': 0.000176959733142721, 'epoch': 0.12}


 12%|█▏        | 1945/16798 [07:42<58:27,  4.23it/s]  

{'loss': 1.061, 'grad_norm': 1.799363136291504, 'learning_rate': 0.00017694781987133667, 'epoch': 0.12}


 12%|█▏        | 1946/16798 [07:43<1:04:57,  3.81it/s]

{'loss': 0.8648, 'grad_norm': 1.3833272457122803, 'learning_rate': 0.00017693590659995236, 'epoch': 0.12}


 12%|█▏        | 1947/16798 [07:43<1:02:22,  3.97it/s]

{'loss': 0.93, 'grad_norm': 1.6200941801071167, 'learning_rate': 0.00017692399332856803, 'epoch': 0.12}


 12%|█▏        | 1948/16798 [07:43<1:00:47,  4.07it/s]

{'loss': 0.5783, 'grad_norm': 1.2056316137313843, 'learning_rate': 0.00017691208005718372, 'epoch': 0.12}


 12%|█▏        | 1949/16798 [07:43<59:31,  4.16it/s]  

{'loss': 0.7325, 'grad_norm': 1.3241504430770874, 'learning_rate': 0.00017690016678579938, 'epoch': 0.12}


 12%|█▏        | 1950/16798 [07:44<1:00:26,  4.09it/s]

{'loss': 0.496, 'grad_norm': 1.0750393867492676, 'learning_rate': 0.00017688825351441507, 'epoch': 0.12}


 12%|█▏        | 1951/16798 [07:44<1:01:52,  4.00it/s]

{'loss': 2.1275, 'grad_norm': 1.7997220754623413, 'learning_rate': 0.00017687634024303074, 'epoch': 0.12}


 12%|█▏        | 1952/16798 [07:44<1:00:49,  4.07it/s]

{'loss': 2.1155, 'grad_norm': 1.9460773468017578, 'learning_rate': 0.00017686442697164643, 'epoch': 0.12}


 12%|█▏        | 1953/16798 [07:44<1:03:18,  3.91it/s]

{'loss': 1.6791, 'grad_norm': 1.8001831769943237, 'learning_rate': 0.0001768525137002621, 'epoch': 0.12}


 12%|█▏        | 1954/16798 [07:45<1:00:47,  4.07it/s]

{'loss': 1.9256, 'grad_norm': 1.906087875366211, 'learning_rate': 0.00017684060042887778, 'epoch': 0.12}


 12%|█▏        | 1955/16798 [07:45<58:54,  4.20it/s]  

{'loss': 1.9029, 'grad_norm': 1.908978819847107, 'learning_rate': 0.00017682868715749344, 'epoch': 0.12}


 12%|█▏        | 1956/16798 [07:45<59:28,  4.16it/s]

{'loss': 1.906, 'grad_norm': 2.0322582721710205, 'learning_rate': 0.00017681677388610914, 'epoch': 0.12}


 12%|█▏        | 1957/16798 [07:45<1:02:57,  3.93it/s]

{'loss': 1.8147, 'grad_norm': 2.0301313400268555, 'learning_rate': 0.0001768048606147248, 'epoch': 0.12}


 12%|█▏        | 1958/16798 [07:46<1:00:46,  4.07it/s]

{'loss': 1.7503, 'grad_norm': 2.1468915939331055, 'learning_rate': 0.0001767929473433405, 'epoch': 0.12}


 12%|█▏        | 1959/16798 [07:46<1:03:11,  3.91it/s]

{'loss': 1.4228, 'grad_norm': 2.0140905380249023, 'learning_rate': 0.00017678103407195615, 'epoch': 0.12}


 12%|█▏        | 1960/16798 [07:46<1:04:24,  3.84it/s]

{'loss': 1.5512, 'grad_norm': 2.001720905303955, 'learning_rate': 0.00017676912080057184, 'epoch': 0.12}


 12%|█▏        | 1961/16798 [07:47<1:05:39,  3.77it/s]

{'loss': 1.7371, 'grad_norm': 2.0289435386657715, 'learning_rate': 0.0001767572075291875, 'epoch': 0.12}


 12%|█▏        | 1962/16798 [07:47<1:02:48,  3.94it/s]

{'loss': 1.5279, 'grad_norm': 1.6616723537445068, 'learning_rate': 0.0001767452942578032, 'epoch': 0.12}


 12%|█▏        | 1963/16798 [07:47<1:01:33,  4.02it/s]

{'loss': 2.0158, 'grad_norm': 1.9282866716384888, 'learning_rate': 0.00017673338098641886, 'epoch': 0.12}


 12%|█▏        | 1964/16798 [07:47<1:05:19,  3.78it/s]

{'loss': 0.8774, 'grad_norm': 1.2726304531097412, 'learning_rate': 0.00017672146771503455, 'epoch': 0.12}


 12%|█▏        | 1965/16798 [07:48<1:02:02,  3.98it/s]

{'loss': 1.4623, 'grad_norm': 1.7887593507766724, 'learning_rate': 0.00017670955444365022, 'epoch': 0.12}


 12%|█▏        | 1966/16798 [07:48<59:35,  4.15it/s]  

{'loss': 1.6834, 'grad_norm': 1.6728280782699585, 'learning_rate': 0.0001766976411722659, 'epoch': 0.12}


 12%|█▏        | 1967/16798 [07:48<1:01:56,  3.99it/s]

{'loss': 1.3693, 'grad_norm': 1.4989155530929565, 'learning_rate': 0.00017668572790088157, 'epoch': 0.12}


 12%|█▏        | 1968/16798 [07:48<1:00:43,  4.07it/s]

{'loss': 1.7091, 'grad_norm': 1.9278013706207275, 'learning_rate': 0.00017667381462949726, 'epoch': 0.12}


 12%|█▏        | 1969/16798 [07:48<59:21,  4.16it/s]  

{'loss': 1.7201, 'grad_norm': 2.1532065868377686, 'learning_rate': 0.00017666190135811293, 'epoch': 0.12}


 12%|█▏        | 1970/16798 [07:49<1:02:22,  3.96it/s]

{'loss': 1.8129, 'grad_norm': 1.850159764289856, 'learning_rate': 0.00017664998808672864, 'epoch': 0.12}


 12%|█▏        | 1971/16798 [07:49<1:01:57,  3.99it/s]

{'loss': 1.6023, 'grad_norm': 1.8728080987930298, 'learning_rate': 0.0001766380748153443, 'epoch': 0.12}


 12%|█▏        | 1972/16798 [07:49<1:01:31,  4.02it/s]

{'loss': 1.115, 'grad_norm': 1.8401501178741455, 'learning_rate': 0.00017662616154396, 'epoch': 0.12}


 12%|█▏        | 1973/16798 [07:50<1:06:09,  3.73it/s]

{'loss': 1.5053, 'grad_norm': 1.5950530767440796, 'learning_rate': 0.00017661424827257566, 'epoch': 0.12}


 12%|█▏        | 1974/16798 [07:50<1:03:08,  3.91it/s]

{'loss': 1.7958, 'grad_norm': 1.7900757789611816, 'learning_rate': 0.00017660233500119135, 'epoch': 0.12}


 12%|█▏        | 1975/16798 [07:50<1:03:26,  3.89it/s]

{'loss': 1.5262, 'grad_norm': 1.6568421125411987, 'learning_rate': 0.00017659042172980702, 'epoch': 0.12}


 12%|█▏        | 1976/16798 [07:50<1:05:07,  3.79it/s]

{'loss': 1.2999, 'grad_norm': 2.1348416805267334, 'learning_rate': 0.0001765785084584227, 'epoch': 0.12}


 12%|█▏        | 1977/16798 [07:51<1:02:54,  3.93it/s]

{'loss': 0.9508, 'grad_norm': 1.7744343280792236, 'learning_rate': 0.00017656659518703837, 'epoch': 0.12}


 12%|█▏        | 1978/16798 [07:51<1:03:06,  3.91it/s]

{'loss': 1.9413, 'grad_norm': 1.9656749963760376, 'learning_rate': 0.00017655468191565406, 'epoch': 0.12}


 12%|█▏        | 1979/16798 [07:51<1:04:26,  3.83it/s]

{'loss': 1.6184, 'grad_norm': 1.7261803150177002, 'learning_rate': 0.00017654276864426972, 'epoch': 0.12}


 12%|█▏        | 1980/16798 [07:51<1:01:59,  3.98it/s]

{'loss': 1.4382, 'grad_norm': 1.6951847076416016, 'learning_rate': 0.00017653085537288542, 'epoch': 0.12}


 12%|█▏        | 1981/16798 [07:52<1:00:35,  4.08it/s]

{'loss': 1.2659, 'grad_norm': 1.7553132772445679, 'learning_rate': 0.00017651894210150108, 'epoch': 0.12}


 12%|█▏        | 1982/16798 [07:52<1:00:33,  4.08it/s]

{'loss': 1.13, 'grad_norm': 1.4568434953689575, 'learning_rate': 0.00017650702883011677, 'epoch': 0.12}


 12%|█▏        | 1983/16798 [07:52<1:02:40,  3.94it/s]

{'loss': 1.3171, 'grad_norm': 1.9436384439468384, 'learning_rate': 0.00017649511555873243, 'epoch': 0.12}


 12%|█▏        | 1984/16798 [07:52<1:00:52,  4.06it/s]

{'loss': 1.2172, 'grad_norm': 1.69074547290802, 'learning_rate': 0.00017648320228734812, 'epoch': 0.12}


 12%|█▏        | 1985/16798 [07:53<1:00:04,  4.11it/s]

{'loss': 1.4167, 'grad_norm': 1.8568371534347534, 'learning_rate': 0.0001764712890159638, 'epoch': 0.12}


 12%|█▏        | 1986/16798 [07:53<1:03:37,  3.88it/s]

{'loss': 1.4891, 'grad_norm': 1.8647129535675049, 'learning_rate': 0.00017645937574457948, 'epoch': 0.12}


 12%|█▏        | 1987/16798 [07:53<1:01:52,  3.99it/s]

{'loss': 1.3485, 'grad_norm': 2.5453317165374756, 'learning_rate': 0.00017644746247319514, 'epoch': 0.12}


 12%|█▏        | 1988/16798 [07:53<1:03:42,  3.87it/s]

{'loss': 0.969, 'grad_norm': 1.4508053064346313, 'learning_rate': 0.00017643554920181083, 'epoch': 0.12}


 12%|█▏        | 1989/16798 [07:54<1:03:03,  3.91it/s]

{'loss': 1.1614, 'grad_norm': 1.846486210823059, 'learning_rate': 0.0001764236359304265, 'epoch': 0.12}


 12%|█▏        | 1990/16798 [07:54<1:01:51,  3.99it/s]

{'loss': 1.0443, 'grad_norm': 1.7892156839370728, 'learning_rate': 0.0001764117226590422, 'epoch': 0.12}


 12%|█▏        | 1991/16798 [07:54<1:00:09,  4.10it/s]

{'loss': 0.9414, 'grad_norm': 1.7180681228637695, 'learning_rate': 0.00017639980938765785, 'epoch': 0.12}


 12%|█▏        | 1992/16798 [07:54<59:38,  4.14it/s]  

{'loss': 1.1665, 'grad_norm': 1.5891472101211548, 'learning_rate': 0.00017638789611627354, 'epoch': 0.12}


 12%|█▏        | 1993/16798 [07:55<1:02:45,  3.93it/s]

{'loss': 1.6174, 'grad_norm': 2.995929718017578, 'learning_rate': 0.0001763759828448892, 'epoch': 0.12}


 12%|█▏        | 1994/16798 [07:55<1:04:54,  3.80it/s]

{'loss': 1.1807, 'grad_norm': 1.7927019596099854, 'learning_rate': 0.0001763640695735049, 'epoch': 0.12}


 12%|█▏        | 1996/16798 [07:55<59:55,  4.12it/s]  

{'loss': 0.8381, 'grad_norm': 1.4537235498428345, 'learning_rate': 0.00017635215630212056, 'epoch': 0.12}


 12%|█▏        | 1996/16798 [07:55<59:55,  4.12it/s]

{'loss': 1.1561, 'grad_norm': 1.5887147188186646, 'learning_rate': 0.00017634024303073625, 'epoch': 0.12}


 12%|█▏        | 1997/16798 [07:56<1:02:58,  3.92it/s]

{'loss': 1.148, 'grad_norm': 1.7553199529647827, 'learning_rate': 0.00017632832975935191, 'epoch': 0.12}


 12%|█▏        | 1998/16798 [07:56<1:01:38,  4.00it/s]

{'loss': 0.8904, 'grad_norm': 1.7234777212142944, 'learning_rate': 0.0001763164164879676, 'epoch': 0.12}


 12%|█▏        | 1999/16798 [07:56<59:35,  4.14it/s]  

{'loss': 0.4056, 'grad_norm': 1.0710784196853638, 'learning_rate': 0.00017630450321658327, 'epoch': 0.12}




{'loss': 0.2311, 'grad_norm': 0.8431032299995422, 'learning_rate': 0.00017629258994519896, 'epoch': 0.12}


 12%|█▏        | 2001/16798 [07:59<4:13:29,  1.03s/it]

{'loss': 2.3293, 'grad_norm': 2.0180132389068604, 'learning_rate': 0.00017628067667381465, 'epoch': 0.12}


 12%|█▏        | 2002/16798 [07:59<3:13:08,  1.28it/s]

{'loss': 2.1836, 'grad_norm': 2.0668680667877197, 'learning_rate': 0.00017626876340243031, 'epoch': 0.12}


 12%|█▏        | 2003/16798 [08:00<2:33:02,  1.61it/s]

{'loss': 1.7149, 'grad_norm': 1.9524283409118652, 'learning_rate': 0.000176256850131046, 'epoch': 0.12}


 12%|█▏        | 2004/16798 [08:00<2:05:38,  1.96it/s]

{'loss': 2.0587, 'grad_norm': 2.1039717197418213, 'learning_rate': 0.00017624493685966167, 'epoch': 0.12}


 12%|█▏        | 2005/16798 [08:00<1:47:08,  2.30it/s]

{'loss': 1.8825, 'grad_norm': 1.884400486946106, 'learning_rate': 0.00017623302358827736, 'epoch': 0.12}


 12%|█▏        | 2006/16798 [08:00<1:35:09,  2.59it/s]

{'loss': 2.038, 'grad_norm': 2.031878709793091, 'learning_rate': 0.00017622111031689302, 'epoch': 0.12}


 12%|█▏        | 2007/16798 [08:01<1:27:49,  2.81it/s]

{'loss': 1.8913, 'grad_norm': 1.9678831100463867, 'learning_rate': 0.00017620919704550871, 'epoch': 0.12}


 12%|█▏        | 2008/16798 [08:01<1:19:08,  3.11it/s]

{'loss': 1.8856, 'grad_norm': 1.911068081855774, 'learning_rate': 0.00017619728377412438, 'epoch': 0.12}


 12%|█▏        | 2009/16798 [08:01<1:11:38,  3.44it/s]

{'loss': 1.5582, 'grad_norm': 1.839555025100708, 'learning_rate': 0.00017618537050274007, 'epoch': 0.12}


 12%|█▏        | 2010/16798 [08:01<1:10:24,  3.50it/s]

{'loss': 2.1318, 'grad_norm': 2.163174867630005, 'learning_rate': 0.00017617345723135573, 'epoch': 0.12}


 12%|█▏        | 2011/16798 [08:02<1:08:15,  3.61it/s]

{'loss': 1.9145, 'grad_norm': 2.307898759841919, 'learning_rate': 0.00017616154395997142, 'epoch': 0.12}


 12%|█▏        | 2012/16798 [08:02<1:06:16,  3.72it/s]

{'loss': 1.8135, 'grad_norm': 2.1975083351135254, 'learning_rate': 0.00017614963068858709, 'epoch': 0.12}


 12%|█▏        | 2013/16798 [08:02<1:03:51,  3.86it/s]

{'loss': 1.3481, 'grad_norm': 1.7621151208877563, 'learning_rate': 0.00017613771741720278, 'epoch': 0.12}


 12%|█▏        | 2014/16798 [08:02<1:07:25,  3.65it/s]

{'loss': 1.364, 'grad_norm': 1.779931664466858, 'learning_rate': 0.00017612580414581844, 'epoch': 0.12}


 12%|█▏        | 2015/16798 [08:03<1:04:17,  3.83it/s]

{'loss': 1.5903, 'grad_norm': 2.1560897827148438, 'learning_rate': 0.00017611389087443413, 'epoch': 0.12}


 12%|█▏        | 2016/16798 [08:03<1:01:35,  4.00it/s]

{'loss': 2.2297, 'grad_norm': 2.269879102706909, 'learning_rate': 0.0001761019776030498, 'epoch': 0.12}


 12%|█▏        | 2017/16798 [08:03<1:00:45,  4.05it/s]

{'loss': 1.3974, 'grad_norm': 1.7763988971710205, 'learning_rate': 0.00017609006433166549, 'epoch': 0.12}


 12%|█▏        | 2018/16798 [08:03<1:02:25,  3.95it/s]

{'loss': 1.4402, 'grad_norm': 1.7848668098449707, 'learning_rate': 0.00017607815106028115, 'epoch': 0.12}


 12%|█▏        | 2019/16798 [08:04<1:02:13,  3.96it/s]

{'loss': 1.7537, 'grad_norm': 2.0477044582366943, 'learning_rate': 0.00017606623778889684, 'epoch': 0.12}


 12%|█▏        | 2020/16798 [08:04<1:01:31,  4.00it/s]

{'loss': 1.6288, 'grad_norm': 1.8784599304199219, 'learning_rate': 0.0001760543245175125, 'epoch': 0.12}


 12%|█▏        | 2021/16798 [08:04<1:01:52,  3.98it/s]

{'loss': 1.5726, 'grad_norm': 1.7696086168289185, 'learning_rate': 0.0001760424112461282, 'epoch': 0.12}


 12%|█▏        | 2022/16798 [08:05<1:08:33,  3.59it/s]

{'loss': 1.5584, 'grad_norm': 2.3781230449676514, 'learning_rate': 0.00017603049797474386, 'epoch': 0.12}


 12%|█▏        | 2023/16798 [08:05<1:03:27,  3.88it/s]

{'loss': 2.0021, 'grad_norm': 2.490697145462036, 'learning_rate': 0.00017601858470335955, 'epoch': 0.12}


 12%|█▏        | 2024/16798 [08:05<1:00:35,  4.06it/s]

{'loss': 1.3095, 'grad_norm': 2.2120258808135986, 'learning_rate': 0.0001760066714319752, 'epoch': 0.12}


 12%|█▏        | 2025/16798 [08:05<59:09,  4.16it/s]  

{'loss': 1.9924, 'grad_norm': 2.2193541526794434, 'learning_rate': 0.0001759947581605909, 'epoch': 0.12}


 12%|█▏        | 2026/16798 [08:05<57:14,  4.30it/s]

{'loss': 1.5325, 'grad_norm': 2.2438101768493652, 'learning_rate': 0.00017598284488920657, 'epoch': 0.12}


 12%|█▏        | 2027/16798 [08:06<1:06:59,  3.67it/s]

{'loss': 1.4807, 'grad_norm': 1.6748051643371582, 'learning_rate': 0.00017597093161782226, 'epoch': 0.12}


 12%|█▏        | 2028/16798 [08:06<1:03:57,  3.85it/s]

{'loss': 1.5168, 'grad_norm': 1.6811784505844116, 'learning_rate': 0.00017595901834643792, 'epoch': 0.12}


 12%|█▏        | 2029/16798 [08:06<1:04:18,  3.83it/s]

{'loss': 1.0281, 'grad_norm': 1.3725327253341675, 'learning_rate': 0.0001759471050750536, 'epoch': 0.12}


 12%|█▏        | 2030/16798 [08:06<1:01:35,  4.00it/s]

{'loss': 1.4464, 'grad_norm': 1.7661598920822144, 'learning_rate': 0.00017593519180366928, 'epoch': 0.12}


 12%|█▏        | 2031/16798 [08:07<1:03:52,  3.85it/s]

{'loss': 0.9054, 'grad_norm': 1.8633939027786255, 'learning_rate': 0.00017592327853228497, 'epoch': 0.12}


 12%|█▏        | 2032/16798 [08:07<1:03:29,  3.88it/s]

{'loss': 1.3861, 'grad_norm': 1.6424245834350586, 'learning_rate': 0.00017591136526090066, 'epoch': 0.12}


 12%|█▏        | 2033/16798 [08:07<1:02:24,  3.94it/s]

{'loss': 1.1296, 'grad_norm': 1.5331151485443115, 'learning_rate': 0.00017589945198951635, 'epoch': 0.12}


 12%|█▏        | 2034/16798 [08:08<1:04:24,  3.82it/s]

{'loss': 1.232, 'grad_norm': 1.649214744567871, 'learning_rate': 0.000175887538718132, 'epoch': 0.12}


 12%|█▏        | 2035/16798 [08:08<1:05:11,  3.77it/s]

{'loss': 1.4374, 'grad_norm': 2.029261350631714, 'learning_rate': 0.0001758756254467477, 'epoch': 0.12}


 12%|█▏        | 2036/16798 [08:08<1:04:44,  3.80it/s]

{'loss': 1.2455, 'grad_norm': 1.942375898361206, 'learning_rate': 0.00017586371217536337, 'epoch': 0.12}


 12%|█▏        | 2037/16798 [08:08<1:03:56,  3.85it/s]

{'loss': 1.0076, 'grad_norm': 1.6623001098632812, 'learning_rate': 0.00017585179890397906, 'epoch': 0.12}


 12%|█▏        | 2038/16798 [08:09<1:06:08,  3.72it/s]

{'loss': 1.3816, 'grad_norm': 1.8779823780059814, 'learning_rate': 0.00017583988563259472, 'epoch': 0.12}


 12%|█▏        | 2039/16798 [08:09<1:06:24,  3.70it/s]

{'loss': 1.2831, 'grad_norm': 1.8595548868179321, 'learning_rate': 0.0001758279723612104, 'epoch': 0.12}


 12%|█▏        | 2040/16798 [08:09<1:03:07,  3.90it/s]

{'loss': 1.3659, 'grad_norm': 1.7310686111450195, 'learning_rate': 0.00017581605908982608, 'epoch': 0.12}


 12%|█▏        | 2041/16798 [08:09<1:02:00,  3.97it/s]

{'loss': 1.1669, 'grad_norm': 1.66297447681427, 'learning_rate': 0.00017580414581844177, 'epoch': 0.12}


 12%|█▏        | 2042/16798 [08:10<1:05:11,  3.77it/s]

{'loss': 1.0652, 'grad_norm': 1.6640422344207764, 'learning_rate': 0.00017579223254705743, 'epoch': 0.12}


 12%|█▏        | 2043/16798 [08:10<1:04:42,  3.80it/s]

{'loss': 1.0104, 'grad_norm': 1.3530683517456055, 'learning_rate': 0.00017578031927567312, 'epoch': 0.12}


 12%|█▏        | 2044/16798 [08:10<1:06:38,  3.69it/s]

{'loss': 1.4348, 'grad_norm': 1.9167414903640747, 'learning_rate': 0.00017576840600428878, 'epoch': 0.12}


 12%|█▏        | 2045/16798 [08:10<1:06:34,  3.69it/s]

{'loss': 0.9179, 'grad_norm': 1.6513606309890747, 'learning_rate': 0.00017575649273290448, 'epoch': 0.12}


 12%|█▏        | 2046/16798 [08:11<1:05:14,  3.77it/s]

{'loss': 1.097, 'grad_norm': 1.8934600353240967, 'learning_rate': 0.00017574457946152014, 'epoch': 0.12}


 12%|█▏        | 2047/16798 [08:11<1:02:33,  3.93it/s]

{'loss': 0.5691, 'grad_norm': 1.151065707206726, 'learning_rate': 0.00017573266619013583, 'epoch': 0.12}


 12%|█▏        | 2048/16798 [08:11<1:05:23,  3.76it/s]

{'loss': 0.2481, 'grad_norm': 0.9451926350593567, 'learning_rate': 0.0001757207529187515, 'epoch': 0.12}


 12%|█▏        | 2049/16798 [08:11<1:05:14,  3.77it/s]

{'loss': 0.2427, 'grad_norm': 0.8013894557952881, 'learning_rate': 0.00017570883964736718, 'epoch': 0.12}


 12%|█▏        | 2050/16798 [08:12<1:04:22,  3.82it/s]

{'loss': 0.4452, 'grad_norm': 1.0151907205581665, 'learning_rate': 0.00017569692637598285, 'epoch': 0.12}


 12%|█▏        | 2051/16798 [08:12<1:03:44,  3.86it/s]

{'loss': 1.8583, 'grad_norm': 1.794742226600647, 'learning_rate': 0.00017568501310459854, 'epoch': 0.12}


 12%|█▏        | 2052/16798 [08:12<1:06:51,  3.68it/s]

{'loss': 2.3525, 'grad_norm': 1.9547570943832397, 'learning_rate': 0.0001756730998332142, 'epoch': 0.12}


 12%|█▏        | 2053/16798 [08:13<1:04:30,  3.81it/s]

{'loss': 2.3205, 'grad_norm': 1.9598807096481323, 'learning_rate': 0.0001756611865618299, 'epoch': 0.12}


 12%|█▏        | 2054/16798 [08:13<1:03:07,  3.89it/s]

{'loss': 2.0008, 'grad_norm': 1.9703354835510254, 'learning_rate': 0.00017564927329044556, 'epoch': 0.12}


 12%|█▏        | 2055/16798 [08:13<1:03:19,  3.88it/s]

{'loss': 2.0938, 'grad_norm': 1.76462984085083, 'learning_rate': 0.00017563736001906125, 'epoch': 0.12}


 12%|█▏        | 2056/16798 [08:13<1:05:21,  3.76it/s]

{'loss': 2.1779, 'grad_norm': 1.8645589351654053, 'learning_rate': 0.0001756254467476769, 'epoch': 0.12}


 12%|█▏        | 2057/16798 [08:14<1:03:30,  3.87it/s]

{'loss': 1.7474, 'grad_norm': 1.8464977741241455, 'learning_rate': 0.0001756135334762926, 'epoch': 0.12}


 12%|█▏        | 2058/16798 [08:14<1:02:57,  3.90it/s]

{'loss': 1.8101, 'grad_norm': 1.6923524141311646, 'learning_rate': 0.00017560162020490827, 'epoch': 0.12}


 12%|█▏        | 2059/16798 [08:14<1:04:19,  3.82it/s]

{'loss': 2.0666, 'grad_norm': 1.8601151704788208, 'learning_rate': 0.00017558970693352396, 'epoch': 0.12}


 12%|█▏        | 2060/16798 [08:14<1:04:42,  3.80it/s]

{'loss': 1.8445, 'grad_norm': 2.0681686401367188, 'learning_rate': 0.00017557779366213962, 'epoch': 0.12}


 12%|█▏        | 2061/16798 [08:15<1:03:33,  3.86it/s]

{'loss': 1.7005, 'grad_norm': 1.8020418882369995, 'learning_rate': 0.0001755658803907553, 'epoch': 0.12}


 12%|█▏        | 2062/16798 [08:15<1:03:15,  3.88it/s]

{'loss': 1.7165, 'grad_norm': 1.7831552028656006, 'learning_rate': 0.00017555396711937097, 'epoch': 0.12}


 12%|█▏        | 2063/16798 [08:15<1:06:28,  3.69it/s]

{'loss': 1.8112, 'grad_norm': 1.8967381715774536, 'learning_rate': 0.00017554205384798666, 'epoch': 0.12}


 12%|█▏        | 2064/16798 [08:15<1:03:10,  3.89it/s]

{'loss': 1.6504, 'grad_norm': 1.7981699705123901, 'learning_rate': 0.00017553014057660236, 'epoch': 0.12}


 12%|█▏        | 2065/16798 [08:16<1:05:00,  3.78it/s]

{'loss': 1.5216, 'grad_norm': 1.8209021091461182, 'learning_rate': 0.00017551822730521802, 'epoch': 0.12}


 12%|█▏        | 2066/16798 [08:16<1:01:48,  3.97it/s]

{'loss': 2.0669, 'grad_norm': 2.1381499767303467, 'learning_rate': 0.0001755063140338337, 'epoch': 0.12}


 12%|█▏        | 2067/16798 [08:16<1:05:01,  3.78it/s]

{'loss': 1.7512, 'grad_norm': 1.9104509353637695, 'learning_rate': 0.00017549440076244937, 'epoch': 0.12}


 12%|█▏        | 2068/16798 [08:16<1:04:53,  3.78it/s]

{'loss': 1.8552, 'grad_norm': 2.161433458328247, 'learning_rate': 0.00017548248749106506, 'epoch': 0.12}


 12%|█▏        | 2069/16798 [08:17<1:03:38,  3.86it/s]

{'loss': 1.6361, 'grad_norm': 1.750443935394287, 'learning_rate': 0.00017547057421968073, 'epoch': 0.12}


 12%|█▏        | 2070/16798 [08:17<1:01:55,  3.96it/s]

{'loss': 1.8391, 'grad_norm': 2.0233314037323, 'learning_rate': 0.00017545866094829642, 'epoch': 0.12}


 12%|█▏        | 2071/16798 [08:17<1:04:14,  3.82it/s]

{'loss': 1.8568, 'grad_norm': 2.139981269836426, 'learning_rate': 0.00017544674767691208, 'epoch': 0.12}


 12%|█▏        | 2072/16798 [08:17<1:01:05,  4.02it/s]

{'loss': 1.6741, 'grad_norm': 1.9350106716156006, 'learning_rate': 0.00017543483440552777, 'epoch': 0.12}


 12%|█▏        | 2073/16798 [08:18<59:52,  4.10it/s]  

{'loss': 1.5947, 'grad_norm': 1.9218530654907227, 'learning_rate': 0.00017542292113414344, 'epoch': 0.12}


 12%|█▏        | 2074/16798 [08:18<1:03:08,  3.89it/s]

{'loss': 1.8679, 'grad_norm': 2.150752067565918, 'learning_rate': 0.00017541100786275913, 'epoch': 0.12}


 12%|█▏        | 2075/16798 [08:18<1:03:55,  3.84it/s]

{'loss': 1.8759, 'grad_norm': 2.649728536605835, 'learning_rate': 0.0001753990945913748, 'epoch': 0.12}


 12%|█▏        | 2076/16798 [08:18<1:03:59,  3.83it/s]

{'loss': 1.1771, 'grad_norm': 2.0727272033691406, 'learning_rate': 0.00017538718131999048, 'epoch': 0.12}


 12%|█▏        | 2077/16798 [08:19<1:00:18,  4.07it/s]

{'loss': 1.7073, 'grad_norm': 2.0096492767333984, 'learning_rate': 0.00017537526804860615, 'epoch': 0.12}


 12%|█▏        | 2078/16798 [08:19<1:01:49,  3.97it/s]

{'loss': 1.6249, 'grad_norm': 1.974128007888794, 'learning_rate': 0.00017536335477722184, 'epoch': 0.12}


 12%|█▏        | 2079/16798 [08:19<59:29,  4.12it/s]  

{'loss': 1.8466, 'grad_norm': 2.1384782791137695, 'learning_rate': 0.0001753514415058375, 'epoch': 0.12}


 12%|█▏        | 2080/16798 [08:19<1:01:50,  3.97it/s]

{'loss': 1.9094, 'grad_norm': 2.1844799518585205, 'learning_rate': 0.0001753395282344532, 'epoch': 0.12}


 12%|█▏        | 2081/16798 [08:20<59:37,  4.11it/s]  

{'loss': 1.5926, 'grad_norm': 1.8213552236557007, 'learning_rate': 0.00017532761496306885, 'epoch': 0.12}


 12%|█▏        | 2082/16798 [08:20<59:02,  4.15it/s]

{'loss': 1.6851, 'grad_norm': 1.9644122123718262, 'learning_rate': 0.00017531570169168455, 'epoch': 0.12}


 12%|█▏        | 2083/16798 [08:20<1:01:35,  3.98it/s]

{'loss': 1.6352, 'grad_norm': 1.866196870803833, 'learning_rate': 0.0001753037884203002, 'epoch': 0.12}


 12%|█▏        | 2084/16798 [08:20<1:03:54,  3.84it/s]

{'loss': 0.9571, 'grad_norm': 1.323636531829834, 'learning_rate': 0.0001752918751489159, 'epoch': 0.12}


 12%|█▏        | 2085/16798 [08:21<1:03:16,  3.88it/s]

{'loss': 1.8424, 'grad_norm': 1.8957597017288208, 'learning_rate': 0.00017527996187753156, 'epoch': 0.12}


 12%|█▏        | 2086/16798 [08:21<1:00:34,  4.05it/s]

{'loss': 1.7752, 'grad_norm': 1.843164086341858, 'learning_rate': 0.00017526804860614725, 'epoch': 0.12}


 12%|█▏        | 2087/16798 [08:21<1:02:11,  3.94it/s]

{'loss': 1.2136, 'grad_norm': 2.003817558288574, 'learning_rate': 0.00017525613533476292, 'epoch': 0.12}


 12%|█▏        | 2088/16798 [08:21<1:03:01,  3.89it/s]

{'loss': 1.1128, 'grad_norm': 1.5061229467391968, 'learning_rate': 0.0001752442220633786, 'epoch': 0.12}


 12%|█▏        | 2089/16798 [08:22<1:01:38,  3.98it/s]

{'loss': 1.8671, 'grad_norm': 2.275916337966919, 'learning_rate': 0.00017523230879199427, 'epoch': 0.12}


 12%|█▏        | 2090/16798 [08:22<1:02:39,  3.91it/s]

{'loss': 1.2029, 'grad_norm': 2.000174045562744, 'learning_rate': 0.00017522039552060996, 'epoch': 0.12}


 12%|█▏        | 2091/16798 [08:22<1:03:59,  3.83it/s]

{'loss': 1.1588, 'grad_norm': 1.645268440246582, 'learning_rate': 0.00017520848224922563, 'epoch': 0.12}


 12%|█▏        | 2092/16798 [08:23<1:04:26,  3.80it/s]

{'loss': 1.416, 'grad_norm': 1.8631471395492554, 'learning_rate': 0.00017519656897784132, 'epoch': 0.12}


 12%|█▏        | 2093/16798 [08:23<1:06:08,  3.71it/s]

{'loss': 1.2096, 'grad_norm': 1.8952646255493164, 'learning_rate': 0.00017518465570645698, 'epoch': 0.12}


 12%|█▏        | 2094/16798 [08:23<1:05:32,  3.74it/s]

{'loss': 1.1387, 'grad_norm': 1.5922472476959229, 'learning_rate': 0.0001751727424350727, 'epoch': 0.12}


 12%|█▏        | 2095/16798 [08:23<1:02:12,  3.94it/s]

{'loss': 1.263, 'grad_norm': 1.8046720027923584, 'learning_rate': 0.00017516082916368836, 'epoch': 0.12}


 12%|█▏        | 2096/16798 [08:24<1:10:33,  3.47it/s]

{'loss': 1.503, 'grad_norm': 1.9790785312652588, 'learning_rate': 0.00017514891589230405, 'epoch': 0.12}


 12%|█▏        | 2097/16798 [08:24<1:06:58,  3.66it/s]

{'loss': 0.6691, 'grad_norm': 1.234052062034607, 'learning_rate': 0.00017513700262091972, 'epoch': 0.12}


 12%|█▏        | 2098/16798 [08:24<1:03:39,  3.85it/s]

{'loss': 1.0164, 'grad_norm': 1.394776463508606, 'learning_rate': 0.0001751250893495354, 'epoch': 0.12}


 12%|█▏        | 2099/16798 [08:24<1:04:07,  3.82it/s]

{'loss': 0.6904, 'grad_norm': 1.475791096687317, 'learning_rate': 0.00017511317607815107, 'epoch': 0.12}


 13%|█▎        | 2100/16798 [08:25<1:03:25,  3.86it/s]

{'loss': 0.5937, 'grad_norm': 1.2859042882919312, 'learning_rate': 0.00017510126280676676, 'epoch': 0.13}


 13%|█▎        | 2101/16798 [08:25<1:03:33,  3.85it/s]

{'loss': 1.5517, 'grad_norm': 1.7175062894821167, 'learning_rate': 0.00017508934953538243, 'epoch': 0.13}


 13%|█▎        | 2102/16798 [08:25<1:06:25,  3.69it/s]

{'loss': 2.2706, 'grad_norm': 2.9420742988586426, 'learning_rate': 0.00017507743626399812, 'epoch': 0.13}


 13%|█▎        | 2103/16798 [08:26<1:09:54,  3.50it/s]

{'loss': 2.0758, 'grad_norm': 1.764585018157959, 'learning_rate': 0.00017506552299261378, 'epoch': 0.13}


 13%|█▎        | 2104/16798 [08:26<1:06:49,  3.66it/s]

{'loss': 1.9006, 'grad_norm': 1.8671951293945312, 'learning_rate': 0.00017505360972122947, 'epoch': 0.13}


 13%|█▎        | 2105/16798 [08:26<1:03:43,  3.84it/s]

{'loss': 2.3871, 'grad_norm': 2.4929068088531494, 'learning_rate': 0.00017504169644984513, 'epoch': 0.13}


 13%|█▎        | 2106/16798 [08:26<1:07:44,  3.61it/s]

{'loss': 2.0226, 'grad_norm': 1.780343770980835, 'learning_rate': 0.00017502978317846083, 'epoch': 0.13}


 13%|█▎        | 2107/16798 [08:27<1:04:24,  3.80it/s]

{'loss': 2.2519, 'grad_norm': 2.0849449634552, 'learning_rate': 0.0001750178699070765, 'epoch': 0.13}


 13%|█▎        | 2108/16798 [08:27<1:00:56,  4.02it/s]

{'loss': 1.9352, 'grad_norm': 1.90496826171875, 'learning_rate': 0.00017500595663569218, 'epoch': 0.13}


 13%|█▎        | 2109/16798 [08:27<59:42,  4.10it/s]  

{'loss': 1.8905, 'grad_norm': 1.8011045455932617, 'learning_rate': 0.00017499404336430784, 'epoch': 0.13}


 13%|█▎        | 2111/16798 [08:27<59:53,  4.09it/s]  

{'loss': 1.4039, 'grad_norm': 1.610434651374817, 'learning_rate': 0.00017498213009292353, 'epoch': 0.13}


 13%|█▎        | 2111/16798 [08:27<59:53,  4.09it/s]

{'loss': 1.9536, 'grad_norm': 2.2480201721191406, 'learning_rate': 0.0001749702168215392, 'epoch': 0.13}


 13%|█▎        | 2112/16798 [08:28<59:28,  4.12it/s]

{'loss': 1.8786, 'grad_norm': 1.7513210773468018, 'learning_rate': 0.0001749583035501549, 'epoch': 0.13}


 13%|█▎        | 2113/16798 [08:28<57:17,  4.27it/s]

{'loss': 1.4133, 'grad_norm': 2.0391836166381836, 'learning_rate': 0.00017494639027877055, 'epoch': 0.13}


 13%|█▎        | 2114/16798 [08:28<1:01:47,  3.96it/s]

{'loss': 1.4122, 'grad_norm': 1.7507216930389404, 'learning_rate': 0.00017493447700738624, 'epoch': 0.13}


 13%|█▎        | 2115/16798 [08:28<1:00:22,  4.05it/s]

{'loss': 1.5889, 'grad_norm': 2.829188823699951, 'learning_rate': 0.0001749225637360019, 'epoch': 0.13}


 13%|█▎        | 2116/16798 [08:29<57:30,  4.26it/s]  

{'loss': 1.4453, 'grad_norm': 1.6615557670593262, 'learning_rate': 0.0001749106504646176, 'epoch': 0.13}


 13%|█▎        | 2117/16798 [08:29<56:15,  4.35it/s]

{'loss': 1.6841, 'grad_norm': 1.7677191495895386, 'learning_rate': 0.00017489873719323326, 'epoch': 0.13}


 13%|█▎        | 2118/16798 [08:29<1:01:20,  3.99it/s]

{'loss': 1.7424, 'grad_norm': 1.8438849449157715, 'learning_rate': 0.00017488682392184895, 'epoch': 0.13}


 13%|█▎        | 2119/16798 [08:29<59:14,  4.13it/s]  

{'loss': 1.5388, 'grad_norm': 1.695783019065857, 'learning_rate': 0.00017487491065046462, 'epoch': 0.13}


 13%|█▎        | 2120/16798 [08:30<57:25,  4.26it/s]

{'loss': 1.6791, 'grad_norm': 1.9433954954147339, 'learning_rate': 0.0001748629973790803, 'epoch': 0.13}


 13%|█▎        | 2121/16798 [08:30<56:56,  4.30it/s]

{'loss': 1.5836, 'grad_norm': 1.718116044998169, 'learning_rate': 0.00017485108410769597, 'epoch': 0.13}


 13%|█▎        | 2122/16798 [08:30<1:01:23,  3.98it/s]

{'loss': 1.853, 'grad_norm': 2.065124034881592, 'learning_rate': 0.00017483917083631166, 'epoch': 0.13}


 13%|█▎        | 2123/16798 [08:30<1:06:20,  3.69it/s]

{'loss': 1.27, 'grad_norm': 1.575613260269165, 'learning_rate': 0.00017482725756492732, 'epoch': 0.13}


 13%|█▎        | 2124/16798 [08:31<1:06:11,  3.69it/s]

{'loss': 1.4421, 'grad_norm': 1.7429004907608032, 'learning_rate': 0.00017481534429354302, 'epoch': 0.13}


 13%|█▎        | 2125/16798 [08:31<1:04:05,  3.82it/s]

{'loss': 1.7661, 'grad_norm': 1.8250057697296143, 'learning_rate': 0.0001748034310221587, 'epoch': 0.13}


 13%|█▎        | 2126/16798 [08:31<1:05:24,  3.74it/s]

{'loss': 1.6195, 'grad_norm': 2.181025981903076, 'learning_rate': 0.00017479151775077437, 'epoch': 0.13}


 13%|█▎        | 2127/16798 [08:32<1:05:38,  3.72it/s]

{'loss': 1.7035, 'grad_norm': 1.9727855920791626, 'learning_rate': 0.00017477960447939006, 'epoch': 0.13}


 13%|█▎        | 2128/16798 [08:32<1:04:18,  3.80it/s]

{'loss': 1.6578, 'grad_norm': 1.914873480796814, 'learning_rate': 0.00017476769120800572, 'epoch': 0.13}


 13%|█▎        | 2129/16798 [08:32<1:05:41,  3.72it/s]

{'loss': 1.5452, 'grad_norm': 1.957259178161621, 'learning_rate': 0.00017475577793662141, 'epoch': 0.13}


 13%|█▎        | 2130/16798 [08:32<1:05:05,  3.76it/s]

{'loss': 1.5598, 'grad_norm': 1.7406947612762451, 'learning_rate': 0.00017474386466523708, 'epoch': 0.13}


 13%|█▎        | 2131/16798 [08:33<1:04:03,  3.82it/s]

{'loss': 1.5511, 'grad_norm': 1.9161787033081055, 'learning_rate': 0.00017473195139385277, 'epoch': 0.13}


 13%|█▎        | 2132/16798 [08:33<1:04:49,  3.77it/s]

{'loss': 1.1697, 'grad_norm': 1.6422107219696045, 'learning_rate': 0.00017472003812246843, 'epoch': 0.13}


 13%|█▎        | 2133/16798 [08:33<1:01:10,  4.00it/s]

{'loss': 1.2845, 'grad_norm': 1.718263030052185, 'learning_rate': 0.00017470812485108412, 'epoch': 0.13}


 13%|█▎        | 2134/16798 [08:33<1:00:54,  4.01it/s]

{'loss': 1.6704, 'grad_norm': 1.8670772314071655, 'learning_rate': 0.0001746962115796998, 'epoch': 0.13}


 13%|█▎        | 2135/16798 [08:34<1:01:32,  3.97it/s]

{'loss': 1.3809, 'grad_norm': 1.7560261487960815, 'learning_rate': 0.00017468429830831548, 'epoch': 0.13}


 13%|█▎        | 2136/16798 [08:34<1:01:45,  3.96it/s]

{'loss': 1.4654, 'grad_norm': 2.448023796081543, 'learning_rate': 0.00017467238503693114, 'epoch': 0.13}


 13%|█▎        | 2137/16798 [08:34<1:02:53,  3.89it/s]

{'loss': 1.119, 'grad_norm': 1.3916187286376953, 'learning_rate': 0.00017466047176554683, 'epoch': 0.13}


 13%|█▎        | 2138/16798 [08:34<1:02:20,  3.92it/s]

{'loss': 1.5801, 'grad_norm': 2.1012160778045654, 'learning_rate': 0.0001746485584941625, 'epoch': 0.13}


 13%|█▎        | 2139/16798 [08:35<59:37,  4.10it/s]  

{'loss': 1.165, 'grad_norm': 1.5803697109222412, 'learning_rate': 0.0001746366452227782, 'epoch': 0.13}


 13%|█▎        | 2140/16798 [08:35<1:03:36,  3.84it/s]

{'loss': 1.2472, 'grad_norm': 2.0259838104248047, 'learning_rate': 0.00017462473195139385, 'epoch': 0.13}


 13%|█▎        | 2141/16798 [08:35<1:03:54,  3.82it/s]

{'loss': 1.4671, 'grad_norm': 1.7888227701187134, 'learning_rate': 0.00017461281868000954, 'epoch': 0.13}


 13%|█▎        | 2142/16798 [08:35<1:00:59,  4.01it/s]

{'loss': 1.4157, 'grad_norm': 2.2612133026123047, 'learning_rate': 0.0001746009054086252, 'epoch': 0.13}


 13%|█▎        | 2143/16798 [08:36<59:35,  4.10it/s]  

{'loss': 1.2332, 'grad_norm': 1.874332070350647, 'learning_rate': 0.0001745889921372409, 'epoch': 0.13}


 13%|█▎        | 2144/16798 [08:36<1:03:05,  3.87it/s]

{'loss': 1.0146, 'grad_norm': 1.9464890956878662, 'learning_rate': 0.00017457707886585656, 'epoch': 0.13}


 13%|█▎        | 2145/16798 [08:36<1:00:21,  4.05it/s]

{'loss': 0.8286, 'grad_norm': 1.5396946668624878, 'learning_rate': 0.00017456516559447225, 'epoch': 0.13}


 13%|█▎        | 2146/16798 [08:36<58:28,  4.18it/s]  

{'loss': 0.7986, 'grad_norm': 1.4032782316207886, 'learning_rate': 0.00017455325232308791, 'epoch': 0.13}


 13%|█▎        | 2147/16798 [08:37<1:00:14,  4.05it/s]

{'loss': 0.7626, 'grad_norm': 1.3409959077835083, 'learning_rate': 0.0001745413390517036, 'epoch': 0.13}


 13%|█▎        | 2149/16798 [08:37<57:36,  4.24it/s]  

{'loss': 0.3561, 'grad_norm': 1.0828375816345215, 'learning_rate': 0.00017452942578031927, 'epoch': 0.13}


 13%|█▎        | 2150/16798 [08:37<55:32,  4.40it/s]

{'loss': 0.4297, 'grad_norm': 1.0373966693878174, 'learning_rate': 0.00017451751250893496, 'epoch': 0.13}


 13%|█▎        | 2150/16798 [08:37<55:32,  4.40it/s]

{'loss': 0.7945, 'grad_norm': 1.460470199584961, 'learning_rate': 0.00017450559923755062, 'epoch': 0.13}


 13%|█▎        | 2151/16798 [08:38<59:06,  4.13it/s]

{'loss': 2.0056, 'grad_norm': 2.090545654296875, 'learning_rate': 0.00017449368596616631, 'epoch': 0.13}


 13%|█▎        | 2152/16798 [08:38<1:02:28,  3.91it/s]

{'loss': 2.2382, 'grad_norm': 1.93696129322052, 'learning_rate': 0.00017448177269478198, 'epoch': 0.13}


 13%|█▎        | 2153/16798 [08:38<1:02:27,  3.91it/s]

{'loss': 2.1674, 'grad_norm': 1.9822380542755127, 'learning_rate': 0.00017446985942339767, 'epoch': 0.13}


 13%|█▎        | 2154/16798 [08:38<1:00:37,  4.03it/s]

{'loss': 2.3791, 'grad_norm': 2.427995204925537, 'learning_rate': 0.00017445794615201333, 'epoch': 0.13}


 13%|█▎        | 2155/16798 [08:39<59:27,  4.11it/s]  

{'loss': 2.3302, 'grad_norm': 2.2084245681762695, 'learning_rate': 0.00017444603288062905, 'epoch': 0.13}


 13%|█▎        | 2156/16798 [08:39<1:02:07,  3.93it/s]

{'loss': 1.5948, 'grad_norm': 1.602051019668579, 'learning_rate': 0.0001744341196092447, 'epoch': 0.13}


 13%|█▎        | 2157/16798 [08:39<1:03:18,  3.85it/s]

{'loss': 1.7151, 'grad_norm': 1.8197561502456665, 'learning_rate': 0.0001744222063378604, 'epoch': 0.13}


 13%|█▎        | 2158/16798 [08:39<1:01:38,  3.96it/s]

{'loss': 1.803, 'grad_norm': 1.991414189338684, 'learning_rate': 0.00017441029306647607, 'epoch': 0.13}


 13%|█▎        | 2159/16798 [08:40<1:05:57,  3.70it/s]

{'loss': 1.3353, 'grad_norm': 1.7905515432357788, 'learning_rate': 0.00017439837979509176, 'epoch': 0.13}


 13%|█▎        | 2160/16798 [08:40<1:06:07,  3.69it/s]

{'loss': 1.7752, 'grad_norm': 2.017401933670044, 'learning_rate': 0.00017438646652370742, 'epoch': 0.13}


 13%|█▎        | 2161/16798 [08:40<1:02:09,  3.92it/s]

{'loss': 1.4899, 'grad_norm': 1.7213181257247925, 'learning_rate': 0.0001743745532523231, 'epoch': 0.13}


 13%|█▎        | 2162/16798 [08:40<1:00:38,  4.02it/s]

{'loss': 1.4763, 'grad_norm': 2.1215572357177734, 'learning_rate': 0.00017436263998093878, 'epoch': 0.13}


 13%|█▎        | 2163/16798 [08:41<1:04:39,  3.77it/s]

{'loss': 1.3952, 'grad_norm': 1.8957051038742065, 'learning_rate': 0.00017435072670955447, 'epoch': 0.13}


 13%|█▎        | 2164/16798 [08:41<1:02:51,  3.88it/s]

{'loss': 1.7269, 'grad_norm': 1.9063116312026978, 'learning_rate': 0.00017433881343817013, 'epoch': 0.13}


 13%|█▎        | 2165/16798 [08:41<1:01:20,  3.98it/s]

{'loss': 1.7155, 'grad_norm': 1.886451244354248, 'learning_rate': 0.00017432690016678582, 'epoch': 0.13}


 13%|█▎        | 2166/16798 [08:41<1:03:23,  3.85it/s]

{'loss': 1.5739, 'grad_norm': 1.8756688833236694, 'learning_rate': 0.00017431498689540149, 'epoch': 0.13}


 13%|█▎        | 2167/16798 [08:42<1:04:35,  3.78it/s]

{'loss': 1.3858, 'grad_norm': 1.5897136926651, 'learning_rate': 0.00017430307362401718, 'epoch': 0.13}


 13%|█▎        | 2168/16798 [08:42<1:02:34,  3.90it/s]

{'loss': 1.6177, 'grad_norm': 2.3798303604125977, 'learning_rate': 0.00017429116035263284, 'epoch': 0.13}


 13%|█▎        | 2169/16798 [08:42<1:05:28,  3.72it/s]

{'loss': 1.5936, 'grad_norm': 1.8328723907470703, 'learning_rate': 0.00017427924708124853, 'epoch': 0.13}


 13%|█▎        | 2170/16798 [08:43<1:08:58,  3.53it/s]

{'loss': 1.7865, 'grad_norm': 2.1628119945526123, 'learning_rate': 0.0001742673338098642, 'epoch': 0.13}


 13%|█▎        | 2171/16798 [08:43<1:07:48,  3.60it/s]

{'loss': 1.4864, 'grad_norm': 1.9646012783050537, 'learning_rate': 0.00017425542053847988, 'epoch': 0.13}


 13%|█▎        | 2172/16798 [08:43<1:08:10,  3.58it/s]

{'loss': 1.5739, 'grad_norm': 1.814374566078186, 'learning_rate': 0.00017424350726709555, 'epoch': 0.13}


 13%|█▎        | 2173/16798 [08:43<1:05:01,  3.75it/s]

{'loss': 1.6411, 'grad_norm': 2.0677454471588135, 'learning_rate': 0.00017423159399571124, 'epoch': 0.13}


 13%|█▎        | 2174/16798 [08:44<1:09:38,  3.50it/s]

{'loss': 1.5309, 'grad_norm': 1.674277663230896, 'learning_rate': 0.0001742196807243269, 'epoch': 0.13}


 13%|█▎        | 2175/16798 [08:44<1:11:25,  3.41it/s]

{'loss': 1.7443, 'grad_norm': 2.4365899562835693, 'learning_rate': 0.0001742077674529426, 'epoch': 0.13}


 13%|█▎        | 2176/16798 [08:44<1:09:16,  3.52it/s]

{'loss': 1.6815, 'grad_norm': 2.1316490173339844, 'learning_rate': 0.00017419585418155826, 'epoch': 0.13}


 13%|█▎        | 2177/16798 [08:45<1:08:41,  3.55it/s]

{'loss': 1.6981, 'grad_norm': 1.8777836561203003, 'learning_rate': 0.00017418394091017395, 'epoch': 0.13}


 13%|█▎        | 2178/16798 [08:45<1:08:02,  3.58it/s]

{'loss': 1.9913, 'grad_norm': 2.1933858394622803, 'learning_rate': 0.0001741720276387896, 'epoch': 0.13}


 13%|█▎        | 2179/16798 [08:45<1:05:17,  3.73it/s]

{'loss': 1.6231, 'grad_norm': 2.1802332401275635, 'learning_rate': 0.0001741601143674053, 'epoch': 0.13}


 13%|█▎        | 2180/16798 [08:45<1:06:22,  3.67it/s]

{'loss': 1.4273, 'grad_norm': 2.105983257293701, 'learning_rate': 0.00017414820109602097, 'epoch': 0.13}


 13%|█▎        | 2181/16798 [08:46<1:06:37,  3.66it/s]

{'loss': 1.5901, 'grad_norm': 2.0331008434295654, 'learning_rate': 0.00017413628782463666, 'epoch': 0.13}


 13%|█▎        | 2182/16798 [08:46<1:03:39,  3.83it/s]

{'loss': 1.2455, 'grad_norm': 1.9411594867706299, 'learning_rate': 0.00017412437455325232, 'epoch': 0.13}


 13%|█▎        | 2183/16798 [08:46<1:04:39,  3.77it/s]

{'loss': 1.5501, 'grad_norm': 1.9749397039413452, 'learning_rate': 0.000174112461281868, 'epoch': 0.13}


 13%|█▎        | 2184/16798 [08:46<1:06:41,  3.65it/s]

{'loss': 1.2348, 'grad_norm': 1.647558569908142, 'learning_rate': 0.00017410054801048368, 'epoch': 0.13}


 13%|█▎        | 2185/16798 [08:47<1:03:23,  3.84it/s]

{'loss': 1.8524, 'grad_norm': 2.528200626373291, 'learning_rate': 0.00017408863473909937, 'epoch': 0.13}


 13%|█▎        | 2186/16798 [08:47<1:03:54,  3.81it/s]

{'loss': 1.5335, 'grad_norm': 1.6989095211029053, 'learning_rate': 0.00017407672146771506, 'epoch': 0.13}


 13%|█▎        | 2187/16798 [08:47<1:04:38,  3.77it/s]

{'loss': 1.7775, 'grad_norm': 2.0885584354400635, 'learning_rate': 0.00017406480819633072, 'epoch': 0.13}


 13%|█▎        | 2188/16798 [08:47<1:07:41,  3.60it/s]

{'loss': 0.8475, 'grad_norm': 1.650240421295166, 'learning_rate': 0.0001740528949249464, 'epoch': 0.13}


 13%|█▎        | 2189/16798 [08:48<1:04:37,  3.77it/s]

{'loss': 0.8361, 'grad_norm': 3.307400703430176, 'learning_rate': 0.00017404098165356207, 'epoch': 0.13}


 13%|█▎        | 2190/16798 [08:48<1:06:21,  3.67it/s]

{'loss': 0.9264, 'grad_norm': 1.5840939283370972, 'learning_rate': 0.00017402906838217777, 'epoch': 0.13}


 13%|█▎        | 2191/16798 [08:48<1:05:14,  3.73it/s]

{'loss': 0.7137, 'grad_norm': 1.224597692489624, 'learning_rate': 0.00017401715511079343, 'epoch': 0.13}


 13%|█▎        | 2192/16798 [08:49<1:07:38,  3.60it/s]

{'loss': 1.2952, 'grad_norm': 1.9165804386138916, 'learning_rate': 0.00017400524183940912, 'epoch': 0.13}


 13%|█▎        | 2194/16798 [08:49<1:02:59,  3.86it/s]

{'loss': 0.8273, 'grad_norm': 1.5627094507217407, 'learning_rate': 0.00017399332856802478, 'epoch': 0.13}


 13%|█▎        | 2194/16798 [08:49<1:02:59,  3.86it/s]

{'loss': 0.6327, 'grad_norm': 1.4584108591079712, 'learning_rate': 0.00017398141529664047, 'epoch': 0.13}


 13%|█▎        | 2195/16798 [08:49<1:04:37,  3.77it/s]

{'loss': 0.67, 'grad_norm': 1.5999678373336792, 'learning_rate': 0.00017396950202525614, 'epoch': 0.13}


 13%|█▎        | 2196/16798 [08:50<1:03:59,  3.80it/s]

{'loss': 0.8024, 'grad_norm': 1.7434439659118652, 'learning_rate': 0.00017395758875387183, 'epoch': 0.13}


 13%|█▎        | 2197/16798 [08:50<1:05:02,  3.74it/s]

{'loss': 0.4447, 'grad_norm': 1.033286213874817, 'learning_rate': 0.0001739456754824875, 'epoch': 0.13}


 13%|█▎        | 2198/16798 [08:50<1:01:25,  3.96it/s]

{'loss': 0.2244, 'grad_norm': 0.8198298215866089, 'learning_rate': 0.00017393376221110318, 'epoch': 0.13}


 13%|█▎        | 2199/16798 [08:50<1:02:21,  3.90it/s]

{'loss': 0.2268, 'grad_norm': 0.8521355390548706, 'learning_rate': 0.00017392184893971885, 'epoch': 0.13}


 13%|█▎        | 2200/16798 [08:51<1:01:34,  3.95it/s]

{'loss': 0.2391, 'grad_norm': 0.8648043274879456, 'learning_rate': 0.00017390993566833454, 'epoch': 0.13}


 13%|█▎        | 2201/16798 [08:51<1:04:13,  3.79it/s]

{'loss': 2.035, 'grad_norm': 1.767425775527954, 'learning_rate': 0.0001738980223969502, 'epoch': 0.13}


 13%|█▎        | 2202/16798 [08:51<1:04:22,  3.78it/s]

{'loss': 1.8466, 'grad_norm': 1.6329880952835083, 'learning_rate': 0.0001738861091255659, 'epoch': 0.13}


 13%|█▎        | 2203/16798 [08:51<1:01:44,  3.94it/s]

{'loss': 1.729, 'grad_norm': 1.6254584789276123, 'learning_rate': 0.00017387419585418156, 'epoch': 0.13}


 13%|█▎        | 2204/16798 [08:52<1:02:03,  3.92it/s]

{'loss': 2.238, 'grad_norm': 2.022876501083374, 'learning_rate': 0.00017386228258279725, 'epoch': 0.13}


 13%|█▎        | 2205/16798 [08:52<1:03:17,  3.84it/s]

{'loss': 1.9298, 'grad_norm': 1.9399185180664062, 'learning_rate': 0.0001738503693114129, 'epoch': 0.13}


 13%|█▎        | 2206/16798 [08:52<1:03:57,  3.80it/s]

{'loss': 2.399, 'grad_norm': 2.218625068664551, 'learning_rate': 0.0001738384560400286, 'epoch': 0.13}


 13%|█▎        | 2207/16798 [08:52<1:01:35,  3.95it/s]

{'loss': 1.9021, 'grad_norm': 1.8561601638793945, 'learning_rate': 0.00017382654276864426, 'epoch': 0.13}


 13%|█▎        | 2208/16798 [08:53<1:01:58,  3.92it/s]

{'loss': 2.2635, 'grad_norm': 2.1109087467193604, 'learning_rate': 0.00017381462949725996, 'epoch': 0.13}


 13%|█▎        | 2209/16798 [08:53<1:04:22,  3.78it/s]

{'loss': 2.157, 'grad_norm': 1.9044972658157349, 'learning_rate': 0.00017380271622587562, 'epoch': 0.13}


 13%|█▎        | 2210/16798 [08:53<1:02:35,  3.88it/s]

{'loss': 2.131, 'grad_norm': 2.0328850746154785, 'learning_rate': 0.0001737908029544913, 'epoch': 0.13}


 13%|█▎        | 2211/16798 [08:53<1:00:01,  4.05it/s]

{'loss': 1.8466, 'grad_norm': 2.0715997219085693, 'learning_rate': 0.00017377888968310697, 'epoch': 0.13}


 13%|█▎        | 2212/16798 [08:54<1:06:35,  3.65it/s]

{'loss': 1.8146, 'grad_norm': 1.830257534980774, 'learning_rate': 0.00017376697641172266, 'epoch': 0.13}


 13%|█▎        | 2213/16798 [08:54<1:06:21,  3.66it/s]

{'loss': 1.9521, 'grad_norm': 2.188567876815796, 'learning_rate': 0.00017375506314033833, 'epoch': 0.13}


 13%|█▎        | 2214/16798 [08:54<1:04:29,  3.77it/s]

{'loss': 1.8709, 'grad_norm': 1.9911682605743408, 'learning_rate': 0.00017374314986895402, 'epoch': 0.13}


 13%|█▎        | 2215/16798 [08:55<1:04:57,  3.74it/s]

{'loss': 1.8916, 'grad_norm': 1.8881627321243286, 'learning_rate': 0.00017373123659756968, 'epoch': 0.13}


 13%|█▎        | 2216/16798 [08:55<1:06:54,  3.63it/s]

{'loss': 1.7785, 'grad_norm': 2.004411458969116, 'learning_rate': 0.00017371932332618537, 'epoch': 0.13}


 13%|█▎        | 2217/16798 [08:55<1:04:17,  3.78it/s]

{'loss': 1.9728, 'grad_norm': 2.189108371734619, 'learning_rate': 0.00017370741005480106, 'epoch': 0.13}


 13%|█▎        | 2218/16798 [08:55<1:05:04,  3.73it/s]

{'loss': 1.8368, 'grad_norm': 2.1015465259552, 'learning_rate': 0.00017369549678341675, 'epoch': 0.13}


 13%|█▎        | 2219/16798 [08:56<1:05:51,  3.69it/s]

{'loss': 1.6251, 'grad_norm': 1.715662956237793, 'learning_rate': 0.00017368358351203242, 'epoch': 0.13}


 13%|█▎        | 2220/16798 [08:56<1:05:14,  3.72it/s]

{'loss': 1.8085, 'grad_norm': 1.9062745571136475, 'learning_rate': 0.0001736716702406481, 'epoch': 0.13}


 13%|█▎        | 2221/16798 [08:56<1:05:36,  3.70it/s]

{'loss': 1.1886, 'grad_norm': 1.8695833683013916, 'learning_rate': 0.00017365975696926377, 'epoch': 0.13}


 13%|█▎        | 2222/16798 [08:56<1:07:20,  3.61it/s]

{'loss': 1.5248, 'grad_norm': 1.735039472579956, 'learning_rate': 0.00017364784369787946, 'epoch': 0.13}


 13%|█▎        | 2223/16798 [08:57<1:02:35,  3.88it/s]

{'loss': 1.5185, 'grad_norm': 1.610552191734314, 'learning_rate': 0.00017363593042649513, 'epoch': 0.13}


 13%|█▎        | 2224/16798 [08:57<59:16,  4.10it/s]  

{'loss': 1.6365, 'grad_norm': 1.7866480350494385, 'learning_rate': 0.00017362401715511082, 'epoch': 0.13}


 13%|█▎        | 2225/16798 [08:57<58:38,  4.14it/s]

{'loss': 1.4826, 'grad_norm': 1.720287799835205, 'learning_rate': 0.00017361210388372648, 'epoch': 0.13}


 13%|█▎        | 2226/16798 [08:57<1:00:33,  4.01it/s]

{'loss': 1.6034, 'grad_norm': 2.2199928760528564, 'learning_rate': 0.00017360019061234217, 'epoch': 0.13}


 13%|█▎        | 2227/16798 [08:58<58:36,  4.14it/s]  

{'loss': 1.3771, 'grad_norm': 1.717462182044983, 'learning_rate': 0.00017358827734095784, 'epoch': 0.13}


 13%|█▎        | 2229/16798 [08:58<57:25,  4.23it/s]

{'loss': 1.0639, 'grad_norm': 1.6784348487854004, 'learning_rate': 0.00017357636406957353, 'epoch': 0.13}


 13%|█▎        | 2229/16798 [08:58<57:25,  4.23it/s]

{'loss': 1.7993, 'grad_norm': 2.1994080543518066, 'learning_rate': 0.0001735644507981892, 'epoch': 0.13}


 13%|█▎        | 2230/16798 [08:58<1:00:57,  3.98it/s]

{'loss': 1.3953, 'grad_norm': 1.8248711824417114, 'learning_rate': 0.00017355253752680488, 'epoch': 0.13}


 13%|█▎        | 2231/16798 [08:59<57:39,  4.21it/s]  

{'loss': 1.2643, 'grad_norm': 1.6100207567214966, 'learning_rate': 0.00017354062425542054, 'epoch': 0.13}


 13%|█▎        | 2232/16798 [08:59<57:39,  4.21it/s]

{'loss': 1.2607, 'grad_norm': 1.7177544832229614, 'learning_rate': 0.00017352871098403624, 'epoch': 0.13}


 13%|█▎        | 2233/16798 [08:59<58:09,  4.17it/s]

{'loss': 2.2617, 'grad_norm': 2.2344281673431396, 'learning_rate': 0.0001735167977126519, 'epoch': 0.13}


 13%|█▎        | 2234/16798 [08:59<1:02:01,  3.91it/s]

{'loss': 1.7024, 'grad_norm': 2.0645928382873535, 'learning_rate': 0.0001735048844412676, 'epoch': 0.13}


 13%|█▎        | 2235/16798 [09:00<1:00:53,  3.99it/s]

{'loss': 1.566, 'grad_norm': 1.9413386583328247, 'learning_rate': 0.00017349297116988325, 'epoch': 0.13}


 13%|█▎        | 2236/16798 [09:00<59:11,  4.10it/s]  

{'loss': 1.3442, 'grad_norm': 2.1686666011810303, 'learning_rate': 0.00017348105789849894, 'epoch': 0.13}


 13%|█▎        | 2237/16798 [09:00<56:43,  4.28it/s]

{'loss': 1.4745, 'grad_norm': 1.95595121383667, 'learning_rate': 0.0001734691446271146, 'epoch': 0.13}


 13%|█▎        | 2238/16798 [09:00<1:00:27,  4.01it/s]

{'loss': 2.0753, 'grad_norm': 3.491628885269165, 'learning_rate': 0.0001734572313557303, 'epoch': 0.13}


 13%|█▎        | 2239/16798 [09:01<58:14,  4.17it/s]  

{'loss': 1.5557, 'grad_norm': 2.8373911380767822, 'learning_rate': 0.00017344531808434596, 'epoch': 0.13}


 13%|█▎        | 2240/16798 [09:01<58:11,  4.17it/s]

{'loss': 1.1526, 'grad_norm': 1.9528350830078125, 'learning_rate': 0.00017343340481296165, 'epoch': 0.13}


 13%|█▎        | 2241/16798 [09:01<57:35,  4.21it/s]

{'loss': 1.4029, 'grad_norm': 1.776206374168396, 'learning_rate': 0.00017342149154157732, 'epoch': 0.13}


 13%|█▎        | 2242/16798 [09:01<1:02:17,  3.89it/s]

{'loss': 1.4029, 'grad_norm': 2.0035037994384766, 'learning_rate': 0.000173409578270193, 'epoch': 0.13}


 13%|█▎        | 2243/16798 [09:02<1:00:00,  4.04it/s]

{'loss': 1.4979, 'grad_norm': 2.0978305339813232, 'learning_rate': 0.00017339766499880867, 'epoch': 0.13}


 13%|█▎        | 2244/16798 [09:02<56:53,  4.26it/s]  

{'loss': 1.2168, 'grad_norm': 1.860694169998169, 'learning_rate': 0.00017338575172742436, 'epoch': 0.13}


 13%|█▎        | 2245/16798 [09:02<57:05,  4.25it/s]

{'loss': 1.1776, 'grad_norm': 2.0061535835266113, 'learning_rate': 0.00017337383845604003, 'epoch': 0.13}


 13%|█▎        | 2246/16798 [09:02<59:59,  4.04it/s]

{'loss': 1.2715, 'grad_norm': 1.7861933708190918, 'learning_rate': 0.00017336192518465572, 'epoch': 0.13}


 13%|█▎        | 2247/16798 [09:03<1:02:20,  3.89it/s]

{'loss': 1.3471, 'grad_norm': 2.0259437561035156, 'learning_rate': 0.00017335001191327138, 'epoch': 0.13}


 13%|█▎        | 2248/16798 [09:03<58:18,  4.16it/s]  

{'loss': 1.0004, 'grad_norm': 1.7053390741348267, 'learning_rate': 0.00017333809864188707, 'epoch': 0.13}


 13%|█▎        | 2249/16798 [09:03<1:01:41,  3.93it/s]

{'loss': 0.2897, 'grad_norm': 0.802210807800293, 'learning_rate': 0.00017332618537050276, 'epoch': 0.13}


 13%|█▎        | 2250/16798 [09:03<1:04:01,  3.79it/s]

{'loss': 0.5858, 'grad_norm': 1.2908697128295898, 'learning_rate': 0.00017331427209911843, 'epoch': 0.13}


 13%|█▎        | 2251/16798 [09:04<1:03:36,  3.81it/s]

{'loss': 2.2933, 'grad_norm': 1.8087797164916992, 'learning_rate': 0.00017330235882773412, 'epoch': 0.13}


 13%|█▎        | 2252/16798 [09:04<1:03:26,  3.82it/s]

{'loss': 2.0914, 'grad_norm': 1.8037983179092407, 'learning_rate': 0.00017329044555634978, 'epoch': 0.13}


 13%|█▎        | 2253/16798 [09:04<1:07:43,  3.58it/s]

{'loss': 2.0869, 'grad_norm': 1.8719090223312378, 'learning_rate': 0.00017327853228496547, 'epoch': 0.13}


 13%|█▎        | 2254/16798 [09:04<1:04:54,  3.73it/s]

{'loss': 1.9114, 'grad_norm': 1.8005123138427734, 'learning_rate': 0.00017326661901358113, 'epoch': 0.13}


 13%|█▎        | 2255/16798 [09:05<1:02:36,  3.87it/s]

{'loss': 2.2853, 'grad_norm': 1.9281558990478516, 'learning_rate': 0.00017325470574219682, 'epoch': 0.13}


 13%|█▎        | 2256/16798 [09:05<1:06:21,  3.65it/s]

{'loss': 2.0698, 'grad_norm': 2.2224512100219727, 'learning_rate': 0.0001732427924708125, 'epoch': 0.13}


 13%|█▎        | 2257/16798 [09:05<1:04:16,  3.77it/s]

{'loss': 1.8891, 'grad_norm': 1.7584648132324219, 'learning_rate': 0.00017323087919942818, 'epoch': 0.13}


 13%|█▎        | 2258/16798 [09:05<1:02:46,  3.86it/s]

{'loss': 1.7712, 'grad_norm': 1.7070326805114746, 'learning_rate': 0.00017321896592804384, 'epoch': 0.13}


 13%|█▎        | 2259/16798 [09:06<1:05:42,  3.69it/s]

{'loss': 2.1401, 'grad_norm': 1.8728458881378174, 'learning_rate': 0.00017320705265665953, 'epoch': 0.13}


 13%|█▎        | 2260/16798 [09:06<1:02:31,  3.88it/s]

{'loss': 1.9757, 'grad_norm': 1.6623541116714478, 'learning_rate': 0.0001731951393852752, 'epoch': 0.13}


 13%|█▎        | 2261/16798 [09:06<1:02:53,  3.85it/s]

{'loss': 2.0199, 'grad_norm': 1.934619426727295, 'learning_rate': 0.0001731832261138909, 'epoch': 0.13}


 13%|█▎        | 2262/16798 [09:07<1:05:50,  3.68it/s]

{'loss': 2.1679, 'grad_norm': 1.907783031463623, 'learning_rate': 0.00017317131284250655, 'epoch': 0.13}


 13%|█▎        | 2263/16798 [09:07<1:03:59,  3.79it/s]

{'loss': 1.6628, 'grad_norm': 1.78459632396698, 'learning_rate': 0.00017315939957112224, 'epoch': 0.13}


 13%|█▎        | 2264/16798 [09:07<1:03:41,  3.80it/s]

{'loss': 1.4246, 'grad_norm': 1.7078447341918945, 'learning_rate': 0.0001731474862997379, 'epoch': 0.13}


 13%|█▎        | 2265/16798 [09:07<1:07:33,  3.59it/s]

{'loss': 1.5595, 'grad_norm': 1.8113071918487549, 'learning_rate': 0.0001731355730283536, 'epoch': 0.13}


 13%|█▎        | 2266/16798 [09:08<1:04:45,  3.74it/s]

{'loss': 1.783, 'grad_norm': 1.9395798444747925, 'learning_rate': 0.00017312365975696926, 'epoch': 0.13}


 13%|█▎        | 2267/16798 [09:08<1:08:55,  3.51it/s]

{'loss': 1.4697, 'grad_norm': 1.633042573928833, 'learning_rate': 0.00017311174648558495, 'epoch': 0.13}


 14%|█▎        | 2268/16798 [09:08<1:10:26,  3.44it/s]

{'loss': 1.6552, 'grad_norm': 1.9003620147705078, 'learning_rate': 0.00017309983321420061, 'epoch': 0.14}


 14%|█▎        | 2269/16798 [09:08<1:06:20,  3.65it/s]

{'loss': 1.401, 'grad_norm': 1.5881776809692383, 'learning_rate': 0.0001730879199428163, 'epoch': 0.14}


 14%|█▎        | 2270/16798 [09:09<1:04:13,  3.77it/s]

{'loss': 1.3585, 'grad_norm': 1.9757776260375977, 'learning_rate': 0.00017307600667143197, 'epoch': 0.14}


 14%|█▎        | 2271/16798 [09:09<1:03:57,  3.79it/s]

{'loss': 1.4546, 'grad_norm': 1.629521369934082, 'learning_rate': 0.00017306409340004766, 'epoch': 0.14}


 14%|█▎        | 2272/16798 [09:09<1:01:14,  3.95it/s]

{'loss': 1.7912, 'grad_norm': 2.032193899154663, 'learning_rate': 0.00017305218012866332, 'epoch': 0.14}


 14%|█▎        | 2273/16798 [09:09<1:02:21,  3.88it/s]

{'loss': 1.4526, 'grad_norm': 1.4722867012023926, 'learning_rate': 0.00017304026685727901, 'epoch': 0.14}


 14%|█▎        | 2274/16798 [09:10<1:03:43,  3.80it/s]

{'loss': 1.6457, 'grad_norm': 1.9606300592422485, 'learning_rate': 0.00017302835358589468, 'epoch': 0.14}


 14%|█▎        | 2275/16798 [09:10<1:02:55,  3.85it/s]

{'loss': 2.0341, 'grad_norm': 2.629512310028076, 'learning_rate': 0.00017301644031451037, 'epoch': 0.14}


 14%|█▎        | 2276/16798 [09:10<1:03:37,  3.80it/s]

{'loss': 1.5678, 'grad_norm': 1.9916610717773438, 'learning_rate': 0.00017300452704312603, 'epoch': 0.14}


 14%|█▎        | 2277/16798 [09:10<1:01:22,  3.94it/s]

{'loss': 1.6564, 'grad_norm': 2.096635580062866, 'learning_rate': 0.00017299261377174172, 'epoch': 0.14}


 14%|█▎        | 2278/16798 [09:11<1:05:26,  3.70it/s]

{'loss': 2.056, 'grad_norm': 2.022371768951416, 'learning_rate': 0.0001729807005003574, 'epoch': 0.14}


 14%|█▎        | 2279/16798 [09:11<1:06:22,  3.65it/s]

{'loss': 1.2355, 'grad_norm': 1.7233110666275024, 'learning_rate': 0.0001729687872289731, 'epoch': 0.14}


 14%|█▎        | 2280/16798 [09:11<1:03:48,  3.79it/s]

{'loss': 1.7216, 'grad_norm': 1.9943606853485107, 'learning_rate': 0.00017295687395758877, 'epoch': 0.14}


 14%|█▎        | 2281/16798 [09:12<1:00:46,  3.98it/s]

{'loss': 1.7089, 'grad_norm': 1.8939199447631836, 'learning_rate': 0.00017294496068620446, 'epoch': 0.14}


 14%|█▎        | 2282/16798 [09:12<1:05:31,  3.69it/s]

{'loss': 1.4982, 'grad_norm': 1.829732894897461, 'learning_rate': 0.00017293304741482012, 'epoch': 0.14}


 14%|█▎        | 2283/16798 [09:12<1:02:01,  3.90it/s]

{'loss': 1.2709, 'grad_norm': 1.6331062316894531, 'learning_rate': 0.00017292113414343581, 'epoch': 0.14}


 14%|█▎        | 2284/16798 [09:12<1:00:27,  4.00it/s]

{'loss': 1.6199, 'grad_norm': 1.8092292547225952, 'learning_rate': 0.00017290922087205148, 'epoch': 0.14}


 14%|█▎        | 2285/16798 [09:13<1:02:12,  3.89it/s]

{'loss': 1.5647, 'grad_norm': 2.2203428745269775, 'learning_rate': 0.00017289730760066717, 'epoch': 0.14}


 14%|█▎        | 2286/16798 [09:13<1:01:22,  3.94it/s]

{'loss': 1.6197, 'grad_norm': 2.2407944202423096, 'learning_rate': 0.00017288539432928283, 'epoch': 0.14}


 14%|█▎        | 2287/16798 [09:13<1:03:58,  3.78it/s]

{'loss': 1.8896, 'grad_norm': 2.003068208694458, 'learning_rate': 0.00017287348105789852, 'epoch': 0.14}


 14%|█▎        | 2288/16798 [09:13<1:05:08,  3.71it/s]

{'loss': 1.0928, 'grad_norm': 1.5732104778289795, 'learning_rate': 0.00017286156778651419, 'epoch': 0.14}


 14%|█▎        | 2289/16798 [09:14<1:05:22,  3.70it/s]

{'loss': 1.634, 'grad_norm': 2.1293365955352783, 'learning_rate': 0.00017284965451512988, 'epoch': 0.14}


 14%|█▎        | 2290/16798 [09:14<1:03:01,  3.84it/s]

{'loss': 1.4815, 'grad_norm': 1.759903073310852, 'learning_rate': 0.00017283774124374554, 'epoch': 0.14}


 14%|█▎        | 2291/16798 [09:14<1:02:31,  3.87it/s]

{'loss': 1.1619, 'grad_norm': 1.6830344200134277, 'learning_rate': 0.00017282582797236123, 'epoch': 0.14}


 14%|█▎        | 2292/16798 [09:14<1:04:27,  3.75it/s]

{'loss': 1.2077, 'grad_norm': 1.7272592782974243, 'learning_rate': 0.0001728139147009769, 'epoch': 0.14}


 14%|█▎        | 2293/16798 [09:15<1:01:28,  3.93it/s]

{'loss': 1.0199, 'grad_norm': 1.6220418214797974, 'learning_rate': 0.00017280200142959259, 'epoch': 0.14}


 14%|█▎        | 2294/16798 [09:15<1:03:37,  3.80it/s]

{'loss': 1.36, 'grad_norm': 1.7098075151443481, 'learning_rate': 0.00017279008815820825, 'epoch': 0.14}


 14%|█▎        | 2295/16798 [09:15<1:05:58,  3.66it/s]

{'loss': 1.1783, 'grad_norm': 1.6475564241409302, 'learning_rate': 0.00017277817488682394, 'epoch': 0.14}


 14%|█▎        | 2296/16798 [09:16<1:12:33,  3.33it/s]

{'loss': 1.3494, 'grad_norm': 1.8534018993377686, 'learning_rate': 0.0001727662616154396, 'epoch': 0.14}


 14%|█▎        | 2297/16798 [09:16<1:14:49,  3.23it/s]

{'loss': 1.0999, 'grad_norm': 1.6186367273330688, 'learning_rate': 0.0001727543483440553, 'epoch': 0.14}


 14%|█▎        | 2298/16798 [09:16<1:09:25,  3.48it/s]

{'loss': 0.9038, 'grad_norm': 1.558963418006897, 'learning_rate': 0.00017274243507267096, 'epoch': 0.14}


 14%|█▎        | 2299/16798 [09:16<1:07:40,  3.57it/s]

{'loss': 0.5495, 'grad_norm': 1.335471272468567, 'learning_rate': 0.00017273052180128665, 'epoch': 0.14}


 14%|█▎        | 2300/16798 [09:17<1:06:42,  3.62it/s]

{'loss': 0.2958, 'grad_norm': 0.8609884977340698, 'learning_rate': 0.0001727186085299023, 'epoch': 0.14}


 14%|█▎        | 2301/16798 [09:17<1:06:48,  3.62it/s]

{'loss': 1.8056, 'grad_norm': 1.665463924407959, 'learning_rate': 0.000172706695258518, 'epoch': 0.14}


 14%|█▎        | 2302/16798 [09:17<1:03:15,  3.82it/s]

{'loss': 2.0699, 'grad_norm': 1.8959513902664185, 'learning_rate': 0.00017269478198713367, 'epoch': 0.14}


 14%|█▎        | 2303/16798 [09:17<1:05:06,  3.71it/s]

{'loss': 2.3303, 'grad_norm': 2.1180899143218994, 'learning_rate': 0.00017268286871574936, 'epoch': 0.14}


 14%|█▎        | 2304/16798 [09:18<1:02:25,  3.87it/s]

{'loss': 2.1744, 'grad_norm': 2.0074737071990967, 'learning_rate': 0.00017267095544436502, 'epoch': 0.14}


 14%|█▎        | 2305/16798 [09:18<1:00:08,  4.02it/s]

{'loss': 2.1558, 'grad_norm': 1.7935155630111694, 'learning_rate': 0.0001726590421729807, 'epoch': 0.14}


 14%|█▎        | 2306/16798 [09:18<58:49,  4.11it/s]  

{'loss': 1.8996, 'grad_norm': 1.8844012022018433, 'learning_rate': 0.00017264712890159638, 'epoch': 0.14}


 14%|█▎        | 2307/16798 [09:18<1:02:36,  3.86it/s]

{'loss': 2.3417, 'grad_norm': 2.2079997062683105, 'learning_rate': 0.00017263521563021207, 'epoch': 0.14}


 14%|█▎        | 2308/16798 [09:19<1:02:59,  3.83it/s]

{'loss': 1.4038, 'grad_norm': 1.5672662258148193, 'learning_rate': 0.00017262330235882773, 'epoch': 0.14}


 14%|█▎        | 2309/16798 [09:19<1:02:03,  3.89it/s]

{'loss': 1.8505, 'grad_norm': 1.84551203250885, 'learning_rate': 0.00017261138908744342, 'epoch': 0.14}


 14%|█▍        | 2310/16798 [09:19<1:06:38,  3.62it/s]

{'loss': 1.5939, 'grad_norm': 1.764321208000183, 'learning_rate': 0.0001725994758160591, 'epoch': 0.14}


 14%|█▍        | 2311/16798 [09:20<1:03:52,  3.78it/s]

{'loss': 1.6786, 'grad_norm': 1.7850168943405151, 'learning_rate': 0.00017258756254467478, 'epoch': 0.14}


 14%|█▍        | 2312/16798 [09:20<1:01:12,  3.94it/s]

{'loss': 1.6994, 'grad_norm': 1.9576377868652344, 'learning_rate': 0.00017257564927329047, 'epoch': 0.14}


 14%|█▍        | 2313/16798 [09:20<1:04:48,  3.72it/s]

{'loss': 1.4906, 'grad_norm': 2.0070347785949707, 'learning_rate': 0.00017256373600190613, 'epoch': 0.14}


 14%|█▍        | 2314/16798 [09:20<1:01:45,  3.91it/s]

{'loss': 1.6845, 'grad_norm': 1.8436275720596313, 'learning_rate': 0.00017255182273052182, 'epoch': 0.14}


 14%|█▍        | 2315/16798 [09:21<1:01:22,  3.93it/s]

{'loss': 1.4596, 'grad_norm': 1.6320492029190063, 'learning_rate': 0.00017253990945913748, 'epoch': 0.14}


 14%|█▍        | 2316/16798 [09:21<1:03:57,  3.77it/s]

{'loss': 1.3838, 'grad_norm': 1.5448933839797974, 'learning_rate': 0.00017252799618775318, 'epoch': 0.14}


 14%|█▍        | 2317/16798 [09:21<1:01:41,  3.91it/s]

{'loss': 1.8307, 'grad_norm': 2.119354248046875, 'learning_rate': 0.00017251608291636884, 'epoch': 0.14}


 14%|█▍        | 2318/16798 [09:21<1:02:21,  3.87it/s]

{'loss': 1.3379, 'grad_norm': 1.531785488128662, 'learning_rate': 0.00017250416964498453, 'epoch': 0.14}


 14%|█▍        | 2319/16798 [09:22<1:05:10,  3.70it/s]

{'loss': 1.4505, 'grad_norm': 1.7400896549224854, 'learning_rate': 0.0001724922563736002, 'epoch': 0.14}


 14%|█▍        | 2320/16798 [09:22<1:02:58,  3.83it/s]

{'loss': 2.1695, 'grad_norm': 4.22025203704834, 'learning_rate': 0.00017248034310221588, 'epoch': 0.14}


 14%|█▍        | 2321/16798 [09:22<1:00:59,  3.96it/s]

{'loss': 1.767, 'grad_norm': 1.886054277420044, 'learning_rate': 0.00017246842983083155, 'epoch': 0.14}


 14%|█▍        | 2322/16798 [09:22<1:03:11,  3.82it/s]

{'loss': 1.8839, 'grad_norm': 2.0008037090301514, 'learning_rate': 0.00017245651655944724, 'epoch': 0.14}


 14%|█▍        | 2323/16798 [09:23<1:01:25,  3.93it/s]

{'loss': 1.612, 'grad_norm': 1.7973649501800537, 'learning_rate': 0.0001724446032880629, 'epoch': 0.14}


 14%|█▍        | 2324/16798 [09:23<1:02:53,  3.84it/s]

{'loss': 1.2964, 'grad_norm': 1.641129493713379, 'learning_rate': 0.0001724326900166786, 'epoch': 0.14}


 14%|█▍        | 2325/16798 [09:23<1:06:07,  3.65it/s]

{'loss': 1.7164, 'grad_norm': 1.8145827054977417, 'learning_rate': 0.00017242077674529426, 'epoch': 0.14}


 14%|█▍        | 2326/16798 [09:23<1:04:20,  3.75it/s]

{'loss': 1.738, 'grad_norm': 1.8862955570220947, 'learning_rate': 0.00017240886347390995, 'epoch': 0.14}


 14%|█▍        | 2327/16798 [09:24<1:03:02,  3.83it/s]

{'loss': 1.5708, 'grad_norm': 1.7218239307403564, 'learning_rate': 0.0001723969502025256, 'epoch': 0.14}


 14%|█▍        | 2328/16798 [09:24<1:06:20,  3.64it/s]

{'loss': 1.5525, 'grad_norm': 2.016690731048584, 'learning_rate': 0.0001723850369311413, 'epoch': 0.14}


 14%|█▍        | 2329/16798 [09:24<1:05:07,  3.70it/s]

{'loss': 1.6097, 'grad_norm': 1.7181504964828491, 'learning_rate': 0.00017237312365975697, 'epoch': 0.14}


 14%|█▍        | 2330/16798 [09:25<1:03:53,  3.77it/s]

{'loss': 1.6705, 'grad_norm': 2.184349775314331, 'learning_rate': 0.00017236121038837266, 'epoch': 0.14}


 14%|█▍        | 2331/16798 [09:25<1:06:34,  3.62it/s]

{'loss': 1.2149, 'grad_norm': 1.6386445760726929, 'learning_rate': 0.00017234929711698832, 'epoch': 0.14}


 14%|█▍        | 2332/16798 [09:25<1:03:36,  3.79it/s]

{'loss': 1.9182, 'grad_norm': 2.2005178928375244, 'learning_rate': 0.000172337383845604, 'epoch': 0.14}


 14%|█▍        | 2333/16798 [09:25<1:00:35,  3.98it/s]

{'loss': 1.2275, 'grad_norm': 1.6937482357025146, 'learning_rate': 0.00017232547057421967, 'epoch': 0.14}


 14%|█▍        | 2334/16798 [09:26<1:03:20,  3.81it/s]

{'loss': 1.637, 'grad_norm': 2.037034511566162, 'learning_rate': 0.00017231355730283537, 'epoch': 0.14}


 14%|█▍        | 2335/16798 [09:26<1:06:17,  3.64it/s]

{'loss': 1.2763, 'grad_norm': 1.7404981851577759, 'learning_rate': 0.00017230164403145103, 'epoch': 0.14}


 14%|█▍        | 2336/16798 [09:26<1:03:10,  3.82it/s]

{'loss': 1.6615, 'grad_norm': 2.117366313934326, 'learning_rate': 0.00017228973076006672, 'epoch': 0.14}


 14%|█▍        | 2337/16798 [09:26<1:04:40,  3.73it/s]

{'loss': 1.5575, 'grad_norm': 1.9444172382354736, 'learning_rate': 0.00017227781748868238, 'epoch': 0.14}


 14%|█▍        | 2338/16798 [09:27<1:04:53,  3.71it/s]

{'loss': 1.5037, 'grad_norm': 2.0201759338378906, 'learning_rate': 0.00017226590421729807, 'epoch': 0.14}


 14%|█▍        | 2340/16798 [09:27<1:01:08,  3.94it/s]

{'loss': 0.8691, 'grad_norm': 1.7623417377471924, 'learning_rate': 0.00017225399094591374, 'epoch': 0.14}


 14%|█▍        | 2340/16798 [09:27<1:01:08,  3.94it/s]

{'loss': 1.3705, 'grad_norm': 1.7666740417480469, 'learning_rate': 0.00017224207767452946, 'epoch': 0.14}


 14%|█▍        | 2341/16798 [09:27<58:37,  4.11it/s]  

{'loss': 1.3529, 'grad_norm': 1.949908971786499, 'learning_rate': 0.00017223016440314512, 'epoch': 0.14}


 14%|█▍        | 2342/16798 [09:28<1:00:37,  3.97it/s]

{'loss': 1.3621, 'grad_norm': 1.7744426727294922, 'learning_rate': 0.0001722182511317608, 'epoch': 0.14}


 14%|█▍        | 2343/16798 [09:28<1:00:20,  3.99it/s]

{'loss': 1.0495, 'grad_norm': 1.6717329025268555, 'learning_rate': 0.00017220633786037647, 'epoch': 0.14}


 14%|█▍        | 2344/16798 [09:28<58:28,  4.12it/s]  

{'loss': 1.0944, 'grad_norm': 1.6075329780578613, 'learning_rate': 0.00017219442458899216, 'epoch': 0.14}


 14%|█▍        | 2345/16798 [09:28<59:53,  4.02it/s]

{'loss': 1.2377, 'grad_norm': 1.9080439805984497, 'learning_rate': 0.00017218251131760783, 'epoch': 0.14}


 14%|█▍        | 2346/16798 [09:29<1:00:50,  3.96it/s]

{'loss': 1.0229, 'grad_norm': 1.518939733505249, 'learning_rate': 0.00017217059804622352, 'epoch': 0.14}


 14%|█▍        | 2347/16798 [09:29<1:00:07,  4.01it/s]

{'loss': 1.0208, 'grad_norm': 1.68938410282135, 'learning_rate': 0.00017215868477483918, 'epoch': 0.14}


 14%|█▍        | 2348/16798 [09:29<1:02:09,  3.87it/s]

{'loss': 0.4037, 'grad_norm': 0.8856857419013977, 'learning_rate': 0.00017214677150345487, 'epoch': 0.14}


 14%|█▍        | 2350/16798 [09:30<59:32,  4.04it/s]  

{'loss': 1.2491, 'grad_norm': 2.1678574085235596, 'learning_rate': 0.00017213485823207054, 'epoch': 0.14}


 14%|█▍        | 2350/16798 [09:30<59:32,  4.04it/s]

{'loss': 0.4842, 'grad_norm': 1.1825798749923706, 'learning_rate': 0.00017212294496068623, 'epoch': 0.14}


 14%|█▍        | 2351/16798 [09:30<57:44,  4.17it/s]

{'loss': 2.1283, 'grad_norm': 1.885239839553833, 'learning_rate': 0.0001721110316893019, 'epoch': 0.14}


 14%|█▍        | 2352/16798 [09:30<1:03:26,  3.80it/s]

{'loss': 2.1273, 'grad_norm': 1.7494536638259888, 'learning_rate': 0.00017209911841791758, 'epoch': 0.14}


 14%|█▍        | 2353/16798 [09:30<1:02:27,  3.85it/s]

{'loss': 1.9958, 'grad_norm': 2.020498514175415, 'learning_rate': 0.00017208720514653325, 'epoch': 0.14}


 14%|█▍        | 2354/16798 [09:31<1:04:24,  3.74it/s]

{'loss': 2.3175, 'grad_norm': 2.017603874206543, 'learning_rate': 0.00017207529187514894, 'epoch': 0.14}


 14%|█▍        | 2355/16798 [09:31<1:10:01,  3.44it/s]

{'loss': 1.7695, 'grad_norm': 1.8667619228363037, 'learning_rate': 0.0001720633786037646, 'epoch': 0.14}


 14%|█▍        | 2356/16798 [09:31<1:07:32,  3.56it/s]

{'loss': 2.1751, 'grad_norm': 2.0573391914367676, 'learning_rate': 0.0001720514653323803, 'epoch': 0.14}


 14%|█▍        | 2357/16798 [09:32<1:07:58,  3.54it/s]

{'loss': 2.1088, 'grad_norm': 1.8691726922988892, 'learning_rate': 0.00017203955206099595, 'epoch': 0.14}


 14%|█▍        | 2358/16798 [09:32<1:07:36,  3.56it/s]

{'loss': 1.9384, 'grad_norm': 1.715884804725647, 'learning_rate': 0.00017202763878961165, 'epoch': 0.14}


 14%|█▍        | 2359/16798 [09:32<1:05:00,  3.70it/s]

{'loss': 1.7327, 'grad_norm': 1.8961650133132935, 'learning_rate': 0.0001720157255182273, 'epoch': 0.14}


 14%|█▍        | 2360/16798 [09:32<1:01:23,  3.92it/s]

{'loss': 1.7056, 'grad_norm': 1.6546473503112793, 'learning_rate': 0.000172003812246843, 'epoch': 0.14}


 14%|█▍        | 2361/16798 [09:33<59:18,  4.06it/s]  

{'loss': 1.7928, 'grad_norm': 1.787192702293396, 'learning_rate': 0.00017199189897545866, 'epoch': 0.14}


 14%|█▍        | 2362/16798 [09:33<1:01:51,  3.89it/s]

{'loss': 1.6097, 'grad_norm': 2.2181196212768555, 'learning_rate': 0.00017197998570407435, 'epoch': 0.14}


 14%|█▍        | 2363/16798 [09:33<1:02:56,  3.82it/s]

{'loss': 1.1408, 'grad_norm': 1.5951907634735107, 'learning_rate': 0.00017196807243269002, 'epoch': 0.14}


 14%|█▍        | 2364/16798 [09:33<1:00:04,  4.00it/s]

{'loss': 1.5062, 'grad_norm': 1.7396177053451538, 'learning_rate': 0.0001719561591613057, 'epoch': 0.14}


 14%|█▍        | 2365/16798 [09:34<1:01:41,  3.90it/s]

{'loss': 1.8461, 'grad_norm': 2.1094329357147217, 'learning_rate': 0.00017194424588992137, 'epoch': 0.14}


 14%|█▍        | 2366/16798 [09:34<1:01:38,  3.90it/s]

{'loss': 1.6549, 'grad_norm': 2.11749529838562, 'learning_rate': 0.00017193233261853706, 'epoch': 0.14}


 14%|█▍        | 2367/16798 [09:34<1:00:25,  3.98it/s]

{'loss': 1.7038, 'grad_norm': 1.7369494438171387, 'learning_rate': 0.00017192041934715273, 'epoch': 0.14}


 14%|█▍        | 2368/16798 [09:34<1:07:05,  3.58it/s]

{'loss': 1.4001, 'grad_norm': 2.2022268772125244, 'learning_rate': 0.00017190850607576842, 'epoch': 0.14}


 14%|█▍        | 2369/16798 [09:35<1:04:55,  3.70it/s]

{'loss': 1.5524, 'grad_norm': 1.722756028175354, 'learning_rate': 0.00017189659280438408, 'epoch': 0.14}


 14%|█▍        | 2370/16798 [09:35<1:05:11,  3.69it/s]

{'loss': 1.544, 'grad_norm': 2.1555802822113037, 'learning_rate': 0.00017188467953299977, 'epoch': 0.14}


 14%|█▍        | 2371/16798 [09:35<1:08:54,  3.49it/s]

{'loss': 1.6627, 'grad_norm': 1.8759887218475342, 'learning_rate': 0.00017187276626161546, 'epoch': 0.14}


 14%|█▍        | 2372/16798 [09:36<1:06:05,  3.64it/s]

{'loss': 1.7048, 'grad_norm': 2.147994041442871, 'learning_rate': 0.00017186085299023113, 'epoch': 0.14}


 14%|█▍        | 2373/16798 [09:36<1:06:17,  3.63it/s]

{'loss': 1.6778, 'grad_norm': 2.211338758468628, 'learning_rate': 0.00017184893971884682, 'epoch': 0.14}


 14%|█▍        | 2374/16798 [09:36<1:07:26,  3.56it/s]

{'loss': 1.7939, 'grad_norm': 1.8981322050094604, 'learning_rate': 0.00017183702644746248, 'epoch': 0.14}


 14%|█▍        | 2375/16798 [09:36<1:05:35,  3.67it/s]

{'loss': 1.586, 'grad_norm': 2.08587384223938, 'learning_rate': 0.00017182511317607817, 'epoch': 0.14}


 14%|█▍        | 2376/16798 [09:37<1:04:10,  3.75it/s]

{'loss': 1.7256, 'grad_norm': 1.8613766431808472, 'learning_rate': 0.00017181319990469384, 'epoch': 0.14}


 14%|█▍        | 2377/16798 [09:37<1:06:08,  3.63it/s]

{'loss': 1.6268, 'grad_norm': 2.0739455223083496, 'learning_rate': 0.00017180128663330953, 'epoch': 0.14}


 14%|█▍        | 2378/16798 [09:37<1:03:19,  3.80it/s]

{'loss': 1.1659, 'grad_norm': 1.7319374084472656, 'learning_rate': 0.0001717893733619252, 'epoch': 0.14}


 14%|█▍        | 2379/16798 [09:37<1:01:05,  3.93it/s]

{'loss': 1.3398, 'grad_norm': 1.6880762577056885, 'learning_rate': 0.00017177746009054088, 'epoch': 0.14}


 14%|█▍        | 2380/16798 [09:38<1:02:51,  3.82it/s]

{'loss': 1.0325, 'grad_norm': 1.7118576765060425, 'learning_rate': 0.00017176554681915654, 'epoch': 0.14}


 14%|█▍        | 2381/16798 [09:38<1:07:06,  3.58it/s]

{'loss': 1.3576, 'grad_norm': 1.7689307928085327, 'learning_rate': 0.00017175363354777223, 'epoch': 0.14}


 14%|█▍        | 2382/16798 [09:38<1:02:41,  3.83it/s]

{'loss': 1.3921, 'grad_norm': 1.8348926305770874, 'learning_rate': 0.0001717417202763879, 'epoch': 0.14}


 14%|█▍        | 2383/16798 [09:38<1:00:43,  3.96it/s]

{'loss': 1.4289, 'grad_norm': 1.9880857467651367, 'learning_rate': 0.0001717298070050036, 'epoch': 0.14}


 14%|█▍        | 2384/16798 [09:39<1:05:55,  3.64it/s]

{'loss': 1.4415, 'grad_norm': 1.8230390548706055, 'learning_rate': 0.00017171789373361925, 'epoch': 0.14}


 14%|█▍        | 2385/16798 [09:39<1:03:04,  3.81it/s]

{'loss': 1.3084, 'grad_norm': 2.3636362552642822, 'learning_rate': 0.00017170598046223494, 'epoch': 0.14}


 14%|█▍        | 2386/16798 [09:39<1:02:05,  3.87it/s]

{'loss': 1.5, 'grad_norm': 2.1568212509155273, 'learning_rate': 0.0001716940671908506, 'epoch': 0.14}


 14%|█▍        | 2387/16798 [09:40<1:05:08,  3.69it/s]

{'loss': 1.377, 'grad_norm': 1.7730109691619873, 'learning_rate': 0.0001716821539194663, 'epoch': 0.14}


 14%|█▍        | 2389/16798 [09:40<1:01:07,  3.93it/s]

{'loss': 1.645, 'grad_norm': 2.305154323577881, 'learning_rate': 0.00017167024064808196, 'epoch': 0.14}


 14%|█▍        | 2389/16798 [09:40<1:01:07,  3.93it/s]

{'loss': 1.2874, 'grad_norm': 1.8077713251113892, 'learning_rate': 0.00017165832737669765, 'epoch': 0.14}


 14%|█▍        | 2390/16798 [09:40<1:05:09,  3.69it/s]

{'loss': 1.2526, 'grad_norm': 1.6865047216415405, 'learning_rate': 0.00017164641410531332, 'epoch': 0.14}


 14%|█▍        | 2391/16798 [09:41<1:05:11,  3.68it/s]

{'loss': 1.1021, 'grad_norm': 1.9278734922409058, 'learning_rate': 0.000171634500833929, 'epoch': 0.14}


 14%|█▍        | 2392/16798 [09:41<1:03:47,  3.76it/s]

{'loss': 1.3686, 'grad_norm': 1.7349331378936768, 'learning_rate': 0.00017162258756254467, 'epoch': 0.14}


 14%|█▍        | 2394/16798 [09:41<59:56,  4.01it/s]  

{'loss': 0.7769, 'grad_norm': 1.663190484046936, 'learning_rate': 0.00017161067429116036, 'epoch': 0.14}


 14%|█▍        | 2394/16798 [09:41<59:56,  4.01it/s]

{'loss': 1.2721, 'grad_norm': 1.6687506437301636, 'learning_rate': 0.00017159876101977602, 'epoch': 0.14}


 14%|█▍        | 2395/16798 [09:42<58:51,  4.08it/s]

{'loss': 1.0053, 'grad_norm': 1.5623672008514404, 'learning_rate': 0.00017158684774839172, 'epoch': 0.14}


 14%|█▍        | 2396/16798 [09:42<58:44,  4.09it/s]

{'loss': 0.9216, 'grad_norm': 2.060124397277832, 'learning_rate': 0.00017157493447700738, 'epoch': 0.14}


 14%|█▍        | 2397/16798 [09:42<1:04:00,  3.75it/s]

{'loss': 1.1565, 'grad_norm': 1.827452540397644, 'learning_rate': 0.00017156302120562307, 'epoch': 0.14}


 14%|█▍        | 2398/16798 [09:42<1:00:51,  3.94it/s]

{'loss': 0.3769, 'grad_norm': 0.9127169251441956, 'learning_rate': 0.00017155110793423873, 'epoch': 0.14}


 14%|█▍        | 2399/16798 [09:43<58:07,  4.13it/s]  

{'loss': 0.3258, 'grad_norm': 0.9276441931724548, 'learning_rate': 0.00017153919466285442, 'epoch': 0.14}


 14%|█▍        | 2400/16798 [09:43<1:03:01,  3.81it/s]

{'loss': 0.4226, 'grad_norm': 1.0495015382766724, 'learning_rate': 0.0001715272813914701, 'epoch': 0.14}


 14%|█▍        | 2401/16798 [09:43<1:01:13,  3.92it/s]

{'loss': 1.9654, 'grad_norm': 2.0301673412323, 'learning_rate': 0.00017151536812008578, 'epoch': 0.14}


 14%|█▍        | 2402/16798 [09:43<1:02:36,  3.83it/s]

{'loss': 2.0195, 'grad_norm': 1.875480055809021, 'learning_rate': 0.00017150345484870147, 'epoch': 0.14}


 14%|█▍        | 2403/16798 [09:44<1:03:43,  3.77it/s]

{'loss': 2.1348, 'grad_norm': 2.011521577835083, 'learning_rate': 0.00017149154157731716, 'epoch': 0.14}


 14%|█▍        | 2404/16798 [09:44<1:01:22,  3.91it/s]

{'loss': 2.152, 'grad_norm': 1.9072166681289673, 'learning_rate': 0.00017147962830593282, 'epoch': 0.14}


 14%|█▍        | 2405/16798 [09:44<1:00:26,  3.97it/s]

{'loss': 2.2945, 'grad_norm': 2.1865055561065674, 'learning_rate': 0.00017146771503454851, 'epoch': 0.14}


 14%|█▍        | 2406/16798 [09:45<1:07:08,  3.57it/s]

{'loss': 1.9568, 'grad_norm': 2.6546547412872314, 'learning_rate': 0.00017145580176316418, 'epoch': 0.14}


 14%|█▍        | 2407/16798 [09:45<1:05:35,  3.66it/s]

{'loss': 1.4678, 'grad_norm': 1.4906699657440186, 'learning_rate': 0.00017144388849177987, 'epoch': 0.14}


 14%|█▍        | 2408/16798 [09:45<1:03:35,  3.77it/s]

{'loss': 1.9576, 'grad_norm': 1.659044861793518, 'learning_rate': 0.00017143197522039553, 'epoch': 0.14}


 14%|█▍        | 2409/16798 [09:45<1:04:53,  3.70it/s]

{'loss': 1.9458, 'grad_norm': 1.9356253147125244, 'learning_rate': 0.00017142006194901122, 'epoch': 0.14}


 14%|█▍        | 2410/16798 [09:46<1:06:12,  3.62it/s]

{'loss': 1.4459, 'grad_norm': 1.8505184650421143, 'learning_rate': 0.0001714081486776269, 'epoch': 0.14}


 14%|█▍        | 2411/16798 [09:46<1:04:53,  3.70it/s]

{'loss': 1.7859, 'grad_norm': 1.85873544216156, 'learning_rate': 0.00017139623540624258, 'epoch': 0.14}


 14%|█▍        | 2412/16798 [09:46<1:04:57,  3.69it/s]

{'loss': 1.4777, 'grad_norm': 1.9970817565917969, 'learning_rate': 0.00017138432213485824, 'epoch': 0.14}


 14%|█▍        | 2413/16798 [09:46<1:02:34,  3.83it/s]

{'loss': 1.998, 'grad_norm': 2.116370439529419, 'learning_rate': 0.00017137240886347393, 'epoch': 0.14}


 14%|█▍        | 2414/16798 [09:47<1:00:08,  3.99it/s]

{'loss': 1.4414, 'grad_norm': 1.7147562503814697, 'learning_rate': 0.0001713604955920896, 'epoch': 0.14}


 14%|█▍        | 2415/16798 [09:47<1:02:44,  3.82it/s]

{'loss': 1.5303, 'grad_norm': 2.205240249633789, 'learning_rate': 0.0001713485823207053, 'epoch': 0.14}


 14%|█▍        | 2416/16798 [09:47<1:03:39,  3.77it/s]

{'loss': 1.7225, 'grad_norm': 2.2804758548736572, 'learning_rate': 0.00017133666904932095, 'epoch': 0.14}


 14%|█▍        | 2417/16798 [09:47<1:00:29,  3.96it/s]

{'loss': 1.8089, 'grad_norm': 1.8701871633529663, 'learning_rate': 0.00017132475577793664, 'epoch': 0.14}


 14%|█▍        | 2418/16798 [09:48<58:49,  4.07it/s]  

{'loss': 1.6552, 'grad_norm': 1.830297827720642, 'learning_rate': 0.0001713128425065523, 'epoch': 0.14}


 14%|█▍        | 2419/16798 [09:48<1:01:29,  3.90it/s]

{'loss': 1.3063, 'grad_norm': 1.902564525604248, 'learning_rate': 0.000171300929235168, 'epoch': 0.14}


 14%|█▍        | 2420/16798 [09:48<1:01:41,  3.88it/s]

{'loss': 1.6545, 'grad_norm': 1.9893054962158203, 'learning_rate': 0.00017128901596378366, 'epoch': 0.14}


 14%|█▍        | 2421/16798 [09:48<59:51,  4.00it/s]  

{'loss': 1.7331, 'grad_norm': 1.7711633443832397, 'learning_rate': 0.00017127710269239935, 'epoch': 0.14}


 14%|█▍        | 2422/16798 [09:49<1:03:38,  3.77it/s]

{'loss': 1.4789, 'grad_norm': 1.807752013206482, 'learning_rate': 0.00017126518942101501, 'epoch': 0.14}


 14%|█▍        | 2423/16798 [09:49<1:02:02,  3.86it/s]

{'loss': 1.2641, 'grad_norm': 1.4806665182113647, 'learning_rate': 0.0001712532761496307, 'epoch': 0.14}


 14%|█▍        | 2424/16798 [09:49<1:00:05,  3.99it/s]

{'loss': 1.8855, 'grad_norm': 2.2597460746765137, 'learning_rate': 0.00017124136287824637, 'epoch': 0.14}


 14%|█▍        | 2425/16798 [09:49<57:43,  4.15it/s]  

{'loss': 1.9074, 'grad_norm': 2.155038833618164, 'learning_rate': 0.00017122944960686206, 'epoch': 0.14}


 14%|█▍        | 2426/16798 [09:50<1:01:37,  3.89it/s]

{'loss': 1.6952, 'grad_norm': 1.7883464097976685, 'learning_rate': 0.00017121753633547772, 'epoch': 0.14}


 14%|█▍        | 2427/16798 [09:50<1:03:26,  3.77it/s]

{'loss': 1.5373, 'grad_norm': 2.0909013748168945, 'learning_rate': 0.0001712056230640934, 'epoch': 0.14}


 14%|█▍        | 2428/16798 [09:50<1:00:21,  3.97it/s]

{'loss': 1.648, 'grad_norm': 1.851637601852417, 'learning_rate': 0.00017119370979270908, 'epoch': 0.14}


 14%|█▍        | 2429/16798 [09:50<1:03:33,  3.77it/s]

{'loss': 1.3938, 'grad_norm': 1.679138422012329, 'learning_rate': 0.00017118179652132477, 'epoch': 0.14}


 14%|█▍        | 2430/16798 [09:51<1:04:22,  3.72it/s]

{'loss': 1.7155, 'grad_norm': 1.8780138492584229, 'learning_rate': 0.00017116988324994043, 'epoch': 0.14}


 14%|█▍        | 2431/16798 [09:51<1:01:44,  3.88it/s]

{'loss': 1.3492, 'grad_norm': 1.8756293058395386, 'learning_rate': 0.00017115796997855612, 'epoch': 0.14}


 14%|█▍        | 2432/16798 [09:51<1:03:38,  3.76it/s]

{'loss': 1.5445, 'grad_norm': 1.8165923357009888, 'learning_rate': 0.00017114605670717179, 'epoch': 0.14}


 14%|█▍        | 2433/16798 [09:52<1:03:39,  3.76it/s]

{'loss': 1.5284, 'grad_norm': 1.9817687273025513, 'learning_rate': 0.00017113414343578748, 'epoch': 0.14}


 14%|█▍        | 2434/16798 [09:52<1:03:36,  3.76it/s]

{'loss': 1.159, 'grad_norm': 1.6498688459396362, 'learning_rate': 0.00017112223016440317, 'epoch': 0.14}


 14%|█▍        | 2435/16798 [09:52<1:04:26,  3.71it/s]

{'loss': 1.0905, 'grad_norm': 1.6059907674789429, 'learning_rate': 0.00017111031689301883, 'epoch': 0.14}


 15%|█▍        | 2436/16798 [09:52<1:03:58,  3.74it/s]

{'loss': 2.065, 'grad_norm': 2.630390167236328, 'learning_rate': 0.00017109840362163452, 'epoch': 0.15}


 15%|█▍        | 2437/16798 [09:53<1:00:45,  3.94it/s]

{'loss': 1.2015, 'grad_norm': 1.8946245908737183, 'learning_rate': 0.00017108649035025019, 'epoch': 0.15}


 15%|█▍        | 2438/16798 [09:53<1:02:59,  3.80it/s]

{'loss': 1.2836, 'grad_norm': 1.7799348831176758, 'learning_rate': 0.00017107457707886588, 'epoch': 0.15}


 15%|█▍        | 2439/16798 [09:53<1:02:50,  3.81it/s]

{'loss': 1.3656, 'grad_norm': 1.6849454641342163, 'learning_rate': 0.00017106266380748154, 'epoch': 0.15}


 15%|█▍        | 2440/16798 [09:53<1:00:10,  3.98it/s]

{'loss': 1.4561, 'grad_norm': 1.8774313926696777, 'learning_rate': 0.00017105075053609723, 'epoch': 0.15}


 15%|█▍        | 2441/16798 [09:54<1:00:08,  3.98it/s]

{'loss': 1.41, 'grad_norm': 2.1825950145721436, 'learning_rate': 0.0001710388372647129, 'epoch': 0.15}


 15%|█▍        | 2442/16798 [09:54<1:04:32,  3.71it/s]

{'loss': 1.5041, 'grad_norm': 1.9563026428222656, 'learning_rate': 0.00017102692399332859, 'epoch': 0.15}


 15%|█▍        | 2443/16798 [09:54<1:01:08,  3.91it/s]

{'loss': 1.12, 'grad_norm': 1.5803149938583374, 'learning_rate': 0.00017101501072194425, 'epoch': 0.15}


 15%|█▍        | 2444/16798 [09:54<59:04,  4.05it/s]  

{'loss': 1.2282, 'grad_norm': 1.6109485626220703, 'learning_rate': 0.00017100309745055994, 'epoch': 0.15}


 15%|█▍        | 2445/16798 [09:55<1:01:55,  3.86it/s]

{'loss': 0.8943, 'grad_norm': 1.671230435371399, 'learning_rate': 0.0001709911841791756, 'epoch': 0.15}


 15%|█▍        | 2446/16798 [09:55<1:01:19,  3.90it/s]

{'loss': 0.9764, 'grad_norm': 1.5807005167007446, 'learning_rate': 0.0001709792709077913, 'epoch': 0.15}


 15%|█▍        | 2447/16798 [09:55<58:44,  4.07it/s]  

{'loss': 0.6814, 'grad_norm': 1.2976281642913818, 'learning_rate': 0.00017096735763640696, 'epoch': 0.15}


 15%|█▍        | 2448/16798 [09:55<57:34,  4.15it/s]

{'loss': 0.2428, 'grad_norm': 0.6976056694984436, 'learning_rate': 0.00017095544436502265, 'epoch': 0.15}


 15%|█▍        | 2449/16798 [09:56<1:02:01,  3.86it/s]

{'loss': 0.2131, 'grad_norm': 0.6487521529197693, 'learning_rate': 0.0001709435310936383, 'epoch': 0.15}


 15%|█▍        | 2450/16798 [09:56<59:15,  4.04it/s]  

{'loss': 0.346, 'grad_norm': 0.9740726947784424, 'learning_rate': 0.000170931617822254, 'epoch': 0.15}


 15%|█▍        | 2451/16798 [09:56<1:01:10,  3.91it/s]

{'loss': 1.9739, 'grad_norm': 1.7779932022094727, 'learning_rate': 0.00017091970455086967, 'epoch': 0.15}


 15%|█▍        | 2452/16798 [09:56<1:03:58,  3.74it/s]

{'loss': 1.9912, 'grad_norm': 1.7596659660339355, 'learning_rate': 0.00017090779127948536, 'epoch': 0.15}


 15%|█▍        | 2453/16798 [09:57<1:00:25,  3.96it/s]

{'loss': 2.2749, 'grad_norm': 2.0728182792663574, 'learning_rate': 0.00017089587800810102, 'epoch': 0.15}


 15%|█▍        | 2454/16798 [09:57<58:08,  4.11it/s]  

{'loss': 1.5411, 'grad_norm': 1.6845494508743286, 'learning_rate': 0.0001708839647367167, 'epoch': 0.15}


 15%|█▍        | 2455/16798 [09:57<56:38,  4.22it/s]

{'loss': 2.1997, 'grad_norm': 2.1196179389953613, 'learning_rate': 0.00017087205146533238, 'epoch': 0.15}


 15%|█▍        | 2456/16798 [09:57<1:01:13,  3.90it/s]

{'loss': 2.2064, 'grad_norm': 2.2133803367614746, 'learning_rate': 0.00017086013819394807, 'epoch': 0.15}


 15%|█▍        | 2457/16798 [09:58<1:02:50,  3.80it/s]

{'loss': 2.1766, 'grad_norm': 2.300936460494995, 'learning_rate': 0.00017084822492256373, 'epoch': 0.15}


 15%|█▍        | 2458/16798 [09:58<1:01:14,  3.90it/s]

{'loss': 2.2094, 'grad_norm': 1.9885227680206299, 'learning_rate': 0.00017083631165117942, 'epoch': 0.15}


 15%|█▍        | 2459/16798 [09:58<1:04:11,  3.72it/s]

{'loss': 1.7052, 'grad_norm': 1.8120172023773193, 'learning_rate': 0.00017082439837979508, 'epoch': 0.15}


 15%|█▍        | 2460/16798 [09:58<1:01:26,  3.89it/s]

{'loss': 2.1723, 'grad_norm': 2.0389909744262695, 'learning_rate': 0.00017081248510841077, 'epoch': 0.15}


 15%|█▍        | 2461/16798 [09:59<59:39,  4.01it/s]  

{'loss': 1.461, 'grad_norm': 1.7030881643295288, 'learning_rate': 0.00017080057183702644, 'epoch': 0.15}


 15%|█▍        | 2462/16798 [09:59<1:04:39,  3.69it/s]

{'loss': 1.6661, 'grad_norm': 1.6104952096939087, 'learning_rate': 0.00017078865856564213, 'epoch': 0.15}


 15%|█▍        | 2463/16798 [09:59<1:00:21,  3.96it/s]

{'loss': 1.4903, 'grad_norm': 1.9021282196044922, 'learning_rate': 0.0001707767452942578, 'epoch': 0.15}


 15%|█▍        | 2464/16798 [09:59<57:45,  4.14it/s]  

{'loss': 2.0192, 'grad_norm': 1.9622442722320557, 'learning_rate': 0.0001707648320228735, 'epoch': 0.15}


 15%|█▍        | 2465/16798 [10:00<55:38,  4.29it/s]

{'loss': 1.5658, 'grad_norm': 1.8652656078338623, 'learning_rate': 0.00017075291875148917, 'epoch': 0.15}


 15%|█▍        | 2466/16798 [10:00<58:23,  4.09it/s]

{'loss': 1.2953, 'grad_norm': 1.5762518644332886, 'learning_rate': 0.00017074100548010487, 'epoch': 0.15}


 15%|█▍        | 2468/16798 [10:00<55:19,  4.32it/s]

{'loss': 1.4182, 'grad_norm': 1.6306160688400269, 'learning_rate': 0.00017072909220872053, 'epoch': 0.15}


 15%|█▍        | 2468/16798 [10:00<55:19,  4.32it/s]

{'loss': 1.7791, 'grad_norm': 1.868790864944458, 'learning_rate': 0.00017071717893733622, 'epoch': 0.15}


 15%|█▍        | 2469/16798 [10:01<59:37,  4.01it/s]

{'loss': 1.6083, 'grad_norm': 2.0766258239746094, 'learning_rate': 0.00017070526566595188, 'epoch': 0.15}


 15%|█▍        | 2470/16798 [10:01<1:02:56,  3.79it/s]

{'loss': 1.5104, 'grad_norm': 1.746272087097168, 'learning_rate': 0.00017069335239456757, 'epoch': 0.15}


 15%|█▍        | 2471/16798 [10:01<1:00:13,  3.97it/s]

{'loss': 1.634, 'grad_norm': 1.7396208047866821, 'learning_rate': 0.00017068143912318324, 'epoch': 0.15}


 15%|█▍        | 2472/16798 [10:01<57:13,  4.17it/s]  

{'loss': 1.3597, 'grad_norm': 1.5508241653442383, 'learning_rate': 0.00017066952585179893, 'epoch': 0.15}


 15%|█▍        | 2473/16798 [10:02<57:41,  4.14it/s]

{'loss': 1.6057, 'grad_norm': 2.355435371398926, 'learning_rate': 0.0001706576125804146, 'epoch': 0.15}


 15%|█▍        | 2474/16798 [10:02<1:00:50,  3.92it/s]

{'loss': 1.282, 'grad_norm': 1.6974382400512695, 'learning_rate': 0.00017064569930903028, 'epoch': 0.15}


 15%|█▍        | 2475/16798 [10:02<57:56,  4.12it/s]  

{'loss': 1.7398, 'grad_norm': 2.2196342945098877, 'learning_rate': 0.00017063378603764595, 'epoch': 0.15}


 15%|█▍        | 2476/16798 [10:02<59:01,  4.04it/s]

{'loss': 1.3828, 'grad_norm': 1.807466983795166, 'learning_rate': 0.00017062187276626164, 'epoch': 0.15}


 15%|█▍        | 2477/16798 [10:03<59:11,  4.03it/s]

{'loss': 1.7704, 'grad_norm': 2.0136046409606934, 'learning_rate': 0.0001706099594948773, 'epoch': 0.15}


 15%|█▍        | 2478/16798 [10:03<58:15,  4.10it/s]

{'loss': 1.3847, 'grad_norm': 1.7530838251113892, 'learning_rate': 0.000170598046223493, 'epoch': 0.15}


 15%|█▍        | 2479/16798 [10:03<1:03:12,  3.78it/s]

{'loss': 1.5743, 'grad_norm': 2.0389561653137207, 'learning_rate': 0.00017058613295210866, 'epoch': 0.15}


 15%|█▍        | 2480/16798 [10:03<1:00:23,  3.95it/s]

{'loss': 1.5013, 'grad_norm': 1.8517335653305054, 'learning_rate': 0.00017057421968072435, 'epoch': 0.15}


 15%|█▍        | 2481/16798 [10:04<58:19,  4.09it/s]  

{'loss': 1.3113, 'grad_norm': 1.709813117980957, 'learning_rate': 0.00017056230640934, 'epoch': 0.15}


 15%|█▍        | 2482/16798 [10:04<1:02:17,  3.83it/s]

{'loss': 2.0511, 'grad_norm': 2.2305445671081543, 'learning_rate': 0.0001705503931379557, 'epoch': 0.15}


 15%|█▍        | 2483/16798 [10:04<1:01:43,  3.87it/s]

{'loss': 1.397, 'grad_norm': 1.805943489074707, 'learning_rate': 0.00017053847986657136, 'epoch': 0.15}


 15%|█▍        | 2484/16798 [10:04<57:59,  4.11it/s]  

{'loss': 1.2027, 'grad_norm': 1.6039760112762451, 'learning_rate': 0.00017052656659518706, 'epoch': 0.15}


 15%|█▍        | 2485/16798 [10:05<59:38,  4.00it/s]

{'loss': 1.4751, 'grad_norm': 1.848602533340454, 'learning_rate': 0.00017051465332380272, 'epoch': 0.15}


 15%|█▍        | 2486/16798 [10:05<59:04,  4.04it/s]

{'loss': 1.6094, 'grad_norm': 2.0912559032440186, 'learning_rate': 0.0001705027400524184, 'epoch': 0.15}


 15%|█▍        | 2487/16798 [10:05<57:18,  4.16it/s]

{'loss': 1.3215, 'grad_norm': 1.8772410154342651, 'learning_rate': 0.00017049082678103407, 'epoch': 0.15}


 15%|█▍        | 2488/16798 [10:05<59:03,  4.04it/s]

{'loss': 1.4504, 'grad_norm': 1.7220180034637451, 'learning_rate': 0.00017047891350964976, 'epoch': 0.15}


 15%|█▍        | 2489/16798 [10:06<57:41,  4.13it/s]

{'loss': 1.6044, 'grad_norm': 1.8980919122695923, 'learning_rate': 0.00017046700023826543, 'epoch': 0.15}


 15%|█▍        | 2490/16798 [10:06<58:26,  4.08it/s]

{'loss': 1.3115, 'grad_norm': 1.6987533569335938, 'learning_rate': 0.00017045508696688112, 'epoch': 0.15}


 15%|█▍        | 2491/16798 [10:06<1:03:35,  3.75it/s]

{'loss': 1.1484, 'grad_norm': 1.9283610582351685, 'learning_rate': 0.00017044317369549678, 'epoch': 0.15}


 15%|█▍        | 2492/16798 [10:06<59:41,  3.99it/s]  

{'loss': 1.2728, 'grad_norm': 1.8394556045532227, 'learning_rate': 0.00017043126042411247, 'epoch': 0.15}


 15%|█▍        | 2493/16798 [10:07<56:53,  4.19it/s]

{'loss': 1.4824, 'grad_norm': 1.8988289833068848, 'learning_rate': 0.00017041934715272814, 'epoch': 0.15}


 15%|█▍        | 2494/16798 [10:07<56:53,  4.19it/s]

{'loss': 1.1199, 'grad_norm': 1.86333429813385, 'learning_rate': 0.00017040743388134383, 'epoch': 0.15}


 15%|█▍        | 2495/16798 [10:07<59:31,  4.00it/s]

{'loss': 0.8665, 'grad_norm': 1.4564759731292725, 'learning_rate': 0.00017039552060995952, 'epoch': 0.15}


 15%|█▍        | 2496/16798 [10:07<58:36,  4.07it/s]

{'loss': 1.0322, 'grad_norm': 2.300483465194702, 'learning_rate': 0.00017038360733857518, 'epoch': 0.15}


 15%|█▍        | 2497/16798 [10:08<59:13,  4.02it/s]

{'loss': 1.0659, 'grad_norm': 1.7654393911361694, 'learning_rate': 0.00017037169406719087, 'epoch': 0.15}


 15%|█▍        | 2499/16798 [10:08<58:08,  4.10it/s]  

{'loss': 0.7342, 'grad_norm': 1.3064720630645752, 'learning_rate': 0.00017035978079580654, 'epoch': 0.15}


 15%|█▍        | 2500/16798 [10:08<55:01,  4.33it/s]

{'loss': 0.9461, 'grad_norm': 1.6102005243301392, 'learning_rate': 0.00017034786752442223, 'epoch': 0.15}




{'loss': 0.2694, 'grad_norm': 0.7850995659828186, 'learning_rate': 0.0001703359542530379, 'epoch': 0.15}


 15%|█▍        | 2501/16798 [10:11<3:58:05,  1.00it/s]

{'loss': 1.8626, 'grad_norm': 1.6963766813278198, 'learning_rate': 0.00017032404098165358, 'epoch': 0.15}


 15%|█▍        | 2502/16798 [10:11<3:06:46,  1.28it/s]

{'loss': 2.2966, 'grad_norm': 2.198298692703247, 'learning_rate': 0.00017031212771026924, 'epoch': 0.15}


 15%|█▍        | 2503/16798 [10:12<2:28:50,  1.60it/s]

{'loss': 1.8641, 'grad_norm': 1.6422228813171387, 'learning_rate': 0.00017030021443888494, 'epoch': 0.15}


 15%|█▍        | 2504/16798 [10:12<2:02:18,  1.95it/s]

{'loss': 1.8035, 'grad_norm': 1.5652079582214355, 'learning_rate': 0.0001702883011675006, 'epoch': 0.15}


 15%|█▍        | 2505/16798 [10:12<1:45:21,  2.26it/s]

{'loss': 1.9022, 'grad_norm': 1.885515809059143, 'learning_rate': 0.0001702763878961163, 'epoch': 0.15}


 15%|█▍        | 2506/16798 [10:12<1:29:59,  2.65it/s]

{'loss': 2.1209, 'grad_norm': 2.0491139888763428, 'learning_rate': 0.00017026447462473195, 'epoch': 0.15}


 15%|█▍        | 2507/16798 [10:13<1:19:44,  2.99it/s]

{'loss': 2.3334, 'grad_norm': 2.215562582015991, 'learning_rate': 0.00017025256135334764, 'epoch': 0.15}


 15%|█▍        | 2508/16798 [10:13<1:16:15,  3.12it/s]

{'loss': 1.8486, 'grad_norm': 1.7666550874710083, 'learning_rate': 0.0001702406480819633, 'epoch': 0.15}


 15%|█▍        | 2509/16798 [10:13<1:11:21,  3.34it/s]

{'loss': 1.7779, 'grad_norm': 1.918928861618042, 'learning_rate': 0.000170228734810579, 'epoch': 0.15}


 15%|█▍        | 2510/16798 [10:13<1:07:38,  3.52it/s]

{'loss': 1.9882, 'grad_norm': 2.0978877544403076, 'learning_rate': 0.00017021682153919466, 'epoch': 0.15}


 15%|█▍        | 2511/16798 [10:14<1:03:46,  3.73it/s]

{'loss': 1.5399, 'grad_norm': 1.61920964717865, 'learning_rate': 0.00017020490826781035, 'epoch': 0.15}


 15%|█▍        | 2512/16798 [10:14<1:03:37,  3.74it/s]

{'loss': 1.8386, 'grad_norm': 1.7553715705871582, 'learning_rate': 0.00017019299499642602, 'epoch': 0.15}


 15%|█▍        | 2514/16798 [10:14<1:00:52,  3.91it/s]

{'loss': 1.6398, 'grad_norm': 1.8452744483947754, 'learning_rate': 0.0001701810817250417, 'epoch': 0.15}


 15%|█▍        | 2515/16798 [10:15<57:32,  4.14it/s]  

{'loss': 1.8853, 'grad_norm': 2.1091198921203613, 'learning_rate': 0.00017016916845365737, 'epoch': 0.15}


 15%|█▍        | 2515/16798 [10:15<57:32,  4.14it/s]

{'loss': 1.751, 'grad_norm': 1.7212066650390625, 'learning_rate': 0.00017015725518227306, 'epoch': 0.15}


 15%|█▍        | 2516/16798 [10:15<1:00:28,  3.94it/s]

{'loss': 1.8459, 'grad_norm': 1.7327730655670166, 'learning_rate': 0.00017014534191088873, 'epoch': 0.15}


 15%|█▍        | 2517/16798 [10:15<1:00:56,  3.91it/s]

{'loss': 1.5117, 'grad_norm': 1.6934727430343628, 'learning_rate': 0.00017013342863950442, 'epoch': 0.15}


 15%|█▍        | 2518/16798 [10:15<58:12,  4.09it/s]  

{'loss': 1.5149, 'grad_norm': 1.7551923990249634, 'learning_rate': 0.00017012151536812008, 'epoch': 0.15}


 15%|█▍        | 2519/16798 [10:16<1:01:10,  3.89it/s]

{'loss': 1.7343, 'grad_norm': 1.7689872980117798, 'learning_rate': 0.00017010960209673577, 'epoch': 0.15}


 15%|█▌        | 2521/16798 [10:16<57:03,  4.17it/s]  

{'loss': 1.5626, 'grad_norm': 1.8331553936004639, 'learning_rate': 0.00017009768882535143, 'epoch': 0.15}


 15%|█▌        | 2521/16798 [10:16<57:03,  4.17it/s]

{'loss': 1.7292, 'grad_norm': 1.7503317594528198, 'learning_rate': 0.00017008577555396713, 'epoch': 0.15}


 15%|█▌        | 2522/16798 [10:16<55:02,  4.32it/s]

{'loss': 1.7762, 'grad_norm': 1.9043325185775757, 'learning_rate': 0.0001700738622825828, 'epoch': 0.15}


 15%|█▌        | 2523/16798 [10:17<54:05,  4.40it/s]

{'loss': 1.3208, 'grad_norm': 1.6609296798706055, 'learning_rate': 0.00017006194901119848, 'epoch': 0.15}


 15%|█▌        | 2525/16798 [10:17<54:23,  4.37it/s]

{'loss': 1.8733, 'grad_norm': 1.8984453678131104, 'learning_rate': 0.00017005003573981414, 'epoch': 0.15}


 15%|█▌        | 2525/16798 [10:17<54:23,  4.37it/s]

{'loss': 1.3486, 'grad_norm': 1.723944067955017, 'learning_rate': 0.00017003812246842983, 'epoch': 0.15}


 15%|█▌        | 2526/16798 [10:17<54:07,  4.39it/s]

{'loss': 1.5201, 'grad_norm': 2.15041184425354, 'learning_rate': 0.00017002620919704553, 'epoch': 0.15}


 15%|█▌        | 2527/16798 [10:17<53:32,  4.44it/s]

{'loss': 1.5404, 'grad_norm': 1.949851393699646, 'learning_rate': 0.00017001429592566122, 'epoch': 0.15}


 15%|█▌        | 2528/16798 [10:18<55:54,  4.25it/s]

{'loss': 1.5947, 'grad_norm': 1.7953362464904785, 'learning_rate': 0.00017000238265427688, 'epoch': 0.15}


 15%|█▌        | 2529/16798 [10:18<58:11,  4.09it/s]

{'loss': 1.3541, 'grad_norm': 1.7910263538360596, 'learning_rate': 0.00016999046938289257, 'epoch': 0.15}


 15%|█▌        | 2530/16798 [10:18<57:34,  4.13it/s]

{'loss': 1.215, 'grad_norm': 1.5695286989212036, 'learning_rate': 0.00016997855611150823, 'epoch': 0.15}


 15%|█▌        | 2531/16798 [10:18<58:12,  4.08it/s]

{'loss': 1.3983, 'grad_norm': 1.9088467359542847, 'learning_rate': 0.00016996664284012392, 'epoch': 0.15}


 15%|█▌        | 2532/16798 [10:19<59:15,  4.01it/s]

{'loss': 1.3631, 'grad_norm': 1.6406184434890747, 'learning_rate': 0.0001699547295687396, 'epoch': 0.15}


 15%|█▌        | 2533/16798 [10:19<1:00:25,  3.93it/s]

{'loss': 1.3445, 'grad_norm': 1.795551061630249, 'learning_rate': 0.00016994281629735528, 'epoch': 0.15}


 15%|█▌        | 2534/16798 [10:19<57:04,  4.17it/s]  

{'loss': 1.6346, 'grad_norm': 1.812331199645996, 'learning_rate': 0.00016993090302597094, 'epoch': 0.15}


 15%|█▌        | 2536/16798 [10:20<55:31,  4.28it/s]

{'loss': 1.797, 'grad_norm': 2.0150041580200195, 'learning_rate': 0.00016991898975458663, 'epoch': 0.15}


 15%|█▌        | 2537/16798 [10:20<53:37,  4.43it/s]

{'loss': 1.144, 'grad_norm': 1.463266134262085, 'learning_rate': 0.0001699070764832023, 'epoch': 0.15}


 15%|█▌        | 2538/16798 [10:20<52:47,  4.50it/s]

{'loss': 1.4691, 'grad_norm': 1.7269169092178345, 'learning_rate': 0.000169895163211818, 'epoch': 0.15}


 15%|█▌        | 2538/16798 [10:20<52:47,  4.50it/s]

{'loss': 1.074, 'grad_norm': 1.5223474502563477, 'learning_rate': 0.00016988324994043365, 'epoch': 0.15}


 15%|█▌        | 2539/16798 [10:20<52:12,  4.55it/s]

{'loss': 1.7663, 'grad_norm': 2.3464579582214355, 'learning_rate': 0.00016987133666904934, 'epoch': 0.15}


 15%|█▌        | 2540/16798 [10:21<55:04,  4.32it/s]

{'loss': 1.4429, 'grad_norm': 1.8167319297790527, 'learning_rate': 0.000169859423397665, 'epoch': 0.15}


 15%|█▌        | 2541/16798 [10:21<57:10,  4.16it/s]

{'loss': 1.4112, 'grad_norm': 1.8792297840118408, 'learning_rate': 0.0001698475101262807, 'epoch': 0.15}


 15%|█▌        | 2542/16798 [10:21<55:40,  4.27it/s]

{'loss': 1.2413, 'grad_norm': 1.5665102005004883, 'learning_rate': 0.00016983559685489636, 'epoch': 0.15}


 15%|█▌        | 2543/16798 [10:21<58:38,  4.05it/s]

{'loss': 1.3886, 'grad_norm': 1.8215402364730835, 'learning_rate': 0.00016982368358351205, 'epoch': 0.15}


 15%|█▌        | 2544/16798 [10:22<56:10,  4.23it/s]

{'loss': 1.1873, 'grad_norm': 1.6355416774749756, 'learning_rate': 0.00016981177031212771, 'epoch': 0.15}


 15%|█▌        | 2546/16798 [10:22<55:40,  4.27it/s]

{'loss': 1.1488, 'grad_norm': 1.796395182609558, 'learning_rate': 0.0001697998570407434, 'epoch': 0.15}


 15%|█▌        | 2547/16798 [10:22<53:50,  4.41it/s]

{'loss': 0.8236, 'grad_norm': 1.7524397373199463, 'learning_rate': 0.00016978794376935907, 'epoch': 0.15}


 15%|█▌        | 2547/16798 [10:22<53:50,  4.41it/s]

{'loss': 0.8603, 'grad_norm': 1.5854476690292358, 'learning_rate': 0.00016977603049797476, 'epoch': 0.15}


 15%|█▌        | 2548/16798 [10:22<56:26,  4.21it/s]

{'loss': 0.873, 'grad_norm': 1.7078707218170166, 'learning_rate': 0.00016976411722659042, 'epoch': 0.15}


 15%|█▌        | 2550/16798 [10:23<55:07,  4.31it/s]

{'loss': 0.2482, 'grad_norm': 0.7218419909477234, 'learning_rate': 0.00016975220395520611, 'epoch': 0.15}


 15%|█▌        | 2550/16798 [10:23<55:07,  4.31it/s]

{'loss': 0.8325, 'grad_norm': 1.533548355102539, 'learning_rate': 0.00016974029068382178, 'epoch': 0.15}


 15%|█▌        | 2551/16798 [10:23<58:13,  4.08it/s]

{'loss': 2.275, 'grad_norm': 1.9510291814804077, 'learning_rate': 0.00016972837741243747, 'epoch': 0.15}


 15%|█▌        | 2552/16798 [10:23<57:00,  4.16it/s]

{'loss': 2.2673, 'grad_norm': 2.0007035732269287, 'learning_rate': 0.00016971646414105313, 'epoch': 0.15}


 15%|█▌        | 2553/16798 [10:24<1:06:59,  3.54it/s]

{'loss': 2.0939, 'grad_norm': 1.8587807416915894, 'learning_rate': 0.00016970455086966882, 'epoch': 0.15}


 15%|█▌        | 2554/16798 [10:24<1:03:28,  3.74it/s]

{'loss': 2.1887, 'grad_norm': 2.0214498043060303, 'learning_rate': 0.0001696926375982845, 'epoch': 0.15}


 15%|█▌        | 2555/16798 [10:24<1:04:12,  3.70it/s]

{'loss': 1.7869, 'grad_norm': 2.0988521575927734, 'learning_rate': 0.00016968072432690018, 'epoch': 0.15}


 15%|█▌        | 2556/16798 [10:25<1:04:06,  3.70it/s]

{'loss': 1.6543, 'grad_norm': 1.7715116739273071, 'learning_rate': 0.00016966881105551587, 'epoch': 0.15}


 15%|█▌        | 2557/16798 [10:25<1:02:03,  3.82it/s]

{'loss': 1.9625, 'grad_norm': 2.4802024364471436, 'learning_rate': 0.00016965689778413153, 'epoch': 0.15}


 15%|█▌        | 2558/16798 [10:25<1:02:42,  3.78it/s]

{'loss': 1.9524, 'grad_norm': 1.9310051202774048, 'learning_rate': 0.00016964498451274722, 'epoch': 0.15}


 15%|█▌        | 2559/16798 [10:25<1:00:17,  3.94it/s]

{'loss': 1.7408, 'grad_norm': 2.055634021759033, 'learning_rate': 0.0001696330712413629, 'epoch': 0.15}


 15%|█▌        | 2560/16798 [10:26<1:01:54,  3.83it/s]

{'loss': 1.5478, 'grad_norm': 1.5512406826019287, 'learning_rate': 0.00016962115796997858, 'epoch': 0.15}


 15%|█▌        | 2561/16798 [10:26<1:00:44,  3.91it/s]

{'loss': 1.5523, 'grad_norm': 1.8412047624588013, 'learning_rate': 0.00016960924469859424, 'epoch': 0.15}


 15%|█▌        | 2562/16798 [10:26<58:32,  4.05it/s]  

{'loss': 1.6316, 'grad_norm': 1.7205450534820557, 'learning_rate': 0.00016959733142720993, 'epoch': 0.15}


 15%|█▌        | 2563/16798 [10:26<57:13,  4.15it/s]

{'loss': 1.8524, 'grad_norm': 2.0622634887695312, 'learning_rate': 0.0001695854181558256, 'epoch': 0.15}


 15%|█▌        | 2565/16798 [10:27<1:01:06,  3.88it/s]

{'loss': 1.5242, 'grad_norm': 1.773042917251587, 'learning_rate': 0.00016957350488444129, 'epoch': 0.15}


 15%|█▌        | 2565/16798 [10:27<1:01:06,  3.88it/s]

{'loss': 1.3845, 'grad_norm': 1.6304919719696045, 'learning_rate': 0.00016956159161305695, 'epoch': 0.15}


 15%|█▌        | 2566/16798 [10:27<58:15,  4.07it/s]  

{'loss': 1.2671, 'grad_norm': 1.728836178779602, 'learning_rate': 0.00016954967834167264, 'epoch': 0.15}


 15%|█▌        | 2568/16798 [10:28<55:58,  4.24it/s]

{'loss': 1.62, 'grad_norm': 1.8187552690505981, 'learning_rate': 0.0001695377650702883, 'epoch': 0.15}


 15%|█▌        | 2568/16798 [10:28<55:58,  4.24it/s]

{'loss': 1.6388, 'grad_norm': 1.9651272296905518, 'learning_rate': 0.000169525851798904, 'epoch': 0.15}


 15%|█▌        | 2569/16798 [10:28<57:08,  4.15it/s]

{'loss': 1.7151, 'grad_norm': 1.7943952083587646, 'learning_rate': 0.00016951393852751966, 'epoch': 0.15}


 15%|█▌        | 2570/16798 [10:28<55:47,  4.25it/s]

{'loss': 1.8209, 'grad_norm': 1.883671522140503, 'learning_rate': 0.00016950202525613535, 'epoch': 0.15}


 15%|█▌        | 2571/16798 [10:28<58:48,  4.03it/s]

{'loss': 1.8194, 'grad_norm': 1.8197987079620361, 'learning_rate': 0.000169490111984751, 'epoch': 0.15}


 15%|█▌        | 2572/16798 [10:29<59:24,  3.99it/s]

{'loss': 1.6126, 'grad_norm': 2.093932867050171, 'learning_rate': 0.0001694781987133667, 'epoch': 0.15}


 15%|█▌        | 2573/16798 [10:29<59:28,  3.99it/s]

{'loss': 1.596, 'grad_norm': 1.7415701150894165, 'learning_rate': 0.00016946628544198237, 'epoch': 0.15}


 15%|█▌        | 2574/16798 [10:29<58:16,  4.07it/s]

{'loss': 1.604, 'grad_norm': 1.715579628944397, 'learning_rate': 0.00016945437217059806, 'epoch': 0.15}


 15%|█▌        | 2575/16798 [10:29<59:41,  3.97it/s]

{'loss': 1.3176, 'grad_norm': 1.709640622138977, 'learning_rate': 0.00016944245889921372, 'epoch': 0.15}


 15%|█▌        | 2576/16798 [10:30<57:32,  4.12it/s]

{'loss': 1.7821, 'grad_norm': 1.9802886247634888, 'learning_rate': 0.0001694305456278294, 'epoch': 0.15}


 15%|█▌        | 2577/16798 [10:30<57:58,  4.09it/s]

{'loss': 1.4586, 'grad_norm': 1.6926171779632568, 'learning_rate': 0.00016941863235644508, 'epoch': 0.15}


 15%|█▌        | 2578/16798 [10:30<58:56,  4.02it/s]

{'loss': 1.3796, 'grad_norm': 1.8114758729934692, 'learning_rate': 0.00016940671908506077, 'epoch': 0.15}


 15%|█▌        | 2579/16798 [10:30<1:01:56,  3.83it/s]

{'loss': 1.315, 'grad_norm': 1.6904582977294922, 'learning_rate': 0.00016939480581367643, 'epoch': 0.15}


 15%|█▌        | 2580/16798 [10:31<1:03:53,  3.71it/s]

{'loss': 1.6517, 'grad_norm': 2.0097641944885254, 'learning_rate': 0.00016938289254229212, 'epoch': 0.15}


 15%|█▌        | 2581/16798 [10:31<1:03:12,  3.75it/s]

{'loss': 1.2917, 'grad_norm': 1.892342209815979, 'learning_rate': 0.00016937097927090779, 'epoch': 0.15}


 15%|█▌        | 2582/16798 [10:31<1:01:53,  3.83it/s]

{'loss': 1.6855, 'grad_norm': 2.1313230991363525, 'learning_rate': 0.00016935906599952348, 'epoch': 0.15}


 15%|█▌        | 2583/16798 [10:32<1:07:21,  3.52it/s]

{'loss': 1.9215, 'grad_norm': 2.387718915939331, 'learning_rate': 0.00016934715272813914, 'epoch': 0.15}


 15%|█▌        | 2584/16798 [10:32<1:03:36,  3.72it/s]

{'loss': 1.342, 'grad_norm': 1.7913581132888794, 'learning_rate': 0.00016933523945675483, 'epoch': 0.15}


 15%|█▌        | 2585/16798 [10:32<1:00:48,  3.90it/s]

{'loss': 1.6359, 'grad_norm': 2.15106201171875, 'learning_rate': 0.0001693233261853705, 'epoch': 0.15}


 15%|█▌        | 2586/16798 [10:32<57:58,  4.09it/s]  

{'loss': 1.4954, 'grad_norm': 1.706733226776123, 'learning_rate': 0.00016931141291398618, 'epoch': 0.15}


 15%|█▌        | 2587/16798 [10:32<1:00:34,  3.91it/s]

{'loss': 0.9162, 'grad_norm': 1.763640284538269, 'learning_rate': 0.00016929949964260188, 'epoch': 0.15}


 15%|█▌        | 2588/16798 [10:33<1:05:42,  3.60it/s]

{'loss': 1.4562, 'grad_norm': 2.1581060886383057, 'learning_rate': 0.00016928758637121757, 'epoch': 0.15}


 15%|█▌        | 2589/16798 [10:33<1:01:37,  3.84it/s]

{'loss': 1.7641, 'grad_norm': 2.1904401779174805, 'learning_rate': 0.00016927567309983323, 'epoch': 0.15}


 15%|█▌        | 2590/16798 [10:33<58:57,  4.02it/s]  

{'loss': 1.0246, 'grad_norm': 1.525235891342163, 'learning_rate': 0.00016926375982844892, 'epoch': 0.15}


 15%|█▌        | 2591/16798 [10:34<1:01:16,  3.86it/s]

{'loss': 1.1798, 'grad_norm': 2.092677354812622, 'learning_rate': 0.00016925184655706458, 'epoch': 0.15}


 15%|█▌        | 2592/16798 [10:34<58:47,  4.03it/s]  

{'loss': 1.1133, 'grad_norm': 1.8476059436798096, 'learning_rate': 0.00016923993328568028, 'epoch': 0.15}


 15%|█▌        | 2593/16798 [10:34<59:26,  3.98it/s]

{'loss': 1.5261, 'grad_norm': 2.2603442668914795, 'learning_rate': 0.00016922802001429594, 'epoch': 0.15}


 15%|█▌        | 2594/16798 [10:34<56:43,  4.17it/s]

{'loss': 1.1749, 'grad_norm': 1.7885370254516602, 'learning_rate': 0.00016921610674291163, 'epoch': 0.15}


 15%|█▌        | 2595/16798 [10:34<55:17,  4.28it/s]

{'loss': 1.5912, 'grad_norm': 2.442237377166748, 'learning_rate': 0.0001692041934715273, 'epoch': 0.15}


 15%|█▌        | 2596/16798 [10:35<58:21,  4.06it/s]

{'loss': 0.7675, 'grad_norm': 1.5715712308883667, 'learning_rate': 0.00016919228020014298, 'epoch': 0.15}


 15%|█▌        | 2597/16798 [10:35<58:21,  4.06it/s]

{'loss': 0.654, 'grad_norm': 1.4555076360702515, 'learning_rate': 0.00016918036692875865, 'epoch': 0.15}


 15%|█▌        | 2598/16798 [10:35<57:56,  4.08it/s]

{'loss': 0.7667, 'grad_norm': 1.357960820198059, 'learning_rate': 0.00016916845365737434, 'epoch': 0.15}


 15%|█▌        | 2599/16798 [10:35<1:00:19,  3.92it/s]

{'loss': 0.5248, 'grad_norm': 1.1717605590820312, 'learning_rate': 0.00016915654038599, 'epoch': 0.15}


 15%|█▌        | 2601/16798 [10:36<58:22,  4.05it/s]  

{'loss': 0.2623, 'grad_norm': 0.8279809355735779, 'learning_rate': 0.0001691446271146057, 'epoch': 0.15}


 15%|█▌        | 2601/16798 [10:36<58:22,  4.05it/s]

{'loss': 2.0325, 'grad_norm': 1.8329741954803467, 'learning_rate': 0.00016913271384322136, 'epoch': 0.15}


 15%|█▌        | 2602/16798 [10:36<56:11,  4.21it/s]

{'loss': 1.9713, 'grad_norm': 1.7536778450012207, 'learning_rate': 0.00016912080057183705, 'epoch': 0.15}


 15%|█▌        | 2603/16798 [10:36<58:06,  4.07it/s]

{'loss': 2.0937, 'grad_norm': 1.9765459299087524, 'learning_rate': 0.0001691088873004527, 'epoch': 0.15}


 16%|█▌        | 2604/16798 [10:37<1:02:00,  3.82it/s]

{'loss': 1.8171, 'grad_norm': 1.663870930671692, 'learning_rate': 0.0001690969740290684, 'epoch': 0.16}


 16%|█▌        | 2605/16798 [10:37<1:01:34,  3.84it/s]

{'loss': 1.9798, 'grad_norm': 1.7079764604568481, 'learning_rate': 0.00016908506075768407, 'epoch': 0.16}


 16%|█▌        | 2606/16798 [10:37<1:03:08,  3.75it/s]

{'loss': 2.0256, 'grad_norm': 1.8784654140472412, 'learning_rate': 0.00016907314748629976, 'epoch': 0.16}


 16%|█▌        | 2607/16798 [10:38<1:03:30,  3.72it/s]

{'loss': 1.7375, 'grad_norm': 1.8604590892791748, 'learning_rate': 0.00016906123421491542, 'epoch': 0.16}


 16%|█▌        | 2608/16798 [10:38<1:03:10,  3.74it/s]

{'loss': 1.9643, 'grad_norm': 2.018087863922119, 'learning_rate': 0.0001690493209435311, 'epoch': 0.16}


 16%|█▌        | 2609/16798 [10:38<1:00:16,  3.92it/s]

{'loss': 1.8486, 'grad_norm': 1.9427272081375122, 'learning_rate': 0.00016903740767214677, 'epoch': 0.16}


 16%|█▌        | 2610/16798 [10:38<1:01:18,  3.86it/s]

{'loss': 1.3052, 'grad_norm': 1.584161639213562, 'learning_rate': 0.00016902549440076246, 'epoch': 0.16}


 16%|█▌        | 2611/16798 [10:39<1:01:50,  3.82it/s]

{'loss': 1.5703, 'grad_norm': 1.8602476119995117, 'learning_rate': 0.00016901358112937813, 'epoch': 0.16}


 16%|█▌        | 2612/16798 [10:39<59:28,  3.98it/s]  

{'loss': 1.5843, 'grad_norm': 1.9554928541183472, 'learning_rate': 0.00016900166785799382, 'epoch': 0.16}


 16%|█▌        | 2613/16798 [10:39<57:46,  4.09it/s]

{'loss': 1.329, 'grad_norm': 1.6263151168823242, 'learning_rate': 0.00016898975458660948, 'epoch': 0.16}


 16%|█▌        | 2614/16798 [10:39<1:01:39,  3.83it/s]

{'loss': 1.7449, 'grad_norm': 1.9887501001358032, 'learning_rate': 0.00016897784131522517, 'epoch': 0.16}


 16%|█▌        | 2615/16798 [10:40<1:01:40,  3.83it/s]

{'loss': 1.7793, 'grad_norm': 2.60199236869812, 'learning_rate': 0.00016896592804384084, 'epoch': 0.16}


 16%|█▌        | 2616/16798 [10:40<1:01:00,  3.87it/s]

{'loss': 1.6055, 'grad_norm': 1.891122817993164, 'learning_rate': 0.00016895401477245653, 'epoch': 0.16}


 16%|█▌        | 2617/16798 [10:40<58:46,  4.02it/s]  

{'loss': 2.1275, 'grad_norm': 2.2676315307617188, 'learning_rate': 0.0001689421015010722, 'epoch': 0.16}


 16%|█▌        | 2618/16798 [10:40<1:02:51,  3.76it/s]

{'loss': 1.6164, 'grad_norm': 2.0369114875793457, 'learning_rate': 0.00016893018822968788, 'epoch': 0.16}


 16%|█▌        | 2619/16798 [10:41<59:53,  3.95it/s]  

{'loss': 1.458, 'grad_norm': 1.997871994972229, 'learning_rate': 0.00016891827495830357, 'epoch': 0.16}


 16%|█▌        | 2621/16798 [10:41<55:17,  4.27it/s]

{'loss': 1.6482, 'grad_norm': 2.0128142833709717, 'learning_rate': 0.00016890636168691924, 'epoch': 0.16}


 16%|█▌        | 2621/16798 [10:41<55:17,  4.27it/s]

{'loss': 1.2035, 'grad_norm': 1.7113094329833984, 'learning_rate': 0.00016889444841553493, 'epoch': 0.16}


 16%|█▌        | 2622/16798 [10:41<55:04,  4.29it/s]

{'loss': 1.4016, 'grad_norm': 1.7880053520202637, 'learning_rate': 0.0001688825351441506, 'epoch': 0.16}


 16%|█▌        | 2623/16798 [10:42<58:23,  4.05it/s]

{'loss': 1.6234, 'grad_norm': 1.9856706857681274, 'learning_rate': 0.00016887062187276628, 'epoch': 0.16}


 16%|█▌        | 2625/16798 [10:42<54:14,  4.36it/s]

{'loss': 1.5561, 'grad_norm': 2.292165994644165, 'learning_rate': 0.00016885870860138195, 'epoch': 0.16}


 16%|█▌        | 2625/16798 [10:42<54:14,  4.36it/s]

{'loss': 1.8081, 'grad_norm': 2.021010398864746, 'learning_rate': 0.00016884679532999764, 'epoch': 0.16}


 16%|█▌        | 2626/16798 [10:42<53:36,  4.41it/s]

{'loss': 1.3853, 'grad_norm': 1.7422934770584106, 'learning_rate': 0.0001688348820586133, 'epoch': 0.16}


 16%|█▌        | 2627/16798 [10:42<57:01,  4.14it/s]

{'loss': 1.5781, 'grad_norm': 1.8386746644973755, 'learning_rate': 0.000168822968787229, 'epoch': 0.16}


 16%|█▌        | 2628/16798 [10:43<56:36,  4.17it/s]

{'loss': 0.9943, 'grad_norm': 1.5776032209396362, 'learning_rate': 0.00016881105551584465, 'epoch': 0.16}


 16%|█▌        | 2629/16798 [10:43<55:27,  4.26it/s]

{'loss': 1.3762, 'grad_norm': 1.9477289915084839, 'learning_rate': 0.00016879914224446035, 'epoch': 0.16}


 16%|█▌        | 2630/16798 [10:43<59:51,  3.94it/s]

{'loss': 0.8108, 'grad_norm': 1.4766056537628174, 'learning_rate': 0.000168787228973076, 'epoch': 0.16}


 16%|█▌        | 2631/16798 [10:44<1:02:44,  3.76it/s]

{'loss': 1.2472, 'grad_norm': 1.6446855068206787, 'learning_rate': 0.0001687753157016917, 'epoch': 0.16}


 16%|█▌        | 2632/16798 [10:44<1:00:51,  3.88it/s]

{'loss': 1.4333, 'grad_norm': 2.0181257724761963, 'learning_rate': 0.00016876340243030736, 'epoch': 0.16}


 16%|█▌        | 2633/16798 [10:44<1:01:25,  3.84it/s]

{'loss': 1.17, 'grad_norm': 1.7186236381530762, 'learning_rate': 0.00016875148915892305, 'epoch': 0.16}


 16%|█▌        | 2634/16798 [10:44<1:03:23,  3.72it/s]

{'loss': 1.0661, 'grad_norm': 1.456967830657959, 'learning_rate': 0.00016873957588753872, 'epoch': 0.16}


 16%|█▌        | 2635/16798 [10:45<59:24,  3.97it/s]  

{'loss': 1.4841, 'grad_norm': 2.238898754119873, 'learning_rate': 0.0001687276626161544, 'epoch': 0.16}


 16%|█▌        | 2636/16798 [10:45<1:00:15,  3.92it/s]

{'loss': 1.218, 'grad_norm': 2.0951666831970215, 'learning_rate': 0.00016871574934477007, 'epoch': 0.16}


 16%|█▌        | 2637/16798 [10:45<1:01:06,  3.86it/s]

{'loss': 1.1757, 'grad_norm': 1.7488447427749634, 'learning_rate': 0.00016870383607338576, 'epoch': 0.16}


 16%|█▌        | 2638/16798 [10:45<1:01:35,  3.83it/s]

{'loss': 1.0807, 'grad_norm': 1.4743969440460205, 'learning_rate': 0.00016869192280200143, 'epoch': 0.16}


 16%|█▌        | 2639/16798 [10:46<1:02:35,  3.77it/s]

{'loss': 0.9532, 'grad_norm': 1.5859074592590332, 'learning_rate': 0.00016868000953061712, 'epoch': 0.16}


 16%|█▌        | 2640/16798 [10:46<1:00:01,  3.93it/s]

{'loss': 1.0865, 'grad_norm': 1.674281120300293, 'learning_rate': 0.00016866809625923278, 'epoch': 0.16}


 16%|█▌        | 2641/16798 [10:46<1:01:19,  3.85it/s]

{'loss': 1.1768, 'grad_norm': 2.491607189178467, 'learning_rate': 0.00016865618298784847, 'epoch': 0.16}


 16%|█▌        | 2642/16798 [10:46<1:01:50,  3.81it/s]

{'loss': 0.9465, 'grad_norm': 1.5370221138000488, 'learning_rate': 0.00016864426971646414, 'epoch': 0.16}


 16%|█▌        | 2643/16798 [10:47<1:01:58,  3.81it/s]

{'loss': 1.0111, 'grad_norm': 1.6831421852111816, 'learning_rate': 0.00016863235644507983, 'epoch': 0.16}


 16%|█▌        | 2644/16798 [10:47<1:00:39,  3.89it/s]

{'loss': 1.0551, 'grad_norm': 1.7605986595153809, 'learning_rate': 0.0001686204431736955, 'epoch': 0.16}


 16%|█▌        | 2645/16798 [10:47<1:00:55,  3.87it/s]

{'loss': 0.9179, 'grad_norm': 2.184584856033325, 'learning_rate': 0.00016860852990231118, 'epoch': 0.16}


 16%|█▌        | 2646/16798 [10:47<1:00:26,  3.90it/s]

{'loss': 0.6883, 'grad_norm': 1.3965085744857788, 'learning_rate': 0.00016859661663092684, 'epoch': 0.16}


 16%|█▌        | 2647/16798 [10:48<1:02:08,  3.79it/s]

{'loss': 0.5231, 'grad_norm': 0.9977852702140808, 'learning_rate': 0.00016858470335954254, 'epoch': 0.16}


 16%|█▌        | 2648/16798 [10:48<1:03:25,  3.72it/s]

{'loss': 0.4413, 'grad_norm': 1.0683553218841553, 'learning_rate': 0.0001685727900881582, 'epoch': 0.16}


 16%|█▌        | 2649/16798 [10:48<1:03:10,  3.73it/s]

{'loss': 0.4848, 'grad_norm': 1.0883402824401855, 'learning_rate': 0.00016856087681677392, 'epoch': 0.16}


 16%|█▌        | 2650/16798 [10:48<1:01:10,  3.85it/s]

{'loss': 0.5027, 'grad_norm': 1.2076091766357422, 'learning_rate': 0.00016854896354538958, 'epoch': 0.16}


 16%|█▌        | 2651/16798 [10:49<1:05:25,  3.60it/s]

{'loss': 2.2282, 'grad_norm': 1.9901301860809326, 'learning_rate': 0.00016853705027400527, 'epoch': 0.16}


 16%|█▌        | 2652/16798 [10:49<1:01:38,  3.82it/s]

{'loss': 1.9115, 'grad_norm': 2.1694176197052, 'learning_rate': 0.00016852513700262093, 'epoch': 0.16}


 16%|█▌        | 2653/16798 [10:49<1:03:34,  3.71it/s]

{'loss': 1.9966, 'grad_norm': 1.9358251094818115, 'learning_rate': 0.00016851322373123663, 'epoch': 0.16}


 16%|█▌        | 2654/16798 [10:50<1:01:42,  3.82it/s]

{'loss': 1.8758, 'grad_norm': 1.9230732917785645, 'learning_rate': 0.0001685013104598523, 'epoch': 0.16}


 16%|█▌        | 2655/16798 [10:50<1:02:05,  3.80it/s]

{'loss': 2.3552, 'grad_norm': 2.143686294555664, 'learning_rate': 0.00016848939718846798, 'epoch': 0.16}


 16%|█▌        | 2656/16798 [10:50<1:03:17,  3.72it/s]

{'loss': 2.4281, 'grad_norm': 2.1869044303894043, 'learning_rate': 0.00016847748391708364, 'epoch': 0.16}


 16%|█▌        | 2657/16798 [10:50<1:02:24,  3.78it/s]

{'loss': 2.1482, 'grad_norm': 2.3452935218811035, 'learning_rate': 0.00016846557064569933, 'epoch': 0.16}


 16%|█▌        | 2658/16798 [10:51<1:01:43,  3.82it/s]

{'loss': 2.0825, 'grad_norm': 1.9909303188323975, 'learning_rate': 0.000168453657374315, 'epoch': 0.16}


 16%|█▌        | 2659/16798 [10:51<1:02:25,  3.78it/s]

{'loss': 1.5845, 'grad_norm': 1.7835702896118164, 'learning_rate': 0.0001684417441029307, 'epoch': 0.16}


 16%|█▌        | 2660/16798 [10:51<1:02:50,  3.75it/s]

{'loss': 1.367, 'grad_norm': 1.6254544258117676, 'learning_rate': 0.00016842983083154635, 'epoch': 0.16}


 16%|█▌        | 2661/16798 [10:51<1:00:40,  3.88it/s]

{'loss': 1.7585, 'grad_norm': 1.9826446771621704, 'learning_rate': 0.00016841791756016204, 'epoch': 0.16}


 16%|█▌        | 2662/16798 [10:52<1:00:26,  3.90it/s]

{'loss': 1.4507, 'grad_norm': 1.718650221824646, 'learning_rate': 0.0001684060042887777, 'epoch': 0.16}


 16%|█▌        | 2663/16798 [10:52<1:01:46,  3.81it/s]

{'loss': 1.4416, 'grad_norm': 1.892845630645752, 'learning_rate': 0.0001683940910173934, 'epoch': 0.16}


 16%|█▌        | 2664/16798 [10:52<1:02:04,  3.79it/s]

{'loss': 1.7749, 'grad_norm': 1.7539734840393066, 'learning_rate': 0.00016838217774600906, 'epoch': 0.16}


 16%|█▌        | 2665/16798 [10:52<1:00:02,  3.92it/s]

{'loss': 1.7322, 'grad_norm': 1.8023912906646729, 'learning_rate': 0.00016837026447462475, 'epoch': 0.16}


 16%|█▌        | 2666/16798 [10:53<1:07:34,  3.49it/s]

{'loss': 1.7541, 'grad_norm': 2.113513946533203, 'learning_rate': 0.00016835835120324042, 'epoch': 0.16}


 16%|█▌        | 2667/16798 [10:53<1:02:45,  3.75it/s]

{'loss': 1.6105, 'grad_norm': 1.8436295986175537, 'learning_rate': 0.0001683464379318561, 'epoch': 0.16}


 16%|█▌        | 2668/16798 [10:53<1:00:32,  3.89it/s]

{'loss': 2.3181, 'grad_norm': 2.2204983234405518, 'learning_rate': 0.00016833452466047177, 'epoch': 0.16}


 16%|█▌        | 2669/16798 [10:53<1:01:32,  3.83it/s]

{'loss': 1.2951, 'grad_norm': 1.8272864818572998, 'learning_rate': 0.00016832261138908746, 'epoch': 0.16}


 16%|█▌        | 2670/16798 [10:54<59:03,  3.99it/s]  

{'loss': 1.2781, 'grad_norm': 1.7078779935836792, 'learning_rate': 0.00016831069811770312, 'epoch': 0.16}


 16%|█▌        | 2671/16798 [10:54<1:01:07,  3.85it/s]

{'loss': 1.6949, 'grad_norm': 1.9375395774841309, 'learning_rate': 0.00016829878484631882, 'epoch': 0.16}


 16%|█▌        | 2672/16798 [10:54<1:04:06,  3.67it/s]

{'loss': 1.6202, 'grad_norm': 2.068007707595825, 'learning_rate': 0.00016828687157493448, 'epoch': 0.16}


 16%|█▌        | 2673/16798 [10:55<1:01:26,  3.83it/s]

{'loss': 1.5742, 'grad_norm': 1.7026156187057495, 'learning_rate': 0.00016827495830355017, 'epoch': 0.16}


 16%|█▌        | 2674/16798 [10:55<1:01:45,  3.81it/s]

{'loss': 1.2208, 'grad_norm': 1.5273488759994507, 'learning_rate': 0.00016826304503216583, 'epoch': 0.16}


 16%|█▌        | 2675/16798 [10:55<1:01:44,  3.81it/s]

{'loss': 2.0383, 'grad_norm': 2.300589084625244, 'learning_rate': 0.0001682511317607815, 'epoch': 0.16}


 16%|█▌        | 2676/16798 [10:55<59:12,  3.98it/s]  

{'loss': 1.4255, 'grad_norm': 1.5883866548538208, 'learning_rate': 0.0001682392184893972, 'epoch': 0.16}


 16%|█▌        | 2677/16798 [10:56<1:00:41,  3.88it/s]

{'loss': 1.8094, 'grad_norm': 2.3683109283447266, 'learning_rate': 0.00016822730521801285, 'epoch': 0.16}


 16%|█▌        | 2678/16798 [10:56<1:00:11,  3.91it/s]

{'loss': 1.5723, 'grad_norm': 2.0190627574920654, 'learning_rate': 0.00016821539194662854, 'epoch': 0.16}


 16%|█▌        | 2679/16798 [10:56<1:04:35,  3.64it/s]

{'loss': 1.0636, 'grad_norm': 1.6569886207580566, 'learning_rate': 0.0001682034786752442, 'epoch': 0.16}


 16%|█▌        | 2680/16798 [10:56<1:03:03,  3.73it/s]

{'loss': 1.4998, 'grad_norm': 1.9001456499099731, 'learning_rate': 0.00016819156540385992, 'epoch': 0.16}


 16%|█▌        | 2681/16798 [10:57<1:08:42,  3.42it/s]

{'loss': 1.6602, 'grad_norm': 2.2162396907806396, 'learning_rate': 0.0001681796521324756, 'epoch': 0.16}


 16%|█▌        | 2682/16798 [10:57<1:05:31,  3.59it/s]

{'loss': 1.057, 'grad_norm': 1.6653032302856445, 'learning_rate': 0.00016816773886109128, 'epoch': 0.16}


 16%|█▌        | 2683/16798 [10:57<1:07:54,  3.46it/s]

{'loss': 1.0964, 'grad_norm': 1.486751675605774, 'learning_rate': 0.00016815582558970694, 'epoch': 0.16}


 16%|█▌        | 2684/16798 [10:58<1:10:37,  3.33it/s]

{'loss': 1.3784, 'grad_norm': 1.7572572231292725, 'learning_rate': 0.00016814391231832263, 'epoch': 0.16}


 16%|█▌        | 2685/16798 [10:58<1:10:16,  3.35it/s]

{'loss': 1.6728, 'grad_norm': 1.988968014717102, 'learning_rate': 0.0001681319990469383, 'epoch': 0.16}


 16%|█▌        | 2686/16798 [10:58<1:07:36,  3.48it/s]

{'loss': 1.3205, 'grad_norm': 1.526923418045044, 'learning_rate': 0.000168120085775554, 'epoch': 0.16}


 16%|█▌        | 2687/16798 [10:58<1:07:48,  3.47it/s]

{'loss': 1.4679, 'grad_norm': 1.6399568319320679, 'learning_rate': 0.00016810817250416965, 'epoch': 0.16}


 16%|█▌        | 2688/16798 [10:59<1:07:27,  3.49it/s]

{'loss': 1.5338, 'grad_norm': 2.2176172733306885, 'learning_rate': 0.00016809625923278534, 'epoch': 0.16}


 16%|█▌        | 2689/16798 [10:59<1:04:22,  3.65it/s]

{'loss': 1.4408, 'grad_norm': 2.019796371459961, 'learning_rate': 0.000168084345961401, 'epoch': 0.16}


 16%|█▌        | 2690/16798 [10:59<1:06:14,  3.55it/s]

{'loss': 1.5936, 'grad_norm': 2.1445562839508057, 'learning_rate': 0.0001680724326900167, 'epoch': 0.16}


 16%|█▌        | 2691/16798 [11:00<1:06:12,  3.55it/s]

{'loss': 0.8083, 'grad_norm': 1.3490415811538696, 'learning_rate': 0.00016806051941863236, 'epoch': 0.16}


 16%|█▌        | 2692/16798 [11:00<1:02:26,  3.77it/s]

{'loss': 0.9191, 'grad_norm': 1.7995284795761108, 'learning_rate': 0.00016804860614724805, 'epoch': 0.16}


 16%|█▌        | 2693/16798 [11:00<1:02:47,  3.74it/s]

{'loss': 1.1874, 'grad_norm': 1.7598658800125122, 'learning_rate': 0.00016803669287586371, 'epoch': 0.16}


 16%|█▌        | 2694/16798 [11:00<1:02:59,  3.73it/s]

{'loss': 1.0415, 'grad_norm': 1.5566645860671997, 'learning_rate': 0.0001680247796044794, 'epoch': 0.16}


 16%|█▌        | 2695/16798 [11:01<1:04:19,  3.65it/s]

{'loss': 1.0497, 'grad_norm': 1.5122504234313965, 'learning_rate': 0.00016801286633309507, 'epoch': 0.16}


 16%|█▌        | 2696/16798 [11:01<1:05:25,  3.59it/s]

{'loss': 0.9846, 'grad_norm': 1.7174500226974487, 'learning_rate': 0.00016800095306171076, 'epoch': 0.16}


 16%|█▌        | 2697/16798 [11:01<1:05:01,  3.61it/s]

{'loss': 0.979, 'grad_norm': 1.5730490684509277, 'learning_rate': 0.00016798903979032642, 'epoch': 0.16}


 16%|█▌        | 2698/16798 [11:01<1:03:30,  3.70it/s]

{'loss': 0.8473, 'grad_norm': 1.4798506498336792, 'learning_rate': 0.00016797712651894211, 'epoch': 0.16}


 16%|█▌        | 2699/16798 [11:02<1:04:04,  3.67it/s]

{'loss': 0.309, 'grad_norm': 0.7957464456558228, 'learning_rate': 0.00016796521324755778, 'epoch': 0.16}


 16%|█▌        | 2700/16798 [11:02<1:04:38,  3.63it/s]

{'loss': 0.2783, 'grad_norm': 0.7927258610725403, 'learning_rate': 0.00016795329997617347, 'epoch': 0.16}


 16%|█▌        | 2701/16798 [11:02<1:02:47,  3.74it/s]

{'loss': 2.1333, 'grad_norm': 1.9171433448791504, 'learning_rate': 0.00016794138670478913, 'epoch': 0.16}


 16%|█▌        | 2702/16798 [11:03<1:01:22,  3.83it/s]

{'loss': 2.4658, 'grad_norm': 2.228609323501587, 'learning_rate': 0.00016792947343340482, 'epoch': 0.16}


 16%|█▌        | 2703/16798 [11:03<1:02:44,  3.74it/s]

{'loss': 1.7835, 'grad_norm': 1.7210187911987305, 'learning_rate': 0.00016791756016202049, 'epoch': 0.16}


 16%|█▌        | 2704/16798 [11:03<1:03:26,  3.70it/s]

{'loss': 1.909, 'grad_norm': 1.7096706628799438, 'learning_rate': 0.00016790564689063618, 'epoch': 0.16}


 16%|█▌        | 2705/16798 [11:03<1:03:31,  3.70it/s]

{'loss': 2.4069, 'grad_norm': 2.3968653678894043, 'learning_rate': 0.00016789373361925184, 'epoch': 0.16}


 16%|█▌        | 2706/16798 [11:04<1:02:11,  3.78it/s]

{'loss': 2.7324, 'grad_norm': 2.228766918182373, 'learning_rate': 0.00016788182034786753, 'epoch': 0.16}


 16%|█▌        | 2707/16798 [11:04<1:03:50,  3.68it/s]

{'loss': 1.4668, 'grad_norm': 1.5713459253311157, 'learning_rate': 0.0001678699070764832, 'epoch': 0.16}


 16%|█▌        | 2708/16798 [11:04<1:01:19,  3.83it/s]

{'loss': 1.3947, 'grad_norm': 1.634198784828186, 'learning_rate': 0.00016785799380509889, 'epoch': 0.16}


 16%|█▌        | 2709/16798 [11:04<1:00:49,  3.86it/s]

{'loss': 2.0221, 'grad_norm': 1.9680832624435425, 'learning_rate': 0.00016784608053371455, 'epoch': 0.16}


 16%|█▌        | 2710/16798 [11:05<1:04:38,  3.63it/s]

{'loss': 1.8035, 'grad_norm': 2.007338523864746, 'learning_rate': 0.00016783416726233024, 'epoch': 0.16}


 16%|█▌        | 2711/16798 [11:05<1:03:50,  3.68it/s]

{'loss': 1.7018, 'grad_norm': 2.472838878631592, 'learning_rate': 0.00016782225399094593, 'epoch': 0.16}


 16%|█▌        | 2712/16798 [11:05<1:02:21,  3.77it/s]

{'loss': 2.0775, 'grad_norm': 2.0680599212646484, 'learning_rate': 0.00016781034071956162, 'epoch': 0.16}


 16%|█▌        | 2713/16798 [11:06<1:06:32,  3.53it/s]

{'loss': 1.5676, 'grad_norm': 1.64988112449646, 'learning_rate': 0.00016779842744817729, 'epoch': 0.16}


 16%|█▌        | 2714/16798 [11:06<1:04:42,  3.63it/s]

{'loss': 1.6969, 'grad_norm': 1.9030202627182007, 'learning_rate': 0.00016778651417679298, 'epoch': 0.16}


 16%|█▌        | 2715/16798 [11:06<1:03:51,  3.68it/s]

{'loss': 1.3673, 'grad_norm': 1.8694407939910889, 'learning_rate': 0.00016777460090540864, 'epoch': 0.16}


 16%|█▌        | 2716/16798 [11:06<1:04:39,  3.63it/s]

{'loss': 1.4104, 'grad_norm': 1.6486613750457764, 'learning_rate': 0.00016776268763402433, 'epoch': 0.16}


 16%|█▌        | 2717/16798 [11:07<1:01:37,  3.81it/s]

{'loss': 1.4044, 'grad_norm': 1.6910531520843506, 'learning_rate': 0.00016775077436264, 'epoch': 0.16}


 16%|█▌        | 2718/16798 [11:07<1:01:27,  3.82it/s]

{'loss': 1.5294, 'grad_norm': 3.006110668182373, 'learning_rate': 0.00016773886109125569, 'epoch': 0.16}


 16%|█▌        | 2719/16798 [11:07<1:03:39,  3.69it/s]

{'loss': 1.8727, 'grad_norm': 1.749062180519104, 'learning_rate': 0.00016772694781987135, 'epoch': 0.16}


 16%|█▌        | 2720/16798 [11:07<1:00:55,  3.85it/s]

{'loss': 1.7722, 'grad_norm': 2.1939523220062256, 'learning_rate': 0.00016771503454848704, 'epoch': 0.16}


 16%|█▌        | 2721/16798 [11:08<1:01:17,  3.83it/s]

{'loss': 1.7596, 'grad_norm': 1.9651342630386353, 'learning_rate': 0.0001677031212771027, 'epoch': 0.16}


 16%|█▌        | 2722/16798 [11:08<1:04:31,  3.64it/s]

{'loss': 1.5799, 'grad_norm': 1.705873727798462, 'learning_rate': 0.0001676912080057184, 'epoch': 0.16}


 16%|█▌        | 2723/16798 [11:08<1:03:32,  3.69it/s]

{'loss': 1.3629, 'grad_norm': 1.8242816925048828, 'learning_rate': 0.00016767929473433406, 'epoch': 0.16}


 16%|█▌        | 2724/16798 [11:08<1:02:24,  3.76it/s]

{'loss': 1.5059, 'grad_norm': 1.811390995979309, 'learning_rate': 0.00016766738146294975, 'epoch': 0.16}


 16%|█▌        | 2725/16798 [11:09<1:03:36,  3.69it/s]

{'loss': 1.5272, 'grad_norm': 1.7887492179870605, 'learning_rate': 0.0001676554681915654, 'epoch': 0.16}


 16%|█▌        | 2726/16798 [11:09<1:02:55,  3.73it/s]

{'loss': 1.3105, 'grad_norm': 2.3179330825805664, 'learning_rate': 0.0001676435549201811, 'epoch': 0.16}


 16%|█▌        | 2727/16798 [11:09<1:01:23,  3.82it/s]

{'loss': 1.6344, 'grad_norm': 1.8277029991149902, 'learning_rate': 0.00016763164164879677, 'epoch': 0.16}


 16%|█▌        | 2728/16798 [11:10<1:03:11,  3.71it/s]

{'loss': 1.4951, 'grad_norm': 2.0701706409454346, 'learning_rate': 0.00016761972837741246, 'epoch': 0.16}


 16%|█▌        | 2729/16798 [11:10<1:03:08,  3.71it/s]

{'loss': 1.0364, 'grad_norm': 1.6674624681472778, 'learning_rate': 0.00016760781510602812, 'epoch': 0.16}


 16%|█▋        | 2730/16798 [11:10<1:00:10,  3.90it/s]

{'loss': 1.7649, 'grad_norm': 1.8785040378570557, 'learning_rate': 0.0001675959018346438, 'epoch': 0.16}


 16%|█▋        | 2731/16798 [11:10<1:07:45,  3.46it/s]

{'loss': 1.4075, 'grad_norm': 1.929243564605713, 'learning_rate': 0.00016758398856325948, 'epoch': 0.16}


 16%|█▋        | 2732/16798 [11:11<1:07:53,  3.45it/s]

{'loss': 1.3924, 'grad_norm': 1.7484419345855713, 'learning_rate': 0.00016757207529187514, 'epoch': 0.16}


 16%|█▋        | 2733/16798 [11:11<1:05:29,  3.58it/s]

{'loss': 1.7901, 'grad_norm': 2.068631887435913, 'learning_rate': 0.00016756016202049083, 'epoch': 0.16}


 16%|█▋        | 2734/16798 [11:11<1:06:25,  3.53it/s]

{'loss': 1.5291, 'grad_norm': 1.7845053672790527, 'learning_rate': 0.0001675482487491065, 'epoch': 0.16}


 16%|█▋        | 2735/16798 [11:11<1:06:18,  3.53it/s]

{'loss': 1.4125, 'grad_norm': 1.7141021490097046, 'learning_rate': 0.00016753633547772218, 'epoch': 0.16}


 16%|█▋        | 2736/16798 [11:12<1:02:31,  3.75it/s]

{'loss': 0.9847, 'grad_norm': 1.3638869524002075, 'learning_rate': 0.00016752442220633785, 'epoch': 0.16}


 16%|█▋        | 2737/16798 [11:12<58:54,  3.98it/s]  

{'loss': 1.3441, 'grad_norm': 1.7565443515777588, 'learning_rate': 0.00016751250893495354, 'epoch': 0.16}


 16%|█▋        | 2738/16798 [11:12<1:01:06,  3.83it/s]

{'loss': 1.6058, 'grad_norm': 2.227060079574585, 'learning_rate': 0.0001675005956635692, 'epoch': 0.16}


 16%|█▋        | 2739/16798 [11:12<1:02:26,  3.75it/s]

{'loss': 1.3602, 'grad_norm': 1.8285164833068848, 'learning_rate': 0.0001674886823921849, 'epoch': 0.16}


 16%|█▋        | 2740/16798 [11:13<1:00:23,  3.88it/s]

{'loss': 1.3603, 'grad_norm': 1.9985600709915161, 'learning_rate': 0.00016747676912080056, 'epoch': 0.16}


 16%|█▋        | 2741/16798 [11:13<1:02:46,  3.73it/s]

{'loss': 1.1862, 'grad_norm': 1.8346333503723145, 'learning_rate': 0.00016746485584941625, 'epoch': 0.16}


 16%|█▋        | 2742/16798 [11:13<1:01:05,  3.83it/s]

{'loss': 1.3554, 'grad_norm': 2.3076515197753906, 'learning_rate': 0.00016745294257803194, 'epoch': 0.16}


 16%|█▋        | 2743/16798 [11:13<58:52,  3.98it/s]  

{'loss': 1.2387, 'grad_norm': 1.9564259052276611, 'learning_rate': 0.00016744102930664763, 'epoch': 0.16}


 16%|█▋        | 2744/16798 [11:14<1:00:51,  3.85it/s]

{'loss': 1.3612, 'grad_norm': 2.04903507232666, 'learning_rate': 0.0001674291160352633, 'epoch': 0.16}


 16%|█▋        | 2745/16798 [11:14<58:01,  4.04it/s]  

{'loss': 0.8974, 'grad_norm': 1.5363502502441406, 'learning_rate': 0.00016741720276387898, 'epoch': 0.16}


 16%|█▋        | 2746/16798 [11:14<55:27,  4.22it/s]

{'loss': 0.8685, 'grad_norm': 1.813055396080017, 'learning_rate': 0.00016740528949249465, 'epoch': 0.16}


 16%|█▋        | 2747/16798 [11:14<55:43,  4.20it/s]

{'loss': 0.6331, 'grad_norm': 1.4121971130371094, 'learning_rate': 0.00016739337622111034, 'epoch': 0.16}


 16%|█▋        | 2748/16798 [11:15<58:22,  4.01it/s]

{'loss': 0.5275, 'grad_norm': 1.2032978534698486, 'learning_rate': 0.000167381462949726, 'epoch': 0.16}


 16%|█▋        | 2749/16798 [11:15<1:01:25,  3.81it/s]

{'loss': 0.4094, 'grad_norm': 1.1587356328964233, 'learning_rate': 0.0001673695496783417, 'epoch': 0.16}


 16%|█▋        | 2750/16798 [11:15<1:01:07,  3.83it/s]

{'loss': 0.3211, 'grad_norm': 0.9665505290031433, 'learning_rate': 0.00016735763640695736, 'epoch': 0.16}


 16%|█▋        | 2751/16798 [11:16<1:01:34,  3.80it/s]

{'loss': 2.3882, 'grad_norm': 1.8684402704238892, 'learning_rate': 0.00016734572313557305, 'epoch': 0.16}


 16%|█▋        | 2752/16798 [11:16<1:02:18,  3.76it/s]

{'loss': 2.2986, 'grad_norm': 2.052190065383911, 'learning_rate': 0.0001673338098641887, 'epoch': 0.16}


 16%|█▋        | 2753/16798 [11:16<1:06:29,  3.52it/s]

{'loss': 1.7946, 'grad_norm': 2.0028328895568848, 'learning_rate': 0.0001673218965928044, 'epoch': 0.16}


 16%|█▋        | 2754/16798 [11:16<1:04:56,  3.60it/s]

{'loss': 1.6868, 'grad_norm': 1.6592131853103638, 'learning_rate': 0.00016730998332142006, 'epoch': 0.16}


 16%|█▋        | 2755/16798 [11:17<1:04:08,  3.65it/s]

{'loss': 2.2654, 'grad_norm': 2.003474235534668, 'learning_rate': 0.00016729807005003576, 'epoch': 0.16}


 16%|█▋        | 2756/16798 [11:17<1:00:40,  3.86it/s]

{'loss': 1.9134, 'grad_norm': 2.0696425437927246, 'learning_rate': 0.00016728615677865142, 'epoch': 0.16}


 16%|█▋        | 2757/16798 [11:17<1:05:22,  3.58it/s]

{'loss': 1.9582, 'grad_norm': 2.0319721698760986, 'learning_rate': 0.0001672742435072671, 'epoch': 0.16}


 16%|█▋        | 2758/16798 [11:17<1:04:27,  3.63it/s]

{'loss': 2.0468, 'grad_norm': 2.040332078933716, 'learning_rate': 0.00016726233023588277, 'epoch': 0.16}


 16%|█▋        | 2759/16798 [11:18<1:01:24,  3.81it/s]

{'loss': 1.9821, 'grad_norm': 1.9913841485977173, 'learning_rate': 0.00016725041696449846, 'epoch': 0.16}


 16%|█▋        | 2760/16798 [11:18<1:07:09,  3.48it/s]

{'loss': 1.7041, 'grad_norm': 1.7050570249557495, 'learning_rate': 0.00016723850369311413, 'epoch': 0.16}


 16%|█▋        | 2761/16798 [11:18<1:04:47,  3.61it/s]

{'loss': 1.5241, 'grad_norm': 1.64813232421875, 'learning_rate': 0.00016722659042172982, 'epoch': 0.16}


 16%|█▋        | 2762/16798 [11:19<1:03:51,  3.66it/s]

{'loss': 1.5616, 'grad_norm': 1.8775510787963867, 'learning_rate': 0.00016721467715034548, 'epoch': 0.16}


 16%|█▋        | 2763/16798 [11:19<1:04:02,  3.65it/s]

{'loss': 1.7473, 'grad_norm': 2.062304735183716, 'learning_rate': 0.00016720276387896117, 'epoch': 0.16}


 16%|█▋        | 2764/16798 [11:19<1:03:27,  3.69it/s]

{'loss': 1.9994, 'grad_norm': 2.0998053550720215, 'learning_rate': 0.00016719085060757684, 'epoch': 0.16}


 16%|█▋        | 2765/16798 [11:19<1:00:00,  3.90it/s]

{'loss': 1.256, 'grad_norm': 1.6260796785354614, 'learning_rate': 0.00016717893733619253, 'epoch': 0.16}


 16%|█▋        | 2766/16798 [11:20<1:00:04,  3.89it/s]

{'loss': 1.565, 'grad_norm': 1.8065855503082275, 'learning_rate': 0.0001671670240648082, 'epoch': 0.16}


 16%|█▋        | 2767/16798 [11:20<1:00:29,  3.87it/s]

{'loss': 1.1696, 'grad_norm': 1.5450860261917114, 'learning_rate': 0.00016715511079342388, 'epoch': 0.16}


 16%|█▋        | 2768/16798 [11:20<58:34,  3.99it/s]  

{'loss': 1.7137, 'grad_norm': 2.010500431060791, 'learning_rate': 0.00016714319752203955, 'epoch': 0.16}


 16%|█▋        | 2769/16798 [11:20<57:45,  4.05it/s]

{'loss': 1.3927, 'grad_norm': 1.7120097875595093, 'learning_rate': 0.00016713128425065524, 'epoch': 0.16}


 16%|█▋        | 2770/16798 [11:21<1:00:25,  3.87it/s]

{'loss': 1.2355, 'grad_norm': 1.987674593925476, 'learning_rate': 0.0001671193709792709, 'epoch': 0.16}


 16%|█▋        | 2771/16798 [11:21<1:00:55,  3.84it/s]

{'loss': 1.4589, 'grad_norm': 1.8916780948638916, 'learning_rate': 0.0001671074577078866, 'epoch': 0.16}


 17%|█▋        | 2772/16798 [11:21<57:53,  4.04it/s]  

{'loss': 1.2617, 'grad_norm': 1.6939666271209717, 'learning_rate': 0.00016709554443650228, 'epoch': 0.17}


 17%|█▋        | 2773/16798 [11:21<1:04:17,  3.64it/s]

{'loss': 1.3391, 'grad_norm': 1.5586131811141968, 'learning_rate': 0.00016708363116511797, 'epoch': 0.17}


 17%|█▋        | 2774/16798 [11:22<1:02:26,  3.74it/s]

{'loss': 1.4449, 'grad_norm': 2.0485141277313232, 'learning_rate': 0.00016707171789373364, 'epoch': 0.17}


 17%|█▋        | 2775/16798 [11:22<1:01:30,  3.80it/s]

{'loss': 1.5949, 'grad_norm': 2.022156000137329, 'learning_rate': 0.00016705980462234933, 'epoch': 0.17}


 17%|█▋        | 2776/16798 [11:22<1:01:09,  3.82it/s]

{'loss': 1.3153, 'grad_norm': 1.7130075693130493, 'learning_rate': 0.000167047891350965, 'epoch': 0.17}


 17%|█▋        | 2777/16798 [11:22<1:01:37,  3.79it/s]

{'loss': 1.5292, 'grad_norm': 1.9418030977249146, 'learning_rate': 0.00016703597807958068, 'epoch': 0.17}


 17%|█▋        | 2778/16798 [11:23<1:02:27,  3.74it/s]

{'loss': 1.5168, 'grad_norm': 1.9438401460647583, 'learning_rate': 0.00016702406480819634, 'epoch': 0.17}


 17%|█▋        | 2779/16798 [11:23<1:00:04,  3.89it/s]

{'loss': 1.1078, 'grad_norm': 1.671760082244873, 'learning_rate': 0.00016701215153681204, 'epoch': 0.17}


 17%|█▋        | 2780/16798 [11:23<1:05:27,  3.57it/s]

{'loss': 2.0338, 'grad_norm': 2.286207437515259, 'learning_rate': 0.0001670002382654277, 'epoch': 0.17}


 17%|█▋        | 2781/16798 [11:24<1:01:41,  3.79it/s]

{'loss': 1.3227, 'grad_norm': 1.792303204536438, 'learning_rate': 0.0001669883249940434, 'epoch': 0.17}


 17%|█▋        | 2782/16798 [11:24<1:02:54,  3.71it/s]

{'loss': 1.047, 'grad_norm': 1.7415120601654053, 'learning_rate': 0.00016697641172265905, 'epoch': 0.17}


 17%|█▋        | 2783/16798 [11:24<1:03:53,  3.66it/s]

{'loss': 1.4146, 'grad_norm': 1.9821343421936035, 'learning_rate': 0.00016696449845127474, 'epoch': 0.17}


 17%|█▋        | 2784/16798 [11:24<1:03:03,  3.70it/s]

{'loss': 1.4664, 'grad_norm': 1.9224352836608887, 'learning_rate': 0.0001669525851798904, 'epoch': 0.17}


 17%|█▋        | 2785/16798 [11:25<1:01:44,  3.78it/s]

{'loss': 1.4058, 'grad_norm': 1.7068610191345215, 'learning_rate': 0.0001669406719085061, 'epoch': 0.17}


 17%|█▋        | 2786/16798 [11:25<1:02:16,  3.75it/s]

{'loss': 1.6645, 'grad_norm': 2.3311564922332764, 'learning_rate': 0.00016692875863712176, 'epoch': 0.17}


 17%|█▋        | 2787/16798 [11:25<1:03:03,  3.70it/s]

{'loss': 1.5475, 'grad_norm': 1.985317587852478, 'learning_rate': 0.00016691684536573745, 'epoch': 0.17}


 17%|█▋        | 2788/16798 [11:25<1:02:18,  3.75it/s]

{'loss': 1.3264, 'grad_norm': 1.7166167497634888, 'learning_rate': 0.00016690493209435312, 'epoch': 0.17}


 17%|█▋        | 2789/16798 [11:26<1:03:10,  3.70it/s]

{'loss': 0.7643, 'grad_norm': 1.3964052200317383, 'learning_rate': 0.00016689301882296878, 'epoch': 0.17}


 17%|█▋        | 2790/16798 [11:26<1:01:34,  3.79it/s]

{'loss': 1.2391, 'grad_norm': 1.586921215057373, 'learning_rate': 0.00016688110555158447, 'epoch': 0.17}


 17%|█▋        | 2791/16798 [11:26<1:00:50,  3.84it/s]

{'loss': 1.7365, 'grad_norm': 2.2604453563690186, 'learning_rate': 0.00016686919228020013, 'epoch': 0.17}


 17%|█▋        | 2792/16798 [11:26<1:02:39,  3.73it/s]

{'loss': 0.926, 'grad_norm': 1.8603863716125488, 'learning_rate': 0.00016685727900881583, 'epoch': 0.17}


 17%|█▋        | 2793/16798 [11:27<1:00:46,  3.84it/s]

{'loss': 0.9268, 'grad_norm': 1.4858897924423218, 'learning_rate': 0.0001668453657374315, 'epoch': 0.17}


 17%|█▋        | 2794/16798 [11:27<1:04:00,  3.65it/s]

{'loss': 1.2812, 'grad_norm': 1.9406237602233887, 'learning_rate': 0.00016683345246604718, 'epoch': 0.17}


 17%|█▋        | 2795/16798 [11:27<1:05:42,  3.55it/s]

{'loss': 1.1851, 'grad_norm': 1.9403010606765747, 'learning_rate': 0.00016682153919466284, 'epoch': 0.17}


 17%|█▋        | 2796/16798 [11:28<1:01:51,  3.77it/s]

{'loss': 0.8877, 'grad_norm': 1.4314855337142944, 'learning_rate': 0.00016680962592327853, 'epoch': 0.17}


 17%|█▋        | 2797/16798 [11:28<1:01:47,  3.78it/s]

{'loss': 0.7872, 'grad_norm': 1.4304882287979126, 'learning_rate': 0.0001667977126518942, 'epoch': 0.17}


 17%|█▋        | 2798/16798 [11:28<1:02:23,  3.74it/s]

{'loss': 0.6154, 'grad_norm': 1.497865080833435, 'learning_rate': 0.0001667857993805099, 'epoch': 0.17}


 17%|█▋        | 2799/16798 [11:28<59:14,  3.94it/s]  

{'loss': 0.6317, 'grad_norm': 1.2961901426315308, 'learning_rate': 0.00016677388610912555, 'epoch': 0.17}


 17%|█▋        | 2800/16798 [11:29<57:22,  4.07it/s]

{'loss': 0.4689, 'grad_norm': 1.0554131269454956, 'learning_rate': 0.00016676197283774124, 'epoch': 0.17}


 17%|█▋        | 2801/16798 [11:29<1:04:30,  3.62it/s]

{'loss': 1.9198, 'grad_norm': 1.7820789813995361, 'learning_rate': 0.0001667500595663569, 'epoch': 0.17}


 17%|█▋        | 2802/16798 [11:29<1:03:56,  3.65it/s]

{'loss': 2.4568, 'grad_norm': 2.059917688369751, 'learning_rate': 0.0001667381462949726, 'epoch': 0.17}


 17%|█▋        | 2803/16798 [11:29<1:02:49,  3.71it/s]

{'loss': 2.5111, 'grad_norm': 2.389472246170044, 'learning_rate': 0.0001667262330235883, 'epoch': 0.17}


 17%|█▋        | 2804/16798 [11:30<1:00:53,  3.83it/s]

{'loss': 1.952, 'grad_norm': 1.6633968353271484, 'learning_rate': 0.00016671431975220398, 'epoch': 0.17}


 17%|█▋        | 2805/16798 [11:30<1:01:15,  3.81it/s]

{'loss': 1.9952, 'grad_norm': 1.75186288356781, 'learning_rate': 0.00016670240648081964, 'epoch': 0.17}


 17%|█▋        | 2807/16798 [11:30<56:42,  4.11it/s]  

{'loss': 1.9644, 'grad_norm': 1.8283418416976929, 'learning_rate': 0.00016669049320943533, 'epoch': 0.17}


 17%|█▋        | 2807/16798 [11:30<56:42,  4.11it/s]

{'loss': 2.0736, 'grad_norm': 1.774269461631775, 'learning_rate': 0.000166678579938051, 'epoch': 0.17}


 17%|█▋        | 2808/16798 [11:31<58:17,  4.00it/s]

{'loss': 1.9517, 'grad_norm': 1.9339178800582886, 'learning_rate': 0.0001666666666666667, 'epoch': 0.17}


 17%|█▋        | 2810/16798 [11:31<58:56,  3.95it/s]  

{'loss': 1.9208, 'grad_norm': 1.87911057472229, 'learning_rate': 0.00016665475339528235, 'epoch': 0.17}


 17%|█▋        | 2810/16798 [11:31<58:56,  3.95it/s]

{'loss': 1.3154, 'grad_norm': 1.433346152305603, 'learning_rate': 0.00016664284012389804, 'epoch': 0.17}


 17%|█▋        | 2811/16798 [11:31<1:00:45,  3.84it/s]

{'loss': 1.5679, 'grad_norm': 1.6855731010437012, 'learning_rate': 0.0001666309268525137, 'epoch': 0.17}


 17%|█▋        | 2812/16798 [11:32<1:00:57,  3.82it/s]

{'loss': 1.8389, 'grad_norm': 1.9774738550186157, 'learning_rate': 0.0001666190135811294, 'epoch': 0.17}


 17%|█▋        | 2813/16798 [11:32<1:01:20,  3.80it/s]

{'loss': 1.437, 'grad_norm': 1.879939317703247, 'learning_rate': 0.00016660710030974506, 'epoch': 0.17}


 17%|█▋        | 2814/16798 [11:32<58:54,  3.96it/s]  

{'loss': 2.3183, 'grad_norm': 2.2770259380340576, 'learning_rate': 0.00016659518703836075, 'epoch': 0.17}


 17%|█▋        | 2815/16798 [11:33<1:00:54,  3.83it/s]

{'loss': 1.4408, 'grad_norm': 1.5199271440505981, 'learning_rate': 0.00016658327376697642, 'epoch': 0.17}


 17%|█▋        | 2816/16798 [11:33<1:03:07,  3.69it/s]

{'loss': 1.7247, 'grad_norm': 2.1409080028533936, 'learning_rate': 0.0001665713604955921, 'epoch': 0.17}


 17%|█▋        | 2817/16798 [11:33<59:22,  3.92it/s]  

{'loss': 1.6489, 'grad_norm': 1.7604765892028809, 'learning_rate': 0.00016655944722420777, 'epoch': 0.17}


 17%|█▋        | 2818/16798 [11:33<1:01:50,  3.77it/s]

{'loss': 1.4652, 'grad_norm': 1.676032543182373, 'learning_rate': 0.00016654753395282346, 'epoch': 0.17}


 17%|█▋        | 2819/16798 [11:34<1:01:22,  3.80it/s]

{'loss': 1.9823, 'grad_norm': 2.049973726272583, 'learning_rate': 0.00016653562068143912, 'epoch': 0.17}


 17%|█▋        | 2820/16798 [11:34<1:06:01,  3.53it/s]

{'loss': 1.319, 'grad_norm': 1.8793895244598389, 'learning_rate': 0.00016652370741005481, 'epoch': 0.17}


 17%|█▋        | 2821/16798 [11:34<1:05:07,  3.58it/s]

{'loss': 1.4715, 'grad_norm': 1.6925994157791138, 'learning_rate': 0.00016651179413867048, 'epoch': 0.17}


 17%|█▋        | 2822/16798 [11:34<1:01:03,  3.82it/s]

{'loss': 1.5802, 'grad_norm': 1.8133056163787842, 'learning_rate': 0.00016649988086728617, 'epoch': 0.17}


 17%|█▋        | 2823/16798 [11:35<1:01:47,  3.77it/s]

{'loss': 1.7162, 'grad_norm': 1.7441017627716064, 'learning_rate': 0.00016648796759590183, 'epoch': 0.17}


 17%|█▋        | 2825/16798 [11:35<59:15,  3.93it/s]  

{'loss': 1.6423, 'grad_norm': 1.8444803953170776, 'learning_rate': 0.00016647605432451752, 'epoch': 0.17}


 17%|█▋        | 2825/16798 [11:35<59:15,  3.93it/s]

{'loss': 1.6489, 'grad_norm': 1.813773274421692, 'learning_rate': 0.0001664641410531332, 'epoch': 0.17}


 17%|█▋        | 2826/16798 [11:35<57:35,  4.04it/s]

{'loss': 1.9085, 'grad_norm': 1.8607224225997925, 'learning_rate': 0.00016645222778174888, 'epoch': 0.17}


 17%|█▋        | 2827/16798 [11:36<1:05:31,  3.55it/s]

{'loss': 1.8281, 'grad_norm': 2.032626152038574, 'learning_rate': 0.00016644031451036454, 'epoch': 0.17}


 17%|█▋        | 2828/16798 [11:36<1:04:43,  3.60it/s]

{'loss': 2.0341, 'grad_norm': 2.1743643283843994, 'learning_rate': 0.00016642840123898023, 'epoch': 0.17}


 17%|█▋        | 2829/16798 [11:36<1:00:47,  3.83it/s]

{'loss': 1.4645, 'grad_norm': 1.6389451026916504, 'learning_rate': 0.0001664164879675959, 'epoch': 0.17}


 17%|█▋        | 2830/16798 [11:37<1:00:56,  3.82it/s]

{'loss': 1.3608, 'grad_norm': 1.6110693216323853, 'learning_rate': 0.0001664045746962116, 'epoch': 0.17}


 17%|█▋        | 2831/16798 [11:37<1:02:52,  3.70it/s]

{'loss': 1.3901, 'grad_norm': 1.720193862915039, 'learning_rate': 0.00016639266142482725, 'epoch': 0.17}


 17%|█▋        | 2832/16798 [11:37<1:00:11,  3.87it/s]

{'loss': 1.5844, 'grad_norm': 2.875703811645508, 'learning_rate': 0.00016638074815344294, 'epoch': 0.17}


 17%|█▋        | 2833/16798 [11:37<56:48,  4.10it/s]  

{'loss': 1.5332, 'grad_norm': 2.700800895690918, 'learning_rate': 0.0001663688348820586, 'epoch': 0.17}


 17%|█▋        | 2834/16798 [11:37<55:04,  4.23it/s]

{'loss': 1.5775, 'grad_norm': 2.023648500442505, 'learning_rate': 0.00016635692161067432, 'epoch': 0.17}


 17%|█▋        | 2835/16798 [11:38<59:44,  3.90it/s]

{'loss': 1.5103, 'grad_norm': 2.090024709701538, 'learning_rate': 0.00016634500833928999, 'epoch': 0.17}


 17%|█▋        | 2836/16798 [11:38<57:46,  4.03it/s]

{'loss': 1.0983, 'grad_norm': 1.7876977920532227, 'learning_rate': 0.00016633309506790568, 'epoch': 0.17}


 17%|█▋        | 2837/16798 [11:38<56:06,  4.15it/s]

{'loss': 1.2883, 'grad_norm': 1.8160498142242432, 'learning_rate': 0.00016632118179652134, 'epoch': 0.17}


 17%|█▋        | 2838/16798 [11:39<59:58,  3.88it/s]

{'loss': 1.5757, 'grad_norm': 1.889074683189392, 'learning_rate': 0.00016630926852513703, 'epoch': 0.17}


 17%|█▋        | 2839/16798 [11:39<1:01:09,  3.80it/s]

{'loss': 1.3164, 'grad_norm': 1.668131947517395, 'learning_rate': 0.0001662973552537527, 'epoch': 0.17}


 17%|█▋        | 2840/16798 [11:39<1:01:03,  3.81it/s]

{'loss': 0.8137, 'grad_norm': 1.4013878107070923, 'learning_rate': 0.00016628544198236839, 'epoch': 0.17}


 17%|█▋        | 2841/16798 [11:39<1:01:53,  3.76it/s]

{'loss': 1.5896, 'grad_norm': 1.999257206916809, 'learning_rate': 0.00016627352871098405, 'epoch': 0.17}


 17%|█▋        | 2842/16798 [11:40<1:03:11,  3.68it/s]

{'loss': 0.9281, 'grad_norm': 1.2635257244110107, 'learning_rate': 0.00016626161543959974, 'epoch': 0.17}


 17%|█▋        | 2843/16798 [11:40<1:04:07,  3.63it/s]

{'loss': 0.958, 'grad_norm': 1.6920958757400513, 'learning_rate': 0.0001662497021682154, 'epoch': 0.17}


 17%|█▋        | 2844/16798 [11:40<1:03:55,  3.64it/s]

{'loss': 1.058, 'grad_norm': 1.6382575035095215, 'learning_rate': 0.0001662377888968311, 'epoch': 0.17}


 17%|█▋        | 2845/16798 [11:40<1:06:29,  3.50it/s]

{'loss': 1.142, 'grad_norm': 1.7559230327606201, 'learning_rate': 0.00016622587562544676, 'epoch': 0.17}


 17%|█▋        | 2846/16798 [11:41<1:05:15,  3.56it/s]

{'loss': 1.1788, 'grad_norm': 1.9488422870635986, 'learning_rate': 0.00016621396235406245, 'epoch': 0.17}


 17%|█▋        | 2847/16798 [11:41<1:04:22,  3.61it/s]

{'loss': 0.6368, 'grad_norm': 1.3153327703475952, 'learning_rate': 0.0001662020490826781, 'epoch': 0.17}


 17%|█▋        | 2848/16798 [11:41<1:00:15,  3.86it/s]

{'loss': 0.4551, 'grad_norm': 0.9635153412818909, 'learning_rate': 0.00016619013581129378, 'epoch': 0.17}


 17%|█▋        | 2849/16798 [11:41<57:24,  4.05it/s]  

{'loss': 0.2446, 'grad_norm': 0.7266120314598083, 'learning_rate': 0.00016617822253990947, 'epoch': 0.17}


 17%|█▋        | 2850/16798 [11:42<1:01:26,  3.78it/s]

{'loss': 0.9734, 'grad_norm': 1.67933189868927, 'learning_rate': 0.00016616630926852513, 'epoch': 0.17}


 17%|█▋        | 2851/16798 [11:42<59:28,  3.91it/s]  

{'loss': 1.6148, 'grad_norm': 1.9101619720458984, 'learning_rate': 0.00016615439599714082, 'epoch': 0.17}


 17%|█▋        | 2852/16798 [11:42<57:40,  4.03it/s]

{'loss': 1.9551, 'grad_norm': 1.8947417736053467, 'learning_rate': 0.00016614248272575649, 'epoch': 0.17}


 17%|█▋        | 2853/16798 [11:43<1:00:58,  3.81it/s]

{'loss': 1.944, 'grad_norm': 1.7087173461914062, 'learning_rate': 0.00016613056945437218, 'epoch': 0.17}


 17%|█▋        | 2854/16798 [11:43<59:35,  3.90it/s]  

{'loss': 2.0628, 'grad_norm': 1.8375595808029175, 'learning_rate': 0.00016611865618298784, 'epoch': 0.17}


 17%|█▋        | 2855/16798 [11:43<1:01:48,  3.76it/s]

{'loss': 2.1944, 'grad_norm': 1.9321792125701904, 'learning_rate': 0.00016610674291160353, 'epoch': 0.17}


 17%|█▋        | 2856/16798 [11:43<1:02:06,  3.74it/s]

{'loss': 2.313, 'grad_norm': 2.141916275024414, 'learning_rate': 0.0001660948296402192, 'epoch': 0.17}


 17%|█▋        | 2857/16798 [11:44<1:00:33,  3.84it/s]

{'loss': 1.6783, 'grad_norm': 1.7041125297546387, 'learning_rate': 0.00016608291636883489, 'epoch': 0.17}


 17%|█▋        | 2858/16798 [11:44<1:01:09,  3.80it/s]

{'loss': 1.7958, 'grad_norm': 1.8687915802001953, 'learning_rate': 0.00016607100309745055, 'epoch': 0.17}


 17%|█▋        | 2859/16798 [11:44<1:03:21,  3.67it/s]

{'loss': 1.413, 'grad_norm': 1.620861291885376, 'learning_rate': 0.00016605908982606624, 'epoch': 0.17}


 17%|█▋        | 2860/16798 [11:44<1:02:03,  3.74it/s]

{'loss': 1.7341, 'grad_norm': 1.7199898958206177, 'learning_rate': 0.0001660471765546819, 'epoch': 0.17}


 17%|█▋        | 2861/16798 [11:45<59:06,  3.93it/s]  

{'loss': 1.6582, 'grad_norm': 1.78591787815094, 'learning_rate': 0.0001660352632832976, 'epoch': 0.17}


 17%|█▋        | 2862/16798 [11:45<58:04,  4.00it/s]

{'loss': 1.5849, 'grad_norm': 1.849849820137024, 'learning_rate': 0.00016602335001191326, 'epoch': 0.17}


 17%|█▋        | 2863/16798 [11:45<59:24,  3.91it/s]

{'loss': 1.6749, 'grad_norm': 2.5906076431274414, 'learning_rate': 0.00016601143674052895, 'epoch': 0.17}


 17%|█▋        | 2864/16798 [11:45<1:00:18,  3.85it/s]

{'loss': 1.5456, 'grad_norm': 1.7496381998062134, 'learning_rate': 0.0001659995234691446, 'epoch': 0.17}


 17%|█▋        | 2865/16798 [11:46<1:01:34,  3.77it/s]

{'loss': 1.2805, 'grad_norm': 1.6347389221191406, 'learning_rate': 0.00016598761019776033, 'epoch': 0.17}


 17%|█▋        | 2866/16798 [11:46<1:02:30,  3.71it/s]

{'loss': 1.4961, 'grad_norm': 1.845171332359314, 'learning_rate': 0.000165975696926376, 'epoch': 0.17}


 17%|█▋        | 2867/16798 [11:46<1:02:59,  3.69it/s]

{'loss': 1.8014, 'grad_norm': 1.8469510078430176, 'learning_rate': 0.00016596378365499168, 'epoch': 0.17}


 17%|█▋        | 2868/16798 [11:46<1:00:23,  3.84it/s]

{'loss': 1.3054, 'grad_norm': 1.6453900337219238, 'learning_rate': 0.00016595187038360735, 'epoch': 0.17}


 17%|█▋        | 2869/16798 [11:47<1:03:40,  3.65it/s]

{'loss': 1.1849, 'grad_norm': 1.6842114925384521, 'learning_rate': 0.00016593995711222304, 'epoch': 0.17}


 17%|█▋        | 2870/16798 [11:47<1:00:34,  3.83it/s]

{'loss': 1.4738, 'grad_norm': 1.6120715141296387, 'learning_rate': 0.0001659280438408387, 'epoch': 0.17}


 17%|█▋        | 2871/16798 [11:47<59:18,  3.91it/s]  

{'loss': 1.9393, 'grad_norm': 2.0162439346313477, 'learning_rate': 0.0001659161305694544, 'epoch': 0.17}


 17%|█▋        | 2872/16798 [11:48<1:01:45,  3.76it/s]

{'loss': 1.7726, 'grad_norm': 2.2263362407684326, 'learning_rate': 0.00016590421729807006, 'epoch': 0.17}


 17%|█▋        | 2873/16798 [11:48<1:05:37,  3.54it/s]

{'loss': 1.8632, 'grad_norm': 2.211200714111328, 'learning_rate': 0.00016589230402668575, 'epoch': 0.17}


 17%|█▋        | 2874/16798 [11:48<1:01:57,  3.75it/s]

{'loss': 1.562, 'grad_norm': 1.6225422620773315, 'learning_rate': 0.0001658803907553014, 'epoch': 0.17}


 17%|█▋        | 2875/16798 [11:48<1:02:35,  3.71it/s]

{'loss': 1.474, 'grad_norm': 1.6688852310180664, 'learning_rate': 0.0001658684774839171, 'epoch': 0.17}


 17%|█▋        | 2876/16798 [11:49<1:03:07,  3.68it/s]

{'loss': 1.6189, 'grad_norm': 1.8712286949157715, 'learning_rate': 0.00016585656421253277, 'epoch': 0.17}


 17%|█▋        | 2877/16798 [11:49<1:00:54,  3.81it/s]

{'loss': 1.5534, 'grad_norm': 1.9177008867263794, 'learning_rate': 0.00016584465094114846, 'epoch': 0.17}


 17%|█▋        | 2878/16798 [11:49<59:26,  3.90it/s]  

{'loss': 1.4603, 'grad_norm': 1.646215796470642, 'learning_rate': 0.00016583273766976412, 'epoch': 0.17}


 17%|█▋        | 2879/16798 [11:49<1:04:02,  3.62it/s]

{'loss': 1.837, 'grad_norm': 2.463240146636963, 'learning_rate': 0.0001658208243983798, 'epoch': 0.17}


 17%|█▋        | 2880/16798 [11:50<1:01:54,  3.75it/s]

{'loss': 1.8526, 'grad_norm': 2.314431667327881, 'learning_rate': 0.00016580891112699547, 'epoch': 0.17}


 17%|█▋        | 2881/16798 [11:50<1:01:22,  3.78it/s]

{'loss': 1.2403, 'grad_norm': 1.7725545167922974, 'learning_rate': 0.00016579699785561117, 'epoch': 0.17}


 17%|█▋        | 2882/16798 [11:50<1:02:31,  3.71it/s]

{'loss': 1.6638, 'grad_norm': 1.8585518598556519, 'learning_rate': 0.00016578508458422683, 'epoch': 0.17}


 17%|█▋        | 2883/16798 [11:50<59:20,  3.91it/s]  

{'loss': 1.8494, 'grad_norm': 2.006678342819214, 'learning_rate': 0.00016577317131284252, 'epoch': 0.17}


 17%|█▋        | 2884/16798 [11:51<1:03:09,  3.67it/s]

{'loss': 1.4548, 'grad_norm': 1.6880247592926025, 'learning_rate': 0.00016576125804145818, 'epoch': 0.17}


 17%|█▋        | 2885/16798 [11:51<1:03:26,  3.66it/s]

{'loss': 1.7498, 'grad_norm': 2.042510986328125, 'learning_rate': 0.00016574934477007387, 'epoch': 0.17}


 17%|█▋        | 2886/16798 [11:51<1:03:50,  3.63it/s]

{'loss': 1.4731, 'grad_norm': 1.6516036987304688, 'learning_rate': 0.00016573743149868954, 'epoch': 0.17}


 17%|█▋        | 2887/16798 [11:52<1:02:29,  3.71it/s]

{'loss': 1.3477, 'grad_norm': 1.6106208562850952, 'learning_rate': 0.00016572551822730523, 'epoch': 0.17}


 17%|█▋        | 2888/16798 [11:52<1:05:08,  3.56it/s]

{'loss': 1.3909, 'grad_norm': 2.9363744258880615, 'learning_rate': 0.0001657136049559209, 'epoch': 0.17}


 17%|█▋        | 2889/16798 [11:52<1:04:19,  3.60it/s]

{'loss': 1.5129, 'grad_norm': 1.995125651359558, 'learning_rate': 0.00016570169168453658, 'epoch': 0.17}


 17%|█▋        | 2890/16798 [11:52<1:01:48,  3.75it/s]

{'loss': 1.3306, 'grad_norm': 1.9237251281738281, 'learning_rate': 0.00016568977841315225, 'epoch': 0.17}


 17%|█▋        | 2891/16798 [11:53<1:07:14,  3.45it/s]

{'loss': 0.9553, 'grad_norm': 1.2399147748947144, 'learning_rate': 0.00016567786514176794, 'epoch': 0.17}


 17%|█▋        | 2892/16798 [11:53<1:03:54,  3.63it/s]

{'loss': 1.2312, 'grad_norm': 1.6283543109893799, 'learning_rate': 0.0001656659518703836, 'epoch': 0.17}


 17%|█▋        | 2893/16798 [11:53<1:02:50,  3.69it/s]

{'loss': 1.9835, 'grad_norm': 2.3416552543640137, 'learning_rate': 0.0001656540385989993, 'epoch': 0.17}


 17%|█▋        | 2894/16798 [11:54<1:04:52,  3.57it/s]

{'loss': 0.9136, 'grad_norm': 1.799178957939148, 'learning_rate': 0.00016564212532761496, 'epoch': 0.17}


 17%|█▋        | 2895/16798 [11:54<1:03:01,  3.68it/s]

{'loss': 1.0041, 'grad_norm': 1.5401123762130737, 'learning_rate': 0.00016563021205623065, 'epoch': 0.17}


 17%|█▋        | 2896/16798 [11:54<1:00:59,  3.80it/s]

{'loss': 1.2724, 'grad_norm': 1.8772187232971191, 'learning_rate': 0.00016561829878484634, 'epoch': 0.17}


 17%|█▋        | 2897/16798 [11:54<1:03:17,  3.66it/s]

{'loss': 0.9127, 'grad_norm': 1.3414885997772217, 'learning_rate': 0.00016560638551346203, 'epoch': 0.17}


 17%|█▋        | 2898/16798 [11:55<59:26,  3.90it/s]  

{'loss': 0.6075, 'grad_norm': 1.0285428762435913, 'learning_rate': 0.0001655944722420777, 'epoch': 0.17}


 17%|█▋        | 2899/16798 [11:55<57:41,  4.02it/s]

{'loss': 0.8899, 'grad_norm': 1.4872627258300781, 'learning_rate': 0.00016558255897069338, 'epoch': 0.17}


 17%|█▋        | 2900/16798 [11:55<59:44,  3.88it/s]

{'loss': 0.5674, 'grad_norm': 1.3537299633026123, 'learning_rate': 0.00016557064569930905, 'epoch': 0.17}


 17%|█▋        | 2901/16798 [11:55<58:21,  3.97it/s]

{'loss': 2.0118, 'grad_norm': 1.8057137727737427, 'learning_rate': 0.00016555873242792474, 'epoch': 0.17}


 17%|█▋        | 2902/16798 [11:56<57:36,  4.02it/s]

{'loss': 1.6982, 'grad_norm': 1.7750978469848633, 'learning_rate': 0.0001655468191565404, 'epoch': 0.17}


 17%|█▋        | 2903/16798 [11:56<1:02:38,  3.70it/s]

{'loss': 2.0881, 'grad_norm': 1.8486145734786987, 'learning_rate': 0.0001655349058851561, 'epoch': 0.17}


 17%|█▋        | 2904/16798 [11:56<1:01:01,  3.79it/s]

{'loss': 1.8349, 'grad_norm': 1.9441598653793335, 'learning_rate': 0.00016552299261377175, 'epoch': 0.17}


 17%|█▋        | 2905/16798 [11:56<59:34,  3.89it/s]  

{'loss': 2.0586, 'grad_norm': 1.812991976737976, 'learning_rate': 0.00016551107934238742, 'epoch': 0.17}


 17%|█▋        | 2906/16798 [11:57<1:00:56,  3.80it/s]

{'loss': 2.186, 'grad_norm': 1.8462074995040894, 'learning_rate': 0.0001654991660710031, 'epoch': 0.17}


 17%|█▋        | 2907/16798 [11:57<57:43,  4.01it/s]  

{'loss': 1.6195, 'grad_norm': 1.8676358461380005, 'learning_rate': 0.00016548725279961877, 'epoch': 0.17}


 17%|█▋        | 2908/16798 [11:57<57:59,  3.99it/s]

{'loss': 1.7625, 'grad_norm': 1.82383131980896, 'learning_rate': 0.00016547533952823446, 'epoch': 0.17}


 17%|█▋        | 2909/16798 [11:57<55:55,  4.14it/s]

{'loss': 1.6322, 'grad_norm': 1.7438783645629883, 'learning_rate': 0.00016546342625685013, 'epoch': 0.17}


 17%|█▋        | 2910/16798 [11:58<57:59,  3.99it/s]

{'loss': 1.4714, 'grad_norm': 1.6683428287506104, 'learning_rate': 0.00016545151298546582, 'epoch': 0.17}


 17%|█▋        | 2911/16798 [11:58<58:41,  3.94it/s]

{'loss': 1.6724, 'grad_norm': 1.7562669515609741, 'learning_rate': 0.00016543959971408148, 'epoch': 0.17}


 17%|█▋        | 2912/16798 [11:58<1:03:14,  3.66it/s]

{'loss': 1.6604, 'grad_norm': 1.8058792352676392, 'learning_rate': 0.00016542768644269717, 'epoch': 0.17}


 17%|█▋        | 2913/16798 [11:58<1:03:21,  3.65it/s]

{'loss': 1.4905, 'grad_norm': 1.73243248462677, 'learning_rate': 0.00016541577317131284, 'epoch': 0.17}


 17%|█▋        | 2914/16798 [11:59<1:00:50,  3.80it/s]

{'loss': 1.5309, 'grad_norm': 1.8729429244995117, 'learning_rate': 0.00016540385989992853, 'epoch': 0.17}


 17%|█▋        | 2915/16798 [11:59<1:00:47,  3.81it/s]

{'loss': 1.3735, 'grad_norm': 1.4982647895812988, 'learning_rate': 0.0001653919466285442, 'epoch': 0.17}


 17%|█▋        | 2916/16798 [11:59<1:01:22,  3.77it/s]

{'loss': 2.0001, 'grad_norm': 2.1045615673065186, 'learning_rate': 0.00016538003335715988, 'epoch': 0.17}


 17%|█▋        | 2917/16798 [11:59<59:05,  3.91it/s]  

{'loss': 1.7005, 'grad_norm': 1.8588042259216309, 'learning_rate': 0.00016536812008577554, 'epoch': 0.17}


 17%|█▋        | 2918/16798 [12:00<57:06,  4.05it/s]

{'loss': 1.5098, 'grad_norm': 1.7289048433303833, 'learning_rate': 0.00016535620681439124, 'epoch': 0.17}


 17%|█▋        | 2919/16798 [12:00<56:12,  4.11it/s]

{'loss': 1.9703, 'grad_norm': 2.316448450088501, 'learning_rate': 0.0001653442935430069, 'epoch': 0.17}


 17%|█▋        | 2920/16798 [12:00<58:46,  3.93it/s]

{'loss': 1.9169, 'grad_norm': 2.1486012935638428, 'learning_rate': 0.0001653323802716226, 'epoch': 0.17}


 17%|█▋        | 2921/16798 [12:00<57:37,  4.01it/s]

{'loss': 1.5327, 'grad_norm': 1.9183597564697266, 'learning_rate': 0.00016532046700023825, 'epoch': 0.17}


 17%|█▋        | 2922/16798 [12:01<59:30,  3.89it/s]

{'loss': 1.4735, 'grad_norm': 1.683254599571228, 'learning_rate': 0.00016530855372885394, 'epoch': 0.17}


 17%|█▋        | 2923/16798 [12:01<1:00:48,  3.80it/s]

{'loss': 1.5805, 'grad_norm': 1.6226892471313477, 'learning_rate': 0.0001652966404574696, 'epoch': 0.17}


 17%|█▋        | 2924/16798 [12:01<57:57,  3.99it/s]  

{'loss': 1.5167, 'grad_norm': 1.9741045236587524, 'learning_rate': 0.0001652847271860853, 'epoch': 0.17}


 17%|█▋        | 2925/16798 [12:01<56:21,  4.10it/s]

{'loss': 1.6817, 'grad_norm': 1.9222878217697144, 'learning_rate': 0.00016527281391470096, 'epoch': 0.17}


 17%|█▋        | 2926/16798 [12:02<54:12,  4.27it/s]

{'loss': 1.828, 'grad_norm': 1.8707404136657715, 'learning_rate': 0.00016526090064331665, 'epoch': 0.17}


 17%|█▋        | 2927/16798 [12:02<56:44,  4.07it/s]

{'loss': 1.1736, 'grad_norm': 1.5386226177215576, 'learning_rate': 0.00016524898737193234, 'epoch': 0.17}


 17%|█▋        | 2928/16798 [12:02<56:09,  4.12it/s]

{'loss': 1.4363, 'grad_norm': 1.7104885578155518, 'learning_rate': 0.00016523707410054803, 'epoch': 0.17}


 17%|█▋        | 2929/16798 [12:02<56:29,  4.09it/s]

{'loss': 1.5051, 'grad_norm': 1.7172925472259521, 'learning_rate': 0.0001652251608291637, 'epoch': 0.17}


 17%|█▋        | 2930/16798 [12:03<58:48,  3.93it/s]

{'loss': 1.3474, 'grad_norm': 1.6173604726791382, 'learning_rate': 0.0001652132475577794, 'epoch': 0.17}


 17%|█▋        | 2931/16798 [12:03<58:15,  3.97it/s]

{'loss': 1.5928, 'grad_norm': 1.6712499856948853, 'learning_rate': 0.00016520133428639505, 'epoch': 0.17}


 17%|█▋        | 2932/16798 [12:03<55:50,  4.14it/s]

{'loss': 1.2567, 'grad_norm': 2.01872181892395, 'learning_rate': 0.00016518942101501074, 'epoch': 0.17}


 17%|█▋        | 2933/16798 [12:03<59:00,  3.92it/s]

{'loss': 1.5102, 'grad_norm': 2.0234689712524414, 'learning_rate': 0.0001651775077436264, 'epoch': 0.17}


 17%|█▋        | 2934/16798 [12:04<56:17,  4.11it/s]

{'loss': 1.3206, 'grad_norm': 1.645139217376709, 'learning_rate': 0.0001651655944722421, 'epoch': 0.17}


 17%|█▋        | 2935/16798 [12:04<54:46,  4.22it/s]

{'loss': 1.4087, 'grad_norm': 2.021268129348755, 'learning_rate': 0.00016515368120085776, 'epoch': 0.17}


 17%|█▋        | 2936/16798 [12:04<54:34,  4.23it/s]

{'loss': 1.2817, 'grad_norm': 1.687214732170105, 'learning_rate': 0.00016514176792947345, 'epoch': 0.17}


 17%|█▋        | 2938/16798 [12:05<56:36,  4.08it/s]  

{'loss': 1.3378, 'grad_norm': 1.7683402299880981, 'learning_rate': 0.00016512985465808912, 'epoch': 0.17}


 17%|█▋        | 2938/16798 [12:05<56:36,  4.08it/s]

{'loss': 1.5069, 'grad_norm': 1.8376911878585815, 'learning_rate': 0.0001651179413867048, 'epoch': 0.17}


 17%|█▋        | 2939/16798 [12:05<54:41,  4.22it/s]

{'loss': 1.3307, 'grad_norm': 1.7842789888381958, 'learning_rate': 0.00016510602811532047, 'epoch': 0.17}


 18%|█▊        | 2940/16798 [12:05<53:40,  4.30it/s]

{'loss': 1.4156, 'grad_norm': 1.8700239658355713, 'learning_rate': 0.00016509411484393616, 'epoch': 0.18}


 18%|█▊        | 2941/16798 [12:05<52:43,  4.38it/s]

{'loss': 1.0081, 'grad_norm': 1.6260980367660522, 'learning_rate': 0.00016508220157255182, 'epoch': 0.18}


 18%|█▊        | 2942/16798 [12:06<55:28,  4.16it/s]

{'loss': 0.9625, 'grad_norm': 1.6990809440612793, 'learning_rate': 0.00016507028830116752, 'epoch': 0.18}


 18%|█▊        | 2943/16798 [12:06<54:19,  4.25it/s]

{'loss': 1.2693, 'grad_norm': 1.7167807817459106, 'learning_rate': 0.00016505837502978318, 'epoch': 0.18}


 18%|█▊        | 2944/16798 [12:06<54:50,  4.21it/s]

{'loss': 0.8292, 'grad_norm': 1.7234989404678345, 'learning_rate': 0.00016504646175839887, 'epoch': 0.18}


 18%|█▊        | 2945/16798 [12:06<57:01,  4.05it/s]

{'loss': 1.0885, 'grad_norm': 1.9032536745071411, 'learning_rate': 0.00016503454848701453, 'epoch': 0.18}


 18%|█▊        | 2946/16798 [12:07<1:00:33,  3.81it/s]

{'loss': 1.0666, 'grad_norm': 1.8087130784988403, 'learning_rate': 0.00016502263521563022, 'epoch': 0.18}


 18%|█▊        | 2947/16798 [12:07<59:35,  3.87it/s]  

{'loss': 0.832, 'grad_norm': 1.5358902215957642, 'learning_rate': 0.0001650107219442459, 'epoch': 0.18}


 18%|█▊        | 2948/16798 [12:07<57:59,  3.98it/s]

{'loss': 0.3907, 'grad_norm': 0.8957539796829224, 'learning_rate': 0.00016499880867286158, 'epoch': 0.18}


 18%|█▊        | 2949/16798 [12:07<59:39,  3.87it/s]

{'loss': 0.2014, 'grad_norm': 0.7146071791648865, 'learning_rate': 0.00016498689540147724, 'epoch': 0.18}


 18%|█▊        | 2950/16798 [12:08<56:08,  4.11it/s]

{'loss': 0.6565, 'grad_norm': 1.3843998908996582, 'learning_rate': 0.00016497498213009293, 'epoch': 0.18}


 18%|█▊        | 2951/16798 [12:08<57:12,  4.03it/s]

{'loss': 2.0823, 'grad_norm': 1.911028265953064, 'learning_rate': 0.0001649630688587086, 'epoch': 0.18}


 18%|█▊        | 2952/16798 [12:08<1:00:36,  3.81it/s]

{'loss': 1.9858, 'grad_norm': 2.014359951019287, 'learning_rate': 0.0001649511555873243, 'epoch': 0.18}


 18%|█▊        | 2953/16798 [12:08<1:01:47,  3.73it/s]

{'loss': 2.0717, 'grad_norm': 1.9475525617599487, 'learning_rate': 0.00016493924231593995, 'epoch': 0.18}


 18%|█▊        | 2954/16798 [12:09<58:21,  3.95it/s]  

{'loss': 1.6268, 'grad_norm': 1.9223213195800781, 'learning_rate': 0.00016492732904455564, 'epoch': 0.18}


 18%|█▊        | 2955/16798 [12:09<56:52,  4.06it/s]

{'loss': 1.7433, 'grad_norm': 2.0434088706970215, 'learning_rate': 0.0001649154157731713, 'epoch': 0.18}


 18%|█▊        | 2956/16798 [12:09<57:30,  4.01it/s]

{'loss': 2.023, 'grad_norm': 2.3333663940429688, 'learning_rate': 0.000164903502501787, 'epoch': 0.18}


 18%|█▊        | 2957/16798 [12:09<56:38,  4.07it/s]

{'loss': 1.5589, 'grad_norm': 1.7835639715194702, 'learning_rate': 0.00016489158923040266, 'epoch': 0.18}


 18%|█▊        | 2958/16798 [12:10<59:13,  3.89it/s]

{'loss': 1.4809, 'grad_norm': 1.7339226007461548, 'learning_rate': 0.00016487967595901838, 'epoch': 0.18}


 18%|█▊        | 2959/16798 [12:10<1:00:35,  3.81it/s]

{'loss': 1.6449, 'grad_norm': 1.6907763481140137, 'learning_rate': 0.00016486776268763404, 'epoch': 0.18}


 18%|█▊        | 2960/16798 [12:10<1:03:45,  3.62it/s]

{'loss': 1.6763, 'grad_norm': 1.8173812627792358, 'learning_rate': 0.00016485584941624973, 'epoch': 0.18}


 18%|█▊        | 2961/16798 [12:10<1:00:54,  3.79it/s]

{'loss': 1.7747, 'grad_norm': 1.7788316011428833, 'learning_rate': 0.0001648439361448654, 'epoch': 0.18}


 18%|█▊        | 2962/16798 [12:11<1:01:54,  3.73it/s]

{'loss': 1.7569, 'grad_norm': 2.2039434909820557, 'learning_rate': 0.00016483202287348106, 'epoch': 0.18}


 18%|█▊        | 2963/16798 [12:11<1:02:49,  3.67it/s]

{'loss': 1.7446, 'grad_norm': 1.8570715188980103, 'learning_rate': 0.00016482010960209675, 'epoch': 0.18}


 18%|█▊        | 2964/16798 [12:11<1:00:35,  3.81it/s]

{'loss': 1.9568, 'grad_norm': 2.0986721515655518, 'learning_rate': 0.00016480819633071241, 'epoch': 0.18}


 18%|█▊        | 2965/16798 [12:11<1:00:57,  3.78it/s]

{'loss': 1.404, 'grad_norm': 1.6332893371582031, 'learning_rate': 0.0001647962830593281, 'epoch': 0.18}


 18%|█▊        | 2966/16798 [12:12<57:48,  3.99it/s]  

{'loss': 1.8156, 'grad_norm': 2.1769936084747314, 'learning_rate': 0.00016478436978794377, 'epoch': 0.18}


 18%|█▊        | 2967/16798 [12:12<56:52,  4.05it/s]

{'loss': 1.6623, 'grad_norm': 1.6364690065383911, 'learning_rate': 0.00016477245651655946, 'epoch': 0.18}


 18%|█▊        | 2968/16798 [12:12<57:51,  3.98it/s]

{'loss': 2.0085, 'grad_norm': 2.336151123046875, 'learning_rate': 0.00016476054324517512, 'epoch': 0.18}


 18%|█▊        | 2969/16798 [12:12<56:36,  4.07it/s]

{'loss': 1.6624, 'grad_norm': 2.223605155944824, 'learning_rate': 0.00016474862997379081, 'epoch': 0.18}


 18%|█▊        | 2970/16798 [12:13<57:41,  3.99it/s]

{'loss': 1.9581, 'grad_norm': 1.956197738647461, 'learning_rate': 0.00016473671670240648, 'epoch': 0.18}


 18%|█▊        | 2971/16798 [12:13<58:47,  3.92it/s]

{'loss': 1.8968, 'grad_norm': 2.318675994873047, 'learning_rate': 0.00016472480343102217, 'epoch': 0.18}


 18%|█▊        | 2973/16798 [12:13<57:11,  4.03it/s]  

{'loss': 1.5175, 'grad_norm': 1.9500566720962524, 'learning_rate': 0.00016471289015963783, 'epoch': 0.18}


 18%|█▊        | 2973/16798 [12:13<57:11,  4.03it/s]

{'loss': 1.5609, 'grad_norm': 1.9364571571350098, 'learning_rate': 0.00016470097688825352, 'epoch': 0.18}


 18%|█▊        | 2974/16798 [12:14<54:45,  4.21it/s]

{'loss': 1.5374, 'grad_norm': 1.7337912321090698, 'learning_rate': 0.0001646890636168692, 'epoch': 0.18}


 18%|█▊        | 2975/16798 [12:14<57:53,  3.98it/s]

{'loss': 1.3548, 'grad_norm': 1.7843834161758423, 'learning_rate': 0.00016467715034548488, 'epoch': 0.18}


 18%|█▊        | 2976/16798 [12:14<55:08,  4.18it/s]

{'loss': 1.7981, 'grad_norm': 2.238046407699585, 'learning_rate': 0.00016466523707410054, 'epoch': 0.18}


 18%|█▊        | 2977/16798 [12:14<58:26,  3.94it/s]

{'loss': 1.3164, 'grad_norm': 1.5324996709823608, 'learning_rate': 0.00016465332380271623, 'epoch': 0.18}


 18%|█▊        | 2978/16798 [12:15<1:00:32,  3.80it/s]

{'loss': 1.801, 'grad_norm': 1.918225884437561, 'learning_rate': 0.0001646414105313319, 'epoch': 0.18}


 18%|█▊        | 2979/16798 [12:15<58:33,  3.93it/s]  

{'loss': 1.7249, 'grad_norm': 2.1414337158203125, 'learning_rate': 0.00016462949725994759, 'epoch': 0.18}


 18%|█▊        | 2980/16798 [12:15<58:42,  3.92it/s]

{'loss': 1.3746, 'grad_norm': 2.8723342418670654, 'learning_rate': 0.00016461758398856325, 'epoch': 0.18}


 18%|█▊        | 2981/16798 [12:16<1:01:32,  3.74it/s]

{'loss': 1.4426, 'grad_norm': 1.9589825868606567, 'learning_rate': 0.00016460567071717894, 'epoch': 0.18}


 18%|█▊        | 2982/16798 [12:16<1:02:53,  3.66it/s]

{'loss': 1.0757, 'grad_norm': 1.7364885807037354, 'learning_rate': 0.0001645937574457946, 'epoch': 0.18}


 18%|█▊        | 2983/16798 [12:16<1:00:03,  3.83it/s]

{'loss': 1.4083, 'grad_norm': 1.8845771551132202, 'learning_rate': 0.0001645818441744103, 'epoch': 0.18}


 18%|█▊        | 2984/16798 [12:16<1:00:37,  3.80it/s]

{'loss': 1.3872, 'grad_norm': 1.7226531505584717, 'learning_rate': 0.00016456993090302596, 'epoch': 0.18}


 18%|█▊        | 2985/16798 [12:17<59:31,  3.87it/s]  

{'loss': 1.8873, 'grad_norm': 2.492859363555908, 'learning_rate': 0.00016455801763164165, 'epoch': 0.18}


 18%|█▊        | 2986/16798 [12:17<58:27,  3.94it/s]

{'loss': 1.6066, 'grad_norm': 1.9764044284820557, 'learning_rate': 0.0001645461043602573, 'epoch': 0.18}


 18%|█▊        | 2987/16798 [12:17<1:00:38,  3.80it/s]

{'loss': 1.2701, 'grad_norm': 1.7789137363433838, 'learning_rate': 0.000164534191088873, 'epoch': 0.18}


 18%|█▊        | 2988/16798 [12:17<59:07,  3.89it/s]  

{'loss': 1.3258, 'grad_norm': 2.0304248332977295, 'learning_rate': 0.00016452227781748867, 'epoch': 0.18}


 18%|█▊        | 2989/16798 [12:18<55:44,  4.13it/s]

{'loss': 1.2152, 'grad_norm': 1.8258016109466553, 'learning_rate': 0.00016451036454610439, 'epoch': 0.18}


 18%|█▊        | 2990/16798 [12:18<59:17,  3.88it/s]

{'loss': 1.4242, 'grad_norm': 2.292860746383667, 'learning_rate': 0.00016449845127472005, 'epoch': 0.18}


 18%|█▊        | 2991/16798 [12:18<58:48,  3.91it/s]

{'loss': 1.2094, 'grad_norm': 1.6388698816299438, 'learning_rate': 0.00016448653800333574, 'epoch': 0.18}


 18%|█▊        | 2992/16798 [12:18<57:51,  3.98it/s]

{'loss': 1.5387, 'grad_norm': 1.774441123008728, 'learning_rate': 0.0001644746247319514, 'epoch': 0.18}


 18%|█▊        | 2993/16798 [12:19<56:07,  4.10it/s]

{'loss': 1.959, 'grad_norm': 2.475236654281616, 'learning_rate': 0.0001644627114605671, 'epoch': 0.18}


 18%|█▊        | 2994/16798 [12:19<59:21,  3.88it/s]

{'loss': 0.9294, 'grad_norm': 1.570186734199524, 'learning_rate': 0.00016445079818918276, 'epoch': 0.18}


 18%|█▊        | 2995/16798 [12:19<57:04,  4.03it/s]

{'loss': 1.4259, 'grad_norm': 2.257788896560669, 'learning_rate': 0.00016443888491779845, 'epoch': 0.18}


 18%|█▊        | 2996/16798 [12:19<55:58,  4.11it/s]

{'loss': 0.945, 'grad_norm': 1.6631590127944946, 'learning_rate': 0.0001644269716464141, 'epoch': 0.18}


 18%|█▊        | 2997/16798 [12:20<55:33,  4.14it/s]

{'loss': 1.2032, 'grad_norm': 1.9297611713409424, 'learning_rate': 0.0001644150583750298, 'epoch': 0.18}


 18%|█▊        | 2998/16798 [12:20<59:49,  3.84it/s]

{'loss': 0.6568, 'grad_norm': 1.6342648267745972, 'learning_rate': 0.00016440314510364547, 'epoch': 0.18}


 18%|█▊        | 2999/16798 [12:20<57:40,  3.99it/s]

{'loss': 0.8779, 'grad_norm': 1.4847978353500366, 'learning_rate': 0.00016439123183226116, 'epoch': 0.18}




{'loss': 0.9096, 'grad_norm': 1.6472500562667847, 'learning_rate': 0.00016437931856087682, 'epoch': 0.18}


 18%|█▊        | 3001/16798 [12:23<3:59:16,  1.04s/it]

{'loss': 2.053, 'grad_norm': 1.8791279792785645, 'learning_rate': 0.0001643674052894925, 'epoch': 0.18}


 18%|█▊        | 3002/16798 [12:23<3:06:16,  1.23it/s]

{'loss': 2.0453, 'grad_norm': 1.9552043676376343, 'learning_rate': 0.00016435549201810818, 'epoch': 0.18}


 18%|█▊        | 3003/16798 [12:24<2:31:29,  1.52it/s]

{'loss': 1.7618, 'grad_norm': 1.8530153036117554, 'learning_rate': 0.00016434357874672387, 'epoch': 0.18}


 18%|█▊        | 3004/16798 [12:24<2:03:50,  1.86it/s]

{'loss': 1.7018, 'grad_norm': 2.084850311279297, 'learning_rate': 0.00016433166547533953, 'epoch': 0.18}


 18%|█▊        | 3005/16798 [12:24<1:42:33,  2.24it/s]

{'loss': 1.8906, 'grad_norm': 1.78959059715271, 'learning_rate': 0.00016431975220395522, 'epoch': 0.18}


 18%|█▊        | 3006/16798 [12:25<1:34:26,  2.43it/s]

{'loss': 1.9711, 'grad_norm': 1.9904823303222656, 'learning_rate': 0.00016430783893257088, 'epoch': 0.18}


 18%|█▊        | 3007/16798 [12:25<1:25:51,  2.68it/s]

{'loss': 2.3454, 'grad_norm': 2.110506772994995, 'learning_rate': 0.00016429592566118658, 'epoch': 0.18}


 18%|█▊        | 3008/16798 [12:25<1:17:34,  2.96it/s]

{'loss': 1.9476, 'grad_norm': 2.046208381652832, 'learning_rate': 0.00016428401238980224, 'epoch': 0.18}


 18%|█▊        | 3009/16798 [12:25<1:15:42,  3.04it/s]

{'loss': 1.5799, 'grad_norm': 1.793199062347412, 'learning_rate': 0.00016427209911841793, 'epoch': 0.18}


 18%|█▊        | 3010/16798 [12:26<1:12:24,  3.17it/s]

{'loss': 2.2617, 'grad_norm': 2.1324994564056396, 'learning_rate': 0.0001642601858470336, 'epoch': 0.18}


 18%|█▊        | 3011/16798 [12:26<1:07:55,  3.38it/s]

{'loss': 2.002, 'grad_norm': 2.30334734916687, 'learning_rate': 0.00016424827257564928, 'epoch': 0.18}


 18%|█▊        | 3012/16798 [12:26<1:06:01,  3.48it/s]

{'loss': 1.8614, 'grad_norm': 2.0881917476654053, 'learning_rate': 0.00016423635930426495, 'epoch': 0.18}


 18%|█▊        | 3013/16798 [12:26<1:02:24,  3.68it/s]

{'loss': 1.868, 'grad_norm': 2.2060863971710205, 'learning_rate': 0.00016422444603288064, 'epoch': 0.18}


 18%|█▊        | 3014/16798 [12:27<1:05:40,  3.50it/s]

{'loss': 1.6061, 'grad_norm': 1.7204594612121582, 'learning_rate': 0.0001642125327614963, 'epoch': 0.18}


 18%|█▊        | 3015/16798 [12:27<1:02:20,  3.68it/s]

{'loss': 1.9646, 'grad_norm': 2.628173351287842, 'learning_rate': 0.000164200619490112, 'epoch': 0.18}


 18%|█▊        | 3016/16798 [12:27<1:01:34,  3.73it/s]

{'loss': 1.5617, 'grad_norm': 2.3366854190826416, 'learning_rate': 0.00016418870621872766, 'epoch': 0.18}


 18%|█▊        | 3017/16798 [12:28<1:00:46,  3.78it/s]

{'loss': 1.7161, 'grad_norm': 1.9702136516571045, 'learning_rate': 0.00016417679294734335, 'epoch': 0.18}


 18%|█▊        | 3018/16798 [12:28<1:00:05,  3.82it/s]

{'loss': 1.4309, 'grad_norm': 1.604354977607727, 'learning_rate': 0.000164164879675959, 'epoch': 0.18}


 18%|█▊        | 3019/16798 [12:28<1:02:32,  3.67it/s]

{'loss': 1.7868, 'grad_norm': 2.1026298999786377, 'learning_rate': 0.0001641529664045747, 'epoch': 0.18}


 18%|█▊        | 3020/16798 [12:28<1:00:13,  3.81it/s]

{'loss': 1.702, 'grad_norm': 1.768324851989746, 'learning_rate': 0.0001641410531331904, 'epoch': 0.18}


 18%|█▊        | 3021/16798 [12:29<1:06:18,  3.46it/s]

{'loss': 1.6457, 'grad_norm': 1.9579129219055176, 'learning_rate': 0.00016412913986180606, 'epoch': 0.18}


 18%|█▊        | 3022/16798 [12:29<1:01:48,  3.71it/s]

{'loss': 1.7571, 'grad_norm': 2.0013427734375, 'learning_rate': 0.00016411722659042175, 'epoch': 0.18}


 18%|█▊        | 3023/16798 [12:29<1:02:24,  3.68it/s]

{'loss': 1.6663, 'grad_norm': 1.927396535873413, 'learning_rate': 0.0001641053133190374, 'epoch': 0.18}


 18%|█▊        | 3024/16798 [12:29<1:03:01,  3.64it/s]

{'loss': 1.2739, 'grad_norm': 1.7118189334869385, 'learning_rate': 0.0001640934000476531, 'epoch': 0.18}


 18%|█▊        | 3025/16798 [12:30<1:01:57,  3.70it/s]

{'loss': 1.0986, 'grad_norm': 1.5130215883255005, 'learning_rate': 0.00016408148677626876, 'epoch': 0.18}


 18%|█▊        | 3026/16798 [12:30<1:05:30,  3.50it/s]

{'loss': 1.4897, 'grad_norm': 2.2923781871795654, 'learning_rate': 0.00016406957350488446, 'epoch': 0.18}


 18%|█▊        | 3027/16798 [12:30<1:03:37,  3.61it/s]

{'loss': 1.2874, 'grad_norm': 1.6039931774139404, 'learning_rate': 0.00016405766023350012, 'epoch': 0.18}


 18%|█▊        | 3028/16798 [12:31<1:01:43,  3.72it/s]

{'loss': 1.3198, 'grad_norm': 1.589208722114563, 'learning_rate': 0.0001640457469621158, 'epoch': 0.18}


 18%|█▊        | 3029/16798 [12:31<1:02:36,  3.67it/s]

{'loss': 1.5605, 'grad_norm': 1.9892369508743286, 'learning_rate': 0.00016403383369073147, 'epoch': 0.18}


 18%|█▊        | 3030/16798 [12:31<1:02:56,  3.65it/s]

{'loss': 1.4911, 'grad_norm': 1.773464560508728, 'learning_rate': 0.00016402192041934716, 'epoch': 0.18}


 18%|█▊        | 3031/16798 [12:31<1:01:46,  3.71it/s]

{'loss': 1.3496, 'grad_norm': 1.6317640542984009, 'learning_rate': 0.00016401000714796283, 'epoch': 0.18}


 18%|█▊        | 3032/16798 [12:32<1:00:58,  3.76it/s]

{'loss': 1.8178, 'grad_norm': 2.3945088386535645, 'learning_rate': 0.00016399809387657852, 'epoch': 0.18}


 18%|█▊        | 3033/16798 [12:32<1:00:49,  3.77it/s]

{'loss': 1.322, 'grad_norm': 1.8424556255340576, 'learning_rate': 0.00016398618060519418, 'epoch': 0.18}


 18%|█▊        | 3034/16798 [12:32<1:02:25,  3.68it/s]

{'loss': 1.5309, 'grad_norm': 1.7662456035614014, 'learning_rate': 0.00016397426733380987, 'epoch': 0.18}


 18%|█▊        | 3035/16798 [12:32<1:01:10,  3.75it/s]

{'loss': 1.7232, 'grad_norm': 1.9576267004013062, 'learning_rate': 0.00016396235406242554, 'epoch': 0.18}


 18%|█▊        | 3036/16798 [12:33<59:33,  3.85it/s]  

{'loss': 1.7106, 'grad_norm': 2.273214340209961, 'learning_rate': 0.00016395044079104123, 'epoch': 0.18}


 18%|█▊        | 3037/16798 [12:33<1:02:35,  3.66it/s]

{'loss': 1.192, 'grad_norm': 1.6572835445404053, 'learning_rate': 0.0001639385275196569, 'epoch': 0.18}


 18%|█▊        | 3038/16798 [12:33<1:01:44,  3.71it/s]

{'loss': 1.6105, 'grad_norm': 2.0751194953918457, 'learning_rate': 0.00016392661424827258, 'epoch': 0.18}


 18%|█▊        | 3039/16798 [12:34<1:02:49,  3.65it/s]

{'loss': 1.6157, 'grad_norm': 2.023367166519165, 'learning_rate': 0.00016391470097688825, 'epoch': 0.18}


 18%|█▊        | 3040/16798 [12:34<1:01:05,  3.75it/s]

{'loss': 1.4775, 'grad_norm': 2.07944393157959, 'learning_rate': 0.00016390278770550394, 'epoch': 0.18}


 18%|█▊        | 3041/16798 [12:34<58:00,  3.95it/s]  

{'loss': 1.0762, 'grad_norm': 2.0736610889434814, 'learning_rate': 0.0001638908744341196, 'epoch': 0.18}


 18%|█▊        | 3042/16798 [12:34<1:02:42,  3.66it/s]

{'loss': 1.1427, 'grad_norm': 2.529041290283203, 'learning_rate': 0.0001638789611627353, 'epoch': 0.18}


 18%|█▊        | 3043/16798 [12:35<1:04:03,  3.58it/s]

{'loss': 1.3043, 'grad_norm': 1.7583965063095093, 'learning_rate': 0.00016386704789135095, 'epoch': 0.18}


 18%|█▊        | 3044/16798 [12:35<1:00:12,  3.81it/s]

{'loss': 1.3557, 'grad_norm': 1.99033784866333, 'learning_rate': 0.00016385513461996665, 'epoch': 0.18}


 18%|█▊        | 3045/16798 [12:35<1:00:47,  3.77it/s]

{'loss': 0.9985, 'grad_norm': 1.6665351390838623, 'learning_rate': 0.0001638432213485823, 'epoch': 0.18}


 18%|█▊        | 3046/16798 [12:35<59:25,  3.86it/s]  

{'loss': 1.2464, 'grad_norm': 2.2578630447387695, 'learning_rate': 0.000163831308077198, 'epoch': 0.18}


 18%|█▊        | 3047/16798 [12:36<1:04:32,  3.55it/s]

{'loss': 0.7287, 'grad_norm': 1.3938900232315063, 'learning_rate': 0.00016381939480581366, 'epoch': 0.18}


 18%|█▊        | 3048/16798 [12:36<1:00:25,  3.79it/s]

{'loss': 0.3134, 'grad_norm': 0.8558775186538696, 'learning_rate': 0.00016380748153442935, 'epoch': 0.18}


 18%|█▊        | 3049/16798 [12:36<57:45,  3.97it/s]  

{'loss': 0.2216, 'grad_norm': 0.6463233828544617, 'learning_rate': 0.00016379556826304502, 'epoch': 0.18}


 18%|█▊        | 3050/16798 [12:36<57:17,  4.00it/s]

{'loss': 0.7574, 'grad_norm': 1.5419777631759644, 'learning_rate': 0.00016378365499166074, 'epoch': 0.18}


 18%|█▊        | 3051/16798 [12:37<1:00:18,  3.80it/s]

{'loss': 1.8223, 'grad_norm': 1.7545628547668457, 'learning_rate': 0.0001637717417202764, 'epoch': 0.18}


 18%|█▊        | 3052/16798 [12:37<1:03:44,  3.59it/s]

{'loss': 2.2193, 'grad_norm': 1.980992078781128, 'learning_rate': 0.0001637598284488921, 'epoch': 0.18}


 18%|█▊        | 3053/16798 [12:37<1:03:20,  3.62it/s]

{'loss': 2.3277, 'grad_norm': 2.394533634185791, 'learning_rate': 0.00016374791517750775, 'epoch': 0.18}


 18%|█▊        | 3054/16798 [12:38<1:05:17,  3.51it/s]

{'loss': 1.5567, 'grad_norm': 1.6098873615264893, 'learning_rate': 0.00016373600190612344, 'epoch': 0.18}


 18%|█▊        | 3055/16798 [12:38<1:02:45,  3.65it/s]

{'loss': 1.6895, 'grad_norm': 1.714141607284546, 'learning_rate': 0.0001637240886347391, 'epoch': 0.18}


 18%|█▊        | 3056/16798 [12:38<1:03:57,  3.58it/s]

{'loss': 1.5787, 'grad_norm': 1.638229250907898, 'learning_rate': 0.0001637121753633548, 'epoch': 0.18}


 18%|█▊        | 3057/16798 [12:38<1:00:55,  3.76it/s]

{'loss': 1.6152, 'grad_norm': 1.764384388923645, 'learning_rate': 0.00016370026209197046, 'epoch': 0.18}


 18%|█▊        | 3058/16798 [12:39<1:00:51,  3.76it/s]

{'loss': 1.3, 'grad_norm': 1.603476881980896, 'learning_rate': 0.00016368834882058615, 'epoch': 0.18}


 18%|█▊        | 3059/16798 [12:39<1:03:16,  3.62it/s]

{'loss': 2.1028, 'grad_norm': 2.357409715652466, 'learning_rate': 0.00016367643554920182, 'epoch': 0.18}


 18%|█▊        | 3060/16798 [12:39<1:01:57,  3.70it/s]

{'loss': 1.5156, 'grad_norm': 1.8678745031356812, 'learning_rate': 0.0001636645222778175, 'epoch': 0.18}


 18%|█▊        | 3061/16798 [12:39<1:01:55,  3.70it/s]

{'loss': 1.8522, 'grad_norm': 1.7952845096588135, 'learning_rate': 0.00016365260900643317, 'epoch': 0.18}


 18%|█▊        | 3063/16798 [12:40<55:16,  4.14it/s]  

{'loss': 1.8473, 'grad_norm': 1.8509100675582886, 'learning_rate': 0.00016364069573504886, 'epoch': 0.18}


 18%|█▊        | 3063/16798 [12:40<55:16,  4.14it/s]

{'loss': 1.8177, 'grad_norm': 2.1728901863098145, 'learning_rate': 0.00016362878246366453, 'epoch': 0.18}


 18%|█▊        | 3064/16798 [12:40<53:36,  4.27it/s]

{'loss': 1.6087, 'grad_norm': 2.1138508319854736, 'learning_rate': 0.00016361686919228022, 'epoch': 0.18}


 18%|█▊        | 3065/16798 [12:40<58:40,  3.90it/s]

{'loss': 1.3963, 'grad_norm': 1.7703872919082642, 'learning_rate': 0.00016360495592089588, 'epoch': 0.18}


 18%|█▊        | 3066/16798 [12:41<56:39,  4.04it/s]

{'loss': 1.2487, 'grad_norm': 1.650126576423645, 'learning_rate': 0.00016359304264951157, 'epoch': 0.18}


 18%|█▊        | 3067/16798 [12:41<58:03,  3.94it/s]

{'loss': 1.6721, 'grad_norm': 2.1009926795959473, 'learning_rate': 0.00016358112937812723, 'epoch': 0.18}


 18%|█▊        | 3068/16798 [12:41<58:04,  3.94it/s]

{'loss': 1.5829, 'grad_norm': 1.7405495643615723, 'learning_rate': 0.00016356921610674293, 'epoch': 0.18}


 18%|█▊        | 3069/16798 [12:41<55:50,  4.10it/s]

{'loss': 1.523, 'grad_norm': 1.9791755676269531, 'learning_rate': 0.0001635573028353586, 'epoch': 0.18}


 18%|█▊        | 3070/16798 [12:42<53:32,  4.27it/s]

{'loss': 1.8487, 'grad_norm': 1.962830662727356, 'learning_rate': 0.00016354538956397428, 'epoch': 0.18}


 18%|█▊        | 3071/16798 [12:42<55:19,  4.14it/s]

{'loss': 1.2684, 'grad_norm': 1.6576817035675049, 'learning_rate': 0.00016353347629258994, 'epoch': 0.18}


 18%|█▊        | 3072/16798 [12:42<58:17,  3.92it/s]

{'loss': 1.8218, 'grad_norm': 2.1564056873321533, 'learning_rate': 0.00016352156302120563, 'epoch': 0.18}


 18%|█▊        | 3073/16798 [12:42<55:45,  4.10it/s]

{'loss': 1.3471, 'grad_norm': 1.6578232049942017, 'learning_rate': 0.0001635096497498213, 'epoch': 0.18}


 18%|█▊        | 3074/16798 [12:43<57:35,  3.97it/s]

{'loss': 1.3138, 'grad_norm': 1.9006595611572266, 'learning_rate': 0.000163497736478437, 'epoch': 0.18}


 18%|█▊        | 3075/16798 [12:43<1:01:20,  3.73it/s]

{'loss': 1.5022, 'grad_norm': 1.7853796482086182, 'learning_rate': 0.00016348582320705265, 'epoch': 0.18}


 18%|█▊        | 3076/16798 [12:43<57:59,  3.94it/s]  

{'loss': 1.7164, 'grad_norm': 1.8512189388275146, 'learning_rate': 0.00016347390993566834, 'epoch': 0.18}


 18%|█▊        | 3077/16798 [12:43<1:01:32,  3.72it/s]

{'loss': 1.2956, 'grad_norm': 1.7006245851516724, 'learning_rate': 0.000163461996664284, 'epoch': 0.18}


 18%|█▊        | 3078/16798 [12:44<1:01:01,  3.75it/s]

{'loss': 1.5364, 'grad_norm': 2.0604498386383057, 'learning_rate': 0.0001634500833928997, 'epoch': 0.18}


 18%|█▊        | 3079/16798 [12:44<1:01:51,  3.70it/s]

{'loss': 1.4009, 'grad_norm': 1.832713007926941, 'learning_rate': 0.00016343817012151536, 'epoch': 0.18}


 18%|█▊        | 3080/16798 [12:44<1:00:34,  3.77it/s]

{'loss': 2.0522, 'grad_norm': 2.5308475494384766, 'learning_rate': 0.00016342625685013105, 'epoch': 0.18}


 18%|█▊        | 3081/16798 [12:44<58:37,  3.90it/s]  

{'loss': 1.4356, 'grad_norm': 1.939523696899414, 'learning_rate': 0.00016341434357874674, 'epoch': 0.18}


 18%|█▊        | 3082/16798 [12:45<1:00:50,  3.76it/s]

{'loss': 1.3613, 'grad_norm': 1.7147754430770874, 'learning_rate': 0.0001634024303073624, 'epoch': 0.18}


 18%|█▊        | 3083/16798 [12:45<1:02:45,  3.64it/s]

{'loss': 1.6282, 'grad_norm': 2.23533296585083, 'learning_rate': 0.0001633905170359781, 'epoch': 0.18}


 18%|█▊        | 3084/16798 [12:45<1:00:16,  3.79it/s]

{'loss': 1.8466, 'grad_norm': 2.1038904190063477, 'learning_rate': 0.00016337860376459376, 'epoch': 0.18}


 18%|█▊        | 3085/16798 [12:46<1:02:56,  3.63it/s]

{'loss': 0.9894, 'grad_norm': 1.6441787481307983, 'learning_rate': 0.00016336669049320945, 'epoch': 0.18}


 18%|█▊        | 3086/16798 [12:46<58:18,  3.92it/s]  

{'loss': 1.3995, 'grad_norm': 1.811718225479126, 'learning_rate': 0.00016335477722182512, 'epoch': 0.18}


 18%|█▊        | 3087/16798 [12:46<58:48,  3.89it/s]

{'loss': 1.4509, 'grad_norm': 2.0217816829681396, 'learning_rate': 0.0001633428639504408, 'epoch': 0.18}


 18%|█▊        | 3088/16798 [12:46<59:23,  3.85it/s]

{'loss': 0.9528, 'grad_norm': 1.6663644313812256, 'learning_rate': 0.00016333095067905647, 'epoch': 0.18}


 18%|█▊        | 3089/16798 [12:47<1:01:21,  3.72it/s]

{'loss': 1.3136, 'grad_norm': 2.578563928604126, 'learning_rate': 0.00016331903740767216, 'epoch': 0.18}


 18%|█▊        | 3090/16798 [12:47<1:04:28,  3.54it/s]

{'loss': 1.4684, 'grad_norm': 1.7557731866836548, 'learning_rate': 0.00016330712413628782, 'epoch': 0.18}


 18%|█▊        | 3091/16798 [12:47<59:41,  3.83it/s]  

{'loss': 1.3768, 'grad_norm': 1.7059752941131592, 'learning_rate': 0.00016329521086490352, 'epoch': 0.18}


 18%|█▊        | 3092/16798 [12:47<56:35,  4.04it/s]

{'loss': 0.9261, 'grad_norm': 1.6607707738876343, 'learning_rate': 0.00016328329759351918, 'epoch': 0.18}


 18%|█▊        | 3093/16798 [12:48<55:55,  4.08it/s]

{'loss': 1.3202, 'grad_norm': 1.9904650449752808, 'learning_rate': 0.00016327138432213487, 'epoch': 0.18}


 18%|█▊        | 3094/16798 [12:48<59:53,  3.81it/s]

{'loss': 1.1668, 'grad_norm': 1.9407150745391846, 'learning_rate': 0.00016325947105075053, 'epoch': 0.18}


 18%|█▊        | 3095/16798 [12:48<56:54,  4.01it/s]

{'loss': 1.2882, 'grad_norm': 2.149940013885498, 'learning_rate': 0.00016324755777936622, 'epoch': 0.18}


 18%|█▊        | 3096/16798 [12:48<57:16,  3.99it/s]

{'loss': 1.6245, 'grad_norm': 2.54744291305542, 'learning_rate': 0.0001632356445079819, 'epoch': 0.18}


 18%|█▊        | 3097/16798 [12:49<1:00:41,  3.76it/s]

{'loss': 0.6578, 'grad_norm': 1.3161029815673828, 'learning_rate': 0.00016322373123659758, 'epoch': 0.18}


 18%|█▊        | 3098/16798 [12:49<58:34,  3.90it/s]  

{'loss': 0.6881, 'grad_norm': 1.4294592142105103, 'learning_rate': 0.00016321181796521324, 'epoch': 0.18}


 18%|█▊        | 3099/16798 [12:49<59:32,  3.84it/s]

{'loss': 0.348, 'grad_norm': 0.9419055581092834, 'learning_rate': 0.00016319990469382893, 'epoch': 0.18}


 18%|█▊        | 3100/16798 [12:49<1:00:45,  3.76it/s]

{'loss': 0.8178, 'grad_norm': 1.5820772647857666, 'learning_rate': 0.0001631879914224446, 'epoch': 0.18}


 18%|█▊        | 3101/16798 [12:50<1:04:51,  3.52it/s]

{'loss': 1.9191, 'grad_norm': 1.8306496143341064, 'learning_rate': 0.0001631760781510603, 'epoch': 0.18}


 18%|█▊        | 3102/16798 [12:50<1:03:12,  3.61it/s]

{'loss': 2.2269, 'grad_norm': 1.8981688022613525, 'learning_rate': 0.00016316416487967595, 'epoch': 0.18}


 18%|█▊        | 3103/16798 [12:50<1:00:49,  3.75it/s]

{'loss': 1.9847, 'grad_norm': 1.8567218780517578, 'learning_rate': 0.00016315225160829164, 'epoch': 0.18}


 18%|█▊        | 3104/16798 [12:51<1:02:56,  3.63it/s]

{'loss': 2.1306, 'grad_norm': 2.678403377532959, 'learning_rate': 0.0001631403383369073, 'epoch': 0.18}


 18%|█▊        | 3105/16798 [12:51<1:03:20,  3.60it/s]

{'loss': 2.2193, 'grad_norm': 2.1976535320281982, 'learning_rate': 0.000163128425065523, 'epoch': 0.18}


 18%|█▊        | 3106/16798 [12:51<1:01:21,  3.72it/s]

{'loss': 2.0456, 'grad_norm': 2.204064130783081, 'learning_rate': 0.00016311651179413866, 'epoch': 0.18}


 18%|█▊        | 3107/16798 [12:51<1:03:25,  3.60it/s]

{'loss': 2.0555, 'grad_norm': 2.0265395641326904, 'learning_rate': 0.00016310459852275435, 'epoch': 0.18}


 19%|█▊        | 3108/16798 [12:52<1:02:13,  3.67it/s]

{'loss': 1.7133, 'grad_norm': 1.9407525062561035, 'learning_rate': 0.00016309268525137001, 'epoch': 0.19}


 19%|█▊        | 3109/16798 [12:52<1:01:49,  3.69it/s]

{'loss': 1.8017, 'grad_norm': 2.001218318939209, 'learning_rate': 0.0001630807719799857, 'epoch': 0.19}


 19%|█▊        | 3110/16798 [12:52<58:29,  3.90it/s]  

{'loss': 1.819, 'grad_norm': 2.011838436126709, 'learning_rate': 0.00016306885870860137, 'epoch': 0.19}


 19%|█▊        | 3111/16798 [12:52<1:02:37,  3.64it/s]

{'loss': 1.3989, 'grad_norm': 1.6486858129501343, 'learning_rate': 0.00016305694543721706, 'epoch': 0.19}


 19%|█▊        | 3112/16798 [12:53<1:04:11,  3.55it/s]

{'loss': 1.7269, 'grad_norm': 1.905085802078247, 'learning_rate': 0.00016304503216583275, 'epoch': 0.19}


 19%|█▊        | 3113/16798 [12:53<1:05:16,  3.49it/s]

{'loss': 1.572, 'grad_norm': 1.667048692703247, 'learning_rate': 0.00016303311889444844, 'epoch': 0.19}


 19%|█▊        | 3114/16798 [12:53<1:04:42,  3.52it/s]

{'loss': 1.4369, 'grad_norm': 1.5680248737335205, 'learning_rate': 0.0001630212056230641, 'epoch': 0.19}


 19%|█▊        | 3115/16798 [12:54<1:01:34,  3.70it/s]

{'loss': 1.7764, 'grad_norm': 2.0455257892608643, 'learning_rate': 0.0001630092923516798, 'epoch': 0.19}


 19%|█▊        | 3116/16798 [12:54<1:00:40,  3.76it/s]

{'loss': 1.6304, 'grad_norm': 2.111109733581543, 'learning_rate': 0.00016299737908029546, 'epoch': 0.19}


 19%|█▊        | 3117/16798 [12:54<1:01:39,  3.70it/s]

{'loss': 1.7135, 'grad_norm': 1.8831071853637695, 'learning_rate': 0.00016298546580891115, 'epoch': 0.19}


 19%|█▊        | 3118/16798 [12:54<58:10,  3.92it/s]  

{'loss': 2.0955, 'grad_norm': 2.2452306747436523, 'learning_rate': 0.0001629735525375268, 'epoch': 0.19}


 19%|█▊        | 3119/16798 [12:55<56:00,  4.07it/s]

{'loss': 1.3833, 'grad_norm': 1.6131867170333862, 'learning_rate': 0.0001629616392661425, 'epoch': 0.19}


 19%|█▊        | 3120/16798 [12:55<58:10,  3.92it/s]

{'loss': 1.2398, 'grad_norm': 1.6552313566207886, 'learning_rate': 0.00016294972599475817, 'epoch': 0.19}


 19%|█▊        | 3121/16798 [12:55<57:37,  3.96it/s]

{'loss': 1.3423, 'grad_norm': 1.8165732622146606, 'learning_rate': 0.00016293781272337386, 'epoch': 0.19}


 19%|█▊        | 3122/16798 [12:55<1:00:28,  3.77it/s]

{'loss': 1.8446, 'grad_norm': 2.159554958343506, 'learning_rate': 0.00016292589945198952, 'epoch': 0.19}


 19%|█▊        | 3124/16798 [12:56<55:15,  4.12it/s]  

{'loss': 1.3287, 'grad_norm': 1.8413292169570923, 'learning_rate': 0.0001629139861806052, 'epoch': 0.19}


 19%|█▊        | 3124/16798 [12:56<55:15,  4.12it/s]

{'loss': 1.7825, 'grad_norm': 2.3537893295288086, 'learning_rate': 0.00016290207290922088, 'epoch': 0.19}


 19%|█▊        | 3125/16798 [12:56<57:22,  3.97it/s]

{'loss': 1.6863, 'grad_norm': 2.465531587600708, 'learning_rate': 0.00016289015963783657, 'epoch': 0.19}


 19%|█▊        | 3126/16798 [12:56<56:31,  4.03it/s]

{'loss': 1.7641, 'grad_norm': 2.125202178955078, 'learning_rate': 0.00016287824636645223, 'epoch': 0.19}


 19%|█▊        | 3127/16798 [12:57<58:23,  3.90it/s]

{'loss': 1.6396, 'grad_norm': 2.2804484367370605, 'learning_rate': 0.00016286633309506792, 'epoch': 0.19}


 19%|█▊        | 3128/16798 [12:57<57:29,  3.96it/s]

{'loss': 1.6905, 'grad_norm': 1.9817003011703491, 'learning_rate': 0.00016285441982368359, 'epoch': 0.19}


 19%|█▊        | 3129/16798 [12:57<58:48,  3.87it/s]

{'loss': 1.7766, 'grad_norm': 2.0676963329315186, 'learning_rate': 0.00016284250655229928, 'epoch': 0.19}


 19%|█▊        | 3130/16798 [12:57<1:00:38,  3.76it/s]

{'loss': 1.6937, 'grad_norm': 1.9132046699523926, 'learning_rate': 0.00016283059328091494, 'epoch': 0.19}


 19%|█▊        | 3131/16798 [12:58<1:00:41,  3.75it/s]

{'loss': 1.5957, 'grad_norm': 1.8035085201263428, 'learning_rate': 0.00016281868000953063, 'epoch': 0.19}


 19%|█▊        | 3132/16798 [12:58<1:00:53,  3.74it/s]

{'loss': 1.4681, 'grad_norm': 1.8172342777252197, 'learning_rate': 0.0001628067667381463, 'epoch': 0.19}


 19%|█▊        | 3133/16798 [12:58<1:00:03,  3.79it/s]

{'loss': 1.5513, 'grad_norm': 2.2895147800445557, 'learning_rate': 0.00016279485346676198, 'epoch': 0.19}


 19%|█▊        | 3134/16798 [12:58<57:11,  3.98it/s]  

{'loss': 1.629, 'grad_norm': 2.0493125915527344, 'learning_rate': 0.00016278294019537765, 'epoch': 0.19}


 19%|█▊        | 3135/16798 [12:59<58:04,  3.92it/s]

{'loss': 1.1544, 'grad_norm': 1.6952214241027832, 'learning_rate': 0.00016277102692399334, 'epoch': 0.19}


 19%|█▊        | 3136/16798 [12:59<1:04:21,  3.54it/s]

{'loss': 1.8005, 'grad_norm': 2.2282114028930664, 'learning_rate': 0.000162759113652609, 'epoch': 0.19}


 19%|█▊        | 3137/16798 [12:59<1:03:02,  3.61it/s]

{'loss': 1.4445, 'grad_norm': 1.7404215335845947, 'learning_rate': 0.0001627472003812247, 'epoch': 0.19}


 19%|█▊        | 3138/16798 [13:00<1:02:54,  3.62it/s]

{'loss': 1.0355, 'grad_norm': 1.6225931644439697, 'learning_rate': 0.00016273528710984036, 'epoch': 0.19}


 19%|█▊        | 3139/16798 [13:00<59:58,  3.80it/s]  

{'loss': 1.4378, 'grad_norm': 2.0425267219543457, 'learning_rate': 0.00016272337383845605, 'epoch': 0.19}


 19%|█▊        | 3140/16798 [13:00<59:19,  3.84it/s]

{'loss': 1.1065, 'grad_norm': 1.7322396039962769, 'learning_rate': 0.0001627114605670717, 'epoch': 0.19}


 19%|█▊        | 3141/16798 [13:00<1:01:51,  3.68it/s]

{'loss': 1.3851, 'grad_norm': 1.8728032112121582, 'learning_rate': 0.0001626995472956874, 'epoch': 0.19}


 19%|█▊        | 3142/16798 [13:01<59:41,  3.81it/s]  

{'loss': 1.5547, 'grad_norm': 1.8900773525238037, 'learning_rate': 0.00016268763402430307, 'epoch': 0.19}


 19%|█▊        | 3143/16798 [13:01<1:00:35,  3.76it/s]

{'loss': 1.2175, 'grad_norm': 1.578041434288025, 'learning_rate': 0.00016267572075291876, 'epoch': 0.19}


 19%|█▊        | 3144/16798 [13:01<57:45,  3.94it/s]  

{'loss': 0.8661, 'grad_norm': 1.6467357873916626, 'learning_rate': 0.00016266380748153445, 'epoch': 0.19}


 19%|█▊        | 3145/16798 [13:01<54:48,  4.15it/s]

{'loss': 0.7425, 'grad_norm': 1.2010927200317383, 'learning_rate': 0.0001626518942101501, 'epoch': 0.19}


 19%|█▊        | 3146/16798 [13:02<57:24,  3.96it/s]

{'loss': 0.9437, 'grad_norm': 1.6676560640335083, 'learning_rate': 0.0001626399809387658, 'epoch': 0.19}


 19%|█▊        | 3147/16798 [13:02<57:30,  3.96it/s]

{'loss': 0.4963, 'grad_norm': 1.120979905128479, 'learning_rate': 0.00016262806766738147, 'epoch': 0.19}


 19%|█▊        | 3148/16798 [13:02<55:15,  4.12it/s]

{'loss': 0.2475, 'grad_norm': 0.7229959964752197, 'learning_rate': 0.00016261615439599716, 'epoch': 0.19}


 19%|█▊        | 3149/16798 [13:02<59:13,  3.84it/s]

{'loss': 0.4092, 'grad_norm': 1.0095889568328857, 'learning_rate': 0.00016260424112461282, 'epoch': 0.19}


 19%|█▉        | 3150/16798 [13:03<55:48,  4.08it/s]

{'loss': 0.2558, 'grad_norm': 0.6934307813644409, 'learning_rate': 0.0001625923278532285, 'epoch': 0.19}


 19%|█▉        | 3151/16798 [13:03<1:00:39,  3.75it/s]

{'loss': 2.1262, 'grad_norm': 1.9746865034103394, 'learning_rate': 0.00016258041458184417, 'epoch': 0.19}


 19%|█▉        | 3152/16798 [13:03<1:01:07,  3.72it/s]

{'loss': 2.2487, 'grad_norm': 1.915549635887146, 'learning_rate': 0.00016256850131045987, 'epoch': 0.19}


 19%|█▉        | 3153/16798 [13:03<1:01:18,  3.71it/s]

{'loss': 1.8757, 'grad_norm': 1.809187889099121, 'learning_rate': 0.00016255658803907553, 'epoch': 0.19}


 19%|█▉        | 3154/16798 [13:04<1:03:07,  3.60it/s]

{'loss': 1.8971, 'grad_norm': 1.947943091392517, 'learning_rate': 0.00016254467476769122, 'epoch': 0.19}


 19%|█▉        | 3155/16798 [13:04<59:47,  3.80it/s]  

{'loss': 2.01, 'grad_norm': 2.0896756649017334, 'learning_rate': 0.00016253276149630688, 'epoch': 0.19}


 19%|█▉        | 3156/16798 [13:04<1:03:04,  3.60it/s]

{'loss': 2.1301, 'grad_norm': 2.3035566806793213, 'learning_rate': 0.00016252084822492257, 'epoch': 0.19}


 19%|█▉        | 3157/16798 [13:05<1:01:43,  3.68it/s]

{'loss': 1.9289, 'grad_norm': 1.8046659231185913, 'learning_rate': 0.00016250893495353824, 'epoch': 0.19}


 19%|█▉        | 3158/16798 [13:05<1:02:52,  3.62it/s]

{'loss': 1.6422, 'grad_norm': 1.6575584411621094, 'learning_rate': 0.00016249702168215393, 'epoch': 0.19}


 19%|█▉        | 3159/16798 [13:05<1:04:22,  3.53it/s]

{'loss': 2.1786, 'grad_norm': 2.0624146461486816, 'learning_rate': 0.0001624851084107696, 'epoch': 0.19}


 19%|█▉        | 3160/16798 [13:05<1:00:33,  3.75it/s]

{'loss': 1.6073, 'grad_norm': 1.8580524921417236, 'learning_rate': 0.00016247319513938528, 'epoch': 0.19}


 19%|█▉        | 3161/16798 [13:06<1:02:34,  3.63it/s]

{'loss': 1.5622, 'grad_norm': 1.8819615840911865, 'learning_rate': 0.00016246128186800095, 'epoch': 0.19}


 19%|█▉        | 3162/16798 [13:06<1:03:25,  3.58it/s]

{'loss': 2.0448, 'grad_norm': 2.3172569274902344, 'learning_rate': 0.00016244936859661664, 'epoch': 0.19}


 19%|█▉        | 3163/16798 [13:06<1:01:25,  3.70it/s]

{'loss': 1.5566, 'grad_norm': 2.2897608280181885, 'learning_rate': 0.0001624374553252323, 'epoch': 0.19}


 19%|█▉        | 3164/16798 [13:06<59:34,  3.81it/s]  

{'loss': 1.8001, 'grad_norm': 2.0918350219726562, 'learning_rate': 0.000162425542053848, 'epoch': 0.19}


 19%|█▉        | 3165/16798 [13:07<1:03:02,  3.60it/s]

{'loss': 2.1969, 'grad_norm': 2.273064136505127, 'learning_rate': 0.00016241362878246366, 'epoch': 0.19}


 19%|█▉        | 3166/16798 [13:07<1:00:38,  3.75it/s]

{'loss': 1.6027, 'grad_norm': 1.8234906196594238, 'learning_rate': 0.00016240171551107935, 'epoch': 0.19}


 19%|█▉        | 3167/16798 [13:07<1:00:43,  3.74it/s]

{'loss': 1.9304, 'grad_norm': 2.050619125366211, 'learning_rate': 0.000162389802239695, 'epoch': 0.19}


 19%|█▉        | 3168/16798 [13:07<57:07,  3.98it/s]  

{'loss': 1.6122, 'grad_norm': 2.184492588043213, 'learning_rate': 0.0001623778889683107, 'epoch': 0.19}


 19%|█▉        | 3169/16798 [13:08<56:17,  4.04it/s]

{'loss': 1.5476, 'grad_norm': 1.8450955152511597, 'learning_rate': 0.00016236597569692636, 'epoch': 0.19}


 19%|█▉        | 3170/16798 [13:08<58:21,  3.89it/s]

{'loss': 1.6356, 'grad_norm': 1.9610956907272339, 'learning_rate': 0.00016235406242554206, 'epoch': 0.19}


 19%|█▉        | 3171/16798 [13:08<1:08:26,  3.32it/s]

{'loss': 1.5479, 'grad_norm': 1.8475849628448486, 'learning_rate': 0.00016234214915415772, 'epoch': 0.19}


 19%|█▉        | 3172/16798 [13:09<1:05:06,  3.49it/s]

{'loss': 1.2287, 'grad_norm': 1.6260277032852173, 'learning_rate': 0.0001623302358827734, 'epoch': 0.19}


 19%|█▉        | 3173/16798 [13:09<1:04:09,  3.54it/s]

{'loss': 1.1574, 'grad_norm': 1.7592302560806274, 'learning_rate': 0.00016231832261138907, 'epoch': 0.19}


 19%|█▉        | 3174/16798 [13:09<1:03:10,  3.59it/s]

{'loss': 1.5113, 'grad_norm': 1.854642391204834, 'learning_rate': 0.0001623064093400048, 'epoch': 0.19}


 19%|█▉        | 3175/16798 [13:09<1:03:36,  3.57it/s]

{'loss': 1.578, 'grad_norm': 1.7180103063583374, 'learning_rate': 0.00016229449606862045, 'epoch': 0.19}


 19%|█▉        | 3176/16798 [13:10<1:02:12,  3.65it/s]

{'loss': 1.2383, 'grad_norm': 1.4869581460952759, 'learning_rate': 0.00016228258279723615, 'epoch': 0.19}


 19%|█▉        | 3177/16798 [13:10<1:01:29,  3.69it/s]

{'loss': 1.471, 'grad_norm': 2.025120735168457, 'learning_rate': 0.0001622706695258518, 'epoch': 0.19}


 19%|█▉        | 3178/16798 [13:10<58:52,  3.86it/s]  

{'loss': 1.9396, 'grad_norm': 2.6317007541656494, 'learning_rate': 0.0001622587562544675, 'epoch': 0.19}


 19%|█▉        | 3179/16798 [13:11<1:02:50,  3.61it/s]

{'loss': 2.0278, 'grad_norm': 2.1908140182495117, 'learning_rate': 0.00016224684298308316, 'epoch': 0.19}


 19%|█▉        | 3180/16798 [13:11<1:02:31,  3.63it/s]

{'loss': 1.3567, 'grad_norm': 1.8752433061599731, 'learning_rate': 0.00016223492971169885, 'epoch': 0.19}


 19%|█▉        | 3181/16798 [13:11<1:00:40,  3.74it/s]

{'loss': 1.2718, 'grad_norm': 1.6325620412826538, 'learning_rate': 0.00016222301644031452, 'epoch': 0.19}


 19%|█▉        | 3182/16798 [13:11<1:00:59,  3.72it/s]

{'loss': 1.752, 'grad_norm': 2.154629707336426, 'learning_rate': 0.0001622111031689302, 'epoch': 0.19}


 19%|█▉        | 3183/16798 [13:12<58:56,  3.85it/s]  

{'loss': 1.7655, 'grad_norm': 2.2090117931365967, 'learning_rate': 0.00016219918989754587, 'epoch': 0.19}


 19%|█▉        | 3184/16798 [13:12<59:53,  3.79it/s]

{'loss': 1.4365, 'grad_norm': 1.8645710945129395, 'learning_rate': 0.00016218727662616156, 'epoch': 0.19}


 19%|█▉        | 3185/16798 [13:12<59:53,  3.79it/s]

{'loss': 1.439, 'grad_norm': 1.8741830587387085, 'learning_rate': 0.00016217536335477723, 'epoch': 0.19}


 19%|█▉        | 3186/16798 [13:12<57:47,  3.93it/s]

{'loss': 1.7805, 'grad_norm': 2.055919647216797, 'learning_rate': 0.00016216345008339292, 'epoch': 0.19}


 19%|█▉        | 3187/16798 [13:13<58:45,  3.86it/s]

{'loss': 1.1626, 'grad_norm': 1.846436619758606, 'learning_rate': 0.00016215153681200858, 'epoch': 0.19}


 19%|█▉        | 3188/16798 [13:13<55:59,  4.05it/s]

{'loss': 1.1724, 'grad_norm': 1.9302012920379639, 'learning_rate': 0.00016213962354062427, 'epoch': 0.19}


 19%|█▉        | 3189/16798 [13:13<56:17,  4.03it/s]

{'loss': 1.2227, 'grad_norm': 1.8492188453674316, 'learning_rate': 0.00016212771026923994, 'epoch': 0.19}


 19%|█▉        | 3190/16798 [13:13<57:38,  3.93it/s]

{'loss': 0.9271, 'grad_norm': 1.7784197330474854, 'learning_rate': 0.00016211579699785563, 'epoch': 0.19}


 19%|█▉        | 3191/16798 [13:14<56:56,  3.98it/s]

{'loss': 1.5692, 'grad_norm': 2.025643825531006, 'learning_rate': 0.0001621038837264713, 'epoch': 0.19}


 19%|█▉        | 3192/16798 [13:14<56:19,  4.03it/s]

{'loss': 1.8618, 'grad_norm': 2.017059803009033, 'learning_rate': 0.00016209197045508698, 'epoch': 0.19}


 19%|█▉        | 3193/16798 [13:14<58:08,  3.90it/s]

{'loss': 1.2895, 'grad_norm': 1.7937480211257935, 'learning_rate': 0.00016208005718370264, 'epoch': 0.19}


 19%|█▉        | 3194/16798 [13:14<58:56,  3.85it/s]

{'loss': 0.9114, 'grad_norm': 1.7230331897735596, 'learning_rate': 0.00016206814391231834, 'epoch': 0.19}


 19%|█▉        | 3195/16798 [13:15<1:00:54,  3.72it/s]

{'loss': 1.013, 'grad_norm': 1.815337896347046, 'learning_rate': 0.000162056230640934, 'epoch': 0.19}


 19%|█▉        | 3196/16798 [13:15<1:00:30,  3.75it/s]

{'loss': 0.6334, 'grad_norm': 1.3353947401046753, 'learning_rate': 0.0001620443173695497, 'epoch': 0.19}


 19%|█▉        | 3197/16798 [13:15<57:13,  3.96it/s]  

{'loss': 0.6112, 'grad_norm': 1.2753899097442627, 'learning_rate': 0.00016203240409816535, 'epoch': 0.19}


 19%|█▉        | 3198/16798 [13:15<59:12,  3.83it/s]

{'loss': 0.3032, 'grad_norm': 0.86250901222229, 'learning_rate': 0.00016202049082678104, 'epoch': 0.19}


 19%|█▉        | 3199/16798 [13:16<1:00:22,  3.75it/s]

{'loss': 0.491, 'grad_norm': 1.221930980682373, 'learning_rate': 0.0001620085775553967, 'epoch': 0.19}


 19%|█▉        | 3200/16798 [13:16<1:02:03,  3.65it/s]

{'loss': 0.8713, 'grad_norm': 1.649061918258667, 'learning_rate': 0.0001619966642840124, 'epoch': 0.19}


 19%|█▉        | 3201/16798 [13:16<1:01:50,  3.66it/s]

{'loss': 2.1943, 'grad_norm': 2.0048649311065674, 'learning_rate': 0.00016198475101262806, 'epoch': 0.19}


 19%|█▉        | 3202/16798 [13:17<1:01:44,  3.67it/s]

{'loss': 2.0103, 'grad_norm': 1.9253181219100952, 'learning_rate': 0.00016197283774124375, 'epoch': 0.19}


 19%|█▉        | 3203/16798 [13:17<1:01:28,  3.69it/s]

{'loss': 2.1262, 'grad_norm': 1.9641764163970947, 'learning_rate': 0.00016196092446985942, 'epoch': 0.19}


 19%|█▉        | 3204/16798 [13:17<59:21,  3.82it/s]  

{'loss': 2.0869, 'grad_norm': 2.6395905017852783, 'learning_rate': 0.0001619490111984751, 'epoch': 0.19}


 19%|█▉        | 3205/16798 [13:17<1:01:12,  3.70it/s]

{'loss': 2.1917, 'grad_norm': 1.9025713205337524, 'learning_rate': 0.0001619370979270908, 'epoch': 0.19}


 19%|█▉        | 3206/16798 [13:18<1:03:45,  3.55it/s]

{'loss': 1.877, 'grad_norm': 1.8047465085983276, 'learning_rate': 0.00016192518465570646, 'epoch': 0.19}


 19%|█▉        | 3207/16798 [13:18<1:00:46,  3.73it/s]

{'loss': 2.6663, 'grad_norm': 2.561774492263794, 'learning_rate': 0.00016191327138432215, 'epoch': 0.19}


 19%|█▉        | 3208/16798 [13:18<1:01:12,  3.70it/s]

{'loss': 1.7887, 'grad_norm': 1.9040863513946533, 'learning_rate': 0.00016190135811293782, 'epoch': 0.19}


 19%|█▉        | 3209/16798 [13:18<58:18,  3.88it/s]  

{'loss': 2.0852, 'grad_norm': 2.0609426498413086, 'learning_rate': 0.0001618894448415535, 'epoch': 0.19}


 19%|█▉        | 3210/16798 [13:19<55:47,  4.06it/s]

{'loss': 1.684, 'grad_norm': 1.6292859315872192, 'learning_rate': 0.00016187753157016917, 'epoch': 0.19}


 19%|█▉        | 3211/16798 [13:19<57:22,  3.95it/s]

{'loss': 1.9207, 'grad_norm': 2.110938549041748, 'learning_rate': 0.00016186561829878486, 'epoch': 0.19}


 19%|█▉        | 3212/16798 [13:19<55:34,  4.07it/s]

{'loss': 1.6326, 'grad_norm': 1.8037546873092651, 'learning_rate': 0.00016185370502740053, 'epoch': 0.19}


 19%|█▉        | 3213/16798 [13:19<54:43,  4.14it/s]

{'loss': 1.2711, 'grad_norm': 1.5625516176223755, 'learning_rate': 0.00016184179175601622, 'epoch': 0.19}


 19%|█▉        | 3214/16798 [13:20<1:01:48,  3.66it/s]

{'loss': 1.6215, 'grad_norm': 1.843045949935913, 'learning_rate': 0.00016182987848463188, 'epoch': 0.19}


 19%|█▉        | 3215/16798 [13:20<57:37,  3.93it/s]  

{'loss': 1.8828, 'grad_norm': 1.9084599018096924, 'learning_rate': 0.00016181796521324757, 'epoch': 0.19}


 19%|█▉        | 3216/16798 [13:20<54:59,  4.12it/s]

{'loss': 1.1163, 'grad_norm': 1.4044907093048096, 'learning_rate': 0.00016180605194186323, 'epoch': 0.19}


 19%|█▉        | 3217/16798 [13:20<56:41,  3.99it/s]

{'loss': 1.8243, 'grad_norm': 2.114461660385132, 'learning_rate': 0.00016179413867047892, 'epoch': 0.19}


 19%|█▉        | 3218/16798 [13:21<1:01:56,  3.65it/s]

{'loss': 1.5897, 'grad_norm': 1.733297348022461, 'learning_rate': 0.0001617822253990946, 'epoch': 0.19}


 19%|█▉        | 3219/16798 [13:21<1:02:55,  3.60it/s]

{'loss': 1.6548, 'grad_norm': 2.0531585216522217, 'learning_rate': 0.00016177031212771028, 'epoch': 0.19}


 19%|█▉        | 3220/16798 [13:21<1:03:11,  3.58it/s]

{'loss': 1.3156, 'grad_norm': 1.5066431760787964, 'learning_rate': 0.00016175839885632594, 'epoch': 0.19}


 19%|█▉        | 3221/16798 [13:22<59:09,  3.83it/s]  

{'loss': 1.707, 'grad_norm': 2.3115861415863037, 'learning_rate': 0.00016174648558494163, 'epoch': 0.19}


 19%|█▉        | 3222/16798 [13:22<1:00:38,  3.73it/s]

{'loss': 1.6987, 'grad_norm': 2.206252336502075, 'learning_rate': 0.0001617345723135573, 'epoch': 0.19}


 19%|█▉        | 3223/16798 [13:22<1:01:25,  3.68it/s]

{'loss': 1.8244, 'grad_norm': 2.041698932647705, 'learning_rate': 0.000161722659042173, 'epoch': 0.19}


 19%|█▉        | 3224/16798 [13:22<57:56,  3.90it/s]  

{'loss': 1.2492, 'grad_norm': 1.446578860282898, 'learning_rate': 0.00016171074577078865, 'epoch': 0.19}


 19%|█▉        | 3225/16798 [13:23<1:00:09,  3.76it/s]

{'loss': 1.3492, 'grad_norm': 1.6942780017852783, 'learning_rate': 0.00016169883249940434, 'epoch': 0.19}


 19%|█▉        | 3226/16798 [13:23<58:25,  3.87it/s]  

{'loss': 1.6273, 'grad_norm': 1.837555170059204, 'learning_rate': 0.00016168691922802, 'epoch': 0.19}


 19%|█▉        | 3227/16798 [13:23<1:01:31,  3.68it/s]

{'loss': 1.3995, 'grad_norm': 1.7806037664413452, 'learning_rate': 0.0001616750059566357, 'epoch': 0.19}


 19%|█▉        | 3228/16798 [13:23<1:02:18,  3.63it/s]

{'loss': 1.227, 'grad_norm': 1.7339025735855103, 'learning_rate': 0.00016166309268525136, 'epoch': 0.19}


 19%|█▉        | 3229/16798 [13:24<1:03:58,  3.53it/s]

{'loss': 1.8377, 'grad_norm': 2.194974422454834, 'learning_rate': 0.00016165117941386705, 'epoch': 0.19}


 19%|█▉        | 3230/16798 [13:24<59:31,  3.80it/s]  

{'loss': 1.1197, 'grad_norm': 1.5974518060684204, 'learning_rate': 0.00016163926614248272, 'epoch': 0.19}


 19%|█▉        | 3231/16798 [13:24<57:40,  3.92it/s]

{'loss': 1.6146, 'grad_norm': 1.8292509317398071, 'learning_rate': 0.0001616273528710984, 'epoch': 0.19}


 19%|█▉        | 3232/16798 [13:24<1:02:35,  3.61it/s]

{'loss': 1.5589, 'grad_norm': 1.798457145690918, 'learning_rate': 0.00016161543959971407, 'epoch': 0.19}


 19%|█▉        | 3233/16798 [13:25<59:06,  3.82it/s]  

{'loss': 1.2322, 'grad_norm': 1.6485179662704468, 'learning_rate': 0.00016160352632832976, 'epoch': 0.19}


 19%|█▉        | 3234/16798 [13:25<1:00:12,  3.75it/s]

{'loss': 1.5011, 'grad_norm': 1.9494924545288086, 'learning_rate': 0.00016159161305694542, 'epoch': 0.19}


 19%|█▉        | 3235/16798 [13:25<56:51,  3.98it/s]  

{'loss': 1.1701, 'grad_norm': 1.705749273300171, 'learning_rate': 0.00016157969978556114, 'epoch': 0.19}


 19%|█▉        | 3236/16798 [13:25<54:09,  4.17it/s]

{'loss': 1.4829, 'grad_norm': 2.3574345111846924, 'learning_rate': 0.0001615677865141768, 'epoch': 0.19}


 19%|█▉        | 3237/16798 [13:26<57:18,  3.94it/s]

{'loss': 1.2538, 'grad_norm': 1.8479036092758179, 'learning_rate': 0.0001615558732427925, 'epoch': 0.19}


 19%|█▉        | 3238/16798 [13:26<56:29,  4.00it/s]

{'loss': 1.42, 'grad_norm': 1.7723090648651123, 'learning_rate': 0.00016154395997140816, 'epoch': 0.19}


 19%|█▉        | 3239/16798 [13:26<55:02,  4.11it/s]

{'loss': 1.1677, 'grad_norm': 1.681478500366211, 'learning_rate': 0.00016153204670002385, 'epoch': 0.19}


 19%|█▉        | 3240/16798 [13:26<57:38,  3.92it/s]

{'loss': 1.4102, 'grad_norm': 1.815003514289856, 'learning_rate': 0.00016152013342863951, 'epoch': 0.19}


 19%|█▉        | 3241/16798 [13:27<56:04,  4.03it/s]

{'loss': 1.4823, 'grad_norm': 1.923183560371399, 'learning_rate': 0.0001615082201572552, 'epoch': 0.19}


 19%|█▉        | 3242/16798 [13:27<56:24,  4.01it/s]

{'loss': 1.1166, 'grad_norm': 1.9008904695510864, 'learning_rate': 0.00016149630688587087, 'epoch': 0.19}


 19%|█▉        | 3243/16798 [13:27<59:21,  3.81it/s]

{'loss': 1.3688, 'grad_norm': 1.8658514022827148, 'learning_rate': 0.00016148439361448656, 'epoch': 0.19}


 19%|█▉        | 3244/16798 [13:27<56:27,  4.00it/s]

{'loss': 1.167, 'grad_norm': 1.753228783607483, 'learning_rate': 0.00016147248034310222, 'epoch': 0.19}


 19%|█▉        | 3245/16798 [13:28<53:36,  4.21it/s]

{'loss': 0.7502, 'grad_norm': 1.3441239595413208, 'learning_rate': 0.00016146056707171791, 'epoch': 0.19}


 19%|█▉        | 3247/16798 [13:28<53:05,  4.25it/s]

{'loss': 1.2162, 'grad_norm': 1.8942413330078125, 'learning_rate': 0.00016144865380033358, 'epoch': 0.19}


 19%|█▉        | 3247/16798 [13:28<53:05,  4.25it/s]

{'loss': 1.1181, 'grad_norm': 1.6573385000228882, 'learning_rate': 0.00016143674052894927, 'epoch': 0.19}


 19%|█▉        | 3248/16798 [13:28<53:15,  4.24it/s]

{'loss': 1.0659, 'grad_norm': 1.6852744817733765, 'learning_rate': 0.00016142482725756493, 'epoch': 0.19}


 19%|█▉        | 3249/16798 [13:29<55:40,  4.06it/s]

{'loss': 0.474, 'grad_norm': 1.108991265296936, 'learning_rate': 0.00016141291398618062, 'epoch': 0.19}


 19%|█▉        | 3250/16798 [13:29<58:21,  3.87it/s]

{'loss': 0.2084, 'grad_norm': 0.6586313247680664, 'learning_rate': 0.00016140100071479629, 'epoch': 0.19}


 19%|█▉        | 3251/16798 [13:29<1:00:42,  3.72it/s]

{'loss': 1.8539, 'grad_norm': 1.898932695388794, 'learning_rate': 0.00016138908744341198, 'epoch': 0.19}


 19%|█▉        | 3252/16798 [13:30<1:01:27,  3.67it/s]

{'loss': 2.1815, 'grad_norm': 2.137962818145752, 'learning_rate': 0.00016137717417202764, 'epoch': 0.19}


 19%|█▉        | 3253/16798 [13:30<59:01,  3.82it/s]  

{'loss': 2.31, 'grad_norm': 1.9559056758880615, 'learning_rate': 0.00016136526090064333, 'epoch': 0.19}


 19%|█▉        | 3254/16798 [13:30<1:03:10,  3.57it/s]

{'loss': 1.5761, 'grad_norm': 1.6706863641738892, 'learning_rate': 0.000161353347629259, 'epoch': 0.19}


 19%|█▉        | 3255/16798 [13:30<1:03:21,  3.56it/s]

{'loss': 1.506, 'grad_norm': 1.7243081331253052, 'learning_rate': 0.00016134143435787469, 'epoch': 0.19}


 19%|█▉        | 3256/16798 [13:31<1:03:38,  3.55it/s]

{'loss': 2.0617, 'grad_norm': 1.691050410270691, 'learning_rate': 0.00016132952108649035, 'epoch': 0.19}


 19%|█▉        | 3257/16798 [13:31<1:03:21,  3.56it/s]

{'loss': 1.9207, 'grad_norm': 1.9463638067245483, 'learning_rate': 0.00016131760781510604, 'epoch': 0.19}


 19%|█▉        | 3258/16798 [13:31<1:04:30,  3.50it/s]

{'loss': 1.5339, 'grad_norm': 1.965569019317627, 'learning_rate': 0.0001613056945437217, 'epoch': 0.19}


 19%|█▉        | 3259/16798 [13:31<1:03:34,  3.55it/s]

{'loss': 1.7064, 'grad_norm': 2.0670413970947266, 'learning_rate': 0.0001612937812723374, 'epoch': 0.19}


 19%|█▉        | 3260/16798 [13:32<59:14,  3.81it/s]  

{'loss': 1.7957, 'grad_norm': 2.1019139289855957, 'learning_rate': 0.00016128186800095306, 'epoch': 0.19}


 19%|█▉        | 3261/16798 [13:32<1:03:10,  3.57it/s]

{'loss': 1.5935, 'grad_norm': 1.8575587272644043, 'learning_rate': 0.00016126995472956875, 'epoch': 0.19}


 19%|█▉        | 3262/16798 [13:32<1:02:30,  3.61it/s]

{'loss': 1.5703, 'grad_norm': 1.831000804901123, 'learning_rate': 0.0001612580414581844, 'epoch': 0.19}


 19%|█▉        | 3263/16798 [13:33<58:45,  3.84it/s]  

{'loss': 1.1596, 'grad_norm': 1.7452248334884644, 'learning_rate': 0.0001612461281868001, 'epoch': 0.19}


 19%|█▉        | 3264/16798 [13:33<56:07,  4.02it/s]

{'loss': 1.787, 'grad_norm': 2.102353096008301, 'learning_rate': 0.00016123421491541577, 'epoch': 0.19}


 19%|█▉        | 3265/16798 [13:33<58:34,  3.85it/s]

{'loss': 1.6334, 'grad_norm': 1.9656471014022827, 'learning_rate': 0.00016122230164403146, 'epoch': 0.19}


 19%|█▉        | 3266/16798 [13:33<55:28,  4.07it/s]

{'loss': 1.5948, 'grad_norm': 1.769403100013733, 'learning_rate': 0.00016121038837264715, 'epoch': 0.19}


 19%|█▉        | 3267/16798 [13:34<57:29,  3.92it/s]

{'loss': 1.2853, 'grad_norm': 1.6754786968231201, 'learning_rate': 0.0001611984751012628, 'epoch': 0.19}


 19%|█▉        | 3268/16798 [13:34<59:18,  3.80it/s]

{'loss': 1.1868, 'grad_norm': 1.4335222244262695, 'learning_rate': 0.0001611865618298785, 'epoch': 0.19}


 19%|█▉        | 3269/16798 [13:34<57:33,  3.92it/s]

{'loss': 1.8926, 'grad_norm': 2.030255079269409, 'learning_rate': 0.00016117464855849417, 'epoch': 0.19}


 19%|█▉        | 3270/16798 [13:34<57:07,  3.95it/s]

{'loss': 1.4218, 'grad_norm': 1.8126944303512573, 'learning_rate': 0.00016116273528710986, 'epoch': 0.19}


 19%|█▉        | 3271/16798 [13:35<59:17,  3.80it/s]

{'loss': 1.5414, 'grad_norm': 1.9639520645141602, 'learning_rate': 0.00016115082201572552, 'epoch': 0.19}


 19%|█▉        | 3272/16798 [13:35<58:25,  3.86it/s]

{'loss': 1.5055, 'grad_norm': 1.9886140823364258, 'learning_rate': 0.0001611389087443412, 'epoch': 0.19}


 19%|█▉        | 3273/16798 [13:35<1:01:13,  3.68it/s]

{'loss': 1.4726, 'grad_norm': 1.8055741786956787, 'learning_rate': 0.00016112699547295688, 'epoch': 0.19}


 19%|█▉        | 3274/16798 [13:35<1:01:36,  3.66it/s]

{'loss': 1.2395, 'grad_norm': 1.5638731718063354, 'learning_rate': 0.00016111508220157257, 'epoch': 0.19}


 19%|█▉        | 3275/16798 [13:36<1:05:03,  3.46it/s]

{'loss': 1.3233, 'grad_norm': 2.410172462463379, 'learning_rate': 0.00016110316893018823, 'epoch': 0.19}


 20%|█▉        | 3276/16798 [13:36<1:01:46,  3.65it/s]

{'loss': 1.5961, 'grad_norm': 1.7531499862670898, 'learning_rate': 0.00016109125565880392, 'epoch': 0.2}


 20%|█▉        | 3277/16798 [13:36<1:00:38,  3.72it/s]

{'loss': 1.3786, 'grad_norm': 1.933276891708374, 'learning_rate': 0.00016107934238741958, 'epoch': 0.2}


 20%|█▉        | 3278/16798 [13:36<1:01:16,  3.68it/s]

{'loss': 1.6759, 'grad_norm': 2.188976764678955, 'learning_rate': 0.00016106742911603528, 'epoch': 0.2}


 20%|█▉        | 3279/16798 [13:37<57:35,  3.91it/s]  

{'loss': 1.4382, 'grad_norm': 1.8033825159072876, 'learning_rate': 0.00016105551584465094, 'epoch': 0.2}


 20%|█▉        | 3280/16798 [13:37<59:35,  3.78it/s]

{'loss': 1.2545, 'grad_norm': 1.8452584743499756, 'learning_rate': 0.00016104360257326663, 'epoch': 0.2}


 20%|█▉        | 3281/16798 [13:37<1:00:15,  3.74it/s]

{'loss': 1.3498, 'grad_norm': 1.767867088317871, 'learning_rate': 0.0001610316893018823, 'epoch': 0.2}


 20%|█▉        | 3282/16798 [13:38<57:45,  3.90it/s]  

{'loss': 1.3983, 'grad_norm': 1.9757494926452637, 'learning_rate': 0.00016101977603049798, 'epoch': 0.2}


 20%|█▉        | 3283/16798 [13:38<56:44,  3.97it/s]

{'loss': 1.5028, 'grad_norm': 2.0314133167266846, 'learning_rate': 0.00016100786275911365, 'epoch': 0.2}


 20%|█▉        | 3284/16798 [13:38<1:00:02,  3.75it/s]

{'loss': 1.51, 'grad_norm': 2.02005672454834, 'learning_rate': 0.00016099594948772934, 'epoch': 0.2}


 20%|█▉        | 3285/16798 [13:38<56:46,  3.97it/s]  

{'loss': 1.1446, 'grad_norm': 1.6107134819030762, 'learning_rate': 0.000160984036216345, 'epoch': 0.2}


 20%|█▉        | 3286/16798 [13:39<56:08,  4.01it/s]

{'loss': 0.9645, 'grad_norm': 1.5989181995391846, 'learning_rate': 0.0001609721229449607, 'epoch': 0.2}


 20%|█▉        | 3287/16798 [13:39<1:01:16,  3.67it/s]

{'loss': 1.3609, 'grad_norm': 2.2888638973236084, 'learning_rate': 0.00016096020967357636, 'epoch': 0.2}


 20%|█▉        | 3288/16798 [13:39<58:25,  3.85it/s]  

{'loss': 1.8269, 'grad_norm': 2.436549663543701, 'learning_rate': 0.00016094829640219205, 'epoch': 0.2}


 20%|█▉        | 3289/16798 [13:39<59:53,  3.76it/s]

{'loss': 1.1299, 'grad_norm': 1.7640072107315063, 'learning_rate': 0.0001609363831308077, 'epoch': 0.2}


 20%|█▉        | 3290/16798 [13:40<58:17,  3.86it/s]

{'loss': 1.4103, 'grad_norm': 1.9474045038223267, 'learning_rate': 0.0001609244698594234, 'epoch': 0.2}


 20%|█▉        | 3291/16798 [13:40<58:09,  3.87it/s]

{'loss': 1.2245, 'grad_norm': 1.7553415298461914, 'learning_rate': 0.00016091255658803907, 'epoch': 0.2}


 20%|█▉        | 3292/16798 [13:40<57:13,  3.93it/s]

{'loss': 0.9938, 'grad_norm': 1.8767203092575073, 'learning_rate': 0.00016090064331665476, 'epoch': 0.2}


 20%|█▉        | 3293/16798 [13:40<58:00,  3.88it/s]

{'loss': 1.3924, 'grad_norm': 1.9262014627456665, 'learning_rate': 0.00016088873004527042, 'epoch': 0.2}


 20%|█▉        | 3294/16798 [13:41<59:50,  3.76it/s]

{'loss': 1.0855, 'grad_norm': 1.7327359914779663, 'learning_rate': 0.0001608768167738861, 'epoch': 0.2}


 20%|█▉        | 3295/16798 [13:41<59:00,  3.81it/s]

{'loss': 1.128, 'grad_norm': 1.7641329765319824, 'learning_rate': 0.00016086490350250177, 'epoch': 0.2}


 20%|█▉        | 3296/16798 [13:41<1:01:05,  3.68it/s]

{'loss': 1.2847, 'grad_norm': 2.297471523284912, 'learning_rate': 0.00016085299023111747, 'epoch': 0.2}


 20%|█▉        | 3297/16798 [13:41<58:22,  3.85it/s]  

{'loss': 0.5138, 'grad_norm': 1.2473886013031006, 'learning_rate': 0.00016084107695973316, 'epoch': 0.2}


 20%|█▉        | 3298/16798 [13:42<59:34,  3.78it/s]

{'loss': 0.6167, 'grad_norm': 1.2832410335540771, 'learning_rate': 0.00016082916368834885, 'epoch': 0.2}


 20%|█▉        | 3299/16798 [13:42<59:33,  3.78it/s]

{'loss': 0.4308, 'grad_norm': 1.360775113105774, 'learning_rate': 0.0001608172504169645, 'epoch': 0.2}


 20%|█▉        | 3300/16798 [13:42<1:00:10,  3.74it/s]

{'loss': 0.4707, 'grad_norm': 1.1681479215621948, 'learning_rate': 0.0001608053371455802, 'epoch': 0.2}


 20%|█▉        | 3301/16798 [13:42<58:45,  3.83it/s]  

{'loss': 1.8286, 'grad_norm': 1.5953115224838257, 'learning_rate': 0.00016079342387419586, 'epoch': 0.2}


 20%|█▉        | 3302/16798 [13:43<55:58,  4.02it/s]

{'loss': 1.6387, 'grad_norm': 1.6161526441574097, 'learning_rate': 0.00016078151060281156, 'epoch': 0.2}


 20%|█▉        | 3303/16798 [13:43<59:03,  3.81it/s]

{'loss': 2.1077, 'grad_norm': 1.957494854927063, 'learning_rate': 0.00016076959733142722, 'epoch': 0.2}


 20%|█▉        | 3304/16798 [13:43<59:59,  3.75it/s]

{'loss': 2.0257, 'grad_norm': 2.264956474304199, 'learning_rate': 0.0001607576840600429, 'epoch': 0.2}


 20%|█▉        | 3305/16798 [13:44<59:12,  3.80it/s]

{'loss': 2.379, 'grad_norm': 2.16011381149292, 'learning_rate': 0.00016074577078865857, 'epoch': 0.2}


 20%|█▉        | 3306/16798 [13:44<57:54,  3.88it/s]

{'loss': 2.5183, 'grad_norm': 2.2092628479003906, 'learning_rate': 0.00016073385751727426, 'epoch': 0.2}


 20%|█▉        | 3307/16798 [13:44<59:34,  3.77it/s]

{'loss': 1.5863, 'grad_norm': 1.7486021518707275, 'learning_rate': 0.00016072194424588993, 'epoch': 0.2}


 20%|█▉        | 3308/16798 [13:44<58:24,  3.85it/s]

{'loss': 2.2724, 'grad_norm': 2.2120792865753174, 'learning_rate': 0.00016071003097450562, 'epoch': 0.2}


 20%|█▉        | 3309/16798 [13:45<1:00:44,  3.70it/s]

{'loss': 1.7508, 'grad_norm': 2.3403265476226807, 'learning_rate': 0.00016069811770312128, 'epoch': 0.2}


 20%|█▉        | 3310/16798 [13:45<1:00:09,  3.74it/s]

{'loss': 1.6115, 'grad_norm': 1.9820401668548584, 'learning_rate': 0.00016068620443173697, 'epoch': 0.2}


 20%|█▉        | 3311/16798 [13:45<58:36,  3.84it/s]  

{'loss': 1.5396, 'grad_norm': 1.5997624397277832, 'learning_rate': 0.00016067429116035264, 'epoch': 0.2}


 20%|█▉        | 3312/16798 [13:45<58:12,  3.86it/s]

{'loss': 1.8523, 'grad_norm': 1.951658010482788, 'learning_rate': 0.00016066237788896833, 'epoch': 0.2}


 20%|█▉        | 3313/16798 [13:46<58:01,  3.87it/s]

{'loss': 1.8617, 'grad_norm': 1.9581397771835327, 'learning_rate': 0.000160650464617584, 'epoch': 0.2}


 20%|█▉        | 3314/16798 [13:46<1:00:26,  3.72it/s]

{'loss': 1.4033, 'grad_norm': 1.8291717767715454, 'learning_rate': 0.00016063855134619968, 'epoch': 0.2}


 20%|█▉        | 3315/16798 [13:46<58:34,  3.84it/s]  

{'loss': 1.3677, 'grad_norm': 1.921916127204895, 'learning_rate': 0.00016062663807481535, 'epoch': 0.2}


 20%|█▉        | 3316/16798 [13:46<59:08,  3.80it/s]

{'loss': 1.5983, 'grad_norm': 1.955116868019104, 'learning_rate': 0.00016061472480343104, 'epoch': 0.2}


 20%|█▉        | 3317/16798 [13:47<56:35,  3.97it/s]

{'loss': 1.3701, 'grad_norm': 2.0033118724823, 'learning_rate': 0.0001606028115320467, 'epoch': 0.2}


 20%|█▉        | 3318/16798 [13:47<59:11,  3.80it/s]

{'loss': 1.9741, 'grad_norm': 2.674715995788574, 'learning_rate': 0.0001605908982606624, 'epoch': 0.2}


 20%|█▉        | 3319/16798 [13:47<1:00:16,  3.73it/s]

{'loss': 1.7016, 'grad_norm': 1.95496666431427, 'learning_rate': 0.00016057898498927805, 'epoch': 0.2}


 20%|█▉        | 3320/16798 [13:47<1:01:04,  3.68it/s]

{'loss': 1.7375, 'grad_norm': 2.035905599594116, 'learning_rate': 0.00016056707171789375, 'epoch': 0.2}


 20%|█▉        | 3321/16798 [13:48<59:25,  3.78it/s]  

{'loss': 1.5681, 'grad_norm': 1.8597512245178223, 'learning_rate': 0.0001605551584465094, 'epoch': 0.2}


 20%|█▉        | 3322/16798 [13:48<1:00:15,  3.73it/s]

{'loss': 1.6725, 'grad_norm': 1.9134531021118164, 'learning_rate': 0.0001605432451751251, 'epoch': 0.2}


 20%|█▉        | 3323/16798 [13:48<57:26,  3.91it/s]  

{'loss': 1.3334, 'grad_norm': 1.680536150932312, 'learning_rate': 0.00016053133190374076, 'epoch': 0.2}


 20%|█▉        | 3324/16798 [13:49<1:00:02,  3.74it/s]

{'loss': 1.8752, 'grad_norm': 2.1457390785217285, 'learning_rate': 0.00016051941863235645, 'epoch': 0.2}


 20%|█▉        | 3325/16798 [13:49<59:43,  3.76it/s]  

{'loss': 1.6694, 'grad_norm': 2.373936653137207, 'learning_rate': 0.00016050750536097212, 'epoch': 0.2}


 20%|█▉        | 3326/16798 [13:49<59:26,  3.78it/s]

{'loss': 2.2779, 'grad_norm': 2.3863394260406494, 'learning_rate': 0.0001604955920895878, 'epoch': 0.2}


 20%|█▉        | 3327/16798 [13:49<57:01,  3.94it/s]

{'loss': 1.6334, 'grad_norm': 1.861906886100769, 'learning_rate': 0.00016048367881820347, 'epoch': 0.2}


 20%|█▉        | 3328/16798 [13:50<1:00:10,  3.73it/s]

{'loss': 1.5391, 'grad_norm': 1.6806765794754028, 'learning_rate': 0.00016047176554681916, 'epoch': 0.2}


 20%|█▉        | 3329/16798 [13:50<1:00:51,  3.69it/s]

{'loss': 1.5267, 'grad_norm': 2.0191283226013184, 'learning_rate': 0.00016045985227543485, 'epoch': 0.2}


 20%|█▉        | 3330/16798 [13:50<1:01:16,  3.66it/s]

{'loss': 1.259, 'grad_norm': 1.8721028566360474, 'learning_rate': 0.00016044793900405052, 'epoch': 0.2}


 20%|█▉        | 3331/16798 [13:50<59:46,  3.75it/s]  

{'loss': 1.8343, 'grad_norm': 2.2898361682891846, 'learning_rate': 0.0001604360257326662, 'epoch': 0.2}


 20%|█▉        | 3332/16798 [13:51<59:50,  3.75it/s]

{'loss': 1.5655, 'grad_norm': 1.6966651678085327, 'learning_rate': 0.00016042411246128187, 'epoch': 0.2}


 20%|█▉        | 3333/16798 [13:51<57:27,  3.91it/s]

{'loss': 1.0678, 'grad_norm': 1.509601354598999, 'learning_rate': 0.00016041219918989756, 'epoch': 0.2}


 20%|█▉        | 3334/16798 [13:51<59:31,  3.77it/s]

{'loss': 1.3909, 'grad_norm': 1.8036837577819824, 'learning_rate': 0.00016040028591851323, 'epoch': 0.2}


 20%|█▉        | 3335/16798 [13:51<56:38,  3.96it/s]

{'loss': 1.7985, 'grad_norm': 2.3306491374969482, 'learning_rate': 0.00016038837264712892, 'epoch': 0.2}


 20%|█▉        | 3336/16798 [13:52<55:19,  4.06it/s]

{'loss': 1.185, 'grad_norm': 1.6817179918289185, 'learning_rate': 0.00016037645937574458, 'epoch': 0.2}


 20%|█▉        | 3337/16798 [13:52<56:40,  3.96it/s]

{'loss': 1.0364, 'grad_norm': 1.6843262910842896, 'learning_rate': 0.00016036454610436027, 'epoch': 0.2}


 20%|█▉        | 3338/16798 [13:52<55:51,  4.02it/s]

{'loss': 1.2518, 'grad_norm': 2.0284640789031982, 'learning_rate': 0.00016035263283297594, 'epoch': 0.2}


 20%|█▉        | 3339/16798 [13:52<59:48,  3.75it/s]

{'loss': 1.4183, 'grad_norm': 1.8292959928512573, 'learning_rate': 0.00016034071956159163, 'epoch': 0.2}


 20%|█▉        | 3340/16798 [13:53<56:13,  3.99it/s]

{'loss': 1.0053, 'grad_norm': 1.6458994150161743, 'learning_rate': 0.0001603288062902073, 'epoch': 0.2}


 20%|█▉        | 3341/16798 [13:53<53:38,  4.18it/s]

{'loss': 0.8779, 'grad_norm': 1.997437834739685, 'learning_rate': 0.00016031689301882298, 'epoch': 0.2}


 20%|█▉        | 3342/16798 [13:53<57:35,  3.89it/s]

{'loss': 1.3917, 'grad_norm': 1.7657488584518433, 'learning_rate': 0.00016030497974743864, 'epoch': 0.2}


 20%|█▉        | 3343/16798 [13:53<54:47,  4.09it/s]

{'loss': 1.5655, 'grad_norm': 2.5732312202453613, 'learning_rate': 0.00016029306647605433, 'epoch': 0.2}


 20%|█▉        | 3344/16798 [13:54<59:48,  3.75it/s]

{'loss': 1.0752, 'grad_norm': 1.653355598449707, 'learning_rate': 0.00016028115320467, 'epoch': 0.2}


 20%|█▉        | 3345/16798 [13:54<1:01:21,  3.65it/s]

{'loss': 0.8665, 'grad_norm': 1.517968773841858, 'learning_rate': 0.0001602692399332857, 'epoch': 0.2}


 20%|█▉        | 3346/16798 [13:54<57:20,  3.91it/s]  

{'loss': 0.6385, 'grad_norm': 1.2508172988891602, 'learning_rate': 0.00016025732666190135, 'epoch': 0.2}


 20%|█▉        | 3347/16798 [13:54<58:14,  3.85it/s]

{'loss': 0.511, 'grad_norm': 1.2135907411575317, 'learning_rate': 0.00016024541339051704, 'epoch': 0.2}


 20%|█▉        | 3348/16798 [13:55<58:06,  3.86it/s]

{'loss': 0.482, 'grad_norm': 1.1175886392593384, 'learning_rate': 0.0001602335001191327, 'epoch': 0.2}


 20%|█▉        | 3349/16798 [13:55<58:15,  3.85it/s]

{'loss': 0.1623, 'grad_norm': 0.5885648727416992, 'learning_rate': 0.0001602215868477484, 'epoch': 0.2}


 20%|█▉        | 3350/16798 [13:55<55:39,  4.03it/s]

{'loss': 0.3508, 'grad_norm': 0.8519130349159241, 'learning_rate': 0.00016020967357636406, 'epoch': 0.2}


 20%|█▉        | 3351/16798 [13:56<58:59,  3.80it/s]

{'loss': 2.2536, 'grad_norm': 2.3588297367095947, 'learning_rate': 0.00016019776030497975, 'epoch': 0.2}


 20%|█▉        | 3352/16798 [13:56<1:03:54,  3.51it/s]

{'loss': 1.4677, 'grad_norm': 2.6052746772766113, 'learning_rate': 0.00016018584703359542, 'epoch': 0.2}


 20%|█▉        | 3353/16798 [13:56<1:03:33,  3.53it/s]

{'loss': 1.9608, 'grad_norm': 1.8065015077590942, 'learning_rate': 0.0001601739337622111, 'epoch': 0.2}


 20%|█▉        | 3354/16798 [13:56<1:00:03,  3.73it/s]

{'loss': 2.2969, 'grad_norm': 2.0503365993499756, 'learning_rate': 0.00016016202049082677, 'epoch': 0.2}


 20%|█▉        | 3355/16798 [13:57<1:01:50,  3.62it/s]

{'loss': 2.1002, 'grad_norm': 2.0143754482269287, 'learning_rate': 0.00016015010721944246, 'epoch': 0.2}


 20%|█▉        | 3356/16798 [13:57<58:19,  3.84it/s]  

{'loss': 2.3288, 'grad_norm': 1.8722361326217651, 'learning_rate': 0.00016013819394805812, 'epoch': 0.2}


 20%|█▉        | 3357/16798 [13:57<1:00:24,  3.71it/s]

{'loss': 2.3123, 'grad_norm': 1.8640085458755493, 'learning_rate': 0.00016012628067667382, 'epoch': 0.2}


 20%|█▉        | 3358/16798 [13:57<58:07,  3.85it/s]  

{'loss': 1.8463, 'grad_norm': 2.0363218784332275, 'learning_rate': 0.00016011436740528948, 'epoch': 0.2}


 20%|█▉        | 3359/16798 [13:58<58:55,  3.80it/s]

{'loss': 1.7286, 'grad_norm': 1.7280179262161255, 'learning_rate': 0.0001601024541339052, 'epoch': 0.2}


 20%|██        | 3360/16798 [13:58<56:27,  3.97it/s]

{'loss': 1.9439, 'grad_norm': 2.0762715339660645, 'learning_rate': 0.00016009054086252086, 'epoch': 0.2}


 20%|██        | 3361/16798 [13:58<58:10,  3.85it/s]

{'loss': 2.1665, 'grad_norm': 2.3721630573272705, 'learning_rate': 0.00016007862759113655, 'epoch': 0.2}


 20%|██        | 3362/16798 [13:58<57:45,  3.88it/s]

{'loss': 1.8881, 'grad_norm': 1.9966928958892822, 'learning_rate': 0.00016006671431975222, 'epoch': 0.2}


 20%|██        | 3363/16798 [13:59<1:01:37,  3.63it/s]

{'loss': 1.9455, 'grad_norm': 2.544858932495117, 'learning_rate': 0.0001600548010483679, 'epoch': 0.2}


 20%|██        | 3364/16798 [13:59<58:26,  3.83it/s]  

{'loss': 1.4566, 'grad_norm': 1.8202060461044312, 'learning_rate': 0.00016004288777698357, 'epoch': 0.2}


 20%|██        | 3365/16798 [13:59<1:00:00,  3.73it/s]

{'loss': 1.1906, 'grad_norm': 1.4425849914550781, 'learning_rate': 0.00016003097450559926, 'epoch': 0.2}


 20%|██        | 3366/16798 [14:00<59:58,  3.73it/s]  

{'loss': 1.719, 'grad_norm': 1.978049874305725, 'learning_rate': 0.00016001906123421492, 'epoch': 0.2}


 20%|██        | 3368/16798 [14:00<58:22,  3.83it/s]  

{'loss': 1.8151, 'grad_norm': 2.1956727504730225, 'learning_rate': 0.00016000714796283061, 'epoch': 0.2}


 20%|██        | 3368/16798 [14:00<58:22,  3.83it/s]

{'loss': 1.7285, 'grad_norm': 1.9906095266342163, 'learning_rate': 0.00015999523469144628, 'epoch': 0.2}


 20%|██        | 3369/16798 [14:00<55:53,  4.00it/s]

{'loss': 2.1702, 'grad_norm': 2.277653455734253, 'learning_rate': 0.00015998332142006197, 'epoch': 0.2}


 20%|██        | 3370/16798 [14:01<58:28,  3.83it/s]

{'loss': 1.6367, 'grad_norm': 2.073748826980591, 'learning_rate': 0.00015997140814867763, 'epoch': 0.2}


 20%|██        | 3371/16798 [14:01<56:38,  3.95it/s]

{'loss': 1.6497, 'grad_norm': 1.9279491901397705, 'learning_rate': 0.00015995949487729332, 'epoch': 0.2}


 20%|██        | 3372/16798 [14:01<1:00:49,  3.68it/s]

{'loss': 1.9848, 'grad_norm': 2.059927463531494, 'learning_rate': 0.000159947581605909, 'epoch': 0.2}


 20%|██        | 3373/16798 [14:01<57:06,  3.92it/s]  

{'loss': 1.7039, 'grad_norm': 1.7063267230987549, 'learning_rate': 0.00015993566833452468, 'epoch': 0.2}


 20%|██        | 3374/16798 [14:02<56:22,  3.97it/s]

{'loss': 1.7151, 'grad_norm': 2.0150339603424072, 'learning_rate': 0.00015992375506314034, 'epoch': 0.2}


 20%|██        | 3375/16798 [14:02<57:49,  3.87it/s]

{'loss': 1.5382, 'grad_norm': 1.8147448301315308, 'learning_rate': 0.00015991184179175603, 'epoch': 0.2}


 20%|██        | 3376/16798 [14:02<55:59,  4.00it/s]

{'loss': 1.1867, 'grad_norm': 1.449299693107605, 'learning_rate': 0.0001598999285203717, 'epoch': 0.2}


 20%|██        | 3377/16798 [14:02<1:00:20,  3.71it/s]

{'loss': 1.7296, 'grad_norm': 2.023369312286377, 'learning_rate': 0.0001598880152489874, 'epoch': 0.2}


 20%|██        | 3378/16798 [14:03<59:09,  3.78it/s]  

{'loss': 1.8027, 'grad_norm': 2.213085651397705, 'learning_rate': 0.00015987610197760305, 'epoch': 0.2}


 20%|██        | 3380/16798 [14:03<55:31,  4.03it/s]

{'loss': 1.8026, 'grad_norm': 2.2399115562438965, 'learning_rate': 0.00015986418870621874, 'epoch': 0.2}


 20%|██        | 3380/16798 [14:03<55:31,  4.03it/s]

{'loss': 1.8462, 'grad_norm': 2.28525447845459, 'learning_rate': 0.0001598522754348344, 'epoch': 0.2}


 20%|██        | 3381/16798 [14:03<53:10,  4.21it/s]

{'loss': 1.5736, 'grad_norm': 2.114633560180664, 'learning_rate': 0.0001598403621634501, 'epoch': 0.2}


 20%|██        | 3382/16798 [14:04<54:41,  4.09it/s]

{'loss': 1.1693, 'grad_norm': 1.6853351593017578, 'learning_rate': 0.00015982844889206576, 'epoch': 0.2}


 20%|██        | 3383/16798 [14:04<56:44,  3.94it/s]

{'loss': 1.5386, 'grad_norm': 2.315000057220459, 'learning_rate': 0.00015981653562068145, 'epoch': 0.2}


 20%|██        | 3384/16798 [14:04<57:44,  3.87it/s]

{'loss': 1.8347, 'grad_norm': 1.9416790008544922, 'learning_rate': 0.00015980462234929711, 'epoch': 0.2}


 20%|██        | 3385/16798 [14:04<55:45,  4.01it/s]

{'loss': 1.2342, 'grad_norm': 1.5897605419158936, 'learning_rate': 0.0001597927090779128, 'epoch': 0.2}


 20%|██        | 3386/16798 [14:05<53:30,  4.18it/s]

{'loss': 1.6733, 'grad_norm': 1.9829928874969482, 'learning_rate': 0.00015978079580652847, 'epoch': 0.2}


 20%|██        | 3387/16798 [14:05<57:23,  3.89it/s]

{'loss': 1.6989, 'grad_norm': 2.135213851928711, 'learning_rate': 0.00015976888253514416, 'epoch': 0.2}


 20%|██        | 3388/16798 [14:05<57:05,  3.92it/s]

{'loss': 1.2817, 'grad_norm': 1.6789244413375854, 'learning_rate': 0.00015975696926375982, 'epoch': 0.2}


 20%|██        | 3389/16798 [14:05<57:41,  3.87it/s]

{'loss': 1.2506, 'grad_norm': 2.664604902267456, 'learning_rate': 0.0001597450559923755, 'epoch': 0.2}


 20%|██        | 3390/16798 [14:06<56:54,  3.93it/s]

{'loss': 1.4881, 'grad_norm': 1.8608866930007935, 'learning_rate': 0.0001597331427209912, 'epoch': 0.2}


 20%|██        | 3391/16798 [14:06<1:01:57,  3.61it/s]

{'loss': 1.0776, 'grad_norm': 1.5298662185668945, 'learning_rate': 0.00015972122944960687, 'epoch': 0.2}


 20%|██        | 3392/16798 [14:06<59:58,  3.73it/s]  

{'loss': 0.9211, 'grad_norm': 1.5276306867599487, 'learning_rate': 0.00015970931617822256, 'epoch': 0.2}


 20%|██        | 3393/16798 [14:07<1:01:30,  3.63it/s]

{'loss': 0.9616, 'grad_norm': 1.9254705905914307, 'learning_rate': 0.00015969740290683822, 'epoch': 0.2}


 20%|██        | 3394/16798 [14:07<1:00:28,  3.69it/s]

{'loss': 1.4231, 'grad_norm': 1.8504152297973633, 'learning_rate': 0.0001596854896354539, 'epoch': 0.2}


 20%|██        | 3395/16798 [14:07<1:02:29,  3.57it/s]

{'loss': 0.9822, 'grad_norm': 1.4364038705825806, 'learning_rate': 0.00015967357636406958, 'epoch': 0.2}


 20%|██        | 3396/16798 [14:07<1:01:46,  3.62it/s]

{'loss': 0.862, 'grad_norm': 1.4458919763565063, 'learning_rate': 0.00015966166309268527, 'epoch': 0.2}


 20%|██        | 3397/16798 [14:08<1:02:03,  3.60it/s]

{'loss': 1.135, 'grad_norm': 2.742767572402954, 'learning_rate': 0.00015964974982130093, 'epoch': 0.2}


 20%|██        | 3398/16798 [14:08<1:01:07,  3.65it/s]

{'loss': 0.5124, 'grad_norm': 1.174043893814087, 'learning_rate': 0.00015963783654991662, 'epoch': 0.2}


 20%|██        | 3399/16798 [14:08<1:00:30,  3.69it/s]

{'loss': 0.4983, 'grad_norm': 1.0571359395980835, 'learning_rate': 0.00015962592327853229, 'epoch': 0.2}


 20%|██        | 3400/16798 [14:08<1:02:21,  3.58it/s]

{'loss': 0.2523, 'grad_norm': 0.7460997700691223, 'learning_rate': 0.00015961401000714798, 'epoch': 0.2}


 20%|██        | 3401/16798 [14:09<1:03:39,  3.51it/s]

{'loss': 2.104, 'grad_norm': 2.1266627311706543, 'learning_rate': 0.00015960209673576364, 'epoch': 0.2}


 20%|██        | 3402/16798 [14:09<1:00:25,  3.69it/s]

{'loss': 2.0807, 'grad_norm': 1.8963041305541992, 'learning_rate': 0.00015959018346437933, 'epoch': 0.2}


 20%|██        | 3403/16798 [14:09<1:00:11,  3.71it/s]

{'loss': 1.8254, 'grad_norm': 1.702569842338562, 'learning_rate': 0.000159578270192995, 'epoch': 0.2}


 20%|██        | 3404/16798 [14:10<1:05:41,  3.40it/s]

{'loss': 2.0685, 'grad_norm': 1.822455644607544, 'learning_rate': 0.00015956635692161069, 'epoch': 0.2}


 20%|██        | 3405/16798 [14:10<1:01:11,  3.65it/s]

{'loss': 2.2931, 'grad_norm': 2.0001473426818848, 'learning_rate': 0.00015955444365022635, 'epoch': 0.2}


 20%|██        | 3406/16798 [14:10<1:01:15,  3.64it/s]

{'loss': 1.9245, 'grad_norm': 1.9884525537490845, 'learning_rate': 0.00015954253037884204, 'epoch': 0.2}


 20%|██        | 3407/16798 [14:10<1:00:15,  3.70it/s]

{'loss': 1.8443, 'grad_norm': 1.8294272422790527, 'learning_rate': 0.0001595306171074577, 'epoch': 0.2}


 20%|██        | 3408/16798 [14:11<1:02:03,  3.60it/s]

{'loss': 1.1958, 'grad_norm': 1.5525524616241455, 'learning_rate': 0.0001595187038360734, 'epoch': 0.2}


 20%|██        | 3409/16798 [14:11<1:00:37,  3.68it/s]

{'loss': 1.7822, 'grad_norm': 2.0350823402404785, 'learning_rate': 0.00015950679056468906, 'epoch': 0.2}


 20%|██        | 3410/16798 [14:11<1:01:23,  3.63it/s]

{'loss': 1.5833, 'grad_norm': 1.895611047744751, 'learning_rate': 0.00015949487729330475, 'epoch': 0.2}


 20%|██        | 3411/16798 [14:11<1:00:43,  3.67it/s]

{'loss': 1.3429, 'grad_norm': 1.7229578495025635, 'learning_rate': 0.0001594829640219204, 'epoch': 0.2}


 20%|██        | 3412/16798 [14:12<1:01:21,  3.64it/s]

{'loss': 1.4886, 'grad_norm': 1.7193565368652344, 'learning_rate': 0.0001594710507505361, 'epoch': 0.2}


 20%|██        | 3413/16798 [14:12<1:01:01,  3.66it/s]

{'loss': 1.5092, 'grad_norm': 1.6561514139175415, 'learning_rate': 0.00015945913747915177, 'epoch': 0.2}


 20%|██        | 3414/16798 [14:12<1:01:43,  3.61it/s]

{'loss': 1.6085, 'grad_norm': 1.906440258026123, 'learning_rate': 0.00015944722420776746, 'epoch': 0.2}


 20%|██        | 3415/16798 [14:13<57:40,  3.87it/s]  

{'loss': 1.7471, 'grad_norm': 2.2361907958984375, 'learning_rate': 0.00015943531093638312, 'epoch': 0.2}


 20%|██        | 3416/16798 [14:13<55:55,  3.99it/s]

{'loss': 1.5694, 'grad_norm': 1.683631181716919, 'learning_rate': 0.0001594233976649988, 'epoch': 0.2}


 20%|██        | 3417/16798 [14:13<1:03:15,  3.53it/s]

{'loss': 1.4477, 'grad_norm': 1.8553006649017334, 'learning_rate': 0.00015941148439361448, 'epoch': 0.2}


 20%|██        | 3418/16798 [14:13<1:02:51,  3.55it/s]

{'loss': 1.4794, 'grad_norm': 1.6740645170211792, 'learning_rate': 0.00015939957112223017, 'epoch': 0.2}


 20%|██        | 3419/16798 [14:14<1:03:33,  3.51it/s]

{'loss': 1.6447, 'grad_norm': 2.1889355182647705, 'learning_rate': 0.00015938765785084583, 'epoch': 0.2}


 20%|██        | 3420/16798 [14:14<59:05,  3.77it/s]  

{'loss': 1.4094, 'grad_norm': 1.5552645921707153, 'learning_rate': 0.00015937574457946152, 'epoch': 0.2}


 20%|██        | 3421/16798 [14:14<57:36,  3.87it/s]

{'loss': 1.676, 'grad_norm': 2.001774311065674, 'learning_rate': 0.0001593638313080772, 'epoch': 0.2}


 20%|██        | 3422/16798 [14:14<1:00:21,  3.69it/s]

{'loss': 1.8567, 'grad_norm': 2.180521249771118, 'learning_rate': 0.0001593519180366929, 'epoch': 0.2}


 20%|██        | 3423/16798 [14:15<59:21,  3.75it/s]  

{'loss': 1.2023, 'grad_norm': 1.7391163110733032, 'learning_rate': 0.00015934000476530857, 'epoch': 0.2}


 20%|██        | 3424/16798 [14:15<1:01:07,  3.65it/s]

{'loss': 1.2817, 'grad_norm': 2.154460906982422, 'learning_rate': 0.00015932809149392426, 'epoch': 0.2}


 20%|██        | 3425/16798 [14:15<1:02:09,  3.59it/s]

{'loss': 1.6658, 'grad_norm': 2.041968584060669, 'learning_rate': 0.00015931617822253992, 'epoch': 0.2}


 20%|██        | 3426/16798 [14:16<1:03:04,  3.53it/s]

{'loss': 1.6422, 'grad_norm': 1.9318325519561768, 'learning_rate': 0.0001593042649511556, 'epoch': 0.2}


 20%|██        | 3427/16798 [14:16<1:01:54,  3.60it/s]

{'loss': 1.5123, 'grad_norm': 1.925004243850708, 'learning_rate': 0.00015929235167977127, 'epoch': 0.2}


 20%|██        | 3428/16798 [14:16<1:04:38,  3.45it/s]

{'loss': 1.1845, 'grad_norm': 1.4999910593032837, 'learning_rate': 0.00015928043840838697, 'epoch': 0.2}


 20%|██        | 3429/16798 [14:16<1:02:26,  3.57it/s]

{'loss': 1.4062, 'grad_norm': 1.8900561332702637, 'learning_rate': 0.00015926852513700263, 'epoch': 0.2}


 20%|██        | 3430/16798 [14:17<1:03:20,  3.52it/s]

{'loss': 2.3076, 'grad_norm': 2.426065683364868, 'learning_rate': 0.00015925661186561832, 'epoch': 0.2}


 20%|██        | 3431/16798 [14:17<59:30,  3.74it/s]  

{'loss': 1.7674, 'grad_norm': 1.9855090379714966, 'learning_rate': 0.00015924469859423398, 'epoch': 0.2}


 20%|██        | 3432/16798 [14:17<1:00:57,  3.65it/s]

{'loss': 1.5987, 'grad_norm': 2.2219815254211426, 'learning_rate': 0.00015923278532284967, 'epoch': 0.2}


 20%|██        | 3433/16798 [14:18<1:00:19,  3.69it/s]

{'loss': 1.3682, 'grad_norm': 1.9148924350738525, 'learning_rate': 0.00015922087205146534, 'epoch': 0.2}


 20%|██        | 3434/16798 [14:18<1:01:44,  3.61it/s]

{'loss': 1.2782, 'grad_norm': 1.571010708808899, 'learning_rate': 0.00015920895878008103, 'epoch': 0.2}


 20%|██        | 3435/16798 [14:18<1:01:14,  3.64it/s]

{'loss': 1.52, 'grad_norm': 2.0589067935943604, 'learning_rate': 0.0001591970455086967, 'epoch': 0.2}


 20%|██        | 3436/16798 [14:18<1:03:45,  3.49it/s]

{'loss': 1.2894, 'grad_norm': 1.709236741065979, 'learning_rate': 0.00015918513223731238, 'epoch': 0.2}


 20%|██        | 3437/16798 [14:19<1:02:44,  3.55it/s]

{'loss': 1.3878, 'grad_norm': 2.106865167617798, 'learning_rate': 0.00015917321896592805, 'epoch': 0.2}


 20%|██        | 3438/16798 [14:19<1:04:29,  3.45it/s]

{'loss': 1.2392, 'grad_norm': 2.0052995681762695, 'learning_rate': 0.00015916130569454374, 'epoch': 0.2}


 20%|██        | 3439/16798 [14:19<1:01:21,  3.63it/s]

{'loss': 1.1972, 'grad_norm': 1.7272052764892578, 'learning_rate': 0.0001591493924231594, 'epoch': 0.2}


 20%|██        | 3440/16798 [14:19<1:01:34,  3.62it/s]

{'loss': 0.99, 'grad_norm': 1.835914134979248, 'learning_rate': 0.0001591374791517751, 'epoch': 0.2}


 20%|██        | 3441/16798 [14:20<57:09,  3.90it/s]  

{'loss': 1.0334, 'grad_norm': 1.6973742246627808, 'learning_rate': 0.00015912556588039076, 'epoch': 0.2}


 20%|██        | 3442/16798 [14:20<58:26,  3.81it/s]

{'loss': 0.8568, 'grad_norm': 1.5155339241027832, 'learning_rate': 0.00015911365260900645, 'epoch': 0.2}


 20%|██        | 3443/16798 [14:20<58:39,  3.80it/s]

{'loss': 0.7858, 'grad_norm': 1.4978691339492798, 'learning_rate': 0.0001591017393376221, 'epoch': 0.2}


 21%|██        | 3444/16798 [14:20<56:43,  3.92it/s]

{'loss': 0.9943, 'grad_norm': 1.6130256652832031, 'learning_rate': 0.0001590898260662378, 'epoch': 0.21}


 21%|██        | 3445/16798 [14:21<1:01:38,  3.61it/s]

{'loss': 1.2007, 'grad_norm': 1.7187528610229492, 'learning_rate': 0.00015907791279485346, 'epoch': 0.21}


 21%|██        | 3446/16798 [14:21<58:04,  3.83it/s]  

{'loss': 0.9969, 'grad_norm': 1.533975601196289, 'learning_rate': 0.00015906599952346916, 'epoch': 0.21}


 21%|██        | 3447/16798 [14:21<59:32,  3.74it/s]

{'loss': 1.0573, 'grad_norm': 1.6823482513427734, 'learning_rate': 0.00015905408625208482, 'epoch': 0.21}


 21%|██        | 3448/16798 [14:22<1:01:25,  3.62it/s]

{'loss': 0.7658, 'grad_norm': 1.4104814529418945, 'learning_rate': 0.0001590421729807005, 'epoch': 0.21}


 21%|██        | 3449/16798 [14:22<58:57,  3.77it/s]  

{'loss': 0.5888, 'grad_norm': 1.282771348953247, 'learning_rate': 0.00015903025970931617, 'epoch': 0.21}


 21%|██        | 3450/16798 [14:22<1:01:37,  3.61it/s]

{'loss': 0.5942, 'grad_norm': 1.315699577331543, 'learning_rate': 0.00015901834643793186, 'epoch': 0.21}


 21%|██        | 3451/16798 [14:22<59:26,  3.74it/s]  

{'loss': 2.1097, 'grad_norm': 2.2344422340393066, 'learning_rate': 0.00015900643316654755, 'epoch': 0.21}


 21%|██        | 3452/16798 [14:23<1:01:07,  3.64it/s]

{'loss': 2.1063, 'grad_norm': 1.8417221307754517, 'learning_rate': 0.00015899451989516322, 'epoch': 0.21}


 21%|██        | 3453/16798 [14:23<59:19,  3.75it/s]  

{'loss': 1.9163, 'grad_norm': 1.8635444641113281, 'learning_rate': 0.0001589826066237789, 'epoch': 0.21}


 21%|██        | 3454/16798 [14:23<1:01:04,  3.64it/s]

{'loss': 2.0159, 'grad_norm': 1.976516842842102, 'learning_rate': 0.00015897069335239457, 'epoch': 0.21}


 21%|██        | 3455/16798 [14:23<58:32,  3.80it/s]  

{'loss': 1.885, 'grad_norm': 1.8791403770446777, 'learning_rate': 0.00015895878008101026, 'epoch': 0.21}


 21%|██        | 3456/16798 [14:24<59:47,  3.72it/s]

{'loss': 1.9639, 'grad_norm': 1.9577494859695435, 'learning_rate': 0.00015894686680962593, 'epoch': 0.21}


 21%|██        | 3457/16798 [14:24<57:39,  3.86it/s]

{'loss': 1.8675, 'grad_norm': 2.0900192260742188, 'learning_rate': 0.00015893495353824162, 'epoch': 0.21}


 21%|██        | 3458/16798 [14:24<1:03:42,  3.49it/s]

{'loss': 1.6418, 'grad_norm': 1.7281399965286255, 'learning_rate': 0.00015892304026685728, 'epoch': 0.21}


 21%|██        | 3459/16798 [14:25<1:00:32,  3.67it/s]

{'loss': 1.1879, 'grad_norm': 1.5737098455429077, 'learning_rate': 0.00015891112699547297, 'epoch': 0.21}


 21%|██        | 3460/16798 [14:25<1:01:09,  3.63it/s]

{'loss': 1.5543, 'grad_norm': 1.7087528705596924, 'learning_rate': 0.00015889921372408864, 'epoch': 0.21}


 21%|██        | 3461/16798 [14:25<58:49,  3.78it/s]  

{'loss': 1.5128, 'grad_norm': 1.741424798965454, 'learning_rate': 0.00015888730045270433, 'epoch': 0.21}


 21%|██        | 3462/16798 [14:25<1:04:24,  3.45it/s]

{'loss': 1.7431, 'grad_norm': 1.81468665599823, 'learning_rate': 0.00015887538718132, 'epoch': 0.21}


 21%|██        | 3463/16798 [14:26<1:04:44,  3.43it/s]

{'loss': 1.5074, 'grad_norm': 1.73844313621521, 'learning_rate': 0.00015886347390993568, 'epoch': 0.21}


 21%|██        | 3464/16798 [14:26<1:05:29,  3.39it/s]

{'loss': 1.4161, 'grad_norm': 1.9152493476867676, 'learning_rate': 0.00015885156063855134, 'epoch': 0.21}


 21%|██        | 3465/16798 [14:26<1:01:04,  3.64it/s]

{'loss': 1.5412, 'grad_norm': 2.0481600761413574, 'learning_rate': 0.00015883964736716704, 'epoch': 0.21}


 21%|██        | 3466/16798 [14:26<57:37,  3.86it/s]  

{'loss': 1.6769, 'grad_norm': 2.033919334411621, 'learning_rate': 0.0001588277340957827, 'epoch': 0.21}


 21%|██        | 3467/16798 [14:27<1:01:00,  3.64it/s]

{'loss': 1.5121, 'grad_norm': 1.8372434377670288, 'learning_rate': 0.0001588158208243984, 'epoch': 0.21}


 21%|██        | 3468/16798 [14:27<57:12,  3.88it/s]  

{'loss': 1.8247, 'grad_norm': 2.0978333950042725, 'learning_rate': 0.00015880390755301405, 'epoch': 0.21}


 21%|██        | 3469/16798 [14:27<1:01:42,  3.60it/s]

{'loss': 1.3667, 'grad_norm': 1.720054030418396, 'learning_rate': 0.00015879199428162974, 'epoch': 0.21}


 21%|██        | 3470/16798 [14:28<1:00:06,  3.70it/s]

{'loss': 2.0069, 'grad_norm': 2.165031671524048, 'learning_rate': 0.0001587800810102454, 'epoch': 0.21}


 21%|██        | 3471/16798 [14:28<58:04,  3.82it/s]  

{'loss': 1.5112, 'grad_norm': 1.7974035739898682, 'learning_rate': 0.0001587681677388611, 'epoch': 0.21}


 21%|██        | 3472/16798 [14:28<1:02:02,  3.58it/s]

{'loss': 1.4739, 'grad_norm': 1.6113066673278809, 'learning_rate': 0.00015875625446747676, 'epoch': 0.21}


 21%|██        | 3473/16798 [14:28<1:01:20,  3.62it/s]

{'loss': 1.2708, 'grad_norm': 1.56965172290802, 'learning_rate': 0.00015874434119609245, 'epoch': 0.21}


 21%|██        | 3474/16798 [14:29<1:01:32,  3.61it/s]

{'loss': 1.806, 'grad_norm': 2.3207337856292725, 'learning_rate': 0.00015873242792470812, 'epoch': 0.21}


 21%|██        | 3475/16798 [14:29<1:00:14,  3.69it/s]

{'loss': 1.6273, 'grad_norm': 2.017402410507202, 'learning_rate': 0.0001587205146533238, 'epoch': 0.21}


 21%|██        | 3476/16798 [14:29<1:01:45,  3.60it/s]

{'loss': 1.2655, 'grad_norm': 1.7306803464889526, 'learning_rate': 0.00015870860138193947, 'epoch': 0.21}


 21%|██        | 3477/16798 [14:29<59:03,  3.76it/s]  

{'loss': 1.565, 'grad_norm': 1.8482946157455444, 'learning_rate': 0.00015869668811055516, 'epoch': 0.21}


 21%|██        | 3478/16798 [14:30<1:00:21,  3.68it/s]

{'loss': 1.0105, 'grad_norm': 1.5059605836868286, 'learning_rate': 0.00015868477483917083, 'epoch': 0.21}


 21%|██        | 3479/16798 [14:30<1:00:28,  3.67it/s]

{'loss': 1.2738, 'grad_norm': 1.5858447551727295, 'learning_rate': 0.00015867286156778652, 'epoch': 0.21}


 21%|██        | 3480/16798 [14:30<1:01:27,  3.61it/s]

{'loss': 1.1193, 'grad_norm': 1.4727437496185303, 'learning_rate': 0.00015866094829640218, 'epoch': 0.21}


 21%|██        | 3481/16798 [14:31<1:00:17,  3.68it/s]

{'loss': 1.4027, 'grad_norm': 1.8452532291412354, 'learning_rate': 0.00015864903502501787, 'epoch': 0.21}


 21%|██        | 3483/16798 [14:31<58:48,  3.77it/s]  

{'loss': 1.4834, 'grad_norm': 2.0364184379577637, 'learning_rate': 0.00015863712175363356, 'epoch': 0.21}


 21%|██        | 3483/16798 [14:31<58:48,  3.77it/s]

{'loss': 1.228, 'grad_norm': 1.7576699256896973, 'learning_rate': 0.00015862520848224925, 'epoch': 0.21}


 21%|██        | 3484/16798 [14:31<56:49,  3.90it/s]

{'loss': 0.8529, 'grad_norm': 3.2111001014709473, 'learning_rate': 0.00015861329521086492, 'epoch': 0.21}


 21%|██        | 3485/16798 [14:32<58:48,  3.77it/s]

{'loss': 1.0731, 'grad_norm': 1.6341811418533325, 'learning_rate': 0.0001586013819394806, 'epoch': 0.21}


 21%|██        | 3486/16798 [14:32<56:29,  3.93it/s]

{'loss': 1.0558, 'grad_norm': 1.4491361379623413, 'learning_rate': 0.00015858946866809627, 'epoch': 0.21}


 21%|██        | 3487/16798 [14:32<59:32,  3.73it/s]

{'loss': 1.4055, 'grad_norm': 2.0610976219177246, 'learning_rate': 0.00015857755539671196, 'epoch': 0.21}


 21%|██        | 3488/16798 [14:32<58:01,  3.82it/s]

{'loss': 1.0712, 'grad_norm': 1.5350794792175293, 'learning_rate': 0.00015856564212532763, 'epoch': 0.21}


 21%|██        | 3489/16798 [14:33<1:00:43,  3.65it/s]

{'loss': 1.0835, 'grad_norm': 1.9269458055496216, 'learning_rate': 0.00015855372885394332, 'epoch': 0.21}


 21%|██        | 3490/16798 [14:33<57:37,  3.85it/s]  

{'loss': 1.0643, 'grad_norm': 1.4918867349624634, 'learning_rate': 0.00015854181558255898, 'epoch': 0.21}


 21%|██        | 3491/16798 [14:33<1:01:02,  3.63it/s]

{'loss': 1.063, 'grad_norm': 1.606702446937561, 'learning_rate': 0.00015852990231117467, 'epoch': 0.21}


 21%|██        | 3492/16798 [14:33<57:22,  3.87it/s]  

{'loss': 1.2594, 'grad_norm': 2.117936611175537, 'learning_rate': 0.00015851798903979033, 'epoch': 0.21}


 21%|██        | 3493/16798 [14:34<58:53,  3.77it/s]

{'loss': 1.089, 'grad_norm': 1.6407837867736816, 'learning_rate': 0.00015850607576840602, 'epoch': 0.21}


 21%|██        | 3494/16798 [14:34<55:36,  3.99it/s]

{'loss': 0.5842, 'grad_norm': 1.109079360961914, 'learning_rate': 0.0001584941624970217, 'epoch': 0.21}


 21%|██        | 3495/16798 [14:34<57:37,  3.85it/s]

{'loss': 0.642, 'grad_norm': 1.2595806121826172, 'learning_rate': 0.00015848224922563738, 'epoch': 0.21}


 21%|██        | 3496/16798 [14:35<57:59,  3.82it/s]

{'loss': 0.9903, 'grad_norm': 1.6397669315338135, 'learning_rate': 0.00015847033595425304, 'epoch': 0.21}


 21%|██        | 3497/16798 [14:35<1:01:02,  3.63it/s]

{'loss': 0.7112, 'grad_norm': 1.4521180391311646, 'learning_rate': 0.00015845842268286873, 'epoch': 0.21}


 21%|██        | 3498/16798 [14:35<58:24,  3.79it/s]  

{'loss': 0.3296, 'grad_norm': 0.8416645526885986, 'learning_rate': 0.0001584465094114844, 'epoch': 0.21}


 21%|██        | 3499/16798 [14:35<59:30,  3.72it/s]

{'loss': 0.2459, 'grad_norm': 0.6869882345199585, 'learning_rate': 0.0001584345961401001, 'epoch': 0.21}




{'loss': 0.7582, 'grad_norm': 1.6335780620574951, 'learning_rate': 0.00015842268286871575, 'epoch': 0.21}


 21%|██        | 3501/16798 [14:39<3:57:48,  1.07s/it]

{'loss': 2.4375, 'grad_norm': 1.9827072620391846, 'learning_rate': 0.00015841076959733144, 'epoch': 0.21}


 21%|██        | 3502/16798 [14:39<3:02:35,  1.21it/s]

{'loss': 2.0983, 'grad_norm': 2.0738720893859863, 'learning_rate': 0.0001583988563259471, 'epoch': 0.21}


 21%|██        | 3503/16798 [14:39<2:27:18,  1.50it/s]

{'loss': 2.2888, 'grad_norm': 2.010298252105713, 'learning_rate': 0.0001583869430545628, 'epoch': 0.21}


 21%|██        | 3504/16798 [14:39<2:01:19,  1.83it/s]

{'loss': 1.4282, 'grad_norm': 1.749436855316162, 'learning_rate': 0.00015837502978317846, 'epoch': 0.21}


 21%|██        | 3505/16798 [14:40<1:43:30,  2.14it/s]

{'loss': 1.875, 'grad_norm': 1.8297832012176514, 'learning_rate': 0.00015836311651179415, 'epoch': 0.21}


 21%|██        | 3506/16798 [14:40<1:30:10,  2.46it/s]

{'loss': 2.4931, 'grad_norm': 2.335002899169922, 'learning_rate': 0.00015835120324040981, 'epoch': 0.21}


 21%|██        | 3507/16798 [14:40<1:18:14,  2.83it/s]

{'loss': 1.5714, 'grad_norm': 1.9241564273834229, 'learning_rate': 0.0001583392899690255, 'epoch': 0.21}


 21%|██        | 3508/16798 [14:41<1:18:37,  2.82it/s]

{'loss': 2.011, 'grad_norm': 2.012094497680664, 'learning_rate': 0.00015832737669764117, 'epoch': 0.21}


 21%|██        | 3509/16798 [14:41<1:11:31,  3.10it/s]

{'loss': 2.0632, 'grad_norm': 2.0952391624450684, 'learning_rate': 0.00015831546342625686, 'epoch': 0.21}


 21%|██        | 3510/16798 [14:41<1:08:04,  3.25it/s]

{'loss': 1.679, 'grad_norm': 1.872519850730896, 'learning_rate': 0.00015830355015487252, 'epoch': 0.21}


 21%|██        | 3511/16798 [14:41<1:03:29,  3.49it/s]

{'loss': 1.7112, 'grad_norm': 2.2611379623413086, 'learning_rate': 0.00015829163688348821, 'epoch': 0.21}


 21%|██        | 3512/16798 [14:42<1:01:39,  3.59it/s]

{'loss': 1.2008, 'grad_norm': 1.6764862537384033, 'learning_rate': 0.00015827972361210388, 'epoch': 0.21}


 21%|██        | 3513/16798 [14:42<59:44,  3.71it/s]  

{'loss': 1.4079, 'grad_norm': 1.913491129875183, 'learning_rate': 0.00015826781034071957, 'epoch': 0.21}


 21%|██        | 3514/16798 [14:42<1:02:03,  3.57it/s]

{'loss': 2.0798, 'grad_norm': 2.123114585876465, 'learning_rate': 0.00015825589706933526, 'epoch': 0.21}


 21%|██        | 3515/16798 [14:42<58:01,  3.81it/s]  

{'loss': 1.9538, 'grad_norm': 2.1354598999023438, 'learning_rate': 0.00015824398379795092, 'epoch': 0.21}


 21%|██        | 3516/16798 [14:43<55:55,  3.96it/s]

{'loss': 1.6461, 'grad_norm': 1.9566073417663574, 'learning_rate': 0.00015823207052656661, 'epoch': 0.21}


 21%|██        | 3517/16798 [14:43<1:01:44,  3.59it/s]

{'loss': 1.3363, 'grad_norm': 1.7402886152267456, 'learning_rate': 0.00015822015725518228, 'epoch': 0.21}


 21%|██        | 3518/16798 [14:43<1:01:11,  3.62it/s]

{'loss': 1.6423, 'grad_norm': 1.941138505935669, 'learning_rate': 0.00015820824398379797, 'epoch': 0.21}


 21%|██        | 3519/16798 [14:43<1:04:34,  3.43it/s]

{'loss': 1.6163, 'grad_norm': 1.9218283891677856, 'learning_rate': 0.00015819633071241363, 'epoch': 0.21}


 21%|██        | 3520/16798 [14:44<1:02:01,  3.57it/s]

{'loss': 1.6995, 'grad_norm': 1.9286359548568726, 'learning_rate': 0.00015818441744102932, 'epoch': 0.21}


 21%|██        | 3521/16798 [14:44<1:02:26,  3.54it/s]

{'loss': 1.541, 'grad_norm': 2.0251386165618896, 'learning_rate': 0.000158172504169645, 'epoch': 0.21}


 21%|██        | 3522/16798 [14:44<1:01:31,  3.60it/s]

{'loss': 1.8134, 'grad_norm': 2.1724870204925537, 'learning_rate': 0.00015816059089826068, 'epoch': 0.21}


 21%|██        | 3523/16798 [14:45<1:00:39,  3.65it/s]

{'loss': 1.7166, 'grad_norm': 2.493635892868042, 'learning_rate': 0.00015814867762687634, 'epoch': 0.21}


 21%|██        | 3524/16798 [14:45<57:26,  3.85it/s]  

{'loss': 1.529, 'grad_norm': 1.8422985076904297, 'learning_rate': 0.00015813676435549203, 'epoch': 0.21}


 21%|██        | 3525/16798 [14:45<59:04,  3.74it/s]

{'loss': 1.2617, 'grad_norm': 1.9090129137039185, 'learning_rate': 0.0001581248510841077, 'epoch': 0.21}


 21%|██        | 3526/16798 [14:45<57:20,  3.86it/s]

{'loss': 1.443, 'grad_norm': 1.814406394958496, 'learning_rate': 0.00015811293781272339, 'epoch': 0.21}


 21%|██        | 3527/16798 [14:46<58:32,  3.78it/s]

{'loss': 1.8153, 'grad_norm': 2.1974034309387207, 'learning_rate': 0.00015810102454133905, 'epoch': 0.21}


 21%|██        | 3528/16798 [14:46<58:41,  3.77it/s]

{'loss': 1.3972, 'grad_norm': 1.679722547531128, 'learning_rate': 0.00015808911126995474, 'epoch': 0.21}


 21%|██        | 3529/16798 [14:46<1:01:14,  3.61it/s]

{'loss': 1.2512, 'grad_norm': 1.570176601409912, 'learning_rate': 0.0001580771979985704, 'epoch': 0.21}


 21%|██        | 3530/16798 [14:46<59:57,  3.69it/s]  

{'loss': 1.0351, 'grad_norm': 2.0209999084472656, 'learning_rate': 0.0001580652847271861, 'epoch': 0.21}


 21%|██        | 3531/16798 [14:47<1:00:42,  3.64it/s]

{'loss': 1.2757, 'grad_norm': 1.6877429485321045, 'learning_rate': 0.00015805337145580176, 'epoch': 0.21}


 21%|██        | 3532/16798 [14:47<1:03:52,  3.46it/s]

{'loss': 1.7274, 'grad_norm': 1.888632893562317, 'learning_rate': 0.00015804145818441745, 'epoch': 0.21}


 21%|██        | 3533/16798 [14:47<1:01:50,  3.58it/s]

{'loss': 1.5843, 'grad_norm': 2.092043399810791, 'learning_rate': 0.0001580295449130331, 'epoch': 0.21}


 21%|██        | 3534/16798 [14:48<59:16,  3.73it/s]  

{'loss': 1.0963, 'grad_norm': 1.5699657201766968, 'learning_rate': 0.0001580176316416488, 'epoch': 0.21}


 21%|██        | 3535/16798 [14:48<1:00:49,  3.63it/s]

{'loss': 1.5126, 'grad_norm': 2.334840774536133, 'learning_rate': 0.00015800571837026447, 'epoch': 0.21}


 21%|██        | 3536/16798 [14:48<1:00:43,  3.64it/s]

{'loss': 1.3754, 'grad_norm': 3.8872814178466797, 'learning_rate': 0.00015799380509888016, 'epoch': 0.21}


 21%|██        | 3537/16798 [14:48<1:01:18,  3.60it/s]

{'loss': 1.1268, 'grad_norm': 1.6741329431533813, 'learning_rate': 0.00015798189182749582, 'epoch': 0.21}


 21%|██        | 3538/16798 [14:49<1:00:27,  3.66it/s]

{'loss': 1.5761, 'grad_norm': 2.2829837799072266, 'learning_rate': 0.0001579699785561115, 'epoch': 0.21}


 21%|██        | 3539/16798 [14:49<1:02:19,  3.55it/s]

{'loss': 1.2436, 'grad_norm': 1.6652623414993286, 'learning_rate': 0.00015795806528472718, 'epoch': 0.21}


 21%|██        | 3540/16798 [14:49<1:00:11,  3.67it/s]

{'loss': 1.4924, 'grad_norm': 2.4068381786346436, 'learning_rate': 0.00015794615201334287, 'epoch': 0.21}


 21%|██        | 3541/16798 [14:50<1:04:26,  3.43it/s]

{'loss': 1.5423, 'grad_norm': 2.1881392002105713, 'learning_rate': 0.00015793423874195853, 'epoch': 0.21}


 21%|██        | 3542/16798 [14:50<1:02:37,  3.53it/s]

{'loss': 0.6508, 'grad_norm': 1.1871031522750854, 'learning_rate': 0.00015792232547057422, 'epoch': 0.21}


 21%|██        | 3543/16798 [14:50<1:04:47,  3.41it/s]

{'loss': 1.0555, 'grad_norm': 1.5096619129180908, 'learning_rate': 0.00015791041219918989, 'epoch': 0.21}


 21%|██        | 3544/16798 [14:50<1:03:47,  3.46it/s]

{'loss': 1.5468, 'grad_norm': 1.903815507888794, 'learning_rate': 0.0001578984989278056, 'epoch': 0.21}


 21%|██        | 3545/16798 [14:51<1:02:33,  3.53it/s]

{'loss': 0.7157, 'grad_norm': 1.3634694814682007, 'learning_rate': 0.00015788658565642127, 'epoch': 0.21}


 21%|██        | 3546/16798 [14:51<58:53,  3.75it/s]  

{'loss': 1.0374, 'grad_norm': 1.7527945041656494, 'learning_rate': 0.00015787467238503696, 'epoch': 0.21}


 21%|██        | 3547/16798 [14:51<1:01:28,  3.59it/s]

{'loss': 0.9641, 'grad_norm': 1.7918596267700195, 'learning_rate': 0.00015786275911365262, 'epoch': 0.21}


 21%|██        | 3548/16798 [14:51<57:11,  3.86it/s]  

{'loss': 0.5209, 'grad_norm': 1.070333480834961, 'learning_rate': 0.0001578508458422683, 'epoch': 0.21}


 21%|██        | 3549/16798 [14:52<58:42,  3.76it/s]

{'loss': 0.2772, 'grad_norm': 0.9556534290313721, 'learning_rate': 0.00015783893257088398, 'epoch': 0.21}


 21%|██        | 3550/16798 [14:52<59:36,  3.70it/s]

{'loss': 0.5223, 'grad_norm': 1.2147773504257202, 'learning_rate': 0.00015782701929949967, 'epoch': 0.21}


 21%|██        | 3551/16798 [14:52<59:06,  3.73it/s]

{'loss': 2.0488, 'grad_norm': 1.8269025087356567, 'learning_rate': 0.00015781510602811533, 'epoch': 0.21}


 21%|██        | 3552/16798 [14:53<1:00:40,  3.64it/s]

{'loss': 1.5497, 'grad_norm': 1.6949673891067505, 'learning_rate': 0.00015780319275673102, 'epoch': 0.21}


 21%|██        | 3553/16798 [14:53<57:46,  3.82it/s]  

{'loss': 2.1745, 'grad_norm': 2.219593048095703, 'learning_rate': 0.00015779127948534668, 'epoch': 0.21}


 21%|██        | 3554/16798 [14:53<1:00:09,  3.67it/s]

{'loss': 1.8645, 'grad_norm': 1.9142522811889648, 'learning_rate': 0.00015777936621396238, 'epoch': 0.21}


 21%|██        | 3555/16798 [14:53<58:12,  3.79it/s]  

{'loss': 2.2927, 'grad_norm': 2.1049458980560303, 'learning_rate': 0.00015776745294257804, 'epoch': 0.21}


 21%|██        | 3556/16798 [14:54<59:40,  3.70it/s]

{'loss': 1.9418, 'grad_norm': 1.9288487434387207, 'learning_rate': 0.00015775553967119373, 'epoch': 0.21}


 21%|██        | 3557/16798 [14:54<1:00:21,  3.66it/s]

{'loss': 1.5861, 'grad_norm': 1.7232561111450195, 'learning_rate': 0.0001577436263998094, 'epoch': 0.21}


 21%|██        | 3558/16798 [14:54<1:02:14,  3.55it/s]

{'loss': 2.3295, 'grad_norm': 2.12514591217041, 'learning_rate': 0.00015773171312842508, 'epoch': 0.21}


 21%|██        | 3559/16798 [14:54<58:48,  3.75it/s]  

{'loss': 1.7801, 'grad_norm': 1.8260830640792847, 'learning_rate': 0.00015771979985704075, 'epoch': 0.21}


 21%|██        | 3560/16798 [14:55<59:44,  3.69it/s]

{'loss': 1.5936, 'grad_norm': 1.6140354871749878, 'learning_rate': 0.00015770788658565644, 'epoch': 0.21}


 21%|██        | 3561/16798 [14:55<1:02:10,  3.55it/s]

{'loss': 1.7726, 'grad_norm': 1.7761261463165283, 'learning_rate': 0.0001576959733142721, 'epoch': 0.21}


 21%|██        | 3562/16798 [14:55<1:03:42,  3.46it/s]

{'loss': 1.5549, 'grad_norm': 1.6914129257202148, 'learning_rate': 0.0001576840600428878, 'epoch': 0.21}


 21%|██        | 3563/16798 [14:56<1:03:13,  3.49it/s]

{'loss': 1.7538, 'grad_norm': 2.2019383907318115, 'learning_rate': 0.00015767214677150346, 'epoch': 0.21}


 21%|██        | 3564/16798 [14:56<1:03:06,  3.50it/s]

{'loss': 1.5825, 'grad_norm': 2.4051434993743896, 'learning_rate': 0.00015766023350011915, 'epoch': 0.21}


 21%|██        | 3565/16798 [14:56<59:29,  3.71it/s]  

{'loss': 1.3148, 'grad_norm': 1.6967732906341553, 'learning_rate': 0.0001576483202287348, 'epoch': 0.21}


 21%|██        | 3566/16798 [14:56<58:53,  3.74it/s]

{'loss': 1.946, 'grad_norm': 1.9499300718307495, 'learning_rate': 0.0001576364069573505, 'epoch': 0.21}


 21%|██        | 3567/16798 [14:57<1:02:18,  3.54it/s]

{'loss': 1.7147, 'grad_norm': 1.8591055870056152, 'learning_rate': 0.00015762449368596617, 'epoch': 0.21}


 21%|██        | 3568/16798 [14:57<1:05:35,  3.36it/s]

{'loss': 1.6747, 'grad_norm': 2.0561435222625732, 'learning_rate': 0.00015761258041458186, 'epoch': 0.21}


 21%|██        | 3569/16798 [14:57<1:01:56,  3.56it/s]

{'loss': 1.33, 'grad_norm': 1.8213169574737549, 'learning_rate': 0.00015760066714319752, 'epoch': 0.21}


 21%|██▏       | 3570/16798 [14:57<1:00:45,  3.63it/s]

{'loss': 1.5525, 'grad_norm': 1.7254024744033813, 'learning_rate': 0.0001575887538718132, 'epoch': 0.21}


 21%|██▏       | 3571/16798 [14:58<1:03:57,  3.45it/s]

{'loss': 1.675, 'grad_norm': 1.854795217514038, 'learning_rate': 0.00015757684060042887, 'epoch': 0.21}


 21%|██▏       | 3572/16798 [14:58<59:30,  3.70it/s]  

{'loss': 1.7493, 'grad_norm': 1.8128007650375366, 'learning_rate': 0.00015756492732904457, 'epoch': 0.21}


 21%|██▏       | 3573/16798 [14:58<59:46,  3.69it/s]

{'loss': 1.5811, 'grad_norm': 1.9587973356246948, 'learning_rate': 0.00015755301405766023, 'epoch': 0.21}


 21%|██▏       | 3574/16798 [14:59<56:24,  3.91it/s]

{'loss': 1.6445, 'grad_norm': 1.7447298765182495, 'learning_rate': 0.00015754110078627592, 'epoch': 0.21}


 21%|██▏       | 3575/16798 [14:59<59:50,  3.68it/s]

{'loss': 1.5467, 'grad_norm': 1.7835942506790161, 'learning_rate': 0.0001575291875148916, 'epoch': 0.21}


 21%|██▏       | 3576/16798 [14:59<58:15,  3.78it/s]

{'loss': 1.4643, 'grad_norm': 1.6888381242752075, 'learning_rate': 0.00015751727424350727, 'epoch': 0.21}


 21%|██▏       | 3577/16798 [14:59<1:01:25,  3.59it/s]

{'loss': 1.619, 'grad_norm': 1.7693361043930054, 'learning_rate': 0.00015750536097212296, 'epoch': 0.21}


 21%|██▏       | 3578/16798 [15:00<59:09,  3.72it/s]  

{'loss': 1.8194, 'grad_norm': 2.1125571727752686, 'learning_rate': 0.00015749344770073863, 'epoch': 0.21}


 21%|██▏       | 3579/16798 [15:00<55:10,  3.99it/s]

{'loss': 1.2802, 'grad_norm': 1.7542856931686401, 'learning_rate': 0.00015748153442935432, 'epoch': 0.21}


 21%|██▏       | 3580/16798 [15:00<58:46,  3.75it/s]

{'loss': 1.1146, 'grad_norm': 1.478973627090454, 'learning_rate': 0.00015746962115796998, 'epoch': 0.21}


 21%|██▏       | 3581/16798 [15:00<56:45,  3.88it/s]

{'loss': 1.3456, 'grad_norm': 1.6738954782485962, 'learning_rate': 0.00015745770788658567, 'epoch': 0.21}


 21%|██▏       | 3582/16798 [15:01<1:03:03,  3.49it/s]

{'loss': 1.4347, 'grad_norm': 1.8925143480300903, 'learning_rate': 0.00015744579461520134, 'epoch': 0.21}


 21%|██▏       | 3583/16798 [15:01<1:00:16,  3.65it/s]

{'loss': 1.5363, 'grad_norm': 1.937668800354004, 'learning_rate': 0.00015743388134381703, 'epoch': 0.21}


 21%|██▏       | 3584/16798 [15:01<1:03:36,  3.46it/s]

{'loss': 1.4413, 'grad_norm': 2.2836077213287354, 'learning_rate': 0.0001574219680724327, 'epoch': 0.21}


 21%|██▏       | 3585/16798 [15:02<1:04:58,  3.39it/s]

{'loss': 1.5969, 'grad_norm': 2.541825294494629, 'learning_rate': 0.00015741005480104838, 'epoch': 0.21}


 21%|██▏       | 3586/16798 [15:02<1:02:27,  3.53it/s]

{'loss': 1.4559, 'grad_norm': 1.6432632207870483, 'learning_rate': 0.00015739814152966405, 'epoch': 0.21}


 21%|██▏       | 3587/16798 [15:02<59:45,  3.68it/s]  

{'loss': 1.6005, 'grad_norm': 1.8784111738204956, 'learning_rate': 0.00015738622825827974, 'epoch': 0.21}


 21%|██▏       | 3588/16798 [15:02<1:03:05,  3.49it/s]

{'loss': 1.9063, 'grad_norm': 2.252274513244629, 'learning_rate': 0.0001573743149868954, 'epoch': 0.21}


 21%|██▏       | 3589/16798 [15:03<59:36,  3.69it/s]  

{'loss': 1.2051, 'grad_norm': 1.4897950887680054, 'learning_rate': 0.0001573624017155111, 'epoch': 0.21}


 21%|██▏       | 3590/16798 [15:03<59:48,  3.68it/s]

{'loss': 1.1183, 'grad_norm': 1.7559325695037842, 'learning_rate': 0.00015735048844412675, 'epoch': 0.21}


 21%|██▏       | 3591/16798 [15:03<59:04,  3.73it/s]

{'loss': 1.4999, 'grad_norm': 2.0311546325683594, 'learning_rate': 0.00015733857517274245, 'epoch': 0.21}


 21%|██▏       | 3592/16798 [15:03<59:27,  3.70it/s]

{'loss': 1.0333, 'grad_norm': 1.6430596113204956, 'learning_rate': 0.0001573266619013581, 'epoch': 0.21}


 21%|██▏       | 3593/16798 [15:04<1:00:05,  3.66it/s]

{'loss': 1.1094, 'grad_norm': 1.7417433261871338, 'learning_rate': 0.0001573147486299738, 'epoch': 0.21}


 21%|██▏       | 3594/16798 [15:04<1:04:17,  3.42it/s]

{'loss': 1.2603, 'grad_norm': 1.8080183267593384, 'learning_rate': 0.00015730283535858946, 'epoch': 0.21}


 21%|██▏       | 3595/16798 [15:04<59:39,  3.69it/s]  

{'loss': 1.1567, 'grad_norm': 1.6547952890396118, 'learning_rate': 0.00015729092208720515, 'epoch': 0.21}


 21%|██▏       | 3596/16798 [15:05<57:35,  3.82it/s]

{'loss': 1.2269, 'grad_norm': 1.8216724395751953, 'learning_rate': 0.00015727900881582082, 'epoch': 0.21}


 21%|██▏       | 3598/16798 [15:05<55:30,  3.96it/s]  

{'loss': 1.2846, 'grad_norm': 1.744835615158081, 'learning_rate': 0.0001572670955444365, 'epoch': 0.21}


 21%|██▏       | 3598/16798 [15:05<55:30,  3.96it/s]

{'loss': 0.9625, 'grad_norm': 1.6350014209747314, 'learning_rate': 0.00015725518227305217, 'epoch': 0.21}


 21%|██▏       | 3599/16798 [15:05<53:40,  4.10it/s]

{'loss': 1.0334, 'grad_norm': 1.8905576467514038, 'learning_rate': 0.00015724326900166786, 'epoch': 0.21}


 21%|██▏       | 3600/16798 [15:06<1:01:19,  3.59it/s]

{'loss': 1.018, 'grad_norm': 1.716278076171875, 'learning_rate': 0.00015723135573028353, 'epoch': 0.21}


 21%|██▏       | 3601/16798 [15:06<58:17,  3.77it/s]  

{'loss': 2.1304, 'grad_norm': 1.8889858722686768, 'learning_rate': 0.00015721944245889922, 'epoch': 0.21}


 21%|██▏       | 3602/16798 [15:06<1:00:36,  3.63it/s]

{'loss': 2.2112, 'grad_norm': 2.0120925903320312, 'learning_rate': 0.00015720752918751488, 'epoch': 0.21}


 21%|██▏       | 3603/16798 [15:06<59:49,  3.68it/s]  

{'loss': 1.9558, 'grad_norm': 1.9415031671524048, 'learning_rate': 0.00015719561591613057, 'epoch': 0.21}


 21%|██▏       | 3604/16798 [15:07<1:04:22,  3.42it/s]

{'loss': 2.2416, 'grad_norm': 2.324472427368164, 'learning_rate': 0.00015718370264474624, 'epoch': 0.21}


 21%|██▏       | 3605/16798 [15:07<1:01:26,  3.58it/s]

{'loss': 2.5569, 'grad_norm': 2.110178232192993, 'learning_rate': 0.00015717178937336193, 'epoch': 0.21}


 21%|██▏       | 3606/16798 [15:07<1:03:47,  3.45it/s]

{'loss': 2.0137, 'grad_norm': 1.9076757431030273, 'learning_rate': 0.00015715987610197762, 'epoch': 0.21}


 21%|██▏       | 3607/16798 [15:08<1:00:38,  3.63it/s]

{'loss': 1.9376, 'grad_norm': 2.0244932174682617, 'learning_rate': 0.0001571479628305933, 'epoch': 0.21}


 21%|██▏       | 3608/16798 [15:08<1:03:00,  3.49it/s]

{'loss': 1.9449, 'grad_norm': 1.7945064306259155, 'learning_rate': 0.00015713604955920897, 'epoch': 0.21}


 21%|██▏       | 3609/16798 [15:08<59:46,  3.68it/s]  

{'loss': 2.1055, 'grad_norm': 2.7423818111419678, 'learning_rate': 0.00015712413628782466, 'epoch': 0.21}


 21%|██▏       | 3610/16798 [15:08<1:01:36,  3.57it/s]

{'loss': 1.9087, 'grad_norm': 1.8060050010681152, 'learning_rate': 0.00015711222301644033, 'epoch': 0.21}


 21%|██▏       | 3611/16798 [15:09<59:03,  3.72it/s]  

{'loss': 1.6276, 'grad_norm': 1.8811018466949463, 'learning_rate': 0.00015710030974505602, 'epoch': 0.21}


 22%|██▏       | 3612/16798 [15:09<1:00:50,  3.61it/s]

{'loss': 1.6604, 'grad_norm': 1.9049115180969238, 'learning_rate': 0.00015708839647367168, 'epoch': 0.22}


 22%|██▏       | 3613/16798 [15:09<57:19,  3.83it/s]  

{'loss': 1.3889, 'grad_norm': 1.6768076419830322, 'learning_rate': 0.00015707648320228737, 'epoch': 0.22}


 22%|██▏       | 3614/16798 [15:09<58:28,  3.76it/s]

{'loss': 1.8091, 'grad_norm': 1.7034032344818115, 'learning_rate': 0.00015706456993090304, 'epoch': 0.22}


 22%|██▏       | 3615/16798 [15:10<57:17,  3.84it/s]

{'loss': 1.5817, 'grad_norm': 1.6455665826797485, 'learning_rate': 0.00015705265665951873, 'epoch': 0.22}


 22%|██▏       | 3616/16798 [15:10<1:02:35,  3.51it/s]

{'loss': 2.0489, 'grad_norm': 1.9873907566070557, 'learning_rate': 0.0001570407433881344, 'epoch': 0.22}


 22%|██▏       | 3617/16798 [15:10<1:03:46,  3.44it/s]

{'loss': 1.4436, 'grad_norm': 1.4828110933303833, 'learning_rate': 0.00015702883011675008, 'epoch': 0.22}


 22%|██▏       | 3618/16798 [15:11<1:06:54,  3.28it/s]

{'loss': 1.706, 'grad_norm': 1.682736873626709, 'learning_rate': 0.00015701691684536574, 'epoch': 0.22}


 22%|██▏       | 3619/16798 [15:11<1:03:33,  3.46it/s]

{'loss': 1.2714, 'grad_norm': 1.5871775150299072, 'learning_rate': 0.00015700500357398143, 'epoch': 0.22}


 22%|██▏       | 3620/16798 [15:11<1:03:53,  3.44it/s]

{'loss': 1.485, 'grad_norm': 1.7471539974212646, 'learning_rate': 0.0001569930903025971, 'epoch': 0.22}


 22%|██▏       | 3621/16798 [15:12<1:02:44,  3.50it/s]

{'loss': 1.5854, 'grad_norm': 2.0216431617736816, 'learning_rate': 0.0001569811770312128, 'epoch': 0.22}


 22%|██▏       | 3622/16798 [15:12<1:01:35,  3.56it/s]

{'loss': 1.1689, 'grad_norm': 1.9116511344909668, 'learning_rate': 0.00015696926375982845, 'epoch': 0.22}


 22%|██▏       | 3623/16798 [15:12<1:00:16,  3.64it/s]

{'loss': 1.6757, 'grad_norm': 1.9829787015914917, 'learning_rate': 0.00015695735048844414, 'epoch': 0.22}


 22%|██▏       | 3624/16798 [15:12<1:00:24,  3.63it/s]

{'loss': 1.6338, 'grad_norm': 2.025456428527832, 'learning_rate': 0.0001569454372170598, 'epoch': 0.22}


 22%|██▏       | 3625/16798 [15:13<56:35,  3.88it/s]  

{'loss': 1.7487, 'grad_norm': 2.2011468410491943, 'learning_rate': 0.0001569335239456755, 'epoch': 0.22}


 22%|██▏       | 3626/16798 [15:13<57:26,  3.82it/s]

{'loss': 1.524, 'grad_norm': 1.699586272239685, 'learning_rate': 0.00015692161067429116, 'epoch': 0.22}


 22%|██▏       | 3627/16798 [15:13<57:46,  3.80it/s]

{'loss': 1.6417, 'grad_norm': 1.7360446453094482, 'learning_rate': 0.00015690969740290685, 'epoch': 0.22}


 22%|██▏       | 3628/16798 [15:13<1:03:54,  3.43it/s]

{'loss': 1.3263, 'grad_norm': 2.070795774459839, 'learning_rate': 0.00015689778413152252, 'epoch': 0.22}


 22%|██▏       | 3629/16798 [15:14<1:01:26,  3.57it/s]

{'loss': 1.4643, 'grad_norm': 1.965596079826355, 'learning_rate': 0.0001568858708601382, 'epoch': 0.22}


 22%|██▏       | 3630/16798 [15:14<1:00:15,  3.64it/s]

{'loss': 1.6405, 'grad_norm': 1.9525038003921509, 'learning_rate': 0.00015687395758875387, 'epoch': 0.22}


 22%|██▏       | 3631/16798 [15:14<1:03:12,  3.47it/s]

{'loss': 1.2753, 'grad_norm': 1.6264231204986572, 'learning_rate': 0.00015686204431736956, 'epoch': 0.22}


 22%|██▏       | 3632/16798 [15:15<59:28,  3.69it/s]  

{'loss': 1.2888, 'grad_norm': 1.731093406677246, 'learning_rate': 0.00015685013104598522, 'epoch': 0.22}


 22%|██▏       | 3633/16798 [15:15<1:02:11,  3.53it/s]

{'loss': 1.275, 'grad_norm': 1.634974718093872, 'learning_rate': 0.00015683821777460092, 'epoch': 0.22}


 22%|██▏       | 3634/16798 [15:15<59:22,  3.70it/s]  

{'loss': 1.2405, 'grad_norm': 1.9589289426803589, 'learning_rate': 0.00015682630450321658, 'epoch': 0.22}


 22%|██▏       | 3635/16798 [15:15<1:00:40,  3.62it/s]

{'loss': 1.2969, 'grad_norm': 1.6699841022491455, 'learning_rate': 0.00015681439123183227, 'epoch': 0.22}


 22%|██▏       | 3636/16798 [15:16<59:17,  3.70it/s]  

{'loss': 1.3258, 'grad_norm': 1.6323269605636597, 'learning_rate': 0.00015680247796044793, 'epoch': 0.22}


 22%|██▏       | 3637/16798 [15:16<1:01:26,  3.57it/s]

{'loss': 1.2083, 'grad_norm': 2.1582045555114746, 'learning_rate': 0.00015679056468906362, 'epoch': 0.22}


 22%|██▏       | 3638/16798 [15:16<59:14,  3.70it/s]  

{'loss': 1.4039, 'grad_norm': 2.4402689933776855, 'learning_rate': 0.00015677865141767932, 'epoch': 0.22}


 22%|██▏       | 3639/16798 [15:16<1:01:29,  3.57it/s]

{'loss': 1.8773, 'grad_norm': 2.255542516708374, 'learning_rate': 0.00015676673814629498, 'epoch': 0.22}


 22%|██▏       | 3640/16798 [15:17<1:01:08,  3.59it/s]

{'loss': 1.0128, 'grad_norm': 2.7388436794281006, 'learning_rate': 0.00015675482487491067, 'epoch': 0.22}


 22%|██▏       | 3641/16798 [15:17<1:02:39,  3.50it/s]

{'loss': 1.7139, 'grad_norm': 2.0990052223205566, 'learning_rate': 0.00015674291160352633, 'epoch': 0.22}


 22%|██▏       | 3642/16798 [15:17<59:38,  3.68it/s]  

{'loss': 1.1444, 'grad_norm': 1.680834412574768, 'learning_rate': 0.00015673099833214202, 'epoch': 0.22}


 22%|██▏       | 3643/16798 [15:18<1:01:16,  3.58it/s]

{'loss': 1.1879, 'grad_norm': 1.9408186674118042, 'learning_rate': 0.0001567190850607577, 'epoch': 0.22}


 22%|██▏       | 3644/16798 [15:18<1:02:04,  3.53it/s]

{'loss': 0.9644, 'grad_norm': 1.4234769344329834, 'learning_rate': 0.00015670717178937338, 'epoch': 0.22}


 22%|██▏       | 3645/16798 [15:18<1:03:23,  3.46it/s]

{'loss': 0.8679, 'grad_norm': 1.7288259267807007, 'learning_rate': 0.00015669525851798904, 'epoch': 0.22}


 22%|██▏       | 3646/16798 [15:18<59:29,  3.68it/s]  

{'loss': 0.7682, 'grad_norm': 1.4326931238174438, 'learning_rate': 0.00015668334524660473, 'epoch': 0.22}


 22%|██▏       | 3647/16798 [15:19<59:30,  3.68it/s]

{'loss': 1.0105, 'grad_norm': 1.6741594076156616, 'learning_rate': 0.0001566714319752204, 'epoch': 0.22}


 22%|██▏       | 3648/16798 [15:19<58:42,  3.73it/s]

{'loss': 0.7802, 'grad_norm': 1.4885882139205933, 'learning_rate': 0.0001566595187038361, 'epoch': 0.22}


 22%|██▏       | 3649/16798 [15:19<1:00:02,  3.65it/s]

{'loss': 0.7745, 'grad_norm': 1.3736144304275513, 'learning_rate': 0.00015664760543245175, 'epoch': 0.22}


 22%|██▏       | 3650/16798 [15:20<59:21,  3.69it/s]  

{'loss': 0.7366, 'grad_norm': 1.4489085674285889, 'learning_rate': 0.00015663569216106744, 'epoch': 0.22}


 22%|██▏       | 3651/16798 [15:20<1:01:48,  3.55it/s]

{'loss': 2.2411, 'grad_norm': 2.4306254386901855, 'learning_rate': 0.0001566237788896831, 'epoch': 0.22}


 22%|██▏       | 3652/16798 [15:20<57:51,  3.79it/s]  

{'loss': 2.0452, 'grad_norm': 2.097473621368408, 'learning_rate': 0.0001566118656182988, 'epoch': 0.22}


 22%|██▏       | 3653/16798 [15:20<57:14,  3.83it/s]

{'loss': 2.1504, 'grad_norm': 2.1108615398406982, 'learning_rate': 0.00015659995234691446, 'epoch': 0.22}


 22%|██▏       | 3654/16798 [15:21<1:02:55,  3.48it/s]

{'loss': 2.0069, 'grad_norm': 2.0197551250457764, 'learning_rate': 0.00015658803907553015, 'epoch': 0.22}


 22%|██▏       | 3655/16798 [15:21<59:39,  3.67it/s]  

{'loss': 2.1687, 'grad_norm': 2.122279167175293, 'learning_rate': 0.00015657612580414581, 'epoch': 0.22}


 22%|██▏       | 3656/16798 [15:21<1:03:44,  3.44it/s]

{'loss': 2.1217, 'grad_norm': 2.0895633697509766, 'learning_rate': 0.0001565642125327615, 'epoch': 0.22}


 22%|██▏       | 3657/16798 [15:21<1:02:30,  3.50it/s]

{'loss': 2.0888, 'grad_norm': 2.1767144203186035, 'learning_rate': 0.00015655229926137717, 'epoch': 0.22}


 22%|██▏       | 3658/16798 [15:22<1:03:32,  3.45it/s]

{'loss': 1.5219, 'grad_norm': 1.8101612329483032, 'learning_rate': 0.00015654038598999286, 'epoch': 0.22}


 22%|██▏       | 3659/16798 [15:22<58:49,  3.72it/s]  

{'loss': 1.5865, 'grad_norm': 1.8486930131912231, 'learning_rate': 0.00015652847271860852, 'epoch': 0.22}


 22%|██▏       | 3660/16798 [15:22<55:20,  3.96it/s]

{'loss': 1.3782, 'grad_norm': 1.9120908975601196, 'learning_rate': 0.00015651655944722421, 'epoch': 0.22}


 22%|██▏       | 3661/16798 [15:23<59:18,  3.69it/s]

{'loss': 2.0195, 'grad_norm': 3.822354555130005, 'learning_rate': 0.00015650464617583988, 'epoch': 0.22}


 22%|██▏       | 3662/16798 [15:23<58:40,  3.73it/s]

{'loss': 1.9334, 'grad_norm': 1.930501937866211, 'learning_rate': 0.00015649273290445557, 'epoch': 0.22}


 22%|██▏       | 3664/16798 [15:23<56:18,  3.89it/s]  

{'loss': 1.6828, 'grad_norm': 1.9402357339859009, 'learning_rate': 0.00015648081963307123, 'epoch': 0.22}


 22%|██▏       | 3664/16798 [15:23<56:18,  3.89it/s]

{'loss': 1.6407, 'grad_norm': 1.865538477897644, 'learning_rate': 0.00015646890636168692, 'epoch': 0.22}


 22%|██▏       | 3665/16798 [15:24<56:38,  3.86it/s]

{'loss': 1.7213, 'grad_norm': 1.8570117950439453, 'learning_rate': 0.00015645699309030259, 'epoch': 0.22}


 22%|██▏       | 3666/16798 [15:24<1:00:54,  3.59it/s]

{'loss': 1.5422, 'grad_norm': 1.6769299507141113, 'learning_rate': 0.00015644507981891828, 'epoch': 0.22}


 22%|██▏       | 3667/16798 [15:24<57:04,  3.83it/s]  

{'loss': 1.5494, 'grad_norm': 1.872207522392273, 'learning_rate': 0.00015643316654753397, 'epoch': 0.22}


 22%|██▏       | 3668/16798 [15:24<58:54,  3.71it/s]

{'loss': 1.4051, 'grad_norm': 1.529059886932373, 'learning_rate': 0.00015642125327614966, 'epoch': 0.22}


 22%|██▏       | 3669/16798 [15:25<56:32,  3.87it/s]

{'loss': 1.829, 'grad_norm': 1.9408315420150757, 'learning_rate': 0.00015640934000476532, 'epoch': 0.22}


 22%|██▏       | 3670/16798 [15:25<1:00:09,  3.64it/s]

{'loss': 1.3715, 'grad_norm': 1.651304841041565, 'learning_rate': 0.000156397426733381, 'epoch': 0.22}


 22%|██▏       | 3671/16798 [15:25<57:04,  3.83it/s]  

{'loss': 1.4052, 'grad_norm': 1.8790611028671265, 'learning_rate': 0.00015638551346199668, 'epoch': 0.22}


 22%|██▏       | 3672/16798 [15:25<58:05,  3.77it/s]

{'loss': 1.7172, 'grad_norm': 1.7756842374801636, 'learning_rate': 0.00015637360019061237, 'epoch': 0.22}


 22%|██▏       | 3673/16798 [15:26<1:02:33,  3.50it/s]

{'loss': 1.335, 'grad_norm': 1.7644927501678467, 'learning_rate': 0.00015636168691922803, 'epoch': 0.22}


 22%|██▏       | 3674/16798 [15:26<1:02:41,  3.49it/s]

{'loss': 1.3747, 'grad_norm': 1.53793203830719, 'learning_rate': 0.00015634977364784372, 'epoch': 0.22}


 22%|██▏       | 3675/16798 [15:26<1:02:16,  3.51it/s]

{'loss': 1.3435, 'grad_norm': 1.8473231792449951, 'learning_rate': 0.00015633786037645939, 'epoch': 0.22}


 22%|██▏       | 3676/16798 [15:27<1:03:32,  3.44it/s]

{'loss': 1.4657, 'grad_norm': 1.879663109779358, 'learning_rate': 0.00015632594710507508, 'epoch': 0.22}


 22%|██▏       | 3677/16798 [15:27<1:01:16,  3.57it/s]

{'loss': 1.9975, 'grad_norm': 1.8950783014297485, 'learning_rate': 0.00015631403383369074, 'epoch': 0.22}


 22%|██▏       | 3678/16798 [15:27<59:54,  3.65it/s]  

{'loss': 1.4177, 'grad_norm': 2.0993587970733643, 'learning_rate': 0.00015630212056230643, 'epoch': 0.22}


 22%|██▏       | 3679/16798 [15:27<1:00:18,  3.63it/s]

{'loss': 1.2813, 'grad_norm': 1.8024098873138428, 'learning_rate': 0.0001562902072909221, 'epoch': 0.22}


 22%|██▏       | 3680/16798 [15:28<1:03:57,  3.42it/s]

{'loss': 1.6311, 'grad_norm': 2.0344111919403076, 'learning_rate': 0.00015627829401953779, 'epoch': 0.22}


 22%|██▏       | 3681/16798 [15:28<1:03:52,  3.42it/s]

{'loss': 1.3067, 'grad_norm': 1.6887942552566528, 'learning_rate': 0.00015626638074815345, 'epoch': 0.22}


 22%|██▏       | 3682/16798 [15:28<59:43,  3.66it/s]  

{'loss': 1.2997, 'grad_norm': 1.8035223484039307, 'learning_rate': 0.00015625446747676914, 'epoch': 0.22}


 22%|██▏       | 3683/16798 [15:29<1:06:52,  3.27it/s]

{'loss': 1.2315, 'grad_norm': 1.7451192140579224, 'learning_rate': 0.0001562425542053848, 'epoch': 0.22}


 22%|██▏       | 3684/16798 [15:29<1:02:24,  3.50it/s]

{'loss': 1.2983, 'grad_norm': 1.694351315498352, 'learning_rate': 0.0001562306409340005, 'epoch': 0.22}


 22%|██▏       | 3685/16798 [15:29<1:04:46,  3.37it/s]

{'loss': 1.1622, 'grad_norm': 1.776546597480774, 'learning_rate': 0.00015621872766261616, 'epoch': 0.22}


 22%|██▏       | 3686/16798 [15:29<1:00:30,  3.61it/s]

{'loss': 1.2943, 'grad_norm': 1.7439954280853271, 'learning_rate': 0.00015620681439123185, 'epoch': 0.22}


 22%|██▏       | 3687/16798 [15:30<1:01:49,  3.53it/s]

{'loss': 1.4669, 'grad_norm': 2.0581676959991455, 'learning_rate': 0.0001561949011198475, 'epoch': 0.22}


 22%|██▏       | 3688/16798 [15:30<1:02:53,  3.47it/s]

{'loss': 1.8199, 'grad_norm': 2.813448667526245, 'learning_rate': 0.0001561829878484632, 'epoch': 0.22}


 22%|██▏       | 3689/16798 [15:30<1:03:08,  3.46it/s]

{'loss': 1.0393, 'grad_norm': 1.6877930164337158, 'learning_rate': 0.00015617107457707887, 'epoch': 0.22}


 22%|██▏       | 3690/16798 [15:31<1:00:16,  3.62it/s]

{'loss': 0.914, 'grad_norm': 1.6232446432113647, 'learning_rate': 0.00015615916130569456, 'epoch': 0.22}


 22%|██▏       | 3691/16798 [15:31<1:04:11,  3.40it/s]

{'loss': 1.423, 'grad_norm': 1.8077869415283203, 'learning_rate': 0.00015614724803431022, 'epoch': 0.22}


 22%|██▏       | 3692/16798 [15:31<1:01:44,  3.54it/s]

{'loss': 1.4709, 'grad_norm': 1.9566742181777954, 'learning_rate': 0.0001561353347629259, 'epoch': 0.22}


 22%|██▏       | 3693/16798 [15:31<1:02:43,  3.48it/s]

{'loss': 0.9492, 'grad_norm': 1.7911019325256348, 'learning_rate': 0.00015612342149154158, 'epoch': 0.22}


 22%|██▏       | 3694/16798 [15:32<1:02:34,  3.49it/s]

{'loss': 1.3172, 'grad_norm': 1.8996262550354004, 'learning_rate': 0.00015611150822015727, 'epoch': 0.22}


 22%|██▏       | 3695/16798 [15:32<1:03:26,  3.44it/s]

{'loss': 1.6684, 'grad_norm': 2.0219266414642334, 'learning_rate': 0.00015609959494877293, 'epoch': 0.22}


 22%|██▏       | 3696/16798 [15:32<1:08:00,  3.21it/s]

{'loss': 1.0822, 'grad_norm': 2.2402660846710205, 'learning_rate': 0.00015608768167738862, 'epoch': 0.22}


 22%|██▏       | 3697/16798 [15:33<1:09:31,  3.14it/s]

{'loss': 1.0739, 'grad_norm': 2.2021570205688477, 'learning_rate': 0.00015607576840600428, 'epoch': 0.22}


 22%|██▏       | 3699/16798 [15:33<57:32,  3.79it/s]  

{'loss': 0.5239, 'grad_norm': 1.4286470413208008, 'learning_rate': 0.00015606385513461997, 'epoch': 0.22}


 22%|██▏       | 3699/16798 [15:33<57:32,  3.79it/s]

{'loss': 0.4937, 'grad_norm': 1.089198112487793, 'learning_rate': 0.00015605194186323567, 'epoch': 0.22}


 22%|██▏       | 3700/16798 [15:33<58:45,  3.72it/s]

{'loss': 0.2932, 'grad_norm': 0.8692574501037598, 'learning_rate': 0.00015604002859185133, 'epoch': 0.22}


 22%|██▏       | 3701/16798 [15:34<1:01:07,  3.57it/s]

{'loss': 2.1117, 'grad_norm': 2.083249807357788, 'learning_rate': 0.00015602811532046702, 'epoch': 0.22}


 22%|██▏       | 3702/16798 [15:34<1:03:41,  3.43it/s]

{'loss': 1.9715, 'grad_norm': 1.7438603639602661, 'learning_rate': 0.00015601620204908268, 'epoch': 0.22}


 22%|██▏       | 3703/16798 [15:34<59:01,  3.70it/s]  

{'loss': 2.2002, 'grad_norm': 2.017332077026367, 'learning_rate': 0.00015600428877769837, 'epoch': 0.22}


 22%|██▏       | 3704/16798 [15:35<55:48,  3.91it/s]

{'loss': 1.8431, 'grad_norm': 1.7475508451461792, 'learning_rate': 0.00015599237550631404, 'epoch': 0.22}


 22%|██▏       | 3705/16798 [15:35<1:00:24,  3.61it/s]

{'loss': 2.172, 'grad_norm': 1.9696539640426636, 'learning_rate': 0.00015598046223492973, 'epoch': 0.22}


 22%|██▏       | 3706/16798 [15:35<59:01,  3.70it/s]  

{'loss': 2.3265, 'grad_norm': 2.2903892993927, 'learning_rate': 0.0001559685489635454, 'epoch': 0.22}


 22%|██▏       | 3707/16798 [15:35<1:02:01,  3.52it/s]

{'loss': 2.39, 'grad_norm': 2.097806453704834, 'learning_rate': 0.00015595663569216108, 'epoch': 0.22}


 22%|██▏       | 3708/16798 [15:36<1:00:42,  3.59it/s]

{'loss': 2.3203, 'grad_norm': 1.998159646987915, 'learning_rate': 0.00015594472242077675, 'epoch': 0.22}


 22%|██▏       | 3709/16798 [15:36<1:03:01,  3.46it/s]

{'loss': 2.0916, 'grad_norm': 2.6826937198638916, 'learning_rate': 0.00015593280914939244, 'epoch': 0.22}


 22%|██▏       | 3710/16798 [15:36<58:46,  3.71it/s]  

{'loss': 1.8693, 'grad_norm': 1.7807164192199707, 'learning_rate': 0.0001559208958780081, 'epoch': 0.22}


 22%|██▏       | 3711/16798 [15:37<59:25,  3.67it/s]

{'loss': 1.7194, 'grad_norm': 1.887259840965271, 'learning_rate': 0.0001559089826066238, 'epoch': 0.22}


 22%|██▏       | 3712/16798 [15:37<58:31,  3.73it/s]

{'loss': 1.5993, 'grad_norm': 1.81727135181427, 'learning_rate': 0.00015589706933523946, 'epoch': 0.22}


 22%|██▏       | 3713/16798 [15:37<1:00:11,  3.62it/s]

{'loss': 2.0265, 'grad_norm': 1.957474708557129, 'learning_rate': 0.00015588515606385515, 'epoch': 0.22}


 22%|██▏       | 3714/16798 [15:37<56:53,  3.83it/s]  

{'loss': 1.7502, 'grad_norm': 1.9038500785827637, 'learning_rate': 0.0001558732427924708, 'epoch': 0.22}


 22%|██▏       | 3715/16798 [15:38<59:00,  3.70it/s]

{'loss': 1.2834, 'grad_norm': 1.6267471313476562, 'learning_rate': 0.0001558613295210865, 'epoch': 0.22}


 22%|██▏       | 3716/16798 [15:38<57:27,  3.79it/s]

{'loss': 1.6925, 'grad_norm': 2.536349058151245, 'learning_rate': 0.00015584941624970216, 'epoch': 0.22}


 22%|██▏       | 3717/16798 [15:38<58:32,  3.72it/s]

{'loss': 1.3843, 'grad_norm': 1.5852011442184448, 'learning_rate': 0.00015583750297831786, 'epoch': 0.22}


 22%|██▏       | 3718/16798 [15:38<1:01:43,  3.53it/s]

{'loss': 1.71, 'grad_norm': 1.9822489023208618, 'learning_rate': 0.00015582558970693352, 'epoch': 0.22}


 22%|██▏       | 3719/16798 [15:39<1:01:13,  3.56it/s]

{'loss': 2.1723, 'grad_norm': 2.3794655799865723, 'learning_rate': 0.0001558136764355492, 'epoch': 0.22}


 22%|██▏       | 3720/16798 [15:39<58:04,  3.75it/s]  

{'loss': 1.3795, 'grad_norm': 1.6200977563858032, 'learning_rate': 0.00015580176316416487, 'epoch': 0.22}


 22%|██▏       | 3721/16798 [15:39<1:00:55,  3.58it/s]

{'loss': 1.7054, 'grad_norm': 1.9160380363464355, 'learning_rate': 0.00015578984989278056, 'epoch': 0.22}


 22%|██▏       | 3722/16798 [15:40<1:00:47,  3.58it/s]

{'loss': 1.433, 'grad_norm': 1.5728942155838013, 'learning_rate': 0.00015577793662139623, 'epoch': 0.22}


 22%|██▏       | 3723/16798 [15:40<1:02:27,  3.49it/s]

{'loss': 1.2543, 'grad_norm': 1.7180750370025635, 'learning_rate': 0.00015576602335001192, 'epoch': 0.22}


 22%|██▏       | 3724/16798 [15:40<59:29,  3.66it/s]  

{'loss': 1.5717, 'grad_norm': 1.8445461988449097, 'learning_rate': 0.00015575411007862758, 'epoch': 0.22}


 22%|██▏       | 3725/16798 [15:40<1:00:50,  3.58it/s]

{'loss': 1.4923, 'grad_norm': 1.8144333362579346, 'learning_rate': 0.00015574219680724327, 'epoch': 0.22}


 22%|██▏       | 3726/16798 [15:41<1:04:52,  3.36it/s]

{'loss': 1.63, 'grad_norm': 1.6790540218353271, 'learning_rate': 0.00015573028353585894, 'epoch': 0.22}


 22%|██▏       | 3727/16798 [15:41<1:04:32,  3.38it/s]

{'loss': 1.3298, 'grad_norm': 1.7653391361236572, 'learning_rate': 0.00015571837026447463, 'epoch': 0.22}


 22%|██▏       | 3728/16798 [15:41<1:00:37,  3.59it/s]

{'loss': 1.4867, 'grad_norm': 1.8121367692947388, 'learning_rate': 0.0001557064569930903, 'epoch': 0.22}


 22%|██▏       | 3729/16798 [15:42<1:03:54,  3.41it/s]

{'loss': 1.3394, 'grad_norm': 1.631737232208252, 'learning_rate': 0.000155694543721706, 'epoch': 0.22}


 22%|██▏       | 3730/16798 [15:42<1:02:49,  3.47it/s]

{'loss': 2.0052, 'grad_norm': 2.2892980575561523, 'learning_rate': 0.00015568263045032167, 'epoch': 0.22}


 22%|██▏       | 3731/16798 [15:42<1:04:03,  3.40it/s]

{'loss': 1.4065, 'grad_norm': 1.5888429880142212, 'learning_rate': 0.00015567071717893736, 'epoch': 0.22}


 22%|██▏       | 3732/16798 [15:42<1:00:02,  3.63it/s]

{'loss': 1.6547, 'grad_norm': 2.317909002304077, 'learning_rate': 0.00015565880390755303, 'epoch': 0.22}


 22%|██▏       | 3733/16798 [15:43<1:01:32,  3.54it/s]

{'loss': 1.2203, 'grad_norm': 1.6743135452270508, 'learning_rate': 0.00015564689063616872, 'epoch': 0.22}


 22%|██▏       | 3734/16798 [15:43<58:00,  3.75it/s]  

{'loss': 1.4612, 'grad_norm': 1.8361148834228516, 'learning_rate': 0.00015563497736478438, 'epoch': 0.22}


 22%|██▏       | 3735/16798 [15:43<1:00:00,  3.63it/s]

{'loss': 1.3383, 'grad_norm': 1.748981237411499, 'learning_rate': 0.00015562306409340007, 'epoch': 0.22}


 22%|██▏       | 3736/16798 [15:43<57:22,  3.79it/s]  

{'loss': 1.4954, 'grad_norm': 1.9682611227035522, 'learning_rate': 0.00015561115082201574, 'epoch': 0.22}


 22%|██▏       | 3737/16798 [15:44<59:55,  3.63it/s]

{'loss': 1.3311, 'grad_norm': 1.7306609153747559, 'learning_rate': 0.00015559923755063143, 'epoch': 0.22}


 22%|██▏       | 3738/16798 [15:44<56:36,  3.85it/s]

{'loss': 1.3527, 'grad_norm': 1.7417787313461304, 'learning_rate': 0.0001555873242792471, 'epoch': 0.22}


 22%|██▏       | 3739/16798 [15:44<1:00:34,  3.59it/s]

{'loss': 0.7422, 'grad_norm': 1.4127919673919678, 'learning_rate': 0.00015557541100786278, 'epoch': 0.22}


 22%|██▏       | 3740/16798 [15:45<57:59,  3.75it/s]  

{'loss': 1.1689, 'grad_norm': 1.7118446826934814, 'learning_rate': 0.00015556349773647844, 'epoch': 0.22}


 22%|██▏       | 3741/16798 [15:45<59:50,  3.64it/s]

{'loss': 1.6339, 'grad_norm': 2.1360220909118652, 'learning_rate': 0.00015555158446509414, 'epoch': 0.22}


 22%|██▏       | 3742/16798 [15:45<58:30,  3.72it/s]

{'loss': 1.2174, 'grad_norm': 1.7762588262557983, 'learning_rate': 0.0001555396711937098, 'epoch': 0.22}


 22%|██▏       | 3743/16798 [15:45<1:00:51,  3.58it/s]

{'loss': 0.6451, 'grad_norm': 1.2578797340393066, 'learning_rate': 0.0001555277579223255, 'epoch': 0.22}


 22%|██▏       | 3744/16798 [15:46<1:00:30,  3.60it/s]

{'loss': 0.8228, 'grad_norm': 1.3950859308242798, 'learning_rate': 0.00015551584465094115, 'epoch': 0.22}


 22%|██▏       | 3745/16798 [15:46<1:02:46,  3.47it/s]

{'loss': 1.0296, 'grad_norm': 2.2413454055786133, 'learning_rate': 0.00015550393137955684, 'epoch': 0.22}


 22%|██▏       | 3746/16798 [15:46<1:00:37,  3.59it/s]

{'loss': 0.6474, 'grad_norm': 1.2537686824798584, 'learning_rate': 0.0001554920181081725, 'epoch': 0.22}


 22%|██▏       | 3747/16798 [15:47<1:03:02,  3.45it/s]

{'loss': 0.3021, 'grad_norm': 0.7745558619499207, 'learning_rate': 0.0001554801048367882, 'epoch': 0.22}


 22%|██▏       | 3748/16798 [15:47<1:02:34,  3.48it/s]

{'loss': 0.2825, 'grad_norm': 0.7746120691299438, 'learning_rate': 0.00015546819156540386, 'epoch': 0.22}


 22%|██▏       | 3749/16798 [15:47<1:03:22,  3.43it/s]

{'loss': 0.3244, 'grad_norm': 0.8585967421531677, 'learning_rate': 0.00015545627829401955, 'epoch': 0.22}


 22%|██▏       | 3750/16798 [15:47<1:00:59,  3.57it/s]

{'loss': 0.3813, 'grad_norm': 0.9430391788482666, 'learning_rate': 0.00015544436502263522, 'epoch': 0.22}


 22%|██▏       | 3752/16798 [15:48<58:20,  3.73it/s]  

{'loss': 1.5084, 'grad_norm': 1.465043306350708, 'learning_rate': 0.0001554324517512509, 'epoch': 0.22}


 22%|██▏       | 3752/16798 [15:48<58:20,  3.73it/s]

{'loss': 2.1685, 'grad_norm': 1.9339638948440552, 'learning_rate': 0.00015542053847986657, 'epoch': 0.22}


 22%|██▏       | 3753/16798 [15:48<55:04,  3.95it/s]

{'loss': 2.1232, 'grad_norm': 2.5339925289154053, 'learning_rate': 0.00015540862520848226, 'epoch': 0.22}


 22%|██▏       | 3754/16798 [15:48<56:57,  3.82it/s]

{'loss': 2.4016, 'grad_norm': 2.07026743888855, 'learning_rate': 0.00015539671193709793, 'epoch': 0.22}


 22%|██▏       | 3755/16798 [15:49<57:52,  3.76it/s]

{'loss': 2.4722, 'grad_norm': 2.2654480934143066, 'learning_rate': 0.00015538479866571362, 'epoch': 0.22}


 22%|██▏       | 3756/16798 [15:49<57:39,  3.77it/s]

{'loss': 2.2719, 'grad_norm': 2.1006009578704834, 'learning_rate': 0.00015537288539432928, 'epoch': 0.22}


 22%|██▏       | 3757/16798 [15:49<57:28,  3.78it/s]

{'loss': 2.6029, 'grad_norm': 2.355011224746704, 'learning_rate': 0.00015536097212294497, 'epoch': 0.22}


 22%|██▏       | 3758/16798 [15:50<59:40,  3.64it/s]

{'loss': 1.7834, 'grad_norm': 1.6752411127090454, 'learning_rate': 0.00015534905885156063, 'epoch': 0.22}


 22%|██▏       | 3759/16798 [15:50<1:04:01,  3.39it/s]

{'loss': 2.0715, 'grad_norm': 1.9994711875915527, 'learning_rate': 0.00015533714558017633, 'epoch': 0.22}


 22%|██▏       | 3760/16798 [15:50<1:05:46,  3.30it/s]

{'loss': 1.8899, 'grad_norm': 1.8003910779953003, 'learning_rate': 0.00015532523230879202, 'epoch': 0.22}


 22%|██▏       | 3761/16798 [15:50<1:00:53,  3.57it/s]

{'loss': 2.1029, 'grad_norm': 2.0701913833618164, 'learning_rate': 0.00015531331903740768, 'epoch': 0.22}


 22%|██▏       | 3762/16798 [15:51<1:04:01,  3.39it/s]

{'loss': 1.6515, 'grad_norm': 1.804187297821045, 'learning_rate': 0.00015530140576602337, 'epoch': 0.22}


 22%|██▏       | 3763/16798 [15:51<58:58,  3.68it/s]  

{'loss': 1.8155, 'grad_norm': 1.7573455572128296, 'learning_rate': 0.00015528949249463903, 'epoch': 0.22}


 22%|██▏       | 3764/16798 [15:51<55:32,  3.91it/s]

{'loss': 1.5519, 'grad_norm': 1.6834577322006226, 'learning_rate': 0.00015527757922325473, 'epoch': 0.22}


 22%|██▏       | 3765/16798 [15:51<58:16,  3.73it/s]

{'loss': 1.7667, 'grad_norm': 2.053011417388916, 'learning_rate': 0.0001552656659518704, 'epoch': 0.22}


 22%|██▏       | 3766/16798 [15:52<56:04,  3.87it/s]

{'loss': 2.0165, 'grad_norm': 2.1855523586273193, 'learning_rate': 0.00015525375268048608, 'epoch': 0.22}


 22%|██▏       | 3767/16798 [15:52<59:18,  3.66it/s]

{'loss': 1.7129, 'grad_norm': 1.870976209640503, 'learning_rate': 0.00015524183940910174, 'epoch': 0.22}


 22%|██▏       | 3768/16798 [15:52<1:00:03,  3.62it/s]

{'loss': 2.1424, 'grad_norm': 2.240351438522339, 'learning_rate': 0.00015522992613771743, 'epoch': 0.22}


 22%|██▏       | 3769/16798 [15:53<1:04:27,  3.37it/s]

{'loss': 1.7128, 'grad_norm': 2.482632637023926, 'learning_rate': 0.0001552180128663331, 'epoch': 0.22}


 22%|██▏       | 3770/16798 [15:53<1:00:34,  3.59it/s]

{'loss': 1.5542, 'grad_norm': 1.8217628002166748, 'learning_rate': 0.0001552060995949488, 'epoch': 0.22}


 22%|██▏       | 3771/16798 [15:53<1:02:26,  3.48it/s]

{'loss': 1.5521, 'grad_norm': 1.7840900421142578, 'learning_rate': 0.00015519418632356445, 'epoch': 0.22}


 22%|██▏       | 3772/16798 [15:53<58:12,  3.73it/s]  

{'loss': 1.5849, 'grad_norm': 1.8259087800979614, 'learning_rate': 0.00015518227305218014, 'epoch': 0.22}


 22%|██▏       | 3773/16798 [15:54<1:02:02,  3.50it/s]

{'loss': 1.6577, 'grad_norm': 1.8438411951065063, 'learning_rate': 0.0001551703597807958, 'epoch': 0.22}


 22%|██▏       | 3774/16798 [15:54<1:00:12,  3.61it/s]

{'loss': 1.898, 'grad_norm': 2.144275426864624, 'learning_rate': 0.0001551584465094115, 'epoch': 0.22}


 22%|██▏       | 3775/16798 [15:54<1:03:28,  3.42it/s]

{'loss': 1.921, 'grad_norm': 2.4622039794921875, 'learning_rate': 0.00015514653323802716, 'epoch': 0.22}


 22%|██▏       | 3776/16798 [15:55<1:00:13,  3.60it/s]

{'loss': 1.5896, 'grad_norm': 1.75055730342865, 'learning_rate': 0.00015513461996664285, 'epoch': 0.22}


 22%|██▏       | 3777/16798 [15:55<1:01:53,  3.51it/s]

{'loss': 1.4618, 'grad_norm': 1.8597874641418457, 'learning_rate': 0.00015512270669525852, 'epoch': 0.22}


 22%|██▏       | 3778/16798 [15:55<58:12,  3.73it/s]  

{'loss': 1.567, 'grad_norm': 2.1962878704071045, 'learning_rate': 0.0001551107934238742, 'epoch': 0.22}


 22%|██▏       | 3779/16798 [15:55<1:05:14,  3.33it/s]

{'loss': 1.2388, 'grad_norm': 1.6102946996688843, 'learning_rate': 0.00015509888015248987, 'epoch': 0.22}


 23%|██▎       | 3780/16798 [15:56<1:04:45,  3.35it/s]

{'loss': 1.6358, 'grad_norm': 1.853920340538025, 'learning_rate': 0.00015508696688110556, 'epoch': 0.23}


 23%|██▎       | 3781/16798 [15:56<1:03:33,  3.41it/s]

{'loss': 1.7664, 'grad_norm': 2.2192492485046387, 'learning_rate': 0.00015507505360972122, 'epoch': 0.23}


 23%|██▎       | 3782/16798 [15:56<1:01:51,  3.51it/s]

{'loss': 1.3873, 'grad_norm': 1.831030249595642, 'learning_rate': 0.00015506314033833691, 'epoch': 0.23}


 23%|██▎       | 3783/16798 [15:57<1:06:07,  3.28it/s]

{'loss': 0.8972, 'grad_norm': 1.6681721210479736, 'learning_rate': 0.00015505122706695258, 'epoch': 0.23}


 23%|██▎       | 3784/16798 [15:57<1:01:57,  3.50it/s]

{'loss': 1.7662, 'grad_norm': 2.1616790294647217, 'learning_rate': 0.00015503931379556827, 'epoch': 0.23}


 23%|██▎       | 3785/16798 [15:57<1:06:11,  3.28it/s]

{'loss': 1.3237, 'grad_norm': 2.537466526031494, 'learning_rate': 0.00015502740052418393, 'epoch': 0.23}


 23%|██▎       | 3786/16798 [15:58<1:08:00,  3.19it/s]

{'loss': 1.2742, 'grad_norm': 1.8196775913238525, 'learning_rate': 0.00015501548725279962, 'epoch': 0.23}


 23%|██▎       | 3787/16798 [15:58<1:08:24,  3.17it/s]

{'loss': 1.2864, 'grad_norm': 1.6203715801239014, 'learning_rate': 0.0001550035739814153, 'epoch': 0.23}


 23%|██▎       | 3788/16798 [15:58<1:03:07,  3.43it/s]

{'loss': 1.2025, 'grad_norm': 1.6362918615341187, 'learning_rate': 0.00015499166071003098, 'epoch': 0.23}


 23%|██▎       | 3790/16798 [15:59<58:43,  3.69it/s]  

{'loss': 1.4988, 'grad_norm': 1.818626880645752, 'learning_rate': 0.00015497974743864664, 'epoch': 0.23}


 23%|██▎       | 3790/16798 [15:59<58:43,  3.69it/s]

{'loss': 1.0495, 'grad_norm': 1.5018701553344727, 'learning_rate': 0.00015496783416726233, 'epoch': 0.23}


 23%|██▎       | 3791/16798 [15:59<55:52,  3.88it/s]

{'loss': 1.0948, 'grad_norm': 1.7991410493850708, 'learning_rate': 0.00015495592089587802, 'epoch': 0.23}


 23%|██▎       | 3793/16798 [15:59<54:31,  3.97it/s]

{'loss': 1.3403, 'grad_norm': 2.3612172603607178, 'learning_rate': 0.00015494400762449371, 'epoch': 0.23}


 23%|██▎       | 3794/16798 [16:00<51:30,  4.21it/s]

{'loss': 0.6798, 'grad_norm': 1.2421931028366089, 'learning_rate': 0.00015493209435310938, 'epoch': 0.23}


 23%|██▎       | 3794/16798 [16:00<51:30,  4.21it/s]

{'loss': 0.6878, 'grad_norm': 1.27488112449646, 'learning_rate': 0.00015492018108172507, 'epoch': 0.23}


 23%|██▎       | 3795/16798 [16:00<53:35,  4.04it/s]

{'loss': 0.3727, 'grad_norm': 0.9193780422210693, 'learning_rate': 0.00015490826781034073, 'epoch': 0.23}


 23%|██▎       | 3796/16798 [16:00<55:51,  3.88it/s]

{'loss': 0.4607, 'grad_norm': 1.3098933696746826, 'learning_rate': 0.00015489635453895642, 'epoch': 0.23}


 23%|██▎       | 3797/16798 [16:00<58:40,  3.69it/s]

{'loss': 0.7363, 'grad_norm': 1.577948808670044, 'learning_rate': 0.0001548844412675721, 'epoch': 0.23}


 23%|██▎       | 3798/16798 [16:01<56:29,  3.84it/s]

{'loss': 0.6941, 'grad_norm': 1.41446852684021, 'learning_rate': 0.00015487252799618778, 'epoch': 0.23}


 23%|██▎       | 3799/16798 [16:01<58:09,  3.72it/s]

{'loss': 0.6952, 'grad_norm': 1.5030219554901123, 'learning_rate': 0.00015486061472480344, 'epoch': 0.23}


 23%|██▎       | 3800/16798 [16:01<57:50,  3.74it/s]

{'loss': 0.741, 'grad_norm': 1.7317051887512207, 'learning_rate': 0.00015484870145341913, 'epoch': 0.23}


 23%|██▎       | 3801/16798 [16:02<1:00:53,  3.56it/s]

{'loss': 2.1377, 'grad_norm': 1.877733826637268, 'learning_rate': 0.0001548367881820348, 'epoch': 0.23}


 23%|██▎       | 3802/16798 [16:02<58:29,  3.70it/s]  

{'loss': 1.9207, 'grad_norm': 1.9249759912490845, 'learning_rate': 0.00015482487491065049, 'epoch': 0.23}


 23%|██▎       | 3803/16798 [16:02<1:01:23,  3.53it/s]

{'loss': 2.038, 'grad_norm': 1.668656349182129, 'learning_rate': 0.00015481296163926615, 'epoch': 0.23}


 23%|██▎       | 3804/16798 [16:02<58:02,  3.73it/s]  

{'loss': 1.8088, 'grad_norm': 2.013019323348999, 'learning_rate': 0.00015480104836788184, 'epoch': 0.23}


 23%|██▎       | 3805/16798 [16:03<1:03:39,  3.40it/s]

{'loss': 2.1171, 'grad_norm': 2.1108129024505615, 'learning_rate': 0.0001547891350964975, 'epoch': 0.23}


 23%|██▎       | 3806/16798 [16:03<58:25,  3.71it/s]  

{'loss': 1.9527, 'grad_norm': 1.7579519748687744, 'learning_rate': 0.0001547772218251132, 'epoch': 0.23}


 23%|██▎       | 3807/16798 [16:03<56:50,  3.81it/s]

{'loss': 1.7315, 'grad_norm': 1.6943877935409546, 'learning_rate': 0.00015476530855372886, 'epoch': 0.23}


 23%|██▎       | 3809/16798 [16:04<54:59,  3.94it/s]

{'loss': 2.0799, 'grad_norm': 2.1052234172821045, 'learning_rate': 0.00015475339528234455, 'epoch': 0.23}


 23%|██▎       | 3809/16798 [16:04<54:59,  3.94it/s]

{'loss': 1.4419, 'grad_norm': 1.976807713508606, 'learning_rate': 0.0001547414820109602, 'epoch': 0.23}


 23%|██▎       | 3810/16798 [16:04<55:46,  3.88it/s]

{'loss': 1.9514, 'grad_norm': 2.0362610816955566, 'learning_rate': 0.0001547295687395759, 'epoch': 0.23}


 23%|██▎       | 3811/16798 [16:04<54:35,  3.96it/s]

{'loss': 1.4451, 'grad_norm': 1.845726490020752, 'learning_rate': 0.00015471765546819157, 'epoch': 0.23}


 23%|██▎       | 3812/16798 [16:04<55:59,  3.86it/s]

{'loss': 1.7652, 'grad_norm': 2.074401617050171, 'learning_rate': 0.00015470574219680726, 'epoch': 0.23}


 23%|██▎       | 3813/16798 [16:05<55:54,  3.87it/s]

{'loss': 1.5688, 'grad_norm': 1.8088518381118774, 'learning_rate': 0.00015469382892542292, 'epoch': 0.23}


 23%|██▎       | 3814/16798 [16:05<59:36,  3.63it/s]

{'loss': 1.1683, 'grad_norm': 1.5131773948669434, 'learning_rate': 0.0001546819156540386, 'epoch': 0.23}


 23%|██▎       | 3815/16798 [16:05<55:19,  3.91it/s]

{'loss': 1.637, 'grad_norm': 2.23646879196167, 'learning_rate': 0.00015467000238265428, 'epoch': 0.23}


 23%|██▎       | 3816/16798 [16:05<52:48,  4.10it/s]

{'loss': 1.7793, 'grad_norm': 1.949714183807373, 'learning_rate': 0.00015465808911126997, 'epoch': 0.23}


 23%|██▎       | 3817/16798 [16:06<56:55,  3.80it/s]

{'loss': 1.4183, 'grad_norm': 1.542123794555664, 'learning_rate': 0.00015464617583988563, 'epoch': 0.23}


 23%|██▎       | 3818/16798 [16:06<55:56,  3.87it/s]

{'loss': 1.4967, 'grad_norm': 1.832235336303711, 'learning_rate': 0.00015463426256850132, 'epoch': 0.23}


 23%|██▎       | 3819/16798 [16:06<58:41,  3.69it/s]

{'loss': 1.5831, 'grad_norm': 1.8198267221450806, 'learning_rate': 0.00015462234929711699, 'epoch': 0.23}


 23%|██▎       | 3820/16798 [16:07<58:05,  3.72it/s]

{'loss': 1.584, 'grad_norm': 1.7207064628601074, 'learning_rate': 0.00015461043602573268, 'epoch': 0.23}


 23%|██▎       | 3821/16798 [16:07<1:00:28,  3.58it/s]

{'loss': 1.7199, 'grad_norm': 1.9028897285461426, 'learning_rate': 0.00015459852275434834, 'epoch': 0.23}


 23%|██▎       | 3822/16798 [16:07<56:16,  3.84it/s]  

{'loss': 1.2729, 'grad_norm': 1.3624579906463623, 'learning_rate': 0.00015458660948296403, 'epoch': 0.23}


 23%|██▎       | 3823/16798 [16:07<58:59,  3.67it/s]

{'loss': 1.4812, 'grad_norm': 1.692225456237793, 'learning_rate': 0.00015457469621157972, 'epoch': 0.23}


 23%|██▎       | 3824/16798 [16:08<1:00:06,  3.60it/s]

{'loss': 1.54, 'grad_norm': 1.7220957279205322, 'learning_rate': 0.00015456278294019538, 'epoch': 0.23}


 23%|██▎       | 3825/16798 [16:08<1:02:42,  3.45it/s]

{'loss': 1.53, 'grad_norm': 1.7289246320724487, 'learning_rate': 0.00015455086966881108, 'epoch': 0.23}


 23%|██▎       | 3826/16798 [16:08<57:49,  3.74it/s]  

{'loss': 1.4441, 'grad_norm': 2.056682586669922, 'learning_rate': 0.00015453895639742674, 'epoch': 0.23}


 23%|██▎       | 3827/16798 [16:08<54:22,  3.98it/s]

{'loss': 1.7116, 'grad_norm': 1.9660309553146362, 'learning_rate': 0.00015452704312604243, 'epoch': 0.23}


 23%|██▎       | 3828/16798 [16:09<59:19,  3.64it/s]

{'loss': 1.5072, 'grad_norm': 2.1127729415893555, 'learning_rate': 0.0001545151298546581, 'epoch': 0.23}


 23%|██▎       | 3829/16798 [16:09<57:53,  3.73it/s]

{'loss': 1.4109, 'grad_norm': 1.7955000400543213, 'learning_rate': 0.00015450321658327378, 'epoch': 0.23}


 23%|██▎       | 3830/16798 [16:09<1:01:06,  3.54it/s]

{'loss': 1.4189, 'grad_norm': 1.7635763883590698, 'learning_rate': 0.00015449130331188945, 'epoch': 0.23}


 23%|██▎       | 3831/16798 [16:10<55:40,  3.88it/s]  

{'loss': 1.953, 'grad_norm': 2.612504482269287, 'learning_rate': 0.00015447939004050514, 'epoch': 0.23}


 23%|██▎       | 3832/16798 [16:10<54:38,  3.96it/s]

{'loss': 1.471, 'grad_norm': 1.994657039642334, 'learning_rate': 0.0001544674767691208, 'epoch': 0.23}


 23%|██▎       | 3833/16798 [16:10<59:12,  3.65it/s]

{'loss': 1.3357, 'grad_norm': 1.5234777927398682, 'learning_rate': 0.0001544555634977365, 'epoch': 0.23}


 23%|██▎       | 3834/16798 [16:10<56:33,  3.82it/s]

{'loss': 1.1213, 'grad_norm': 1.8466156721115112, 'learning_rate': 0.00015444365022635216, 'epoch': 0.23}


 23%|██▎       | 3835/16798 [16:11<59:28,  3.63it/s]

{'loss': 1.5275, 'grad_norm': 1.732319712638855, 'learning_rate': 0.00015443173695496785, 'epoch': 0.23}


 23%|██▎       | 3836/16798 [16:11<55:51,  3.87it/s]

{'loss': 1.6182, 'grad_norm': 1.913062334060669, 'learning_rate': 0.0001544198236835835, 'epoch': 0.23}


 23%|██▎       | 3837/16798 [16:11<57:02,  3.79it/s]

{'loss': 1.3934, 'grad_norm': 1.7954769134521484, 'learning_rate': 0.0001544079104121992, 'epoch': 0.23}


 23%|██▎       | 3839/16798 [16:12<53:52,  4.01it/s]

{'loss': 1.5763, 'grad_norm': 1.9015744924545288, 'learning_rate': 0.00015439599714081487, 'epoch': 0.23}


 23%|██▎       | 3839/16798 [16:12<53:52,  4.01it/s]

{'loss': 1.67, 'grad_norm': 2.3315882682800293, 'learning_rate': 0.00015438408386943056, 'epoch': 0.23}


 23%|██▎       | 3840/16798 [16:12<54:34,  3.96it/s]

{'loss': 1.5784, 'grad_norm': 2.0069901943206787, 'learning_rate': 0.00015437217059804622, 'epoch': 0.23}


 23%|██▎       | 3841/16798 [16:12<53:51,  4.01it/s]

{'loss': 1.3185, 'grad_norm': 1.8990600109100342, 'learning_rate': 0.0001543602573266619, 'epoch': 0.23}


 23%|██▎       | 3842/16798 [16:12<57:15,  3.77it/s]

{'loss': 1.1081, 'grad_norm': 1.583125114440918, 'learning_rate': 0.00015434834405527757, 'epoch': 0.23}


 23%|██▎       | 3843/16798 [16:13<57:57,  3.72it/s]

{'loss': 0.8284, 'grad_norm': 9.82451057434082, 'learning_rate': 0.00015433643078389327, 'epoch': 0.23}


 23%|██▎       | 3844/16798 [16:13<1:00:41,  3.56it/s]

{'loss': 1.4508, 'grad_norm': 1.9720059633255005, 'learning_rate': 0.00015432451751250893, 'epoch': 0.23}


 23%|██▎       | 3845/16798 [16:13<56:50,  3.80it/s]  

{'loss': 0.7694, 'grad_norm': 1.6233693361282349, 'learning_rate': 0.00015431260424112462, 'epoch': 0.23}


 23%|██▎       | 3846/16798 [16:14<1:00:40,  3.56it/s]

{'loss': 0.3698, 'grad_norm': 0.8558306694030762, 'learning_rate': 0.00015430069096974028, 'epoch': 0.23}


 23%|██▎       | 3847/16798 [16:14<57:08,  3.78it/s]  

{'loss': 0.5692, 'grad_norm': 1.07568359375, 'learning_rate': 0.00015428877769835597, 'epoch': 0.23}


 23%|██▎       | 3849/16798 [16:14<54:02,  3.99it/s]

{'loss': 0.4449, 'grad_norm': 1.0578042268753052, 'learning_rate': 0.00015427686442697164, 'epoch': 0.23}


 23%|██▎       | 3850/16798 [16:14<50:38,  4.26it/s]

{'loss': 0.1992, 'grad_norm': 0.8836632966995239, 'learning_rate': 0.00015426495115558733, 'epoch': 0.23}


 23%|██▎       | 3850/16798 [16:14<50:38,  4.26it/s]

{'loss': 0.296, 'grad_norm': 0.9674335718154907, 'learning_rate': 0.000154253037884203, 'epoch': 0.23}


 23%|██▎       | 3851/16798 [16:15<55:32,  3.89it/s]

{'loss': 1.9751, 'grad_norm': 1.7282366752624512, 'learning_rate': 0.00015424112461281868, 'epoch': 0.23}


 23%|██▎       | 3852/16798 [16:15<53:14,  4.05it/s]

{'loss': 1.5659, 'grad_norm': 1.4906643629074097, 'learning_rate': 0.00015422921134143435, 'epoch': 0.23}


 23%|██▎       | 3853/16798 [16:15<53:48,  4.01it/s]

{'loss': 2.4217, 'grad_norm': 2.060244083404541, 'learning_rate': 0.00015421729807005006, 'epoch': 0.23}


 23%|██▎       | 3854/16798 [16:16<1:00:05,  3.59it/s]

{'loss': 2.0971, 'grad_norm': 2.3771822452545166, 'learning_rate': 0.00015420538479866573, 'epoch': 0.23}


 23%|██▎       | 3855/16798 [16:16<58:33,  3.68it/s]  

{'loss': 2.1129, 'grad_norm': 2.0724148750305176, 'learning_rate': 0.00015419347152728142, 'epoch': 0.23}


 23%|██▎       | 3856/16798 [16:16<59:38,  3.62it/s]

{'loss': 2.4748, 'grad_norm': 2.018373966217041, 'learning_rate': 0.00015418155825589708, 'epoch': 0.23}


 23%|██▎       | 3857/16798 [16:16<59:22,  3.63it/s]

{'loss': 1.6498, 'grad_norm': 2.083676815032959, 'learning_rate': 0.00015416964498451277, 'epoch': 0.23}


 23%|██▎       | 3858/16798 [16:17<1:00:34,  3.56it/s]

{'loss': 1.8059, 'grad_norm': 1.9379881620407104, 'learning_rate': 0.00015415773171312844, 'epoch': 0.23}


 23%|██▎       | 3859/16798 [16:17<56:13,  3.84it/s]  

{'loss': 1.9338, 'grad_norm': 1.9516911506652832, 'learning_rate': 0.00015414581844174413, 'epoch': 0.23}


 23%|██▎       | 3860/16798 [16:17<1:00:20,  3.57it/s]

{'loss': 1.8226, 'grad_norm': 1.9524216651916504, 'learning_rate': 0.0001541339051703598, 'epoch': 0.23}


 23%|██▎       | 3861/16798 [16:18<58:23,  3.69it/s]  

{'loss': 2.1934, 'grad_norm': 2.016230344772339, 'learning_rate': 0.00015412199189897548, 'epoch': 0.23}


 23%|██▎       | 3862/16798 [16:18<58:24,  3.69it/s]

{'loss': 1.7653, 'grad_norm': 2.047769069671631, 'learning_rate': 0.00015411007862759115, 'epoch': 0.23}


 23%|██▎       | 3863/16798 [16:18<56:07,  3.84it/s]

{'loss': 1.4559, 'grad_norm': 1.8060966730117798, 'learning_rate': 0.00015409816535620684, 'epoch': 0.23}


 23%|██▎       | 3864/16798 [16:18<57:13,  3.77it/s]

{'loss': 1.5783, 'grad_norm': 1.778272271156311, 'learning_rate': 0.0001540862520848225, 'epoch': 0.23}


 23%|██▎       | 3865/16798 [16:19<57:24,  3.75it/s]

{'loss': 1.5966, 'grad_norm': 1.7547663450241089, 'learning_rate': 0.0001540743388134382, 'epoch': 0.23}


 23%|██▎       | 3866/16798 [16:19<58:19,  3.70it/s]

{'loss': 1.5715, 'grad_norm': 1.7569347620010376, 'learning_rate': 0.00015406242554205385, 'epoch': 0.23}


 23%|██▎       | 3867/16798 [16:19<55:23,  3.89it/s]

{'loss': 1.4829, 'grad_norm': 2.2446341514587402, 'learning_rate': 0.00015405051227066955, 'epoch': 0.23}


 23%|██▎       | 3868/16798 [16:19<57:28,  3.75it/s]

{'loss': 1.2309, 'grad_norm': 1.8556597232818604, 'learning_rate': 0.0001540385989992852, 'epoch': 0.23}


 23%|██▎       | 3869/16798 [16:20<56:09,  3.84it/s]

{'loss': 1.9419, 'grad_norm': 2.3748979568481445, 'learning_rate': 0.0001540266857279009, 'epoch': 0.23}


 23%|██▎       | 3870/16798 [16:20<59:38,  3.61it/s]

{'loss': 1.4488, 'grad_norm': 1.6999279260635376, 'learning_rate': 0.00015401477245651656, 'epoch': 0.23}


 23%|██▎       | 3871/16798 [16:20<57:55,  3.72it/s]

{'loss': 1.6395, 'grad_norm': 2.1056199073791504, 'learning_rate': 0.00015400285918513225, 'epoch': 0.23}


 23%|██▎       | 3872/16798 [16:20<58:43,  3.67it/s]

{'loss': 1.9895, 'grad_norm': 2.1063413619995117, 'learning_rate': 0.00015399094591374792, 'epoch': 0.23}


 23%|██▎       | 3873/16798 [16:21<58:48,  3.66it/s]

{'loss': 1.6305, 'grad_norm': 2.1130688190460205, 'learning_rate': 0.0001539790326423636, 'epoch': 0.23}


 23%|██▎       | 3874/16798 [16:21<1:02:51,  3.43it/s]

{'loss': 1.5139, 'grad_norm': 2.2423527240753174, 'learning_rate': 0.00015396711937097927, 'epoch': 0.23}


 23%|██▎       | 3875/16798 [16:21<1:00:24,  3.57it/s]

{'loss': 1.5384, 'grad_norm': 1.7066879272460938, 'learning_rate': 0.00015395520609959496, 'epoch': 0.23}


 23%|██▎       | 3876/16798 [16:22<1:01:25,  3.51it/s]

{'loss': 1.1361, 'grad_norm': 2.2566845417022705, 'learning_rate': 0.00015394329282821063, 'epoch': 0.23}


 23%|██▎       | 3877/16798 [16:22<57:17,  3.76it/s]  

{'loss': 1.6928, 'grad_norm': 2.103624105453491, 'learning_rate': 0.00015393137955682632, 'epoch': 0.23}


 23%|██▎       | 3878/16798 [16:22<54:35,  3.94it/s]

{'loss': 1.111, 'grad_norm': 1.661865234375, 'learning_rate': 0.00015391946628544198, 'epoch': 0.23}


 23%|██▎       | 3880/16798 [16:23<55:59,  3.84it/s]  

{'loss': 1.2657, 'grad_norm': 1.808929681777954, 'learning_rate': 0.00015390755301405767, 'epoch': 0.23}


 23%|██▎       | 3880/16798 [16:23<55:59,  3.84it/s]

{'loss': 1.2719, 'grad_norm': 2.057654619216919, 'learning_rate': 0.00015389563974267334, 'epoch': 0.23}


 23%|██▎       | 3882/16798 [16:23<53:19,  4.04it/s]

{'loss': 1.7299, 'grad_norm': 2.017843008041382, 'learning_rate': 0.00015388372647128903, 'epoch': 0.23}


 23%|██▎       | 3882/16798 [16:23<53:19,  4.04it/s]

{'loss': 1.2742, 'grad_norm': 2.024991750717163, 'learning_rate': 0.0001538718131999047, 'epoch': 0.23}


 23%|██▎       | 3883/16798 [16:23<50:31,  4.26it/s]

{'loss': 1.0012, 'grad_norm': 1.4605679512023926, 'learning_rate': 0.00015385989992852038, 'epoch': 0.23}


 23%|██▎       | 3884/16798 [16:24<54:58,  3.92it/s]

{'loss': 1.0746, 'grad_norm': 1.412183165550232, 'learning_rate': 0.00015384798665713607, 'epoch': 0.23}


 23%|██▎       | 3885/16798 [16:24<53:49,  4.00it/s]

{'loss': 1.1903, 'grad_norm': 1.8805427551269531, 'learning_rate': 0.00015383607338575174, 'epoch': 0.23}


 23%|██▎       | 3886/16798 [16:24<54:52,  3.92it/s]

{'loss': 1.3718, 'grad_norm': 1.7378685474395752, 'learning_rate': 0.00015382416011436743, 'epoch': 0.23}


 23%|██▎       | 3887/16798 [16:24<57:21,  3.75it/s]

{'loss': 1.4008, 'grad_norm': 2.0496644973754883, 'learning_rate': 0.0001538122468429831, 'epoch': 0.23}


 23%|██▎       | 3888/16798 [16:25<1:00:55,  3.53it/s]

{'loss': 1.1591, 'grad_norm': 1.682701826095581, 'learning_rate': 0.00015380033357159878, 'epoch': 0.23}


 23%|██▎       | 3889/16798 [16:25<57:33,  3.74it/s]  

{'loss': 1.0519, 'grad_norm': 1.5283344984054565, 'learning_rate': 0.00015378842030021444, 'epoch': 0.23}


 23%|██▎       | 3890/16798 [16:25<57:10,  3.76it/s]

{'loss': 0.7467, 'grad_norm': 1.3005892038345337, 'learning_rate': 0.00015377650702883013, 'epoch': 0.23}


 23%|██▎       | 3891/16798 [16:25<56:18,  3.82it/s]

{'loss': 1.3434, 'grad_norm': 2.115020990371704, 'learning_rate': 0.0001537645937574458, 'epoch': 0.23}


 23%|██▎       | 3892/16798 [16:26<59:35,  3.61it/s]

{'loss': 0.8111, 'grad_norm': 1.368118405342102, 'learning_rate': 0.0001537526804860615, 'epoch': 0.23}


 23%|██▎       | 3893/16798 [16:26<55:22,  3.88it/s]

{'loss': 0.9164, 'grad_norm': 1.9629175662994385, 'learning_rate': 0.00015374076721467715, 'epoch': 0.23}


 23%|██▎       | 3894/16798 [16:26<52:49,  4.07it/s]

{'loss': 1.3038, 'grad_norm': 1.8618718385696411, 'learning_rate': 0.00015372885394329284, 'epoch': 0.23}


 23%|██▎       | 3895/16798 [16:27<56:22,  3.82it/s]

{'loss': 0.6443, 'grad_norm': 1.091226577758789, 'learning_rate': 0.0001537169406719085, 'epoch': 0.23}


 23%|██▎       | 3896/16798 [16:27<58:03,  3.70it/s]

{'loss': 1.1921, 'grad_norm': 1.7561062574386597, 'learning_rate': 0.0001537050274005242, 'epoch': 0.23}


 23%|██▎       | 3898/16798 [16:27<54:07,  3.97it/s]

{'loss': 0.6775, 'grad_norm': 1.293143630027771, 'learning_rate': 0.00015369311412913986, 'epoch': 0.23}


 23%|██▎       | 3899/16798 [16:27<50:57,  4.22it/s]

{'loss': 0.7163, 'grad_norm': 1.3126519918441772, 'learning_rate': 0.00015368120085775555, 'epoch': 0.23}


 23%|██▎       | 3899/16798 [16:27<50:57,  4.22it/s]

{'loss': 0.7232, 'grad_norm': 1.258844256401062, 'learning_rate': 0.00015366928758637122, 'epoch': 0.23}


 23%|██▎       | 3900/16798 [16:28<56:33,  3.80it/s]

{'loss': 0.2764, 'grad_norm': 0.7833524346351624, 'learning_rate': 0.0001536573743149869, 'epoch': 0.23}


 23%|██▎       | 3901/16798 [16:28<53:30,  4.02it/s]

{'loss': 2.0319, 'grad_norm': 1.7573051452636719, 'learning_rate': 0.00015364546104360257, 'epoch': 0.23}


 23%|██▎       | 3902/16798 [16:28<59:35,  3.61it/s]

{'loss': 1.9604, 'grad_norm': 1.9883090257644653, 'learning_rate': 0.00015363354777221826, 'epoch': 0.23}


 23%|██▎       | 3903/16798 [16:29<56:25,  3.81it/s]

{'loss': 1.8576, 'grad_norm': 1.7714780569076538, 'learning_rate': 0.00015362163450083393, 'epoch': 0.23}


 23%|██▎       | 3904/16798 [16:29<57:40,  3.73it/s]

{'loss': 2.0042, 'grad_norm': 2.0761525630950928, 'learning_rate': 0.00015360972122944962, 'epoch': 0.23}


 23%|██▎       | 3905/16798 [16:29<59:51,  3.59it/s]

{'loss': 2.169, 'grad_norm': 2.0056300163269043, 'learning_rate': 0.00015359780795806528, 'epoch': 0.23}


 23%|██▎       | 3906/16798 [16:29<58:09,  3.70it/s]

{'loss': 1.7222, 'grad_norm': 1.69595205783844, 'learning_rate': 0.00015358589468668097, 'epoch': 0.23}


 23%|██▎       | 3907/16798 [16:30<1:01:43,  3.48it/s]

{'loss': 2.3053, 'grad_norm': 2.0366549491882324, 'learning_rate': 0.00015357398141529663, 'epoch': 0.23}


 23%|██▎       | 3908/16798 [16:30<1:04:21,  3.34it/s]

{'loss': 1.6053, 'grad_norm': 2.1926724910736084, 'learning_rate': 0.00015356206814391232, 'epoch': 0.23}


 23%|██▎       | 3909/16798 [16:30<1:01:08,  3.51it/s]

{'loss': 2.0125, 'grad_norm': 2.0584659576416016, 'learning_rate': 0.000153550154872528, 'epoch': 0.23}


 23%|██▎       | 3910/16798 [16:31<1:00:51,  3.53it/s]

{'loss': 2.0563, 'grad_norm': 1.9698407649993896, 'learning_rate': 0.00015353824160114368, 'epoch': 0.23}


 23%|██▎       | 3911/16798 [16:31<1:03:58,  3.36it/s]

{'loss': 1.4207, 'grad_norm': 1.7148947715759277, 'learning_rate': 0.00015352632832975934, 'epoch': 0.23}


 23%|██▎       | 3912/16798 [16:31<58:22,  3.68it/s]  

{'loss': 1.3637, 'grad_norm': 1.6954776048660278, 'learning_rate': 0.00015351441505837503, 'epoch': 0.23}


 23%|██▎       | 3913/16798 [16:31<1:01:49,  3.47it/s]

{'loss': 1.8725, 'grad_norm': 2.241858959197998, 'learning_rate': 0.0001535025017869907, 'epoch': 0.23}


 23%|██▎       | 3914/16798 [16:32<1:02:38,  3.43it/s]

{'loss': 1.4261, 'grad_norm': 1.8663235902786255, 'learning_rate': 0.00015349058851560642, 'epoch': 0.23}


 23%|██▎       | 3915/16798 [16:32<1:01:56,  3.47it/s]

{'loss': 1.9323, 'grad_norm': 1.944393277168274, 'learning_rate': 0.00015347867524422208, 'epoch': 0.23}


 23%|██▎       | 3916/16798 [16:32<1:00:26,  3.55it/s]

{'loss': 1.6662, 'grad_norm': 1.747416615486145, 'learning_rate': 0.00015346676197283777, 'epoch': 0.23}


 23%|██▎       | 3917/16798 [16:33<1:03:51,  3.36it/s]

{'loss': 1.5216, 'grad_norm': 1.8593040704727173, 'learning_rate': 0.00015345484870145343, 'epoch': 0.23}


 23%|██▎       | 3918/16798 [16:33<1:02:17,  3.45it/s]

{'loss': 1.5842, 'grad_norm': 1.7719354629516602, 'learning_rate': 0.00015344293543006912, 'epoch': 0.23}


 23%|██▎       | 3919/16798 [16:33<1:05:21,  3.28it/s]

{'loss': 1.7441, 'grad_norm': 2.3153135776519775, 'learning_rate': 0.0001534310221586848, 'epoch': 0.23}


 23%|██▎       | 3920/16798 [16:33<59:43,  3.59it/s]  

{'loss': 1.74, 'grad_norm': 2.030707597732544, 'learning_rate': 0.00015341910888730048, 'epoch': 0.23}


 23%|██▎       | 3921/16798 [16:34<58:01,  3.70it/s]

{'loss': 1.6058, 'grad_norm': 2.156155586242676, 'learning_rate': 0.00015340719561591614, 'epoch': 0.23}


 23%|██▎       | 3922/16798 [16:34<57:25,  3.74it/s]

{'loss': 1.1616, 'grad_norm': 1.6124306917190552, 'learning_rate': 0.00015339528234453183, 'epoch': 0.23}


 23%|██▎       | 3923/16798 [16:34<58:35,  3.66it/s]

{'loss': 1.7328, 'grad_norm': 1.9447956085205078, 'learning_rate': 0.0001533833690731475, 'epoch': 0.23}


 23%|██▎       | 3924/16798 [16:35<56:52,  3.77it/s]

{'loss': 1.5927, 'grad_norm': 2.0084903240203857, 'learning_rate': 0.0001533714558017632, 'epoch': 0.23}


 23%|██▎       | 3925/16798 [16:35<1:00:14,  3.56it/s]

{'loss': 1.1996, 'grad_norm': 1.6331349611282349, 'learning_rate': 0.00015335954253037885, 'epoch': 0.23}


 23%|██▎       | 3926/16798 [16:35<58:48,  3.65it/s]  

{'loss': 0.9403, 'grad_norm': 1.4414407014846802, 'learning_rate': 0.00015334762925899454, 'epoch': 0.23}


 23%|██▎       | 3927/16798 [16:35<56:28,  3.80it/s]

{'loss': 1.3768, 'grad_norm': 1.7357686758041382, 'learning_rate': 0.0001533357159876102, 'epoch': 0.23}


 23%|██▎       | 3928/16798 [16:36<1:05:27,  3.28it/s]

{'loss': 1.2945, 'grad_norm': 1.4472699165344238, 'learning_rate': 0.0001533238027162259, 'epoch': 0.23}


 23%|██▎       | 3929/16798 [16:36<1:00:16,  3.56it/s]

{'loss': 1.3474, 'grad_norm': 1.6573448181152344, 'learning_rate': 0.00015331188944484156, 'epoch': 0.23}


 23%|██▎       | 3930/16798 [16:36<57:51,  3.71it/s]  

{'loss': 1.3193, 'grad_norm': 1.7387539148330688, 'learning_rate': 0.00015329997617345725, 'epoch': 0.23}


 23%|██▎       | 3931/16798 [16:37<1:03:06,  3.40it/s]

{'loss': 1.3774, 'grad_norm': 1.9201650619506836, 'learning_rate': 0.00015328806290207291, 'epoch': 0.23}


 23%|██▎       | 3932/16798 [16:37<1:00:26,  3.55it/s]

{'loss': 0.7852, 'grad_norm': 1.3623782396316528, 'learning_rate': 0.0001532761496306886, 'epoch': 0.23}


 23%|██▎       | 3933/16798 [16:37<1:01:32,  3.48it/s]

{'loss': 1.4741, 'grad_norm': 1.9296120405197144, 'learning_rate': 0.00015326423635930427, 'epoch': 0.23}


 23%|██▎       | 3934/16798 [16:37<56:21,  3.80it/s]  

{'loss': 1.4085, 'grad_norm': 1.9811629056930542, 'learning_rate': 0.00015325232308791996, 'epoch': 0.23}


 23%|██▎       | 3935/16798 [16:38<1:00:04,  3.57it/s]

{'loss': 1.3897, 'grad_norm': 1.9909672737121582, 'learning_rate': 0.00015324040981653562, 'epoch': 0.23}


 23%|██▎       | 3936/16798 [16:38<1:00:58,  3.52it/s]

{'loss': 1.2672, 'grad_norm': 1.7040520906448364, 'learning_rate': 0.00015322849654515131, 'epoch': 0.23}


 23%|██▎       | 3937/16798 [16:38<1:03:36,  3.37it/s]

{'loss': 1.1541, 'grad_norm': 1.6229734420776367, 'learning_rate': 0.00015321658327376698, 'epoch': 0.23}


 23%|██▎       | 3938/16798 [16:39<1:01:45,  3.47it/s]

{'loss': 1.0252, 'grad_norm': 1.627942442893982, 'learning_rate': 0.00015320467000238267, 'epoch': 0.23}


 23%|██▎       | 3939/16798 [16:39<1:04:12,  3.34it/s]

{'loss': 1.1266, 'grad_norm': 1.5855098962783813, 'learning_rate': 0.00015319275673099833, 'epoch': 0.23}


 23%|██▎       | 3940/16798 [16:39<59:44,  3.59it/s]  

{'loss': 1.1326, 'grad_norm': 1.5472464561462402, 'learning_rate': 0.00015318084345961402, 'epoch': 0.23}


 23%|██▎       | 3941/16798 [16:39<1:03:51,  3.36it/s]

{'loss': 1.2362, 'grad_norm': 2.0197906494140625, 'learning_rate': 0.00015316893018822969, 'epoch': 0.23}


 23%|██▎       | 3942/16798 [16:40<1:00:16,  3.55it/s]

{'loss': 1.5208, 'grad_norm': 2.4011270999908447, 'learning_rate': 0.00015315701691684538, 'epoch': 0.23}


 23%|██▎       | 3943/16798 [16:40<1:00:50,  3.52it/s]

{'loss': 1.2211, 'grad_norm': 1.7698898315429688, 'learning_rate': 0.00015314510364546104, 'epoch': 0.23}


 23%|██▎       | 3945/16798 [16:40<53:09,  4.03it/s]  

{'loss': 1.3408, 'grad_norm': 1.8703511953353882, 'learning_rate': 0.00015313319037407673, 'epoch': 0.23}


 23%|██▎       | 3945/16798 [16:40<53:09,  4.03it/s]

{'loss': 1.1139, 'grad_norm': 1.7934203147888184, 'learning_rate': 0.00015312127710269242, 'epoch': 0.23}


 23%|██▎       | 3946/16798 [16:41<57:59,  3.69it/s]

{'loss': 0.7767, 'grad_norm': 1.4281779527664185, 'learning_rate': 0.00015310936383130809, 'epoch': 0.23}


 23%|██▎       | 3947/16798 [16:41<54:42,  3.92it/s]

{'loss': 0.6414, 'grad_norm': 1.1563469171524048, 'learning_rate': 0.00015309745055992378, 'epoch': 0.23}


 24%|██▎       | 3948/16798 [16:41<51:36,  4.15it/s]

{'loss': 0.8243, 'grad_norm': 1.4280840158462524, 'learning_rate': 0.00015308553728853944, 'epoch': 0.24}


 24%|██▎       | 3949/16798 [16:41<54:50,  3.90it/s]

{'loss': 0.4542, 'grad_norm': 0.9853237271308899, 'learning_rate': 0.00015307362401715513, 'epoch': 0.24}


 24%|██▎       | 3950/16798 [16:42<53:45,  3.98it/s]

{'loss': 0.885, 'grad_norm': 1.922999382019043, 'learning_rate': 0.0001530617107457708, 'epoch': 0.24}


 24%|██▎       | 3951/16798 [16:42<56:29,  3.79it/s]

{'loss': 1.9968, 'grad_norm': 2.071610927581787, 'learning_rate': 0.00015304979747438649, 'epoch': 0.24}


 24%|██▎       | 3952/16798 [16:42<55:32,  3.86it/s]

{'loss': 2.103, 'grad_norm': 1.8079400062561035, 'learning_rate': 0.00015303788420300215, 'epoch': 0.24}


 24%|██▎       | 3953/16798 [16:43<57:57,  3.69it/s]

{'loss': 1.8327, 'grad_norm': 1.6959713697433472, 'learning_rate': 0.00015302597093161784, 'epoch': 0.24}


 24%|██▎       | 3954/16798 [16:43<58:07,  3.68it/s]

{'loss': 2.6396, 'grad_norm': 2.1717851161956787, 'learning_rate': 0.0001530140576602335, 'epoch': 0.24}


 24%|██▎       | 3955/16798 [16:43<1:00:19,  3.55it/s]

{'loss': 2.1858, 'grad_norm': 1.8666950464248657, 'learning_rate': 0.0001530021443888492, 'epoch': 0.24}


 24%|██▎       | 3956/16798 [16:43<56:46,  3.77it/s]  

{'loss': 1.6917, 'grad_norm': 2.219836950302124, 'learning_rate': 0.00015299023111746486, 'epoch': 0.24}


 24%|██▎       | 3957/16798 [16:44<56:13,  3.81it/s]

{'loss': 1.643, 'grad_norm': 1.8538389205932617, 'learning_rate': 0.00015297831784608055, 'epoch': 0.24}


 24%|██▎       | 3958/16798 [16:44<1:04:12,  3.33it/s]

{'loss': 1.9272, 'grad_norm': 1.6319129467010498, 'learning_rate': 0.0001529664045746962, 'epoch': 0.24}


 24%|██▎       | 3959/16798 [16:44<1:01:56,  3.45it/s]

{'loss': 1.2291, 'grad_norm': 1.5704076290130615, 'learning_rate': 0.0001529544913033119, 'epoch': 0.24}


 24%|██▎       | 3960/16798 [16:45<1:02:44,  3.41it/s]

{'loss': 1.7182, 'grad_norm': 2.2186813354492188, 'learning_rate': 0.00015294257803192757, 'epoch': 0.24}


 24%|██▎       | 3961/16798 [16:45<1:02:37,  3.42it/s]

{'loss': 1.5003, 'grad_norm': 1.6030092239379883, 'learning_rate': 0.00015293066476054326, 'epoch': 0.24}


 24%|██▎       | 3962/16798 [16:45<1:02:57,  3.40it/s]

{'loss': 1.7473, 'grad_norm': 1.7528308629989624, 'learning_rate': 0.00015291875148915892, 'epoch': 0.24}


 24%|██▎       | 3963/16798 [16:45<58:55,  3.63it/s]  

{'loss': 1.3289, 'grad_norm': 1.8818740844726562, 'learning_rate': 0.0001529068382177746, 'epoch': 0.24}


 24%|██▎       | 3964/16798 [16:46<1:03:11,  3.38it/s]

{'loss': 1.3652, 'grad_norm': 1.7792233228683472, 'learning_rate': 0.00015289492494639028, 'epoch': 0.24}


 24%|██▎       | 3965/16798 [16:46<58:28,  3.66it/s]  

{'loss': 1.6508, 'grad_norm': 1.969253659248352, 'learning_rate': 0.00015288301167500597, 'epoch': 0.24}


 24%|██▎       | 3966/16798 [16:46<1:00:10,  3.55it/s]

{'loss': 1.5131, 'grad_norm': 1.8499480485916138, 'learning_rate': 0.00015287109840362163, 'epoch': 0.24}


 24%|██▎       | 3967/16798 [16:46<55:53,  3.83it/s]  

{'loss': 2.0915, 'grad_norm': 2.116077423095703, 'learning_rate': 0.00015285918513223732, 'epoch': 0.24}


 24%|██▎       | 3968/16798 [16:47<53:58,  3.96it/s]

{'loss': 1.3406, 'grad_norm': 1.8715403079986572, 'learning_rate': 0.00015284727186085298, 'epoch': 0.24}


 24%|██▎       | 3969/16798 [16:47<56:57,  3.75it/s]

{'loss': 1.8109, 'grad_norm': 2.0564749240875244, 'learning_rate': 0.00015283535858946868, 'epoch': 0.24}


 24%|██▎       | 3970/16798 [16:47<59:50,  3.57it/s]

{'loss': 1.7211, 'grad_norm': 2.0759048461914062, 'learning_rate': 0.00015282344531808434, 'epoch': 0.24}


 24%|██▎       | 3971/16798 [16:48<59:06,  3.62it/s]

{'loss': 1.4689, 'grad_norm': 1.8216540813446045, 'learning_rate': 0.00015281153204670003, 'epoch': 0.24}


 24%|██▎       | 3972/16798 [16:48<59:34,  3.59it/s]

{'loss': 2.1067, 'grad_norm': 2.5516278743743896, 'learning_rate': 0.0001527996187753157, 'epoch': 0.24}


 24%|██▎       | 3973/16798 [16:48<1:00:42,  3.52it/s]

{'loss': 1.6277, 'grad_norm': 1.791522741317749, 'learning_rate': 0.00015278770550393138, 'epoch': 0.24}


 24%|██▎       | 3974/16798 [16:48<57:24,  3.72it/s]  

{'loss': 1.3506, 'grad_norm': 1.6450852155685425, 'learning_rate': 0.00015277579223254705, 'epoch': 0.24}


 24%|██▎       | 3975/16798 [16:49<1:01:40,  3.46it/s]

{'loss': 1.8186, 'grad_norm': 1.9650583267211914, 'learning_rate': 0.00015276387896116274, 'epoch': 0.24}


 24%|██▎       | 3976/16798 [16:49<57:47,  3.70it/s]  

{'loss': 1.5141, 'grad_norm': 1.8615808486938477, 'learning_rate': 0.00015275196568977843, 'epoch': 0.24}


 24%|██▎       | 3977/16798 [16:49<55:23,  3.86it/s]

{'loss': 1.6919, 'grad_norm': 2.052434206008911, 'learning_rate': 0.00015274005241839412, 'epoch': 0.24}


 24%|██▎       | 3978/16798 [16:50<1:01:07,  3.50it/s]

{'loss': 1.3775, 'grad_norm': 2.0263359546661377, 'learning_rate': 0.00015272813914700978, 'epoch': 0.24}


 24%|██▎       | 3979/16798 [16:50<1:00:24,  3.54it/s]

{'loss': 1.2486, 'grad_norm': 1.7311981916427612, 'learning_rate': 0.00015271622587562547, 'epoch': 0.24}


 24%|██▎       | 3980/16798 [16:50<58:55,  3.63it/s]  

{'loss': 1.2435, 'grad_norm': 2.0418386459350586, 'learning_rate': 0.00015270431260424114, 'epoch': 0.24}


 24%|██▎       | 3981/16798 [16:50<59:08,  3.61it/s]

{'loss': 1.7912, 'grad_norm': 1.9676579236984253, 'learning_rate': 0.00015269239933285683, 'epoch': 0.24}


 24%|██▎       | 3982/16798 [16:51<1:00:07,  3.55it/s]

{'loss': 1.7069, 'grad_norm': 2.1133148670196533, 'learning_rate': 0.0001526804860614725, 'epoch': 0.24}


 24%|██▎       | 3983/16798 [16:51<1:03:17,  3.37it/s]

{'loss': 1.467, 'grad_norm': 1.8609271049499512, 'learning_rate': 0.00015266857279008818, 'epoch': 0.24}


 24%|██▎       | 3984/16798 [16:51<1:00:52,  3.51it/s]

{'loss': 1.6113, 'grad_norm': 2.0536177158355713, 'learning_rate': 0.00015265665951870385, 'epoch': 0.24}


 24%|██▎       | 3985/16798 [16:52<1:02:21,  3.42it/s]

{'loss': 1.498, 'grad_norm': 1.7856770753860474, 'learning_rate': 0.00015264474624731954, 'epoch': 0.24}


 24%|██▎       | 3986/16798 [16:52<1:03:00,  3.39it/s]

{'loss': 1.6375, 'grad_norm': 2.6830382347106934, 'learning_rate': 0.0001526328329759352, 'epoch': 0.24}


 24%|██▎       | 3987/16798 [16:52<1:03:02,  3.39it/s]

{'loss': 1.4448, 'grad_norm': 1.7725915908813477, 'learning_rate': 0.0001526209197045509, 'epoch': 0.24}


 24%|██▎       | 3988/16798 [16:52<59:12,  3.61it/s]  

{'loss': 1.2815, 'grad_norm': 1.833875298500061, 'learning_rate': 0.00015260900643316656, 'epoch': 0.24}


 24%|██▎       | 3989/16798 [16:53<1:00:44,  3.51it/s]

{'loss': 1.1506, 'grad_norm': 1.5819272994995117, 'learning_rate': 0.00015259709316178225, 'epoch': 0.24}


 24%|██▍       | 3990/16798 [16:53<56:41,  3.77it/s]  

{'loss': 1.4665, 'grad_norm': 2.1181845664978027, 'learning_rate': 0.0001525851798903979, 'epoch': 0.24}


 24%|██▍       | 3991/16798 [16:53<53:45,  3.97it/s]

{'loss': 1.1934, 'grad_norm': 1.4605711698532104, 'learning_rate': 0.0001525732666190136, 'epoch': 0.24}


 24%|██▍       | 3993/16798 [16:54<54:10,  3.94it/s]

{'loss': 1.136, 'grad_norm': 1.5541685819625854, 'learning_rate': 0.00015256135334762926, 'epoch': 0.24}


 24%|██▍       | 3993/16798 [16:54<54:10,  3.94it/s]

{'loss': 1.0071, 'grad_norm': 1.4216351509094238, 'learning_rate': 0.00015254944007624496, 'epoch': 0.24}


 24%|██▍       | 3994/16798 [16:54<53:54,  3.96it/s]

{'loss': 1.2011, 'grad_norm': 1.9137645959854126, 'learning_rate': 0.00015253752680486062, 'epoch': 0.24}


 24%|██▍       | 3996/16798 [16:54<54:16,  3.93it/s]

{'loss': 1.0292, 'grad_norm': 1.6202051639556885, 'learning_rate': 0.0001525256135334763, 'epoch': 0.24}


 24%|██▍       | 3996/16798 [16:54<54:16,  3.93it/s]

{'loss': 0.9165, 'grad_norm': 1.7339043617248535, 'learning_rate': 0.00015251370026209197, 'epoch': 0.24}


 24%|██▍       | 3997/16798 [16:55<56:04,  3.80it/s]

{'loss': 0.5668, 'grad_norm': 0.9682663083076477, 'learning_rate': 0.00015250178699070766, 'epoch': 0.24}


 24%|██▍       | 3999/16798 [16:55<50:22,  4.23it/s]

{'loss': 0.4323, 'grad_norm': 0.9785577058792114, 'learning_rate': 0.00015248987371932333, 'epoch': 0.24}


 24%|██▍       | 3999/16798 [16:55<50:22,  4.23it/s]

{'loss': 0.7555, 'grad_norm': 1.4762372970581055, 'learning_rate': 0.00015247796044793902, 'epoch': 0.24}




{'loss': 0.295, 'grad_norm': 0.7317626476287842, 'learning_rate': 0.00015246604717655468, 'epoch': 0.24}


 24%|██▍       | 4001/16798 [16:58<3:45:17,  1.06s/it]

{'loss': 1.797, 'grad_norm': 1.8913410902023315, 'learning_rate': 0.00015245413390517037, 'epoch': 0.24}


 24%|██▍       | 4002/16798 [16:59<2:55:56,  1.21it/s]

{'loss': 2.0007, 'grad_norm': 1.7598780393600464, 'learning_rate': 0.00015244222063378604, 'epoch': 0.24}


 24%|██▍       | 4003/16798 [16:59<2:22:23,  1.50it/s]

{'loss': 1.9538, 'grad_norm': 2.099426507949829, 'learning_rate': 0.00015243030736240173, 'epoch': 0.24}


 24%|██▍       | 4004/16798 [16:59<1:56:32,  1.83it/s]

{'loss': 2.1528, 'grad_norm': 2.095731258392334, 'learning_rate': 0.0001524183940910174, 'epoch': 0.24}


 24%|██▍       | 4005/16798 [16:59<1:40:55,  2.11it/s]

{'loss': 2.3253, 'grad_norm': 2.6343629360198975, 'learning_rate': 0.00015240648081963308, 'epoch': 0.24}


 24%|██▍       | 4006/16798 [17:00<1:31:37,  2.33it/s]

{'loss': 2.1778, 'grad_norm': 4.775082111358643, 'learning_rate': 0.00015239456754824875, 'epoch': 0.24}


 24%|██▍       | 4007/16798 [17:00<1:20:48,  2.64it/s]

{'loss': 2.0736, 'grad_norm': 1.917024850845337, 'learning_rate': 0.00015238265427686444, 'epoch': 0.24}


 24%|██▍       | 4008/16798 [17:00<1:16:00,  2.80it/s]

{'loss': 1.833, 'grad_norm': 2.3318192958831787, 'learning_rate': 0.00015237074100548013, 'epoch': 0.24}


 24%|██▍       | 4009/16798 [17:01<1:13:13,  2.91it/s]

{'loss': 1.9176, 'grad_norm': 1.915151596069336, 'learning_rate': 0.0001523588277340958, 'epoch': 0.24}


 24%|██▍       | 4010/16798 [17:01<1:12:22,  2.94it/s]

{'loss': 2.2471, 'grad_norm': 2.0175540447235107, 'learning_rate': 0.00015234691446271148, 'epoch': 0.24}


 24%|██▍       | 4011/16798 [17:01<1:08:14,  3.12it/s]

{'loss': 1.6718, 'grad_norm': 2.179063081741333, 'learning_rate': 0.00015233500119132715, 'epoch': 0.24}


 24%|██▍       | 4012/16798 [17:02<1:06:59,  3.18it/s]

{'loss': 1.5068, 'grad_norm': 1.8500617742538452, 'learning_rate': 0.00015232308791994284, 'epoch': 0.24}


 24%|██▍       | 4013/16798 [17:02<1:09:46,  3.05it/s]

{'loss': 1.7987, 'grad_norm': 2.2983803749084473, 'learning_rate': 0.0001523111746485585, 'epoch': 0.24}


 24%|██▍       | 4014/16798 [17:02<1:08:16,  3.12it/s]

{'loss': 1.5229, 'grad_norm': 2.116457223892212, 'learning_rate': 0.0001522992613771742, 'epoch': 0.24}


 24%|██▍       | 4015/16798 [17:03<1:08:36,  3.11it/s]

{'loss': 1.3772, 'grad_norm': 1.7820838689804077, 'learning_rate': 0.00015228734810578985, 'epoch': 0.24}


 24%|██▍       | 4016/16798 [17:03<1:05:41,  3.24it/s]

{'loss': 1.3589, 'grad_norm': 1.8700510263442993, 'learning_rate': 0.00015227543483440554, 'epoch': 0.24}


 24%|██▍       | 4017/16798 [17:03<1:02:41,  3.40it/s]

{'loss': 1.4979, 'grad_norm': 1.7829768657684326, 'learning_rate': 0.0001522635215630212, 'epoch': 0.24}


 24%|██▍       | 4018/16798 [17:03<1:02:48,  3.39it/s]

{'loss': 1.3626, 'grad_norm': 1.7030396461486816, 'learning_rate': 0.0001522516082916369, 'epoch': 0.24}


 24%|██▍       | 4019/16798 [17:04<1:02:50,  3.39it/s]

{'loss': 1.2334, 'grad_norm': 1.726467490196228, 'learning_rate': 0.00015223969502025256, 'epoch': 0.24}


 24%|██▍       | 4020/16798 [17:04<1:04:10,  3.32it/s]

{'loss': 1.4869, 'grad_norm': 2.2339365482330322, 'learning_rate': 0.00015222778174886825, 'epoch': 0.24}


 24%|██▍       | 4021/16798 [17:04<1:01:05,  3.49it/s]

{'loss': 1.5194, 'grad_norm': 2.0609703063964844, 'learning_rate': 0.00015221586847748392, 'epoch': 0.24}


 24%|██▍       | 4022/16798 [17:05<1:04:36,  3.30it/s]

{'loss': 1.4873, 'grad_norm': 1.8023736476898193, 'learning_rate': 0.0001522039552060996, 'epoch': 0.24}


 24%|██▍       | 4023/16798 [17:05<1:00:08,  3.54it/s]

{'loss': 1.4122, 'grad_norm': 1.699092984199524, 'learning_rate': 0.00015219204193471527, 'epoch': 0.24}


 24%|██▍       | 4024/16798 [17:05<1:04:26,  3.30it/s]

{'loss': 1.1875, 'grad_norm': 1.7765774726867676, 'learning_rate': 0.00015218012866333096, 'epoch': 0.24}


 24%|██▍       | 4025/16798 [17:05<59:19,  3.59it/s]  

{'loss': 2.0851, 'grad_norm': 2.356837034225464, 'learning_rate': 0.00015216821539194663, 'epoch': 0.24}


 24%|██▍       | 4026/16798 [17:06<1:01:22,  3.47it/s]

{'loss': 1.4092, 'grad_norm': 1.912117838859558, 'learning_rate': 0.00015215630212056232, 'epoch': 0.24}


 24%|██▍       | 4027/16798 [17:06<1:04:03,  3.32it/s]

{'loss': 1.715, 'grad_norm': 2.046105146408081, 'learning_rate': 0.00015214438884917798, 'epoch': 0.24}


 24%|██▍       | 4028/16798 [17:06<1:00:20,  3.53it/s]

{'loss': 1.6022, 'grad_norm': 2.3642730712890625, 'learning_rate': 0.00015213247557779367, 'epoch': 0.24}


 24%|██▍       | 4029/16798 [17:07<1:05:18,  3.26it/s]

{'loss': 1.5068, 'grad_norm': 2.199763059616089, 'learning_rate': 0.00015212056230640933, 'epoch': 0.24}


 24%|██▍       | 4030/16798 [17:07<1:01:45,  3.45it/s]

{'loss': 1.467, 'grad_norm': 2.203218460083008, 'learning_rate': 0.00015210864903502503, 'epoch': 0.24}


 24%|██▍       | 4031/16798 [17:07<1:04:34,  3.30it/s]

{'loss': 1.4836, 'grad_norm': 1.7164883613586426, 'learning_rate': 0.0001520967357636407, 'epoch': 0.24}


 24%|██▍       | 4032/16798 [17:08<1:01:36,  3.45it/s]

{'loss': 1.1935, 'grad_norm': 1.734126329421997, 'learning_rate': 0.00015208482249225638, 'epoch': 0.24}


 24%|██▍       | 4033/16798 [17:08<1:02:17,  3.42it/s]

{'loss': 1.2218, 'grad_norm': 1.6491330862045288, 'learning_rate': 0.00015207290922087204, 'epoch': 0.24}


 24%|██▍       | 4034/16798 [17:08<1:02:39,  3.40it/s]

{'loss': 1.3515, 'grad_norm': 1.9777729511260986, 'learning_rate': 0.00015206099594948773, 'epoch': 0.24}


 24%|██▍       | 4035/16798 [17:08<1:06:29,  3.20it/s]

{'loss': 1.2026, 'grad_norm': 1.5865178108215332, 'learning_rate': 0.0001520490826781034, 'epoch': 0.24}


 24%|██▍       | 4036/16798 [17:09<1:05:09,  3.26it/s]

{'loss': 1.2793, 'grad_norm': 2.4972331523895264, 'learning_rate': 0.0001520371694067191, 'epoch': 0.24}


 24%|██▍       | 4037/16798 [17:09<1:03:19,  3.36it/s]

{'loss': 1.4074, 'grad_norm': 1.971227765083313, 'learning_rate': 0.00015202525613533475, 'epoch': 0.24}


 24%|██▍       | 4038/16798 [17:09<1:04:32,  3.29it/s]

{'loss': 1.1043, 'grad_norm': 2.3779568672180176, 'learning_rate': 0.00015201334286395047, 'epoch': 0.24}


 24%|██▍       | 4039/16798 [17:10<1:02:42,  3.39it/s]

{'loss': 1.109, 'grad_norm': 1.6662758588790894, 'learning_rate': 0.00015200142959256613, 'epoch': 0.24}


 24%|██▍       | 4040/16798 [17:10<1:04:49,  3.28it/s]

{'loss': 1.0667, 'grad_norm': 3.083510637283325, 'learning_rate': 0.00015198951632118182, 'epoch': 0.24}


 24%|██▍       | 4041/16798 [17:10<1:01:58,  3.43it/s]

{'loss': 1.0987, 'grad_norm': 1.9082520008087158, 'learning_rate': 0.0001519776030497975, 'epoch': 0.24}


 24%|██▍       | 4042/16798 [17:11<1:03:28,  3.35it/s]

{'loss': 0.843, 'grad_norm': 1.5846658945083618, 'learning_rate': 0.00015196568977841318, 'epoch': 0.24}


 24%|██▍       | 4043/16798 [17:11<1:04:15,  3.31it/s]

{'loss': 0.9315, 'grad_norm': 1.5257163047790527, 'learning_rate': 0.00015195377650702884, 'epoch': 0.24}


 24%|██▍       | 4044/16798 [17:11<1:03:59,  3.32it/s]

{'loss': 1.1917, 'grad_norm': 1.8271100521087646, 'learning_rate': 0.00015194186323564453, 'epoch': 0.24}


 24%|██▍       | 4045/16798 [17:11<1:03:06,  3.37it/s]

{'loss': 0.6858, 'grad_norm': 1.3150203227996826, 'learning_rate': 0.0001519299499642602, 'epoch': 0.24}


 24%|██▍       | 4046/16798 [17:12<1:03:07,  3.37it/s]

{'loss': 1.0458, 'grad_norm': 1.5581525564193726, 'learning_rate': 0.0001519180366928759, 'epoch': 0.24}


 24%|██▍       | 4048/16798 [17:12<56:45,  3.74it/s]  

{'loss': 0.9771, 'grad_norm': 1.7633494138717651, 'learning_rate': 0.00015190612342149155, 'epoch': 0.24}


 24%|██▍       | 4048/16798 [17:12<56:45,  3.74it/s]

{'loss': 0.7039, 'grad_norm': 1.3265786170959473, 'learning_rate': 0.00015189421015010724, 'epoch': 0.24}


 24%|██▍       | 4049/16798 [17:13<59:00,  3.60it/s]

{'loss': 0.4234, 'grad_norm': 1.0279334783554077, 'learning_rate': 0.0001518822968787229, 'epoch': 0.24}


 24%|██▍       | 4050/16798 [17:13<56:53,  3.73it/s]

{'loss': 0.429, 'grad_norm': 1.016143798828125, 'learning_rate': 0.0001518703836073386, 'epoch': 0.24}


 24%|██▍       | 4051/16798 [17:13<1:01:29,  3.46it/s]

{'loss': 1.7722, 'grad_norm': 1.6375712156295776, 'learning_rate': 0.00015185847033595426, 'epoch': 0.24}


 24%|██▍       | 4052/16798 [17:13<1:02:16,  3.41it/s]

{'loss': 2.1402, 'grad_norm': 2.0985658168792725, 'learning_rate': 0.00015184655706456995, 'epoch': 0.24}


 24%|██▍       | 4053/16798 [17:14<1:02:18,  3.41it/s]

{'loss': 1.8858, 'grad_norm': 1.9726572036743164, 'learning_rate': 0.00015183464379318562, 'epoch': 0.24}


 24%|██▍       | 4054/16798 [17:14<1:01:02,  3.48it/s]

{'loss': 1.8063, 'grad_norm': 1.8147923946380615, 'learning_rate': 0.0001518227305218013, 'epoch': 0.24}


 24%|██▍       | 4055/16798 [17:14<1:04:46,  3.28it/s]

{'loss': 1.859, 'grad_norm': 1.8461354970932007, 'learning_rate': 0.00015181081725041697, 'epoch': 0.24}


 24%|██▍       | 4056/16798 [17:15<1:00:11,  3.53it/s]

{'loss': 1.7471, 'grad_norm': 1.7583085298538208, 'learning_rate': 0.00015179890397903266, 'epoch': 0.24}


 24%|██▍       | 4057/16798 [17:15<1:09:24,  3.06it/s]

{'loss': 1.3142, 'grad_norm': 1.6409496068954468, 'learning_rate': 0.00015178699070764832, 'epoch': 0.24}


 24%|██▍       | 4058/16798 [17:15<1:08:32,  3.10it/s]

{'loss': 1.7585, 'grad_norm': 1.8130987882614136, 'learning_rate': 0.00015177507743626401, 'epoch': 0.24}


 24%|██▍       | 4059/16798 [17:16<1:06:56,  3.17it/s]

{'loss': 1.6197, 'grad_norm': 2.0475895404815674, 'learning_rate': 0.00015176316416487968, 'epoch': 0.24}


 24%|██▍       | 4060/16798 [17:16<1:01:10,  3.47it/s]

{'loss': 1.789, 'grad_norm': 1.7772578001022339, 'learning_rate': 0.00015175125089349537, 'epoch': 0.24}


 24%|██▍       | 4061/16798 [17:16<1:05:03,  3.26it/s]

{'loss': 1.6389, 'grad_norm': 1.8899776935577393, 'learning_rate': 0.00015173933762211103, 'epoch': 0.24}


 24%|██▍       | 4062/16798 [17:16<1:01:42,  3.44it/s]

{'loss': 1.8372, 'grad_norm': 1.9575729370117188, 'learning_rate': 0.00015172742435072672, 'epoch': 0.24}


 24%|██▍       | 4063/16798 [17:17<1:03:39,  3.33it/s]

{'loss': 1.3569, 'grad_norm': 1.6301133632659912, 'learning_rate': 0.0001517155110793424, 'epoch': 0.24}


 24%|██▍       | 4064/16798 [17:17<1:00:31,  3.51it/s]

{'loss': 1.6985, 'grad_norm': 2.0676631927490234, 'learning_rate': 0.00015170359780795808, 'epoch': 0.24}


 24%|██▍       | 4065/16798 [17:17<1:02:52,  3.38it/s]

{'loss': 1.514, 'grad_norm': 1.9344922304153442, 'learning_rate': 0.00015169168453657374, 'epoch': 0.24}


 24%|██▍       | 4066/16798 [17:18<58:41,  3.62it/s]  

{'loss': 1.7363, 'grad_norm': 2.0640933513641357, 'learning_rate': 0.0001516797712651894, 'epoch': 0.24}


 24%|██▍       | 4067/16798 [17:18<59:33,  3.56it/s]

{'loss': 1.4719, 'grad_norm': 1.8912593126296997, 'learning_rate': 0.0001516678579938051, 'epoch': 0.24}


 24%|██▍       | 4068/16798 [17:18<1:01:05,  3.47it/s]

{'loss': 1.7408, 'grad_norm': 2.023954391479492, 'learning_rate': 0.00015165594472242076, 'epoch': 0.24}


 24%|██▍       | 4069/16798 [17:19<1:05:22,  3.24it/s]

{'loss': 1.6249, 'grad_norm': 2.044363498687744, 'learning_rate': 0.00015164403145103648, 'epoch': 0.24}


 24%|██▍       | 4070/16798 [17:19<1:02:17,  3.41it/s]

{'loss': 1.4909, 'grad_norm': 1.8851864337921143, 'learning_rate': 0.00015163211817965214, 'epoch': 0.24}


 24%|██▍       | 4071/16798 [17:19<1:03:18,  3.35it/s]

{'loss': 2.0612, 'grad_norm': 2.246760606765747, 'learning_rate': 0.00015162020490826783, 'epoch': 0.24}


 24%|██▍       | 4072/16798 [17:19<1:01:09,  3.47it/s]

{'loss': 1.3214, 'grad_norm': 1.6830081939697266, 'learning_rate': 0.0001516082916368835, 'epoch': 0.24}


 24%|██▍       | 4073/16798 [17:20<1:02:21,  3.40it/s]

{'loss': 1.7938, 'grad_norm': 1.8390116691589355, 'learning_rate': 0.00015159637836549919, 'epoch': 0.24}


 24%|██▍       | 4074/16798 [17:20<57:47,  3.67it/s]  

{'loss': 1.4893, 'grad_norm': 2.0298385620117188, 'learning_rate': 0.00015158446509411485, 'epoch': 0.24}


 24%|██▍       | 4075/16798 [17:20<59:48,  3.55it/s]

{'loss': 1.793, 'grad_norm': 2.1476004123687744, 'learning_rate': 0.00015157255182273054, 'epoch': 0.24}


 24%|██▍       | 4076/16798 [17:20<57:02,  3.72it/s]

{'loss': 1.3172, 'grad_norm': 2.4009153842926025, 'learning_rate': 0.0001515606385513462, 'epoch': 0.24}


 24%|██▍       | 4077/16798 [17:21<1:00:04,  3.53it/s]

{'loss': 1.5259, 'grad_norm': 1.8329403400421143, 'learning_rate': 0.0001515487252799619, 'epoch': 0.24}


 24%|██▍       | 4078/16798 [17:21<1:00:38,  3.50it/s]

{'loss': 1.3264, 'grad_norm': 1.8024598360061646, 'learning_rate': 0.00015153681200857756, 'epoch': 0.24}


 24%|██▍       | 4079/16798 [17:21<1:03:06,  3.36it/s]

{'loss': 2.117, 'grad_norm': 2.1393349170684814, 'learning_rate': 0.00015152489873719325, 'epoch': 0.24}


 24%|██▍       | 4080/16798 [17:22<58:40,  3.61it/s]  

{'loss': 1.8137, 'grad_norm': 2.1042706966400146, 'learning_rate': 0.0001515129854658089, 'epoch': 0.24}


 24%|██▍       | 4081/16798 [17:22<1:01:01,  3.47it/s]

{'loss': 1.3732, 'grad_norm': 2.0844333171844482, 'learning_rate': 0.0001515010721944246, 'epoch': 0.24}


 24%|██▍       | 4082/16798 [17:22<57:46,  3.67it/s]  

{'loss': 1.7082, 'grad_norm': 1.816500186920166, 'learning_rate': 0.00015148915892304027, 'epoch': 0.24}


 24%|██▍       | 4083/16798 [17:22<1:01:00,  3.47it/s]

{'loss': 1.7474, 'grad_norm': 1.954481601715088, 'learning_rate': 0.00015147724565165596, 'epoch': 0.24}


 24%|██▍       | 4084/16798 [17:23<56:29,  3.75it/s]  

{'loss': 1.3632, 'grad_norm': 1.9385175704956055, 'learning_rate': 0.00015146533238027162, 'epoch': 0.24}


 24%|██▍       | 4085/16798 [17:23<55:25,  3.82it/s]

{'loss': 0.8516, 'grad_norm': 1.5417038202285767, 'learning_rate': 0.0001514534191088873, 'epoch': 0.24}


 24%|██▍       | 4086/16798 [17:23<57:38,  3.68it/s]

{'loss': 1.505, 'grad_norm': 1.8381093740463257, 'learning_rate': 0.00015144150583750298, 'epoch': 0.24}


 24%|██▍       | 4087/16798 [17:23<54:31,  3.89it/s]

{'loss': 1.8851, 'grad_norm': 2.2814111709594727, 'learning_rate': 0.00015142959256611867, 'epoch': 0.24}


 24%|██▍       | 4088/16798 [17:24<57:08,  3.71it/s]

{'loss': 1.2959, 'grad_norm': 1.9407036304473877, 'learning_rate': 0.00015141767929473433, 'epoch': 0.24}


 24%|██▍       | 4089/16798 [17:24<55:03,  3.85it/s]

{'loss': 1.6672, 'grad_norm': 2.0674386024475098, 'learning_rate': 0.00015140576602335002, 'epoch': 0.24}


 24%|██▍       | 4090/16798 [17:24<59:45,  3.54it/s]

{'loss': 1.3121, 'grad_norm': 1.7595245838165283, 'learning_rate': 0.00015139385275196569, 'epoch': 0.24}


 24%|██▍       | 4091/16798 [17:25<59:09,  3.58it/s]

{'loss': 1.4411, 'grad_norm': 2.0379273891448975, 'learning_rate': 0.00015138193948058138, 'epoch': 0.24}


 24%|██▍       | 4092/16798 [17:25<1:02:24,  3.39it/s]

{'loss': 1.1868, 'grad_norm': 1.8569166660308838, 'learning_rate': 0.00015137002620919704, 'epoch': 0.24}


 24%|██▍       | 4093/16798 [17:25<59:42,  3.55it/s]  

{'loss': 1.3557, 'grad_norm': 2.5763370990753174, 'learning_rate': 0.00015135811293781273, 'epoch': 0.24}


 24%|██▍       | 4094/16798 [17:26<1:03:36,  3.33it/s]

{'loss': 1.2928, 'grad_norm': 1.7530834674835205, 'learning_rate': 0.0001513461996664284, 'epoch': 0.24}


 24%|██▍       | 4095/16798 [17:26<1:00:24,  3.51it/s]

{'loss': 1.1566, 'grad_norm': 1.709352731704712, 'learning_rate': 0.00015133428639504409, 'epoch': 0.24}


 24%|██▍       | 4097/16798 [17:26<56:47,  3.73it/s]  

{'loss': 0.8798, 'grad_norm': 1.6067259311676025, 'learning_rate': 0.00015132237312365975, 'epoch': 0.24}


 24%|██▍       | 4097/16798 [17:26<56:47,  3.73it/s]

{'loss': 0.9017, 'grad_norm': 1.7678183317184448, 'learning_rate': 0.00015131045985227544, 'epoch': 0.24}


 24%|██▍       | 4098/16798 [17:27<57:56,  3.65it/s]

{'loss': 0.9329, 'grad_norm': 1.9674848318099976, 'learning_rate': 0.0001512985465808911, 'epoch': 0.24}


 24%|██▍       | 4099/16798 [17:27<58:53,  3.59it/s]

{'loss': 0.5938, 'grad_norm': 1.3282411098480225, 'learning_rate': 0.00015128663330950682, 'epoch': 0.24}


 24%|██▍       | 4100/16798 [17:27<1:04:19,  3.29it/s]

{'loss': 0.4581, 'grad_norm': 1.0883809328079224, 'learning_rate': 0.00015127472003812248, 'epoch': 0.24}


 24%|██▍       | 4101/16798 [17:27<1:00:11,  3.52it/s]

{'loss': 1.9562, 'grad_norm': 1.7508906126022339, 'learning_rate': 0.00015126280676673818, 'epoch': 0.24}


 24%|██▍       | 4102/16798 [17:28<57:20,  3.69it/s]  

{'loss': 2.1007, 'grad_norm': 2.0335841178894043, 'learning_rate': 0.00015125089349535384, 'epoch': 0.24}


 24%|██▍       | 4103/16798 [17:28<59:30,  3.56it/s]

{'loss': 1.9866, 'grad_norm': 2.0038914680480957, 'learning_rate': 0.00015123898022396953, 'epoch': 0.24}


 24%|██▍       | 4104/16798 [17:28<58:59,  3.59it/s]

{'loss': 2.0525, 'grad_norm': 1.709831714630127, 'learning_rate': 0.0001512270669525852, 'epoch': 0.24}


 24%|██▍       | 4105/16798 [17:29<1:00:26,  3.50it/s]

{'loss': 2.1416, 'grad_norm': 2.1127283573150635, 'learning_rate': 0.00015121515368120088, 'epoch': 0.24}


 24%|██▍       | 4106/16798 [17:29<58:53,  3.59it/s]  

{'loss': 1.8042, 'grad_norm': 2.2930755615234375, 'learning_rate': 0.00015120324040981655, 'epoch': 0.24}


 24%|██▍       | 4107/16798 [17:29<57:43,  3.66it/s]

{'loss': 1.747, 'grad_norm': 1.6410908699035645, 'learning_rate': 0.00015119132713843224, 'epoch': 0.24}


 24%|██▍       | 4108/16798 [17:29<1:01:24,  3.44it/s]

{'loss': 2.0243, 'grad_norm': 1.9721704721450806, 'learning_rate': 0.0001511794138670479, 'epoch': 0.24}


 24%|██▍       | 4109/16798 [17:30<58:51,  3.59it/s]  

{'loss': 2.019, 'grad_norm': 2.02668833732605, 'learning_rate': 0.0001511675005956636, 'epoch': 0.24}


 24%|██▍       | 4110/16798 [17:30<59:10,  3.57it/s]

{'loss': 1.7196, 'grad_norm': 2.033451557159424, 'learning_rate': 0.00015115558732427926, 'epoch': 0.24}


 24%|██▍       | 4111/16798 [17:30<59:43,  3.54it/s]

{'loss': 1.8037, 'grad_norm': 1.966904640197754, 'learning_rate': 0.00015114367405289495, 'epoch': 0.24}


 24%|██▍       | 4112/16798 [17:31<59:44,  3.54it/s]

{'loss': 1.4677, 'grad_norm': 2.6666009426116943, 'learning_rate': 0.0001511317607815106, 'epoch': 0.24}


 24%|██▍       | 4113/16798 [17:31<1:02:09,  3.40it/s]

{'loss': 1.647, 'grad_norm': 2.0099377632141113, 'learning_rate': 0.0001511198475101263, 'epoch': 0.24}


 24%|██▍       | 4114/16798 [17:31<1:02:09,  3.40it/s]

{'loss': 1.6083, 'grad_norm': 1.9671964645385742, 'learning_rate': 0.00015110793423874197, 'epoch': 0.24}


 24%|██▍       | 4115/16798 [17:31<1:00:59,  3.47it/s]

{'loss': 1.7632, 'grad_norm': 1.9863343238830566, 'learning_rate': 0.00015109602096735766, 'epoch': 0.24}


 25%|██▍       | 4116/16798 [17:32<56:07,  3.77it/s]  

{'loss': 1.5309, 'grad_norm': 1.711439609527588, 'learning_rate': 0.00015108410769597332, 'epoch': 0.25}


 25%|██▍       | 4117/16798 [17:32<1:01:29,  3.44it/s]

{'loss': 1.5949, 'grad_norm': 1.8287324905395508, 'learning_rate': 0.000151072194424589, 'epoch': 0.25}


 25%|██▍       | 4118/16798 [17:32<56:52,  3.72it/s]  

{'loss': 1.9279, 'grad_norm': 1.9870859384536743, 'learning_rate': 0.00015106028115320467, 'epoch': 0.25}


 25%|██▍       | 4119/16798 [17:33<1:00:05,  3.52it/s]

{'loss': 1.7766, 'grad_norm': 2.143228054046631, 'learning_rate': 0.00015104836788182037, 'epoch': 0.25}


 25%|██▍       | 4120/16798 [17:33<59:18,  3.56it/s]  

{'loss': 1.5505, 'grad_norm': 2.0004019737243652, 'learning_rate': 0.00015103645461043603, 'epoch': 0.25}


 25%|██▍       | 4121/16798 [17:33<1:01:54,  3.41it/s]

{'loss': 2.028, 'grad_norm': 2.2831614017486572, 'learning_rate': 0.00015102454133905172, 'epoch': 0.25}


 25%|██▍       | 4122/16798 [17:33<57:46,  3.66it/s]  

{'loss': 1.3515, 'grad_norm': 1.58245849609375, 'learning_rate': 0.00015101262806766738, 'epoch': 0.25}


 25%|██▍       | 4123/16798 [17:34<1:00:44,  3.48it/s]

{'loss': 1.3886, 'grad_norm': 1.7398649454116821, 'learning_rate': 0.00015100071479628305, 'epoch': 0.25}


 25%|██▍       | 4124/16798 [17:34<56:36,  3.73it/s]  

{'loss': 1.578, 'grad_norm': 1.9114373922348022, 'learning_rate': 0.00015098880152489874, 'epoch': 0.25}


 25%|██▍       | 4125/16798 [17:34<53:05,  3.98it/s]

{'loss': 2.0552, 'grad_norm': 2.6780290603637695, 'learning_rate': 0.0001509768882535144, 'epoch': 0.25}


 25%|██▍       | 4126/16798 [17:34<57:48,  3.65it/s]

{'loss': 1.8087, 'grad_norm': 1.8775107860565186, 'learning_rate': 0.0001509649749821301, 'epoch': 0.25}


 25%|██▍       | 4127/16798 [17:35<1:00:28,  3.49it/s]

{'loss': 1.9039, 'grad_norm': 2.3589048385620117, 'learning_rate': 0.00015095306171074576, 'epoch': 0.25}


 25%|██▍       | 4128/16798 [17:35<1:02:57,  3.35it/s]

{'loss': 1.723, 'grad_norm': 1.8364710807800293, 'learning_rate': 0.00015094114843936145, 'epoch': 0.25}


 25%|██▍       | 4129/16798 [17:35<58:23,  3.62it/s]  

{'loss': 1.1069, 'grad_norm': 1.5365004539489746, 'learning_rate': 0.0001509292351679771, 'epoch': 0.25}


 25%|██▍       | 4130/16798 [17:36<59:58,  3.52it/s]

{'loss': 1.3101, 'grad_norm': 1.7280337810516357, 'learning_rate': 0.00015091732189659283, 'epoch': 0.25}


 25%|██▍       | 4131/16798 [17:36<57:18,  3.68it/s]

{'loss': 1.8818, 'grad_norm': 1.9705123901367188, 'learning_rate': 0.0001509054086252085, 'epoch': 0.25}


 25%|██▍       | 4132/16798 [17:36<1:00:54,  3.47it/s]

{'loss': 1.657, 'grad_norm': 1.9424004554748535, 'learning_rate': 0.00015089349535382418, 'epoch': 0.25}


 25%|██▍       | 4133/16798 [17:36<57:48,  3.65it/s]  

{'loss': 1.4043, 'grad_norm': 1.67955482006073, 'learning_rate': 0.00015088158208243985, 'epoch': 0.25}


 25%|██▍       | 4134/16798 [17:37<59:22,  3.56it/s]

{'loss': 1.5095, 'grad_norm': 1.9599319696426392, 'learning_rate': 0.00015086966881105554, 'epoch': 0.25}


 25%|██▍       | 4136/16798 [17:37<52:02,  4.06it/s]

{'loss': 1.5041, 'grad_norm': 2.007791042327881, 'learning_rate': 0.0001508577555396712, 'epoch': 0.25}


 25%|██▍       | 4136/16798 [17:37<52:02,  4.06it/s]

{'loss': 1.3791, 'grad_norm': 1.7838690280914307, 'learning_rate': 0.0001508458422682869, 'epoch': 0.25}


 25%|██▍       | 4137/16798 [17:37<56:24,  3.74it/s]

{'loss': 1.5852, 'grad_norm': 1.8874036073684692, 'learning_rate': 0.00015083392899690256, 'epoch': 0.25}


 25%|██▍       | 4138/16798 [17:38<55:19,  3.81it/s]

{'loss': 1.3017, 'grad_norm': 1.8584760427474976, 'learning_rate': 0.00015082201572551825, 'epoch': 0.25}


 25%|██▍       | 4139/16798 [17:38<59:02,  3.57it/s]

{'loss': 1.5646, 'grad_norm': 1.8046154975891113, 'learning_rate': 0.0001508101024541339, 'epoch': 0.25}


 25%|██▍       | 4140/16798 [17:38<55:02,  3.83it/s]

{'loss': 0.6677, 'grad_norm': 1.3244370222091675, 'learning_rate': 0.0001507981891827496, 'epoch': 0.25}


 25%|██▍       | 4141/16798 [17:39<57:31,  3.67it/s]

{'loss': 1.3852, 'grad_norm': 1.7662461996078491, 'learning_rate': 0.00015078627591136526, 'epoch': 0.25}


 25%|██▍       | 4142/16798 [17:39<59:31,  3.54it/s]

{'loss': 1.2588, 'grad_norm': 1.536495327949524, 'learning_rate': 0.00015077436263998095, 'epoch': 0.25}


 25%|██▍       | 4143/16798 [17:39<57:14,  3.68it/s]

{'loss': 0.7865, 'grad_norm': 1.4827862977981567, 'learning_rate': 0.00015076244936859662, 'epoch': 0.25}


 25%|██▍       | 4145/16798 [17:40<55:02,  3.83it/s]

{'loss': 0.9987, 'grad_norm': 1.5211416482925415, 'learning_rate': 0.0001507505360972123, 'epoch': 0.25}


 25%|██▍       | 4145/16798 [17:40<55:02,  3.83it/s]

{'loss': 1.2405, 'grad_norm': 1.7455300092697144, 'learning_rate': 0.00015073862282582797, 'epoch': 0.25}


 25%|██▍       | 4146/16798 [17:40<59:05,  3.57it/s]

{'loss': 0.7916, 'grad_norm': 1.208268165588379, 'learning_rate': 0.00015072670955444366, 'epoch': 0.25}


 25%|██▍       | 4147/16798 [17:40<56:33,  3.73it/s]

{'loss': 1.0457, 'grad_norm': 1.8025773763656616, 'learning_rate': 0.00015071479628305933, 'epoch': 0.25}


 25%|██▍       | 4148/16798 [17:40<58:23,  3.61it/s]

{'loss': 0.5551, 'grad_norm': 1.1381603479385376, 'learning_rate': 0.00015070288301167502, 'epoch': 0.25}


 25%|██▍       | 4149/16798 [17:41<1:01:39,  3.42it/s]

{'loss': 0.7969, 'grad_norm': 1.292643666267395, 'learning_rate': 0.00015069096974029068, 'epoch': 0.25}


 25%|██▍       | 4150/16798 [17:41<59:39,  3.53it/s]  

{'loss': 0.4229, 'grad_norm': 1.0408661365509033, 'learning_rate': 0.00015067905646890637, 'epoch': 0.25}


 25%|██▍       | 4151/16798 [17:41<59:15,  3.56it/s]

{'loss': 1.5874, 'grad_norm': 2.02296781539917, 'learning_rate': 0.00015066714319752204, 'epoch': 0.25}


 25%|██▍       | 4152/16798 [17:42<59:10,  3.56it/s]

{'loss': 2.2691, 'grad_norm': 2.1285903453826904, 'learning_rate': 0.00015065522992613773, 'epoch': 0.25}


 25%|██▍       | 4153/16798 [17:42<1:00:55,  3.46it/s]

{'loss': 2.4894, 'grad_norm': 2.1349220275878906, 'learning_rate': 0.0001506433166547534, 'epoch': 0.25}


 25%|██▍       | 4154/16798 [17:42<58:00,  3.63it/s]  

{'loss': 1.9819, 'grad_norm': 2.026684284210205, 'learning_rate': 0.00015063140338336908, 'epoch': 0.25}


 25%|██▍       | 4155/16798 [17:43<1:01:09,  3.45it/s]

{'loss': 1.887, 'grad_norm': 2.3762757778167725, 'learning_rate': 0.00015061949011198474, 'epoch': 0.25}


 25%|██▍       | 4156/16798 [17:43<57:01,  3.69it/s]  

{'loss': 1.62, 'grad_norm': 1.6629706621170044, 'learning_rate': 0.00015060757684060044, 'epoch': 0.25}


 25%|██▍       | 4157/16798 [17:43<56:26,  3.73it/s]

{'loss': 2.128, 'grad_norm': 2.2102975845336914, 'learning_rate': 0.0001505956635692161, 'epoch': 0.25}


 25%|██▍       | 4158/16798 [17:43<58:38,  3.59it/s]

{'loss': 1.6651, 'grad_norm': 1.7757339477539062, 'learning_rate': 0.0001505837502978318, 'epoch': 0.25}


 25%|██▍       | 4159/16798 [17:44<55:49,  3.77it/s]

{'loss': 1.5777, 'grad_norm': 1.9944099187850952, 'learning_rate': 0.00015057183702644745, 'epoch': 0.25}


 25%|██▍       | 4160/16798 [17:44<59:32,  3.54it/s]

{'loss': 1.7943, 'grad_norm': 1.8324847221374512, 'learning_rate': 0.00015055992375506314, 'epoch': 0.25}


 25%|██▍       | 4161/16798 [17:44<57:33,  3.66it/s]

{'loss': 2.1949, 'grad_norm': 2.027766227722168, 'learning_rate': 0.00015054801048367884, 'epoch': 0.25}


 25%|██▍       | 4162/16798 [17:44<58:53,  3.58it/s]

{'loss': 1.6289, 'grad_norm': 1.7914800643920898, 'learning_rate': 0.00015053609721229453, 'epoch': 0.25}


 25%|██▍       | 4163/16798 [17:45<56:23,  3.73it/s]

{'loss': 1.448, 'grad_norm': 1.7846990823745728, 'learning_rate': 0.0001505241839409102, 'epoch': 0.25}


 25%|██▍       | 4164/16798 [17:45<58:27,  3.60it/s]

{'loss': 1.6523, 'grad_norm': 1.8358269929885864, 'learning_rate': 0.00015051227066952588, 'epoch': 0.25}


 25%|██▍       | 4165/16798 [17:45<57:17,  3.68it/s]

{'loss': 1.5076, 'grad_norm': 2.0491254329681396, 'learning_rate': 0.00015050035739814154, 'epoch': 0.25}


 25%|██▍       | 4166/16798 [17:46<1:02:23,  3.37it/s]

{'loss': 1.6033, 'grad_norm': 1.9818387031555176, 'learning_rate': 0.00015048844412675723, 'epoch': 0.25}


 25%|██▍       | 4167/16798 [17:46<1:02:02,  3.39it/s]

{'loss': 1.6874, 'grad_norm': 2.2037055492401123, 'learning_rate': 0.0001504765308553729, 'epoch': 0.25}


 25%|██▍       | 4168/16798 [17:46<1:03:50,  3.30it/s]

{'loss': 1.4416, 'grad_norm': 1.7724568843841553, 'learning_rate': 0.0001504646175839886, 'epoch': 0.25}


 25%|██▍       | 4169/16798 [17:46<1:01:56,  3.40it/s]

{'loss': 1.4697, 'grad_norm': 1.6148970127105713, 'learning_rate': 0.00015045270431260425, 'epoch': 0.25}


 25%|██▍       | 4170/16798 [17:47<1:04:36,  3.26it/s]

{'loss': 1.7623, 'grad_norm': 1.9889767169952393, 'learning_rate': 0.00015044079104121994, 'epoch': 0.25}


 25%|██▍       | 4171/16798 [17:47<58:44,  3.58it/s]  

{'loss': 1.8708, 'grad_norm': 2.2318873405456543, 'learning_rate': 0.0001504288777698356, 'epoch': 0.25}


 25%|██▍       | 4172/16798 [17:47<55:27,  3.79it/s]

{'loss': 2.2472, 'grad_norm': 3.883376359939575, 'learning_rate': 0.0001504169644984513, 'epoch': 0.25}


 25%|██▍       | 4173/16798 [17:48<58:13,  3.61it/s]

{'loss': 1.2542, 'grad_norm': 1.7394003868103027, 'learning_rate': 0.00015040505122706696, 'epoch': 0.25}


 25%|██▍       | 4174/16798 [17:48<59:17,  3.55it/s]

{'loss': 1.6818, 'grad_norm': 1.9467430114746094, 'learning_rate': 0.00015039313795568265, 'epoch': 0.25}


 25%|██▍       | 4175/16798 [17:48<1:02:41,  3.36it/s]

{'loss': 1.52, 'grad_norm': 1.8310421705245972, 'learning_rate': 0.00015038122468429832, 'epoch': 0.25}


 25%|██▍       | 4176/16798 [17:48<58:44,  3.58it/s]  

{'loss': 1.3197, 'grad_norm': 1.8763571977615356, 'learning_rate': 0.000150369311412914, 'epoch': 0.25}


 25%|██▍       | 4177/16798 [17:49<59:53,  3.51it/s]

{'loss': 1.5277, 'grad_norm': 2.080559730529785, 'learning_rate': 0.00015035739814152967, 'epoch': 0.25}


 25%|██▍       | 4178/16798 [17:49<55:59,  3.76it/s]

{'loss': 1.4877, 'grad_norm': 1.873096227645874, 'learning_rate': 0.00015034548487014536, 'epoch': 0.25}


 25%|██▍       | 4179/16798 [17:49<58:38,  3.59it/s]

{'loss': 1.1693, 'grad_norm': 1.6758891344070435, 'learning_rate': 0.00015033357159876102, 'epoch': 0.25}


 25%|██▍       | 4180/16798 [17:49<56:21,  3.73it/s]

{'loss': 1.7342, 'grad_norm': 2.299631118774414, 'learning_rate': 0.0001503216583273767, 'epoch': 0.25}


 25%|██▍       | 4181/16798 [17:50<59:11,  3.55it/s]

{'loss': 1.2002, 'grad_norm': 2.193317174911499, 'learning_rate': 0.00015030974505599238, 'epoch': 0.25}


 25%|██▍       | 4182/16798 [17:50<56:03,  3.75it/s]

{'loss': 1.4782, 'grad_norm': 1.9668757915496826, 'learning_rate': 0.00015029783178460804, 'epoch': 0.25}


 25%|██▍       | 4183/16798 [17:50<1:00:13,  3.49it/s]

{'loss': 1.4352, 'grad_norm': 1.879144310951233, 'learning_rate': 0.00015028591851322373, 'epoch': 0.25}


 25%|██▍       | 4184/16798 [17:51<56:34,  3.72it/s]  

{'loss': 0.8637, 'grad_norm': 1.581636667251587, 'learning_rate': 0.0001502740052418394, 'epoch': 0.25}


 25%|██▍       | 4185/16798 [17:51<57:03,  3.68it/s]

{'loss': 2.1562, 'grad_norm': 2.525744676589966, 'learning_rate': 0.0001502620919704551, 'epoch': 0.25}


 25%|██▍       | 4186/16798 [17:51<57:41,  3.64it/s]

{'loss': 1.2143, 'grad_norm': 1.5939922332763672, 'learning_rate': 0.00015025017869907075, 'epoch': 0.25}


 25%|██▍       | 4187/16798 [17:51<1:02:20,  3.37it/s]

{'loss': 1.229, 'grad_norm': 1.724399209022522, 'learning_rate': 0.00015023826542768644, 'epoch': 0.25}


 25%|██▍       | 4188/16798 [17:52<1:00:27,  3.48it/s]

{'loss': 1.5031, 'grad_norm': 2.1702067852020264, 'learning_rate': 0.0001502263521563021, 'epoch': 0.25}


 25%|██▍       | 4189/16798 [17:52<58:06,  3.62it/s]  

{'loss': 1.0633, 'grad_norm': 1.6381514072418213, 'learning_rate': 0.0001502144388849178, 'epoch': 0.25}


 25%|██▍       | 4190/16798 [17:52<59:48,  3.51it/s]

{'loss': 1.2593, 'grad_norm': 1.7063733339309692, 'learning_rate': 0.00015020252561353346, 'epoch': 0.25}


 25%|██▍       | 4191/16798 [17:53<1:02:09,  3.38it/s]

{'loss': 1.7417, 'grad_norm': 1.7932536602020264, 'learning_rate': 0.00015019061234214915, 'epoch': 0.25}


 25%|██▍       | 4193/16798 [17:53<53:11,  3.95it/s]  

{'loss': 0.9965, 'grad_norm': 1.5017670392990112, 'learning_rate': 0.00015017869907076484, 'epoch': 0.25}


 25%|██▍       | 4193/16798 [17:53<53:11,  3.95it/s]

{'loss': 1.2877, 'grad_norm': 2.0053961277008057, 'learning_rate': 0.00015016678579938053, 'epoch': 0.25}


 25%|██▍       | 4194/16798 [17:53<50:48,  4.13it/s]

{'loss': 1.1578, 'grad_norm': 1.552554965019226, 'learning_rate': 0.0001501548725279962, 'epoch': 0.25}


 25%|██▍       | 4195/16798 [17:54<57:32,  3.65it/s]

{'loss': 1.1196, 'grad_norm': 1.7721872329711914, 'learning_rate': 0.0001501429592566119, 'epoch': 0.25}


 25%|██▍       | 4197/16798 [17:54<51:09,  4.10it/s]

{'loss': 0.6543, 'grad_norm': 1.1658228635787964, 'learning_rate': 0.00015013104598522755, 'epoch': 0.25}


 25%|██▍       | 4197/16798 [17:54<51:09,  4.10it/s]

{'loss': 0.852, 'grad_norm': 1.4287968873977661, 'learning_rate': 0.00015011913271384324, 'epoch': 0.25}


 25%|██▍       | 4198/16798 [17:54<52:45,  3.98it/s]

{'loss': 0.7548, 'grad_norm': 1.3834880590438843, 'learning_rate': 0.0001501072194424589, 'epoch': 0.25}


 25%|██▍       | 4199/16798 [17:55<54:14,  3.87it/s]

{'loss': 0.4014, 'grad_norm': 0.9950962662696838, 'learning_rate': 0.0001500953061710746, 'epoch': 0.25}


 25%|██▌       | 4200/16798 [17:55<58:26,  3.59it/s]

{'loss': 0.5582, 'grad_norm': 1.2184621095657349, 'learning_rate': 0.00015008339289969026, 'epoch': 0.25}


 25%|██▌       | 4201/16798 [17:55<55:55,  3.75it/s]

{'loss': 2.4491, 'grad_norm': 1.880825400352478, 'learning_rate': 0.00015007147962830595, 'epoch': 0.25}


 25%|██▌       | 4202/16798 [17:55<59:53,  3.50it/s]

{'loss': 1.6868, 'grad_norm': 1.67360258102417, 'learning_rate': 0.00015005956635692161, 'epoch': 0.25}


 25%|██▌       | 4203/16798 [17:56<57:44,  3.64it/s]

{'loss': 1.7232, 'grad_norm': 2.695051431655884, 'learning_rate': 0.0001500476530855373, 'epoch': 0.25}


 25%|██▌       | 4204/16798 [17:56<59:24,  3.53it/s]

{'loss': 1.973, 'grad_norm': 1.7607872486114502, 'learning_rate': 0.00015003573981415297, 'epoch': 0.25}


 25%|██▌       | 4205/16798 [17:56<55:35,  3.78it/s]

{'loss': 2.0363, 'grad_norm': 1.6583467721939087, 'learning_rate': 0.00015002382654276866, 'epoch': 0.25}


 25%|██▌       | 4206/16798 [17:56<53:03,  3.96it/s]

{'loss': 2.3369, 'grad_norm': 2.003591299057007, 'learning_rate': 0.00015001191327138432, 'epoch': 0.25}


 25%|██▌       | 4207/16798 [17:57<59:40,  3.52it/s]

{'loss': 2.1357, 'grad_norm': 1.96989905834198, 'learning_rate': 0.00015000000000000001, 'epoch': 0.25}


 25%|██▌       | 4208/16798 [17:57<56:24,  3.72it/s]

{'loss': 1.8804, 'grad_norm': 1.8591917753219604, 'learning_rate': 0.00014998808672861568, 'epoch': 0.25}


 25%|██▌       | 4209/16798 [17:57<58:56,  3.56it/s]

{'loss': 1.8633, 'grad_norm': 2.136086940765381, 'learning_rate': 0.00014997617345723137, 'epoch': 0.25}


 25%|██▌       | 4210/16798 [17:58<55:09,  3.80it/s]

{'loss': 1.661, 'grad_norm': 2.567390203475952, 'learning_rate': 0.00014996426018584703, 'epoch': 0.25}


 25%|██▌       | 4211/16798 [17:58<59:40,  3.51it/s]

{'loss': 1.5865, 'grad_norm': 1.5579909086227417, 'learning_rate': 0.00014995234691446272, 'epoch': 0.25}


 25%|██▌       | 4212/16798 [17:58<56:24,  3.72it/s]

{'loss': 1.919, 'grad_norm': 1.9630120992660522, 'learning_rate': 0.0001499404336430784, 'epoch': 0.25}


 25%|██▌       | 4213/16798 [17:58<59:06,  3.55it/s]

{'loss': 1.5329, 'grad_norm': 1.7049154043197632, 'learning_rate': 0.00014992852037169408, 'epoch': 0.25}


 25%|██▌       | 4214/16798 [17:59<55:57,  3.75it/s]

{'loss': 1.6516, 'grad_norm': 1.7344142198562622, 'learning_rate': 0.00014991660710030974, 'epoch': 0.25}


 25%|██▌       | 4215/16798 [17:59<58:17,  3.60it/s]

{'loss': 1.5266, 'grad_norm': 2.005784273147583, 'learning_rate': 0.00014990469382892543, 'epoch': 0.25}


 25%|██▌       | 4216/16798 [17:59<54:29,  3.85it/s]

{'loss': 1.6763, 'grad_norm': 1.6640764474868774, 'learning_rate': 0.0001498927805575411, 'epoch': 0.25}


 25%|██▌       | 4217/16798 [18:00<57:43,  3.63it/s]

{'loss': 1.3438, 'grad_norm': 1.8060096502304077, 'learning_rate': 0.00014988086728615679, 'epoch': 0.25}


 25%|██▌       | 4218/16798 [18:00<58:55,  3.56it/s]

{'loss': 1.9795, 'grad_norm': 1.9482053518295288, 'learning_rate': 0.00014986895401477245, 'epoch': 0.25}


 25%|██▌       | 4219/16798 [18:00<1:01:08,  3.43it/s]

{'loss': 1.4454, 'grad_norm': 1.8083291053771973, 'learning_rate': 0.00014985704074338814, 'epoch': 0.25}


 25%|██▌       | 4220/16798 [18:00<57:42,  3.63it/s]  

{'loss': 1.6356, 'grad_norm': 2.04648756980896, 'learning_rate': 0.0001498451274720038, 'epoch': 0.25}


 25%|██▌       | 4221/16798 [18:01<1:02:03,  3.38it/s]

{'loss': 2.0357, 'grad_norm': 2.0225014686584473, 'learning_rate': 0.0001498332142006195, 'epoch': 0.25}


 25%|██▌       | 4222/16798 [18:01<57:58,  3.62it/s]  

{'loss': 1.5721, 'grad_norm': 1.9130659103393555, 'learning_rate': 0.00014982130092923516, 'epoch': 0.25}


 25%|██▌       | 4223/16798 [18:01<59:58,  3.49it/s]

{'loss': 1.5775, 'grad_norm': 2.1176693439483643, 'learning_rate': 0.00014980938765785088, 'epoch': 0.25}


 25%|██▌       | 4224/16798 [18:02<57:09,  3.67it/s]

{'loss': 1.461, 'grad_norm': 1.6919771432876587, 'learning_rate': 0.00014979747438646654, 'epoch': 0.25}


 25%|██▌       | 4225/16798 [18:02<59:11,  3.54it/s]

{'loss': 1.5225, 'grad_norm': 1.7338358163833618, 'learning_rate': 0.00014978556111508223, 'epoch': 0.25}


 25%|██▌       | 4226/16798 [18:02<57:44,  3.63it/s]

{'loss': 1.3384, 'grad_norm': 1.7403838634490967, 'learning_rate': 0.0001497736478436979, 'epoch': 0.25}


 25%|██▌       | 4227/16798 [18:02<57:55,  3.62it/s]

{'loss': 1.5538, 'grad_norm': 2.2048521041870117, 'learning_rate': 0.00014976173457231359, 'epoch': 0.25}


 25%|██▌       | 4228/16798 [18:03<55:06,  3.80it/s]

{'loss': 2.0017, 'grad_norm': 2.0612175464630127, 'learning_rate': 0.00014974982130092925, 'epoch': 0.25}


 25%|██▌       | 4229/16798 [18:03<57:54,  3.62it/s]

{'loss': 1.4459, 'grad_norm': 1.8601804971694946, 'learning_rate': 0.00014973790802954494, 'epoch': 0.25}


 25%|██▌       | 4230/16798 [18:03<55:22,  3.78it/s]

{'loss': 2.0102, 'grad_norm': 2.2713582515716553, 'learning_rate': 0.0001497259947581606, 'epoch': 0.25}


 25%|██▌       | 4231/16798 [18:03<57:46,  3.63it/s]

{'loss': 1.5368, 'grad_norm': 1.9024510383605957, 'learning_rate': 0.0001497140814867763, 'epoch': 0.25}


 25%|██▌       | 4232/16798 [18:04<56:37,  3.70it/s]

{'loss': 1.4265, 'grad_norm': 1.6755754947662354, 'learning_rate': 0.00014970216821539196, 'epoch': 0.25}


 25%|██▌       | 4233/16798 [18:04<1:00:33,  3.46it/s]

{'loss': 1.3177, 'grad_norm': 1.7138521671295166, 'learning_rate': 0.00014969025494400765, 'epoch': 0.25}


 25%|██▌       | 4234/16798 [18:04<1:00:59,  3.43it/s]

{'loss': 1.4625, 'grad_norm': 1.7965294122695923, 'learning_rate': 0.0001496783416726233, 'epoch': 0.25}


 25%|██▌       | 4235/16798 [18:05<59:44,  3.50it/s]  

{'loss': 1.4075, 'grad_norm': 1.8228551149368286, 'learning_rate': 0.000149666428401239, 'epoch': 0.25}


 25%|██▌       | 4236/16798 [18:05<57:21,  3.65it/s]

{'loss': 1.7495, 'grad_norm': 1.7710210084915161, 'learning_rate': 0.00014965451512985467, 'epoch': 0.25}


 25%|██▌       | 4237/16798 [18:05<57:16,  3.65it/s]

{'loss': 1.3578, 'grad_norm': 1.8220226764678955, 'learning_rate': 0.00014964260185847033, 'epoch': 0.25}


 25%|██▌       | 4238/16798 [18:05<55:20,  3.78it/s]

{'loss': 1.4326, 'grad_norm': 1.7233942747116089, 'learning_rate': 0.00014963068858708602, 'epoch': 0.25}


 25%|██▌       | 4239/16798 [18:06<59:19,  3.53it/s]

{'loss': 1.269, 'grad_norm': 2.255527973175049, 'learning_rate': 0.00014961877531570168, 'epoch': 0.25}


 25%|██▌       | 4240/16798 [18:06<56:40,  3.69it/s]

{'loss': 1.8335, 'grad_norm': 1.9610143899917603, 'learning_rate': 0.00014960686204431738, 'epoch': 0.25}


 25%|██▌       | 4241/16798 [18:06<58:54,  3.55it/s]

{'loss': 0.7958, 'grad_norm': 1.3471788167953491, 'learning_rate': 0.00014959494877293304, 'epoch': 0.25}


 25%|██▌       | 4242/16798 [18:06<56:15,  3.72it/s]

{'loss': 1.6113, 'grad_norm': 2.0833935737609863, 'learning_rate': 0.00014958303550154873, 'epoch': 0.25}


 25%|██▌       | 4243/16798 [18:07<58:12,  3.60it/s]

{'loss': 1.1829, 'grad_norm': 1.7160537242889404, 'learning_rate': 0.0001495711222301644, 'epoch': 0.25}


 25%|██▌       | 4244/16798 [18:07<55:05,  3.80it/s]

{'loss': 0.9512, 'grad_norm': 1.4815069437026978, 'learning_rate': 0.00014955920895878008, 'epoch': 0.25}


 25%|██▌       | 4245/16798 [18:07<58:03,  3.60it/s]

{'loss': 1.1416, 'grad_norm': 1.6344330310821533, 'learning_rate': 0.00014954729568739575, 'epoch': 0.25}


 25%|██▌       | 4246/16798 [18:08<56:31,  3.70it/s]

{'loss': 1.1466, 'grad_norm': 1.506378412246704, 'learning_rate': 0.00014953538241601144, 'epoch': 0.25}


 25%|██▌       | 4247/16798 [18:08<56:01,  3.73it/s]

{'loss': 0.5281, 'grad_norm': 0.9578277468681335, 'learning_rate': 0.0001495234691446271, 'epoch': 0.25}


 25%|██▌       | 4248/16798 [18:08<52:53,  3.95it/s]

{'loss': 0.7138, 'grad_norm': 1.3509749174118042, 'learning_rate': 0.0001495115558732428, 'epoch': 0.25}


 25%|██▌       | 4249/16798 [18:08<50:56,  4.11it/s]

{'loss': 0.5214, 'grad_norm': 1.1933361291885376, 'learning_rate': 0.00014949964260185846, 'epoch': 0.25}


 25%|██▌       | 4250/16798 [18:09<55:04,  3.80it/s]

{'loss': 0.2589, 'grad_norm': 0.6956731677055359, 'learning_rate': 0.00014948772933047415, 'epoch': 0.25}


 25%|██▌       | 4251/16798 [18:09<53:19,  3.92it/s]

{'loss': 2.285, 'grad_norm': 1.9293491840362549, 'learning_rate': 0.0001494758160590898, 'epoch': 0.25}


 25%|██▌       | 4252/16798 [18:09<57:08,  3.66it/s]

{'loss': 1.7171, 'grad_norm': 1.8720253705978394, 'learning_rate': 0.0001494639027877055, 'epoch': 0.25}


 25%|██▌       | 4253/16798 [18:09<58:14,  3.59it/s]

{'loss': 1.8077, 'grad_norm': 1.7295557260513306, 'learning_rate': 0.00014945198951632117, 'epoch': 0.25}


 25%|██▌       | 4254/16798 [18:10<1:00:28,  3.46it/s]

{'loss': 2.1796, 'grad_norm': 1.967511773109436, 'learning_rate': 0.00014944007624493688, 'epoch': 0.25}


 25%|██▌       | 4255/16798 [18:10<58:12,  3.59it/s]  

{'loss': 2.2249, 'grad_norm': 2.1963117122650146, 'learning_rate': 0.00014942816297355255, 'epoch': 0.25}


 25%|██▌       | 4256/16798 [18:10<59:38,  3.50it/s]

{'loss': 2.2029, 'grad_norm': 2.0753846168518066, 'learning_rate': 0.00014941624970216824, 'epoch': 0.25}


 25%|██▌       | 4257/16798 [18:11<56:14,  3.72it/s]

{'loss': 1.8742, 'grad_norm': 5.145107269287109, 'learning_rate': 0.0001494043364307839, 'epoch': 0.25}


 25%|██▌       | 4258/16798 [18:11<58:43,  3.56it/s]

{'loss': 1.3907, 'grad_norm': 2.5909767150878906, 'learning_rate': 0.0001493924231593996, 'epoch': 0.25}


 25%|██▌       | 4259/16798 [18:11<55:41,  3.75it/s]

{'loss': 1.4746, 'grad_norm': 2.0076634883880615, 'learning_rate': 0.00014938050988801526, 'epoch': 0.25}


 25%|██▌       | 4260/16798 [18:11<58:19,  3.58it/s]

{'loss': 2.06, 'grad_norm': 2.499096632003784, 'learning_rate': 0.00014936859661663095, 'epoch': 0.25}


 25%|██▌       | 4261/16798 [18:12<56:49,  3.68it/s]

{'loss': 1.5521, 'grad_norm': 1.7405593395233154, 'learning_rate': 0.0001493566833452466, 'epoch': 0.25}


 25%|██▌       | 4262/16798 [18:12<58:22,  3.58it/s]

{'loss': 1.5734, 'grad_norm': 1.8644758462905884, 'learning_rate': 0.0001493447700738623, 'epoch': 0.25}


 25%|██▌       | 4263/16798 [18:12<55:37,  3.76it/s]

{'loss': 1.51, 'grad_norm': 1.8101177215576172, 'learning_rate': 0.00014933285680247796, 'epoch': 0.25}


 25%|██▌       | 4264/16798 [18:12<57:24,  3.64it/s]

{'loss': 1.5216, 'grad_norm': 1.8754113912582397, 'learning_rate': 0.00014932094353109366, 'epoch': 0.25}


 25%|██▌       | 4265/16798 [18:13<56:12,  3.72it/s]

{'loss': 1.8265, 'grad_norm': 1.8522764444351196, 'learning_rate': 0.00014930903025970932, 'epoch': 0.25}


 25%|██▌       | 4266/16798 [18:13<57:53,  3.61it/s]

{'loss': 1.8197, 'grad_norm': 2.1585052013397217, 'learning_rate': 0.000149297116988325, 'epoch': 0.25}


 25%|██▌       | 4267/16798 [18:13<54:06,  3.86it/s]

{'loss': 1.606, 'grad_norm': 1.9796526432037354, 'learning_rate': 0.00014928520371694067, 'epoch': 0.25}


 25%|██▌       | 4268/16798 [18:13<51:02,  4.09it/s]

{'loss': 1.9134, 'grad_norm': 2.0993213653564453, 'learning_rate': 0.00014927329044555636, 'epoch': 0.25}


 25%|██▌       | 4269/16798 [18:14<55:41,  3.75it/s]

{'loss': 1.7572, 'grad_norm': 1.8505858182907104, 'learning_rate': 0.00014926137717417203, 'epoch': 0.25}


 25%|██▌       | 4270/16798 [18:14<55:22,  3.77it/s]

{'loss': 1.4586, 'grad_norm': 1.714242935180664, 'learning_rate': 0.00014924946390278772, 'epoch': 0.25}


 25%|██▌       | 4271/16798 [18:14<59:45,  3.49it/s]

{'loss': 1.2843, 'grad_norm': 1.5002135038375854, 'learning_rate': 0.00014923755063140338, 'epoch': 0.25}


 25%|██▌       | 4272/16798 [18:15<55:02,  3.79it/s]

{'loss': 1.8231, 'grad_norm': 1.9930062294006348, 'learning_rate': 0.00014922563736001907, 'epoch': 0.25}


 25%|██▌       | 4273/16798 [18:15<53:15,  3.92it/s]

{'loss': 1.4067, 'grad_norm': 1.8379908800125122, 'learning_rate': 0.00014921372408863474, 'epoch': 0.25}


 25%|██▌       | 4274/16798 [18:15<53:57,  3.87it/s]

{'loss': 2.0034, 'grad_norm': 2.100828170776367, 'learning_rate': 0.00014920181081725043, 'epoch': 0.25}


 25%|██▌       | 4275/16798 [18:15<1:02:02,  3.36it/s]

{'loss': 1.2436, 'grad_norm': 1.5922428369522095, 'learning_rate': 0.0001491898975458661, 'epoch': 0.25}


 25%|██▌       | 4276/16798 [18:16<57:51,  3.61it/s]  

{'loss': 1.048, 'grad_norm': 1.5996371507644653, 'learning_rate': 0.00014917798427448178, 'epoch': 0.25}


 25%|██▌       | 4277/16798 [18:16<56:51,  3.67it/s]

{'loss': 1.6171, 'grad_norm': 2.0779407024383545, 'learning_rate': 0.00014916607100309745, 'epoch': 0.25}


 25%|██▌       | 4278/16798 [18:16<59:06,  3.53it/s]

{'loss': 1.3944, 'grad_norm': 1.800900936126709, 'learning_rate': 0.00014915415773171314, 'epoch': 0.25}


 25%|██▌       | 4279/16798 [18:16<55:29,  3.76it/s]

{'loss': 1.1679, 'grad_norm': 1.6975661516189575, 'learning_rate': 0.0001491422444603288, 'epoch': 0.25}


 25%|██▌       | 4280/16798 [18:17<1:00:23,  3.45it/s]

{'loss': 1.8056, 'grad_norm': 2.028470516204834, 'learning_rate': 0.0001491303311889445, 'epoch': 0.25}


 25%|██▌       | 4281/16798 [18:17<56:08,  3.72it/s]  

{'loss': 1.5021, 'grad_norm': 1.893455147743225, 'learning_rate': 0.00014911841791756015, 'epoch': 0.25}


 25%|██▌       | 4282/16798 [18:17<55:18,  3.77it/s]

{'loss': 1.4662, 'grad_norm': 2.2016944885253906, 'learning_rate': 0.00014910650464617585, 'epoch': 0.25}


 25%|██▌       | 4283/16798 [18:18<56:35,  3.69it/s]

{'loss': 1.7856, 'grad_norm': 2.2321205139160156, 'learning_rate': 0.0001490945913747915, 'epoch': 0.25}


 26%|██▌       | 4284/16798 [18:18<59:22,  3.51it/s]

{'loss': 1.4788, 'grad_norm': 2.1300010681152344, 'learning_rate': 0.0001490826781034072, 'epoch': 0.26}


 26%|██▌       | 4285/16798 [18:18<58:47,  3.55it/s]

{'loss': 1.078, 'grad_norm': 1.4799602031707764, 'learning_rate': 0.0001490707648320229, 'epoch': 0.26}


 26%|██▌       | 4286/16798 [18:18<1:00:22,  3.45it/s]

{'loss': 1.4326, 'grad_norm': 1.8630110025405884, 'learning_rate': 0.00014905885156063858, 'epoch': 0.26}


 26%|██▌       | 4287/16798 [18:19<56:56,  3.66it/s]  

{'loss': 1.2476, 'grad_norm': 1.6408623456954956, 'learning_rate': 0.00014904693828925425, 'epoch': 0.26}


 26%|██▌       | 4288/16798 [18:19<58:19,  3.57it/s]

{'loss': 1.4659, 'grad_norm': 1.9176857471466064, 'learning_rate': 0.00014903502501786994, 'epoch': 0.26}


 26%|██▌       | 4289/16798 [18:19<55:34,  3.75it/s]

{'loss': 1.491, 'grad_norm': 2.2132937908172607, 'learning_rate': 0.0001490231117464856, 'epoch': 0.26}


 26%|██▌       | 4290/16798 [18:20<57:05,  3.65it/s]

{'loss': 0.9905, 'grad_norm': 1.4374805688858032, 'learning_rate': 0.0001490111984751013, 'epoch': 0.26}


 26%|██▌       | 4291/16798 [18:20<55:19,  3.77it/s]

{'loss': 1.0223, 'grad_norm': 1.9215151071548462, 'learning_rate': 0.00014899928520371695, 'epoch': 0.26}


 26%|██▌       | 4292/16798 [18:20<58:49,  3.54it/s]

{'loss': 1.3681, 'grad_norm': 2.2329630851745605, 'learning_rate': 0.00014898737193233264, 'epoch': 0.26}


 26%|██▌       | 4293/16798 [18:20<56:43,  3.67it/s]

{'loss': 1.0678, 'grad_norm': 1.7009321451187134, 'learning_rate': 0.0001489754586609483, 'epoch': 0.26}


 26%|██▌       | 4294/16798 [18:21<1:00:57,  3.42it/s]

{'loss': 1.4479, 'grad_norm': 2.1091039180755615, 'learning_rate': 0.00014896354538956397, 'epoch': 0.26}


 26%|██▌       | 4295/16798 [18:21<58:38,  3.55it/s]  

{'loss': 0.7489, 'grad_norm': 1.221664547920227, 'learning_rate': 0.00014895163211817966, 'epoch': 0.26}


 26%|██▌       | 4297/16798 [18:21<54:21,  3.83it/s]

{'loss': 0.669, 'grad_norm': 1.326246738433838, 'learning_rate': 0.00014893971884679533, 'epoch': 0.26}


 26%|██▌       | 4297/16798 [18:21<54:21,  3.83it/s]

{'loss': 0.6391, 'grad_norm': 1.115101933479309, 'learning_rate': 0.00014892780557541102, 'epoch': 0.26}


 26%|██▌       | 4298/16798 [18:22<51:28,  4.05it/s]

{'loss': 0.9174, 'grad_norm': 1.71989905834198, 'learning_rate': 0.00014891589230402668, 'epoch': 0.26}


 26%|██▌       | 4299/16798 [18:22<56:40,  3.68it/s]

{'loss': 0.943, 'grad_norm': 1.5747649669647217, 'learning_rate': 0.00014890397903264237, 'epoch': 0.26}


 26%|██▌       | 4300/16798 [18:22<53:17,  3.91it/s]

{'loss': 0.3147, 'grad_norm': 0.7456916570663452, 'learning_rate': 0.00014889206576125804, 'epoch': 0.26}


 26%|██▌       | 4301/16798 [18:22<51:12,  4.07it/s]

{'loss': 2.0124, 'grad_norm': 2.2242355346679688, 'learning_rate': 0.00014888015248987373, 'epoch': 0.26}


 26%|██▌       | 4302/16798 [18:23<56:21,  3.70it/s]

{'loss': 1.7349, 'grad_norm': 1.9816386699676514, 'learning_rate': 0.0001488682392184894, 'epoch': 0.26}


 26%|██▌       | 4303/16798 [18:23<54:58,  3.79it/s]

{'loss': 2.1237, 'grad_norm': 1.9931864738464355, 'learning_rate': 0.00014885632594710508, 'epoch': 0.26}


 26%|██▌       | 4304/16798 [18:23<58:38,  3.55it/s]

{'loss': 2.2202, 'grad_norm': 2.159839391708374, 'learning_rate': 0.00014884441267572074, 'epoch': 0.26}


 26%|██▌       | 4305/16798 [18:24<1:00:09,  3.46it/s]

{'loss': 2.0071, 'grad_norm': 2.054177761077881, 'learning_rate': 0.00014883249940433643, 'epoch': 0.26}


 26%|██▌       | 4306/16798 [18:24<1:02:34,  3.33it/s]

{'loss': 1.7112, 'grad_norm': 1.8651409149169922, 'learning_rate': 0.0001488205861329521, 'epoch': 0.26}


 26%|██▌       | 4307/16798 [18:24<57:20,  3.63it/s]  

{'loss': 1.6716, 'grad_norm': 1.74522066116333, 'learning_rate': 0.0001488086728615678, 'epoch': 0.26}


 26%|██▌       | 4308/16798 [18:24<59:53,  3.48it/s]

{'loss': 1.5237, 'grad_norm': 2.181225299835205, 'learning_rate': 0.00014879675959018345, 'epoch': 0.26}


 26%|██▌       | 4309/16798 [18:25<57:37,  3.61it/s]

{'loss': 1.2869, 'grad_norm': 1.7490496635437012, 'learning_rate': 0.00014878484631879914, 'epoch': 0.26}


 26%|██▌       | 4310/16798 [18:25<1:01:36,  3.38it/s]

{'loss': 1.7991, 'grad_norm': 1.9690877199172974, 'learning_rate': 0.0001487729330474148, 'epoch': 0.26}


 26%|██▌       | 4311/16798 [18:25<56:38,  3.67it/s]  

{'loss': 1.6674, 'grad_norm': 1.9688096046447754, 'learning_rate': 0.0001487610197760305, 'epoch': 0.26}


 26%|██▌       | 4312/16798 [18:26<57:26,  3.62it/s]

{'loss': 1.5369, 'grad_norm': 2.0526468753814697, 'learning_rate': 0.00014874910650464616, 'epoch': 0.26}


 26%|██▌       | 4313/16798 [18:26<55:53,  3.72it/s]

{'loss': 1.8538, 'grad_norm': 2.1837637424468994, 'learning_rate': 0.00014873719323326185, 'epoch': 0.26}


 26%|██▌       | 4314/16798 [18:26<58:51,  3.54it/s]

{'loss': 1.6925, 'grad_norm': 2.3289411067962646, 'learning_rate': 0.00014872527996187752, 'epoch': 0.26}


 26%|██▌       | 4315/16798 [18:26<56:08,  3.71it/s]

{'loss': 1.9838, 'grad_norm': 2.064239025115967, 'learning_rate': 0.0001487133666904932, 'epoch': 0.26}


 26%|██▌       | 4317/16798 [18:27<55:08,  3.77it/s]

{'loss': 1.3531, 'grad_norm': 1.7856693267822266, 'learning_rate': 0.0001487014534191089, 'epoch': 0.26}


 26%|██▌       | 4317/16798 [18:27<55:08,  3.77it/s]

{'loss': 1.9191, 'grad_norm': 2.0562751293182373, 'learning_rate': 0.0001486895401477246, 'epoch': 0.26}


 26%|██▌       | 4318/16798 [18:27<55:44,  3.73it/s]

{'loss': 1.702, 'grad_norm': 1.959450125694275, 'learning_rate': 0.00014867762687634025, 'epoch': 0.26}


 26%|██▌       | 4319/16798 [18:28<58:30,  3.55it/s]

{'loss': 1.5855, 'grad_norm': 2.0901238918304443, 'learning_rate': 0.00014866571360495594, 'epoch': 0.26}


 26%|██▌       | 4320/16798 [18:28<1:01:22,  3.39it/s]

{'loss': 2.1221, 'grad_norm': 2.398862600326538, 'learning_rate': 0.0001486538003335716, 'epoch': 0.26}


 26%|██▌       | 4321/16798 [18:28<58:01,  3.58it/s]  

{'loss': 1.0733, 'grad_norm': 1.5792415142059326, 'learning_rate': 0.0001486418870621873, 'epoch': 0.26}


 26%|██▌       | 4322/16798 [18:28<54:31,  3.81it/s]

{'loss': 1.7959, 'grad_norm': 1.9108333587646484, 'learning_rate': 0.00014862997379080296, 'epoch': 0.26}


 26%|██▌       | 4323/16798 [18:29<58:22,  3.56it/s]

{'loss': 2.0063, 'grad_norm': 1.8501722812652588, 'learning_rate': 0.00014861806051941865, 'epoch': 0.26}


 26%|██▌       | 4324/16798 [18:29<57:25,  3.62it/s]

{'loss': 2.3997, 'grad_norm': 2.39326810836792, 'learning_rate': 0.00014860614724803432, 'epoch': 0.26}


 26%|██▌       | 4325/16798 [18:29<1:00:51,  3.42it/s]

{'loss': 1.4772, 'grad_norm': 1.7882336378097534, 'learning_rate': 0.00014859423397665, 'epoch': 0.26}


 26%|██▌       | 4326/16798 [18:29<56:57,  3.65it/s]  

{'loss': 2.1552, 'grad_norm': 2.2940073013305664, 'learning_rate': 0.00014858232070526567, 'epoch': 0.26}


 26%|██▌       | 4327/16798 [18:30<1:00:06,  3.46it/s]

{'loss': 1.6653, 'grad_norm': 1.9483389854431152, 'learning_rate': 0.00014857040743388136, 'epoch': 0.26}


 26%|██▌       | 4328/16798 [18:30<1:01:10,  3.40it/s]

{'loss': 1.8218, 'grad_norm': 2.057288646697998, 'learning_rate': 0.00014855849416249702, 'epoch': 0.26}


 26%|██▌       | 4329/16798 [18:30<1:01:28,  3.38it/s]

{'loss': 1.8968, 'grad_norm': 1.9378973245620728, 'learning_rate': 0.00014854658089111271, 'epoch': 0.26}


 26%|██▌       | 4330/16798 [18:31<1:03:59,  3.25it/s]

{'loss': 1.5848, 'grad_norm': 2.3388824462890625, 'learning_rate': 0.00014853466761972838, 'epoch': 0.26}


 26%|██▌       | 4331/16798 [18:31<1:03:36,  3.27it/s]

{'loss': 1.225, 'grad_norm': 1.5345733165740967, 'learning_rate': 0.00014852275434834407, 'epoch': 0.26}


 26%|██▌       | 4332/16798 [18:31<1:00:02,  3.46it/s]

{'loss': 1.5545, 'grad_norm': 1.8495625257492065, 'learning_rate': 0.00014851084107695973, 'epoch': 0.26}


 26%|██▌       | 4333/16798 [18:32<1:01:57,  3.35it/s]

{'loss': 1.4654, 'grad_norm': 1.8674582242965698, 'learning_rate': 0.00014849892780557542, 'epoch': 0.26}


 26%|██▌       | 4334/16798 [18:32<57:36,  3.61it/s]  

{'loss': 1.826, 'grad_norm': 2.2469754219055176, 'learning_rate': 0.0001484870145341911, 'epoch': 0.26}


 26%|██▌       | 4335/16798 [18:32<57:12,  3.63it/s]

{'loss': 1.4907, 'grad_norm': 2.136514663696289, 'learning_rate': 0.00014847510126280678, 'epoch': 0.26}


 26%|██▌       | 4336/16798 [18:32<56:35,  3.67it/s]

{'loss': 1.3042, 'grad_norm': 1.6239086389541626, 'learning_rate': 0.00014846318799142244, 'epoch': 0.26}


 26%|██▌       | 4337/16798 [18:33<1:00:52,  3.41it/s]

{'loss': 1.7042, 'grad_norm': 2.01361346244812, 'learning_rate': 0.00014845127472003813, 'epoch': 0.26}


 26%|██▌       | 4338/16798 [18:33<56:36,  3.67it/s]  

{'loss': 1.2597, 'grad_norm': 1.702440857887268, 'learning_rate': 0.0001484393614486538, 'epoch': 0.26}


 26%|██▌       | 4339/16798 [18:33<58:04,  3.58it/s]

{'loss': 1.3357, 'grad_norm': 1.877524733543396, 'learning_rate': 0.0001484274481772695, 'epoch': 0.26}


 26%|██▌       | 4340/16798 [18:33<55:17,  3.76it/s]

{'loss': 1.2946, 'grad_norm': 1.7097623348236084, 'learning_rate': 0.00014841553490588515, 'epoch': 0.26}


 26%|██▌       | 4341/16798 [18:34<56:23,  3.68it/s]

{'loss': 1.1377, 'grad_norm': 1.7344807386398315, 'learning_rate': 0.00014840362163450084, 'epoch': 0.26}


 26%|██▌       | 4342/16798 [18:34<58:59,  3.52it/s]

{'loss': 1.1439, 'grad_norm': 2.1547083854675293, 'learning_rate': 0.0001483917083631165, 'epoch': 0.26}


 26%|██▌       | 4343/16798 [18:34<1:01:18,  3.39it/s]

{'loss': 1.0434, 'grad_norm': 1.6495661735534668, 'learning_rate': 0.0001483797950917322, 'epoch': 0.26}


 26%|██▌       | 4344/16798 [18:35<59:19,  3.50it/s]  

{'loss': 0.7443, 'grad_norm': 1.4402238130569458, 'learning_rate': 0.00014836788182034786, 'epoch': 0.26}


 26%|██▌       | 4345/16798 [18:35<59:33,  3.48it/s]

{'loss': 1.0312, 'grad_norm': 1.6521389484405518, 'learning_rate': 0.00014835596854896355, 'epoch': 0.26}


 26%|██▌       | 4346/16798 [18:35<57:48,  3.59it/s]

{'loss': 0.6146, 'grad_norm': 1.1132493019104004, 'learning_rate': 0.00014834405527757924, 'epoch': 0.26}


 26%|██▌       | 4347/16798 [18:36<1:02:00,  3.35it/s]

{'loss': 0.5007, 'grad_norm': 1.283492922782898, 'learning_rate': 0.00014833214200619493, 'epoch': 0.26}


 26%|██▌       | 4348/16798 [18:36<59:04,  3.51it/s]  

{'loss': 0.2759, 'grad_norm': 0.6938432455062866, 'learning_rate': 0.0001483202287348106, 'epoch': 0.26}


 26%|██▌       | 4350/16798 [18:36<55:28,  3.74it/s]  

{'loss': 0.3189, 'grad_norm': 0.9159142971038818, 'learning_rate': 0.00014830831546342629, 'epoch': 0.26}


 26%|██▌       | 4350/16798 [18:36<55:28,  3.74it/s]

{'loss': 0.3721, 'grad_norm': 0.853748619556427, 'learning_rate': 0.00014829640219204195, 'epoch': 0.26}


 26%|██▌       | 4351/16798 [18:37<56:49,  3.65it/s]

{'loss': 2.1622, 'grad_norm': 1.9783368110656738, 'learning_rate': 0.00014828448892065761, 'epoch': 0.26}


 26%|██▌       | 4352/16798 [18:37<56:30,  3.67it/s]

{'loss': 1.6639, 'grad_norm': 1.780035376548767, 'learning_rate': 0.0001482725756492733, 'epoch': 0.26}


 26%|██▌       | 4353/16798 [18:37<58:40,  3.54it/s]

{'loss': 2.3761, 'grad_norm': 1.975785255432129, 'learning_rate': 0.00014826066237788897, 'epoch': 0.26}


 26%|██▌       | 4354/16798 [18:37<57:15,  3.62it/s]

{'loss': 2.1985, 'grad_norm': 2.022979497909546, 'learning_rate': 0.00014824874910650466, 'epoch': 0.26}


 26%|██▌       | 4355/16798 [18:38<58:56,  3.52it/s]

{'loss': 1.6836, 'grad_norm': 1.9631462097167969, 'learning_rate': 0.00014823683583512032, 'epoch': 0.26}


 26%|██▌       | 4356/16798 [18:38<56:30,  3.67it/s]

{'loss': 1.6282, 'grad_norm': 1.7738733291625977, 'learning_rate': 0.000148224922563736, 'epoch': 0.26}


 26%|██▌       | 4357/16798 [18:38<59:28,  3.49it/s]

{'loss': 2.0981, 'grad_norm': 2.4248311519622803, 'learning_rate': 0.00014821300929235168, 'epoch': 0.26}


 26%|██▌       | 4358/16798 [18:39<55:20,  3.75it/s]

{'loss': 1.4366, 'grad_norm': 1.9937423467636108, 'learning_rate': 0.00014820109602096737, 'epoch': 0.26}


 26%|██▌       | 4359/16798 [18:39<57:52,  3.58it/s]

{'loss': 1.6017, 'grad_norm': 2.0639376640319824, 'learning_rate': 0.00014818918274958303, 'epoch': 0.26}


 26%|██▌       | 4360/16798 [18:39<54:59,  3.77it/s]

{'loss': 1.5862, 'grad_norm': 1.7257503271102905, 'learning_rate': 0.00014817726947819872, 'epoch': 0.26}


 26%|██▌       | 4361/16798 [18:39<59:56,  3.46it/s]

{'loss': 1.3711, 'grad_norm': 1.6484404802322388, 'learning_rate': 0.00014816535620681439, 'epoch': 0.26}


 26%|██▌       | 4362/16798 [18:40<55:32,  3.73it/s]

{'loss': 1.7403, 'grad_norm': 2.011059522628784, 'learning_rate': 0.00014815344293543008, 'epoch': 0.26}


 26%|██▌       | 4363/16798 [18:40<55:11,  3.76it/s]

{'loss': 1.1908, 'grad_norm': 2.048600196838379, 'learning_rate': 0.00014814152966404574, 'epoch': 0.26}


 26%|██▌       | 4364/16798 [18:40<1:00:14,  3.44it/s]

{'loss': 1.6199, 'grad_norm': 2.1381051540374756, 'learning_rate': 0.00014812961639266143, 'epoch': 0.26}


 26%|██▌       | 4365/16798 [18:40<57:34,  3.60it/s]  

{'loss': 1.7213, 'grad_norm': 1.9749866724014282, 'learning_rate': 0.0001481177031212771, 'epoch': 0.26}


 26%|██▌       | 4366/16798 [18:41<1:00:23,  3.43it/s]

{'loss': 1.774, 'grad_norm': 2.0965116024017334, 'learning_rate': 0.00014810578984989279, 'epoch': 0.26}


 26%|██▌       | 4367/16798 [18:41<59:59,  3.45it/s]  

{'loss': 1.3379, 'grad_norm': 1.784224033355713, 'learning_rate': 0.00014809387657850845, 'epoch': 0.26}


 26%|██▌       | 4368/16798 [18:41<1:00:26,  3.43it/s]

{'loss': 1.5304, 'grad_norm': 1.9362008571624756, 'learning_rate': 0.00014808196330712414, 'epoch': 0.26}


 26%|██▌       | 4369/16798 [18:42<58:04,  3.57it/s]  

{'loss': 1.3953, 'grad_norm': 1.6987186670303345, 'learning_rate': 0.0001480700500357398, 'epoch': 0.26}


 26%|██▌       | 4370/16798 [18:42<1:01:08,  3.39it/s]

{'loss': 1.8816, 'grad_norm': 2.0529749393463135, 'learning_rate': 0.0001480581367643555, 'epoch': 0.26}


 26%|██▌       | 4371/16798 [18:42<1:00:37,  3.42it/s]

{'loss': 1.4839, 'grad_norm': 1.654943585395813, 'learning_rate': 0.00014804622349297116, 'epoch': 0.26}


 26%|██▌       | 4372/16798 [18:43<59:02,  3.51it/s]  

{'loss': 1.539, 'grad_norm': 1.7561970949172974, 'learning_rate': 0.00014803431022158685, 'epoch': 0.26}


 26%|██▌       | 4373/16798 [18:43<58:01,  3.57it/s]

{'loss': 1.2767, 'grad_norm': 1.8522292375564575, 'learning_rate': 0.0001480223969502025, 'epoch': 0.26}


 26%|██▌       | 4374/16798 [18:43<56:56,  3.64it/s]

{'loss': 1.4622, 'grad_norm': 1.9753144979476929, 'learning_rate': 0.0001480104836788182, 'epoch': 0.26}


 26%|██▌       | 4375/16798 [18:43<56:06,  3.69it/s]

{'loss': 1.3416, 'grad_norm': 1.738995909690857, 'learning_rate': 0.00014799857040743387, 'epoch': 0.26}


 26%|██▌       | 4376/16798 [18:44<53:38,  3.86it/s]

{'loss': 1.2533, 'grad_norm': 1.6581144332885742, 'learning_rate': 0.00014798665713604956, 'epoch': 0.26}


 26%|██▌       | 4377/16798 [18:44<58:28,  3.54it/s]

{'loss': 1.4715, 'grad_norm': 2.0054335594177246, 'learning_rate': 0.00014797474386466525, 'epoch': 0.26}


 26%|██▌       | 4378/16798 [18:44<56:12,  3.68it/s]

{'loss': 1.0798, 'grad_norm': 2.0088560581207275, 'learning_rate': 0.00014796283059328094, 'epoch': 0.26}


 26%|██▌       | 4379/16798 [18:44<58:15,  3.55it/s]

{'loss': 1.5864, 'grad_norm': 1.9424341917037964, 'learning_rate': 0.0001479509173218966, 'epoch': 0.26}


 26%|██▌       | 4380/16798 [18:45<56:45,  3.65it/s]

{'loss': 1.6232, 'grad_norm': 2.3216164112091064, 'learning_rate': 0.0001479390040505123, 'epoch': 0.26}


 26%|██▌       | 4381/16798 [18:45<57:30,  3.60it/s]

{'loss': 1.8011, 'grad_norm': 2.108402967453003, 'learning_rate': 0.00014792709077912796, 'epoch': 0.26}


 26%|██▌       | 4382/16798 [18:45<56:11,  3.68it/s]

{'loss': 1.4774, 'grad_norm': 1.854728102684021, 'learning_rate': 0.00014791517750774365, 'epoch': 0.26}


 26%|██▌       | 4383/16798 [18:46<1:01:55,  3.34it/s]

{'loss': 1.3909, 'grad_norm': 2.042893171310425, 'learning_rate': 0.0001479032642363593, 'epoch': 0.26}


 26%|██▌       | 4384/16798 [18:46<56:22,  3.67it/s]  

{'loss': 1.3316, 'grad_norm': 2.966171979904175, 'learning_rate': 0.000147891350964975, 'epoch': 0.26}


 26%|██▌       | 4385/16798 [18:46<59:05,  3.50it/s]

{'loss': 1.4553, 'grad_norm': 1.8591803312301636, 'learning_rate': 0.00014787943769359067, 'epoch': 0.26}


 26%|██▌       | 4386/16798 [18:46<57:47,  3.58it/s]

{'loss': 1.2918, 'grad_norm': 1.756896734237671, 'learning_rate': 0.00014786752442220636, 'epoch': 0.26}


 26%|██▌       | 4387/16798 [18:47<58:01,  3.57it/s]

{'loss': 1.3987, 'grad_norm': 1.9426770210266113, 'learning_rate': 0.00014785561115082202, 'epoch': 0.26}


 26%|██▌       | 4388/16798 [18:47<57:09,  3.62it/s]

{'loss': 0.8979, 'grad_norm': 1.5938416719436646, 'learning_rate': 0.0001478436978794377, 'epoch': 0.26}


 26%|██▌       | 4389/16798 [18:47<57:05,  3.62it/s]

{'loss': 1.2159, 'grad_norm': 2.0848655700683594, 'learning_rate': 0.00014783178460805337, 'epoch': 0.26}


 26%|██▌       | 4390/16798 [18:47<56:44,  3.64it/s]

{'loss': 1.0358, 'grad_norm': 1.7276965379714966, 'learning_rate': 0.00014781987133666907, 'epoch': 0.26}


 26%|██▌       | 4391/16798 [18:48<58:52,  3.51it/s]

{'loss': 1.1425, 'grad_norm': 1.7569071054458618, 'learning_rate': 0.00014780795806528473, 'epoch': 0.26}


 26%|██▌       | 4392/16798 [18:48<58:10,  3.55it/s]

{'loss': 0.9107, 'grad_norm': 1.5668282508850098, 'learning_rate': 0.00014779604479390042, 'epoch': 0.26}


 26%|██▌       | 4393/16798 [18:48<59:01,  3.50it/s]

{'loss': 0.833, 'grad_norm': 1.4931071996688843, 'learning_rate': 0.00014778413152251608, 'epoch': 0.26}


 26%|██▌       | 4394/16798 [18:49<55:11,  3.75it/s]

{'loss': 0.7707, 'grad_norm': 1.491722583770752, 'learning_rate': 0.00014777221825113177, 'epoch': 0.26}


 26%|██▌       | 4395/16798 [18:49<57:29,  3.60it/s]

{'loss': 0.4831, 'grad_norm': 0.9922774434089661, 'learning_rate': 0.00014776030497974744, 'epoch': 0.26}


 26%|██▌       | 4396/16798 [18:49<54:22,  3.80it/s]

{'loss': 0.3345, 'grad_norm': 0.8281407356262207, 'learning_rate': 0.00014774839170836313, 'epoch': 0.26}


 26%|██▌       | 4397/16798 [18:49<1:00:08,  3.44it/s]

{'loss': 0.3391, 'grad_norm': 0.9297314286231995, 'learning_rate': 0.0001477364784369788, 'epoch': 0.26}


 26%|██▌       | 4398/16798 [18:50<58:35,  3.53it/s]  

{'loss': 0.2141, 'grad_norm': 0.6159545183181763, 'learning_rate': 0.00014772456516559448, 'epoch': 0.26}


 26%|██▌       | 4399/16798 [18:50<1:00:28,  3.42it/s]

{'loss': 0.3988, 'grad_norm': 0.9958189129829407, 'learning_rate': 0.00014771265189421015, 'epoch': 0.26}


 26%|██▌       | 4400/16798 [18:50<58:00,  3.56it/s]  

{'loss': 0.2893, 'grad_norm': 0.9823421835899353, 'learning_rate': 0.00014770073862282584, 'epoch': 0.26}


 26%|██▌       | 4401/16798 [18:51<1:03:32,  3.25it/s]

{'loss': 1.7262, 'grad_norm': 1.583822250366211, 'learning_rate': 0.0001476888253514415, 'epoch': 0.26}


 26%|██▌       | 4402/16798 [18:51<59:00,  3.50it/s]  

{'loss': 2.0128, 'grad_norm': 1.885391354560852, 'learning_rate': 0.0001476769120800572, 'epoch': 0.26}


 26%|██▌       | 4403/16798 [18:51<57:43,  3.58it/s]

{'loss': 1.6686, 'grad_norm': 1.709067702293396, 'learning_rate': 0.00014766499880867286, 'epoch': 0.26}


 26%|██▌       | 4404/16798 [18:51<54:35,  3.78it/s]

{'loss': 2.0809, 'grad_norm': 2.174806833267212, 'learning_rate': 0.00014765308553728855, 'epoch': 0.26}


 26%|██▌       | 4405/16798 [18:52<57:30,  3.59it/s]

{'loss': 2.1172, 'grad_norm': 2.10221266746521, 'learning_rate': 0.0001476411722659042, 'epoch': 0.26}


 26%|██▌       | 4406/16798 [18:52<54:31,  3.79it/s]

{'loss': 2.0056, 'grad_norm': 2.1830687522888184, 'learning_rate': 0.0001476292589945199, 'epoch': 0.26}


 26%|██▌       | 4407/16798 [18:52<56:41,  3.64it/s]

{'loss': 1.5773, 'grad_norm': 1.9788718223571777, 'learning_rate': 0.00014761734572313556, 'epoch': 0.26}


 26%|██▌       | 4408/16798 [18:53<56:03,  3.68it/s]

{'loss': 1.6673, 'grad_norm': 2.2611947059631348, 'learning_rate': 0.00014760543245175126, 'epoch': 0.26}


 26%|██▌       | 4409/16798 [18:53<1:00:53,  3.39it/s]

{'loss': 2.1044, 'grad_norm': 1.9686026573181152, 'learning_rate': 0.00014759351918036695, 'epoch': 0.26}


 26%|██▋       | 4410/16798 [18:53<56:32,  3.65it/s]  

{'loss': 1.6393, 'grad_norm': 2.078855276107788, 'learning_rate': 0.0001475816059089826, 'epoch': 0.26}


 26%|██▋       | 4411/16798 [18:53<58:58,  3.50it/s]

{'loss': 1.5398, 'grad_norm': 1.8062604665756226, 'learning_rate': 0.0001475696926375983, 'epoch': 0.26}


 26%|██▋       | 4412/16798 [18:54<56:09,  3.68it/s]

{'loss': 1.4115, 'grad_norm': 1.6356914043426514, 'learning_rate': 0.00014755777936621396, 'epoch': 0.26}


 26%|██▋       | 4413/16798 [18:54<57:32,  3.59it/s]

{'loss': 1.6912, 'grad_norm': 1.974246859550476, 'learning_rate': 0.00014754586609482965, 'epoch': 0.26}


 26%|██▋       | 4414/16798 [18:54<1:01:09,  3.38it/s]

{'loss': 1.8346, 'grad_norm': 1.9659768342971802, 'learning_rate': 0.00014753395282344532, 'epoch': 0.26}


 26%|██▋       | 4415/16798 [18:55<1:02:28,  3.30it/s]

{'loss': 2.0165, 'grad_norm': 2.267686367034912, 'learning_rate': 0.000147522039552061, 'epoch': 0.26}


 26%|██▋       | 4416/16798 [18:55<59:03,  3.49it/s]  

{'loss': 1.9279, 'grad_norm': 2.116328716278076, 'learning_rate': 0.00014751012628067667, 'epoch': 0.26}


 26%|██▋       | 4417/16798 [18:55<1:01:01,  3.38it/s]

{'loss': 1.9559, 'grad_norm': 2.6562037467956543, 'learning_rate': 0.00014749821300929236, 'epoch': 0.26}


 26%|██▋       | 4418/16798 [18:55<56:46,  3.63it/s]  

{'loss': 1.5433, 'grad_norm': 1.7924922704696655, 'learning_rate': 0.00014748629973790803, 'epoch': 0.26}


 26%|██▋       | 4419/16798 [18:56<59:27,  3.47it/s]

{'loss': 1.5148, 'grad_norm': 2.25539493560791, 'learning_rate': 0.00014747438646652372, 'epoch': 0.26}


 26%|██▋       | 4420/16798 [18:56<59:37,  3.46it/s]

{'loss': 1.5424, 'grad_norm': 2.0305562019348145, 'learning_rate': 0.00014746247319513938, 'epoch': 0.26}


 26%|██▋       | 4422/16798 [18:57<56:21,  3.66it/s]  

{'loss': 1.425, 'grad_norm': 1.7491942644119263, 'learning_rate': 0.00014745055992375507, 'epoch': 0.26}


 26%|██▋       | 4422/16798 [18:57<56:21,  3.66it/s]

{'loss': 1.2103, 'grad_norm': 1.6975682973861694, 'learning_rate': 0.00014743864665237074, 'epoch': 0.26}


 26%|██▋       | 4423/16798 [18:57<54:02,  3.82it/s]

{'loss': 1.6011, 'grad_norm': 1.886826753616333, 'learning_rate': 0.00014742673338098643, 'epoch': 0.26}


 26%|██▋       | 4424/16798 [18:57<56:10,  3.67it/s]

{'loss': 1.5902, 'grad_norm': 1.8949828147888184, 'learning_rate': 0.0001474148201096021, 'epoch': 0.26}


 26%|██▋       | 4425/16798 [18:57<53:13,  3.87it/s]

{'loss': 1.4152, 'grad_norm': 1.9577645063400269, 'learning_rate': 0.00014740290683821778, 'epoch': 0.26}


 26%|██▋       | 4426/16798 [18:58<59:57,  3.44it/s]

{'loss': 1.6573, 'grad_norm': 1.8596340417861938, 'learning_rate': 0.00014739099356683345, 'epoch': 0.26}


 26%|██▋       | 4427/16798 [18:58<56:26,  3.65it/s]

{'loss': 1.5427, 'grad_norm': 1.6975834369659424, 'learning_rate': 0.00014737908029544914, 'epoch': 0.26}


 26%|██▋       | 4428/16798 [18:58<58:34,  3.52it/s]

{'loss': 1.7937, 'grad_norm': 2.1888277530670166, 'learning_rate': 0.0001473671670240648, 'epoch': 0.26}


 26%|██▋       | 4429/16798 [18:58<54:23,  3.79it/s]

{'loss': 1.5136, 'grad_norm': 2.0283350944519043, 'learning_rate': 0.0001473552537526805, 'epoch': 0.26}


 26%|██▋       | 4430/16798 [18:59<59:57,  3.44it/s]

{'loss': 1.5474, 'grad_norm': 2.07303524017334, 'learning_rate': 0.00014734334048129615, 'epoch': 0.26}


 26%|██▋       | 4431/16798 [18:59<58:03,  3.55it/s]

{'loss': 1.4764, 'grad_norm': 2.269822835922241, 'learning_rate': 0.00014733142720991184, 'epoch': 0.26}


 26%|██▋       | 4432/16798 [18:59<59:29,  3.46it/s]

{'loss': 1.815, 'grad_norm': 2.2992441654205322, 'learning_rate': 0.0001473195139385275, 'epoch': 0.26}


 26%|██▋       | 4433/16798 [19:00<56:40,  3.64it/s]

{'loss': 1.4086, 'grad_norm': 1.7541272640228271, 'learning_rate': 0.0001473076006671432, 'epoch': 0.26}


 26%|██▋       | 4434/16798 [19:00<59:56,  3.44it/s]

{'loss': 1.8345, 'grad_norm': 2.105689525604248, 'learning_rate': 0.00014729568739575886, 'epoch': 0.26}


 26%|██▋       | 4435/16798 [19:00<1:01:38,  3.34it/s]

{'loss': 1.8354, 'grad_norm': 2.212400436401367, 'learning_rate': 0.00014728377412437455, 'epoch': 0.26}


 26%|██▋       | 4436/16798 [19:01<1:02:59,  3.27it/s]

{'loss': 1.4228, 'grad_norm': 1.824629783630371, 'learning_rate': 0.00014727186085299022, 'epoch': 0.26}


 26%|██▋       | 4437/16798 [19:01<58:33,  3.52it/s]  

{'loss': 1.1853, 'grad_norm': 1.530784249305725, 'learning_rate': 0.0001472599475816059, 'epoch': 0.26}


 26%|██▋       | 4438/16798 [19:01<59:45,  3.45it/s]

{'loss': 1.3999, 'grad_norm': 1.8341078758239746, 'learning_rate': 0.00014724803431022157, 'epoch': 0.26}


 26%|██▋       | 4439/16798 [19:01<55:10,  3.73it/s]

{'loss': 1.2075, 'grad_norm': 1.8170714378356934, 'learning_rate': 0.0001472361210388373, 'epoch': 0.26}


 26%|██▋       | 4440/16798 [19:02<55:21,  3.72it/s]

{'loss': 0.9979, 'grad_norm': 1.6024309396743774, 'learning_rate': 0.00014722420776745295, 'epoch': 0.26}


 26%|██▋       | 4441/16798 [19:02<55:42,  3.70it/s]

{'loss': 1.1419, 'grad_norm': 1.688301920890808, 'learning_rate': 0.00014721229449606864, 'epoch': 0.26}


 26%|██▋       | 4442/16798 [19:02<52:47,  3.90it/s]

{'loss': 1.0093, 'grad_norm': 1.7117873430252075, 'learning_rate': 0.0001472003812246843, 'epoch': 0.26}


 26%|██▋       | 4443/16798 [19:02<54:55,  3.75it/s]

{'loss': 1.2924, 'grad_norm': 1.7652010917663574, 'learning_rate': 0.0001471884679533, 'epoch': 0.26}


 26%|██▋       | 4444/16798 [19:03<55:28,  3.71it/s]

{'loss': 1.2715, 'grad_norm': 1.8689391613006592, 'learning_rate': 0.00014717655468191566, 'epoch': 0.26}


 26%|██▋       | 4445/16798 [19:03<58:57,  3.49it/s]

{'loss': 1.1609, 'grad_norm': 1.8631982803344727, 'learning_rate': 0.00014716464141053135, 'epoch': 0.26}


 26%|██▋       | 4446/16798 [19:03<58:15,  3.53it/s]

{'loss': 0.9347, 'grad_norm': 1.5965181589126587, 'learning_rate': 0.00014715272813914702, 'epoch': 0.26}


 26%|██▋       | 4447/16798 [19:04<1:01:00,  3.37it/s]

{'loss': 0.3172, 'grad_norm': 0.7921452522277832, 'learning_rate': 0.0001471408148677627, 'epoch': 0.26}


 26%|██▋       | 4448/16798 [19:04<56:16,  3.66it/s]  

{'loss': 0.6777, 'grad_norm': 1.219464659690857, 'learning_rate': 0.00014712890159637837, 'epoch': 0.26}


 26%|██▋       | 4449/16798 [19:04<57:48,  3.56it/s]

{'loss': 0.1326, 'grad_norm': 0.44212502241134644, 'learning_rate': 0.00014711698832499406, 'epoch': 0.26}


 26%|██▋       | 4450/16798 [19:04<53:44,  3.83it/s]

{'loss': 0.391, 'grad_norm': 0.9540313482284546, 'learning_rate': 0.00014710507505360973, 'epoch': 0.26}


 26%|██▋       | 4451/16798 [19:05<53:04,  3.88it/s]

{'loss': 1.8566, 'grad_norm': 1.7528587579727173, 'learning_rate': 0.00014709316178222542, 'epoch': 0.26}


 27%|██▋       | 4452/16798 [19:05<56:25,  3.65it/s]

{'loss': 1.7549, 'grad_norm': 1.8090717792510986, 'learning_rate': 0.00014708124851084108, 'epoch': 0.27}


 27%|██▋       | 4453/16798 [19:05<57:07,  3.60it/s]

{'loss': 1.8376, 'grad_norm': 2.106407403945923, 'learning_rate': 0.00014706933523945677, 'epoch': 0.27}


 27%|██▋       | 4454/16798 [19:05<57:06,  3.60it/s]

{'loss': 2.1187, 'grad_norm': 1.9782538414001465, 'learning_rate': 0.00014705742196807243, 'epoch': 0.27}


 27%|██▋       | 4455/16798 [19:06<1:03:08,  3.26it/s]

{'loss': 2.1149, 'grad_norm': 1.8985464572906494, 'learning_rate': 0.00014704550869668812, 'epoch': 0.27}


 27%|██▋       | 4456/16798 [19:06<58:44,  3.50it/s]  

{'loss': 1.911, 'grad_norm': 1.9042668342590332, 'learning_rate': 0.0001470335954253038, 'epoch': 0.27}


 27%|██▋       | 4457/16798 [19:06<1:01:48,  3.33it/s]

{'loss': 1.9623, 'grad_norm': 1.8320505619049072, 'learning_rate': 0.00014702168215391948, 'epoch': 0.27}


 27%|██▋       | 4458/16798 [19:07<58:30,  3.52it/s]  

{'loss': 1.6974, 'grad_norm': 1.9290121793746948, 'learning_rate': 0.00014700976888253514, 'epoch': 0.27}


 27%|██▋       | 4459/16798 [19:07<59:51,  3.44it/s]

{'loss': 1.8594, 'grad_norm': 2.1402015686035156, 'learning_rate': 0.00014699785561115083, 'epoch': 0.27}


 27%|██▋       | 4460/16798 [19:07<57:24,  3.58it/s]

{'loss': 1.7697, 'grad_norm': 2.0075061321258545, 'learning_rate': 0.0001469859423397665, 'epoch': 0.27}


 27%|██▋       | 4461/16798 [19:07<59:29,  3.46it/s]

{'loss': 1.6002, 'grad_norm': 1.8592365980148315, 'learning_rate': 0.0001469740290683822, 'epoch': 0.27}


 27%|██▋       | 4462/16798 [19:08<56:57,  3.61it/s]

{'loss': 1.9022, 'grad_norm': 2.164576530456543, 'learning_rate': 0.00014696211579699785, 'epoch': 0.27}


 27%|██▋       | 4463/16798 [19:08<59:29,  3.46it/s]

{'loss': 1.6823, 'grad_norm': 1.7998628616333008, 'learning_rate': 0.00014695020252561354, 'epoch': 0.27}


 27%|██▋       | 4464/16798 [19:08<55:17,  3.72it/s]

{'loss': 1.7535, 'grad_norm': 1.9876649379730225, 'learning_rate': 0.0001469382892542292, 'epoch': 0.27}


 27%|██▋       | 4465/16798 [19:09<52:33,  3.91it/s]

{'loss': 1.7192, 'grad_norm': 1.9310214519500732, 'learning_rate': 0.0001469263759828449, 'epoch': 0.27}


 27%|██▋       | 4466/16798 [19:09<56:00,  3.67it/s]

{'loss': 1.6417, 'grad_norm': 1.791819453239441, 'learning_rate': 0.00014691446271146056, 'epoch': 0.27}


 27%|██▋       | 4467/16798 [19:09<53:53,  3.81it/s]

{'loss': 1.6498, 'grad_norm': 2.1015608310699463, 'learning_rate': 0.00014690254944007625, 'epoch': 0.27}


 27%|██▋       | 4468/16798 [19:09<57:02,  3.60it/s]

{'loss': 1.7268, 'grad_norm': 2.1250531673431396, 'learning_rate': 0.00014689063616869192, 'epoch': 0.27}


 27%|██▋       | 4469/16798 [19:10<54:06,  3.80it/s]

{'loss': 1.6835, 'grad_norm': 1.8622980117797852, 'learning_rate': 0.0001468787228973076, 'epoch': 0.27}


 27%|██▋       | 4470/16798 [19:10<56:05,  3.66it/s]

{'loss': 1.4418, 'grad_norm': 1.8429821729660034, 'learning_rate': 0.0001468668096259233, 'epoch': 0.27}


 27%|██▋       | 4471/16798 [19:10<56:16,  3.65it/s]

{'loss': 1.5598, 'grad_norm': 1.6313841342926025, 'learning_rate': 0.00014685489635453896, 'epoch': 0.27}


 27%|██▋       | 4472/16798 [19:10<53:57,  3.81it/s]

{'loss': 1.9791, 'grad_norm': 2.029782295227051, 'learning_rate': 0.00014684298308315465, 'epoch': 0.27}


 27%|██▋       | 4473/16798 [19:11<1:02:13,  3.30it/s]

{'loss': 2.2328, 'grad_norm': 2.512166738510132, 'learning_rate': 0.00014683106981177031, 'epoch': 0.27}


 27%|██▋       | 4474/16798 [19:11<59:08,  3.47it/s]  

{'loss': 1.4959, 'grad_norm': 1.9043854475021362, 'learning_rate': 0.000146819156540386, 'epoch': 0.27}


 27%|██▋       | 4475/16798 [19:11<55:41,  3.69it/s]

{'loss': 1.5168, 'grad_norm': 1.6778148412704468, 'learning_rate': 0.00014680724326900167, 'epoch': 0.27}


 27%|██▋       | 4476/16798 [19:12<58:23,  3.52it/s]

{'loss': 1.5559, 'grad_norm': 2.099447727203369, 'learning_rate': 0.00014679532999761736, 'epoch': 0.27}


 27%|██▋       | 4477/16798 [19:12<57:32,  3.57it/s]

{'loss': 2.1352, 'grad_norm': 2.341116189956665, 'learning_rate': 0.00014678341672623302, 'epoch': 0.27}


 27%|██▋       | 4478/16798 [19:12<57:07,  3.59it/s]

{'loss': 0.9696, 'grad_norm': 2.0079379081726074, 'learning_rate': 0.00014677150345484871, 'epoch': 0.27}


 27%|██▋       | 4479/16798 [19:12<54:51,  3.74it/s]

{'loss': 1.1638, 'grad_norm': 1.715814471244812, 'learning_rate': 0.00014675959018346438, 'epoch': 0.27}


 27%|██▋       | 4480/16798 [19:13<57:59,  3.54it/s]

{'loss': 1.065, 'grad_norm': 1.4296075105667114, 'learning_rate': 0.00014674767691208007, 'epoch': 0.27}


 27%|██▋       | 4481/16798 [19:13<53:26,  3.84it/s]

{'loss': 1.2952, 'grad_norm': 1.9493355751037598, 'learning_rate': 0.00014673576364069573, 'epoch': 0.27}


 27%|██▋       | 4482/16798 [19:13<50:53,  4.03it/s]

{'loss': 1.5805, 'grad_norm': 1.8163748979568481, 'learning_rate': 0.00014672385036931142, 'epoch': 0.27}


 27%|██▋       | 4483/16798 [19:13<54:40,  3.75it/s]

{'loss': 1.2974, 'grad_norm': 1.7034049034118652, 'learning_rate': 0.0001467119370979271, 'epoch': 0.27}


 27%|██▋       | 4484/16798 [19:14<51:34,  3.98it/s]

{'loss': 1.5214, 'grad_norm': 1.8478548526763916, 'learning_rate': 0.00014670002382654278, 'epoch': 0.27}


 27%|██▋       | 4485/16798 [19:14<55:14,  3.71it/s]

{'loss': 1.0251, 'grad_norm': 1.5888129472732544, 'learning_rate': 0.00014668811055515844, 'epoch': 0.27}


 27%|██▋       | 4486/16798 [19:14<52:51,  3.88it/s]

{'loss': 1.3413, 'grad_norm': 1.7948267459869385, 'learning_rate': 0.00014667619728377413, 'epoch': 0.27}


 27%|██▋       | 4487/16798 [19:14<52:28,  3.91it/s]

{'loss': 1.1082, 'grad_norm': 1.8324631452560425, 'learning_rate': 0.0001466642840123898, 'epoch': 0.27}


 27%|██▋       | 4488/16798 [19:15<57:57,  3.54it/s]

{'loss': 1.0159, 'grad_norm': 1.8353677988052368, 'learning_rate': 0.00014665237074100549, 'epoch': 0.27}


 27%|██▋       | 4489/16798 [19:15<57:53,  3.54it/s]

{'loss': 0.8726, 'grad_norm': 1.50320565700531, 'learning_rate': 0.00014664045746962115, 'epoch': 0.27}


 27%|██▋       | 4490/16798 [19:15<56:42,  3.62it/s]

{'loss': 0.9747, 'grad_norm': 1.6146528720855713, 'learning_rate': 0.00014662854419823684, 'epoch': 0.27}


 27%|██▋       | 4491/16798 [19:16<59:06,  3.47it/s]

{'loss': 1.0449, 'grad_norm': 1.8684630393981934, 'learning_rate': 0.0001466166309268525, 'epoch': 0.27}


 27%|██▋       | 4492/16798 [19:16<57:05,  3.59it/s]

{'loss': 1.1626, 'grad_norm': 1.8651654720306396, 'learning_rate': 0.0001466047176554682, 'epoch': 0.27}


 27%|██▋       | 4493/16798 [19:16<59:46,  3.43it/s]

{'loss': 1.0123, 'grad_norm': 1.582882046699524, 'learning_rate': 0.00014659280438408386, 'epoch': 0.27}


 27%|██▋       | 4494/16798 [19:17<58:38,  3.50it/s]

{'loss': 0.8979, 'grad_norm': 1.5274288654327393, 'learning_rate': 0.00014658089111269955, 'epoch': 0.27}


 27%|██▋       | 4495/16798 [19:17<1:00:51,  3.37it/s]

{'loss': 0.723, 'grad_norm': 1.452223300933838, 'learning_rate': 0.0001465689778413152, 'epoch': 0.27}


 27%|██▋       | 4496/16798 [19:17<59:14,  3.46it/s]  

{'loss': 0.6871, 'grad_norm': 1.4642966985702515, 'learning_rate': 0.0001465570645699309, 'epoch': 0.27}


 27%|██▋       | 4497/16798 [19:17<1:01:18,  3.34it/s]

{'loss': 0.5285, 'grad_norm': 1.1699130535125732, 'learning_rate': 0.00014654515129854657, 'epoch': 0.27}


 27%|██▋       | 4498/16798 [19:18<56:54,  3.60it/s]  

{'loss': 0.2474, 'grad_norm': 0.6673001050949097, 'learning_rate': 0.00014653323802716226, 'epoch': 0.27}


 27%|██▋       | 4499/16798 [19:18<1:00:10,  3.41it/s]

{'loss': 0.3589, 'grad_norm': 1.0277650356292725, 'learning_rate': 0.00014652132475577792, 'epoch': 0.27}




{'loss': 0.5563, 'grad_norm': 1.173362135887146, 'learning_rate': 0.0001465094114843936, 'epoch': 0.27}


 27%|██▋       | 4501/16798 [19:21<3:37:18,  1.06s/it]

{'loss': 1.864, 'grad_norm': 1.8419594764709473, 'learning_rate': 0.0001464974982130093, 'epoch': 0.27}


 27%|██▋       | 4502/16798 [19:21<2:48:04,  1.22it/s]

{'loss': 2.1461, 'grad_norm': 2.159970760345459, 'learning_rate': 0.000146485584941625, 'epoch': 0.27}


 27%|██▋       | 4503/16798 [19:22<2:16:52,  1.50it/s]

{'loss': 2.0398, 'grad_norm': 1.9637480974197388, 'learning_rate': 0.00014647367167024066, 'epoch': 0.27}


 27%|██▋       | 4504/16798 [19:22<1:54:32,  1.79it/s]

{'loss': 2.1589, 'grad_norm': 2.075758695602417, 'learning_rate': 0.00014646175839885635, 'epoch': 0.27}


 27%|██▋       | 4505/16798 [19:22<1:41:35,  2.02it/s]

{'loss': 2.128, 'grad_norm': 2.262843370437622, 'learning_rate': 0.000146449845127472, 'epoch': 0.27}


 27%|██▋       | 4506/16798 [19:23<1:29:09,  2.30it/s]

{'loss': 2.197, 'grad_norm': 2.3449342250823975, 'learning_rate': 0.0001464379318560877, 'epoch': 0.27}


 27%|██▋       | 4507/16798 [19:23<1:20:17,  2.55it/s]

{'loss': 1.7321, 'grad_norm': 2.1031315326690674, 'learning_rate': 0.00014642601858470337, 'epoch': 0.27}


 27%|██▋       | 4508/16798 [19:23<1:16:26,  2.68it/s]

{'loss': 2.0367, 'grad_norm': 2.200629472732544, 'learning_rate': 0.00014641410531331906, 'epoch': 0.27}


 27%|██▋       | 4509/16798 [19:24<1:11:26,  2.87it/s]

{'loss': 1.7833, 'grad_norm': 2.1547629833221436, 'learning_rate': 0.00014640219204193472, 'epoch': 0.27}


 27%|██▋       | 4510/16798 [19:24<1:05:22,  3.13it/s]

{'loss': 1.6403, 'grad_norm': 1.9461771249771118, 'learning_rate': 0.0001463902787705504, 'epoch': 0.27}


 27%|██▋       | 4511/16798 [19:24<1:06:21,  3.09it/s]

{'loss': 1.8538, 'grad_norm': 2.0476560592651367, 'learning_rate': 0.00014637836549916608, 'epoch': 0.27}


 27%|██▋       | 4512/16798 [19:24<1:04:26,  3.18it/s]

{'loss': 1.4971, 'grad_norm': 1.982853651046753, 'learning_rate': 0.00014636645222778177, 'epoch': 0.27}


 27%|██▋       | 4513/16798 [19:25<1:03:09,  3.24it/s]

{'loss': 1.8938, 'grad_norm': 2.300788164138794, 'learning_rate': 0.00014635453895639743, 'epoch': 0.27}


 27%|██▋       | 4514/16798 [19:25<57:42,  3.55it/s]  

{'loss': 1.4022, 'grad_norm': 1.7401267290115356, 'learning_rate': 0.00014634262568501312, 'epoch': 0.27}


 27%|██▋       | 4515/16798 [19:25<1:00:01,  3.41it/s]

{'loss': 1.4957, 'grad_norm': 1.9199539422988892, 'learning_rate': 0.00014633071241362878, 'epoch': 0.27}


 27%|██▋       | 4516/16798 [19:25<56:06,  3.65it/s]  

{'loss': 1.5974, 'grad_norm': 1.979013204574585, 'learning_rate': 0.00014631879914224448, 'epoch': 0.27}


 27%|██▋       | 4517/16798 [19:26<59:25,  3.44it/s]

{'loss': 1.885, 'grad_norm': 1.9583265781402588, 'learning_rate': 0.00014630688587086014, 'epoch': 0.27}


 27%|██▋       | 4518/16798 [19:26<56:18,  3.63it/s]

{'loss': 1.7414, 'grad_norm': 1.9695351123809814, 'learning_rate': 0.00014629497259947583, 'epoch': 0.27}


 27%|██▋       | 4519/16798 [19:26<58:52,  3.48it/s]

{'loss': 1.2945, 'grad_norm': 1.4518812894821167, 'learning_rate': 0.0001462830593280915, 'epoch': 0.27}


 27%|██▋       | 4520/16798 [19:27<56:22,  3.63it/s]

{'loss': 1.4414, 'grad_norm': 1.6208945512771606, 'learning_rate': 0.00014627114605670718, 'epoch': 0.27}


 27%|██▋       | 4521/16798 [19:27<58:01,  3.53it/s]

{'loss': 1.5488, 'grad_norm': 1.800543189048767, 'learning_rate': 0.00014625923278532285, 'epoch': 0.27}


 27%|██▋       | 4522/16798 [19:27<57:59,  3.53it/s]

{'loss': 2.031, 'grad_norm': 2.065868616104126, 'learning_rate': 0.00014624731951393854, 'epoch': 0.27}


 27%|██▋       | 4523/16798 [19:28<1:00:35,  3.38it/s]

{'loss': 1.4928, 'grad_norm': 2.324307680130005, 'learning_rate': 0.0001462354062425542, 'epoch': 0.27}


 27%|██▋       | 4524/16798 [19:28<59:13,  3.45it/s]  

{'loss': 1.8748, 'grad_norm': 2.2684640884399414, 'learning_rate': 0.0001462234929711699, 'epoch': 0.27}


 27%|██▋       | 4525/16798 [19:28<1:00:25,  3.39it/s]

{'loss': 1.6084, 'grad_norm': 1.9614341259002686, 'learning_rate': 0.00014621157969978556, 'epoch': 0.27}


 27%|██▋       | 4526/16798 [19:28<58:54,  3.47it/s]  

{'loss': 1.3796, 'grad_norm': 1.9039416313171387, 'learning_rate': 0.00014619966642840125, 'epoch': 0.27}


 27%|██▋       | 4527/16798 [19:29<1:01:17,  3.34it/s]

{'loss': 1.2425, 'grad_norm': 1.6933214664459229, 'learning_rate': 0.0001461877531570169, 'epoch': 0.27}


 27%|██▋       | 4528/16798 [19:29<56:29,  3.62it/s]  

{'loss': 1.5842, 'grad_norm': 1.6564782857894897, 'learning_rate': 0.0001461758398856326, 'epoch': 0.27}


 27%|██▋       | 4529/16798 [19:29<57:08,  3.58it/s]

{'loss': 1.3486, 'grad_norm': 1.8598120212554932, 'learning_rate': 0.00014616392661424827, 'epoch': 0.27}


 27%|██▋       | 4530/16798 [19:30<57:46,  3.54it/s]

{'loss': 1.776, 'grad_norm': 2.071720600128174, 'learning_rate': 0.00014615201334286396, 'epoch': 0.27}


 27%|██▋       | 4531/16798 [19:30<59:34,  3.43it/s]

{'loss': 1.2183, 'grad_norm': 1.7429935932159424, 'learning_rate': 0.00014614010007147962, 'epoch': 0.27}


 27%|██▋       | 4532/16798 [19:30<55:32,  3.68it/s]

{'loss': 1.5305, 'grad_norm': 1.7769684791564941, 'learning_rate': 0.0001461281868000953, 'epoch': 0.27}


 27%|██▋       | 4533/16798 [19:30<57:09,  3.58it/s]

{'loss': 1.5868, 'grad_norm': 1.8817145824432373, 'learning_rate': 0.000146116273528711, 'epoch': 0.27}


 27%|██▋       | 4534/16798 [19:31<58:48,  3.48it/s]

{'loss': 1.5845, 'grad_norm': 2.171614646911621, 'learning_rate': 0.00014610436025732667, 'epoch': 0.27}


 27%|██▋       | 4535/16798 [19:31<1:01:57,  3.30it/s]

{'loss': 1.5884, 'grad_norm': 1.9411340951919556, 'learning_rate': 0.00014609244698594236, 'epoch': 0.27}


 27%|██▋       | 4536/16798 [19:31<59:12,  3.45it/s]  

{'loss': 1.3326, 'grad_norm': 1.622633695602417, 'learning_rate': 0.00014608053371455802, 'epoch': 0.27}


 27%|██▋       | 4537/16798 [19:32<59:40,  3.42it/s]

{'loss': 1.4186, 'grad_norm': 2.2404797077178955, 'learning_rate': 0.0001460686204431737, 'epoch': 0.27}


 27%|██▋       | 4538/16798 [19:32<56:47,  3.60it/s]

{'loss': 1.0779, 'grad_norm': 1.4094375371932983, 'learning_rate': 0.00014605670717178937, 'epoch': 0.27}


 27%|██▋       | 4539/16798 [19:32<58:52,  3.47it/s]

{'loss': 1.8936, 'grad_norm': 2.769381284713745, 'learning_rate': 0.00014604479390040506, 'epoch': 0.27}


 27%|██▋       | 4540/16798 [19:32<57:25,  3.56it/s]

{'loss': 1.2909, 'grad_norm': 2.3180572986602783, 'learning_rate': 0.00014603288062902073, 'epoch': 0.27}


 27%|██▋       | 4541/16798 [19:33<1:00:44,  3.36it/s]

{'loss': 1.0827, 'grad_norm': 1.7454074621200562, 'learning_rate': 0.00014602096735763642, 'epoch': 0.27}


 27%|██▋       | 4542/16798 [19:33<58:07,  3.51it/s]  

{'loss': 1.3195, 'grad_norm': 1.9852133989334106, 'learning_rate': 0.00014600905408625208, 'epoch': 0.27}


 27%|██▋       | 4543/16798 [19:33<58:17,  3.50it/s]

{'loss': 1.619, 'grad_norm': 1.9506598711013794, 'learning_rate': 0.00014599714081486777, 'epoch': 0.27}


 27%|██▋       | 4544/16798 [19:34<58:30,  3.49it/s]

{'loss': 1.387, 'grad_norm': 2.061913013458252, 'learning_rate': 0.00014598522754348344, 'epoch': 0.27}


 27%|██▋       | 4545/16798 [19:34<58:34,  3.49it/s]

{'loss': 0.9027, 'grad_norm': 1.4290111064910889, 'learning_rate': 0.00014597331427209913, 'epoch': 0.27}


 27%|██▋       | 4546/16798 [19:34<57:12,  3.57it/s]

{'loss': 0.848, 'grad_norm': 1.4244987964630127, 'learning_rate': 0.0001459614010007148, 'epoch': 0.27}


 27%|██▋       | 4547/16798 [19:34<57:52,  3.53it/s]

{'loss': 1.1427, 'grad_norm': 3.249725341796875, 'learning_rate': 0.00014594948772933048, 'epoch': 0.27}


 27%|██▋       | 4548/16798 [19:35<59:56,  3.41it/s]

{'loss': 0.6654, 'grad_norm': 1.2303619384765625, 'learning_rate': 0.00014593757445794615, 'epoch': 0.27}


 27%|██▋       | 4549/16798 [19:35<1:00:11,  3.39it/s]

{'loss': 0.6702, 'grad_norm': 1.4222468137741089, 'learning_rate': 0.00014592566118656184, 'epoch': 0.27}


 27%|██▋       | 4550/16798 [19:35<57:01,  3.58it/s]  

{'loss': 0.6556, 'grad_norm': 1.3574635982513428, 'learning_rate': 0.0001459137479151775, 'epoch': 0.27}


 27%|██▋       | 4551/16798 [19:36<1:03:11,  3.23it/s]

{'loss': 2.031, 'grad_norm': 1.667181372642517, 'learning_rate': 0.0001459018346437932, 'epoch': 0.27}


 27%|██▋       | 4552/16798 [19:36<1:00:04,  3.40it/s]

{'loss': 1.7122, 'grad_norm': 1.7169148921966553, 'learning_rate': 0.00014588992137240885, 'epoch': 0.27}


 27%|██▋       | 4553/16798 [19:36<1:00:08,  3.39it/s]

{'loss': 2.1555, 'grad_norm': 2.033095359802246, 'learning_rate': 0.00014587800810102455, 'epoch': 0.27}


 27%|██▋       | 4554/16798 [19:36<1:00:13,  3.39it/s]

{'loss': 2.2692, 'grad_norm': 1.969774842262268, 'learning_rate': 0.0001458660948296402, 'epoch': 0.27}


 27%|██▋       | 4555/16798 [19:37<1:00:52,  3.35it/s]

{'loss': 1.9883, 'grad_norm': 1.9295049905776978, 'learning_rate': 0.0001458541815582559, 'epoch': 0.27}


 27%|██▋       | 4556/16798 [19:37<1:02:47,  3.25it/s]

{'loss': 2.3865, 'grad_norm': 2.2094924449920654, 'learning_rate': 0.00014584226828687156, 'epoch': 0.27}


 27%|██▋       | 4557/16798 [19:37<59:03,  3.45it/s]  

{'loss': 2.1257, 'grad_norm': 2.0584611892700195, 'learning_rate': 0.00014583035501548725, 'epoch': 0.27}


 27%|██▋       | 4558/16798 [19:38<56:25,  3.61it/s]

{'loss': 2.2109, 'grad_norm': 2.2432708740234375, 'learning_rate': 0.00014581844174410292, 'epoch': 0.27}


 27%|██▋       | 4559/16798 [19:38<58:04,  3.51it/s]

{'loss': 1.8049, 'grad_norm': 1.7613786458969116, 'learning_rate': 0.0001458065284727186, 'epoch': 0.27}


 27%|██▋       | 4560/16798 [19:38<57:38,  3.54it/s]

{'loss': 1.7551, 'grad_norm': 1.7505710124969482, 'learning_rate': 0.00014579461520133427, 'epoch': 0.27}


 27%|██▋       | 4561/16798 [19:38<57:54,  3.52it/s]

{'loss': 1.9109, 'grad_norm': 2.027488946914673, 'learning_rate': 0.00014578270192994996, 'epoch': 0.27}


 27%|██▋       | 4562/16798 [19:39<55:47,  3.66it/s]

{'loss': 1.9946, 'grad_norm': 2.1828207969665527, 'learning_rate': 0.00014577078865856565, 'epoch': 0.27}


 27%|██▋       | 4563/16798 [19:39<1:00:38,  3.36it/s]

{'loss': 1.4227, 'grad_norm': 1.5926798582077026, 'learning_rate': 0.00014575887538718134, 'epoch': 0.27}


 27%|██▋       | 4564/16798 [19:39<56:15,  3.62it/s]  

{'loss': 1.6188, 'grad_norm': 2.1444754600524902, 'learning_rate': 0.000145746962115797, 'epoch': 0.27}


 27%|██▋       | 4565/16798 [19:40<1:00:17,  3.38it/s]

{'loss': 1.4142, 'grad_norm': 1.8188765048980713, 'learning_rate': 0.0001457350488444127, 'epoch': 0.27}


 27%|██▋       | 4566/16798 [19:40<59:11,  3.44it/s]  

{'loss': 1.1668, 'grad_norm': 1.4632359743118286, 'learning_rate': 0.00014572313557302836, 'epoch': 0.27}


 27%|██▋       | 4567/16798 [19:40<1:02:04,  3.28it/s]

{'loss': 1.6399, 'grad_norm': 1.9631282091140747, 'learning_rate': 0.00014571122230164405, 'epoch': 0.27}


 27%|██▋       | 4568/16798 [19:40<57:04,  3.57it/s]  

{'loss': 1.3856, 'grad_norm': 1.7222665548324585, 'learning_rate': 0.00014569930903025972, 'epoch': 0.27}


 27%|██▋       | 4569/16798 [19:41<55:33,  3.67it/s]

{'loss': 1.685, 'grad_norm': 2.0519185066223145, 'learning_rate': 0.0001456873957588754, 'epoch': 0.27}


 27%|██▋       | 4570/16798 [19:41<56:53,  3.58it/s]

{'loss': 1.2617, 'grad_norm': 1.7887449264526367, 'learning_rate': 0.00014567548248749107, 'epoch': 0.27}


 27%|██▋       | 4571/16798 [19:41<58:08,  3.50it/s]

{'loss': 1.5123, 'grad_norm': 1.6839045286178589, 'learning_rate': 0.00014566356921610676, 'epoch': 0.27}


 27%|██▋       | 4572/16798 [19:42<1:00:49,  3.35it/s]

{'loss': 1.8939, 'grad_norm': 2.0098414421081543, 'learning_rate': 0.00014565165594472243, 'epoch': 0.27}


 27%|██▋       | 4573/16798 [19:42<1:01:53,  3.29it/s]

{'loss': 1.3328, 'grad_norm': 2.098334789276123, 'learning_rate': 0.00014563974267333812, 'epoch': 0.27}


 27%|██▋       | 4574/16798 [19:42<59:48,  3.41it/s]  

{'loss': 1.3242, 'grad_norm': 2.0281264781951904, 'learning_rate': 0.00014562782940195378, 'epoch': 0.27}


 27%|██▋       | 4575/16798 [19:43<59:47,  3.41it/s]

{'loss': 1.3612, 'grad_norm': 1.4792704582214355, 'learning_rate': 0.00014561591613056947, 'epoch': 0.27}


 27%|██▋       | 4576/16798 [19:43<1:01:26,  3.32it/s]

{'loss': 1.8501, 'grad_norm': 2.0525081157684326, 'learning_rate': 0.00014560400285918514, 'epoch': 0.27}


 27%|██▋       | 4577/16798 [19:43<58:32,  3.48it/s]  

{'loss': 1.5447, 'grad_norm': 1.9677729606628418, 'learning_rate': 0.00014559208958780083, 'epoch': 0.27}


 27%|██▋       | 4578/16798 [19:43<1:00:30,  3.37it/s]

{'loss': 1.5197, 'grad_norm': 1.888425350189209, 'learning_rate': 0.0001455801763164165, 'epoch': 0.27}


 27%|██▋       | 4579/16798 [19:44<58:54,  3.46it/s]  

{'loss': 2.1483, 'grad_norm': 2.6452794075012207, 'learning_rate': 0.00014556826304503218, 'epoch': 0.27}


 27%|██▋       | 4580/16798 [19:44<56:52,  3.58it/s]

{'loss': 1.751, 'grad_norm': 2.079786777496338, 'learning_rate': 0.00014555634977364784, 'epoch': 0.27}


 27%|██▋       | 4581/16798 [19:44<58:09,  3.50it/s]

{'loss': 1.3255, 'grad_norm': 1.8357782363891602, 'learning_rate': 0.00014554443650226353, 'epoch': 0.27}


 27%|██▋       | 4582/16798 [19:45<57:45,  3.53it/s]

{'loss': 1.6407, 'grad_norm': 2.205000400543213, 'learning_rate': 0.0001455325232308792, 'epoch': 0.27}


 27%|██▋       | 4583/16798 [19:45<59:28,  3.42it/s]

{'loss': 1.7763, 'grad_norm': 2.0344817638397217, 'learning_rate': 0.0001455206099594949, 'epoch': 0.27}


 27%|██▋       | 4584/16798 [19:45<59:00,  3.45it/s]

{'loss': 1.6016, 'grad_norm': 2.25618314743042, 'learning_rate': 0.00014550869668811055, 'epoch': 0.27}


 27%|██▋       | 4585/16798 [19:45<1:01:48,  3.29it/s]

{'loss': 1.1732, 'grad_norm': 2.5344278812408447, 'learning_rate': 0.00014549678341672624, 'epoch': 0.27}


 27%|██▋       | 4586/16798 [19:46<58:31,  3.48it/s]  

{'loss': 1.0368, 'grad_norm': 1.4951865673065186, 'learning_rate': 0.0001454848701453419, 'epoch': 0.27}


 27%|██▋       | 4587/16798 [19:46<59:31,  3.42it/s]

{'loss': 1.5638, 'grad_norm': 1.9772526025772095, 'learning_rate': 0.0001454729568739576, 'epoch': 0.27}


 27%|██▋       | 4588/16798 [19:46<58:42,  3.47it/s]

{'loss': 1.15, 'grad_norm': 3.362393617630005, 'learning_rate': 0.00014546104360257326, 'epoch': 0.27}


 27%|██▋       | 4589/16798 [19:47<1:01:36,  3.30it/s]

{'loss': 1.2827, 'grad_norm': 1.884572148323059, 'learning_rate': 0.00014544913033118895, 'epoch': 0.27}


 27%|██▋       | 4590/16798 [19:47<58:21,  3.49it/s]  

{'loss': 1.0751, 'grad_norm': 1.6211392879486084, 'learning_rate': 0.00014543721705980462, 'epoch': 0.27}


 27%|██▋       | 4591/16798 [19:47<57:39,  3.53it/s]

{'loss': 1.2016, 'grad_norm': 1.5525026321411133, 'learning_rate': 0.0001454253037884203, 'epoch': 0.27}


 27%|██▋       | 4592/16798 [19:47<56:37,  3.59it/s]

{'loss': 1.2926, 'grad_norm': 2.457550048828125, 'learning_rate': 0.00014541339051703597, 'epoch': 0.27}


 27%|██▋       | 4593/16798 [19:48<1:00:17,  3.37it/s]

{'loss': 1.4027, 'grad_norm': 2.1590285301208496, 'learning_rate': 0.00014540147724565166, 'epoch': 0.27}


 27%|██▋       | 4594/16798 [19:48<56:56,  3.57it/s]  

{'loss': 0.5691, 'grad_norm': 1.2260133028030396, 'learning_rate': 0.00014538956397426735, 'epoch': 0.27}


 27%|██▋       | 4595/16798 [19:48<57:06,  3.56it/s]

{'loss': 1.2601, 'grad_norm': 1.7845906019210815, 'learning_rate': 0.00014537765070288302, 'epoch': 0.27}


 27%|██▋       | 4596/16798 [19:49<58:31,  3.48it/s]

{'loss': 1.1787, 'grad_norm': 1.724036693572998, 'learning_rate': 0.0001453657374314987, 'epoch': 0.27}


 27%|██▋       | 4597/16798 [19:49<58:48,  3.46it/s]

{'loss': 0.7614, 'grad_norm': 1.450500249862671, 'learning_rate': 0.00014535382416011437, 'epoch': 0.27}


 27%|██▋       | 4598/16798 [19:49<55:29,  3.66it/s]

{'loss': 0.7807, 'grad_norm': 1.534793734550476, 'learning_rate': 0.00014534191088873006, 'epoch': 0.27}


 27%|██▋       | 4599/16798 [19:49<59:52,  3.40it/s]

{'loss': 0.3345, 'grad_norm': 0.8809393644332886, 'learning_rate': 0.00014532999761734572, 'epoch': 0.27}


 27%|██▋       | 4600/16798 [19:50<1:00:24,  3.37it/s]

{'loss': 1.0051, 'grad_norm': 1.8943711519241333, 'learning_rate': 0.00014531808434596142, 'epoch': 0.27}


 27%|██▋       | 4601/16798 [19:50<1:00:02,  3.39it/s]

{'loss': 1.8588, 'grad_norm': 1.7681031227111816, 'learning_rate': 0.00014530617107457708, 'epoch': 0.27}


 27%|██▋       | 4602/16798 [19:50<57:43,  3.52it/s]  

{'loss': 1.7404, 'grad_norm': 1.6782091856002808, 'learning_rate': 0.00014529425780319277, 'epoch': 0.27}


 27%|██▋       | 4604/16798 [19:51<57:51,  3.51it/s]  

{'loss': 1.87, 'grad_norm': 1.8178908824920654, 'learning_rate': 0.00014528234453180843, 'epoch': 0.27}


 27%|██▋       | 4604/16798 [19:51<57:51,  3.51it/s]

{'loss': 2.1524, 'grad_norm': 1.909609317779541, 'learning_rate': 0.00014527043126042412, 'epoch': 0.27}


 27%|██▋       | 4605/16798 [19:51<55:06,  3.69it/s]

{'loss': 1.8183, 'grad_norm': 1.866268277168274, 'learning_rate': 0.0001452585179890398, 'epoch': 0.27}


 27%|██▋       | 4606/16798 [19:52<59:28,  3.42it/s]

{'loss': 2.2127, 'grad_norm': 2.2990591526031494, 'learning_rate': 0.00014524660471765548, 'epoch': 0.27}


 27%|██▋       | 4607/16798 [19:52<56:37,  3.59it/s]

{'loss': 1.6707, 'grad_norm': 1.8629753589630127, 'learning_rate': 0.00014523469144627114, 'epoch': 0.27}


 27%|██▋       | 4608/16798 [19:52<59:18,  3.43it/s]

{'loss': 1.7732, 'grad_norm': 1.993282437324524, 'learning_rate': 0.00014522277817488683, 'epoch': 0.27}


 27%|██▋       | 4609/16798 [19:52<56:35,  3.59it/s]

{'loss': 1.4513, 'grad_norm': 1.5881986618041992, 'learning_rate': 0.0001452108649035025, 'epoch': 0.27}


 27%|██▋       | 4610/16798 [19:53<58:33,  3.47it/s]

{'loss': 1.4369, 'grad_norm': 1.8802521228790283, 'learning_rate': 0.0001451989516321182, 'epoch': 0.27}


 27%|██▋       | 4611/16798 [19:53<58:04,  3.50it/s]

{'loss': 1.5874, 'grad_norm': 1.7732616662979126, 'learning_rate': 0.00014518703836073385, 'epoch': 0.27}


 27%|██▋       | 4612/16798 [19:53<1:01:08,  3.32it/s]

{'loss': 1.4052, 'grad_norm': 1.6159782409667969, 'learning_rate': 0.00014517512508934954, 'epoch': 0.27}


 27%|██▋       | 4613/16798 [19:53<55:33,  3.65it/s]  

{'loss': 1.4348, 'grad_norm': 1.8486124277114868, 'learning_rate': 0.0001451632118179652, 'epoch': 0.27}


 27%|██▋       | 4614/16798 [19:54<58:02,  3.50it/s]

{'loss': 1.9035, 'grad_norm': 1.9583864212036133, 'learning_rate': 0.0001451512985465809, 'epoch': 0.27}


 27%|██▋       | 4615/16798 [19:54<53:55,  3.77it/s]

{'loss': 1.755, 'grad_norm': 2.0397543907165527, 'learning_rate': 0.00014513938527519656, 'epoch': 0.27}


 27%|██▋       | 4616/16798 [19:54<55:30,  3.66it/s]

{'loss': 1.6508, 'grad_norm': 1.7136746644973755, 'learning_rate': 0.00014512747200381225, 'epoch': 0.27}


 27%|██▋       | 4617/16798 [19:55<55:52,  3.63it/s]

{'loss': 1.9205, 'grad_norm': 2.4635746479034424, 'learning_rate': 0.00014511555873242791, 'epoch': 0.27}


 27%|██▋       | 4618/16798 [19:55<55:35,  3.65it/s]

{'loss': 1.3918, 'grad_norm': 1.6891026496887207, 'learning_rate': 0.0001451036454610436, 'epoch': 0.27}


 27%|██▋       | 4619/16798 [19:55<57:03,  3.56it/s]

{'loss': 1.3581, 'grad_norm': 1.6790815591812134, 'learning_rate': 0.00014509173218965927, 'epoch': 0.27}


 28%|██▊       | 4620/16798 [19:55<55:07,  3.68it/s]

{'loss': 1.8423, 'grad_norm': 2.11155366897583, 'learning_rate': 0.00014507981891827496, 'epoch': 0.28}


 28%|██▊       | 4621/16798 [19:56<1:01:11,  3.32it/s]

{'loss': 1.3753, 'grad_norm': 1.6720211505889893, 'learning_rate': 0.00014506790564689062, 'epoch': 0.28}


 28%|██▊       | 4622/16798 [19:56<56:30,  3.59it/s]  

{'loss': 1.5442, 'grad_norm': 2.1080751419067383, 'learning_rate': 0.00014505599237550631, 'epoch': 0.28}


 28%|██▊       | 4623/16798 [19:56<52:51,  3.84it/s]

{'loss': 1.6637, 'grad_norm': 1.961578130722046, 'learning_rate': 0.00014504407910412198, 'epoch': 0.28}


 28%|██▊       | 4624/16798 [19:57<56:36,  3.58it/s]

{'loss': 1.6168, 'grad_norm': 2.001816511154175, 'learning_rate': 0.0001450321658327377, 'epoch': 0.28}


 28%|██▊       | 4625/16798 [19:57<54:45,  3.71it/s]

{'loss': 1.7584, 'grad_norm': 2.137223243713379, 'learning_rate': 0.00014502025256135336, 'epoch': 0.28}


 28%|██▊       | 4626/16798 [19:57<57:11,  3.55it/s]

{'loss': 1.3188, 'grad_norm': 1.644221544265747, 'learning_rate': 0.00014500833928996905, 'epoch': 0.28}


 28%|██▊       | 4627/16798 [19:57<55:31,  3.65it/s]

{'loss': 1.3776, 'grad_norm': 1.7894401550292969, 'learning_rate': 0.0001449964260185847, 'epoch': 0.28}


 28%|██▊       | 4628/16798 [19:58<56:32,  3.59it/s]

{'loss': 1.5669, 'grad_norm': 1.9151781797409058, 'learning_rate': 0.0001449845127472004, 'epoch': 0.28}


 28%|██▊       | 4629/16798 [19:58<56:59,  3.56it/s]

{'loss': 1.5095, 'grad_norm': 1.8941411972045898, 'learning_rate': 0.00014497259947581607, 'epoch': 0.28}


 28%|██▊       | 4630/16798 [19:58<57:46,  3.51it/s]

{'loss': 1.6691, 'grad_norm': 1.9647947549819946, 'learning_rate': 0.00014496068620443176, 'epoch': 0.28}


 28%|██▊       | 4631/16798 [19:58<57:29,  3.53it/s]

{'loss': 1.1874, 'grad_norm': 1.6362011432647705, 'learning_rate': 0.00014494877293304742, 'epoch': 0.28}


 28%|██▊       | 4632/16798 [19:59<58:52,  3.44it/s]

{'loss': 1.3313, 'grad_norm': 1.5769867897033691, 'learning_rate': 0.0001449368596616631, 'epoch': 0.28}


 28%|██▊       | 4633/16798 [19:59<54:51,  3.70it/s]

{'loss': 1.5843, 'grad_norm': 1.873850703239441, 'learning_rate': 0.00014492494639027878, 'epoch': 0.28}


 28%|██▊       | 4634/16798 [19:59<56:39,  3.58it/s]

{'loss': 1.3097, 'grad_norm': 1.7753453254699707, 'learning_rate': 0.00014491303311889447, 'epoch': 0.28}


 28%|██▊       | 4635/16798 [20:00<56:50,  3.57it/s]

{'loss': 1.673, 'grad_norm': 2.0343244075775146, 'learning_rate': 0.00014490111984751013, 'epoch': 0.28}


 28%|██▊       | 4636/16798 [20:00<58:55,  3.44it/s]

{'loss': 1.6508, 'grad_norm': 2.234750270843506, 'learning_rate': 0.00014488920657612582, 'epoch': 0.28}


 28%|██▊       | 4637/16798 [20:00<55:20,  3.66it/s]

{'loss': 1.4692, 'grad_norm': 2.0066182613372803, 'learning_rate': 0.00014487729330474149, 'epoch': 0.28}


 28%|██▊       | 4638/16798 [20:00<59:12,  3.42it/s]

{'loss': 1.0837, 'grad_norm': 1.6038596630096436, 'learning_rate': 0.00014486538003335718, 'epoch': 0.28}


 28%|██▊       | 4639/16798 [20:01<58:17,  3.48it/s]

{'loss': 1.5968, 'grad_norm': 1.9838719367980957, 'learning_rate': 0.00014485346676197284, 'epoch': 0.28}


 28%|██▊       | 4640/16798 [20:01<55:53,  3.63it/s]

{'loss': 1.1717, 'grad_norm': 1.5953023433685303, 'learning_rate': 0.00014484155349058853, 'epoch': 0.28}


 28%|██▊       | 4641/16798 [20:01<56:30,  3.59it/s]

{'loss': 1.5635, 'grad_norm': 2.088958978652954, 'learning_rate': 0.0001448296402192042, 'epoch': 0.28}


 28%|██▊       | 4642/16798 [20:02<58:05,  3.49it/s]

{'loss': 1.3331, 'grad_norm': 2.1682231426239014, 'learning_rate': 0.00014481772694781989, 'epoch': 0.28}


 28%|██▊       | 4643/16798 [20:02<57:50,  3.50it/s]

{'loss': 1.0639, 'grad_norm': 1.744431734085083, 'learning_rate': 0.00014480581367643555, 'epoch': 0.28}


 28%|██▊       | 4644/16798 [20:02<59:00,  3.43it/s]

{'loss': 1.0938, 'grad_norm': 1.8472518920898438, 'learning_rate': 0.00014479390040505124, 'epoch': 0.28}


 28%|██▊       | 4645/16798 [20:02<56:07,  3.61it/s]

{'loss': 0.6066, 'grad_norm': 1.13019859790802, 'learning_rate': 0.0001447819871336669, 'epoch': 0.28}


 28%|██▊       | 4646/16798 [20:03<59:52,  3.38it/s]

{'loss': 0.4523, 'grad_norm': 1.0533092021942139, 'learning_rate': 0.0001447700738622826, 'epoch': 0.28}


 28%|██▊       | 4647/16798 [20:03<56:45,  3.57it/s]

{'loss': 0.3252, 'grad_norm': 0.9076198935508728, 'learning_rate': 0.00014475816059089826, 'epoch': 0.28}


 28%|██▊       | 4648/16798 [20:03<59:03,  3.43it/s]

{'loss': 0.4696, 'grad_norm': 1.0703943967819214, 'learning_rate': 0.00014474624731951395, 'epoch': 0.28}


 28%|██▊       | 4649/16798 [20:04<54:35,  3.71it/s]

{'loss': 0.3253, 'grad_norm': 0.8685603737831116, 'learning_rate': 0.0001447343340481296, 'epoch': 0.28}


 28%|██▊       | 4650/16798 [20:04<53:24,  3.79it/s]

{'loss': 0.68, 'grad_norm': 1.4961764812469482, 'learning_rate': 0.0001447224207767453, 'epoch': 0.28}


 28%|██▊       | 4651/16798 [20:04<57:00,  3.55it/s]

{'loss': 2.3902, 'grad_norm': 2.340198516845703, 'learning_rate': 0.00014471050750536097, 'epoch': 0.28}


 28%|██▊       | 4652/16798 [20:04<53:56,  3.75it/s]

{'loss': 1.7738, 'grad_norm': 1.9404555559158325, 'learning_rate': 0.00014469859423397666, 'epoch': 0.28}


 28%|██▊       | 4653/16798 [20:05<57:35,  3.52it/s]

{'loss': 1.8413, 'grad_norm': 1.8290632963180542, 'learning_rate': 0.00014468668096259232, 'epoch': 0.28}


 28%|██▊       | 4654/16798 [20:05<58:38,  3.45it/s]

{'loss': 2.5025, 'grad_norm': 2.0466253757476807, 'learning_rate': 0.000144674767691208, 'epoch': 0.28}


 28%|██▊       | 4655/16798 [20:05<1:00:16,  3.36it/s]

{'loss': 1.8233, 'grad_norm': 2.4118664264678955, 'learning_rate': 0.0001446628544198237, 'epoch': 0.28}


 28%|██▊       | 4656/16798 [20:06<56:31,  3.58it/s]  

{'loss': 2.0132, 'grad_norm': 3.4969420433044434, 'learning_rate': 0.00014465094114843937, 'epoch': 0.28}


 28%|██▊       | 4657/16798 [20:06<59:52,  3.38it/s]

{'loss': 2.2607, 'grad_norm': 2.0418546199798584, 'learning_rate': 0.00014463902787705506, 'epoch': 0.28}


 28%|██▊       | 4658/16798 [20:06<58:28,  3.46it/s]

{'loss': 1.7761, 'grad_norm': 1.944144368171692, 'learning_rate': 0.00014462711460567072, 'epoch': 0.28}


 28%|██▊       | 4659/16798 [20:06<1:00:27,  3.35it/s]

{'loss': 1.8644, 'grad_norm': 2.180992841720581, 'learning_rate': 0.0001446152013342864, 'epoch': 0.28}


 28%|██▊       | 4660/16798 [20:07<57:02,  3.55it/s]  

{'loss': 1.8002, 'grad_norm': 1.766295075416565, 'learning_rate': 0.00014460328806290208, 'epoch': 0.28}


 28%|██▊       | 4661/16798 [20:07<59:25,  3.40it/s]

{'loss': 1.5842, 'grad_norm': 1.7405437231063843, 'learning_rate': 0.00014459137479151777, 'epoch': 0.28}


 28%|██▊       | 4662/16798 [20:07<56:24,  3.59it/s]

{'loss': 1.4214, 'grad_norm': 1.7388091087341309, 'learning_rate': 0.00014457946152013343, 'epoch': 0.28}


 28%|██▊       | 4663/16798 [20:08<58:46,  3.44it/s]

{'loss': 2.1043, 'grad_norm': 1.9923745393753052, 'learning_rate': 0.00014456754824874912, 'epoch': 0.28}


 28%|██▊       | 4664/16798 [20:08<1:03:14,  3.20it/s]

{'loss': 1.4052, 'grad_norm': 1.579809546470642, 'learning_rate': 0.00014455563497736478, 'epoch': 0.28}


 28%|██▊       | 4665/16798 [20:08<1:00:04,  3.37it/s]

{'loss': 1.6123, 'grad_norm': 1.90740168094635, 'learning_rate': 0.00014454372170598047, 'epoch': 0.28}


 28%|██▊       | 4666/16798 [20:08<57:25,  3.52it/s]  

{'loss': 1.7845, 'grad_norm': 1.8415523767471313, 'learning_rate': 0.00014453180843459614, 'epoch': 0.28}


 28%|██▊       | 4667/16798 [20:09<1:00:54,  3.32it/s]

{'loss': 1.5431, 'grad_norm': 1.6690019369125366, 'learning_rate': 0.00014451989516321183, 'epoch': 0.28}


 28%|██▊       | 4668/16798 [20:09<1:00:15,  3.36it/s]

{'loss': 1.5953, 'grad_norm': 1.6298418045043945, 'learning_rate': 0.0001445079818918275, 'epoch': 0.28}


 28%|██▊       | 4669/16798 [20:09<59:55,  3.37it/s]  

{'loss': 1.4539, 'grad_norm': 1.7457172870635986, 'learning_rate': 0.00014449606862044318, 'epoch': 0.28}


 28%|██▊       | 4670/16798 [20:10<59:57,  3.37it/s]

{'loss': 1.2107, 'grad_norm': 1.5039715766906738, 'learning_rate': 0.00014448415534905885, 'epoch': 0.28}


 28%|██▊       | 4671/16798 [20:10<57:17,  3.53it/s]

{'loss': 1.0742, 'grad_norm': 1.696815013885498, 'learning_rate': 0.00014447224207767454, 'epoch': 0.28}


 28%|██▊       | 4672/16798 [20:10<58:05,  3.48it/s]

{'loss': 1.3929, 'grad_norm': 1.8888022899627686, 'learning_rate': 0.0001444603288062902, 'epoch': 0.28}


 28%|██▊       | 4673/16798 [20:10<55:15,  3.66it/s]

{'loss': 1.7598, 'grad_norm': 2.2231392860412598, 'learning_rate': 0.0001444484155349059, 'epoch': 0.28}


 28%|██▊       | 4674/16798 [20:11<56:50,  3.55it/s]

{'loss': 1.9778, 'grad_norm': 2.2866523265838623, 'learning_rate': 0.00014443650226352156, 'epoch': 0.28}


 28%|██▊       | 4675/16798 [20:11<57:30,  3.51it/s]

{'loss': 1.4979, 'grad_norm': 1.7196099758148193, 'learning_rate': 0.00014442458899213725, 'epoch': 0.28}


 28%|██▊       | 4676/16798 [20:11<58:44,  3.44it/s]

{'loss': 1.5769, 'grad_norm': 1.7513371706008911, 'learning_rate': 0.0001444126757207529, 'epoch': 0.28}


 28%|██▊       | 4677/16798 [20:12<56:23,  3.58it/s]

{'loss': 1.3555, 'grad_norm': 1.6783915758132935, 'learning_rate': 0.0001444007624493686, 'epoch': 0.28}


 28%|██▊       | 4678/16798 [20:12<56:37,  3.57it/s]

{'loss': 1.8367, 'grad_norm': 2.0863847732543945, 'learning_rate': 0.00014438884917798426, 'epoch': 0.28}


 28%|██▊       | 4679/16798 [20:12<56:26,  3.58it/s]

{'loss': 1.6437, 'grad_norm': 2.1546242237091064, 'learning_rate': 0.00014437693590659996, 'epoch': 0.28}


 28%|██▊       | 4680/16798 [20:12<56:26,  3.58it/s]

{'loss': 1.2449, 'grad_norm': 1.8992550373077393, 'learning_rate': 0.00014436502263521562, 'epoch': 0.28}


 28%|██▊       | 4681/16798 [20:13<56:36,  3.57it/s]

{'loss': 1.2996, 'grad_norm': 1.5340663194656372, 'learning_rate': 0.0001443531093638313, 'epoch': 0.28}


 28%|██▊       | 4682/16798 [20:13<56:12,  3.59it/s]

{'loss': 1.1573, 'grad_norm': 1.413630485534668, 'learning_rate': 0.00014434119609244697, 'epoch': 0.28}


 28%|██▊       | 4683/16798 [20:13<54:41,  3.69it/s]

{'loss': 1.8227, 'grad_norm': 2.3215956687927246, 'learning_rate': 0.00014432928282106266, 'epoch': 0.28}


 28%|██▊       | 4684/16798 [20:14<56:22,  3.58it/s]

{'loss': 1.3924, 'grad_norm': 1.6711362600326538, 'learning_rate': 0.00014431736954967833, 'epoch': 0.28}


 28%|██▊       | 4685/16798 [20:14<53:23,  3.78it/s]

{'loss': 1.4349, 'grad_norm': 1.652813196182251, 'learning_rate': 0.00014430545627829402, 'epoch': 0.28}


 28%|██▊       | 4686/16798 [20:14<56:40,  3.56it/s]

{'loss': 1.4808, 'grad_norm': 2.0436618328094482, 'learning_rate': 0.0001442935430069097, 'epoch': 0.28}


 28%|██▊       | 4687/16798 [20:14<53:37,  3.76it/s]

{'loss': 1.3484, 'grad_norm': 1.6536146402359009, 'learning_rate': 0.0001442816297355254, 'epoch': 0.28}


 28%|██▊       | 4688/16798 [20:15<54:38,  3.69it/s]

{'loss': 1.013, 'grad_norm': 1.485487699508667, 'learning_rate': 0.00014426971646414106, 'epoch': 0.28}


 28%|██▊       | 4689/16798 [20:15<56:07,  3.60it/s]

{'loss': 1.2277, 'grad_norm': 1.6959706544876099, 'learning_rate': 0.00014425780319275675, 'epoch': 0.28}


 28%|██▊       | 4690/16798 [20:15<57:15,  3.52it/s]

{'loss': 0.8285, 'grad_norm': 1.5051244497299194, 'learning_rate': 0.00014424588992137242, 'epoch': 0.28}


 28%|██▊       | 4691/16798 [20:15<53:37,  3.76it/s]

{'loss': 0.9004, 'grad_norm': 1.593787670135498, 'learning_rate': 0.0001442339766499881, 'epoch': 0.28}


 28%|██▊       | 4692/16798 [20:16<55:49,  3.61it/s]

{'loss': 1.195, 'grad_norm': 1.8894184827804565, 'learning_rate': 0.00014422206337860377, 'epoch': 0.28}


 28%|██▊       | 4693/16798 [20:16<54:14,  3.72it/s]

{'loss': 0.7936, 'grad_norm': 1.333169937133789, 'learning_rate': 0.00014421015010721946, 'epoch': 0.28}


 28%|██▊       | 4694/16798 [20:16<53:26,  3.78it/s]

{'loss': 1.2618, 'grad_norm': 2.2244625091552734, 'learning_rate': 0.00014419823683583513, 'epoch': 0.28}


 28%|██▊       | 4695/16798 [20:17<59:07,  3.41it/s]

{'loss': 0.933, 'grad_norm': 1.467822790145874, 'learning_rate': 0.00014418632356445082, 'epoch': 0.28}


 28%|██▊       | 4696/16798 [20:17<57:38,  3.50it/s]

{'loss': 1.1359, 'grad_norm': 1.7439521551132202, 'learning_rate': 0.00014417441029306648, 'epoch': 0.28}


 28%|██▊       | 4697/16798 [20:17<59:17,  3.40it/s]

{'loss': 0.9309, 'grad_norm': 1.8059829473495483, 'learning_rate': 0.00014416249702168217, 'epoch': 0.28}


 28%|██▊       | 4698/16798 [20:17<55:05,  3.66it/s]

{'loss': 0.6765, 'grad_norm': 1.5031288862228394, 'learning_rate': 0.00014415058375029784, 'epoch': 0.28}


 28%|██▊       | 4699/16798 [20:18<59:01,  3.42it/s]

{'loss': 0.248, 'grad_norm': 0.6588618159294128, 'learning_rate': 0.00014413867047891353, 'epoch': 0.28}


 28%|██▊       | 4700/16798 [20:18<59:12,  3.41it/s]

{'loss': 0.2193, 'grad_norm': 0.8066368699073792, 'learning_rate': 0.0001441267572075292, 'epoch': 0.28}


 28%|██▊       | 4701/16798 [20:18<55:48,  3.61it/s]

{'loss': 1.7283, 'grad_norm': 1.9695611000061035, 'learning_rate': 0.00014411484393614488, 'epoch': 0.28}


 28%|██▊       | 4702/16798 [20:19<56:53,  3.54it/s]

{'loss': 1.7455, 'grad_norm': 2.480523109436035, 'learning_rate': 0.00014410293066476054, 'epoch': 0.28}


 28%|██▊       | 4703/16798 [20:19<55:46,  3.61it/s]

{'loss': 1.7612, 'grad_norm': 2.0097081661224365, 'learning_rate': 0.00014409101739337624, 'epoch': 0.28}


 28%|██▊       | 4704/16798 [20:19<58:12,  3.46it/s]

{'loss': 1.6823, 'grad_norm': 1.7501710653305054, 'learning_rate': 0.0001440791041219919, 'epoch': 0.28}


 28%|██▊       | 4705/16798 [20:19<54:39,  3.69it/s]

{'loss': 2.0576, 'grad_norm': 2.0064358711242676, 'learning_rate': 0.0001440671908506076, 'epoch': 0.28}


 28%|██▊       | 4706/16798 [20:20<57:48,  3.49it/s]

{'loss': 2.1026, 'grad_norm': 2.3967812061309814, 'learning_rate': 0.00014405527757922325, 'epoch': 0.28}


 28%|██▊       | 4707/16798 [20:20<55:16,  3.65it/s]

{'loss': 1.7998, 'grad_norm': 2.0842106342315674, 'learning_rate': 0.00014404336430783894, 'epoch': 0.28}


 28%|██▊       | 4708/16798 [20:20<57:47,  3.49it/s]

{'loss': 2.0586, 'grad_norm': 2.032249927520752, 'learning_rate': 0.0001440314510364546, 'epoch': 0.28}


 28%|██▊       | 4709/16798 [20:21<55:58,  3.60it/s]

{'loss': 1.529, 'grad_norm': 1.8979476690292358, 'learning_rate': 0.0001440195377650703, 'epoch': 0.28}


 28%|██▊       | 4710/16798 [20:21<57:10,  3.52it/s]

{'loss': 1.5911, 'grad_norm': 1.7617624998092651, 'learning_rate': 0.00014400762449368596, 'epoch': 0.28}


 28%|██▊       | 4711/16798 [20:21<54:11,  3.72it/s]

{'loss': 1.8397, 'grad_norm': 2.083343982696533, 'learning_rate': 0.00014399571122230165, 'epoch': 0.28}


 28%|██▊       | 4712/16798 [20:21<57:03,  3.53it/s]

{'loss': 1.5524, 'grad_norm': 1.7844749689102173, 'learning_rate': 0.00014398379795091732, 'epoch': 0.28}


 28%|██▊       | 4713/16798 [20:22<54:07,  3.72it/s]

{'loss': 1.4909, 'grad_norm': 2.1268539428710938, 'learning_rate': 0.000143971884679533, 'epoch': 0.28}


 28%|██▊       | 4714/16798 [20:22<56:28,  3.57it/s]

{'loss': 1.5536, 'grad_norm': 1.9855461120605469, 'learning_rate': 0.00014395997140814867, 'epoch': 0.28}


 28%|██▊       | 4715/16798 [20:22<52:51,  3.81it/s]

{'loss': 1.396, 'grad_norm': 2.036242723464966, 'learning_rate': 0.00014394805813676436, 'epoch': 0.28}


 28%|██▊       | 4716/16798 [20:22<54:15,  3.71it/s]

{'loss': 1.7952, 'grad_norm': 2.1821677684783936, 'learning_rate': 0.00014393614486538003, 'epoch': 0.28}


 28%|██▊       | 4717/16798 [20:23<56:02,  3.59it/s]

{'loss': 1.2668, 'grad_norm': 1.6659826040267944, 'learning_rate': 0.00014392423159399572, 'epoch': 0.28}


 28%|██▊       | 4718/16798 [20:23<57:58,  3.47it/s]

{'loss': 1.445, 'grad_norm': 1.751090168952942, 'learning_rate': 0.0001439123183226114, 'epoch': 0.28}


 28%|██▊       | 4719/16798 [20:23<56:56,  3.54it/s]

{'loss': 1.4691, 'grad_norm': 1.649631142616272, 'learning_rate': 0.00014390040505122707, 'epoch': 0.28}


 28%|██▊       | 4720/16798 [20:24<57:43,  3.49it/s]

{'loss': 1.6303, 'grad_norm': 2.1328232288360596, 'learning_rate': 0.00014388849177984276, 'epoch': 0.28}


 28%|██▊       | 4721/16798 [20:24<58:28,  3.44it/s]

{'loss': 1.2053, 'grad_norm': 1.881196141242981, 'learning_rate': 0.00014387657850845843, 'epoch': 0.28}


 28%|██▊       | 4722/16798 [20:24<57:14,  3.52it/s]

{'loss': 1.4235, 'grad_norm': 1.651051640510559, 'learning_rate': 0.00014386466523707412, 'epoch': 0.28}


 28%|██▊       | 4723/16798 [20:25<1:00:38,  3.32it/s]

{'loss': 1.4605, 'grad_norm': 2.0878052711486816, 'learning_rate': 0.00014385275196568978, 'epoch': 0.28}


 28%|██▊       | 4724/16798 [20:25<59:34,  3.38it/s]  

{'loss': 1.6051, 'grad_norm': 1.9552879333496094, 'learning_rate': 0.00014384083869430547, 'epoch': 0.28}


 28%|██▊       | 4725/16798 [20:25<57:40,  3.49it/s]

{'loss': 1.2761, 'grad_norm': 1.6546661853790283, 'learning_rate': 0.00014382892542292113, 'epoch': 0.28}


 28%|██▊       | 4726/16798 [20:25<56:01,  3.59it/s]

{'loss': 1.593, 'grad_norm': 1.883682131767273, 'learning_rate': 0.00014381701215153683, 'epoch': 0.28}


 28%|██▊       | 4727/16798 [20:26<1:00:08,  3.35it/s]

{'loss': 1.3983, 'grad_norm': 1.7798253297805786, 'learning_rate': 0.0001438050988801525, 'epoch': 0.28}


 28%|██▊       | 4728/16798 [20:26<57:15,  3.51it/s]  

{'loss': 1.5061, 'grad_norm': 1.988573431968689, 'learning_rate': 0.00014379318560876818, 'epoch': 0.28}


 28%|██▊       | 4729/16798 [20:26<57:07,  3.52it/s]

{'loss': 1.6585, 'grad_norm': 2.1322991847991943, 'learning_rate': 0.00014378127233738384, 'epoch': 0.28}


 28%|██▊       | 4730/16798 [20:26<55:19,  3.64it/s]

{'loss': 1.0135, 'grad_norm': 1.5670826435089111, 'learning_rate': 0.00014376935906599953, 'epoch': 0.28}


 28%|██▊       | 4731/16798 [20:27<58:15,  3.45it/s]

{'loss': 1.5789, 'grad_norm': 1.8207745552062988, 'learning_rate': 0.0001437574457946152, 'epoch': 0.28}


 28%|██▊       | 4732/16798 [20:27<58:23,  3.44it/s]

{'loss': 1.0144, 'grad_norm': 1.6973395347595215, 'learning_rate': 0.0001437455325232309, 'epoch': 0.28}


 28%|██▊       | 4733/16798 [20:27<58:58,  3.41it/s]

{'loss': 1.7295, 'grad_norm': 2.06252121925354, 'learning_rate': 0.00014373361925184655, 'epoch': 0.28}


 28%|██▊       | 4734/16798 [20:28<55:03,  3.65it/s]

{'loss': 1.5403, 'grad_norm': 2.1565256118774414, 'learning_rate': 0.00014372170598046224, 'epoch': 0.28}


 28%|██▊       | 4735/16798 [20:28<57:18,  3.51it/s]

{'loss': 1.5865, 'grad_norm': 1.9953216314315796, 'learning_rate': 0.0001437097927090779, 'epoch': 0.28}


 28%|██▊       | 4736/16798 [20:28<53:36,  3.75it/s]

{'loss': 1.3892, 'grad_norm': 1.7399569749832153, 'learning_rate': 0.0001436978794376936, 'epoch': 0.28}


 28%|██▊       | 4737/16798 [20:28<57:45,  3.48it/s]

{'loss': 1.1513, 'grad_norm': 1.7195334434509277, 'learning_rate': 0.00014368596616630926, 'epoch': 0.28}


 28%|██▊       | 4738/16798 [20:29<55:19,  3.63it/s]

{'loss': 1.123, 'grad_norm': 1.8582372665405273, 'learning_rate': 0.00014367405289492495, 'epoch': 0.28}


 28%|██▊       | 4739/16798 [20:29<59:00,  3.41it/s]

{'loss': 0.8811, 'grad_norm': 1.604671597480774, 'learning_rate': 0.00014366213962354062, 'epoch': 0.28}


 28%|██▊       | 4740/16798 [20:29<54:54,  3.66it/s]

{'loss': 1.2065, 'grad_norm': 1.870652198791504, 'learning_rate': 0.0001436502263521563, 'epoch': 0.28}


 28%|██▊       | 4741/16798 [20:30<55:03,  3.65it/s]

{'loss': 0.8697, 'grad_norm': 1.53998601436615, 'learning_rate': 0.00014363831308077197, 'epoch': 0.28}


 28%|██▊       | 4742/16798 [20:30<52:49,  3.80it/s]

{'loss': 1.1056, 'grad_norm': 1.8179212808609009, 'learning_rate': 0.00014362639980938766, 'epoch': 0.28}


 28%|██▊       | 4743/16798 [20:30<58:22,  3.44it/s]

{'loss': 1.2806, 'grad_norm': 1.9730149507522583, 'learning_rate': 0.00014361448653800332, 'epoch': 0.28}


 28%|██▊       | 4744/16798 [20:30<55:32,  3.62it/s]

{'loss': 1.0372, 'grad_norm': 1.6059414148330688, 'learning_rate': 0.00014360257326661901, 'epoch': 0.28}


 28%|██▊       | 4745/16798 [20:31<57:42,  3.48it/s]

{'loss': 0.8815, 'grad_norm': 1.8157527446746826, 'learning_rate': 0.00014359065999523468, 'epoch': 0.28}


 28%|██▊       | 4746/16798 [20:31<58:32,  3.43it/s]

{'loss': 1.0865, 'grad_norm': 1.7089321613311768, 'learning_rate': 0.00014357874672385037, 'epoch': 0.28}


 28%|██▊       | 4747/16798 [20:31<57:24,  3.50it/s]

{'loss': 0.3857, 'grad_norm': 0.8612333536148071, 'learning_rate': 0.00014356683345246603, 'epoch': 0.28}


 28%|██▊       | 4748/16798 [20:32<56:57,  3.53it/s]

{'loss': 0.7739, 'grad_norm': 1.8324229717254639, 'learning_rate': 0.00014355492018108175, 'epoch': 0.28}


 28%|██▊       | 4749/16798 [20:32<59:22,  3.38it/s]

{'loss': 0.3396, 'grad_norm': 0.8247376084327698, 'learning_rate': 0.00014354300690969741, 'epoch': 0.28}


 28%|██▊       | 4750/16798 [20:32<57:32,  3.49it/s]

{'loss': 0.4649, 'grad_norm': 1.1871955394744873, 'learning_rate': 0.0001435310936383131, 'epoch': 0.28}


 28%|██▊       | 4751/16798 [20:32<59:48,  3.36it/s]

{'loss': 1.8917, 'grad_norm': 1.6923234462738037, 'learning_rate': 0.00014351918036692877, 'epoch': 0.28}


 28%|██▊       | 4752/16798 [20:33<57:05,  3.52it/s]

{'loss': 2.5006, 'grad_norm': 2.077353000640869, 'learning_rate': 0.00014350726709554446, 'epoch': 0.28}


 28%|██▊       | 4753/16798 [20:33<59:04,  3.40it/s]

{'loss': 2.3488, 'grad_norm': 2.061901807785034, 'learning_rate': 0.00014349535382416012, 'epoch': 0.28}


 28%|██▊       | 4754/16798 [20:33<55:30,  3.62it/s]

{'loss': 1.4223, 'grad_norm': 1.5574381351470947, 'learning_rate': 0.00014348344055277581, 'epoch': 0.28}


 28%|██▊       | 4755/16798 [20:34<56:44,  3.54it/s]

{'loss': 1.9392, 'grad_norm': 1.791896939277649, 'learning_rate': 0.00014347152728139148, 'epoch': 0.28}


 28%|██▊       | 4756/16798 [20:34<56:05,  3.58it/s]

{'loss': 2.0179, 'grad_norm': 1.918726921081543, 'learning_rate': 0.00014345961401000717, 'epoch': 0.28}


 28%|██▊       | 4757/16798 [20:34<58:19,  3.44it/s]

{'loss': 1.9246, 'grad_norm': 1.9113729000091553, 'learning_rate': 0.00014344770073862283, 'epoch': 0.28}


 28%|██▊       | 4758/16798 [20:34<55:11,  3.64it/s]

{'loss': 1.677, 'grad_norm': 1.7619836330413818, 'learning_rate': 0.00014343578746723852, 'epoch': 0.28}


 28%|██▊       | 4759/16798 [20:35<58:15,  3.44it/s]

{'loss': 2.1792, 'grad_norm': 2.0853934288024902, 'learning_rate': 0.0001434238741958542, 'epoch': 0.28}


 28%|██▊       | 4760/16798 [20:35<55:11,  3.64it/s]

{'loss': 1.6079, 'grad_norm': 1.7118252515792847, 'learning_rate': 0.00014341196092446988, 'epoch': 0.28}


 28%|██▊       | 4761/16798 [20:35<58:13,  3.45it/s]

{'loss': 1.5991, 'grad_norm': 1.8022152185440063, 'learning_rate': 0.00014340004765308554, 'epoch': 0.28}


 28%|██▊       | 4762/16798 [20:36<56:38,  3.54it/s]

{'loss': 2.1154, 'grad_norm': 2.1773500442504883, 'learning_rate': 0.00014338813438170123, 'epoch': 0.28}


 28%|██▊       | 4763/16798 [20:36<58:50,  3.41it/s]

{'loss': 1.5237, 'grad_norm': 1.9822059869766235, 'learning_rate': 0.0001433762211103169, 'epoch': 0.28}


 28%|██▊       | 4764/16798 [20:36<1:01:00,  3.29it/s]

{'loss': 1.4712, 'grad_norm': 1.7705546617507935, 'learning_rate': 0.00014336430783893259, 'epoch': 0.28}


 28%|██▊       | 4765/16798 [20:37<1:01:19,  3.27it/s]

{'loss': 1.4229, 'grad_norm': 1.8096847534179688, 'learning_rate': 0.00014335239456754825, 'epoch': 0.28}


 28%|██▊       | 4766/16798 [20:37<59:02,  3.40it/s]  

{'loss': 1.4247, 'grad_norm': 1.6110745668411255, 'learning_rate': 0.00014334048129616394, 'epoch': 0.28}


 28%|██▊       | 4767/16798 [20:37<59:49,  3.35it/s]

{'loss': 1.8011, 'grad_norm': 1.8318425416946411, 'learning_rate': 0.0001433285680247796, 'epoch': 0.28}


 28%|██▊       | 4768/16798 [20:37<55:56,  3.58it/s]

{'loss': 1.8836, 'grad_norm': 2.8114898204803467, 'learning_rate': 0.0001433166547533953, 'epoch': 0.28}


 28%|██▊       | 4769/16798 [20:38<58:51,  3.41it/s]

{'loss': 1.4528, 'grad_norm': 1.637561321258545, 'learning_rate': 0.00014330474148201096, 'epoch': 0.28}


 28%|██▊       | 4770/16798 [20:38<58:27,  3.43it/s]

{'loss': 2.0427, 'grad_norm': 2.229871988296509, 'learning_rate': 0.00014329282821062665, 'epoch': 0.28}


 28%|██▊       | 4771/16798 [20:38<1:00:45,  3.30it/s]

{'loss': 1.2061, 'grad_norm': 1.7033627033233643, 'learning_rate': 0.0001432809149392423, 'epoch': 0.28}


 28%|██▊       | 4772/16798 [20:39<57:55,  3.46it/s]  

{'loss': 1.457, 'grad_norm': 1.9635838270187378, 'learning_rate': 0.000143269001667858, 'epoch': 0.28}


 28%|██▊       | 4773/16798 [20:39<57:55,  3.46it/s]

{'loss': 1.2797, 'grad_norm': 1.7289483547210693, 'learning_rate': 0.00014325708839647367, 'epoch': 0.28}


 28%|██▊       | 4774/16798 [20:39<54:50,  3.65it/s]

{'loss': 1.4726, 'grad_norm': 1.9834719896316528, 'learning_rate': 0.00014324517512508936, 'epoch': 0.28}


 28%|██▊       | 4775/16798 [20:39<57:23,  3.49it/s]

{'loss': 1.383, 'grad_norm': 1.5808045864105225, 'learning_rate': 0.00014323326185370502, 'epoch': 0.28}


 28%|██▊       | 4776/16798 [20:40<58:09,  3.45it/s]

{'loss': 1.0127, 'grad_norm': 1.599139928817749, 'learning_rate': 0.0001432213485823207, 'epoch': 0.28}


 28%|██▊       | 4777/16798 [20:40<59:37,  3.36it/s]

{'loss': 1.1542, 'grad_norm': 4.286227703094482, 'learning_rate': 0.00014320943531093638, 'epoch': 0.28}


 28%|██▊       | 4778/16798 [20:40<57:13,  3.50it/s]

{'loss': 1.4229, 'grad_norm': 1.8106003999710083, 'learning_rate': 0.00014319752203955207, 'epoch': 0.28}


 28%|██▊       | 4779/16798 [20:41<1:01:29,  3.26it/s]

{'loss': 1.0785, 'grad_norm': 1.3379758596420288, 'learning_rate': 0.00014318560876816776, 'epoch': 0.28}


 28%|██▊       | 4780/16798 [20:41<58:08,  3.45it/s]  

{'loss': 1.7979, 'grad_norm': 2.3296310901641846, 'learning_rate': 0.00014317369549678342, 'epoch': 0.28}


 28%|██▊       | 4781/16798 [20:41<58:48,  3.41it/s]

{'loss': 1.3994, 'grad_norm': 1.8551167249679565, 'learning_rate': 0.0001431617822253991, 'epoch': 0.28}


 28%|██▊       | 4782/16798 [20:41<58:33,  3.42it/s]

{'loss': 0.9111, 'grad_norm': 1.4376449584960938, 'learning_rate': 0.00014314986895401478, 'epoch': 0.28}


 28%|██▊       | 4783/16798 [20:42<58:05,  3.45it/s]

{'loss': 1.6826, 'grad_norm': 2.1199493408203125, 'learning_rate': 0.00014313795568263047, 'epoch': 0.28}


 28%|██▊       | 4784/16798 [20:42<55:05,  3.63it/s]

{'loss': 1.2903, 'grad_norm': 2.2414984703063965, 'learning_rate': 0.00014312604241124613, 'epoch': 0.28}


 28%|██▊       | 4785/16798 [20:42<56:16,  3.56it/s]

{'loss': 1.6789, 'grad_norm': 1.9912029504776, 'learning_rate': 0.00014311412913986182, 'epoch': 0.28}


 28%|██▊       | 4786/16798 [20:43<58:39,  3.41it/s]

{'loss': 1.3392, 'grad_norm': 1.8805441856384277, 'learning_rate': 0.00014310221586847748, 'epoch': 0.28}


 28%|██▊       | 4787/16798 [20:43<1:00:59,  3.28it/s]

{'loss': 1.6756, 'grad_norm': 2.080490827560425, 'learning_rate': 0.00014309030259709318, 'epoch': 0.28}


 29%|██▊       | 4788/16798 [20:43<55:42,  3.59it/s]  

{'loss': 0.9629, 'grad_norm': 2.1673851013183594, 'learning_rate': 0.00014307838932570884, 'epoch': 0.29}


 29%|██▊       | 4789/16798 [20:43<55:11,  3.63it/s]

{'loss': 1.2905, 'grad_norm': 1.8209154605865479, 'learning_rate': 0.00014306647605432453, 'epoch': 0.29}


 29%|██▊       | 4790/16798 [20:44<57:52,  3.46it/s]

{'loss': 1.1813, 'grad_norm': 1.679299235343933, 'learning_rate': 0.0001430545627829402, 'epoch': 0.29}


 29%|██▊       | 4791/16798 [20:44<59:58,  3.34it/s]

{'loss': 1.6757, 'grad_norm': 2.2017953395843506, 'learning_rate': 0.00014304264951155588, 'epoch': 0.29}


 29%|██▊       | 4792/16798 [20:44<56:36,  3.53it/s]

{'loss': 1.0203, 'grad_norm': 1.6798193454742432, 'learning_rate': 0.00014303073624017155, 'epoch': 0.29}


 29%|██▊       | 4793/16798 [20:45<58:01,  3.45it/s]

{'loss': 1.7431, 'grad_norm': 2.240643262863159, 'learning_rate': 0.00014301882296878724, 'epoch': 0.29}


 29%|██▊       | 4794/16798 [20:45<56:59,  3.51it/s]

{'loss': 1.3015, 'grad_norm': 2.044591188430786, 'learning_rate': 0.0001430069096974029, 'epoch': 0.29}


 29%|██▊       | 4795/16798 [20:45<58:58,  3.39it/s]

{'loss': 0.9002, 'grad_norm': 1.4531959295272827, 'learning_rate': 0.0001429949964260186, 'epoch': 0.29}


 29%|██▊       | 4796/16798 [20:45<55:19,  3.62it/s]

{'loss': 1.3077, 'grad_norm': 1.720200538635254, 'learning_rate': 0.00014298308315463426, 'epoch': 0.29}


 29%|██▊       | 4797/16798 [20:46<59:26,  3.36it/s]

{'loss': 0.4565, 'grad_norm': 0.9325425624847412, 'learning_rate': 0.00014297116988324995, 'epoch': 0.29}


 29%|██▊       | 4798/16798 [20:46<55:22,  3.61it/s]

{'loss': 0.2872, 'grad_norm': 0.7716797590255737, 'learning_rate': 0.0001429592566118656, 'epoch': 0.29}


 29%|██▊       | 4799/16798 [20:46<56:42,  3.53it/s]

{'loss': 0.4541, 'grad_norm': 1.1213444471359253, 'learning_rate': 0.0001429473433404813, 'epoch': 0.29}


 29%|██▊       | 4800/16798 [20:47<55:54,  3.58it/s]

{'loss': 0.7991, 'grad_norm': 1.5159305334091187, 'learning_rate': 0.00014293543006909697, 'epoch': 0.29}


 29%|██▊       | 4801/16798 [20:47<59:20,  3.37it/s]

{'loss': 2.0066, 'grad_norm': 1.779913067817688, 'learning_rate': 0.00014292351679771266, 'epoch': 0.29}


 29%|██▊       | 4802/16798 [20:47<56:49,  3.52it/s]

{'loss': 2.0542, 'grad_norm': 1.835160732269287, 'learning_rate': 0.00014291160352632832, 'epoch': 0.29}


 29%|██▊       | 4803/16798 [20:47<58:31,  3.42it/s]

{'loss': 2.269, 'grad_norm': 2.5302939414978027, 'learning_rate': 0.000142899690254944, 'epoch': 0.29}


 29%|██▊       | 4804/16798 [20:48<58:41,  3.41it/s]

{'loss': 2.1042, 'grad_norm': 2.7810781002044678, 'learning_rate': 0.00014288777698355967, 'epoch': 0.29}


 29%|██▊       | 4805/16798 [20:48<59:06,  3.38it/s]

{'loss': 1.9867, 'grad_norm': 1.9855787754058838, 'learning_rate': 0.00014287586371217537, 'epoch': 0.29}


 29%|██▊       | 4806/16798 [20:48<54:17,  3.68it/s]

{'loss': 2.354, 'grad_norm': 2.2139642238616943, 'learning_rate': 0.00014286395044079103, 'epoch': 0.29}


 29%|██▊       | 4807/16798 [20:49<58:40,  3.41it/s]

{'loss': 2.211, 'grad_norm': 2.0137288570404053, 'learning_rate': 0.00014285203716940672, 'epoch': 0.29}


 29%|██▊       | 4808/16798 [20:49<56:19,  3.55it/s]

{'loss': 1.2645, 'grad_norm': 1.807318925857544, 'learning_rate': 0.00014284012389802238, 'epoch': 0.29}


 29%|██▊       | 4809/16798 [20:49<58:02,  3.44it/s]

{'loss': 1.6217, 'grad_norm': 1.7137877941131592, 'learning_rate': 0.0001428282106266381, 'epoch': 0.29}


 29%|██▊       | 4810/16798 [20:50<59:09,  3.38it/s]

{'loss': 1.425, 'grad_norm': 1.7433280944824219, 'learning_rate': 0.00014281629735525377, 'epoch': 0.29}


 29%|██▊       | 4811/16798 [20:50<59:46,  3.34it/s]

{'loss': 1.9534, 'grad_norm': 1.8802170753479004, 'learning_rate': 0.00014280438408386946, 'epoch': 0.29}


 29%|██▊       | 4812/16798 [20:50<58:19,  3.42it/s]

{'loss': 1.6142, 'grad_norm': 2.0265042781829834, 'learning_rate': 0.00014279247081248512, 'epoch': 0.29}


 29%|██▊       | 4813/16798 [20:50<58:46,  3.40it/s]

{'loss': 2.2596, 'grad_norm': 2.24783992767334, 'learning_rate': 0.0001427805575411008, 'epoch': 0.29}


 29%|██▊       | 4814/16798 [20:51<56:22,  3.54it/s]

{'loss': 1.5472, 'grad_norm': 1.8870964050292969, 'learning_rate': 0.00014276864426971647, 'epoch': 0.29}


 29%|██▊       | 4815/16798 [20:51<58:39,  3.40it/s]

{'loss': 1.9071, 'grad_norm': 2.219984769821167, 'learning_rate': 0.00014275673099833216, 'epoch': 0.29}


 29%|██▊       | 4816/16798 [20:51<56:32,  3.53it/s]

{'loss': 1.5582, 'grad_norm': 2.1525025367736816, 'learning_rate': 0.00014274481772694783, 'epoch': 0.29}


 29%|██▊       | 4817/16798 [20:52<58:47,  3.40it/s]

{'loss': 1.6814, 'grad_norm': 2.1676127910614014, 'learning_rate': 0.00014273290445556352, 'epoch': 0.29}


 29%|██▊       | 4818/16798 [20:52<54:32,  3.66it/s]

{'loss': 1.4749, 'grad_norm': 1.9273909330368042, 'learning_rate': 0.00014272099118417918, 'epoch': 0.29}


 29%|██▊       | 4819/16798 [20:52<56:50,  3.51it/s]

{'loss': 1.1976, 'grad_norm': 1.4831247329711914, 'learning_rate': 0.00014270907791279487, 'epoch': 0.29}


 29%|██▊       | 4820/16798 [20:52<57:30,  3.47it/s]

{'loss': 1.2142, 'grad_norm': 1.8614912033081055, 'learning_rate': 0.00014269716464141054, 'epoch': 0.29}


 29%|██▊       | 4821/16798 [20:53<1:01:08,  3.27it/s]

{'loss': 1.6247, 'grad_norm': 1.8900717496871948, 'learning_rate': 0.00014268525137002623, 'epoch': 0.29}


 29%|██▊       | 4822/16798 [20:53<58:28,  3.41it/s]  

{'loss': 1.7342, 'grad_norm': 2.492696523666382, 'learning_rate': 0.0001426733380986419, 'epoch': 0.29}


 29%|██▊       | 4823/16798 [20:53<1:00:06,  3.32it/s]

{'loss': 1.2732, 'grad_norm': 1.7574998140335083, 'learning_rate': 0.00014266142482725758, 'epoch': 0.29}


 29%|██▊       | 4824/16798 [20:54<57:55,  3.45it/s]  

{'loss': 1.7898, 'grad_norm': 2.204455852508545, 'learning_rate': 0.00014264951155587325, 'epoch': 0.29}


 29%|██▊       | 4825/16798 [20:54<58:19,  3.42it/s]

{'loss': 1.8644, 'grad_norm': 2.0646591186523438, 'learning_rate': 0.00014263759828448894, 'epoch': 0.29}


 29%|██▊       | 4826/16798 [20:54<1:01:37,  3.24it/s]

{'loss': 1.3188, 'grad_norm': 1.8125958442687988, 'learning_rate': 0.0001426256850131046, 'epoch': 0.29}


 29%|██▊       | 4827/16798 [20:55<1:01:03,  3.27it/s]

{'loss': 1.683, 'grad_norm': 1.9247889518737793, 'learning_rate': 0.0001426137717417203, 'epoch': 0.29}


 29%|██▊       | 4828/16798 [20:55<58:37,  3.40it/s]  

{'loss': 1.3955, 'grad_norm': 2.009503126144409, 'learning_rate': 0.00014260185847033595, 'epoch': 0.29}


 29%|██▊       | 4829/16798 [20:55<59:29,  3.35it/s]

{'loss': 1.3435, 'grad_norm': 1.794101595878601, 'learning_rate': 0.00014258994519895165, 'epoch': 0.29}


 29%|██▉       | 4830/16798 [20:55<57:04,  3.49it/s]

{'loss': 1.2619, 'grad_norm': 1.6469290256500244, 'learning_rate': 0.0001425780319275673, 'epoch': 0.29}


 29%|██▉       | 4831/16798 [20:56<1:03:18,  3.15it/s]

{'loss': 1.2677, 'grad_norm': 1.8788834810256958, 'learning_rate': 0.000142566118656183, 'epoch': 0.29}


 29%|██▉       | 4832/16798 [20:56<58:07,  3.43it/s]  

{'loss': 1.1068, 'grad_norm': 1.777239441871643, 'learning_rate': 0.00014255420538479866, 'epoch': 0.29}


 29%|██▉       | 4833/16798 [20:56<58:42,  3.40it/s]

{'loss': 1.1407, 'grad_norm': 1.7329597473144531, 'learning_rate': 0.00014254229211341435, 'epoch': 0.29}


 29%|██▉       | 4834/16798 [20:57<59:51,  3.33it/s]

{'loss': 1.4711, 'grad_norm': 1.9105321168899536, 'learning_rate': 0.00014253037884203002, 'epoch': 0.29}


 29%|██▉       | 4835/16798 [20:57<55:38,  3.58it/s]

{'loss': 1.5732, 'grad_norm': 2.167102575302124, 'learning_rate': 0.0001425184655706457, 'epoch': 0.29}


 29%|██▉       | 4836/16798 [20:57<52:07,  3.83it/s]

{'loss': 0.7827, 'grad_norm': 1.4863178730010986, 'learning_rate': 0.00014250655229926137, 'epoch': 0.29}


 29%|██▉       | 4837/16798 [20:57<54:58,  3.63it/s]

{'loss': 0.991, 'grad_norm': 1.687950611114502, 'learning_rate': 0.00014249463902787706, 'epoch': 0.29}


 29%|██▉       | 4838/16798 [20:58<55:51,  3.57it/s]

{'loss': 0.7636, 'grad_norm': 1.3778738975524902, 'learning_rate': 0.00014248272575649273, 'epoch': 0.29}


 29%|██▉       | 4839/16798 [20:58<56:36,  3.52it/s]

{'loss': 1.2935, 'grad_norm': 1.9430664777755737, 'learning_rate': 0.00014247081248510842, 'epoch': 0.29}


 29%|██▉       | 4840/16798 [20:58<55:33,  3.59it/s]

{'loss': 1.5902, 'grad_norm': 1.9776544570922852, 'learning_rate': 0.0001424588992137241, 'epoch': 0.29}


 29%|██▉       | 4841/16798 [20:58<56:53,  3.50it/s]

{'loss': 1.0357, 'grad_norm': 1.61662757396698, 'learning_rate': 0.00014244698594233977, 'epoch': 0.29}


 29%|██▉       | 4842/16798 [20:59<54:54,  3.63it/s]

{'loss': 1.1372, 'grad_norm': 1.7334095239639282, 'learning_rate': 0.00014243507267095546, 'epoch': 0.29}


 29%|██▉       | 4843/16798 [20:59<56:26,  3.53it/s]

{'loss': 0.9948, 'grad_norm': 1.6212109327316284, 'learning_rate': 0.00014242315939957113, 'epoch': 0.29}


 29%|██▉       | 4844/16798 [20:59<52:10,  3.82it/s]

{'loss': 1.0022, 'grad_norm': 1.4640108346939087, 'learning_rate': 0.00014241124612818682, 'epoch': 0.29}


 29%|██▉       | 4845/16798 [21:00<53:12,  3.74it/s]

{'loss': 0.6343, 'grad_norm': 1.2119839191436768, 'learning_rate': 0.00014239933285680248, 'epoch': 0.29}


 29%|██▉       | 4846/16798 [21:00<54:01,  3.69it/s]

{'loss': 1.1532, 'grad_norm': 1.9283413887023926, 'learning_rate': 0.00014238741958541817, 'epoch': 0.29}


 29%|██▉       | 4847/16798 [21:00<56:45,  3.51it/s]

{'loss': 0.4366, 'grad_norm': 1.3123761415481567, 'learning_rate': 0.00014237550631403384, 'epoch': 0.29}


 29%|██▉       | 4848/16798 [21:00<54:37,  3.65it/s]

{'loss': 1.1344, 'grad_norm': 1.7609659433364868, 'learning_rate': 0.00014236359304264953, 'epoch': 0.29}


 29%|██▉       | 4849/16798 [21:01<55:50,  3.57it/s]

{'loss': 0.5478, 'grad_norm': 1.2606689929962158, 'learning_rate': 0.0001423516797712652, 'epoch': 0.29}


 29%|██▉       | 4850/16798 [21:01<55:49,  3.57it/s]

{'loss': 0.2326, 'grad_norm': 0.6981527209281921, 'learning_rate': 0.00014233976649988088, 'epoch': 0.29}


 29%|██▉       | 4851/16798 [21:01<57:28,  3.46it/s]

{'loss': 2.2391, 'grad_norm': 1.853165626525879, 'learning_rate': 0.00014232785322849654, 'epoch': 0.29}


 29%|██▉       | 4852/16798 [21:02<55:24,  3.59it/s]

{'loss': 2.0057, 'grad_norm': 1.9340423345565796, 'learning_rate': 0.00014231593995711223, 'epoch': 0.29}


 29%|██▉       | 4853/16798 [21:02<58:13,  3.42it/s]

{'loss': 1.9905, 'grad_norm': 1.6262905597686768, 'learning_rate': 0.0001423040266857279, 'epoch': 0.29}


 29%|██▉       | 4854/16798 [21:02<54:09,  3.68it/s]

{'loss': 2.0479, 'grad_norm': 1.8474621772766113, 'learning_rate': 0.0001422921134143436, 'epoch': 0.29}


 29%|██▉       | 4855/16798 [21:02<53:06,  3.75it/s]

{'loss': 2.1622, 'grad_norm': 2.0834693908691406, 'learning_rate': 0.00014228020014295925, 'epoch': 0.29}


 29%|██▉       | 4856/16798 [21:03<58:29,  3.40it/s]

{'loss': 1.9815, 'grad_norm': 2.246042251586914, 'learning_rate': 0.00014226828687157494, 'epoch': 0.29}


 29%|██▉       | 4857/16798 [21:03<54:19,  3.66it/s]

{'loss': 2.0597, 'grad_norm': 1.8085306882858276, 'learning_rate': 0.0001422563736001906, 'epoch': 0.29}


 29%|██▉       | 4858/16798 [21:03<57:38,  3.45it/s]

{'loss': 2.0612, 'grad_norm': 1.8977775573730469, 'learning_rate': 0.0001422444603288063, 'epoch': 0.29}


 29%|██▉       | 4859/16798 [21:04<57:27,  3.46it/s]

{'loss': 2.0196, 'grad_norm': 1.8434584140777588, 'learning_rate': 0.00014223254705742196, 'epoch': 0.29}


 29%|██▉       | 4860/16798 [21:04<58:21,  3.41it/s]

{'loss': 2.0326, 'grad_norm': 1.82106351852417, 'learning_rate': 0.00014222063378603765, 'epoch': 0.29}


 29%|██▉       | 4861/16798 [21:04<1:03:16,  3.14it/s]

{'loss': 2.301, 'grad_norm': 2.0534160137176514, 'learning_rate': 0.00014220872051465332, 'epoch': 0.29}


 29%|██▉       | 4862/16798 [21:05<1:03:14,  3.15it/s]

{'loss': 1.7612, 'grad_norm': 2.1576550006866455, 'learning_rate': 0.000142196807243269, 'epoch': 0.29}


 29%|██▉       | 4863/16798 [21:05<1:01:07,  3.25it/s]

{'loss': 2.0172, 'grad_norm': 1.8576027154922485, 'learning_rate': 0.00014218489397188467, 'epoch': 0.29}


 29%|██▉       | 4864/16798 [21:05<1:02:41,  3.17it/s]

{'loss': 1.9549, 'grad_norm': 1.8901352882385254, 'learning_rate': 0.00014217298070050036, 'epoch': 0.29}


 29%|██▉       | 4865/16798 [21:05<58:39,  3.39it/s]  

{'loss': 1.6417, 'grad_norm': 1.841941237449646, 'learning_rate': 0.00014216106742911603, 'epoch': 0.29}


 29%|██▉       | 4866/16798 [21:06<58:49,  3.38it/s]

{'loss': 2.0978, 'grad_norm': 2.0955810546875, 'learning_rate': 0.00014214915415773172, 'epoch': 0.29}


 29%|██▉       | 4867/16798 [21:06<53:44,  3.70it/s]

{'loss': 1.2476, 'grad_norm': 1.5911716222763062, 'learning_rate': 0.00014213724088634738, 'epoch': 0.29}


 29%|██▉       | 4868/16798 [21:06<58:57,  3.37it/s]

{'loss': 1.3799, 'grad_norm': 1.8359209299087524, 'learning_rate': 0.00014212532761496307, 'epoch': 0.29}


 29%|██▉       | 4869/16798 [21:07<57:58,  3.43it/s]

{'loss': 1.3101, 'grad_norm': 1.680979609489441, 'learning_rate': 0.00014211341434357873, 'epoch': 0.29}


 29%|██▉       | 4870/16798 [21:07<1:00:04,  3.31it/s]

{'loss': 1.4751, 'grad_norm': 1.6984076499938965, 'learning_rate': 0.00014210150107219442, 'epoch': 0.29}


 29%|██▉       | 4871/16798 [21:07<58:44,  3.38it/s]  

{'loss': 1.5631, 'grad_norm': 1.7934340238571167, 'learning_rate': 0.00014208958780081012, 'epoch': 0.29}


 29%|██▉       | 4872/16798 [21:07<59:35,  3.34it/s]

{'loss': 1.7912, 'grad_norm': 2.5066335201263428, 'learning_rate': 0.0001420776745294258, 'epoch': 0.29}


 29%|██▉       | 4873/16798 [21:08<58:19,  3.41it/s]

{'loss': 1.1536, 'grad_norm': 1.8524985313415527, 'learning_rate': 0.00014206576125804147, 'epoch': 0.29}


 29%|██▉       | 4874/16798 [21:08<59:07,  3.36it/s]

{'loss': 1.627, 'grad_norm': 2.013859987258911, 'learning_rate': 0.00014205384798665716, 'epoch': 0.29}


 29%|██▉       | 4875/16798 [21:08<57:14,  3.47it/s]

{'loss': 1.5864, 'grad_norm': 2.139970064163208, 'learning_rate': 0.00014204193471527282, 'epoch': 0.29}


 29%|██▉       | 4876/16798 [21:09<1:00:51,  3.26it/s]

{'loss': 1.9981, 'grad_norm': 1.9876080751419067, 'learning_rate': 0.00014203002144388852, 'epoch': 0.29}


 29%|██▉       | 4877/16798 [21:09<59:33,  3.34it/s]  

{'loss': 1.6397, 'grad_norm': 1.8744544982910156, 'learning_rate': 0.00014201810817250418, 'epoch': 0.29}


 29%|██▉       | 4878/16798 [21:09<58:18,  3.41it/s]

{'loss': 1.3792, 'grad_norm': 1.8982398509979248, 'learning_rate': 0.00014200619490111987, 'epoch': 0.29}


 29%|██▉       | 4879/16798 [21:09<55:45,  3.56it/s]

{'loss': 1.5385, 'grad_norm': 1.6540299654006958, 'learning_rate': 0.00014199428162973553, 'epoch': 0.29}


 29%|██▉       | 4880/16798 [21:10<57:32,  3.45it/s]

{'loss': 1.3946, 'grad_norm': 1.9452968835830688, 'learning_rate': 0.00014198236835835122, 'epoch': 0.29}


 29%|██▉       | 4881/16798 [21:10<56:26,  3.52it/s]

{'loss': 0.9162, 'grad_norm': 1.7101974487304688, 'learning_rate': 0.0001419704550869669, 'epoch': 0.29}


 29%|██▉       | 4882/16798 [21:10<58:27,  3.40it/s]

{'loss': 1.3905, 'grad_norm': 1.8580098152160645, 'learning_rate': 0.00014195854181558258, 'epoch': 0.29}


 29%|██▉       | 4883/16798 [21:11<59:03,  3.36it/s]

{'loss': 1.3599, 'grad_norm': 1.7874482870101929, 'learning_rate': 0.00014194662854419824, 'epoch': 0.29}


 29%|██▉       | 4884/16798 [21:11<1:00:31,  3.28it/s]

{'loss': 1.62, 'grad_norm': 1.981450080871582, 'learning_rate': 0.00014193471527281393, 'epoch': 0.29}


 29%|██▉       | 4885/16798 [21:11<59:54,  3.31it/s]  

{'loss': 1.3607, 'grad_norm': 2.459202527999878, 'learning_rate': 0.0001419228020014296, 'epoch': 0.29}


 29%|██▉       | 4886/16798 [21:12<1:00:30,  3.28it/s]

{'loss': 1.5673, 'grad_norm': 1.9688338041305542, 'learning_rate': 0.0001419108887300453, 'epoch': 0.29}


 29%|██▉       | 4887/16798 [21:12<1:00:15,  3.29it/s]

{'loss': 0.8124, 'grad_norm': 1.4576454162597656, 'learning_rate': 0.00014189897545866095, 'epoch': 0.29}


 29%|██▉       | 4888/16798 [21:12<57:55,  3.43it/s]  

{'loss': 1.564, 'grad_norm': 2.194823741912842, 'learning_rate': 0.00014188706218727664, 'epoch': 0.29}


 29%|██▉       | 4889/16798 [21:12<58:38,  3.38it/s]

{'loss': 1.3047, 'grad_norm': 1.8128362894058228, 'learning_rate': 0.0001418751489158923, 'epoch': 0.29}


 29%|██▉       | 4890/16798 [21:13<57:32,  3.45it/s]

{'loss': 1.2073, 'grad_norm': 1.8002867698669434, 'learning_rate': 0.000141863235644508, 'epoch': 0.29}


 29%|██▉       | 4891/16798 [21:13<58:16,  3.40it/s]

{'loss': 1.1641, 'grad_norm': 1.9357695579528809, 'learning_rate': 0.00014185132237312366, 'epoch': 0.29}


 29%|██▉       | 4892/16798 [21:13<55:39,  3.56it/s]

{'loss': 1.2186, 'grad_norm': 1.8957839012145996, 'learning_rate': 0.00014183940910173935, 'epoch': 0.29}


 29%|██▉       | 4893/16798 [21:14<55:48,  3.56it/s]

{'loss': 0.8997, 'grad_norm': 1.512000322341919, 'learning_rate': 0.00014182749583035501, 'epoch': 0.29}


 29%|██▉       | 4894/16798 [21:14<54:18,  3.65it/s]

{'loss': 1.5595, 'grad_norm': 2.372716188430786, 'learning_rate': 0.0001418155825589707, 'epoch': 0.29}


 29%|██▉       | 4895/16798 [21:14<55:42,  3.56it/s]

{'loss': 0.7781, 'grad_norm': 1.4149346351623535, 'learning_rate': 0.00014180366928758637, 'epoch': 0.29}


 29%|██▉       | 4896/16798 [21:14<53:03,  3.74it/s]

{'loss': 1.3124, 'grad_norm': 2.3079276084899902, 'learning_rate': 0.00014179175601620206, 'epoch': 0.29}


 29%|██▉       | 4897/16798 [21:15<57:18,  3.46it/s]

{'loss': 0.7486, 'grad_norm': 1.4054855108261108, 'learning_rate': 0.00014177984274481772, 'epoch': 0.29}


 29%|██▉       | 4898/16798 [21:15<55:14,  3.59it/s]

{'loss': 0.4651, 'grad_norm': 1.110846996307373, 'learning_rate': 0.00014176792947343341, 'epoch': 0.29}


 29%|██▉       | 4899/16798 [21:15<57:48,  3.43it/s]

{'loss': 0.2471, 'grad_norm': 0.707200825214386, 'learning_rate': 0.00014175601620204908, 'epoch': 0.29}


 29%|██▉       | 4900/16798 [21:16<55:33,  3.57it/s]

{'loss': 0.6089, 'grad_norm': 1.4486668109893799, 'learning_rate': 0.00014174410293066477, 'epoch': 0.29}


 29%|██▉       | 4901/16798 [21:16<58:39,  3.38it/s]

{'loss': 1.8618, 'grad_norm': 1.7503418922424316, 'learning_rate': 0.00014173218965928043, 'epoch': 0.29}


 29%|██▉       | 4902/16798 [21:16<1:03:56,  3.10it/s]

{'loss': 2.1044, 'grad_norm': 1.9675707817077637, 'learning_rate': 0.00014172027638789612, 'epoch': 0.29}


 29%|██▉       | 4903/16798 [21:17<1:05:25,  3.03it/s]

{'loss': 2.2223, 'grad_norm': 1.924714207649231, 'learning_rate': 0.0001417083631165118, 'epoch': 0.29}


 29%|██▉       | 4904/16798 [21:17<1:02:41,  3.16it/s]

{'loss': 2.0334, 'grad_norm': 1.9826396703720093, 'learning_rate': 0.00014169644984512748, 'epoch': 0.29}


 29%|██▉       | 4905/16798 [21:17<1:03:19,  3.13it/s]

{'loss': 1.8267, 'grad_norm': 1.9804420471191406, 'learning_rate': 0.00014168453657374317, 'epoch': 0.29}


 29%|██▉       | 4906/16798 [21:18<1:03:54,  3.10it/s]

{'loss': 2.0746, 'grad_norm': 2.0399441719055176, 'learning_rate': 0.00014167262330235883, 'epoch': 0.29}


 29%|██▉       | 4907/16798 [21:18<1:01:29,  3.22it/s]

{'loss': 1.8399, 'grad_norm': 2.182377576828003, 'learning_rate': 0.00014166071003097452, 'epoch': 0.29}


 29%|██▉       | 4908/16798 [21:18<1:00:42,  3.26it/s]

{'loss': 1.9531, 'grad_norm': 2.5893967151641846, 'learning_rate': 0.00014164879675959019, 'epoch': 0.29}


 29%|██▉       | 4909/16798 [21:18<59:18,  3.34it/s]  

{'loss': 1.4212, 'grad_norm': 2.0587034225463867, 'learning_rate': 0.00014163688348820588, 'epoch': 0.29}


 29%|██▉       | 4910/16798 [21:19<1:05:26,  3.03it/s]

{'loss': 1.6508, 'grad_norm': 1.733270525932312, 'learning_rate': 0.00014162497021682154, 'epoch': 0.29}


 29%|██▉       | 4911/16798 [21:19<1:02:29,  3.17it/s]

{'loss': 1.2861, 'grad_norm': 1.9154462814331055, 'learning_rate': 0.00014161305694543723, 'epoch': 0.29}


 29%|██▉       | 4912/16798 [21:19<1:02:03,  3.19it/s]

{'loss': 1.4731, 'grad_norm': 1.7669459581375122, 'learning_rate': 0.0001416011436740529, 'epoch': 0.29}


 29%|██▉       | 4913/16798 [21:20<1:01:59,  3.19it/s]

{'loss': 1.3691, 'grad_norm': 1.7115799188613892, 'learning_rate': 0.00014158923040266859, 'epoch': 0.29}


 29%|██▉       | 4914/16798 [21:20<59:57,  3.30it/s]  

{'loss': 1.843, 'grad_norm': 2.1288809776306152, 'learning_rate': 0.00014157731713128425, 'epoch': 0.29}


 29%|██▉       | 4915/16798 [21:20<59:05,  3.35it/s]

{'loss': 1.4966, 'grad_norm': 1.6842609643936157, 'learning_rate': 0.00014156540385989994, 'epoch': 0.29}


 29%|██▉       | 4916/16798 [21:21<59:56,  3.30it/s]

{'loss': 1.5942, 'grad_norm': 1.798693060874939, 'learning_rate': 0.0001415534905885156, 'epoch': 0.29}


 29%|██▉       | 4917/16798 [21:21<59:44,  3.31it/s]

{'loss': 1.5792, 'grad_norm': 1.980830430984497, 'learning_rate': 0.0001415415773171313, 'epoch': 0.29}


 29%|██▉       | 4918/16798 [21:21<56:58,  3.48it/s]

{'loss': 1.724, 'grad_norm': 2.172197103500366, 'learning_rate': 0.00014152966404574696, 'epoch': 0.29}


 29%|██▉       | 4919/16798 [21:21<58:28,  3.39it/s]

{'loss': 1.5538, 'grad_norm': 1.7179219722747803, 'learning_rate': 0.00014151775077436265, 'epoch': 0.29}


 29%|██▉       | 4920/16798 [21:22<58:40,  3.37it/s]

{'loss': 1.3872, 'grad_norm': 1.6575307846069336, 'learning_rate': 0.0001415058375029783, 'epoch': 0.29}


 29%|██▉       | 4921/16798 [21:22<56:50,  3.48it/s]

{'loss': 1.387, 'grad_norm': 1.9057947397232056, 'learning_rate': 0.000141493924231594, 'epoch': 0.29}


 29%|██▉       | 4922/16798 [21:22<58:33,  3.38it/s]

{'loss': 1.3887, 'grad_norm': 1.6569572687149048, 'learning_rate': 0.00014148201096020967, 'epoch': 0.29}


 29%|██▉       | 4923/16798 [21:23<1:04:23,  3.07it/s]

{'loss': 1.192, 'grad_norm': 1.5152860879898071, 'learning_rate': 0.00014147009768882536, 'epoch': 0.29}


 29%|██▉       | 4924/16798 [21:23<1:04:12,  3.08it/s]

{'loss': 1.605, 'grad_norm': 1.9084941148757935, 'learning_rate': 0.00014145818441744102, 'epoch': 0.29}


 29%|██▉       | 4925/16798 [21:23<58:39,  3.37it/s]  

{'loss': 1.4325, 'grad_norm': 1.9086105823516846, 'learning_rate': 0.0001414462711460567, 'epoch': 0.29}


 29%|██▉       | 4926/16798 [21:24<1:01:42,  3.21it/s]

{'loss': 1.5922, 'grad_norm': 1.8664302825927734, 'learning_rate': 0.00014143435787467238, 'epoch': 0.29}


 29%|██▉       | 4927/16798 [21:24<1:00:47,  3.25it/s]

{'loss': 1.9632, 'grad_norm': 2.4976840019226074, 'learning_rate': 0.00014142244460328807, 'epoch': 0.29}


 29%|██▉       | 4928/16798 [21:24<1:01:18,  3.23it/s]

{'loss': 1.9733, 'grad_norm': 2.0606515407562256, 'learning_rate': 0.00014141053133190373, 'epoch': 0.29}


 29%|██▉       | 4929/16798 [21:24<56:46,  3.48it/s]  

{'loss': 1.787, 'grad_norm': 1.9800201654434204, 'learning_rate': 0.00014139861806051942, 'epoch': 0.29}


 29%|██▉       | 4930/16798 [21:25<58:47,  3.36it/s]

{'loss': 1.161, 'grad_norm': 1.6272327899932861, 'learning_rate': 0.00014138670478913508, 'epoch': 0.29}


 29%|██▉       | 4931/16798 [21:25<57:14,  3.46it/s]

{'loss': 1.3894, 'grad_norm': 1.8825182914733887, 'learning_rate': 0.00014137479151775078, 'epoch': 0.29}


 29%|██▉       | 4932/16798 [21:25<59:27,  3.33it/s]

{'loss': 1.6529, 'grad_norm': 2.2186737060546875, 'learning_rate': 0.00014136287824636644, 'epoch': 0.29}


 29%|██▉       | 4933/16798 [21:26<1:00:43,  3.26it/s]

{'loss': 1.246, 'grad_norm': 1.4492430686950684, 'learning_rate': 0.00014135096497498216, 'epoch': 0.29}


 29%|██▉       | 4934/16798 [21:26<1:00:04,  3.29it/s]

{'loss': 1.338, 'grad_norm': 1.7247862815856934, 'learning_rate': 0.00014133905170359782, 'epoch': 0.29}


 29%|██▉       | 4935/16798 [21:26<56:24,  3.51it/s]  

{'loss': 0.9869, 'grad_norm': 1.3929466009140015, 'learning_rate': 0.0001413271384322135, 'epoch': 0.29}


 29%|██▉       | 4936/16798 [21:27<58:07,  3.40it/s]

{'loss': 0.9975, 'grad_norm': 1.597558617591858, 'learning_rate': 0.00014131522516082917, 'epoch': 0.29}


 29%|██▉       | 4937/16798 [21:27<53:47,  3.67it/s]

{'loss': 1.3151, 'grad_norm': 1.6390820741653442, 'learning_rate': 0.00014130331188944487, 'epoch': 0.29}


 29%|██▉       | 4938/16798 [21:27<52:25,  3.77it/s]

{'loss': 1.289, 'grad_norm': 1.695906400680542, 'learning_rate': 0.00014129139861806053, 'epoch': 0.29}


 29%|██▉       | 4939/16798 [21:27<54:35,  3.62it/s]

{'loss': 0.9709, 'grad_norm': 1.3460665941238403, 'learning_rate': 0.00014127948534667622, 'epoch': 0.29}


 29%|██▉       | 4940/16798 [21:28<51:41,  3.82it/s]

{'loss': 1.1722, 'grad_norm': 2.2113964557647705, 'learning_rate': 0.00014126757207529188, 'epoch': 0.29}


 29%|██▉       | 4941/16798 [21:28<55:33,  3.56it/s]

{'loss': 1.2578, 'grad_norm': 1.661409854888916, 'learning_rate': 0.00014125565880390757, 'epoch': 0.29}


 29%|██▉       | 4942/16798 [21:28<54:06,  3.65it/s]

{'loss': 0.9913, 'grad_norm': 1.6799921989440918, 'learning_rate': 0.00014124374553252324, 'epoch': 0.29}


 29%|██▉       | 4944/16798 [21:29<54:12,  3.64it/s]

{'loss': 1.2445, 'grad_norm': 1.8097840547561646, 'learning_rate': 0.00014123183226113893, 'epoch': 0.29}


 29%|██▉       | 4944/16798 [21:29<54:12,  3.64it/s]

{'loss': 1.185, 'grad_norm': 1.8047699928283691, 'learning_rate': 0.0001412199189897546, 'epoch': 0.29}


 29%|██▉       | 4945/16798 [21:29<53:44,  3.68it/s]

{'loss': 1.0114, 'grad_norm': 1.663726806640625, 'learning_rate': 0.00014120800571837028, 'epoch': 0.29}


 29%|██▉       | 4946/16798 [21:29<57:31,  3.43it/s]

{'loss': 0.9198, 'grad_norm': 1.4888370037078857, 'learning_rate': 0.00014119609244698595, 'epoch': 0.29}


 29%|██▉       | 4947/16798 [21:30<54:49,  3.60it/s]

{'loss': 0.7863, 'grad_norm': 1.578790307044983, 'learning_rate': 0.00014118417917560164, 'epoch': 0.29}


 29%|██▉       | 4948/16798 [21:30<55:00,  3.59it/s]

{'loss': 0.3302, 'grad_norm': 0.765119731426239, 'learning_rate': 0.0001411722659042173, 'epoch': 0.29}


 29%|██▉       | 4949/16798 [21:30<54:11,  3.64it/s]

{'loss': 0.6835, 'grad_norm': 1.3419380187988281, 'learning_rate': 0.000141160352632833, 'epoch': 0.29}


 29%|██▉       | 4950/16798 [21:30<50:56,  3.88it/s]

{'loss': 0.6741, 'grad_norm': 1.3113406896591187, 'learning_rate': 0.00014114843936144866, 'epoch': 0.29}


 29%|██▉       | 4951/16798 [21:31<54:06,  3.65it/s]

{'loss': 1.9141, 'grad_norm': 1.7831575870513916, 'learning_rate': 0.00014113652609006435, 'epoch': 0.29}


 29%|██▉       | 4952/16798 [21:31<57:11,  3.45it/s]

{'loss': 2.0478, 'grad_norm': 2.131995677947998, 'learning_rate': 0.00014112461281868, 'epoch': 0.29}


 29%|██▉       | 4953/16798 [21:31<59:19,  3.33it/s]

{'loss': 1.794, 'grad_norm': 1.702497124671936, 'learning_rate': 0.0001411126995472957, 'epoch': 0.29}


 29%|██▉       | 4954/16798 [21:32<55:15,  3.57it/s]

{'loss': 2.3529, 'grad_norm': 1.9206870794296265, 'learning_rate': 0.00014110078627591136, 'epoch': 0.29}


 29%|██▉       | 4955/16798 [21:32<54:26,  3.63it/s]

{'loss': 1.9483, 'grad_norm': 1.881457805633545, 'learning_rate': 0.00014108887300452706, 'epoch': 0.29}


 30%|██▉       | 4956/16798 [21:32<52:14,  3.78it/s]

{'loss': 2.1082, 'grad_norm': 1.9523239135742188, 'learning_rate': 0.00014107695973314272, 'epoch': 0.3}


 30%|██▉       | 4957/16798 [21:32<54:47,  3.60it/s]

{'loss': 1.8432, 'grad_norm': 2.2446658611297607, 'learning_rate': 0.0001410650464617584, 'epoch': 0.3}


 30%|██▉       | 4958/16798 [21:33<54:53,  3.59it/s]

{'loss': 1.9352, 'grad_norm': 2.0984511375427246, 'learning_rate': 0.00014105313319037407, 'epoch': 0.3}


 30%|██▉       | 4959/16798 [21:33<53:40,  3.68it/s]

{'loss': 1.556, 'grad_norm': 2.051909923553467, 'learning_rate': 0.00014104121991898976, 'epoch': 0.3}


 30%|██▉       | 4960/16798 [21:33<55:36,  3.55it/s]

{'loss': 1.67, 'grad_norm': 2.1037964820861816, 'learning_rate': 0.00014102930664760543, 'epoch': 0.3}


 30%|██▉       | 4961/16798 [21:34<59:37,  3.31it/s]

{'loss': 1.8908, 'grad_norm': 1.9928689002990723, 'learning_rate': 0.00014101739337622112, 'epoch': 0.3}


 30%|██▉       | 4962/16798 [21:34<1:00:13,  3.28it/s]

{'loss': 1.8374, 'grad_norm': 1.92586088180542, 'learning_rate': 0.00014100548010483678, 'epoch': 0.3}


 30%|██▉       | 4963/16798 [21:34<56:48,  3.47it/s]  

{'loss': 1.3192, 'grad_norm': 1.6819660663604736, 'learning_rate': 0.00014099356683345247, 'epoch': 0.3}


 30%|██▉       | 4964/16798 [21:34<53:22,  3.70it/s]

{'loss': 1.8035, 'grad_norm': 1.9215906858444214, 'learning_rate': 0.00014098165356206816, 'epoch': 0.3}


 30%|██▉       | 4965/16798 [21:35<1:01:37,  3.20it/s]

{'loss': 1.2437, 'grad_norm': 1.8662190437316895, 'learning_rate': 0.00014096974029068383, 'epoch': 0.3}


 30%|██▉       | 4966/16798 [21:35<59:11,  3.33it/s]  

{'loss': 1.72, 'grad_norm': 2.045245409011841, 'learning_rate': 0.00014095782701929952, 'epoch': 0.3}


 30%|██▉       | 4967/16798 [21:35<56:54,  3.47it/s]

{'loss': 1.6967, 'grad_norm': 1.7733371257781982, 'learning_rate': 0.00014094591374791518, 'epoch': 0.3}


 30%|██▉       | 4968/16798 [21:36<1:01:18,  3.22it/s]

{'loss': 1.7845, 'grad_norm': 2.018630266189575, 'learning_rate': 0.00014093400047653087, 'epoch': 0.3}


 30%|██▉       | 4969/16798 [21:36<57:34,  3.42it/s]  

{'loss': 1.6211, 'grad_norm': 1.769949197769165, 'learning_rate': 0.00014092208720514654, 'epoch': 0.3}


 30%|██▉       | 4970/16798 [21:36<58:31,  3.37it/s]

{'loss': 1.27, 'grad_norm': 1.6610960960388184, 'learning_rate': 0.00014091017393376223, 'epoch': 0.3}


 30%|██▉       | 4971/16798 [21:36<56:11,  3.51it/s]

{'loss': 2.0733, 'grad_norm': 2.160029649734497, 'learning_rate': 0.0001408982606623779, 'epoch': 0.3}


 30%|██▉       | 4972/16798 [21:37<58:15,  3.38it/s]

{'loss': 1.5255, 'grad_norm': 1.8482266664505005, 'learning_rate': 0.00014088634739099358, 'epoch': 0.3}


 30%|██▉       | 4973/16798 [21:37<55:14,  3.57it/s]

{'loss': 2.0028, 'grad_norm': 2.3605353832244873, 'learning_rate': 0.00014087443411960925, 'epoch': 0.3}


 30%|██▉       | 4974/16798 [21:37<57:14,  3.44it/s]

{'loss': 1.318, 'grad_norm': 1.7673143148422241, 'learning_rate': 0.00014086252084822494, 'epoch': 0.3}


 30%|██▉       | 4975/16798 [21:38<58:07,  3.39it/s]

{'loss': 1.7481, 'grad_norm': 2.4352521896362305, 'learning_rate': 0.0001408506075768406, 'epoch': 0.3}


 30%|██▉       | 4976/16798 [21:38<58:24,  3.37it/s]

{'loss': 1.3994, 'grad_norm': 1.8253377676010132, 'learning_rate': 0.0001408386943054563, 'epoch': 0.3}


 30%|██▉       | 4977/16798 [21:38<55:09,  3.57it/s]

{'loss': 1.5064, 'grad_norm': 1.8211307525634766, 'learning_rate': 0.00014082678103407195, 'epoch': 0.3}


 30%|██▉       | 4978/16798 [21:38<57:15,  3.44it/s]

{'loss': 1.555, 'grad_norm': 2.271737813949585, 'learning_rate': 0.00014081486776268764, 'epoch': 0.3}


 30%|██▉       | 4979/16798 [21:39<54:19,  3.63it/s]

{'loss': 1.7237, 'grad_norm': 2.164031982421875, 'learning_rate': 0.0001408029544913033, 'epoch': 0.3}


 30%|██▉       | 4980/16798 [21:39<55:51,  3.53it/s]

{'loss': 1.3, 'grad_norm': 1.716713786125183, 'learning_rate': 0.000140791041219919, 'epoch': 0.3}


 30%|██▉       | 4981/16798 [21:39<55:57,  3.52it/s]

{'loss': 1.5717, 'grad_norm': 1.800835371017456, 'learning_rate': 0.00014077912794853466, 'epoch': 0.3}


 30%|██▉       | 4982/16798 [21:40<57:52,  3.40it/s]

{'loss': 2.034, 'grad_norm': 2.426706552505493, 'learning_rate': 0.00014076721467715035, 'epoch': 0.3}


 30%|██▉       | 4983/16798 [21:40<59:18,  3.32it/s]

{'loss': 1.3183, 'grad_norm': 2.0979130268096924, 'learning_rate': 0.00014075530140576602, 'epoch': 0.3}


 30%|██▉       | 4984/16798 [21:40<1:02:03,  3.17it/s]

{'loss': 1.5285, 'grad_norm': 1.8538182973861694, 'learning_rate': 0.0001407433881343817, 'epoch': 0.3}


 30%|██▉       | 4985/16798 [21:41<58:01,  3.39it/s]  

{'loss': 1.1825, 'grad_norm': 1.7980469465255737, 'learning_rate': 0.00014073147486299737, 'epoch': 0.3}


 30%|██▉       | 4986/16798 [21:41<1:00:35,  3.25it/s]

{'loss': 1.107, 'grad_norm': 2.16436767578125, 'learning_rate': 0.00014071956159161306, 'epoch': 0.3}


 30%|██▉       | 4987/16798 [21:41<57:10,  3.44it/s]  

{'loss': 1.4501, 'grad_norm': 1.8441963195800781, 'learning_rate': 0.00014070764832022873, 'epoch': 0.3}


 30%|██▉       | 4988/16798 [21:41<56:59,  3.45it/s]

{'loss': 1.312, 'grad_norm': 2.1080687046051025, 'learning_rate': 0.00014069573504884442, 'epoch': 0.3}


 30%|██▉       | 4989/16798 [21:42<56:41,  3.47it/s]

{'loss': 1.1628, 'grad_norm': 1.9010387659072876, 'learning_rate': 0.00014068382177746008, 'epoch': 0.3}


 30%|██▉       | 4990/16798 [21:42<57:33,  3.42it/s]

{'loss': 1.4613, 'grad_norm': 2.193742513656616, 'learning_rate': 0.00014067190850607577, 'epoch': 0.3}


 30%|██▉       | 4991/16798 [21:42<56:14,  3.50it/s]

{'loss': 1.1945, 'grad_norm': 1.7457865476608276, 'learning_rate': 0.00014065999523469144, 'epoch': 0.3}


 30%|██▉       | 4992/16798 [21:43<58:43,  3.35it/s]

{'loss': 1.4403, 'grad_norm': 2.3180441856384277, 'learning_rate': 0.00014064808196330713, 'epoch': 0.3}


 30%|██▉       | 4993/16798 [21:43<1:00:10,  3.27it/s]

{'loss': 1.1091, 'grad_norm': 2.211613178253174, 'learning_rate': 0.0001406361686919228, 'epoch': 0.3}


 30%|██▉       | 4994/16798 [21:43<1:00:11,  3.27it/s]

{'loss': 1.1818, 'grad_norm': 1.9219142198562622, 'learning_rate': 0.0001406242554205385, 'epoch': 0.3}


 30%|██▉       | 4995/16798 [21:44<59:01,  3.33it/s]  

{'loss': 0.822, 'grad_norm': 1.413710355758667, 'learning_rate': 0.00014061234214915417, 'epoch': 0.3}


 30%|██▉       | 4996/16798 [21:44<59:42,  3.29it/s]

{'loss': 0.8207, 'grad_norm': 1.3516077995300293, 'learning_rate': 0.00014060042887776986, 'epoch': 0.3}


 30%|██▉       | 4997/16798 [21:44<57:51,  3.40it/s]

{'loss': 0.9434, 'grad_norm': 1.8657904863357544, 'learning_rate': 0.00014058851560638553, 'epoch': 0.3}


 30%|██▉       | 4998/16798 [21:44<59:20,  3.31it/s]

{'loss': 0.3102, 'grad_norm': 0.8335238099098206, 'learning_rate': 0.00014057660233500122, 'epoch': 0.3}


 30%|██▉       | 4999/16798 [21:45<55:30,  3.54it/s]

{'loss': 0.3335, 'grad_norm': 0.810402512550354, 'learning_rate': 0.00014056468906361688, 'epoch': 0.3}




{'loss': 0.7907, 'grad_norm': 1.4375600814819336, 'learning_rate': 0.00014055277579223257, 'epoch': 0.3}


 30%|██▉       | 5001/16798 [21:48<3:27:08,  1.05s/it]

{'loss': 1.3409, 'grad_norm': 1.3952387571334839, 'learning_rate': 0.00014054086252084823, 'epoch': 0.3}


 30%|██▉       | 5002/16798 [21:48<2:41:51,  1.21it/s]

{'loss': 2.4171, 'grad_norm': 2.276630163192749, 'learning_rate': 0.00014052894924946393, 'epoch': 0.3}


 30%|██▉       | 5003/16798 [21:48<2:12:48,  1.48it/s]

{'loss': 2.3519, 'grad_norm': 2.1830246448516846, 'learning_rate': 0.0001405170359780796, 'epoch': 0.3}


 30%|██▉       | 5004/16798 [21:49<1:50:48,  1.77it/s]

{'loss': 1.6086, 'grad_norm': 1.6566835641860962, 'learning_rate': 0.00014050512270669528, 'epoch': 0.3}


 30%|██▉       | 5005/16798 [21:49<1:35:21,  2.06it/s]

{'loss': 2.0435, 'grad_norm': 1.9314640760421753, 'learning_rate': 0.00014049320943531094, 'epoch': 0.3}


 30%|██▉       | 5006/16798 [21:49<1:27:08,  2.26it/s]

{'loss': 1.773, 'grad_norm': 2.0316150188446045, 'learning_rate': 0.00014048129616392663, 'epoch': 0.3}


 30%|██▉       | 5007/16798 [21:50<1:19:08,  2.48it/s]

{'loss': 1.8568, 'grad_norm': 1.8030850887298584, 'learning_rate': 0.0001404693828925423, 'epoch': 0.3}


 30%|██▉       | 5008/16798 [21:50<1:12:48,  2.70it/s]

{'loss': 1.9454, 'grad_norm': 1.9527881145477295, 'learning_rate': 0.000140457469621158, 'epoch': 0.3}


 30%|██▉       | 5009/16798 [21:50<1:11:20,  2.75it/s]

{'loss': 1.6779, 'grad_norm': 1.7747492790222168, 'learning_rate': 0.00014044555634977365, 'epoch': 0.3}


 30%|██▉       | 5010/16798 [21:51<1:08:24,  2.87it/s]

{'loss': 2.0314, 'grad_norm': 1.9678477048873901, 'learning_rate': 0.00014043364307838934, 'epoch': 0.3}


 30%|██▉       | 5011/16798 [21:51<1:02:36,  3.14it/s]

{'loss': 2.047, 'grad_norm': 2.06388521194458, 'learning_rate': 0.000140421729807005, 'epoch': 0.3}


 30%|██▉       | 5012/16798 [21:51<1:00:44,  3.23it/s]

{'loss': 1.5825, 'grad_norm': 1.7704168558120728, 'learning_rate': 0.0001404098165356207, 'epoch': 0.3}


 30%|██▉       | 5013/16798 [21:51<1:00:07,  3.27it/s]

{'loss': 1.8079, 'grad_norm': 2.3065438270568848, 'learning_rate': 0.00014039790326423636, 'epoch': 0.3}


 30%|██▉       | 5014/16798 [21:52<57:29,  3.42it/s]  

{'loss': 1.951, 'grad_norm': 2.162285089492798, 'learning_rate': 0.00014038598999285205, 'epoch': 0.3}


 30%|██▉       | 5015/16798 [21:52<1:00:58,  3.22it/s]

{'loss': 1.8029, 'grad_norm': 1.84256112575531, 'learning_rate': 0.00014037407672146772, 'epoch': 0.3}


 30%|██▉       | 5016/16798 [21:52<56:09,  3.50it/s]  

{'loss': 1.878, 'grad_norm': 2.3842649459838867, 'learning_rate': 0.0001403621634500834, 'epoch': 0.3}


 30%|██▉       | 5017/16798 [21:53<58:51,  3.34it/s]

{'loss': 1.6064, 'grad_norm': 2.007276773452759, 'learning_rate': 0.00014035025017869907, 'epoch': 0.3}


 30%|██▉       | 5018/16798 [21:53<55:05,  3.56it/s]

{'loss': 1.3162, 'grad_norm': 1.789979338645935, 'learning_rate': 0.00014033833690731476, 'epoch': 0.3}


 30%|██▉       | 5019/16798 [21:53<57:11,  3.43it/s]

{'loss': 1.5002, 'grad_norm': 1.6789097785949707, 'learning_rate': 0.00014032642363593042, 'epoch': 0.3}


 30%|██▉       | 5020/16798 [21:53<53:06,  3.70it/s]

{'loss': 1.861, 'grad_norm': 2.9570579528808594, 'learning_rate': 0.00014031451036454611, 'epoch': 0.3}


 30%|██▉       | 5021/16798 [21:54<55:31,  3.54it/s]

{'loss': 1.7196, 'grad_norm': 2.185879707336426, 'learning_rate': 0.00014030259709316178, 'epoch': 0.3}


 30%|██▉       | 5022/16798 [21:54<52:49,  3.71it/s]

{'loss': 1.7199, 'grad_norm': 2.0452613830566406, 'learning_rate': 0.00014029068382177747, 'epoch': 0.3}


 30%|██▉       | 5023/16798 [21:54<53:30,  3.67it/s]

{'loss': 1.703, 'grad_norm': 2.0002970695495605, 'learning_rate': 0.00014027877055039313, 'epoch': 0.3}


 30%|██▉       | 5024/16798 [21:54<51:19,  3.82it/s]

{'loss': 1.7464, 'grad_norm': 1.8035638332366943, 'learning_rate': 0.00014026685727900882, 'epoch': 0.3}


 30%|██▉       | 5025/16798 [21:55<52:11,  3.76it/s]

{'loss': 1.6805, 'grad_norm': 2.171865224838257, 'learning_rate': 0.00014025494400762451, 'epoch': 0.3}


 30%|██▉       | 5026/16798 [21:55<53:26,  3.67it/s]

{'loss': 1.335, 'grad_norm': 1.9361982345581055, 'learning_rate': 0.00014024303073624018, 'epoch': 0.3}


 30%|██▉       | 5027/16798 [21:55<55:59,  3.50it/s]

{'loss': 1.2082, 'grad_norm': 1.5985716581344604, 'learning_rate': 0.00014023111746485587, 'epoch': 0.3}


 30%|██▉       | 5028/16798 [21:56<56:24,  3.48it/s]

{'loss': 1.3368, 'grad_norm': 1.6513837575912476, 'learning_rate': 0.00014021920419347153, 'epoch': 0.3}


 30%|██▉       | 5029/16798 [21:56<57:51,  3.39it/s]

{'loss': 1.4271, 'grad_norm': 1.9345483779907227, 'learning_rate': 0.00014020729092208722, 'epoch': 0.3}


 30%|██▉       | 5030/16798 [21:56<56:55,  3.45it/s]

{'loss': 1.7587, 'grad_norm': 2.184068202972412, 'learning_rate': 0.0001401953776507029, 'epoch': 0.3}


 30%|██▉       | 5031/16798 [21:57<57:45,  3.40it/s]

{'loss': 1.2707, 'grad_norm': 1.8110804557800293, 'learning_rate': 0.00014018346437931858, 'epoch': 0.3}


 30%|██▉       | 5032/16798 [21:57<56:18,  3.48it/s]

{'loss': 1.6541, 'grad_norm': 2.5780506134033203, 'learning_rate': 0.00014017155110793424, 'epoch': 0.3}


 30%|██▉       | 5033/16798 [21:57<57:27,  3.41it/s]

{'loss': 1.4334, 'grad_norm': 2.4850950241088867, 'learning_rate': 0.00014015963783654993, 'epoch': 0.3}


 30%|██▉       | 5034/16798 [21:57<55:51,  3.51it/s]

{'loss': 1.6642, 'grad_norm': 2.1889069080352783, 'learning_rate': 0.0001401477245651656, 'epoch': 0.3}


 30%|██▉       | 5035/16798 [21:58<59:05,  3.32it/s]

{'loss': 1.6579, 'grad_norm': 2.731055498123169, 'learning_rate': 0.0001401358112937813, 'epoch': 0.3}


 30%|██▉       | 5036/16798 [21:58<56:26,  3.47it/s]

{'loss': 1.1666, 'grad_norm': 1.5599703788757324, 'learning_rate': 0.00014012389802239695, 'epoch': 0.3}


 30%|██▉       | 5038/16798 [21:59<52:57,  3.70it/s]

{'loss': 1.2671, 'grad_norm': 1.8276859521865845, 'learning_rate': 0.00014011198475101264, 'epoch': 0.3}


 30%|██▉       | 5038/16798 [21:59<52:57,  3.70it/s]

{'loss': 1.3101, 'grad_norm': 1.8919857740402222, 'learning_rate': 0.0001401000714796283, 'epoch': 0.3}


 30%|██▉       | 5039/16798 [21:59<52:38,  3.72it/s]

{'loss': 1.4507, 'grad_norm': 1.7491134405136108, 'learning_rate': 0.000140088158208244, 'epoch': 0.3}


 30%|███       | 5040/16798 [21:59<52:06,  3.76it/s]

{'loss': 1.3301, 'grad_norm': 1.757631540298462, 'learning_rate': 0.00014007624493685966, 'epoch': 0.3}


 30%|███       | 5041/16798 [21:59<54:29,  3.60it/s]

{'loss': 1.3832, 'grad_norm': 2.17045259475708, 'learning_rate': 0.00014006433166547535, 'epoch': 0.3}


 30%|███       | 5042/16798 [22:00<55:19,  3.54it/s]

{'loss': 1.0541, 'grad_norm': 1.469765543937683, 'learning_rate': 0.000140052418394091, 'epoch': 0.3}


 30%|███       | 5043/16798 [22:00<57:29,  3.41it/s]

{'loss': 1.26, 'grad_norm': 1.6643903255462646, 'learning_rate': 0.0001400405051227067, 'epoch': 0.3}


 30%|███       | 5044/16798 [22:00<53:47,  3.64it/s]

{'loss': 0.5843, 'grad_norm': 1.3452168703079224, 'learning_rate': 0.00014002859185132237, 'epoch': 0.3}


 30%|███       | 5045/16798 [22:01<57:32,  3.40it/s]

{'loss': 1.0069, 'grad_norm': 1.820340633392334, 'learning_rate': 0.00014001667857993806, 'epoch': 0.3}


 30%|███       | 5046/16798 [22:01<57:29,  3.41it/s]

{'loss': 0.5792, 'grad_norm': 1.233109474182129, 'learning_rate': 0.00014000476530855372, 'epoch': 0.3}


 30%|███       | 5047/16798 [22:01<55:58,  3.50it/s]

{'loss': 0.7007, 'grad_norm': 1.3635907173156738, 'learning_rate': 0.0001399928520371694, 'epoch': 0.3}


 30%|███       | 5048/16798 [22:01<54:04,  3.62it/s]

{'loss': 0.5266, 'grad_norm': 0.9823373556137085, 'learning_rate': 0.00013998093876578508, 'epoch': 0.3}


 30%|███       | 5049/16798 [22:02<55:54,  3.50it/s]

{'loss': 0.2598, 'grad_norm': 0.7419635653495789, 'learning_rate': 0.00013996902549440077, 'epoch': 0.3}


 30%|███       | 5050/16798 [22:02<56:19,  3.48it/s]

{'loss': 0.4008, 'grad_norm': 1.1095033884048462, 'learning_rate': 0.00013995711222301643, 'epoch': 0.3}


 30%|███       | 5051/16798 [22:02<57:39,  3.40it/s]

{'loss': 2.062, 'grad_norm': 2.175625801086426, 'learning_rate': 0.00013994519895163212, 'epoch': 0.3}


 30%|███       | 5052/16798 [22:03<54:53,  3.57it/s]

{'loss': 2.114, 'grad_norm': 1.8665242195129395, 'learning_rate': 0.00013993328568024779, 'epoch': 0.3}


 30%|███       | 5053/16798 [22:03<53:35,  3.65it/s]

{'loss': 2.2674, 'grad_norm': 1.949822187423706, 'learning_rate': 0.00013992137240886348, 'epoch': 0.3}


 30%|███       | 5054/16798 [22:03<54:32,  3.59it/s]

{'loss': 2.2942, 'grad_norm': 1.9301376342773438, 'learning_rate': 0.00013990945913747914, 'epoch': 0.3}


 30%|███       | 5055/16798 [22:03<56:08,  3.49it/s]

{'loss': 1.8959, 'grad_norm': 1.9260231256484985, 'learning_rate': 0.00013989754586609483, 'epoch': 0.3}


 30%|███       | 5056/16798 [22:04<52:13,  3.75it/s]

{'loss': 1.7174, 'grad_norm': 1.7266803979873657, 'learning_rate': 0.00013988563259471052, 'epoch': 0.3}


 30%|███       | 5057/16798 [22:04<52:34,  3.72it/s]

{'loss': 1.96, 'grad_norm': 2.0049400329589844, 'learning_rate': 0.0001398737193233262, 'epoch': 0.3}


 30%|███       | 5058/16798 [22:04<53:44,  3.64it/s]

{'loss': 2.1791, 'grad_norm': 2.243760824203491, 'learning_rate': 0.00013986180605194188, 'epoch': 0.3}


 30%|███       | 5059/16798 [22:04<53:47,  3.64it/s]

{'loss': 1.6284, 'grad_norm': 1.8781464099884033, 'learning_rate': 0.00013984989278055757, 'epoch': 0.3}


 30%|███       | 5060/16798 [22:05<53:06,  3.68it/s]

{'loss': 1.7344, 'grad_norm': 1.9557863473892212, 'learning_rate': 0.00013983797950917323, 'epoch': 0.3}


 30%|███       | 5061/16798 [22:05<55:28,  3.53it/s]

{'loss': 1.6279, 'grad_norm': 2.496208906173706, 'learning_rate': 0.00013982606623778892, 'epoch': 0.3}


 30%|███       | 5062/16798 [22:05<56:03,  3.49it/s]

{'loss': 1.4919, 'grad_norm': 1.8914647102355957, 'learning_rate': 0.00013981415296640458, 'epoch': 0.3}


 30%|███       | 5063/16798 [22:06<56:28,  3.46it/s]

{'loss': 1.7912, 'grad_norm': 2.024091958999634, 'learning_rate': 0.00013980223969502028, 'epoch': 0.3}


 30%|███       | 5064/16798 [22:06<52:47,  3.71it/s]

{'loss': 1.7755, 'grad_norm': 2.1646888256073, 'learning_rate': 0.00013979032642363594, 'epoch': 0.3}


 30%|███       | 5065/16798 [22:06<55:08,  3.55it/s]

{'loss': 1.6411, 'grad_norm': 2.219059705734253, 'learning_rate': 0.00013977841315225163, 'epoch': 0.3}


 30%|███       | 5066/16798 [22:06<57:04,  3.43it/s]

{'loss': 2.2215, 'grad_norm': 2.156658172607422, 'learning_rate': 0.0001397664998808673, 'epoch': 0.3}


 30%|███       | 5067/16798 [22:07<55:01,  3.55it/s]

{'loss': 1.66, 'grad_norm': 2.209484100341797, 'learning_rate': 0.00013975458660948298, 'epoch': 0.3}


 30%|███       | 5068/16798 [22:07<53:43,  3.64it/s]

{'loss': 1.458, 'grad_norm': 1.631445050239563, 'learning_rate': 0.00013974267333809865, 'epoch': 0.3}


 30%|███       | 5069/16798 [22:07<54:51,  3.56it/s]

{'loss': 2.1088, 'grad_norm': 2.340960741043091, 'learning_rate': 0.00013973076006671434, 'epoch': 0.3}


 30%|███       | 5070/16798 [22:08<55:13,  3.54it/s]

{'loss': 1.4961, 'grad_norm': 1.7855262756347656, 'learning_rate': 0.00013971884679533, 'epoch': 0.3}


 30%|███       | 5071/16798 [22:08<53:33,  3.65it/s]

{'loss': 1.3973, 'grad_norm': 1.6907103061676025, 'learning_rate': 0.0001397069335239457, 'epoch': 0.3}


 30%|███       | 5072/16798 [22:08<55:31,  3.52it/s]

{'loss': 1.8524, 'grad_norm': 2.3321845531463623, 'learning_rate': 0.00013969502025256136, 'epoch': 0.3}


 30%|███       | 5073/16798 [22:08<51:53,  3.77it/s]

{'loss': 1.5741, 'grad_norm': 2.1488709449768066, 'learning_rate': 0.00013968310698117705, 'epoch': 0.3}


 30%|███       | 5074/16798 [22:09<56:08,  3.48it/s]

{'loss': 1.284, 'grad_norm': 1.777185320854187, 'learning_rate': 0.0001396711937097927, 'epoch': 0.3}


 30%|███       | 5075/16798 [22:09<54:16,  3.60it/s]

{'loss': 1.1239, 'grad_norm': 1.6005582809448242, 'learning_rate': 0.0001396592804384084, 'epoch': 0.3}


 30%|███       | 5076/16798 [22:09<53:15,  3.67it/s]

{'loss': 1.6271, 'grad_norm': 2.0437562465667725, 'learning_rate': 0.00013964736716702407, 'epoch': 0.3}


 30%|███       | 5077/16798 [22:09<50:51,  3.84it/s]

{'loss': 1.313, 'grad_norm': 1.7665514945983887, 'learning_rate': 0.00013963545389563976, 'epoch': 0.3}


 30%|███       | 5078/16798 [22:10<54:53,  3.56it/s]

{'loss': 1.9348, 'grad_norm': 2.4664394855499268, 'learning_rate': 0.00013962354062425542, 'epoch': 0.3}


 30%|███       | 5079/16798 [22:10<54:43,  3.57it/s]

{'loss': 1.8972, 'grad_norm': 2.2953245639801025, 'learning_rate': 0.0001396116273528711, 'epoch': 0.3}


 30%|███       | 5080/16798 [22:10<56:48,  3.44it/s]

{'loss': 1.4639, 'grad_norm': 1.8381274938583374, 'learning_rate': 0.00013959971408148677, 'epoch': 0.3}


 30%|███       | 5081/16798 [22:11<54:29,  3.58it/s]

{'loss': 1.3086, 'grad_norm': 1.7334760427474976, 'learning_rate': 0.00013958780081010247, 'epoch': 0.3}


 30%|███       | 5082/16798 [22:11<56:06,  3.48it/s]

{'loss': 1.3119, 'grad_norm': 1.6071345806121826, 'learning_rate': 0.00013957588753871813, 'epoch': 0.3}


 30%|███       | 5083/16798 [22:11<55:55,  3.49it/s]

{'loss': 1.6265, 'grad_norm': 2.3271570205688477, 'learning_rate': 0.00013956397426733382, 'epoch': 0.3}


 30%|███       | 5084/16798 [22:11<57:00,  3.42it/s]

{'loss': 1.2997, 'grad_norm': 1.9130228757858276, 'learning_rate': 0.00013955206099594948, 'epoch': 0.3}


 30%|███       | 5085/16798 [22:12<55:54,  3.49it/s]

{'loss': 1.1713, 'grad_norm': 1.821542501449585, 'learning_rate': 0.00013954014772456517, 'epoch': 0.3}


 30%|███       | 5087/16798 [22:12<52:38,  3.71it/s]

{'loss': 2.0092, 'grad_norm': 2.6672348976135254, 'learning_rate': 0.00013952823445318084, 'epoch': 0.3}


 30%|███       | 5087/16798 [22:12<52:38,  3.71it/s]

{'loss': 1.5261, 'grad_norm': 1.8632521629333496, 'learning_rate': 0.00013951632118179653, 'epoch': 0.3}


 30%|███       | 5088/16798 [22:13<53:01,  3.68it/s]

{'loss': 1.3687, 'grad_norm': 1.588335633277893, 'learning_rate': 0.00013950440791041222, 'epoch': 0.3}


 30%|███       | 5089/16798 [22:13<51:34,  3.78it/s]

{'loss': 0.753, 'grad_norm': 1.2417734861373901, 'learning_rate': 0.00013949249463902788, 'epoch': 0.3}


 30%|███       | 5090/16798 [22:13<54:40,  3.57it/s]

{'loss': 1.4482, 'grad_norm': 2.0296685695648193, 'learning_rate': 0.00013948058136764357, 'epoch': 0.3}


 30%|███       | 5091/16798 [22:13<52:09,  3.74it/s]

{'loss': 1.0414, 'grad_norm': 1.8163824081420898, 'learning_rate': 0.00013946866809625924, 'epoch': 0.3}


 30%|███       | 5092/16798 [22:14<56:30,  3.45it/s]

{'loss': 0.9044, 'grad_norm': 1.547149896621704, 'learning_rate': 0.00013945675482487493, 'epoch': 0.3}


 30%|███       | 5093/16798 [22:14<52:11,  3.74it/s]

{'loss': 1.0139, 'grad_norm': 1.7335772514343262, 'learning_rate': 0.0001394448415534906, 'epoch': 0.3}


 30%|███       | 5094/16798 [22:14<51:27,  3.79it/s]

{'loss': 1.1127, 'grad_norm': 1.7941910028457642, 'learning_rate': 0.00013943292828210628, 'epoch': 0.3}


 30%|███       | 5095/16798 [22:14<53:41,  3.63it/s]

{'loss': 0.5022, 'grad_norm': 1.0881128311157227, 'learning_rate': 0.00013942101501072195, 'epoch': 0.3}


 30%|███       | 5097/16798 [22:15<49:39,  3.93it/s]

{'loss': 1.0326, 'grad_norm': 1.8191074132919312, 'learning_rate': 0.00013940910173933764, 'epoch': 0.3}


 30%|███       | 5097/16798 [22:15<49:39,  3.93it/s]

{'loss': 0.1962, 'grad_norm': 0.5849806070327759, 'learning_rate': 0.0001393971884679533, 'epoch': 0.3}


 30%|███       | 5098/16798 [22:15<48:09,  4.05it/s]

{'loss': 0.7436, 'grad_norm': 1.8650765419006348, 'learning_rate': 0.000139385275196569, 'epoch': 0.3}


 30%|███       | 5099/16798 [22:15<49:27,  3.94it/s]

{'loss': 0.5242, 'grad_norm': 1.2335143089294434, 'learning_rate': 0.00013937336192518466, 'epoch': 0.3}


 30%|███       | 5100/16798 [22:16<53:13,  3.66it/s]

{'loss': 0.751, 'grad_norm': 1.535034418106079, 'learning_rate': 0.00013936144865380035, 'epoch': 0.3}


 30%|███       | 5101/16798 [22:16<53:20,  3.65it/s]

{'loss': 1.9193, 'grad_norm': 1.9519048929214478, 'learning_rate': 0.000139349535382416, 'epoch': 0.3}


 30%|███       | 5102/16798 [22:16<53:26,  3.65it/s]

{'loss': 2.3753, 'grad_norm': 1.9493701457977295, 'learning_rate': 0.0001393376221110317, 'epoch': 0.3}


 30%|███       | 5103/16798 [22:17<54:49,  3.56it/s]

{'loss': 1.7975, 'grad_norm': 1.6665881872177124, 'learning_rate': 0.00013932570883964736, 'epoch': 0.3}


 30%|███       | 5104/16798 [22:17<53:17,  3.66it/s]

{'loss': 1.545, 'grad_norm': 1.8879585266113281, 'learning_rate': 0.00013931379556826305, 'epoch': 0.3}


 30%|███       | 5105/16798 [22:17<54:48,  3.56it/s]

{'loss': 1.3895, 'grad_norm': 1.7852097749710083, 'learning_rate': 0.00013930188229687872, 'epoch': 0.3}


 30%|███       | 5106/16798 [22:17<54:51,  3.55it/s]

{'loss': 1.5271, 'grad_norm': 1.7898691892623901, 'learning_rate': 0.0001392899690254944, 'epoch': 0.3}


 30%|███       | 5107/16798 [22:18<55:33,  3.51it/s]

{'loss': 1.5801, 'grad_norm': 1.7325628995895386, 'learning_rate': 0.00013927805575411007, 'epoch': 0.3}


 30%|███       | 5108/16798 [22:18<54:40,  3.56it/s]

{'loss': 1.5569, 'grad_norm': 1.7993338108062744, 'learning_rate': 0.00013926614248272576, 'epoch': 0.3}


 30%|███       | 5109/16798 [22:18<56:23,  3.45it/s]

{'loss': 1.9339, 'grad_norm': 2.0014069080352783, 'learning_rate': 0.00013925422921134143, 'epoch': 0.3}


 30%|███       | 5110/16798 [22:19<53:34,  3.64it/s]

{'loss': 1.6465, 'grad_norm': 1.7216169834136963, 'learning_rate': 0.00013924231593995712, 'epoch': 0.3}


 30%|███       | 5111/16798 [22:19<56:18,  3.46it/s]

{'loss': 1.8228, 'grad_norm': 1.9842909574508667, 'learning_rate': 0.00013923040266857278, 'epoch': 0.3}


 30%|███       | 5112/16798 [22:19<53:03,  3.67it/s]

{'loss': 1.5235, 'grad_norm': 1.7954217195510864, 'learning_rate': 0.00013921848939718847, 'epoch': 0.3}


 30%|███       | 5113/16798 [22:19<54:41,  3.56it/s]

{'loss': 1.4456, 'grad_norm': 1.8167619705200195, 'learning_rate': 0.00013920657612580414, 'epoch': 0.3}


 30%|███       | 5114/16798 [22:20<56:05,  3.47it/s]

{'loss': 1.6979, 'grad_norm': 1.8838353157043457, 'learning_rate': 0.00013919466285441983, 'epoch': 0.3}


 30%|███       | 5115/16798 [22:20<53:47,  3.62it/s]

{'loss': 0.8684, 'grad_norm': 1.3855855464935303, 'learning_rate': 0.0001391827495830355, 'epoch': 0.3}


 30%|███       | 5116/16798 [22:20<56:04,  3.47it/s]

{'loss': 1.7048, 'grad_norm': 2.0244972705841064, 'learning_rate': 0.00013917083631165118, 'epoch': 0.3}


 30%|███       | 5117/16798 [22:21<55:45,  3.49it/s]

{'loss': 1.4562, 'grad_norm': 1.8403823375701904, 'learning_rate': 0.00013915892304026684, 'epoch': 0.3}


 30%|███       | 5118/16798 [22:21<57:43,  3.37it/s]

{'loss': 1.2899, 'grad_norm': 1.962459683418274, 'learning_rate': 0.00013914700976888256, 'epoch': 0.3}


 30%|███       | 5119/16798 [22:21<56:39,  3.44it/s]

{'loss': 1.6926, 'grad_norm': 2.0089850425720215, 'learning_rate': 0.00013913509649749823, 'epoch': 0.3}


 30%|███       | 5120/16798 [22:21<58:22,  3.33it/s]

{'loss': 1.5699, 'grad_norm': 1.7996644973754883, 'learning_rate': 0.00013912318322611392, 'epoch': 0.3}


 30%|███       | 5121/16798 [22:22<55:20,  3.52it/s]

{'loss': 1.4728, 'grad_norm': 1.8774769306182861, 'learning_rate': 0.00013911126995472958, 'epoch': 0.3}


 30%|███       | 5123/16798 [22:22<53:15,  3.65it/s]

{'loss': 1.6995, 'grad_norm': 1.9909794330596924, 'learning_rate': 0.00013909935668334527, 'epoch': 0.3}


 30%|███       | 5123/16798 [22:22<53:15,  3.65it/s]

{'loss': 1.7779, 'grad_norm': 1.9017874002456665, 'learning_rate': 0.00013908744341196094, 'epoch': 0.3}


 31%|███       | 5124/16798 [22:23<52:23,  3.71it/s]

{'loss': 1.4902, 'grad_norm': 1.9069877862930298, 'learning_rate': 0.00013907553014057663, 'epoch': 0.31}


 31%|███       | 5125/16798 [22:23<51:14,  3.80it/s]

{'loss': 1.5043, 'grad_norm': 1.9741606712341309, 'learning_rate': 0.0001390636168691923, 'epoch': 0.31}


 31%|███       | 5126/16798 [22:23<53:41,  3.62it/s]

{'loss': 1.542, 'grad_norm': 2.00166392326355, 'learning_rate': 0.00013905170359780798, 'epoch': 0.31}


 31%|███       | 5127/16798 [22:23<52:49,  3.68it/s]

{'loss': 1.5405, 'grad_norm': 2.134795665740967, 'learning_rate': 0.00013903979032642364, 'epoch': 0.31}


 31%|███       | 5128/16798 [22:24<54:42,  3.55it/s]

{'loss': 1.5704, 'grad_norm': 1.7228410243988037, 'learning_rate': 0.00013902787705503933, 'epoch': 0.31}


 31%|███       | 5129/16798 [22:24<57:40,  3.37it/s]

{'loss': 1.4229, 'grad_norm': 1.7211127281188965, 'learning_rate': 0.000139015963783655, 'epoch': 0.31}


 31%|███       | 5130/16798 [22:24<57:38,  3.37it/s]

{'loss': 1.1452, 'grad_norm': 2.160123586654663, 'learning_rate': 0.0001390040505122707, 'epoch': 0.31}


 31%|███       | 5131/16798 [22:25<57:27,  3.38it/s]

{'loss': 1.4811, 'grad_norm': 1.957686185836792, 'learning_rate': 0.00013899213724088635, 'epoch': 0.31}


 31%|███       | 5132/16798 [22:25<56:15,  3.46it/s]

{'loss': 1.0968, 'grad_norm': 1.4843131303787231, 'learning_rate': 0.00013898022396950204, 'epoch': 0.31}


 31%|███       | 5133/16798 [22:25<58:08,  3.34it/s]

{'loss': 1.258, 'grad_norm': 1.6897401809692383, 'learning_rate': 0.0001389683106981177, 'epoch': 0.31}


 31%|███       | 5134/16798 [22:25<57:34,  3.38it/s]

{'loss': 0.7998, 'grad_norm': 1.291790246963501, 'learning_rate': 0.0001389563974267334, 'epoch': 0.31}


 31%|███       | 5135/16798 [22:26<57:41,  3.37it/s]

{'loss': 1.3765, 'grad_norm': 1.7597007751464844, 'learning_rate': 0.00013894448415534906, 'epoch': 0.31}


 31%|███       | 5136/16798 [22:26<57:24,  3.39it/s]

{'loss': 0.9957, 'grad_norm': 1.6103811264038086, 'learning_rate': 0.00013893257088396475, 'epoch': 0.31}


 31%|███       | 5137/16798 [22:26<58:06,  3.34it/s]

{'loss': 0.9483, 'grad_norm': 1.3556519746780396, 'learning_rate': 0.00013892065761258042, 'epoch': 0.31}


 31%|███       | 5138/16798 [22:27<55:06,  3.53it/s]

{'loss': 1.3586, 'grad_norm': 1.971531867980957, 'learning_rate': 0.0001389087443411961, 'epoch': 0.31}


 31%|███       | 5139/16798 [22:27<56:47,  3.42it/s]

{'loss': 1.2404, 'grad_norm': 1.8025141954421997, 'learning_rate': 0.00013889683106981177, 'epoch': 0.31}


 31%|███       | 5140/16798 [22:27<55:23,  3.51it/s]

{'loss': 0.9528, 'grad_norm': 1.5447285175323486, 'learning_rate': 0.00013888491779842746, 'epoch': 0.31}


 31%|███       | 5141/16798 [22:28<57:45,  3.36it/s]

{'loss': 0.7666, 'grad_norm': 1.4446114301681519, 'learning_rate': 0.00013887300452704313, 'epoch': 0.31}


 31%|███       | 5142/16798 [22:28<54:52,  3.54it/s]

{'loss': 1.1222, 'grad_norm': 1.6081953048706055, 'learning_rate': 0.00013886109125565882, 'epoch': 0.31}


 31%|███       | 5143/16798 [22:28<55:54,  3.47it/s]

{'loss': 1.7238, 'grad_norm': 2.1045002937316895, 'learning_rate': 0.00013884917798427448, 'epoch': 0.31}


 31%|███       | 5144/16798 [22:28<55:06,  3.52it/s]

{'loss': 0.9508, 'grad_norm': 1.9453723430633545, 'learning_rate': 0.00013883726471289017, 'epoch': 0.31}


 31%|███       | 5145/16798 [22:29<55:11,  3.52it/s]

{'loss': 0.7881, 'grad_norm': 1.3282065391540527, 'learning_rate': 0.00013882535144150583, 'epoch': 0.31}


 31%|███       | 5146/16798 [22:29<55:43,  3.48it/s]

{'loss': 0.626, 'grad_norm': 1.377136468887329, 'learning_rate': 0.00013881343817012152, 'epoch': 0.31}


 31%|███       | 5147/16798 [22:29<55:07,  3.52it/s]

{'loss': 0.3509, 'grad_norm': 0.955133318901062, 'learning_rate': 0.0001388015248987372, 'epoch': 0.31}


 31%|███       | 5148/16798 [22:30<56:39,  3.43it/s]

{'loss': 0.3965, 'grad_norm': 0.9896101951599121, 'learning_rate': 0.00013878961162735288, 'epoch': 0.31}


 31%|███       | 5149/16798 [22:30<55:32,  3.50it/s]

{'loss': 0.3755, 'grad_norm': 0.9358975291252136, 'learning_rate': 0.00013877769835596857, 'epoch': 0.31}


 31%|███       | 5150/16798 [22:30<55:52,  3.47it/s]

{'loss': 0.6016, 'grad_norm': 1.252367377281189, 'learning_rate': 0.00013876578508458423, 'epoch': 0.31}


 31%|███       | 5151/16798 [22:30<57:00,  3.41it/s]

{'loss': 2.0543, 'grad_norm': 2.1742238998413086, 'learning_rate': 0.00013875387181319992, 'epoch': 0.31}


 31%|███       | 5152/16798 [22:31<56:40,  3.42it/s]

{'loss': 2.1026, 'grad_norm': 2.047541856765747, 'learning_rate': 0.0001387419585418156, 'epoch': 0.31}


 31%|███       | 5153/16798 [22:31<58:25,  3.32it/s]

{'loss': 1.9381, 'grad_norm': 1.78673255443573, 'learning_rate': 0.00013873004527043128, 'epoch': 0.31}


 31%|███       | 5154/16798 [22:31<54:42,  3.55it/s]

{'loss': 2.4766, 'grad_norm': 2.1293039321899414, 'learning_rate': 0.00013871813199904694, 'epoch': 0.31}


 31%|███       | 5155/16798 [22:32<56:40,  3.42it/s]

{'loss': 2.014, 'grad_norm': 2.0340428352355957, 'learning_rate': 0.00013870621872766263, 'epoch': 0.31}


 31%|███       | 5156/16798 [22:32<53:55,  3.60it/s]

{'loss': 1.8332, 'grad_norm': 1.8939473628997803, 'learning_rate': 0.0001386943054562783, 'epoch': 0.31}


 31%|███       | 5157/16798 [22:32<55:10,  3.52it/s]

{'loss': 1.9695, 'grad_norm': 1.791175127029419, 'learning_rate': 0.000138682392184894, 'epoch': 0.31}


 31%|███       | 5158/16798 [22:32<55:12,  3.51it/s]

{'loss': 2.0836, 'grad_norm': 2.156242847442627, 'learning_rate': 0.00013867047891350965, 'epoch': 0.31}


 31%|███       | 5159/16798 [22:33<1:01:10,  3.17it/s]

{'loss': 2.5541, 'grad_norm': 2.2807862758636475, 'learning_rate': 0.00013865856564212534, 'epoch': 0.31}


 31%|███       | 5160/16798 [22:33<57:51,  3.35it/s]  

{'loss': 2.1354, 'grad_norm': 2.204437732696533, 'learning_rate': 0.000138646652370741, 'epoch': 0.31}


 31%|███       | 5161/16798 [22:33<58:21,  3.32it/s]

{'loss': 2.1776, 'grad_norm': 2.051234483718872, 'learning_rate': 0.0001386347390993567, 'epoch': 0.31}


 31%|███       | 5162/16798 [22:34<56:57,  3.40it/s]

{'loss': 1.3311, 'grad_norm': 1.7696784734725952, 'learning_rate': 0.00013862282582797236, 'epoch': 0.31}


 31%|███       | 5163/16798 [22:34<56:49,  3.41it/s]

{'loss': 1.955, 'grad_norm': 2.304428815841675, 'learning_rate': 0.00013861091255658805, 'epoch': 0.31}


 31%|███       | 5164/16798 [22:34<53:07,  3.65it/s]

{'loss': 1.307, 'grad_norm': 1.7261890172958374, 'learning_rate': 0.00013859899928520371, 'epoch': 0.31}


 31%|███       | 5165/16798 [22:34<55:00,  3.52it/s]

{'loss': 1.4426, 'grad_norm': 1.9101909399032593, 'learning_rate': 0.0001385870860138194, 'epoch': 0.31}


 31%|███       | 5166/16798 [22:35<56:01,  3.46it/s]

{'loss': 1.5295, 'grad_norm': 2.035856008529663, 'learning_rate': 0.00013857517274243507, 'epoch': 0.31}


 31%|███       | 5167/16798 [22:35<54:33,  3.55it/s]

{'loss': 1.6001, 'grad_norm': 1.7231030464172363, 'learning_rate': 0.00013856325947105076, 'epoch': 0.31}


 31%|███       | 5168/16798 [22:35<53:31,  3.62it/s]

{'loss': 1.4743, 'grad_norm': 1.8308517932891846, 'learning_rate': 0.00013855134619966642, 'epoch': 0.31}


 31%|███       | 5169/16798 [22:36<53:03,  3.65it/s]

{'loss': 1.6117, 'grad_norm': 2.1252896785736084, 'learning_rate': 0.00013853943292828211, 'epoch': 0.31}


 31%|███       | 5170/16798 [22:36<56:08,  3.45it/s]

{'loss': 1.2302, 'grad_norm': 1.5643473863601685, 'learning_rate': 0.00013852751965689778, 'epoch': 0.31}


 31%|███       | 5171/16798 [22:36<55:50,  3.47it/s]

{'loss': 0.9879, 'grad_norm': 1.3078283071517944, 'learning_rate': 0.00013851560638551347, 'epoch': 0.31}


 31%|███       | 5172/16798 [22:36<52:43,  3.67it/s]

{'loss': 1.4034, 'grad_norm': 1.729579210281372, 'learning_rate': 0.00013850369311412913, 'epoch': 0.31}


 31%|███       | 5173/16798 [22:37<57:53,  3.35it/s]

{'loss': 1.7184, 'grad_norm': 2.350008249282837, 'learning_rate': 0.00013849177984274482, 'epoch': 0.31}


 31%|███       | 5174/16798 [22:37<55:10,  3.51it/s]

{'loss': 1.493, 'grad_norm': 1.8978784084320068, 'learning_rate': 0.0001384798665713605, 'epoch': 0.31}


 31%|███       | 5175/16798 [22:37<56:15,  3.44it/s]

{'loss': 1.7874, 'grad_norm': 2.5136237144470215, 'learning_rate': 0.00013846795329997618, 'epoch': 0.31}


 31%|███       | 5176/16798 [22:38<53:26,  3.62it/s]

{'loss': 1.6247, 'grad_norm': 2.194965362548828, 'learning_rate': 0.00013845604002859184, 'epoch': 0.31}


 31%|███       | 5177/16798 [22:38<55:24,  3.50it/s]

{'loss': 1.8221, 'grad_norm': 2.4356775283813477, 'learning_rate': 0.00013844412675720753, 'epoch': 0.31}


 31%|███       | 5178/16798 [22:38<53:58,  3.59it/s]

{'loss': 1.4612, 'grad_norm': 1.8011983633041382, 'learning_rate': 0.0001384322134858232, 'epoch': 0.31}


 31%|███       | 5179/16798 [22:38<52:54,  3.66it/s]

{'loss': 1.2872, 'grad_norm': 2.0468697547912598, 'learning_rate': 0.00013842030021443889, 'epoch': 0.31}


 31%|███       | 5180/16798 [22:39<56:20,  3.44it/s]

{'loss': 1.4415, 'grad_norm': 1.7469723224639893, 'learning_rate': 0.00013840838694305458, 'epoch': 0.31}


 31%|███       | 5181/16798 [22:39<53:44,  3.60it/s]

{'loss': 1.4771, 'grad_norm': 1.8323643207550049, 'learning_rate': 0.00013839647367167027, 'epoch': 0.31}


 31%|███       | 5182/16798 [22:39<55:07,  3.51it/s]

{'loss': 1.3521, 'grad_norm': 1.9620585441589355, 'learning_rate': 0.00013838456040028593, 'epoch': 0.31}


 31%|███       | 5183/16798 [22:40<57:00,  3.40it/s]

{'loss': 1.1363, 'grad_norm': 1.708682894706726, 'learning_rate': 0.00013837264712890162, 'epoch': 0.31}


 31%|███       | 5184/16798 [22:40<55:24,  3.49it/s]

{'loss': 1.2563, 'grad_norm': 2.238145351409912, 'learning_rate': 0.00013836073385751729, 'epoch': 0.31}


 31%|███       | 5185/16798 [22:40<56:14,  3.44it/s]

{'loss': 1.1278, 'grad_norm': 1.605721354484558, 'learning_rate': 0.00013834882058613298, 'epoch': 0.31}


 31%|███       | 5186/16798 [22:40<55:43,  3.47it/s]

{'loss': 1.3107, 'grad_norm': 2.045762538909912, 'learning_rate': 0.00013833690731474864, 'epoch': 0.31}


 31%|███       | 5187/16798 [22:41<59:18,  3.26it/s]

{'loss': 1.3136, 'grad_norm': 1.8242902755737305, 'learning_rate': 0.00013832499404336433, 'epoch': 0.31}


 31%|███       | 5188/16798 [22:41<57:32,  3.36it/s]

{'loss': 1.2366, 'grad_norm': 1.8536019325256348, 'learning_rate': 0.00013831308077198, 'epoch': 0.31}


 31%|███       | 5189/16798 [22:41<56:31,  3.42it/s]

{'loss': 1.3822, 'grad_norm': 1.8757530450820923, 'learning_rate': 0.00013830116750059569, 'epoch': 0.31}


 31%|███       | 5190/16798 [22:42<57:28,  3.37it/s]

{'loss': 1.3155, 'grad_norm': 2.1327803134918213, 'learning_rate': 0.00013828925422921135, 'epoch': 0.31}


 31%|███       | 5191/16798 [22:42<1:00:02,  3.22it/s]

{'loss': 0.7639, 'grad_norm': 1.6579952239990234, 'learning_rate': 0.00013827734095782704, 'epoch': 0.31}


 31%|███       | 5192/16798 [22:42<57:50,  3.34it/s]  

{'loss': 1.0526, 'grad_norm': 2.000270366668701, 'learning_rate': 0.0001382654276864427, 'epoch': 0.31}


 31%|███       | 5193/16798 [22:43<56:41,  3.41it/s]

{'loss': 1.2322, 'grad_norm': 1.8292063474655151, 'learning_rate': 0.0001382535144150584, 'epoch': 0.31}


 31%|███       | 5194/16798 [22:43<53:49,  3.59it/s]

{'loss': 1.1357, 'grad_norm': 1.8622045516967773, 'learning_rate': 0.00013824160114367406, 'epoch': 0.31}


 31%|███       | 5195/16798 [22:43<54:58,  3.52it/s]

{'loss': 0.8956, 'grad_norm': 1.553259253501892, 'learning_rate': 0.00013822968787228975, 'epoch': 0.31}


 31%|███       | 5196/16798 [22:43<56:50,  3.40it/s]

{'loss': 0.7927, 'grad_norm': 1.561610460281372, 'learning_rate': 0.0001382177746009054, 'epoch': 0.31}


 31%|███       | 5197/16798 [22:44<57:07,  3.38it/s]

{'loss': 0.5704, 'grad_norm': 1.2899612188339233, 'learning_rate': 0.0001382058613295211, 'epoch': 0.31}


 31%|███       | 5198/16798 [22:44<56:13,  3.44it/s]

{'loss': 0.8063, 'grad_norm': 1.5001951456069946, 'learning_rate': 0.00013819394805813677, 'epoch': 0.31}


 31%|███       | 5199/16798 [22:44<57:06,  3.38it/s]

{'loss': 0.3624, 'grad_norm': 1.0419807434082031, 'learning_rate': 0.00013818203478675246, 'epoch': 0.31}


 31%|███       | 5200/16798 [22:45<54:50,  3.53it/s]

{'loss': 0.6062, 'grad_norm': 1.4373352527618408, 'learning_rate': 0.00013817012151536812, 'epoch': 0.31}


 31%|███       | 5201/16798 [22:45<55:09,  3.50it/s]

{'loss': 2.1193, 'grad_norm': 1.8981103897094727, 'learning_rate': 0.0001381582082439838, 'epoch': 0.31}


 31%|███       | 5202/16798 [22:45<54:16,  3.56it/s]

{'loss': 2.0409, 'grad_norm': 1.922027826309204, 'learning_rate': 0.00013814629497259948, 'epoch': 0.31}


 31%|███       | 5203/16798 [22:45<53:08,  3.64it/s]

{'loss': 2.1238, 'grad_norm': 1.9891650676727295, 'learning_rate': 0.00013813438170121517, 'epoch': 0.31}


 31%|███       | 5204/16798 [22:46<55:14,  3.50it/s]

{'loss': 1.9094, 'grad_norm': 1.9478970766067505, 'learning_rate': 0.00013812246842983083, 'epoch': 0.31}


 31%|███       | 5205/16798 [22:46<56:10,  3.44it/s]

{'loss': 2.007, 'grad_norm': 2.3368966579437256, 'learning_rate': 0.00013811055515844652, 'epoch': 0.31}


 31%|███       | 5206/16798 [22:46<59:35,  3.24it/s]

{'loss': 2.0162, 'grad_norm': 2.0844192504882812, 'learning_rate': 0.00013809864188706218, 'epoch': 0.31}


 31%|███       | 5207/16798 [22:47<59:44,  3.23it/s]

{'loss': 1.9353, 'grad_norm': 2.0973267555236816, 'learning_rate': 0.00013808672861567788, 'epoch': 0.31}


 31%|███       | 5208/16798 [22:47<59:33,  3.24it/s]

{'loss': 1.8431, 'grad_norm': 1.9697891473770142, 'learning_rate': 0.00013807481534429354, 'epoch': 0.31}


 31%|███       | 5209/16798 [22:47<58:01,  3.33it/s]

{'loss': 1.6216, 'grad_norm': 1.88485586643219, 'learning_rate': 0.00013806290207290923, 'epoch': 0.31}


 31%|███       | 5210/16798 [22:47<56:18,  3.43it/s]

{'loss': 1.363, 'grad_norm': 1.9034125804901123, 'learning_rate': 0.00013805098880152492, 'epoch': 0.31}


 31%|███       | 5211/16798 [22:48<55:46,  3.46it/s]

{'loss': 2.0048, 'grad_norm': 2.1407470703125, 'learning_rate': 0.00013803907553014058, 'epoch': 0.31}


 31%|███       | 5212/16798 [22:48<56:10,  3.44it/s]

{'loss': 1.5226, 'grad_norm': 1.8271138668060303, 'learning_rate': 0.00013802716225875627, 'epoch': 0.31}


 31%|███       | 5213/16798 [22:48<56:17,  3.43it/s]

{'loss': 1.2639, 'grad_norm': 1.5884068012237549, 'learning_rate': 0.00013801524898737194, 'epoch': 0.31}


 31%|███       | 5214/16798 [22:49<57:54,  3.33it/s]

{'loss': 1.7531, 'grad_norm': 1.9794214963912964, 'learning_rate': 0.00013800333571598763, 'epoch': 0.31}


 31%|███       | 5215/16798 [22:49<59:11,  3.26it/s]

{'loss': 1.7773, 'grad_norm': 2.180659532546997, 'learning_rate': 0.0001379914224446033, 'epoch': 0.31}


 31%|███       | 5216/16798 [22:49<58:07,  3.32it/s]

{'loss': 1.8308, 'grad_norm': 1.8987746238708496, 'learning_rate': 0.00013797950917321898, 'epoch': 0.31}


 31%|███       | 5217/16798 [22:50<57:23,  3.36it/s]

{'loss': 1.8784, 'grad_norm': 2.1772732734680176, 'learning_rate': 0.00013796759590183465, 'epoch': 0.31}


 31%|███       | 5218/16798 [22:50<56:23,  3.42it/s]

{'loss': 1.564, 'grad_norm': 2.063525676727295, 'learning_rate': 0.00013795568263045034, 'epoch': 0.31}


 31%|███       | 5219/16798 [22:50<55:57,  3.45it/s]

{'loss': 1.632, 'grad_norm': 1.957069993019104, 'learning_rate': 0.000137943769359066, 'epoch': 0.31}


 31%|███       | 5220/16798 [22:51<1:01:00,  3.16it/s]

{'loss': 1.5442, 'grad_norm': 2.2245163917541504, 'learning_rate': 0.0001379318560876817, 'epoch': 0.31}


 31%|███       | 5221/16798 [22:51<1:00:00,  3.22it/s]

{'loss': 1.3028, 'grad_norm': 3.7288036346435547, 'learning_rate': 0.00013791994281629736, 'epoch': 0.31}


 31%|███       | 5222/16798 [22:51<58:52,  3.28it/s]  

{'loss': 1.6316, 'grad_norm': 2.2420053482055664, 'learning_rate': 0.00013790802954491305, 'epoch': 0.31}


 31%|███       | 5223/16798 [22:51<59:13,  3.26it/s]

{'loss': 2.0624, 'grad_norm': 2.431833028793335, 'learning_rate': 0.0001378961162735287, 'epoch': 0.31}


 31%|███       | 5224/16798 [22:52<1:00:55,  3.17it/s]

{'loss': 1.5281, 'grad_norm': 1.6289430856704712, 'learning_rate': 0.0001378842030021444, 'epoch': 0.31}


 31%|███       | 5225/16798 [22:52<58:11,  3.31it/s]  

{'loss': 1.9971, 'grad_norm': 2.641993761062622, 'learning_rate': 0.00013787228973076006, 'epoch': 0.31}


 31%|███       | 5226/16798 [22:52<58:24,  3.30it/s]

{'loss': 1.4566, 'grad_norm': 2.0061120986938477, 'learning_rate': 0.00013786037645937576, 'epoch': 0.31}


 31%|███       | 5227/16798 [22:53<56:58,  3.38it/s]

{'loss': 1.429, 'grad_norm': 1.705073356628418, 'learning_rate': 0.00013784846318799142, 'epoch': 0.31}


 31%|███       | 5228/16798 [22:53<58:14,  3.31it/s]

{'loss': 1.3825, 'grad_norm': 2.1558516025543213, 'learning_rate': 0.0001378365499166071, 'epoch': 0.31}


 31%|███       | 5229/16798 [22:53<58:31,  3.30it/s]

{'loss': 1.4383, 'grad_norm': 1.6923937797546387, 'learning_rate': 0.00013782463664522277, 'epoch': 0.31}


 31%|███       | 5230/16798 [22:53<56:56,  3.39it/s]

{'loss': 1.2771, 'grad_norm': 1.6014615297317505, 'learning_rate': 0.00013781272337383846, 'epoch': 0.31}


 31%|███       | 5231/16798 [22:54<55:33,  3.47it/s]

{'loss': 0.9931, 'grad_norm': 1.6156673431396484, 'learning_rate': 0.00013780081010245413, 'epoch': 0.31}


 31%|███       | 5232/16798 [22:54<54:01,  3.57it/s]

{'loss': 1.34, 'grad_norm': 1.8780597448349, 'learning_rate': 0.00013778889683106982, 'epoch': 0.31}


 31%|███       | 5233/16798 [22:54<53:17,  3.62it/s]

{'loss': 1.6477, 'grad_norm': 1.983763575553894, 'learning_rate': 0.00013777698355968548, 'epoch': 0.31}


 31%|███       | 5234/16798 [22:55<54:23,  3.54it/s]

{'loss': 1.2932, 'grad_norm': 2.08034348487854, 'learning_rate': 0.00013776507028830117, 'epoch': 0.31}


 31%|███       | 5235/16798 [22:55<56:27,  3.41it/s]

{'loss': 1.5751, 'grad_norm': 2.1676363945007324, 'learning_rate': 0.00013775315701691684, 'epoch': 0.31}


 31%|███       | 5236/16798 [22:55<55:04,  3.50it/s]

{'loss': 1.5129, 'grad_norm': 1.9679819345474243, 'learning_rate': 0.00013774124374553253, 'epoch': 0.31}


 31%|███       | 5237/16798 [22:55<54:34,  3.53it/s]

{'loss': 1.2021, 'grad_norm': 1.5800856351852417, 'learning_rate': 0.0001377293304741482, 'epoch': 0.31}


 31%|███       | 5238/16798 [22:56<54:53,  3.51it/s]

{'loss': 1.6026, 'grad_norm': 1.8936045169830322, 'learning_rate': 0.00013771741720276388, 'epoch': 0.31}


 31%|███       | 5239/16798 [22:56<56:40,  3.40it/s]

{'loss': 1.1935, 'grad_norm': 1.6153545379638672, 'learning_rate': 0.00013770550393137955, 'epoch': 0.31}


 31%|███       | 5240/16798 [22:56<58:01,  3.32it/s]

{'loss': 1.1335, 'grad_norm': 1.7807700634002686, 'learning_rate': 0.00013769359065999524, 'epoch': 0.31}


 31%|███       | 5241/16798 [22:57<57:28,  3.35it/s]

{'loss': 1.6831, 'grad_norm': 2.3848702907562256, 'learning_rate': 0.00013768167738861093, 'epoch': 0.31}


 31%|███       | 5242/16798 [22:57<57:07,  3.37it/s]

{'loss': 1.8189, 'grad_norm': 2.6089978218078613, 'learning_rate': 0.00013766976411722662, 'epoch': 0.31}


 31%|███       | 5243/16798 [22:57<55:18,  3.48it/s]

{'loss': 1.3046, 'grad_norm': 1.812072992324829, 'learning_rate': 0.00013765785084584228, 'epoch': 0.31}


 31%|███       | 5244/16798 [22:58<57:48,  3.33it/s]

{'loss': 0.9211, 'grad_norm': 1.48343825340271, 'learning_rate': 0.00013764593757445797, 'epoch': 0.31}


 31%|███       | 5245/16798 [22:58<58:46,  3.28it/s]

{'loss': 1.1394, 'grad_norm': 1.6572133302688599, 'learning_rate': 0.00013763402430307364, 'epoch': 0.31}


 31%|███       | 5246/16798 [22:58<57:23,  3.36it/s]

{'loss': 0.5975, 'grad_norm': 1.3548662662506104, 'learning_rate': 0.00013762211103168933, 'epoch': 0.31}


 31%|███       | 5247/16798 [22:58<58:23,  3.30it/s]

{'loss': 0.2095, 'grad_norm': 0.8196566104888916, 'learning_rate': 0.000137610197760305, 'epoch': 0.31}


 31%|███       | 5248/16798 [22:59<55:26,  3.47it/s]

{'loss': 0.2678, 'grad_norm': 0.7591409087181091, 'learning_rate': 0.00013759828448892068, 'epoch': 0.31}


 31%|███       | 5249/16798 [22:59<54:08,  3.56it/s]

{'loss': 0.6894, 'grad_norm': 1.5272692441940308, 'learning_rate': 0.00013758637121753635, 'epoch': 0.31}


 31%|███▏      | 5250/16798 [22:59<55:00,  3.50it/s]

{'loss': 0.6792, 'grad_norm': 1.4527820348739624, 'learning_rate': 0.00013757445794615204, 'epoch': 0.31}


 31%|███▏      | 5251/16798 [23:00<54:31,  3.53it/s]

{'loss': 2.1453, 'grad_norm': 1.8673136234283447, 'learning_rate': 0.0001375625446747677, 'epoch': 0.31}


 31%|███▏      | 5252/16798 [23:00<55:21,  3.48it/s]

{'loss': 1.8913, 'grad_norm': 1.6410070657730103, 'learning_rate': 0.0001375506314033834, 'epoch': 0.31}


 31%|███▏      | 5253/16798 [23:00<56:21,  3.41it/s]

{'loss': 2.0504, 'grad_norm': 1.7787147760391235, 'learning_rate': 0.00013753871813199905, 'epoch': 0.31}


 31%|███▏      | 5254/16798 [23:00<55:30,  3.47it/s]

{'loss': 1.7523, 'grad_norm': 1.7789971828460693, 'learning_rate': 0.00013752680486061474, 'epoch': 0.31}


 31%|███▏      | 5255/16798 [23:01<59:15,  3.25it/s]

{'loss': 2.0093, 'grad_norm': 2.027148485183716, 'learning_rate': 0.0001375148915892304, 'epoch': 0.31}


 31%|███▏      | 5256/16798 [23:01<59:21,  3.24it/s]

{'loss': 2.1463, 'grad_norm': 1.9753416776657104, 'learning_rate': 0.0001375029783178461, 'epoch': 0.31}


 31%|███▏      | 5257/16798 [23:01<1:01:24,  3.13it/s]

{'loss': 1.7774, 'grad_norm': 1.7637895345687866, 'learning_rate': 0.00013749106504646176, 'epoch': 0.31}


 31%|███▏      | 5258/16798 [23:02<58:55,  3.26it/s]  

{'loss': 2.0274, 'grad_norm': 1.8149734735488892, 'learning_rate': 0.00013747915177507745, 'epoch': 0.31}


 31%|███▏      | 5259/16798 [23:02<57:19,  3.36it/s]

{'loss': 1.482, 'grad_norm': 1.751554012298584, 'learning_rate': 0.00013746723850369312, 'epoch': 0.31}


 31%|███▏      | 5260/16798 [23:02<59:32,  3.23it/s]

{'loss': 1.8343, 'grad_norm': 2.131176471710205, 'learning_rate': 0.0001374553252323088, 'epoch': 0.31}


 31%|███▏      | 5261/16798 [23:03<58:45,  3.27it/s]

{'loss': 1.4951, 'grad_norm': 2.0410776138305664, 'learning_rate': 0.00013744341196092447, 'epoch': 0.31}


 31%|███▏      | 5262/16798 [23:03<1:00:26,  3.18it/s]

{'loss': 1.5672, 'grad_norm': 1.5810792446136475, 'learning_rate': 0.00013743149868954016, 'epoch': 0.31}


 31%|███▏      | 5263/16798 [23:03<59:28,  3.23it/s]  

{'loss': 1.9819, 'grad_norm': 2.0659990310668945, 'learning_rate': 0.00013741958541815583, 'epoch': 0.31}


 31%|███▏      | 5264/16798 [23:04<59:16,  3.24it/s]

{'loss': 1.5914, 'grad_norm': 1.7454725503921509, 'learning_rate': 0.00013740767214677152, 'epoch': 0.31}


 31%|███▏      | 5265/16798 [23:04<56:54,  3.38it/s]

{'loss': 1.6628, 'grad_norm': 1.9306995868682861, 'learning_rate': 0.00013739575887538718, 'epoch': 0.31}


 31%|███▏      | 5266/16798 [23:04<58:49,  3.27it/s]

{'loss': 1.5573, 'grad_norm': 2.0121800899505615, 'learning_rate': 0.00013738384560400287, 'epoch': 0.31}


 31%|███▏      | 5267/16798 [23:04<55:18,  3.47it/s]

{'loss': 1.7199, 'grad_norm': 1.8540176153182983, 'learning_rate': 0.00013737193233261853, 'epoch': 0.31}


 31%|███▏      | 5268/16798 [23:05<56:14,  3.42it/s]

{'loss': 1.7049, 'grad_norm': 2.2019200325012207, 'learning_rate': 0.00013736001906123423, 'epoch': 0.31}


 31%|███▏      | 5269/16798 [23:05<55:52,  3.44it/s]

{'loss': 1.9046, 'grad_norm': 2.1971819400787354, 'learning_rate': 0.0001373481057898499, 'epoch': 0.31}


 31%|███▏      | 5270/16798 [23:05<55:44,  3.45it/s]

{'loss': 1.5589, 'grad_norm': 1.8949824571609497, 'learning_rate': 0.00013733619251846558, 'epoch': 0.31}


 31%|███▏      | 5271/16798 [23:06<57:32,  3.34it/s]

{'loss': 1.6919, 'grad_norm': 2.136439800262451, 'learning_rate': 0.00013732427924708124, 'epoch': 0.31}


 31%|███▏      | 5272/16798 [23:06<54:41,  3.51it/s]

{'loss': 1.659, 'grad_norm': 2.088559865951538, 'learning_rate': 0.00013731236597569693, 'epoch': 0.31}


 31%|███▏      | 5273/16798 [23:06<57:17,  3.35it/s]

{'loss': 1.6535, 'grad_norm': 1.9491112232208252, 'learning_rate': 0.00013730045270431263, 'epoch': 0.31}


 31%|███▏      | 5274/16798 [23:06<54:01,  3.56it/s]

{'loss': 1.6629, 'grad_norm': 2.257133722305298, 'learning_rate': 0.0001372885394329283, 'epoch': 0.31}


 31%|███▏      | 5275/16798 [23:07<57:23,  3.35it/s]

{'loss': 1.4637, 'grad_norm': 1.6939841508865356, 'learning_rate': 0.00013727662616154398, 'epoch': 0.31}


 31%|███▏      | 5276/16798 [23:07<55:56,  3.43it/s]

{'loss': 1.7632, 'grad_norm': 2.169062614440918, 'learning_rate': 0.00013726471289015964, 'epoch': 0.31}


 31%|███▏      | 5277/16798 [23:07<56:55,  3.37it/s]

{'loss': 1.2598, 'grad_norm': 1.5565059185028076, 'learning_rate': 0.00013725279961877533, 'epoch': 0.31}


 31%|███▏      | 5278/16798 [23:08<57:16,  3.35it/s]

{'loss': 1.4564, 'grad_norm': 1.9545323848724365, 'learning_rate': 0.000137240886347391, 'epoch': 0.31}


 31%|███▏      | 5279/16798 [23:08<58:53,  3.26it/s]

{'loss': 1.5785, 'grad_norm': 1.939307451248169, 'learning_rate': 0.0001372289730760067, 'epoch': 0.31}


 31%|███▏      | 5280/16798 [23:08<56:40,  3.39it/s]

{'loss': 1.1699, 'grad_norm': 1.4626011848449707, 'learning_rate': 0.00013721705980462235, 'epoch': 0.31}


 31%|███▏      | 5281/16798 [23:09<55:27,  3.46it/s]

{'loss': 1.3524, 'grad_norm': 1.5907231569290161, 'learning_rate': 0.00013720514653323804, 'epoch': 0.31}


 31%|███▏      | 5282/16798 [23:09<56:54,  3.37it/s]

{'loss': 1.6703, 'grad_norm': 1.9809584617614746, 'learning_rate': 0.0001371932332618537, 'epoch': 0.31}


 31%|███▏      | 5283/16798 [23:09<55:31,  3.46it/s]

{'loss': 1.6037, 'grad_norm': 2.0474863052368164, 'learning_rate': 0.0001371813199904694, 'epoch': 0.31}


 31%|███▏      | 5284/16798 [23:09<56:30,  3.40it/s]

{'loss': 1.256, 'grad_norm': 2.191171407699585, 'learning_rate': 0.00013716940671908506, 'epoch': 0.31}


 31%|███▏      | 5285/16798 [23:10<55:26,  3.46it/s]

{'loss': 1.3767, 'grad_norm': 2.0719447135925293, 'learning_rate': 0.00013715749344770075, 'epoch': 0.31}


 31%|███▏      | 5286/16798 [23:10<57:57,  3.31it/s]

{'loss': 1.2469, 'grad_norm': 1.6901055574417114, 'learning_rate': 0.00013714558017631642, 'epoch': 0.31}


 31%|███▏      | 5287/16798 [23:10<59:02,  3.25it/s]

{'loss': 1.3912, 'grad_norm': 2.0678153038024902, 'learning_rate': 0.0001371336669049321, 'epoch': 0.31}


 31%|███▏      | 5288/16798 [23:11<58:25,  3.28it/s]

{'loss': 1.5361, 'grad_norm': 1.8697693347930908, 'learning_rate': 0.00013712175363354777, 'epoch': 0.31}


 31%|███▏      | 5289/16798 [23:11<58:47,  3.26it/s]

{'loss': 1.3329, 'grad_norm': 1.810320258140564, 'learning_rate': 0.00013710984036216346, 'epoch': 0.31}


 31%|███▏      | 5290/16798 [23:11<57:01,  3.36it/s]

{'loss': 0.5603, 'grad_norm': 1.498279333114624, 'learning_rate': 0.00013709792709077912, 'epoch': 0.31}


 31%|███▏      | 5291/16798 [23:12<57:35,  3.33it/s]

{'loss': 1.6717, 'grad_norm': 2.151916265487671, 'learning_rate': 0.00013708601381939482, 'epoch': 0.31}


 32%|███▏      | 5292/16798 [23:12<55:59,  3.43it/s]

{'loss': 1.46, 'grad_norm': 2.0705015659332275, 'learning_rate': 0.00013707410054801048, 'epoch': 0.32}


 32%|███▏      | 5293/16798 [23:12<56:37,  3.39it/s]

{'loss': 1.1051, 'grad_norm': 1.800528883934021, 'learning_rate': 0.00013706218727662617, 'epoch': 0.32}


 32%|███▏      | 5294/16798 [23:12<58:03,  3.30it/s]

{'loss': 1.2468, 'grad_norm': 1.8541536331176758, 'learning_rate': 0.00013705027400524183, 'epoch': 0.32}


 32%|███▏      | 5295/16798 [23:13<56:41,  3.38it/s]

{'loss': 1.3341, 'grad_norm': 2.1263554096221924, 'learning_rate': 0.00013703836073385752, 'epoch': 0.32}


 32%|███▏      | 5296/16798 [23:13<55:28,  3.46it/s]

{'loss': 1.1689, 'grad_norm': 2.060908317565918, 'learning_rate': 0.0001370264474624732, 'epoch': 0.32}


 32%|███▏      | 5297/16798 [23:13<56:10,  3.41it/s]

{'loss': 0.7359, 'grad_norm': 1.5012162923812866, 'learning_rate': 0.00013701453419108888, 'epoch': 0.32}


 32%|███▏      | 5298/16798 [23:14<55:50,  3.43it/s]

{'loss': 1.0307, 'grad_norm': 1.9090487957000732, 'learning_rate': 0.00013700262091970454, 'epoch': 0.32}


 32%|███▏      | 5299/16798 [23:14<56:12,  3.41it/s]

{'loss': 0.4429, 'grad_norm': 1.1169222593307495, 'learning_rate': 0.00013699070764832023, 'epoch': 0.32}


 32%|███▏      | 5300/16798 [23:14<57:24,  3.34it/s]

{'loss': 0.2681, 'grad_norm': 0.70998215675354, 'learning_rate': 0.0001369787943769359, 'epoch': 0.32}


 32%|███▏      | 5301/16798 [23:14<54:45,  3.50it/s]

{'loss': 1.8968, 'grad_norm': 1.7480560541152954, 'learning_rate': 0.0001369668811055516, 'epoch': 0.32}


 32%|███▏      | 5302/16798 [23:15<58:40,  3.27it/s]

{'loss': 2.0823, 'grad_norm': 1.856083631515503, 'learning_rate': 0.00013695496783416725, 'epoch': 0.32}


 32%|███▏      | 5303/16798 [23:15<1:01:51,  3.10it/s]

{'loss': 1.8092, 'grad_norm': 1.65073561668396, 'learning_rate': 0.00013694305456278297, 'epoch': 0.32}


 32%|███▏      | 5304/16798 [23:15<57:45,  3.32it/s]  

{'loss': 1.9964, 'grad_norm': 1.7799063920974731, 'learning_rate': 0.00013693114129139863, 'epoch': 0.32}


 32%|███▏      | 5305/16798 [23:16<58:16,  3.29it/s]

{'loss': 2.1323, 'grad_norm': 2.138284921646118, 'learning_rate': 0.00013691922802001432, 'epoch': 0.32}


 32%|███▏      | 5306/16798 [23:16<54:46,  3.50it/s]

{'loss': 1.7635, 'grad_norm': 1.9779890775680542, 'learning_rate': 0.00013690731474863, 'epoch': 0.32}


 32%|███▏      | 5307/16798 [23:16<56:02,  3.42it/s]

{'loss': 1.9877, 'grad_norm': 3.3468611240386963, 'learning_rate': 0.00013689540147724568, 'epoch': 0.32}


 32%|███▏      | 5308/16798 [23:17<59:43,  3.21it/s]

{'loss': 1.9328, 'grad_norm': 1.8902549743652344, 'learning_rate': 0.00013688348820586134, 'epoch': 0.32}


 32%|███▏      | 5309/16798 [23:17<56:39,  3.38it/s]

{'loss': 1.7301, 'grad_norm': 1.6680225133895874, 'learning_rate': 0.00013687157493447703, 'epoch': 0.32}


 32%|███▏      | 5310/16798 [23:17<56:41,  3.38it/s]

{'loss': 1.863, 'grad_norm': 2.070136070251465, 'learning_rate': 0.0001368596616630927, 'epoch': 0.32}


 32%|███▏      | 5311/16798 [23:17<54:37,  3.50it/s]

{'loss': 1.684, 'grad_norm': 2.20045804977417, 'learning_rate': 0.00013684774839170839, 'epoch': 0.32}


 32%|███▏      | 5312/16798 [23:18<56:31,  3.39it/s]

{'loss': 1.7605, 'grad_norm': 2.1097021102905273, 'learning_rate': 0.00013683583512032405, 'epoch': 0.32}


 32%|███▏      | 5313/16798 [23:18<58:07,  3.29it/s]

{'loss': 1.5371, 'grad_norm': 1.8387320041656494, 'learning_rate': 0.00013682392184893974, 'epoch': 0.32}


 32%|███▏      | 5314/16798 [23:18<57:58,  3.30it/s]

{'loss': 1.5649, 'grad_norm': 1.863002896308899, 'learning_rate': 0.0001368120085775554, 'epoch': 0.32}


 32%|███▏      | 5315/16798 [23:19<57:19,  3.34it/s]

{'loss': 1.6951, 'grad_norm': 1.9405747652053833, 'learning_rate': 0.0001368000953061711, 'epoch': 0.32}


 32%|███▏      | 5316/16798 [23:19<55:36,  3.44it/s]

{'loss': 1.5646, 'grad_norm': 2.0182156562805176, 'learning_rate': 0.00013678818203478676, 'epoch': 0.32}


 32%|███▏      | 5317/16798 [23:19<54:59,  3.48it/s]

{'loss': 1.4564, 'grad_norm': 1.9892209768295288, 'learning_rate': 0.00013677626876340245, 'epoch': 0.32}


 32%|███▏      | 5318/16798 [23:20<57:49,  3.31it/s]

{'loss': 1.3976, 'grad_norm': 1.8725534677505493, 'learning_rate': 0.0001367643554920181, 'epoch': 0.32}


 32%|███▏      | 5319/16798 [23:20<54:51,  3.49it/s]

{'loss': 1.8493, 'grad_norm': 2.0399580001831055, 'learning_rate': 0.0001367524422206338, 'epoch': 0.32}


 32%|███▏      | 5320/16798 [23:20<54:35,  3.50it/s]

{'loss': 1.4613, 'grad_norm': 1.9793260097503662, 'learning_rate': 0.00013674052894924947, 'epoch': 0.32}


 32%|███▏      | 5321/16798 [23:20<59:15,  3.23it/s]

{'loss': 1.2567, 'grad_norm': 1.8753823041915894, 'learning_rate': 0.00013672861567786516, 'epoch': 0.32}


 32%|███▏      | 5322/16798 [23:21<57:28,  3.33it/s]

{'loss': 1.3329, 'grad_norm': 1.7603846788406372, 'learning_rate': 0.00013671670240648082, 'epoch': 0.32}


 32%|███▏      | 5323/16798 [23:21<55:52,  3.42it/s]

{'loss': 1.6452, 'grad_norm': 1.9183244705200195, 'learning_rate': 0.0001367047891350965, 'epoch': 0.32}


 32%|███▏      | 5324/16798 [23:21<57:11,  3.34it/s]

{'loss': 1.6066, 'grad_norm': 1.6759271621704102, 'learning_rate': 0.00013669287586371218, 'epoch': 0.32}


 32%|███▏      | 5325/16798 [23:22<58:21,  3.28it/s]

{'loss': 1.7305, 'grad_norm': 1.8372210264205933, 'learning_rate': 0.00013668096259232787, 'epoch': 0.32}


 32%|███▏      | 5326/16798 [23:22<1:01:16,  3.12it/s]

{'loss': 1.3978, 'grad_norm': 2.024113893508911, 'learning_rate': 0.00013666904932094353, 'epoch': 0.32}


 32%|███▏      | 5327/16798 [23:22<58:29,  3.27it/s]  

{'loss': 1.8317, 'grad_norm': 1.8594356775283813, 'learning_rate': 0.00013665713604955922, 'epoch': 0.32}


 32%|███▏      | 5328/16798 [23:23<1:01:51,  3.09it/s]

{'loss': 1.7869, 'grad_norm': 1.9561964273452759, 'learning_rate': 0.00013664522277817489, 'epoch': 0.32}


 32%|███▏      | 5329/16798 [23:23<57:19,  3.33it/s]  

{'loss': 1.2329, 'grad_norm': 1.6518871784210205, 'learning_rate': 0.00013663330950679058, 'epoch': 0.32}


 32%|███▏      | 5330/16798 [23:23<57:21,  3.33it/s]

{'loss': 1.383, 'grad_norm': 2.0315189361572266, 'learning_rate': 0.00013662139623540624, 'epoch': 0.32}


 32%|███▏      | 5331/16798 [23:23<54:15,  3.52it/s]

{'loss': 1.2354, 'grad_norm': 2.0565593242645264, 'learning_rate': 0.00013660948296402193, 'epoch': 0.32}


 32%|███▏      | 5332/16798 [23:24<58:16,  3.28it/s]

{'loss': 2.0634, 'grad_norm': 2.378878593444824, 'learning_rate': 0.0001365975696926376, 'epoch': 0.32}


 32%|███▏      | 5333/16798 [23:24<57:34,  3.32it/s]

{'loss': 1.2924, 'grad_norm': 1.5991753339767456, 'learning_rate': 0.00013658565642125329, 'epoch': 0.32}


 32%|███▏      | 5334/16798 [23:24<59:00,  3.24it/s]

{'loss': 1.6275, 'grad_norm': 1.8676687479019165, 'learning_rate': 0.00013657374314986898, 'epoch': 0.32}


 32%|███▏      | 5335/16798 [23:25<59:57,  3.19it/s]

{'loss': 0.9824, 'grad_norm': 1.7197610139846802, 'learning_rate': 0.00013656182987848464, 'epoch': 0.32}


 32%|███▏      | 5336/16798 [23:25<54:25,  3.51it/s]

{'loss': 1.0708, 'grad_norm': 1.6338469982147217, 'learning_rate': 0.00013654991660710033, 'epoch': 0.32}


 32%|███▏      | 5337/16798 [23:25<53:13,  3.59it/s]

{'loss': 2.0079, 'grad_norm': 2.5130746364593506, 'learning_rate': 0.000136538003335716, 'epoch': 0.32}


 32%|███▏      | 5338/16798 [23:26<55:03,  3.47it/s]

{'loss': 1.2222, 'grad_norm': 1.8056496381759644, 'learning_rate': 0.00013652609006433168, 'epoch': 0.32}


 32%|███▏      | 5339/16798 [23:26<56:15,  3.39it/s]

{'loss': 1.122, 'grad_norm': 1.5026774406433105, 'learning_rate': 0.00013651417679294735, 'epoch': 0.32}


 32%|███▏      | 5340/16798 [23:26<57:34,  3.32it/s]

{'loss': 0.9772, 'grad_norm': 1.5689690113067627, 'learning_rate': 0.00013650226352156304, 'epoch': 0.32}


 32%|███▏      | 5341/16798 [23:26<57:21,  3.33it/s]

{'loss': 1.416, 'grad_norm': 1.8246175050735474, 'learning_rate': 0.0001364903502501787, 'epoch': 0.32}


 32%|███▏      | 5342/16798 [23:27<55:40,  3.43it/s]

{'loss': 1.2289, 'grad_norm': 1.832183599472046, 'learning_rate': 0.0001364784369787944, 'epoch': 0.32}


 32%|███▏      | 5343/16798 [23:27<54:30,  3.50it/s]

{'loss': 1.1076, 'grad_norm': 1.622583031654358, 'learning_rate': 0.00013646652370741006, 'epoch': 0.32}


 32%|███▏      | 5344/16798 [23:27<54:26,  3.51it/s]

{'loss': 1.1956, 'grad_norm': 3.166062831878662, 'learning_rate': 0.00013645461043602575, 'epoch': 0.32}


 32%|███▏      | 5345/16798 [23:28<54:23,  3.51it/s]

{'loss': 0.5119, 'grad_norm': 1.059891700744629, 'learning_rate': 0.0001364426971646414, 'epoch': 0.32}


 32%|███▏      | 5346/16798 [23:28<57:21,  3.33it/s]

{'loss': 0.9111, 'grad_norm': 1.6376376152038574, 'learning_rate': 0.0001364307838932571, 'epoch': 0.32}


 32%|███▏      | 5347/16798 [23:28<58:56,  3.24it/s]

{'loss': 1.0465, 'grad_norm': 1.8668376207351685, 'learning_rate': 0.00013641887062187277, 'epoch': 0.32}


 32%|███▏      | 5348/16798 [23:29<57:42,  3.31it/s]

{'loss': 0.5767, 'grad_norm': 1.2238521575927734, 'learning_rate': 0.00013640695735048846, 'epoch': 0.32}


 32%|███▏      | 5349/16798 [23:29<57:03,  3.34it/s]

{'loss': 0.2725, 'grad_norm': 0.7530226111412048, 'learning_rate': 0.00013639504407910412, 'epoch': 0.32}


 32%|███▏      | 5350/16798 [23:29<56:20,  3.39it/s]

{'loss': 0.4175, 'grad_norm': 1.1337296962738037, 'learning_rate': 0.0001363831308077198, 'epoch': 0.32}


 32%|███▏      | 5351/16798 [23:29<54:24,  3.51it/s]

{'loss': 1.6216, 'grad_norm': 1.5570921897888184, 'learning_rate': 0.00013637121753633547, 'epoch': 0.32}


 32%|███▏      | 5352/16798 [23:30<55:25,  3.44it/s]

{'loss': 2.2574, 'grad_norm': 2.050967216491699, 'learning_rate': 0.00013635930426495117, 'epoch': 0.32}


 32%|███▏      | 5353/16798 [23:30<55:57,  3.41it/s]

{'loss': 2.0388, 'grad_norm': 2.0729262828826904, 'learning_rate': 0.00013634739099356683, 'epoch': 0.32}


 32%|███▏      | 5354/16798 [23:30<55:51,  3.41it/s]

{'loss': 1.9711, 'grad_norm': 1.8643198013305664, 'learning_rate': 0.00013633547772218252, 'epoch': 0.32}


 32%|███▏      | 5355/16798 [23:31<58:05,  3.28it/s]

{'loss': 1.9136, 'grad_norm': 1.8853405714035034, 'learning_rate': 0.00013632356445079818, 'epoch': 0.32}


 32%|███▏      | 5356/16798 [23:31<58:41,  3.25it/s]

{'loss': 1.4124, 'grad_norm': 1.7149237394332886, 'learning_rate': 0.00013631165117941387, 'epoch': 0.32}


 32%|███▏      | 5357/16798 [23:31<58:50,  3.24it/s]

{'loss': 2.2702, 'grad_norm': 2.1575558185577393, 'learning_rate': 0.00013629973790802954, 'epoch': 0.32}


 32%|███▏      | 5358/16798 [23:32<58:47,  3.24it/s]

{'loss': 2.0819, 'grad_norm': 2.2359459400177, 'learning_rate': 0.00013628782463664523, 'epoch': 0.32}


 32%|███▏      | 5359/16798 [23:32<58:33,  3.26it/s]

{'loss': 1.6396, 'grad_norm': 1.8090590238571167, 'learning_rate': 0.0001362759113652609, 'epoch': 0.32}


 32%|███▏      | 5360/16798 [23:32<57:30,  3.32it/s]

{'loss': 1.872, 'grad_norm': 2.0230870246887207, 'learning_rate': 0.00013626399809387658, 'epoch': 0.32}


 32%|███▏      | 5361/16798 [23:32<57:54,  3.29it/s]

{'loss': 0.9865, 'grad_norm': 1.3664741516113281, 'learning_rate': 0.00013625208482249225, 'epoch': 0.32}


 32%|███▏      | 5362/16798 [23:33<59:04,  3.23it/s]

{'loss': 1.4543, 'grad_norm': 1.9256653785705566, 'learning_rate': 0.00013624017155110794, 'epoch': 0.32}


 32%|███▏      | 5363/16798 [23:33<57:44,  3.30it/s]

{'loss': 1.9153, 'grad_norm': 2.2417562007904053, 'learning_rate': 0.0001362282582797236, 'epoch': 0.32}


 32%|███▏      | 5364/16798 [23:33<58:14,  3.27it/s]

{'loss': 1.3711, 'grad_norm': 1.6322802305221558, 'learning_rate': 0.0001362163450083393, 'epoch': 0.32}


 32%|███▏      | 5365/16798 [23:34<54:51,  3.47it/s]

{'loss': 1.4312, 'grad_norm': 1.9719429016113281, 'learning_rate': 0.00013620443173695498, 'epoch': 0.32}


 32%|███▏      | 5366/16798 [23:34<57:39,  3.30it/s]

{'loss': 1.7304, 'grad_norm': 1.8485790491104126, 'learning_rate': 0.00013619251846557067, 'epoch': 0.32}


 32%|███▏      | 5367/16798 [23:34<56:50,  3.35it/s]

{'loss': 1.8767, 'grad_norm': 1.957789659500122, 'learning_rate': 0.00013618060519418634, 'epoch': 0.32}


 32%|███▏      | 5368/16798 [23:35<56:54,  3.35it/s]

{'loss': 1.5164, 'grad_norm': 1.782102108001709, 'learning_rate': 0.00013616869192280203, 'epoch': 0.32}


 32%|███▏      | 5369/16798 [23:35<59:08,  3.22it/s]

{'loss': 1.5156, 'grad_norm': 2.125293493270874, 'learning_rate': 0.0001361567786514177, 'epoch': 0.32}


 32%|███▏      | 5370/16798 [23:35<55:33,  3.43it/s]

{'loss': 1.6205, 'grad_norm': 2.5556681156158447, 'learning_rate': 0.00013614486538003338, 'epoch': 0.32}


 32%|███▏      | 5371/16798 [23:35<59:07,  3.22it/s]

{'loss': 1.5879, 'grad_norm': 2.18894362449646, 'learning_rate': 0.00013613295210864905, 'epoch': 0.32}


 32%|███▏      | 5372/16798 [23:36<57:54,  3.29it/s]

{'loss': 1.7801, 'grad_norm': 1.854628324508667, 'learning_rate': 0.00013612103883726474, 'epoch': 0.32}


 32%|███▏      | 5373/16798 [23:36<59:35,  3.20it/s]

{'loss': 1.3478, 'grad_norm': 1.7570773363113403, 'learning_rate': 0.0001361091255658804, 'epoch': 0.32}


 32%|███▏      | 5374/16798 [23:36<57:04,  3.34it/s]

{'loss': 1.298, 'grad_norm': 1.5276085138320923, 'learning_rate': 0.0001360972122944961, 'epoch': 0.32}


 32%|███▏      | 5375/16798 [23:37<56:04,  3.39it/s]

{'loss': 1.426, 'grad_norm': 1.8301385641098022, 'learning_rate': 0.00013608529902311175, 'epoch': 0.32}


 32%|███▏      | 5376/16798 [23:37<59:02,  3.22it/s]

{'loss': 1.6953, 'grad_norm': 2.058065176010132, 'learning_rate': 0.00013607338575172745, 'epoch': 0.32}


 32%|███▏      | 5377/16798 [23:37<56:43,  3.36it/s]

{'loss': 1.9291, 'grad_norm': 2.2273714542388916, 'learning_rate': 0.0001360614724803431, 'epoch': 0.32}


 32%|███▏      | 5378/16798 [23:38<55:24,  3.44it/s]

{'loss': 1.7834, 'grad_norm': 2.3978934288024902, 'learning_rate': 0.0001360495592089588, 'epoch': 0.32}


 32%|███▏      | 5379/16798 [23:38<55:06,  3.45it/s]

{'loss': 1.3715, 'grad_norm': 1.724919319152832, 'learning_rate': 0.00013603764593757446, 'epoch': 0.32}


 32%|███▏      | 5380/16798 [23:38<57:54,  3.29it/s]

{'loss': 1.5564, 'grad_norm': 1.954224944114685, 'learning_rate': 0.00013602573266619015, 'epoch': 0.32}


 32%|███▏      | 5381/16798 [23:38<57:36,  3.30it/s]

{'loss': 1.456, 'grad_norm': 1.7372689247131348, 'learning_rate': 0.00013601381939480582, 'epoch': 0.32}


 32%|███▏      | 5382/16798 [23:39<59:09,  3.22it/s]

{'loss': 1.1127, 'grad_norm': 2.131636619567871, 'learning_rate': 0.0001360019061234215, 'epoch': 0.32}


 32%|███▏      | 5383/16798 [23:39<58:46,  3.24it/s]

{'loss': 1.3904, 'grad_norm': 1.8002873659133911, 'learning_rate': 0.00013598999285203717, 'epoch': 0.32}


 32%|███▏      | 5384/16798 [23:39<57:15,  3.32it/s]

{'loss': 1.6103, 'grad_norm': 2.1202824115753174, 'learning_rate': 0.00013597807958065286, 'epoch': 0.32}


 32%|███▏      | 5385/16798 [23:40<58:52,  3.23it/s]

{'loss': 0.9118, 'grad_norm': 1.462698221206665, 'learning_rate': 0.00013596616630926853, 'epoch': 0.32}


 32%|███▏      | 5386/16798 [23:40<55:09,  3.45it/s]

{'loss': 1.4855, 'grad_norm': 1.8766506910324097, 'learning_rate': 0.00013595425303788422, 'epoch': 0.32}


 32%|███▏      | 5387/16798 [23:40<57:41,  3.30it/s]

{'loss': 1.4602, 'grad_norm': 1.9695137739181519, 'learning_rate': 0.00013594233976649988, 'epoch': 0.32}


 32%|███▏      | 5388/16798 [23:41<54:30,  3.49it/s]

{'loss': 1.1802, 'grad_norm': 1.6042400598526, 'learning_rate': 0.00013593042649511557, 'epoch': 0.32}


 32%|███▏      | 5389/16798 [23:41<56:46,  3.35it/s]

{'loss': 1.2983, 'grad_norm': 1.7714447975158691, 'learning_rate': 0.00013591851322373124, 'epoch': 0.32}


 32%|███▏      | 5390/16798 [23:41<56:54,  3.34it/s]

{'loss': 1.5824, 'grad_norm': 2.0364866256713867, 'learning_rate': 0.00013590659995234693, 'epoch': 0.32}


 32%|███▏      | 5391/16798 [23:41<56:45,  3.35it/s]

{'loss': 1.0283, 'grad_norm': 1.886326551437378, 'learning_rate': 0.0001358946866809626, 'epoch': 0.32}


 32%|███▏      | 5392/16798 [23:42<57:36,  3.30it/s]

{'loss': 0.8471, 'grad_norm': 1.4161696434020996, 'learning_rate': 0.00013588277340957828, 'epoch': 0.32}


 32%|███▏      | 5393/16798 [23:42<57:35,  3.30it/s]

{'loss': 1.2073, 'grad_norm': 1.8708717823028564, 'learning_rate': 0.00013587086013819394, 'epoch': 0.32}


 32%|███▏      | 5394/16798 [23:42<56:44,  3.35it/s]

{'loss': 1.1879, 'grad_norm': 1.8931736946105957, 'learning_rate': 0.00013585894686680964, 'epoch': 0.32}


 32%|███▏      | 5395/16798 [23:43<58:01,  3.28it/s]

{'loss': 1.4828, 'grad_norm': 2.299612522125244, 'learning_rate': 0.0001358470335954253, 'epoch': 0.32}


 32%|███▏      | 5396/16798 [23:43<57:34,  3.30it/s]

{'loss': 1.0008, 'grad_norm': 1.560653567314148, 'learning_rate': 0.000135835120324041, 'epoch': 0.32}


 32%|███▏      | 5397/16798 [23:43<57:01,  3.33it/s]

{'loss': 0.7519, 'grad_norm': 1.5395851135253906, 'learning_rate': 0.00013582320705265668, 'epoch': 0.32}


 32%|███▏      | 5398/16798 [23:44<57:58,  3.28it/s]

{'loss': 0.5239, 'grad_norm': 1.1198093891143799, 'learning_rate': 0.00013581129378127234, 'epoch': 0.32}


 32%|███▏      | 5399/16798 [23:44<54:41,  3.47it/s]

{'loss': 0.4515, 'grad_norm': 1.0611234903335571, 'learning_rate': 0.00013579938050988804, 'epoch': 0.32}


 32%|███▏      | 5400/16798 [23:44<56:11,  3.38it/s]

{'loss': 0.6434, 'grad_norm': 1.5727969408035278, 'learning_rate': 0.0001357874672385037, 'epoch': 0.32}


 32%|███▏      | 5401/16798 [23:44<56:51,  3.34it/s]

{'loss': 1.7331, 'grad_norm': 1.79708731174469, 'learning_rate': 0.0001357755539671194, 'epoch': 0.32}


 32%|███▏      | 5402/16798 [23:45<57:03,  3.33it/s]

{'loss': 1.9701, 'grad_norm': 1.8436094522476196, 'learning_rate': 0.00013576364069573505, 'epoch': 0.32}


 32%|███▏      | 5403/16798 [23:45<57:25,  3.31it/s]

{'loss': 1.7415, 'grad_norm': 1.660009503364563, 'learning_rate': 0.00013575172742435074, 'epoch': 0.32}


 32%|███▏      | 5404/16798 [23:45<54:49,  3.46it/s]

{'loss': 1.7541, 'grad_norm': 2.404635190963745, 'learning_rate': 0.0001357398141529664, 'epoch': 0.32}


 32%|███▏      | 5405/16798 [23:46<57:39,  3.29it/s]

{'loss': 2.397, 'grad_norm': 2.229351282119751, 'learning_rate': 0.0001357279008815821, 'epoch': 0.32}


 32%|███▏      | 5406/16798 [23:46<55:03,  3.45it/s]

{'loss': 1.7722, 'grad_norm': 2.2708749771118164, 'learning_rate': 0.00013571598761019776, 'epoch': 0.32}


 32%|███▏      | 5407/16798 [23:46<56:53,  3.34it/s]

{'loss': 1.8232, 'grad_norm': 2.308638095855713, 'learning_rate': 0.00013570407433881345, 'epoch': 0.32}


 32%|███▏      | 5408/16798 [23:47<58:22,  3.25it/s]

{'loss': 1.6082, 'grad_norm': 2.00227689743042, 'learning_rate': 0.00013569216106742912, 'epoch': 0.32}


 32%|███▏      | 5409/16798 [23:47<57:34,  3.30it/s]

{'loss': 1.7796, 'grad_norm': 2.083270788192749, 'learning_rate': 0.0001356802477960448, 'epoch': 0.32}


 32%|███▏      | 5410/16798 [23:47<59:27,  3.19it/s]

{'loss': 1.5623, 'grad_norm': 1.6192104816436768, 'learning_rate': 0.00013566833452466047, 'epoch': 0.32}


 32%|███▏      | 5411/16798 [23:47<57:34,  3.30it/s]

{'loss': 1.9056, 'grad_norm': 2.114816188812256, 'learning_rate': 0.00013565642125327616, 'epoch': 0.32}


 32%|███▏      | 5412/16798 [23:48<58:35,  3.24it/s]

{'loss': 1.551, 'grad_norm': 1.8720287084579468, 'learning_rate': 0.00013564450798189183, 'epoch': 0.32}


 32%|███▏      | 5413/16798 [23:48<58:37,  3.24it/s]

{'loss': 1.5464, 'grad_norm': 1.8900734186172485, 'learning_rate': 0.00013563259471050752, 'epoch': 0.32}


 32%|███▏      | 5414/16798 [23:48<55:56,  3.39it/s]

{'loss': 1.3632, 'grad_norm': 1.6515885591506958, 'learning_rate': 0.00013562068143912318, 'epoch': 0.32}


 32%|███▏      | 5415/16798 [23:49<54:26,  3.48it/s]

{'loss': 1.2509, 'grad_norm': 1.5801082849502563, 'learning_rate': 0.00013560876816773887, 'epoch': 0.32}


 32%|███▏      | 5416/16798 [23:49<56:28,  3.36it/s]

{'loss': 1.6828, 'grad_norm': 2.1415066719055176, 'learning_rate': 0.00013559685489635453, 'epoch': 0.32}


 32%|███▏      | 5417/16798 [23:49<58:03,  3.27it/s]

{'loss': 1.3808, 'grad_norm': 2.216862678527832, 'learning_rate': 0.00013558494162497022, 'epoch': 0.32}


 32%|███▏      | 5418/16798 [23:50<57:48,  3.28it/s]

{'loss': 1.6671, 'grad_norm': 1.8453937768936157, 'learning_rate': 0.0001355730283535859, 'epoch': 0.32}


 32%|███▏      | 5419/16798 [23:50<56:28,  3.36it/s]

{'loss': 1.718, 'grad_norm': 2.2567436695098877, 'learning_rate': 0.00013556111508220158, 'epoch': 0.32}


 32%|███▏      | 5420/16798 [23:50<59:03,  3.21it/s]

{'loss': 1.3865, 'grad_norm': 1.7495460510253906, 'learning_rate': 0.00013554920181081724, 'epoch': 0.32}


 32%|███▏      | 5421/16798 [23:50<56:03,  3.38it/s]

{'loss': 1.6436, 'grad_norm': 2.1226534843444824, 'learning_rate': 0.00013553728853943293, 'epoch': 0.32}


 32%|███▏      | 5422/16798 [23:51<57:10,  3.32it/s]

{'loss': 1.3024, 'grad_norm': 1.6405998468399048, 'learning_rate': 0.0001355253752680486, 'epoch': 0.32}


 32%|███▏      | 5423/16798 [23:51<58:27,  3.24it/s]

{'loss': 1.5114, 'grad_norm': 1.8906725645065308, 'learning_rate': 0.0001355134619966643, 'epoch': 0.32}


 32%|███▏      | 5424/16798 [23:51<56:54,  3.33it/s]

{'loss': 1.6328, 'grad_norm': 2.6260244846343994, 'learning_rate': 0.00013550154872527995, 'epoch': 0.32}


 32%|███▏      | 5425/16798 [23:52<57:23,  3.30it/s]

{'loss': 1.2368, 'grad_norm': 1.7543113231658936, 'learning_rate': 0.00013548963545389564, 'epoch': 0.32}


 32%|███▏      | 5426/16798 [23:52<55:58,  3.39it/s]

{'loss': 1.5376, 'grad_norm': 1.7348616123199463, 'learning_rate': 0.00013547772218251133, 'epoch': 0.32}


 32%|███▏      | 5427/16798 [23:52<56:06,  3.38it/s]

{'loss': 1.4814, 'grad_norm': 1.9474869966506958, 'learning_rate': 0.00013546580891112702, 'epoch': 0.32}


 32%|███▏      | 5428/16798 [23:53<57:20,  3.30it/s]

{'loss': 1.4868, 'grad_norm': 1.7822959423065186, 'learning_rate': 0.0001354538956397427, 'epoch': 0.32}


 32%|███▏      | 5429/16798 [23:53<1:02:00,  3.06it/s]

{'loss': 1.4343, 'grad_norm': 1.8389441967010498, 'learning_rate': 0.00013544198236835838, 'epoch': 0.32}


 32%|███▏      | 5430/16798 [23:53<59:34,  3.18it/s]  

{'loss': 1.6995, 'grad_norm': 2.033194065093994, 'learning_rate': 0.00013543006909697404, 'epoch': 0.32}


 32%|███▏      | 5431/16798 [23:54<58:24,  3.24it/s]

{'loss': 1.7852, 'grad_norm': 2.0945816040039062, 'learning_rate': 0.00013541815582558973, 'epoch': 0.32}


 32%|███▏      | 5432/16798 [23:54<57:21,  3.30it/s]

{'loss': 1.4567, 'grad_norm': 2.0316996574401855, 'learning_rate': 0.0001354062425542054, 'epoch': 0.32}


 32%|███▏      | 5433/16798 [23:54<56:22,  3.36it/s]

{'loss': 1.2883, 'grad_norm': 1.7005327939987183, 'learning_rate': 0.0001353943292828211, 'epoch': 0.32}


 32%|███▏      | 5434/16798 [23:54<55:27,  3.42it/s]

{'loss': 1.7749, 'grad_norm': 2.043283462524414, 'learning_rate': 0.00013538241601143675, 'epoch': 0.32}


 32%|███▏      | 5435/16798 [23:55<56:55,  3.33it/s]

{'loss': 1.3181, 'grad_norm': 2.462853193283081, 'learning_rate': 0.00013537050274005244, 'epoch': 0.32}


 32%|███▏      | 5436/16798 [23:55<57:17,  3.30it/s]

{'loss': 1.3812, 'grad_norm': 2.0290465354919434, 'learning_rate': 0.0001353585894686681, 'epoch': 0.32}


 32%|███▏      | 5437/16798 [23:55<56:29,  3.35it/s]

{'loss': 1.1022, 'grad_norm': 2.0219802856445312, 'learning_rate': 0.0001353466761972838, 'epoch': 0.32}


 32%|███▏      | 5438/16798 [23:56<55:24,  3.42it/s]

{'loss': 1.6604, 'grad_norm': 2.134336471557617, 'learning_rate': 0.00013533476292589946, 'epoch': 0.32}


 32%|███▏      | 5439/16798 [23:56<55:45,  3.40it/s]

{'loss': 1.1876, 'grad_norm': 2.1295559406280518, 'learning_rate': 0.00013532284965451515, 'epoch': 0.32}


 32%|███▏      | 5440/16798 [23:56<56:27,  3.35it/s]

{'loss': 1.0784, 'grad_norm': 1.5679877996444702, 'learning_rate': 0.00013531093638313081, 'epoch': 0.32}


 32%|███▏      | 5441/16798 [23:57<56:03,  3.38it/s]

{'loss': 1.3888, 'grad_norm': 1.943747639656067, 'learning_rate': 0.0001352990231117465, 'epoch': 0.32}


 32%|███▏      | 5442/16798 [23:57<55:16,  3.42it/s]

{'loss': 0.9095, 'grad_norm': 1.5946544408798218, 'learning_rate': 0.00013528710984036217, 'epoch': 0.32}


 32%|███▏      | 5443/16798 [23:57<55:21,  3.42it/s]

{'loss': 1.3622, 'grad_norm': 1.9033643007278442, 'learning_rate': 0.00013527519656897786, 'epoch': 0.32}


 32%|███▏      | 5444/16798 [23:57<56:19,  3.36it/s]

{'loss': 1.2314, 'grad_norm': 1.7084161043167114, 'learning_rate': 0.00013526328329759352, 'epoch': 0.32}


 32%|███▏      | 5445/16798 [23:58<56:43,  3.34it/s]

{'loss': 0.5591, 'grad_norm': 1.2892714738845825, 'learning_rate': 0.00013525137002620921, 'epoch': 0.32}


 32%|███▏      | 5446/16798 [23:58<55:57,  3.38it/s]

{'loss': 0.9109, 'grad_norm': 1.5369681119918823, 'learning_rate': 0.00013523945675482488, 'epoch': 0.32}


 32%|███▏      | 5447/16798 [23:58<57:08,  3.31it/s]

{'loss': 0.8777, 'grad_norm': 1.7581716775894165, 'learning_rate': 0.00013522754348344057, 'epoch': 0.32}


 32%|███▏      | 5448/16798 [23:59<56:12,  3.37it/s]

{'loss': 0.4267, 'grad_norm': 0.9252347350120544, 'learning_rate': 0.00013521563021205623, 'epoch': 0.32}


 32%|███▏      | 5449/16798 [23:59<56:09,  3.37it/s]

{'loss': 0.6435, 'grad_norm': 1.3831008672714233, 'learning_rate': 0.00013520371694067192, 'epoch': 0.32}


 32%|███▏      | 5450/16798 [23:59<58:07,  3.25it/s]

{'loss': 0.2558, 'grad_norm': 0.7554119825363159, 'learning_rate': 0.00013519180366928759, 'epoch': 0.32}


 32%|███▏      | 5451/16798 [24:00<58:12,  3.25it/s]

{'loss': 2.361, 'grad_norm': 2.41269588470459, 'learning_rate': 0.00013517989039790328, 'epoch': 0.32}


 32%|███▏      | 5452/16798 [24:00<58:07,  3.25it/s]

{'loss': 2.3126, 'grad_norm': 1.9465179443359375, 'learning_rate': 0.00013516797712651894, 'epoch': 0.32}


 32%|███▏      | 5453/16798 [24:00<57:55,  3.26it/s]

{'loss': 2.1872, 'grad_norm': 1.9287420511245728, 'learning_rate': 0.00013515606385513463, 'epoch': 0.32}


 32%|███▏      | 5454/16798 [24:00<59:53,  3.16it/s]

{'loss': 2.1766, 'grad_norm': 2.0055782794952393, 'learning_rate': 0.0001351441505837503, 'epoch': 0.32}


 32%|███▏      | 5455/16798 [24:01<59:54,  3.16it/s]

{'loss': 1.8188, 'grad_norm': 2.745190382003784, 'learning_rate': 0.00013513223731236596, 'epoch': 0.32}


 32%|███▏      | 5456/16798 [24:01<59:24,  3.18it/s]

{'loss': 2.0568, 'grad_norm': 2.078472137451172, 'learning_rate': 0.00013512032404098165, 'epoch': 0.32}


 32%|███▏      | 5457/16798 [24:01<59:19,  3.19it/s]

{'loss': 1.9796, 'grad_norm': 1.8870620727539062, 'learning_rate': 0.00013510841076959734, 'epoch': 0.32}


 32%|███▏      | 5458/16798 [24:02<57:02,  3.31it/s]

{'loss': 1.5119, 'grad_norm': 3.4127089977264404, 'learning_rate': 0.00013509649749821303, 'epoch': 0.32}


 32%|███▏      | 5459/16798 [24:02<57:31,  3.29it/s]

{'loss': 1.898, 'grad_norm': 2.18099308013916, 'learning_rate': 0.0001350845842268287, 'epoch': 0.32}


 33%|███▎      | 5460/16798 [24:02<56:32,  3.34it/s]

{'loss': 1.6723, 'grad_norm': 1.880458116531372, 'learning_rate': 0.00013507267095544439, 'epoch': 0.33}


 33%|███▎      | 5461/16798 [24:03<58:11,  3.25it/s]

{'loss': 1.6326, 'grad_norm': 1.853216290473938, 'learning_rate': 0.00013506075768406005, 'epoch': 0.33}


 33%|███▎      | 5462/16798 [24:03<57:15,  3.30it/s]

{'loss': 2.029, 'grad_norm': 2.1903607845306396, 'learning_rate': 0.00013504884441267574, 'epoch': 0.33}


 33%|███▎      | 5463/16798 [24:03<56:47,  3.33it/s]

{'loss': 1.9947, 'grad_norm': 2.188436508178711, 'learning_rate': 0.0001350369311412914, 'epoch': 0.33}


 33%|███▎      | 5464/16798 [24:03<54:16,  3.48it/s]

{'loss': 1.4112, 'grad_norm': 1.5981171131134033, 'learning_rate': 0.0001350250178699071, 'epoch': 0.33}


 33%|███▎      | 5465/16798 [24:04<55:46,  3.39it/s]

{'loss': 1.3482, 'grad_norm': 2.205601930618286, 'learning_rate': 0.00013501310459852276, 'epoch': 0.33}


 33%|███▎      | 5466/16798 [24:04<55:50,  3.38it/s]

{'loss': 1.9075, 'grad_norm': 2.1858510971069336, 'learning_rate': 0.00013500119132713845, 'epoch': 0.33}


 33%|███▎      | 5467/16798 [24:04<56:24,  3.35it/s]

{'loss': 1.6832, 'grad_norm': 2.0094685554504395, 'learning_rate': 0.0001349892780557541, 'epoch': 0.33}


 33%|███▎      | 5468/16798 [24:05<57:33,  3.28it/s]

{'loss': 1.0768, 'grad_norm': 1.54141366481781, 'learning_rate': 0.0001349773647843698, 'epoch': 0.33}


 33%|███▎      | 5469/16798 [24:05<59:22,  3.18it/s]

{'loss': 1.0621, 'grad_norm': 1.64962899684906, 'learning_rate': 0.00013496545151298547, 'epoch': 0.33}


 33%|███▎      | 5470/16798 [24:05<57:34,  3.28it/s]

{'loss': 1.2646, 'grad_norm': 1.656484603881836, 'learning_rate': 0.00013495353824160116, 'epoch': 0.33}


 33%|███▎      | 5471/16798 [24:06<1:00:10,  3.14it/s]

{'loss': 1.6476, 'grad_norm': 1.8740049600601196, 'learning_rate': 0.00013494162497021682, 'epoch': 0.33}


 33%|███▎      | 5472/16798 [24:06<59:41,  3.16it/s]  

{'loss': 1.7251, 'grad_norm': 2.1758015155792236, 'learning_rate': 0.0001349297116988325, 'epoch': 0.33}


 33%|███▎      | 5473/16798 [24:06<58:41,  3.22it/s]

{'loss': 1.2724, 'grad_norm': 1.7852530479431152, 'learning_rate': 0.00013491779842744818, 'epoch': 0.33}


 33%|███▎      | 5474/16798 [24:07<59:52,  3.15it/s]

{'loss': 1.469, 'grad_norm': 1.8586201667785645, 'learning_rate': 0.00013490588515606387, 'epoch': 0.33}


 33%|███▎      | 5475/16798 [24:07<57:55,  3.26it/s]

{'loss': 1.1478, 'grad_norm': 1.7627441883087158, 'learning_rate': 0.00013489397188467953, 'epoch': 0.33}


 33%|███▎      | 5476/16798 [24:07<55:59,  3.37it/s]

{'loss': 1.1593, 'grad_norm': 1.6461647748947144, 'learning_rate': 0.00013488205861329522, 'epoch': 0.33}


 33%|███▎      | 5477/16798 [24:07<54:56,  3.43it/s]

{'loss': 1.6909, 'grad_norm': 2.2961835861206055, 'learning_rate': 0.00013487014534191088, 'epoch': 0.33}


 33%|███▎      | 5478/16798 [24:08<56:12,  3.36it/s]

{'loss': 1.6055, 'grad_norm': 2.4231951236724854, 'learning_rate': 0.00013485823207052658, 'epoch': 0.33}


 33%|███▎      | 5479/16798 [24:08<55:51,  3.38it/s]

{'loss': 1.7376, 'grad_norm': 2.1173360347747803, 'learning_rate': 0.00013484631879914224, 'epoch': 0.33}


 33%|███▎      | 5480/16798 [24:08<54:59,  3.43it/s]

{'loss': 1.9851, 'grad_norm': 2.5303828716278076, 'learning_rate': 0.00013483440552775793, 'epoch': 0.33}


 33%|███▎      | 5481/16798 [24:09<56:23,  3.34it/s]

{'loss': 1.3232, 'grad_norm': 1.8926730155944824, 'learning_rate': 0.0001348224922563736, 'epoch': 0.33}


 33%|███▎      | 5482/16798 [24:09<55:11,  3.42it/s]

{'loss': 2.0825, 'grad_norm': 3.083366870880127, 'learning_rate': 0.00013481057898498928, 'epoch': 0.33}


 33%|███▎      | 5483/16798 [24:09<55:04,  3.42it/s]

{'loss': 1.4772, 'grad_norm': 1.7303061485290527, 'learning_rate': 0.00013479866571360495, 'epoch': 0.33}


 33%|███▎      | 5484/16798 [24:10<55:18,  3.41it/s]

{'loss': 1.4584, 'grad_norm': 1.7601324319839478, 'learning_rate': 0.00013478675244222064, 'epoch': 0.33}


 33%|███▎      | 5485/16798 [24:10<55:41,  3.39it/s]

{'loss': 1.9295, 'grad_norm': 2.4423105716705322, 'learning_rate': 0.0001347748391708363, 'epoch': 0.33}


 33%|███▎      | 5486/16798 [24:10<54:38,  3.45it/s]

{'loss': 1.3614, 'grad_norm': 1.6488037109375, 'learning_rate': 0.000134762925899452, 'epoch': 0.33}


 33%|███▎      | 5487/16798 [24:10<54:18,  3.47it/s]

{'loss': 1.0135, 'grad_norm': 1.5052322149276733, 'learning_rate': 0.00013475101262806766, 'epoch': 0.33}


 33%|███▎      | 5488/16798 [24:11<55:00,  3.43it/s]

{'loss': 1.0476, 'grad_norm': 1.5987200736999512, 'learning_rate': 0.00013473909935668337, 'epoch': 0.33}


 33%|███▎      | 5489/16798 [24:11<55:27,  3.40it/s]

{'loss': 1.5973, 'grad_norm': 1.902363896369934, 'learning_rate': 0.00013472718608529904, 'epoch': 0.33}


 33%|███▎      | 5490/16798 [24:11<56:45,  3.32it/s]

{'loss': 1.0528, 'grad_norm': 2.002133846282959, 'learning_rate': 0.00013471527281391473, 'epoch': 0.33}


 33%|███▎      | 5491/16798 [24:12<1:00:23,  3.12it/s]

{'loss': 1.002, 'grad_norm': 1.6476027965545654, 'learning_rate': 0.0001347033595425304, 'epoch': 0.33}


 33%|███▎      | 5492/16798 [24:12<59:03,  3.19it/s]  

{'loss': 1.1215, 'grad_norm': 1.6724357604980469, 'learning_rate': 0.00013469144627114608, 'epoch': 0.33}


 33%|███▎      | 5493/16798 [24:12<58:19,  3.23it/s]

{'loss': 1.2811, 'grad_norm': 2.14487886428833, 'learning_rate': 0.00013467953299976175, 'epoch': 0.33}


 33%|███▎      | 5494/16798 [24:13<58:00,  3.25it/s]

{'loss': 1.1109, 'grad_norm': 1.8937580585479736, 'learning_rate': 0.00013466761972837744, 'epoch': 0.33}


 33%|███▎      | 5495/16798 [24:13<57:57,  3.25it/s]

{'loss': 0.991, 'grad_norm': 2.166893243789673, 'learning_rate': 0.0001346557064569931, 'epoch': 0.33}


 33%|███▎      | 5496/16798 [24:13<59:08,  3.19it/s]

{'loss': 0.9884, 'grad_norm': 1.4591147899627686, 'learning_rate': 0.0001346437931856088, 'epoch': 0.33}


 33%|███▎      | 5497/16798 [24:13<58:56,  3.20it/s]

{'loss': 0.9216, 'grad_norm': 1.3416407108306885, 'learning_rate': 0.00013463187991422446, 'epoch': 0.33}


 33%|███▎      | 5498/16798 [24:14<58:51,  3.20it/s]

{'loss': 0.7217, 'grad_norm': 1.4869356155395508, 'learning_rate': 0.00013461996664284015, 'epoch': 0.33}


 33%|███▎      | 5499/16798 [24:14<56:37,  3.33it/s]

{'loss': 0.2205, 'grad_norm': 0.797204315662384, 'learning_rate': 0.0001346080533714558, 'epoch': 0.33}




{'loss': 0.416, 'grad_norm': 0.9856520891189575, 'learning_rate': 0.0001345961401000715, 'epoch': 0.33}


 33%|███▎      | 5501/16798 [24:17<3:17:10,  1.05s/it]

{'loss': 1.8326, 'grad_norm': 1.5914911031723022, 'learning_rate': 0.00013458422682868716, 'epoch': 0.33}


 33%|███▎      | 5502/16798 [24:18<2:37:37,  1.19it/s]

{'loss': 1.7343, 'grad_norm': 1.8562184572219849, 'learning_rate': 0.00013457231355730286, 'epoch': 0.33}


 33%|███▎      | 5503/16798 [24:18<2:06:30,  1.49it/s]

{'loss': 1.6779, 'grad_norm': 2.112600564956665, 'learning_rate': 0.00013456040028591852, 'epoch': 0.33}


 33%|███▎      | 5504/16798 [24:18<1:46:58,  1.76it/s]

{'loss': 2.1633, 'grad_norm': 1.8072599172592163, 'learning_rate': 0.0001345484870145342, 'epoch': 0.33}


 33%|███▎      | 5505/16798 [24:18<1:28:48,  2.12it/s]

{'loss': 2.0529, 'grad_norm': 1.9143809080123901, 'learning_rate': 0.00013453657374314987, 'epoch': 0.33}


 33%|███▎      | 5506/16798 [24:19<1:18:19,  2.40it/s]

{'loss': 1.8319, 'grad_norm': 1.96688973903656, 'learning_rate': 0.00013452466047176556, 'epoch': 0.33}


 33%|███▎      | 5507/16798 [24:19<1:10:17,  2.68it/s]

{'loss': 1.687, 'grad_norm': 1.6632782220840454, 'learning_rate': 0.00013451274720038123, 'epoch': 0.33}


 33%|███▎      | 5508/16798 [24:19<1:07:06,  2.80it/s]

{'loss': 1.8785, 'grad_norm': 2.1663172245025635, 'learning_rate': 0.00013450083392899692, 'epoch': 0.33}


 33%|███▎      | 5509/16798 [24:20<1:03:58,  2.94it/s]

{'loss': 1.4525, 'grad_norm': 1.8708964586257935, 'learning_rate': 0.00013448892065761258, 'epoch': 0.33}


 33%|███▎      | 5510/16798 [24:20<1:03:22,  2.97it/s]

{'loss': 1.5825, 'grad_norm': 1.658685326576233, 'learning_rate': 0.00013447700738622827, 'epoch': 0.33}


 33%|███▎      | 5511/16798 [24:20<1:00:51,  3.09it/s]

{'loss': 1.7562, 'grad_norm': 2.005378484725952, 'learning_rate': 0.00013446509411484394, 'epoch': 0.33}


 33%|███▎      | 5512/16798 [24:21<1:01:19,  3.07it/s]

{'loss': 1.6846, 'grad_norm': 1.9621057510375977, 'learning_rate': 0.0001344531808434596, 'epoch': 0.33}


 33%|███▎      | 5513/16798 [24:21<59:13,  3.18it/s]  

{'loss': 1.6677, 'grad_norm': 2.7859129905700684, 'learning_rate': 0.0001344412675720753, 'epoch': 0.33}


 33%|███▎      | 5514/16798 [24:21<59:08,  3.18it/s]

{'loss': 1.5954, 'grad_norm': 2.03836989402771, 'learning_rate': 0.00013442935430069096, 'epoch': 0.33}


 33%|███▎      | 5515/16798 [24:21<57:40,  3.26it/s]

{'loss': 1.536, 'grad_norm': 2.099297046661377, 'learning_rate': 0.00013441744102930665, 'epoch': 0.33}


 33%|███▎      | 5516/16798 [24:22<55:04,  3.41it/s]

{'loss': 1.4069, 'grad_norm': 1.704565167427063, 'learning_rate': 0.0001344055277579223, 'epoch': 0.33}


 33%|███▎      | 5517/16798 [24:22<54:18,  3.46it/s]

{'loss': 1.2985, 'grad_norm': 1.8950527906417847, 'learning_rate': 0.000134393614486538, 'epoch': 0.33}


 33%|███▎      | 5518/16798 [24:22<54:53,  3.42it/s]

{'loss': 1.5984, 'grad_norm': 1.8834205865859985, 'learning_rate': 0.00013438170121515366, 'epoch': 0.33}


 33%|███▎      | 5519/16798 [24:23<54:40,  3.44it/s]

{'loss': 1.0494, 'grad_norm': 1.5221550464630127, 'learning_rate': 0.00013436978794376938, 'epoch': 0.33}


 33%|███▎      | 5520/16798 [24:23<56:20,  3.34it/s]

{'loss': 1.3235, 'grad_norm': 1.7813336849212646, 'learning_rate': 0.00013435787467238505, 'epoch': 0.33}


 33%|███▎      | 5521/16798 [24:23<57:35,  3.26it/s]

{'loss': 1.7721, 'grad_norm': 1.9994736909866333, 'learning_rate': 0.00013434596140100074, 'epoch': 0.33}


 33%|███▎      | 5522/16798 [24:24<1:00:16,  3.12it/s]

{'loss': 1.2487, 'grad_norm': 1.9724563360214233, 'learning_rate': 0.0001343340481296164, 'epoch': 0.33}


 33%|███▎      | 5523/16798 [24:24<58:05,  3.23it/s]  

{'loss': 1.3801, 'grad_norm': 1.800476312637329, 'learning_rate': 0.0001343221348582321, 'epoch': 0.33}


 33%|███▎      | 5524/16798 [24:24<57:37,  3.26it/s]

{'loss': 1.3697, 'grad_norm': 1.8684879541397095, 'learning_rate': 0.00013431022158684775, 'epoch': 0.33}


 33%|███▎      | 5525/16798 [24:24<57:05,  3.29it/s]

{'loss': 1.5275, 'grad_norm': 2.212625741958618, 'learning_rate': 0.00013429830831546345, 'epoch': 0.33}


 33%|███▎      | 5526/16798 [24:25<56:18,  3.34it/s]

{'loss': 1.3822, 'grad_norm': 2.228368043899536, 'learning_rate': 0.0001342863950440791, 'epoch': 0.33}


 33%|███▎      | 5527/16798 [24:25<56:10,  3.34it/s]

{'loss': 1.6857, 'grad_norm': 1.9060814380645752, 'learning_rate': 0.0001342744817726948, 'epoch': 0.33}


 33%|███▎      | 5528/16798 [24:25<55:11,  3.40it/s]

{'loss': 1.2959, 'grad_norm': 2.106229066848755, 'learning_rate': 0.00013426256850131046, 'epoch': 0.33}


 33%|███▎      | 5529/16798 [24:26<56:42,  3.31it/s]

{'loss': 1.6927, 'grad_norm': 2.279684066772461, 'learning_rate': 0.00013425065522992615, 'epoch': 0.33}


 33%|███▎      | 5530/16798 [24:26<58:04,  3.23it/s]

{'loss': 1.2202, 'grad_norm': 1.6167964935302734, 'learning_rate': 0.00013423874195854182, 'epoch': 0.33}


 33%|███▎      | 5531/16798 [24:26<57:28,  3.27it/s]

{'loss': 1.4346, 'grad_norm': 1.9650918245315552, 'learning_rate': 0.0001342268286871575, 'epoch': 0.33}


 33%|███▎      | 5532/16798 [24:27<58:30,  3.21it/s]

{'loss': 1.2132, 'grad_norm': 1.6554101705551147, 'learning_rate': 0.00013421491541577317, 'epoch': 0.33}


 33%|███▎      | 5533/16798 [24:27<57:51,  3.24it/s]

{'loss': 1.2884, 'grad_norm': 1.7300492525100708, 'learning_rate': 0.00013420300214438886, 'epoch': 0.33}


 33%|███▎      | 5534/16798 [24:27<54:54,  3.42it/s]

{'loss': 1.5789, 'grad_norm': 2.144639492034912, 'learning_rate': 0.00013419108887300453, 'epoch': 0.33}


 33%|███▎      | 5535/16798 [24:27<54:53,  3.42it/s]

{'loss': 0.6628, 'grad_norm': 1.8239648342132568, 'learning_rate': 0.00013417917560162022, 'epoch': 0.33}


 33%|███▎      | 5536/16798 [24:28<55:32,  3.38it/s]

{'loss': 1.3194, 'grad_norm': 1.789846658706665, 'learning_rate': 0.00013416726233023588, 'epoch': 0.33}


 33%|███▎      | 5537/16798 [24:28<57:09,  3.28it/s]

{'loss': 1.1388, 'grad_norm': 1.7733078002929688, 'learning_rate': 0.00013415534905885157, 'epoch': 0.33}


 33%|███▎      | 5538/16798 [24:28<54:25,  3.45it/s]

{'loss': 1.4471, 'grad_norm': 2.842670202255249, 'learning_rate': 0.00013414343578746724, 'epoch': 0.33}


 33%|███▎      | 5539/16798 [24:29<55:35,  3.38it/s]

{'loss': 1.1419, 'grad_norm': 1.7093827724456787, 'learning_rate': 0.00013413152251608293, 'epoch': 0.33}


 33%|███▎      | 5540/16798 [24:29<55:49,  3.36it/s]

{'loss': 1.2393, 'grad_norm': 1.7664542198181152, 'learning_rate': 0.0001341196092446986, 'epoch': 0.33}


 33%|███▎      | 5541/16798 [24:29<54:38,  3.43it/s]

{'loss': 1.3679, 'grad_norm': 1.8265269994735718, 'learning_rate': 0.00013410769597331428, 'epoch': 0.33}


 33%|███▎      | 5542/16798 [24:29<53:55,  3.48it/s]

{'loss': 1.0569, 'grad_norm': 2.027296304702759, 'learning_rate': 0.00013409578270192994, 'epoch': 0.33}


 33%|███▎      | 5543/16798 [24:30<53:28,  3.51it/s]

{'loss': 0.9333, 'grad_norm': 1.704870343208313, 'learning_rate': 0.00013408386943054563, 'epoch': 0.33}


 33%|███▎      | 5544/16798 [24:30<56:06,  3.34it/s]

{'loss': 1.3818, 'grad_norm': 2.4763054847717285, 'learning_rate': 0.0001340719561591613, 'epoch': 0.33}


 33%|███▎      | 5545/16798 [24:30<55:30,  3.38it/s]

{'loss': 0.5947, 'grad_norm': 1.30365788936615, 'learning_rate': 0.000134060042887777, 'epoch': 0.33}


 33%|███▎      | 5546/16798 [24:31<57:09,  3.28it/s]

{'loss': 0.8077, 'grad_norm': 1.3758091926574707, 'learning_rate': 0.00013404812961639265, 'epoch': 0.33}


 33%|███▎      | 5547/16798 [24:31<57:04,  3.29it/s]

{'loss': 0.6513, 'grad_norm': 1.6010555028915405, 'learning_rate': 0.00013403621634500834, 'epoch': 0.33}


 33%|███▎      | 5548/16798 [24:31<57:27,  3.26it/s]

{'loss': 0.5327, 'grad_norm': 1.2627880573272705, 'learning_rate': 0.000134024303073624, 'epoch': 0.33}


 33%|███▎      | 5549/16798 [24:32<57:26,  3.26it/s]

{'loss': 0.3614, 'grad_norm': 0.9015620350837708, 'learning_rate': 0.0001340123898022397, 'epoch': 0.33}


 33%|███▎      | 5550/16798 [24:32<57:17,  3.27it/s]

{'loss': 0.5557, 'grad_norm': 1.1841462850570679, 'learning_rate': 0.0001340004765308554, 'epoch': 0.33}


 33%|███▎      | 5551/16798 [24:32<1:00:12,  3.11it/s]

{'loss': 1.9263, 'grad_norm': 1.917446494102478, 'learning_rate': 0.00013398856325947108, 'epoch': 0.33}


 33%|███▎      | 5552/16798 [24:33<59:44,  3.14it/s]  

{'loss': 1.8102, 'grad_norm': 1.8713587522506714, 'learning_rate': 0.00013397664998808674, 'epoch': 0.33}


 33%|███▎      | 5553/16798 [24:33<59:42,  3.14it/s]

{'loss': 2.1148, 'grad_norm': 2.071556806564331, 'learning_rate': 0.00013396473671670243, 'epoch': 0.33}


 33%|███▎      | 5554/16798 [24:33<58:24,  3.21it/s]

{'loss': 2.0559, 'grad_norm': 1.8909904956817627, 'learning_rate': 0.0001339528234453181, 'epoch': 0.33}


 33%|███▎      | 5555/16798 [24:33<57:51,  3.24it/s]

{'loss': 1.6377, 'grad_norm': 1.6169850826263428, 'learning_rate': 0.0001339409101739338, 'epoch': 0.33}


 33%|███▎      | 5556/16798 [24:34<56:23,  3.32it/s]

{'loss': 2.3983, 'grad_norm': 2.219710350036621, 'learning_rate': 0.00013392899690254945, 'epoch': 0.33}


 33%|███▎      | 5557/16798 [24:34<56:54,  3.29it/s]

{'loss': 1.8423, 'grad_norm': 1.8184493780136108, 'learning_rate': 0.00013391708363116514, 'epoch': 0.33}


 33%|███▎      | 5558/16798 [24:34<57:00,  3.29it/s]

{'loss': 2.3809, 'grad_norm': 2.1128461360931396, 'learning_rate': 0.0001339051703597808, 'epoch': 0.33}


 33%|███▎      | 5559/16798 [24:35<57:43,  3.24it/s]

{'loss': 1.559, 'grad_norm': 1.7122031450271606, 'learning_rate': 0.0001338932570883965, 'epoch': 0.33}


 33%|███▎      | 5560/16798 [24:35<58:43,  3.19it/s]

{'loss': 1.8239, 'grad_norm': 1.7069753408432007, 'learning_rate': 0.00013388134381701216, 'epoch': 0.33}


 33%|███▎      | 5561/16798 [24:35<57:09,  3.28it/s]

{'loss': 1.5344, 'grad_norm': 1.7112877368927002, 'learning_rate': 0.00013386943054562785, 'epoch': 0.33}


 33%|███▎      | 5562/16798 [24:36<1:00:05,  3.12it/s]

{'loss': 1.6708, 'grad_norm': 1.7970865964889526, 'learning_rate': 0.00013385751727424352, 'epoch': 0.33}


 33%|███▎      | 5563/16798 [24:36<59:07,  3.17it/s]  

{'loss': 1.7241, 'grad_norm': 2.1446211338043213, 'learning_rate': 0.0001338456040028592, 'epoch': 0.33}


 33%|███▎      | 5564/16798 [24:36<56:55,  3.29it/s]

{'loss': 1.5531, 'grad_norm': 1.803877830505371, 'learning_rate': 0.00013383369073147487, 'epoch': 0.33}


 33%|███▎      | 5565/16798 [24:37<55:55,  3.35it/s]

{'loss': 1.397, 'grad_norm': 2.139434814453125, 'learning_rate': 0.00013382177746009056, 'epoch': 0.33}


 33%|███▎      | 5566/16798 [24:37<56:25,  3.32it/s]

{'loss': 1.5561, 'grad_norm': 1.700617790222168, 'learning_rate': 0.00013380986418870622, 'epoch': 0.33}


 33%|███▎      | 5567/16798 [24:37<58:08,  3.22it/s]

{'loss': 1.911, 'grad_norm': 1.947622537612915, 'learning_rate': 0.00013379795091732191, 'epoch': 0.33}


 33%|███▎      | 5568/16798 [24:37<57:55,  3.23it/s]

{'loss': 1.3961, 'grad_norm': 1.8376526832580566, 'learning_rate': 0.00013378603764593758, 'epoch': 0.33}


 33%|███▎      | 5569/16798 [24:38<57:49,  3.24it/s]

{'loss': 1.778, 'grad_norm': 1.8301142454147339, 'learning_rate': 0.00013377412437455324, 'epoch': 0.33}


 33%|███▎      | 5570/16798 [24:38<58:42,  3.19it/s]

{'loss': 1.4221, 'grad_norm': 1.5831108093261719, 'learning_rate': 0.00013376221110316893, 'epoch': 0.33}


 33%|███▎      | 5571/16798 [24:38<57:36,  3.25it/s]

{'loss': 1.3886, 'grad_norm': 1.8408973217010498, 'learning_rate': 0.0001337502978317846, 'epoch': 0.33}


 33%|███▎      | 5572/16798 [24:39<56:46,  3.30it/s]

{'loss': 1.556, 'grad_norm': 1.9420535564422607, 'learning_rate': 0.0001337383845604003, 'epoch': 0.33}


 33%|███▎      | 5573/16798 [24:39<53:55,  3.47it/s]

{'loss': 1.408, 'grad_norm': 1.9485394954681396, 'learning_rate': 0.00013372647128901595, 'epoch': 0.33}


 33%|███▎      | 5574/16798 [24:39<53:58,  3.47it/s]

{'loss': 1.8138, 'grad_norm': 2.1750526428222656, 'learning_rate': 0.00013371455801763164, 'epoch': 0.33}


 33%|███▎      | 5575/16798 [24:40<53:57,  3.47it/s]

{'loss': 1.4364, 'grad_norm': 1.7062894105911255, 'learning_rate': 0.0001337026447462473, 'epoch': 0.33}


 33%|███▎      | 5576/16798 [24:40<54:51,  3.41it/s]

{'loss': 1.7465, 'grad_norm': 2.082963705062866, 'learning_rate': 0.000133690731474863, 'epoch': 0.33}


 33%|███▎      | 5577/16798 [24:40<55:41,  3.36it/s]

{'loss': 1.5774, 'grad_norm': 1.900604009628296, 'learning_rate': 0.00013367881820347866, 'epoch': 0.33}


 33%|███▎      | 5578/16798 [24:40<56:15,  3.32it/s]

{'loss': 1.5877, 'grad_norm': 2.0840964317321777, 'learning_rate': 0.00013366690493209435, 'epoch': 0.33}


 33%|███▎      | 5579/16798 [24:41<57:13,  3.27it/s]

{'loss': 1.7093, 'grad_norm': 2.0991599559783936, 'learning_rate': 0.00013365499166071001, 'epoch': 0.33}


 33%|███▎      | 5580/16798 [24:41<55:17,  3.38it/s]

{'loss': 1.3042, 'grad_norm': 1.758187174797058, 'learning_rate': 0.0001336430783893257, 'epoch': 0.33}


 33%|███▎      | 5581/16798 [24:41<56:09,  3.33it/s]

{'loss': 1.5682, 'grad_norm': 2.074810266494751, 'learning_rate': 0.0001336311651179414, 'epoch': 0.33}


 33%|███▎      | 5582/16798 [24:42<56:34,  3.30it/s]

{'loss': 1.8128, 'grad_norm': 2.1922407150268555, 'learning_rate': 0.0001336192518465571, 'epoch': 0.33}


 33%|███▎      | 5583/16798 [24:42<55:54,  3.34it/s]

{'loss': 1.5809, 'grad_norm': 2.2633886337280273, 'learning_rate': 0.00013360733857517275, 'epoch': 0.33}


 33%|███▎      | 5584/16798 [24:42<56:07,  3.33it/s]

{'loss': 1.3692, 'grad_norm': 1.9151517152786255, 'learning_rate': 0.00013359542530378844, 'epoch': 0.33}


 33%|███▎      | 5585/16798 [24:43<55:50,  3.35it/s]

{'loss': 1.2653, 'grad_norm': 1.802830696105957, 'learning_rate': 0.0001335835120324041, 'epoch': 0.33}


 33%|███▎      | 5586/16798 [24:43<55:39,  3.36it/s]

{'loss': 1.308, 'grad_norm': 1.9392871856689453, 'learning_rate': 0.0001335715987610198, 'epoch': 0.33}


 33%|███▎      | 5587/16798 [24:43<56:42,  3.29it/s]

{'loss': 1.8306, 'grad_norm': 2.0624523162841797, 'learning_rate': 0.00013355968548963546, 'epoch': 0.33}


 33%|███▎      | 5588/16798 [24:43<54:07,  3.45it/s]

{'loss': 1.3078, 'grad_norm': 2.1824557781219482, 'learning_rate': 0.00013354777221825115, 'epoch': 0.33}


 33%|███▎      | 5589/16798 [24:44<52:01,  3.59it/s]

{'loss': 1.1587, 'grad_norm': 1.8297481536865234, 'learning_rate': 0.00013353585894686681, 'epoch': 0.33}


 33%|███▎      | 5590/16798 [24:44<52:22,  3.57it/s]

{'loss': 1.172, 'grad_norm': 1.7674388885498047, 'learning_rate': 0.0001335239456754825, 'epoch': 0.33}


 33%|███▎      | 5591/16798 [24:44<52:31,  3.56it/s]

{'loss': 1.0817, 'grad_norm': 1.9147071838378906, 'learning_rate': 0.00013351203240409817, 'epoch': 0.33}


 33%|███▎      | 5592/16798 [24:45<52:45,  3.54it/s]

{'loss': 1.2995, 'grad_norm': 1.7486720085144043, 'learning_rate': 0.00013350011913271386, 'epoch': 0.33}


 33%|███▎      | 5593/16798 [24:45<54:36,  3.42it/s]

{'loss': 1.0042, 'grad_norm': 1.6520224809646606, 'learning_rate': 0.00013348820586132952, 'epoch': 0.33}


 33%|███▎      | 5594/16798 [24:45<53:52,  3.47it/s]

{'loss': 1.0266, 'grad_norm': 1.8341025114059448, 'learning_rate': 0.0001334762925899452, 'epoch': 0.33}


 33%|███▎      | 5595/16798 [24:45<53:05,  3.52it/s]

{'loss': 0.9689, 'grad_norm': 1.9081326723098755, 'learning_rate': 0.00013346437931856088, 'epoch': 0.33}


 33%|███▎      | 5596/16798 [24:46<56:22,  3.31it/s]

{'loss': 1.3335, 'grad_norm': 2.022374391555786, 'learning_rate': 0.00013345246604717657, 'epoch': 0.33}


 33%|███▎      | 5597/16798 [24:46<55:51,  3.34it/s]

{'loss': 1.0619, 'grad_norm': 1.856846809387207, 'learning_rate': 0.00013344055277579223, 'epoch': 0.33}


 33%|███▎      | 5598/16798 [24:46<55:58,  3.34it/s]

{'loss': 0.9612, 'grad_norm': 1.7258042097091675, 'learning_rate': 0.00013342863950440792, 'epoch': 0.33}


 33%|███▎      | 5599/16798 [24:47<55:03,  3.39it/s]

{'loss': 0.6242, 'grad_norm': 1.240868091583252, 'learning_rate': 0.00013341672623302359, 'epoch': 0.33}


 33%|███▎      | 5600/16798 [24:47<56:01,  3.33it/s]

{'loss': 0.8346, 'grad_norm': 1.4453060626983643, 'learning_rate': 0.00013340481296163928, 'epoch': 0.33}


 33%|███▎      | 5601/16798 [24:47<56:29,  3.30it/s]

{'loss': 1.7137, 'grad_norm': 2.3099279403686523, 'learning_rate': 0.00013339289969025494, 'epoch': 0.33}


 33%|███▎      | 5602/16798 [24:48<57:13,  3.26it/s]

{'loss': 1.6388, 'grad_norm': 3.0547053813934326, 'learning_rate': 0.00013338098641887063, 'epoch': 0.33}


 33%|███▎      | 5603/16798 [24:48<55:46,  3.35it/s]

{'loss': 1.9135, 'grad_norm': 1.7103148698806763, 'learning_rate': 0.0001333690731474863, 'epoch': 0.33}


 33%|███▎      | 5604/16798 [24:48<54:30,  3.42it/s]

{'loss': 1.7417, 'grad_norm': 1.6249451637268066, 'learning_rate': 0.00013335715987610199, 'epoch': 0.33}


 33%|███▎      | 5605/16798 [24:48<55:20,  3.37it/s]

{'loss': 1.8003, 'grad_norm': 1.998395323753357, 'learning_rate': 0.00013334524660471765, 'epoch': 0.33}


 33%|███▎      | 5606/16798 [24:49<56:27,  3.30it/s]

{'loss': 1.8221, 'grad_norm': 1.7278404235839844, 'learning_rate': 0.00013333333333333334, 'epoch': 0.33}


 33%|███▎      | 5607/16798 [24:49<56:14,  3.32it/s]

{'loss': 2.2854, 'grad_norm': 2.103632688522339, 'learning_rate': 0.000133321420061949, 'epoch': 0.33}


 33%|███▎      | 5608/16798 [24:49<54:57,  3.39it/s]

{'loss': 1.8258, 'grad_norm': 1.8170515298843384, 'learning_rate': 0.0001333095067905647, 'epoch': 0.33}


 33%|███▎      | 5609/16798 [24:50<55:12,  3.38it/s]

{'loss': 2.0633, 'grad_norm': 1.8746205568313599, 'learning_rate': 0.00013329759351918036, 'epoch': 0.33}


 33%|███▎      | 5610/16798 [24:50<56:37,  3.29it/s]

{'loss': 1.6399, 'grad_norm': 2.0865087509155273, 'learning_rate': 0.00013328568024779605, 'epoch': 0.33}


 33%|███▎      | 5611/16798 [24:50<56:12,  3.32it/s]

{'loss': 1.7937, 'grad_norm': 2.3203821182250977, 'learning_rate': 0.0001332737669764117, 'epoch': 0.33}


 33%|███▎      | 5612/16798 [24:51<57:16,  3.25it/s]

{'loss': 1.4291, 'grad_norm': 1.6186227798461914, 'learning_rate': 0.00013326185370502743, 'epoch': 0.33}


 33%|███▎      | 5613/16798 [24:51<57:02,  3.27it/s]

{'loss': 1.4496, 'grad_norm': 2.1206741333007812, 'learning_rate': 0.0001332499404336431, 'epoch': 0.33}


 33%|███▎      | 5614/16798 [24:51<55:51,  3.34it/s]

{'loss': 1.6656, 'grad_norm': 2.1682963371276855, 'learning_rate': 0.00013323802716225878, 'epoch': 0.33}


 33%|███▎      | 5615/16798 [24:51<55:40,  3.35it/s]

{'loss': 1.8909, 'grad_norm': 1.9891924858093262, 'learning_rate': 0.00013322611389087445, 'epoch': 0.33}


 33%|███▎      | 5616/16798 [24:52<56:23,  3.31it/s]

{'loss': 1.8818, 'grad_norm': 2.18588924407959, 'learning_rate': 0.00013321420061949014, 'epoch': 0.33}


 33%|███▎      | 5617/16798 [24:52<56:02,  3.33it/s]

{'loss': 1.6599, 'grad_norm': 1.9217928647994995, 'learning_rate': 0.0001332022873481058, 'epoch': 0.33}


 33%|███▎      | 5618/16798 [24:52<55:43,  3.34it/s]

{'loss': 1.8172, 'grad_norm': 2.012291669845581, 'learning_rate': 0.0001331903740767215, 'epoch': 0.33}


 33%|███▎      | 5619/16798 [24:53<56:04,  3.32it/s]

{'loss': 1.7155, 'grad_norm': 2.0296525955200195, 'learning_rate': 0.00013317846080533716, 'epoch': 0.33}


 33%|███▎      | 5620/16798 [24:53<55:15,  3.37it/s]

{'loss': 1.7965, 'grad_norm': 1.9152406454086304, 'learning_rate': 0.00013316654753395285, 'epoch': 0.33}


 33%|███▎      | 5621/16798 [24:53<55:13,  3.37it/s]

{'loss': 0.9569, 'grad_norm': 1.5750231742858887, 'learning_rate': 0.0001331546342625685, 'epoch': 0.33}


 33%|███▎      | 5622/16798 [24:54<55:17,  3.37it/s]

{'loss': 1.7126, 'grad_norm': 1.9800390005111694, 'learning_rate': 0.0001331427209911842, 'epoch': 0.33}


 33%|███▎      | 5623/16798 [24:54<54:27,  3.42it/s]

{'loss': 1.7684, 'grad_norm': 2.0816919803619385, 'learning_rate': 0.00013313080771979987, 'epoch': 0.33}


 33%|███▎      | 5624/16798 [24:54<52:11,  3.57it/s]

{'loss': 1.6684, 'grad_norm': 2.1786158084869385, 'learning_rate': 0.00013311889444841556, 'epoch': 0.33}


 33%|███▎      | 5625/16798 [24:54<52:39,  3.54it/s]

{'loss': 1.5244, 'grad_norm': 1.8086389303207397, 'learning_rate': 0.00013310698117703122, 'epoch': 0.33}


 33%|███▎      | 5626/16798 [24:55<55:01,  3.38it/s]

{'loss': 1.6081, 'grad_norm': 2.0409762859344482, 'learning_rate': 0.00013309506790564688, 'epoch': 0.33}


 33%|███▎      | 5627/16798 [24:55<54:08,  3.44it/s]

{'loss': 1.5672, 'grad_norm': 2.1004765033721924, 'learning_rate': 0.00013308315463426257, 'epoch': 0.33}


 34%|███▎      | 5628/16798 [24:55<55:13,  3.37it/s]

{'loss': 1.2088, 'grad_norm': 1.7034145593643188, 'learning_rate': 0.00013307124136287824, 'epoch': 0.34}


 34%|███▎      | 5629/16798 [24:56<55:02,  3.38it/s]

{'loss': 1.8069, 'grad_norm': 2.065126657485962, 'learning_rate': 0.00013305932809149393, 'epoch': 0.34}


 34%|███▎      | 5630/16798 [24:56<54:57,  3.39it/s]

{'loss': 1.7396, 'grad_norm': 1.9929450750350952, 'learning_rate': 0.0001330474148201096, 'epoch': 0.34}


 34%|███▎      | 5631/16798 [24:56<55:29,  3.35it/s]

{'loss': 1.6735, 'grad_norm': 2.0807998180389404, 'learning_rate': 0.00013303550154872528, 'epoch': 0.34}


 34%|███▎      | 5632/16798 [24:56<55:07,  3.38it/s]

{'loss': 1.6837, 'grad_norm': 2.676832437515259, 'learning_rate': 0.00013302358827734095, 'epoch': 0.34}


 34%|███▎      | 5633/16798 [24:57<54:26,  3.42it/s]

{'loss': 1.3324, 'grad_norm': 2.28265118598938, 'learning_rate': 0.00013301167500595664, 'epoch': 0.34}


 34%|███▎      | 5634/16798 [24:57<54:25,  3.42it/s]

{'loss': 1.4169, 'grad_norm': 2.023419141769409, 'learning_rate': 0.0001329997617345723, 'epoch': 0.34}


 34%|███▎      | 5635/16798 [24:57<54:46,  3.40it/s]

{'loss': 1.4642, 'grad_norm': 2.128903865814209, 'learning_rate': 0.000132987848463188, 'epoch': 0.34}


 34%|███▎      | 5636/16798 [24:58<54:46,  3.40it/s]

{'loss': 1.4583, 'grad_norm': 1.7934925556182861, 'learning_rate': 0.00013297593519180366, 'epoch': 0.34}


 34%|███▎      | 5637/16798 [24:58<56:38,  3.28it/s]

{'loss': 1.2973, 'grad_norm': 2.140220880508423, 'learning_rate': 0.00013296402192041935, 'epoch': 0.34}


 34%|███▎      | 5638/16798 [24:58<56:00,  3.32it/s]

{'loss': 1.3734, 'grad_norm': 1.5823827981948853, 'learning_rate': 0.000132952108649035, 'epoch': 0.34}


 34%|███▎      | 5639/16798 [24:59<55:47,  3.33it/s]

{'loss': 1.1409, 'grad_norm': 1.9866479635238647, 'learning_rate': 0.0001329401953776507, 'epoch': 0.34}


 34%|███▎      | 5640/16798 [24:59<55:04,  3.38it/s]

{'loss': 1.0416, 'grad_norm': 1.7691258192062378, 'learning_rate': 0.00013292828210626636, 'epoch': 0.34}


 34%|███▎      | 5641/16798 [24:59<53:25,  3.48it/s]

{'loss': 1.1643, 'grad_norm': 1.754677176475525, 'learning_rate': 0.00013291636883488206, 'epoch': 0.34}


 34%|███▎      | 5642/16798 [24:59<53:18,  3.49it/s]

{'loss': 0.8849, 'grad_norm': 1.425963044166565, 'learning_rate': 0.00013290445556349772, 'epoch': 0.34}


 34%|███▎      | 5643/16798 [25:00<54:41,  3.40it/s]

{'loss': 0.8488, 'grad_norm': 1.4511852264404297, 'learning_rate': 0.00013289254229211344, 'epoch': 0.34}


 34%|███▎      | 5644/16798 [25:00<54:52,  3.39it/s]

{'loss': 0.7057, 'grad_norm': 1.4130127429962158, 'learning_rate': 0.0001328806290207291, 'epoch': 0.34}


 34%|███▎      | 5645/16798 [25:00<56:57,  3.26it/s]

{'loss': 1.1855, 'grad_norm': 1.757793664932251, 'learning_rate': 0.0001328687157493448, 'epoch': 0.34}


 34%|███▎      | 5646/16798 [25:01<54:51,  3.39it/s]

{'loss': 0.8147, 'grad_norm': 1.8093634843826294, 'learning_rate': 0.00013285680247796046, 'epoch': 0.34}


 34%|███▎      | 5647/16798 [25:01<54:46,  3.39it/s]

{'loss': 0.5191, 'grad_norm': 0.9889772534370422, 'learning_rate': 0.00013284488920657615, 'epoch': 0.34}


 34%|███▎      | 5648/16798 [25:01<55:30,  3.35it/s]

{'loss': 0.8421, 'grad_norm': 1.3528915643692017, 'learning_rate': 0.0001328329759351918, 'epoch': 0.34}


 34%|███▎      | 5649/16798 [25:02<57:14,  3.25it/s]

{'loss': 0.8895, 'grad_norm': 1.762344241142273, 'learning_rate': 0.0001328210626638075, 'epoch': 0.34}


 34%|███▎      | 5650/16798 [25:02<57:38,  3.22it/s]

{'loss': 0.741, 'grad_norm': 1.4146360158920288, 'learning_rate': 0.00013280914939242316, 'epoch': 0.34}


 34%|███▎      | 5651/16798 [25:02<56:48,  3.27it/s]

{'loss': 1.8018, 'grad_norm': 1.7636206150054932, 'learning_rate': 0.00013279723612103885, 'epoch': 0.34}


 34%|███▎      | 5652/16798 [25:02<56:05,  3.31it/s]

{'loss': 1.7496, 'grad_norm': 1.5958492755889893, 'learning_rate': 0.00013278532284965452, 'epoch': 0.34}


 34%|███▎      | 5653/16798 [25:03<54:37,  3.40it/s]

{'loss': 1.8575, 'grad_norm': 1.7688381671905518, 'learning_rate': 0.0001327734095782702, 'epoch': 0.34}


 34%|███▎      | 5654/16798 [25:03<54:55,  3.38it/s]

{'loss': 1.9913, 'grad_norm': 2.2392797470092773, 'learning_rate': 0.00013276149630688587, 'epoch': 0.34}


 34%|███▎      | 5655/16798 [25:03<54:02,  3.44it/s]

{'loss': 2.0204, 'grad_norm': 1.8914685249328613, 'learning_rate': 0.00013274958303550156, 'epoch': 0.34}


 34%|███▎      | 5656/16798 [25:04<52:24,  3.54it/s]

{'loss': 2.44, 'grad_norm': 2.226395606994629, 'learning_rate': 0.00013273766976411723, 'epoch': 0.34}


 34%|███▎      | 5657/16798 [25:04<54:02,  3.44it/s]

{'loss': 2.3076, 'grad_norm': 1.9728877544403076, 'learning_rate': 0.00013272575649273292, 'epoch': 0.34}


 34%|███▎      | 5658/16798 [25:04<54:37,  3.40it/s]

{'loss': 2.0592, 'grad_norm': 2.1408355236053467, 'learning_rate': 0.00013271384322134858, 'epoch': 0.34}


 34%|███▎      | 5659/16798 [25:04<54:55,  3.38it/s]

{'loss': 2.2142, 'grad_norm': 2.4292209148406982, 'learning_rate': 0.00013270192994996427, 'epoch': 0.34}


 34%|███▎      | 5660/16798 [25:05<56:08,  3.31it/s]

{'loss': 1.7904, 'grad_norm': 1.806931734085083, 'learning_rate': 0.00013269001667857994, 'epoch': 0.34}


 34%|███▎      | 5661/16798 [25:05<54:02,  3.44it/s]

{'loss': 1.455, 'grad_norm': 1.753893494606018, 'learning_rate': 0.00013267810340719563, 'epoch': 0.34}


 34%|███▎      | 5662/16798 [25:05<54:30,  3.40it/s]

{'loss': 1.4913, 'grad_norm': 2.21522855758667, 'learning_rate': 0.0001326661901358113, 'epoch': 0.34}


 34%|███▎      | 5663/16798 [25:06<56:50,  3.27it/s]

{'loss': 1.5461, 'grad_norm': 1.768222451210022, 'learning_rate': 0.00013265427686442698, 'epoch': 0.34}


 34%|███▎      | 5664/16798 [25:06<59:06,  3.14it/s]

{'loss': 2.0297, 'grad_norm': 1.8958792686462402, 'learning_rate': 0.00013264236359304265, 'epoch': 0.34}


 34%|███▎      | 5665/16798 [25:06<56:34,  3.28it/s]

{'loss': 1.5458, 'grad_norm': 1.824540376663208, 'learning_rate': 0.00013263045032165834, 'epoch': 0.34}


 34%|███▎      | 5666/16798 [25:07<55:47,  3.33it/s]

{'loss': 1.2237, 'grad_norm': 1.7861437797546387, 'learning_rate': 0.000132618537050274, 'epoch': 0.34}


 34%|███▎      | 5667/16798 [25:07<54:44,  3.39it/s]

{'loss': 1.4964, 'grad_norm': 2.09474778175354, 'learning_rate': 0.0001326066237788897, 'epoch': 0.34}


 34%|███▎      | 5668/16798 [25:07<54:35,  3.40it/s]

{'loss': 1.2525, 'grad_norm': 1.6747071743011475, 'learning_rate': 0.00013259471050750535, 'epoch': 0.34}


 34%|███▎      | 5669/16798 [25:07<54:34,  3.40it/s]

{'loss': 1.4064, 'grad_norm': 2.2680916786193848, 'learning_rate': 0.00013258279723612104, 'epoch': 0.34}


 34%|███▍      | 5670/16798 [25:08<56:07,  3.30it/s]

{'loss': 1.0465, 'grad_norm': 1.2809393405914307, 'learning_rate': 0.0001325708839647367, 'epoch': 0.34}


 34%|███▍      | 5671/16798 [25:08<55:28,  3.34it/s]

{'loss': 1.8228, 'grad_norm': 2.131032943725586, 'learning_rate': 0.0001325589706933524, 'epoch': 0.34}


 34%|███▍      | 5672/16798 [25:08<55:30,  3.34it/s]

{'loss': 1.3007, 'grad_norm': 1.975441813468933, 'learning_rate': 0.00013254705742196806, 'epoch': 0.34}


 34%|███▍      | 5673/16798 [25:09<55:09,  3.36it/s]

{'loss': 1.3459, 'grad_norm': 1.8660368919372559, 'learning_rate': 0.00013253514415058378, 'epoch': 0.34}


 34%|███▍      | 5674/16798 [25:09<55:24,  3.35it/s]

{'loss': 1.4049, 'grad_norm': 1.6053452491760254, 'learning_rate': 0.00013252323087919944, 'epoch': 0.34}


 34%|███▍      | 5675/16798 [25:09<56:59,  3.25it/s]

{'loss': 1.4622, 'grad_norm': 1.9242905378341675, 'learning_rate': 0.00013251131760781514, 'epoch': 0.34}


 34%|███▍      | 5676/16798 [25:10<55:35,  3.33it/s]

{'loss': 1.5131, 'grad_norm': 1.8374642133712769, 'learning_rate': 0.0001324994043364308, 'epoch': 0.34}


 34%|███▍      | 5677/16798 [25:10<54:14,  3.42it/s]

{'loss': 1.5401, 'grad_norm': 1.7507140636444092, 'learning_rate': 0.0001324874910650465, 'epoch': 0.34}


 34%|███▍      | 5678/16798 [25:10<53:49,  3.44it/s]

{'loss': 1.3484, 'grad_norm': 1.6439034938812256, 'learning_rate': 0.00013247557779366215, 'epoch': 0.34}


 34%|███▍      | 5679/16798 [25:10<54:34,  3.40it/s]

{'loss': 1.559, 'grad_norm': 1.9252058267593384, 'learning_rate': 0.00013246366452227784, 'epoch': 0.34}


 34%|███▍      | 5680/16798 [25:11<56:19,  3.29it/s]

{'loss': 1.7599, 'grad_norm': 2.3353517055511475, 'learning_rate': 0.0001324517512508935, 'epoch': 0.34}


 34%|███▍      | 5681/16798 [25:11<57:03,  3.25it/s]

{'loss': 1.0936, 'grad_norm': 1.4826654195785522, 'learning_rate': 0.0001324398379795092, 'epoch': 0.34}


 34%|███▍      | 5682/16798 [25:11<56:01,  3.31it/s]

{'loss': 1.4409, 'grad_norm': 1.7638530731201172, 'learning_rate': 0.00013242792470812486, 'epoch': 0.34}


 34%|███▍      | 5683/16798 [25:12<55:09,  3.36it/s]

{'loss': 1.2063, 'grad_norm': 1.669760823249817, 'learning_rate': 0.00013241601143674053, 'epoch': 0.34}


 34%|███▍      | 5684/16798 [25:12<53:37,  3.45it/s]

{'loss': 1.3794, 'grad_norm': 1.9551113843917847, 'learning_rate': 0.00013240409816535622, 'epoch': 0.34}


 34%|███▍      | 5685/16798 [25:12<52:16,  3.54it/s]

{'loss': 1.5241, 'grad_norm': 2.0015058517456055, 'learning_rate': 0.00013239218489397188, 'epoch': 0.34}


 34%|███▍      | 5686/16798 [25:12<54:27,  3.40it/s]

{'loss': 1.4385, 'grad_norm': 2.1645376682281494, 'learning_rate': 0.00013238027162258757, 'epoch': 0.34}


 34%|███▍      | 5687/16798 [25:13<53:08,  3.48it/s]

{'loss': 1.2831, 'grad_norm': 1.841526746749878, 'learning_rate': 0.00013236835835120323, 'epoch': 0.34}


 34%|███▍      | 5688/16798 [25:13<52:52,  3.50it/s]

{'loss': 1.4723, 'grad_norm': 1.759671688079834, 'learning_rate': 0.00013235644507981893, 'epoch': 0.34}


 34%|███▍      | 5689/16798 [25:13<52:28,  3.53it/s]

{'loss': 1.7493, 'grad_norm': 2.0630342960357666, 'learning_rate': 0.0001323445318084346, 'epoch': 0.34}


 34%|███▍      | 5690/16798 [25:14<52:55,  3.50it/s]

{'loss': 1.2814, 'grad_norm': 1.8502649068832397, 'learning_rate': 0.00013233261853705028, 'epoch': 0.34}


 34%|███▍      | 5691/16798 [25:14<52:33,  3.52it/s]

{'loss': 0.9907, 'grad_norm': 1.6639243364334106, 'learning_rate': 0.00013232070526566594, 'epoch': 0.34}


 34%|███▍      | 5692/16798 [25:14<52:18,  3.54it/s]

{'loss': 1.4429, 'grad_norm': 1.9509116411209106, 'learning_rate': 0.00013230879199428163, 'epoch': 0.34}


 34%|███▍      | 5693/16798 [25:14<51:06,  3.62it/s]

{'loss': 1.261, 'grad_norm': 2.121553659439087, 'learning_rate': 0.0001322968787228973, 'epoch': 0.34}


 34%|███▍      | 5694/16798 [25:15<52:01,  3.56it/s]

{'loss': 1.1948, 'grad_norm': 2.090960741043091, 'learning_rate': 0.000132284965451513, 'epoch': 0.34}


 34%|███▍      | 5695/16798 [25:15<54:09,  3.42it/s]

{'loss': 1.3122, 'grad_norm': 1.835831642150879, 'learning_rate': 0.00013227305218012865, 'epoch': 0.34}


 34%|███▍      | 5696/16798 [25:15<53:18,  3.47it/s]

{'loss': 0.9242, 'grad_norm': 1.484345555305481, 'learning_rate': 0.00013226113890874434, 'epoch': 0.34}


 34%|███▍      | 5697/16798 [25:16<53:14,  3.48it/s]

{'loss': 0.3986, 'grad_norm': 0.892946183681488, 'learning_rate': 0.00013224922563736, 'epoch': 0.34}


 34%|███▍      | 5698/16798 [25:16<52:17,  3.54it/s]

{'loss': 0.4045, 'grad_norm': 1.0093061923980713, 'learning_rate': 0.0001322373123659757, 'epoch': 0.34}


 34%|███▍      | 5699/16798 [25:16<52:45,  3.51it/s]

{'loss': 0.2847, 'grad_norm': 0.7349572777748108, 'learning_rate': 0.00013222539909459136, 'epoch': 0.34}


 34%|███▍      | 5700/16798 [25:17<55:27,  3.34it/s]

{'loss': 0.3398, 'grad_norm': 0.8831032514572144, 'learning_rate': 0.00013221348582320705, 'epoch': 0.34}


 34%|███▍      | 5701/16798 [25:17<55:44,  3.32it/s]

{'loss': 2.0065, 'grad_norm': 1.71518874168396, 'learning_rate': 0.00013220157255182272, 'epoch': 0.34}


 34%|███▍      | 5702/16798 [25:17<52:50,  3.50it/s]

{'loss': 1.9454, 'grad_norm': 1.7046738862991333, 'learning_rate': 0.0001321896592804384, 'epoch': 0.34}


 34%|███▍      | 5703/16798 [25:17<53:30,  3.46it/s]

{'loss': 2.04, 'grad_norm': 1.7550334930419922, 'learning_rate': 0.00013217774600905407, 'epoch': 0.34}


 34%|███▍      | 5704/16798 [25:18<52:49,  3.50it/s]

{'loss': 2.0637, 'grad_norm': 2.053784132003784, 'learning_rate': 0.0001321658327376698, 'epoch': 0.34}


 34%|███▍      | 5705/16798 [25:18<53:22,  3.46it/s]

{'loss': 2.3683, 'grad_norm': 2.210493326187134, 'learning_rate': 0.00013215391946628545, 'epoch': 0.34}


 34%|███▍      | 5706/16798 [25:18<54:40,  3.38it/s]

{'loss': 1.7152, 'grad_norm': 2.0002799034118652, 'learning_rate': 0.00013214200619490114, 'epoch': 0.34}


 34%|███▍      | 5707/16798 [25:19<57:51,  3.19it/s]

{'loss': 1.7766, 'grad_norm': 1.9325858354568481, 'learning_rate': 0.0001321300929235168, 'epoch': 0.34}


 34%|███▍      | 5708/16798 [25:19<58:19,  3.17it/s]

{'loss': 1.3687, 'grad_norm': 2.067462682723999, 'learning_rate': 0.0001321181796521325, 'epoch': 0.34}


 34%|███▍      | 5709/16798 [25:19<56:54,  3.25it/s]

{'loss': 1.7376, 'grad_norm': 1.9012953042984009, 'learning_rate': 0.00013210626638074816, 'epoch': 0.34}


 34%|███▍      | 5710/16798 [25:20<56:03,  3.30it/s]

{'loss': 1.8172, 'grad_norm': 2.548624277114868, 'learning_rate': 0.00013209435310936385, 'epoch': 0.34}


 34%|███▍      | 5711/16798 [25:20<56:35,  3.27it/s]

{'loss': 1.7964, 'grad_norm': 2.553076982498169, 'learning_rate': 0.00013208243983797951, 'epoch': 0.34}


 34%|███▍      | 5712/16798 [25:20<56:26,  3.27it/s]

{'loss': 1.5933, 'grad_norm': 1.8679970502853394, 'learning_rate': 0.0001320705265665952, 'epoch': 0.34}


 34%|███▍      | 5713/16798 [25:20<55:37,  3.32it/s]

{'loss': 1.713, 'grad_norm': 2.1290829181671143, 'learning_rate': 0.00013205861329521087, 'epoch': 0.34}


 34%|███▍      | 5714/16798 [25:21<54:50,  3.37it/s]

{'loss': 1.7154, 'grad_norm': 1.894342303276062, 'learning_rate': 0.00013204670002382656, 'epoch': 0.34}


 34%|███▍      | 5715/16798 [25:21<52:54,  3.49it/s]

{'loss': 1.6448, 'grad_norm': 2.000457763671875, 'learning_rate': 0.00013203478675244222, 'epoch': 0.34}


 34%|███▍      | 5716/16798 [25:21<52:19,  3.53it/s]

{'loss': 1.6956, 'grad_norm': 1.958889126777649, 'learning_rate': 0.00013202287348105791, 'epoch': 0.34}


 34%|███▍      | 5717/16798 [25:22<53:26,  3.46it/s]

{'loss': 1.5736, 'grad_norm': 1.9787880182266235, 'learning_rate': 0.00013201096020967358, 'epoch': 0.34}


 34%|███▍      | 5718/16798 [25:22<52:14,  3.53it/s]

{'loss': 1.6734, 'grad_norm': 2.012460708618164, 'learning_rate': 0.00013199904693828927, 'epoch': 0.34}


 34%|███▍      | 5719/16798 [25:22<53:09,  3.47it/s]

{'loss': 1.4839, 'grad_norm': 1.829294204711914, 'learning_rate': 0.00013198713366690493, 'epoch': 0.34}


 34%|███▍      | 5720/16798 [25:22<53:38,  3.44it/s]

{'loss': 1.5711, 'grad_norm': 1.96934175491333, 'learning_rate': 0.00013197522039552062, 'epoch': 0.34}


 34%|███▍      | 5721/16798 [25:23<53:50,  3.43it/s]

{'loss': 1.7236, 'grad_norm': 2.751004695892334, 'learning_rate': 0.0001319633071241363, 'epoch': 0.34}


 34%|███▍      | 5722/16798 [25:23<52:15,  3.53it/s]

{'loss': 1.6992, 'grad_norm': 1.9223580360412598, 'learning_rate': 0.00013195139385275198, 'epoch': 0.34}


 34%|███▍      | 5723/16798 [25:23<51:15,  3.60it/s]

{'loss': 2.3977, 'grad_norm': 2.4212584495544434, 'learning_rate': 0.00013193948058136764, 'epoch': 0.34}


 34%|███▍      | 5724/16798 [25:24<53:48,  3.43it/s]

{'loss': 1.5738, 'grad_norm': 1.76894211769104, 'learning_rate': 0.00013192756730998333, 'epoch': 0.34}


 34%|███▍      | 5725/16798 [25:24<55:33,  3.32it/s]

{'loss': 1.5893, 'grad_norm': 1.8036679029464722, 'learning_rate': 0.000131915654038599, 'epoch': 0.34}


 34%|███▍      | 5726/16798 [25:24<54:45,  3.37it/s]

{'loss': 1.7341, 'grad_norm': 1.900059461593628, 'learning_rate': 0.00013190374076721469, 'epoch': 0.34}


 34%|███▍      | 5727/16798 [25:24<55:47,  3.31it/s]

{'loss': 1.228, 'grad_norm': 1.6590930223464966, 'learning_rate': 0.00013189182749583035, 'epoch': 0.34}


 34%|███▍      | 5728/16798 [25:25<54:13,  3.40it/s]

{'loss': 1.4586, 'grad_norm': 1.932421326637268, 'learning_rate': 0.00013187991422444604, 'epoch': 0.34}


 34%|███▍      | 5729/16798 [25:25<53:16,  3.46it/s]

{'loss': 1.4309, 'grad_norm': 1.866882562637329, 'learning_rate': 0.0001318680009530617, 'epoch': 0.34}


 34%|███▍      | 5730/16798 [25:25<53:29,  3.45it/s]

{'loss': 1.2501, 'grad_norm': 1.7765424251556396, 'learning_rate': 0.0001318560876816774, 'epoch': 0.34}


 34%|███▍      | 5731/16798 [25:26<54:41,  3.37it/s]

{'loss': 1.6028, 'grad_norm': 2.0366456508636475, 'learning_rate': 0.00013184417441029306, 'epoch': 0.34}


 34%|███▍      | 5732/16798 [25:26<56:41,  3.25it/s]

{'loss': 1.6312, 'grad_norm': 1.913155436515808, 'learning_rate': 0.00013183226113890875, 'epoch': 0.34}


 34%|███▍      | 5733/16798 [25:26<59:24,  3.10it/s]

{'loss': 1.0526, 'grad_norm': 2.0596249103546143, 'learning_rate': 0.0001318203478675244, 'epoch': 0.34}


 34%|███▍      | 5734/16798 [25:27<56:40,  3.25it/s]

{'loss': 1.2682, 'grad_norm': 1.7327264547348022, 'learning_rate': 0.0001318084345961401, 'epoch': 0.34}


 34%|███▍      | 5735/16798 [25:27<56:47,  3.25it/s]

{'loss': 1.193, 'grad_norm': 1.640170931816101, 'learning_rate': 0.0001317965213247558, 'epoch': 0.34}


 34%|███▍      | 5736/16798 [25:27<58:38,  3.14it/s]

{'loss': 1.2754, 'grad_norm': 1.6431723833084106, 'learning_rate': 0.00013178460805337149, 'epoch': 0.34}


 34%|███▍      | 5737/16798 [25:28<56:56,  3.24it/s]

{'loss': 1.5247, 'grad_norm': 1.9182504415512085, 'learning_rate': 0.00013177269478198715, 'epoch': 0.34}


 34%|███▍      | 5738/16798 [25:28<56:59,  3.23it/s]

{'loss': 1.6833, 'grad_norm': 2.312070608139038, 'learning_rate': 0.00013176078151060284, 'epoch': 0.34}


 34%|███▍      | 5739/16798 [25:28<58:33,  3.15it/s]

{'loss': 1.4209, 'grad_norm': 1.7982922792434692, 'learning_rate': 0.0001317488682392185, 'epoch': 0.34}


 34%|███▍      | 5740/16798 [25:29<58:29,  3.15it/s]

{'loss': 1.4167, 'grad_norm': 1.961087942123413, 'learning_rate': 0.00013173695496783417, 'epoch': 0.34}


 34%|███▍      | 5741/16798 [25:29<57:00,  3.23it/s]

{'loss': 1.3344, 'grad_norm': 1.904456615447998, 'learning_rate': 0.00013172504169644986, 'epoch': 0.34}


 34%|███▍      | 5742/16798 [25:29<56:04,  3.29it/s]

{'loss': 1.7777, 'grad_norm': 2.439157009124756, 'learning_rate': 0.00013171312842506552, 'epoch': 0.34}


 34%|███▍      | 5743/16798 [25:29<56:16,  3.27it/s]

{'loss': 1.3089, 'grad_norm': 1.8259706497192383, 'learning_rate': 0.0001317012151536812, 'epoch': 0.34}


 34%|███▍      | 5744/16798 [25:30<55:26,  3.32it/s]

{'loss': 0.9848, 'grad_norm': 1.8090091943740845, 'learning_rate': 0.00013168930188229688, 'epoch': 0.34}


 34%|███▍      | 5745/16798 [25:30<56:06,  3.28it/s]

{'loss': 0.6563, 'grad_norm': 1.559239387512207, 'learning_rate': 0.00013167738861091257, 'epoch': 0.34}


 34%|███▍      | 5746/16798 [25:30<56:20,  3.27it/s]

{'loss': 0.6819, 'grad_norm': 1.3089278936386108, 'learning_rate': 0.00013166547533952823, 'epoch': 0.34}


 34%|███▍      | 5747/16798 [25:31<54:25,  3.38it/s]

{'loss': 0.4436, 'grad_norm': 1.1886522769927979, 'learning_rate': 0.00013165356206814392, 'epoch': 0.34}


 34%|███▍      | 5748/16798 [25:31<57:14,  3.22it/s]

{'loss': 0.323, 'grad_norm': 0.8545719981193542, 'learning_rate': 0.00013164164879675958, 'epoch': 0.34}


 34%|███▍      | 5749/16798 [25:31<57:32,  3.20it/s]

{'loss': 0.2981, 'grad_norm': 0.8272563815116882, 'learning_rate': 0.00013162973552537528, 'epoch': 0.34}


 34%|███▍      | 5750/16798 [25:32<59:05,  3.12it/s]

{'loss': 0.5069, 'grad_norm': 1.2280675172805786, 'learning_rate': 0.00013161782225399094, 'epoch': 0.34}


 34%|███▍      | 5751/16798 [25:32<58:26,  3.15it/s]

{'loss': 1.9066, 'grad_norm': 1.851247787475586, 'learning_rate': 0.00013160590898260663, 'epoch': 0.34}


 34%|███▍      | 5752/16798 [25:32<57:52,  3.18it/s]

{'loss': 2.1262, 'grad_norm': 2.1909852027893066, 'learning_rate': 0.0001315939957112223, 'epoch': 0.34}


 34%|███▍      | 5753/16798 [25:33<57:22,  3.21it/s]

{'loss': 1.9272, 'grad_norm': 1.8363059759140015, 'learning_rate': 0.00013158208243983798, 'epoch': 0.34}


 34%|███▍      | 5754/16798 [25:33<57:06,  3.22it/s]

{'loss': 1.9472, 'grad_norm': 1.9475418329238892, 'learning_rate': 0.00013157016916845365, 'epoch': 0.34}


 34%|███▍      | 5755/16798 [25:33<57:06,  3.22it/s]

{'loss': 1.6646, 'grad_norm': 1.9260921478271484, 'learning_rate': 0.00013155825589706934, 'epoch': 0.34}


 34%|███▍      | 5756/16798 [25:33<59:14,  3.11it/s]

{'loss': 1.8281, 'grad_norm': 1.8303664922714233, 'learning_rate': 0.000131546342625685, 'epoch': 0.34}


 34%|███▍      | 5757/16798 [25:34<57:58,  3.17it/s]

{'loss': 1.8213, 'grad_norm': 1.8420792818069458, 'learning_rate': 0.0001315344293543007, 'epoch': 0.34}


 34%|███▍      | 5758/16798 [25:34<58:16,  3.16it/s]

{'loss': 1.9269, 'grad_norm': 2.3346939086914062, 'learning_rate': 0.00013152251608291636, 'epoch': 0.34}


 34%|███▍      | 5759/16798 [25:34<58:08,  3.16it/s]

{'loss': 1.4894, 'grad_norm': 1.6128684282302856, 'learning_rate': 0.00013151060281153205, 'epoch': 0.34}


 34%|███▍      | 5760/16798 [25:35<57:09,  3.22it/s]

{'loss': 1.8859, 'grad_norm': 1.9156986474990845, 'learning_rate': 0.0001314986895401477, 'epoch': 0.34}


 34%|███▍      | 5761/16798 [25:35<56:03,  3.28it/s]

{'loss': 1.362, 'grad_norm': 1.8013910055160522, 'learning_rate': 0.0001314867762687634, 'epoch': 0.34}


 34%|███▍      | 5762/16798 [25:35<58:03,  3.17it/s]

{'loss': 1.8872, 'grad_norm': 2.05036997795105, 'learning_rate': 0.00013147486299737907, 'epoch': 0.34}


 34%|███▍      | 5763/16798 [25:36<59:38,  3.08it/s]

{'loss': 1.909, 'grad_norm': 1.9014067649841309, 'learning_rate': 0.00013146294972599476, 'epoch': 0.34}


 34%|███▍      | 5764/16798 [25:36<1:00:04,  3.06it/s]

{'loss': 1.5587, 'grad_norm': 1.8336392641067505, 'learning_rate': 0.00013145103645461042, 'epoch': 0.34}


 34%|███▍      | 5765/16798 [25:36<59:07,  3.11it/s]  

{'loss': 1.4841, 'grad_norm': 1.8700768947601318, 'learning_rate': 0.0001314391231832261, 'epoch': 0.34}


 34%|███▍      | 5766/16798 [25:37<58:11,  3.16it/s]

{'loss': 1.6642, 'grad_norm': 2.1895601749420166, 'learning_rate': 0.0001314272099118418, 'epoch': 0.34}


 34%|███▍      | 5767/16798 [25:37<57:42,  3.19it/s]

{'loss': 2.1388, 'grad_norm': 2.0716490745544434, 'learning_rate': 0.0001314152966404575, 'epoch': 0.34}


 34%|███▍      | 5768/16798 [25:37<56:32,  3.25it/s]

{'loss': 1.6158, 'grad_norm': 1.864132046699524, 'learning_rate': 0.00013140338336907316, 'epoch': 0.34}


 34%|███▍      | 5769/16798 [25:37<54:33,  3.37it/s]

{'loss': 1.1358, 'grad_norm': 1.608397126197815, 'learning_rate': 0.00013139147009768885, 'epoch': 0.34}


 34%|███▍      | 5770/16798 [25:38<56:03,  3.28it/s]

{'loss': 1.6855, 'grad_norm': 1.9746190309524536, 'learning_rate': 0.0001313795568263045, 'epoch': 0.34}


 34%|███▍      | 5771/16798 [25:38<55:56,  3.28it/s]

{'loss': 1.4951, 'grad_norm': 1.9128259420394897, 'learning_rate': 0.0001313676435549202, 'epoch': 0.34}


 34%|███▍      | 5772/16798 [25:38<57:04,  3.22it/s]

{'loss': 1.8063, 'grad_norm': 2.094754219055176, 'learning_rate': 0.00013135573028353587, 'epoch': 0.34}


 34%|███▍      | 5773/16798 [25:39<54:56,  3.34it/s]

{'loss': 1.3293, 'grad_norm': 1.6316399574279785, 'learning_rate': 0.00013134381701215156, 'epoch': 0.34}


 34%|███▍      | 5774/16798 [25:39<54:55,  3.35it/s]

{'loss': 1.615, 'grad_norm': 2.0017125606536865, 'learning_rate': 0.00013133190374076722, 'epoch': 0.34}


 34%|███▍      | 5775/16798 [25:39<54:00,  3.40it/s]

{'loss': 1.3967, 'grad_norm': 2.020655393600464, 'learning_rate': 0.0001313199904693829, 'epoch': 0.34}


 34%|███▍      | 5776/16798 [25:40<53:31,  3.43it/s]

{'loss': 1.6855, 'grad_norm': 1.904921054840088, 'learning_rate': 0.00013130807719799857, 'epoch': 0.34}


 34%|███▍      | 5777/16798 [25:40<53:12,  3.45it/s]

{'loss': 1.5439, 'grad_norm': 1.8075721263885498, 'learning_rate': 0.00013129616392661426, 'epoch': 0.34}


 34%|███▍      | 5778/16798 [25:40<51:59,  3.53it/s]

{'loss': 1.3791, 'grad_norm': 1.9396812915802002, 'learning_rate': 0.00013128425065522993, 'epoch': 0.34}


 34%|███▍      | 5779/16798 [25:40<52:01,  3.53it/s]

{'loss': 1.367, 'grad_norm': 1.9080580472946167, 'learning_rate': 0.00013127233738384562, 'epoch': 0.34}


 34%|███▍      | 5780/16798 [25:41<53:27,  3.44it/s]

{'loss': 1.2089, 'grad_norm': 1.7702867984771729, 'learning_rate': 0.00013126042411246128, 'epoch': 0.34}


 34%|███▍      | 5781/16798 [25:41<53:38,  3.42it/s]

{'loss': 1.5941, 'grad_norm': 2.1456379890441895, 'learning_rate': 0.00013124851084107697, 'epoch': 0.34}


 34%|███▍      | 5782/16798 [25:41<53:31,  3.43it/s]

{'loss': 1.2329, 'grad_norm': 1.9220446348190308, 'learning_rate': 0.00013123659756969264, 'epoch': 0.34}


 34%|███▍      | 5783/16798 [25:42<54:50,  3.35it/s]

{'loss': 1.6716, 'grad_norm': 2.086186170578003, 'learning_rate': 0.00013122468429830833, 'epoch': 0.34}


 34%|███▍      | 5784/16798 [25:42<54:00,  3.40it/s]

{'loss': 1.6067, 'grad_norm': 2.1395092010498047, 'learning_rate': 0.000131212771026924, 'epoch': 0.34}


 34%|███▍      | 5785/16798 [25:42<52:14,  3.51it/s]

{'loss': 1.2539, 'grad_norm': 1.8959282636642456, 'learning_rate': 0.00013120085775553968, 'epoch': 0.34}


 34%|███▍      | 5786/16798 [25:42<53:13,  3.45it/s]

{'loss': 1.3626, 'grad_norm': 2.0522587299346924, 'learning_rate': 0.00013118894448415535, 'epoch': 0.34}


 34%|███▍      | 5787/16798 [25:43<55:12,  3.32it/s]

{'loss': 1.336, 'grad_norm': 1.868721842765808, 'learning_rate': 0.00013117703121277104, 'epoch': 0.34}


 34%|███▍      | 5788/16798 [25:43<55:43,  3.29it/s]

{'loss': 1.2855, 'grad_norm': 1.8835524320602417, 'learning_rate': 0.0001311651179413867, 'epoch': 0.34}


 34%|███▍      | 5789/16798 [25:43<55:41,  3.29it/s]

{'loss': 0.8663, 'grad_norm': 1.568272352218628, 'learning_rate': 0.0001311532046700024, 'epoch': 0.34}


 34%|███▍      | 5790/16798 [25:44<56:05,  3.27it/s]

{'loss': 1.3963, 'grad_norm': 2.46108341217041, 'learning_rate': 0.00013114129139861805, 'epoch': 0.34}


 34%|███▍      | 5791/16798 [25:44<54:21,  3.37it/s]

{'loss': 1.1884, 'grad_norm': 2.007516384124756, 'learning_rate': 0.00013112937812723375, 'epoch': 0.34}


 34%|███▍      | 5792/16798 [25:44<53:34,  3.42it/s]

{'loss': 1.2405, 'grad_norm': 2.172166109085083, 'learning_rate': 0.0001311174648558494, 'epoch': 0.34}


 34%|███▍      | 5793/16798 [25:45<54:01,  3.39it/s]

{'loss': 1.205, 'grad_norm': 1.7925935983657837, 'learning_rate': 0.0001311055515844651, 'epoch': 0.34}


 34%|███▍      | 5794/16798 [25:45<55:42,  3.29it/s]

{'loss': 1.6887, 'grad_norm': 2.285034418106079, 'learning_rate': 0.00013109363831308076, 'epoch': 0.34}


 34%|███▍      | 5795/16798 [25:45<1:02:42,  2.92it/s]

{'loss': 0.8516, 'grad_norm': 1.3876315355300903, 'learning_rate': 0.00013108172504169645, 'epoch': 0.34}


 35%|███▍      | 5796/16798 [25:46<1:02:57,  2.91it/s]

{'loss': 0.8055, 'grad_norm': 1.498538851737976, 'learning_rate': 0.00013106981177031212, 'epoch': 0.35}


 35%|███▍      | 5797/16798 [25:46<1:01:03,  3.00it/s]

{'loss': 0.5943, 'grad_norm': 1.138893723487854, 'learning_rate': 0.0001310578984989278, 'epoch': 0.35}


 35%|███▍      | 5798/16798 [25:46<58:12,  3.15it/s]  

{'loss': 0.3208, 'grad_norm': 0.7669834494590759, 'learning_rate': 0.0001310459852275435, 'epoch': 0.35}


 35%|███▍      | 5799/16798 [25:47<57:09,  3.21it/s]

{'loss': 0.2427, 'grad_norm': 0.7406197190284729, 'learning_rate': 0.00013103407195615916, 'epoch': 0.35}


 35%|███▍      | 5800/16798 [25:47<56:29,  3.24it/s]

{'loss': 0.3866, 'grad_norm': 0.9281860589981079, 'learning_rate': 0.00013102215868477485, 'epoch': 0.35}


 35%|███▍      | 5801/16798 [25:47<57:30,  3.19it/s]

{'loss': 1.6649, 'grad_norm': 1.8904634714126587, 'learning_rate': 0.00013101024541339052, 'epoch': 0.35}


 35%|███▍      | 5802/16798 [25:47<55:29,  3.30it/s]

{'loss': 2.1098, 'grad_norm': 2.087756872177124, 'learning_rate': 0.0001309983321420062, 'epoch': 0.35}


 35%|███▍      | 5803/16798 [25:48<55:15,  3.32it/s]

{'loss': 1.8438, 'grad_norm': 1.731882095336914, 'learning_rate': 0.00013098641887062187, 'epoch': 0.35}


 35%|███▍      | 5804/16798 [25:48<56:01,  3.27it/s]

{'loss': 1.6663, 'grad_norm': 1.9174673557281494, 'learning_rate': 0.00013097450559923756, 'epoch': 0.35}


 35%|███▍      | 5805/16798 [25:48<57:14,  3.20it/s]

{'loss': 1.9602, 'grad_norm': 1.9712669849395752, 'learning_rate': 0.00013096259232785323, 'epoch': 0.35}


 35%|███▍      | 5806/16798 [25:49<56:19,  3.25it/s]

{'loss': 2.0157, 'grad_norm': 1.7893344163894653, 'learning_rate': 0.00013095067905646892, 'epoch': 0.35}


 35%|███▍      | 5807/16798 [25:49<55:53,  3.28it/s]

{'loss': 1.8914, 'grad_norm': 1.935933232307434, 'learning_rate': 0.00013093876578508458, 'epoch': 0.35}


 35%|███▍      | 5808/16798 [25:49<57:11,  3.20it/s]

{'loss': 1.6055, 'grad_norm': 1.6593022346496582, 'learning_rate': 0.00013092685251370027, 'epoch': 0.35}


 35%|███▍      | 5809/16798 [25:50<58:48,  3.11it/s]

{'loss': 1.7685, 'grad_norm': 2.1615991592407227, 'learning_rate': 0.00013091493924231594, 'epoch': 0.35}


 35%|███▍      | 5810/16798 [25:50<58:38,  3.12it/s]

{'loss': 1.9366, 'grad_norm': 1.9766374826431274, 'learning_rate': 0.00013090302597093163, 'epoch': 0.35}


 35%|███▍      | 5811/16798 [25:50<56:03,  3.27it/s]

{'loss': 1.7192, 'grad_norm': 2.039379358291626, 'learning_rate': 0.0001308911126995473, 'epoch': 0.35}


 35%|███▍      | 5812/16798 [25:51<1:00:36,  3.02it/s]

{'loss': 1.5469, 'grad_norm': 1.6648147106170654, 'learning_rate': 0.00013087919942816298, 'epoch': 0.35}


 35%|███▍      | 5813/16798 [25:51<57:13,  3.20it/s]  

{'loss': 1.725, 'grad_norm': 1.913749098777771, 'learning_rate': 0.00013086728615677864, 'epoch': 0.35}


 35%|███▍      | 5814/16798 [25:51<56:19,  3.25it/s]

{'loss': 1.2543, 'grad_norm': 1.6847689151763916, 'learning_rate': 0.00013085537288539434, 'epoch': 0.35}


 35%|███▍      | 5815/16798 [25:52<57:24,  3.19it/s]

{'loss': 1.6879, 'grad_norm': 1.8698326349258423, 'learning_rate': 0.00013084345961401, 'epoch': 0.35}


 35%|███▍      | 5816/16798 [25:52<58:18,  3.14it/s]

{'loss': 1.6405, 'grad_norm': 1.8182363510131836, 'learning_rate': 0.0001308315463426257, 'epoch': 0.35}


 35%|███▍      | 5817/16798 [25:52<57:41,  3.17it/s]

{'loss': 1.8433, 'grad_norm': 2.154679536819458, 'learning_rate': 0.00013081963307124135, 'epoch': 0.35}


 35%|███▍      | 5818/16798 [25:53<1:00:01,  3.05it/s]

{'loss': 1.4708, 'grad_norm': 1.7832016944885254, 'learning_rate': 0.00013080771979985704, 'epoch': 0.35}


 35%|███▍      | 5819/16798 [25:53<1:00:40,  3.02it/s]

{'loss': 1.6212, 'grad_norm': 1.8303306102752686, 'learning_rate': 0.0001307958065284727, 'epoch': 0.35}


 35%|███▍      | 5820/16798 [25:53<1:01:55,  2.95it/s]

{'loss': 1.8037, 'grad_norm': 2.159874200820923, 'learning_rate': 0.0001307838932570884, 'epoch': 0.35}


 35%|███▍      | 5821/16798 [25:54<57:18,  3.19it/s]  

{'loss': 1.3893, 'grad_norm': 1.852135181427002, 'learning_rate': 0.00013077197998570406, 'epoch': 0.35}


 35%|███▍      | 5822/16798 [25:54<1:01:03,  3.00it/s]

{'loss': 1.3574, 'grad_norm': 1.6138687133789062, 'learning_rate': 0.00013076006671431975, 'epoch': 0.35}


 35%|███▍      | 5823/16798 [25:54<55:02,  3.32it/s]  

{'loss': 1.5081, 'grad_norm': 1.8955821990966797, 'learning_rate': 0.00013074815344293542, 'epoch': 0.35}


 35%|███▍      | 5824/16798 [25:54<54:06,  3.38it/s]

{'loss': 1.5267, 'grad_norm': 1.7255635261535645, 'learning_rate': 0.0001307362401715511, 'epoch': 0.35}


 35%|███▍      | 5825/16798 [25:55<55:13,  3.31it/s]

{'loss': 1.7492, 'grad_norm': 2.0055458545684814, 'learning_rate': 0.00013072432690016677, 'epoch': 0.35}


 35%|███▍      | 5826/16798 [25:55<54:48,  3.34it/s]

{'loss': 1.2907, 'grad_norm': 1.8795361518859863, 'learning_rate': 0.00013071241362878246, 'epoch': 0.35}


 35%|███▍      | 5827/16798 [25:55<57:33,  3.18it/s]

{'loss': 1.6324, 'grad_norm': 2.1671645641326904, 'learning_rate': 0.00013070050035739813, 'epoch': 0.35}


 35%|███▍      | 5828/16798 [25:56<58:33,  3.12it/s]

{'loss': 1.4264, 'grad_norm': 1.773432970046997, 'learning_rate': 0.00013068858708601384, 'epoch': 0.35}


 35%|███▍      | 5829/16798 [25:56<57:18,  3.19it/s]

{'loss': 1.458, 'grad_norm': 1.9386502504348755, 'learning_rate': 0.0001306766738146295, 'epoch': 0.35}


 35%|███▍      | 5830/16798 [25:56<57:57,  3.15it/s]

{'loss': 1.8162, 'grad_norm': 2.070211172103882, 'learning_rate': 0.0001306647605432452, 'epoch': 0.35}


 35%|███▍      | 5831/16798 [25:57<58:26,  3.13it/s]

{'loss': 1.582, 'grad_norm': 2.1718807220458984, 'learning_rate': 0.00013065284727186086, 'epoch': 0.35}


 35%|███▍      | 5832/16798 [25:57<58:28,  3.13it/s]

{'loss': 0.8882, 'grad_norm': 1.6193652153015137, 'learning_rate': 0.00013064093400047655, 'epoch': 0.35}


 35%|███▍      | 5833/16798 [25:57<55:37,  3.29it/s]

{'loss': 2.1127, 'grad_norm': 2.4306249618530273, 'learning_rate': 0.00013062902072909222, 'epoch': 0.35}


 35%|███▍      | 5834/16798 [25:58<57:58,  3.15it/s]

{'loss': 1.4106, 'grad_norm': 1.7675142288208008, 'learning_rate': 0.0001306171074577079, 'epoch': 0.35}


 35%|███▍      | 5835/16798 [25:58<56:04,  3.26it/s]

{'loss': 1.2737, 'grad_norm': 1.7456496953964233, 'learning_rate': 0.00013060519418632357, 'epoch': 0.35}


 35%|███▍      | 5836/16798 [25:58<54:06,  3.38it/s]

{'loss': 1.2718, 'grad_norm': 1.7875653505325317, 'learning_rate': 0.00013059328091493926, 'epoch': 0.35}


 35%|███▍      | 5837/16798 [25:58<56:59,  3.21it/s]

{'loss': 1.6258, 'grad_norm': 2.0290446281433105, 'learning_rate': 0.00013058136764355492, 'epoch': 0.35}


 35%|███▍      | 5838/16798 [25:59<56:29,  3.23it/s]

{'loss': 1.3917, 'grad_norm': 1.9704339504241943, 'learning_rate': 0.00013056945437217062, 'epoch': 0.35}


 35%|███▍      | 5840/16798 [25:59<53:59,  3.38it/s]  

{'loss': 0.8118, 'grad_norm': 1.4457329511642456, 'learning_rate': 0.00013055754110078628, 'epoch': 0.35}


 35%|███▍      | 5840/16798 [25:59<53:59,  3.38it/s]

{'loss': 1.2277, 'grad_norm': 1.869228720664978, 'learning_rate': 0.00013054562782940197, 'epoch': 0.35}


 35%|███▍      | 5841/16798 [26:00<53:21,  3.42it/s]

{'loss': 1.4336, 'grad_norm': 2.1412429809570312, 'learning_rate': 0.00013053371455801763, 'epoch': 0.35}


 35%|███▍      | 5842/16798 [26:00<53:13,  3.43it/s]

{'loss': 0.9812, 'grad_norm': 1.8036054372787476, 'learning_rate': 0.00013052180128663332, 'epoch': 0.35}


 35%|███▍      | 5843/16798 [26:00<53:48,  3.39it/s]

{'loss': 0.7397, 'grad_norm': 1.6028372049331665, 'learning_rate': 0.000130509888015249, 'epoch': 0.35}


 35%|███▍      | 5844/16798 [26:01<56:03,  3.26it/s]

{'loss': 0.8785, 'grad_norm': 1.4965487718582153, 'learning_rate': 0.00013049797474386468, 'epoch': 0.35}


 35%|███▍      | 5845/16798 [26:01<54:34,  3.34it/s]

{'loss': 0.7206, 'grad_norm': 1.3854007720947266, 'learning_rate': 0.00013048606147248034, 'epoch': 0.35}


 35%|███▍      | 5846/16798 [26:01<56:00,  3.26it/s]

{'loss': 0.8337, 'grad_norm': 1.603796124458313, 'learning_rate': 0.00013047414820109603, 'epoch': 0.35}


 35%|███▍      | 5847/16798 [26:02<57:53,  3.15it/s]

{'loss': 0.8673, 'grad_norm': 1.601912498474121, 'learning_rate': 0.0001304622349297117, 'epoch': 0.35}


 35%|███▍      | 5848/16798 [26:02<56:51,  3.21it/s]

{'loss': 0.5074, 'grad_norm': 0.9876738786697388, 'learning_rate': 0.0001304503216583274, 'epoch': 0.35}


 35%|███▍      | 5849/16798 [26:02<56:02,  3.26it/s]

{'loss': 0.6174, 'grad_norm': 1.4015511274337769, 'learning_rate': 0.00013043840838694305, 'epoch': 0.35}


 35%|███▍      | 5850/16798 [26:02<55:12,  3.30it/s]

{'loss': 0.9453, 'grad_norm': 1.948548436164856, 'learning_rate': 0.00013042649511555874, 'epoch': 0.35}


 35%|███▍      | 5851/16798 [26:03<56:07,  3.25it/s]

{'loss': 2.2789, 'grad_norm': 2.0764646530151367, 'learning_rate': 0.0001304145818441744, 'epoch': 0.35}


 35%|███▍      | 5852/16798 [26:03<56:14,  3.24it/s]

{'loss': 1.8816, 'grad_norm': 1.85057532787323, 'learning_rate': 0.0001304026685727901, 'epoch': 0.35}


 35%|███▍      | 5853/16798 [26:03<57:21,  3.18it/s]

{'loss': 1.8772, 'grad_norm': 1.789671778678894, 'learning_rate': 0.00013039075530140576, 'epoch': 0.35}


 35%|███▍      | 5854/16798 [26:04<57:05,  3.19it/s]

{'loss': 1.6454, 'grad_norm': 1.7186925411224365, 'learning_rate': 0.00013037884203002145, 'epoch': 0.35}


 35%|███▍      | 5855/16798 [26:04<55:53,  3.26it/s]

{'loss': 2.3075, 'grad_norm': 2.015822172164917, 'learning_rate': 0.00013036692875863711, 'epoch': 0.35}


 35%|███▍      | 5856/16798 [26:04<54:35,  3.34it/s]

{'loss': 1.7872, 'grad_norm': 1.8307546377182007, 'learning_rate': 0.0001303550154872528, 'epoch': 0.35}


 35%|███▍      | 5857/16798 [26:05<55:07,  3.31it/s]

{'loss': 1.639, 'grad_norm': 1.9380141496658325, 'learning_rate': 0.00013034310221586847, 'epoch': 0.35}


 35%|███▍      | 5858/16798 [26:05<54:54,  3.32it/s]

{'loss': 1.8545, 'grad_norm': 1.8134963512420654, 'learning_rate': 0.00013033118894448416, 'epoch': 0.35}


 35%|███▍      | 5859/16798 [26:05<59:25,  3.07it/s]

{'loss': 1.4422, 'grad_norm': 1.6138943433761597, 'learning_rate': 0.00013031927567309985, 'epoch': 0.35}


 35%|███▍      | 5860/16798 [26:06<1:00:39,  3.01it/s]

{'loss': 1.5146, 'grad_norm': 1.7432094812393188, 'learning_rate': 0.00013030736240171551, 'epoch': 0.35}


 35%|███▍      | 5861/16798 [26:06<57:55,  3.15it/s]  

{'loss': 1.6846, 'grad_norm': 1.8463388681411743, 'learning_rate': 0.0001302954491303312, 'epoch': 0.35}


 35%|███▍      | 5862/16798 [26:06<58:43,  3.10it/s]

{'loss': 1.4536, 'grad_norm': 2.275939464569092, 'learning_rate': 0.00013028353585894687, 'epoch': 0.35}


 35%|███▍      | 5863/16798 [26:07<1:02:29,  2.92it/s]

{'loss': 1.8352, 'grad_norm': 2.190983295440674, 'learning_rate': 0.00013027162258756256, 'epoch': 0.35}


 35%|███▍      | 5864/16798 [26:07<57:40,  3.16it/s]  

{'loss': 1.2555, 'grad_norm': 1.5781214237213135, 'learning_rate': 0.00013025970931617822, 'epoch': 0.35}


 35%|███▍      | 5865/16798 [26:07<1:00:30,  3.01it/s]

{'loss': 1.5355, 'grad_norm': 1.8229974508285522, 'learning_rate': 0.0001302477960447939, 'epoch': 0.35}


 35%|███▍      | 5866/16798 [26:08<58:05,  3.14it/s]  

{'loss': 1.603, 'grad_norm': 1.9204049110412598, 'learning_rate': 0.00013023588277340958, 'epoch': 0.35}


 35%|███▍      | 5867/16798 [26:08<59:25,  3.07it/s]

{'loss': 1.9384, 'grad_norm': 2.3388888835906982, 'learning_rate': 0.00013022396950202527, 'epoch': 0.35}


 35%|███▍      | 5868/16798 [26:08<56:40,  3.21it/s]

{'loss': 1.2819, 'grad_norm': 1.7489959001541138, 'learning_rate': 0.00013021205623064093, 'epoch': 0.35}


 35%|███▍      | 5869/16798 [26:09<1:00:28,  3.01it/s]

{'loss': 1.2465, 'grad_norm': 1.6076884269714355, 'learning_rate': 0.00013020014295925662, 'epoch': 0.35}


 35%|███▍      | 5870/16798 [26:09<56:08,  3.24it/s]  

{'loss': 1.6456, 'grad_norm': 1.996907114982605, 'learning_rate': 0.00013018822968787229, 'epoch': 0.35}


 35%|███▍      | 5871/16798 [26:09<1:00:10,  3.03it/s]

{'loss': 1.4109, 'grad_norm': 1.9623063802719116, 'learning_rate': 0.00013017631641648798, 'epoch': 0.35}


 35%|███▍      | 5872/16798 [26:09<55:30,  3.28it/s]  

{'loss': 1.2625, 'grad_norm': 1.8073288202285767, 'learning_rate': 0.00013016440314510364, 'epoch': 0.35}


 35%|███▍      | 5873/16798 [26:10<56:35,  3.22it/s]

{'loss': 1.3639, 'grad_norm': 2.6202573776245117, 'learning_rate': 0.00013015248987371933, 'epoch': 0.35}


 35%|███▍      | 5874/16798 [26:10<57:45,  3.15it/s]

{'loss': 1.8165, 'grad_norm': 2.0596680641174316, 'learning_rate': 0.000130140576602335, 'epoch': 0.35}


 35%|███▍      | 5875/16798 [26:10<56:19,  3.23it/s]

{'loss': 1.7703, 'grad_norm': 2.183784008026123, 'learning_rate': 0.00013012866333095069, 'epoch': 0.35}


 35%|███▍      | 5876/16798 [26:11<56:32,  3.22it/s]

{'loss': 1.9, 'grad_norm': 2.1043012142181396, 'learning_rate': 0.00013011675005956635, 'epoch': 0.35}


 35%|███▍      | 5877/16798 [26:11<57:00,  3.19it/s]

{'loss': 1.308, 'grad_norm': 1.7698293924331665, 'learning_rate': 0.00013010483678818204, 'epoch': 0.35}


 35%|███▍      | 5878/16798 [26:11<55:03,  3.31it/s]

{'loss': 1.5275, 'grad_norm': 1.8973885774612427, 'learning_rate': 0.0001300929235167977, 'epoch': 0.35}


 35%|███▍      | 5879/16798 [26:12<55:30,  3.28it/s]

{'loss': 1.5485, 'grad_norm': 2.04196834564209, 'learning_rate': 0.0001300810102454134, 'epoch': 0.35}


 35%|███▌      | 5880/16798 [26:12<56:45,  3.21it/s]

{'loss': 1.3392, 'grad_norm': 2.051384449005127, 'learning_rate': 0.00013006909697402906, 'epoch': 0.35}


 35%|███▌      | 5881/16798 [26:12<56:55,  3.20it/s]

{'loss': 1.2821, 'grad_norm': 1.9402251243591309, 'learning_rate': 0.00013005718370264475, 'epoch': 0.35}


 35%|███▌      | 5882/16798 [26:13<56:29,  3.22it/s]

{'loss': 1.5794, 'grad_norm': 2.5852391719818115, 'learning_rate': 0.0001300452704312604, 'epoch': 0.35}


 35%|███▌      | 5883/16798 [26:13<58:11,  3.13it/s]

{'loss': 1.1189, 'grad_norm': 1.6810462474822998, 'learning_rate': 0.0001300333571598761, 'epoch': 0.35}


 35%|███▌      | 5884/16798 [26:13<55:55,  3.25it/s]

{'loss': 1.6419, 'grad_norm': 2.0395851135253906, 'learning_rate': 0.00013002144388849177, 'epoch': 0.35}


 35%|███▌      | 5885/16798 [26:13<54:24,  3.34it/s]

{'loss': 0.9441, 'grad_norm': 1.5494471788406372, 'learning_rate': 0.00013000953061710746, 'epoch': 0.35}


 35%|███▌      | 5886/16798 [26:14<54:41,  3.33it/s]

{'loss': 0.8823, 'grad_norm': 1.4082088470458984, 'learning_rate': 0.00012999761734572312, 'epoch': 0.35}


 35%|███▌      | 5887/16798 [26:14<54:00,  3.37it/s]

{'loss': 0.9338, 'grad_norm': 1.7831815481185913, 'learning_rate': 0.0001299857040743388, 'epoch': 0.35}


 35%|███▌      | 5888/16798 [26:14<54:58,  3.31it/s]

{'loss': 1.1675, 'grad_norm': 1.6671255826950073, 'learning_rate': 0.00012997379080295448, 'epoch': 0.35}


 35%|███▌      | 5889/16798 [26:15<53:14,  3.42it/s]

{'loss': 1.0061, 'grad_norm': 1.5423444509506226, 'learning_rate': 0.0001299618775315702, 'epoch': 0.35}


 35%|███▌      | 5890/16798 [26:15<54:38,  3.33it/s]

{'loss': 1.0223, 'grad_norm': 3.1931443214416504, 'learning_rate': 0.00012994996426018586, 'epoch': 0.35}


 35%|███▌      | 5891/16798 [26:15<54:27,  3.34it/s]

{'loss': 1.1341, 'grad_norm': 1.6072099208831787, 'learning_rate': 0.00012993805098880155, 'epoch': 0.35}


 35%|███▌      | 5892/16798 [26:15<51:40,  3.52it/s]

{'loss': 1.7327, 'grad_norm': 2.3567469120025635, 'learning_rate': 0.0001299261377174172, 'epoch': 0.35}


 35%|███▌      | 5893/16798 [26:16<55:25,  3.28it/s]

{'loss': 1.0253, 'grad_norm': 1.6679840087890625, 'learning_rate': 0.0001299142244460329, 'epoch': 0.35}


 35%|███▌      | 5894/16798 [26:16<53:11,  3.42it/s]

{'loss': 0.5409, 'grad_norm': 1.0990686416625977, 'learning_rate': 0.00012990231117464857, 'epoch': 0.35}


 35%|███▌      | 5895/16798 [26:16<55:03,  3.30it/s]

{'loss': 1.306, 'grad_norm': 1.971510648727417, 'learning_rate': 0.00012989039790326426, 'epoch': 0.35}


 35%|███▌      | 5896/16798 [26:17<57:59,  3.13it/s]

{'loss': 0.9219, 'grad_norm': 1.5569826364517212, 'learning_rate': 0.00012987848463187992, 'epoch': 0.35}


 35%|███▌      | 5897/16798 [26:17<53:02,  3.42it/s]

{'loss': 0.3022, 'grad_norm': 0.7655595541000366, 'learning_rate': 0.0001298665713604956, 'epoch': 0.35}


 35%|███▌      | 5898/16798 [26:17<49:40,  3.66it/s]

{'loss': 0.6762, 'grad_norm': 1.7699781656265259, 'learning_rate': 0.00012985465808911127, 'epoch': 0.35}


 35%|███▌      | 5899/16798 [26:18<51:50,  3.50it/s]

{'loss': 0.3063, 'grad_norm': 1.0142133235931396, 'learning_rate': 0.00012984274481772697, 'epoch': 0.35}


 35%|███▌      | 5900/16798 [26:18<55:34,  3.27it/s]

{'loss': 0.4198, 'grad_norm': 1.049957036972046, 'learning_rate': 0.00012983083154634263, 'epoch': 0.35}


 35%|███▌      | 5901/16798 [26:18<57:56,  3.13it/s]

{'loss': 1.7909, 'grad_norm': 1.9357188940048218, 'learning_rate': 0.00012981891827495832, 'epoch': 0.35}


 35%|███▌      | 5902/16798 [26:19<57:20,  3.17it/s]

{'loss': 1.8251, 'grad_norm': 1.954142689704895, 'learning_rate': 0.00012980700500357398, 'epoch': 0.35}


 35%|███▌      | 5903/16798 [26:19<57:12,  3.17it/s]

{'loss': 2.378, 'grad_norm': 2.066767930984497, 'learning_rate': 0.00012979509173218967, 'epoch': 0.35}


 35%|███▌      | 5904/16798 [26:19<54:53,  3.31it/s]

{'loss': 2.0351, 'grad_norm': 1.999706506729126, 'learning_rate': 0.00012978317846080534, 'epoch': 0.35}


 35%|███▌      | 5905/16798 [26:19<54:41,  3.32it/s]

{'loss': 2.0698, 'grad_norm': 2.138322353363037, 'learning_rate': 0.00012977126518942103, 'epoch': 0.35}


 35%|███▌      | 5906/16798 [26:20<58:04,  3.13it/s]

{'loss': 1.8179, 'grad_norm': 2.125434398651123, 'learning_rate': 0.0001297593519180367, 'epoch': 0.35}


 35%|███▌      | 5907/16798 [26:20<54:36,  3.32it/s]

{'loss': 1.6187, 'grad_norm': 1.9926985502243042, 'learning_rate': 0.00012974743864665238, 'epoch': 0.35}


 35%|███▌      | 5908/16798 [26:20<53:50,  3.37it/s]

{'loss': 2.322, 'grad_norm': 2.1034445762634277, 'learning_rate': 0.00012973552537526805, 'epoch': 0.35}


 35%|███▌      | 5909/16798 [26:21<56:08,  3.23it/s]

{'loss': 2.2019, 'grad_norm': 2.056028127670288, 'learning_rate': 0.00012972361210388374, 'epoch': 0.35}


 35%|███▌      | 5910/16798 [26:21<55:45,  3.25it/s]

{'loss': 1.4728, 'grad_norm': 2.4356796741485596, 'learning_rate': 0.0001297116988324994, 'epoch': 0.35}


 35%|███▌      | 5911/16798 [26:21<56:30,  3.21it/s]

{'loss': 1.4183, 'grad_norm': 1.6723997592926025, 'learning_rate': 0.0001296997855611151, 'epoch': 0.35}


 35%|███▌      | 5912/16798 [26:22<55:57,  3.24it/s]

{'loss': 2.2062, 'grad_norm': 2.265573263168335, 'learning_rate': 0.00012968787228973076, 'epoch': 0.35}


 35%|███▌      | 5913/16798 [26:22<55:26,  3.27it/s]

{'loss': 1.7645, 'grad_norm': 1.9430915117263794, 'learning_rate': 0.00012967595901834645, 'epoch': 0.35}


 35%|███▌      | 5914/16798 [26:22<55:01,  3.30it/s]

{'loss': 1.5684, 'grad_norm': 1.9371728897094727, 'learning_rate': 0.0001296640457469621, 'epoch': 0.35}


 35%|███▌      | 5915/16798 [26:22<54:16,  3.34it/s]

{'loss': 1.7516, 'grad_norm': 2.3449461460113525, 'learning_rate': 0.0001296521324755778, 'epoch': 0.35}


 35%|███▌      | 5916/16798 [26:23<54:00,  3.36it/s]

{'loss': 1.4615, 'grad_norm': 1.612761378288269, 'learning_rate': 0.00012964021920419346, 'epoch': 0.35}


 35%|███▌      | 5917/16798 [26:23<53:35,  3.38it/s]

{'loss': 1.5292, 'grad_norm': 2.0572121143341064, 'learning_rate': 0.00012962830593280916, 'epoch': 0.35}


 35%|███▌      | 5918/16798 [26:23<52:45,  3.44it/s]

{'loss': 1.6346, 'grad_norm': 2.0454320907592773, 'learning_rate': 0.00012961639266142482, 'epoch': 0.35}


 35%|███▌      | 5919/16798 [26:24<54:12,  3.34it/s]

{'loss': 1.5255, 'grad_norm': 1.908803939819336, 'learning_rate': 0.0001296044793900405, 'epoch': 0.35}


 35%|███▌      | 5920/16798 [26:24<54:02,  3.36it/s]

{'loss': 1.1293, 'grad_norm': 1.5925651788711548, 'learning_rate': 0.0001295925661186562, 'epoch': 0.35}


 35%|███▌      | 5921/16798 [26:24<56:57,  3.18it/s]

{'loss': 1.3662, 'grad_norm': 1.8507747650146484, 'learning_rate': 0.00012958065284727186, 'epoch': 0.35}


 35%|███▌      | 5922/16798 [26:25<55:15,  3.28it/s]

{'loss': 1.6036, 'grad_norm': 1.9155524969100952, 'learning_rate': 0.00012956873957588756, 'epoch': 0.35}


 35%|███▌      | 5923/16798 [26:25<55:55,  3.24it/s]

{'loss': 1.6239, 'grad_norm': 2.0168581008911133, 'learning_rate': 0.00012955682630450322, 'epoch': 0.35}


 35%|███▌      | 5924/16798 [26:25<53:01,  3.42it/s]

{'loss': 1.4702, 'grad_norm': 1.9645947217941284, 'learning_rate': 0.0001295449130331189, 'epoch': 0.35}


 35%|███▌      | 5925/16798 [26:25<54:19,  3.34it/s]

{'loss': 1.5496, 'grad_norm': 1.873020052909851, 'learning_rate': 0.00012953299976173457, 'epoch': 0.35}


 35%|███▌      | 5926/16798 [26:26<57:20,  3.16it/s]

{'loss': 1.5039, 'grad_norm': 1.8700822591781616, 'learning_rate': 0.00012952108649035026, 'epoch': 0.35}


 35%|███▌      | 5927/16798 [26:26<54:43,  3.31it/s]

{'loss': 1.5897, 'grad_norm': 2.294229030609131, 'learning_rate': 0.00012950917321896593, 'epoch': 0.35}


 35%|███▌      | 5928/16798 [26:26<54:33,  3.32it/s]

{'loss': 1.9861, 'grad_norm': 2.426295518875122, 'learning_rate': 0.00012949725994758162, 'epoch': 0.35}


 35%|███▌      | 5929/16798 [26:27<56:48,  3.19it/s]

{'loss': 1.5646, 'grad_norm': 1.7935807704925537, 'learning_rate': 0.00012948534667619728, 'epoch': 0.35}


 35%|███▌      | 5930/16798 [26:27<54:48,  3.30it/s]

{'loss': 1.4957, 'grad_norm': 1.909898281097412, 'learning_rate': 0.00012947343340481297, 'epoch': 0.35}


 35%|███▌      | 5931/16798 [26:27<55:01,  3.29it/s]

{'loss': 1.35, 'grad_norm': 2.0107455253601074, 'learning_rate': 0.00012946152013342864, 'epoch': 0.35}


 35%|███▌      | 5932/16798 [26:28<54:01,  3.35it/s]

{'loss': 1.48, 'grad_norm': 2.033926486968994, 'learning_rate': 0.00012944960686204433, 'epoch': 0.35}


 35%|███▌      | 5933/16798 [26:28<51:58,  3.48it/s]

{'loss': 1.3239, 'grad_norm': 2.1260969638824463, 'learning_rate': 0.00012943769359066, 'epoch': 0.35}


 35%|███▌      | 5934/16798 [26:28<53:06,  3.41it/s]

{'loss': 1.4653, 'grad_norm': 1.9551929235458374, 'learning_rate': 0.00012942578031927568, 'epoch': 0.35}


 35%|███▌      | 5935/16798 [26:29<55:03,  3.29it/s]

{'loss': 1.3847, 'grad_norm': 2.3808791637420654, 'learning_rate': 0.00012941386704789135, 'epoch': 0.35}


 35%|███▌      | 5936/16798 [26:29<54:53,  3.30it/s]

{'loss': 0.8131, 'grad_norm': 1.5329058170318604, 'learning_rate': 0.00012940195377650704, 'epoch': 0.35}


 35%|███▌      | 5937/16798 [26:29<52:25,  3.45it/s]

{'loss': 1.1745, 'grad_norm': 1.8988697528839111, 'learning_rate': 0.0001293900405051227, 'epoch': 0.35}


 35%|███▌      | 5938/16798 [26:29<52:21,  3.46it/s]

{'loss': 0.9879, 'grad_norm': 2.0079874992370605, 'learning_rate': 0.0001293781272337384, 'epoch': 0.35}


 35%|███▌      | 5939/16798 [26:30<55:14,  3.28it/s]

{'loss': 1.449, 'grad_norm': 1.944870114326477, 'learning_rate': 0.00012936621396235405, 'epoch': 0.35}


 35%|███▌      | 5940/16798 [26:30<52:52,  3.42it/s]

{'loss': 1.3724, 'grad_norm': 2.130002498626709, 'learning_rate': 0.00012935430069096974, 'epoch': 0.35}


 35%|███▌      | 5941/16798 [26:30<52:05,  3.47it/s]

{'loss': 1.629, 'grad_norm': 2.265948534011841, 'learning_rate': 0.0001293423874195854, 'epoch': 0.35}


 35%|███▌      | 5942/16798 [26:31<53:06,  3.41it/s]

{'loss': 1.1121, 'grad_norm': 1.7249131202697754, 'learning_rate': 0.0001293304741482011, 'epoch': 0.35}


 35%|███▌      | 5943/16798 [26:31<53:17,  3.40it/s]

{'loss': 0.9949, 'grad_norm': 1.595278263092041, 'learning_rate': 0.00012931856087681676, 'epoch': 0.35}


 35%|███▌      | 5944/16798 [26:31<52:05,  3.47it/s]

{'loss': 0.7085, 'grad_norm': 1.3444124460220337, 'learning_rate': 0.00012930664760543245, 'epoch': 0.35}


 35%|███▌      | 5945/16798 [26:31<54:04,  3.35it/s]

{'loss': 0.5415, 'grad_norm': 1.161888599395752, 'learning_rate': 0.00012929473433404812, 'epoch': 0.35}


 35%|███▌      | 5946/16798 [26:32<54:48,  3.30it/s]

{'loss': 0.5568, 'grad_norm': 1.0908210277557373, 'learning_rate': 0.0001292828210626638, 'epoch': 0.35}


 35%|███▌      | 5947/16798 [26:32<53:26,  3.38it/s]

{'loss': 0.3419, 'grad_norm': 0.9033230543136597, 'learning_rate': 0.00012927090779127947, 'epoch': 0.35}


 35%|███▌      | 5948/16798 [26:32<53:11,  3.40it/s]

{'loss': 0.2126, 'grad_norm': 0.6373232007026672, 'learning_rate': 0.00012925899451989516, 'epoch': 0.35}


 35%|███▌      | 5949/16798 [26:33<54:18,  3.33it/s]

{'loss': 0.2453, 'grad_norm': 0.7385711073875427, 'learning_rate': 0.00012924708124851083, 'epoch': 0.35}


 35%|███▌      | 5950/16798 [26:33<53:32,  3.38it/s]

{'loss': 0.1814, 'grad_norm': 0.6339465975761414, 'learning_rate': 0.00012923516797712652, 'epoch': 0.35}


 35%|███▌      | 5951/16798 [26:33<55:23,  3.26it/s]

{'loss': 2.1055, 'grad_norm': 1.72227144241333, 'learning_rate': 0.0001292232547057422, 'epoch': 0.35}


 35%|███▌      | 5952/16798 [26:34<55:33,  3.25it/s]

{'loss': 2.1858, 'grad_norm': 2.178232431411743, 'learning_rate': 0.0001292113414343579, 'epoch': 0.35}


 35%|███▌      | 5953/16798 [26:34<55:20,  3.27it/s]

{'loss': 1.5757, 'grad_norm': 1.704091191291809, 'learning_rate': 0.00012919942816297356, 'epoch': 0.35}


 35%|███▌      | 5954/16798 [26:34<55:41,  3.25it/s]

{'loss': 1.7978, 'grad_norm': 1.837730884552002, 'learning_rate': 0.00012918751489158925, 'epoch': 0.35}


 35%|███▌      | 5955/16798 [26:34<54:50,  3.29it/s]

{'loss': 2.0502, 'grad_norm': 2.432960033416748, 'learning_rate': 0.00012917560162020492, 'epoch': 0.35}


 35%|███▌      | 5956/16798 [26:35<55:53,  3.23it/s]

{'loss': 1.781, 'grad_norm': 2.473757266998291, 'learning_rate': 0.0001291636883488206, 'epoch': 0.35}


 35%|███▌      | 5957/16798 [26:35<54:03,  3.34it/s]

{'loss': 1.9921, 'grad_norm': 2.1577367782592773, 'learning_rate': 0.00012915177507743627, 'epoch': 0.35}


 35%|███▌      | 5958/16798 [26:35<54:35,  3.31it/s]

{'loss': 1.945, 'grad_norm': 2.8621761798858643, 'learning_rate': 0.00012913986180605196, 'epoch': 0.35}


 35%|███▌      | 5959/16798 [26:36<56:17,  3.21it/s]

{'loss': 1.8969, 'grad_norm': 1.8322861194610596, 'learning_rate': 0.00012912794853466763, 'epoch': 0.35}


 35%|███▌      | 5960/16798 [26:36<55:11,  3.27it/s]

{'loss': 1.8505, 'grad_norm': 1.986583948135376, 'learning_rate': 0.00012911603526328332, 'epoch': 0.35}


 35%|███▌      | 5961/16798 [26:36<53:19,  3.39it/s]

{'loss': 2.0756, 'grad_norm': 2.2093889713287354, 'learning_rate': 0.00012910412199189898, 'epoch': 0.35}


 35%|███▌      | 5962/16798 [26:37<55:05,  3.28it/s]

{'loss': 1.6816, 'grad_norm': 1.9192389249801636, 'learning_rate': 0.00012909220872051467, 'epoch': 0.35}


 35%|███▌      | 5963/16798 [26:37<55:15,  3.27it/s]

{'loss': 1.1671, 'grad_norm': 1.5819222927093506, 'learning_rate': 0.00012908029544913033, 'epoch': 0.35}


 36%|███▌      | 5964/16798 [26:37<55:19,  3.26it/s]

{'loss': 1.5032, 'grad_norm': 2.104930877685547, 'learning_rate': 0.00012906838217774603, 'epoch': 0.36}


 36%|███▌      | 5965/16798 [26:38<53:29,  3.38it/s]

{'loss': 1.4633, 'grad_norm': 1.6462782621383667, 'learning_rate': 0.0001290564689063617, 'epoch': 0.36}


 36%|███▌      | 5966/16798 [26:38<56:29,  3.20it/s]

{'loss': 1.9865, 'grad_norm': 2.164034605026245, 'learning_rate': 0.00012904455563497738, 'epoch': 0.36}


 36%|███▌      | 5967/16798 [26:38<57:43,  3.13it/s]

{'loss': 1.8157, 'grad_norm': 2.236280679702759, 'learning_rate': 0.00012903264236359304, 'epoch': 0.36}


 36%|███▌      | 5968/16798 [26:38<55:56,  3.23it/s]

{'loss': 1.6132, 'grad_norm': 1.8173072338104248, 'learning_rate': 0.00012902072909220873, 'epoch': 0.36}


 36%|███▌      | 5969/16798 [26:39<53:30,  3.37it/s]

{'loss': 1.6252, 'grad_norm': 2.09601092338562, 'learning_rate': 0.0001290088158208244, 'epoch': 0.36}


 36%|███▌      | 5970/16798 [26:39<54:32,  3.31it/s]

{'loss': 1.6714, 'grad_norm': 2.1159210205078125, 'learning_rate': 0.0001289969025494401, 'epoch': 0.36}


 36%|███▌      | 5971/16798 [26:39<57:11,  3.16it/s]

{'loss': 1.3611, 'grad_norm': 1.6982052326202393, 'learning_rate': 0.00012898498927805575, 'epoch': 0.36}


 36%|███▌      | 5972/16798 [26:40<58:08,  3.10it/s]

{'loss': 1.2326, 'grad_norm': 1.69495689868927, 'learning_rate': 0.00012897307600667144, 'epoch': 0.36}


 36%|███▌      | 5973/16798 [26:40<55:13,  3.27it/s]

{'loss': 1.7557, 'grad_norm': 1.8481897115707397, 'learning_rate': 0.0001289611627352871, 'epoch': 0.36}


 36%|███▌      | 5974/16798 [26:40<55:11,  3.27it/s]

{'loss': 1.9017, 'grad_norm': 2.038320541381836, 'learning_rate': 0.0001289492494639028, 'epoch': 0.36}


 36%|███▌      | 5975/16798 [26:41<55:47,  3.23it/s]

{'loss': 1.146, 'grad_norm': 1.6709004640579224, 'learning_rate': 0.00012893733619251846, 'epoch': 0.36}


 36%|███▌      | 5976/16798 [26:41<53:50,  3.35it/s]

{'loss': 1.4521, 'grad_norm': 1.903779149055481, 'learning_rate': 0.00012892542292113415, 'epoch': 0.36}


 36%|███▌      | 5977/16798 [26:41<50:31,  3.57it/s]

{'loss': 1.2984, 'grad_norm': 1.6329575777053833, 'learning_rate': 0.00012891350964974982, 'epoch': 0.36}


 36%|███▌      | 5978/16798 [26:41<52:52,  3.41it/s]

{'loss': 1.2514, 'grad_norm': 1.9450349807739258, 'learning_rate': 0.0001289015963783655, 'epoch': 0.36}


 36%|███▌      | 5979/16798 [26:42<54:59,  3.28it/s]

{'loss': 1.2932, 'grad_norm': 1.671874761581421, 'learning_rate': 0.00012888968310698117, 'epoch': 0.36}


 36%|███▌      | 5980/16798 [26:42<52:53,  3.41it/s]

{'loss': 1.4332, 'grad_norm': 1.8337358236312866, 'learning_rate': 0.00012887776983559686, 'epoch': 0.36}


 36%|███▌      | 5981/16798 [26:42<56:56,  3.17it/s]

{'loss': 1.6762, 'grad_norm': 2.1442978382110596, 'learning_rate': 0.00012886585656421252, 'epoch': 0.36}


 36%|███▌      | 5982/16798 [26:43<56:15,  3.20it/s]

{'loss': 1.5436, 'grad_norm': 2.2153868675231934, 'learning_rate': 0.00012885394329282821, 'epoch': 0.36}


 36%|███▌      | 5983/16798 [26:43<57:36,  3.13it/s]

{'loss': 1.1807, 'grad_norm': 2.046840190887451, 'learning_rate': 0.0001288420300214439, 'epoch': 0.36}


 36%|███▌      | 5984/16798 [26:43<54:13,  3.32it/s]

{'loss': 1.1324, 'grad_norm': 1.7723994255065918, 'learning_rate': 0.00012883011675005957, 'epoch': 0.36}


 36%|███▌      | 5985/16798 [26:44<54:51,  3.29it/s]

{'loss': 1.1448, 'grad_norm': 1.6227647066116333, 'learning_rate': 0.00012881820347867526, 'epoch': 0.36}


 36%|███▌      | 5986/16798 [26:44<55:34,  3.24it/s]

{'loss': 1.4564, 'grad_norm': 2.3295857906341553, 'learning_rate': 0.00012880629020729092, 'epoch': 0.36}


 36%|███▌      | 5987/16798 [26:44<54:07,  3.33it/s]

{'loss': 0.9309, 'grad_norm': 3.363353967666626, 'learning_rate': 0.00012879437693590661, 'epoch': 0.36}


 36%|███▌      | 5988/16798 [26:45<52:22,  3.44it/s]

{'loss': 1.5721, 'grad_norm': 1.9050962924957275, 'learning_rate': 0.00012878246366452228, 'epoch': 0.36}


 36%|███▌      | 5989/16798 [26:45<53:39,  3.36it/s]

{'loss': 1.0595, 'grad_norm': 3.3011515140533447, 'learning_rate': 0.00012877055039313797, 'epoch': 0.36}


 36%|███▌      | 5990/16798 [26:45<54:42,  3.29it/s]

{'loss': 1.0603, 'grad_norm': 1.714227318763733, 'learning_rate': 0.00012875863712175363, 'epoch': 0.36}


 36%|███▌      | 5991/16798 [26:45<55:01,  3.27it/s]

{'loss': 1.0734, 'grad_norm': 1.6786785125732422, 'learning_rate': 0.00012874672385036932, 'epoch': 0.36}


 36%|███▌      | 5992/16798 [26:46<53:47,  3.35it/s]

{'loss': 0.9923, 'grad_norm': 2.172929048538208, 'learning_rate': 0.000128734810578985, 'epoch': 0.36}


 36%|███▌      | 5993/16798 [26:46<58:59,  3.05it/s]

{'loss': 1.0018, 'grad_norm': 1.680206060409546, 'learning_rate': 0.00012872289730760068, 'epoch': 0.36}


 36%|███▌      | 5994/16798 [26:46<56:45,  3.17it/s]

{'loss': 0.8411, 'grad_norm': 1.7430634498596191, 'learning_rate': 0.00012871098403621634, 'epoch': 0.36}


 36%|███▌      | 5995/16798 [26:47<58:03,  3.10it/s]

{'loss': 0.5381, 'grad_norm': 0.9929910898208618, 'learning_rate': 0.00012869907076483203, 'epoch': 0.36}


 36%|███▌      | 5996/16798 [26:47<57:27,  3.13it/s]

{'loss': 0.4969, 'grad_norm': 1.1773502826690674, 'learning_rate': 0.0001286871574934477, 'epoch': 0.36}


 36%|███▌      | 5997/16798 [26:47<56:38,  3.18it/s]

{'loss': 0.3991, 'grad_norm': 0.9641094207763672, 'learning_rate': 0.0001286752442220634, 'epoch': 0.36}


 36%|███▌      | 5998/16798 [26:48<54:52,  3.28it/s]

{'loss': 0.3265, 'grad_norm': 0.8626247048377991, 'learning_rate': 0.00012866333095067905, 'epoch': 0.36}


 36%|███▌      | 5999/16798 [26:48<56:03,  3.21it/s]

{'loss': 0.5888, 'grad_norm': 1.299957513809204, 'learning_rate': 0.00012865141767929474, 'epoch': 0.36}




{'loss': 0.998, 'grad_norm': 1.8976999521255493, 'learning_rate': 0.0001286395044079104, 'epoch': 0.36}


 36%|███▌      | 6001/16798 [26:51<3:09:52,  1.06s/it]

{'loss': 1.7183, 'grad_norm': 1.6185107231140137, 'learning_rate': 0.0001286275911365261, 'epoch': 0.36}


 36%|███▌      | 6002/16798 [26:51<2:28:42,  1.21it/s]

{'loss': 2.1822, 'grad_norm': 1.8976085186004639, 'learning_rate': 0.00012861567786514176, 'epoch': 0.36}


 36%|███▌      | 6003/16798 [26:52<2:03:25,  1.46it/s]

{'loss': 2.1806, 'grad_norm': 2.218898296356201, 'learning_rate': 0.00012860376459375745, 'epoch': 0.36}


 36%|███▌      | 6004/16798 [26:52<1:40:37,  1.79it/s]

{'loss': 2.0779, 'grad_norm': 1.9208033084869385, 'learning_rate': 0.0001285918513223731, 'epoch': 0.36}


 36%|███▌      | 6005/16798 [26:52<1:28:27,  2.03it/s]

{'loss': 2.0289, 'grad_norm': 1.9404542446136475, 'learning_rate': 0.0001285799380509888, 'epoch': 0.36}


 36%|███▌      | 6006/16798 [26:53<1:19:32,  2.26it/s]

{'loss': 2.294, 'grad_norm': 4.655871391296387, 'learning_rate': 0.00012856802477960447, 'epoch': 0.36}


 36%|███▌      | 6007/16798 [26:53<1:14:08,  2.43it/s]

{'loss': 2.2609, 'grad_norm': 2.0355868339538574, 'learning_rate': 0.00012855611150822016, 'epoch': 0.36}


 36%|███▌      | 6008/16798 [26:53<1:10:51,  2.54it/s]

{'loss': 1.8611, 'grad_norm': 1.8598359823226929, 'learning_rate': 0.00012854419823683582, 'epoch': 0.36}


 36%|███▌      | 6009/16798 [26:54<1:05:21,  2.75it/s]

{'loss': 1.9747, 'grad_norm': 1.7742375135421753, 'learning_rate': 0.0001285322849654515, 'epoch': 0.36}


 36%|███▌      | 6010/16798 [26:54<1:02:33,  2.87it/s]

{'loss': 2.297, 'grad_norm': 2.398834705352783, 'learning_rate': 0.00012852037169406718, 'epoch': 0.36}


 36%|███▌      | 6011/16798 [26:54<1:03:48,  2.82it/s]

{'loss': 1.9266, 'grad_norm': 2.2980077266693115, 'learning_rate': 0.00012850845842268287, 'epoch': 0.36}


 36%|███▌      | 6012/16798 [26:55<59:39,  3.01it/s]  

{'loss': 1.5212, 'grad_norm': 1.691563367843628, 'learning_rate': 0.00012849654515129853, 'epoch': 0.36}


 36%|███▌      | 6013/16798 [26:55<58:40,  3.06it/s]

{'loss': 1.7387, 'grad_norm': 2.025482416152954, 'learning_rate': 0.00012848463187991425, 'epoch': 0.36}


 36%|███▌      | 6014/16798 [26:55<57:17,  3.14it/s]

{'loss': 1.7673, 'grad_norm': 1.7620010375976562, 'learning_rate': 0.0001284727186085299, 'epoch': 0.36}


 36%|███▌      | 6015/16798 [26:56<56:27,  3.18it/s]

{'loss': 1.6091, 'grad_norm': 2.0340821743011475, 'learning_rate': 0.0001284608053371456, 'epoch': 0.36}


 36%|███▌      | 6016/16798 [26:56<55:11,  3.26it/s]

{'loss': 1.3651, 'grad_norm': 1.7873153686523438, 'learning_rate': 0.00012844889206576127, 'epoch': 0.36}


 36%|███▌      | 6017/16798 [26:56<53:53,  3.33it/s]

{'loss': 1.587, 'grad_norm': 2.272561550140381, 'learning_rate': 0.00012843697879437696, 'epoch': 0.36}


 36%|███▌      | 6018/16798 [26:56<54:29,  3.30it/s]

{'loss': 1.7074, 'grad_norm': 1.862060785293579, 'learning_rate': 0.00012842506552299262, 'epoch': 0.36}


 36%|███▌      | 6019/16798 [26:57<55:46,  3.22it/s]

{'loss': 1.4351, 'grad_norm': 1.8379955291748047, 'learning_rate': 0.0001284131522516083, 'epoch': 0.36}


 36%|███▌      | 6020/16798 [26:57<55:53,  3.21it/s]

{'loss': 1.6675, 'grad_norm': 1.8202695846557617, 'learning_rate': 0.00012840123898022398, 'epoch': 0.36}


 36%|███▌      | 6021/16798 [26:57<59:26,  3.02it/s]

{'loss': 1.6101, 'grad_norm': 1.8794254064559937, 'learning_rate': 0.00012838932570883967, 'epoch': 0.36}


 36%|███▌      | 6022/16798 [26:58<57:20,  3.13it/s]

{'loss': 1.3616, 'grad_norm': 1.992193341255188, 'learning_rate': 0.00012837741243745533, 'epoch': 0.36}


 36%|███▌      | 6023/16798 [26:58<57:16,  3.14it/s]

{'loss': 1.4382, 'grad_norm': 1.726043939590454, 'learning_rate': 0.00012836549916607102, 'epoch': 0.36}


 36%|███▌      | 6024/16798 [26:58<56:53,  3.16it/s]

{'loss': 1.6144, 'grad_norm': 1.9471379518508911, 'learning_rate': 0.00012835358589468668, 'epoch': 0.36}


 36%|███▌      | 6025/16798 [26:59<57:56,  3.10it/s]

{'loss': 1.383, 'grad_norm': 1.9679770469665527, 'learning_rate': 0.00012834167262330238, 'epoch': 0.36}


 36%|███▌      | 6026/16798 [26:59<56:36,  3.17it/s]

{'loss': 1.3646, 'grad_norm': 2.1689820289611816, 'learning_rate': 0.00012832975935191804, 'epoch': 0.36}


 36%|███▌      | 6027/16798 [26:59<55:30,  3.23it/s]

{'loss': 1.424, 'grad_norm': 1.9431548118591309, 'learning_rate': 0.00012831784608053373, 'epoch': 0.36}


 36%|███▌      | 6028/16798 [27:00<54:18,  3.31it/s]

{'loss': 1.524, 'grad_norm': 2.0267107486724854, 'learning_rate': 0.0001283059328091494, 'epoch': 0.36}


 36%|███▌      | 6029/16798 [27:00<55:28,  3.24it/s]

{'loss': 1.1546, 'grad_norm': 2.8904216289520264, 'learning_rate': 0.00012829401953776508, 'epoch': 0.36}


 36%|███▌      | 6030/16798 [27:00<59:55,  2.99it/s]

{'loss': 1.2133, 'grad_norm': 2.526151180267334, 'learning_rate': 0.00012828210626638075, 'epoch': 0.36}


 36%|███▌      | 6031/16798 [27:01<58:11,  3.08it/s]

{'loss': 1.3427, 'grad_norm': 1.66785728931427, 'learning_rate': 0.00012827019299499644, 'epoch': 0.36}


 36%|███▌      | 6032/16798 [27:01<57:50,  3.10it/s]

{'loss': 1.7221, 'grad_norm': 2.581232786178589, 'learning_rate': 0.0001282582797236121, 'epoch': 0.36}


 36%|███▌      | 6033/16798 [27:01<55:02,  3.26it/s]

{'loss': 1.7554, 'grad_norm': 2.2815659046173096, 'learning_rate': 0.0001282463664522278, 'epoch': 0.36}


 36%|███▌      | 6034/16798 [27:01<54:59,  3.26it/s]

{'loss': 0.8233, 'grad_norm': 1.4239064455032349, 'learning_rate': 0.00012823445318084346, 'epoch': 0.36}


 36%|███▌      | 6035/16798 [27:02<54:51,  3.27it/s]

{'loss': 1.1006, 'grad_norm': 1.9085915088653564, 'learning_rate': 0.00012822253990945915, 'epoch': 0.36}


 36%|███▌      | 6036/16798 [27:02<59:03,  3.04it/s]

{'loss': 1.2183, 'grad_norm': 1.7100359201431274, 'learning_rate': 0.0001282106266380748, 'epoch': 0.36}


 36%|███▌      | 6037/16798 [27:02<59:15,  3.03it/s]

{'loss': 1.1936, 'grad_norm': 1.5618646144866943, 'learning_rate': 0.0001281987133666905, 'epoch': 0.36}


 36%|███▌      | 6038/16798 [27:03<58:39,  3.06it/s]

{'loss': 0.9835, 'grad_norm': 1.9151740074157715, 'learning_rate': 0.00012818680009530617, 'epoch': 0.36}


 36%|███▌      | 6039/16798 [27:03<58:03,  3.09it/s]

{'loss': 1.3637, 'grad_norm': 1.8507053852081299, 'learning_rate': 0.00012817488682392186, 'epoch': 0.36}


 36%|███▌      | 6040/16798 [27:03<54:02,  3.32it/s]

{'loss': 1.2235, 'grad_norm': 1.6484196186065674, 'learning_rate': 0.00012816297355253752, 'epoch': 0.36}


 36%|███▌      | 6041/16798 [27:04<54:29,  3.29it/s]

{'loss': 1.3471, 'grad_norm': 1.9253898859024048, 'learning_rate': 0.0001281510602811532, 'epoch': 0.36}


 36%|███▌      | 6042/16798 [27:04<52:30,  3.41it/s]

{'loss': 1.4881, 'grad_norm': 1.9962337017059326, 'learning_rate': 0.00012813914700976887, 'epoch': 0.36}


 36%|███▌      | 6043/16798 [27:04<53:54,  3.32it/s]

{'loss': 1.0375, 'grad_norm': 1.6908578872680664, 'learning_rate': 0.00012812723373838457, 'epoch': 0.36}


 36%|███▌      | 6044/16798 [27:05<51:37,  3.47it/s]

{'loss': 0.9966, 'grad_norm': 1.637224793434143, 'learning_rate': 0.00012811532046700026, 'epoch': 0.36}


 36%|███▌      | 6045/16798 [27:05<54:57,  3.26it/s]

{'loss': 0.7485, 'grad_norm': 1.373752474784851, 'learning_rate': 0.00012810340719561592, 'epoch': 0.36}


 36%|███▌      | 6046/16798 [27:05<52:33,  3.41it/s]

{'loss': 0.7299, 'grad_norm': 1.4846001863479614, 'learning_rate': 0.0001280914939242316, 'epoch': 0.36}


 36%|███▌      | 6047/16798 [27:05<54:55,  3.26it/s]

{'loss': 0.7358, 'grad_norm': 1.2940644025802612, 'learning_rate': 0.00012807958065284727, 'epoch': 0.36}


 36%|███▌      | 6048/16798 [27:06<56:33,  3.17it/s]

{'loss': 0.6252, 'grad_norm': 1.2328381538391113, 'learning_rate': 0.00012806766738146296, 'epoch': 0.36}


 36%|███▌      | 6049/16798 [27:06<55:28,  3.23it/s]

{'loss': 0.808, 'grad_norm': 1.5624635219573975, 'learning_rate': 0.00012805575411007863, 'epoch': 0.36}


 36%|███▌      | 6050/16798 [27:06<58:23,  3.07it/s]

{'loss': 0.4131, 'grad_norm': 0.8721544742584229, 'learning_rate': 0.00012804384083869432, 'epoch': 0.36}


 36%|███▌      | 6051/16798 [27:07<57:55,  3.09it/s]

{'loss': 2.6319, 'grad_norm': 2.34618878364563, 'learning_rate': 0.00012803192756730998, 'epoch': 0.36}


 36%|███▌      | 6052/16798 [27:07<55:52,  3.20it/s]

{'loss': 2.1383, 'grad_norm': 2.3051483631134033, 'learning_rate': 0.00012802001429592567, 'epoch': 0.36}


 36%|███▌      | 6053/16798 [27:07<56:20,  3.18it/s]

{'loss': 1.8139, 'grad_norm': 2.313450813293457, 'learning_rate': 0.00012800810102454134, 'epoch': 0.36}


 36%|███▌      | 6054/16798 [27:08<58:40,  3.05it/s]

{'loss': 1.2492, 'grad_norm': 1.8452759981155396, 'learning_rate': 0.00012799618775315703, 'epoch': 0.36}


 36%|███▌      | 6055/16798 [27:08<54:50,  3.26it/s]

{'loss': 1.3135, 'grad_norm': 2.285456657409668, 'learning_rate': 0.0001279842744817727, 'epoch': 0.36}


 36%|███▌      | 6056/16798 [27:08<1:00:09,  2.98it/s]

{'loss': 1.4151, 'grad_norm': 1.9123315811157227, 'learning_rate': 0.00012797236121038838, 'epoch': 0.36}


 36%|███▌      | 6057/16798 [27:09<56:37,  3.16it/s]  

{'loss': 1.5449, 'grad_norm': 1.872002124786377, 'learning_rate': 0.00012796044793900405, 'epoch': 0.36}


 36%|███▌      | 6058/16798 [27:09<56:56,  3.14it/s]

{'loss': 1.7562, 'grad_norm': 2.244062900543213, 'learning_rate': 0.00012794853466761974, 'epoch': 0.36}


 36%|███▌      | 6059/16798 [27:09<55:46,  3.21it/s]

{'loss': 1.3492, 'grad_norm': 2.8965530395507812, 'learning_rate': 0.0001279366213962354, 'epoch': 0.36}


 36%|███▌      | 6060/16798 [27:10<56:56,  3.14it/s]

{'loss': 1.5478, 'grad_norm': 1.8850293159484863, 'learning_rate': 0.0001279247081248511, 'epoch': 0.36}


 36%|███▌      | 6061/16798 [27:10<54:13,  3.30it/s]

{'loss': 1.7221, 'grad_norm': 1.971800684928894, 'learning_rate': 0.00012791279485346676, 'epoch': 0.36}


 36%|███▌      | 6062/16798 [27:10<53:54,  3.32it/s]

{'loss': 1.5336, 'grad_norm': 2.099789619445801, 'learning_rate': 0.00012790088158208245, 'epoch': 0.36}


 36%|███▌      | 6063/16798 [27:11<55:08,  3.24it/s]

{'loss': 2.0514, 'grad_norm': 2.4131717681884766, 'learning_rate': 0.0001278889683106981, 'epoch': 0.36}


 36%|███▌      | 6064/16798 [27:11<56:31,  3.16it/s]

{'loss': 1.5072, 'grad_norm': 2.262601137161255, 'learning_rate': 0.0001278770550393138, 'epoch': 0.36}


 36%|███▌      | 6065/16798 [27:11<53:53,  3.32it/s]

{'loss': 1.9928, 'grad_norm': 2.2139530181884766, 'learning_rate': 0.00012786514176792946, 'epoch': 0.36}


 36%|███▌      | 6066/16798 [27:11<52:30,  3.41it/s]

{'loss': 1.1717, 'grad_norm': 1.5307774543762207, 'learning_rate': 0.00012785322849654515, 'epoch': 0.36}


 36%|███▌      | 6067/16798 [27:12<53:17,  3.36it/s]

{'loss': 1.6956, 'grad_norm': 2.270562171936035, 'learning_rate': 0.00012784131522516082, 'epoch': 0.36}


 36%|███▌      | 6068/16798 [27:12<54:50,  3.26it/s]

{'loss': 1.7751, 'grad_norm': 2.3902416229248047, 'learning_rate': 0.0001278294019537765, 'epoch': 0.36}


 36%|███▌      | 6069/16798 [27:12<53:15,  3.36it/s]

{'loss': 1.4119, 'grad_norm': 1.9762742519378662, 'learning_rate': 0.00012781748868239217, 'epoch': 0.36}


 36%|███▌      | 6070/16798 [27:13<53:30,  3.34it/s]

{'loss': 1.5793, 'grad_norm': 1.9053622484207153, 'learning_rate': 0.00012780557541100786, 'epoch': 0.36}


 36%|███▌      | 6071/16798 [27:13<53:07,  3.37it/s]

{'loss': 1.8824, 'grad_norm': 1.9844588041305542, 'learning_rate': 0.00012779366213962353, 'epoch': 0.36}


 36%|███▌      | 6072/16798 [27:13<54:02,  3.31it/s]

{'loss': 1.7058, 'grad_norm': 1.8452666997909546, 'learning_rate': 0.00012778174886823922, 'epoch': 0.36}


 36%|███▌      | 6073/16798 [27:14<53:36,  3.33it/s]

{'loss': 1.2075, 'grad_norm': 1.6747491359710693, 'learning_rate': 0.00012776983559685488, 'epoch': 0.36}


 36%|███▌      | 6074/16798 [27:14<56:22,  3.17it/s]

{'loss': 2.149, 'grad_norm': 2.4896836280822754, 'learning_rate': 0.00012775792232547057, 'epoch': 0.36}


 36%|███▌      | 6075/16798 [27:14<54:56,  3.25it/s]

{'loss': 1.5527, 'grad_norm': 2.023704767227173, 'learning_rate': 0.00012774600905408626, 'epoch': 0.36}


 36%|███▌      | 6076/16798 [27:15<57:05,  3.13it/s]

{'loss': 1.6219, 'grad_norm': 1.743049144744873, 'learning_rate': 0.00012773409578270195, 'epoch': 0.36}


 36%|███▌      | 6077/16798 [27:15<55:20,  3.23it/s]

{'loss': 1.4501, 'grad_norm': 1.8036754131317139, 'learning_rate': 0.00012772218251131762, 'epoch': 0.36}


 36%|███▌      | 6078/16798 [27:15<53:16,  3.35it/s]

{'loss': 1.2694, 'grad_norm': 1.708328127861023, 'learning_rate': 0.0001277102692399333, 'epoch': 0.36}


 36%|███▌      | 6079/16798 [27:15<52:37,  3.39it/s]

{'loss': 0.8991, 'grad_norm': 1.4169882535934448, 'learning_rate': 0.00012769835596854897, 'epoch': 0.36}


 36%|███▌      | 6080/16798 [27:16<54:14,  3.29it/s]

{'loss': 1.2118, 'grad_norm': 1.7658040523529053, 'learning_rate': 0.00012768644269716466, 'epoch': 0.36}


 36%|███▌      | 6081/16798 [27:16<56:00,  3.19it/s]

{'loss': 1.6999, 'grad_norm': 2.092174768447876, 'learning_rate': 0.00012767452942578033, 'epoch': 0.36}


 36%|███▌      | 6082/16798 [27:16<55:57,  3.19it/s]

{'loss': 1.6938, 'grad_norm': 1.9276349544525146, 'learning_rate': 0.00012766261615439602, 'epoch': 0.36}


 36%|███▌      | 6083/16798 [27:17<55:26,  3.22it/s]

{'loss': 1.2364, 'grad_norm': 1.6936010122299194, 'learning_rate': 0.00012765070288301168, 'epoch': 0.36}


 36%|███▌      | 6084/16798 [27:17<53:03,  3.37it/s]

{'loss': 1.4605, 'grad_norm': 1.8633472919464111, 'learning_rate': 0.00012763878961162737, 'epoch': 0.36}


 36%|███▌      | 6085/16798 [27:17<53:44,  3.32it/s]

{'loss': 1.3898, 'grad_norm': 1.6842173337936401, 'learning_rate': 0.00012762687634024304, 'epoch': 0.36}


 36%|███▌      | 6086/16798 [27:18<53:50,  3.32it/s]

{'loss': 1.5352, 'grad_norm': 1.9839845895767212, 'learning_rate': 0.00012761496306885873, 'epoch': 0.36}


 36%|███▌      | 6087/16798 [27:18<53:30,  3.34it/s]

{'loss': 1.286, 'grad_norm': 1.9792765378952026, 'learning_rate': 0.0001276030497974744, 'epoch': 0.36}


 36%|███▌      | 6088/16798 [27:18<53:54,  3.31it/s]

{'loss': 1.1059, 'grad_norm': 2.4079277515411377, 'learning_rate': 0.00012759113652609008, 'epoch': 0.36}


 36%|███▌      | 6089/16798 [27:18<53:07,  3.36it/s]

{'loss': 1.4292, 'grad_norm': 2.1479852199554443, 'learning_rate': 0.00012757922325470574, 'epoch': 0.36}


 36%|███▋      | 6090/16798 [27:19<52:50,  3.38it/s]

{'loss': 1.2313, 'grad_norm': 1.8086017370224, 'learning_rate': 0.00012756730998332143, 'epoch': 0.36}


 36%|███▋      | 6091/16798 [27:19<53:49,  3.32it/s]

{'loss': 1.2161, 'grad_norm': 1.8128286600112915, 'learning_rate': 0.0001275553967119371, 'epoch': 0.36}


 36%|███▋      | 6092/16798 [27:19<51:57,  3.43it/s]

{'loss': 1.3764, 'grad_norm': 2.0992887020111084, 'learning_rate': 0.0001275434834405528, 'epoch': 0.36}


 36%|███▋      | 6093/16798 [27:20<51:34,  3.46it/s]

{'loss': 0.8134, 'grad_norm': 1.939827561378479, 'learning_rate': 0.00012753157016916845, 'epoch': 0.36}


 36%|███▋      | 6094/16798 [27:20<50:03,  3.56it/s]

{'loss': 1.0765, 'grad_norm': 1.8001512289047241, 'learning_rate': 0.00012751965689778414, 'epoch': 0.36}


 36%|███▋      | 6095/16798 [27:20<53:08,  3.36it/s]

{'loss': 1.2288, 'grad_norm': 1.7155463695526123, 'learning_rate': 0.0001275077436263998, 'epoch': 0.36}


 36%|███▋      | 6096/16798 [27:21<55:00,  3.24it/s]

{'loss': 0.2949, 'grad_norm': 0.7237725257873535, 'learning_rate': 0.0001274958303550155, 'epoch': 0.36}


 36%|███▋      | 6097/16798 [27:21<55:21,  3.22it/s]

{'loss': 0.3768, 'grad_norm': 0.8483150005340576, 'learning_rate': 0.00012748391708363116, 'epoch': 0.36}


 36%|███▋      | 6098/16798 [27:21<52:30,  3.40it/s]

{'loss': 0.4035, 'grad_norm': 0.9225760698318481, 'learning_rate': 0.00012747200381224685, 'epoch': 0.36}


 36%|███▋      | 6099/16798 [27:21<50:28,  3.53it/s]

{'loss': 0.2107, 'grad_norm': 0.6470629572868347, 'learning_rate': 0.00012746009054086252, 'epoch': 0.36}


 36%|███▋      | 6100/16798 [27:22<52:53,  3.37it/s]

{'loss': 0.7636, 'grad_norm': 1.5681253671646118, 'learning_rate': 0.0001274481772694782, 'epoch': 0.36}


 36%|███▋      | 6101/16798 [27:22<53:44,  3.32it/s]

{'loss': 1.9778, 'grad_norm': 1.8655612468719482, 'learning_rate': 0.00012743626399809387, 'epoch': 0.36}


 36%|███▋      | 6102/16798 [27:22<53:42,  3.32it/s]

{'loss': 1.6517, 'grad_norm': 1.6867527961730957, 'learning_rate': 0.00012742435072670956, 'epoch': 0.36}


 36%|███▋      | 6103/16798 [27:23<55:21,  3.22it/s]

{'loss': 2.0484, 'grad_norm': 2.0570051670074463, 'learning_rate': 0.00012741243745532523, 'epoch': 0.36}


 36%|███▋      | 6104/16798 [27:23<54:51,  3.25it/s]

{'loss': 2.3195, 'grad_norm': 2.1879169940948486, 'learning_rate': 0.00012740052418394092, 'epoch': 0.36}


 36%|███▋      | 6105/16798 [27:23<54:04,  3.30it/s]

{'loss': 1.7946, 'grad_norm': 1.9401706457138062, 'learning_rate': 0.0001273886109125566, 'epoch': 0.36}


 36%|███▋      | 6106/16798 [27:24<55:48,  3.19it/s]

{'loss': 2.4395, 'grad_norm': 2.4911727905273438, 'learning_rate': 0.00012737669764117227, 'epoch': 0.36}


 36%|███▋      | 6107/16798 [27:24<58:27,  3.05it/s]

{'loss': 1.6149, 'grad_norm': 1.698948621749878, 'learning_rate': 0.00012736478436978796, 'epoch': 0.36}


 36%|███▋      | 6108/16798 [27:24<55:57,  3.18it/s]

{'loss': 1.957, 'grad_norm': 2.2647194862365723, 'learning_rate': 0.00012735287109840362, 'epoch': 0.36}


 36%|███▋      | 6109/16798 [27:24<55:51,  3.19it/s]

{'loss': 1.5801, 'grad_norm': 1.8525457382202148, 'learning_rate': 0.00012734095782701932, 'epoch': 0.36}


 36%|███▋      | 6110/16798 [27:25<55:38,  3.20it/s]

{'loss': 1.8745, 'grad_norm': 2.2297825813293457, 'learning_rate': 0.00012732904455563498, 'epoch': 0.36}


 36%|███▋      | 6111/16798 [27:25<53:12,  3.35it/s]

{'loss': 1.5541, 'grad_norm': 2.518616199493408, 'learning_rate': 0.00012731713128425067, 'epoch': 0.36}


 36%|███▋      | 6112/16798 [27:25<53:02,  3.36it/s]

{'loss': 1.7458, 'grad_norm': 2.0316433906555176, 'learning_rate': 0.00012730521801286633, 'epoch': 0.36}


 36%|███▋      | 6113/16798 [27:26<56:04,  3.18it/s]

{'loss': 1.7053, 'grad_norm': 2.6198372840881348, 'learning_rate': 0.00012729330474148202, 'epoch': 0.36}


 36%|███▋      | 6114/16798 [27:26<55:30,  3.21it/s]

{'loss': 1.5244, 'grad_norm': 2.092055320739746, 'learning_rate': 0.0001272813914700977, 'epoch': 0.36}


 36%|███▋      | 6115/16798 [27:26<54:17,  3.28it/s]

{'loss': 1.502, 'grad_norm': 1.733860969543457, 'learning_rate': 0.00012726947819871338, 'epoch': 0.36}


 36%|███▋      | 6116/16798 [27:27<58:30,  3.04it/s]

{'loss': 1.4311, 'grad_norm': 2.089935302734375, 'learning_rate': 0.00012725756492732904, 'epoch': 0.36}


 36%|███▋      | 6117/16798 [27:27<57:38,  3.09it/s]

{'loss': 1.7193, 'grad_norm': 2.068615674972534, 'learning_rate': 0.00012724565165594473, 'epoch': 0.36}


 36%|███▋      | 6118/16798 [27:27<58:29,  3.04it/s]

{'loss': 1.4908, 'grad_norm': 2.005709171295166, 'learning_rate': 0.0001272337383845604, 'epoch': 0.36}


 36%|███▋      | 6119/16798 [27:28<57:43,  3.08it/s]

{'loss': 1.4557, 'grad_norm': 1.9732857942581177, 'learning_rate': 0.0001272218251131761, 'epoch': 0.36}


 36%|███▋      | 6120/16798 [27:28<54:59,  3.24it/s]

{'loss': 1.3122, 'grad_norm': 1.6435492038726807, 'learning_rate': 0.00012720991184179175, 'epoch': 0.36}


 36%|███▋      | 6121/16798 [27:28<55:55,  3.18it/s]

{'loss': 1.571, 'grad_norm': 1.876676321029663, 'learning_rate': 0.00012719799857040744, 'epoch': 0.36}


 36%|███▋      | 6122/16798 [27:29<55:18,  3.22it/s]

{'loss': 1.2183, 'grad_norm': 1.598834753036499, 'learning_rate': 0.0001271860852990231, 'epoch': 0.36}


 36%|███▋      | 6123/16798 [27:29<56:14,  3.16it/s]

{'loss': 1.6628, 'grad_norm': 2.4956214427948, 'learning_rate': 0.0001271741720276388, 'epoch': 0.36}


 36%|███▋      | 6124/16798 [27:29<56:20,  3.16it/s]

{'loss': 1.6566, 'grad_norm': 2.33772611618042, 'learning_rate': 0.00012716225875625446, 'epoch': 0.36}


 36%|███▋      | 6125/16798 [27:30<55:58,  3.18it/s]

{'loss': 1.0702, 'grad_norm': 1.6329574584960938, 'learning_rate': 0.00012715034548487015, 'epoch': 0.36}


 36%|███▋      | 6126/16798 [27:30<54:50,  3.24it/s]

{'loss': 2.0286, 'grad_norm': 2.5819332599639893, 'learning_rate': 0.00012713843221348581, 'epoch': 0.36}


 36%|███▋      | 6127/16798 [27:30<53:34,  3.32it/s]

{'loss': 1.3124, 'grad_norm': 4.8772783279418945, 'learning_rate': 0.0001271265189421015, 'epoch': 0.36}


 36%|███▋      | 6128/16798 [27:30<52:06,  3.41it/s]

{'loss': 1.3458, 'grad_norm': 2.0560851097106934, 'learning_rate': 0.00012711460567071717, 'epoch': 0.36}


 36%|███▋      | 6129/16798 [27:31<54:39,  3.25it/s]

{'loss': 1.1626, 'grad_norm': 1.6556947231292725, 'learning_rate': 0.00012710269239933286, 'epoch': 0.36}


 36%|███▋      | 6130/16798 [27:31<51:26,  3.46it/s]

{'loss': 1.6272, 'grad_norm': 2.4133877754211426, 'learning_rate': 0.00012709077912794852, 'epoch': 0.36}


 36%|███▋      | 6131/16798 [27:31<54:39,  3.25it/s]

{'loss': 1.4921, 'grad_norm': 2.0706233978271484, 'learning_rate': 0.00012707886585656421, 'epoch': 0.36}


 37%|███▋      | 6132/16798 [27:32<54:34,  3.26it/s]

{'loss': 1.4683, 'grad_norm': 2.158917188644409, 'learning_rate': 0.00012706695258517988, 'epoch': 0.37}


 37%|███▋      | 6133/16798 [27:32<55:12,  3.22it/s]

{'loss': 1.4279, 'grad_norm': 1.9803214073181152, 'learning_rate': 0.00012705503931379557, 'epoch': 0.37}


 37%|███▋      | 6134/16798 [27:32<53:38,  3.31it/s]

{'loss': 1.2615, 'grad_norm': 1.996121883392334, 'learning_rate': 0.00012704312604241123, 'epoch': 0.37}


 37%|███▋      | 6135/16798 [27:32<50:03,  3.55it/s]

{'loss': 0.9502, 'grad_norm': 1.935339331626892, 'learning_rate': 0.00012703121277102692, 'epoch': 0.37}


 37%|███▋      | 6136/16798 [27:33<52:34,  3.38it/s]

{'loss': 0.995, 'grad_norm': 1.8328882455825806, 'learning_rate': 0.00012701929949964261, 'epoch': 0.37}


 37%|███▋      | 6137/16798 [27:33<54:39,  3.25it/s]

{'loss': 0.9827, 'grad_norm': 1.658305048942566, 'learning_rate': 0.0001270073862282583, 'epoch': 0.37}


 37%|███▋      | 6138/16798 [27:33<57:22,  3.10it/s]

{'loss': 0.8547, 'grad_norm': 1.6803921461105347, 'learning_rate': 0.00012699547295687397, 'epoch': 0.37}


 37%|███▋      | 6139/16798 [27:34<53:35,  3.31it/s]

{'loss': 1.4175, 'grad_norm': 2.2768821716308594, 'learning_rate': 0.00012698355968548966, 'epoch': 0.37}


 37%|███▋      | 6140/16798 [27:34<52:50,  3.36it/s]

{'loss': 1.6149, 'grad_norm': 1.9320465326309204, 'learning_rate': 0.00012697164641410532, 'epoch': 0.37}


 37%|███▋      | 6141/16798 [27:34<53:14,  3.34it/s]

{'loss': 1.1121, 'grad_norm': 1.6841998100280762, 'learning_rate': 0.000126959733142721, 'epoch': 0.37}


 37%|███▋      | 6142/16798 [27:35<55:23,  3.21it/s]

{'loss': 0.664, 'grad_norm': 1.4232687950134277, 'learning_rate': 0.00012694781987133668, 'epoch': 0.37}


 37%|███▋      | 6143/16798 [27:35<56:03,  3.17it/s]

{'loss': 0.8397, 'grad_norm': 1.4571374654769897, 'learning_rate': 0.00012693590659995237, 'epoch': 0.37}


 37%|███▋      | 6144/16798 [27:35<50:57,  3.48it/s]

{'loss': 1.3131, 'grad_norm': 2.2789628505706787, 'learning_rate': 0.00012692399332856803, 'epoch': 0.37}


 37%|███▋      | 6145/16798 [27:36<52:58,  3.35it/s]

{'loss': 0.8763, 'grad_norm': 1.797011137008667, 'learning_rate': 0.00012691208005718372, 'epoch': 0.37}


 37%|███▋      | 6146/16798 [27:36<53:50,  3.30it/s]

{'loss': 1.0187, 'grad_norm': 2.0315322875976562, 'learning_rate': 0.00012690016678579939, 'epoch': 0.37}


 37%|███▋      | 6147/16798 [27:36<51:29,  3.45it/s]

{'loss': 1.0606, 'grad_norm': 1.652567982673645, 'learning_rate': 0.00012688825351441508, 'epoch': 0.37}


 37%|███▋      | 6148/16798 [27:36<53:35,  3.31it/s]

{'loss': 0.6038, 'grad_norm': 1.2088348865509033, 'learning_rate': 0.00012687634024303074, 'epoch': 0.37}


 37%|███▋      | 6149/16798 [27:37<55:55,  3.17it/s]

{'loss': 0.6309, 'grad_norm': 1.4629216194152832, 'learning_rate': 0.00012686442697164643, 'epoch': 0.37}


 37%|███▋      | 6150/16798 [27:37<54:50,  3.24it/s]

{'loss': 0.3699, 'grad_norm': 0.808079719543457, 'learning_rate': 0.0001268525137002621, 'epoch': 0.37}


 37%|███▋      | 6151/16798 [27:37<56:46,  3.13it/s]

{'loss': 2.0903, 'grad_norm': 1.9346990585327148, 'learning_rate': 0.00012684060042887779, 'epoch': 0.37}


 37%|███▋      | 6152/16798 [27:38<57:12,  3.10it/s]

{'loss': 1.9274, 'grad_norm': 2.075521230697632, 'learning_rate': 0.00012682868715749345, 'epoch': 0.37}


 37%|███▋      | 6153/16798 [27:38<57:12,  3.10it/s]

{'loss': 2.163, 'grad_norm': 2.0404491424560547, 'learning_rate': 0.00012681677388610914, 'epoch': 0.37}


 37%|███▋      | 6154/16798 [27:38<54:06,  3.28it/s]

{'loss': 1.799, 'grad_norm': 2.1961417198181152, 'learning_rate': 0.0001268048606147248, 'epoch': 0.37}


 37%|███▋      | 6155/16798 [27:39<53:15,  3.33it/s]

{'loss': 2.2819, 'grad_norm': 2.188396692276001, 'learning_rate': 0.0001267929473433405, 'epoch': 0.37}


 37%|███▋      | 6156/16798 [27:39<57:01,  3.11it/s]

{'loss': 1.8979, 'grad_norm': 2.6161162853240967, 'learning_rate': 0.00012678103407195616, 'epoch': 0.37}


 37%|███▋      | 6157/16798 [27:39<58:21,  3.04it/s]

{'loss': 2.356, 'grad_norm': 2.411691904067993, 'learning_rate': 0.00012676912080057185, 'epoch': 0.37}


 37%|███▋      | 6158/16798 [27:40<55:41,  3.18it/s]

{'loss': 2.112, 'grad_norm': 2.488459825515747, 'learning_rate': 0.0001267572075291875, 'epoch': 0.37}


 37%|███▋      | 6159/16798 [27:40<58:23,  3.04it/s]

{'loss': 1.9349, 'grad_norm': 2.394904136657715, 'learning_rate': 0.0001267452942578032, 'epoch': 0.37}


 37%|███▋      | 6160/16798 [27:40<53:30,  3.31it/s]

{'loss': 1.815, 'grad_norm': 1.9815599918365479, 'learning_rate': 0.00012673338098641887, 'epoch': 0.37}


 37%|███▋      | 6161/16798 [27:41<54:09,  3.27it/s]

{'loss': 1.3801, 'grad_norm': 1.6247758865356445, 'learning_rate': 0.00012672146771503456, 'epoch': 0.37}


 37%|███▋      | 6162/16798 [27:41<54:34,  3.25it/s]

{'loss': 1.2766, 'grad_norm': 2.4752612113952637, 'learning_rate': 0.00012670955444365022, 'epoch': 0.37}


 37%|███▋      | 6163/16798 [27:41<52:11,  3.40it/s]

{'loss': 1.4575, 'grad_norm': 1.9808909893035889, 'learning_rate': 0.0001266976411722659, 'epoch': 0.37}


 37%|███▋      | 6164/16798 [27:41<52:46,  3.36it/s]

{'loss': 1.6301, 'grad_norm': 1.7312870025634766, 'learning_rate': 0.00012668572790088158, 'epoch': 0.37}


 37%|███▋      | 6165/16798 [27:42<53:32,  3.31it/s]

{'loss': 1.4013, 'grad_norm': 1.7415794134140015, 'learning_rate': 0.00012667381462949727, 'epoch': 0.37}


 37%|███▋      | 6166/16798 [27:42<54:35,  3.25it/s]

{'loss': 1.7715, 'grad_norm': 3.0745389461517334, 'learning_rate': 0.00012666190135811293, 'epoch': 0.37}


 37%|███▋      | 6167/16798 [27:42<58:09,  3.05it/s]

{'loss': 2.1059, 'grad_norm': 2.2377610206604004, 'learning_rate': 0.00012664998808672862, 'epoch': 0.37}


 37%|███▋      | 6168/16798 [27:43<56:22,  3.14it/s]

{'loss': 1.5934, 'grad_norm': 1.7793084383010864, 'learning_rate': 0.0001266380748153443, 'epoch': 0.37}


 37%|███▋      | 6169/16798 [27:43<55:15,  3.21it/s]

{'loss': 1.5755, 'grad_norm': 1.907517910003662, 'learning_rate': 0.00012662616154395998, 'epoch': 0.37}


 37%|███▋      | 6170/16798 [27:43<53:16,  3.32it/s]

{'loss': 1.4279, 'grad_norm': 1.8359167575836182, 'learning_rate': 0.00012661424827257567, 'epoch': 0.37}


 37%|███▋      | 6171/16798 [27:44<51:32,  3.44it/s]

{'loss': 1.6808, 'grad_norm': 2.5100481510162354, 'learning_rate': 0.00012660233500119133, 'epoch': 0.37}


 37%|███▋      | 6172/16798 [27:44<51:23,  3.45it/s]

{'loss': 1.6637, 'grad_norm': 1.9364380836486816, 'learning_rate': 0.00012659042172980702, 'epoch': 0.37}


 37%|███▋      | 6173/16798 [27:44<53:41,  3.30it/s]

{'loss': 1.1254, 'grad_norm': 1.5254449844360352, 'learning_rate': 0.00012657850845842268, 'epoch': 0.37}


 37%|███▋      | 6174/16798 [27:45<56:24,  3.14it/s]

{'loss': 1.5425, 'grad_norm': 1.7036505937576294, 'learning_rate': 0.00012656659518703837, 'epoch': 0.37}


 37%|███▋      | 6175/16798 [27:45<54:05,  3.27it/s]

{'loss': 1.4653, 'grad_norm': 1.9130719900131226, 'learning_rate': 0.00012655468191565404, 'epoch': 0.37}


 37%|███▋      | 6176/16798 [27:45<53:24,  3.32it/s]

{'loss': 1.7362, 'grad_norm': 2.0106308460235596, 'learning_rate': 0.00012654276864426973, 'epoch': 0.37}


 37%|███▋      | 6177/16798 [27:45<54:22,  3.26it/s]

{'loss': 1.5023, 'grad_norm': 2.231492042541504, 'learning_rate': 0.0001265308553728854, 'epoch': 0.37}


 37%|███▋      | 6178/16798 [27:46<54:20,  3.26it/s]

{'loss': 1.343, 'grad_norm': 1.8756892681121826, 'learning_rate': 0.00012651894210150108, 'epoch': 0.37}


 37%|███▋      | 6179/16798 [27:46<52:41,  3.36it/s]

{'loss': 1.1743, 'grad_norm': 1.5843379497528076, 'learning_rate': 0.00012650702883011675, 'epoch': 0.37}


 37%|███▋      | 6180/16798 [27:46<52:25,  3.38it/s]

{'loss': 1.4792, 'grad_norm': 2.3012917041778564, 'learning_rate': 0.00012649511555873244, 'epoch': 0.37}


 37%|███▋      | 6181/16798 [27:47<52:55,  3.34it/s]

{'loss': 1.7052, 'grad_norm': 2.107787847518921, 'learning_rate': 0.0001264832022873481, 'epoch': 0.37}


 37%|███▋      | 6182/16798 [27:47<55:15,  3.20it/s]

{'loss': 1.5474, 'grad_norm': 2.3666040897369385, 'learning_rate': 0.0001264712890159638, 'epoch': 0.37}


 37%|███▋      | 6183/16798 [27:47<55:55,  3.16it/s]

{'loss': 1.7215, 'grad_norm': 2.5237927436828613, 'learning_rate': 0.00012645937574457946, 'epoch': 0.37}


 37%|███▋      | 6184/16798 [27:48<55:13,  3.20it/s]

{'loss': 0.9905, 'grad_norm': 2.386415481567383, 'learning_rate': 0.00012644746247319515, 'epoch': 0.37}


 37%|███▋      | 6185/16798 [27:48<56:30,  3.13it/s]

{'loss': 1.0151, 'grad_norm': 1.7815899848937988, 'learning_rate': 0.0001264355492018108, 'epoch': 0.37}


 37%|███▋      | 6186/16798 [27:48<53:48,  3.29it/s]

{'loss': 1.1568, 'grad_norm': 1.762139081954956, 'learning_rate': 0.0001264236359304265, 'epoch': 0.37}


 37%|███▋      | 6187/16798 [27:48<53:10,  3.33it/s]

{'loss': 1.1275, 'grad_norm': 1.6567704677581787, 'learning_rate': 0.00012641172265904217, 'epoch': 0.37}


 37%|███▋      | 6188/16798 [27:49<52:30,  3.37it/s]

{'loss': 1.3504, 'grad_norm': 1.7856799364089966, 'learning_rate': 0.00012639980938765786, 'epoch': 0.37}


 37%|███▋      | 6189/16798 [27:49<52:49,  3.35it/s]

{'loss': 1.5009, 'grad_norm': 1.8239028453826904, 'learning_rate': 0.00012638789611627352, 'epoch': 0.37}


 37%|███▋      | 6190/16798 [27:49<50:09,  3.53it/s]

{'loss': 0.8815, 'grad_norm': 1.914138913154602, 'learning_rate': 0.0001263759828448892, 'epoch': 0.37}


 37%|███▋      | 6191/16798 [27:50<50:54,  3.47it/s]

{'loss': 1.4005, 'grad_norm': 2.114236831665039, 'learning_rate': 0.00012636406957350487, 'epoch': 0.37}


 37%|███▋      | 6192/16798 [27:50<52:59,  3.34it/s]

{'loss': 1.0488, 'grad_norm': 1.7629210948944092, 'learning_rate': 0.00012635215630212056, 'epoch': 0.37}


 37%|███▋      | 6193/16798 [27:50<52:50,  3.35it/s]

{'loss': 1.2906, 'grad_norm': 1.9974884986877441, 'learning_rate': 0.00012634024303073623, 'epoch': 0.37}


 37%|███▋      | 6194/16798 [27:51<51:01,  3.46it/s]

{'loss': 1.0834, 'grad_norm': 1.4916162490844727, 'learning_rate': 0.00012632832975935192, 'epoch': 0.37}


 37%|███▋      | 6195/16798 [27:51<55:09,  3.20it/s]

{'loss': 0.7208, 'grad_norm': 1.6391417980194092, 'learning_rate': 0.00012631641648796758, 'epoch': 0.37}


 37%|███▋      | 6196/16798 [27:51<56:21,  3.14it/s]

{'loss': 0.4942, 'grad_norm': 0.9952782392501831, 'learning_rate': 0.00012630450321658327, 'epoch': 0.37}


 37%|███▋      | 6197/16798 [27:51<54:25,  3.25it/s]

{'loss': 0.3083, 'grad_norm': 0.7340943813323975, 'learning_rate': 0.00012629258994519894, 'epoch': 0.37}


 37%|███▋      | 6198/16798 [27:52<54:34,  3.24it/s]

{'loss': 0.1796, 'grad_norm': 0.5787292122840881, 'learning_rate': 0.00012628067667381466, 'epoch': 0.37}


 37%|███▋      | 6199/16798 [27:52<55:05,  3.21it/s]

{'loss': 0.2583, 'grad_norm': 0.7535324096679688, 'learning_rate': 0.00012626876340243032, 'epoch': 0.37}


 37%|███▋      | 6200/16798 [27:52<54:42,  3.23it/s]

{'loss': 0.3165, 'grad_norm': 0.748142659664154, 'learning_rate': 0.000126256850131046, 'epoch': 0.37}


 37%|███▋      | 6201/16798 [27:53<55:42,  3.17it/s]

{'loss': 1.8157, 'grad_norm': 1.9664828777313232, 'learning_rate': 0.00012624493685966167, 'epoch': 0.37}


 37%|███▋      | 6202/16798 [27:53<56:06,  3.15it/s]

{'loss': 1.9905, 'grad_norm': 1.9005872011184692, 'learning_rate': 0.00012623302358827736, 'epoch': 0.37}


 37%|███▋      | 6203/16798 [27:53<53:51,  3.28it/s]

{'loss': 2.1575, 'grad_norm': 1.991573452949524, 'learning_rate': 0.00012622111031689303, 'epoch': 0.37}


 37%|███▋      | 6204/16798 [27:54<53:18,  3.31it/s]

{'loss': 1.799, 'grad_norm': 1.9534298181533813, 'learning_rate': 0.00012620919704550872, 'epoch': 0.37}


 37%|███▋      | 6205/16798 [27:54<54:44,  3.23it/s]

{'loss': 1.9322, 'grad_norm': 1.7991663217544556, 'learning_rate': 0.00012619728377412438, 'epoch': 0.37}


 37%|███▋      | 6206/16798 [27:54<54:52,  3.22it/s]

{'loss': 2.3969, 'grad_norm': 2.1651930809020996, 'learning_rate': 0.00012618537050274007, 'epoch': 0.37}


 37%|███▋      | 6207/16798 [27:55<54:47,  3.22it/s]

{'loss': 1.7173, 'grad_norm': 1.815242886543274, 'learning_rate': 0.00012617345723135574, 'epoch': 0.37}


 37%|███▋      | 6208/16798 [27:55<55:11,  3.20it/s]

{'loss': 1.9044, 'grad_norm': 2.20753812789917, 'learning_rate': 0.00012616154395997143, 'epoch': 0.37}


 37%|███▋      | 6209/16798 [27:55<54:24,  3.24it/s]

{'loss': 1.8102, 'grad_norm': 1.990612506866455, 'learning_rate': 0.0001261496306885871, 'epoch': 0.37}


 37%|███▋      | 6210/16798 [27:56<57:16,  3.08it/s]

{'loss': 1.8048, 'grad_norm': 2.289663791656494, 'learning_rate': 0.00012613771741720278, 'epoch': 0.37}


 37%|███▋      | 6211/16798 [27:56<53:30,  3.30it/s]

{'loss': 1.6588, 'grad_norm': 1.9392014741897583, 'learning_rate': 0.00012612580414581845, 'epoch': 0.37}


 37%|███▋      | 6212/16798 [27:56<54:36,  3.23it/s]

{'loss': 1.6848, 'grad_norm': 2.080195903778076, 'learning_rate': 0.00012611389087443414, 'epoch': 0.37}


 37%|███▋      | 6213/16798 [27:56<52:47,  3.34it/s]

{'loss': 1.0981, 'grad_norm': 1.5085443258285522, 'learning_rate': 0.0001261019776030498, 'epoch': 0.37}


 37%|███▋      | 6214/16798 [27:57<57:29,  3.07it/s]

{'loss': 1.8392, 'grad_norm': 2.1834285259246826, 'learning_rate': 0.0001260900643316655, 'epoch': 0.37}


 37%|███▋      | 6215/16798 [27:57<52:14,  3.38it/s]

{'loss': 1.345, 'grad_norm': 1.7504117488861084, 'learning_rate': 0.00012607815106028115, 'epoch': 0.37}


 37%|███▋      | 6216/16798 [27:57<50:50,  3.47it/s]

{'loss': 1.638, 'grad_norm': 2.2708327770233154, 'learning_rate': 0.00012606623778889684, 'epoch': 0.37}


 37%|███▋      | 6217/16798 [27:58<51:27,  3.43it/s]

{'loss': 1.4132, 'grad_norm': 1.6574565172195435, 'learning_rate': 0.0001260543245175125, 'epoch': 0.37}


 37%|███▋      | 6218/16798 [27:58<56:15,  3.13it/s]

{'loss': 1.255, 'grad_norm': 1.7943308353424072, 'learning_rate': 0.0001260424112461282, 'epoch': 0.37}


 37%|███▋      | 6219/16798 [27:58<54:41,  3.22it/s]

{'loss': 1.5452, 'grad_norm': 2.4278311729431152, 'learning_rate': 0.00012603049797474386, 'epoch': 0.37}


 37%|███▋      | 6220/16798 [27:59<1:00:10,  2.93it/s]

{'loss': 1.3686, 'grad_norm': 1.7927559614181519, 'learning_rate': 0.00012601858470335955, 'epoch': 0.37}


 37%|███▋      | 6221/16798 [27:59<54:04,  3.26it/s]  

{'loss': 1.7463, 'grad_norm': 1.96487557888031, 'learning_rate': 0.00012600667143197522, 'epoch': 0.37}


 37%|███▋      | 6222/16798 [27:59<57:00,  3.09it/s]

{'loss': 1.4446, 'grad_norm': 1.9227674007415771, 'learning_rate': 0.0001259947581605909, 'epoch': 0.37}


 37%|███▋      | 6223/16798 [28:00<56:13,  3.13it/s]

{'loss': 1.5883, 'grad_norm': 2.1892874240875244, 'learning_rate': 0.00012598284488920657, 'epoch': 0.37}


 37%|███▋      | 6224/16798 [28:00<53:48,  3.28it/s]

{'loss': 1.4224, 'grad_norm': 1.7238729000091553, 'learning_rate': 0.00012597093161782226, 'epoch': 0.37}


 37%|███▋      | 6225/16798 [28:00<55:03,  3.20it/s]

{'loss': 1.421, 'grad_norm': 1.8593637943267822, 'learning_rate': 0.00012595901834643793, 'epoch': 0.37}


 37%|███▋      | 6226/16798 [28:00<54:30,  3.23it/s]

{'loss': 1.4297, 'grad_norm': 2.060634136199951, 'learning_rate': 0.00012594710507505362, 'epoch': 0.37}


 37%|███▋      | 6227/16798 [28:01<53:14,  3.31it/s]

{'loss': 1.9042, 'grad_norm': 2.0021584033966064, 'learning_rate': 0.00012593519180366928, 'epoch': 0.37}


 37%|███▋      | 6228/16798 [28:01<54:21,  3.24it/s]

{'loss': 1.7679, 'grad_norm': 2.3983426094055176, 'learning_rate': 0.00012592327853228497, 'epoch': 0.37}


 37%|███▋      | 6229/16798 [28:01<54:40,  3.22it/s]

{'loss': 1.8352, 'grad_norm': 2.494110345840454, 'learning_rate': 0.00012591136526090066, 'epoch': 0.37}


 37%|███▋      | 6230/16798 [28:02<55:41,  3.16it/s]

{'loss': 1.5787, 'grad_norm': 2.279036045074463, 'learning_rate': 0.00012589945198951633, 'epoch': 0.37}


 37%|███▋      | 6231/16798 [28:02<55:02,  3.20it/s]

{'loss': 1.4025, 'grad_norm': 1.8843994140625, 'learning_rate': 0.00012588753871813202, 'epoch': 0.37}


 37%|███▋      | 6232/16798 [28:02<53:50,  3.27it/s]

{'loss': 1.3779, 'grad_norm': 1.950896143913269, 'learning_rate': 0.00012587562544674768, 'epoch': 0.37}


 37%|███▋      | 6233/16798 [28:03<52:14,  3.37it/s]

{'loss': 0.9582, 'grad_norm': 1.491081953048706, 'learning_rate': 0.00012586371217536337, 'epoch': 0.37}


 37%|███▋      | 6234/16798 [28:03<52:11,  3.37it/s]

{'loss': 1.4682, 'grad_norm': 2.3313305377960205, 'learning_rate': 0.00012585179890397903, 'epoch': 0.37}


 37%|███▋      | 6235/16798 [28:03<52:19,  3.36it/s]

{'loss': 1.4511, 'grad_norm': 1.9712574481964111, 'learning_rate': 0.00012583988563259473, 'epoch': 0.37}


 37%|███▋      | 6236/16798 [28:04<54:28,  3.23it/s]

{'loss': 0.6496, 'grad_norm': 1.3438518047332764, 'learning_rate': 0.0001258279723612104, 'epoch': 0.37}


 37%|███▋      | 6237/16798 [28:04<54:10,  3.25it/s]

{'loss': 1.1621, 'grad_norm': 1.9674755334854126, 'learning_rate': 0.00012581605908982608, 'epoch': 0.37}


 37%|███▋      | 6238/16798 [28:04<55:27,  3.17it/s]

{'loss': 1.3437, 'grad_norm': 2.0100924968719482, 'learning_rate': 0.00012580414581844174, 'epoch': 0.37}


 37%|███▋      | 6239/16798 [28:04<54:55,  3.20it/s]

{'loss': 1.2584, 'grad_norm': 1.8443584442138672, 'learning_rate': 0.00012579223254705743, 'epoch': 0.37}


 37%|███▋      | 6240/16798 [28:05<53:07,  3.31it/s]

{'loss': 1.1244, 'grad_norm': 1.6886800527572632, 'learning_rate': 0.0001257803192756731, 'epoch': 0.37}


 37%|███▋      | 6241/16798 [28:05<54:11,  3.25it/s]

{'loss': 1.1513, 'grad_norm': 1.8232463598251343, 'learning_rate': 0.0001257684060042888, 'epoch': 0.37}


 37%|███▋      | 6242/16798 [28:05<50:30,  3.48it/s]

{'loss': 1.2835, 'grad_norm': 1.9116888046264648, 'learning_rate': 0.00012575649273290445, 'epoch': 0.37}


 37%|███▋      | 6243/16798 [28:06<53:50,  3.27it/s]

{'loss': 0.7064, 'grad_norm': 1.798532485961914, 'learning_rate': 0.00012574457946152014, 'epoch': 0.37}


 37%|███▋      | 6244/16798 [28:06<51:33,  3.41it/s]

{'loss': 0.6399, 'grad_norm': 1.2687594890594482, 'learning_rate': 0.0001257326661901358, 'epoch': 0.37}


 37%|███▋      | 6245/16798 [28:06<52:53,  3.33it/s]

{'loss': 0.8846, 'grad_norm': 1.5305218696594238, 'learning_rate': 0.0001257207529187515, 'epoch': 0.37}


 37%|███▋      | 6246/16798 [28:07<50:46,  3.46it/s]

{'loss': 0.8293, 'grad_norm': 1.5989933013916016, 'learning_rate': 0.00012570883964736716, 'epoch': 0.37}


 37%|███▋      | 6247/16798 [28:07<55:24,  3.17it/s]

{'loss': 0.318, 'grad_norm': 1.0392528772354126, 'learning_rate': 0.00012569692637598285, 'epoch': 0.37}


 37%|███▋      | 6248/16798 [28:07<51:11,  3.44it/s]

{'loss': 0.3257, 'grad_norm': 0.8058502674102783, 'learning_rate': 0.00012568501310459852, 'epoch': 0.37}


 37%|███▋      | 6249/16798 [28:07<51:57,  3.38it/s]

{'loss': 0.1896, 'grad_norm': 0.574594259262085, 'learning_rate': 0.0001256730998332142, 'epoch': 0.37}


 37%|███▋      | 6250/16798 [28:08<51:12,  3.43it/s]

{'loss': 0.6232, 'grad_norm': 1.4789140224456787, 'learning_rate': 0.00012566118656182987, 'epoch': 0.37}


 37%|███▋      | 6251/16798 [28:08<51:28,  3.41it/s]

{'loss': 1.9305, 'grad_norm': 2.0400824546813965, 'learning_rate': 0.00012564927329044556, 'epoch': 0.37}


 37%|███▋      | 6252/16798 [28:08<51:24,  3.42it/s]

{'loss': 2.1921, 'grad_norm': 1.8711738586425781, 'learning_rate': 0.00012563736001906122, 'epoch': 0.37}


 37%|███▋      | 6253/16798 [28:09<51:04,  3.44it/s]

{'loss': 1.7958, 'grad_norm': 2.083146333694458, 'learning_rate': 0.00012562544674767692, 'epoch': 0.37}


 37%|███▋      | 6254/16798 [28:09<50:42,  3.47it/s]

{'loss': 2.1565, 'grad_norm': 2.344973087310791, 'learning_rate': 0.00012561353347629258, 'epoch': 0.37}


 37%|███▋      | 6255/16798 [28:09<50:13,  3.50it/s]

{'loss': 2.0819, 'grad_norm': 1.7825781106948853, 'learning_rate': 0.00012560162020490827, 'epoch': 0.37}


 37%|███▋      | 6256/16798 [28:09<51:25,  3.42it/s]

{'loss': 2.4964, 'grad_norm': 2.1730775833129883, 'learning_rate': 0.00012558970693352393, 'epoch': 0.37}


 37%|███▋      | 6257/16798 [28:10<52:37,  3.34it/s]

{'loss': 1.8625, 'grad_norm': 1.8158516883850098, 'learning_rate': 0.00012557779366213962, 'epoch': 0.37}


 37%|███▋      | 6258/16798 [28:10<53:07,  3.31it/s]

{'loss': 1.7066, 'grad_norm': 1.8832398653030396, 'learning_rate': 0.0001255658803907553, 'epoch': 0.37}


 37%|███▋      | 6259/16798 [28:10<54:32,  3.22it/s]

{'loss': 1.8042, 'grad_norm': 1.8810925483703613, 'learning_rate': 0.00012555396711937098, 'epoch': 0.37}


 37%|███▋      | 6260/16798 [28:11<53:51,  3.26it/s]

{'loss': 2.1107, 'grad_norm': 2.317364454269409, 'learning_rate': 0.00012554205384798667, 'epoch': 0.37}


 37%|███▋      | 6261/16798 [28:11<53:42,  3.27it/s]

{'loss': 1.8497, 'grad_norm': 2.0893714427948, 'learning_rate': 0.00012553014057660236, 'epoch': 0.37}


 37%|███▋      | 6262/16798 [28:11<51:14,  3.43it/s]

{'loss': 1.7181, 'grad_norm': 2.0261237621307373, 'learning_rate': 0.00012551822730521802, 'epoch': 0.37}


 37%|███▋      | 6263/16798 [28:12<53:49,  3.26it/s]

{'loss': 1.553, 'grad_norm': 1.7699931859970093, 'learning_rate': 0.00012550631403383371, 'epoch': 0.37}


 37%|███▋      | 6264/16798 [28:12<56:26,  3.11it/s]

{'loss': 1.3542, 'grad_norm': 2.2369132041931152, 'learning_rate': 0.00012549440076244938, 'epoch': 0.37}


 37%|███▋      | 6265/16798 [28:12<54:04,  3.25it/s]

{'loss': 1.5683, 'grad_norm': 1.9722250699996948, 'learning_rate': 0.00012548248749106507, 'epoch': 0.37}


 37%|███▋      | 6266/16798 [28:13<56:35,  3.10it/s]

{'loss': 1.5709, 'grad_norm': 1.8929152488708496, 'learning_rate': 0.00012547057421968073, 'epoch': 0.37}


 37%|███▋      | 6267/16798 [28:13<56:10,  3.12it/s]

{'loss': 1.5219, 'grad_norm': 1.8593310117721558, 'learning_rate': 0.00012545866094829642, 'epoch': 0.37}


 37%|███▋      | 6268/16798 [28:13<55:59,  3.13it/s]

{'loss': 1.7487, 'grad_norm': 1.8188955783843994, 'learning_rate': 0.0001254467476769121, 'epoch': 0.37}


 37%|███▋      | 6269/16798 [28:14<56:43,  3.09it/s]

{'loss': 1.5509, 'grad_norm': 1.868823528289795, 'learning_rate': 0.00012543483440552778, 'epoch': 0.37}


 37%|███▋      | 6270/16798 [28:14<54:25,  3.22it/s]

{'loss': 1.6745, 'grad_norm': 2.070613384246826, 'learning_rate': 0.00012542292113414344, 'epoch': 0.37}


 37%|███▋      | 6271/16798 [28:14<55:21,  3.17it/s]

{'loss': 1.823, 'grad_norm': 2.049274206161499, 'learning_rate': 0.00012541100786275913, 'epoch': 0.37}


 37%|███▋      | 6272/16798 [28:14<53:17,  3.29it/s]

{'loss': 1.4941, 'grad_norm': 1.809624433517456, 'learning_rate': 0.0001253990945913748, 'epoch': 0.37}


 37%|███▋      | 6273/16798 [28:15<52:39,  3.33it/s]

{'loss': 1.5808, 'grad_norm': 1.8301761150360107, 'learning_rate': 0.0001253871813199905, 'epoch': 0.37}


 37%|███▋      | 6274/16798 [28:15<52:26,  3.34it/s]

{'loss': 1.3441, 'grad_norm': 2.6525285243988037, 'learning_rate': 0.00012537526804860615, 'epoch': 0.37}


 37%|███▋      | 6275/16798 [28:15<53:51,  3.26it/s]

{'loss': 1.8746, 'grad_norm': 2.114525318145752, 'learning_rate': 0.00012536335477722184, 'epoch': 0.37}


 37%|███▋      | 6276/16798 [28:16<56:15,  3.12it/s]

{'loss': 1.9046, 'grad_norm': 2.0632853507995605, 'learning_rate': 0.0001253514415058375, 'epoch': 0.37}


 37%|███▋      | 6277/16798 [28:16<55:19,  3.17it/s]

{'loss': 0.9312, 'grad_norm': 1.6513097286224365, 'learning_rate': 0.0001253395282344532, 'epoch': 0.37}


 37%|███▋      | 6278/16798 [28:16<56:27,  3.11it/s]

{'loss': 1.2789, 'grad_norm': 1.6530916690826416, 'learning_rate': 0.00012532761496306886, 'epoch': 0.37}


 37%|███▋      | 6279/16798 [28:17<55:03,  3.18it/s]

{'loss': 1.6086, 'grad_norm': 2.1111040115356445, 'learning_rate': 0.00012531570169168455, 'epoch': 0.37}


 37%|███▋      | 6280/16798 [28:17<58:18,  3.01it/s]

{'loss': 1.467, 'grad_norm': 1.8527179956436157, 'learning_rate': 0.0001253037884203002, 'epoch': 0.37}


 37%|███▋      | 6281/16798 [28:17<52:54,  3.31it/s]

{'loss': 1.5269, 'grad_norm': 2.1571204662323, 'learning_rate': 0.0001252918751489159, 'epoch': 0.37}


 37%|███▋      | 6282/16798 [28:18<56:01,  3.13it/s]

{'loss': 1.8412, 'grad_norm': 2.1967670917510986, 'learning_rate': 0.00012527996187753157, 'epoch': 0.37}


 37%|███▋      | 6283/16798 [28:18<53:52,  3.25it/s]

{'loss': 1.2487, 'grad_norm': 1.7017513513565063, 'learning_rate': 0.00012526804860614726, 'epoch': 0.37}


 37%|███▋      | 6284/16798 [28:18<54:26,  3.22it/s]

{'loss': 1.6478, 'grad_norm': 2.3143930435180664, 'learning_rate': 0.00012525613533476292, 'epoch': 0.37}


 37%|███▋      | 6285/16798 [28:19<53:11,  3.29it/s]

{'loss': 1.453, 'grad_norm': 2.0317866802215576, 'learning_rate': 0.0001252442220633786, 'epoch': 0.37}


 37%|███▋      | 6286/16798 [28:19<55:26,  3.16it/s]

{'loss': 1.3413, 'grad_norm': 1.9409735202789307, 'learning_rate': 0.00012523230879199428, 'epoch': 0.37}


 37%|███▋      | 6287/16798 [28:19<55:42,  3.14it/s]

{'loss': 1.1611, 'grad_norm': 1.667198657989502, 'learning_rate': 0.00012522039552060997, 'epoch': 0.37}


 37%|███▋      | 6288/16798 [28:20<56:05,  3.12it/s]

{'loss': 1.2297, 'grad_norm': 2.4606781005859375, 'learning_rate': 0.00012520848224922563, 'epoch': 0.37}


 37%|███▋      | 6289/16798 [28:20<52:17,  3.35it/s]

{'loss': 1.0908, 'grad_norm': 1.8625563383102417, 'learning_rate': 0.00012519656897784132, 'epoch': 0.37}


 37%|███▋      | 6290/16798 [28:20<55:58,  3.13it/s]

{'loss': 1.5019, 'grad_norm': 1.9319089651107788, 'learning_rate': 0.00012518465570645699, 'epoch': 0.37}


 37%|███▋      | 6291/16798 [28:20<56:51,  3.08it/s]

{'loss': 0.9959, 'grad_norm': 1.8148401975631714, 'learning_rate': 0.00012517274243507268, 'epoch': 0.37}


 37%|███▋      | 6292/16798 [28:21<55:48,  3.14it/s]

{'loss': 1.0927, 'grad_norm': 1.5935804843902588, 'learning_rate': 0.00012516082916368837, 'epoch': 0.37}


 37%|███▋      | 6293/16798 [28:21<53:28,  3.27it/s]

{'loss': 1.0524, 'grad_norm': 1.8193809986114502, 'learning_rate': 0.00012514891589230403, 'epoch': 0.37}


 37%|███▋      | 6294/16798 [28:21<53:36,  3.27it/s]

{'loss': 1.1181, 'grad_norm': 2.567786455154419, 'learning_rate': 0.00012513700262091972, 'epoch': 0.37}


 37%|███▋      | 6295/16798 [28:22<54:38,  3.20it/s]

{'loss': 1.071, 'grad_norm': 1.8250552415847778, 'learning_rate': 0.00012512508934953539, 'epoch': 0.37}


 37%|███▋      | 6296/16798 [28:22<54:26,  3.21it/s]

{'loss': 0.9689, 'grad_norm': 1.7153041362762451, 'learning_rate': 0.00012511317607815108, 'epoch': 0.37}


 37%|███▋      | 6297/16798 [28:22<53:21,  3.28it/s]

{'loss': 0.49, 'grad_norm': 1.0685956478118896, 'learning_rate': 0.00012510126280676674, 'epoch': 0.37}


 37%|███▋      | 6298/16798 [28:23<52:41,  3.32it/s]

{'loss': 0.7192, 'grad_norm': 1.5454802513122559, 'learning_rate': 0.00012508934953538243, 'epoch': 0.37}


 37%|███▋      | 6299/16798 [28:23<47:35,  3.68it/s]

{'loss': 0.1965, 'grad_norm': 0.5384412407875061, 'learning_rate': 0.0001250774362639981, 'epoch': 0.37}


 38%|███▊      | 6300/16798 [28:23<46:59,  3.72it/s]

{'loss': 0.7941, 'grad_norm': 1.4722814559936523, 'learning_rate': 0.00012506552299261378, 'epoch': 0.38}


 38%|███▊      | 6301/16798 [28:23<48:27,  3.61it/s]

{'loss': 1.8861, 'grad_norm': 1.9561206102371216, 'learning_rate': 0.00012505360972122945, 'epoch': 0.38}


 38%|███▊      | 6302/16798 [28:24<52:45,  3.32it/s]

{'loss': 2.0305, 'grad_norm': 1.822353482246399, 'learning_rate': 0.00012504169644984514, 'epoch': 0.38}


 38%|███▊      | 6303/16798 [28:24<53:14,  3.29it/s]

{'loss': 1.623, 'grad_norm': 1.710208535194397, 'learning_rate': 0.0001250297831784608, 'epoch': 0.38}


 38%|███▊      | 6304/16798 [28:24<52:53,  3.31it/s]

{'loss': 2.0871, 'grad_norm': 1.860188364982605, 'learning_rate': 0.0001250178699070765, 'epoch': 0.38}


 38%|███▊      | 6305/16798 [28:25<53:37,  3.26it/s]

{'loss': 2.0694, 'grad_norm': 1.80049467086792, 'learning_rate': 0.00012500595663569216, 'epoch': 0.38}


 38%|███▊      | 6306/16798 [28:25<55:00,  3.18it/s]

{'loss': 1.5048, 'grad_norm': 1.8220254182815552, 'learning_rate': 0.00012499404336430785, 'epoch': 0.38}


 38%|███▊      | 6307/16798 [28:25<55:46,  3.14it/s]

{'loss': 1.5318, 'grad_norm': 1.7947793006896973, 'learning_rate': 0.0001249821300929235, 'epoch': 0.38}


 38%|███▊      | 6308/16798 [28:26<54:59,  3.18it/s]

{'loss': 1.8139, 'grad_norm': 2.193582773208618, 'learning_rate': 0.0001249702168215392, 'epoch': 0.38}


 38%|███▊      | 6309/16798 [28:26<56:46,  3.08it/s]

{'loss': 1.2802, 'grad_norm': 1.617571473121643, 'learning_rate': 0.00012495830355015487, 'epoch': 0.38}


 38%|███▊      | 6310/16798 [28:26<53:53,  3.24it/s]

{'loss': 1.6358, 'grad_norm': 1.7848459482192993, 'learning_rate': 0.00012494639027877056, 'epoch': 0.38}


 38%|███▊      | 6311/16798 [28:27<57:04,  3.06it/s]

{'loss': 1.4125, 'grad_norm': 1.8013818264007568, 'learning_rate': 0.00012493447700738622, 'epoch': 0.38}


 38%|███▊      | 6312/16798 [28:27<53:54,  3.24it/s]

{'loss': 1.61, 'grad_norm': 1.9585480690002441, 'learning_rate': 0.0001249225637360019, 'epoch': 0.38}


 38%|███▊      | 6313/16798 [28:27<53:30,  3.27it/s]

{'loss': 1.8341, 'grad_norm': 2.484483003616333, 'learning_rate': 0.00012491065046461757, 'epoch': 0.38}


 38%|███▊      | 6314/16798 [28:27<53:15,  3.28it/s]

{'loss': 1.286, 'grad_norm': 2.345052480697632, 'learning_rate': 0.00012489873719323327, 'epoch': 0.38}


 38%|███▊      | 6315/16798 [28:28<52:39,  3.32it/s]

{'loss': 1.5332, 'grad_norm': 1.759194254875183, 'learning_rate': 0.00012488682392184893, 'epoch': 0.38}


 38%|███▊      | 6316/16798 [28:28<53:31,  3.26it/s]

{'loss': 2.2901, 'grad_norm': 2.3273990154266357, 'learning_rate': 0.00012487491065046462, 'epoch': 0.38}


 38%|███▊      | 6317/16798 [28:28<53:24,  3.27it/s]

{'loss': 1.7303, 'grad_norm': 2.0671942234039307, 'learning_rate': 0.00012486299737908028, 'epoch': 0.38}


 38%|███▊      | 6318/16798 [28:29<52:15,  3.34it/s]

{'loss': 1.368, 'grad_norm': 2.6103804111480713, 'learning_rate': 0.00012485108410769597, 'epoch': 0.38}


 38%|███▊      | 6319/16798 [28:29<53:16,  3.28it/s]

{'loss': 1.4946, 'grad_norm': 2.2030327320098877, 'learning_rate': 0.00012483917083631164, 'epoch': 0.38}


 38%|███▊      | 6320/16798 [28:29<54:13,  3.22it/s]

{'loss': 1.6569, 'grad_norm': 2.096489191055298, 'learning_rate': 0.00012482725756492733, 'epoch': 0.38}


 38%|███▊      | 6321/16798 [28:30<53:52,  3.24it/s]

{'loss': 1.5535, 'grad_norm': 2.1835503578186035, 'learning_rate': 0.00012481534429354302, 'epoch': 0.38}


 38%|███▊      | 6322/16798 [28:30<52:12,  3.34it/s]

{'loss': 1.6711, 'grad_norm': 1.8736236095428467, 'learning_rate': 0.0001248034310221587, 'epoch': 0.38}


 38%|███▊      | 6323/16798 [28:30<49:22,  3.54it/s]

{'loss': 1.5027, 'grad_norm': 1.9245314598083496, 'learning_rate': 0.00012479151775077437, 'epoch': 0.38}


 38%|███▊      | 6324/16798 [28:30<51:44,  3.37it/s]

{'loss': 1.7201, 'grad_norm': 2.007890224456787, 'learning_rate': 0.00012477960447939006, 'epoch': 0.38}


 38%|███▊      | 6325/16798 [28:31<56:38,  3.08it/s]

{'loss': 1.7638, 'grad_norm': 1.9240220785140991, 'learning_rate': 0.00012476769120800573, 'epoch': 0.38}


 38%|███▊      | 6326/16798 [28:31<52:51,  3.30it/s]

{'loss': 1.8759, 'grad_norm': 2.1146912574768066, 'learning_rate': 0.00012475577793662142, 'epoch': 0.38}


 38%|███▊      | 6327/16798 [28:31<52:30,  3.32it/s]

{'loss': 1.3614, 'grad_norm': 2.027916193008423, 'learning_rate': 0.00012474386466523708, 'epoch': 0.38}


 38%|███▊      | 6328/16798 [28:32<49:13,  3.54it/s]

{'loss': 1.3587, 'grad_norm': 1.7302603721618652, 'learning_rate': 0.00012473195139385277, 'epoch': 0.38}


 38%|███▊      | 6329/16798 [28:32<53:39,  3.25it/s]

{'loss': 1.4405, 'grad_norm': 1.5489569902420044, 'learning_rate': 0.00012472003812246844, 'epoch': 0.38}


 38%|███▊      | 6330/16798 [28:32<53:49,  3.24it/s]

{'loss': 1.5093, 'grad_norm': 1.8573073148727417, 'learning_rate': 0.00012470812485108413, 'epoch': 0.38}


 38%|███▊      | 6331/16798 [28:33<55:47,  3.13it/s]

{'loss': 1.6833, 'grad_norm': 2.0724596977233887, 'learning_rate': 0.0001246962115796998, 'epoch': 0.38}


 38%|███▊      | 6332/16798 [28:33<50:14,  3.47it/s]

{'loss': 1.601, 'grad_norm': 2.200613260269165, 'learning_rate': 0.00012468429830831548, 'epoch': 0.38}


 38%|███▊      | 6333/16798 [28:33<52:19,  3.33it/s]

{'loss': 1.2552, 'grad_norm': 1.847509503364563, 'learning_rate': 0.00012467238503693115, 'epoch': 0.38}


 38%|███▊      | 6334/16798 [28:33<53:09,  3.28it/s]

{'loss': 1.3303, 'grad_norm': 1.6649811267852783, 'learning_rate': 0.00012466047176554684, 'epoch': 0.38}


 38%|███▊      | 6335/16798 [28:34<56:07,  3.11it/s]

{'loss': 1.0497, 'grad_norm': 1.5344914197921753, 'learning_rate': 0.0001246485584941625, 'epoch': 0.38}


 38%|███▊      | 6336/16798 [28:34<55:29,  3.14it/s]

{'loss': 1.1338, 'grad_norm': 1.662980318069458, 'learning_rate': 0.0001246366452227782, 'epoch': 0.38}


 38%|███▊      | 6337/16798 [28:34<55:25,  3.15it/s]

{'loss': 1.1294, 'grad_norm': 1.5675615072250366, 'learning_rate': 0.00012462473195139386, 'epoch': 0.38}


 38%|███▊      | 6338/16798 [28:35<56:08,  3.11it/s]

{'loss': 1.241, 'grad_norm': 1.7479443550109863, 'learning_rate': 0.00012461281868000955, 'epoch': 0.38}


 38%|███▊      | 6339/16798 [28:35<54:22,  3.21it/s]

{'loss': 1.2305, 'grad_norm': 2.058487892150879, 'learning_rate': 0.0001246009054086252, 'epoch': 0.38}


 38%|███▊      | 6340/16798 [28:35<54:33,  3.20it/s]

{'loss': 1.1461, 'grad_norm': 1.5288292169570923, 'learning_rate': 0.0001245889921372409, 'epoch': 0.38}


 38%|███▊      | 6341/16798 [28:36<54:38,  3.19it/s]

{'loss': 1.3026, 'grad_norm': 1.8997633457183838, 'learning_rate': 0.00012457707886585656, 'epoch': 0.38}


 38%|███▊      | 6342/16798 [28:36<55:14,  3.15it/s]

{'loss': 1.3035, 'grad_norm': 2.013112783432007, 'learning_rate': 0.00012456516559447225, 'epoch': 0.38}


 38%|███▊      | 6343/16798 [28:36<54:46,  3.18it/s]

{'loss': 1.1877, 'grad_norm': 1.729315161705017, 'learning_rate': 0.00012455325232308792, 'epoch': 0.38}


 38%|███▊      | 6344/16798 [28:37<57:57,  3.01it/s]

{'loss': 1.4422, 'grad_norm': 1.7591228485107422, 'learning_rate': 0.0001245413390517036, 'epoch': 0.38}


 38%|███▊      | 6345/16798 [28:37<53:57,  3.23it/s]

{'loss': 0.7403, 'grad_norm': 1.3804689645767212, 'learning_rate': 0.00012452942578031927, 'epoch': 0.38}


 38%|███▊      | 6346/16798 [28:37<53:09,  3.28it/s]

{'loss': 1.7274, 'grad_norm': 2.481923818588257, 'learning_rate': 0.00012451751250893496, 'epoch': 0.38}


 38%|███▊      | 6347/16798 [28:38<53:24,  3.26it/s]

{'loss': 0.7195, 'grad_norm': 1.3229100704193115, 'learning_rate': 0.00012450559923755063, 'epoch': 0.38}


 38%|███▊      | 6348/16798 [28:38<56:49,  3.06it/s]

{'loss': 0.8611, 'grad_norm': 1.5389575958251953, 'learning_rate': 0.00012449368596616632, 'epoch': 0.38}


 38%|███▊      | 6349/16798 [28:38<55:28,  3.14it/s]

{'loss': 0.5397, 'grad_norm': 1.1125496625900269, 'learning_rate': 0.00012448177269478198, 'epoch': 0.38}


 38%|███▊      | 6350/16798 [28:39<54:56,  3.17it/s]

{'loss': 0.7284, 'grad_norm': 1.5285764932632446, 'learning_rate': 0.00012446985942339767, 'epoch': 0.38}


 38%|███▊      | 6351/16798 [28:39<53:42,  3.24it/s]

{'loss': 1.5726, 'grad_norm': 1.5272278785705566, 'learning_rate': 0.00012445794615201334, 'epoch': 0.38}


 38%|███▊      | 6352/16798 [28:39<52:35,  3.31it/s]

{'loss': 1.8615, 'grad_norm': 1.780556082725525, 'learning_rate': 0.00012444603288062903, 'epoch': 0.38}


 38%|███▊      | 6353/16798 [28:39<53:03,  3.28it/s]

{'loss': 1.6277, 'grad_norm': 1.9881123304367065, 'learning_rate': 0.00012443411960924472, 'epoch': 0.38}


 38%|███▊      | 6354/16798 [28:40<53:46,  3.24it/s]

{'loss': 1.7254, 'grad_norm': 1.8667569160461426, 'learning_rate': 0.00012442220633786038, 'epoch': 0.38}


 38%|███▊      | 6355/16798 [28:40<54:04,  3.22it/s]

{'loss': 1.6931, 'grad_norm': 1.7979810237884521, 'learning_rate': 0.00012441029306647607, 'epoch': 0.38}


 38%|███▊      | 6356/16798 [28:40<53:39,  3.24it/s]

{'loss': 1.7668, 'grad_norm': 1.916000485420227, 'learning_rate': 0.00012439837979509174, 'epoch': 0.38}


 38%|███▊      | 6357/16798 [28:41<55:01,  3.16it/s]

{'loss': 1.8304, 'grad_norm': 2.0103635787963867, 'learning_rate': 0.00012438646652370743, 'epoch': 0.38}


 38%|███▊      | 6358/16798 [28:41<54:00,  3.22it/s]

{'loss': 1.6327, 'grad_norm': 2.2796456813812256, 'learning_rate': 0.0001243745532523231, 'epoch': 0.38}


 38%|███▊      | 6359/16798 [28:41<54:15,  3.21it/s]

{'loss': 1.4933, 'grad_norm': 1.9267369508743286, 'learning_rate': 0.00012436263998093878, 'epoch': 0.38}


 38%|███▊      | 6360/16798 [28:42<54:03,  3.22it/s]

{'loss': 1.1275, 'grad_norm': 1.5422347784042358, 'learning_rate': 0.00012435072670955444, 'epoch': 0.38}


 38%|███▊      | 6361/16798 [28:42<54:43,  3.18it/s]

{'loss': 1.6223, 'grad_norm': 1.897670865058899, 'learning_rate': 0.00012433881343817014, 'epoch': 0.38}


 38%|███▊      | 6362/16798 [28:42<54:57,  3.16it/s]

{'loss': 1.4546, 'grad_norm': 1.7107365131378174, 'learning_rate': 0.0001243269001667858, 'epoch': 0.38}


 38%|███▊      | 6363/16798 [28:43<55:13,  3.15it/s]

{'loss': 1.352, 'grad_norm': 1.9452203512191772, 'learning_rate': 0.0001243149868954015, 'epoch': 0.38}


 38%|███▊      | 6364/16798 [28:43<56:04,  3.10it/s]

{'loss': 1.5408, 'grad_norm': 1.7537060976028442, 'learning_rate': 0.00012430307362401715, 'epoch': 0.38}


 38%|███▊      | 6365/16798 [28:43<56:12,  3.09it/s]

{'loss': 1.4867, 'grad_norm': 2.1986701488494873, 'learning_rate': 0.00012429116035263284, 'epoch': 0.38}


 38%|███▊      | 6366/16798 [28:44<54:45,  3.17it/s]

{'loss': 1.508, 'grad_norm': 2.26528000831604, 'learning_rate': 0.0001242792470812485, 'epoch': 0.38}


 38%|███▊      | 6367/16798 [28:44<55:17,  3.14it/s]

{'loss': 1.5216, 'grad_norm': 2.1638152599334717, 'learning_rate': 0.0001242673338098642, 'epoch': 0.38}


 38%|███▊      | 6368/16798 [28:44<53:26,  3.25it/s]

{'loss': 1.4034, 'grad_norm': 1.7529603242874146, 'learning_rate': 0.00012425542053847986, 'epoch': 0.38}


 38%|███▊      | 6369/16798 [28:44<51:26,  3.38it/s]

{'loss': 1.4398, 'grad_norm': 1.9078973531723022, 'learning_rate': 0.00012424350726709555, 'epoch': 0.38}


 38%|███▊      | 6370/16798 [28:45<53:42,  3.24it/s]

{'loss': 1.6252, 'grad_norm': 2.048151731491089, 'learning_rate': 0.00012423159399571122, 'epoch': 0.38}


 38%|███▊      | 6371/16798 [28:45<51:48,  3.35it/s]

{'loss': 1.9233, 'grad_norm': 2.8695151805877686, 'learning_rate': 0.0001242196807243269, 'epoch': 0.38}


 38%|███▊      | 6372/16798 [28:45<52:11,  3.33it/s]

{'loss': 1.3612, 'grad_norm': 2.002066135406494, 'learning_rate': 0.00012420776745294257, 'epoch': 0.38}


 38%|███▊      | 6373/16798 [28:46<55:20,  3.14it/s]

{'loss': 1.6647, 'grad_norm': 2.9560649394989014, 'learning_rate': 0.00012419585418155826, 'epoch': 0.38}


 38%|███▊      | 6374/16798 [28:46<53:36,  3.24it/s]

{'loss': 1.3669, 'grad_norm': 1.8264691829681396, 'learning_rate': 0.00012418394091017393, 'epoch': 0.38}


 38%|███▊      | 6375/16798 [28:46<52:19,  3.32it/s]

{'loss': 1.6587, 'grad_norm': 2.003734827041626, 'learning_rate': 0.00012417202763878962, 'epoch': 0.38}


 38%|███▊      | 6376/16798 [28:47<50:22,  3.45it/s]

{'loss': 1.6567, 'grad_norm': 2.1671652793884277, 'learning_rate': 0.00012416011436740528, 'epoch': 0.38}


 38%|███▊      | 6377/16798 [28:47<52:31,  3.31it/s]

{'loss': 1.5231, 'grad_norm': 2.001882791519165, 'learning_rate': 0.00012414820109602097, 'epoch': 0.38}


 38%|███▊      | 6378/16798 [28:47<51:27,  3.37it/s]

{'loss': 1.4082, 'grad_norm': 1.7641637325286865, 'learning_rate': 0.00012413628782463663, 'epoch': 0.38}


 38%|███▊      | 6379/16798 [28:47<51:21,  3.38it/s]

{'loss': 1.369, 'grad_norm': 2.1882238388061523, 'learning_rate': 0.00012412437455325233, 'epoch': 0.38}


 38%|███▊      | 6380/16798 [28:48<52:55,  3.28it/s]

{'loss': 1.5198, 'grad_norm': 1.9338927268981934, 'learning_rate': 0.000124112461281868, 'epoch': 0.38}


 38%|███▊      | 6381/16798 [28:48<53:11,  3.26it/s]

{'loss': 1.1665, 'grad_norm': 1.4398071765899658, 'learning_rate': 0.00012410054801048368, 'epoch': 0.38}


 38%|███▊      | 6382/16798 [28:48<53:12,  3.26it/s]

{'loss': 1.3211, 'grad_norm': 2.0593974590301514, 'learning_rate': 0.00012408863473909934, 'epoch': 0.38}


 38%|███▊      | 6383/16798 [28:49<53:49,  3.23it/s]

{'loss': 1.6654, 'grad_norm': 2.166318416595459, 'learning_rate': 0.00012407672146771506, 'epoch': 0.38}


 38%|███▊      | 6384/16798 [28:49<53:33,  3.24it/s]

{'loss': 0.9905, 'grad_norm': 1.8458133935928345, 'learning_rate': 0.00012406480819633072, 'epoch': 0.38}


 38%|███▊      | 6385/16798 [28:49<53:31,  3.24it/s]

{'loss': 1.278, 'grad_norm': 1.6862452030181885, 'learning_rate': 0.00012405289492494642, 'epoch': 0.38}


 38%|███▊      | 6386/16798 [28:50<53:48,  3.22it/s]

{'loss': 1.2126, 'grad_norm': 1.737895131111145, 'learning_rate': 0.00012404098165356208, 'epoch': 0.38}


 38%|███▊      | 6387/16798 [28:50<53:54,  3.22it/s]

{'loss': 1.5369, 'grad_norm': 1.9737823009490967, 'learning_rate': 0.00012402906838217777, 'epoch': 0.38}


 38%|███▊      | 6388/16798 [28:50<50:54,  3.41it/s]

{'loss': 1.3209, 'grad_norm': 1.7958858013153076, 'learning_rate': 0.00012401715511079343, 'epoch': 0.38}


 38%|███▊      | 6389/16798 [28:51<54:32,  3.18it/s]

{'loss': 1.0356, 'grad_norm': 1.6442407369613647, 'learning_rate': 0.00012400524183940912, 'epoch': 0.38}


 38%|███▊      | 6390/16798 [28:51<55:26,  3.13it/s]

{'loss': 0.8733, 'grad_norm': 1.7299808263778687, 'learning_rate': 0.0001239933285680248, 'epoch': 0.38}


 38%|███▊      | 6391/16798 [28:51<55:21,  3.13it/s]

{'loss': 1.3426, 'grad_norm': 2.2911887168884277, 'learning_rate': 0.00012398141529664048, 'epoch': 0.38}


 38%|███▊      | 6392/16798 [28:52<55:05,  3.15it/s]

{'loss': 1.2467, 'grad_norm': 1.9204038381576538, 'learning_rate': 0.00012396950202525614, 'epoch': 0.38}


 38%|███▊      | 6393/16798 [28:52<54:18,  3.19it/s]

{'loss': 1.1303, 'grad_norm': 2.142129421234131, 'learning_rate': 0.00012395758875387183, 'epoch': 0.38}


 38%|███▊      | 6394/16798 [28:52<51:50,  3.34it/s]

{'loss': 0.8716, 'grad_norm': 1.4047901630401611, 'learning_rate': 0.0001239456754824875, 'epoch': 0.38}


 38%|███▊      | 6395/16798 [28:52<54:28,  3.18it/s]

{'loss': 1.3987, 'grad_norm': 1.8811370134353638, 'learning_rate': 0.0001239337622111032, 'epoch': 0.38}


 38%|███▊      | 6396/16798 [28:53<54:08,  3.20it/s]

{'loss': 0.8439, 'grad_norm': 1.4943358898162842, 'learning_rate': 0.00012392184893971885, 'epoch': 0.38}


 38%|███▊      | 6397/16798 [28:53<54:20,  3.19it/s]

{'loss': 1.1166, 'grad_norm': 1.8280216455459595, 'learning_rate': 0.00012390993566833454, 'epoch': 0.38}


 38%|███▊      | 6398/16798 [28:53<54:36,  3.17it/s]

{'loss': 0.6651, 'grad_norm': 1.2503105401992798, 'learning_rate': 0.0001238980223969502, 'epoch': 0.38}


 38%|███▊      | 6399/16798 [28:54<54:11,  3.20it/s]

{'loss': 0.5632, 'grad_norm': 1.172228217124939, 'learning_rate': 0.0001238861091255659, 'epoch': 0.38}


 38%|███▊      | 6400/16798 [28:54<53:12,  3.26it/s]

{'loss': 0.2145, 'grad_norm': 0.7774632573127747, 'learning_rate': 0.00012387419585418156, 'epoch': 0.38}


 38%|███▊      | 6401/16798 [28:54<52:22,  3.31it/s]

{'loss': 1.5499, 'grad_norm': 1.7339617013931274, 'learning_rate': 0.00012386228258279725, 'epoch': 0.38}


 38%|███▊      | 6402/16798 [28:55<53:42,  3.23it/s]

{'loss': 1.8694, 'grad_norm': 1.826820969581604, 'learning_rate': 0.00012385036931141291, 'epoch': 0.38}


 38%|███▊      | 6403/16798 [28:55<52:32,  3.30it/s]

{'loss': 1.8266, 'grad_norm': 1.9867616891860962, 'learning_rate': 0.0001238384560400286, 'epoch': 0.38}


 38%|███▊      | 6404/16798 [28:55<56:06,  3.09it/s]

{'loss': 2.2833, 'grad_norm': 2.6034111976623535, 'learning_rate': 0.00012382654276864427, 'epoch': 0.38}


 38%|███▊      | 6405/16798 [28:56<53:03,  3.26it/s]

{'loss': 2.2635, 'grad_norm': 2.1646432876586914, 'learning_rate': 0.00012381462949725996, 'epoch': 0.38}


 38%|███▊      | 6406/16798 [28:56<55:51,  3.10it/s]

{'loss': 2.1713, 'grad_norm': 2.103909730911255, 'learning_rate': 0.00012380271622587562, 'epoch': 0.38}


 38%|███▊      | 6407/16798 [28:56<56:28,  3.07it/s]

{'loss': 2.0272, 'grad_norm': 2.044732093811035, 'learning_rate': 0.00012379080295449131, 'epoch': 0.38}


 38%|███▊      | 6408/16798 [28:57<56:07,  3.09it/s]

{'loss': 2.3146, 'grad_norm': 2.483926296234131, 'learning_rate': 0.00012377888968310698, 'epoch': 0.38}


 38%|███▊      | 6409/16798 [28:57<56:21,  3.07it/s]

{'loss': 1.8424, 'grad_norm': 2.1191086769104004, 'learning_rate': 0.00012376697641172267, 'epoch': 0.38}


 38%|███▊      | 6410/16798 [28:57<56:46,  3.05it/s]

{'loss': 1.4209, 'grad_norm': 1.6418007612228394, 'learning_rate': 0.00012375506314033833, 'epoch': 0.38}


 38%|███▊      | 6411/16798 [28:58<56:49,  3.05it/s]

{'loss': 1.948, 'grad_norm': 2.133702278137207, 'learning_rate': 0.00012374314986895402, 'epoch': 0.38}


 38%|███▊      | 6412/16798 [28:58<56:06,  3.09it/s]

{'loss': 1.3531, 'grad_norm': 1.674793004989624, 'learning_rate': 0.0001237312365975697, 'epoch': 0.38}


 38%|███▊      | 6413/16798 [28:58<56:31,  3.06it/s]

{'loss': 1.732, 'grad_norm': 1.9593498706817627, 'learning_rate': 0.00012371932332618538, 'epoch': 0.38}


 38%|███▊      | 6414/16798 [28:59<55:26,  3.12it/s]

{'loss': 1.2933, 'grad_norm': 1.6763041019439697, 'learning_rate': 0.00012370741005480107, 'epoch': 0.38}


 38%|███▊      | 6415/16798 [28:59<54:13,  3.19it/s]

{'loss': 1.0553, 'grad_norm': 2.0361905097961426, 'learning_rate': 0.00012369549678341673, 'epoch': 0.38}


 38%|███▊      | 6416/16798 [28:59<54:05,  3.20it/s]

{'loss': 1.3385, 'grad_norm': 1.8089960813522339, 'learning_rate': 0.00012368358351203242, 'epoch': 0.38}


 38%|███▊      | 6417/16798 [28:59<54:21,  3.18it/s]

{'loss': 1.5714, 'grad_norm': 2.9447567462921143, 'learning_rate': 0.00012367167024064809, 'epoch': 0.38}


 38%|███▊      | 6418/16798 [29:00<51:56,  3.33it/s]

{'loss': 1.3487, 'grad_norm': 1.6987210512161255, 'learning_rate': 0.00012365975696926378, 'epoch': 0.38}


 38%|███▊      | 6419/16798 [29:00<52:28,  3.30it/s]

{'loss': 2.0249, 'grad_norm': 2.0244576930999756, 'learning_rate': 0.00012364784369787944, 'epoch': 0.38}


 38%|███▊      | 6420/16798 [29:00<50:49,  3.40it/s]

{'loss': 1.6666, 'grad_norm': 1.9795302152633667, 'learning_rate': 0.00012363593042649513, 'epoch': 0.38}


 38%|███▊      | 6421/16798 [29:01<52:23,  3.30it/s]

{'loss': 1.5532, 'grad_norm': 1.8752609491348267, 'learning_rate': 0.0001236240171551108, 'epoch': 0.38}


 38%|███▊      | 6422/16798 [29:01<53:22,  3.24it/s]

{'loss': 1.5164, 'grad_norm': 2.2586474418640137, 'learning_rate': 0.00012361210388372649, 'epoch': 0.38}


 38%|███▊      | 6423/16798 [29:01<53:54,  3.21it/s]

{'loss': 1.4232, 'grad_norm': 1.6568810939788818, 'learning_rate': 0.00012360019061234215, 'epoch': 0.38}


 38%|███▊      | 6424/16798 [29:02<52:18,  3.30it/s]

{'loss': 1.5216, 'grad_norm': 2.0901970863342285, 'learning_rate': 0.00012358827734095784, 'epoch': 0.38}


 38%|███▊      | 6425/16798 [29:02<52:33,  3.29it/s]

{'loss': 1.368, 'grad_norm': 1.798991084098816, 'learning_rate': 0.0001235763640695735, 'epoch': 0.38}


 38%|███▊      | 6426/16798 [29:02<52:50,  3.27it/s]

{'loss': 2.0023, 'grad_norm': 2.3551924228668213, 'learning_rate': 0.0001235644507981892, 'epoch': 0.38}


 38%|███▊      | 6427/16798 [29:02<52:34,  3.29it/s]

{'loss': 1.1297, 'grad_norm': 1.8427077531814575, 'learning_rate': 0.00012355253752680486, 'epoch': 0.38}


 38%|███▊      | 6428/16798 [29:03<51:00,  3.39it/s]

{'loss': 1.5327, 'grad_norm': 2.1142117977142334, 'learning_rate': 0.00012354062425542055, 'epoch': 0.38}


 38%|███▊      | 6429/16798 [29:03<57:04,  3.03it/s]

{'loss': 1.6309, 'grad_norm': 1.9933394193649292, 'learning_rate': 0.0001235287109840362, 'epoch': 0.38}


 38%|███▊      | 6430/16798 [29:03<55:26,  3.12it/s]

{'loss': 1.7982, 'grad_norm': 2.029127359390259, 'learning_rate': 0.0001235167977126519, 'epoch': 0.38}


 38%|███▊      | 6431/16798 [29:04<55:55,  3.09it/s]

{'loss': 1.1847, 'grad_norm': 1.7091400623321533, 'learning_rate': 0.00012350488444126757, 'epoch': 0.38}


 38%|███▊      | 6432/16798 [29:04<56:19,  3.07it/s]

{'loss': 1.3562, 'grad_norm': 1.9081205129623413, 'learning_rate': 0.00012349297116988326, 'epoch': 0.38}


 38%|███▊      | 6433/16798 [29:04<56:39,  3.05it/s]

{'loss': 1.3161, 'grad_norm': 1.9341357946395874, 'learning_rate': 0.00012348105789849892, 'epoch': 0.38}


 38%|███▊      | 6434/16798 [29:05<58:09,  2.97it/s]

{'loss': 1.0731, 'grad_norm': 1.5456514358520508, 'learning_rate': 0.0001234691446271146, 'epoch': 0.38}


 38%|███▊      | 6435/16798 [29:05<54:50,  3.15it/s]

{'loss': 1.5228, 'grad_norm': 1.9282442331314087, 'learning_rate': 0.00012345723135573028, 'epoch': 0.38}


 38%|███▊      | 6436/16798 [29:05<57:33,  3.00it/s]

{'loss': 0.9628, 'grad_norm': 1.6807506084442139, 'learning_rate': 0.00012344531808434597, 'epoch': 0.38}


 38%|███▊      | 6437/16798 [29:06<56:42,  3.05it/s]

{'loss': 1.0662, 'grad_norm': 1.8517155647277832, 'learning_rate': 0.00012343340481296163, 'epoch': 0.38}


 38%|███▊      | 6438/16798 [29:06<56:28,  3.06it/s]

{'loss': 1.4568, 'grad_norm': 1.8997738361358643, 'learning_rate': 0.00012342149154157732, 'epoch': 0.38}


 38%|███▊      | 6439/16798 [29:06<56:27,  3.06it/s]

{'loss': 1.1983, 'grad_norm': 1.707546591758728, 'learning_rate': 0.00012340957827019298, 'epoch': 0.38}


 38%|███▊      | 6440/16798 [29:07<56:41,  3.05it/s]

{'loss': 1.2002, 'grad_norm': 2.1200571060180664, 'learning_rate': 0.00012339766499880868, 'epoch': 0.38}


 38%|███▊      | 6441/16798 [29:07<56:57,  3.03it/s]

{'loss': 1.3909, 'grad_norm': 2.113009214401245, 'learning_rate': 0.00012338575172742434, 'epoch': 0.38}


 38%|███▊      | 6442/16798 [29:07<57:57,  2.98it/s]

{'loss': 1.1298, 'grad_norm': 1.6794379949569702, 'learning_rate': 0.00012337383845604003, 'epoch': 0.38}


 38%|███▊      | 6443/16798 [29:08<56:27,  3.06it/s]

{'loss': 1.3737, 'grad_norm': 2.1203887462615967, 'learning_rate': 0.0001233619251846557, 'epoch': 0.38}


 38%|███▊      | 6444/16798 [29:08<54:54,  3.14it/s]

{'loss': 0.8991, 'grad_norm': 1.5210466384887695, 'learning_rate': 0.00012335001191327138, 'epoch': 0.38}


 38%|███▊      | 6445/16798 [29:08<52:17,  3.30it/s]

{'loss': 0.8392, 'grad_norm': 1.5967357158660889, 'learning_rate': 0.00012333809864188708, 'epoch': 0.38}


 38%|███▊      | 6446/16798 [29:09<51:26,  3.35it/s]

{'loss': 1.0148, 'grad_norm': 1.9591832160949707, 'learning_rate': 0.00012332618537050277, 'epoch': 0.38}


 38%|███▊      | 6447/16798 [29:09<50:57,  3.39it/s]

{'loss': 0.4154, 'grad_norm': 0.868725061416626, 'learning_rate': 0.00012331427209911843, 'epoch': 0.38}


 38%|███▊      | 6448/16798 [29:09<49:02,  3.52it/s]

{'loss': 1.198, 'grad_norm': 2.3104422092437744, 'learning_rate': 0.00012330235882773412, 'epoch': 0.38}


 38%|███▊      | 6449/16798 [29:09<50:54,  3.39it/s]

{'loss': 0.6627, 'grad_norm': 1.4914815425872803, 'learning_rate': 0.00012329044555634978, 'epoch': 0.38}


 38%|███▊      | 6450/16798 [29:10<48:21,  3.57it/s]

{'loss': 0.5976, 'grad_norm': 1.2396665811538696, 'learning_rate': 0.00012327853228496547, 'epoch': 0.38}


 38%|███▊      | 6451/16798 [29:10<49:24,  3.49it/s]

{'loss': 2.1997, 'grad_norm': 1.8804516792297363, 'learning_rate': 0.00012326661901358114, 'epoch': 0.38}


 38%|███▊      | 6452/16798 [29:10<49:59,  3.45it/s]

{'loss': 1.812, 'grad_norm': 1.881678819656372, 'learning_rate': 0.00012325470574219683, 'epoch': 0.38}


 38%|███▊      | 6453/16798 [29:11<56:08,  3.07it/s]

{'loss': 1.9473, 'grad_norm': 1.9614994525909424, 'learning_rate': 0.0001232427924708125, 'epoch': 0.38}


 38%|███▊      | 6454/16798 [29:11<56:29,  3.05it/s]

{'loss': 1.8101, 'grad_norm': 1.8091121912002563, 'learning_rate': 0.00012323087919942818, 'epoch': 0.38}


 38%|███▊      | 6455/16798 [29:11<55:49,  3.09it/s]

{'loss': 1.9333, 'grad_norm': 2.1408591270446777, 'learning_rate': 0.00012321896592804385, 'epoch': 0.38}


 38%|███▊      | 6456/16798 [29:12<56:34,  3.05it/s]

{'loss': 2.0253, 'grad_norm': 2.2266488075256348, 'learning_rate': 0.00012320705265665954, 'epoch': 0.38}


 38%|███▊      | 6457/16798 [29:12<57:08,  3.02it/s]

{'loss': 2.5702, 'grad_norm': 2.798737049102783, 'learning_rate': 0.0001231951393852752, 'epoch': 0.38}


 38%|███▊      | 6458/16798 [29:12<56:02,  3.07it/s]

{'loss': 1.886, 'grad_norm': 1.9603570699691772, 'learning_rate': 0.0001231832261138909, 'epoch': 0.38}


 38%|███▊      | 6459/16798 [29:13<55:10,  3.12it/s]

{'loss': 1.8755, 'grad_norm': 2.288892984390259, 'learning_rate': 0.00012317131284250656, 'epoch': 0.38}


 38%|███▊      | 6460/16798 [29:13<55:10,  3.12it/s]

{'loss': 1.3686, 'grad_norm': 1.5555520057678223, 'learning_rate': 0.00012315939957112225, 'epoch': 0.38}


 38%|███▊      | 6461/16798 [29:13<59:22,  2.90it/s]

{'loss': 1.7505, 'grad_norm': 2.0933525562286377, 'learning_rate': 0.0001231474862997379, 'epoch': 0.38}


 38%|███▊      | 6462/16798 [29:14<56:51,  3.03it/s]

{'loss': 1.6829, 'grad_norm': 1.9804292917251587, 'learning_rate': 0.0001231355730283536, 'epoch': 0.38}


 38%|███▊      | 6463/16798 [29:14<56:01,  3.07it/s]

{'loss': 1.4624, 'grad_norm': 1.7619481086730957, 'learning_rate': 0.00012312365975696926, 'epoch': 0.38}


 38%|███▊      | 6464/16798 [29:14<56:41,  3.04it/s]

{'loss': 1.7362, 'grad_norm': 1.9439387321472168, 'learning_rate': 0.00012311174648558496, 'epoch': 0.38}


 38%|███▊      | 6465/16798 [29:15<53:17,  3.23it/s]

{'loss': 1.6759, 'grad_norm': 2.1742730140686035, 'learning_rate': 0.00012309983321420062, 'epoch': 0.38}


 38%|███▊      | 6466/16798 [29:15<53:50,  3.20it/s]

{'loss': 1.7171, 'grad_norm': 2.128257989883423, 'learning_rate': 0.0001230879199428163, 'epoch': 0.38}


 38%|███▊      | 6467/16798 [29:15<53:20,  3.23it/s]

{'loss': 1.6882, 'grad_norm': 2.165682077407837, 'learning_rate': 0.00012307600667143197, 'epoch': 0.38}


 39%|███▊      | 6468/16798 [29:16<53:31,  3.22it/s]

{'loss': 1.5327, 'grad_norm': 2.1607367992401123, 'learning_rate': 0.00012306409340004766, 'epoch': 0.39}


 39%|███▊      | 6469/16798 [29:16<52:45,  3.26it/s]

{'loss': 1.4706, 'grad_norm': 1.8680622577667236, 'learning_rate': 0.00012305218012866333, 'epoch': 0.39}


 39%|███▊      | 6470/16798 [29:16<56:12,  3.06it/s]

{'loss': 1.8777, 'grad_norm': 1.9919283390045166, 'learning_rate': 0.00012304026685727902, 'epoch': 0.39}


 39%|███▊      | 6471/16798 [29:16<52:01,  3.31it/s]

{'loss': 1.8574, 'grad_norm': 2.294722318649292, 'learning_rate': 0.00012302835358589468, 'epoch': 0.39}


 39%|███▊      | 6472/16798 [29:17<52:50,  3.26it/s]

{'loss': 1.8011, 'grad_norm': 2.318601608276367, 'learning_rate': 0.00012301644031451037, 'epoch': 0.39}


 39%|███▊      | 6473/16798 [29:17<54:23,  3.16it/s]

{'loss': 1.1944, 'grad_norm': 2.5191805362701416, 'learning_rate': 0.00012300452704312604, 'epoch': 0.39}


 39%|███▊      | 6474/16798 [29:17<55:38,  3.09it/s]

{'loss': 1.6018, 'grad_norm': 1.8699792623519897, 'learning_rate': 0.00012299261377174173, 'epoch': 0.39}


 39%|███▊      | 6475/16798 [29:18<55:07,  3.12it/s]

{'loss': 1.795, 'grad_norm': 1.878005027770996, 'learning_rate': 0.0001229807005003574, 'epoch': 0.39}


 39%|███▊      | 6476/16798 [29:18<55:52,  3.08it/s]

{'loss': 1.2073, 'grad_norm': 2.09395432472229, 'learning_rate': 0.00012296878722897308, 'epoch': 0.39}


 39%|███▊      | 6477/16798 [29:18<55:43,  3.09it/s]

{'loss': 1.5824, 'grad_norm': 1.889909267425537, 'learning_rate': 0.00012295687395758877, 'epoch': 0.39}


 39%|███▊      | 6478/16798 [29:19<55:49,  3.08it/s]

{'loss': 1.1676, 'grad_norm': 1.6186420917510986, 'learning_rate': 0.00012294496068620444, 'epoch': 0.39}


 39%|███▊      | 6479/16798 [29:19<53:40,  3.20it/s]

{'loss': 1.2908, 'grad_norm': 1.708208441734314, 'learning_rate': 0.00012293304741482013, 'epoch': 0.39}


 39%|███▊      | 6480/16798 [29:19<53:42,  3.20it/s]

{'loss': 1.4485, 'grad_norm': 1.879006266593933, 'learning_rate': 0.0001229211341434358, 'epoch': 0.39}


 39%|███▊      | 6481/16798 [29:20<54:15,  3.17it/s]

{'loss': 1.5844, 'grad_norm': 2.1589725017547607, 'learning_rate': 0.00012290922087205148, 'epoch': 0.39}


 39%|███▊      | 6482/16798 [29:20<55:35,  3.09it/s]

{'loss': 1.2316, 'grad_norm': 1.6500955820083618, 'learning_rate': 0.00012289730760066715, 'epoch': 0.39}


 39%|███▊      | 6483/16798 [29:20<56:05,  3.06it/s]

{'loss': 1.4978, 'grad_norm': 2.211580753326416, 'learning_rate': 0.00012288539432928284, 'epoch': 0.39}


 39%|███▊      | 6484/16798 [29:21<57:25,  2.99it/s]

{'loss': 2.0592, 'grad_norm': 2.3069887161254883, 'learning_rate': 0.0001228734810578985, 'epoch': 0.39}


 39%|███▊      | 6485/16798 [29:21<57:44,  2.98it/s]

{'loss': 0.9532, 'grad_norm': 1.6749391555786133, 'learning_rate': 0.0001228615677865142, 'epoch': 0.39}


 39%|███▊      | 6486/16798 [29:21<56:28,  3.04it/s]

{'loss': 1.3551, 'grad_norm': 1.9813419580459595, 'learning_rate': 0.00012284965451512985, 'epoch': 0.39}


 39%|███▊      | 6487/16798 [29:22<57:23,  2.99it/s]

{'loss': 1.6124, 'grad_norm': 2.206273078918457, 'learning_rate': 0.00012283774124374555, 'epoch': 0.39}


 39%|███▊      | 6488/16798 [29:22<55:40,  3.09it/s]

{'loss': 1.0129, 'grad_norm': 1.7959657907485962, 'learning_rate': 0.0001228258279723612, 'epoch': 0.39}


 39%|███▊      | 6489/16798 [29:22<55:04,  3.12it/s]

{'loss': 1.3933, 'grad_norm': 1.9879755973815918, 'learning_rate': 0.0001228139147009769, 'epoch': 0.39}


 39%|███▊      | 6490/16798 [29:23<53:38,  3.20it/s]

{'loss': 1.4113, 'grad_norm': 1.6546131372451782, 'learning_rate': 0.00012280200142959256, 'epoch': 0.39}


 39%|███▊      | 6491/16798 [29:23<52:40,  3.26it/s]

{'loss': 1.251, 'grad_norm': 1.842582106590271, 'learning_rate': 0.00012279008815820825, 'epoch': 0.39}


 39%|███▊      | 6492/16798 [29:23<50:14,  3.42it/s]

{'loss': 1.1689, 'grad_norm': 1.791549801826477, 'learning_rate': 0.00012277817488682392, 'epoch': 0.39}


 39%|███▊      | 6493/16798 [29:23<52:08,  3.29it/s]

{'loss': 1.342, 'grad_norm': 1.606980562210083, 'learning_rate': 0.0001227662616154396, 'epoch': 0.39}


 39%|███▊      | 6494/16798 [29:24<54:46,  3.14it/s]

{'loss': 0.783, 'grad_norm': 1.9030095338821411, 'learning_rate': 0.00012275434834405527, 'epoch': 0.39}


 39%|███▊      | 6495/16798 [29:24<52:54,  3.25it/s]

{'loss': 1.1959, 'grad_norm': 2.4103574752807617, 'learning_rate': 0.00012274243507267096, 'epoch': 0.39}


 39%|███▊      | 6496/16798 [29:24<52:41,  3.26it/s]

{'loss': 1.0478, 'grad_norm': 2.2973473072052, 'learning_rate': 0.00012273052180128663, 'epoch': 0.39}


 39%|███▊      | 6497/16798 [29:25<54:33,  3.15it/s]

{'loss': 0.9183, 'grad_norm': 1.6918683052062988, 'learning_rate': 0.00012271860852990232, 'epoch': 0.39}


 39%|███▊      | 6498/16798 [29:25<56:11,  3.05it/s]

{'loss': 0.7971, 'grad_norm': 1.6510976552963257, 'learning_rate': 0.00012270669525851798, 'epoch': 0.39}


 39%|███▊      | 6499/16798 [29:25<55:51,  3.07it/s]

{'loss': 0.5731, 'grad_norm': 1.2556095123291016, 'learning_rate': 0.00012269478198713367, 'epoch': 0.39}




{'loss': 0.4934, 'grad_norm': 1.165819764137268, 'learning_rate': 0.00012268286871574934, 'epoch': 0.39}


 39%|███▊      | 6501/16798 [29:28<2:57:49,  1.04s/it]

{'loss': 1.8259, 'grad_norm': 1.8759288787841797, 'learning_rate': 0.00012267095544436503, 'epoch': 0.39}


 39%|███▊      | 6502/16798 [29:29<2:19:59,  1.23it/s]

{'loss': 1.7666, 'grad_norm': 1.7957415580749512, 'learning_rate': 0.0001226590421729807, 'epoch': 0.39}


 39%|███▊      | 6503/16798 [29:29<2:00:12,  1.43it/s]

{'loss': 2.1935, 'grad_norm': 1.9445220232009888, 'learning_rate': 0.00012264712890159638, 'epoch': 0.39}


 39%|███▊      | 6504/16798 [29:29<1:37:06,  1.77it/s]

{'loss': 2.4594, 'grad_norm': 2.3662447929382324, 'learning_rate': 0.00012263521563021204, 'epoch': 0.39}


 39%|███▊      | 6505/16798 [29:30<1:24:51,  2.02it/s]

{'loss': 1.9204, 'grad_norm': 1.9971323013305664, 'learning_rate': 0.00012262330235882773, 'epoch': 0.39}


 39%|███▊      | 6506/16798 [29:30<1:16:08,  2.25it/s]

{'loss': 2.0342, 'grad_norm': 2.9738292694091797, 'learning_rate': 0.0001226113890874434, 'epoch': 0.39}


 39%|███▊      | 6507/16798 [29:30<1:09:02,  2.48it/s]

{'loss': 2.7589, 'grad_norm': 2.5335965156555176, 'learning_rate': 0.00012259947581605912, 'epoch': 0.39}


 39%|███▊      | 6508/16798 [29:31<1:05:40,  2.61it/s]

{'loss': 1.8102, 'grad_norm': 1.962664246559143, 'learning_rate': 0.00012258756254467478, 'epoch': 0.39}


 39%|███▊      | 6509/16798 [29:31<1:03:33,  2.70it/s]

{'loss': 1.4468, 'grad_norm': 1.886055827140808, 'learning_rate': 0.00012257564927329047, 'epoch': 0.39}


 39%|███▉      | 6510/16798 [29:31<57:59,  2.96it/s]  

{'loss': 1.6877, 'grad_norm': 2.232785701751709, 'learning_rate': 0.00012256373600190613, 'epoch': 0.39}


 39%|███▉      | 6511/16798 [29:32<57:47,  2.97it/s]

{'loss': 1.4703, 'grad_norm': 1.6254370212554932, 'learning_rate': 0.00012255182273052183, 'epoch': 0.39}


 39%|███▉      | 6512/16798 [29:32<59:17,  2.89it/s]

{'loss': 1.6957, 'grad_norm': 1.9047330617904663, 'learning_rate': 0.0001225399094591375, 'epoch': 0.39}


 39%|███▉      | 6513/16798 [29:32<57:11,  3.00it/s]

{'loss': 1.322, 'grad_norm': 1.6832900047302246, 'learning_rate': 0.00012252799618775318, 'epoch': 0.39}


 39%|███▉      | 6514/16798 [29:33<56:30,  3.03it/s]

{'loss': 1.4244, 'grad_norm': 1.789988398551941, 'learning_rate': 0.00012251608291636884, 'epoch': 0.39}


 39%|███▉      | 6515/16798 [29:33<57:04,  3.00it/s]

{'loss': 1.4774, 'grad_norm': 1.9503107070922852, 'learning_rate': 0.00012250416964498453, 'epoch': 0.39}


 39%|███▉      | 6516/16798 [29:33<55:40,  3.08it/s]

{'loss': 1.4204, 'grad_norm': 1.8018795251846313, 'learning_rate': 0.0001224922563736002, 'epoch': 0.39}


 39%|███▉      | 6517/16798 [29:34<54:31,  3.14it/s]

{'loss': 1.2571, 'grad_norm': 1.7876200675964355, 'learning_rate': 0.0001224803431022159, 'epoch': 0.39}


 39%|███▉      | 6518/16798 [29:34<54:24,  3.15it/s]

{'loss': 1.3831, 'grad_norm': 1.885115623474121, 'learning_rate': 0.00012246842983083155, 'epoch': 0.39}


 39%|███▉      | 6519/16798 [29:34<52:18,  3.28it/s]

{'loss': 1.4813, 'grad_norm': 1.8089944124221802, 'learning_rate': 0.00012245651655944724, 'epoch': 0.39}


 39%|███▉      | 6520/16798 [29:34<53:02,  3.23it/s]

{'loss': 1.6394, 'grad_norm': 1.8909685611724854, 'learning_rate': 0.0001224446032880629, 'epoch': 0.39}


 39%|███▉      | 6521/16798 [29:35<52:12,  3.28it/s]

{'loss': 1.794, 'grad_norm': 2.166135787963867, 'learning_rate': 0.0001224326900166786, 'epoch': 0.39}


 39%|███▉      | 6522/16798 [29:35<54:02,  3.17it/s]

{'loss': 1.7199, 'grad_norm': 1.9880528450012207, 'learning_rate': 0.00012242077674529426, 'epoch': 0.39}


 39%|███▉      | 6523/16798 [29:35<55:03,  3.11it/s]

{'loss': 1.7651, 'grad_norm': 2.3605966567993164, 'learning_rate': 0.00012240886347390995, 'epoch': 0.39}


 39%|███▉      | 6524/16798 [29:36<56:48,  3.01it/s]

{'loss': 1.6624, 'grad_norm': 2.1350762844085693, 'learning_rate': 0.00012239695020252562, 'epoch': 0.39}


 39%|███▉      | 6525/16798 [29:36<58:07,  2.95it/s]

{'loss': 1.8908, 'grad_norm': 2.2152671813964844, 'learning_rate': 0.0001223850369311413, 'epoch': 0.39}


 39%|███▉      | 6526/16798 [29:36<53:08,  3.22it/s]

{'loss': 1.7468, 'grad_norm': 2.095924139022827, 'learning_rate': 0.00012237312365975697, 'epoch': 0.39}


 39%|███▉      | 6527/16798 [29:37<53:28,  3.20it/s]

{'loss': 1.4866, 'grad_norm': 1.7705875635147095, 'learning_rate': 0.00012236121038837266, 'epoch': 0.39}


 39%|███▉      | 6528/16798 [29:37<54:33,  3.14it/s]

{'loss': 1.7045, 'grad_norm': 2.041149377822876, 'learning_rate': 0.00012234929711698832, 'epoch': 0.39}


 39%|███▉      | 6529/16798 [29:37<53:37,  3.19it/s]

{'loss': 1.559, 'grad_norm': 1.9767283201217651, 'learning_rate': 0.00012233738384560402, 'epoch': 0.39}


 39%|███▉      | 6530/16798 [29:38<49:35,  3.45it/s]

{'loss': 1.716, 'grad_norm': 2.167426824569702, 'learning_rate': 0.00012232547057421968, 'epoch': 0.39}


 39%|███▉      | 6531/16798 [29:38<52:29,  3.26it/s]

{'loss': 1.671, 'grad_norm': 2.1069228649139404, 'learning_rate': 0.00012231355730283537, 'epoch': 0.39}


 39%|███▉      | 6532/16798 [29:38<53:23,  3.20it/s]

{'loss': 1.479, 'grad_norm': 2.1814191341400146, 'learning_rate': 0.00012230164403145103, 'epoch': 0.39}


 39%|███▉      | 6533/16798 [29:39<52:41,  3.25it/s]

{'loss': 0.9315, 'grad_norm': 1.5370546579360962, 'learning_rate': 0.00012228973076006672, 'epoch': 0.39}


 39%|███▉      | 6534/16798 [29:39<53:04,  3.22it/s]

{'loss': 1.7202, 'grad_norm': 2.0757038593292236, 'learning_rate': 0.0001222778174886824, 'epoch': 0.39}


 39%|███▉      | 6535/16798 [29:39<52:39,  3.25it/s]

{'loss': 0.7906, 'grad_norm': 1.4514795541763306, 'learning_rate': 0.00012226590421729808, 'epoch': 0.39}


 39%|███▉      | 6536/16798 [29:39<52:07,  3.28it/s]

{'loss': 1.1319, 'grad_norm': 1.8173056840896606, 'learning_rate': 0.00012225399094591374, 'epoch': 0.39}


 39%|███▉      | 6537/16798 [29:40<52:38,  3.25it/s]

{'loss': 1.471, 'grad_norm': 2.06355881690979, 'learning_rate': 0.00012224207767452943, 'epoch': 0.39}


 39%|███▉      | 6538/16798 [29:40<54:34,  3.13it/s]

{'loss': 1.5659, 'grad_norm': 2.236616849899292, 'learning_rate': 0.00012223016440314512, 'epoch': 0.39}


 39%|███▉      | 6539/16798 [29:40<52:18,  3.27it/s]

{'loss': 1.5567, 'grad_norm': 2.0439655780792236, 'learning_rate': 0.0001222182511317608, 'epoch': 0.39}


 39%|███▉      | 6540/16798 [29:41<53:18,  3.21it/s]

{'loss': 1.0385, 'grad_norm': 1.644813060760498, 'learning_rate': 0.00012220633786037648, 'epoch': 0.39}


 39%|███▉      | 6541/16798 [29:41<54:01,  3.16it/s]

{'loss': 0.7785, 'grad_norm': 1.7110413312911987, 'learning_rate': 0.00012219442458899214, 'epoch': 0.39}


 39%|███▉      | 6542/16798 [29:41<53:50,  3.17it/s]

{'loss': 0.9983, 'grad_norm': 1.604429006576538, 'learning_rate': 0.00012218251131760783, 'epoch': 0.39}


 39%|███▉      | 6543/16798 [29:42<56:05,  3.05it/s]

{'loss': 0.7048, 'grad_norm': 1.3276338577270508, 'learning_rate': 0.0001221705980462235, 'epoch': 0.39}


 39%|███▉      | 6544/16798 [29:42<53:27,  3.20it/s]

{'loss': 1.2477, 'grad_norm': 2.184479236602783, 'learning_rate': 0.0001221586847748392, 'epoch': 0.39}


 39%|███▉      | 6545/16798 [29:42<52:15,  3.27it/s]

{'loss': 1.404, 'grad_norm': 2.402064323425293, 'learning_rate': 0.00012214677150345485, 'epoch': 0.39}


 39%|███▉      | 6546/16798 [29:43<52:43,  3.24it/s]

{'loss': 1.0695, 'grad_norm': 1.96237051486969, 'learning_rate': 0.00012213485823207054, 'epoch': 0.39}


 39%|███▉      | 6547/16798 [29:43<53:09,  3.21it/s]

{'loss': 0.5387, 'grad_norm': 1.0955266952514648, 'learning_rate': 0.0001221229449606862, 'epoch': 0.39}


 39%|███▉      | 6548/16798 [29:43<53:56,  3.17it/s]

{'loss': 0.676, 'grad_norm': 1.414820909500122, 'learning_rate': 0.0001221110316893019, 'epoch': 0.39}


 39%|███▉      | 6549/16798 [29:44<54:14,  3.15it/s]

{'loss': 0.6394, 'grad_norm': 1.482015609741211, 'learning_rate': 0.00012209911841791756, 'epoch': 0.39}


 39%|███▉      | 6550/16798 [29:44<50:40,  3.37it/s]

{'loss': 0.4661, 'grad_norm': 1.2013403177261353, 'learning_rate': 0.00012208720514653325, 'epoch': 0.39}


 39%|███▉      | 6551/16798 [29:44<51:04,  3.34it/s]

{'loss': 2.0037, 'grad_norm': 1.9572993516921997, 'learning_rate': 0.00012207529187514891, 'epoch': 0.39}


 39%|███▉      | 6552/16798 [29:44<52:04,  3.28it/s]

{'loss': 2.1949, 'grad_norm': 2.0504603385925293, 'learning_rate': 0.0001220633786037646, 'epoch': 0.39}


 39%|███▉      | 6553/16798 [29:45<53:49,  3.17it/s]

{'loss': 1.674, 'grad_norm': 1.910240888595581, 'learning_rate': 0.00012205146533238028, 'epoch': 0.39}


 39%|███▉      | 6554/16798 [29:45<52:45,  3.24it/s]

{'loss': 2.2429, 'grad_norm': 2.239861011505127, 'learning_rate': 0.00012203955206099596, 'epoch': 0.39}


 39%|███▉      | 6555/16798 [29:45<51:46,  3.30it/s]

{'loss': 1.4216, 'grad_norm': 1.8291809558868408, 'learning_rate': 0.00012202763878961164, 'epoch': 0.39}


 39%|███▉      | 6556/16798 [29:46<54:15,  3.15it/s]

{'loss': 1.7875, 'grad_norm': 2.0462396144866943, 'learning_rate': 0.00012201572551822731, 'epoch': 0.39}


 39%|███▉      | 6557/16798 [29:46<51:30,  3.31it/s]

{'loss': 1.2417, 'grad_norm': 1.5014779567718506, 'learning_rate': 0.00012200381224684299, 'epoch': 0.39}


 39%|███▉      | 6558/16798 [29:46<53:33,  3.19it/s]

{'loss': 1.5149, 'grad_norm': 2.653289556503296, 'learning_rate': 0.00012199189897545867, 'epoch': 0.39}


 39%|███▉      | 6559/16798 [29:47<49:17,  3.46it/s]

{'loss': 1.617, 'grad_norm': 1.9992775917053223, 'learning_rate': 0.00012197998570407434, 'epoch': 0.39}


 39%|███▉      | 6560/16798 [29:47<50:38,  3.37it/s]

{'loss': 1.9893, 'grad_norm': 2.2119181156158447, 'learning_rate': 0.00012196807243269002, 'epoch': 0.39}


 39%|███▉      | 6561/16798 [29:47<52:05,  3.28it/s]

{'loss': 1.428, 'grad_norm': 1.820934534072876, 'learning_rate': 0.0001219561591613057, 'epoch': 0.39}


 39%|███▉      | 6562/16798 [29:48<52:38,  3.24it/s]

{'loss': 1.548, 'grad_norm': 1.7242889404296875, 'learning_rate': 0.00012194424588992138, 'epoch': 0.39}


 39%|███▉      | 6563/16798 [29:48<51:53,  3.29it/s]

{'loss': 1.2699, 'grad_norm': 2.2391862869262695, 'learning_rate': 0.00012193233261853705, 'epoch': 0.39}


 39%|███▉      | 6564/16798 [29:48<51:02,  3.34it/s]

{'loss': 1.7994, 'grad_norm': 2.2326838970184326, 'learning_rate': 0.00012192041934715273, 'epoch': 0.39}


 39%|███▉      | 6565/16798 [29:48<51:11,  3.33it/s]

{'loss': 1.4803, 'grad_norm': 1.8436371088027954, 'learning_rate': 0.00012190850607576841, 'epoch': 0.39}


 39%|███▉      | 6566/16798 [29:49<51:41,  3.30it/s]

{'loss': 1.5214, 'grad_norm': 1.918318510055542, 'learning_rate': 0.00012189659280438409, 'epoch': 0.39}


 39%|███▉      | 6567/16798 [29:49<50:32,  3.37it/s]

{'loss': 2.0069, 'grad_norm': 2.088003396987915, 'learning_rate': 0.00012188467953299976, 'epoch': 0.39}


 39%|███▉      | 6568/16798 [29:49<52:32,  3.25it/s]

{'loss': 1.3371, 'grad_norm': 1.7254568338394165, 'learning_rate': 0.00012187276626161545, 'epoch': 0.39}


 39%|███▉      | 6569/16798 [29:50<54:04,  3.15it/s]

{'loss': 1.5711, 'grad_norm': 2.115565776824951, 'learning_rate': 0.00012186085299023113, 'epoch': 0.39}


 39%|███▉      | 6570/16798 [29:50<50:39,  3.37it/s]

{'loss': 1.2705, 'grad_norm': 1.6158350706100464, 'learning_rate': 0.00012184893971884681, 'epoch': 0.39}


 39%|███▉      | 6571/16798 [29:50<51:21,  3.32it/s]

{'loss': 1.3159, 'grad_norm': 1.7693791389465332, 'learning_rate': 0.00012183702644746248, 'epoch': 0.39}


 39%|███▉      | 6572/16798 [29:51<53:49,  3.17it/s]

{'loss': 1.4838, 'grad_norm': 2.143289566040039, 'learning_rate': 0.00012182511317607816, 'epoch': 0.39}


 39%|███▉      | 6573/16798 [29:51<55:12,  3.09it/s]

{'loss': 1.8392, 'grad_norm': 2.400608777999878, 'learning_rate': 0.00012181319990469384, 'epoch': 0.39}


 39%|███▉      | 6574/16798 [29:51<53:44,  3.17it/s]

{'loss': 1.5656, 'grad_norm': 1.8480684757232666, 'learning_rate': 0.00012180128663330952, 'epoch': 0.39}


 39%|███▉      | 6575/16798 [29:52<53:56,  3.16it/s]

{'loss': 1.6954, 'grad_norm': 2.169027090072632, 'learning_rate': 0.0001217893733619252, 'epoch': 0.39}


 39%|███▉      | 6576/16798 [29:52<55:29,  3.07it/s]

{'loss': 1.6402, 'grad_norm': 1.9426283836364746, 'learning_rate': 0.00012177746009054087, 'epoch': 0.39}


 39%|███▉      | 6577/16798 [29:52<54:18,  3.14it/s]

{'loss': 1.4105, 'grad_norm': 2.0720415115356445, 'learning_rate': 0.00012176554681915655, 'epoch': 0.39}


 39%|███▉      | 6578/16798 [29:53<56:08,  3.03it/s]

{'loss': 1.4332, 'grad_norm': 1.8119478225708008, 'learning_rate': 0.00012175363354777223, 'epoch': 0.39}


 39%|███▉      | 6579/16798 [29:53<56:17,  3.03it/s]

{'loss': 1.7226, 'grad_norm': 2.1029067039489746, 'learning_rate': 0.0001217417202763879, 'epoch': 0.39}


 39%|███▉      | 6580/16798 [29:53<51:57,  3.28it/s]

{'loss': 1.5739, 'grad_norm': 2.065434217453003, 'learning_rate': 0.00012172980700500358, 'epoch': 0.39}


 39%|███▉      | 6581/16798 [29:53<51:27,  3.31it/s]

{'loss': 1.6125, 'grad_norm': 2.272177219390869, 'learning_rate': 0.00012171789373361926, 'epoch': 0.39}


 39%|███▉      | 6582/16798 [29:54<53:24,  3.19it/s]

{'loss': 1.4631, 'grad_norm': 1.8348923921585083, 'learning_rate': 0.00012170598046223493, 'epoch': 0.39}


 39%|███▉      | 6583/16798 [29:54<52:26,  3.25it/s]

{'loss': 1.2273, 'grad_norm': 1.6748671531677246, 'learning_rate': 0.00012169406719085061, 'epoch': 0.39}


 39%|███▉      | 6584/16798 [29:54<51:41,  3.29it/s]

{'loss': 1.222, 'grad_norm': 1.9337518215179443, 'learning_rate': 0.00012168215391946629, 'epoch': 0.39}


 39%|███▉      | 6585/16798 [29:55<51:14,  3.32it/s]

{'loss': 1.6026, 'grad_norm': 2.071523666381836, 'learning_rate': 0.00012167024064808197, 'epoch': 0.39}


 39%|███▉      | 6586/16798 [29:55<51:21,  3.31it/s]

{'loss': 1.7311, 'grad_norm': 2.2745485305786133, 'learning_rate': 0.00012165832737669764, 'epoch': 0.39}


 39%|███▉      | 6587/16798 [29:55<50:30,  3.37it/s]

{'loss': 1.0901, 'grad_norm': 1.6328985691070557, 'learning_rate': 0.00012164641410531332, 'epoch': 0.39}


 39%|███▉      | 6588/16798 [29:56<50:58,  3.34it/s]

{'loss': 1.245, 'grad_norm': 1.5747267007827759, 'learning_rate': 0.000121634500833929, 'epoch': 0.39}


 39%|███▉      | 6589/16798 [29:56<50:52,  3.34it/s]

{'loss': 1.6281, 'grad_norm': 1.80832040309906, 'learning_rate': 0.00012162258756254467, 'epoch': 0.39}


 39%|███▉      | 6590/16798 [29:56<52:23,  3.25it/s]

{'loss': 1.0989, 'grad_norm': 1.6612200736999512, 'learning_rate': 0.00012161067429116035, 'epoch': 0.39}


 39%|███▉      | 6591/16798 [29:56<50:33,  3.36it/s]

{'loss': 1.543, 'grad_norm': 1.9348284006118774, 'learning_rate': 0.00012159876101977603, 'epoch': 0.39}


 39%|███▉      | 6592/16798 [29:57<52:00,  3.27it/s]

{'loss': 1.2681, 'grad_norm': 1.7359415292739868, 'learning_rate': 0.0001215868477483917, 'epoch': 0.39}


 39%|███▉      | 6593/16798 [29:57<50:37,  3.36it/s]

{'loss': 0.9765, 'grad_norm': 1.5249979496002197, 'learning_rate': 0.00012157493447700738, 'epoch': 0.39}


 39%|███▉      | 6594/16798 [29:57<49:18,  3.45it/s]

{'loss': 1.5189, 'grad_norm': 2.1768016815185547, 'learning_rate': 0.00012156302120562306, 'epoch': 0.39}


 39%|███▉      | 6595/16798 [29:58<50:14,  3.38it/s]

{'loss': 0.9165, 'grad_norm': 1.5150141716003418, 'learning_rate': 0.00012155110793423874, 'epoch': 0.39}


 39%|███▉      | 6596/16798 [29:58<49:19,  3.45it/s]

{'loss': 0.6399, 'grad_norm': 1.2126195430755615, 'learning_rate': 0.00012153919466285442, 'epoch': 0.39}


 39%|███▉      | 6597/16798 [29:58<48:56,  3.47it/s]

{'loss': 0.843, 'grad_norm': 1.4601632356643677, 'learning_rate': 0.00012152728139147009, 'epoch': 0.39}


 39%|███▉      | 6598/16798 [29:59<50:51,  3.34it/s]

{'loss': 0.8043, 'grad_norm': 1.7666655778884888, 'learning_rate': 0.00012151536812008577, 'epoch': 0.39}


 39%|███▉      | 6599/16798 [29:59<52:25,  3.24it/s]

{'loss': 0.6203, 'grad_norm': 1.2498172521591187, 'learning_rate': 0.00012150345484870147, 'epoch': 0.39}


 39%|███▉      | 6600/16798 [29:59<52:38,  3.23it/s]

{'loss': 0.2464, 'grad_norm': 0.6705291271209717, 'learning_rate': 0.00012149154157731715, 'epoch': 0.39}


 39%|███▉      | 6601/16798 [29:59<52:45,  3.22it/s]

{'loss': 1.8886, 'grad_norm': 2.070051431655884, 'learning_rate': 0.00012147962830593283, 'epoch': 0.39}


 39%|███▉      | 6602/16798 [30:00<52:26,  3.24it/s]

{'loss': 2.0073, 'grad_norm': 1.8511735200881958, 'learning_rate': 0.0001214677150345485, 'epoch': 0.39}


 39%|███▉      | 6603/16798 [30:00<52:20,  3.25it/s]

{'loss': 1.893, 'grad_norm': 1.9717023372650146, 'learning_rate': 0.00012145580176316418, 'epoch': 0.39}


 39%|███▉      | 6604/16798 [30:00<50:32,  3.36it/s]

{'loss': 2.3306, 'grad_norm': 2.2036020755767822, 'learning_rate': 0.00012144388849177986, 'epoch': 0.39}


 39%|███▉      | 6605/16798 [30:01<52:45,  3.22it/s]

{'loss': 1.5804, 'grad_norm': 1.877479076385498, 'learning_rate': 0.00012143197522039554, 'epoch': 0.39}


 39%|███▉      | 6606/16798 [30:01<52:36,  3.23it/s]

{'loss': 1.9559, 'grad_norm': 1.9615387916564941, 'learning_rate': 0.00012142006194901121, 'epoch': 0.39}


 39%|███▉      | 6607/16798 [30:01<51:41,  3.29it/s]

{'loss': 1.6861, 'grad_norm': 1.7550723552703857, 'learning_rate': 0.00012140814867762689, 'epoch': 0.39}


 39%|███▉      | 6608/16798 [30:02<51:56,  3.27it/s]

{'loss': 1.9125, 'grad_norm': 1.8324670791625977, 'learning_rate': 0.00012139623540624257, 'epoch': 0.39}


 39%|███▉      | 6609/16798 [30:02<51:31,  3.30it/s]

{'loss': 1.7112, 'grad_norm': 1.8362236022949219, 'learning_rate': 0.00012138432213485825, 'epoch': 0.39}


 39%|███▉      | 6610/16798 [30:02<48:23,  3.51it/s]

{'loss': 1.6864, 'grad_norm': 1.9576301574707031, 'learning_rate': 0.00012137240886347392, 'epoch': 0.39}


 39%|███▉      | 6611/16798 [30:02<49:03,  3.46it/s]

{'loss': 1.0518, 'grad_norm': 1.4183257818222046, 'learning_rate': 0.0001213604955920896, 'epoch': 0.39}


 39%|███▉      | 6612/16798 [30:03<51:34,  3.29it/s]

{'loss': 1.6992, 'grad_norm': 1.7985787391662598, 'learning_rate': 0.00012134858232070528, 'epoch': 0.39}


 39%|███▉      | 6613/16798 [30:03<50:24,  3.37it/s]

{'loss': 1.672, 'grad_norm': 2.15946626663208, 'learning_rate': 0.00012133666904932095, 'epoch': 0.39}


 39%|███▉      | 6614/16798 [30:03<50:18,  3.37it/s]

{'loss': 1.7266, 'grad_norm': 2.091609239578247, 'learning_rate': 0.00012132475577793663, 'epoch': 0.39}


 39%|███▉      | 6615/16798 [30:04<51:00,  3.33it/s]

{'loss': 1.7107, 'grad_norm': 2.0352604389190674, 'learning_rate': 0.00012131284250655231, 'epoch': 0.39}


 39%|███▉      | 6616/16798 [30:04<49:23,  3.44it/s]

{'loss': 1.0866, 'grad_norm': 1.529726505279541, 'learning_rate': 0.00012130092923516799, 'epoch': 0.39}


 39%|███▉      | 6617/16798 [30:04<49:47,  3.41it/s]

{'loss': 1.6455, 'grad_norm': 2.124493360519409, 'learning_rate': 0.00012128901596378366, 'epoch': 0.39}


 39%|███▉      | 6618/16798 [30:05<50:23,  3.37it/s]

{'loss': 1.5525, 'grad_norm': 2.3925955295562744, 'learning_rate': 0.00012127710269239934, 'epoch': 0.39}


 39%|███▉      | 6619/16798 [30:05<50:13,  3.38it/s]

{'loss': 1.5418, 'grad_norm': 2.171098232269287, 'learning_rate': 0.00012126518942101502, 'epoch': 0.39}


 39%|███▉      | 6620/16798 [30:05<50:31,  3.36it/s]

{'loss': 1.6786, 'grad_norm': 1.7698570489883423, 'learning_rate': 0.0001212532761496307, 'epoch': 0.39}


 39%|███▉      | 6621/16798 [30:05<50:22,  3.37it/s]

{'loss': 1.316, 'grad_norm': 1.8625538349151611, 'learning_rate': 0.00012124136287824637, 'epoch': 0.39}


 39%|███▉      | 6622/16798 [30:06<50:32,  3.36it/s]

{'loss': 1.6471, 'grad_norm': 2.0065901279449463, 'learning_rate': 0.00012122944960686205, 'epoch': 0.39}


 39%|███▉      | 6623/16798 [30:06<52:10,  3.25it/s]

{'loss': 1.7389, 'grad_norm': 2.0536460876464844, 'learning_rate': 0.00012121753633547773, 'epoch': 0.39}


 39%|███▉      | 6624/16798 [30:06<53:16,  3.18it/s]

{'loss': 2.0494, 'grad_norm': 2.236820936203003, 'learning_rate': 0.0001212056230640934, 'epoch': 0.39}


 39%|███▉      | 6625/16798 [30:07<53:44,  3.15it/s]

{'loss': 1.6166, 'grad_norm': 1.877853274345398, 'learning_rate': 0.00012119370979270908, 'epoch': 0.39}


 39%|███▉      | 6626/16798 [30:07<52:59,  3.20it/s]

{'loss': 1.5269, 'grad_norm': 2.0698747634887695, 'learning_rate': 0.00012118179652132476, 'epoch': 0.39}


 39%|███▉      | 6627/16798 [30:07<51:12,  3.31it/s]

{'loss': 1.3067, 'grad_norm': 1.8471810817718506, 'learning_rate': 0.00012116988324994044, 'epoch': 0.39}


 39%|███▉      | 6628/16798 [30:08<52:08,  3.25it/s]

{'loss': 1.5325, 'grad_norm': 2.028757095336914, 'learning_rate': 0.00012115796997855611, 'epoch': 0.39}


 39%|███▉      | 6629/16798 [30:08<50:44,  3.34it/s]

{'loss': 1.3733, 'grad_norm': 1.7513490915298462, 'learning_rate': 0.00012114605670717179, 'epoch': 0.39}


 39%|███▉      | 6630/16798 [30:08<51:25,  3.30it/s]

{'loss': 1.2836, 'grad_norm': 2.3800477981567383, 'learning_rate': 0.00012113414343578748, 'epoch': 0.39}


 39%|███▉      | 6631/16798 [30:09<51:14,  3.31it/s]

{'loss': 1.6908, 'grad_norm': 2.812870740890503, 'learning_rate': 0.00012112223016440316, 'epoch': 0.39}


 39%|███▉      | 6632/16798 [30:09<52:43,  3.21it/s]

{'loss': 1.3724, 'grad_norm': 2.191948652267456, 'learning_rate': 0.00012111031689301884, 'epoch': 0.39}


 39%|███▉      | 6633/16798 [30:09<51:21,  3.30it/s]

{'loss': 1.2598, 'grad_norm': 1.9897117614746094, 'learning_rate': 0.00012109840362163451, 'epoch': 0.39}


 39%|███▉      | 6634/16798 [30:09<50:15,  3.37it/s]

{'loss': 1.7298, 'grad_norm': 2.2208802700042725, 'learning_rate': 0.00012108649035025019, 'epoch': 0.39}


 39%|███▉      | 6635/16798 [30:10<51:57,  3.26it/s]

{'loss': 1.7022, 'grad_norm': 2.338165760040283, 'learning_rate': 0.00012107457707886587, 'epoch': 0.39}


 40%|███▉      | 6636/16798 [30:10<51:33,  3.29it/s]

{'loss': 1.5247, 'grad_norm': 2.1092302799224854, 'learning_rate': 0.00012106266380748154, 'epoch': 0.4}


 40%|███▉      | 6637/16798 [30:10<50:32,  3.35it/s]

{'loss': 1.3313, 'grad_norm': 1.780773639678955, 'learning_rate': 0.00012105075053609722, 'epoch': 0.4}


 40%|███▉      | 6638/16798 [30:11<50:48,  3.33it/s]

{'loss': 1.3922, 'grad_norm': 2.0088815689086914, 'learning_rate': 0.0001210388372647129, 'epoch': 0.4}


 40%|███▉      | 6639/16798 [30:11<51:05,  3.31it/s]

{'loss': 1.0744, 'grad_norm': 1.5803442001342773, 'learning_rate': 0.00012102692399332858, 'epoch': 0.4}


 40%|███▉      | 6640/16798 [30:11<48:49,  3.47it/s]

{'loss': 1.4999, 'grad_norm': 2.102208375930786, 'learning_rate': 0.00012101501072194425, 'epoch': 0.4}


 40%|███▉      | 6641/16798 [30:11<49:08,  3.44it/s]

{'loss': 0.8328, 'grad_norm': 1.305418848991394, 'learning_rate': 0.00012100309745055993, 'epoch': 0.4}


 40%|███▉      | 6642/16798 [30:12<50:48,  3.33it/s]

{'loss': 1.4079, 'grad_norm': 2.0979080200195312, 'learning_rate': 0.00012099118417917561, 'epoch': 0.4}


 40%|███▉      | 6643/16798 [30:12<50:54,  3.32it/s]

{'loss': 0.9352, 'grad_norm': 1.7060582637786865, 'learning_rate': 0.00012097927090779128, 'epoch': 0.4}


 40%|███▉      | 6644/16798 [30:12<48:45,  3.47it/s]

{'loss': 0.8037, 'grad_norm': 1.1874492168426514, 'learning_rate': 0.00012096735763640696, 'epoch': 0.4}


 40%|███▉      | 6645/16798 [30:13<55:18,  3.06it/s]

{'loss': 0.9152, 'grad_norm': 1.6998772621154785, 'learning_rate': 0.00012095544436502264, 'epoch': 0.4}


 40%|███▉      | 6646/16798 [30:13<49:49,  3.40it/s]

{'loss': 0.5452, 'grad_norm': 1.4125885963439941, 'learning_rate': 0.00012094353109363832, 'epoch': 0.4}


 40%|███▉      | 6647/16798 [30:13<48:54,  3.46it/s]

{'loss': 0.2233, 'grad_norm': 0.6699957847595215, 'learning_rate': 0.000120931617822254, 'epoch': 0.4}


 40%|███▉      | 6648/16798 [30:14<47:51,  3.53it/s]

{'loss': 0.4391, 'grad_norm': 1.0309280157089233, 'learning_rate': 0.00012091970455086967, 'epoch': 0.4}


 40%|███▉      | 6649/16798 [30:14<47:38,  3.55it/s]

{'loss': 0.6317, 'grad_norm': 1.2828869819641113, 'learning_rate': 0.00012090779127948535, 'epoch': 0.4}


 40%|███▉      | 6650/16798 [30:14<49:52,  3.39it/s]

{'loss': 0.3053, 'grad_norm': 0.963359534740448, 'learning_rate': 0.00012089587800810103, 'epoch': 0.4}


 40%|███▉      | 6651/16798 [30:14<51:08,  3.31it/s]

{'loss': 1.7806, 'grad_norm': 2.1156537532806396, 'learning_rate': 0.0001208839647367167, 'epoch': 0.4}


 40%|███▉      | 6652/16798 [30:15<50:44,  3.33it/s]

{'loss': 1.5615, 'grad_norm': 1.911555528640747, 'learning_rate': 0.00012087205146533238, 'epoch': 0.4}


 40%|███▉      | 6653/16798 [30:15<49:48,  3.39it/s]

{'loss': 1.9764, 'grad_norm': 1.9354143142700195, 'learning_rate': 0.00012086013819394806, 'epoch': 0.4}


 40%|███▉      | 6654/16798 [30:15<51:10,  3.30it/s]

{'loss': 1.8008, 'grad_norm': 1.9944733381271362, 'learning_rate': 0.00012084822492256373, 'epoch': 0.4}


 40%|███▉      | 6655/16798 [30:16<52:13,  3.24it/s]

{'loss': 2.5048, 'grad_norm': 2.3922488689422607, 'learning_rate': 0.00012083631165117941, 'epoch': 0.4}


 40%|███▉      | 6656/16798 [30:16<52:42,  3.21it/s]

{'loss': 1.8557, 'grad_norm': 1.9720900058746338, 'learning_rate': 0.00012082439837979509, 'epoch': 0.4}


 40%|███▉      | 6657/16798 [30:16<50:38,  3.34it/s]

{'loss': 1.9822, 'grad_norm': 1.8803390264511108, 'learning_rate': 0.00012081248510841077, 'epoch': 0.4}


 40%|███▉      | 6658/16798 [30:17<50:13,  3.36it/s]

{'loss': 1.2863, 'grad_norm': 1.6073061227798462, 'learning_rate': 0.00012080057183702644, 'epoch': 0.4}


 40%|███▉      | 6659/16798 [30:17<51:40,  3.27it/s]

{'loss': 1.9254, 'grad_norm': 1.922886610031128, 'learning_rate': 0.00012078865856564212, 'epoch': 0.4}


 40%|███▉      | 6660/16798 [30:17<50:33,  3.34it/s]

{'loss': 1.7631, 'grad_norm': 2.0109996795654297, 'learning_rate': 0.0001207767452942578, 'epoch': 0.4}


 40%|███▉      | 6661/16798 [30:17<49:55,  3.38it/s]

{'loss': 1.187, 'grad_norm': 1.4728999137878418, 'learning_rate': 0.0001207648320228735, 'epoch': 0.4}


 40%|███▉      | 6662/16798 [30:18<54:00,  3.13it/s]

{'loss': 1.5968, 'grad_norm': 2.614957571029663, 'learning_rate': 0.00012075291875148918, 'epoch': 0.4}


 40%|███▉      | 6663/16798 [30:18<52:42,  3.21it/s]

{'loss': 1.5765, 'grad_norm': 1.7778886556625366, 'learning_rate': 0.00012074100548010486, 'epoch': 0.4}


 40%|███▉      | 6664/16798 [30:18<52:59,  3.19it/s]

{'loss': 1.3959, 'grad_norm': 2.189812421798706, 'learning_rate': 0.00012072909220872053, 'epoch': 0.4}


 40%|███▉      | 6665/16798 [30:19<51:13,  3.30it/s]

{'loss': 1.4908, 'grad_norm': 1.9480317831039429, 'learning_rate': 0.00012071717893733621, 'epoch': 0.4}


 40%|███▉      | 6666/16798 [30:19<50:58,  3.31it/s]

{'loss': 1.5029, 'grad_norm': 1.805534839630127, 'learning_rate': 0.00012070526566595189, 'epoch': 0.4}


 40%|███▉      | 6667/16798 [30:19<51:02,  3.31it/s]

{'loss': 1.7599, 'grad_norm': 1.9743678569793701, 'learning_rate': 0.00012069335239456756, 'epoch': 0.4}


 40%|███▉      | 6668/16798 [30:20<52:28,  3.22it/s]

{'loss': 1.8655, 'grad_norm': 1.939008116722107, 'learning_rate': 0.00012068143912318324, 'epoch': 0.4}


 40%|███▉      | 6669/16798 [30:20<52:24,  3.22it/s]

{'loss': 1.2791, 'grad_norm': 1.5982239246368408, 'learning_rate': 0.00012066952585179892, 'epoch': 0.4}


 40%|███▉      | 6670/16798 [30:20<49:43,  3.39it/s]

{'loss': 1.6418, 'grad_norm': 1.9991258382797241, 'learning_rate': 0.0001206576125804146, 'epoch': 0.4}


 40%|███▉      | 6671/16798 [30:21<50:25,  3.35it/s]

{'loss': 1.7129, 'grad_norm': 2.2793405055999756, 'learning_rate': 0.00012064569930903027, 'epoch': 0.4}


 40%|███▉      | 6672/16798 [30:21<1:00:19,  2.80it/s]

{'loss': 1.5772, 'grad_norm': 1.912345051765442, 'learning_rate': 0.00012063378603764595, 'epoch': 0.4}


 40%|███▉      | 6673/16798 [30:21<54:27,  3.10it/s]  

{'loss': 1.2928, 'grad_norm': 1.7659952640533447, 'learning_rate': 0.00012062187276626163, 'epoch': 0.4}


 40%|███▉      | 6674/16798 [30:22<53:11,  3.17it/s]

{'loss': 1.3281, 'grad_norm': 1.7180559635162354, 'learning_rate': 0.0001206099594948773, 'epoch': 0.4}


 40%|███▉      | 6675/16798 [30:22<50:26,  3.34it/s]

{'loss': 1.4181, 'grad_norm': 1.9643594026565552, 'learning_rate': 0.00012059804622349298, 'epoch': 0.4}


 40%|███▉      | 6676/16798 [30:22<51:26,  3.28it/s]

{'loss': 1.9376, 'grad_norm': 2.4059348106384277, 'learning_rate': 0.00012058613295210866, 'epoch': 0.4}


 40%|███▉      | 6677/16798 [30:22<51:50,  3.25it/s]

{'loss': 1.9947, 'grad_norm': 2.2175698280334473, 'learning_rate': 0.00012057421968072434, 'epoch': 0.4}


 40%|███▉      | 6678/16798 [30:23<49:10,  3.43it/s]

{'loss': 1.0126, 'grad_norm': 1.7907360792160034, 'learning_rate': 0.00012056230640934001, 'epoch': 0.4}


 40%|███▉      | 6679/16798 [30:23<53:21,  3.16it/s]

{'loss': 1.6326, 'grad_norm': 2.136145830154419, 'learning_rate': 0.00012055039313795569, 'epoch': 0.4}


 40%|███▉      | 6680/16798 [30:23<53:19,  3.16it/s]

{'loss': 2.0382, 'grad_norm': 2.5863969326019287, 'learning_rate': 0.00012053847986657137, 'epoch': 0.4}


 40%|███▉      | 6681/16798 [30:24<53:54,  3.13it/s]

{'loss': 1.3575, 'grad_norm': 1.8054864406585693, 'learning_rate': 0.00012052656659518705, 'epoch': 0.4}


 40%|███▉      | 6682/16798 [30:24<54:02,  3.12it/s]

{'loss': 1.5218, 'grad_norm': 1.861679196357727, 'learning_rate': 0.00012051465332380272, 'epoch': 0.4}


 40%|███▉      | 6683/16798 [30:24<52:25,  3.22it/s]

{'loss': 1.5823, 'grad_norm': 2.2218985557556152, 'learning_rate': 0.0001205027400524184, 'epoch': 0.4}


 40%|███▉      | 6684/16798 [30:25<53:15,  3.16it/s]

{'loss': 1.0975, 'grad_norm': 1.7358189821243286, 'learning_rate': 0.00012049082678103408, 'epoch': 0.4}


 40%|███▉      | 6685/16798 [30:25<52:15,  3.23it/s]

{'loss': 1.2217, 'grad_norm': 1.487280249595642, 'learning_rate': 0.00012047891350964975, 'epoch': 0.4}


 40%|███▉      | 6686/16798 [30:25<51:13,  3.29it/s]

{'loss': 1.3044, 'grad_norm': 1.71912682056427, 'learning_rate': 0.00012046700023826543, 'epoch': 0.4}


 40%|███▉      | 6687/16798 [30:26<52:22,  3.22it/s]

{'loss': 1.1989, 'grad_norm': 1.773688554763794, 'learning_rate': 0.00012045508696688111, 'epoch': 0.4}


 40%|███▉      | 6688/16798 [30:26<50:31,  3.34it/s]

{'loss': 1.0817, 'grad_norm': 1.7543578147888184, 'learning_rate': 0.00012044317369549679, 'epoch': 0.4}


 40%|███▉      | 6689/16798 [30:26<52:16,  3.22it/s]

{'loss': 1.3912, 'grad_norm': 2.2963953018188477, 'learning_rate': 0.00012043126042411246, 'epoch': 0.4}


 40%|███▉      | 6690/16798 [30:26<49:12,  3.42it/s]

{'loss': 1.5775, 'grad_norm': 2.2397377490997314, 'learning_rate': 0.00012041934715272814, 'epoch': 0.4}


 40%|███▉      | 6691/16798 [30:27<52:20,  3.22it/s]

{'loss': 1.1808, 'grad_norm': 2.003863573074341, 'learning_rate': 0.00012040743388134382, 'epoch': 0.4}


 40%|███▉      | 6692/16798 [30:27<50:30,  3.33it/s]

{'loss': 1.0859, 'grad_norm': 1.700624704360962, 'learning_rate': 0.00012039552060995951, 'epoch': 0.4}


 40%|███▉      | 6693/16798 [30:27<48:45,  3.45it/s]

{'loss': 1.4371, 'grad_norm': 2.0870676040649414, 'learning_rate': 0.00012038360733857519, 'epoch': 0.4}


 40%|███▉      | 6694/16798 [30:28<50:05,  3.36it/s]

{'loss': 1.3629, 'grad_norm': 2.027385711669922, 'learning_rate': 0.00012037169406719086, 'epoch': 0.4}


 40%|███▉      | 6695/16798 [30:28<51:24,  3.28it/s]

{'loss': 0.6256, 'grad_norm': 1.3573793172836304, 'learning_rate': 0.00012035978079580654, 'epoch': 0.4}


 40%|███▉      | 6696/16798 [30:28<50:29,  3.33it/s]

{'loss': 0.8532, 'grad_norm': 1.4863839149475098, 'learning_rate': 0.00012034786752442222, 'epoch': 0.4}


 40%|███▉      | 6697/16798 [30:29<49:42,  3.39it/s]

{'loss': 0.7684, 'grad_norm': 1.5807286500930786, 'learning_rate': 0.0001203359542530379, 'epoch': 0.4}


 40%|███▉      | 6698/16798 [30:29<49:23,  3.41it/s]

{'loss': 0.6736, 'grad_norm': 1.3613959550857544, 'learning_rate': 0.00012032404098165357, 'epoch': 0.4}


 40%|███▉      | 6699/16798 [30:29<50:15,  3.35it/s]

{'loss': 0.3603, 'grad_norm': 0.9009444117546082, 'learning_rate': 0.00012031212771026925, 'epoch': 0.4}


 40%|███▉      | 6700/16798 [30:29<50:56,  3.30it/s]

{'loss': 0.8974, 'grad_norm': 1.7347360849380493, 'learning_rate': 0.00012030021443888493, 'epoch': 0.4}


 40%|███▉      | 6701/16798 [30:30<51:40,  3.26it/s]

{'loss': 2.0228, 'grad_norm': 1.8409781455993652, 'learning_rate': 0.0001202883011675006, 'epoch': 0.4}


 40%|███▉      | 6702/16798 [30:30<52:56,  3.18it/s]

{'loss': 1.6888, 'grad_norm': 1.8725911378860474, 'learning_rate': 0.00012027638789611628, 'epoch': 0.4}


 40%|███▉      | 6703/16798 [30:30<51:46,  3.25it/s]

{'loss': 2.0153, 'grad_norm': 1.7448575496673584, 'learning_rate': 0.00012026447462473196, 'epoch': 0.4}


 40%|███▉      | 6704/16798 [30:31<53:21,  3.15it/s]

{'loss': 2.1565, 'grad_norm': 2.2526051998138428, 'learning_rate': 0.00012025256135334764, 'epoch': 0.4}


 40%|███▉      | 6705/16798 [30:31<51:46,  3.25it/s]

{'loss': 1.848, 'grad_norm': 1.8754903078079224, 'learning_rate': 0.00012024064808196331, 'epoch': 0.4}


 40%|███▉      | 6706/16798 [30:31<52:26,  3.21it/s]

{'loss': 1.3776, 'grad_norm': 2.1832621097564697, 'learning_rate': 0.00012022873481057899, 'epoch': 0.4}


 40%|███▉      | 6707/16798 [30:32<52:17,  3.22it/s]

{'loss': 2.0852, 'grad_norm': 2.101325511932373, 'learning_rate': 0.00012021682153919467, 'epoch': 0.4}


 40%|███▉      | 6708/16798 [30:32<51:47,  3.25it/s]

{'loss': 1.5327, 'grad_norm': 1.8403764963150024, 'learning_rate': 0.00012020490826781034, 'epoch': 0.4}


 40%|███▉      | 6709/16798 [30:32<54:55,  3.06it/s]

{'loss': 1.925, 'grad_norm': 2.190943479537964, 'learning_rate': 0.00012019299499642602, 'epoch': 0.4}


 40%|███▉      | 6710/16798 [30:33<53:54,  3.12it/s]

{'loss': 1.8725, 'grad_norm': 1.9112900495529175, 'learning_rate': 0.0001201810817250417, 'epoch': 0.4}


 40%|███▉      | 6711/16798 [30:33<52:06,  3.23it/s]

{'loss': 1.5731, 'grad_norm': 1.7908003330230713, 'learning_rate': 0.00012016916845365738, 'epoch': 0.4}


 40%|███▉      | 6712/16798 [30:33<52:20,  3.21it/s]

{'loss': 1.3633, 'grad_norm': 1.773650050163269, 'learning_rate': 0.00012015725518227305, 'epoch': 0.4}


 40%|███▉      | 6713/16798 [30:34<51:56,  3.24it/s]

{'loss': 1.4378, 'grad_norm': 1.874085545539856, 'learning_rate': 0.00012014534191088873, 'epoch': 0.4}


 40%|███▉      | 6714/16798 [30:34<54:06,  3.11it/s]

{'loss': 1.7856, 'grad_norm': 2.123457908630371, 'learning_rate': 0.00012013342863950441, 'epoch': 0.4}


 40%|███▉      | 6715/16798 [30:34<55:00,  3.06it/s]

{'loss': 2.084, 'grad_norm': 1.9086073637008667, 'learning_rate': 0.00012012151536812008, 'epoch': 0.4}


 40%|███▉      | 6716/16798 [30:35<53:15,  3.15it/s]

{'loss': 1.7451, 'grad_norm': 1.9446097612380981, 'learning_rate': 0.00012010960209673576, 'epoch': 0.4}


 40%|███▉      | 6717/16798 [30:35<53:03,  3.17it/s]

{'loss': 1.3903, 'grad_norm': 1.755499005317688, 'learning_rate': 0.00012009768882535144, 'epoch': 0.4}


 40%|███▉      | 6718/16798 [30:35<52:37,  3.19it/s]

{'loss': 1.2936, 'grad_norm': 1.6353893280029297, 'learning_rate': 0.00012008577555396712, 'epoch': 0.4}


 40%|███▉      | 6719/16798 [30:35<52:55,  3.17it/s]

{'loss': 1.6031, 'grad_norm': 1.9095816612243652, 'learning_rate': 0.0001200738622825828, 'epoch': 0.4}


 40%|████      | 6720/16798 [30:36<52:59,  3.17it/s]

{'loss': 1.4094, 'grad_norm': 1.6847878694534302, 'learning_rate': 0.00012006194901119847, 'epoch': 0.4}


 40%|████      | 6721/16798 [30:36<52:39,  3.19it/s]

{'loss': 1.5744, 'grad_norm': 1.8501667976379395, 'learning_rate': 0.00012005003573981415, 'epoch': 0.4}


 40%|████      | 6722/16798 [30:36<54:02,  3.11it/s]

{'loss': 1.7605, 'grad_norm': 2.177551507949829, 'learning_rate': 0.00012003812246842983, 'epoch': 0.4}


 40%|████      | 6723/16798 [30:37<57:41,  2.91it/s]

{'loss': 2.0365, 'grad_norm': 2.4266533851623535, 'learning_rate': 0.00012002620919704553, 'epoch': 0.4}


 40%|████      | 6724/16798 [30:37<53:04,  3.16it/s]

{'loss': 1.517, 'grad_norm': 2.0993199348449707, 'learning_rate': 0.0001200142959256612, 'epoch': 0.4}


 40%|████      | 6725/16798 [30:37<52:19,  3.21it/s]

{'loss': 1.5902, 'grad_norm': 2.0788393020629883, 'learning_rate': 0.00012000238265427688, 'epoch': 0.4}


 40%|████      | 6726/16798 [30:38<51:44,  3.24it/s]

{'loss': 1.5644, 'grad_norm': 1.8968796730041504, 'learning_rate': 0.00011999046938289256, 'epoch': 0.4}


 40%|████      | 6727/16798 [30:38<51:19,  3.27it/s]

{'loss': 1.7383, 'grad_norm': 1.9847503900527954, 'learning_rate': 0.00011997855611150824, 'epoch': 0.4}


 40%|████      | 6728/16798 [30:38<50:15,  3.34it/s]

{'loss': 1.0763, 'grad_norm': 1.8152260780334473, 'learning_rate': 0.00011996664284012392, 'epoch': 0.4}


 40%|████      | 6729/16798 [30:39<55:22,  3.03it/s]

{'loss': 1.3799, 'grad_norm': 1.9258337020874023, 'learning_rate': 0.00011995472956873959, 'epoch': 0.4}


 40%|████      | 6730/16798 [30:39<55:34,  3.02it/s]

{'loss': 1.1549, 'grad_norm': 1.6340152025222778, 'learning_rate': 0.00011994281629735527, 'epoch': 0.4}


 40%|████      | 6731/16798 [30:39<54:18,  3.09it/s]

{'loss': 1.4338, 'grad_norm': 1.9642307758331299, 'learning_rate': 0.00011993090302597095, 'epoch': 0.4}


 40%|████      | 6732/16798 [30:40<53:07,  3.16it/s]

{'loss': 1.129, 'grad_norm': 1.5944020748138428, 'learning_rate': 0.00011991898975458662, 'epoch': 0.4}


 40%|████      | 6733/16798 [30:40<52:36,  3.19it/s]

{'loss': 1.7992, 'grad_norm': 2.3553926944732666, 'learning_rate': 0.0001199070764832023, 'epoch': 0.4}


 40%|████      | 6734/16798 [30:40<51:46,  3.24it/s]

{'loss': 1.4902, 'grad_norm': 2.0476222038269043, 'learning_rate': 0.00011989516321181798, 'epoch': 0.4}


 40%|████      | 6735/16798 [30:41<51:56,  3.23it/s]

{'loss': 1.5334, 'grad_norm': 1.9778532981872559, 'learning_rate': 0.00011988324994043366, 'epoch': 0.4}


 40%|████      | 6736/16798 [30:41<54:23,  3.08it/s]

{'loss': 1.2074, 'grad_norm': 1.5821568965911865, 'learning_rate': 0.00011987133666904933, 'epoch': 0.4}


 40%|████      | 6737/16798 [30:41<48:49,  3.43it/s]

{'loss': 1.2486, 'grad_norm': 2.1775825023651123, 'learning_rate': 0.00011985942339766501, 'epoch': 0.4}


 40%|████      | 6738/16798 [30:41<48:38,  3.45it/s]

{'loss': 1.2662, 'grad_norm': 2.320894718170166, 'learning_rate': 0.00011984751012628069, 'epoch': 0.4}


 40%|████      | 6739/16798 [30:42<50:21,  3.33it/s]

{'loss': 1.0412, 'grad_norm': 1.8938466310501099, 'learning_rate': 0.00011983559685489636, 'epoch': 0.4}


 40%|████      | 6740/16798 [30:42<50:00,  3.35it/s]

{'loss': 1.2729, 'grad_norm': 1.9976634979248047, 'learning_rate': 0.00011982368358351204, 'epoch': 0.4}


 40%|████      | 6741/16798 [30:42<47:19,  3.54it/s]

{'loss': 1.5426, 'grad_norm': 1.9895758628845215, 'learning_rate': 0.00011981177031212772, 'epoch': 0.4}


 40%|████      | 6742/16798 [30:43<49:22,  3.39it/s]

{'loss': 1.0504, 'grad_norm': 1.4857909679412842, 'learning_rate': 0.0001197998570407434, 'epoch': 0.4}


 40%|████      | 6743/16798 [30:43<47:12,  3.55it/s]

{'loss': 1.3883, 'grad_norm': 2.036804676055908, 'learning_rate': 0.00011978794376935907, 'epoch': 0.4}


 40%|████      | 6744/16798 [30:43<47:25,  3.53it/s]

{'loss': 0.8706, 'grad_norm': 1.428605556488037, 'learning_rate': 0.00011977603049797475, 'epoch': 0.4}


 40%|████      | 6745/16798 [30:43<46:41,  3.59it/s]

{'loss': 1.074, 'grad_norm': 1.645128846168518, 'learning_rate': 0.00011976411722659043, 'epoch': 0.4}


 40%|████      | 6746/16798 [30:44<48:08,  3.48it/s]

{'loss': 1.2082, 'grad_norm': 1.7971559762954712, 'learning_rate': 0.0001197522039552061, 'epoch': 0.4}


 40%|████      | 6747/16798 [30:44<49:22,  3.39it/s]

{'loss': 0.7669, 'grad_norm': 1.3385608196258545, 'learning_rate': 0.00011974029068382178, 'epoch': 0.4}


 40%|████      | 6748/16798 [30:44<50:35,  3.31it/s]

{'loss': 0.4163, 'grad_norm': 1.071357250213623, 'learning_rate': 0.00011972837741243746, 'epoch': 0.4}


 40%|████      | 6749/16798 [30:45<51:32,  3.25it/s]

{'loss': 0.5524, 'grad_norm': 1.8768202066421509, 'learning_rate': 0.00011971646414105314, 'epoch': 0.4}


 40%|████      | 6750/16798 [30:45<52:08,  3.21it/s]

{'loss': 0.3606, 'grad_norm': 0.8990530967712402, 'learning_rate': 0.00011970455086966881, 'epoch': 0.4}


 40%|████      | 6751/16798 [30:45<51:57,  3.22it/s]

{'loss': 1.7018, 'grad_norm': 1.7054742574691772, 'learning_rate': 0.00011969263759828449, 'epoch': 0.4}


 40%|████      | 6752/16798 [30:46<51:16,  3.27it/s]

{'loss': 2.0098, 'grad_norm': 1.8738305568695068, 'learning_rate': 0.00011968072432690017, 'epoch': 0.4}


 40%|████      | 6753/16798 [30:46<52:34,  3.18it/s]

{'loss': 2.1955, 'grad_norm': 1.8161131143569946, 'learning_rate': 0.00011966881105551586, 'epoch': 0.4}


 40%|████      | 6754/16798 [30:46<52:53,  3.17it/s]

{'loss': 1.7555, 'grad_norm': 1.8937846422195435, 'learning_rate': 0.00011965689778413154, 'epoch': 0.4}


 40%|████      | 6755/16798 [30:47<54:35,  3.07it/s]

{'loss': 2.2059, 'grad_norm': 2.115947723388672, 'learning_rate': 0.00011964498451274721, 'epoch': 0.4}


 40%|████      | 6756/16798 [30:47<52:23,  3.19it/s]

{'loss': 1.9495, 'grad_norm': 1.9104936122894287, 'learning_rate': 0.00011963307124136289, 'epoch': 0.4}


 40%|████      | 6757/16798 [30:47<50:54,  3.29it/s]

{'loss': 1.647, 'grad_norm': 1.7679942846298218, 'learning_rate': 0.00011962115796997857, 'epoch': 0.4}


 40%|████      | 6758/16798 [30:47<50:50,  3.29it/s]

{'loss': 1.388, 'grad_norm': 1.5423423051834106, 'learning_rate': 0.00011960924469859425, 'epoch': 0.4}


 40%|████      | 6759/16798 [30:48<52:52,  3.16it/s]

{'loss': 2.0192, 'grad_norm': 2.1456003189086914, 'learning_rate': 0.00011959733142720992, 'epoch': 0.4}


 40%|████      | 6760/16798 [30:48<53:54,  3.10it/s]

{'loss': 1.5623, 'grad_norm': 1.796966552734375, 'learning_rate': 0.0001195854181558256, 'epoch': 0.4}


 40%|████      | 6761/16798 [30:48<53:50,  3.11it/s]

{'loss': 1.6636, 'grad_norm': 1.86783766746521, 'learning_rate': 0.00011957350488444128, 'epoch': 0.4}


 40%|████      | 6762/16798 [30:49<54:02,  3.10it/s]

{'loss': 1.2203, 'grad_norm': 1.7353512048721313, 'learning_rate': 0.00011956159161305695, 'epoch': 0.4}


 40%|████      | 6763/16798 [30:49<52:14,  3.20it/s]

{'loss': 1.5608, 'grad_norm': 1.5924873352050781, 'learning_rate': 0.00011954967834167263, 'epoch': 0.4}


 40%|████      | 6764/16798 [30:49<54:09,  3.09it/s]

{'loss': 1.5676, 'grad_norm': 1.7497090101242065, 'learning_rate': 0.00011953776507028831, 'epoch': 0.4}


 40%|████      | 6765/16798 [30:50<53:48,  3.11it/s]

{'loss': 1.6639, 'grad_norm': 1.8166807889938354, 'learning_rate': 0.00011952585179890399, 'epoch': 0.4}


 40%|████      | 6766/16798 [30:50<52:20,  3.19it/s]

{'loss': 1.6736, 'grad_norm': 2.101375102996826, 'learning_rate': 0.00011951393852751966, 'epoch': 0.4}


 40%|████      | 6767/16798 [30:50<51:26,  3.25it/s]

{'loss': 2.0058, 'grad_norm': 2.4513700008392334, 'learning_rate': 0.00011950202525613534, 'epoch': 0.4}


 40%|████      | 6768/16798 [30:51<54:36,  3.06it/s]

{'loss': 1.3838, 'grad_norm': 1.6314995288848877, 'learning_rate': 0.00011949011198475102, 'epoch': 0.4}


 40%|████      | 6769/16798 [30:51<52:40,  3.17it/s]

{'loss': 1.2565, 'grad_norm': 1.6629345417022705, 'learning_rate': 0.0001194781987133667, 'epoch': 0.4}


 40%|████      | 6770/16798 [30:51<51:44,  3.23it/s]

{'loss': 1.7482, 'grad_norm': 1.9310250282287598, 'learning_rate': 0.00011946628544198237, 'epoch': 0.4}


 40%|████      | 6771/16798 [30:52<50:45,  3.29it/s]

{'loss': 1.4214, 'grad_norm': 1.93617582321167, 'learning_rate': 0.00011945437217059805, 'epoch': 0.4}


 40%|████      | 6772/16798 [30:52<50:23,  3.32it/s]

{'loss': 1.1285, 'grad_norm': 1.5835529565811157, 'learning_rate': 0.00011944245889921373, 'epoch': 0.4}


 40%|████      | 6773/16798 [30:52<50:56,  3.28it/s]

{'loss': 1.7398, 'grad_norm': 2.2062020301818848, 'learning_rate': 0.0001194305456278294, 'epoch': 0.4}


 40%|████      | 6774/16798 [30:52<51:48,  3.22it/s]

{'loss': 1.6501, 'grad_norm': 1.9898536205291748, 'learning_rate': 0.00011941863235644508, 'epoch': 0.4}


 40%|████      | 6775/16798 [30:53<49:16,  3.39it/s]

{'loss': 1.7345, 'grad_norm': 2.2561779022216797, 'learning_rate': 0.00011940671908506076, 'epoch': 0.4}


 40%|████      | 6776/16798 [30:53<51:32,  3.24it/s]

{'loss': 1.4889, 'grad_norm': 1.8953362703323364, 'learning_rate': 0.00011939480581367644, 'epoch': 0.4}


 40%|████      | 6777/16798 [30:53<52:20,  3.19it/s]

{'loss': 1.4449, 'grad_norm': 1.8157929182052612, 'learning_rate': 0.00011938289254229211, 'epoch': 0.4}


 40%|████      | 6778/16798 [30:54<51:56,  3.21it/s]

{'loss': 1.4189, 'grad_norm': 1.7191343307495117, 'learning_rate': 0.00011937097927090779, 'epoch': 0.4}


 40%|████      | 6779/16798 [30:54<51:33,  3.24it/s]

{'loss': 1.6219, 'grad_norm': 2.057952404022217, 'learning_rate': 0.00011935906599952347, 'epoch': 0.4}


 40%|████      | 6780/16798 [30:54<51:18,  3.25it/s]

{'loss': 1.2178, 'grad_norm': 1.6584653854370117, 'learning_rate': 0.00011934715272813914, 'epoch': 0.4}


 40%|████      | 6781/16798 [30:55<53:05,  3.14it/s]

{'loss': 1.314, 'grad_norm': 2.067396640777588, 'learning_rate': 0.00011933523945675482, 'epoch': 0.4}


 40%|████      | 6782/16798 [30:55<51:22,  3.25it/s]

{'loss': 1.3773, 'grad_norm': 1.7925734519958496, 'learning_rate': 0.0001193233261853705, 'epoch': 0.4}


 40%|████      | 6783/16798 [30:55<52:05,  3.20it/s]

{'loss': 1.3919, 'grad_norm': 1.8218750953674316, 'learning_rate': 0.00011931141291398618, 'epoch': 0.4}


 40%|████      | 6784/16798 [30:56<50:54,  3.28it/s]

{'loss': 1.6123, 'grad_norm': 1.8843837976455688, 'learning_rate': 0.00011929949964260188, 'epoch': 0.4}


 40%|████      | 6785/16798 [30:56<51:02,  3.27it/s]

{'loss': 1.1785, 'grad_norm': 2.1243321895599365, 'learning_rate': 0.00011928758637121756, 'epoch': 0.4}


 40%|████      | 6786/16798 [30:56<51:53,  3.22it/s]

{'loss': 1.4977, 'grad_norm': 1.9554587602615356, 'learning_rate': 0.00011927567309983323, 'epoch': 0.4}


 40%|████      | 6787/16798 [30:57<56:13,  2.97it/s]

{'loss': 1.0233, 'grad_norm': 1.4868440628051758, 'learning_rate': 0.00011926375982844891, 'epoch': 0.4}


 40%|████      | 6788/16798 [30:57<54:43,  3.05it/s]

{'loss': 1.2969, 'grad_norm': 1.9917086362838745, 'learning_rate': 0.00011925184655706459, 'epoch': 0.4}


 40%|████      | 6789/16798 [30:57<53:52,  3.10it/s]

{'loss': 1.1548, 'grad_norm': 1.8086838722229004, 'learning_rate': 0.00011923993328568027, 'epoch': 0.4}


 40%|████      | 6790/16798 [30:57<52:01,  3.21it/s]

{'loss': 1.5362, 'grad_norm': 2.1126797199249268, 'learning_rate': 0.00011922802001429594, 'epoch': 0.4}


 40%|████      | 6791/16798 [30:58<55:24,  3.01it/s]

{'loss': 1.1889, 'grad_norm': 1.5249662399291992, 'learning_rate': 0.00011921610674291162, 'epoch': 0.4}


 40%|████      | 6792/16798 [30:58<54:37,  3.05it/s]

{'loss': 1.1943, 'grad_norm': 1.7601618766784668, 'learning_rate': 0.0001192041934715273, 'epoch': 0.4}


 40%|████      | 6793/16798 [30:59<53:47,  3.10it/s]

{'loss': 1.2134, 'grad_norm': 1.6840211153030396, 'learning_rate': 0.00011919228020014297, 'epoch': 0.4}


 40%|████      | 6794/16798 [30:59<54:02,  3.09it/s]

{'loss': 1.3265, 'grad_norm': 2.157153367996216, 'learning_rate': 0.00011918036692875865, 'epoch': 0.4}


 40%|████      | 6795/16798 [30:59<58:27,  2.85it/s]

{'loss': 0.8457, 'grad_norm': 1.8211313486099243, 'learning_rate': 0.00011916845365737433, 'epoch': 0.4}


 40%|████      | 6796/16798 [31:00<55:10,  3.02it/s]

{'loss': 1.0728, 'grad_norm': 1.5918635129928589, 'learning_rate': 0.00011915654038599, 'epoch': 0.4}


 40%|████      | 6797/16798 [31:00<58:07,  2.87it/s]

{'loss': 0.938, 'grad_norm': 1.7011979818344116, 'learning_rate': 0.00011914462711460568, 'epoch': 0.4}


 40%|████      | 6798/16798 [31:00<53:59,  3.09it/s]

{'loss': 0.7411, 'grad_norm': 1.360916256904602, 'learning_rate': 0.00011913271384322136, 'epoch': 0.4}


 40%|████      | 6799/16798 [31:01<54:20,  3.07it/s]

{'loss': 0.2791, 'grad_norm': 0.6391371488571167, 'learning_rate': 0.00011912080057183704, 'epoch': 0.4}


 40%|████      | 6800/16798 [31:01<54:07,  3.08it/s]

{'loss': 0.522, 'grad_norm': 1.095034122467041, 'learning_rate': 0.00011910888730045272, 'epoch': 0.4}


 40%|████      | 6801/16798 [31:01<57:00,  2.92it/s]

{'loss': 1.7435, 'grad_norm': 1.6786184310913086, 'learning_rate': 0.00011909697402906839, 'epoch': 0.4}


 40%|████      | 6802/16798 [31:02<54:00,  3.08it/s]

{'loss': 1.9971, 'grad_norm': 1.7750439643859863, 'learning_rate': 0.00011908506075768407, 'epoch': 0.4}


 40%|████      | 6803/16798 [31:02<55:06,  3.02it/s]

{'loss': 1.7894, 'grad_norm': 1.8001632690429688, 'learning_rate': 0.00011907314748629975, 'epoch': 0.4}


 41%|████      | 6804/16798 [31:02<51:27,  3.24it/s]

{'loss': 2.4972, 'grad_norm': 2.1291627883911133, 'learning_rate': 0.00011906123421491542, 'epoch': 0.41}


 41%|████      | 6805/16798 [31:02<52:29,  3.17it/s]

{'loss': 2.103, 'grad_norm': 1.8347134590148926, 'learning_rate': 0.0001190493209435311, 'epoch': 0.41}


 41%|████      | 6806/16798 [31:03<55:18,  3.01it/s]

{'loss': 1.9758, 'grad_norm': 2.1671676635742188, 'learning_rate': 0.00011903740767214678, 'epoch': 0.41}


 41%|████      | 6807/16798 [31:03<54:04,  3.08it/s]

{'loss': 2.1449, 'grad_norm': 2.1407341957092285, 'learning_rate': 0.00011902549440076246, 'epoch': 0.41}


 41%|████      | 6808/16798 [31:03<53:54,  3.09it/s]

{'loss': 2.0535, 'grad_norm': 1.7524806261062622, 'learning_rate': 0.00011901358112937813, 'epoch': 0.41}


 41%|████      | 6809/16798 [31:04<56:20,  2.96it/s]

{'loss': 2.0178, 'grad_norm': 2.2149574756622314, 'learning_rate': 0.00011900166785799381, 'epoch': 0.41}


 41%|████      | 6810/16798 [31:04<53:40,  3.10it/s]

{'loss': 2.0177, 'grad_norm': 2.0171573162078857, 'learning_rate': 0.00011898975458660949, 'epoch': 0.41}


 41%|████      | 6811/16798 [31:04<51:39,  3.22it/s]

{'loss': 2.1612, 'grad_norm': 1.9970730543136597, 'learning_rate': 0.00011897784131522516, 'epoch': 0.41}


 41%|████      | 6812/16798 [31:05<52:18,  3.18it/s]

{'loss': 1.9028, 'grad_norm': 1.9197770357131958, 'learning_rate': 0.00011896592804384084, 'epoch': 0.41}


 41%|████      | 6813/16798 [31:05<52:14,  3.19it/s]

{'loss': 1.8252, 'grad_norm': 1.878064751625061, 'learning_rate': 0.00011895401477245652, 'epoch': 0.41}


 41%|████      | 6814/16798 [31:05<51:33,  3.23it/s]

{'loss': 1.9541, 'grad_norm': 1.8549187183380127, 'learning_rate': 0.0001189421015010722, 'epoch': 0.41}


 41%|████      | 6815/16798 [31:06<51:50,  3.21it/s]

{'loss': 1.496, 'grad_norm': 1.5715023279190063, 'learning_rate': 0.00011893018822968789, 'epoch': 0.41}


 41%|████      | 6816/16798 [31:06<52:58,  3.14it/s]

{'loss': 1.6874, 'grad_norm': 2.157057046890259, 'learning_rate': 0.00011891827495830356, 'epoch': 0.41}


 41%|████      | 6817/16798 [31:06<54:11,  3.07it/s]

{'loss': 2.0262, 'grad_norm': 2.1741347312927246, 'learning_rate': 0.00011890636168691924, 'epoch': 0.41}


 41%|████      | 6818/16798 [31:07<53:57,  3.08it/s]

{'loss': 1.6183, 'grad_norm': 1.704681396484375, 'learning_rate': 0.00011889444841553492, 'epoch': 0.41}


 41%|████      | 6819/16798 [31:07<54:04,  3.08it/s]

{'loss': 1.5902, 'grad_norm': 1.9408133029937744, 'learning_rate': 0.0001188825351441506, 'epoch': 0.41}


 41%|████      | 6820/16798 [31:07<53:20,  3.12it/s]

{'loss': 1.7903, 'grad_norm': 2.230329990386963, 'learning_rate': 0.00011887062187276627, 'epoch': 0.41}


 41%|████      | 6821/16798 [31:08<50:45,  3.28it/s]

{'loss': 1.7665, 'grad_norm': 2.0250356197357178, 'learning_rate': 0.00011885870860138195, 'epoch': 0.41}


 41%|████      | 6822/16798 [31:08<50:52,  3.27it/s]

{'loss': 1.4311, 'grad_norm': 2.0394227504730225, 'learning_rate': 0.00011884679532999763, 'epoch': 0.41}


 41%|████      | 6823/16798 [31:08<50:03,  3.32it/s]

{'loss': 1.2582, 'grad_norm': 1.5932691097259521, 'learning_rate': 0.0001188348820586133, 'epoch': 0.41}


 41%|████      | 6824/16798 [31:08<48:04,  3.46it/s]

{'loss': 1.5655, 'grad_norm': 1.813928246498108, 'learning_rate': 0.00011882296878722898, 'epoch': 0.41}


 41%|████      | 6825/16798 [31:09<51:13,  3.24it/s]

{'loss': 1.5972, 'grad_norm': 1.780102252960205, 'learning_rate': 0.00011881105551584466, 'epoch': 0.41}


 41%|████      | 6826/16798 [31:09<52:58,  3.14it/s]

{'loss': 1.3108, 'grad_norm': 1.7014580965042114, 'learning_rate': 0.00011879914224446034, 'epoch': 0.41}


 41%|████      | 6827/16798 [31:09<53:00,  3.14it/s]

{'loss': 1.487, 'grad_norm': 2.8271355628967285, 'learning_rate': 0.00011878722897307601, 'epoch': 0.41}


 41%|████      | 6828/16798 [31:10<53:47,  3.09it/s]

{'loss': 1.9699, 'grad_norm': 2.1629419326782227, 'learning_rate': 0.00011877531570169169, 'epoch': 0.41}


 41%|████      | 6829/16798 [31:10<54:08,  3.07it/s]

{'loss': 1.5427, 'grad_norm': 1.8650201559066772, 'learning_rate': 0.00011876340243030737, 'epoch': 0.41}


 41%|████      | 6830/16798 [31:10<54:32,  3.05it/s]

{'loss': 1.4886, 'grad_norm': 1.867547631263733, 'learning_rate': 0.00011875148915892305, 'epoch': 0.41}


 41%|████      | 6831/16798 [31:11<53:33,  3.10it/s]

{'loss': 1.1451, 'grad_norm': 1.598913550376892, 'learning_rate': 0.00011873957588753872, 'epoch': 0.41}


 41%|████      | 6832/16798 [31:11<54:18,  3.06it/s]

{'loss': 1.4855, 'grad_norm': 1.8887437582015991, 'learning_rate': 0.0001187276626161544, 'epoch': 0.41}


 41%|████      | 6833/16798 [31:11<53:48,  3.09it/s]

{'loss': 1.633, 'grad_norm': 1.7217305898666382, 'learning_rate': 0.00011871574934477008, 'epoch': 0.41}


 41%|████      | 6834/16798 [31:12<53:55,  3.08it/s]

{'loss': 1.3894, 'grad_norm': 1.797093391418457, 'learning_rate': 0.00011870383607338575, 'epoch': 0.41}


 41%|████      | 6835/16798 [31:12<52:44,  3.15it/s]

{'loss': 1.0033, 'grad_norm': 1.5019340515136719, 'learning_rate': 0.00011869192280200143, 'epoch': 0.41}


 41%|████      | 6836/16798 [31:12<53:10,  3.12it/s]

{'loss': 1.1143, 'grad_norm': 1.5590795278549194, 'learning_rate': 0.00011868000953061711, 'epoch': 0.41}


 41%|████      | 6837/16798 [31:13<52:03,  3.19it/s]

{'loss': 1.5926, 'grad_norm': 2.0552761554718018, 'learning_rate': 0.00011866809625923279, 'epoch': 0.41}


 41%|████      | 6838/16798 [31:13<50:37,  3.28it/s]

{'loss': 1.1914, 'grad_norm': 1.647716760635376, 'learning_rate': 0.00011865618298784846, 'epoch': 0.41}


 41%|████      | 6839/16798 [31:13<48:43,  3.41it/s]

{'loss': 1.5037, 'grad_norm': 2.0268750190734863, 'learning_rate': 0.00011864426971646414, 'epoch': 0.41}


 41%|████      | 6840/16798 [31:14<51:04,  3.25it/s]

{'loss': 1.2736, 'grad_norm': 1.8417587280273438, 'learning_rate': 0.00011863235644507982, 'epoch': 0.41}


 41%|████      | 6841/16798 [31:14<51:32,  3.22it/s]

{'loss': 0.9927, 'grad_norm': 1.7944852113723755, 'learning_rate': 0.0001186204431736955, 'epoch': 0.41}


 41%|████      | 6842/16798 [31:14<52:42,  3.15it/s]

{'loss': 0.9858, 'grad_norm': 1.5434730052947998, 'learning_rate': 0.00011860852990231117, 'epoch': 0.41}


 41%|████      | 6843/16798 [31:14<48:08,  3.45it/s]

{'loss': 1.0713, 'grad_norm': 1.6677948236465454, 'learning_rate': 0.00011859661663092685, 'epoch': 0.41}


 41%|████      | 6844/16798 [31:15<49:20,  3.36it/s]

{'loss': 1.3197, 'grad_norm': 1.7254219055175781, 'learning_rate': 0.00011858470335954253, 'epoch': 0.41}


 41%|████      | 6845/16798 [31:15<52:26,  3.16it/s]

{'loss': 0.9424, 'grad_norm': 1.499186635017395, 'learning_rate': 0.0001185727900881582, 'epoch': 0.41}


 41%|████      | 6846/16798 [31:15<51:32,  3.22it/s]

{'loss': 0.8476, 'grad_norm': 1.5472712516784668, 'learning_rate': 0.00011856087681677391, 'epoch': 0.41}


 41%|████      | 6847/16798 [31:16<52:22,  3.17it/s]

{'loss': 0.7759, 'grad_norm': 1.7742372751235962, 'learning_rate': 0.00011854896354538958, 'epoch': 0.41}


 41%|████      | 6848/16798 [31:16<52:35,  3.15it/s]

{'loss': 1.229, 'grad_norm': 2.0779592990875244, 'learning_rate': 0.00011853705027400526, 'epoch': 0.41}


 41%|████      | 6849/16798 [31:16<51:42,  3.21it/s]

{'loss': 0.3222, 'grad_norm': 0.8007837533950806, 'learning_rate': 0.00011852513700262094, 'epoch': 0.41}


 41%|████      | 6850/16798 [31:17<51:48,  3.20it/s]

{'loss': 0.4835, 'grad_norm': 1.1095149517059326, 'learning_rate': 0.00011851322373123662, 'epoch': 0.41}


 41%|████      | 6851/16798 [31:17<52:29,  3.16it/s]

{'loss': 1.7779, 'grad_norm': 1.9466181993484497, 'learning_rate': 0.0001185013104598523, 'epoch': 0.41}


 41%|████      | 6852/16798 [31:17<51:17,  3.23it/s]

{'loss': 1.9304, 'grad_norm': 2.7990641593933105, 'learning_rate': 0.00011848939718846797, 'epoch': 0.41}


 41%|████      | 6853/16798 [31:18<54:22,  3.05it/s]

{'loss': 1.8798, 'grad_norm': 2.001082420349121, 'learning_rate': 0.00011847748391708365, 'epoch': 0.41}


 41%|████      | 6854/16798 [31:18<56:20,  2.94it/s]

{'loss': 2.2426, 'grad_norm': 2.3161559104919434, 'learning_rate': 0.00011846557064569933, 'epoch': 0.41}


 41%|████      | 6855/16798 [31:18<55:12,  3.00it/s]

{'loss': 2.2309, 'grad_norm': 1.9747085571289062, 'learning_rate': 0.000118453657374315, 'epoch': 0.41}


 41%|████      | 6856/16798 [31:19<54:21,  3.05it/s]

{'loss': 2.1746, 'grad_norm': 2.121824026107788, 'learning_rate': 0.00011844174410293068, 'epoch': 0.41}


 41%|████      | 6857/16798 [31:19<53:42,  3.08it/s]

{'loss': 1.1774, 'grad_norm': 1.7741341590881348, 'learning_rate': 0.00011842983083154636, 'epoch': 0.41}


 41%|████      | 6858/16798 [31:19<53:28,  3.10it/s]

{'loss': 2.0218, 'grad_norm': 2.0610458850860596, 'learning_rate': 0.00011841791756016203, 'epoch': 0.41}


 41%|████      | 6859/16798 [31:20<53:06,  3.12it/s]

{'loss': 1.9063, 'grad_norm': 2.0433194637298584, 'learning_rate': 0.00011840600428877771, 'epoch': 0.41}


 41%|████      | 6860/16798 [31:20<54:44,  3.03it/s]

{'loss': 1.5714, 'grad_norm': 2.0065062046051025, 'learning_rate': 0.00011839409101739339, 'epoch': 0.41}


 41%|████      | 6861/16798 [31:20<54:05,  3.06it/s]

{'loss': 1.6802, 'grad_norm': 1.9914360046386719, 'learning_rate': 0.00011838217774600907, 'epoch': 0.41}


 41%|████      | 6862/16798 [31:21<54:52,  3.02it/s]

{'loss': 1.6543, 'grad_norm': 1.8413877487182617, 'learning_rate': 0.00011837026447462474, 'epoch': 0.41}


 41%|████      | 6863/16798 [31:21<55:25,  2.99it/s]

{'loss': 2.142, 'grad_norm': 2.68684983253479, 'learning_rate': 0.00011835835120324042, 'epoch': 0.41}


 41%|████      | 6864/16798 [31:21<55:10,  3.00it/s]

{'loss': 1.3092, 'grad_norm': 1.6744989156723022, 'learning_rate': 0.0001183464379318561, 'epoch': 0.41}


 41%|████      | 6865/16798 [31:22<54:43,  3.03it/s]

{'loss': 1.5513, 'grad_norm': 1.977303385734558, 'learning_rate': 0.00011833452466047177, 'epoch': 0.41}


 41%|████      | 6866/16798 [31:22<53:50,  3.07it/s]

{'loss': 1.5117, 'grad_norm': 1.875267744064331, 'learning_rate': 0.00011832261138908745, 'epoch': 0.41}


 41%|████      | 6867/16798 [31:22<51:33,  3.21it/s]

{'loss': 1.5729, 'grad_norm': 1.9736095666885376, 'learning_rate': 0.00011831069811770313, 'epoch': 0.41}


 41%|████      | 6868/16798 [31:22<50:36,  3.27it/s]

{'loss': 1.5778, 'grad_norm': 1.8429127931594849, 'learning_rate': 0.0001182987848463188, 'epoch': 0.41}


 41%|████      | 6869/16798 [31:23<51:07,  3.24it/s]

{'loss': 1.7418, 'grad_norm': 1.9323351383209229, 'learning_rate': 0.00011828687157493448, 'epoch': 0.41}


 41%|████      | 6870/16798 [31:23<53:09,  3.11it/s]

{'loss': 1.8988, 'grad_norm': 2.075565814971924, 'learning_rate': 0.00011827495830355016, 'epoch': 0.41}


 41%|████      | 6871/16798 [31:23<50:45,  3.26it/s]

{'loss': 1.2447, 'grad_norm': 1.558983564376831, 'learning_rate': 0.00011826304503216584, 'epoch': 0.41}


 41%|████      | 6872/16798 [31:24<51:09,  3.23it/s]

{'loss': 1.0073, 'grad_norm': 1.5764507055282593, 'learning_rate': 0.0001182511317607815, 'epoch': 0.41}


 41%|████      | 6873/16798 [31:24<49:01,  3.37it/s]

{'loss': 1.4104, 'grad_norm': 1.9083161354064941, 'learning_rate': 0.00011823921848939718, 'epoch': 0.41}


 41%|████      | 6874/16798 [31:24<51:30,  3.21it/s]

{'loss': 1.9219, 'grad_norm': 2.2381742000579834, 'learning_rate': 0.00011822730521801286, 'epoch': 0.41}


 41%|████      | 6875/16798 [31:25<50:23,  3.28it/s]

{'loss': 1.3098, 'grad_norm': 1.8645309209823608, 'learning_rate': 0.00011821539194662853, 'epoch': 0.41}


 41%|████      | 6876/16798 [31:25<52:17,  3.16it/s]

{'loss': 1.4285, 'grad_norm': 2.135383367538452, 'learning_rate': 0.00011820347867524421, 'epoch': 0.41}


 41%|████      | 6877/16798 [31:25<51:53,  3.19it/s]

{'loss': 1.0965, 'grad_norm': 2.035205841064453, 'learning_rate': 0.00011819156540385991, 'epoch': 0.41}


 41%|████      | 6878/16798 [31:26<52:20,  3.16it/s]

{'loss': 1.5189, 'grad_norm': 2.3818883895874023, 'learning_rate': 0.00011817965213247559, 'epoch': 0.41}


 41%|████      | 6879/16798 [31:26<52:03,  3.18it/s]

{'loss': 1.2918, 'grad_norm': 1.916741132736206, 'learning_rate': 0.00011816773886109127, 'epoch': 0.41}


 41%|████      | 6880/16798 [31:26<53:11,  3.11it/s]

{'loss': 1.4251, 'grad_norm': 2.0347442626953125, 'learning_rate': 0.00011815582558970695, 'epoch': 0.41}


 41%|████      | 6881/16798 [31:27<52:43,  3.13it/s]

{'loss': 1.3705, 'grad_norm': 2.0265543460845947, 'learning_rate': 0.00011814391231832262, 'epoch': 0.41}


 41%|████      | 6882/16798 [31:27<50:39,  3.26it/s]

{'loss': 1.3153, 'grad_norm': 2.0385401248931885, 'learning_rate': 0.0001181319990469383, 'epoch': 0.41}


 41%|████      | 6883/16798 [31:27<50:19,  3.28it/s]

{'loss': 1.2428, 'grad_norm': 1.665102481842041, 'learning_rate': 0.00011812008577555398, 'epoch': 0.41}


 41%|████      | 6884/16798 [31:27<50:41,  3.26it/s]

{'loss': 1.2716, 'grad_norm': 1.9480465650558472, 'learning_rate': 0.00011810817250416966, 'epoch': 0.41}


 41%|████      | 6885/16798 [31:28<51:54,  3.18it/s]

{'loss': 1.6048, 'grad_norm': 2.3417532444000244, 'learning_rate': 0.00011809625923278533, 'epoch': 0.41}


 41%|████      | 6886/16798 [31:28<53:57,  3.06it/s]

{'loss': 1.4799, 'grad_norm': 1.8947248458862305, 'learning_rate': 0.00011808434596140101, 'epoch': 0.41}


 41%|████      | 6887/16798 [31:28<54:14,  3.05it/s]

{'loss': 0.8752, 'grad_norm': 1.519255518913269, 'learning_rate': 0.00011807243269001669, 'epoch': 0.41}


 41%|████      | 6888/16798 [31:29<53:00,  3.12it/s]

{'loss': 1.213, 'grad_norm': 1.5755765438079834, 'learning_rate': 0.00011806051941863236, 'epoch': 0.41}


 41%|████      | 6889/16798 [31:29<51:58,  3.18it/s]

{'loss': 1.1247, 'grad_norm': 2.011829137802124, 'learning_rate': 0.00011804860614724804, 'epoch': 0.41}


 41%|████      | 6890/16798 [31:29<50:42,  3.26it/s]

{'loss': 1.1549, 'grad_norm': 1.8501235246658325, 'learning_rate': 0.00011803669287586372, 'epoch': 0.41}


 41%|████      | 6891/16798 [31:30<48:53,  3.38it/s]

{'loss': 0.883, 'grad_norm': 1.9144692420959473, 'learning_rate': 0.0001180247796044794, 'epoch': 0.41}


 41%|████      | 6892/16798 [31:30<53:18,  3.10it/s]

{'loss': 1.0352, 'grad_norm': 2.0746500492095947, 'learning_rate': 0.00011801286633309507, 'epoch': 0.41}


 41%|████      | 6893/16798 [31:30<55:22,  2.98it/s]

{'loss': 1.1072, 'grad_norm': 1.6985013484954834, 'learning_rate': 0.00011800095306171075, 'epoch': 0.41}


 41%|████      | 6894/16798 [31:31<56:45,  2.91it/s]

{'loss': 0.6412, 'grad_norm': 1.431136131286621, 'learning_rate': 0.00011798903979032643, 'epoch': 0.41}


 41%|████      | 6895/16798 [31:31<53:54,  3.06it/s]

{'loss': 0.5633, 'grad_norm': 1.175053358078003, 'learning_rate': 0.0001179771265189421, 'epoch': 0.41}


 41%|████      | 6896/16798 [31:31<56:00,  2.95it/s]

{'loss': 0.4067, 'grad_norm': 0.8940317034721375, 'learning_rate': 0.00011796521324755778, 'epoch': 0.41}


 41%|████      | 6897/16798 [31:32<55:01,  3.00it/s]

{'loss': 0.4774, 'grad_norm': 1.0783475637435913, 'learning_rate': 0.00011795329997617346, 'epoch': 0.41}


 41%|████      | 6898/16798 [31:32<52:28,  3.14it/s]

{'loss': 0.6314, 'grad_norm': 1.368007779121399, 'learning_rate': 0.00011794138670478914, 'epoch': 0.41}


 41%|████      | 6899/16798 [31:32<51:04,  3.23it/s]

{'loss': 0.4113, 'grad_norm': 0.9313517212867737, 'learning_rate': 0.00011792947343340481, 'epoch': 0.41}


 41%|████      | 6900/16798 [31:33<51:22,  3.21it/s]

{'loss': 0.1735, 'grad_norm': 0.6331894993782043, 'learning_rate': 0.00011791756016202049, 'epoch': 0.41}


 41%|████      | 6901/16798 [31:33<52:46,  3.13it/s]

{'loss': 1.7375, 'grad_norm': 1.8789135217666626, 'learning_rate': 0.00011790564689063617, 'epoch': 0.41}


 41%|████      | 6902/16798 [31:33<51:52,  3.18it/s]

{'loss': 1.8974, 'grad_norm': 1.9300546646118164, 'learning_rate': 0.00011789373361925185, 'epoch': 0.41}


 41%|████      | 6903/16798 [31:34<54:10,  3.04it/s]

{'loss': 1.6612, 'grad_norm': 2.121901035308838, 'learning_rate': 0.00011788182034786752, 'epoch': 0.41}


 41%|████      | 6904/16798 [31:34<52:16,  3.15it/s]

{'loss': 2.2686, 'grad_norm': 2.0542867183685303, 'learning_rate': 0.0001178699070764832, 'epoch': 0.41}


 41%|████      | 6905/16798 [31:34<51:14,  3.22it/s]

{'loss': 2.1444, 'grad_norm': 2.0387258529663086, 'learning_rate': 0.00011785799380509888, 'epoch': 0.41}


 41%|████      | 6906/16798 [31:35<52:29,  3.14it/s]

{'loss': 1.4449, 'grad_norm': 1.765221357345581, 'learning_rate': 0.00011784608053371455, 'epoch': 0.41}


 41%|████      | 6907/16798 [31:35<50:55,  3.24it/s]

{'loss': 2.0708, 'grad_norm': 1.9464495182037354, 'learning_rate': 0.00011783416726233023, 'epoch': 0.41}


 41%|████      | 6908/16798 [31:35<51:52,  3.18it/s]

{'loss': 2.3829, 'grad_norm': 2.189526319503784, 'learning_rate': 0.00011782225399094594, 'epoch': 0.41}


 41%|████      | 6909/16798 [31:35<53:37,  3.07it/s]

{'loss': 1.8975, 'grad_norm': 1.9954147338867188, 'learning_rate': 0.00011781034071956161, 'epoch': 0.41}


 41%|████      | 6910/16798 [31:36<51:44,  3.19it/s]

{'loss': 1.6075, 'grad_norm': 2.031090259552002, 'learning_rate': 0.00011779842744817729, 'epoch': 0.41}


 41%|████      | 6911/16798 [31:36<52:17,  3.15it/s]

{'loss': 2.0256, 'grad_norm': 2.2326390743255615, 'learning_rate': 0.00011778651417679297, 'epoch': 0.41}


 41%|████      | 6912/16798 [31:36<52:13,  3.16it/s]

{'loss': 1.9045, 'grad_norm': 2.237837314605713, 'learning_rate': 0.00011777460090540864, 'epoch': 0.41}


 41%|████      | 6913/16798 [31:37<51:53,  3.17it/s]

{'loss': 1.5549, 'grad_norm': 1.920335054397583, 'learning_rate': 0.00011776268763402432, 'epoch': 0.41}


 41%|████      | 6914/16798 [31:37<51:39,  3.19it/s]

{'loss': 1.5952, 'grad_norm': 1.9198155403137207, 'learning_rate': 0.00011775077436264, 'epoch': 0.41}


 41%|████      | 6915/16798 [31:37<52:09,  3.16it/s]

{'loss': 1.3567, 'grad_norm': 1.9054096937179565, 'learning_rate': 0.00011773886109125568, 'epoch': 0.41}


 41%|████      | 6916/16798 [31:38<54:48,  3.00it/s]

{'loss': 1.041, 'grad_norm': 2.265369415283203, 'learning_rate': 0.00011772694781987135, 'epoch': 0.41}


 41%|████      | 6917/16798 [31:38<54:02,  3.05it/s]

{'loss': 1.3779, 'grad_norm': 1.7604038715362549, 'learning_rate': 0.00011771503454848703, 'epoch': 0.41}


 41%|████      | 6918/16798 [31:38<54:35,  3.02it/s]

{'loss': 1.898, 'grad_norm': 2.8098838329315186, 'learning_rate': 0.00011770312127710271, 'epoch': 0.41}


 41%|████      | 6919/16798 [31:39<54:06,  3.04it/s]

{'loss': 1.4316, 'grad_norm': 1.6774866580963135, 'learning_rate': 0.00011769120800571838, 'epoch': 0.41}


 41%|████      | 6920/16798 [31:39<54:42,  3.01it/s]

{'loss': 1.7297, 'grad_norm': 2.0178885459899902, 'learning_rate': 0.00011767929473433406, 'epoch': 0.41}


 41%|████      | 6921/16798 [31:39<54:16,  3.03it/s]

{'loss': 1.5914, 'grad_norm': 2.1543450355529785, 'learning_rate': 0.00011766738146294974, 'epoch': 0.41}


 41%|████      | 6922/16798 [31:40<52:53,  3.11it/s]

{'loss': 1.429, 'grad_norm': 1.9278324842453003, 'learning_rate': 0.00011765546819156542, 'epoch': 0.41}


 41%|████      | 6923/16798 [31:40<51:54,  3.17it/s]

{'loss': 1.7632, 'grad_norm': 1.8378297090530396, 'learning_rate': 0.0001176435549201811, 'epoch': 0.41}


 41%|████      | 6924/16798 [31:40<51:13,  3.21it/s]

{'loss': 1.9883, 'grad_norm': 2.90803861618042, 'learning_rate': 0.00011763164164879677, 'epoch': 0.41}


 41%|████      | 6925/16798 [31:41<53:54,  3.05it/s]

{'loss': 1.5897, 'grad_norm': 1.9747930765151978, 'learning_rate': 0.00011761972837741245, 'epoch': 0.41}


 41%|████      | 6926/16798 [31:41<54:17,  3.03it/s]

{'loss': 1.6138, 'grad_norm': 2.061870574951172, 'learning_rate': 0.00011760781510602813, 'epoch': 0.41}


 41%|████      | 6927/16798 [31:41<53:58,  3.05it/s]

{'loss': 1.2889, 'grad_norm': 1.8344522714614868, 'learning_rate': 0.0001175959018346438, 'epoch': 0.41}


 41%|████      | 6928/16798 [31:42<53:31,  3.07it/s]

{'loss': 1.0622, 'grad_norm': 1.524566888809204, 'learning_rate': 0.00011758398856325948, 'epoch': 0.41}


 41%|████      | 6929/16798 [31:42<53:18,  3.09it/s]

{'loss': 1.4838, 'grad_norm': 1.9812395572662354, 'learning_rate': 0.00011757207529187514, 'epoch': 0.41}


 41%|████▏     | 6930/16798 [31:42<53:36,  3.07it/s]

{'loss': 1.3346, 'grad_norm': 1.8802330493927002, 'learning_rate': 0.00011756016202049082, 'epoch': 0.41}


 41%|████▏     | 6931/16798 [31:43<53:25,  3.08it/s]

{'loss': 1.0689, 'grad_norm': 2.122957229614258, 'learning_rate': 0.0001175482487491065, 'epoch': 0.41}


 41%|████▏     | 6932/16798 [31:43<53:20,  3.08it/s]

{'loss': 1.5106, 'grad_norm': 2.1623311042785645, 'learning_rate': 0.00011753633547772217, 'epoch': 0.41}


 41%|████▏     | 6933/16798 [31:43<54:16,  3.03it/s]

{'loss': 1.2672, 'grad_norm': 1.6925603151321411, 'learning_rate': 0.00011752442220633785, 'epoch': 0.41}


 41%|████▏     | 6934/16798 [31:44<53:36,  3.07it/s]

{'loss': 1.4231, 'grad_norm': 2.006499767303467, 'learning_rate': 0.00011751250893495353, 'epoch': 0.41}


 41%|████▏     | 6935/16798 [31:44<53:34,  3.07it/s]

{'loss': 1.4192, 'grad_norm': 1.8978935480117798, 'learning_rate': 0.0001175005956635692, 'epoch': 0.41}


 41%|████▏     | 6936/16798 [31:44<53:59,  3.04it/s]

{'loss': 1.2772, 'grad_norm': 1.707930326461792, 'learning_rate': 0.00011748868239218488, 'epoch': 0.41}


 41%|████▏     | 6937/16798 [31:45<53:24,  3.08it/s]

{'loss': 1.5517, 'grad_norm': 1.8204911947250366, 'learning_rate': 0.00011747676912080056, 'epoch': 0.41}


 41%|████▏     | 6938/16798 [31:45<54:35,  3.01it/s]

{'loss': 1.391, 'grad_norm': 1.8830510377883911, 'learning_rate': 0.00011746485584941624, 'epoch': 0.41}


 41%|████▏     | 6939/16798 [31:45<53:39,  3.06it/s]

{'loss': 1.4912, 'grad_norm': 2.2072229385375977, 'learning_rate': 0.00011745294257803194, 'epoch': 0.41}


 41%|████▏     | 6940/16798 [31:46<54:39,  3.01it/s]

{'loss': 1.0245, 'grad_norm': 1.4788970947265625, 'learning_rate': 0.00011744102930664762, 'epoch': 0.41}


 41%|████▏     | 6941/16798 [31:46<54:00,  3.04it/s]

{'loss': 1.423, 'grad_norm': 2.0504934787750244, 'learning_rate': 0.0001174291160352633, 'epoch': 0.41}


 41%|████▏     | 6942/16798 [31:46<53:31,  3.07it/s]

{'loss': 1.1321, 'grad_norm': 1.6821684837341309, 'learning_rate': 0.00011741720276387897, 'epoch': 0.41}


 41%|████▏     | 6943/16798 [31:47<52:52,  3.11it/s]

{'loss': 1.0024, 'grad_norm': 2.102306842803955, 'learning_rate': 0.00011740528949249465, 'epoch': 0.41}


 41%|████▏     | 6944/16798 [31:47<53:36,  3.06it/s]

{'loss': 1.4455, 'grad_norm': 2.073687791824341, 'learning_rate': 0.00011739337622111033, 'epoch': 0.41}


 41%|████▏     | 6945/16798 [31:47<53:25,  3.07it/s]

{'loss': 1.0294, 'grad_norm': 1.5640923976898193, 'learning_rate': 0.000117381462949726, 'epoch': 0.41}


 41%|████▏     | 6946/16798 [31:47<51:41,  3.18it/s]

{'loss': 1.3368, 'grad_norm': 2.136969804763794, 'learning_rate': 0.00011736954967834168, 'epoch': 0.41}


 41%|████▏     | 6947/16798 [31:48<50:33,  3.25it/s]

{'loss': 0.8658, 'grad_norm': 1.5981674194335938, 'learning_rate': 0.00011735763640695736, 'epoch': 0.41}


 41%|████▏     | 6948/16798 [31:48<50:21,  3.26it/s]

{'loss': 0.9112, 'grad_norm': 1.6742243766784668, 'learning_rate': 0.00011734572313557304, 'epoch': 0.41}


 41%|████▏     | 6949/16798 [31:48<50:42,  3.24it/s]

{'loss': 0.4113, 'grad_norm': 0.8961469531059265, 'learning_rate': 0.00011733380986418871, 'epoch': 0.41}


 41%|████▏     | 6950/16798 [31:49<51:59,  3.16it/s]

{'loss': 0.8032, 'grad_norm': 1.3837629556655884, 'learning_rate': 0.00011732189659280439, 'epoch': 0.41}


 41%|████▏     | 6951/16798 [31:49<52:28,  3.13it/s]

{'loss': 1.6649, 'grad_norm': 2.212944269180298, 'learning_rate': 0.00011730998332142007, 'epoch': 0.41}


 41%|████▏     | 6952/16798 [31:49<50:40,  3.24it/s]

{'loss': 1.7078, 'grad_norm': 2.538102626800537, 'learning_rate': 0.00011729807005003575, 'epoch': 0.41}


 41%|████▏     | 6953/16798 [31:50<52:16,  3.14it/s]

{'loss': 2.1567, 'grad_norm': 1.835917353630066, 'learning_rate': 0.00011728615677865142, 'epoch': 0.41}


 41%|████▏     | 6954/16798 [31:50<52:40,  3.11it/s]

{'loss': 1.6906, 'grad_norm': 1.6696217060089111, 'learning_rate': 0.0001172742435072671, 'epoch': 0.41}


 41%|████▏     | 6955/16798 [31:50<54:06,  3.03it/s]

{'loss': 2.0657, 'grad_norm': 2.05678129196167, 'learning_rate': 0.00011726233023588278, 'epoch': 0.41}


 41%|████▏     | 6956/16798 [31:51<56:46,  2.89it/s]

{'loss': 1.9036, 'grad_norm': 1.6892913579940796, 'learning_rate': 0.00011725041696449846, 'epoch': 0.41}


 41%|████▏     | 6957/16798 [31:51<55:17,  2.97it/s]

{'loss': 1.9113, 'grad_norm': 1.8147372007369995, 'learning_rate': 0.00011723850369311413, 'epoch': 0.41}


 41%|████▏     | 6958/16798 [31:51<52:52,  3.10it/s]

{'loss': 1.9302, 'grad_norm': 1.8953158855438232, 'learning_rate': 0.00011722659042172981, 'epoch': 0.41}


 41%|████▏     | 6959/16798 [31:52<52:26,  3.13it/s]

{'loss': 2.7021, 'grad_norm': 2.430384397506714, 'learning_rate': 0.00011721467715034549, 'epoch': 0.41}


 41%|████▏     | 6960/16798 [31:52<52:30,  3.12it/s]

{'loss': 2.0971, 'grad_norm': 2.083730697631836, 'learning_rate': 0.00011720276387896116, 'epoch': 0.41}


 41%|████▏     | 6961/16798 [31:52<51:25,  3.19it/s]

{'loss': 1.0281, 'grad_norm': 2.0315983295440674, 'learning_rate': 0.00011719085060757684, 'epoch': 0.41}


 41%|████▏     | 6962/16798 [31:53<49:41,  3.30it/s]

{'loss': 1.9552, 'grad_norm': 2.1130449771881104, 'learning_rate': 0.00011717893733619252, 'epoch': 0.41}


 41%|████▏     | 6963/16798 [31:53<53:10,  3.08it/s]

{'loss': 1.3985, 'grad_norm': 1.7497445344924927, 'learning_rate': 0.0001171670240648082, 'epoch': 0.41}


 41%|████▏     | 6964/16798 [31:53<52:30,  3.12it/s]

{'loss': 1.6892, 'grad_norm': 1.953144907951355, 'learning_rate': 0.00011715511079342387, 'epoch': 0.41}


 41%|████▏     | 6965/16798 [31:54<53:06,  3.09it/s]

{'loss': 1.7123, 'grad_norm': 2.1924521923065186, 'learning_rate': 0.00011714319752203955, 'epoch': 0.41}


 41%|████▏     | 6966/16798 [31:54<53:12,  3.08it/s]

{'loss': 1.711, 'grad_norm': 2.0780110359191895, 'learning_rate': 0.00011713128425065523, 'epoch': 0.41}


 41%|████▏     | 6967/16798 [31:54<53:40,  3.05it/s]

{'loss': 1.1372, 'grad_norm': 1.7932097911834717, 'learning_rate': 0.0001171193709792709, 'epoch': 0.41}


 41%|████▏     | 6968/16798 [31:55<53:24,  3.07it/s]

{'loss': 1.552, 'grad_norm': 1.8071047067642212, 'learning_rate': 0.00011710745770788658, 'epoch': 0.41}


 41%|████▏     | 6969/16798 [31:55<52:51,  3.10it/s]

{'loss': 1.3408, 'grad_norm': 1.8022069931030273, 'learning_rate': 0.00011709554443650229, 'epoch': 0.41}


 41%|████▏     | 6970/16798 [31:55<54:10,  3.02it/s]

{'loss': 1.5361, 'grad_norm': 1.8817877769470215, 'learning_rate': 0.00011708363116511796, 'epoch': 0.41}


 41%|████▏     | 6971/16798 [31:56<55:00,  2.98it/s]

{'loss': 1.683, 'grad_norm': 2.129885673522949, 'learning_rate': 0.00011707171789373364, 'epoch': 0.41}


 42%|████▏     | 6972/16798 [31:56<55:38,  2.94it/s]

{'loss': 1.8025, 'grad_norm': 1.9599844217300415, 'learning_rate': 0.00011705980462234932, 'epoch': 0.42}


 42%|████▏     | 6973/16798 [31:56<54:09,  3.02it/s]

{'loss': 2.0753, 'grad_norm': 2.6505916118621826, 'learning_rate': 0.000117047891350965, 'epoch': 0.42}


 42%|████▏     | 6974/16798 [31:57<53:27,  3.06it/s]

{'loss': 1.7428, 'grad_norm': 2.165940761566162, 'learning_rate': 0.00011703597807958067, 'epoch': 0.42}


 42%|████▏     | 6975/16798 [31:57<52:05,  3.14it/s]

{'loss': 1.7358, 'grad_norm': 2.312753438949585, 'learning_rate': 0.00011702406480819635, 'epoch': 0.42}


 42%|████▏     | 6976/16798 [31:57<51:26,  3.18it/s]

{'loss': 1.4519, 'grad_norm': 1.7479480504989624, 'learning_rate': 0.00011701215153681203, 'epoch': 0.42}


 42%|████▏     | 6977/16798 [31:57<52:27,  3.12it/s]

{'loss': 1.1907, 'grad_norm': 1.7720316648483276, 'learning_rate': 0.0001170002382654277, 'epoch': 0.42}


 42%|████▏     | 6978/16798 [31:58<53:46,  3.04it/s]

{'loss': 1.7618, 'grad_norm': 1.991594910621643, 'learning_rate': 0.00011698832499404338, 'epoch': 0.42}


 42%|████▏     | 6979/16798 [31:58<52:58,  3.09it/s]

{'loss': 1.3737, 'grad_norm': 2.106309652328491, 'learning_rate': 0.00011697641172265906, 'epoch': 0.42}


 42%|████▏     | 6980/16798 [31:58<53:12,  3.08it/s]

{'loss': 1.7724, 'grad_norm': 1.866469383239746, 'learning_rate': 0.00011696449845127474, 'epoch': 0.42}


 42%|████▏     | 6981/16798 [31:59<52:00,  3.15it/s]

{'loss': 1.6219, 'grad_norm': 2.029325485229492, 'learning_rate': 0.00011695258517989041, 'epoch': 0.42}


 42%|████▏     | 6982/16798 [31:59<52:05,  3.14it/s]

{'loss': 1.1564, 'grad_norm': 1.6918658018112183, 'learning_rate': 0.00011694067190850609, 'epoch': 0.42}


 42%|████▏     | 6983/16798 [31:59<52:24,  3.12it/s]

{'loss': 1.7964, 'grad_norm': 1.8726863861083984, 'learning_rate': 0.00011692875863712177, 'epoch': 0.42}


 42%|████▏     | 6984/16798 [32:00<52:00,  3.14it/s]

{'loss': 1.6935, 'grad_norm': 2.066234588623047, 'learning_rate': 0.00011691684536573744, 'epoch': 0.42}


 42%|████▏     | 6985/16798 [32:00<50:09,  3.26it/s]

{'loss': 1.5342, 'grad_norm': 2.9778997898101807, 'learning_rate': 0.00011690493209435312, 'epoch': 0.42}


 42%|████▏     | 6986/16798 [32:00<50:28,  3.24it/s]

{'loss': 1.1877, 'grad_norm': 1.5816892385482788, 'learning_rate': 0.00011689301882296878, 'epoch': 0.42}


 42%|████▏     | 6987/16798 [32:01<54:58,  2.97it/s]

{'loss': 1.7821, 'grad_norm': 2.255497932434082, 'learning_rate': 0.00011688110555158446, 'epoch': 0.42}


 42%|████▏     | 6988/16798 [32:01<52:50,  3.09it/s]

{'loss': 1.3561, 'grad_norm': 1.8259528875350952, 'learning_rate': 0.00011686919228020014, 'epoch': 0.42}


 42%|████▏     | 6989/16798 [32:01<49:34,  3.30it/s]

{'loss': 1.3773, 'grad_norm': 2.3608596324920654, 'learning_rate': 0.00011685727900881582, 'epoch': 0.42}


 42%|████▏     | 6990/16798 [32:02<50:23,  3.24it/s]

{'loss': 1.8684, 'grad_norm': 2.0854575634002686, 'learning_rate': 0.0001168453657374315, 'epoch': 0.42}


 42%|████▏     | 6991/16798 [32:02<48:49,  3.35it/s]

{'loss': 1.2306, 'grad_norm': 1.6379776000976562, 'learning_rate': 0.00011683345246604717, 'epoch': 0.42}


 42%|████▏     | 6992/16798 [32:02<52:07,  3.14it/s]

{'loss': 1.0378, 'grad_norm': 1.6741527318954468, 'learning_rate': 0.00011682153919466285, 'epoch': 0.42}


 42%|████▏     | 6993/16798 [32:03<51:15,  3.19it/s]

{'loss': 1.0897, 'grad_norm': 1.649953842163086, 'learning_rate': 0.00011680962592327853, 'epoch': 0.42}


 42%|████▏     | 6994/16798 [32:03<51:40,  3.16it/s]

{'loss': 0.9317, 'grad_norm': 1.7967112064361572, 'learning_rate': 0.0001167977126518942, 'epoch': 0.42}


 42%|████▏     | 6995/16798 [32:03<52:46,  3.10it/s]

{'loss': 0.8598, 'grad_norm': 1.4937198162078857, 'learning_rate': 0.00011678579938050988, 'epoch': 0.42}


 42%|████▏     | 6996/16798 [32:04<53:36,  3.05it/s]

{'loss': 0.4412, 'grad_norm': 0.9724992513656616, 'learning_rate': 0.00011677388610912556, 'epoch': 0.42}


 42%|████▏     | 6997/16798 [32:04<51:09,  3.19it/s]

{'loss': 0.52, 'grad_norm': 1.2422634363174438, 'learning_rate': 0.00011676197283774123, 'epoch': 0.42}


 42%|████▏     | 6998/16798 [32:04<51:35,  3.17it/s]

{'loss': 0.2894, 'grad_norm': 0.7569541931152344, 'learning_rate': 0.00011675005956635691, 'epoch': 0.42}


 42%|████▏     | 6999/16798 [32:04<49:27,  3.30it/s]

{'loss': 0.2828, 'grad_norm': 0.7802401781082153, 'learning_rate': 0.00011673814629497259, 'epoch': 0.42}




{'loss': 0.5326, 'grad_norm': 1.1437932252883911, 'learning_rate': 0.00011672623302358829, 'epoch': 0.42}
