In [3]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text(encoding='utf-8'))
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

  labels.append(0 if label_dir is "neg" else 1)


In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [5]:
!pip install transformers



In [5]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [7]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [8]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 10/3750 [00:07<21:49,  2.86it/s] 

{'loss': 0.6972, 'grad_norm': 1.5717148780822754, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  1%|          | 20/3750 [00:09<17:26,  3.56it/s]

{'loss': 0.7009, 'grad_norm': 1.2666778564453125, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  1%|          | 30/3750 [00:12<16:53,  3.67it/s]

{'loss': 0.6911, 'grad_norm': 1.1858385801315308, 'learning_rate': 3e-06, 'epoch': 0.02}


  1%|          | 40/3750 [00:15<16:46,  3.69it/s]

{'loss': 0.695, 'grad_norm': 1.441402554512024, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}


  1%|▏         | 50/3750 [00:18<16:38,  3.71it/s]

{'loss': 0.6967, 'grad_norm': 2.780712604522705, 'learning_rate': 5e-06, 'epoch': 0.04}


  2%|▏         | 60/3750 [00:20<16:36,  3.70it/s]

{'loss': 0.6899, 'grad_norm': 0.9821226000785828, 'learning_rate': 6e-06, 'epoch': 0.05}


  2%|▏         | 70/3750 [00:23<16:34,  3.70it/s]

{'loss': 0.6867, 'grad_norm': 2.1381421089172363, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.06}


  2%|▏         | 80/3750 [00:26<16:30,  3.70it/s]

{'loss': 0.6751, 'grad_norm': 2.1252756118774414, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.06}


  2%|▏         | 90/3750 [00:28<16:31,  3.69it/s]

{'loss': 0.667, 'grad_norm': 1.167785882949829, 'learning_rate': 9e-06, 'epoch': 0.07}


  3%|▎         | 100/3750 [00:31<16:25,  3.70it/s]

{'loss': 0.6456, 'grad_norm': 1.3205753564834595, 'learning_rate': 1e-05, 'epoch': 0.08}


  3%|▎         | 110/3750 [00:34<16:24,  3.70it/s]

{'loss': 0.5683, 'grad_norm': 2.4879584312438965, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.09}


  3%|▎         | 120/3750 [00:36<16:20,  3.70it/s]

{'loss': 0.4654, 'grad_norm': 6.101283073425293, 'learning_rate': 1.2e-05, 'epoch': 0.1}


  3%|▎         | 130/3750 [00:39<16:18,  3.70it/s]

{'loss': 0.4065, 'grad_norm': 15.3212308883667, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.1}


  4%|▎         | 140/3750 [00:42<16:18,  3.69it/s]

{'loss': 0.4594, 'grad_norm': 8.66718578338623, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.11}


  4%|▍         | 150/3750 [00:45<16:25,  3.65it/s]

{'loss': 0.353, 'grad_norm': 7.349375247955322, 'learning_rate': 1.5e-05, 'epoch': 0.12}


  4%|▍         | 160/3750 [00:47<16:14,  3.68it/s]

{'loss': 0.2699, 'grad_norm': 9.287389755249023, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.13}


  5%|▍         | 170/3750 [00:50<16:12,  3.68it/s]

{'loss': 0.3967, 'grad_norm': 11.532280921936035, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.14}


  5%|▍         | 180/3750 [00:53<16:05,  3.70it/s]

{'loss': 0.4328, 'grad_norm': 5.373385429382324, 'learning_rate': 1.8e-05, 'epoch': 0.14}


  5%|▌         | 190/3750 [00:56<16:10,  3.67it/s]

{'loss': 0.3676, 'grad_norm': 9.740294456481934, 'learning_rate': 1.9e-05, 'epoch': 0.15}


  5%|▌         | 200/3750 [00:58<16:02,  3.69it/s]

{'loss': 0.3281, 'grad_norm': 8.239296913146973, 'learning_rate': 2e-05, 'epoch': 0.16}


  6%|▌         | 210/3750 [01:01<16:01,  3.68it/s]

{'loss': 0.2393, 'grad_norm': 8.626252174377441, 'learning_rate': 2.1e-05, 'epoch': 0.17}


  6%|▌         | 220/3750 [01:04<16:01,  3.67it/s]

{'loss': 0.346, 'grad_norm': 10.63962459564209, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.18}


  6%|▌         | 230/3750 [01:06<15:57,  3.68it/s]

{'loss': 0.4124, 'grad_norm': 8.237617492675781, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.18}


  6%|▋         | 240/3750 [01:09<15:56,  3.67it/s]

{'loss': 0.3659, 'grad_norm': 7.860538959503174, 'learning_rate': 2.4e-05, 'epoch': 0.19}


  7%|▋         | 250/3750 [01:12<15:54,  3.67it/s]

{'loss': 0.2984, 'grad_norm': 7.772144317626953, 'learning_rate': 2.5e-05, 'epoch': 0.2}


  7%|▋         | 260/3750 [01:15<15:51,  3.67it/s]

{'loss': 0.3156, 'grad_norm': 4.279891490936279, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.21}


  7%|▋         | 270/3750 [01:17<15:34,  3.72it/s]

{'loss': 0.3435, 'grad_norm': 10.108110427856445, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.22}


  7%|▋         | 280/3750 [01:20<15:38,  3.70it/s]

{'loss': 0.3664, 'grad_norm': 14.276097297668457, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.22}


  8%|▊         | 290/3750 [01:23<15:41,  3.68it/s]

{'loss': 0.2969, 'grad_norm': 2.5283379554748535, 'learning_rate': 2.9e-05, 'epoch': 0.23}


  8%|▊         | 300/3750 [01:25<15:41,  3.66it/s]

{'loss': 0.3484, 'grad_norm': 9.543800354003906, 'learning_rate': 3e-05, 'epoch': 0.24}


  8%|▊         | 310/3750 [01:28<15:37,  3.67it/s]

{'loss': 0.2791, 'grad_norm': 2.973247528076172, 'learning_rate': 3.1e-05, 'epoch': 0.25}


  9%|▊         | 320/3750 [01:31<15:27,  3.70it/s]

{'loss': 0.2949, 'grad_norm': 16.653339385986328, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.26}


  9%|▉         | 330/3750 [01:34<15:30,  3.67it/s]

{'loss': 0.2322, 'grad_norm': 3.125208616256714, 'learning_rate': 3.3e-05, 'epoch': 0.26}


  9%|▉         | 340/3750 [01:36<15:27,  3.67it/s]

{'loss': 0.2864, 'grad_norm': 13.076640129089355, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.27}


  9%|▉         | 350/3750 [01:39<15:24,  3.68it/s]

{'loss': 0.3769, 'grad_norm': 7.246153831481934, 'learning_rate': 3.5e-05, 'epoch': 0.28}


 10%|▉         | 360/3750 [01:42<15:21,  3.68it/s]

{'loss': 0.2322, 'grad_norm': 7.567549705505371, 'learning_rate': 3.6e-05, 'epoch': 0.29}


 10%|▉         | 370/3750 [01:45<15:16,  3.69it/s]

{'loss': 0.3311, 'grad_norm': 14.162513732910156, 'learning_rate': 3.7e-05, 'epoch': 0.3}


 10%|█         | 380/3750 [01:47<15:20,  3.66it/s]

{'loss': 0.1753, 'grad_norm': 10.438393592834473, 'learning_rate': 3.8e-05, 'epoch': 0.3}


 10%|█         | 390/3750 [01:50<15:35,  3.59it/s]

{'loss': 0.2349, 'grad_norm': 3.188992977142334, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.31}


 11%|█         | 400/3750 [01:53<15:14,  3.66it/s]

{'loss': 0.3991, 'grad_norm': 7.587440013885498, 'learning_rate': 4e-05, 'epoch': 0.32}


 11%|█         | 410/3750 [01:55<15:10,  3.67it/s]

{'loss': 0.3642, 'grad_norm': 9.148326873779297, 'learning_rate': 4.1e-05, 'epoch': 0.33}


 11%|█         | 420/3750 [01:58<15:08,  3.66it/s]

{'loss': 0.3204, 'grad_norm': 5.869972229003906, 'learning_rate': 4.2e-05, 'epoch': 0.34}


 11%|█▏        | 430/3750 [02:01<15:07,  3.66it/s]

{'loss': 0.3709, 'grad_norm': 10.331229209899902, 'learning_rate': 4.3e-05, 'epoch': 0.34}


 12%|█▏        | 440/3750 [02:04<15:02,  3.67it/s]

{'loss': 0.3351, 'grad_norm': 6.612332344055176, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.35}


 12%|█▏        | 450/3750 [02:06<14:59,  3.67it/s]

{'loss': 0.3534, 'grad_norm': 4.1338276863098145, 'learning_rate': 4.5e-05, 'epoch': 0.36}


 12%|█▏        | 460/3750 [02:09<14:59,  3.66it/s]

{'loss': 0.3908, 'grad_norm': 9.095805168151855, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.37}


 13%|█▎        | 470/3750 [02:12<14:53,  3.67it/s]

{'loss': 0.1778, 'grad_norm': 3.390857458114624, 'learning_rate': 4.7e-05, 'epoch': 0.38}


 13%|█▎        | 480/3750 [02:15<14:54,  3.66it/s]

{'loss': 0.5969, 'grad_norm': 10.481690406799316, 'learning_rate': 4.8e-05, 'epoch': 0.38}


 13%|█▎        | 490/3750 [02:17<14:52,  3.65it/s]

{'loss': 0.401, 'grad_norm': 2.8946971893310547, 'learning_rate': 4.9e-05, 'epoch': 0.39}


 13%|█▎        | 500/3750 [02:20<14:40,  3.69it/s]

{'loss': 0.3273, 'grad_norm': 6.877674102783203, 'learning_rate': 5e-05, 'epoch': 0.4}


 14%|█▎        | 510/3750 [02:24<16:56,  3.19it/s]

{'loss': 0.409, 'grad_norm': 6.780611038208008, 'learning_rate': 4.984615384615385e-05, 'epoch': 0.41}


 14%|█▍        | 520/3750 [02:27<15:21,  3.50it/s]

{'loss': 0.3241, 'grad_norm': 6.973483085632324, 'learning_rate': 4.969230769230769e-05, 'epoch': 0.42}


 14%|█▍        | 530/3750 [02:30<14:51,  3.61it/s]

{'loss': 0.3266, 'grad_norm': 4.335149765014648, 'learning_rate': 4.953846153846154e-05, 'epoch': 0.42}


 14%|█▍        | 540/3750 [02:32<14:48,  3.61it/s]

{'loss': 0.3436, 'grad_norm': 3.7275853157043457, 'learning_rate': 4.9384615384615384e-05, 'epoch': 0.43}


 15%|█▍        | 550/3750 [02:35<14:42,  3.63it/s]

{'loss': 0.3104, 'grad_norm': 8.579946517944336, 'learning_rate': 4.923076923076924e-05, 'epoch': 0.44}


 15%|█▍        | 560/3750 [02:38<14:48,  3.59it/s]

{'loss': 0.4439, 'grad_norm': 6.93914270401001, 'learning_rate': 4.907692307692308e-05, 'epoch': 0.45}


 15%|█▌        | 570/3750 [02:41<14:37,  3.63it/s]

{'loss': 0.2853, 'grad_norm': 6.964446544647217, 'learning_rate': 4.892307692307693e-05, 'epoch': 0.46}


 15%|█▌        | 580/3750 [02:43<14:30,  3.64it/s]

{'loss': 0.3255, 'grad_norm': 7.651678085327148, 'learning_rate': 4.876923076923077e-05, 'epoch': 0.46}


 16%|█▌        | 590/3750 [02:46<14:29,  3.63it/s]

{'loss': 0.2575, 'grad_norm': 10.727653503417969, 'learning_rate': 4.861538461538462e-05, 'epoch': 0.47}


 16%|█▌        | 600/3750 [02:49<14:29,  3.62it/s]

{'loss': 0.2248, 'grad_norm': 6.2859578132629395, 'learning_rate': 4.846153846153846e-05, 'epoch': 0.48}


 16%|█▋        | 610/3750 [02:52<14:17,  3.66it/s]

{'loss': 0.3902, 'grad_norm': 4.993444919586182, 'learning_rate': 4.830769230769231e-05, 'epoch': 0.49}


 17%|█▋        | 620/3750 [02:54<14:14,  3.66it/s]

{'loss': 0.2514, 'grad_norm': 8.277691841125488, 'learning_rate': 4.815384615384615e-05, 'epoch': 0.5}


 17%|█▋        | 630/3750 [02:57<14:14,  3.65it/s]

{'loss': 0.2784, 'grad_norm': 3.852212429046631, 'learning_rate': 4.8e-05, 'epoch': 0.5}


 17%|█▋        | 640/3750 [03:00<14:14,  3.64it/s]

{'loss': 0.5137, 'grad_norm': 9.426596641540527, 'learning_rate': 4.784615384615384e-05, 'epoch': 0.51}


 17%|█▋        | 650/3750 [03:03<14:05,  3.66it/s]

{'loss': 0.389, 'grad_norm': 3.648956537246704, 'learning_rate': 4.76923076923077e-05, 'epoch': 0.52}


 18%|█▊        | 660/3750 [03:05<14:05,  3.66it/s]

{'loss': 0.3131, 'grad_norm': 10.562483787536621, 'learning_rate': 4.753846153846154e-05, 'epoch': 0.53}


 18%|█▊        | 670/3750 [03:08<14:09,  3.63it/s]

{'loss': 0.2685, 'grad_norm': 3.7109644412994385, 'learning_rate': 4.738461538461539e-05, 'epoch': 0.54}


 18%|█▊        | 680/3750 [03:11<14:07,  3.62it/s]

{'loss': 0.2068, 'grad_norm': 0.7458673715591431, 'learning_rate': 4.723076923076923e-05, 'epoch': 0.54}


 18%|█▊        | 690/3750 [03:14<14:05,  3.62it/s]

{'loss': 0.3142, 'grad_norm': 8.376029014587402, 'learning_rate': 4.707692307692308e-05, 'epoch': 0.55}


 19%|█▊        | 700/3750 [03:16<13:59,  3.63it/s]

{'loss': 0.2386, 'grad_norm': 7.771261215209961, 'learning_rate': 4.692307692307693e-05, 'epoch': 0.56}


 19%|█▉        | 710/3750 [03:19<14:01,  3.61it/s]

{'loss': 0.2801, 'grad_norm': 5.377802848815918, 'learning_rate': 4.676923076923077e-05, 'epoch': 0.57}


 19%|█▉        | 720/3750 [03:22<13:57,  3.62it/s]

{'loss': 0.2499, 'grad_norm': 6.855386257171631, 'learning_rate': 4.661538461538462e-05, 'epoch': 0.58}


 19%|█▉        | 730/3750 [03:25<13:52,  3.63it/s]

{'loss': 0.235, 'grad_norm': 12.796589851379395, 'learning_rate': 4.646153846153846e-05, 'epoch': 0.58}


 20%|█▉        | 740/3750 [03:28<13:55,  3.60it/s]

{'loss': 0.3496, 'grad_norm': 5.602896690368652, 'learning_rate': 4.630769230769231e-05, 'epoch': 0.59}


 20%|██        | 750/3750 [03:30<13:47,  3.62it/s]

{'loss': 0.2517, 'grad_norm': 4.162884712219238, 'learning_rate': 4.615384615384616e-05, 'epoch': 0.6}


 20%|██        | 760/3750 [03:33<13:39,  3.65it/s]

{'loss': 0.2053, 'grad_norm': 10.892425537109375, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.61}


 21%|██        | 770/3750 [03:36<13:35,  3.65it/s]

{'loss': 0.2336, 'grad_norm': 2.9123222827911377, 'learning_rate': 4.584615384615385e-05, 'epoch': 0.62}


 21%|██        | 780/3750 [03:39<13:28,  3.67it/s]

{'loss': 0.2804, 'grad_norm': 4.215754508972168, 'learning_rate': 4.56923076923077e-05, 'epoch': 0.62}


 21%|██        | 790/3750 [03:41<13:30,  3.65it/s]

{'loss': 0.1929, 'grad_norm': 8.032365798950195, 'learning_rate': 4.553846153846154e-05, 'epoch': 0.63}


 21%|██▏       | 800/3750 [03:44<13:26,  3.66it/s]

{'loss': 0.2562, 'grad_norm': 9.75494384765625, 'learning_rate': 4.538461538461539e-05, 'epoch': 0.64}


 22%|██▏       | 810/3750 [03:47<13:23,  3.66it/s]

{'loss': 0.337, 'grad_norm': 7.783023357391357, 'learning_rate': 4.523076923076923e-05, 'epoch': 0.65}


 22%|██▏       | 820/3750 [03:49<13:21,  3.66it/s]

{'loss': 0.2372, 'grad_norm': 3.955103874206543, 'learning_rate': 4.507692307692308e-05, 'epoch': 0.66}


 22%|██▏       | 830/3750 [03:52<13:17,  3.66it/s]

{'loss': 0.1536, 'grad_norm': 10.680975914001465, 'learning_rate': 4.492307692307692e-05, 'epoch': 0.66}


 22%|██▏       | 840/3750 [03:55<13:15,  3.66it/s]

{'loss': 0.446, 'grad_norm': 5.4565629959106445, 'learning_rate': 4.476923076923077e-05, 'epoch': 0.67}


 23%|██▎       | 850/3750 [03:58<13:12,  3.66it/s]

{'loss': 0.3255, 'grad_norm': 5.764597415924072, 'learning_rate': 4.461538461538462e-05, 'epoch': 0.68}


 23%|██▎       | 860/3750 [04:00<13:10,  3.66it/s]

{'loss': 0.3451, 'grad_norm': 4.198781490325928, 'learning_rate': 4.4461538461538466e-05, 'epoch': 0.69}


 23%|██▎       | 870/3750 [04:03<13:18,  3.61it/s]

{'loss': 0.2631, 'grad_norm': 7.036981582641602, 'learning_rate': 4.430769230769231e-05, 'epoch': 0.7}


 23%|██▎       | 880/3750 [04:06<13:13,  3.62it/s]

{'loss': 0.2548, 'grad_norm': 4.682916641235352, 'learning_rate': 4.415384615384616e-05, 'epoch': 0.7}


 24%|██▎       | 890/3750 [04:09<13:14,  3.60it/s]

{'loss': 0.3607, 'grad_norm': 4.376043319702148, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.71}


 24%|██▍       | 900/3750 [04:12<13:08,  3.61it/s]

{'loss': 0.269, 'grad_norm': 2.692206859588623, 'learning_rate': 4.384615384615385e-05, 'epoch': 0.72}


 24%|██▍       | 910/3750 [04:14<13:09,  3.60it/s]

{'loss': 0.2412, 'grad_norm': 6.043333530426025, 'learning_rate': 4.3692307692307696e-05, 'epoch': 0.73}


 25%|██▍       | 920/3750 [04:17<13:02,  3.61it/s]

{'loss': 0.2744, 'grad_norm': 5.734634876251221, 'learning_rate': 4.353846153846154e-05, 'epoch': 0.74}


 25%|██▍       | 930/3750 [04:20<13:02,  3.61it/s]

{'loss': 0.223, 'grad_norm': 10.852492332458496, 'learning_rate': 4.338461538461539e-05, 'epoch': 0.74}


 25%|██▌       | 940/3750 [04:23<12:57,  3.61it/s]

{'loss': 0.2227, 'grad_norm': 1.5014071464538574, 'learning_rate': 4.323076923076923e-05, 'epoch': 0.75}


 25%|██▌       | 950/3750 [04:25<12:58,  3.60it/s]

{'loss': 0.1706, 'grad_norm': 5.6693902015686035, 'learning_rate': 4.3076923076923084e-05, 'epoch': 0.76}


 26%|██▌       | 960/3750 [04:28<12:53,  3.61it/s]

{'loss': 0.3373, 'grad_norm': 8.209442138671875, 'learning_rate': 4.2923076923076926e-05, 'epoch': 0.77}


 26%|██▌       | 970/3750 [04:31<12:50,  3.61it/s]

{'loss': 0.2859, 'grad_norm': 2.588654041290283, 'learning_rate': 4.2769230769230775e-05, 'epoch': 0.78}


 26%|██▌       | 980/3750 [04:34<12:44,  3.62it/s]

{'loss': 0.2285, 'grad_norm': 4.6359782218933105, 'learning_rate': 4.2615384615384617e-05, 'epoch': 0.78}


 26%|██▋       | 990/3750 [04:36<12:46,  3.60it/s]

{'loss': 0.1848, 'grad_norm': 2.410296678543091, 'learning_rate': 4.2461538461538465e-05, 'epoch': 0.79}


 27%|██▋       | 1000/3750 [04:39<12:36,  3.63it/s]

{'loss': 0.2984, 'grad_norm': 10.176850318908691, 'learning_rate': 4.230769230769231e-05, 'epoch': 0.8}


 27%|██▋       | 1010/3750 [04:43<14:51,  3.07it/s]

{'loss': 0.343, 'grad_norm': 8.061102867126465, 'learning_rate': 4.2153846153846156e-05, 'epoch': 0.81}


 27%|██▋       | 1020/3750 [04:46<13:20,  3.41it/s]

{'loss': 0.3148, 'grad_norm': 4.569148063659668, 'learning_rate': 4.2e-05, 'epoch': 0.82}


 27%|██▋       | 1030/3750 [04:49<12:58,  3.50it/s]

{'loss': 0.1899, 'grad_norm': 8.686240196228027, 'learning_rate': 4.1846153846153846e-05, 'epoch': 0.82}


 28%|██▊       | 1040/3750 [04:52<12:46,  3.53it/s]

{'loss': 0.2613, 'grad_norm': 5.5476813316345215, 'learning_rate': 4.169230769230769e-05, 'epoch': 0.83}


 28%|██▊       | 1050/3750 [04:55<12:29,  3.60it/s]

{'loss': 0.2134, 'grad_norm': 7.774003505706787, 'learning_rate': 4.1538461538461544e-05, 'epoch': 0.84}


 28%|██▊       | 1060/3750 [04:57<12:23,  3.62it/s]

{'loss': 0.2671, 'grad_norm': 4.903701305389404, 'learning_rate': 4.1384615384615386e-05, 'epoch': 0.85}


 29%|██▊       | 1070/3750 [05:00<12:17,  3.63it/s]

{'loss': 0.1852, 'grad_norm': 5.35076904296875, 'learning_rate': 4.1230769230769234e-05, 'epoch': 0.86}


 29%|██▉       | 1080/3750 [05:03<12:22,  3.60it/s]

{'loss': 0.3511, 'grad_norm': 6.942461013793945, 'learning_rate': 4.1076923076923076e-05, 'epoch': 0.86}


 29%|██▉       | 1090/3750 [05:06<12:18,  3.60it/s]

{'loss': 0.1514, 'grad_norm': 8.133625030517578, 'learning_rate': 4.0923076923076925e-05, 'epoch': 0.87}


 29%|██▉       | 1100/3750 [05:09<12:17,  3.59it/s]

{'loss': 0.2853, 'grad_norm': 0.8489212989807129, 'learning_rate': 4.0769230769230773e-05, 'epoch': 0.88}


 30%|██▉       | 1110/3750 [05:11<12:13,  3.60it/s]

{'loss': 0.1653, 'grad_norm': 8.67277717590332, 'learning_rate': 4.0615384615384615e-05, 'epoch': 0.89}


 30%|██▉       | 1120/3750 [05:14<12:22,  3.54it/s]

{'loss': 0.2304, 'grad_norm': 9.602072715759277, 'learning_rate': 4.0461538461538464e-05, 'epoch': 0.9}


 30%|███       | 1130/3750 [05:17<12:18,  3.55it/s]

{'loss': 0.2059, 'grad_norm': 12.913468360900879, 'learning_rate': 4.0307692307692306e-05, 'epoch': 0.9}


 30%|███       | 1140/3750 [05:20<12:17,  3.54it/s]

{'loss': 0.3138, 'grad_norm': 11.94363784790039, 'learning_rate': 4.0153846153846155e-05, 'epoch': 0.91}


 31%|███       | 1150/3750 [05:23<12:13,  3.55it/s]

{'loss': 0.3214, 'grad_norm': 4.403100967407227, 'learning_rate': 4e-05, 'epoch': 0.92}


 31%|███       | 1160/3750 [05:25<12:08,  3.56it/s]

{'loss': 0.2432, 'grad_norm': 4.610959529876709, 'learning_rate': 3.984615384615385e-05, 'epoch': 0.93}


 31%|███       | 1170/3750 [05:28<12:17,  3.50it/s]

{'loss': 0.1632, 'grad_norm': 4.8815765380859375, 'learning_rate': 3.9692307692307694e-05, 'epoch': 0.94}


 31%|███▏      | 1180/3750 [05:31<11:58,  3.58it/s]

{'loss': 0.18, 'grad_norm': 7.946305751800537, 'learning_rate': 3.953846153846154e-05, 'epoch': 0.94}


 32%|███▏      | 1190/3750 [05:34<11:50,  3.60it/s]

{'loss': 0.2454, 'grad_norm': 7.435680389404297, 'learning_rate': 3.9384615384615384e-05, 'epoch': 0.95}


 32%|███▏      | 1200/3750 [05:37<11:50,  3.59it/s]

{'loss': 0.2973, 'grad_norm': 4.760448932647705, 'learning_rate': 3.923076923076923e-05, 'epoch': 0.96}


 32%|███▏      | 1210/3750 [05:39<11:47,  3.59it/s]

{'loss': 0.1892, 'grad_norm': 4.816673278808594, 'learning_rate': 3.9076923076923075e-05, 'epoch': 0.97}


 33%|███▎      | 1220/3750 [05:42<11:46,  3.58it/s]

{'loss': 0.384, 'grad_norm': 8.292550086975098, 'learning_rate': 3.8923076923076924e-05, 'epoch': 0.98}


 33%|███▎      | 1230/3750 [05:45<11:40,  3.60it/s]

{'loss': 0.1406, 'grad_norm': 1.4162070751190186, 'learning_rate': 3.8769230769230766e-05, 'epoch': 0.98}


 33%|███▎      | 1240/3750 [05:48<11:33,  3.62it/s]

{'loss': 0.2539, 'grad_norm': 16.530054092407227, 'learning_rate': 3.861538461538462e-05, 'epoch': 0.99}


 33%|███▎      | 1250/3750 [05:51<11:33,  3.61it/s]

{'loss': 0.3245, 'grad_norm': 7.191878318786621, 'learning_rate': 3.846153846153846e-05, 'epoch': 1.0}


 34%|███▎      | 1260/3750 [05:53<11:32,  3.60it/s]

{'loss': 0.1157, 'grad_norm': 0.5243437886238098, 'learning_rate': 3.830769230769231e-05, 'epoch': 1.01}


 34%|███▍      | 1270/3750 [05:56<11:31,  3.59it/s]

{'loss': 0.1905, 'grad_norm': 0.3120182752609253, 'learning_rate': 3.8153846153846153e-05, 'epoch': 1.02}


 34%|███▍      | 1280/3750 [05:59<11:28,  3.59it/s]

{'loss': 0.246, 'grad_norm': 10.62739372253418, 'learning_rate': 3.8e-05, 'epoch': 1.02}


 34%|███▍      | 1290/3750 [06:02<11:21,  3.61it/s]

{'loss': 0.1022, 'grad_norm': 3.168999671936035, 'learning_rate': 3.784615384615385e-05, 'epoch': 1.03}


 35%|███▍      | 1300/3750 [06:04<11:19,  3.61it/s]

{'loss': 0.1369, 'grad_norm': 6.899278163909912, 'learning_rate': 3.769230769230769e-05, 'epoch': 1.04}


 35%|███▍      | 1310/3750 [06:07<11:18,  3.60it/s]

{'loss': 0.1348, 'grad_norm': 7.708598613739014, 'learning_rate': 3.753846153846154e-05, 'epoch': 1.05}


 35%|███▌      | 1320/3750 [06:10<11:11,  3.62it/s]

{'loss': 0.187, 'grad_norm': 0.20155829191207886, 'learning_rate': 3.738461538461538e-05, 'epoch': 1.06}


 35%|███▌      | 1330/3750 [06:13<11:06,  3.63it/s]

{'loss': 0.1903, 'grad_norm': 1.8201888799667358, 'learning_rate': 3.723076923076923e-05, 'epoch': 1.06}


 36%|███▌      | 1340/3750 [06:16<11:08,  3.61it/s]

{'loss': 0.1357, 'grad_norm': 0.9450441598892212, 'learning_rate': 3.707692307692308e-05, 'epoch': 1.07}


 36%|███▌      | 1350/3750 [06:18<11:05,  3.61it/s]

{'loss': 0.1735, 'grad_norm': 7.600000858306885, 'learning_rate': 3.692307692307693e-05, 'epoch': 1.08}


 36%|███▋      | 1360/3750 [06:21<11:06,  3.59it/s]

{'loss': 0.1317, 'grad_norm': 8.396793365478516, 'learning_rate': 3.676923076923077e-05, 'epoch': 1.09}


 37%|███▋      | 1370/3750 [06:24<11:01,  3.60it/s]

{'loss': 0.2002, 'grad_norm': 4.294376850128174, 'learning_rate': 3.661538461538462e-05, 'epoch': 1.1}


 37%|███▋      | 1380/3750 [06:27<11:00,  3.59it/s]

{'loss': 0.1539, 'grad_norm': 7.384941101074219, 'learning_rate': 3.646153846153846e-05, 'epoch': 1.1}


 37%|███▋      | 1390/3750 [06:29<10:57,  3.59it/s]

{'loss': 0.1195, 'grad_norm': 0.27952179312705994, 'learning_rate': 3.630769230769231e-05, 'epoch': 1.11}


 37%|███▋      | 1400/3750 [06:32<10:53,  3.60it/s]

{'loss': 0.1014, 'grad_norm': 1.1492455005645752, 'learning_rate': 3.615384615384615e-05, 'epoch': 1.12}


 38%|███▊      | 1410/3750 [06:35<10:52,  3.59it/s]

{'loss': 0.2586, 'grad_norm': 3.917750120162964, 'learning_rate': 3.6e-05, 'epoch': 1.13}


 38%|███▊      | 1420/3750 [06:38<10:42,  3.63it/s]

{'loss': 0.1303, 'grad_norm': 6.4567670822143555, 'learning_rate': 3.584615384615384e-05, 'epoch': 1.14}


 38%|███▊      | 1430/3750 [06:41<10:36,  3.64it/s]

{'loss': 0.1107, 'grad_norm': 17.142223358154297, 'learning_rate': 3.569230769230769e-05, 'epoch': 1.14}


 38%|███▊      | 1440/3750 [06:43<10:40,  3.61it/s]

{'loss': 0.1816, 'grad_norm': 5.606646537780762, 'learning_rate': 3.553846153846154e-05, 'epoch': 1.15}


 39%|███▊      | 1450/3750 [06:46<10:34,  3.62it/s]

{'loss': 0.1077, 'grad_norm': 9.57114315032959, 'learning_rate': 3.538461538461539e-05, 'epoch': 1.16}


 39%|███▉      | 1460/3750 [06:49<10:30,  3.63it/s]

{'loss': 0.1721, 'grad_norm': 7.96489143371582, 'learning_rate': 3.523076923076923e-05, 'epoch': 1.17}


 39%|███▉      | 1470/3750 [06:52<10:30,  3.62it/s]

{'loss': 0.1413, 'grad_norm': 5.857298851013184, 'learning_rate': 3.507692307692308e-05, 'epoch': 1.18}


 39%|███▉      | 1480/3750 [06:54<10:23,  3.64it/s]

{'loss': 0.0902, 'grad_norm': 5.105051040649414, 'learning_rate': 3.492307692307693e-05, 'epoch': 1.18}


 40%|███▉      | 1490/3750 [06:57<10:22,  3.63it/s]

{'loss': 0.1815, 'grad_norm': 9.906122207641602, 'learning_rate': 3.476923076923077e-05, 'epoch': 1.19}


 40%|████      | 1500/3750 [07:00<10:19,  3.63it/s]

{'loss': 0.0456, 'grad_norm': 0.3302038311958313, 'learning_rate': 3.461538461538462e-05, 'epoch': 1.2}


 40%|████      | 1510/3750 [07:04<11:56,  3.13it/s]

{'loss': 0.2617, 'grad_norm': 0.709566593170166, 'learning_rate': 3.446153846153846e-05, 'epoch': 1.21}


 41%|████      | 1520/3750 [07:07<10:42,  3.47it/s]

{'loss': 0.1277, 'grad_norm': 18.00716209411621, 'learning_rate': 3.430769230769231e-05, 'epoch': 1.22}


 41%|████      | 1530/3750 [07:10<10:23,  3.56it/s]

{'loss': 0.2092, 'grad_norm': 13.118906021118164, 'learning_rate': 3.415384615384615e-05, 'epoch': 1.22}


 41%|████      | 1540/3750 [07:12<10:24,  3.54it/s]

{'loss': 0.3234, 'grad_norm': 8.39810848236084, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.23}


 41%|████▏     | 1550/3750 [07:15<10:31,  3.48it/s]

{'loss': 0.1222, 'grad_norm': 17.329198837280273, 'learning_rate': 3.384615384615385e-05, 'epoch': 1.24}


 42%|████▏     | 1560/3750 [07:18<10:14,  3.57it/s]

{'loss': 0.1737, 'grad_norm': 3.137397050857544, 'learning_rate': 3.36923076923077e-05, 'epoch': 1.25}


 42%|████▏     | 1570/3750 [07:21<10:07,  3.59it/s]

{'loss': 0.1943, 'grad_norm': 7.637469291687012, 'learning_rate': 3.353846153846154e-05, 'epoch': 1.26}


 42%|████▏     | 1580/3750 [07:24<10:03,  3.59it/s]

{'loss': 0.2587, 'grad_norm': 15.737981796264648, 'learning_rate': 3.338461538461539e-05, 'epoch': 1.26}


 42%|████▏     | 1590/3750 [07:26<10:02,  3.58it/s]

{'loss': 0.1576, 'grad_norm': 7.373289585113525, 'learning_rate': 3.323076923076923e-05, 'epoch': 1.27}


 43%|████▎     | 1600/3750 [07:29<09:58,  3.59it/s]

{'loss': 0.1498, 'grad_norm': 3.1950881481170654, 'learning_rate': 3.307692307692308e-05, 'epoch': 1.28}


 43%|████▎     | 1610/3750 [07:32<09:53,  3.61it/s]

{'loss': 0.1954, 'grad_norm': 8.476312637329102, 'learning_rate': 3.292307692307692e-05, 'epoch': 1.29}


 43%|████▎     | 1620/3750 [07:35<09:50,  3.61it/s]

{'loss': 0.1851, 'grad_norm': 8.152973175048828, 'learning_rate': 3.276923076923077e-05, 'epoch': 1.3}


 43%|████▎     | 1630/3750 [07:38<09:46,  3.61it/s]

{'loss': 0.1274, 'grad_norm': 5.2646403312683105, 'learning_rate': 3.261538461538462e-05, 'epoch': 1.3}


 44%|████▎     | 1640/3750 [07:40<09:46,  3.60it/s]

{'loss': 0.0792, 'grad_norm': 2.6465532779693604, 'learning_rate': 3.2461538461538466e-05, 'epoch': 1.31}


 44%|████▍     | 1650/3750 [07:43<09:46,  3.58it/s]

{'loss': 0.133, 'grad_norm': 7.103399753570557, 'learning_rate': 3.230769230769231e-05, 'epoch': 1.32}


 44%|████▍     | 1660/3750 [07:46<09:40,  3.60it/s]

{'loss': 0.124, 'grad_norm': 8.430017471313477, 'learning_rate': 3.215384615384616e-05, 'epoch': 1.33}


 45%|████▍     | 1670/3750 [07:49<09:36,  3.61it/s]

{'loss': 0.2749, 'grad_norm': 9.918973922729492, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.34}


 45%|████▍     | 1680/3750 [07:51<09:37,  3.58it/s]

{'loss': 0.1496, 'grad_norm': 5.621988296508789, 'learning_rate': 3.184615384615385e-05, 'epoch': 1.34}


 45%|████▌     | 1690/3750 [07:54<09:33,  3.59it/s]

{'loss': 0.1694, 'grad_norm': 5.479750633239746, 'learning_rate': 3.1692307692307696e-05, 'epoch': 1.35}


 45%|████▌     | 1700/3750 [07:57<09:30,  3.59it/s]

{'loss': 0.1933, 'grad_norm': 9.14242172241211, 'learning_rate': 3.153846153846154e-05, 'epoch': 1.36}


 46%|████▌     | 1710/3750 [08:00<09:30,  3.57it/s]

{'loss': 0.195, 'grad_norm': 11.767102241516113, 'learning_rate': 3.1384615384615386e-05, 'epoch': 1.37}


 46%|████▌     | 1720/3750 [08:03<09:27,  3.57it/s]

{'loss': 0.0796, 'grad_norm': 10.607935905456543, 'learning_rate': 3.123076923076923e-05, 'epoch': 1.38}


 46%|████▌     | 1730/3750 [08:05<09:23,  3.58it/s]

{'loss': 0.1491, 'grad_norm': 0.728749692440033, 'learning_rate': 3.107692307692308e-05, 'epoch': 1.38}


 46%|████▋     | 1740/3750 [08:08<09:17,  3.60it/s]

{'loss': 0.0582, 'grad_norm': 2.5440409183502197, 'learning_rate': 3.0923076923076926e-05, 'epoch': 1.39}


 47%|████▋     | 1750/3750 [08:11<09:15,  3.60it/s]

{'loss': 0.2447, 'grad_norm': 10.51297378540039, 'learning_rate': 3.0769230769230774e-05, 'epoch': 1.4}


 47%|████▋     | 1760/3750 [08:14<09:12,  3.60it/s]

{'loss': 0.2333, 'grad_norm': 0.7418273687362671, 'learning_rate': 3.0615384615384616e-05, 'epoch': 1.41}


 47%|████▋     | 1770/3750 [08:17<09:08,  3.61it/s]

{'loss': 0.1268, 'grad_norm': 1.2190876007080078, 'learning_rate': 3.0461538461538465e-05, 'epoch': 1.42}


 47%|████▋     | 1780/3750 [08:19<09:07,  3.60it/s]

{'loss': 0.1302, 'grad_norm': 0.15721751749515533, 'learning_rate': 3.030769230769231e-05, 'epoch': 1.42}


 48%|████▊     | 1790/3750 [08:22<09:04,  3.60it/s]

{'loss': 0.1224, 'grad_norm': 12.734265327453613, 'learning_rate': 3.0153846153846155e-05, 'epoch': 1.43}


 48%|████▊     | 1800/3750 [08:25<09:02,  3.59it/s]

{'loss': 0.1532, 'grad_norm': 4.191930770874023, 'learning_rate': 3e-05, 'epoch': 1.44}


 48%|████▊     | 1810/3750 [08:28<08:59,  3.60it/s]

{'loss': 0.1871, 'grad_norm': 12.362336158752441, 'learning_rate': 2.9846153846153846e-05, 'epoch': 1.45}


 49%|████▊     | 1820/3750 [08:31<08:58,  3.58it/s]

{'loss': 0.0949, 'grad_norm': 0.1285061091184616, 'learning_rate': 2.969230769230769e-05, 'epoch': 1.46}


 49%|████▉     | 1830/3750 [08:33<08:54,  3.59it/s]

{'loss': 0.1918, 'grad_norm': 10.27590560913086, 'learning_rate': 2.9538461538461543e-05, 'epoch': 1.46}


 49%|████▉     | 1840/3750 [08:36<08:54,  3.57it/s]

{'loss': 0.1561, 'grad_norm': 0.5015294551849365, 'learning_rate': 2.938461538461539e-05, 'epoch': 1.47}


 49%|████▉     | 1850/3750 [08:39<08:51,  3.58it/s]

{'loss': 0.1976, 'grad_norm': 9.933704376220703, 'learning_rate': 2.9230769230769234e-05, 'epoch': 1.48}


 50%|████▉     | 1860/3750 [08:42<08:47,  3.58it/s]

{'loss': 0.1636, 'grad_norm': 6.33043098449707, 'learning_rate': 2.907692307692308e-05, 'epoch': 1.49}


 50%|████▉     | 1870/3750 [08:44<08:40,  3.61it/s]

{'loss': 0.1464, 'grad_norm': 0.9626346230506897, 'learning_rate': 2.8923076923076925e-05, 'epoch': 1.5}


 50%|█████     | 1880/3750 [08:47<08:38,  3.61it/s]

{'loss': 0.1183, 'grad_norm': 13.837444305419922, 'learning_rate': 2.876923076923077e-05, 'epoch': 1.5}


 50%|█████     | 1890/3750 [08:50<08:36,  3.60it/s]

{'loss': 0.2133, 'grad_norm': 1.2897011041641235, 'learning_rate': 2.8615384615384615e-05, 'epoch': 1.51}


 51%|█████     | 1900/3750 [08:53<08:33,  3.60it/s]

{'loss': 0.1618, 'grad_norm': 9.677821159362793, 'learning_rate': 2.846153846153846e-05, 'epoch': 1.52}


 51%|█████     | 1910/3750 [08:56<08:32,  3.59it/s]

{'loss': 0.1219, 'grad_norm': 7.805222511291504, 'learning_rate': 2.8307692307692306e-05, 'epoch': 1.53}


 51%|█████     | 1920/3750 [08:58<08:30,  3.58it/s]

{'loss': 0.2123, 'grad_norm': 5.590021133422852, 'learning_rate': 2.8153846153846154e-05, 'epoch': 1.54}


 51%|█████▏    | 1930/3750 [09:01<08:26,  3.59it/s]

{'loss': 0.1973, 'grad_norm': 7.2362060546875, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.54}


 52%|█████▏    | 1940/3750 [09:04<08:24,  3.59it/s]

{'loss': 0.0855, 'grad_norm': 0.5092495679855347, 'learning_rate': 2.7846153846153848e-05, 'epoch': 1.55}


 52%|█████▏    | 1950/3750 [09:07<08:21,  3.59it/s]

{'loss': 0.1695, 'grad_norm': 3.7613208293914795, 'learning_rate': 2.7692307692307694e-05, 'epoch': 1.56}


 52%|█████▏    | 1960/3750 [09:10<08:18,  3.59it/s]

{'loss': 0.061, 'grad_norm': 8.134137153625488, 'learning_rate': 2.7538461538461542e-05, 'epoch': 1.57}


 53%|█████▎    | 1970/3750 [09:12<08:15,  3.59it/s]

{'loss': 0.3174, 'grad_norm': 18.03874969482422, 'learning_rate': 2.7384615384615387e-05, 'epoch': 1.58}


 53%|█████▎    | 1980/3750 [09:15<08:15,  3.57it/s]

{'loss': 0.1006, 'grad_norm': 4.773394584655762, 'learning_rate': 2.7230769230769233e-05, 'epoch': 1.58}


 53%|█████▎    | 1990/3750 [09:18<08:15,  3.55it/s]

{'loss': 0.1782, 'grad_norm': 7.433125019073486, 'learning_rate': 2.7076923076923078e-05, 'epoch': 1.59}


 53%|█████▎    | 2000/3750 [09:21<08:11,  3.56it/s]

{'loss': 0.1957, 'grad_norm': 6.087990760803223, 'learning_rate': 2.6923076923076923e-05, 'epoch': 1.6}


 54%|█████▎    | 2010/3750 [09:25<09:30,  3.05it/s]

{'loss': 0.1464, 'grad_norm': 1.0817921161651611, 'learning_rate': 2.676923076923077e-05, 'epoch': 1.61}


 54%|█████▍    | 2020/3750 [09:28<08:28,  3.41it/s]

{'loss': 0.226, 'grad_norm': 2.1897356510162354, 'learning_rate': 2.6615384615384614e-05, 'epoch': 1.62}


 54%|█████▍    | 2030/3750 [09:31<08:15,  3.47it/s]

{'loss': 0.1729, 'grad_norm': 18.25448226928711, 'learning_rate': 2.6461538461538466e-05, 'epoch': 1.62}


 54%|█████▍    | 2040/3750 [09:34<08:03,  3.53it/s]

{'loss': 0.1479, 'grad_norm': 6.710668563842773, 'learning_rate': 2.630769230769231e-05, 'epoch': 1.63}


 55%|█████▍    | 2050/3750 [09:36<07:56,  3.57it/s]

{'loss': 0.1524, 'grad_norm': 12.066123962402344, 'learning_rate': 2.6153846153846157e-05, 'epoch': 1.64}


 55%|█████▍    | 2060/3750 [09:39<07:51,  3.59it/s]

{'loss': 0.1373, 'grad_norm': 10.1835298538208, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.65}


 55%|█████▌    | 2070/3750 [09:42<07:48,  3.59it/s]

{'loss': 0.2159, 'grad_norm': 1.5591572523117065, 'learning_rate': 2.5846153846153847e-05, 'epoch': 1.66}


 55%|█████▌    | 2080/3750 [09:45<07:48,  3.57it/s]

{'loss': 0.1957, 'grad_norm': 7.4835591316223145, 'learning_rate': 2.5692307692307692e-05, 'epoch': 1.66}


 56%|█████▌    | 2090/3750 [09:47<07:42,  3.59it/s]

{'loss': 0.1517, 'grad_norm': 4.863776683807373, 'learning_rate': 2.5538461538461538e-05, 'epoch': 1.67}


 56%|█████▌    | 2100/3750 [09:50<07:41,  3.58it/s]

{'loss': 0.1505, 'grad_norm': 13.377496719360352, 'learning_rate': 2.5384615384615383e-05, 'epoch': 1.68}


 56%|█████▋    | 2110/3750 [09:53<07:38,  3.58it/s]

{'loss': 0.1439, 'grad_norm': 7.8563127517700195, 'learning_rate': 2.523076923076923e-05, 'epoch': 1.69}


 57%|█████▋    | 2120/3750 [09:56<07:34,  3.58it/s]

{'loss': 0.0781, 'grad_norm': 12.294113159179688, 'learning_rate': 2.5076923076923077e-05, 'epoch': 1.7}


 57%|█████▋    | 2130/3750 [09:59<07:30,  3.60it/s]

{'loss': 0.1739, 'grad_norm': 14.56243896484375, 'learning_rate': 2.4923076923076926e-05, 'epoch': 1.7}


 57%|█████▋    | 2140/3750 [10:01<07:29,  3.58it/s]

{'loss': 0.0747, 'grad_norm': 8.94363021850586, 'learning_rate': 2.476923076923077e-05, 'epoch': 1.71}


 57%|█████▋    | 2150/3750 [10:04<07:25,  3.59it/s]

{'loss': 0.1312, 'grad_norm': 2.2613914012908936, 'learning_rate': 2.461538461538462e-05, 'epoch': 1.72}


 58%|█████▊    | 2160/3750 [10:07<07:21,  3.60it/s]

{'loss': 0.0991, 'grad_norm': 22.47162628173828, 'learning_rate': 2.4461538461538465e-05, 'epoch': 1.73}


 58%|█████▊    | 2170/3750 [10:10<07:18,  3.60it/s]

{'loss': 0.0694, 'grad_norm': 1.9025789499282837, 'learning_rate': 2.430769230769231e-05, 'epoch': 1.74}


 58%|█████▊    | 2180/3750 [10:13<07:18,  3.58it/s]

{'loss': 0.1954, 'grad_norm': 18.920604705810547, 'learning_rate': 2.4153846153846155e-05, 'epoch': 1.74}


 58%|█████▊    | 2190/3750 [10:15<07:16,  3.57it/s]

{'loss': 0.2464, 'grad_norm': 0.20213891565799713, 'learning_rate': 2.4e-05, 'epoch': 1.75}


 59%|█████▊    | 2200/3750 [10:18<07:13,  3.58it/s]

{'loss': 0.3113, 'grad_norm': 18.621912002563477, 'learning_rate': 2.384615384615385e-05, 'epoch': 1.76}


 59%|█████▉    | 2210/3750 [10:21<07:12,  3.56it/s]

{'loss': 0.1097, 'grad_norm': 10.348918914794922, 'learning_rate': 2.3692307692307695e-05, 'epoch': 1.77}


 59%|█████▉    | 2220/3750 [10:24<07:07,  3.58it/s]

{'loss': 0.129, 'grad_norm': 0.26686882972717285, 'learning_rate': 2.353846153846154e-05, 'epoch': 1.78}


 59%|█████▉    | 2230/3750 [10:27<07:04,  3.58it/s]

{'loss': 0.2232, 'grad_norm': 2.853219747543335, 'learning_rate': 2.3384615384615385e-05, 'epoch': 1.78}


 60%|█████▉    | 2240/3750 [10:29<07:01,  3.58it/s]

{'loss': 0.1587, 'grad_norm': 12.33515739440918, 'learning_rate': 2.323076923076923e-05, 'epoch': 1.79}


 60%|██████    | 2250/3750 [10:32<06:58,  3.59it/s]

{'loss': 0.2261, 'grad_norm': 0.36389750242233276, 'learning_rate': 2.307692307692308e-05, 'epoch': 1.8}


 60%|██████    | 2260/3750 [10:35<06:55,  3.58it/s]

{'loss': 0.0771, 'grad_norm': 0.19029875099658966, 'learning_rate': 2.2923076923076924e-05, 'epoch': 1.81}


 61%|██████    | 2270/3750 [10:38<06:52,  3.59it/s]

{'loss': 0.2997, 'grad_norm': 13.252963066101074, 'learning_rate': 2.276923076923077e-05, 'epoch': 1.82}


 61%|██████    | 2280/3750 [10:41<06:49,  3.59it/s]

{'loss': 0.1746, 'grad_norm': 10.531383514404297, 'learning_rate': 2.2615384615384615e-05, 'epoch': 1.82}


 61%|██████    | 2290/3750 [10:43<06:47,  3.58it/s]

{'loss': 0.1248, 'grad_norm': 4.2127366065979, 'learning_rate': 2.246153846153846e-05, 'epoch': 1.83}


 61%|██████▏   | 2300/3750 [10:46<06:44,  3.58it/s]

{'loss': 0.1782, 'grad_norm': 9.937251091003418, 'learning_rate': 2.230769230769231e-05, 'epoch': 1.84}


 62%|██████▏   | 2310/3750 [10:49<06:41,  3.58it/s]

{'loss': 0.1452, 'grad_norm': 4.602688312530518, 'learning_rate': 2.2153846153846154e-05, 'epoch': 1.85}


 62%|██████▏   | 2320/3750 [10:52<06:37,  3.60it/s]

{'loss': 0.1871, 'grad_norm': 4.521955490112305, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.86}


 62%|██████▏   | 2330/3750 [10:55<06:36,  3.59it/s]

{'loss': 0.1421, 'grad_norm': 0.15466322004795074, 'learning_rate': 2.1846153846153848e-05, 'epoch': 1.86}


 62%|██████▏   | 2340/3750 [10:57<06:35,  3.57it/s]

{'loss': 0.2017, 'grad_norm': 10.106131553649902, 'learning_rate': 2.1692307692307693e-05, 'epoch': 1.87}


 63%|██████▎   | 2350/3750 [11:00<06:28,  3.61it/s]

{'loss': 0.1638, 'grad_norm': 9.047607421875, 'learning_rate': 2.1538461538461542e-05, 'epoch': 1.88}


 63%|██████▎   | 2360/3750 [11:03<06:29,  3.57it/s]

{'loss': 0.1586, 'grad_norm': 3.201390027999878, 'learning_rate': 2.1384615384615387e-05, 'epoch': 1.89}


 63%|██████▎   | 2370/3750 [11:06<06:24,  3.59it/s]

{'loss': 0.1273, 'grad_norm': 0.9151481986045837, 'learning_rate': 2.1230769230769233e-05, 'epoch': 1.9}


 63%|██████▎   | 2380/3750 [11:09<06:20,  3.60it/s]

{'loss': 0.152, 'grad_norm': 18.44281578063965, 'learning_rate': 2.1076923076923078e-05, 'epoch': 1.9}


 64%|██████▎   | 2390/3750 [11:11<06:20,  3.58it/s]

{'loss': 0.193, 'grad_norm': 4.823342323303223, 'learning_rate': 2.0923076923076923e-05, 'epoch': 1.91}


 64%|██████▍   | 2400/3750 [11:14<06:16,  3.59it/s]

{'loss': 0.197, 'grad_norm': 6.412694931030273, 'learning_rate': 2.0769230769230772e-05, 'epoch': 1.92}


 64%|██████▍   | 2410/3750 [11:17<06:14,  3.58it/s]

{'loss': 0.1456, 'grad_norm': 12.367769241333008, 'learning_rate': 2.0615384615384617e-05, 'epoch': 1.93}


 65%|██████▍   | 2420/3750 [11:20<06:12,  3.57it/s]

{'loss': 0.2734, 'grad_norm': 1.467171549797058, 'learning_rate': 2.0461538461538462e-05, 'epoch': 1.94}


 65%|██████▍   | 2430/3750 [11:23<06:07,  3.59it/s]

{'loss': 0.0866, 'grad_norm': 4.153721809387207, 'learning_rate': 2.0307692307692308e-05, 'epoch': 1.94}


 65%|██████▌   | 2440/3750 [11:25<06:03,  3.61it/s]

{'loss': 0.1554, 'grad_norm': 11.073869705200195, 'learning_rate': 2.0153846153846153e-05, 'epoch': 1.95}


 65%|██████▌   | 2450/3750 [11:28<06:02,  3.58it/s]

{'loss': 0.1083, 'grad_norm': 0.2674141824245453, 'learning_rate': 2e-05, 'epoch': 1.96}


 66%|██████▌   | 2460/3750 [11:31<05:59,  3.59it/s]

{'loss': 0.2405, 'grad_norm': 5.067114353179932, 'learning_rate': 1.9846153846153847e-05, 'epoch': 1.97}


 66%|██████▌   | 2470/3750 [11:34<05:55,  3.60it/s]

{'loss': 0.1137, 'grad_norm': 11.777588844299316, 'learning_rate': 1.9692307692307692e-05, 'epoch': 1.98}


 66%|██████▌   | 2480/3750 [11:37<05:53,  3.59it/s]

{'loss': 0.1286, 'grad_norm': 2.485685110092163, 'learning_rate': 1.9538461538461537e-05, 'epoch': 1.98}


 66%|██████▋   | 2490/3750 [11:39<05:51,  3.58it/s]

{'loss': 0.1638, 'grad_norm': 8.599584579467773, 'learning_rate': 1.9384615384615383e-05, 'epoch': 1.99}


 67%|██████▋   | 2500/3750 [11:42<05:49,  3.58it/s]

{'loss': 0.2368, 'grad_norm': 8.808673858642578, 'learning_rate': 1.923076923076923e-05, 'epoch': 2.0}


 67%|██████▋   | 2510/3750 [11:46<06:28,  3.19it/s]

{'loss': 0.0461, 'grad_norm': 8.25570011138916, 'learning_rate': 1.9076923076923077e-05, 'epoch': 2.01}


 67%|██████▋   | 2520/3750 [11:49<05:59,  3.42it/s]

{'loss': 0.0804, 'grad_norm': 0.07504568994045258, 'learning_rate': 1.8923076923076925e-05, 'epoch': 2.02}


 67%|██████▋   | 2530/3750 [11:52<05:47,  3.51it/s]

{'loss': 0.089, 'grad_norm': 7.590095043182373, 'learning_rate': 1.876923076923077e-05, 'epoch': 2.02}


 68%|██████▊   | 2540/3750 [11:55<05:38,  3.58it/s]

{'loss': 0.1598, 'grad_norm': 11.370804786682129, 'learning_rate': 1.8615384615384616e-05, 'epoch': 2.03}


 68%|██████▊   | 2550/3750 [11:57<05:35,  3.58it/s]

{'loss': 0.1332, 'grad_norm': 17.022417068481445, 'learning_rate': 1.8461538461538465e-05, 'epoch': 2.04}


 68%|██████▊   | 2560/3750 [12:00<05:30,  3.60it/s]

{'loss': 0.0932, 'grad_norm': 15.48974609375, 'learning_rate': 1.830769230769231e-05, 'epoch': 2.05}


 69%|██████▊   | 2570/3750 [12:03<05:29,  3.59it/s]

{'loss': 0.0669, 'grad_norm': 0.43715086579322815, 'learning_rate': 1.8153846153846155e-05, 'epoch': 2.06}


 69%|██████▉   | 2580/3750 [12:06<05:25,  3.60it/s]

{'loss': 0.0571, 'grad_norm': 0.13088978826999664, 'learning_rate': 1.8e-05, 'epoch': 2.06}


 69%|██████▉   | 2590/3750 [12:09<05:22,  3.60it/s]

{'loss': 0.0624, 'grad_norm': 11.508482933044434, 'learning_rate': 1.7846153846153846e-05, 'epoch': 2.07}


 69%|██████▉   | 2600/3750 [12:11<05:18,  3.62it/s]

{'loss': 0.0744, 'grad_norm': 7.090078830718994, 'learning_rate': 1.7692307692307694e-05, 'epoch': 2.08}


 70%|██████▉   | 2610/3750 [12:14<05:15,  3.61it/s]

{'loss': 0.1487, 'grad_norm': 2.0301074981689453, 'learning_rate': 1.753846153846154e-05, 'epoch': 2.09}


 70%|██████▉   | 2620/3750 [12:17<05:13,  3.61it/s]

{'loss': 0.1718, 'grad_norm': 0.06964010745286942, 'learning_rate': 1.7384615384615385e-05, 'epoch': 2.1}


 70%|███████   | 2630/3750 [12:20<05:11,  3.60it/s]

{'loss': 0.0576, 'grad_norm': 7.865217685699463, 'learning_rate': 1.723076923076923e-05, 'epoch': 2.1}


 70%|███████   | 2640/3750 [12:22<05:08,  3.59it/s]

{'loss': 0.0417, 'grad_norm': 0.16916194558143616, 'learning_rate': 1.7076923076923076e-05, 'epoch': 2.11}


 71%|███████   | 2650/3750 [12:25<05:03,  3.63it/s]

{'loss': 0.087, 'grad_norm': 0.4039624035358429, 'learning_rate': 1.6923076923076924e-05, 'epoch': 2.12}


 71%|███████   | 2660/3750 [12:28<05:02,  3.60it/s]

{'loss': 0.0954, 'grad_norm': 21.668909072875977, 'learning_rate': 1.676923076923077e-05, 'epoch': 2.13}


 71%|███████   | 2670/3750 [12:31<04:59,  3.60it/s]

{'loss': 0.1062, 'grad_norm': 13.221015930175781, 'learning_rate': 1.6615384615384615e-05, 'epoch': 2.14}


 71%|███████▏  | 2680/3750 [12:34<04:58,  3.59it/s]

{'loss': 0.0298, 'grad_norm': 0.11467460542917252, 'learning_rate': 1.646153846153846e-05, 'epoch': 2.14}


 72%|███████▏  | 2690/3750 [12:36<04:55,  3.59it/s]

{'loss': 0.1217, 'grad_norm': 6.189194679260254, 'learning_rate': 1.630769230769231e-05, 'epoch': 2.15}


 72%|███████▏  | 2700/3750 [12:39<04:52,  3.59it/s]

{'loss': 0.0726, 'grad_norm': 0.7774955630302429, 'learning_rate': 1.6153846153846154e-05, 'epoch': 2.16}


 72%|███████▏  | 2710/3750 [12:42<04:47,  3.62it/s]

{'loss': 0.0372, 'grad_norm': 0.05205610767006874, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.17}


 73%|███████▎  | 2720/3750 [12:45<04:45,  3.60it/s]

{'loss': 0.0667, 'grad_norm': 6.524424076080322, 'learning_rate': 1.5846153846153848e-05, 'epoch': 2.18}


 73%|███████▎  | 2730/3750 [12:47<04:43,  3.60it/s]

{'loss': 0.0169, 'grad_norm': 0.11722157895565033, 'learning_rate': 1.5692307692307693e-05, 'epoch': 2.18}


 73%|███████▎  | 2740/3750 [12:50<04:40,  3.60it/s]

{'loss': 0.036, 'grad_norm': 0.06526423990726471, 'learning_rate': 1.553846153846154e-05, 'epoch': 2.19}


 73%|███████▎  | 2750/3750 [12:53<04:38,  3.59it/s]

{'loss': 0.0491, 'grad_norm': 0.03846771642565727, 'learning_rate': 1.5384615384615387e-05, 'epoch': 2.2}


 74%|███████▎  | 2760/3750 [12:56<04:33,  3.61it/s]

{'loss': 0.0582, 'grad_norm': 0.18539181351661682, 'learning_rate': 1.5230769230769232e-05, 'epoch': 2.21}


 74%|███████▍  | 2770/3750 [12:59<04:31,  3.61it/s]

{'loss': 0.0039, 'grad_norm': 0.07496995478868484, 'learning_rate': 1.5076923076923078e-05, 'epoch': 2.22}


 74%|███████▍  | 2780/3750 [13:01<04:29,  3.60it/s]

{'loss': 0.0531, 'grad_norm': 0.06286995112895966, 'learning_rate': 1.4923076923076923e-05, 'epoch': 2.22}


 74%|███████▍  | 2790/3750 [13:04<04:27,  3.59it/s]

{'loss': 0.0524, 'grad_norm': 0.1671350598335266, 'learning_rate': 1.4769230769230772e-05, 'epoch': 2.23}


 75%|███████▍  | 2800/3750 [13:07<04:23,  3.60it/s]

{'loss': 0.0158, 'grad_norm': 0.3040677309036255, 'learning_rate': 1.4615384615384617e-05, 'epoch': 2.24}


 75%|███████▍  | 2810/3750 [13:10<04:18,  3.63it/s]

{'loss': 0.0755, 'grad_norm': 0.20074288547039032, 'learning_rate': 1.4461538461538462e-05, 'epoch': 2.25}


 75%|███████▌  | 2820/3750 [13:12<04:16,  3.63it/s]

{'loss': 0.0658, 'grad_norm': 0.02447269856929779, 'learning_rate': 1.4307692307692308e-05, 'epoch': 2.26}


 75%|███████▌  | 2830/3750 [13:15<04:15,  3.61it/s]

{'loss': 0.0498, 'grad_norm': 6.654971599578857, 'learning_rate': 1.4153846153846153e-05, 'epoch': 2.26}


 76%|███████▌  | 2840/3750 [13:18<04:13,  3.59it/s]

{'loss': 0.155, 'grad_norm': 29.268953323364258, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.27}


 76%|███████▌  | 2850/3750 [13:21<04:10,  3.60it/s]

{'loss': 0.1142, 'grad_norm': 0.17462411522865295, 'learning_rate': 1.3846153846153847e-05, 'epoch': 2.28}


 76%|███████▋  | 2860/3750 [13:24<04:07,  3.60it/s]

{'loss': 0.0693, 'grad_norm': 0.23779471218585968, 'learning_rate': 1.3692307692307694e-05, 'epoch': 2.29}


 77%|███████▋  | 2870/3750 [13:26<04:07,  3.56it/s]

{'loss': 0.1013, 'grad_norm': 4.499735355377197, 'learning_rate': 1.3538461538461539e-05, 'epoch': 2.3}


 77%|███████▋  | 2880/3750 [13:29<04:02,  3.59it/s]

{'loss': 0.0338, 'grad_norm': 0.3253311514854431, 'learning_rate': 1.3384615384615384e-05, 'epoch': 2.3}


 77%|███████▋  | 2890/3750 [13:32<04:00,  3.58it/s]

{'loss': 0.0308, 'grad_norm': 0.028181837871670723, 'learning_rate': 1.3230769230769233e-05, 'epoch': 2.31}


 77%|███████▋  | 2900/3750 [13:35<03:56,  3.59it/s]

{'loss': 0.1246, 'grad_norm': 17.34284019470215, 'learning_rate': 1.3076923076923078e-05, 'epoch': 2.32}


 78%|███████▊  | 2910/3750 [13:38<03:54,  3.59it/s]

{'loss': 0.086, 'grad_norm': 1.617094874382019, 'learning_rate': 1.2923076923076924e-05, 'epoch': 2.33}


 78%|███████▊  | 2920/3750 [13:40<03:50,  3.60it/s]

{'loss': 0.0386, 'grad_norm': 2.0041558742523193, 'learning_rate': 1.2769230769230769e-05, 'epoch': 2.34}


 78%|███████▊  | 2930/3750 [13:43<03:48,  3.59it/s]

{'loss': 0.061, 'grad_norm': 0.07893304526805878, 'learning_rate': 1.2615384615384616e-05, 'epoch': 2.34}


 78%|███████▊  | 2940/3750 [13:46<03:47,  3.57it/s]

{'loss': 0.144, 'grad_norm': 0.3706083297729492, 'learning_rate': 1.2461538461538463e-05, 'epoch': 2.35}


 79%|███████▊  | 2950/3750 [13:49<03:42,  3.59it/s]

{'loss': 0.2106, 'grad_norm': 17.97142219543457, 'learning_rate': 1.230769230769231e-05, 'epoch': 2.36}


 79%|███████▉  | 2960/3750 [13:51<03:41,  3.57it/s]

{'loss': 0.1004, 'grad_norm': 0.13873209059238434, 'learning_rate': 1.2153846153846155e-05, 'epoch': 2.37}


 79%|███████▉  | 2970/3750 [13:54<03:36,  3.60it/s]

{'loss': 0.0656, 'grad_norm': 2.148344039916992, 'learning_rate': 1.2e-05, 'epoch': 2.38}


 79%|███████▉  | 2980/3750 [13:57<03:33,  3.60it/s]

{'loss': 0.0422, 'grad_norm': 0.2236303687095642, 'learning_rate': 1.1846153846153847e-05, 'epoch': 2.38}


 80%|███████▉  | 2990/3750 [14:00<03:31,  3.59it/s]

{'loss': 0.0616, 'grad_norm': 0.06638569384813309, 'learning_rate': 1.1692307692307693e-05, 'epoch': 2.39}


 80%|████████  | 3000/3750 [14:03<03:32,  3.53it/s]

{'loss': 0.0604, 'grad_norm': 9.153648376464844, 'learning_rate': 1.153846153846154e-05, 'epoch': 2.4}


 80%|████████  | 3010/3750 [14:07<04:00,  3.07it/s]

{'loss': 0.0205, 'grad_norm': 27.658798217773438, 'learning_rate': 1.1384615384615385e-05, 'epoch': 2.41}


 81%|████████  | 3020/3750 [14:10<03:34,  3.41it/s]

{'loss': 0.0494, 'grad_norm': 0.5493505597114563, 'learning_rate': 1.123076923076923e-05, 'epoch': 2.42}


 81%|████████  | 3030/3750 [14:13<03:24,  3.52it/s]

{'loss': 0.1371, 'grad_norm': 8.35908031463623, 'learning_rate': 1.1076923076923077e-05, 'epoch': 2.42}


 81%|████████  | 3040/3750 [14:15<03:20,  3.55it/s]

{'loss': 0.0704, 'grad_norm': 2.401137113571167, 'learning_rate': 1.0923076923076924e-05, 'epoch': 2.43}


 81%|████████▏ | 3050/3750 [14:18<03:15,  3.58it/s]

{'loss': 0.2071, 'grad_norm': 2.030931234359741, 'learning_rate': 1.0769230769230771e-05, 'epoch': 2.44}


 82%|████████▏ | 3060/3750 [14:21<03:12,  3.58it/s]

{'loss': 0.0243, 'grad_norm': 0.050674114376306534, 'learning_rate': 1.0615384615384616e-05, 'epoch': 2.45}


 82%|████████▏ | 3070/3750 [14:24<03:09,  3.58it/s]

{'loss': 0.0277, 'grad_norm': 0.1435876190662384, 'learning_rate': 1.0461538461538462e-05, 'epoch': 2.46}


 82%|████████▏ | 3080/3750 [14:27<03:06,  3.59it/s]

{'loss': 0.0066, 'grad_norm': 7.35247278213501, 'learning_rate': 1.0307692307692309e-05, 'epoch': 2.46}


 82%|████████▏ | 3090/3750 [14:29<03:03,  3.60it/s]

{'loss': 0.0242, 'grad_norm': 6.0272650718688965, 'learning_rate': 1.0153846153846154e-05, 'epoch': 2.47}


 83%|████████▎ | 3100/3750 [14:32<03:00,  3.59it/s]

{'loss': 0.0925, 'grad_norm': 13.899664878845215, 'learning_rate': 1e-05, 'epoch': 2.48}


 83%|████████▎ | 3110/3750 [14:35<02:57,  3.61it/s]

{'loss': 0.1405, 'grad_norm': 0.05342899262905121, 'learning_rate': 9.846153846153846e-06, 'epoch': 2.49}


 83%|████████▎ | 3120/3750 [14:38<02:55,  3.59it/s]

{'loss': 0.1393, 'grad_norm': 2.028653383255005, 'learning_rate': 9.692307692307691e-06, 'epoch': 2.5}


 83%|████████▎ | 3130/3750 [14:40<02:52,  3.59it/s]

{'loss': 0.0854, 'grad_norm': 16.376909255981445, 'learning_rate': 9.538461538461538e-06, 'epoch': 2.5}


 84%|████████▎ | 3140/3750 [14:43<02:49,  3.59it/s]

{'loss': 0.1043, 'grad_norm': 20.59807586669922, 'learning_rate': 9.384615384615385e-06, 'epoch': 2.51}


 84%|████████▍ | 3150/3750 [14:46<02:45,  3.63it/s]

{'loss': 0.0299, 'grad_norm': 1.3145867586135864, 'learning_rate': 9.230769230769232e-06, 'epoch': 2.52}


 84%|████████▍ | 3160/3750 [14:49<02:46,  3.54it/s]

{'loss': 0.0607, 'grad_norm': 0.09161373972892761, 'learning_rate': 9.076923076923078e-06, 'epoch': 2.53}


 85%|████████▍ | 3170/3750 [14:52<02:46,  3.48it/s]

{'loss': 0.1418, 'grad_norm': 0.10507866740226746, 'learning_rate': 8.923076923076923e-06, 'epoch': 2.54}


 85%|████████▍ | 3180/3750 [14:55<02:45,  3.45it/s]

{'loss': 0.0648, 'grad_norm': 0.06976453214883804, 'learning_rate': 8.76923076923077e-06, 'epoch': 2.54}


 85%|████████▌ | 3190/3750 [14:58<02:40,  3.48it/s]

{'loss': 0.105, 'grad_norm': 1.294508934020996, 'learning_rate': 8.615384615384615e-06, 'epoch': 2.55}


 85%|████████▌ | 3200/3750 [15:00<02:38,  3.48it/s]

{'loss': 0.0405, 'grad_norm': 0.6307841539382935, 'learning_rate': 8.461538461538462e-06, 'epoch': 2.56}


 86%|████████▌ | 3210/3750 [15:03<02:31,  3.57it/s]

{'loss': 0.1077, 'grad_norm': 13.022690773010254, 'learning_rate': 8.307692307692307e-06, 'epoch': 2.57}


 86%|████████▌ | 3220/3750 [15:06<02:28,  3.56it/s]

{'loss': 0.0393, 'grad_norm': 0.3822920322418213, 'learning_rate': 8.153846153846154e-06, 'epoch': 2.58}


 86%|████████▌ | 3230/3750 [15:09<02:24,  3.60it/s]

{'loss': 0.0054, 'grad_norm': 0.05250004678964615, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.58}


 86%|████████▋ | 3240/3750 [15:12<02:22,  3.57it/s]

{'loss': 0.142, 'grad_norm': 2.7658209800720215, 'learning_rate': 7.846153846153847e-06, 'epoch': 2.59}


 87%|████████▋ | 3250/3750 [15:14<02:18,  3.60it/s]

{'loss': 0.091, 'grad_norm': 0.13409145176410675, 'learning_rate': 7.692307692307694e-06, 'epoch': 2.6}


 87%|████████▋ | 3260/3750 [15:17<02:16,  3.59it/s]

{'loss': 0.0243, 'grad_norm': 0.18684741854667664, 'learning_rate': 7.538461538461539e-06, 'epoch': 2.61}


 87%|████████▋ | 3270/3750 [15:20<02:13,  3.59it/s]

{'loss': 0.0751, 'grad_norm': 33.726932525634766, 'learning_rate': 7.384615384615386e-06, 'epoch': 2.62}


 87%|████████▋ | 3280/3750 [15:23<02:10,  3.59it/s]

{'loss': 0.0464, 'grad_norm': 3.0451228618621826, 'learning_rate': 7.230769230769231e-06, 'epoch': 2.62}


 88%|████████▊ | 3290/3750 [15:26<02:07,  3.59it/s]

{'loss': 0.0282, 'grad_norm': 10.584115028381348, 'learning_rate': 7.076923076923076e-06, 'epoch': 2.63}


 88%|████████▊ | 3300/3750 [15:28<02:05,  3.60it/s]

{'loss': 0.0498, 'grad_norm': 0.24351149797439575, 'learning_rate': 6.923076923076923e-06, 'epoch': 2.64}


 88%|████████▊ | 3310/3750 [15:31<02:02,  3.60it/s]

{'loss': 0.1104, 'grad_norm': 22.536577224731445, 'learning_rate': 6.7692307692307695e-06, 'epoch': 2.65}


 89%|████████▊ | 3320/3750 [15:34<01:59,  3.58it/s]

{'loss': 0.0589, 'grad_norm': 1.1167892217636108, 'learning_rate': 6.6153846153846165e-06, 'epoch': 2.66}


 89%|████████▉ | 3330/3750 [15:37<01:56,  3.59it/s]

{'loss': 0.0603, 'grad_norm': 0.295895516872406, 'learning_rate': 6.461538461538462e-06, 'epoch': 2.66}


 89%|████████▉ | 3340/3750 [15:39<01:54,  3.58it/s]

{'loss': 0.0836, 'grad_norm': 25.85028076171875, 'learning_rate': 6.307692307692308e-06, 'epoch': 2.67}


 89%|████████▉ | 3350/3750 [15:42<01:51,  3.59it/s]

{'loss': 0.0099, 'grad_norm': 0.7269555330276489, 'learning_rate': 6.153846153846155e-06, 'epoch': 2.68}


 90%|████████▉ | 3360/3750 [15:45<01:48,  3.59it/s]

{'loss': 0.0503, 'grad_norm': 0.029845768585801125, 'learning_rate': 6e-06, 'epoch': 2.69}


 90%|████████▉ | 3370/3750 [15:48<01:46,  3.58it/s]

{'loss': 0.0252, 'grad_norm': 0.02282293140888214, 'learning_rate': 5.846153846153846e-06, 'epoch': 2.7}


 90%|█████████ | 3380/3750 [15:51<01:42,  3.61it/s]

{'loss': 0.0321, 'grad_norm': 0.6986433863639832, 'learning_rate': 5.692307692307692e-06, 'epoch': 2.7}


 90%|█████████ | 3390/3750 [15:53<01:40,  3.58it/s]

{'loss': 0.0614, 'grad_norm': 0.2705259323120117, 'learning_rate': 5.5384615384615385e-06, 'epoch': 2.71}


 91%|█████████ | 3400/3750 [15:56<01:37,  3.58it/s]

{'loss': 0.0607, 'grad_norm': 0.04152591526508331, 'learning_rate': 5.3846153846153855e-06, 'epoch': 2.72}


 91%|█████████ | 3410/3750 [15:59<01:34,  3.60it/s]

{'loss': 0.0365, 'grad_norm': 0.04251774400472641, 'learning_rate': 5.230769230769231e-06, 'epoch': 2.73}


 91%|█████████ | 3420/3750 [16:02<01:32,  3.58it/s]

{'loss': 0.041, 'grad_norm': 0.044639330357313156, 'learning_rate': 5.076923076923077e-06, 'epoch': 2.74}


 91%|█████████▏| 3430/3750 [16:05<01:29,  3.58it/s]

{'loss': 0.0746, 'grad_norm': 0.07326054573059082, 'learning_rate': 4.923076923076923e-06, 'epoch': 2.74}


 92%|█████████▏| 3440/3750 [16:07<01:26,  3.57it/s]

{'loss': 0.1261, 'grad_norm': 2.45743727684021, 'learning_rate': 4.769230769230769e-06, 'epoch': 2.75}


 92%|█████████▏| 3450/3750 [16:10<01:23,  3.59it/s]

{'loss': 0.054, 'grad_norm': 11.04041862487793, 'learning_rate': 4.615384615384616e-06, 'epoch': 2.76}


 92%|█████████▏| 3460/3750 [16:13<01:20,  3.59it/s]

{'loss': 0.0227, 'grad_norm': 0.5468271374702454, 'learning_rate': 4.4615384615384614e-06, 'epoch': 2.77}


 93%|█████████▎| 3470/3750 [16:16<01:17,  3.60it/s]

{'loss': 0.1173, 'grad_norm': 0.25881141424179077, 'learning_rate': 4.3076923076923076e-06, 'epoch': 2.78}


 93%|█████████▎| 3480/3750 [16:19<01:15,  3.60it/s]

{'loss': 0.0669, 'grad_norm': 0.10198893398046494, 'learning_rate': 4.153846153846154e-06, 'epoch': 2.78}


 93%|█████████▎| 3490/3750 [16:21<01:12,  3.58it/s]

{'loss': 0.0359, 'grad_norm': 0.04224828630685806, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.79}


 93%|█████████▎| 3500/3750 [16:24<01:09,  3.57it/s]

{'loss': 0.0178, 'grad_norm': 1.0152913331985474, 'learning_rate': 3.846153846153847e-06, 'epoch': 2.8}


 94%|█████████▎| 3510/3750 [16:28<01:15,  3.16it/s]

{'loss': 0.0568, 'grad_norm': 0.1165972501039505, 'learning_rate': 3.692307692307693e-06, 'epoch': 2.81}


 94%|█████████▍| 3520/3750 [16:31<01:07,  3.40it/s]

{'loss': 0.0225, 'grad_norm': 0.03276073932647705, 'learning_rate': 3.538461538461538e-06, 'epoch': 2.82}


 94%|█████████▍| 3530/3750 [16:34<01:02,  3.49it/s]

{'loss': 0.0619, 'grad_norm': 21.1452693939209, 'learning_rate': 3.3846153846153848e-06, 'epoch': 2.82}


 94%|█████████▍| 3540/3750 [16:37<00:59,  3.55it/s]

{'loss': 0.0202, 'grad_norm': 35.08075714111328, 'learning_rate': 3.230769230769231e-06, 'epoch': 2.83}


 95%|█████████▍| 3550/3750 [16:39<00:56,  3.57it/s]

{'loss': 0.119, 'grad_norm': 0.41553330421447754, 'learning_rate': 3.0769230769230774e-06, 'epoch': 2.84}


 95%|█████████▍| 3560/3750 [16:42<00:53,  3.58it/s]

{'loss': 0.0094, 'grad_norm': 1.4884700775146484, 'learning_rate': 2.923076923076923e-06, 'epoch': 2.85}


 95%|█████████▌| 3570/3750 [16:45<00:50,  3.59it/s]

{'loss': 0.0395, 'grad_norm': 0.029398983344435692, 'learning_rate': 2.7692307692307693e-06, 'epoch': 2.86}


 95%|█████████▌| 3580/3750 [16:48<00:47,  3.58it/s]

{'loss': 0.04, 'grad_norm': 0.2029891312122345, 'learning_rate': 2.6153846153846154e-06, 'epoch': 2.86}


 96%|█████████▌| 3590/3750 [16:51<00:44,  3.58it/s]

{'loss': 0.0683, 'grad_norm': 19.43035888671875, 'learning_rate': 2.4615384615384615e-06, 'epoch': 2.87}


 96%|█████████▌| 3600/3750 [16:53<00:41,  3.60it/s]

{'loss': 0.0769, 'grad_norm': 0.024663204327225685, 'learning_rate': 2.307692307692308e-06, 'epoch': 2.88}


 96%|█████████▋| 3610/3750 [16:56<00:38,  3.61it/s]

{'loss': 0.0059, 'grad_norm': 1.2733733654022217, 'learning_rate': 2.1538461538461538e-06, 'epoch': 2.89}


 97%|█████████▋| 3620/3750 [16:59<00:36,  3.59it/s]

{'loss': 0.0745, 'grad_norm': 0.08776156604290009, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.9}


 97%|█████████▋| 3630/3750 [17:02<00:33,  3.58it/s]

{'loss': 0.0365, 'grad_norm': 0.02086278237402439, 'learning_rate': 1.8461538461538465e-06, 'epoch': 2.9}


 97%|█████████▋| 3640/3750 [17:05<00:30,  3.58it/s]

{'loss': 0.0312, 'grad_norm': 0.04601680859923363, 'learning_rate': 1.6923076923076924e-06, 'epoch': 2.91}


 97%|█████████▋| 3650/3750 [17:07<00:28,  3.57it/s]

{'loss': 0.0688, 'grad_norm': 5.788278579711914, 'learning_rate': 1.5384615384615387e-06, 'epoch': 2.92}


 98%|█████████▊| 3660/3750 [17:10<00:25,  3.59it/s]

{'loss': 0.0545, 'grad_norm': 3.3892858028411865, 'learning_rate': 1.3846153846153846e-06, 'epoch': 2.93}


 98%|█████████▊| 3670/3750 [17:13<00:22,  3.60it/s]

{'loss': 0.088, 'grad_norm': 10.540987968444824, 'learning_rate': 1.2307692307692308e-06, 'epoch': 2.94}


 98%|█████████▊| 3680/3750 [17:16<00:19,  3.57it/s]

{'loss': 0.0828, 'grad_norm': 0.030204735696315765, 'learning_rate': 1.0769230769230769e-06, 'epoch': 2.94}


 98%|█████████▊| 3690/3750 [17:19<00:16,  3.57it/s]

{'loss': 0.0035, 'grad_norm': 0.05516856163740158, 'learning_rate': 9.230769230769232e-07, 'epoch': 2.95}


 99%|█████████▊| 3700/3750 [17:21<00:13,  3.59it/s]

{'loss': 0.1096, 'grad_norm': 0.48725923895835876, 'learning_rate': 7.692307692307694e-07, 'epoch': 2.96}


 99%|█████████▉| 3710/3750 [17:24<00:11,  3.58it/s]

{'loss': 0.0803, 'grad_norm': 0.06926888972520828, 'learning_rate': 6.153846153846154e-07, 'epoch': 2.97}


 99%|█████████▉| 3720/3750 [17:27<00:08,  3.59it/s]

{'loss': 0.0896, 'grad_norm': 9.963266372680664, 'learning_rate': 4.615384615384616e-07, 'epoch': 2.98}


 99%|█████████▉| 3730/3750 [17:30<00:05,  3.58it/s]

{'loss': 0.1144, 'grad_norm': 23.55409049987793, 'learning_rate': 3.076923076923077e-07, 'epoch': 2.98}


100%|█████████▉| 3740/3750 [17:32<00:02,  3.59it/s]

{'loss': 0.1277, 'grad_norm': 16.00086212158203, 'learning_rate': 1.5384615384615385e-07, 'epoch': 2.99}


100%|██████████| 3750/3750 [17:35<00:00,  3.55it/s]

{'loss': 0.0311, 'grad_norm': 0.29583197832107544, 'learning_rate': 0.0, 'epoch': 3.0}
{'train_runtime': 1055.7663, 'train_samples_per_second': 56.831, 'train_steps_per_second': 3.552, 'train_loss': 0.18773591228922207, 'epoch': 3.0}





TrainOutput(global_step=3750, training_loss=0.18773591228922207, metrics={'train_runtime': 1055.7663, 'train_samples_per_second': 56.831, 'train_steps_per_second': 3.552, 'train_loss': 0.18773591228922207, 'epoch': 3.0})