In [1]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding, pipeline
from datasets import load_metric, Dataset
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
snips_file=open("Data/Snips Train Data.txt")
snips_rows=snips_file.readlines()
snips_rows[:20]

['listen O\n',
 'to O\n',
 'westbam B-artist\n',
 'alumb O\n',
 'allergic B-album\n',
 'on O\n',
 'google B-service\n',
 'music I-service\n',
 'PlayMusic\n',
 '\n',
 'add O\n',
 'step B-entity_name\n',
 'to I-entity_name\n',
 'me I-entity_name\n',
 'to O\n',
 'the O\n',
 '50 B-playlist\n',
 'clásicos I-playlist\n',
 'playlist O\n',
 'AddToPlaylist\n']

In [3]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []
utterance, tokenized_utterance, label_for_utterances = '', [], []

for snip_row in snips_rows:
    if len(snip_row) == 2:  # Skip rows with insufficient data
        continue
    if ' ' not in snip_row.strip():  # Sequence label
        sequence_label = snip_row.strip()
        if sequence_label:  # Only add non-empty labels
            sequence_labels.append(sequence_label)
        if utterance.strip():  # Avoid appending empty utterances
            utterances.append(utterance.strip())
            tokenized_utterances.append(tokenized_utterance)
            labels_for_tokens.append(label_for_utterances)
        # Reset for next sequence
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)


In [4]:
len(labels_for_tokens) , len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [5]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [6]:
unique_sequence_labels=list(set(sequence_labels))
sequence_labels=[unique_sequence_labels.index(l) for l in sequence_labels]
print(len(unique_sequence_labels))
print(unique_sequence_labels)

7
['GetWeather', 'BookRestaurant', 'SearchCreativeWork', 'PlayMusic', 'SearchScreeningEvent', 'AddToPlaylist', 'RateBook']


In [7]:
from functools import reduce

unique_token_labels=list(set(reduce(lambda x, y:x+y,labels_for_tokens)))
labels_for_tokens=[[unique_token_labels.index(_) for _ in l]for l in labels_for_tokens]
print(len(unique_token_labels))

72


In [8]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[28, 28, 55, 28, 44, 28, 41, 36]
listen to westbam alumb allergic on google music
3


In [9]:
snips_dataset=Dataset.from_dict(dict(utterance=utterances,label=sequence_labels,tokens=tokenized_utterances,token_label=labels_for_tokens))
snips_dataset=snips_dataset.train_test_split(0.2)

In [10]:
print(snips_dataset['train'][0])

{'utterance': 'restaurant in south sudan for 6', 'label': 1, 'tokens': ['restaurant', 'in', 'south', 'sudan', 'for', '6'], 'token_label': [27, 28, 53, 59, 28, 29]}


In [11]:
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["utterance"],truncation=True)

In [13]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function,batched=True)

100%|██████████| 11/11 [00:00<00:00, 26.74ba/s]
100%|██████████| 3/3 [00:00<00:00, 10.51ba/s]


In [14]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'restaurant in south sudan for 6',
 'label': 1,
 'tokens': ['restaurant', 'in', 'south', 'sudan', 'for', '6'],
 'token_label': [27, 28, 53, 59, 28, 29],
 'input_ids': [101, 4825, 1999, 2148, 10411, 2005, 1020, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
seq_clf_model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=len(unique_sequence_labels))
seq_clf_model.config.id2label={i:l for i,l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
seq_clf_model.config.id2label[0]

'GetWeather'

In [18]:
metric = load_metric("accuracy")
def compute_matrics(eval_pred):
    logists,labels=eval_pred
    predictions=np.argmax(logists,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [19]:
epochs = 2
warmup_steps = len(seq_clf_tokenized_snips['train']) // 5
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

training_arguments = TrainingArguments(
    output_dir='./snips_clf/results',
    num_train_epochs=epochs,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=0.05,
    logging_steps=1,
    log_level="info",
    eval_strategy='epoch',
    save_strategy='epoch',
    no_cuda=True
)

seq_clf_model.to(device)
trainer=Trainer(model=seq_clf_model,args=training_arguments,train_dataset=seq_clf_tokenized_snips['train'],eval_dataset=seq_clf_tokenized_snips['test'],compute_metrics=compute_matrics,data_collator=data_collator)



In [20]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:27<00:00,  3.00it/s]


{'eval_loss': 1.946377158164978,
 'eval_model_preparation_time': 0.0011,
 'eval_accuracy': 0.13068398930072603,
 'eval_runtime': 27.6335,
 'eval_samples_per_second': 94.704,
 'eval_steps_per_second': 2.967}

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855
  0%|          | 1/656 [00:00<07:45,  1.41it/s]

{'loss': 1.9455, 'grad_norm': 1.4432977437973022, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}


  0%|          | 2/656 [00:01<08:27,  1.29it/s]

{'loss': 1.9424, 'grad_norm': 1.0646190643310547, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}


  0%|          | 3/656 [00:02<07:48,  1.39it/s]

{'loss': 1.9299, 'grad_norm': 1.4507027864456177, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}


  1%|          | 4/656 [00:02<07:40,  1.42it/s]

{'loss': 1.9542, 'grad_norm': 1.3370057344436646, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}


  1%|          | 5/656 [00:03<07:14,  1.50it/s]

{'loss': 1.9322, 'grad_norm': 1.6780327558517456, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}


  1%|          | 6/656 [00:04<07:10,  1.51it/s]

{'loss': 1.9728, 'grad_norm': 1.9158755540847778, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}


  1%|          | 7/656 [00:04<06:57,  1.56it/s]

{'loss': 1.9512, 'grad_norm': 1.4547834396362305, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}


  1%|          | 8/656 [00:05<06:58,  1.55it/s]

{'loss': 1.949, 'grad_norm': 1.1432669162750244, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}


  1%|▏         | 9/656 [00:06<07:21,  1.47it/s]

{'loss': 1.9478, 'grad_norm': 1.1437920331954956, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}


  2%|▏         | 10/656 [00:06<07:09,  1.50it/s]

{'loss': 1.9396, 'grad_norm': 1.5416814088821411, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}


  2%|▏         | 11/656 [00:07<07:12,  1.49it/s]

{'loss': 1.9551, 'grad_norm': 1.435892105102539, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}


  2%|▏         | 12/656 [00:08<06:59,  1.53it/s]

{'loss': 1.9322, 'grad_norm': 0.9807370901107788, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}


  2%|▏         | 13/656 [00:08<07:10,  1.49it/s]

{'loss': 1.9458, 'grad_norm': 1.4570504426956177, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}


  2%|▏         | 14/656 [00:09<07:19,  1.46it/s]

{'loss': 1.943, 'grad_norm': 1.888792872428894, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}


  2%|▏         | 15/656 [00:10<07:29,  1.43it/s]

{'loss': 1.9146, 'grad_norm': 1.1940377950668335, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}


  2%|▏         | 16/656 [00:10<07:27,  1.43it/s]

{'loss': 1.9363, 'grad_norm': 1.5636001825332642, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}


  3%|▎         | 17/656 [00:12<09:00,  1.18it/s]

{'loss': 1.943, 'grad_norm': 1.3228293657302856, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}


  3%|▎         | 18/656 [00:12<08:47,  1.21it/s]

{'loss': 1.9463, 'grad_norm': 1.381127953529358, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}


  3%|▎         | 19/656 [00:13<08:31,  1.24it/s]

{'loss': 1.9542, 'grad_norm': 1.9230003356933594, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}


  3%|▎         | 20/656 [00:14<08:03,  1.32it/s]

{'loss': 1.9548, 'grad_norm': 1.630460500717163, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}


  3%|▎         | 21/656 [00:15<08:25,  1.26it/s]

{'loss': 1.9509, 'grad_norm': 1.0899946689605713, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}


  3%|▎         | 22/656 [00:15<08:04,  1.31it/s]

{'loss': 1.9624, 'grad_norm': 1.7421908378601074, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}


  4%|▎         | 23/656 [00:16<08:02,  1.31it/s]

{'loss': 1.9511, 'grad_norm': 1.2420904636383057, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}


  4%|▎         | 24/656 [00:17<07:48,  1.35it/s]

{'loss': 1.9464, 'grad_norm': 1.5845413208007812, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}


  4%|▍         | 25/656 [00:18<07:38,  1.38it/s]

{'loss': 1.9438, 'grad_norm': 1.6815409660339355, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}


  4%|▍         | 26/656 [00:18<08:07,  1.29it/s]

{'loss': 1.9412, 'grad_norm': 1.702203392982483, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}


  4%|▍         | 27/656 [00:19<07:45,  1.35it/s]

{'loss': 1.9274, 'grad_norm': 1.8972746133804321, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}


  4%|▍         | 28/656 [00:20<07:25,  1.41it/s]

{'loss': 1.9572, 'grad_norm': 1.3279168605804443, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


  4%|▍         | 29/656 [00:21<07:45,  1.35it/s]

{'loss': 1.9369, 'grad_norm': 1.3134055137634277, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}


  5%|▍         | 30/656 [00:21<07:58,  1.31it/s]

{'loss': 1.9245, 'grad_norm': 1.390013575553894, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}


  5%|▍         | 31/656 [00:22<08:33,  1.22it/s]

{'loss': 1.935, 'grad_norm': 1.8091869354248047, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}


  5%|▍         | 32/656 [00:23<07:52,  1.32it/s]

{'loss': 1.9451, 'grad_norm': 1.7240056991577148, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}


  5%|▌         | 33/656 [00:23<07:20,  1.41it/s]

{'loss': 1.9573, 'grad_norm': 1.816876769065857, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}


  5%|▌         | 34/656 [00:24<08:04,  1.28it/s]

{'loss': 1.9404, 'grad_norm': 1.4961719512939453, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}


  5%|▌         | 35/656 [00:25<07:48,  1.33it/s]

{'loss': 1.9384, 'grad_norm': 1.6171088218688965, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}


  5%|▌         | 36/656 [00:26<07:37,  1.36it/s]

{'loss': 1.9371, 'grad_norm': 1.3435227870941162, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}


  6%|▌         | 37/656 [00:26<07:13,  1.43it/s]

{'loss': 1.9515, 'grad_norm': 1.3693970441818237, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}


  6%|▌         | 38/656 [00:27<07:31,  1.37it/s]

{'loss': 1.9736, 'grad_norm': 1.4439680576324463, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}


  6%|▌         | 39/656 [00:28<07:37,  1.35it/s]

{'loss': 1.9329, 'grad_norm': 1.4406108856201172, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}


  6%|▌         | 40/656 [00:29<07:36,  1.35it/s]

{'loss': 1.9205, 'grad_norm': 1.0910903215408325, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}


  6%|▋         | 41/656 [00:30<07:44,  1.32it/s]

{'loss': 1.9551, 'grad_norm': 1.2705150842666626, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}


  6%|▋         | 42/656 [00:30<07:24,  1.38it/s]

{'loss': 1.9515, 'grad_norm': 1.3831474781036377, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}


  7%|▋         | 43/656 [00:31<07:19,  1.40it/s]

{'loss': 1.9365, 'grad_norm': 1.2725448608398438, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}


  7%|▋         | 44/656 [00:32<07:05,  1.44it/s]

{'loss': 1.945, 'grad_norm': 1.4916247129440308, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}


  7%|▋         | 45/656 [00:32<07:53,  1.29it/s]

{'loss': 1.9349, 'grad_norm': 1.6604797840118408, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}


  7%|▋         | 46/656 [00:33<07:34,  1.34it/s]

{'loss': 1.917, 'grad_norm': 1.1781011819839478, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}


  7%|▋         | 47/656 [00:34<07:16,  1.39it/s]

{'loss': 1.9375, 'grad_norm': 1.2646667957305908, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}


  7%|▋         | 48/656 [00:35<07:34,  1.34it/s]

{'loss': 1.943, 'grad_norm': 1.5699973106384277, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}


  7%|▋         | 49/656 [00:35<07:21,  1.38it/s]

{'loss': 1.9189, 'grad_norm': 1.567204475402832, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}


  8%|▊         | 50/656 [00:36<07:02,  1.44it/s]

{'loss': 1.944, 'grad_norm': 1.0927590131759644, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}


  8%|▊         | 51/656 [00:37<06:54,  1.46it/s]

{'loss': 1.9353, 'grad_norm': 1.438379168510437, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}


  8%|▊         | 52/656 [00:37<06:43,  1.50it/s]

{'loss': 1.9321, 'grad_norm': 1.6643247604370117, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}


  8%|▊         | 53/656 [00:38<07:01,  1.43it/s]

{'loss': 1.9437, 'grad_norm': 1.2488200664520264, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}


  8%|▊         | 54/656 [00:39<06:48,  1.47it/s]

{'loss': 1.925, 'grad_norm': 1.1120716333389282, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}


  8%|▊         | 55/656 [00:39<06:32,  1.53it/s]

{'loss': 1.9215, 'grad_norm': 1.3697572946548462, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}


  9%|▊         | 56/656 [00:40<06:47,  1.47it/s]

{'loss': 1.9413, 'grad_norm': 1.5769597291946411, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}


  9%|▊         | 57/656 [00:41<06:52,  1.45it/s]

{'loss': 1.92, 'grad_norm': 1.6575016975402832, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}


  9%|▉         | 58/656 [00:41<07:00,  1.42it/s]

{'loss': 1.9246, 'grad_norm': 1.652542233467102, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}


  9%|▉         | 59/656 [00:42<07:10,  1.39it/s]

{'loss': 1.9434, 'grad_norm': 1.8105080127716064, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}


  9%|▉         | 60/656 [00:43<07:08,  1.39it/s]

{'loss': 1.9357, 'grad_norm': 1.5905681848526, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


  9%|▉         | 61/656 [00:44<07:00,  1.41it/s]

{'loss': 1.9337, 'grad_norm': 1.454654574394226, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}


  9%|▉         | 62/656 [00:44<06:50,  1.45it/s]

{'loss': 1.9341, 'grad_norm': 1.6869041919708252, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}


 10%|▉         | 63/656 [00:45<06:59,  1.41it/s]

{'loss': 1.9441, 'grad_norm': 1.9693626165390015, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}


 10%|▉         | 64/656 [00:46<06:50,  1.44it/s]

{'loss': 1.9297, 'grad_norm': 1.3012022972106934, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}


 10%|▉         | 65/656 [00:46<07:07,  1.38it/s]

{'loss': 1.9315, 'grad_norm': 1.380439281463623, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}


 10%|█         | 66/656 [00:47<07:05,  1.39it/s]

{'loss': 1.9249, 'grad_norm': 1.483946681022644, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


 10%|█         | 67/656 [00:48<07:13,  1.36it/s]

{'loss': 1.923, 'grad_norm': 1.426619291305542, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}


 10%|█         | 68/656 [00:49<07:19,  1.34it/s]

{'loss': 1.9475, 'grad_norm': 1.4265121221542358, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}


 11%|█         | 69/656 [00:49<06:53,  1.42it/s]

{'loss': 1.9373, 'grad_norm': 1.2414612770080566, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}


 11%|█         | 70/656 [00:50<08:01,  1.22it/s]

{'loss': 1.9153, 'grad_norm': 1.9849259853363037, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}


 11%|█         | 71/656 [00:51<07:46,  1.25it/s]

{'loss': 1.9417, 'grad_norm': 1.9192640781402588, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}


 11%|█         | 72/656 [00:52<07:16,  1.34it/s]

{'loss': 1.9119, 'grad_norm': 1.3688355684280396, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}


 11%|█         | 73/656 [00:52<07:10,  1.35it/s]

{'loss': 1.9244, 'grad_norm': 1.6726330518722534, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}


 11%|█▏        | 74/656 [00:53<06:50,  1.42it/s]

{'loss': 1.9301, 'grad_norm': 1.2292064428329468, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}


 11%|█▏        | 75/656 [00:54<06:37,  1.46it/s]

{'loss': 1.9016, 'grad_norm': 1.2116615772247314, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}


 12%|█▏        | 76/656 [00:54<06:36,  1.46it/s]

{'loss': 1.9082, 'grad_norm': 1.447011113166809, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


 12%|█▏        | 77/656 [00:55<07:33,  1.28it/s]

{'loss': 1.8971, 'grad_norm': 1.4442239999771118, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}


 12%|█▏        | 78/656 [00:56<07:34,  1.27it/s]

{'loss': 1.9048, 'grad_norm': 1.3840874433517456, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}


 12%|█▏        | 79/656 [00:57<07:31,  1.28it/s]

{'loss': 1.9345, 'grad_norm': 1.7927457094192505, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}


 12%|█▏        | 80/656 [00:58<07:11,  1.34it/s]

{'loss': 1.9013, 'grad_norm': 1.5950207710266113, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}


 12%|█▏        | 81/656 [00:58<07:10,  1.34it/s]

{'loss': 1.9188, 'grad_norm': 1.3933790922164917, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}


 12%|█▎        | 82/656 [00:59<06:54,  1.38it/s]

{'loss': 1.9003, 'grad_norm': 1.2670931816101074, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


 13%|█▎        | 83/656 [01:00<06:43,  1.42it/s]

{'loss': 1.9107, 'grad_norm': 1.6834073066711426, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}


 13%|█▎        | 84/656 [01:00<06:45,  1.41it/s]

{'loss': 1.8919, 'grad_norm': 2.1414198875427246, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}


 13%|█▎        | 85/656 [01:01<06:45,  1.41it/s]

{'loss': 1.9187, 'grad_norm': 1.784510612487793, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}


 13%|█▎        | 86/656 [01:02<06:52,  1.38it/s]

{'loss': 1.9142, 'grad_norm': 1.3933253288269043, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}


 13%|█▎        | 87/656 [01:03<06:50,  1.39it/s]

{'loss': 1.9083, 'grad_norm': 1.4456576108932495, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}


 13%|█▎        | 88/656 [01:03<06:40,  1.42it/s]

{'loss': 1.9, 'grad_norm': 1.3715941905975342, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


 14%|█▎        | 89/656 [01:04<07:13,  1.31it/s]

{'loss': 1.8813, 'grad_norm': 1.6212892532348633, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}


 14%|█▎        | 90/656 [01:05<07:39,  1.23it/s]

{'loss': 1.8766, 'grad_norm': 2.2416000366210938, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}


 14%|█▍        | 91/656 [01:06<07:49,  1.20it/s]

{'loss': 1.86, 'grad_norm': 1.9385963678359985, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}


 14%|█▍        | 92/656 [01:07<07:29,  1.25it/s]

{'loss': 1.8901, 'grad_norm': 1.5549730062484741, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}


 14%|█▍        | 93/656 [01:07<07:02,  1.33it/s]

{'loss': 1.8607, 'grad_norm': 1.9687564373016357, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}


 14%|█▍        | 94/656 [01:08<06:56,  1.35it/s]

{'loss': 1.8752, 'grad_norm': 1.6237633228302002, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


 14%|█▍        | 95/656 [01:09<07:13,  1.30it/s]

{'loss': 1.898, 'grad_norm': 1.7520009279251099, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}


 15%|█▍        | 96/656 [01:10<07:29,  1.25it/s]

{'loss': 1.8862, 'grad_norm': 1.6218838691711426, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}


 15%|█▍        | 97/656 [01:11<07:35,  1.23it/s]

{'loss': 1.8712, 'grad_norm': 1.763873815536499, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}


 15%|█▍        | 98/656 [01:11<07:23,  1.26it/s]

{'loss': 1.8843, 'grad_norm': 2.141645908355713, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}


 15%|█▌        | 99/656 [01:12<07:01,  1.32it/s]

{'loss': 1.8732, 'grad_norm': 1.9323457479476929, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}


 15%|█▌        | 100/656 [01:13<06:50,  1.35it/s]

{'loss': 1.8803, 'grad_norm': 1.7635473012924194, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}


 15%|█▌        | 101/656 [01:14<07:18,  1.26it/s]

{'loss': 1.8845, 'grad_norm': 1.9296090602874756, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}


 16%|█▌        | 102/656 [01:14<07:22,  1.25it/s]

{'loss': 1.8665, 'grad_norm': 2.0374293327331543, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}


 16%|█▌        | 103/656 [01:15<07:04,  1.30it/s]

{'loss': 1.8413, 'grad_norm': 2.0912058353424072, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}


 16%|█▌        | 104/656 [01:16<07:08,  1.29it/s]

{'loss': 1.8773, 'grad_norm': 1.8961671590805054, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}


 16%|█▌        | 105/656 [01:17<07:29,  1.23it/s]

{'loss': 1.8225, 'grad_norm': 2.070122241973877, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}


 16%|█▌        | 106/656 [01:18<07:08,  1.28it/s]

{'loss': 1.8546, 'grad_norm': 1.8770238161087036, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


 16%|█▋        | 107/656 [01:18<07:06,  1.29it/s]

{'loss': 1.859, 'grad_norm': 1.7812708616256714, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}


 16%|█▋        | 108/656 [01:19<07:16,  1.25it/s]

{'loss': 1.85, 'grad_norm': 2.125661611557007, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}


 17%|█▋        | 109/656 [01:20<06:52,  1.33it/s]

{'loss': 1.8031, 'grad_norm': 2.4000613689422607, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}


 17%|█▋        | 110/656 [01:21<06:49,  1.33it/s]

{'loss': 1.8499, 'grad_norm': 1.827552080154419, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}


 17%|█▋        | 111/656 [01:21<06:33,  1.38it/s]

{'loss': 1.8515, 'grad_norm': 1.8014472723007202, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}


 17%|█▋        | 112/656 [01:22<06:33,  1.38it/s]

{'loss': 1.8826, 'grad_norm': 1.9468860626220703, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [01:23<06:47,  1.33it/s]

{'loss': 1.8224, 'grad_norm': 2.1344213485717773, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}


 17%|█▋        | 114/656 [01:24<07:16,  1.24it/s]

{'loss': 1.8199, 'grad_norm': 1.8812611103057861, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}


 18%|█▊        | 115/656 [01:24<07:04,  1.27it/s]

{'loss': 1.8612, 'grad_norm': 1.9136022329330444, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}


 18%|█▊        | 116/656 [01:25<07:28,  1.20it/s]

{'loss': 1.8558, 'grad_norm': 2.257063865661621, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}


 18%|█▊        | 117/656 [01:26<07:09,  1.26it/s]

{'loss': 1.8175, 'grad_norm': 2.5907464027404785, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}


 18%|█▊        | 118/656 [01:27<06:51,  1.31it/s]

{'loss': 1.7642, 'grad_norm': 2.608336925506592, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}


 18%|█▊        | 119/656 [01:27<06:33,  1.36it/s]

{'loss': 1.7264, 'grad_norm': 2.8370518684387207, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}


 18%|█▊        | 120/656 [01:28<06:20,  1.41it/s]

{'loss': 1.8237, 'grad_norm': 2.2739288806915283, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}


 18%|█▊        | 121/656 [01:29<06:30,  1.37it/s]

{'loss': 1.8459, 'grad_norm': 2.2081801891326904, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}


 19%|█▊        | 122/656 [01:30<06:33,  1.36it/s]

{'loss': 1.7606, 'grad_norm': 2.1906251907348633, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}


 19%|█▉        | 123/656 [01:30<06:50,  1.30it/s]

{'loss': 1.8094, 'grad_norm': 2.1454203128814697, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}


 19%|█▉        | 124/656 [01:31<06:45,  1.31it/s]

{'loss': 1.7812, 'grad_norm': 2.1622326374053955, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}


 19%|█▉        | 125/656 [01:32<07:01,  1.26it/s]

{'loss': 1.768, 'grad_norm': 2.136599540710449, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}


 19%|█▉        | 126/656 [01:33<06:34,  1.34it/s]

{'loss': 1.7713, 'grad_norm': 2.3433988094329834, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}


 19%|█▉        | 127/656 [01:34<06:48,  1.30it/s]

{'loss': 1.7113, 'grad_norm': 2.491931200027466, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}


 20%|█▉        | 128/656 [01:34<07:07,  1.24it/s]

{'loss': 1.6743, 'grad_norm': 2.4677722454071045, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}


 20%|█▉        | 129/656 [01:35<06:50,  1.28it/s]

{'loss': 1.7883, 'grad_norm': 2.3686647415161133, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}


 20%|█▉        | 130/656 [01:36<06:35,  1.33it/s]

{'loss': 1.7547, 'grad_norm': 2.2539358139038086, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}


 20%|█▉        | 131/656 [01:37<06:42,  1.30it/s]

{'loss': 1.7123, 'grad_norm': 2.5220530033111572, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}


 20%|██        | 132/656 [01:37<06:29,  1.34it/s]

{'loss': 1.7842, 'grad_norm': 2.244122266769409, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}


 20%|██        | 133/656 [01:38<06:33,  1.33it/s]

{'loss': 1.6034, 'grad_norm': 2.7733469009399414, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}


 20%|██        | 134/656 [01:39<06:31,  1.33it/s]

{'loss': 1.7195, 'grad_norm': 2.382829427719116, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}


 21%|██        | 135/656 [01:40<06:24,  1.35it/s]

{'loss': 1.726, 'grad_norm': 2.3399124145507812, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}


 21%|██        | 136/656 [01:40<06:29,  1.33it/s]

{'loss': 1.6853, 'grad_norm': 2.7336483001708984, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}


 21%|██        | 137/656 [01:41<06:27,  1.34it/s]

{'loss': 1.7013, 'grad_norm': 2.6678178310394287, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}


 21%|██        | 138/656 [01:42<06:18,  1.37it/s]

{'loss': 1.7458, 'grad_norm': 2.5175726413726807, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}


 21%|██        | 139/656 [01:43<06:14,  1.38it/s]

{'loss': 1.7185, 'grad_norm': 2.2715721130371094, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}


 21%|██▏       | 140/656 [01:43<06:12,  1.39it/s]

{'loss': 1.6476, 'grad_norm': 2.796717405319214, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}


 21%|██▏       | 141/656 [01:44<06:06,  1.41it/s]

{'loss': 1.7382, 'grad_norm': 2.483579635620117, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}


 22%|██▏       | 142/656 [01:45<06:58,  1.23it/s]

{'loss': 1.6929, 'grad_norm': 2.4720091819763184, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}


 22%|██▏       | 143/656 [01:46<06:52,  1.24it/s]

{'loss': 1.685, 'grad_norm': 2.4987540245056152, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}


 22%|██▏       | 144/656 [01:46<06:25,  1.33it/s]

{'loss': 1.6823, 'grad_norm': 2.689889669418335, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}


 22%|██▏       | 145/656 [01:47<06:13,  1.37it/s]

{'loss': 1.7456, 'grad_norm': 3.1395130157470703, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}


 22%|██▏       | 146/656 [01:48<06:06,  1.39it/s]

{'loss': 1.6769, 'grad_norm': 2.8021938800811768, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}


 22%|██▏       | 147/656 [01:49<06:19,  1.34it/s]

{'loss': 1.6594, 'grad_norm': 2.5220582485198975, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}


 23%|██▎       | 148/656 [01:49<06:11,  1.37it/s]

{'loss': 1.5672, 'grad_norm': 2.9786365032196045, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}


 23%|██▎       | 149/656 [01:50<06:22,  1.33it/s]

{'loss': 1.579, 'grad_norm': 2.9342076778411865, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}


 23%|██▎       | 150/656 [01:51<06:39,  1.27it/s]

{'loss': 1.6008, 'grad_norm': 2.4854447841644287, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


 23%|██▎       | 151/656 [01:52<06:31,  1.29it/s]

{'loss': 1.5959, 'grad_norm': 2.3987858295440674, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}


 23%|██▎       | 152/656 [01:53<07:10,  1.17it/s]

{'loss': 1.6453, 'grad_norm': 2.5083651542663574, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}


 23%|██▎       | 153/656 [01:54<07:04,  1.19it/s]

{'loss': 1.6308, 'grad_norm': 2.5975637435913086, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}


 23%|██▎       | 154/656 [01:54<06:44,  1.24it/s]

{'loss': 1.6041, 'grad_norm': 2.788851499557495, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}


 24%|██▎       | 155/656 [01:55<06:17,  1.33it/s]

{'loss': 1.5184, 'grad_norm': 2.4868226051330566, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}


 24%|██▍       | 156/656 [01:56<06:06,  1.36it/s]

{'loss': 1.5578, 'grad_norm': 2.432229518890381, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}


 24%|██▍       | 157/656 [01:57<06:36,  1.26it/s]

{'loss': 1.5142, 'grad_norm': 2.5199577808380127, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}


 24%|██▍       | 158/656 [01:57<06:24,  1.29it/s]

{'loss': 1.5514, 'grad_norm': 2.653651475906372, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}


 24%|██▍       | 159/656 [01:58<06:16,  1.32it/s]

{'loss': 1.5166, 'grad_norm': 3.0443530082702637, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}


 24%|██▍       | 160/656 [01:59<05:58,  1.39it/s]

{'loss': 1.4388, 'grad_norm': 3.6469497680664062, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}


 25%|██▍       | 161/656 [01:59<05:53,  1.40it/s]

{'loss': 1.5613, 'grad_norm': 2.915172815322876, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}


 25%|██▍       | 162/656 [02:00<05:49,  1.41it/s]

{'loss': 1.4596, 'grad_norm': 2.8211653232574463, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


 25%|██▍       | 163/656 [02:01<05:41,  1.44it/s]

{'loss': 1.5057, 'grad_norm': 2.988311290740967, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}


 25%|██▌       | 164/656 [02:01<05:57,  1.37it/s]

{'loss': 1.5182, 'grad_norm': 2.6381986141204834, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}


 25%|██▌       | 165/656 [02:02<06:12,  1.32it/s]

{'loss': 1.517, 'grad_norm': 2.7037973403930664, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}


 25%|██▌       | 166/656 [02:03<05:57,  1.37it/s]

{'loss': 1.4574, 'grad_norm': 2.5211732387542725, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}


 25%|██▌       | 167/656 [02:04<06:44,  1.21it/s]

{'loss': 1.4888, 'grad_norm': 2.6703200340270996, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}


 26%|██▌       | 168/656 [02:05<06:26,  1.26it/s]

{'loss': 1.4747, 'grad_norm': 2.7097597122192383, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


 26%|██▌       | 169/656 [02:06<06:48,  1.19it/s]

{'loss': 1.3794, 'grad_norm': 2.6368935108184814, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}


 26%|██▌       | 170/656 [02:06<06:29,  1.25it/s]

{'loss': 1.4432, 'grad_norm': 3.0640764236450195, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}


 26%|██▌       | 171/656 [02:07<06:38,  1.22it/s]

{'loss': 1.4415, 'grad_norm': 2.4611101150512695, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}


 26%|██▌       | 172/656 [02:08<06:58,  1.16it/s]

{'loss': 1.3964, 'grad_norm': 2.761427402496338, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


 26%|██▋       | 173/656 [02:09<06:58,  1.16it/s]

{'loss': 1.243, 'grad_norm': 3.5474750995635986, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}


 27%|██▋       | 174/656 [02:10<06:32,  1.23it/s]

{'loss': 1.3639, 'grad_norm': 3.210853338241577, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}


 27%|██▋       | 175/656 [02:11<06:26,  1.24it/s]

{'loss': 1.4783, 'grad_norm': 3.080810546875, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}


 27%|██▋       | 176/656 [02:11<06:05,  1.31it/s]

{'loss': 1.3043, 'grad_norm': 3.203864097595215, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


 27%|██▋       | 177/656 [02:12<05:59,  1.33it/s]

{'loss': 1.4349, 'grad_norm': 3.0943498611450195, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}


 27%|██▋       | 178/656 [02:13<05:50,  1.37it/s]

{'loss': 1.3327, 'grad_norm': 3.074859857559204, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}


 27%|██▋       | 179/656 [02:13<06:00,  1.32it/s]

{'loss': 1.3265, 'grad_norm': 2.827507257461548, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}


 27%|██▋       | 180/656 [02:14<06:03,  1.31it/s]

{'loss': 1.3348, 'grad_norm': 2.956279754638672, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


 28%|██▊       | 181/656 [02:15<06:05,  1.30it/s]

{'loss': 1.3366, 'grad_norm': 2.7649085521698, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}


 28%|██▊       | 182/656 [02:16<05:57,  1.33it/s]

{'loss': 1.3869, 'grad_norm': 2.6250147819519043, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}


 28%|██▊       | 183/656 [02:17<06:23,  1.23it/s]

{'loss': 1.2973, 'grad_norm': 2.8248636722564697, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}


 28%|██▊       | 184/656 [02:18<06:36,  1.19it/s]

{'loss': 1.2736, 'grad_norm': 2.9205496311187744, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


 28%|██▊       | 185/656 [02:18<06:28,  1.21it/s]

{'loss': 1.278, 'grad_norm': 2.7817869186401367, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}


 28%|██▊       | 186/656 [02:19<06:34,  1.19it/s]

{'loss': 1.241, 'grad_norm': 3.004554271697998, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}


 29%|██▊       | 187/656 [02:20<06:25,  1.22it/s]

{'loss': 1.252, 'grad_norm': 3.039501905441284, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}


 29%|██▊       | 188/656 [02:21<06:02,  1.29it/s]

{'loss': 1.2127, 'grad_norm': 2.7452948093414307, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


 29%|██▉       | 189/656 [02:21<05:58,  1.30it/s]

{'loss': 1.2017, 'grad_norm': 2.748102903366089, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}


 29%|██▉       | 190/656 [02:22<05:52,  1.32it/s]

{'loss': 1.2095, 'grad_norm': 2.9851784706115723, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}


 29%|██▉       | 191/656 [02:23<05:47,  1.34it/s]

{'loss': 1.1384, 'grad_norm': 2.615001916885376, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}


 29%|██▉       | 192/656 [02:24<05:39,  1.37it/s]

{'loss': 1.2743, 'grad_norm': 2.876547336578369, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}


 29%|██▉       | 193/656 [02:24<05:54,  1.31it/s]

{'loss': 1.1366, 'grad_norm': 2.6531286239624023, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}


 30%|██▉       | 194/656 [02:25<05:51,  1.31it/s]

{'loss': 1.3117, 'grad_norm': 3.04011607170105, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}


 30%|██▉       | 195/656 [02:26<05:41,  1.35it/s]

{'loss': 1.1603, 'grad_norm': 2.7655632495880127, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}


 30%|██▉       | 196/656 [02:27<05:34,  1.37it/s]

{'loss': 1.0792, 'grad_norm': 2.99257230758667, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}


 30%|███       | 197/656 [02:27<05:37,  1.36it/s]

{'loss': 1.2065, 'grad_norm': 2.9069771766662598, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}


 30%|███       | 198/656 [02:28<05:31,  1.38it/s]

{'loss': 1.0729, 'grad_norm': 2.7830188274383545, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}


 30%|███       | 199/656 [02:29<05:18,  1.43it/s]

{'loss': 1.093, 'grad_norm': 2.8322062492370605, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}


 30%|███       | 200/656 [02:29<05:20,  1.42it/s]

{'loss': 1.1867, 'grad_norm': 2.693209171295166, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}


 31%|███       | 201/656 [02:30<05:22,  1.41it/s]

{'loss': 1.1572, 'grad_norm': 2.872690200805664, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}


 31%|███       | 202/656 [02:31<05:12,  1.45it/s]

{'loss': 1.1861, 'grad_norm': 2.888486385345459, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}


 31%|███       | 203/656 [02:31<05:21,  1.41it/s]

{'loss': 1.1157, 'grad_norm': 2.536937952041626, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}


 31%|███       | 204/656 [02:32<05:22,  1.40it/s]

{'loss': 1.111, 'grad_norm': 2.8327555656433105, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}


 31%|███▏      | 205/656 [02:33<05:31,  1.36it/s]

{'loss': 0.9253, 'grad_norm': 3.091426134109497, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}


 31%|███▏      | 206/656 [02:34<05:37,  1.33it/s]

{'loss': 0.986, 'grad_norm': 2.696272611618042, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


 32%|███▏      | 207/656 [02:34<05:29,  1.36it/s]

{'loss': 0.9721, 'grad_norm': 2.7272121906280518, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}


 32%|███▏      | 208/656 [02:35<05:23,  1.38it/s]

{'loss': 0.9524, 'grad_norm': 2.5935449600219727, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}


 32%|███▏      | 209/656 [02:36<05:31,  1.35it/s]

{'loss': 1.0253, 'grad_norm': 2.8424906730651855, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}


 32%|███▏      | 210/656 [02:37<05:48,  1.28it/s]

{'loss': 1.0209, 'grad_norm': 3.213871717453003, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


 32%|███▏      | 211/656 [02:38<06:05,  1.22it/s]

{'loss': 1.0145, 'grad_norm': 2.783653736114502, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}


 32%|███▏      | 212/656 [02:38<05:50,  1.27it/s]

{'loss': 1.0509, 'grad_norm': 2.8532488346099854, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}


 32%|███▏      | 213/656 [02:39<05:33,  1.33it/s]

{'loss': 0.9839, 'grad_norm': 2.7445790767669678, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}


 33%|███▎      | 214/656 [02:40<05:49,  1.26it/s]

{'loss': 0.8905, 'grad_norm': 2.626033067703247, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


 33%|███▎      | 215/656 [02:41<05:44,  1.28it/s]

{'loss': 1.0111, 'grad_norm': 2.73091197013855, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}


 33%|███▎      | 216/656 [02:42<05:52,  1.25it/s]

{'loss': 0.9975, 'grad_norm': 3.103853464126587, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}


 33%|███▎      | 217/656 [02:42<06:01,  1.21it/s]

{'loss': 0.9294, 'grad_norm': 2.5571556091308594, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}


 33%|███▎      | 218/656 [02:43<06:07,  1.19it/s]

{'loss': 0.9699, 'grad_norm': 3.1058382987976074, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}


 33%|███▎      | 219/656 [02:44<05:48,  1.25it/s]

{'loss': 0.9041, 'grad_norm': 2.450540781021118, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}


 34%|███▎      | 220/656 [02:45<05:42,  1.27it/s]

{'loss': 0.8752, 'grad_norm': 2.33476185798645, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}


 34%|███▎      | 221/656 [02:46<05:46,  1.26it/s]

{'loss': 0.9571, 'grad_norm': 2.8748135566711426, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}


 34%|███▍      | 222/656 [02:46<05:40,  1.27it/s]

{'loss': 0.8455, 'grad_norm': 2.6859283447265625, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}


 34%|███▍      | 223/656 [02:47<05:40,  1.27it/s]

{'loss': 0.8878, 'grad_norm': 2.673405885696411, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}


 34%|███▍      | 224/656 [02:48<05:24,  1.33it/s]

{'loss': 0.9445, 'grad_norm': 3.1029469966888428, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}


 34%|███▍      | 225/656 [02:49<05:16,  1.36it/s]

{'loss': 0.8778, 'grad_norm': 2.9740281105041504, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}


 34%|███▍      | 226/656 [02:49<05:18,  1.35it/s]

{'loss': 0.7802, 'grad_norm': 2.4614996910095215, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}


 35%|███▍      | 227/656 [02:50<05:08,  1.39it/s]

{'loss': 0.8625, 'grad_norm': 2.720747709274292, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}


 35%|███▍      | 228/656 [02:51<04:57,  1.44it/s]

{'loss': 0.8321, 'grad_norm': 2.726412057876587, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}


 35%|███▍      | 229/656 [02:51<05:04,  1.40it/s]

{'loss': 0.7584, 'grad_norm': 2.3367345333099365, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}


 35%|███▌      | 230/656 [02:52<04:54,  1.44it/s]

{'loss': 0.7628, 'grad_norm': 2.7363343238830566, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}


 35%|███▌      | 231/656 [02:53<05:02,  1.41it/s]

{'loss': 0.7941, 'grad_norm': 2.9247536659240723, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}


 35%|███▌      | 232/656 [02:54<05:15,  1.34it/s]

{'loss': 0.7385, 'grad_norm': 2.4823853969573975, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}


 36%|███▌      | 233/656 [02:54<05:24,  1.30it/s]

{'loss': 0.7922, 'grad_norm': 2.649897813796997, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}


 36%|███▌      | 234/656 [02:55<05:33,  1.27it/s]

{'loss': 0.6872, 'grad_norm': 2.589578628540039, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}


 36%|███▌      | 235/656 [02:56<05:44,  1.22it/s]

{'loss': 0.7756, 'grad_norm': 2.287985324859619, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}


 36%|███▌      | 236/656 [02:57<05:20,  1.31it/s]

{'loss': 0.8206, 'grad_norm': 2.671402931213379, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}


 36%|███▌      | 237/656 [02:58<05:23,  1.30it/s]

{'loss': 0.7988, 'grad_norm': 2.821091651916504, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}


 36%|███▋      | 238/656 [02:58<05:13,  1.33it/s]

{'loss': 0.8673, 'grad_norm': 3.2886717319488525, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}


 36%|███▋      | 239/656 [02:59<05:03,  1.38it/s]

{'loss': 0.717, 'grad_norm': 2.5689034461975098, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}


 37%|███▋      | 240/656 [03:00<05:13,  1.33it/s]

{'loss': 0.8404, 'grad_norm': 2.5030980110168457, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


 37%|███▋      | 241/656 [03:00<05:06,  1.35it/s]

{'loss': 0.6185, 'grad_norm': 2.441394090652466, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}


 37%|███▋      | 242/656 [03:01<05:04,  1.36it/s]

{'loss': 0.7533, 'grad_norm': 2.4936130046844482, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}


 37%|███▋      | 243/656 [03:02<05:06,  1.35it/s]

{'loss': 0.6573, 'grad_norm': 2.2685611248016357, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}


 37%|███▋      | 244/656 [03:03<05:03,  1.36it/s]

{'loss': 0.667, 'grad_norm': 2.3946971893310547, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


 37%|███▋      | 245/656 [03:03<05:06,  1.34it/s]

{'loss': 0.6311, 'grad_norm': 2.266460657119751, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}


 38%|███▊      | 246/656 [03:04<04:55,  1.39it/s]

{'loss': 0.6684, 'grad_norm': 2.5012171268463135, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}


 38%|███▊      | 247/656 [03:05<05:10,  1.32it/s]

{'loss': 0.6599, 'grad_norm': 2.5723297595977783, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}


 38%|███▊      | 248/656 [03:06<04:58,  1.37it/s]

{'loss': 0.63, 'grad_norm': 2.229067087173462, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


 38%|███▊      | 249/656 [03:06<05:00,  1.35it/s]

{'loss': 0.6179, 'grad_norm': 2.334834098815918, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}


 38%|███▊      | 250/656 [03:07<05:13,  1.30it/s]

{'loss': 0.6587, 'grad_norm': 2.327226161956787, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}


 38%|███▊      | 251/656 [03:08<05:14,  1.29it/s]

{'loss': 0.5825, 'grad_norm': 2.3874704837799072, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}


 38%|███▊      | 252/656 [03:09<05:22,  1.25it/s]

{'loss': 0.6862, 'grad_norm': 2.495335578918457, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}


 39%|███▊      | 253/656 [03:10<05:09,  1.30it/s]

{'loss': 0.474, 'grad_norm': 1.8747754096984863, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}


 39%|███▊      | 254/656 [03:11<05:30,  1.22it/s]

{'loss': 0.5977, 'grad_norm': 2.3244032859802246, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}


 39%|███▉      | 255/656 [03:11<05:22,  1.24it/s]

{'loss': 0.6906, 'grad_norm': 2.690936326980591, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}


 39%|███▉      | 256/656 [03:12<05:08,  1.29it/s]

{'loss': 0.6778, 'grad_norm': 2.6601908206939697, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}


 39%|███▉      | 257/656 [03:13<05:02,  1.32it/s]

{'loss': 0.5643, 'grad_norm': 2.2519612312316895, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}


 39%|███▉      | 258/656 [03:13<04:58,  1.33it/s]

{'loss': 0.5891, 'grad_norm': 2.5067970752716064, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}


 39%|███▉      | 259/656 [03:14<04:47,  1.38it/s]

{'loss': 0.5992, 'grad_norm': 2.5004591941833496, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}


 40%|███▉      | 260/656 [03:15<04:54,  1.35it/s]

{'loss': 0.59, 'grad_norm': 2.920990228652954, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}


 40%|███▉      | 261/656 [03:16<04:55,  1.34it/s]

{'loss': 0.6228, 'grad_norm': 2.5518925189971924, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}


 40%|███▉      | 262/656 [03:16<04:56,  1.33it/s]

{'loss': 0.5763, 'grad_norm': 2.5685813426971436, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}


 40%|████      | 263/656 [03:17<04:56,  1.33it/s]

{'loss': 0.5418, 'grad_norm': 2.277304172515869, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}


 40%|████      | 264/656 [03:18<05:13,  1.25it/s]

{'loss': 0.484, 'grad_norm': 1.95961332321167, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}


 40%|████      | 265/656 [03:19<05:15,  1.24it/s]

{'loss': 0.4625, 'grad_norm': 2.911977767944336, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}


 41%|████      | 266/656 [03:20<05:02,  1.29it/s]

{'loss': 0.4658, 'grad_norm': 1.9941861629486084, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}


 41%|████      | 267/656 [03:20<05:07,  1.26it/s]

{'loss': 0.5504, 'grad_norm': 2.3459393978118896, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}


 41%|████      | 268/656 [03:21<05:10,  1.25it/s]

{'loss': 0.4684, 'grad_norm': 2.0669612884521484, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}


 41%|████      | 269/656 [03:22<05:15,  1.23it/s]

{'loss': 0.5236, 'grad_norm': 2.382087230682373, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}


 41%|████      | 270/656 [03:23<05:04,  1.27it/s]

{'loss': 0.5861, 'grad_norm': 2.6691360473632812, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}


 41%|████▏     | 271/656 [03:23<04:42,  1.36it/s]

{'loss': 0.4799, 'grad_norm': 2.2289865016937256, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}


 41%|████▏     | 272/656 [03:25<05:28,  1.17it/s]

{'loss': 0.4694, 'grad_norm': 2.7389283180236816, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}


 42%|████▏     | 273/656 [03:25<05:13,  1.22it/s]

{'loss': 0.5003, 'grad_norm': 2.27628231048584, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}


 42%|████▏     | 274/656 [03:26<05:02,  1.26it/s]

{'loss': 0.4238, 'grad_norm': 1.879025936126709, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}


 42%|████▏     | 275/656 [03:27<05:01,  1.26it/s]

{'loss': 0.4909, 'grad_norm': 2.032163143157959, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}


 42%|████▏     | 276/656 [03:28<04:57,  1.28it/s]

{'loss': 0.4731, 'grad_norm': 2.3588762283325195, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}


 42%|████▏     | 277/656 [03:28<04:57,  1.27it/s]

{'loss': 0.428, 'grad_norm': 2.170233726501465, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}


 42%|████▏     | 278/656 [03:29<05:14,  1.20it/s]

{'loss': 0.4387, 'grad_norm': 2.1449427604675293, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}


 43%|████▎     | 279/656 [03:30<04:52,  1.29it/s]

{'loss': 0.4875, 'grad_norm': 2.0360913276672363, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}


 43%|████▎     | 280/656 [03:31<04:57,  1.27it/s]

{'loss': 0.4862, 'grad_norm': 2.1548891067504883, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}


 43%|████▎     | 281/656 [03:32<05:07,  1.22it/s]

{'loss': 0.4114, 'grad_norm': 2.221281051635742, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}


 43%|████▎     | 282/656 [03:32<04:56,  1.26it/s]

{'loss': 0.4324, 'grad_norm': 2.4579083919525146, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}


 43%|████▎     | 283/656 [03:33<04:42,  1.32it/s]

{'loss': 0.4928, 'grad_norm': 2.091013193130493, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}


 43%|████▎     | 284/656 [03:34<04:41,  1.32it/s]

{'loss': 0.4671, 'grad_norm': 2.4130945205688477, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}


 43%|████▎     | 285/656 [03:35<04:49,  1.28it/s]

{'loss': 0.3235, 'grad_norm': 1.524186134338379, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}


 44%|████▎     | 286/656 [03:35<04:33,  1.35it/s]

{'loss': 0.4136, 'grad_norm': 1.9746989011764526, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}


 44%|████▍     | 287/656 [03:36<04:49,  1.27it/s]

{'loss': 0.3839, 'grad_norm': 1.994982361793518, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}


 44%|████▍     | 288/656 [03:37<04:42,  1.30it/s]

{'loss': 0.4405, 'grad_norm': 2.191676378250122, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


 44%|████▍     | 289/656 [03:38<04:37,  1.32it/s]

{'loss': 0.3482, 'grad_norm': 2.0395843982696533, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}


 44%|████▍     | 290/656 [03:39<05:08,  1.19it/s]

{'loss': 0.3557, 'grad_norm': 1.946542501449585, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}


 44%|████▍     | 291/656 [03:39<04:55,  1.24it/s]

{'loss': 0.4735, 'grad_norm': 2.7813799381256104, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}


 45%|████▍     | 292/656 [03:40<04:42,  1.29it/s]

{'loss': 0.3186, 'grad_norm': 1.8898539543151855, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


 45%|████▍     | 293/656 [03:41<04:40,  1.29it/s]

{'loss': 0.4076, 'grad_norm': 1.960202693939209, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}


 45%|████▍     | 294/656 [03:42<04:56,  1.22it/s]

{'loss': 0.3603, 'grad_norm': 2.2322285175323486, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}


 45%|████▍     | 295/656 [03:43<04:58,  1.21it/s]

{'loss': 0.3021, 'grad_norm': 1.8377439975738525, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}


 45%|████▌     | 296/656 [03:44<05:08,  1.17it/s]

{'loss': 0.3059, 'grad_norm': 1.633162021636963, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


 45%|████▌     | 297/656 [03:44<05:04,  1.18it/s]

{'loss': 0.3129, 'grad_norm': 1.5659624338150024, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}


 45%|████▌     | 298/656 [03:45<05:03,  1.18it/s]

{'loss': 0.3407, 'grad_norm': 2.346064567565918, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}


 46%|████▌     | 299/656 [03:46<04:51,  1.22it/s]

{'loss': 0.3301, 'grad_norm': 2.3437094688415527, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}


 46%|████▌     | 300/656 [03:47<04:48,  1.23it/s]

{'loss': 0.2946, 'grad_norm': 2.149045944213867, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


 46%|████▌     | 301/656 [03:47<04:36,  1.28it/s]

{'loss': 0.3401, 'grad_norm': 3.0326554775238037, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}


 46%|████▌     | 302/656 [03:48<04:27,  1.32it/s]

{'loss': 0.3202, 'grad_norm': 2.3239431381225586, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}


 46%|████▌     | 303/656 [03:49<04:18,  1.37it/s]

{'loss': 0.3151, 'grad_norm': 1.9180245399475098, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}


 46%|████▋     | 304/656 [03:50<04:20,  1.35it/s]

{'loss': 0.3121, 'grad_norm': 1.5528316497802734, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


 46%|████▋     | 305/656 [03:50<04:22,  1.34it/s]

{'loss': 0.3398, 'grad_norm': 2.533323049545288, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}


 47%|████▋     | 306/656 [03:51<04:32,  1.28it/s]

{'loss': 0.3869, 'grad_norm': 2.2235445976257324, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}


 47%|████▋     | 307/656 [03:52<04:30,  1.29it/s]

{'loss': 0.3051, 'grad_norm': 1.8711504936218262, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}


 47%|████▋     | 308/656 [03:53<04:18,  1.34it/s]

{'loss': 0.3008, 'grad_norm': 2.94276762008667, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


 47%|████▋     | 309/656 [03:53<04:19,  1.34it/s]

{'loss': 0.5276, 'grad_norm': 2.5729455947875977, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}


 47%|████▋     | 310/656 [03:54<04:14,  1.36it/s]

{'loss': 0.2741, 'grad_norm': 1.9715197086334229, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}


 47%|████▋     | 311/656 [03:55<04:32,  1.27it/s]

{'loss': 0.2392, 'grad_norm': 1.8708829879760742, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}


 48%|████▊     | 312/656 [03:56<04:26,  1.29it/s]

{'loss': 0.2762, 'grad_norm': 1.9260520935058594, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


 48%|████▊     | 313/656 [03:57<04:47,  1.20it/s]

{'loss': 0.2369, 'grad_norm': 1.5515310764312744, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}


 48%|████▊     | 314/656 [03:58<04:35,  1.24it/s]

{'loss': 0.316, 'grad_norm': 2.440927743911743, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}


 48%|████▊     | 315/656 [03:58<04:31,  1.26it/s]

{'loss': 0.26, 'grad_norm': 2.713839530944824, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}


 48%|████▊     | 316/656 [03:59<04:34,  1.24it/s]

{'loss': 0.293, 'grad_norm': 2.1144700050354004, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


 48%|████▊     | 317/656 [04:00<04:36,  1.23it/s]

{'loss': 0.2918, 'grad_norm': 2.2552483081817627, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}


 48%|████▊     | 318/656 [04:01<04:31,  1.24it/s]

{'loss': 0.2037, 'grad_norm': 1.4965226650238037, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}


 49%|████▊     | 319/656 [04:02<04:53,  1.15it/s]

{'loss': 0.2378, 'grad_norm': 1.4478570222854614, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}


 49%|████▉     | 320/656 [04:02<04:34,  1.22it/s]

{'loss': 0.2043, 'grad_norm': 1.209388017654419, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


 49%|████▉     | 321/656 [04:03<04:26,  1.26it/s]

{'loss': 0.1967, 'grad_norm': 1.0796924829483032, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}


 49%|████▉     | 322/656 [04:04<04:40,  1.19it/s]

{'loss': 0.2168, 'grad_norm': 1.936516523361206, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}


 49%|████▉     | 323/656 [04:05<04:34,  1.21it/s]

{'loss': 0.2535, 'grad_norm': 3.1566731929779053, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}


 49%|████▉     | 324/656 [04:06<04:57,  1.11it/s]

{'loss': 0.2119, 'grad_norm': 1.5235729217529297, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


 50%|████▉     | 325/656 [04:07<04:49,  1.14it/s]

{'loss': 0.2516, 'grad_norm': 2.2045764923095703, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}


 50%|████▉     | 326/656 [04:08<04:33,  1.21it/s]

{'loss': 0.2365, 'grad_norm': 2.080421209335327, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}


 50%|████▉     | 327/656 [04:08<04:16,  1.28it/s]

{'loss': 0.2284, 'grad_norm': 1.4784235954284668, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}


 50%|█████     | 328/656 [04:08<03:19,  1.64it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 0.2127, 'grad_norm': 3.168280839920044, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [04:28<03:19,  1.64it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors


{'eval_loss': 0.21589116752147675, 'eval_model_preparation_time': 0.0011, 'eval_accuracy': 0.9736339319831868, 'eval_runtime': 19.1672, 'eval_samples_per_second': 136.535, 'eval_steps_per_second': 4.278, 'epoch': 1.0}


 50%|█████     | 329/656 [04:29<35:30,  6.52s/it]

{'loss': 0.2033, 'grad_norm': 2.059602737426758, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}


 50%|█████     | 330/656 [04:30<26:06,  4.81s/it]

{'loss': 0.1871, 'grad_norm': 1.324660301208496, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}


 50%|█████     | 331/656 [04:30<19:29,  3.60s/it]

{'loss': 0.1652, 'grad_norm': 0.9385837912559509, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}


 51%|█████     | 332/656 [04:31<15:26,  2.86s/it]

{'loss': 0.2226, 'grad_norm': 2.0050296783447266, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}


 51%|█████     | 333/656 [04:32<12:14,  2.27s/it]

{'loss': 0.2134, 'grad_norm': 2.279529571533203, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}


 51%|█████     | 334/656 [04:33<10:05,  1.88s/it]

{'loss': 0.221, 'grad_norm': 2.1477930545806885, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}


 51%|█████     | 335/656 [04:34<08:16,  1.55s/it]

{'loss': 0.1651, 'grad_norm': 1.04526948928833, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}


 51%|█████     | 336/656 [04:35<07:00,  1.31s/it]

{'loss': 0.367, 'grad_norm': 2.159714460372925, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}


 51%|█████▏    | 337/656 [04:36<06:07,  1.15s/it]

{'loss': 0.1852, 'grad_norm': 1.8255606889724731, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}


 52%|█████▏    | 338/656 [04:37<05:45,  1.09s/it]

{'loss': 0.1963, 'grad_norm': 3.563551902770996, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}


 52%|█████▏    | 339/656 [04:37<05:29,  1.04s/it]

{'loss': 0.3887, 'grad_norm': 2.6980762481689453, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}


 52%|█████▏    | 340/656 [04:38<04:56,  1.06it/s]

{'loss': 0.2564, 'grad_norm': 1.962104082107544, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}


 52%|█████▏    | 341/656 [04:39<04:49,  1.09it/s]

{'loss': 0.1686, 'grad_norm': 1.2036057710647583, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}


 52%|█████▏    | 342/656 [04:40<04:34,  1.14it/s]

{'loss': 0.2492, 'grad_norm': 1.8928840160369873, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}


 52%|█████▏    | 343/656 [04:41<04:38,  1.13it/s]

{'loss': 0.2447, 'grad_norm': 2.745241641998291, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}


 52%|█████▏    | 344/656 [04:42<04:32,  1.14it/s]

{'loss': 0.1432, 'grad_norm': 0.8434262871742249, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}


 53%|█████▎    | 345/656 [04:42<04:25,  1.17it/s]

{'loss': 0.2619, 'grad_norm': 1.890560269355774, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}


 53%|█████▎    | 346/656 [04:43<04:13,  1.22it/s]

{'loss': 0.2629, 'grad_norm': 2.0507166385650635, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}


 53%|█████▎    | 347/656 [04:44<04:02,  1.27it/s]

{'loss': 0.1844, 'grad_norm': 1.8046715259552002, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}


 53%|█████▎    | 348/656 [04:45<03:58,  1.29it/s]

{'loss': 0.157, 'grad_norm': 1.4027540683746338, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}


 53%|█████▎    | 349/656 [04:46<04:10,  1.23it/s]

{'loss': 0.1482, 'grad_norm': 1.2694125175476074, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}


 53%|█████▎    | 350/656 [04:46<04:11,  1.22it/s]

{'loss': 0.1651, 'grad_norm': 2.0383851528167725, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}


 54%|█████▎    | 351/656 [04:47<04:13,  1.21it/s]

{'loss': 0.1351, 'grad_norm': 1.065109133720398, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}


 54%|█████▎    | 352/656 [04:48<04:07,  1.23it/s]

{'loss': 0.3365, 'grad_norm': 2.596759796142578, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}


 54%|█████▍    | 353/656 [04:49<04:22,  1.16it/s]

{'loss': 0.3674, 'grad_norm': 2.9018986225128174, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}


 54%|█████▍    | 354/656 [04:50<04:22,  1.15it/s]

{'loss': 0.2215, 'grad_norm': 1.8628467321395874, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}


 54%|█████▍    | 355/656 [04:51<04:10,  1.20it/s]

{'loss': 0.297, 'grad_norm': 2.7726969718933105, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}


 54%|█████▍    | 356/656 [04:52<04:23,  1.14it/s]

{'loss': 0.123, 'grad_norm': 0.8659164309501648, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}


 54%|█████▍    | 357/656 [04:53<04:31,  1.10it/s]

{'loss': 0.1386, 'grad_norm': 1.483905553817749, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}


 55%|█████▍    | 358/656 [04:53<04:22,  1.14it/s]

{'loss': 0.173, 'grad_norm': 2.254504680633545, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}


 55%|█████▍    | 359/656 [04:54<04:18,  1.15it/s]

{'loss': 0.2783, 'grad_norm': 6.019668102264404, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}


 55%|█████▍    | 360/656 [04:55<04:06,  1.20it/s]

{'loss': 0.1468, 'grad_norm': 2.102848529815674, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}


 55%|█████▌    | 361/656 [04:56<04:29,  1.10it/s]

{'loss': 0.2011, 'grad_norm': 2.357167959213257, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}


 55%|█████▌    | 362/656 [04:57<04:41,  1.04it/s]

{'loss': 0.1303, 'grad_norm': 1.089754343032837, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}


 55%|█████▌    | 363/656 [04:58<04:40,  1.04it/s]

{'loss': 0.1378, 'grad_norm': 1.9017606973648071, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}


 55%|█████▌    | 364/656 [04:59<04:29,  1.08it/s]

{'loss': 0.167, 'grad_norm': 2.9821722507476807, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}


 56%|█████▌    | 365/656 [05:00<04:13,  1.15it/s]

{'loss': 0.1462, 'grad_norm': 1.5636444091796875, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}


 56%|█████▌    | 366/656 [05:01<04:19,  1.12it/s]

{'loss': 0.1821, 'grad_norm': 2.3037288188934326, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}


 56%|█████▌    | 367/656 [05:02<04:18,  1.12it/s]

{'loss': 0.168, 'grad_norm': 3.2076849937438965, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}


 56%|█████▌    | 368/656 [05:02<04:19,  1.11it/s]

{'loss': 0.201, 'grad_norm': 3.3925161361694336, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}


 56%|█████▋    | 369/656 [05:03<04:30,  1.06it/s]

{'loss': 0.1475, 'grad_norm': 2.502168893814087, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}


 56%|█████▋    | 370/656 [05:04<04:24,  1.08it/s]

{'loss': 0.1272, 'grad_norm': 2.172041416168213, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}


 57%|█████▋    | 371/656 [05:05<04:22,  1.08it/s]

{'loss': 0.1241, 'grad_norm': 2.2454326152801514, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}


 57%|█████▋    | 372/656 [05:06<04:19,  1.10it/s]

{'loss': 0.1544, 'grad_norm': 2.839808940887451, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}


 57%|█████▋    | 373/656 [05:07<04:09,  1.13it/s]

{'loss': 0.266, 'grad_norm': 3.8012824058532715, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}


 57%|█████▋    | 374/656 [05:08<04:21,  1.08it/s]

{'loss': 0.1575, 'grad_norm': 2.7415378093719482, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}


 57%|█████▋    | 375/656 [05:09<04:11,  1.12it/s]

{'loss': 0.2148, 'grad_norm': 3.165296792984009, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}


 57%|█████▋    | 376/656 [05:10<03:52,  1.20it/s]

{'loss': 0.1736, 'grad_norm': 3.8924851417541504, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}


 57%|█████▋    | 377/656 [05:10<04:04,  1.14it/s]

{'loss': 0.1201, 'grad_norm': 0.9355942606925964, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}


 58%|█████▊    | 378/656 [05:11<04:02,  1.15it/s]

{'loss': 0.1337, 'grad_norm': 2.954404592514038, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


 58%|█████▊    | 379/656 [05:12<04:02,  1.14it/s]

{'loss': 0.2772, 'grad_norm': 5.220545291900635, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}


 58%|█████▊    | 380/656 [05:13<03:59,  1.15it/s]

{'loss': 0.12, 'grad_norm': 1.3817365169525146, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}


 58%|█████▊    | 381/656 [05:14<03:54,  1.17it/s]

{'loss': 0.2168, 'grad_norm': 3.546886444091797, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}


 58%|█████▊    | 382/656 [05:15<03:55,  1.16it/s]

{'loss': 0.1226, 'grad_norm': 1.6680716276168823, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


 58%|█████▊    | 383/656 [05:16<03:44,  1.22it/s]

{'loss': 0.2305, 'grad_norm': 4.771048069000244, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}


 59%|█████▊    | 384/656 [05:16<03:46,  1.20it/s]

{'loss': 0.115, 'grad_norm': 1.432926058769226, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}


 59%|█████▊    | 385/656 [05:17<03:50,  1.18it/s]

{'loss': 0.1155, 'grad_norm': 1.0821239948272705, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}


 59%|█████▉    | 386/656 [05:18<03:46,  1.19it/s]

{'loss': 0.1102, 'grad_norm': 1.2002803087234497, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


 59%|█████▉    | 387/656 [05:19<03:39,  1.22it/s]

{'loss': 0.1733, 'grad_norm': 3.5896503925323486, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}


 59%|█████▉    | 388/656 [05:20<03:37,  1.23it/s]

{'loss': 0.1164, 'grad_norm': 1.9224451780319214, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}


 59%|█████▉    | 389/656 [05:21<03:43,  1.20it/s]

{'loss': 0.1035, 'grad_norm': 1.2184464931488037, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}


 59%|█████▉    | 390/656 [05:21<03:49,  1.16it/s]

{'loss': 0.1016, 'grad_norm': 0.7777795195579529, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


 60%|█████▉    | 391/656 [05:22<03:53,  1.14it/s]

{'loss': 0.117, 'grad_norm': 3.2291746139526367, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}


 60%|█████▉    | 392/656 [05:23<03:57,  1.11it/s]

{'loss': 0.1947, 'grad_norm': 1.8650298118591309, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}


 60%|█████▉    | 393/656 [05:24<03:50,  1.14it/s]

{'loss': 0.1017, 'grad_norm': 1.2657687664031982, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}


 60%|██████    | 394/656 [05:25<03:47,  1.15it/s]

{'loss': 0.106, 'grad_norm': 0.9674388766288757, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


 60%|██████    | 395/656 [05:26<03:45,  1.16it/s]

{'loss': 0.0962, 'grad_norm': 0.8142465949058533, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}


 60%|██████    | 396/656 [05:27<03:46,  1.15it/s]

{'loss': 0.0892, 'grad_norm': 0.6655052900314331, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}


 61%|██████    | 397/656 [05:28<03:41,  1.17it/s]

{'loss': 0.0967, 'grad_norm': 1.2877894639968872, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}


 61%|██████    | 398/656 [05:29<03:57,  1.09it/s]

{'loss': 0.1067, 'grad_norm': 2.3108270168304443, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


 61%|██████    | 399/656 [05:30<03:54,  1.10it/s]

{'loss': 0.1069, 'grad_norm': 2.1064977645874023, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}


 61%|██████    | 400/656 [05:30<03:52,  1.10it/s]

{'loss': 0.0833, 'grad_norm': 0.6553444266319275, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}


 61%|██████    | 401/656 [05:31<03:47,  1.12it/s]

{'loss': 0.1144, 'grad_norm': 2.694196939468384, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}


 61%|██████▏   | 402/656 [05:32<03:35,  1.18it/s]

{'loss': 0.0753, 'grad_norm': 0.45661839842796326, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


 61%|██████▏   | 403/656 [05:33<03:47,  1.11it/s]

{'loss': 0.0916, 'grad_norm': 0.8715701699256897, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}


 62%|██████▏   | 404/656 [05:34<03:38,  1.15it/s]

{'loss': 0.2755, 'grad_norm': 3.271035671234131, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}


 62%|██████▏   | 405/656 [05:35<03:54,  1.07it/s]

{'loss': 0.0845, 'grad_norm': 0.948353111743927, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}


 62%|██████▏   | 406/656 [05:36<04:01,  1.04it/s]

{'loss': 0.0857, 'grad_norm': 0.6524559855461121, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


 62%|██████▏   | 407/656 [05:37<03:46,  1.10it/s]

{'loss': 0.0721, 'grad_norm': 0.46586939692497253, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}


 62%|██████▏   | 408/656 [05:38<03:39,  1.13it/s]

{'loss': 0.0815, 'grad_norm': 0.716956377029419, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}


 62%|██████▏   | 409/656 [05:38<03:31,  1.17it/s]

{'loss': 0.0748, 'grad_norm': 0.609008252620697, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}


 62%|██████▎   | 410/656 [05:39<03:27,  1.19it/s]

{'loss': 0.1712, 'grad_norm': 4.627167224884033, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


 63%|██████▎   | 411/656 [05:40<03:24,  1.20it/s]

{'loss': 0.2989, 'grad_norm': 2.6502315998077393, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}


 63%|██████▎   | 412/656 [05:41<03:27,  1.17it/s]

{'loss': 0.0738, 'grad_norm': 0.6207338571548462, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}


 63%|██████▎   | 413/656 [05:42<03:23,  1.19it/s]

{'loss': 0.0829, 'grad_norm': 1.6281657218933105, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}


 63%|██████▎   | 414/656 [05:43<03:21,  1.20it/s]

{'loss': 0.144, 'grad_norm': 3.0285661220550537, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}


 63%|██████▎   | 415/656 [05:43<03:26,  1.16it/s]

{'loss': 0.0665, 'grad_norm': 0.424868106842041, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}


 63%|██████▎   | 416/656 [05:44<03:16,  1.22it/s]

{'loss': 0.0901, 'grad_norm': 2.4357404708862305, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}


 64%|██████▎   | 417/656 [05:45<03:15,  1.22it/s]

{'loss': 0.0739, 'grad_norm': 0.4847167730331421, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}


 64%|██████▎   | 418/656 [05:46<03:14,  1.23it/s]

{'loss': 0.0787, 'grad_norm': 1.0130741596221924, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}


 64%|██████▍   | 419/656 [05:47<03:18,  1.19it/s]

{'loss': 0.1208, 'grad_norm': 3.1925339698791504, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}


 64%|██████▍   | 420/656 [05:47<03:14,  1.21it/s]

{'loss': 0.1154, 'grad_norm': 2.7280807495117188, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}


 64%|██████▍   | 421/656 [05:48<03:13,  1.22it/s]

{'loss': 0.1967, 'grad_norm': 1.1960594654083252, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}


 64%|██████▍   | 422/656 [05:49<03:06,  1.25it/s]

{'loss': 0.2182, 'grad_norm': 4.888989448547363, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}


 64%|██████▍   | 423/656 [05:50<03:05,  1.26it/s]

{'loss': 0.0707, 'grad_norm': 0.5416994690895081, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}


 65%|██████▍   | 424/656 [05:51<03:15,  1.19it/s]

{'loss': 0.0641, 'grad_norm': 0.4470784366130829, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}


 65%|██████▍   | 425/656 [05:52<03:24,  1.13it/s]

{'loss': 0.0609, 'grad_norm': 0.4426923394203186, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}


 65%|██████▍   | 426/656 [05:52<03:13,  1.19it/s]

{'loss': 0.0802, 'grad_norm': 0.9394424557685852, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}


 65%|██████▌   | 427/656 [05:53<03:10,  1.20it/s]

{'loss': 0.0653, 'grad_norm': 0.5632370114326477, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}


 65%|██████▌   | 428/656 [05:54<03:04,  1.24it/s]

{'loss': 0.0941, 'grad_norm': 1.8625839948654175, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [05:55<03:01,  1.25it/s]

{'loss': 0.0675, 'grad_norm': 0.6388010382652283, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}


 66%|██████▌   | 430/656 [05:56<03:06,  1.21it/s]

{'loss': 0.2697, 'grad_norm': 4.4015116691589355, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}


 66%|██████▌   | 431/656 [05:57<03:13,  1.16it/s]

{'loss': 0.2374, 'grad_norm': 2.5878098011016846, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}


 66%|██████▌   | 432/656 [05:58<03:13,  1.16it/s]

{'loss': 0.1484, 'grad_norm': 2.6378252506256104, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}


 66%|██████▌   | 433/656 [05:58<03:05,  1.21it/s]

{'loss': 0.2024, 'grad_norm': 1.4699454307556152, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}


 66%|██████▌   | 434/656 [05:59<03:11,  1.16it/s]

{'loss': 0.0757, 'grad_norm': 1.8614251613616943, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}


 66%|██████▋   | 435/656 [06:00<03:07,  1.18it/s]

{'loss': 0.2063, 'grad_norm': 0.8557084798812866, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}


 66%|██████▋   | 436/656 [06:01<03:14,  1.13it/s]

{'loss': 0.1032, 'grad_norm': 3.8777403831481934, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}


 67%|██████▋   | 437/656 [06:02<03:17,  1.11it/s]

{'loss': 0.2393, 'grad_norm': 3.657794713973999, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}


 67%|██████▋   | 438/656 [06:03<03:12,  1.13it/s]

{'loss': 0.0792, 'grad_norm': 1.3340959548950195, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}


 67%|██████▋   | 439/656 [06:04<03:09,  1.14it/s]

{'loss': 0.1177, 'grad_norm': 4.7837419509887695, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}


 67%|██████▋   | 440/656 [06:05<03:07,  1.15it/s]

{'loss': 0.092, 'grad_norm': 2.324965715408325, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


 67%|██████▋   | 441/656 [06:06<03:18,  1.08it/s]

{'loss': 0.0707, 'grad_norm': 1.682382583618164, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}


 67%|██████▋   | 442/656 [06:06<03:04,  1.16it/s]

{'loss': 0.1791, 'grad_norm': 4.70548152923584, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}


 68%|██████▊   | 443/656 [06:07<03:07,  1.14it/s]

{'loss': 0.0946, 'grad_norm': 2.366011619567871, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}


 68%|██████▊   | 444/656 [06:08<02:54,  1.22it/s]

{'loss': 0.2784, 'grad_norm': 9.796497344970703, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


 68%|██████▊   | 445/656 [06:09<02:59,  1.18it/s]

{'loss': 0.0639, 'grad_norm': 1.3355344533920288, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}


 68%|██████▊   | 446/656 [06:10<02:58,  1.18it/s]

{'loss': 0.2769, 'grad_norm': 4.042390823364258, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}


 68%|██████▊   | 447/656 [06:11<02:59,  1.16it/s]

{'loss': 0.0723, 'grad_norm': 1.473854422569275, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}


 68%|██████▊   | 448/656 [06:11<02:58,  1.17it/s]

{'loss': 0.0601, 'grad_norm': 0.7332204580307007, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}


 68%|██████▊   | 449/656 [06:12<02:50,  1.21it/s]

{'loss': 0.1177, 'grad_norm': 3.481985569000244, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}


 69%|██████▊   | 450/656 [06:13<02:47,  1.23it/s]

{'loss': 0.0559, 'grad_norm': 0.39278343319892883, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}


 69%|██████▉   | 451/656 [06:14<03:02,  1.12it/s]

{'loss': 0.0555, 'grad_norm': 0.47222042083740234, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}


 69%|██████▉   | 452/656 [06:15<03:03,  1.11it/s]

{'loss': 0.0865, 'grad_norm': 2.945436716079712, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}


 69%|██████▉   | 453/656 [06:16<03:05,  1.10it/s]

{'loss': 0.0545, 'grad_norm': 0.3624827265739441, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}


 69%|██████▉   | 454/656 [06:17<02:53,  1.17it/s]

{'loss': 0.0599, 'grad_norm': 1.2476320266723633, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}


 69%|██████▉   | 455/656 [06:17<02:53,  1.16it/s]

{'loss': 0.1639, 'grad_norm': 1.2879738807678223, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}


 70%|██████▉   | 456/656 [06:19<03:06,  1.07it/s]

{'loss': 0.0961, 'grad_norm': 5.133727550506592, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


 70%|██████▉   | 457/656 [06:20<03:08,  1.05it/s]

{'loss': 0.0597, 'grad_norm': 1.3297760486602783, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}


 70%|██████▉   | 458/656 [06:21<03:10,  1.04it/s]

{'loss': 0.1806, 'grad_norm': 2.162631034851074, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}


 70%|██████▉   | 459/656 [06:21<02:59,  1.10it/s]

{'loss': 0.0557, 'grad_norm': 0.8876288533210754, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}


 70%|███████   | 460/656 [06:22<02:46,  1.18it/s]

{'loss': 0.2319, 'grad_norm': 3.7396645545959473, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}


 70%|███████   | 461/656 [06:23<02:46,  1.17it/s]

{'loss': 0.1086, 'grad_norm': 4.756915092468262, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}


 70%|███████   | 462/656 [06:24<02:41,  1.20it/s]

{'loss': 0.0516, 'grad_norm': 0.4724022150039673, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}


 71%|███████   | 463/656 [06:25<02:41,  1.20it/s]

{'loss': 0.0698, 'grad_norm': 2.0070135593414307, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}


 71%|███████   | 464/656 [06:25<02:41,  1.19it/s]

{'loss': 0.0549, 'grad_norm': 0.6617861390113831, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}


 71%|███████   | 465/656 [06:26<02:41,  1.18it/s]

{'loss': 0.0523, 'grad_norm': 0.7946739196777344, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}


 71%|███████   | 466/656 [06:27<02:42,  1.17it/s]

{'loss': 0.0518, 'grad_norm': 0.7325640916824341, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}


 71%|███████   | 467/656 [06:28<02:45,  1.14it/s]

{'loss': 0.1797, 'grad_norm': 1.41270112991333, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}


 71%|███████▏  | 468/656 [06:29<02:50,  1.10it/s]

{'loss': 0.0455, 'grad_norm': 0.5822061896324158, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}


 71%|███████▏  | 469/656 [06:30<02:48,  1.11it/s]

{'loss': 0.0642, 'grad_norm': 1.7389593124389648, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}


 72%|███████▏  | 470/656 [06:31<02:44,  1.13it/s]

{'loss': 0.0684, 'grad_norm': 1.165708303451538, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}


 72%|███████▏  | 471/656 [06:32<02:59,  1.03it/s]

{'loss': 0.0486, 'grad_norm': 0.38742727041244507, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}


 72%|███████▏  | 472/656 [06:33<02:47,  1.10it/s]

{'loss': 0.0557, 'grad_norm': 0.9924666881561279, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


 72%|███████▏  | 473/656 [06:34<02:53,  1.05it/s]

{'loss': 0.1874, 'grad_norm': 3.839867115020752, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}


 72%|███████▏  | 474/656 [06:35<02:44,  1.10it/s]

{'loss': 0.0483, 'grad_norm': 0.6391960382461548, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}


 72%|███████▏  | 475/656 [06:35<02:35,  1.16it/s]

{'loss': 0.0646, 'grad_norm': 2.0287630558013916, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}


 73%|███████▎  | 476/656 [06:36<02:34,  1.17it/s]

{'loss': 0.0619, 'grad_norm': 2.438795566558838, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


 73%|███████▎  | 477/656 [06:38<03:05,  1.04s/it]

{'loss': 0.045, 'grad_norm': 0.3281501829624176, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}


 73%|███████▎  | 478/656 [06:38<02:49,  1.05it/s]

{'loss': 0.0765, 'grad_norm': 4.515491485595703, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}


 73%|███████▎  | 479/656 [06:39<02:41,  1.10it/s]

{'loss': 0.0442, 'grad_norm': 0.2918770909309387, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}


 73%|███████▎  | 480/656 [06:40<02:37,  1.12it/s]

{'loss': 0.0419, 'grad_norm': 0.27575433254241943, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


 73%|███████▎  | 481/656 [06:41<02:37,  1.11it/s]

{'loss': 0.2988, 'grad_norm': 6.056497097015381, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}


 73%|███████▎  | 482/656 [06:42<02:34,  1.13it/s]

{'loss': 0.0431, 'grad_norm': 0.3085028827190399, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}


 74%|███████▎  | 483/656 [06:43<02:42,  1.06it/s]

{'loss': 0.0477, 'grad_norm': 0.5422279238700867, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}


 74%|███████▍  | 484/656 [06:44<02:37,  1.09it/s]

{'loss': 0.042, 'grad_norm': 0.31195345520973206, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


 74%|███████▍  | 485/656 [06:45<02:40,  1.07it/s]

{'loss': 0.1564, 'grad_norm': 4.743429183959961, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}


 74%|███████▍  | 486/656 [06:46<02:47,  1.02it/s]

{'loss': 0.1184, 'grad_norm': 3.9589099884033203, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}


 74%|███████▍  | 487/656 [06:47<02:37,  1.08it/s]

{'loss': 0.0459, 'grad_norm': 0.5745202898979187, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}


 74%|███████▍  | 488/656 [06:48<02:49,  1.01s/it]

{'loss': 0.1737, 'grad_norm': 3.5636770725250244, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


 75%|███████▍  | 489/656 [06:49<02:42,  1.03it/s]

{'loss': 0.0446, 'grad_norm': 0.6843639612197876, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}


 75%|███████▍  | 490/656 [06:50<02:36,  1.06it/s]

{'loss': 0.0433, 'grad_norm': 0.365714967250824, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}


 75%|███████▍  | 491/656 [06:50<02:30,  1.09it/s]

{'loss': 0.1725, 'grad_norm': 1.6348179578781128, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}


 75%|███████▌  | 492/656 [06:52<02:43,  1.00it/s]

{'loss': 0.0478, 'grad_norm': 0.7678313255310059, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


 75%|███████▌  | 493/656 [06:52<02:36,  1.04it/s]

{'loss': 0.0817, 'grad_norm': 3.2453808784484863, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}


 75%|███████▌  | 494/656 [06:53<02:27,  1.10it/s]

{'loss': 0.0497, 'grad_norm': 1.4040931463241577, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}


 75%|███████▌  | 495/656 [06:54<02:27,  1.09it/s]

{'loss': 0.046, 'grad_norm': 0.48894619941711426, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}


 76%|███████▌  | 496/656 [06:55<02:21,  1.13it/s]

{'loss': 0.1776, 'grad_norm': 2.0226473808288574, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


 76%|███████▌  | 497/656 [06:56<02:29,  1.07it/s]

{'loss': 0.0496, 'grad_norm': 0.9993080496788025, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}


 76%|███████▌  | 498/656 [06:57<02:23,  1.10it/s]

{'loss': 0.04, 'grad_norm': 0.40189313888549805, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}


 76%|███████▌  | 499/656 [06:58<02:15,  1.16it/s]

{'loss': 0.0425, 'grad_norm': 0.4671821892261505, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}


 76%|███████▌  | 500/656 [06:58<02:09,  1.20it/s]

{'loss': 0.2006, 'grad_norm': 4.942381381988525, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


 76%|███████▋  | 501/656 [06:59<02:09,  1.20it/s]

{'loss': 0.0882, 'grad_norm': 4.304422378540039, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}


 77%|███████▋  | 502/656 [07:00<02:12,  1.16it/s]

{'loss': 0.151, 'grad_norm': 4.816380023956299, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}


 77%|███████▋  | 503/656 [07:01<02:09,  1.18it/s]

{'loss': 0.089, 'grad_norm': 4.730923652648926, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}


 77%|███████▋  | 504/656 [07:02<02:10,  1.17it/s]

{'loss': 0.1338, 'grad_norm': 3.87565541267395, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


 77%|███████▋  | 505/656 [07:03<02:04,  1.21it/s]

{'loss': 0.0504, 'grad_norm': 2.2740840911865234, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}


 77%|███████▋  | 506/656 [07:03<02:03,  1.21it/s]

{'loss': 0.0936, 'grad_norm': 5.446310997009277, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}


 77%|███████▋  | 507/656 [07:04<02:01,  1.23it/s]

{'loss': 0.0351, 'grad_norm': 0.22920119762420654, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}


 77%|███████▋  | 508/656 [07:05<02:04,  1.19it/s]

{'loss': 0.0448, 'grad_norm': 1.3933961391448975, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}


 78%|███████▊  | 509/656 [07:06<01:58,  1.25it/s]

{'loss': 0.1352, 'grad_norm': 5.831365585327148, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}


 78%|███████▊  | 510/656 [07:07<02:08,  1.14it/s]

{'loss': 0.13, 'grad_norm': 4.519550323486328, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}


 78%|███████▊  | 511/656 [07:08<02:17,  1.06it/s]

{'loss': 0.0401, 'grad_norm': 0.40468814969062805, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}


 78%|███████▊  | 512/656 [07:09<02:13,  1.08it/s]

{'loss': 0.0434, 'grad_norm': 0.501724123954773, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}


 78%|███████▊  | 513/656 [07:10<02:09,  1.10it/s]

{'loss': 0.0443, 'grad_norm': 1.1183207035064697, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}


 78%|███████▊  | 514/656 [07:11<02:06,  1.12it/s]

{'loss': 0.0328, 'grad_norm': 0.25612011551856995, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}


 79%|███████▊  | 515/656 [07:11<02:03,  1.15it/s]

{'loss': 0.0383, 'grad_norm': 0.6760016083717346, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}


 79%|███████▊  | 516/656 [07:13<02:10,  1.07it/s]

{'loss': 0.0401, 'grad_norm': 0.6031008958816528, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}


 79%|███████▉  | 517/656 [07:13<02:11,  1.05it/s]

{'loss': 0.2543, 'grad_norm': 3.363624095916748, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}


 79%|███████▉  | 518/656 [07:14<02:12,  1.04it/s]

{'loss': 0.0667, 'grad_norm': 2.9964301586151123, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}


 79%|███████▉  | 519/656 [07:15<02:04,  1.10it/s]

{'loss': 0.0595, 'grad_norm': 1.8410087823867798, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}


 79%|███████▉  | 520/656 [07:16<02:04,  1.09it/s]

{'loss': 0.0399, 'grad_norm': 0.8187448978424072, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}


 79%|███████▉  | 521/656 [07:17<02:01,  1.11it/s]

{'loss': 0.0341, 'grad_norm': 0.3303842842578888, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}


 80%|███████▉  | 522/656 [07:18<02:02,  1.10it/s]

{'loss': 0.0542, 'grad_norm': 2.5292160511016846, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}


 80%|███████▉  | 523/656 [07:19<01:55,  1.16it/s]

{'loss': 0.0895, 'grad_norm': 7.98539400100708, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}


 80%|███████▉  | 524/656 [07:20<02:00,  1.09it/s]

{'loss': 0.0335, 'grad_norm': 0.5186068415641785, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


 80%|████████  | 525/656 [07:21<02:00,  1.09it/s]

{'loss': 0.0354, 'grad_norm': 0.417303204536438, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}


 80%|████████  | 526/656 [07:22<02:02,  1.06it/s]

{'loss': 0.0317, 'grad_norm': 0.2267511636018753, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}


 80%|████████  | 527/656 [07:22<01:54,  1.12it/s]

{'loss': 0.0303, 'grad_norm': 0.232246994972229, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}


 80%|████████  | 528/656 [07:23<01:52,  1.14it/s]

{'loss': 0.2297, 'grad_norm': 3.537264108657837, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}


 81%|████████  | 529/656 [07:24<01:44,  1.21it/s]

{'loss': 0.0471, 'grad_norm': 1.8101686239242554, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}


 81%|████████  | 530/656 [07:25<01:41,  1.24it/s]

{'loss': 0.1414, 'grad_norm': 5.557215690612793, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}


 81%|████████  | 531/656 [07:26<01:38,  1.27it/s]

{'loss': 0.0615, 'grad_norm': 2.1475229263305664, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}


 81%|████████  | 532/656 [07:27<01:44,  1.19it/s]

{'loss': 0.0336, 'grad_norm': 0.3310604393482208, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}


 81%|████████▏ | 533/656 [07:27<01:42,  1.20it/s]

{'loss': 0.0302, 'grad_norm': 0.3533025085926056, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}


 81%|████████▏ | 534/656 [07:28<01:40,  1.22it/s]

{'loss': 0.0856, 'grad_norm': 4.5466485023498535, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}


 82%|████████▏ | 535/656 [07:29<01:42,  1.18it/s]

{'loss': 0.0747, 'grad_norm': 4.1608805656433105, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}


 82%|████████▏ | 536/656 [07:30<01:41,  1.18it/s]

{'loss': 0.0411, 'grad_norm': 1.2850419282913208, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}


 82%|████████▏ | 537/656 [07:31<01:47,  1.10it/s]

{'loss': 0.0343, 'grad_norm': 0.8846946358680725, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}


 82%|████████▏ | 538/656 [07:32<01:45,  1.12it/s]

{'loss': 0.0341, 'grad_norm': 0.5973051190376282, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


 82%|████████▏ | 539/656 [07:33<01:44,  1.12it/s]

{'loss': 0.0531, 'grad_norm': 3.2358546257019043, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}


 82%|████████▏ | 540/656 [07:33<01:38,  1.17it/s]

{'loss': 0.144, 'grad_norm': 4.626684665679932, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}


 82%|████████▏ | 541/656 [07:34<01:39,  1.16it/s]

{'loss': 0.141, 'grad_norm': 3.7726080417633057, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}


 83%|████████▎ | 542/656 [07:35<01:37,  1.17it/s]

{'loss': 0.0306, 'grad_norm': 0.35393744707107544, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}


 83%|████████▎ | 543/656 [07:36<01:31,  1.23it/s]

{'loss': 0.1425, 'grad_norm': 5.802974700927734, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}


 83%|████████▎ | 544/656 [07:37<01:29,  1.26it/s]

{'loss': 0.0423, 'grad_norm': 1.6575571298599243, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}


 83%|████████▎ | 545/656 [07:38<01:31,  1.21it/s]

{'loss': 0.0297, 'grad_norm': 0.2849825918674469, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}


 83%|████████▎ | 546/656 [07:38<01:35,  1.16it/s]

{'loss': 0.029, 'grad_norm': 0.21593065559864044, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}


 83%|████████▎ | 547/656 [07:39<01:33,  1.17it/s]

{'loss': 0.0323, 'grad_norm': 0.38974666595458984, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}


 84%|████████▎ | 548/656 [07:40<01:33,  1.15it/s]

{'loss': 0.0319, 'grad_norm': 0.7041918039321899, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}


 84%|████████▎ | 549/656 [07:41<01:30,  1.18it/s]

{'loss': 0.0292, 'grad_norm': 0.2227000743150711, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}


 84%|████████▍ | 550/656 [07:42<01:28,  1.20it/s]

{'loss': 0.0309, 'grad_norm': 0.43907198309898376, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}


 84%|████████▍ | 551/656 [07:43<01:28,  1.18it/s]

{'loss': 0.0501, 'grad_norm': 2.0950381755828857, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}


 84%|████████▍ | 552/656 [07:44<01:27,  1.19it/s]

{'loss': 0.05, 'grad_norm': 3.1721816062927246, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}


 84%|████████▍ | 553/656 [07:44<01:22,  1.26it/s]

{'loss': 0.0314, 'grad_norm': 0.5443840026855469, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}


 84%|████████▍ | 554/656 [07:45<01:26,  1.18it/s]

{'loss': 0.1788, 'grad_norm': 4.808205604553223, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}


 85%|████████▍ | 555/656 [07:46<01:24,  1.20it/s]

{'loss': 0.3467, 'grad_norm': 6.329813003540039, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}


 85%|████████▍ | 556/656 [07:47<01:30,  1.11it/s]

{'loss': 0.0287, 'grad_norm': 0.20778989791870117, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}


 85%|████████▍ | 557/656 [07:48<01:26,  1.15it/s]

{'loss': 0.0281, 'grad_norm': 0.3781690299510956, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}


 85%|████████▌ | 558/656 [07:49<01:27,  1.12it/s]

{'loss': 0.0255, 'grad_norm': 0.21227985620498657, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}


 85%|████████▌ | 559/656 [07:50<01:34,  1.03it/s]

{'loss': 0.0236, 'grad_norm': 0.16938628256320953, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}


 85%|████████▌ | 560/656 [07:51<01:31,  1.05it/s]

{'loss': 0.0279, 'grad_norm': 0.39351195096969604, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}


 86%|████████▌ | 561/656 [07:52<01:26,  1.10it/s]

{'loss': 0.038, 'grad_norm': 1.2328503131866455, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}


 86%|████████▌ | 562/656 [07:52<01:22,  1.14it/s]

{'loss': 0.0269, 'grad_norm': 0.28501030802726746, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}


 86%|████████▌ | 563/656 [07:53<01:20,  1.16it/s]

{'loss': 0.036, 'grad_norm': 0.6345365643501282, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}


 86%|████████▌ | 564/656 [07:54<01:18,  1.17it/s]

{'loss': 0.0329, 'grad_norm': 0.3904421329498291, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}


 86%|████████▌ | 565/656 [07:55<01:14,  1.21it/s]

{'loss': 0.0711, 'grad_norm': 3.2105627059936523, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}


 86%|████████▋ | 566/656 [07:56<01:13,  1.22it/s]

{'loss': 0.0247, 'grad_norm': 0.2535208463668823, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}


 86%|████████▋ | 567/656 [07:57<01:25,  1.04it/s]

{'loss': 0.024, 'grad_norm': 0.18119880557060242, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}


 87%|████████▋ | 568/656 [07:58<01:25,  1.03it/s]

{'loss': 0.1983, 'grad_norm': 2.828232765197754, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}


 87%|████████▋ | 569/656 [07:59<01:20,  1.08it/s]

{'loss': 0.1282, 'grad_norm': 4.822483539581299, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}


 87%|████████▋ | 570/656 [08:00<01:26,  1.01s/it]

{'loss': 0.205, 'grad_norm': 3.6815760135650635, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}


 87%|████████▋ | 571/656 [08:01<01:25,  1.01s/it]

{'loss': 0.0254, 'grad_norm': 0.4271644949913025, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}


 87%|████████▋ | 572/656 [08:02<01:18,  1.07it/s]

{'loss': 0.0267, 'grad_norm': 0.32042738795280457, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


 87%|████████▋ | 573/656 [08:03<01:16,  1.09it/s]

{'loss': 0.2776, 'grad_norm': 6.753608703613281, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}


 88%|████████▊ | 574/656 [08:03<01:09,  1.17it/s]

{'loss': 0.1939, 'grad_norm': 6.377989768981934, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}


 88%|████████▊ | 575/656 [08:04<01:08,  1.17it/s]

{'loss': 0.1792, 'grad_norm': 5.086284160614014, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}


 88%|████████▊ | 576/656 [08:05<01:08,  1.17it/s]

{'loss': 0.1259, 'grad_norm': 8.467609405517578, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


 88%|████████▊ | 577/656 [08:06<01:09,  1.13it/s]

{'loss': 0.0235, 'grad_norm': 0.21954715251922607, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}


 88%|████████▊ | 578/656 [08:07<01:10,  1.10it/s]

{'loss': 0.0292, 'grad_norm': 0.33098772168159485, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}


 88%|████████▊ | 579/656 [08:08<01:08,  1.12it/s]

{'loss': 0.0228, 'grad_norm': 0.18970826268196106, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}


 88%|████████▊ | 580/656 [08:09<01:05,  1.16it/s]

{'loss': 0.0236, 'grad_norm': 0.18537616729736328, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}


 89%|████████▊ | 581/656 [08:09<01:02,  1.20it/s]

{'loss': 0.025, 'grad_norm': 0.2390826791524887, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}


 89%|████████▊ | 582/656 [08:10<01:04,  1.14it/s]

{'loss': 0.0278, 'grad_norm': 0.2534668743610382, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}


 89%|████████▉ | 583/656 [08:11<01:07,  1.08it/s]

{'loss': 0.0221, 'grad_norm': 0.16485244035720825, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}


 89%|████████▉ | 584/656 [08:13<01:14,  1.03s/it]

{'loss': 0.0924, 'grad_norm': 5.439773082733154, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


 89%|████████▉ | 585/656 [08:14<01:08,  1.03it/s]

{'loss': 0.0292, 'grad_norm': 1.2283128499984741, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}


 89%|████████▉ | 586/656 [08:15<01:09,  1.01it/s]

{'loss': 0.0719, 'grad_norm': 4.00386381149292, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}


 89%|████████▉ | 587/656 [08:15<01:05,  1.05it/s]

{'loss': 0.0234, 'grad_norm': 0.18326681852340698, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}


 90%|████████▉ | 588/656 [08:16<01:04,  1.05it/s]

{'loss': 0.1669, 'grad_norm': 1.1834957599639893, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


 90%|████████▉ | 589/656 [08:17<01:00,  1.12it/s]

{'loss': 0.0251, 'grad_norm': 0.1875297576189041, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}


 90%|████████▉ | 590/656 [08:18<00:59,  1.10it/s]

{'loss': 0.1792, 'grad_norm': 6.06603479385376, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}


 90%|█████████ | 591/656 [08:19<00:58,  1.12it/s]

{'loss': 0.1105, 'grad_norm': 6.715856075286865, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}


 90%|█████████ | 592/656 [08:20<01:00,  1.06it/s]

{'loss': 0.0575, 'grad_norm': 2.8456366062164307, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


 90%|█████████ | 593/656 [08:21<00:58,  1.08it/s]

{'loss': 0.0257, 'grad_norm': 1.1839901208877563, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}


 91%|█████████ | 594/656 [08:22<01:00,  1.03it/s]

{'loss': 0.0464, 'grad_norm': 3.4699976444244385, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}


 91%|█████████ | 595/656 [08:23<00:57,  1.06it/s]

{'loss': 0.0774, 'grad_norm': 4.525294303894043, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}


 91%|█████████ | 596/656 [08:24<00:54,  1.10it/s]

{'loss': 0.0458, 'grad_norm': 3.625800371170044, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


 91%|█████████ | 597/656 [08:24<00:52,  1.13it/s]

{'loss': 0.0214, 'grad_norm': 0.18409167230129242, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}


 91%|█████████ | 598/656 [08:26<00:54,  1.07it/s]

{'loss': 0.0261, 'grad_norm': 0.2945447266101837, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}


 91%|█████████▏| 599/656 [08:26<00:53,  1.07it/s]

{'loss': 0.0436, 'grad_norm': 2.537611722946167, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}


 91%|█████████▏| 600/656 [08:27<00:49,  1.13it/s]

{'loss': 0.0228, 'grad_norm': 0.18993128836154938, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


 92%|█████████▏| 601/656 [08:28<00:49,  1.12it/s]

{'loss': 0.0278, 'grad_norm': 0.694507360458374, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}


 92%|█████████▏| 602/656 [08:29<00:46,  1.17it/s]

{'loss': 0.0755, 'grad_norm': 4.281641960144043, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}


 92%|█████████▏| 603/656 [08:30<00:44,  1.19it/s]

{'loss': 0.0505, 'grad_norm': 3.172348737716675, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}


 92%|█████████▏| 604/656 [08:31<00:44,  1.18it/s]

{'loss': 0.1711, 'grad_norm': 2.743967056274414, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


 92%|█████████▏| 605/656 [08:32<00:44,  1.14it/s]

{'loss': 0.221, 'grad_norm': 6.310118198394775, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}


 92%|█████████▏| 606/656 [08:32<00:43,  1.16it/s]

{'loss': 0.0221, 'grad_norm': 0.3468479812145233, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}


 93%|█████████▎| 607/656 [08:33<00:43,  1.12it/s]

{'loss': 0.0208, 'grad_norm': 0.24227365851402283, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}


 93%|█████████▎| 608/656 [08:34<00:41,  1.15it/s]

{'loss': 0.0973, 'grad_norm': 6.040865421295166, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}


 93%|█████████▎| 609/656 [08:35<00:38,  1.21it/s]

{'loss': 0.0647, 'grad_norm': 3.070967674255371, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}


 93%|█████████▎| 610/656 [08:36<00:39,  1.18it/s]

{'loss': 0.0591, 'grad_norm': 3.1541850566864014, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}


 93%|█████████▎| 611/656 [08:37<00:38,  1.17it/s]

{'loss': 0.0748, 'grad_norm': 4.732306003570557, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}


 93%|█████████▎| 612/656 [08:38<00:38,  1.14it/s]

{'loss': 0.0253, 'grad_norm': 0.6423932909965515, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}


 93%|█████████▎| 613/656 [08:38<00:37,  1.16it/s]

{'loss': 0.2539, 'grad_norm': 3.9650611877441406, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}


 94%|█████████▎| 614/656 [08:39<00:36,  1.15it/s]

{'loss': 0.019, 'grad_norm': 0.1420411765575409, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}


 94%|█████████▍| 615/656 [08:40<00:38,  1.07it/s]

{'loss': 0.0253, 'grad_norm': 0.8696269392967224, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}


 94%|█████████▍| 616/656 [08:41<00:35,  1.12it/s]

{'loss': 0.1453, 'grad_norm': 8.597689628601074, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}


 94%|█████████▍| 617/656 [08:42<00:34,  1.15it/s]

{'loss': 0.0217, 'grad_norm': 0.386260449886322, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}


 94%|█████████▍| 618/656 [08:43<00:34,  1.11it/s]

{'loss': 0.026, 'grad_norm': 0.9572716355323792, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [08:44<00:36,  1.02it/s]

{'loss': 0.0328, 'grad_norm': 1.469474196434021, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}


 95%|█████████▍| 620/656 [08:45<00:34,  1.05it/s]

{'loss': 0.104, 'grad_norm': 5.415670871734619, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}


 95%|█████████▍| 621/656 [08:46<00:31,  1.11it/s]

{'loss': 0.0287, 'grad_norm': 1.191298007965088, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}


 95%|█████████▍| 622/656 [08:47<00:30,  1.12it/s]

{'loss': 0.1705, 'grad_norm': 8.538728713989258, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}


 95%|█████████▍| 623/656 [08:48<00:31,  1.05it/s]

{'loss': 0.0268, 'grad_norm': 1.2726835012435913, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}


 95%|█████████▌| 624/656 [08:49<00:29,  1.10it/s]

{'loss': 0.0201, 'grad_norm': 0.6233860850334167, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}


 95%|█████████▌| 625/656 [08:50<00:29,  1.04it/s]

{'loss': 0.0209, 'grad_norm': 0.21419702470302582, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}


 95%|█████████▌| 626/656 [08:51<00:31,  1.04s/it]

{'loss': 0.1082, 'grad_norm': 4.940230369567871, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


 96%|█████████▌| 627/656 [08:52<00:27,  1.04it/s]

{'loss': 0.0809, 'grad_norm': 3.869739294052124, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}


 96%|█████████▌| 628/656 [08:53<00:30,  1.08s/it]

{'loss': 0.0398, 'grad_norm': 4.443361759185791, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}


 96%|█████████▌| 629/656 [08:54<00:27,  1.03s/it]

{'loss': 0.0221, 'grad_norm': 0.5323262810707092, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}


 96%|█████████▌| 630/656 [08:55<00:25,  1.01it/s]

{'loss': 0.1299, 'grad_norm': 2.84150767326355, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


 96%|█████████▌| 631/656 [08:56<00:25,  1.03s/it]

{'loss': 0.0218, 'grad_norm': 0.2881334125995636, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}


 96%|█████████▋| 632/656 [08:57<00:24,  1.02s/it]

{'loss': 0.0758, 'grad_norm': 3.8789587020874023, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}


 96%|█████████▋| 633/656 [08:58<00:23,  1.04s/it]

{'loss': 0.1258, 'grad_norm': 1.378240704536438, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}


 97%|█████████▋| 634/656 [08:59<00:21,  1.01it/s]

{'loss': 0.1602, 'grad_norm': 6.917661190032959, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


 97%|█████████▋| 635/656 [09:00<00:20,  1.01it/s]

{'loss': 0.0306, 'grad_norm': 0.9983196258544922, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}


 97%|█████████▋| 636/656 [09:01<00:19,  1.02it/s]

{'loss': 0.079, 'grad_norm': 3.9711718559265137, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}


 97%|█████████▋| 637/656 [09:02<00:18,  1.05it/s]

{'loss': 0.0553, 'grad_norm': 3.5516579151153564, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}


 97%|█████████▋| 638/656 [09:03<00:16,  1.07it/s]

{'loss': 0.0195, 'grad_norm': 0.3068470060825348, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}


 97%|█████████▋| 639/656 [09:04<00:16,  1.01it/s]

{'loss': 0.031, 'grad_norm': 1.549977421760559, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}


 98%|█████████▊| 640/656 [09:05<00:15,  1.03it/s]

{'loss': 0.0209, 'grad_norm': 0.3117426633834839, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}


 98%|█████████▊| 641/656 [09:06<00:14,  1.07it/s]

{'loss': 0.0283, 'grad_norm': 1.2467186450958252, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}


 98%|█████████▊| 642/656 [09:06<00:13,  1.07it/s]

{'loss': 0.1388, 'grad_norm': 3.070894718170166, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}


 98%|█████████▊| 643/656 [09:07<00:11,  1.12it/s]

{'loss': 0.031, 'grad_norm': 2.0532522201538086, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}


 98%|█████████▊| 644/656 [09:08<00:10,  1.18it/s]

{'loss': 0.0229, 'grad_norm': 1.149070143699646, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


 98%|█████████▊| 645/656 [09:09<00:09,  1.19it/s]

{'loss': 0.0182, 'grad_norm': 0.14486931264400482, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}


 98%|█████████▊| 646/656 [09:10<00:08,  1.18it/s]

{'loss': 0.0179, 'grad_norm': 0.12744903564453125, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}


 99%|█████████▊| 647/656 [09:10<00:07,  1.19it/s]

{'loss': 0.0298, 'grad_norm': 1.6387699842453003, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}


 99%|█████████▉| 648/656 [09:11<00:07,  1.13it/s]

{'loss': 0.0193, 'grad_norm': 0.44179847836494446, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


 99%|█████████▉| 649/656 [09:12<00:05,  1.17it/s]

{'loss': 0.0173, 'grad_norm': 0.19553035497665405, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}


 99%|█████████▉| 650/656 [09:13<00:05,  1.08it/s]

{'loss': 0.1473, 'grad_norm': 2.4306368827819824, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}


 99%|█████████▉| 651/656 [09:14<00:04,  1.09it/s]

{'loss': 0.0218, 'grad_norm': 0.2973655164241791, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [09:15<00:03,  1.07it/s]

{'loss': 0.136, 'grad_norm': 4.099701881408691, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}


100%|█████████▉| 653/656 [09:16<00:02,  1.10it/s]

{'loss': 0.3406, 'grad_norm': 4.681666374206543, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}


100%|█████████▉| 654/656 [09:17<00:01,  1.17it/s]

{'loss': 0.0761, 'grad_norm': 4.701026916503906, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}


100%|██████████| 656/656 [09:18<00:00,  1.52it/s]

{'loss': 0.2088, 'grad_norm': 1.792160987854004, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}


100%|██████████| 656/656 [09:18<00:00,  1.52it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors


{'loss': 0.0145, 'grad_norm': 0.19625923037528992, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
                                                 
100%|██████████| 656/656 [09:40<00:00,  1.52it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors


{'eval_loss': 0.06383088231086731, 'eval_model_preparation_time': 0.0011, 'eval_accuracy': 0.9870080244554834, 'eval_runtime': 20.836, 'eval_samples_per_second': 125.6, 'eval_steps_per_second': 3.935, 'epoch': 2.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results/checkpoint-656 (score: 0.06383088231086731).
100%|██████████| 656/656 [09:41<00:00,  1.13it/s]

{'train_runtime': 581.0366, 'train_samples_per_second': 36.029, 'train_steps_per_second': 1.129, 'train_loss': 0.6983654683004361, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=0.6983654683004361, metrics={'train_runtime': 581.0366, 'train_samples_per_second': 36.029, 'train_steps_per_second': 1.129, 'total_flos': 116893238379912.0, 'train_loss': 0.6983654683004361, 'epoch': 2.0})

In [22]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:20<00:00,  3.97it/s]


{'eval_loss': 0.06383088231086731,
 'eval_model_preparation_time': 0.0011,
 'eval_accuracy': 0.9870080244554834,
 'eval_runtime': 20.8378,
 'eval_samples_per_second': 125.589,
 'eval_steps_per_second': 3.935,
 'epoch': 2.0}

In [23]:
pipe=pipeline('text-classification', seq_clf_model,tokenizer=tokenizer)
pipe('Add Two Coins by dispatch to my road trip playlist')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'AddToPlaylist', 'score': 0.990362823009491}]

In [24]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/model.safetensors


In [25]:
pipe=pipeline('text-classification', './snips_clf/results/',tokenizer=tokenizer)
pipe('Add Two Coins by dispatch to my road trip playlist')

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "GetWeather",
    "1": "BookRestaurant",
    "2": "SearchCreativeWork",
    "3": "PlayMusic",
    "4": "SearchScreeningEvent",
    "5": "AddToPlaylist",
    "6": "RateBook"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfo

[{'label': 'AddToPlaylist', 'score': 0.990362823009491}]

In [36]:
frozen_sequence_clf_model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=len(unique_sequence_labels))
frozen_sequence_clf_model.config.id2label={i:l for i,l in enumerate(unique_sequence_labels)}

loading configuration file config.json from cache at /Users/vishalsankarram/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.2",
  "vocab_size": 

In [37]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False
for param in frozen_sequence_clf_model.classifier.parameters():
    param.requires_grad = True

In [38]:
epochs = 2
warmup_steps = len(seq_clf_tokenized_snips['train']) // 5
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

training_arguments = TrainingArguments(
    output_dir='./snips_clf/results',
    num_train_epochs=epochs,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=0.05,
    logging_steps=1,
    log_level="info",
    eval_strategy='epoch',
    save_strategy='epoch',
    no_cuda=True
)

frozen_sequence_clf_model.to(device)
trainer=Trainer(model=frozen_sequence_clf_model,args=training_arguments,train_dataset=seq_clf_tokenized_snips['train'],eval_dataset=seq_clf_tokenized_snips['test'],compute_metrics=compute_matrics,data_collator=data_collator)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:16<00:00,  5.11it/s]


{'eval_loss': 1.9490116834640503,
 'eval_model_preparation_time': 0.0018,
 'eval_accuracy': 0.14902560183416125,
 'eval_runtime': 16.2085,
 'eval_samples_per_second': 161.458,
 'eval_steps_per_second': 5.059}

In [39]:
trainer.train()

  0%|          | 0/656 [06:53<?, ?it/s]
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975
                                               
  0%|          | 1/656 [00:00<02:17,  4.76it/s]

{'loss': 1.964, 'grad_norm': 1.3711175918579102, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}


                                               
  0%|          | 2/656 [00:00<02:50,  3.84it/s]

{'loss': 1.9509, 'grad_norm': 0.9557554721832275, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}


                                               
  0%|          | 3/656 [00:00<02:36,  4.16it/s]

{'loss': 1.9705, 'grad_norm': 1.3226507902145386, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}


                                               
                                               
  1%|          | 5/656 [00:01<02:23,  4.54it/s]

{'loss': 1.9679, 'grad_norm': 1.2126619815826416, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}
{'loss': 1.9716, 'grad_norm': 1.6875144243240356, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}


                                               
                                               
  1%|          | 7/656 [00:01<02:11,  4.95it/s]

{'loss': 1.9137, 'grad_norm': 1.6609488725662231, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}
{'loss': 1.9693, 'grad_norm': 1.464619517326355, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}


                                               
  1%|          | 8/656 [00:01<02:09,  5.00it/s]

{'loss': 1.9355, 'grad_norm': 1.0031709671020508, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}


                                               
                                                
  2%|▏         | 10/656 [00:02<02:11,  4.93it/s]

{'loss': 1.9707, 'grad_norm': 1.0861058235168457, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}
{'loss': 1.9608, 'grad_norm': 1.4388827085494995, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}


                                                
                                                
  2%|▏         | 12/656 [00:02<02:06,  5.10it/s]

{'loss': 1.9522, 'grad_norm': 1.1985535621643066, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}
{'loss': 1.9513, 'grad_norm': 0.8164734840393066, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}


                                                
  2%|▏         | 13/656 [00:02<02:09,  4.95it/s]

{'loss': 1.9415, 'grad_norm': 1.4343341588974, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}


                                                
  2%|▏         | 14/656 [00:02<02:12,  4.84it/s]

{'loss': 1.9719, 'grad_norm': 1.9488933086395264, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}


                                                
  2%|▏         | 16/656 [00:03<02:14,  4.76it/s]

{'loss': 1.9536, 'grad_norm': 1.1107503175735474, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}


                                                
  2%|▏         | 16/656 [00:03<02:14,  4.76it/s]

{'loss': 1.9678, 'grad_norm': 1.5275554656982422, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}


                                                
  3%|▎         | 17/656 [00:03<02:42,  3.93it/s]

{'loss': 1.9375, 'grad_norm': 1.1797666549682617, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}


                                                
  3%|▎         | 18/656 [00:03<02:35,  4.11it/s]

{'loss': 1.9589, 'grad_norm': 1.285291075706482, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}


                                                
                                                
  3%|▎         | 20/656 [00:04<02:27,  4.30it/s]

{'loss': 1.9234, 'grad_norm': 1.615831971168518, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}
{'loss': 1.9652, 'grad_norm': 1.552363395690918, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}


                                                
  3%|▎         | 22/656 [00:04<02:27,  4.31it/s]

{'loss': 1.9466, 'grad_norm': 0.9806542992591858, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}


                                                
                                                
  4%|▎         | 23/656 [00:05<02:18,  4.58it/s]

{'loss': 1.9717, 'grad_norm': 1.5728143453598022, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}
{'loss': 1.9577, 'grad_norm': 1.1769094467163086, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}


                                                
  4%|▍         | 25/656 [00:05<02:10,  4.83it/s]

{'loss': 1.966, 'grad_norm': 1.564102292060852, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}


                                                
  4%|▍         | 25/656 [00:05<02:10,  4.83it/s]

{'loss': 1.9229, 'grad_norm': 1.5221962928771973, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}


                                                
                                                
  4%|▍         | 27/656 [00:05<02:14,  4.66it/s]

{'loss': 1.9598, 'grad_norm': 1.6290781497955322, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}
{'loss': 1.9543, 'grad_norm': 1.763266921043396, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}


                                                
  4%|▍         | 28/656 [00:06<02:10,  4.81it/s]

{'loss': 1.9523, 'grad_norm': 1.2301162481307983, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


                                                
  4%|▍         | 29/656 [00:06<02:15,  4.63it/s]

{'loss': 1.9599, 'grad_norm': 1.308971643447876, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}


                                                
  5%|▍         | 30/656 [00:06<02:23,  4.35it/s]

{'loss': 1.9631, 'grad_norm': 1.4438406229019165, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}


                                                
                                                
  5%|▍         | 32/656 [00:07<02:30,  4.16it/s]

{'loss': 1.959, 'grad_norm': 1.6956051588058472, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}
{'loss': 1.9602, 'grad_norm': 1.5298861265182495, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}


                                                
  5%|▌         | 33/656 [00:07<02:17,  4.52it/s]

{'loss': 1.9373, 'grad_norm': 1.6539305448532104, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}


                                                
  5%|▌         | 34/656 [00:07<02:32,  4.09it/s]

{'loss': 1.9591, 'grad_norm': 1.364108920097351, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}


                                                
  5%|▌         | 35/656 [00:07<02:25,  4.27it/s]

{'loss': 1.9349, 'grad_norm': 1.5173109769821167, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}


                                                
                                                
  6%|▌         | 37/656 [00:08<02:14,  4.60it/s]

{'loss': 1.9633, 'grad_norm': 1.32539963722229, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}
{'loss': 1.939, 'grad_norm': 1.147833228111267, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}


                                                
  6%|▌         | 38/656 [00:08<02:14,  4.58it/s]

{'loss': 1.9571, 'grad_norm': 1.3975911140441895, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}


                                                
  6%|▌         | 39/656 [00:08<02:12,  4.65it/s]

{'loss': 1.9378, 'grad_norm': 1.2861378192901611, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}


                                                
  6%|▌         | 40/656 [00:08<02:15,  4.56it/s]

{'loss': 1.9561, 'grad_norm': 1.009400725364685, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}


                                                
  6%|▋         | 42/656 [00:09<02:16,  4.49it/s]

{'loss': 1.9524, 'grad_norm': 1.04463791847229, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}


                                                
  6%|▋         | 42/656 [00:09<02:16,  4.49it/s]

{'loss': 1.9308, 'grad_norm': 1.1704472303390503, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}


                                                
                                                
  7%|▋         | 44/656 [00:09<02:15,  4.50it/s]

{'loss': 1.9669, 'grad_norm': 1.1997417211532593, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}
{'loss': 1.9454, 'grad_norm': 1.239569067955017, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}


                                                
  7%|▋         | 45/656 [00:10<02:30,  4.06it/s]

{'loss': 1.9598, 'grad_norm': 1.6386795043945312, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}


                                                
  7%|▋         | 46/656 [00:10<02:23,  4.26it/s]

{'loss': 1.942, 'grad_norm': 1.1012108325958252, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}


                                                
  7%|▋         | 47/656 [00:10<02:17,  4.43it/s]

{'loss': 1.9529, 'grad_norm': 1.1544840335845947, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}


                                                
                                                
  7%|▋         | 49/656 [00:10<02:17,  4.41it/s]

{'loss': 1.9361, 'grad_norm': 1.5348585844039917, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}
{'loss': 1.9345, 'grad_norm': 1.512559413909912, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}


                                                
                                                
  8%|▊         | 51/656 [00:11<02:07,  4.76it/s]

{'loss': 1.964, 'grad_norm': 0.9194754362106323, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}
{'loss': 1.9559, 'grad_norm': 1.3952308893203735, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}


                                                
  8%|▊         | 52/656 [00:11<02:03,  4.89it/s]

{'loss': 1.9377, 'grad_norm': 1.5212254524230957, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}


                                                
                                                
  8%|▊         | 54/656 [00:11<02:04,  4.83it/s]

{'loss': 1.9272, 'grad_norm': 1.0074505805969238, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}
{'loss': 1.9607, 'grad_norm': 0.9066425561904907, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}


                                                
  8%|▊         | 55/656 [00:12<01:58,  5.09it/s]

{'loss': 1.937, 'grad_norm': 1.1873725652694702, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}


                                                
  9%|▊         | 56/656 [00:12<02:04,  4.80it/s]

{'loss': 1.9579, 'grad_norm': 1.39696204662323, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}


                                                
  9%|▊         | 57/656 [00:12<02:25,  4.11it/s]

{'loss': 1.9506, 'grad_norm': 1.5663447380065918, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}


                                                
  9%|▉         | 58/656 [00:12<02:24,  4.14it/s]

{'loss': 1.9705, 'grad_norm': 1.6580582857131958, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}


                                                
  9%|▉         | 59/656 [00:13<02:25,  4.10it/s]

{'loss': 1.9615, 'grad_norm': 1.6606850624084473, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}


                                                
  9%|▉         | 60/656 [00:13<02:22,  4.19it/s]

{'loss': 1.9687, 'grad_norm': 1.5178333520889282, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


                                                
  9%|▉         | 61/656 [00:13<02:17,  4.34it/s]

{'loss': 1.9189, 'grad_norm': 1.1427208185195923, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}


                                                
  9%|▉         | 62/656 [00:13<02:13,  4.46it/s]

{'loss': 1.9433, 'grad_norm': 1.4392653703689575, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}


                                                
 10%|▉         | 63/656 [00:14<02:15,  4.37it/s]

{'loss': 1.9721, 'grad_norm': 1.9306824207305908, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}


                                                
 10%|▉         | 64/656 [00:14<02:11,  4.50it/s]

{'loss': 1.9542, 'grad_norm': 1.0568618774414062, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}


                                                
 10%|▉         | 65/656 [00:14<02:17,  4.30it/s]

{'loss': 1.9598, 'grad_norm': 1.3369101285934448, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}


                                                
 10%|█         | 66/656 [00:14<02:15,  4.36it/s]

{'loss': 1.942, 'grad_norm': 1.3246012926101685, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


                                                
 10%|█         | 67/656 [00:15<02:23,  4.10it/s]

{'loss': 1.9252, 'grad_norm': 1.1763625144958496, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}


                                                
                                                
 11%|█         | 69/656 [00:15<02:13,  4.39it/s]

{'loss': 1.9573, 'grad_norm': 1.264095425605774, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}
{'loss': 1.9656, 'grad_norm': 1.155953049659729, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}


                                                
 11%|█         | 70/656 [00:15<02:33,  3.81it/s]

{'loss': 1.9463, 'grad_norm': 1.704691767692566, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}


                                                
 11%|█         | 71/656 [00:16<02:29,  3.91it/s]

{'loss': 1.9881, 'grad_norm': 1.763789415359497, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}


                                                
 11%|█         | 72/656 [00:16<02:23,  4.08it/s]

{'loss': 1.9534, 'grad_norm': 1.2719624042510986, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}


                                                
                                                
 11%|█▏        | 74/656 [00:16<02:23,  4.04it/s]

{'loss': 1.9386, 'grad_norm': 1.4490054845809937, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}
{'loss': 1.9705, 'grad_norm': 0.8963138461112976, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}


                                                
 11%|█▏        | 75/656 [00:17<02:13,  4.35it/s]

{'loss': 1.95, 'grad_norm': 1.0932731628417969, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}


                                                
 12%|█▏        | 76/656 [00:17<02:09,  4.46it/s]

{'loss': 1.9618, 'grad_norm': 1.2783031463623047, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


                                                
 12%|█▏        | 77/656 [00:17<02:26,  3.97it/s]

{'loss': 1.9195, 'grad_norm': 0.9795290231704712, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}


                                                
 12%|█▏        | 78/656 [00:17<02:25,  3.98it/s]

{'loss': 1.9425, 'grad_norm': 1.0382113456726074, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}


                                                
 12%|█▏        | 80/656 [00:18<02:14,  4.27it/s]

{'loss': 1.9762, 'grad_norm': 1.6749162673950195, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}


                                                
 12%|█▏        | 80/656 [00:18<02:14,  4.27it/s]

{'loss': 1.9404, 'grad_norm': 1.3459988832473755, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}


                                                
                                                
 12%|█▎        | 82/656 [00:18<02:07,  4.50it/s]

{'loss': 1.9351, 'grad_norm': 1.1421574354171753, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}
{'loss': 1.9394, 'grad_norm': 0.9161821007728577, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


                                                
 13%|█▎        | 83/656 [00:18<02:04,  4.62it/s]

{'loss': 1.9347, 'grad_norm': 1.4945777654647827, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}


                                                
 13%|█▎        | 84/656 [00:19<02:05,  4.57it/s]

{'loss': 1.9194, 'grad_norm': 1.738227367401123, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}


                                                
 13%|█▎        | 85/656 [00:19<02:05,  4.55it/s]

{'loss': 1.9562, 'grad_norm': 1.4486255645751953, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}


                                                
 13%|█▎        | 86/656 [00:19<02:07,  4.45it/s]

{'loss': 1.9613, 'grad_norm': 1.0935585498809814, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}


                                                
 13%|█▎        | 88/656 [00:19<02:02,  4.63it/s]

{'loss': 1.9497, 'grad_norm': 0.9590843915939331, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}


                                                
 13%|█▎        | 88/656 [00:19<02:02,  4.63it/s]

{'loss': 1.9428, 'grad_norm': 1.0464359521865845, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


                                                
 14%|█▎        | 89/656 [00:20<02:21,  4.01it/s]

{'loss': 1.9202, 'grad_norm': 1.2356802225112915, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}


                                                
 14%|█▎        | 90/656 [00:20<02:28,  3.80it/s]

{'loss': 1.9296, 'grad_norm': 1.9193886518478394, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}


                                                
 14%|█▍        | 91/656 [00:20<02:29,  3.77it/s]

{'loss': 1.932, 'grad_norm': 1.4523621797561646, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}


                                                
                                                
 14%|█▍        | 93/656 [00:21<02:13,  4.22it/s]

{'loss': 1.9332, 'grad_norm': 1.1898516416549683, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}
{'loss': 1.947, 'grad_norm': 1.6017898321151733, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}


                                                
 14%|█▍        | 94/656 [00:21<02:10,  4.32it/s]

{'loss': 1.9523, 'grad_norm': 1.2188048362731934, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


                                                
 14%|█▍        | 95/656 [00:21<02:13,  4.22it/s]

{'loss': 1.9576, 'grad_norm': 1.3280973434448242, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}


                                                
 15%|█▍        | 96/656 [00:22<02:15,  4.12it/s]

{'loss': 1.9416, 'grad_norm': 1.1748191118240356, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}


                                                
 15%|█▍        | 97/656 [00:22<02:17,  4.07it/s]

{'loss': 1.9092, 'grad_norm': 1.1640369892120361, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}


                                                
 15%|█▍        | 98/656 [00:22<02:18,  4.03it/s]

{'loss': 1.929, 'grad_norm': 1.5507471561431885, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}


                                                
 15%|█▌        | 99/656 [00:22<02:16,  4.08it/s]

{'loss': 1.959, 'grad_norm': 1.5454245805740356, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}


                                                 
 15%|█▌        | 100/656 [00:22<02:13,  4.17it/s]

{'loss': 1.9552, 'grad_norm': 1.348003625869751, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}


                                                 
 15%|█▌        | 101/656 [00:23<02:19,  3.98it/s]

{'loss': 1.9386, 'grad_norm': 1.4089574813842773, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}


                                                 
 16%|█▌        | 102/656 [00:23<02:31,  3.66it/s]

{'loss': 1.9172, 'grad_norm': 1.3868341445922852, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}


                                                 
 16%|█▌        | 103/656 [00:23<02:40,  3.45it/s]

{'loss': 1.9382, 'grad_norm': 1.4688465595245361, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}


                                                 
 16%|█▌        | 104/656 [00:24<03:09,  2.91it/s]

{'loss': 1.916, 'grad_norm': 1.246717929840088, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}


                                                 
 16%|█▌        | 105/656 [00:24<02:59,  3.07it/s]

{'loss': 1.9667, 'grad_norm': 1.4175851345062256, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}


                                                 
 16%|█▌        | 106/656 [00:24<02:43,  3.37it/s]

{'loss': 1.9445, 'grad_norm': 1.0800607204437256, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


                                                 
 16%|█▋        | 107/656 [00:25<02:31,  3.62it/s]

{'loss': 1.9356, 'grad_norm': 1.1133651733398438, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}


                                                 
                                                 
 17%|█▋        | 109/656 [00:25<02:15,  4.05it/s]

{'loss': 1.961, 'grad_norm': 1.3972009420394897, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}
{'loss': 1.9313, 'grad_norm': 1.5319616794586182, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}


                                                 
                                                 
 17%|█▋        | 111/656 [00:25<01:54,  4.74it/s]

{'loss': 1.9327, 'grad_norm': 1.1764622926712036, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}
{'loss': 1.9667, 'grad_norm': 1.1586823463439941, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}


                                                 
 17%|█▋        | 112/656 [00:26<01:56,  4.66it/s]

{'loss': 1.9356, 'grad_norm': 1.2909737825393677, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}


                                                 
 17%|█▋        | 113/656 [00:26<01:58,  4.59it/s]

{'loss': 1.9265, 'grad_norm': 1.1683387756347656, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}


                                                 
 17%|█▋        | 114/656 [00:26<02:05,  4.31it/s]

{'loss': 1.9565, 'grad_norm': 1.0066723823547363, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}


                                                 
 18%|█▊        | 115/656 [00:26<02:01,  4.47it/s]

{'loss': 1.9625, 'grad_norm': 1.2337268590927124, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}


                                                 
 18%|█▊        | 116/656 [00:27<02:08,  4.19it/s]

{'loss': 1.9627, 'grad_norm': 1.68080472946167, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}


                                                 
                                                 
 18%|█▊        | 118/656 [00:27<01:55,  4.65it/s]

{'loss': 1.9489, 'grad_norm': 1.666199803352356, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}
{'loss': 1.9383, 'grad_norm': 1.6506503820419312, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}


                                                 
                                                 
 18%|█▊        | 120/656 [00:27<01:47,  5.00it/s]

{'loss': 1.9136, 'grad_norm': 1.6431633234024048, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}
{'loss': 1.9777, 'grad_norm': 1.535509705543518, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}


                                                 
 18%|█▊        | 121/656 [00:28<01:51,  4.80it/s]

{'loss': 1.9374, 'grad_norm': 1.2193336486816406, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}


                                                 
 19%|█▊        | 122/656 [00:28<01:52,  4.74it/s]

{'loss': 1.9307, 'grad_norm': 1.1560050249099731, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}


                                                 
 19%|█▉        | 123/656 [00:28<01:54,  4.65it/s]

{'loss': 1.9665, 'grad_norm': 1.3270983695983887, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}


                                                 
 19%|█▉        | 124/656 [00:28<02:07,  4.18it/s]

{'loss': 1.9437, 'grad_norm': 1.3106106519699097, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}


                                                 
                                                 
 19%|█▉        | 126/656 [00:29<02:00,  4.42it/s]

{'loss': 1.9414, 'grad_norm': 1.0404988527297974, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}
{'loss': 1.9662, 'grad_norm': 1.498659610748291, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}


                                                 
 19%|█▉        | 127/656 [00:29<02:03,  4.30it/s]

{'loss': 1.9447, 'grad_norm': 1.4169285297393799, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}


                                                 
 20%|█▉        | 128/656 [00:29<02:10,  4.04it/s]

{'loss': 1.9369, 'grad_norm': 1.278685450553894, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}


                                                 
 20%|█▉        | 129/656 [00:30<02:07,  4.14it/s]

{'loss': 1.9659, 'grad_norm': 1.519646167755127, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}


                                                 
 20%|█▉        | 130/656 [00:30<02:02,  4.30it/s]

{'loss': 1.9707, 'grad_norm': 1.1976779699325562, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}


                                                 
 20%|█▉        | 131/656 [00:30<02:02,  4.28it/s]

{'loss': 1.9278, 'grad_norm': 1.4103926420211792, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}


                                                 
 20%|██        | 132/656 [00:30<02:16,  3.83it/s]

{'loss': 1.9393, 'grad_norm': 1.327501654624939, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}


                                                 
 20%|██        | 133/656 [00:31<02:34,  3.39it/s]

{'loss': 1.9349, 'grad_norm': 1.650508165359497, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}


                                                 
 20%|██        | 134/656 [00:31<03:03,  2.84it/s]

{'loss': 1.9632, 'grad_norm': 1.4222053289413452, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}


                                                 
 21%|██        | 135/656 [00:32<03:21,  2.58it/s]

{'loss': 1.919, 'grad_norm': 1.0830527544021606, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}


                                                 
 21%|██        | 136/656 [00:32<03:23,  2.56it/s]

{'loss': 1.9191, 'grad_norm': 1.4152264595031738, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}


                                                 
 21%|██        | 137/656 [00:32<03:19,  2.60it/s]

{'loss': 1.9262, 'grad_norm': 1.3074424266815186, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}


                                                 
 21%|██        | 138/656 [00:33<03:10,  2.71it/s]

{'loss': 1.9524, 'grad_norm': 1.1531968116760254, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}


                                                 
 21%|██        | 139/656 [00:33<03:18,  2.60it/s]

{'loss': 1.951, 'grad_norm': 1.1238359212875366, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}


                                                 
 21%|██▏       | 140/656 [00:33<03:07,  2.75it/s]

{'loss': 1.9308, 'grad_norm': 1.4544055461883545, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}


                                                 
 21%|██▏       | 141/656 [00:34<03:07,  2.75it/s]

{'loss': 1.9297, 'grad_norm': 1.2680120468139648, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}


                                                 
 22%|██▏       | 142/656 [00:34<03:25,  2.51it/s]

{'loss': 1.9596, 'grad_norm': 1.263702630996704, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}


                                                 
 22%|██▏       | 143/656 [00:35<03:01,  2.83it/s]

{'loss': 1.9429, 'grad_norm': 1.379124641418457, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}


                                                 
 22%|██▏       | 144/656 [00:35<02:38,  3.22it/s]

{'loss': 1.9514, 'grad_norm': 1.3457105159759521, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}


                                                 
 22%|██▏       | 145/656 [00:35<02:34,  3.32it/s]

{'loss': 1.9489, 'grad_norm': 1.873889684677124, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}


                                                 
 22%|██▏       | 146/656 [00:35<02:24,  3.52it/s]

{'loss': 1.9592, 'grad_norm': 1.6736825704574585, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}


                                                 
 22%|██▏       | 147/656 [00:36<02:24,  3.53it/s]

{'loss': 1.9471, 'grad_norm': 1.1978609561920166, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}


                                                 
 23%|██▎       | 148/656 [00:36<02:12,  3.85it/s]

{'loss': 1.8954, 'grad_norm': 1.6475557088851929, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}


                                                 
 23%|██▎       | 149/656 [00:36<02:09,  3.90it/s]

{'loss': 1.9262, 'grad_norm': 1.5403625965118408, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}


                                                 
 23%|██▎       | 150/656 [00:36<02:10,  3.87it/s]

{'loss': 1.943, 'grad_norm': 1.0248980522155762, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


                                                 
 23%|██▎       | 151/656 [00:37<02:28,  3.39it/s]

{'loss': 1.9333, 'grad_norm': 0.9386250376701355, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}


                                                 
 23%|██▎       | 152/656 [00:37<02:34,  3.26it/s]

{'loss': 1.9435, 'grad_norm': 1.1494344472885132, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}


                                                 
 23%|██▎       | 153/656 [00:37<02:42,  3.09it/s]

{'loss': 1.9375, 'grad_norm': 1.1135448217391968, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}


                                                 
 23%|██▎       | 154/656 [00:38<02:29,  3.36it/s]

{'loss': 1.9194, 'grad_norm': 1.331114411354065, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}


                                                 
 24%|██▎       | 155/656 [00:38<02:31,  3.30it/s]

{'loss': 1.9154, 'grad_norm': 0.9856305122375488, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}


                                                 
 24%|██▍       | 156/656 [00:38<02:24,  3.47it/s]

{'loss': 1.9309, 'grad_norm': 1.0766606330871582, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}


                                                 
 24%|██▍       | 157/656 [00:38<02:20,  3.54it/s]

{'loss': 1.9362, 'grad_norm': 1.2400634288787842, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}


                                                 
 24%|██▍       | 158/656 [00:39<02:10,  3.83it/s]

{'loss': 1.9265, 'grad_norm': 1.2488142251968384, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}


                                                 
 24%|██▍       | 159/656 [00:39<02:03,  4.04it/s]

{'loss': 1.9165, 'grad_norm': 1.475458025932312, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}


                                                 
 24%|██▍       | 160/656 [00:39<01:56,  4.25it/s]

{'loss': 1.9251, 'grad_norm': 2.2612695693969727, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}


                                                 
 25%|██▍       | 161/656 [00:39<01:58,  4.17it/s]

{'loss': 1.9433, 'grad_norm': 1.4659719467163086, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}


                                                 
 25%|██▍       | 162/656 [00:40<02:08,  3.85it/s]

{'loss': 1.9148, 'grad_norm': 1.231339931488037, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


                                                 
 25%|██▍       | 163/656 [00:40<02:00,  4.10it/s]

{'loss': 1.9049, 'grad_norm': 1.1890664100646973, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}


                                                 
 25%|██▌       | 164/656 [00:40<02:10,  3.77it/s]

{'loss': 1.9236, 'grad_norm': 1.275304913520813, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}


                                                 
 25%|██▌       | 165/656 [00:41<02:20,  3.50it/s]

{'loss': 1.9361, 'grad_norm': 1.2861908674240112, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}


                                                 
 25%|██▌       | 166/656 [00:41<02:09,  3.79it/s]

{'loss': 1.9002, 'grad_norm': 1.1675033569335938, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}


                                                 
 25%|██▌       | 167/656 [00:41<02:31,  3.23it/s]

{'loss': 1.9386, 'grad_norm': 1.2124451398849487, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}


                                                 
 26%|██▌       | 168/656 [00:41<02:25,  3.36it/s]

{'loss': 1.934, 'grad_norm': 1.2306849956512451, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


                                                 
 26%|██▌       | 169/656 [00:42<02:37,  3.08it/s]

{'loss': 1.9179, 'grad_norm': 0.9243804216384888, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}


                                                 
 26%|██▌       | 170/656 [00:42<02:28,  3.26it/s]

{'loss': 1.9331, 'grad_norm': 1.4083541631698608, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}


                                                 
 26%|██▌       | 171/656 [00:42<02:27,  3.28it/s]

{'loss': 1.926, 'grad_norm': 1.0739506483078003, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}


                                                 
 26%|██▌       | 172/656 [00:43<02:27,  3.28it/s]

{'loss': 1.9184, 'grad_norm': 1.234028935432434, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


                                                 
 26%|██▋       | 173/656 [00:43<02:41,  2.99it/s]

{'loss': 1.8929, 'grad_norm': 1.9076085090637207, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}


                                                 
 27%|██▋       | 174/656 [00:43<02:40,  3.01it/s]

{'loss': 1.9143, 'grad_norm': 1.4865617752075195, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}


                                                 
 27%|██▋       | 175/656 [00:44<02:34,  3.12it/s]

{'loss': 1.9478, 'grad_norm': 1.521260380744934, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}


                                                 
 27%|██▋       | 176/656 [00:44<02:43,  2.94it/s]

{'loss': 1.9181, 'grad_norm': 1.6222593784332275, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


                                                 
 27%|██▋       | 177/656 [00:44<02:45,  2.89it/s]

{'loss': 1.9499, 'grad_norm': 1.5014859437942505, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}


                                                 
 27%|██▋       | 178/656 [00:45<02:35,  3.07it/s]

{'loss': 1.9155, 'grad_norm': 1.4211461544036865, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}


                                                 
 27%|██▋       | 179/656 [00:45<02:33,  3.10it/s]

{'loss': 1.9299, 'grad_norm': 1.0524303913116455, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}


                                                 
 27%|██▋       | 180/656 [00:45<02:26,  3.25it/s]

{'loss': 1.9223, 'grad_norm': 1.3294856548309326, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


                                                 
 28%|██▊       | 181/656 [00:46<02:28,  3.19it/s]

{'loss': 1.9363, 'grad_norm': 1.0895946025848389, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}


                                                 
 28%|██▊       | 182/656 [00:46<02:27,  3.21it/s]

{'loss': 1.9475, 'grad_norm': 1.0638656616210938, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}


                                                 
 28%|██▊       | 183/656 [00:46<02:32,  3.09it/s]

{'loss': 1.9281, 'grad_norm': 1.2919857501983643, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}


                                                 
 28%|██▊       | 184/656 [00:47<02:31,  3.12it/s]

{'loss': 1.9226, 'grad_norm': 1.4312392473220825, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


                                                 
 28%|██▊       | 185/656 [00:47<02:38,  2.97it/s]

{'loss': 1.9021, 'grad_norm': 1.2465019226074219, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}


                                                 
 28%|██▊       | 186/656 [00:47<02:38,  2.96it/s]

{'loss': 1.9197, 'grad_norm': 1.3093023300170898, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}


                                                 
 29%|██▊       | 187/656 [00:48<02:31,  3.09it/s]

{'loss': 1.9287, 'grad_norm': 1.5526255369186401, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}


                                                 
 29%|██▊       | 188/656 [00:48<02:16,  3.43it/s]

{'loss': 1.9221, 'grad_norm': 1.0712454319000244, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


                                                 
 29%|██▉       | 189/656 [00:48<02:25,  3.22it/s]

{'loss': 1.9195, 'grad_norm': 1.1305423974990845, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}


                                                 
 29%|██▉       | 190/656 [00:48<02:21,  3.28it/s]

{'loss': 1.9173, 'grad_norm': 1.0617060661315918, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}


                                                 
 29%|██▉       | 191/656 [00:49<02:14,  3.45it/s]

{'loss': 1.9248, 'grad_norm': 1.1966618299484253, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}


                                                 
 29%|██▉       | 192/656 [00:49<02:13,  3.48it/s]

{'loss': 1.9331, 'grad_norm': 1.2607051134109497, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}


                                                 
 29%|██▉       | 193/656 [00:49<02:25,  3.18it/s]

{'loss': 1.9322, 'grad_norm': 0.8721822500228882, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}


                                                 
 30%|██▉       | 194/656 [00:50<02:21,  3.27it/s]

{'loss': 1.941, 'grad_norm': 1.3756325244903564, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}


                                                 
 30%|██▉       | 195/656 [00:50<02:14,  3.43it/s]

{'loss': 1.9198, 'grad_norm': 1.2611900568008423, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}


                                                 
 30%|██▉       | 196/656 [00:50<02:12,  3.47it/s]

{'loss': 1.9287, 'grad_norm': 1.4201041460037231, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}


                                                 
 30%|███       | 197/656 [00:51<02:21,  3.25it/s]

{'loss': 1.9474, 'grad_norm': 1.5858407020568848, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}


                                                 
 30%|███       | 198/656 [00:51<02:10,  3.50it/s]

{'loss': 1.9123, 'grad_norm': 1.2482579946517944, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}


                                                 
 30%|███       | 199/656 [00:51<02:04,  3.66it/s]

{'loss': 1.9064, 'grad_norm': 1.2868685722351074, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}


                                                 
 30%|███       | 200/656 [00:51<01:58,  3.84it/s]

{'loss': 1.9415, 'grad_norm': 1.2838501930236816, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}


                                                 
 31%|███       | 201/656 [00:51<01:53,  4.00it/s]

{'loss': 1.942, 'grad_norm': 1.047528862953186, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}


                                                 
 31%|███       | 202/656 [00:52<01:47,  4.22it/s]

{'loss': 1.9541, 'grad_norm': 1.3843317031860352, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}


                                                 
 31%|███       | 203/656 [00:52<01:50,  4.11it/s]

{'loss': 1.9407, 'grad_norm': 0.9387434124946594, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}


                                                 
 31%|███       | 204/656 [00:52<01:58,  3.82it/s]

{'loss': 1.9146, 'grad_norm': 0.9291233420372009, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}


                                                 
 31%|███▏      | 205/656 [00:53<01:56,  3.89it/s]

{'loss': 1.8708, 'grad_norm': 1.5518772602081299, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}


                                                 
 31%|███▏      | 206/656 [00:53<02:14,  3.34it/s]

{'loss': 1.9192, 'grad_norm': 1.2874265909194946, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


                                                 
 32%|███▏      | 207/656 [00:53<02:07,  3.52it/s]

{'loss': 1.9048, 'grad_norm': 1.1872837543487549, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}


                                                 
 32%|███▏      | 208/656 [00:53<02:00,  3.72it/s]

{'loss': 1.9238, 'grad_norm': 1.4844869375228882, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}


                                                 
 32%|███▏      | 209/656 [00:54<02:00,  3.72it/s]

{'loss': 1.9079, 'grad_norm': 1.2120192050933838, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}


                                                 
 32%|███▏      | 210/656 [00:54<02:03,  3.60it/s]

{'loss': 1.9162, 'grad_norm': 1.6921266317367554, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


                                                 
 32%|███▏      | 211/656 [00:54<02:08,  3.47it/s]

{'loss': 1.9289, 'grad_norm': 1.299017310142517, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}


                                                 
 32%|███▏      | 212/656 [00:55<02:01,  3.65it/s]

{'loss': 1.9489, 'grad_norm': 1.2391880750656128, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}


                                                 
 32%|███▏      | 213/656 [00:55<01:54,  3.86it/s]

{'loss': 1.9549, 'grad_norm': 1.4732544422149658, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}


                                                 
 33%|███▎      | 214/656 [00:55<02:02,  3.61it/s]

{'loss': 1.904, 'grad_norm': 1.1656931638717651, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


                                                 
 33%|███▎      | 215/656 [00:55<02:01,  3.63it/s]

{'loss': 1.924, 'grad_norm': 1.1389927864074707, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}


                                                 
 33%|███▎      | 216/656 [00:56<02:04,  3.52it/s]

{'loss': 1.913, 'grad_norm': 1.2014682292938232, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}


                                                 
 33%|███▎      | 217/656 [00:56<02:05,  3.50it/s]

{'loss': 1.9291, 'grad_norm': 1.3921414613723755, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}


                                                 
 33%|███▎      | 218/656 [00:56<02:06,  3.45it/s]

{'loss': 1.9337, 'grad_norm': 1.4262713193893433, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}


                                                 
 33%|███▎      | 219/656 [00:56<01:57,  3.71it/s]

{'loss': 1.9193, 'grad_norm': 1.0036174058914185, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}


                                                 
 34%|███▎      | 220/656 [00:57<01:54,  3.81it/s]

{'loss': 1.9103, 'grad_norm': 0.9824476838111877, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}


                                                 
 34%|███▎      | 221/656 [00:57<01:54,  3.82it/s]

{'loss': 1.954, 'grad_norm': 1.4232792854309082, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}


                                                 
 34%|███▍      | 222/656 [00:57<01:52,  3.87it/s]

{'loss': 1.9148, 'grad_norm': 1.5102627277374268, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}


                                                 
 34%|███▍      | 223/656 [00:57<01:54,  3.77it/s]

{'loss': 1.9301, 'grad_norm': 0.9495670199394226, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}


                                                 
 34%|███▍      | 224/656 [00:58<01:47,  4.03it/s]

{'loss': 1.9388, 'grad_norm': 1.7284023761749268, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}


                                                 
 34%|███▍      | 225/656 [00:58<01:43,  4.16it/s]

{'loss': 1.9313, 'grad_norm': 1.500165343284607, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}


                                                 
 34%|███▍      | 226/656 [00:58<01:44,  4.10it/s]

{'loss': 1.8813, 'grad_norm': 1.2897008657455444, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}


                                                 
 35%|███▍      | 227/656 [00:58<01:39,  4.30it/s]

{'loss': 1.9319, 'grad_norm': 1.497800588607788, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}


                                                 
 35%|███▍      | 228/656 [00:59<01:35,  4.46it/s]

{'loss': 1.9255, 'grad_norm': 1.3567922115325928, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}


                                                 
 35%|███▍      | 229/656 [00:59<01:38,  4.34it/s]

{'loss': 1.9191, 'grad_norm': 1.1664838790893555, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}


                                                 
 35%|███▌      | 230/656 [00:59<01:35,  4.47it/s]

{'loss': 1.9015, 'grad_norm': 1.1408054828643799, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}


                                                 
 35%|███▌      | 231/656 [00:59<01:39,  4.27it/s]

{'loss': 1.9127, 'grad_norm': 1.115838646888733, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}


                                                 
 35%|███▌      | 232/656 [01:00<01:44,  4.04it/s]

{'loss': 1.9143, 'grad_norm': 1.0943950414657593, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}


                                                 
 36%|███▌      | 233/656 [01:00<01:53,  3.74it/s]

{'loss': 1.92, 'grad_norm': 1.0404095649719238, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}


                                                 
 36%|███▌      | 234/656 [01:00<01:54,  3.69it/s]

{'loss': 1.8993, 'grad_norm': 1.0008469820022583, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}


                                                 
                                                 
 36%|███▌      | 236/656 [01:01<01:51,  3.76it/s]

{'loss': 1.9109, 'grad_norm': 1.0051450729370117, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}
{'loss': 1.9161, 'grad_norm': 1.1010197401046753, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}


                                                 
 36%|███▌      | 237/656 [01:01<02:00,  3.46it/s]

{'loss': 1.9002, 'grad_norm': 1.656787395477295, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}


                                                 
 36%|███▋      | 238/656 [01:01<01:54,  3.65it/s]

{'loss': 1.9107, 'grad_norm': 1.3509949445724487, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}


                                                 
 36%|███▋      | 239/656 [01:02<01:48,  3.85it/s]

{'loss': 1.8954, 'grad_norm': 1.2640950679779053, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}


                                                 
 37%|███▋      | 240/656 [01:02<01:51,  3.74it/s]

{'loss': 1.9051, 'grad_norm': 1.368145227432251, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


                                                 
 37%|███▋      | 241/656 [01:02<01:45,  3.92it/s]

{'loss': 1.8963, 'grad_norm': 1.4032597541809082, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}


                                                 
 37%|███▋      | 242/656 [01:02<01:42,  4.02it/s]

{'loss': 1.9177, 'grad_norm': 1.4145805835723877, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}


                                                 
 37%|███▋      | 243/656 [01:03<01:45,  3.90it/s]

{'loss': 1.9015, 'grad_norm': 1.3909105062484741, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}


                                                 
 37%|███▋      | 244/656 [01:03<01:42,  4.02it/s]

{'loss': 1.9121, 'grad_norm': 1.4290884733200073, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


                                                 
 37%|███▋      | 245/656 [01:03<01:42,  4.03it/s]

{'loss': 1.907, 'grad_norm': 1.0041701793670654, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}


                                                 
 38%|███▊      | 246/656 [01:03<01:40,  4.08it/s]

{'loss': 1.9459, 'grad_norm': 1.597586750984192, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}


                                                 
 38%|███▊      | 247/656 [01:04<01:47,  3.80it/s]

{'loss': 1.9351, 'grad_norm': 1.672958254814148, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}


                                                 
 38%|███▊      | 248/656 [01:04<01:43,  3.94it/s]

{'loss': 1.9043, 'grad_norm': 0.8778584599494934, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


                                                 
 38%|███▊      | 249/656 [01:04<01:43,  3.95it/s]

{'loss': 1.8978, 'grad_norm': 1.2234764099121094, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}


                                                 
 38%|███▊      | 250/656 [01:04<01:48,  3.75it/s]

{'loss': 1.9085, 'grad_norm': 1.2253612279891968, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}


                                                 
 38%|███▊      | 251/656 [01:05<01:51,  3.64it/s]

{'loss': 1.921, 'grad_norm': 1.613627314567566, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}


                                                 
 38%|███▊      | 252/656 [01:05<01:52,  3.61it/s]

{'loss': 1.9161, 'grad_norm': 1.1859025955200195, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}


                                                 
 39%|███▊      | 253/656 [01:05<01:46,  3.78it/s]

{'loss': 1.8951, 'grad_norm': 1.1185435056686401, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}


                                                 
 39%|███▊      | 254/656 [01:05<01:52,  3.56it/s]

{'loss': 1.9277, 'grad_norm': 1.509916067123413, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}


                                                 
 39%|███▉      | 255/656 [01:06<01:48,  3.70it/s]

{'loss': 1.9042, 'grad_norm': 1.0864523649215698, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}


                                                 
 39%|███▉      | 256/656 [01:06<01:44,  3.83it/s]

{'loss': 1.9163, 'grad_norm': 1.5869355201721191, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}


                                                 
 39%|███▉      | 257/656 [01:06<01:42,  3.88it/s]

{'loss': 1.8977, 'grad_norm': 1.1523221731185913, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}


                                                 
 39%|███▉      | 258/656 [01:06<01:41,  3.93it/s]

{'loss': 1.9172, 'grad_norm': 1.2252787351608276, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}


                                                 
 39%|███▉      | 259/656 [01:07<01:36,  4.13it/s]

{'loss': 1.9156, 'grad_norm': 1.7348312139511108, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}


                                                 
 40%|███▉      | 260/656 [01:07<01:38,  4.03it/s]

{'loss': 1.8943, 'grad_norm': 1.0874125957489014, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}


                                                 
 40%|███▉      | 261/656 [01:07<01:41,  3.88it/s]

{'loss': 1.9296, 'grad_norm': 1.0228502750396729, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}


                                                 
 40%|███▉      | 262/656 [01:07<01:39,  3.95it/s]

{'loss': 1.9136, 'grad_norm': 1.4691728353500366, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}


                                                 
 40%|████      | 263/656 [01:08<01:38,  3.98it/s]

{'loss': 1.9031, 'grad_norm': 1.098280429840088, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}


                                                 
 40%|████      | 264/656 [01:08<01:42,  3.82it/s]

{'loss': 1.9022, 'grad_norm': 1.1527708768844604, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}


                                                 
 40%|████      | 265/656 [01:08<01:42,  3.80it/s]

{'loss': 1.8882, 'grad_norm': 1.5864676237106323, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}


                                                 
 41%|████      | 266/656 [01:08<01:37,  4.00it/s]

{'loss': 1.9111, 'grad_norm': 1.0798519849777222, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}


                                                 
 41%|████      | 267/656 [01:09<01:41,  3.85it/s]

{'loss': 1.8974, 'grad_norm': 1.3075319528579712, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}


                                                 
 41%|████      | 268/656 [01:09<01:41,  3.81it/s]

{'loss': 1.8943, 'grad_norm': 1.3225603103637695, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}


                                                 
 41%|████      | 269/656 [01:09<01:45,  3.67it/s]

{'loss': 1.8964, 'grad_norm': 1.6128636598587036, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}


                                                 
 41%|████      | 270/656 [01:10<01:42,  3.76it/s]

{'loss': 1.9197, 'grad_norm': 1.2604923248291016, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}


                                                 
 41%|████▏     | 271/656 [01:10<01:38,  3.90it/s]

{'loss': 1.8918, 'grad_norm': 1.3335802555084229, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}


                                                 
 41%|████▏     | 272/656 [01:10<01:53,  3.37it/s]

{'loss': 1.8991, 'grad_norm': 1.300837516784668, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}


                                                 
 42%|████▏     | 273/656 [01:10<01:47,  3.56it/s]

{'loss': 1.8977, 'grad_norm': 1.1972206830978394, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}


                                                 
 42%|████▏     | 274/656 [01:11<01:43,  3.70it/s]

{'loss': 1.8989, 'grad_norm': 0.8819725513458252, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}


                                                 
 42%|████▏     | 275/656 [01:11<01:42,  3.73it/s]

{'loss': 1.9307, 'grad_norm': 1.227866530418396, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}


                                                 
 42%|████▏     | 276/656 [01:11<01:40,  3.80it/s]

{'loss': 1.9006, 'grad_norm': 1.0331202745437622, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}


                                                 
 42%|████▏     | 277/656 [01:11<01:39,  3.81it/s]

{'loss': 1.9106, 'grad_norm': 1.2013297080993652, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}


                                                 
 42%|████▏     | 278/656 [01:12<01:46,  3.54it/s]

{'loss': 1.8909, 'grad_norm': 1.224323034286499, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}


                                                 
 43%|████▎     | 279/656 [01:12<01:39,  3.79it/s]

{'loss': 1.8851, 'grad_norm': 0.9094113111495972, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}


                                                 
 43%|████▎     | 280/656 [01:12<01:43,  3.64it/s]

{'loss': 1.9084, 'grad_norm': 0.9005710482597351, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}


                                                 
 43%|████▎     | 281/656 [01:13<01:45,  3.57it/s]

{'loss': 1.9089, 'grad_norm': 1.1376725435256958, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}


                                                 
 43%|████▎     | 282/656 [01:13<01:40,  3.71it/s]

{'loss': 1.8845, 'grad_norm': 1.0836610794067383, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}


                                                 
 43%|████▎     | 283/656 [01:13<01:35,  3.92it/s]

{'loss': 1.9222, 'grad_norm': 1.0812495946884155, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}


                                                 
 43%|████▎     | 284/656 [01:13<01:34,  3.92it/s]

{'loss': 1.9028, 'grad_norm': 1.619211196899414, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}


                                                 
                                                 
 44%|████▎     | 286/656 [01:14<01:28,  4.18it/s]

{'loss': 1.9099, 'grad_norm': 1.35149347782135, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}
{'loss': 1.8876, 'grad_norm': 1.1277650594711304, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}


                                                 
 44%|████▍     | 287/656 [01:14<01:32,  3.99it/s]

{'loss': 1.9077, 'grad_norm': 0.9314396977424622, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}


                                                 
 44%|████▍     | 288/656 [01:14<01:30,  4.06it/s]

{'loss': 1.9039, 'grad_norm': 1.2078371047973633, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


                                                 
 44%|████▍     | 289/656 [01:15<01:28,  4.15it/s]

{'loss': 1.8933, 'grad_norm': 1.2307379245758057, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}


                                                 
 44%|████▍     | 290/656 [01:15<01:38,  3.70it/s]

{'loss': 1.9025, 'grad_norm': 1.1304646730422974, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}


                                                 
 44%|████▍     | 291/656 [01:15<01:34,  3.85it/s]

{'loss': 1.9167, 'grad_norm': 1.4703444242477417, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}


                                                 
 45%|████▍     | 292/656 [01:15<01:30,  4.04it/s]

{'loss': 1.8883, 'grad_norm': 1.134706735610962, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


                                                 
 45%|████▍     | 293/656 [01:16<01:29,  4.06it/s]

{'loss': 1.9027, 'grad_norm': 1.0613820552825928, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}


                                                 
 45%|████▍     | 294/656 [01:16<01:34,  3.85it/s]

{'loss': 1.906, 'grad_norm': 1.1165460348129272, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}


                                                 
 45%|████▍     | 295/656 [01:16<01:34,  3.81it/s]

{'loss': 1.9023, 'grad_norm': 1.141669750213623, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}


                                                 
 45%|████▌     | 296/656 [01:16<01:42,  3.52it/s]

{'loss': 1.8962, 'grad_norm': 1.2641278505325317, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


                                                 
 45%|████▌     | 297/656 [01:17<01:44,  3.42it/s]

{'loss': 1.9035, 'grad_norm': 1.043862223625183, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}


                                                 
 45%|████▌     | 298/656 [01:17<01:43,  3.46it/s]

{'loss': 1.8715, 'grad_norm': 1.0253868103027344, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}


                                                 
 46%|████▌     | 299/656 [01:17<01:36,  3.69it/s]

{'loss': 1.8985, 'grad_norm': 1.467592716217041, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}


                                                 
 46%|████▌     | 300/656 [01:18<01:35,  3.73it/s]

{'loss': 1.8992, 'grad_norm': 1.5397772789001465, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


                                                 
 46%|████▌     | 301/656 [01:18<01:30,  3.91it/s]

{'loss': 1.9269, 'grad_norm': 1.8041237592697144, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}


                                                 
 46%|████▌     | 302/656 [01:18<01:27,  4.03it/s]

{'loss': 1.8817, 'grad_norm': 0.9240999817848206, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}


                                                 
 46%|████▌     | 303/656 [01:18<01:24,  4.18it/s]

{'loss': 1.8859, 'grad_norm': 1.1406522989273071, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}


                                                 
 46%|████▋     | 304/656 [01:18<01:25,  4.12it/s]

{'loss': 1.9035, 'grad_norm': 1.299523115158081, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


                                                 
 46%|████▋     | 305/656 [01:19<01:25,  4.11it/s]

{'loss': 1.8797, 'grad_norm': 1.4789893627166748, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}


                                                 
 47%|████▋     | 306/656 [01:19<01:29,  3.92it/s]

{'loss': 1.9057, 'grad_norm': 1.3155921697616577, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}


                                                 
 47%|████▋     | 307/656 [01:19<01:28,  3.93it/s]

{'loss': 1.9079, 'grad_norm': 1.3001291751861572, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}


                                                 
 47%|████▋     | 308/656 [01:19<01:24,  4.11it/s]

{'loss': 1.8898, 'grad_norm': 1.0544854402542114, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


                                                 
 47%|████▋     | 309/656 [01:20<01:25,  4.07it/s]

{'loss': 1.8904, 'grad_norm': 0.935949981212616, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}


                                                 
 47%|████▋     | 310/656 [01:20<01:23,  4.12it/s]

{'loss': 1.9053, 'grad_norm': 1.358142614364624, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}


                                                 
 47%|████▋     | 311/656 [01:20<01:34,  3.66it/s]

{'loss': 1.8929, 'grad_norm': 1.4416399002075195, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}


                                                 
 48%|████▊     | 312/656 [01:21<01:32,  3.73it/s]

{'loss': 1.8878, 'grad_norm': 1.169231653213501, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


                                                 
 48%|████▊     | 313/656 [01:21<01:37,  3.50it/s]

{'loss': 1.8998, 'grad_norm': 1.0889328718185425, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}


                                                 
 48%|████▊     | 314/656 [01:21<01:32,  3.71it/s]

{'loss': 1.8939, 'grad_norm': 1.1899960041046143, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}


                                                 
 48%|████▊     | 315/656 [01:21<01:28,  3.85it/s]

{'loss': 1.8809, 'grad_norm': 1.1974849700927734, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}


                                                 
 48%|████▊     | 316/656 [01:22<01:28,  3.85it/s]

{'loss': 1.8727, 'grad_norm': 1.3724339008331299, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


                                                 
 48%|████▊     | 317/656 [01:22<01:27,  3.86it/s]

{'loss': 1.9239, 'grad_norm': 1.4596935510635376, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}


                                                 
 48%|████▊     | 318/656 [01:22<01:25,  3.97it/s]

{'loss': 1.8847, 'grad_norm': 1.342919111251831, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}


                                                 
 49%|████▊     | 319/656 [01:22<01:31,  3.66it/s]

{'loss': 1.8972, 'grad_norm': 1.6012307405471802, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}


                                                 
 49%|████▉     | 320/656 [01:23<01:26,  3.87it/s]

{'loss': 1.8777, 'grad_norm': 1.24078369140625, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


                                                 
 49%|████▉     | 321/656 [01:23<01:29,  3.74it/s]

{'loss': 1.8874, 'grad_norm': 1.399096131324768, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}


                                                 
 49%|████▉     | 322/656 [01:23<01:32,  3.63it/s]

{'loss': 1.8537, 'grad_norm': 1.3123493194580078, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}


                                                 
 49%|████▉     | 323/656 [01:23<01:27,  3.80it/s]

{'loss': 1.8779, 'grad_norm': 1.4725557565689087, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}


                                                 
 49%|████▉     | 324/656 [01:24<01:34,  3.51it/s]

{'loss': 1.8577, 'grad_norm': 1.4492290019989014, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


                                                 
 50%|████▉     | 325/656 [01:24<01:31,  3.63it/s]

{'loss': 1.8892, 'grad_norm': 1.416987419128418, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}


                                                 
                                                 
 50%|████▉     | 327/656 [01:24<01:17,  4.23it/s]

{'loss': 1.8985, 'grad_norm': 1.2460404634475708, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}
{'loss': 1.8711, 'grad_norm': 1.1657679080963135, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [01:24<01:17,  4.23it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.871, 'grad_norm': 3.6868789196014404, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                    

 50%|█████     | 328/656 [01:48<01:17,  4.23it/s]
[A
[ASaving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors


{'eval_loss': 1.8718997240066528, 'eval_accuracy': 0.5716469239587314, 'eval_runtime': 23.4365, 'eval_samples_per_second': 111.663, 'eval_steps_per_second': 3.499, 'epoch': 1.0}


                                                 
 50%|█████     | 329/656 [01:48<30:47,  5.65s/it]

{'loss': 1.8933, 'grad_norm': 1.1968863010406494, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 330/656 [01:49<24:14,  4.46s/it]

{'loss': 1.8795, 'grad_norm': 1.6204378604888916, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}


                                                 
 50%|█████     | 331/656 [01:50<18:35,  3.43s/it]

{'loss': 1.8689, 'grad_norm': 1.0692042112350464, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}


                                                 
 51%|█████     | 332/656 [01:50<14:21,  2.66s/it]

{'loss': 1.8982, 'grad_norm': 1.2600176334381104, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}


                                                 
 51%|█████     | 333/656 [01:51<10:44,  2.00s/it]

{'loss': 1.8636, 'grad_norm': 1.3415292501449585, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}


                                                 
 51%|█████     | 334/656 [01:51<08:13,  1.53s/it]

{'loss': 1.8661, 'grad_norm': 1.1943787336349487, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}


                                                 
 51%|█████     | 335/656 [01:51<06:19,  1.18s/it]

{'loss': 1.8754, 'grad_norm': 1.2816264629364014, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}


                                                 
 51%|█████     | 336/656 [01:52<04:59,  1.07it/s]

{'loss': 1.9042, 'grad_norm': 1.3345345258712769, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}


                                                 
 51%|█████▏    | 337/656 [01:52<04:10,  1.27it/s]

{'loss': 1.8739, 'grad_norm': 1.0970360040664673, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}


                                                 
 52%|█████▏    | 338/656 [01:53<03:39,  1.45it/s]

{'loss': 1.8671, 'grad_norm': 1.298018455505371, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}


                                                 
 52%|█████▏    | 339/656 [01:53<03:12,  1.65it/s]

{'loss': 1.8803, 'grad_norm': 1.4232008457183838, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}


                                                 
 52%|█████▏    | 340/656 [01:53<02:44,  1.92it/s]

{'loss': 1.8681, 'grad_norm': 1.2473560571670532, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}


                                                 
 52%|█████▏    | 341/656 [01:54<02:37,  2.00it/s]

{'loss': 1.8543, 'grad_norm': 1.0075773000717163, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}


                                                 
 52%|█████▏    | 342/656 [01:54<02:22,  2.20it/s]

{'loss': 1.8866, 'grad_norm': 1.3993817567825317, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}


                                                 
 52%|█████▏    | 343/656 [01:54<02:16,  2.30it/s]

{'loss': 1.8792, 'grad_norm': 1.2334203720092773, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}


                                                 
 52%|█████▏    | 344/656 [01:55<02:06,  2.46it/s]

{'loss': 1.8733, 'grad_norm': 1.39949369430542, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}


                                                 
 53%|█████▎    | 345/656 [01:55<02:00,  2.57it/s]

{'loss': 1.8758, 'grad_norm': 1.3679122924804688, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}


                                                 
 53%|█████▎    | 346/656 [01:55<01:53,  2.72it/s]

{'loss': 1.889, 'grad_norm': 1.7913132905960083, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}


                                                 
 53%|█████▎    | 347/656 [01:56<01:47,  2.87it/s]

{'loss': 1.8968, 'grad_norm': 1.5130162239074707, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}


                                                 
 53%|█████▎    | 348/656 [01:56<01:43,  2.96it/s]

{'loss': 1.8551, 'grad_norm': 1.5236172676086426, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}


                                                 
 53%|█████▎    | 349/656 [01:56<01:49,  2.81it/s]

{'loss': 1.8637, 'grad_norm': 1.362966537475586, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}


                                                 
 53%|█████▎    | 350/656 [01:57<01:49,  2.80it/s]

{'loss': 1.8653, 'grad_norm': 1.0868804454803467, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}


                                                 
 54%|█████▎    | 351/656 [01:57<01:51,  2.75it/s]

{'loss': 1.8606, 'grad_norm': 0.9177486896514893, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}


                                                 
 54%|█████▎    | 352/656 [01:58<01:50,  2.74it/s]

{'loss': 1.8671, 'grad_norm': 0.925316333770752, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}


                                                 
 54%|█████▍    | 353/656 [01:58<01:55,  2.62it/s]

{'loss': 1.8761, 'grad_norm': 1.180679202079773, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}


                                                 
 54%|█████▍    | 354/656 [01:58<01:58,  2.55it/s]

{'loss': 1.8866, 'grad_norm': 1.8532475233078003, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}


                                                 
 54%|█████▍    | 355/656 [01:59<01:51,  2.69it/s]

{'loss': 1.8504, 'grad_norm': 1.3990981578826904, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}


                                                 
 54%|█████▍    | 356/656 [01:59<01:58,  2.53it/s]

{'loss': 1.8761, 'grad_norm': 1.8453514575958252, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}


                                                 
 54%|█████▍    | 357/656 [02:00<02:00,  2.48it/s]

{'loss': 1.8702, 'grad_norm': 1.3321044445037842, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}


                                                 
 55%|█████▍    | 358/656 [02:00<01:54,  2.60it/s]

{'loss': 1.8691, 'grad_norm': 1.5233900547027588, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}


                                                 
 55%|█████▍    | 359/656 [02:00<01:52,  2.63it/s]

{'loss': 1.8585, 'grad_norm': 1.732568621635437, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}


                                                 
 55%|█████▍    | 360/656 [02:01<01:46,  2.79it/s]

{'loss': 1.8832, 'grad_norm': 1.2725389003753662, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}


                                                 
 55%|█████▌    | 361/656 [02:01<01:54,  2.58it/s]

{'loss': 1.8296, 'grad_norm': 1.7115200757980347, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}


                                                 
 55%|█████▌    | 362/656 [02:02<02:00,  2.45it/s]

{'loss': 1.8609, 'grad_norm': 1.414594054222107, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}


                                                 
 55%|█████▌    | 363/656 [02:02<01:58,  2.46it/s]

{'loss': 1.8898, 'grad_norm': 1.5083736181259155, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}


                                                 
 55%|█████▌    | 364/656 [02:02<01:44,  2.78it/s]

{'loss': 1.8611, 'grad_norm': 1.4089640378952026, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}


                                                 
 56%|█████▌    | 365/656 [02:02<01:31,  3.19it/s]

{'loss': 1.8931, 'grad_norm': 1.1893419027328491, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}


                                                 
 56%|█████▌    | 366/656 [02:03<01:27,  3.31it/s]

{'loss': 1.8739, 'grad_norm': 1.3558754920959473, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}


                                                 
 56%|█████▌    | 367/656 [02:03<01:23,  3.45it/s]

{'loss': 1.8656, 'grad_norm': 1.1675869226455688, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}


                                                 
 56%|█████▌    | 368/656 [02:03<01:25,  3.36it/s]

{'loss': 1.8391, 'grad_norm': 1.211789608001709, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}


                                                 
 56%|█████▋    | 369/656 [02:04<01:29,  3.20it/s]

{'loss': 1.8611, 'grad_norm': 1.1782058477401733, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}


                                                 
 56%|█████▋    | 370/656 [02:04<01:26,  3.29it/s]

{'loss': 1.8577, 'grad_norm': 1.935887098312378, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}


                                                 
 57%|█████▋    | 371/656 [02:04<01:24,  3.36it/s]

{'loss': 1.8795, 'grad_norm': 1.2943302392959595, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}


                                                 
 57%|█████▋    | 372/656 [02:04<01:20,  3.52it/s]

{'loss': 1.8695, 'grad_norm': 0.9548373818397522, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}


                                                 
 57%|█████▋    | 373/656 [02:05<01:15,  3.73it/s]

{'loss': 1.8595, 'grad_norm': 1.0801440477371216, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}


                                                 
 57%|█████▋    | 374/656 [02:05<01:19,  3.55it/s]

{'loss': 1.8636, 'grad_norm': 1.2209854125976562, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}


                                                 
                                                 
 57%|█████▋    | 376/656 [02:05<01:09,  4.01it/s]

{'loss': 1.8718, 'grad_norm': 1.1314984560012817, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}
{'loss': 1.8561, 'grad_norm': 1.2546730041503906, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}


                                                 
 57%|█████▋    | 377/656 [02:06<01:13,  3.80it/s]

{'loss': 1.8759, 'grad_norm': 1.5453096628189087, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}


                                                 
 58%|█████▊    | 378/656 [02:06<01:11,  3.91it/s]

{'loss': 1.8453, 'grad_norm': 1.4542932510375977, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


                                                 
 58%|█████▊    | 379/656 [02:06<01:10,  3.94it/s]

{'loss': 1.8587, 'grad_norm': 1.7298892736434937, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}


                                                 
 58%|█████▊    | 380/656 [02:06<01:08,  4.00it/s]

{'loss': 1.8748, 'grad_norm': 1.283676028251648, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}


                                                 
 58%|█████▊    | 381/656 [02:07<01:06,  4.11it/s]

{'loss': 1.8299, 'grad_norm': 0.9670916795730591, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}


                                                 
 58%|█████▊    | 383/656 [02:07<01:02,  4.38it/s]

{'loss': 1.8568, 'grad_norm': 1.1351183652877808, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


                                                 
 58%|█████▊    | 383/656 [02:07<01:02,  4.38it/s]

{'loss': 1.8526, 'grad_norm': 1.1364320516586304, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}


                                                 
 59%|█████▊    | 384/656 [02:07<01:05,  4.13it/s]

{'loss': 1.8535, 'grad_norm': 1.2446231842041016, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}


                                                 
 59%|█████▊    | 385/656 [02:08<01:07,  4.01it/s]

{'loss': 1.8719, 'grad_norm': 1.6070733070373535, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}


                                                 
 59%|█████▉    | 386/656 [02:08<01:06,  4.05it/s]

{'loss': 1.8533, 'grad_norm': 1.0992755889892578, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


                                                 
 59%|█████▉    | 387/656 [02:08<01:07,  4.00it/s]

{'loss': 1.8724, 'grad_norm': 1.5052549839019775, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}


                                                 
 59%|█████▉    | 388/656 [02:08<01:10,  3.81it/s]

{'loss': 1.8356, 'grad_norm': 1.457261323928833, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}


                                                 
 59%|█████▉    | 389/656 [02:09<01:12,  3.69it/s]

{'loss': 1.8699, 'grad_norm': 1.4638748168945312, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}


                                                 
 59%|█████▉    | 390/656 [02:09<01:12,  3.68it/s]

{'loss': 1.8475, 'grad_norm': 1.142958641052246, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


                                                 
 60%|█████▉    | 391/656 [02:09<01:12,  3.67it/s]

{'loss': 1.8511, 'grad_norm': 1.378761649131775, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}


                                                 
 60%|█████▉    | 392/656 [02:10<01:19,  3.30it/s]

{'loss': 1.8544, 'grad_norm': 1.276105523109436, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}


                                                 
 60%|█████▉    | 393/656 [02:10<01:36,  2.72it/s]

{'loss': 1.835, 'grad_norm': 1.552270770072937, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}


                                                 
 60%|██████    | 394/656 [02:11<01:33,  2.79it/s]

{'loss': 1.8492, 'grad_norm': 1.4765475988388062, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


                                                 
 60%|██████    | 395/656 [02:11<01:26,  3.03it/s]

{'loss': 1.794, 'grad_norm': 1.215286374092102, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}


                                                 
 60%|██████    | 396/656 [02:11<01:22,  3.17it/s]

{'loss': 1.853, 'grad_norm': 1.2757549285888672, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}


                                                 
 61%|██████    | 397/656 [02:11<01:23,  3.12it/s]

{'loss': 1.838, 'grad_norm': 1.7345733642578125, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}


                                                 
 61%|██████    | 398/656 [02:12<01:22,  3.13it/s]

{'loss': 1.8452, 'grad_norm': 0.9812607765197754, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


                                                 
 61%|██████    | 399/656 [02:12<01:21,  3.15it/s]

{'loss': 1.8625, 'grad_norm': 1.2746931314468384, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}


                                                 
 61%|██████    | 400/656 [02:12<01:21,  3.14it/s]

{'loss': 1.8592, 'grad_norm': 1.3254355192184448, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}


                                                 
 61%|██████    | 401/656 [02:13<01:27,  2.92it/s]

{'loss': 1.8419, 'grad_norm': 1.2444380521774292, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}


                                                 
 61%|██████▏   | 402/656 [02:13<01:46,  2.38it/s]

{'loss': 1.8263, 'grad_norm': 1.0907033681869507, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


                                                 
 61%|██████▏   | 403/656 [02:14<02:09,  1.95it/s]

{'loss': 1.8674, 'grad_norm': 1.9417155981063843, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}


                                                 
 62%|██████▏   | 404/656 [02:14<01:53,  2.23it/s]

{'loss': 1.855, 'grad_norm': 1.2510452270507812, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}


                                                 
 62%|██████▏   | 405/656 [02:15<01:44,  2.39it/s]

{'loss': 1.8385, 'grad_norm': 1.174344778060913, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}


                                                 
 62%|██████▏   | 406/656 [02:15<01:36,  2.58it/s]

{'loss': 1.853, 'grad_norm': 1.326794981956482, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


                                                 
 62%|██████▏   | 407/656 [02:15<01:31,  2.73it/s]

{'loss': 1.8417, 'grad_norm': 1.2920759916305542, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}


                                                 
 62%|██████▏   | 408/656 [02:16<01:30,  2.74it/s]

{'loss': 1.8419, 'grad_norm': 1.1901447772979736, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}


                                                 
 62%|██████▏   | 409/656 [02:16<01:27,  2.81it/s]

{'loss': 1.842, 'grad_norm': 1.2399598360061646, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}


                                                 
 62%|██████▎   | 410/656 [02:16<01:19,  3.08it/s]

{'loss': 1.8474, 'grad_norm': 0.9264739751815796, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


                                                 
 63%|██████▎   | 411/656 [02:17<01:12,  3.38it/s]

{'loss': 1.8346, 'grad_norm': 0.973659098148346, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}


                                                 
 63%|██████▎   | 412/656 [02:17<01:12,  3.39it/s]

{'loss': 1.8284, 'grad_norm': 1.339959740638733, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}


                                                 
 63%|██████▎   | 413/656 [02:17<01:08,  3.57it/s]

{'loss': 1.8426, 'grad_norm': 1.3250706195831299, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}


                                                 
 63%|██████▎   | 414/656 [02:17<01:04,  3.74it/s]

{'loss': 1.8345, 'grad_norm': 1.4010896682739258, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}


                                                 
 63%|██████▎   | 415/656 [02:18<01:04,  3.74it/s]

{'loss': 1.8265, 'grad_norm': 1.507908582687378, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}


                                                 
 63%|██████▎   | 416/656 [02:18<00:59,  4.02it/s]

{'loss': 1.8486, 'grad_norm': 1.0088289976119995, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}


                                                 
 64%|██████▎   | 417/656 [02:18<00:58,  4.06it/s]

{'loss': 1.8081, 'grad_norm': 1.0666592121124268, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}


                                                 
 64%|██████▎   | 418/656 [02:18<00:58,  4.06it/s]

{'loss': 1.8218, 'grad_norm': 1.387603998184204, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}


                                                 
 64%|██████▍   | 419/656 [02:19<01:04,  3.70it/s]

{'loss': 1.8347, 'grad_norm': 0.9919586181640625, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}


                                                 
 64%|██████▍   | 420/656 [02:19<01:01,  3.82it/s]

{'loss': 1.8708, 'grad_norm': 1.7281893491744995, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}


                                                 
 64%|██████▍   | 421/656 [02:19<01:01,  3.83it/s]

{'loss': 1.8174, 'grad_norm': 1.065382719039917, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}


                                                 
 64%|██████▍   | 422/656 [02:19<00:58,  3.97it/s]

{'loss': 1.8747, 'grad_norm': 1.129810094833374, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}


                                                 
 64%|██████▍   | 423/656 [02:20<00:56,  4.11it/s]

{'loss': 1.8168, 'grad_norm': 1.1455650329589844, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}


                                                 
 65%|██████▍   | 424/656 [02:20<00:58,  3.98it/s]

{'loss': 1.7878, 'grad_norm': 1.6471638679504395, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}


                                                 
 65%|██████▍   | 425/656 [02:20<01:00,  3.83it/s]

{'loss': 1.8068, 'grad_norm': 1.3386716842651367, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}


                                                 
 65%|██████▍   | 426/656 [02:20<00:56,  4.07it/s]

{'loss': 1.8381, 'grad_norm': 1.5091438293457031, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}


                                                 
 65%|██████▌   | 427/656 [02:21<00:58,  3.93it/s]

{'loss': 1.8184, 'grad_norm': 1.402345895767212, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}


                                                 
 65%|██████▌   | 428/656 [02:21<00:54,  4.16it/s]

{'loss': 1.8026, 'grad_norm': 1.585341215133667, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}


                                                 
 65%|██████▌   | 429/656 [02:21<00:53,  4.24it/s]

{'loss': 1.8154, 'grad_norm': 1.132807970046997, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}


                                                 
 66%|██████▌   | 430/656 [02:21<00:54,  4.15it/s]

{'loss': 1.8336, 'grad_norm': 0.9934921860694885, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}


                                                 
 66%|██████▌   | 431/656 [02:22<00:56,  4.00it/s]

{'loss': 1.8198, 'grad_norm': 1.3301879167556763, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}


                                                 
 66%|██████▌   | 432/656 [02:22<00:56,  3.98it/s]

{'loss': 1.844, 'grad_norm': 1.1337746381759644, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}


                                                 
 66%|██████▌   | 433/656 [02:22<00:53,  4.17it/s]

{'loss': 1.8543, 'grad_norm': 1.0960792303085327, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}


                                                 
 66%|██████▌   | 434/656 [02:22<00:55,  3.98it/s]

{'loss': 1.7905, 'grad_norm': 1.2293621301651, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}


                                                 
 66%|██████▋   | 435/656 [02:23<00:54,  4.02it/s]

{'loss': 1.8306, 'grad_norm': 1.0059289932250977, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}


                                                 
 66%|██████▋   | 436/656 [02:23<00:59,  3.69it/s]

{'loss': 1.8233, 'grad_norm': 1.3440439701080322, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}


                                                 
 67%|██████▋   | 437/656 [02:23<00:59,  3.66it/s]

{'loss': 1.7934, 'grad_norm': 1.216903805732727, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}


                                                 
 67%|██████▋   | 438/656 [02:23<00:57,  3.77it/s]

{'loss': 1.8196, 'grad_norm': 1.136316180229187, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}


                                                 
 67%|██████▋   | 439/656 [02:24<00:58,  3.73it/s]

{'loss': 1.8178, 'grad_norm': 1.4219658374786377, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}


                                                 
 67%|██████▋   | 440/656 [02:24<00:59,  3.62it/s]

{'loss': 1.8204, 'grad_norm': 1.1479518413543701, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


                                                 
 67%|██████▋   | 441/656 [02:24<01:07,  3.19it/s]

{'loss': 1.7981, 'grad_norm': 1.359683632850647, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}


                                                 
 67%|██████▋   | 442/656 [02:25<01:01,  3.47it/s]

{'loss': 1.812, 'grad_norm': 1.162262201309204, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}


                                                 
 68%|██████▊   | 443/656 [02:25<01:01,  3.48it/s]

{'loss': 1.7964, 'grad_norm': 1.2632828950881958, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}


                                                 
 68%|██████▊   | 444/656 [02:25<00:58,  3.61it/s]

{'loss': 1.8451, 'grad_norm': 1.1563793420791626, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


                                                 
 68%|██████▊   | 445/656 [02:25<00:58,  3.59it/s]

{'loss': 1.806, 'grad_norm': 1.5092861652374268, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}


                                                 
 68%|██████▊   | 446/656 [02:26<00:57,  3.63it/s]

{'loss': 1.8155, 'grad_norm': 1.103469967842102, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}


                                                 
 68%|██████▊   | 447/656 [02:26<00:58,  3.59it/s]

{'loss': 1.8208, 'grad_norm': 1.3147079944610596, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}


                                                 
 68%|██████▊   | 448/656 [02:26<00:57,  3.61it/s]

{'loss': 1.791, 'grad_norm': 0.9577153325080872, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}


                                                 
 68%|██████▊   | 449/656 [02:26<00:55,  3.74it/s]

{'loss': 1.8507, 'grad_norm': 1.8072128295898438, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}


                                                 
 69%|██████▊   | 450/656 [02:27<00:54,  3.78it/s]

{'loss': 1.8, 'grad_norm': 1.3770933151245117, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}


                                                 
 69%|██████▉   | 451/656 [02:27<01:02,  3.26it/s]

{'loss': 1.7987, 'grad_norm': 1.6927886009216309, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}


                                                 
 69%|██████▉   | 452/656 [02:27<01:00,  3.38it/s]

{'loss': 1.8179, 'grad_norm': 1.8017456531524658, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}


                                                 
 69%|██████▉   | 453/656 [02:28<00:59,  3.39it/s]

{'loss': 1.81, 'grad_norm': 1.1723835468292236, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}


                                                 
 69%|██████▉   | 454/656 [02:28<00:54,  3.68it/s]

{'loss': 1.802, 'grad_norm': 0.8952587842941284, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}


                                                 
 69%|██████▉   | 455/656 [02:28<00:57,  3.50it/s]

{'loss': 1.8208, 'grad_norm': 1.1559946537017822, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}


                                                 
 70%|██████▉   | 456/656 [02:29<00:59,  3.35it/s]

{'loss': 1.7822, 'grad_norm': 1.0329262018203735, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


                                                 
 70%|██████▉   | 457/656 [02:29<00:59,  3.34it/s]

{'loss': 1.7984, 'grad_norm': 1.5425838232040405, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}


                                                 
 70%|██████▉   | 458/656 [02:29<00:58,  3.37it/s]

{'loss': 1.7969, 'grad_norm': 1.4490851163864136, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}


                                                 
 70%|██████▉   | 459/656 [02:29<00:54,  3.60it/s]

{'loss': 1.8006, 'grad_norm': 1.4047435522079468, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}


                                                 
 70%|███████   | 460/656 [02:30<00:51,  3.79it/s]

{'loss': 1.7857, 'grad_norm': 1.5041320323944092, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}


                                                 
 70%|███████   | 461/656 [02:30<00:50,  3.87it/s]

{'loss': 1.8392, 'grad_norm': 1.574475646018982, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}


                                                 
 70%|███████   | 462/656 [02:30<00:47,  4.05it/s]

{'loss': 1.7761, 'grad_norm': 1.7178179025650024, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}


                                                 
 71%|███████   | 463/656 [02:30<00:47,  4.10it/s]

{'loss': 1.8252, 'grad_norm': 1.5400651693344116, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}


                                                 
 71%|███████   | 464/656 [02:31<00:47,  4.05it/s]

{'loss': 1.7621, 'grad_norm': 1.9494271278381348, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}


                                                 
 71%|███████   | 465/656 [02:31<00:46,  4.07it/s]

{'loss': 1.7661, 'grad_norm': 1.3053723573684692, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}


                                                 
 71%|███████   | 466/656 [02:31<00:48,  3.93it/s]

{'loss': 1.8003, 'grad_norm': 1.4730969667434692, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}


                                                 
 71%|███████   | 467/656 [02:31<00:48,  3.86it/s]

{'loss': 1.8356, 'grad_norm': 1.3917396068572998, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}


                                                 
 71%|███████▏  | 468/656 [02:32<00:51,  3.67it/s]

{'loss': 1.7739, 'grad_norm': 1.2842977046966553, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}


                                                 
 71%|███████▏  | 469/656 [02:32<00:50,  3.68it/s]

{'loss': 1.7657, 'grad_norm': 1.1763787269592285, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}


                                                 
 72%|███████▏  | 470/656 [02:32<00:49,  3.74it/s]

{'loss': 1.7884, 'grad_norm': 1.6645042896270752, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}


                                                 
 72%|███████▏  | 471/656 [02:33<00:54,  3.41it/s]

{'loss': 1.7904, 'grad_norm': 1.778412103652954, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}


                                                 
 72%|███████▏  | 472/656 [02:33<00:50,  3.64it/s]

{'loss': 1.773, 'grad_norm': 1.0782321691513062, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


                                                 
 72%|███████▏  | 473/656 [02:33<00:52,  3.47it/s]

{'loss': 1.7976, 'grad_norm': 1.9027972221374512, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}


                                                 
 72%|███████▏  | 474/656 [02:33<00:49,  3.67it/s]

{'loss': 1.7881, 'grad_norm': 1.2379571199417114, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}


                                                 
 72%|███████▏  | 475/656 [02:34<00:45,  3.97it/s]

{'loss': 1.8172, 'grad_norm': 1.2693958282470703, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}


                                                 
 73%|███████▎  | 476/656 [02:34<00:44,  4.01it/s]

{'loss': 1.7683, 'grad_norm': 1.1437186002731323, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


                                                 
 73%|███████▎  | 477/656 [02:34<00:54,  3.29it/s]

{'loss': 1.7808, 'grad_norm': 1.0655804872512817, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}


                                                 
 73%|███████▎  | 478/656 [02:34<00:49,  3.61it/s]

{'loss': 1.7885, 'grad_norm': 1.2798526287078857, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}


                                                 
 73%|███████▎  | 479/656 [02:35<00:47,  3.72it/s]

{'loss': 1.7972, 'grad_norm': 1.0195180177688599, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}


                                                 
 73%|███████▎  | 480/656 [02:35<00:46,  3.80it/s]

{'loss': 1.7895, 'grad_norm': 1.1582731008529663, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


                                                 
 73%|███████▎  | 481/656 [02:35<00:45,  3.85it/s]

{'loss': 1.7896, 'grad_norm': 1.3318524360656738, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}


                                                 
 73%|███████▎  | 482/656 [02:35<00:46,  3.77it/s]

{'loss': 1.7993, 'grad_norm': 1.1532307863235474, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}


                                                 
 74%|███████▎  | 483/656 [02:36<00:48,  3.55it/s]

{'loss': 1.794, 'grad_norm': 1.2779264450073242, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}


                                                 
 74%|███████▍  | 484/656 [02:36<00:47,  3.64it/s]

{'loss': 1.7528, 'grad_norm': 1.1779170036315918, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


                                                 
 74%|███████▍  | 485/656 [02:36<00:47,  3.63it/s]

{'loss': 1.7894, 'grad_norm': 1.3390302658081055, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}


                                                 
 74%|███████▍  | 486/656 [02:37<00:49,  3.46it/s]

{'loss': 1.8069, 'grad_norm': 1.4736100435256958, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}


                                                 
 74%|███████▍  | 487/656 [02:37<00:49,  3.38it/s]

{'loss': 1.7784, 'grad_norm': 1.3680654764175415, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}


                                                 
 74%|███████▍  | 488/656 [02:37<00:53,  3.14it/s]

{'loss': 1.7827, 'grad_norm': 1.5952787399291992, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


                                                 
 75%|███████▍  | 489/656 [02:38<00:49,  3.35it/s]

{'loss': 1.7803, 'grad_norm': 1.2591522932052612, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}


                                                 
 75%|███████▍  | 490/656 [02:38<00:47,  3.50it/s]

{'loss': 1.7657, 'grad_norm': 1.1684932708740234, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}


                                                 
 75%|███████▍  | 491/656 [02:38<00:45,  3.66it/s]

{'loss': 1.7937, 'grad_norm': 1.1524934768676758, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}


                                                 
 75%|███████▌  | 492/656 [02:38<00:49,  3.34it/s]

{'loss': 1.7915, 'grad_norm': 1.5629239082336426, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


                                                 
 75%|███████▌  | 493/656 [02:39<00:46,  3.50it/s]

{'loss': 1.748, 'grad_norm': 1.1890438795089722, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}


                                                 
 75%|███████▌  | 494/656 [02:39<00:44,  3.62it/s]

{'loss': 1.7953, 'grad_norm': 1.382004976272583, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}


                                                 
 75%|███████▌  | 495/656 [02:39<00:44,  3.62it/s]

{'loss': 1.782, 'grad_norm': 1.4847506284713745, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}


                                                 
 76%|███████▌  | 496/656 [02:39<00:42,  3.80it/s]

{'loss': 1.7561, 'grad_norm': 1.5663286447525024, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


                                                 
 76%|███████▌  | 497/656 [02:40<00:44,  3.61it/s]

{'loss': 1.7893, 'grad_norm': 1.5990841388702393, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}


                                                 
 76%|███████▌  | 498/656 [02:40<00:42,  3.74it/s]

{'loss': 1.8025, 'grad_norm': 1.4984883069992065, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}


                                                 
 76%|███████▌  | 499/656 [02:40<00:39,  4.01it/s]

{'loss': 1.74, 'grad_norm': 1.1149941682815552, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}


                                                 
 76%|███████▌  | 500/656 [02:40<00:37,  4.21it/s]

{'loss': 1.7718, 'grad_norm': 2.025923013687134, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


                                                 
 76%|███████▋  | 501/656 [02:41<00:38,  4.01it/s]

{'loss': 1.7747, 'grad_norm': 1.2262598276138306, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}


                                                 
 77%|███████▋  | 502/656 [02:41<00:38,  4.00it/s]

{'loss': 1.793, 'grad_norm': 1.3096551895141602, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}


                                                 
 77%|███████▋  | 503/656 [02:41<00:37,  4.08it/s]

{'loss': 1.7985, 'grad_norm': 1.054084300994873, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}


                                                 
 77%|███████▋  | 504/656 [02:41<00:37,  4.05it/s]

{'loss': 1.7657, 'grad_norm': 1.3250969648361206, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


                                                 
 77%|███████▋  | 505/656 [02:42<00:35,  4.25it/s]

{'loss': 1.7424, 'grad_norm': 1.2094446420669556, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}


                                                 
 77%|███████▋  | 506/656 [02:42<00:35,  4.26it/s]

{'loss': 1.7714, 'grad_norm': 1.2914799451828003, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}


                                                 
 77%|███████▋  | 507/656 [02:42<00:34,  4.35it/s]

{'loss': 1.7607, 'grad_norm': 1.0272715091705322, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}


                                                 
                                                 
 78%|███████▊  | 509/656 [02:43<00:33,  4.45it/s]

{'loss': 1.7885, 'grad_norm': 1.8297902345657349, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}
{'loss': 1.7533, 'grad_norm': 1.2335307598114014, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}


                                                 
 78%|███████▊  | 510/656 [02:43<00:36,  4.05it/s]

{'loss': 1.7851, 'grad_norm': 1.4821791648864746, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}


                                                 
 78%|███████▊  | 511/656 [02:43<00:38,  3.73it/s]

{'loss': 1.7436, 'grad_norm': 1.6822915077209473, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}


                                                 
 78%|███████▊  | 512/656 [02:43<00:39,  3.65it/s]

{'loss': 1.771, 'grad_norm': 1.2195109128952026, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}


                                                 
 78%|███████▊  | 513/656 [02:44<00:37,  3.76it/s]

{'loss': 1.7464, 'grad_norm': 1.3999078273773193, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}


                                                 
 78%|███████▊  | 514/656 [02:44<00:36,  3.86it/s]

{'loss': 1.795, 'grad_norm': 1.502241849899292, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}


                                                 
 79%|███████▊  | 515/656 [02:44<00:35,  3.95it/s]

{'loss': 1.7333, 'grad_norm': 1.928929328918457, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}


                                                 
 79%|███████▊  | 516/656 [02:44<00:37,  3.70it/s]

{'loss': 1.7048, 'grad_norm': 1.1186021566390991, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}


                                                 
 79%|███████▉  | 517/656 [02:45<00:38,  3.63it/s]

{'loss': 1.7973, 'grad_norm': 1.5579546689987183, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}


                                                 
 79%|███████▉  | 518/656 [02:45<00:38,  3.59it/s]

{'loss': 1.7415, 'grad_norm': 1.1911224126815796, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}


                                                 
 79%|███████▉  | 519/656 [02:45<00:35,  3.81it/s]

{'loss': 1.7686, 'grad_norm': 1.1558160781860352, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}


                                                 
 79%|███████▉  | 520/656 [02:46<00:35,  3.85it/s]

{'loss': 1.7477, 'grad_norm': 1.1968944072723389, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}


                                                 
 79%|███████▉  | 521/656 [02:46<00:34,  3.91it/s]

{'loss': 1.7408, 'grad_norm': 1.1787842512130737, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}


                                                 
 80%|███████▉  | 522/656 [02:46<00:35,  3.75it/s]

{'loss': 1.7349, 'grad_norm': 1.4676588773727417, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}


                                                 
 80%|███████▉  | 523/656 [02:46<00:34,  3.88it/s]

{'loss': 1.7783, 'grad_norm': 1.3034805059432983, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}


                                                 
 80%|███████▉  | 524/656 [02:47<00:37,  3.54it/s]

{'loss': 1.7377, 'grad_norm': 1.291278600692749, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


                                                 
 80%|████████  | 525/656 [02:47<00:35,  3.72it/s]

{'loss': 1.7496, 'grad_norm': 1.5339607000350952, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}


                                                 
 80%|████████  | 526/656 [02:47<00:35,  3.64it/s]

{'loss': 1.7417, 'grad_norm': 1.1916133165359497, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}


                                                 
 80%|████████  | 527/656 [02:47<00:33,  3.89it/s]

{'loss': 1.741, 'grad_norm': 1.5841172933578491, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}


                                                 
                                                 
 81%|████████  | 529/656 [02:48<00:29,  4.26it/s]

{'loss': 1.7863, 'grad_norm': 1.9450244903564453, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}
{'loss': 1.7716, 'grad_norm': 1.2905267477035522, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}


                                                 
 81%|████████  | 530/656 [02:48<00:29,  4.23it/s]

{'loss': 1.77, 'grad_norm': 1.5251833200454712, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}


                                                 
 81%|████████  | 531/656 [02:48<00:29,  4.20it/s]

{'loss': 1.732, 'grad_norm': 1.375551700592041, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}


                                                 
 81%|████████  | 532/656 [02:49<00:31,  3.91it/s]

{'loss': 1.7482, 'grad_norm': 1.3013132810592651, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}


                                                 
 81%|████████▏ | 533/656 [02:49<00:31,  3.87it/s]

{'loss': 1.7164, 'grad_norm': 1.9478191137313843, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}


                                                 
 81%|████████▏ | 534/656 [02:49<00:31,  3.85it/s]

{'loss': 1.7379, 'grad_norm': 1.3245599269866943, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}


                                                 
 82%|████████▏ | 535/656 [02:49<00:32,  3.76it/s]

{'loss': 1.7261, 'grad_norm': 1.280444622039795, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}


                                                 
 82%|████████▏ | 536/656 [02:50<00:31,  3.81it/s]

{'loss': 1.674, 'grad_norm': 1.6652437448501587, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}


                                                 
 82%|████████▏ | 537/656 [02:50<00:34,  3.44it/s]

{'loss': 1.7416, 'grad_norm': 1.5107618570327759, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}


                                                 
 82%|████████▏ | 538/656 [02:50<00:35,  3.33it/s]

{'loss': 1.7513, 'grad_norm': 1.4711333513259888, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


                                                 
 82%|████████▏ | 539/656 [02:51<00:35,  3.32it/s]

{'loss': 1.7502, 'grad_norm': 1.0041176080703735, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}


                                                 
 82%|████████▏ | 540/656 [02:51<00:34,  3.40it/s]

{'loss': 1.7006, 'grad_norm': 1.0208312273025513, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}


                                                 
 82%|████████▏ | 541/656 [02:51<00:34,  3.33it/s]

{'loss': 1.741, 'grad_norm': 1.389573097229004, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}


                                                 
 83%|████████▎ | 542/656 [02:52<00:33,  3.43it/s]

{'loss': 1.7418, 'grad_norm': 1.6272461414337158, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}


                                                 
 83%|████████▎ | 543/656 [02:52<00:30,  3.66it/s]

{'loss': 1.7202, 'grad_norm': 1.7210898399353027, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}


                                                 
 83%|████████▎ | 544/656 [02:52<00:30,  3.71it/s]

{'loss': 1.7188, 'grad_norm': 1.2755743265151978, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}


                                                 
 83%|████████▎ | 545/656 [02:52<00:31,  3.57it/s]

{'loss': 1.7428, 'grad_norm': 1.4366803169250488, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}


                                                 
 83%|████████▎ | 546/656 [02:53<00:32,  3.41it/s]

{'loss': 1.7053, 'grad_norm': 1.4406381845474243, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}


                                                 
 83%|████████▎ | 547/656 [02:53<00:30,  3.61it/s]

{'loss': 1.731, 'grad_norm': 1.6528372764587402, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}


                                                 
 84%|████████▎ | 548/656 [02:53<00:31,  3.45it/s]

{'loss': 1.772, 'grad_norm': 1.2044934034347534, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}


                                                 
 84%|████████▎ | 549/656 [02:53<00:30,  3.54it/s]

{'loss': 1.7051, 'grad_norm': 1.2774425745010376, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}


                                                 
 84%|████████▍ | 550/656 [02:54<00:29,  3.64it/s]

{'loss': 1.7092, 'grad_norm': 1.7213070392608643, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}


                                                 
 84%|████████▍ | 551/656 [02:54<00:28,  3.73it/s]

{'loss': 1.7313, 'grad_norm': 1.35158109664917, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}


                                                 
                                                 
 84%|████████▍ | 553/656 [02:54<00:24,  4.19it/s]

{'loss': 1.6744, 'grad_norm': 1.371487021446228, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}
{'loss': 1.7393, 'grad_norm': 1.6384786367416382, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}


                                                 
 84%|████████▍ | 554/656 [02:55<00:25,  4.05it/s]

{'loss': 1.7513, 'grad_norm': 1.482858657836914, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}


                                                 
 85%|████████▍ | 555/656 [02:55<00:24,  4.17it/s]

{'loss': 1.7446, 'grad_norm': 1.1845117807388306, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}


                                                 
 85%|████████▍ | 556/656 [02:55<00:26,  3.84it/s]

{'loss': 1.7097, 'grad_norm': 1.2442466020584106, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}


                                                 
 85%|████████▍ | 557/656 [02:55<00:24,  4.02it/s]

{'loss': 1.7179, 'grad_norm': 1.459168791770935, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}


                                                 
 85%|████████▌ | 558/656 [02:56<00:24,  3.94it/s]

{'loss': 1.7372, 'grad_norm': 1.151832938194275, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}


                                                 
 85%|████████▌ | 559/656 [02:56<00:26,  3.62it/s]

{'loss': 1.6925, 'grad_norm': 1.3604778051376343, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}


                                                 
 85%|████████▌ | 560/656 [02:56<00:25,  3.72it/s]

{'loss': 1.7342, 'grad_norm': 1.1499336957931519, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}


                                                 
 86%|████████▌ | 561/656 [02:57<00:25,  3.78it/s]

{'loss': 1.7062, 'grad_norm': 1.6115288734436035, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}


                                                 
 86%|████████▌ | 562/656 [02:57<00:23,  3.97it/s]

{'loss': 1.689, 'grad_norm': 1.1033879518508911, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}


                                                 
 86%|████████▌ | 563/656 [02:57<00:22,  4.08it/s]

{'loss': 1.7231, 'grad_norm': 1.090693473815918, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}


                                                 
 86%|████████▌ | 564/656 [02:57<00:22,  4.16it/s]

{'loss': 1.7337, 'grad_norm': 1.3413697481155396, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}


                                                 
 86%|████████▌ | 565/656 [02:57<00:20,  4.35it/s]

{'loss': 1.727, 'grad_norm': 1.2474180459976196, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}


                                                 
 86%|████████▋ | 566/656 [02:58<00:20,  4.37it/s]

{'loss': 1.6677, 'grad_norm': 1.458691954612732, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}


                                                 
 86%|████████▋ | 567/656 [02:58<00:24,  3.66it/s]

{'loss': 1.6744, 'grad_norm': 1.657012939453125, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}


                                                 
 87%|████████▋ | 568/656 [02:58<00:23,  3.70it/s]

{'loss': 1.7323, 'grad_norm': 1.276488184928894, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}


                                                 
 87%|████████▋ | 569/656 [02:58<00:22,  3.94it/s]

{'loss': 1.6878, 'grad_norm': 1.6231757402420044, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}


                                                 
 87%|████████▋ | 570/656 [02:59<00:24,  3.58it/s]

{'loss': 1.7105, 'grad_norm': 1.327246069908142, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}


                                                 
 87%|████████▋ | 571/656 [02:59<00:24,  3.42it/s]

{'loss': 1.6919, 'grad_norm': 1.2320183515548706, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}


                                                 
 87%|████████▋ | 572/656 [02:59<00:22,  3.68it/s]

{'loss': 1.682, 'grad_norm': 1.2407023906707764, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


                                                 
 87%|████████▋ | 573/656 [03:00<00:23,  3.60it/s]

{'loss': 1.701, 'grad_norm': 1.380947470664978, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}


                                                 
 88%|████████▊ | 574/656 [03:00<00:21,  3.75it/s]

{'loss': 1.7348, 'grad_norm': 1.7170687913894653, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}


                                                 
 88%|████████▊ | 575/656 [03:00<00:21,  3.80it/s]

{'loss': 1.6759, 'grad_norm': 1.869567632675171, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}


                                                 
 88%|████████▊ | 576/656 [03:00<00:20,  3.88it/s]

{'loss': 1.6969, 'grad_norm': 1.6241053342819214, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


                                                 
 88%|████████▊ | 577/656 [03:01<00:20,  3.80it/s]

{'loss': 1.6982, 'grad_norm': 1.3955810070037842, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}


                                                 
 88%|████████▊ | 578/656 [03:01<00:20,  3.73it/s]

{'loss': 1.7033, 'grad_norm': 1.6001613140106201, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}


                                                 
 88%|████████▊ | 579/656 [03:01<00:20,  3.81it/s]

{'loss': 1.7019, 'grad_norm': 1.2556074857711792, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}


                                                 
 88%|████████▊ | 580/656 [03:01<00:19,  3.83it/s]

{'loss': 1.6658, 'grad_norm': 1.3774261474609375, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}


                                                 
 89%|████████▊ | 581/656 [03:02<00:18,  4.02it/s]

{'loss': 1.6799, 'grad_norm': 1.075614333152771, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}


                                                 
 89%|████████▊ | 582/656 [03:02<00:19,  3.77it/s]

{'loss': 1.6935, 'grad_norm': 1.4437975883483887, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}


                                                 
 89%|████████▉ | 583/656 [03:02<00:20,  3.62it/s]

{'loss': 1.6903, 'grad_norm': 1.4562939405441284, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}


                                                 
 89%|████████▉ | 584/656 [03:03<00:22,  3.21it/s]

{'loss': 1.6981, 'grad_norm': 1.1478254795074463, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


                                                 
 89%|████████▉ | 585/656 [03:03<00:20,  3.47it/s]

{'loss': 1.6553, 'grad_norm': 1.1200876235961914, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}


                                                 
 89%|████████▉ | 586/656 [03:03<00:20,  3.45it/s]

{'loss': 1.734, 'grad_norm': 1.684448003768921, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}


                                                 
 89%|████████▉ | 587/656 [03:03<00:19,  3.61it/s]

{'loss': 1.6853, 'grad_norm': 1.205403208732605, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}


                                                 
 90%|████████▉ | 588/656 [03:04<00:18,  3.61it/s]

{'loss': 1.7383, 'grad_norm': 1.9111050367355347, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


                                                 
 90%|████████▉ | 589/656 [03:04<00:17,  3.90it/s]

{'loss': 1.6946, 'grad_norm': 1.0657134056091309, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}


                                                 
 90%|████████▉ | 590/656 [03:04<00:16,  3.90it/s]

{'loss': 1.6785, 'grad_norm': 1.1006114482879639, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}


                                                 
 90%|█████████ | 591/656 [03:04<00:16,  3.94it/s]

{'loss': 1.6588, 'grad_norm': 1.3152196407318115, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}


                                                 
 90%|█████████ | 592/656 [03:05<00:17,  3.74it/s]

{'loss': 1.7254, 'grad_norm': 1.630596399307251, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


                                                 
 90%|█████████ | 593/656 [03:05<00:16,  3.79it/s]

{'loss': 1.6846, 'grad_norm': 1.3987642526626587, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}


                                                 
 91%|█████████ | 594/656 [03:05<00:17,  3.51it/s]

{'loss': 1.6835, 'grad_norm': 1.0902972221374512, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}


                                                 
 91%|█████████ | 595/656 [03:06<00:17,  3.46it/s]

{'loss': 1.6677, 'grad_norm': 1.3433775901794434, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}


                                                 
 91%|█████████ | 596/656 [03:06<00:16,  3.55it/s]

{'loss': 1.6204, 'grad_norm': 1.727049469947815, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


                                                 
 91%|█████████ | 597/656 [03:06<00:15,  3.69it/s]

{'loss': 1.6552, 'grad_norm': 1.4635417461395264, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}


                                                 
 91%|█████████ | 598/656 [03:06<00:16,  3.44it/s]

{'loss': 1.6809, 'grad_norm': 1.4290940761566162, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}


                                                 
 91%|█████████▏| 599/656 [03:07<00:16,  3.50it/s]

{'loss': 1.6623, 'grad_norm': 1.3939324617385864, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}


                                                 
 91%|█████████▏| 600/656 [03:07<00:14,  3.80it/s]

{'loss': 1.6662, 'grad_norm': 1.623011589050293, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


                                                 
 92%|█████████▏| 601/656 [03:07<00:14,  3.81it/s]

{'loss': 1.6693, 'grad_norm': 1.1180410385131836, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}


                                                 
 92%|█████████▏| 602/656 [03:07<00:13,  3.90it/s]

{'loss': 1.6995, 'grad_norm': 1.68422532081604, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}


                                                 
 92%|█████████▏| 603/656 [03:08<00:13,  3.95it/s]

{'loss': 1.6343, 'grad_norm': 1.674067497253418, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}


                                                 
 92%|█████████▏| 604/656 [03:08<00:13,  3.98it/s]

{'loss': 1.687, 'grad_norm': 1.4947848320007324, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


                                                 
 92%|█████████▏| 605/656 [03:08<00:13,  3.89it/s]

{'loss': 1.6674, 'grad_norm': 1.4066537618637085, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}


                                                 
 92%|█████████▏| 606/656 [03:08<00:12,  3.97it/s]

{'loss': 1.7143, 'grad_norm': 1.369555950164795, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}


                                                 
 93%|█████████▎| 607/656 [03:09<00:12,  3.85it/s]

{'loss': 1.6659, 'grad_norm': 1.5868827104568481, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}


                                                 
 93%|█████████▎| 608/656 [03:09<00:12,  3.85it/s]

{'loss': 1.6877, 'grad_norm': 1.6599732637405396, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}


                                                 
 93%|█████████▎| 609/656 [03:09<00:11,  4.02it/s]

{'loss': 1.6997, 'grad_norm': 2.007035493850708, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}


                                                 
 93%|█████████▎| 610/656 [03:10<00:11,  3.88it/s]

{'loss': 1.6609, 'grad_norm': 1.5438170433044434, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}


                                                 
 93%|█████████▎| 611/656 [03:10<00:11,  3.93it/s]

{'loss': 1.6498, 'grad_norm': 1.3706166744232178, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}


                                                 
 93%|█████████▎| 612/656 [03:10<00:12,  3.61it/s]

{'loss': 1.655, 'grad_norm': 1.501865029335022, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}


                                                 
 93%|█████████▎| 613/656 [03:10<00:11,  3.67it/s]

{'loss': 1.6596, 'grad_norm': 1.3076202869415283, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}


                                                 
 94%|█████████▎| 614/656 [03:11<00:11,  3.62it/s]

{'loss': 1.6594, 'grad_norm': 1.3792518377304077, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}


                                                 
 94%|█████████▍| 615/656 [03:11<00:12,  3.38it/s]

{'loss': 1.6466, 'grad_norm': 1.49551522731781, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}


                                                 
 94%|█████████▍| 616/656 [03:11<00:11,  3.61it/s]

{'loss': 1.661, 'grad_norm': 1.3923689126968384, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}


                                                 
 94%|█████████▍| 617/656 [03:12<00:10,  3.57it/s]

{'loss': 1.5882, 'grad_norm': 1.430981993675232, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}


                                                 
 94%|█████████▍| 618/656 [03:12<00:11,  3.42it/s]

{'loss': 1.625, 'grad_norm': 1.467315673828125, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}


                                                 
 94%|█████████▍| 619/656 [03:12<00:11,  3.21it/s]

{'loss': 1.6631, 'grad_norm': 1.5783613920211792, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}


                                                 
 95%|█████████▍| 620/656 [03:12<00:10,  3.36it/s]

{'loss': 1.6151, 'grad_norm': 1.1227484941482544, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}


                                                 
 95%|█████████▍| 621/656 [03:13<00:09,  3.50it/s]

{'loss': 1.6555, 'grad_norm': 1.3132095336914062, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}


                                                 
 95%|█████████▍| 622/656 [03:13<00:09,  3.63it/s]

{'loss': 1.684, 'grad_norm': 1.1001192331314087, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}


                                                 
 95%|█████████▍| 623/656 [03:13<00:09,  3.31it/s]

{'loss': 1.6476, 'grad_norm': 1.7854992151260376, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}


                                                 
 95%|█████████▌| 624/656 [03:14<00:09,  3.39it/s]

{'loss': 1.5551, 'grad_norm': 1.86830472946167, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}


                                                 
 95%|█████████▌| 625/656 [03:14<00:09,  3.32it/s]

{'loss': 1.6477, 'grad_norm': 1.35952889919281, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}


                                                 
 95%|█████████▌| 626/656 [03:14<00:09,  3.19it/s]

{'loss': 1.6317, 'grad_norm': 1.1684638261795044, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


                                                 
 96%|█████████▌| 627/656 [03:14<00:08,  3.53it/s]

{'loss': 1.6719, 'grad_norm': 1.6594736576080322, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}


                                                 
 96%|█████████▌| 628/656 [03:15<00:08,  3.18it/s]

{'loss': 1.6747, 'grad_norm': 1.4911247491836548, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}


                                                 
 96%|█████████▌| 629/656 [03:15<00:08,  3.30it/s]

{'loss': 1.5796, 'grad_norm': 1.671470284461975, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}


                                                 
 96%|█████████▌| 630/656 [03:15<00:07,  3.47it/s]

{'loss': 1.6646, 'grad_norm': 1.2308238744735718, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


                                                 
 96%|█████████▌| 631/656 [03:16<00:07,  3.39it/s]

{'loss': 1.6688, 'grad_norm': 1.43638277053833, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}


                                                 
 96%|█████████▋| 632/656 [03:16<00:07,  3.41it/s]

{'loss': 1.6478, 'grad_norm': 1.4884467124938965, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}


                                                 
 96%|█████████▋| 633/656 [03:16<00:06,  3.31it/s]

{'loss': 1.6047, 'grad_norm': 1.6493834257125854, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}


                                                 
 97%|█████████▋| 634/656 [03:17<00:06,  3.40it/s]

{'loss': 1.6305, 'grad_norm': 1.341217279434204, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


                                                 
 97%|█████████▋| 635/656 [03:17<00:06,  3.42it/s]

{'loss': 1.6395, 'grad_norm': 1.5593887567520142, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}


                                                 
 97%|█████████▋| 636/656 [03:17<00:05,  3.45it/s]

{'loss': 1.5857, 'grad_norm': 1.3531558513641357, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}


                                                 
 97%|█████████▋| 637/656 [03:17<00:05,  3.59it/s]

{'loss': 1.5898, 'grad_norm': 1.2341465950012207, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}


                                                 
 97%|█████████▋| 638/656 [03:18<00:05,  3.59it/s]

{'loss': 1.6207, 'grad_norm': 1.3463183641433716, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}


                                                 
 97%|█████████▋| 639/656 [03:18<00:05,  3.26it/s]

{'loss': 1.6114, 'grad_norm': 1.2429475784301758, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}


                                                 
 98%|█████████▊| 640/656 [03:18<00:04,  3.25it/s]

{'loss': 1.6485, 'grad_norm': 2.0185537338256836, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}


                                                 
 98%|█████████▊| 641/656 [03:19<00:04,  3.38it/s]

{'loss': 1.638, 'grad_norm': 1.6596033573150635, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}


                                                 
 98%|█████████▊| 642/656 [03:19<00:04,  3.36it/s]

{'loss': 1.641, 'grad_norm': 1.755290150642395, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}


                                                 
 98%|█████████▊| 643/656 [03:19<00:03,  3.51it/s]

{'loss': 1.6366, 'grad_norm': 1.1701794862747192, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}


                                                 
 98%|█████████▊| 644/656 [03:19<00:03,  3.61it/s]

{'loss': 1.6327, 'grad_norm': 1.54970121383667, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


                                                 
 98%|█████████▊| 645/656 [03:20<00:02,  3.70it/s]

{'loss': 1.6334, 'grad_norm': 1.3534610271453857, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}


                                                 
 98%|█████████▊| 646/656 [03:20<00:02,  3.80it/s]

{'loss': 1.6266, 'grad_norm': 1.0406036376953125, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}


                                                 
 99%|█████████▊| 647/656 [03:20<00:02,  3.88it/s]

{'loss': 1.6823, 'grad_norm': 1.2368537187576294, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}


                                                 
 99%|█████████▉| 648/656 [03:20<00:02,  3.76it/s]

{'loss': 1.5956, 'grad_norm': 1.2803016901016235, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


                                                 
 99%|█████████▉| 649/656 [03:21<00:01,  3.83it/s]

{'loss': 1.6228, 'grad_norm': 1.5071793794631958, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}


                                                 
 99%|█████████▉| 650/656 [03:21<00:01,  3.37it/s]

{'loss': 1.5303, 'grad_norm': 1.7077335119247437, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}


                                                 
 99%|█████████▉| 651/656 [03:21<00:01,  3.37it/s]

{'loss': 1.6593, 'grad_norm': 1.505871295928955, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}


                                                 
 99%|█████████▉| 652/656 [03:22<00:01,  3.06it/s]

{'loss': 1.5702, 'grad_norm': 1.2030895948410034, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}


                                                 
100%|█████████▉| 653/656 [03:22<00:01,  2.96it/s]

{'loss': 1.589, 'grad_norm': 1.1132755279541016, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}


                                                 
100%|█████████▉| 654/656 [03:22<00:00,  3.31it/s]

{'loss': 1.6034, 'grad_norm': 1.2978816032409668, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}


                                                 
                                                 
100%|██████████| 656/656 [03:23<00:00,  3.48it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.6064, 'grad_norm': 1.3096256256103516, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}
{'loss': 1.499, 'grad_norm': 3.8476948738098145, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                    

100%|██████████| 656/656 [03:47<00:00,  3.48it/s]
[A
[ASaving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results/checkpoint-656 (score: 1.5848826169967651).
                                                 
100%|██████████| 656/656 [03:47<00:00,  2.89it/s]

{'eval_loss': 1.5848826169967651, 'eval_accuracy': 0.7875429881543753, 'eval_runtime': 23.749, 'eval_samples_per_second': 110.194, 'eval_steps_per_second': 3.453, 'epoch': 2.0}
{'train_runtime': 227.1811, 'train_samples_per_second': 92.147, 'train_steps_per_second': 2.888, 'train_loss': 1.8466433021353512, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=1.8466433021353512, metrics={'train_runtime': 227.1811, 'train_samples_per_second': 92.147, 'train_steps_per_second': 2.888, 'total_flos': 116893238379912.0, 'train_loss': 1.8466433021353512, 'epoch': 2.0})

In [40]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_label, tokens. If utterance, token_label, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:15<00:00,  5.28it/s]


{'eval_loss': 1.5848826169967651,
 'eval_accuracy': 0.7875429881543753,
 'eval_runtime': 15.7042,
 'eval_samples_per_second': 166.643,
 'eval_steps_per_second': 5.222,
 'epoch': 2.0}