In [81]:
from transformers import Trainer, TrainingArguments, DistilBertForTokenClassification, DistilBertTokenizerFast, DataCollatorForTokenClassification, pipeline
from datasets import load_metric, Dataset
import numpy as np
import torch

In [82]:
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [83]:
snips_file=open("Data/Snips Train Data.txt")
snips_rows=snips_file.readlines()
snips_rows[:20]

['listen O\n',
 'to O\n',
 'westbam B-artist\n',
 'alumb O\n',
 'allergic B-album\n',
 'on O\n',
 'google B-service\n',
 'music I-service\n',
 'PlayMusic\n',
 '\n',
 'add O\n',
 'step B-entity_name\n',
 'to I-entity_name\n',
 'me I-entity_name\n',
 'to O\n',
 'the O\n',
 '50 B-playlist\n',
 'clásicos I-playlist\n',
 'playlist O\n',
 'AddToPlaylist\n']

In [84]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []
utterance, tokenized_utterance, label_for_utterances = '', [], []

for snip_row in snips_rows:
    if len(snip_row) == 2:  # Skip rows with insufficient data
        continue
    if ' ' not in snip_row.strip():  # Sequence label
        sequence_label = snip_row.strip()
        if sequence_label:  # Only add non-empty labels
            sequence_labels.append(sequence_label)
        if utterance.strip():  # Avoid appending empty utterances
            utterances.append(utterance.strip())
            tokenized_utterances.append(tokenized_utterance)
            labels_for_tokens.append(label_for_utterances)
        # Reset for next sequence
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [85]:
len(labels_for_tokens) , len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [86]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [87]:
unique_sequence_labels=list(set(sequence_labels))
sequence_labels=[unique_sequence_labels.index(l) for l in sequence_labels]
print(len(unique_sequence_labels))
print(unique_sequence_labels)

7
['BookRestaurant', 'RateBook', 'SearchScreeningEvent', 'AddToPlaylist', 'PlayMusic', 'GetWeather', 'SearchCreativeWork']


In [88]:
from functools import reduce

unique_token_labels=list(set(reduce(lambda x, y:x+y,labels_for_tokens)))
labels_for_tokens=[[unique_token_labels.index(_) for _ in l]for l in labels_for_tokens]
print(len(unique_token_labels))

72


In [89]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[54, 54, 22, 54, 30, 54, 65, 35]
listen to westbam alumb allergic on google music
4


In [90]:
snips_dataset=Dataset.from_dict(dict(utterance=utterances,label=sequence_labels,tokens=tokenized_utterances,token_label=labels_for_tokens))
snips_dataset=snips_dataset.train_test_split(0.2)

In [91]:
print(snips_dataset['train'][0])

{'utterance': 'i need a table for four at ten pm in dodge park', 'label': 0, 'tokens': ['i', 'need', 'a', 'table', 'for', 'four', 'at', 'ten', 'pm', 'in', 'dodge', 'park'], 'token_label': [54, 54, 54, 54, 54, 49, 54, 5, 50, 54, 57, 24]}


In [92]:
def tokenize_and_align_labels(examples):
    """
    Tokenizes the input text and aligns the labels with the wordpiece tokenization.

    Args:
        examples: A batch of examples containing 'tokens' and 'token_label'.
        tokenizer: The tokenizer to use for tokenization (e.g., a BERT tokenizer).

    Returns:
        A dictionary containing the tokenized inputs and aligned labels.
    """
    # Tokenize the inputs
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)

    labels = []
    for i, label in enumerate(examples["token_label"]):
        # Map tokenized words back to original word IDs
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # This maps the tokenized tokens to their original word indices
        previous_word_idx = None
        label_ids = []

        # Iterate through the word_ids to align the labels with tokens
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens (CLS, SEP, etc.)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # First token of a new word
                label_ids.append(label[word_idx])
            else:  # Other tokens in the same word
                label_ids.append(-100)  # Set to -100 for subword tokens
            previous_word_idx = word_idx
        
        labels.append(label_ids)

    # Add the aligned labels to the tokenized inputs
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [93]:
snips_dataset['train'][0]

{'utterance': 'i need a table for four at ten pm in dodge park',
 'label': 0,
 'tokens': ['i',
  'need',
  'a',
  'table',
  'for',
  'four',
  'at',
  'ten',
  'pm',
  'in',
  'dodge',
  'park'],
 'token_label': [54, 54, 54, 54, 54, 49, 54, 5, 50, 54, 57, 24]}

In [94]:
token_clf_token_snips=snips_dataset.map(tokenize_and_align_labels,batched=True)

100%|██████████| 11/11 [00:00<00:00, 13.07ba/s]
100%|██████████| 3/3 [00:00<00:00, 20.53ba/s]


In [95]:
token_clf_token_snips['train'][0]

{'utterance': 'i need a table for four at ten pm in dodge park',
 'label': 0,
 'tokens': ['i',
  'need',
  'a',
  'table',
  'for',
  'four',
  'at',
  'ten',
  'pm',
  'in',
  'dodge',
  'park'],
 'token_label': [54, 54, 54, 54, 54, 49, 54, 5, 50, 54, 57, 24],
 'input_ids': [101,
  1045,
  2342,
  1037,
  2795,
  2005,
  2176,
  2012,
  2702,
  7610,
  1999,
  11898,
  2380,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [-100,
  54,
  54,
  54,
  54,
  54,
  49,
  54,
  5,
  50,
  54,
  57,
  24,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100]}

In [96]:
token_clf_token_snips['train'] = token_clf_token_snips['train'].remove_columns(['utterance', 'label', 'tokens' , 'token_label'])
token_clf_token_snips['test'] = token_clf_token_snips['test'].remove_columns(['utterance', 'label', 'tokens' , 'token_label'])
token_clf_token_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [97]:
token_data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)

In [101]:
token_clf_model=DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',num_labels=len(unique_token_labels))
token_clf_model.config.id2label={i: l for i,l in enumerate(unique_token_labels)}

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
token_clf_model.config.id2label[0]

'I-object_part_of_series_type'

In [103]:
epochs = 2
warmup_steps = len(token_clf_token_snips['train']) // 5
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

training_arguments = TrainingArguments(
    output_dir='./snips_token_clf/results',
    num_train_epochs=epochs,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=0.05,
    logging_steps=1,
    log_level="info",
    eval_strategy='epoch',
    save_strategy='epoch',
    no_cuda=True
)

token_clf_model.to(device)
trainer=Trainer(model=token_clf_model,args=training_arguments,train_dataset=token_clf_token_snips['train'],eval_dataset=token_clf_token_snips['test'],data_collator=token_data_collator)



In [104]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:55<00:00,  1.47it/s]


{'eval_loss': 4.29494047164917,
 'eval_model_preparation_time': 0.0015,
 'eval_runtime': 56.119,
 'eval_samples_per_second': 46.633,
 'eval_steps_per_second': 1.461}

In [105]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,418,248
  0%|          | 1/656 [00:02<31:26,  2.88s/it]

{'loss': 4.3009, 'grad_norm': 5.085914611816406, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}


  0%|          | 2/656 [00:05<30:57,  2.84s/it]

{'loss': 4.3093, 'grad_norm': 4.624221324920654, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}


  0%|          | 3/656 [00:08<30:17,  2.78s/it]

{'loss': 4.3025, 'grad_norm': 5.356290340423584, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}


  1%|          | 4/656 [00:11<31:10,  2.87s/it]

{'loss': 4.2801, 'grad_norm': 5.076109886169434, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}


  1%|          | 5/656 [00:13<29:42,  2.74s/it]

{'loss': 4.2974, 'grad_norm': 4.7469563484191895, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}


  1%|          | 6/656 [00:17<30:57,  2.86s/it]

{'loss': 4.2807, 'grad_norm': 4.908218860626221, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}


  1%|          | 7/656 [00:20<31:26,  2.91s/it]

{'loss': 4.2959, 'grad_norm': 4.763194561004639, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}


  1%|          | 8/656 [00:22<30:15,  2.80s/it]

{'loss': 4.3066, 'grad_norm': 5.079795837402344, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}


  1%|▏         | 9/656 [00:25<29:08,  2.70s/it]

{'loss': 4.2968, 'grad_norm': 5.289200782775879, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}


  2%|▏         | 10/656 [00:27<29:39,  2.75s/it]

{'loss': 4.2838, 'grad_norm': 4.62969446182251, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}


  2%|▏         | 11/656 [00:30<29:38,  2.76s/it]

{'loss': 4.2988, 'grad_norm': 5.266684532165527, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}


  2%|▏         | 12/656 [00:33<29:47,  2.78s/it]

{'loss': 4.2879, 'grad_norm': 5.258734703063965, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}


  2%|▏         | 13/656 [00:36<30:31,  2.85s/it]

{'loss': 4.2933, 'grad_norm': 5.0546650886535645, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}


  2%|▏         | 14/656 [00:39<30:29,  2.85s/it]

{'loss': 4.2554, 'grad_norm': 4.494696140289307, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}


  2%|▏         | 15/656 [00:42<30:25,  2.85s/it]

{'loss': 4.291, 'grad_norm': 4.643035888671875, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}


  2%|▏         | 16/656 [00:45<30:22,  2.85s/it]

{'loss': 4.2878, 'grad_norm': 4.4958295822143555, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}


  3%|▎         | 17/656 [00:48<31:10,  2.93s/it]

{'loss': 4.2999, 'grad_norm': 4.859738826751709, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}


  3%|▎         | 18/656 [00:50<30:33,  2.87s/it]

{'loss': 4.3123, 'grad_norm': 4.507328510284424, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}


  3%|▎         | 19/656 [00:54<31:35,  2.98s/it]

{'loss': 4.2911, 'grad_norm': 4.910254955291748, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}


  3%|▎         | 20/656 [00:56<31:02,  2.93s/it]

{'loss': 4.2726, 'grad_norm': 5.0928120613098145, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}


  3%|▎         | 21/656 [00:59<30:22,  2.87s/it]

{'loss': 4.2665, 'grad_norm': 4.064023494720459, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}


  3%|▎         | 22/656 [01:02<29:56,  2.83s/it]

{'loss': 4.293, 'grad_norm': 4.51459264755249, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}


  4%|▎         | 23/656 [01:05<29:27,  2.79s/it]

{'loss': 4.2617, 'grad_norm': 4.8841423988342285, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}


  4%|▎         | 24/656 [01:07<29:09,  2.77s/it]

{'loss': 4.2779, 'grad_norm': 4.711093902587891, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}


  4%|▍         | 25/656 [01:11<32:06,  3.05s/it]

{'loss': 4.2764, 'grad_norm': 4.795802116394043, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}


  4%|▍         | 26/656 [01:15<34:49,  3.32s/it]

{'loss': 4.2492, 'grad_norm': 4.8032355308532715, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}


  4%|▍         | 27/656 [01:18<34:45,  3.32s/it]

{'loss': 4.2492, 'grad_norm': 4.5623040199279785, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}


  4%|▍         | 28/656 [01:21<34:05,  3.26s/it]

{'loss': 4.2319, 'grad_norm': 4.647701740264893, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


  4%|▍         | 29/656 [01:25<33:41,  3.22s/it]

{'loss': 4.2422, 'grad_norm': 4.898726940155029, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}


  5%|▍         | 30/656 [01:28<35:23,  3.39s/it]

{'loss': 4.243, 'grad_norm': 4.968744277954102, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}


  5%|▍         | 31/656 [01:32<36:29,  3.50s/it]

{'loss': 4.2339, 'grad_norm': 4.446929454803467, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}


  5%|▍         | 32/656 [01:36<36:07,  3.47s/it]

{'loss': 4.2381, 'grad_norm': 5.318870544433594, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}


  5%|▌         | 33/656 [01:38<33:26,  3.22s/it]

{'loss': 4.237, 'grad_norm': 4.6315484046936035, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}


  5%|▌         | 34/656 [01:40<30:25,  2.93s/it]

{'loss': 4.2346, 'grad_norm': 4.546346187591553, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}


  5%|▌         | 35/656 [01:43<28:21,  2.74s/it]

{'loss': 4.2147, 'grad_norm': 4.812370300292969, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}


  5%|▌         | 36/656 [01:45<27:05,  2.62s/it]

{'loss': 4.2566, 'grad_norm': 4.85140323638916, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}


  6%|▌         | 37/656 [01:48<26:36,  2.58s/it]

{'loss': 4.2371, 'grad_norm': 5.165072917938232, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}


  6%|▌         | 38/656 [01:51<27:49,  2.70s/it]

{'loss': 4.2419, 'grad_norm': 4.80017614364624, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}


  6%|▌         | 39/656 [01:53<27:22,  2.66s/it]

{'loss': 4.1909, 'grad_norm': 4.724559307098389, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}


  6%|▌         | 40/656 [01:56<26:38,  2.60s/it]

{'loss': 4.2192, 'grad_norm': 5.126741409301758, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}


  6%|▋         | 41/656 [01:58<25:56,  2.53s/it]

{'loss': 4.2597, 'grad_norm': 4.453256607055664, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}


  6%|▋         | 42/656 [02:00<25:32,  2.50s/it]

{'loss': 4.1759, 'grad_norm': 4.597639560699463, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}


  7%|▋         | 43/656 [02:03<25:31,  2.50s/it]

{'loss': 4.2155, 'grad_norm': 4.430399417877197, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}


  7%|▋         | 44/656 [02:05<25:39,  2.52s/it]

{'loss': 4.1916, 'grad_norm': 4.711440563201904, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}


  7%|▋         | 45/656 [02:08<25:08,  2.47s/it]

{'loss': 4.2153, 'grad_norm': 4.618434906005859, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}


  7%|▋         | 46/656 [02:10<25:06,  2.47s/it]

{'loss': 4.1456, 'grad_norm': 4.918545722961426, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}


  7%|▋         | 47/656 [02:13<24:45,  2.44s/it]

{'loss': 4.1707, 'grad_norm': 4.732576370239258, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}


  7%|▋         | 48/656 [02:15<25:52,  2.55s/it]

{'loss': 4.1475, 'grad_norm': 5.022714138031006, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}


  7%|▋         | 49/656 [02:18<27:09,  2.68s/it]

{'loss': 4.1768, 'grad_norm': 4.313758373260498, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}


  8%|▊         | 50/656 [02:22<29:44,  2.94s/it]

{'loss': 4.1714, 'grad_norm': 5.153716564178467, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}


  8%|▊         | 51/656 [02:25<28:42,  2.85s/it]

{'loss': 4.1528, 'grad_norm': 4.630426406860352, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}


  8%|▊         | 52/656 [02:27<27:48,  2.76s/it]

{'loss': 4.1657, 'grad_norm': 4.660095691680908, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}


  8%|▊         | 53/656 [02:30<26:50,  2.67s/it]

{'loss': 4.1393, 'grad_norm': 4.940135955810547, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}


  8%|▊         | 54/656 [02:32<26:05,  2.60s/it]

{'loss': 4.1224, 'grad_norm': 4.43679141998291, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}


  8%|▊         | 55/656 [02:35<25:44,  2.57s/it]

{'loss': 4.127, 'grad_norm': 4.890179634094238, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}


  9%|▊         | 56/656 [02:37<25:21,  2.54s/it]

{'loss': 4.1221, 'grad_norm': 4.903988361358643, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}


  9%|▊         | 57/656 [02:40<25:24,  2.54s/it]

{'loss': 4.1076, 'grad_norm': 4.952454566955566, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}


  9%|▉         | 58/656 [02:42<25:35,  2.57s/it]

{'loss': 4.098, 'grad_norm': 4.82053804397583, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}


  9%|▉         | 59/656 [02:45<25:12,  2.53s/it]

{'loss': 4.1012, 'grad_norm': 4.838386058807373, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}


  9%|▉         | 60/656 [02:47<24:33,  2.47s/it]

{'loss': 4.0918, 'grad_norm': 4.92332124710083, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


  9%|▉         | 61/656 [02:49<24:06,  2.43s/it]

{'loss': 4.0455, 'grad_norm': 5.080505847930908, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}


  9%|▉         | 62/656 [02:52<23:38,  2.39s/it]

{'loss': 4.0856, 'grad_norm': 4.9347004890441895, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}


 10%|▉         | 63/656 [02:54<24:28,  2.48s/it]

{'loss': 4.0555, 'grad_norm': 5.162052154541016, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}


 10%|▉         | 64/656 [02:57<24:57,  2.53s/it]

{'loss': 4.1016, 'grad_norm': 4.324662208557129, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}


 10%|▉         | 65/656 [03:00<25:06,  2.55s/it]

{'loss': 4.0632, 'grad_norm': 4.82887601852417, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}


 10%|█         | 66/656 [03:02<24:59,  2.54s/it]

{'loss': 4.0388, 'grad_norm': 4.867741584777832, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


 10%|█         | 67/656 [03:04<24:23,  2.48s/it]

{'loss': 4.0451, 'grad_norm': 4.745523452758789, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}


 10%|█         | 68/656 [03:07<23:49,  2.43s/it]

{'loss': 4.0202, 'grad_norm': 5.213283538818359, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}


 11%|█         | 69/656 [03:09<23:29,  2.40s/it]

{'loss': 4.0118, 'grad_norm': 5.077054977416992, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}


 11%|█         | 70/656 [03:12<25:31,  2.61s/it]

{'loss': 4.0569, 'grad_norm': 4.373050689697266, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}


 11%|█         | 71/656 [03:15<27:33,  2.83s/it]

{'loss': 4.0362, 'grad_norm': 4.745738506317139, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}


 11%|█         | 72/656 [03:19<28:25,  2.92s/it]

{'loss': 3.978, 'grad_norm': 5.127805709838867, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}


 11%|█         | 73/656 [03:22<28:19,  2.92s/it]

{'loss': 3.9956, 'grad_norm': 4.803183078765869, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}


 11%|█▏        | 74/656 [03:24<27:53,  2.88s/it]

{'loss': 3.9984, 'grad_norm': 4.87138032913208, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}


 11%|█▏        | 75/656 [03:27<27:02,  2.79s/it]

{'loss': 4.0027, 'grad_norm': 4.823363304138184, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}


 12%|█▏        | 76/656 [03:30<27:39,  2.86s/it]

{'loss': 3.994, 'grad_norm': 4.614816665649414, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


 12%|█▏        | 77/656 [03:33<29:16,  3.03s/it]

{'loss': 3.9734, 'grad_norm': 4.612666606903076, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}


 12%|█▏        | 78/656 [03:36<29:01,  3.01s/it]

{'loss': 3.9165, 'grad_norm': 5.148777484893799, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}


 12%|█▏        | 79/656 [03:39<28:34,  2.97s/it]

{'loss': 3.9748, 'grad_norm': 4.655776500701904, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}


 12%|█▏        | 80/656 [03:42<28:32,  2.97s/it]

{'loss': 3.9752, 'grad_norm': 4.3036980628967285, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}


 12%|█▏        | 81/656 [03:45<28:04,  2.93s/it]

{'loss': 3.9224, 'grad_norm': 5.016764163970947, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}


 12%|█▎        | 82/656 [03:48<28:07,  2.94s/it]

{'loss': 3.8558, 'grad_norm': 5.551757335662842, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


 13%|█▎        | 83/656 [03:51<27:46,  2.91s/it]

{'loss': 3.9264, 'grad_norm': 4.8857102394104, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}


 13%|█▎        | 84/656 [03:54<27:52,  2.92s/it]

{'loss': 3.9074, 'grad_norm': 5.197556972503662, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}


 13%|█▎        | 85/656 [03:57<27:34,  2.90s/it]

{'loss': 3.8644, 'grad_norm': 5.115663528442383, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}


 13%|█▎        | 86/656 [04:00<27:47,  2.93s/it]

{'loss': 3.8826, 'grad_norm': 5.056106090545654, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}


 13%|█▎        | 87/656 [04:02<27:24,  2.89s/it]

{'loss': 3.8241, 'grad_norm': 5.586409091949463, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}


 13%|█▎        | 88/656 [04:05<27:16,  2.88s/it]

{'loss': 3.8101, 'grad_norm': 5.706971645355225, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


 14%|█▎        | 89/656 [04:08<27:33,  2.92s/it]

{'loss': 3.8198, 'grad_norm': 5.430663108825684, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}


 14%|█▎        | 90/656 [04:11<27:19,  2.90s/it]

{'loss': 3.8143, 'grad_norm': 5.247541427612305, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}


 14%|█▍        | 91/656 [04:14<27:22,  2.91s/it]

{'loss': 3.778, 'grad_norm': 5.629462718963623, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}


 14%|█▍        | 92/656 [04:17<27:33,  2.93s/it]

{'loss': 3.8313, 'grad_norm': 4.945455551147461, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}


 14%|█▍        | 93/656 [04:20<27:38,  2.95s/it]

{'loss': 3.8167, 'grad_norm': 5.16379976272583, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}


 14%|█▍        | 94/656 [04:23<27:19,  2.92s/it]

{'loss': 3.7859, 'grad_norm': 5.143908500671387, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


 14%|█▍        | 95/656 [04:26<27:23,  2.93s/it]

{'loss': 3.7484, 'grad_norm': 5.394815921783447, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}


 15%|█▍        | 96/656 [04:29<27:18,  2.93s/it]

{'loss': 3.6979, 'grad_norm': 6.000935077667236, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}


 15%|█▍        | 97/656 [04:32<26:52,  2.88s/it]

{'loss': 3.7657, 'grad_norm': 5.166909694671631, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}


 15%|█▍        | 98/656 [04:34<25:09,  2.70s/it]

{'loss': 3.6552, 'grad_norm': 6.203434944152832, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}


 15%|█▌        | 99/656 [04:36<23:39,  2.55s/it]

{'loss': 3.6345, 'grad_norm': 6.12365198135376, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}


 15%|█▌        | 100/656 [04:38<22:33,  2.44s/it]

{'loss': 3.6573, 'grad_norm': 6.055201053619385, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}


 15%|█▌        | 101/656 [04:40<21:49,  2.36s/it]

{'loss': 3.6593, 'grad_norm': 6.113765239715576, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}


 16%|█▌        | 102/656 [04:43<21:17,  2.31s/it]

{'loss': 3.537, 'grad_norm': 7.083325386047363, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}


 16%|█▌        | 103/656 [04:45<20:52,  2.27s/it]

{'loss': 3.5925, 'grad_norm': 6.540341854095459, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}


 16%|█▌        | 104/656 [04:47<20:34,  2.24s/it]

{'loss': 3.5792, 'grad_norm': 6.306625843048096, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}


 16%|█▌        | 105/656 [04:49<20:21,  2.22s/it]

{'loss': 3.5632, 'grad_norm': 6.286392688751221, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}


 16%|█▌        | 106/656 [04:52<22:02,  2.40s/it]

{'loss': 3.5799, 'grad_norm': 6.087452411651611, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


 16%|█▋        | 107/656 [04:55<24:09,  2.64s/it]

{'loss': 3.4445, 'grad_norm': 7.187560081481934, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}


 16%|█▋        | 108/656 [04:58<23:45,  2.60s/it]

{'loss': 3.5971, 'grad_norm': 5.919460296630859, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}


 17%|█▋        | 109/656 [05:00<22:49,  2.50s/it]

{'loss': 3.4123, 'grad_norm': 7.376334190368652, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}


 17%|█▋        | 110/656 [05:02<22:03,  2.42s/it]

{'loss': 3.475, 'grad_norm': 6.683959484100342, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}


 17%|█▋        | 111/656 [05:04<21:31,  2.37s/it]

{'loss': 3.4076, 'grad_norm': 7.113531589508057, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}


 17%|█▋        | 112/656 [05:07<20:56,  2.31s/it]

{'loss': 3.5082, 'grad_norm': 6.187674522399902, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [05:09<20:32,  2.27s/it]

{'loss': 3.4255, 'grad_norm': 6.576614856719971, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}


 17%|█▋        | 114/656 [05:11<20:21,  2.25s/it]

{'loss': 3.2437, 'grad_norm': 8.089536666870117, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}


 18%|█▊        | 115/656 [05:13<20:15,  2.25s/it]

{'loss': 3.4729, 'grad_norm': 6.027083396911621, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}


 18%|█▊        | 116/656 [05:15<20:10,  2.24s/it]

{'loss': 3.3203, 'grad_norm': 6.791558742523193, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}


 18%|█▊        | 117/656 [05:18<20:01,  2.23s/it]

{'loss': 3.2984, 'grad_norm': 6.9408979415893555, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}


 18%|█▊        | 118/656 [05:20<21:33,  2.40s/it]

{'loss': 3.2175, 'grad_norm': 7.48325252532959, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}


 18%|█▊        | 119/656 [05:24<23:34,  2.63s/it]

{'loss': 3.2401, 'grad_norm': 6.875918388366699, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}


 18%|█▊        | 120/656 [05:27<25:26,  2.85s/it]

{'loss': 3.3321, 'grad_norm': 6.323623180389404, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}


 18%|█▊        | 121/656 [05:29<24:28,  2.75s/it]

{'loss': 3.4315, 'grad_norm': 5.5301923751831055, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}


 19%|█▊        | 122/656 [05:32<23:48,  2.68s/it]

{'loss': 3.219, 'grad_norm': 6.508051872253418, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}


 19%|█▉        | 123/656 [05:35<24:38,  2.77s/it]

{'loss': 3.1085, 'grad_norm': 6.794110298156738, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}


 19%|█▉        | 124/656 [05:38<24:32,  2.77s/it]

{'loss': 3.1367, 'grad_norm': 6.479499340057373, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}


 19%|█▉        | 125/656 [05:40<23:13,  2.63s/it]

{'loss': 3.0445, 'grad_norm': 6.622122764587402, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}


 19%|█▉        | 126/656 [05:43<23:54,  2.71s/it]

{'loss': 3.053, 'grad_norm': 6.134063720703125, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}


 19%|█▉        | 127/656 [05:45<22:31,  2.56s/it]

{'loss': 2.9872, 'grad_norm': 6.347332954406738, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}


 20%|█▉        | 128/656 [05:48<22:17,  2.53s/it]

{'loss': 3.0621, 'grad_norm': 5.538850784301758, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}


 20%|█▉        | 129/656 [05:50<21:25,  2.44s/it]

{'loss': 3.0846, 'grad_norm': 5.22988748550415, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}


 20%|█▉        | 130/656 [05:52<21:12,  2.42s/it]

{'loss': 3.076, 'grad_norm': 4.763619899749756, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}


 20%|█▉        | 131/656 [05:55<21:08,  2.42s/it]

{'loss': 2.9891, 'grad_norm': 5.2704973220825195, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}


 20%|██        | 132/656 [05:57<22:03,  2.52s/it]

{'loss': 2.9855, 'grad_norm': 4.799814701080322, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}


 20%|██        | 133/656 [06:00<23:08,  2.66s/it]

{'loss': 2.8735, 'grad_norm': 4.72919225692749, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}


 20%|██        | 134/656 [06:03<24:19,  2.80s/it]

{'loss': 2.7039, 'grad_norm': 5.309356689453125, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}


 21%|██        | 135/656 [06:06<24:47,  2.85s/it]

{'loss': 2.7959, 'grad_norm': 4.505966663360596, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}


 21%|██        | 136/656 [06:09<24:40,  2.85s/it]

{'loss': 2.8459, 'grad_norm': 3.9600510597229004, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}


 21%|██        | 137/656 [06:11<23:04,  2.67s/it]

{'loss': 2.9944, 'grad_norm': 3.4752390384674072, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}


 21%|██        | 138/656 [06:14<21:50,  2.53s/it]

{'loss': 2.8398, 'grad_norm': 3.3310108184814453, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}


 21%|██        | 139/656 [06:16<21:06,  2.45s/it]

{'loss': 2.7077, 'grad_norm': 3.7263023853302, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}


 21%|██▏       | 140/656 [06:18<20:32,  2.39s/it]

{'loss': 2.8066, 'grad_norm': 3.2338709831237793, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}


 21%|██▏       | 141/656 [06:20<20:06,  2.34s/it]

{'loss': 2.9359, 'grad_norm': 2.7221386432647705, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}


 22%|██▏       | 142/656 [06:23<19:47,  2.31s/it]

{'loss': 2.5195, 'grad_norm': 3.5991272926330566, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}


 22%|██▏       | 143/656 [06:25<19:37,  2.30s/it]

{'loss': 2.7699, 'grad_norm': 2.6367135047912598, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}


 22%|██▏       | 144/656 [06:27<19:26,  2.28s/it]

{'loss': 2.6634, 'grad_norm': 2.5648105144500732, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}


 22%|██▏       | 145/656 [06:29<19:15,  2.26s/it]

{'loss': 2.7268, 'grad_norm': 2.3435940742492676, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}


 22%|██▏       | 146/656 [06:32<19:08,  2.25s/it]

{'loss': 2.6266, 'grad_norm': 2.631027936935425, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}


 22%|██▏       | 147/656 [06:34<19:14,  2.27s/it]

{'loss': 2.7043, 'grad_norm': 2.6260290145874023, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}


 23%|██▎       | 148/656 [06:36<19:00,  2.24s/it]

{'loss': 2.514, 'grad_norm': 2.3232638835906982, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}


 23%|██▎       | 149/656 [06:38<18:54,  2.24s/it]

{'loss': 2.6349, 'grad_norm': 2.17499041557312, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}


 23%|██▎       | 150/656 [06:41<19:10,  2.27s/it]

{'loss': 2.5274, 'grad_norm': 2.2698118686676025, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


 23%|██▎       | 151/656 [06:43<20:13,  2.40s/it]

{'loss': 2.4966, 'grad_norm': 2.449148178100586, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}


 23%|██▎       | 152/656 [06:46<21:26,  2.55s/it]

{'loss': 2.5971, 'grad_norm': 2.140355110168457, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}


 23%|██▎       | 153/656 [06:49<21:03,  2.51s/it]

{'loss': 2.5986, 'grad_norm': 2.0919768810272217, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}


 23%|██▎       | 154/656 [06:51<20:33,  2.46s/it]

{'loss': 2.4611, 'grad_norm': 2.1383213996887207, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}


 24%|██▎       | 155/656 [06:55<23:05,  2.76s/it]

{'loss': 2.5735, 'grad_norm': 2.222543478012085, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}


 24%|██▍       | 156/656 [06:57<23:13,  2.79s/it]

{'loss': 2.4519, 'grad_norm': 2.3346168994903564, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}


 24%|██▍       | 157/656 [07:00<23:57,  2.88s/it]

{'loss': 2.6168, 'grad_norm': 2.259561777114868, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}


 24%|██▍       | 158/656 [07:03<22:59,  2.77s/it]

{'loss': 2.6431, 'grad_norm': 2.1580026149749756, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}


 24%|██▍       | 159/656 [07:05<22:16,  2.69s/it]

{'loss': 2.4183, 'grad_norm': 2.3900012969970703, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}


 24%|██▍       | 160/656 [07:08<21:58,  2.66s/it]

{'loss': 2.3877, 'grad_norm': 2.848003625869751, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}


 25%|██▍       | 161/656 [07:10<21:15,  2.58s/it]

{'loss': 2.6532, 'grad_norm': 2.470641851425171, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}


 25%|██▍       | 162/656 [07:13<20:50,  2.53s/it]

{'loss': 2.5649, 'grad_norm': 2.0737383365631104, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


 25%|██▍       | 163/656 [07:16<21:16,  2.59s/it]

{'loss': 2.5246, 'grad_norm': 2.041661262512207, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}


 25%|██▌       | 164/656 [07:18<20:19,  2.48s/it]

{'loss': 2.6964, 'grad_norm': 2.0592479705810547, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}


 25%|██▌       | 165/656 [07:20<19:40,  2.40s/it]

{'loss': 2.4667, 'grad_norm': 2.00734543800354, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}


 25%|██▌       | 166/656 [07:23<21:53,  2.68s/it]

{'loss': 2.6073, 'grad_norm': 2.1281936168670654, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}


 25%|██▌       | 167/656 [07:27<22:58,  2.82s/it]

{'loss': 2.4405, 'grad_norm': 2.0737531185150146, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}


 26%|██▌       | 168/656 [07:29<23:01,  2.83s/it]

{'loss': 2.3979, 'grad_norm': 2.214305877685547, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


 26%|██▌       | 169/656 [07:33<24:35,  3.03s/it]

{'loss': 2.6893, 'grad_norm': 2.112484931945801, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}


 26%|██▌       | 170/656 [07:36<23:42,  2.93s/it]

{'loss': 2.4617, 'grad_norm': 2.060986280441284, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}


 26%|██▌       | 171/656 [07:39<24:34,  3.04s/it]

{'loss': 2.5503, 'grad_norm': 2.165837526321411, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}


 26%|██▌       | 172/656 [07:42<25:05,  3.11s/it]

{'loss': 2.3131, 'grad_norm': 2.247823715209961, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


 26%|██▋       | 173/656 [07:45<24:45,  3.08s/it]

{'loss': 2.5676, 'grad_norm': 2.099600076675415, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}


 27%|██▋       | 174/656 [07:48<24:44,  3.08s/it]

{'loss': 2.4157, 'grad_norm': 2.1272573471069336, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}


 27%|██▋       | 175/656 [07:51<23:55,  2.99s/it]

{'loss': 2.3407, 'grad_norm': 2.6242661476135254, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}


 27%|██▋       | 176/656 [07:54<23:46,  2.97s/it]

{'loss': 2.5489, 'grad_norm': 2.0087625980377197, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


 27%|██▋       | 177/656 [07:57<23:24,  2.93s/it]

{'loss': 2.3609, 'grad_norm': 2.3083155155181885, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}


 27%|██▋       | 178/656 [08:00<23:27,  2.94s/it]

{'loss': 2.663, 'grad_norm': 2.0165998935699463, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}


 27%|██▋       | 179/656 [08:02<22:58,  2.89s/it]

{'loss': 2.4528, 'grad_norm': 2.0796899795532227, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}


 27%|██▋       | 180/656 [08:05<22:49,  2.88s/it]

{'loss': 2.2736, 'grad_norm': 2.1093461513519287, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


 28%|██▊       | 181/656 [08:08<22:57,  2.90s/it]

{'loss': 2.4203, 'grad_norm': 1.8792059421539307, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}


 28%|██▊       | 182/656 [08:11<23:05,  2.92s/it]

{'loss': 2.2451, 'grad_norm': 2.0984747409820557, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}


 28%|██▊       | 183/656 [08:14<22:47,  2.89s/it]

{'loss': 2.393, 'grad_norm': 2.2039787769317627, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}


 28%|██▊       | 184/656 [08:16<21:30,  2.73s/it]

{'loss': 2.2254, 'grad_norm': 1.9462273120880127, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


 28%|██▊       | 185/656 [08:19<20:25,  2.60s/it]

{'loss': 2.3041, 'grad_norm': 1.7911388874053955, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}


 28%|██▊       | 186/656 [08:21<19:24,  2.48s/it]

{'loss': 2.3633, 'grad_norm': 1.893935203552246, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}


 29%|██▊       | 187/656 [08:23<18:48,  2.41s/it]

{'loss': 2.4173, 'grad_norm': 1.8647682666778564, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}


 29%|██▊       | 188/656 [08:25<18:14,  2.34s/it]

{'loss': 2.335, 'grad_norm': 1.7017847299575806, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


 29%|██▉       | 189/656 [08:28<17:49,  2.29s/it]

{'loss': 2.1755, 'grad_norm': 1.912717342376709, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}


 29%|██▉       | 190/656 [08:30<17:34,  2.26s/it]

{'loss': 2.4083, 'grad_norm': 1.7250080108642578, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}


 29%|██▉       | 191/656 [08:32<17:25,  2.25s/it]

{'loss': 2.1342, 'grad_norm': 2.029205799102783, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}


 29%|██▉       | 192/656 [08:34<17:22,  2.25s/it]

{'loss': 2.2851, 'grad_norm': 1.8430272340774536, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}


 29%|██▉       | 193/656 [08:36<17:19,  2.24s/it]

{'loss': 2.1551, 'grad_norm': 1.925761103630066, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}


 30%|██▉       | 194/656 [08:39<17:07,  2.23s/it]

{'loss': 2.3089, 'grad_norm': 2.320054292678833, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}


 30%|██▉       | 195/656 [08:41<16:57,  2.21s/it]

{'loss': 2.169, 'grad_norm': 1.7041747570037842, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}


 30%|██▉       | 196/656 [08:43<16:49,  2.20s/it]

{'loss': 2.2907, 'grad_norm': 1.8820033073425293, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}


 30%|███       | 197/656 [08:45<16:44,  2.19s/it]

{'loss': 2.2655, 'grad_norm': 2.1297872066497803, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}


 30%|███       | 198/656 [08:47<16:49,  2.20s/it]

{'loss': 2.2585, 'grad_norm': 1.927581548690796, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}


 30%|███       | 199/656 [08:50<16:41,  2.19s/it]

{'loss': 2.3506, 'grad_norm': 2.3535492420196533, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}


 30%|███       | 200/656 [08:52<16:35,  2.18s/it]

{'loss': 2.2847, 'grad_norm': 1.6797815561294556, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}


 31%|███       | 201/656 [08:54<16:31,  2.18s/it]

{'loss': 2.1989, 'grad_norm': 2.3032376766204834, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}


 31%|███       | 202/656 [08:56<16:32,  2.19s/it]

{'loss': 2.2141, 'grad_norm': 2.3918492794036865, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}


 31%|███       | 203/656 [08:58<16:48,  2.23s/it]

{'loss': 2.4128, 'grad_norm': 2.2497377395629883, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}


 31%|███       | 204/656 [09:00<15:47,  2.10s/it]

{'loss': 2.2975, 'grad_norm': 1.9698138236999512, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}


 31%|███▏      | 205/656 [09:02<15:54,  2.12s/it]

{'loss': 2.1539, 'grad_norm': 1.6634160280227661, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}


 31%|███▏      | 206/656 [09:04<15:57,  2.13s/it]

{'loss': 2.0078, 'grad_norm': 3.115065813064575, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


 32%|███▏      | 207/656 [09:07<15:59,  2.14s/it]

{'loss': 2.222, 'grad_norm': 1.7541205883026123, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}


 32%|███▏      | 208/656 [09:09<16:28,  2.21s/it]

{'loss': 2.1237, 'grad_norm': 2.1604177951812744, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}


 32%|███▏      | 209/656 [09:11<16:20,  2.19s/it]

{'loss': 2.3235, 'grad_norm': 1.8813223838806152, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}


 32%|███▏      | 210/656 [09:13<16:14,  2.19s/it]

{'loss': 2.2162, 'grad_norm': 2.2043275833129883, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


 32%|███▏      | 211/656 [09:16<16:09,  2.18s/it]

{'loss': 2.3543, 'grad_norm': 2.030385971069336, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}


 32%|███▏      | 212/656 [09:18<16:13,  2.19s/it]

{'loss': 2.09, 'grad_norm': 1.8917360305786133, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}


 32%|███▏      | 213/656 [09:20<16:04,  2.18s/it]

{'loss': 2.257, 'grad_norm': 1.8939175605773926, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}


 33%|███▎      | 214/656 [09:22<16:00,  2.17s/it]

{'loss': 2.2183, 'grad_norm': 2.0288286209106445, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


 33%|███▎      | 215/656 [09:24<15:57,  2.17s/it]

{'loss': 2.0386, 'grad_norm': 1.911882996559143, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}


 33%|███▎      | 216/656 [09:26<15:53,  2.17s/it]

{'loss': 2.0355, 'grad_norm': 1.8909273147583008, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}


 33%|███▎      | 217/656 [09:29<15:49,  2.16s/it]

{'loss': 2.1916, 'grad_norm': 1.7827086448669434, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}


 33%|███▎      | 218/656 [09:31<15:47,  2.16s/it]

{'loss': 2.0696, 'grad_norm': 1.7285189628601074, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}


 33%|███▎      | 219/656 [09:33<15:44,  2.16s/it]

{'loss': 1.995, 'grad_norm': 2.1702165603637695, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}


 34%|███▎      | 220/656 [09:35<15:44,  2.17s/it]

{'loss': 2.2174, 'grad_norm': 2.1657862663269043, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}


 34%|███▎      | 221/656 [09:37<15:40,  2.16s/it]

{'loss': 2.134, 'grad_norm': 2.2482752799987793, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}


 34%|███▍      | 222/656 [09:39<15:38,  2.16s/it]

{'loss': 2.0014, 'grad_norm': 2.4038009643554688, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}


 34%|███▍      | 223/656 [09:41<15:36,  2.16s/it]

{'loss': 2.1138, 'grad_norm': 1.8805629014968872, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}


 34%|███▍      | 224/656 [09:44<15:42,  2.18s/it]

{'loss': 2.1333, 'grad_norm': 2.6577250957489014, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}


 34%|███▍      | 225/656 [09:46<15:36,  2.17s/it]

{'loss': 2.1283, 'grad_norm': 2.0876526832580566, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}


 34%|███▍      | 226/656 [09:48<15:34,  2.17s/it]

{'loss': 2.0059, 'grad_norm': 2.2740933895111084, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}


 35%|███▍      | 227/656 [09:50<15:30,  2.17s/it]

{'loss': 1.971, 'grad_norm': 2.9170286655426025, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}


 35%|███▍      | 228/656 [09:52<15:32,  2.18s/it]

{'loss': 2.2461, 'grad_norm': 2.1037192344665527, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}


 35%|███▍      | 229/656 [09:55<15:28,  2.17s/it]

{'loss': 2.1426, 'grad_norm': 2.1273598670959473, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}


 35%|███▌      | 230/656 [09:57<15:47,  2.22s/it]

{'loss': 2.2634, 'grad_norm': 2.1603634357452393, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}


 35%|███▌      | 231/656 [09:59<15:37,  2.21s/it]

{'loss': 1.9879, 'grad_norm': 2.5647330284118652, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}


 35%|███▌      | 232/656 [10:01<15:28,  2.19s/it]

{'loss': 1.9038, 'grad_norm': 2.698179006576538, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}


 36%|███▌      | 233/656 [10:03<15:22,  2.18s/it]

{'loss': 1.9825, 'grad_norm': 3.028884172439575, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}


 36%|███▌      | 234/656 [10:05<15:11,  2.16s/it]

{'loss': 1.8845, 'grad_norm': 2.7020349502563477, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}


 36%|███▌      | 235/656 [10:08<15:09,  2.16s/it]

{'loss': 2.0635, 'grad_norm': 2.079606294631958, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}


 36%|███▌      | 236/656 [10:10<15:13,  2.17s/it]

{'loss': 2.0189, 'grad_norm': 2.339020252227783, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}


 36%|███▌      | 237/656 [10:12<15:08,  2.17s/it]

{'loss': 2.0384, 'grad_norm': 2.799076557159424, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}


 36%|███▋      | 238/656 [10:14<14:58,  2.15s/it]

{'loss': 2.0061, 'grad_norm': 3.288285255432129, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}


 36%|███▋      | 239/656 [10:16<15:06,  2.17s/it]

{'loss': 2.1399, 'grad_norm': 2.871328353881836, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}


 37%|███▋      | 240/656 [10:19<15:10,  2.19s/it]

{'loss': 1.9608, 'grad_norm': 2.5932421684265137, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


 37%|███▋      | 241/656 [10:21<15:07,  2.19s/it]

{'loss': 2.0537, 'grad_norm': 2.2677977085113525, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}


 37%|███▋      | 242/656 [10:23<15:02,  2.18s/it]

{'loss': 2.0312, 'grad_norm': 2.292186737060547, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}


 37%|███▋      | 243/656 [10:25<15:03,  2.19s/it]

{'loss': 1.9442, 'grad_norm': 2.2452423572540283, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}


 37%|███▋      | 244/656 [10:28<15:34,  2.27s/it]

{'loss': 1.9847, 'grad_norm': 2.177638053894043, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


 37%|███▋      | 245/656 [10:30<15:38,  2.28s/it]

{'loss': 1.9401, 'grad_norm': 2.219675064086914, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}


 38%|███▊      | 246/656 [11:54<3:02:58, 26.78s/it]

{'loss': 1.926, 'grad_norm': 2.212383508682251, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}


 38%|███▊      | 247/656 [11:56<2:13:13, 19.54s/it]

{'loss': 2.0462, 'grad_norm': 2.1031341552734375, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}


 38%|███▊      | 248/656 [11:59<1:37:42, 14.37s/it]

{'loss': 1.8879, 'grad_norm': 2.0012950897216797, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


 38%|███▊      | 249/656 [12:02<1:15:22, 11.11s/it]

{'loss': 1.8358, 'grad_norm': 2.404571294784546, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}


 38%|███▊      | 250/656 [12:05<58:44,  8.68s/it]  

{'loss': 1.9771, 'grad_norm': 2.233105421066284, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}


 38%|███▊      | 251/656 [12:08<46:10,  6.84s/it]

{'loss': 1.9753, 'grad_norm': 2.9070801734924316, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}


 38%|███▊      | 252/656 [12:11<37:53,  5.63s/it]

{'loss': 1.9673, 'grad_norm': 2.3727853298187256, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}


 39%|███▊      | 253/656 [12:13<31:22,  4.67s/it]

{'loss': 1.9211, 'grad_norm': 3.7000622749328613, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}


 39%|███▊      | 254/656 [12:16<27:09,  4.05s/it]

{'loss': 1.5709, 'grad_norm': 2.912052869796753, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}


 39%|███▉      | 255/656 [12:18<24:04,  3.60s/it]

{'loss': 1.9095, 'grad_norm': 2.306898832321167, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}


 39%|███▉      | 256/656 [12:21<21:54,  3.29s/it]

{'loss': 1.7944, 'grad_norm': 2.879032850265503, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}


 39%|███▉      | 257/656 [12:23<19:48,  2.98s/it]

{'loss': 1.8315, 'grad_norm': 3.4537174701690674, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}


 39%|███▉      | 258/656 [12:25<18:23,  2.77s/it]

{'loss': 1.9292, 'grad_norm': 2.0088698863983154, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}


 39%|███▉      | 259/656 [12:28<17:15,  2.61s/it]

{'loss': 1.8112, 'grad_norm': 3.3268215656280518, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}


 40%|███▉      | 260/656 [12:30<16:38,  2.52s/it]

{'loss': 1.985, 'grad_norm': 2.3114490509033203, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}


 40%|███▉      | 261/656 [12:32<16:04,  2.44s/it]

{'loss': 1.7087, 'grad_norm': 3.024162769317627, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}


 40%|███▉      | 262/656 [12:34<15:31,  2.36s/it]

{'loss': 1.8335, 'grad_norm': 2.905679702758789, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}


 40%|████      | 263/656 [12:37<15:27,  2.36s/it]

{'loss': 1.9359, 'grad_norm': 2.273564338684082, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}


 40%|████      | 264/656 [12:39<15:46,  2.41s/it]

{'loss': 1.8541, 'grad_norm': 2.256377935409546, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}


 40%|████      | 265/656 [12:42<15:32,  2.39s/it]

{'loss': 1.541, 'grad_norm': 2.6796278953552246, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}


 41%|████      | 266/656 [12:44<15:52,  2.44s/it]

{'loss': 1.7389, 'grad_norm': 4.383115768432617, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}


 41%|████      | 267/656 [12:47<15:56,  2.46s/it]

{'loss': 1.7726, 'grad_norm': 2.37296199798584, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}


 41%|████      | 268/656 [12:49<15:57,  2.47s/it]

{'loss': 1.986, 'grad_norm': 2.352827787399292, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}


 41%|████      | 269/656 [12:52<16:09,  2.50s/it]

{'loss': 1.7738, 'grad_norm': 2.35845685005188, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}


 41%|████      | 270/656 [12:54<16:05,  2.50s/it]

{'loss': 1.7546, 'grad_norm': 2.2856032848358154, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}


 41%|████▏     | 271/656 [12:57<15:57,  2.49s/it]

{'loss': 1.9397, 'grad_norm': 3.3237416744232178, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}


 41%|████▏     | 272/656 [12:59<15:56,  2.49s/it]

{'loss': 1.7318, 'grad_norm': 3.081908941268921, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}


 42%|████▏     | 273/656 [13:02<15:38,  2.45s/it]

{'loss': 1.9409, 'grad_norm': 2.712676525115967, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}


 42%|████▏     | 274/656 [13:04<15:42,  2.47s/it]

{'loss': 1.7572, 'grad_norm': 2.0418548583984375, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}


 42%|████▏     | 275/656 [13:06<15:29,  2.44s/it]

{'loss': 1.8077, 'grad_norm': 2.334077835083008, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}


 42%|████▏     | 276/656 [13:09<15:26,  2.44s/it]

{'loss': 1.6067, 'grad_norm': 2.145320177078247, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}


 42%|████▏     | 277/656 [13:11<15:30,  2.45s/it]

{'loss': 1.762, 'grad_norm': 2.6999971866607666, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}


 42%|████▏     | 278/656 [13:15<17:17,  2.75s/it]

{'loss': 1.7704, 'grad_norm': 2.773766040802002, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}


 43%|████▎     | 279/656 [13:18<18:52,  3.00s/it]

{'loss': 1.4278, 'grad_norm': 1.7679197788238525, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}


 43%|████▎     | 280/656 [13:21<18:07,  2.89s/it]

{'loss': 1.66, 'grad_norm': 2.8656325340270996, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}


 43%|████▎     | 281/656 [13:23<17:20,  2.78s/it]

{'loss': 2.0001, 'grad_norm': 3.7879679203033447, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}


 43%|████▎     | 282/656 [13:26<16:51,  2.70s/it]

{'loss': 1.6906, 'grad_norm': 2.08768630027771, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}


 43%|████▎     | 283/656 [13:29<16:40,  2.68s/it]

{'loss': 1.8404, 'grad_norm': 2.9589731693267822, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}


 43%|████▎     | 284/656 [13:31<16:08,  2.60s/it]

{'loss': 1.5449, 'grad_norm': 2.2233774662017822, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}


 43%|████▎     | 285/656 [13:34<16:54,  2.73s/it]

{'loss': 1.6526, 'grad_norm': 2.260939836502075, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}


 44%|████▎     | 286/656 [13:38<18:49,  3.05s/it]

{'loss': 1.5877, 'grad_norm': 3.1099772453308105, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}


 44%|████▍     | 287/656 [13:42<20:42,  3.37s/it]

{'loss': 1.6052, 'grad_norm': 3.5488429069519043, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}


 44%|████▍     | 288/656 [13:47<22:55,  3.74s/it]

{'loss': 1.7937, 'grad_norm': 4.170284748077393, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


 44%|████▍     | 289/656 [13:50<22:50,  3.73s/it]

{'loss': 1.7856, 'grad_norm': 2.982727527618408, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}


 44%|████▍     | 290/656 [13:53<21:28,  3.52s/it]

{'loss': 1.8315, 'grad_norm': 2.373805284500122, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}


 44%|████▍     | 291/656 [13:56<19:58,  3.28s/it]

{'loss': 1.7946, 'grad_norm': 2.242086887359619, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}


 45%|████▍     | 292/656 [13:59<18:31,  3.05s/it]

{'loss': 1.941, 'grad_norm': 2.9015448093414307, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


 45%|████▍     | 293/656 [14:01<17:59,  2.97s/it]

{'loss': 1.7164, 'grad_norm': 2.569526195526123, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}


 45%|████▍     | 294/656 [14:04<17:10,  2.85s/it]

{'loss': 1.599, 'grad_norm': 2.648601770401001, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}


 45%|████▍     | 295/656 [14:07<16:42,  2.78s/it]

{'loss': 1.5686, 'grad_norm': 3.4125711917877197, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}


 45%|████▌     | 296/656 [14:09<16:01,  2.67s/it]

{'loss': 1.5609, 'grad_norm': 2.163149118423462, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


 45%|████▌     | 297/656 [14:11<15:38,  2.61s/it]

{'loss': 1.6679, 'grad_norm': 3.2598021030426025, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}


 45%|████▌     | 298/656 [14:14<15:13,  2.55s/it]

{'loss': 1.3834, 'grad_norm': 2.3238277435302734, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}


 46%|████▌     | 299/656 [14:16<14:55,  2.51s/it]

{'loss': 1.5305, 'grad_norm': 2.420675754547119, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}


 46%|████▌     | 300/656 [14:19<14:34,  2.46s/it]

{'loss': 1.6068, 'grad_norm': 2.463432550430298, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


 46%|████▌     | 301/656 [14:21<14:27,  2.44s/it]

{'loss': 1.5796, 'grad_norm': 3.601567506790161, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}


 46%|████▌     | 302/656 [14:23<14:10,  2.40s/it]

{'loss': 1.4949, 'grad_norm': 4.437533855438232, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}


 46%|████▌     | 303/656 [14:26<13:59,  2.38s/it]

{'loss': 1.6438, 'grad_norm': 2.6347599029541016, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}


 46%|████▋     | 304/656 [14:28<14:02,  2.39s/it]

{'loss': 1.7193, 'grad_norm': 3.200812816619873, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


 46%|████▋     | 305/656 [14:31<14:03,  2.40s/it]

{'loss': 1.7144, 'grad_norm': 2.545654296875, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}


 47%|████▋     | 306/656 [14:33<14:28,  2.48s/it]

{'loss': 1.8144, 'grad_norm': 2.4216060638427734, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}


 47%|████▋     | 307/656 [14:37<16:50,  2.90s/it]

{'loss': 1.8878, 'grad_norm': 2.5819449424743652, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}


 47%|████▋     | 308/656 [14:40<17:21,  2.99s/it]

{'loss': 1.7525, 'grad_norm': 3.3704473972320557, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


 47%|████▋     | 309/656 [14:43<16:33,  2.86s/it]

{'loss': 1.4191, 'grad_norm': 4.745548248291016, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}


 47%|████▋     | 310/656 [14:46<16:27,  2.85s/it]

{'loss': 1.6089, 'grad_norm': 2.557385206222534, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}


 47%|████▋     | 311/656 [14:48<16:00,  2.78s/it]

{'loss': 1.5828, 'grad_norm': 4.556909084320068, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}


 48%|████▊     | 312/656 [14:51<15:44,  2.75s/it]

{'loss': 1.7176, 'grad_norm': 2.827686071395874, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


 48%|████▊     | 313/656 [14:54<15:26,  2.70s/it]

{'loss': 1.6215, 'grad_norm': 4.058869361877441, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}


 48%|████▊     | 314/656 [14:56<15:33,  2.73s/it]

{'loss': 1.4782, 'grad_norm': 2.842905282974243, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}


 48%|████▊     | 315/656 [14:59<15:09,  2.67s/it]

{'loss': 1.4777, 'grad_norm': 2.6265103816986084, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}


 48%|████▊     | 316/656 [15:02<15:33,  2.75s/it]

{'loss': 1.585, 'grad_norm': 2.117626905441284, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


 48%|████▊     | 317/656 [15:04<15:11,  2.69s/it]

{'loss': 1.6528, 'grad_norm': 2.741961717605591, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}


 48%|████▊     | 318/656 [15:07<15:34,  2.76s/it]

{'loss': 1.3706, 'grad_norm': 2.046959638595581, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}


 49%|████▊     | 319/656 [15:10<15:16,  2.72s/it]

{'loss': 1.5585, 'grad_norm': 2.065670967102051, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}


 49%|████▉     | 320/656 [15:13<15:18,  2.73s/it]

{'loss': 1.4644, 'grad_norm': 2.6153552532196045, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


 49%|████▉     | 321/656 [15:15<15:25,  2.76s/it]

{'loss': 1.3178, 'grad_norm': 2.6734108924865723, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}


 49%|████▉     | 322/656 [15:18<15:11,  2.73s/it]

{'loss': 1.4606, 'grad_norm': 3.217090606689453, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}


 49%|████▉     | 323/656 [15:21<14:53,  2.68s/it]

{'loss': 1.6342, 'grad_norm': 3.357952356338501, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}


 49%|████▉     | 324/656 [15:23<14:48,  2.68s/it]

{'loss': 1.4003, 'grad_norm': 3.5482912063598633, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


 50%|████▉     | 325/656 [15:26<14:52,  2.70s/it]

{'loss': 1.5723, 'grad_norm': 2.5577595233917236, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}


 50%|████▉     | 326/656 [15:29<14:33,  2.65s/it]

{'loss': 1.2869, 'grad_norm': 2.466374158859253, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}


 50%|████▉     | 327/656 [15:31<14:38,  2.67s/it]

{'loss': 1.5752, 'grad_norm': 3.8035178184509277, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}


 50%|█████     | 328/656 [15:32<11:01,  2.02s/it]
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.0256, 'grad_norm': 5.742748260498047, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [16:35<11:01,  2.02s/it]Saving model checkpoint to ./snips_token_clf/results/checkpoint-328
Configuration saved in ./snips_token_clf/results/checkpoint-328/config.json


{'eval_loss': 1.4345436096191406, 'eval_model_preparation_time': 0.0015, 'eval_runtime': 63.5925, 'eval_samples_per_second': 41.153, 'eval_steps_per_second': 1.289, 'epoch': 1.0}


Model weights saved in ./snips_token_clf/results/checkpoint-328/model.safetensors
 50%|█████     | 329/656 [16:39<1:57:58, 21.65s/it]

{'loss': 1.365, 'grad_norm': 1.9256511926651, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}


 50%|█████     | 330/656 [16:42<1:27:02, 16.02s/it]

{'loss': 1.3507, 'grad_norm': 2.184438705444336, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}


 50%|█████     | 331/656 [16:45<1:04:47, 11.96s/it]

{'loss': 1.3686, 'grad_norm': 2.5905346870422363, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}


 51%|█████     | 332/656 [16:47<49:20,  9.14s/it]  

{'loss': 1.3067, 'grad_norm': 2.128403902053833, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}


 51%|█████     | 333/656 [16:50<38:33,  7.16s/it]

{'loss': 1.4978, 'grad_norm': 2.4443647861480713, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}


 51%|█████     | 334/656 [16:52<30:40,  5.72s/it]

{'loss': 1.5246, 'grad_norm': 3.559370756149292, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}


 51%|█████     | 335/656 [16:55<25:15,  4.72s/it]

{'loss': 1.4356, 'grad_norm': 2.1179020404815674, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}


 51%|█████     | 336/656 [16:57<21:30,  4.03s/it]

{'loss': 1.4169, 'grad_norm': 3.5556697845458984, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}


 51%|█████▏    | 337/656 [17:00<19:06,  3.59s/it]

{'loss': 1.362, 'grad_norm': 2.4439592361450195, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}


 52%|█████▏    | 338/656 [17:02<16:56,  3.20s/it]

{'loss': 1.3477, 'grad_norm': 2.7080700397491455, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}


 52%|█████▏    | 339/656 [17:04<16:01,  3.03s/it]

{'loss': 1.4254, 'grad_norm': 3.9975333213806152, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}


 52%|█████▏    | 340/656 [17:07<14:44,  2.80s/it]

{'loss': 1.3293, 'grad_norm': 2.1288211345672607, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}


 52%|█████▏    | 341/656 [17:09<14:04,  2.68s/it]

{'loss': 1.3133, 'grad_norm': 2.2624738216400146, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}


 52%|█████▏    | 342/656 [17:12<14:14,  2.72s/it]

{'loss': 1.5973, 'grad_norm': 2.7169787883758545, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}


 52%|█████▏    | 343/656 [17:14<13:34,  2.60s/it]

{'loss': 1.4326, 'grad_norm': 2.4962613582611084, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}


 52%|█████▏    | 344/656 [17:17<13:38,  2.62s/it]

{'loss': 1.3231, 'grad_norm': 2.6091675758361816, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}


 53%|█████▎    | 345/656 [17:20<13:45,  2.65s/it]

{'loss': 1.3163, 'grad_norm': 2.9640824794769287, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}


 53%|█████▎    | 346/656 [17:24<15:43,  3.04s/it]

{'loss': 1.4646, 'grad_norm': 2.4500210285186768, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}


 53%|█████▎    | 347/656 [17:28<17:17,  3.36s/it]

{'loss': 1.3642, 'grad_norm': 2.995640754699707, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}


 53%|█████▎    | 348/656 [17:31<16:30,  3.22s/it]

{'loss': 1.6896, 'grad_norm': 3.304947853088379, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}


 53%|█████▎    | 349/656 [17:33<15:32,  3.04s/it]

{'loss': 1.3717, 'grad_norm': 3.104552984237671, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}


 53%|█████▎    | 350/656 [17:35<14:22,  2.82s/it]

{'loss': 1.3358, 'grad_norm': 3.647162437438965, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}


 54%|█████▎    | 351/656 [17:40<16:40,  3.28s/it]

{'loss': 1.1578, 'grad_norm': 2.3340659141540527, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}


 54%|█████▎    | 352/656 [17:42<15:26,  3.05s/it]

{'loss': 1.4438, 'grad_norm': 2.51314115524292, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}


 54%|█████▍    | 353/656 [17:46<16:07,  3.19s/it]

{'loss': 1.2658, 'grad_norm': 2.565129518508911, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}


 54%|█████▍    | 354/656 [17:48<15:08,  3.01s/it]

{'loss': 1.2653, 'grad_norm': 3.80364990234375, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}


 54%|█████▍    | 355/656 [17:51<14:44,  2.94s/it]

{'loss': 1.2485, 'grad_norm': 3.642089366912842, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}


 54%|█████▍    | 356/656 [17:55<15:55,  3.18s/it]

{'loss': 1.327, 'grad_norm': 2.424067258834839, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}


 54%|█████▍    | 357/656 [17:58<16:00,  3.21s/it]

{'loss': 1.2724, 'grad_norm': 2.3702170848846436, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}


 55%|█████▍    | 358/656 [18:01<15:57,  3.21s/it]

{'loss': 1.1846, 'grad_norm': 2.646165370941162, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}


 55%|█████▍    | 359/656 [18:04<15:21,  3.10s/it]

{'loss': 1.1788, 'grad_norm': 2.4552180767059326, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}


 55%|█████▍    | 360/656 [18:08<15:35,  3.16s/it]

{'loss': 1.3561, 'grad_norm': 2.7788310050964355, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}


 55%|█████▌    | 361/656 [18:11<15:21,  3.12s/it]

{'loss': 1.3061, 'grad_norm': 3.966364622116089, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}


 55%|█████▌    | 362/656 [18:13<14:40,  3.00s/it]

{'loss': 1.2396, 'grad_norm': 3.060382604598999, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}


 55%|█████▌    | 363/656 [18:17<15:16,  3.13s/it]

{'loss': 1.1891, 'grad_norm': 2.474496603012085, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}


 55%|█████▌    | 364/656 [18:19<14:04,  2.89s/it]

{'loss': 1.3398, 'grad_norm': 2.350759506225586, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}


 56%|█████▌    | 365/656 [18:22<13:59,  2.88s/it]

{'loss': 1.1303, 'grad_norm': 2.5142297744750977, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}


 56%|█████▌    | 366/656 [18:24<13:15,  2.74s/it]

{'loss': 1.287, 'grad_norm': 3.343867301940918, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}


 56%|█████▌    | 367/656 [18:29<16:33,  3.44s/it]

{'loss': 1.2052, 'grad_norm': 2.2847418785095215, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}


 56%|█████▌    | 368/656 [18:33<16:12,  3.38s/it]

{'loss': 1.2338, 'grad_norm': 3.03446102142334, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}


 56%|█████▋    | 369/656 [18:35<14:32,  3.04s/it]

{'loss': 1.4808, 'grad_norm': 3.249521017074585, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}


 56%|█████▋    | 370/656 [18:37<13:42,  2.87s/it]

{'loss': 1.1799, 'grad_norm': 2.9885623455047607, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}


 57%|█████▋    | 371/656 [18:40<13:50,  2.91s/it]

{'loss': 1.3329, 'grad_norm': 7.239459991455078, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}


 57%|█████▋    | 372/656 [18:43<13:00,  2.75s/it]

{'loss': 1.1327, 'grad_norm': 2.9991655349731445, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}


 57%|█████▋    | 373/656 [18:45<12:45,  2.71s/it]

{'loss': 1.121, 'grad_norm': 2.963184118270874, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}


 57%|█████▋    | 374/656 [18:49<13:46,  2.93s/it]

{'loss': 1.3895, 'grad_norm': 3.293078660964966, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}


 57%|█████▋    | 375/656 [18:51<13:05,  2.80s/it]

{'loss': 1.2081, 'grad_norm': 4.254981994628906, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}


 57%|█████▋    | 376/656 [18:54<12:47,  2.74s/it]

{'loss': 1.1666, 'grad_norm': 2.843567132949829, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}


 57%|█████▋    | 377/656 [18:57<12:32,  2.70s/it]

{'loss': 1.1641, 'grad_norm': 3.2949368953704834, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}


 58%|█████▊    | 378/656 [18:59<12:45,  2.75s/it]

{'loss': 1.1135, 'grad_norm': 2.3147857189178467, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


 58%|█████▊    | 379/656 [19:03<13:53,  3.01s/it]

{'loss': 1.1977, 'grad_norm': 2.8417909145355225, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}


 58%|█████▊    | 380/656 [19:06<14:21,  3.12s/it]

{'loss': 1.2342, 'grad_norm': 3.0709362030029297, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}


 58%|█████▊    | 381/656 [19:10<14:52,  3.24s/it]

{'loss': 1.1414, 'grad_norm': 2.8647119998931885, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}


 58%|█████▊    | 382/656 [19:14<15:14,  3.34s/it]

{'loss': 1.0324, 'grad_norm': 2.949887275695801, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


 58%|█████▊    | 383/656 [19:17<15:19,  3.37s/it]

{'loss': 1.184, 'grad_norm': 3.7760815620422363, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}


 59%|█████▊    | 384/656 [19:20<14:53,  3.29s/it]

{'loss': 1.1285, 'grad_norm': 2.3155667781829834, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}


 59%|█████▊    | 385/656 [19:23<14:28,  3.21s/it]

{'loss': 1.067, 'grad_norm': 2.684569835662842, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}


 59%|█████▉    | 386/656 [19:26<14:29,  3.22s/it]

{'loss': 1.0578, 'grad_norm': 2.4303927421569824, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


 59%|█████▉    | 387/656 [19:29<13:56,  3.11s/it]

{'loss': 1.1015, 'grad_norm': 2.1964406967163086, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}


 59%|█████▉    | 388/656 [19:34<15:33,  3.48s/it]

{'loss': 1.301, 'grad_norm': 2.9492297172546387, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}


 59%|█████▉    | 389/656 [19:36<14:06,  3.17s/it]

{'loss': 1.142, 'grad_norm': 2.6493582725524902, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}


 59%|█████▉    | 390/656 [19:40<14:30,  3.27s/it]

{'loss': 1.2329, 'grad_norm': 3.302079677581787, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


 60%|█████▉    | 391/656 [19:42<13:27,  3.05s/it]

{'loss': 1.2243, 'grad_norm': 3.168891429901123, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}


 60%|█████▉    | 392/656 [19:44<12:32,  2.85s/it]

{'loss': 1.0589, 'grad_norm': 2.7107303142547607, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}


 60%|█████▉    | 393/656 [19:47<12:13,  2.79s/it]

{'loss': 1.3266, 'grad_norm': 3.755831003189087, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}


 60%|██████    | 394/656 [19:50<11:55,  2.73s/it]

{'loss': 1.2175, 'grad_norm': 3.399320363998413, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


 60%|██████    | 395/656 [19:53<12:09,  2.80s/it]

{'loss': 0.9958, 'grad_norm': 3.042987108230591, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}


 60%|██████    | 396/656 [19:55<11:34,  2.67s/it]

{'loss': 0.9151, 'grad_norm': 3.4769816398620605, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}


 61%|██████    | 397/656 [19:57<11:12,  2.60s/it]

{'loss': 1.3029, 'grad_norm': 4.136719703674316, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}


 61%|██████    | 398/656 [20:00<11:18,  2.63s/it]

{'loss': 1.245, 'grad_norm': 4.234344482421875, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


 61%|██████    | 399/656 [20:03<11:56,  2.79s/it]

{'loss': 1.2734, 'grad_norm': 3.5259745121002197, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}


 61%|██████    | 400/656 [20:06<11:50,  2.78s/it]

{'loss': 1.0163, 'grad_norm': 2.4744293689727783, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}


 61%|██████    | 401/656 [20:09<11:39,  2.74s/it]

{'loss': 1.0639, 'grad_norm': 2.6979894638061523, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}


 61%|██████▏   | 402/656 [20:11<11:06,  2.63s/it]

{'loss': 1.0966, 'grad_norm': 2.5278685092926025, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


 61%|██████▏   | 403/656 [20:13<10:42,  2.54s/it]

{'loss': 1.2204, 'grad_norm': 3.0933260917663574, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}


 62%|██████▏   | 404/656 [20:16<10:39,  2.54s/it]

{'loss': 1.0922, 'grad_norm': 3.283317804336548, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}


 62%|██████▏   | 405/656 [20:19<11:17,  2.70s/it]

{'loss': 0.993, 'grad_norm': 2.229247570037842, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}


 62%|██████▏   | 406/656 [20:22<11:31,  2.77s/it]

{'loss': 1.1379, 'grad_norm': 2.9235992431640625, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


 62%|██████▏   | 407/656 [20:24<10:52,  2.62s/it]

{'loss': 0.9319, 'grad_norm': 2.91243577003479, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}


 62%|██████▏   | 408/656 [20:27<10:41,  2.59s/it]

{'loss': 1.1282, 'grad_norm': 2.4341185092926025, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}


 62%|██████▏   | 409/656 [20:29<10:19,  2.51s/it]

{'loss': 1.0383, 'grad_norm': 3.3157036304473877, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}


 62%|██████▎   | 410/656 [20:31<09:53,  2.41s/it]

{'loss': 1.1394, 'grad_norm': 2.5188190937042236, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


 63%|██████▎   | 411/656 [20:33<09:35,  2.35s/it]

{'loss': 0.9912, 'grad_norm': 2.0537731647491455, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}


 63%|██████▎   | 412/656 [20:37<10:46,  2.65s/it]

{'loss': 1.1961, 'grad_norm': 3.5526113510131836, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}


 63%|██████▎   | 413/656 [20:39<10:16,  2.54s/it]

{'loss': 1.0005, 'grad_norm': 3.2264153957366943, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}


 63%|██████▎   | 414/656 [20:41<10:05,  2.50s/it]

{'loss': 1.2102, 'grad_norm': 4.384121894836426, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}


 63%|██████▎   | 415/656 [20:44<09:49,  2.44s/it]

{'loss': 1.1211, 'grad_norm': 2.8488807678222656, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}


 63%|██████▎   | 416/656 [20:46<09:42,  2.43s/it]

{'loss': 0.9496, 'grad_norm': 2.988067626953125, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}


 64%|██████▎   | 417/656 [20:49<09:42,  2.44s/it]

{'loss': 0.97, 'grad_norm': 3.3623616695404053, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}


 64%|██████▎   | 418/656 [20:52<10:28,  2.64s/it]

{'loss': 0.9446, 'grad_norm': 4.1333537101745605, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}


 64%|██████▍   | 419/656 [20:54<10:07,  2.56s/it]

{'loss': 1.1868, 'grad_norm': 3.851442813873291, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}


 64%|██████▍   | 420/656 [20:57<10:14,  2.61s/it]

{'loss': 1.017, 'grad_norm': 2.687321424484253, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}


 64%|██████▍   | 421/656 [20:59<10:04,  2.57s/it]

{'loss': 0.9604, 'grad_norm': 2.3613131046295166, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}


 64%|██████▍   | 422/656 [21:04<12:48,  3.28s/it]

{'loss': 1.0488, 'grad_norm': 3.939284324645996, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}


 64%|██████▍   | 423/656 [21:07<12:31,  3.23s/it]

{'loss': 1.0349, 'grad_norm': 2.60941743850708, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}


 65%|██████▍   | 424/656 [21:10<11:26,  2.96s/it]

{'loss': 1.004, 'grad_norm': 2.9700489044189453, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}


 65%|██████▍   | 425/656 [21:15<13:51,  3.60s/it]

{'loss': 0.8919, 'grad_norm': 2.780580997467041, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}


 65%|██████▍   | 426/656 [21:17<12:34,  3.28s/it]

{'loss': 1.1661, 'grad_norm': 3.7922403812408447, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}


 65%|██████▌   | 427/656 [21:20<11:35,  3.04s/it]

{'loss': 0.8431, 'grad_norm': 2.371584415435791, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}


 65%|██████▌   | 428/656 [21:22<10:52,  2.86s/it]

{'loss': 0.9175, 'grad_norm': 4.418153762817383, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [21:25<10:26,  2.76s/it]

{'loss': 0.9829, 'grad_norm': 4.011166572570801, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}


 66%|██████▌   | 430/656 [21:27<10:15,  2.72s/it]

{'loss': 1.0193, 'grad_norm': 4.315126895904541, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}


 66%|██████▌   | 431/656 [21:31<11:11,  2.98s/it]

{'loss': 0.9072, 'grad_norm': 2.665357828140259, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}


 66%|██████▌   | 432/656 [21:34<10:51,  2.91s/it]

{'loss': 0.7535, 'grad_norm': 2.04801082611084, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}


 66%|██████▌   | 433/656 [21:37<11:15,  3.03s/it]

{'loss': 0.867, 'grad_norm': 2.7533962726593018, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}


 66%|██████▌   | 434/656 [21:40<11:24,  3.08s/it]

{'loss': 0.9392, 'grad_norm': 2.4480011463165283, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}


 66%|██████▋   | 435/656 [21:44<11:57,  3.25s/it]

{'loss': 0.9858, 'grad_norm': 2.530902862548828, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}


 66%|██████▋   | 436/656 [21:47<11:44,  3.20s/it]

{'loss': 0.9127, 'grad_norm': 2.3560409545898438, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}


 67%|██████▋   | 437/656 [21:50<11:34,  3.17s/it]

{'loss': 0.8351, 'grad_norm': 2.8881452083587646, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}


 67%|██████▋   | 438/656 [21:53<11:29,  3.16s/it]

{'loss': 0.9251, 'grad_norm': 2.410336971282959, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}


 67%|██████▋   | 439/656 [21:56<11:20,  3.14s/it]

{'loss': 0.7993, 'grad_norm': 2.036158561706543, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}


 67%|██████▋   | 440/656 [22:00<11:27,  3.18s/it]

{'loss': 0.8983, 'grad_norm': 2.3955159187316895, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


 67%|██████▋   | 441/656 [22:03<11:25,  3.19s/it]

{'loss': 0.7887, 'grad_norm': 3.3483402729034424, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}


 67%|██████▋   | 442/656 [22:06<10:54,  3.06s/it]

{'loss': 0.7603, 'grad_norm': 2.3440463542938232, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}


 68%|██████▊   | 443/656 [22:08<10:27,  2.94s/it]

{'loss': 0.8524, 'grad_norm': 3.439053773880005, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}


 68%|██████▊   | 444/656 [22:11<10:06,  2.86s/it]

{'loss': 0.7184, 'grad_norm': 2.236201047897339, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


 68%|██████▊   | 445/656 [22:13<09:21,  2.66s/it]

{'loss': 0.9178, 'grad_norm': 3.738513708114624, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}


 68%|██████▊   | 446/656 [22:18<11:11,  3.20s/it]

{'loss': 1.1718, 'grad_norm': 5.947801113128662, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}


 68%|██████▊   | 447/656 [22:20<10:41,  3.07s/it]

{'loss': 0.8162, 'grad_norm': 5.7053680419921875, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}


 68%|██████▊   | 448/656 [22:25<12:15,  3.54s/it]

{'loss': 0.874, 'grad_norm': 2.2591843605041504, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}


 68%|██████▊   | 449/656 [22:28<11:46,  3.41s/it]

{'loss': 1.0533, 'grad_norm': 3.6567130088806152, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}


 69%|██████▊   | 450/656 [22:32<12:39,  3.69s/it]

{'loss': 0.8854, 'grad_norm': 3.6992664337158203, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}


 69%|██████▉   | 451/656 [22:36<12:10,  3.56s/it]

{'loss': 0.8856, 'grad_norm': 3.374633312225342, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}


 69%|██████▉   | 452/656 [22:39<11:28,  3.37s/it]

{'loss': 0.8274, 'grad_norm': 2.7805864810943604, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}


 69%|██████▉   | 453/656 [22:42<11:14,  3.32s/it]

{'loss': 1.0221, 'grad_norm': 5.494555950164795, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}


 69%|██████▉   | 454/656 [22:45<10:37,  3.16s/it]

{'loss': 0.9878, 'grad_norm': 3.9936845302581787, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}


 69%|██████▉   | 455/656 [22:48<10:26,  3.12s/it]

{'loss': 0.995, 'grad_norm': 2.686981201171875, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}


 70%|██████▉   | 456/656 [22:50<09:52,  2.96s/it]

{'loss': 0.8913, 'grad_norm': 3.5245120525360107, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


 70%|██████▉   | 457/656 [22:53<09:51,  2.97s/it]

{'loss': 0.768, 'grad_norm': 4.010958671569824, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}


 70%|██████▉   | 458/656 [22:56<10:04,  3.05s/it]

{'loss': 0.9728, 'grad_norm': 2.8345537185668945, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}


 70%|██████▉   | 459/656 [22:59<09:45,  2.97s/it]

{'loss': 0.9033, 'grad_norm': 3.9280893802642822, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}


 70%|███████   | 460/656 [23:03<10:17,  3.15s/it]

{'loss': 0.783, 'grad_norm': 2.3453919887542725, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}


 70%|███████   | 461/656 [23:05<09:24,  2.90s/it]

{'loss': 0.8919, 'grad_norm': 3.089097738265991, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}


 70%|███████   | 462/656 [23:08<09:23,  2.90s/it]

{'loss': 0.849, 'grad_norm': 3.725722312927246, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}


 71%|███████   | 463/656 [23:11<09:25,  2.93s/it]

{'loss': 0.9045, 'grad_norm': 3.0115907192230225, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}


 71%|███████   | 464/656 [23:14<09:24,  2.94s/it]

{'loss': 0.8872, 'grad_norm': 3.24304461479187, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}


 71%|███████   | 465/656 [23:17<09:13,  2.90s/it]

{'loss': 0.7591, 'grad_norm': 2.139129877090454, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}


 71%|███████   | 466/656 [23:19<08:46,  2.77s/it]

{'loss': 0.7228, 'grad_norm': 2.2503740787506104, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}


 71%|███████   | 467/656 [23:22<08:32,  2.71s/it]

{'loss': 0.8525, 'grad_norm': 3.3058643341064453, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}


 71%|███████▏  | 468/656 [23:25<08:31,  2.72s/it]

{'loss': 0.7507, 'grad_norm': 1.9016832113265991, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}


 71%|███████▏  | 469/656 [23:27<08:07,  2.61s/it]

{'loss': 0.6589, 'grad_norm': 2.8807389736175537, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}


 72%|███████▏  | 470/656 [23:30<08:39,  2.79s/it]

{'loss': 0.799, 'grad_norm': 2.5142455101013184, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}


 72%|███████▏  | 471/656 [23:33<08:51,  2.87s/it]

{'loss': 0.7745, 'grad_norm': 2.7163286209106445, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}


 72%|███████▏  | 472/656 [23:36<09:04,  2.96s/it]

{'loss': 0.6859, 'grad_norm': 2.511582612991333, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


 72%|███████▏  | 473/656 [23:39<08:59,  2.95s/it]

{'loss': 0.7541, 'grad_norm': 2.8950366973876953, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}


 72%|███████▏  | 474/656 [23:45<11:17,  3.72s/it]

{'loss': 0.7464, 'grad_norm': 2.9495058059692383, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}


 72%|███████▏  | 475/656 [23:47<10:18,  3.42s/it]

{'loss': 0.7172, 'grad_norm': 3.205446720123291, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}


 73%|███████▎  | 476/656 [23:51<10:18,  3.44s/it]

{'loss': 0.8846, 'grad_norm': 5.0546793937683105, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


 73%|███████▎  | 477/656 [23:54<09:35,  3.22s/it]

{'loss': 0.8484, 'grad_norm': 2.512821674346924, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}


 73%|███████▎  | 478/656 [23:56<08:44,  2.95s/it]

{'loss': 0.7861, 'grad_norm': 3.9757633209228516, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}


 73%|███████▎  | 479/656 [23:59<09:04,  3.08s/it]

{'loss': 0.8049, 'grad_norm': 2.6960787773132324, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}


 73%|███████▎  | 480/656 [24:02<08:42,  2.97s/it]

{'loss': 0.8484, 'grad_norm': 2.641967296600342, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


 73%|███████▎  | 481/656 [24:06<09:22,  3.21s/it]

{'loss': 0.7071, 'grad_norm': 2.538609266281128, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}


 73%|███████▎  | 482/656 [24:09<08:54,  3.07s/it]

{'loss': 0.5813, 'grad_norm': 2.5909252166748047, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}


 74%|███████▎  | 483/656 [24:11<08:39,  3.01s/it]

{'loss': 0.6107, 'grad_norm': 2.0546648502349854, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}


 74%|███████▍  | 484/656 [24:15<09:09,  3.20s/it]

{'loss': 0.9391, 'grad_norm': 4.645782947540283, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


 74%|███████▍  | 485/656 [24:19<09:20,  3.28s/it]

{'loss': 0.7242, 'grad_norm': 3.4014647006988525, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}


 74%|███████▍  | 486/656 [24:23<10:25,  3.68s/it]

{'loss': 0.8771, 'grad_norm': 3.0670995712280273, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}


 74%|███████▍  | 487/656 [24:27<10:17,  3.66s/it]

{'loss': 0.7832, 'grad_norm': 3.211761951446533, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}


 74%|███████▍  | 488/656 [24:30<10:07,  3.61s/it]

{'loss': 0.6768, 'grad_norm': 2.8638811111450195, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


 75%|███████▍  | 489/656 [24:33<09:37,  3.46s/it]

{'loss': 0.6924, 'grad_norm': 2.910846710205078, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}


 75%|███████▍  | 490/656 [24:37<09:37,  3.48s/it]

{'loss': 0.835, 'grad_norm': 3.176218032836914, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}


 75%|███████▍  | 491/656 [24:40<09:27,  3.44s/it]

{'loss': 0.6141, 'grad_norm': 2.5659050941467285, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}


 75%|███████▌  | 492/656 [24:43<08:43,  3.19s/it]

{'loss': 0.7183, 'grad_norm': 1.8789896965026855, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


 75%|███████▌  | 493/656 [24:46<08:52,  3.26s/it]

{'loss': 0.633, 'grad_norm': 2.625105381011963, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}


 75%|███████▌  | 494/656 [24:49<08:22,  3.10s/it]

{'loss': 0.6813, 'grad_norm': 3.5557870864868164, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}


 75%|███████▌  | 495/656 [24:53<09:02,  3.37s/it]

{'loss': 0.725, 'grad_norm': 2.1454782485961914, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}


 76%|███████▌  | 496/656 [24:55<08:09,  3.06s/it]

{'loss': 0.6191, 'grad_norm': 2.27659010887146, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


 76%|███████▌  | 497/656 [24:58<08:02,  3.04s/it]

{'loss': 0.7951, 'grad_norm': 2.6993257999420166, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}


 76%|███████▌  | 498/656 [25:01<07:35,  2.88s/it]

{'loss': 0.7307, 'grad_norm': 3.697944402694702, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}


 76%|███████▌  | 499/656 [25:04<08:08,  3.11s/it]

{'loss': 0.7709, 'grad_norm': 2.921823024749756, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}


 76%|███████▌  | 500/656 [25:07<07:42,  2.96s/it]

{'loss': 0.8222, 'grad_norm': 3.4176437854766846, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


 76%|███████▋  | 501/656 [25:10<07:46,  3.01s/it]

{'loss': 0.8009, 'grad_norm': 3.22479248046875, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}


 77%|███████▋  | 502/656 [25:14<08:26,  3.29s/it]

{'loss': 0.7139, 'grad_norm': 2.8702099323272705, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}


 77%|███████▋  | 503/656 [25:18<09:08,  3.58s/it]

{'loss': 0.9015, 'grad_norm': 3.2263028621673584, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}


 77%|███████▋  | 504/656 [25:22<08:52,  3.50s/it]

{'loss': 0.6704, 'grad_norm': 3.222604513168335, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


 77%|███████▋  | 505/656 [25:25<08:15,  3.28s/it]

{'loss': 0.5603, 'grad_norm': 2.5138795375823975, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}


 77%|███████▋  | 506/656 [25:28<08:12,  3.28s/it]

{'loss': 0.8407, 'grad_norm': 3.2769510746002197, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}


 77%|███████▋  | 507/656 [25:30<07:23,  2.97s/it]

{'loss': 0.6319, 'grad_norm': 3.884711742401123, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}


 77%|███████▋  | 508/656 [25:32<06:46,  2.75s/it]

{'loss': 0.7297, 'grad_norm': 2.355771064758301, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}


 78%|███████▊  | 509/656 [25:35<06:34,  2.68s/it]

{'loss': 0.7313, 'grad_norm': 2.637970447540283, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}


 78%|███████▊  | 510/656 [25:38<06:31,  2.68s/it]

{'loss': 0.6382, 'grad_norm': 2.750413179397583, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}


 78%|███████▊  | 511/656 [25:40<06:23,  2.65s/it]

{'loss': 0.8623, 'grad_norm': 3.0278446674346924, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}


 78%|███████▊  | 512/656 [25:42<06:09,  2.56s/it]

{'loss': 0.5957, 'grad_norm': 2.870981216430664, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}


 78%|███████▊  | 513/656 [25:45<06:20,  2.66s/it]

{'loss': 0.7609, 'grad_norm': 3.493215322494507, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}


 78%|███████▊  | 514/656 [25:48<06:21,  2.69s/it]

{'loss': 0.7493, 'grad_norm': 3.1937255859375, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}


 79%|███████▊  | 515/656 [25:52<06:55,  2.95s/it]

{'loss': 0.7238, 'grad_norm': 4.309025287628174, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}


 79%|███████▊  | 516/656 [25:54<06:31,  2.79s/it]

{'loss': 0.7926, 'grad_norm': 2.802448034286499, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}


 79%|███████▉  | 517/656 [25:57<06:18,  2.72s/it]

{'loss': 0.6, 'grad_norm': 2.7613162994384766, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}


 79%|███████▉  | 518/656 [25:59<06:13,  2.71s/it]

{'loss': 0.6971, 'grad_norm': 2.6018412113189697, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}


 79%|███████▉  | 519/656 [26:02<06:20,  2.78s/it]

{'loss': 0.6088, 'grad_norm': 3.074005365371704, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}


 79%|███████▉  | 520/656 [26:05<06:06,  2.69s/it]

{'loss': 0.6219, 'grad_norm': 3.154752016067505, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}


 79%|███████▉  | 521/656 [26:10<07:42,  3.42s/it]

{'loss': 0.5823, 'grad_norm': 2.3577280044555664, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}


 80%|███████▉  | 522/656 [26:12<06:58,  3.13s/it]

{'loss': 0.892, 'grad_norm': 3.733757734298706, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}


 80%|███████▉  | 523/656 [26:15<06:22,  2.87s/it]

{'loss': 0.631, 'grad_norm': 2.2225615978240967, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}


 80%|███████▉  | 524/656 [26:17<05:53,  2.68s/it]

{'loss': 0.7936, 'grad_norm': 2.7812297344207764, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


 80%|████████  | 525/656 [26:19<05:38,  2.59s/it]

{'loss': 0.8112, 'grad_norm': 4.296076774597168, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}


 80%|████████  | 526/656 [26:21<05:22,  2.48s/it]

{'loss': 0.703, 'grad_norm': 3.0301663875579834, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}


 80%|████████  | 527/656 [26:24<05:15,  2.45s/it]

{'loss': 0.6247, 'grad_norm': 4.916183948516846, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}


 80%|████████  | 528/656 [26:26<05:06,  2.39s/it]

{'loss': 0.6279, 'grad_norm': 3.8788697719573975, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}


 81%|████████  | 529/656 [26:30<06:17,  2.97s/it]

{'loss': 0.7869, 'grad_norm': 3.4676034450531006, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}


 81%|████████  | 530/656 [26:33<06:19,  3.01s/it]

{'loss': 0.5333, 'grad_norm': 3.522905111312866, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}


 81%|████████  | 531/656 [26:36<05:54,  2.84s/it]

{'loss': 0.6366, 'grad_norm': 3.7762701511383057, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}


 81%|████████  | 532/656 [26:38<05:38,  2.73s/it]

{'loss': 0.7947, 'grad_norm': 3.059842109680176, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}


 81%|████████▏ | 533/656 [26:41<05:37,  2.74s/it]

{'loss': 0.5781, 'grad_norm': 2.170686721801758, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}


 81%|████████▏ | 534/656 [26:44<05:24,  2.66s/it]

{'loss': 0.6076, 'grad_norm': 2.7783620357513428, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}


 82%|████████▏ | 535/656 [26:46<05:20,  2.65s/it]

{'loss': 0.712, 'grad_norm': 2.8154773712158203, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}


 82%|████████▏ | 536/656 [26:49<05:38,  2.82s/it]

{'loss': 0.5816, 'grad_norm': 2.7268378734588623, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}


 82%|████████▏ | 537/656 [26:52<05:34,  2.81s/it]

{'loss': 0.7594, 'grad_norm': 3.9189867973327637, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}


 82%|████████▏ | 538/656 [26:56<05:48,  2.96s/it]

{'loss': 0.7556, 'grad_norm': 4.484676361083984, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


 82%|████████▏ | 539/656 [27:00<06:37,  3.40s/it]

{'loss': 0.5184, 'grad_norm': 2.0740604400634766, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}


 82%|████████▏ | 540/656 [27:04<06:47,  3.51s/it]

{'loss': 0.995, 'grad_norm': 3.0494208335876465, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}


 82%|████████▏ | 541/656 [27:07<06:31,  3.40s/it]

{'loss': 0.5778, 'grad_norm': 2.633047580718994, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}


 83%|████████▎ | 542/656 [27:11<07:00,  3.69s/it]

{'loss': 0.6338, 'grad_norm': 3.2433857917785645, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}


 83%|████████▎ | 543/656 [27:16<07:17,  3.88s/it]

{'loss': 0.64, 'grad_norm': 3.2630422115325928, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}


 83%|████████▎ | 544/656 [27:19<07:09,  3.83s/it]

{'loss': 0.6177, 'grad_norm': 2.8643016815185547, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}


 83%|████████▎ | 545/656 [27:23<06:55,  3.74s/it]

{'loss': 0.7239, 'grad_norm': 4.462383270263672, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}


 83%|████████▎ | 546/656 [27:27<06:49,  3.72s/it]

{'loss': 0.5262, 'grad_norm': 3.7515177726745605, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}


 83%|████████▎ | 547/656 [27:29<06:07,  3.38s/it]

{'loss': 0.6227, 'grad_norm': 3.8311963081359863, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}


 84%|████████▎ | 548/656 [27:32<05:41,  3.16s/it]

{'loss': 0.6271, 'grad_norm': 4.244022846221924, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}


 84%|████████▎ | 549/656 [27:34<05:24,  3.03s/it]

{'loss': 0.5563, 'grad_norm': 2.606605052947998, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}


 84%|████████▍ | 550/656 [27:37<05:08,  2.91s/it]

{'loss': 0.6624, 'grad_norm': 3.160130500793457, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}


 84%|████████▍ | 551/656 [27:40<04:50,  2.77s/it]

{'loss': 0.5983, 'grad_norm': 3.352013349533081, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}


 84%|████████▍ | 552/656 [27:42<04:43,  2.72s/it]

{'loss': 0.7707, 'grad_norm': 5.472382068634033, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}


 84%|████████▍ | 553/656 [27:45<04:30,  2.63s/it]

{'loss': 0.5453, 'grad_norm': 3.1246700286865234, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}


 84%|████████▍ | 554/656 [27:48<04:40,  2.75s/it]

{'loss': 0.6049, 'grad_norm': 3.3685526847839355, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}


 85%|████████▍ | 555/656 [27:51<05:04,  3.01s/it]

{'loss': 0.5239, 'grad_norm': 2.255718946456909, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}


 85%|████████▍ | 556/656 [27:55<05:20,  3.20s/it]

{'loss': 0.5748, 'grad_norm': 2.359683036804199, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}


 85%|████████▍ | 557/656 [27:58<05:24,  3.28s/it]

{'loss': 0.481, 'grad_norm': 2.189429998397827, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}


 85%|████████▌ | 558/656 [28:01<05:13,  3.20s/it]

{'loss': 0.5962, 'grad_norm': 2.8943495750427246, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}


 85%|████████▌ | 559/656 [28:04<04:48,  2.97s/it]

{'loss': 0.5209, 'grad_norm': 2.3769195079803467, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}


 85%|████████▌ | 560/656 [28:06<04:36,  2.88s/it]

{'loss': 0.6115, 'grad_norm': 2.4315719604492188, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}


 86%|████████▌ | 561/656 [28:09<04:30,  2.85s/it]

{'loss': 0.588, 'grad_norm': 2.9026072025299072, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}


 86%|████████▌ | 562/656 [28:13<05:01,  3.21s/it]

{'loss': 0.4768, 'grad_norm': 2.1351966857910156, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}


 86%|████████▌ | 563/656 [28:17<05:03,  3.26s/it]

{'loss': 0.5362, 'grad_norm': 2.5596542358398438, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}


 86%|████████▌ | 564/656 [28:19<04:40,  3.05s/it]

{'loss': 0.6433, 'grad_norm': 4.633455276489258, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}


 86%|████████▌ | 565/656 [28:22<04:22,  2.88s/it]

{'loss': 0.5841, 'grad_norm': 2.4534220695495605, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}


 86%|████████▋ | 566/656 [28:24<04:12,  2.81s/it]

{'loss': 0.5162, 'grad_norm': 2.6015889644622803, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}


 86%|████████▋ | 567/656 [28:27<04:06,  2.77s/it]

{'loss': 0.4582, 'grad_norm': 3.346268892288208, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}


 87%|████████▋ | 568/656 [28:30<03:58,  2.71s/it]

{'loss': 0.6317, 'grad_norm': 3.4667577743530273, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}


 87%|████████▋ | 569/656 [28:32<03:50,  2.64s/it]

{'loss': 0.6908, 'grad_norm': 2.959463119506836, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}


 87%|████████▋ | 570/656 [28:35<03:43,  2.60s/it]

{'loss': 0.5804, 'grad_norm': 2.938615560531616, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}


 87%|████████▋ | 571/656 [28:37<03:47,  2.67s/it]

{'loss': 0.5992, 'grad_norm': 2.831941604614258, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}


 87%|████████▋ | 572/656 [28:40<03:53,  2.78s/it]

{'loss': 0.4719, 'grad_norm': 2.2059216499328613, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


 87%|████████▋ | 573/656 [28:43<03:51,  2.79s/it]

{'loss': 0.5555, 'grad_norm': 2.795809030532837, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}


 88%|████████▊ | 574/656 [28:46<03:45,  2.76s/it]

{'loss': 0.6524, 'grad_norm': 2.8987529277801514, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}


 88%|████████▊ | 575/656 [28:50<04:03,  3.00s/it]

{'loss': 0.5625, 'grad_norm': 2.7991116046905518, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}


 88%|████████▊ | 576/656 [28:53<04:12,  3.15s/it]

{'loss': 0.6005, 'grad_norm': 4.51483678817749, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


 88%|████████▊ | 577/656 [28:56<04:13,  3.20s/it]

{'loss': 0.5908, 'grad_norm': 2.299755573272705, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}


 88%|████████▊ | 578/656 [29:00<04:18,  3.31s/it]

{'loss': 0.5023, 'grad_norm': 2.4589860439300537, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}


 88%|████████▊ | 579/656 [29:05<04:56,  3.85s/it]

{'loss': 0.4862, 'grad_norm': 1.8976820707321167, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}


 88%|████████▊ | 580/656 [29:08<04:26,  3.51s/it]

{'loss': 0.4516, 'grad_norm': 3.1675219535827637, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}


 89%|████████▊ | 581/656 [29:10<04:04,  3.25s/it]

{'loss': 0.6131, 'grad_norm': 2.7319767475128174, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}


 89%|████████▊ | 582/656 [29:13<03:46,  3.06s/it]

{'loss': 0.5655, 'grad_norm': 2.8611762523651123, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}


 89%|████████▉ | 583/656 [29:16<03:31,  2.90s/it]

{'loss': 0.4602, 'grad_norm': 2.717594623565674, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}


 89%|████████▉ | 584/656 [29:18<03:29,  2.91s/it]

{'loss': 0.7601, 'grad_norm': 3.819554567337036, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


 89%|████████▉ | 585/656 [29:22<03:33,  3.01s/it]

{'loss': 0.4978, 'grad_norm': 2.125610828399658, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}


 89%|████████▉ | 586/656 [29:25<03:33,  3.05s/it]

{'loss': 0.664, 'grad_norm': 2.4587719440460205, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}


 89%|████████▉ | 587/656 [29:28<03:30,  3.05s/it]

{'loss': 0.5718, 'grad_norm': 3.285195827484131, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}


 90%|████████▉ | 588/656 [29:31<03:19,  2.93s/it]

{'loss': 0.6057, 'grad_norm': 2.9432477951049805, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


 90%|████████▉ | 589/656 [29:34<03:18,  2.96s/it]

{'loss': 0.571, 'grad_norm': 2.237096071243286, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}


 90%|████████▉ | 590/656 [29:36<03:12,  2.92s/it]

{'loss': 0.5247, 'grad_norm': 2.7980878353118896, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}


 90%|█████████ | 591/656 [29:39<03:08,  2.90s/it]

{'loss': 0.6066, 'grad_norm': 2.835001230239868, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}


 90%|█████████ | 592/656 [29:42<03:03,  2.87s/it]

{'loss': 0.5655, 'grad_norm': 3.2510244846343994, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


 90%|█████████ | 593/656 [29:46<03:13,  3.07s/it]

{'loss': 0.4821, 'grad_norm': 3.132932424545288, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}


 91%|█████████ | 594/656 [29:49<03:25,  3.32s/it]

{'loss': 0.5089, 'grad_norm': 2.7627944946289062, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}


 91%|█████████ | 595/656 [29:53<03:22,  3.32s/it]

{'loss': 0.477, 'grad_norm': 2.472287893295288, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}


 91%|█████████ | 596/656 [29:56<03:14,  3.24s/it]

{'loss': 0.746, 'grad_norm': 3.618598461151123, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


 91%|█████████ | 597/656 [29:58<03:00,  3.06s/it]

{'loss': 0.5713, 'grad_norm': 3.2833898067474365, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}


 91%|█████████ | 598/656 [30:01<02:53,  3.00s/it]

{'loss': 0.5614, 'grad_norm': 3.8124585151672363, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}


 91%|█████████▏| 599/656 [30:04<02:47,  2.94s/it]

{'loss': 0.492, 'grad_norm': 4.182800769805908, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}


 91%|█████████▏| 600/656 [30:07<02:44,  2.94s/it]

{'loss': 0.5524, 'grad_norm': 3.373440742492676, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


 92%|█████████▏| 601/656 [30:10<02:45,  3.01s/it]

{'loss': 0.5051, 'grad_norm': 2.82375431060791, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}


 92%|█████████▏| 602/656 [30:13<02:37,  2.91s/it]

{'loss': 0.4665, 'grad_norm': 2.810170888900757, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}


 92%|█████████▏| 603/656 [30:16<02:29,  2.82s/it]

{'loss': 0.5945, 'grad_norm': 4.750021934509277, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}


 92%|█████████▏| 604/656 [30:18<02:23,  2.76s/it]

{'loss': 0.433, 'grad_norm': 4.077975273132324, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


 92%|█████████▏| 605/656 [30:21<02:14,  2.65s/it]

{'loss': 0.4442, 'grad_norm': 2.4803547859191895, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}


 92%|█████████▏| 606/656 [30:23<02:13,  2.67s/it]

{'loss': 0.3969, 'grad_norm': 2.629528045654297, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}


 93%|█████████▎| 607/656 [30:25<02:03,  2.51s/it]

{'loss': 0.4685, 'grad_norm': 3.1748099327087402, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}


 93%|█████████▎| 608/656 [30:28<01:58,  2.47s/it]

{'loss': 0.4975, 'grad_norm': 2.769251585006714, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}


 93%|█████████▎| 609/656 [30:30<01:54,  2.43s/it]

{'loss': 0.4611, 'grad_norm': 2.0844061374664307, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}


 93%|█████████▎| 610/656 [30:32<01:50,  2.40s/it]

{'loss': 0.4966, 'grad_norm': 3.583847761154175, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}


 93%|█████████▎| 611/656 [30:35<01:46,  2.38s/it]

{'loss': 0.351, 'grad_norm': 2.5047478675842285, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}


 93%|█████████▎| 612/656 [30:37<01:44,  2.37s/it]

{'loss': 0.412, 'grad_norm': 2.605239152908325, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}


 93%|█████████▎| 613/656 [30:40<01:47,  2.51s/it]

{'loss': 0.4315, 'grad_norm': 1.769179105758667, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}


 94%|█████████▎| 614/656 [30:43<01:46,  2.54s/it]

{'loss': 0.6088, 'grad_norm': 2.8247947692871094, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}


 94%|█████████▍| 615/656 [30:46<01:50,  2.69s/it]

{'loss': 0.4808, 'grad_norm': 2.2437047958374023, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}


 94%|█████████▍| 616/656 [30:48<01:48,  2.72s/it]

{'loss': 0.3592, 'grad_norm': 3.252869129180908, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}


 94%|█████████▍| 617/656 [30:52<01:53,  2.92s/it]

{'loss': 0.4949, 'grad_norm': 2.983755350112915, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}


 94%|█████████▍| 618/656 [30:56<02:06,  3.33s/it]

{'loss': 0.4735, 'grad_norm': 2.5467605590820312, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [31:00<02:08,  3.46s/it]

{'loss': 0.5853, 'grad_norm': 3.1879773139953613, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}


 95%|█████████▍| 620/656 [31:04<02:13,  3.71s/it]

{'loss': 0.5383, 'grad_norm': 3.6418616771698, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}


 95%|█████████▍| 621/656 [31:09<02:18,  3.95s/it]

{'loss': 0.3957, 'grad_norm': 2.7687103748321533, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}


 95%|█████████▍| 622/656 [31:13<02:15,  3.98s/it]

{'loss': 0.4512, 'grad_norm': 3.729947805404663, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}


 95%|█████████▍| 623/656 [31:17<02:14,  4.09s/it]

{'loss': 0.5219, 'grad_norm': 4.506016254425049, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}


 95%|█████████▌| 624/656 [31:22<02:18,  4.34s/it]

{'loss': 0.4435, 'grad_norm': 2.5180530548095703, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}


 95%|█████████▌| 625/656 [31:27<02:23,  4.63s/it]

{'loss': 0.436, 'grad_norm': 2.654606819152832, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}


 95%|█████████▌| 626/656 [31:32<02:20,  4.68s/it]

{'loss': 0.44, 'grad_norm': 3.7908685207366943, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


 96%|█████████▌| 627/656 [31:36<02:11,  4.52s/it]

{'loss': 0.4141, 'grad_norm': 2.278359889984131, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}


 96%|█████████▌| 628/656 [31:39<01:51,  3.98s/it]

{'loss': 0.4009, 'grad_norm': 3.2223095893859863, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}


 96%|█████████▌| 629/656 [31:41<01:36,  3.56s/it]

{'loss': 0.5905, 'grad_norm': 4.598660469055176, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}


 96%|█████████▌| 630/656 [31:44<01:25,  3.29s/it]

{'loss': 0.3994, 'grad_norm': 3.628114700317383, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


 96%|█████████▌| 631/656 [31:47<01:16,  3.05s/it]

{'loss': 0.3843, 'grad_norm': 3.7501320838928223, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}


 96%|█████████▋| 632/656 [31:49<01:09,  2.89s/it]

{'loss': 0.3935, 'grad_norm': 4.21409273147583, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}


 96%|█████████▋| 633/656 [31:52<01:05,  2.84s/it]

{'loss': 0.3772, 'grad_norm': 2.704525947570801, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}


 97%|█████████▋| 634/656 [31:55<01:02,  2.83s/it]

{'loss': 0.4537, 'grad_norm': 3.251430034637451, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


 97%|█████████▋| 635/656 [31:57<00:56,  2.71s/it]

{'loss': 0.5472, 'grad_norm': 2.9347617626190186, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}


 97%|█████████▋| 636/656 [32:00<00:52,  2.62s/it]

{'loss': 0.462, 'grad_norm': 2.415963649749756, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}


 97%|█████████▋| 637/656 [32:02<00:49,  2.63s/it]

{'loss': 0.4897, 'grad_norm': 2.2146975994110107, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}


 97%|█████████▋| 638/656 [32:05<00:48,  2.68s/it]

{'loss': 0.4404, 'grad_norm': 2.6707630157470703, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}


 97%|█████████▋| 639/656 [32:08<00:45,  2.69s/it]

{'loss': 0.4598, 'grad_norm': 3.0261662006378174, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}


 98%|█████████▊| 640/656 [32:10<00:42,  2.65s/it]

{'loss': 0.4256, 'grad_norm': 3.696305513381958, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}


 98%|█████████▊| 641/656 [32:13<00:39,  2.61s/it]

{'loss': 0.4499, 'grad_norm': 2.665825605392456, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}


 98%|█████████▊| 642/656 [32:16<00:37,  2.66s/it]

{'loss': 0.4043, 'grad_norm': 1.8528200387954712, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}


 98%|█████████▊| 643/656 [32:19<00:39,  3.03s/it]

{'loss': 0.5014, 'grad_norm': 4.5860443115234375, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}


 98%|█████████▊| 644/656 [32:25<00:46,  3.84s/it]

{'loss': 0.5028, 'grad_norm': 4.187246322631836, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


 98%|█████████▊| 645/656 [32:31<00:50,  4.57s/it]

{'loss': 0.3568, 'grad_norm': 2.2366185188293457, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}


 98%|█████████▊| 646/656 [32:38<00:52,  5.27s/it]

{'loss': 0.508, 'grad_norm': 2.4087960720062256, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}


 99%|█████████▊| 647/656 [32:45<00:52,  5.79s/it]

{'loss': 0.5405, 'grad_norm': 3.320901393890381, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}


 99%|█████████▉| 648/656 [32:50<00:44,  5.58s/it]

{'loss': 0.3976, 'grad_norm': 2.874616861343384, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


 99%|█████████▉| 649/656 [32:53<00:33,  4.82s/it]

{'loss': 0.5398, 'grad_norm': 3.0240366458892822, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}


 99%|█████████▉| 650/656 [32:57<00:25,  4.32s/it]

{'loss': 0.304, 'grad_norm': 2.488598346710205, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}


 99%|█████████▉| 651/656 [33:00<00:19,  3.99s/it]

{'loss': 0.4258, 'grad_norm': 3.1077497005462646, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [33:03<00:15,  3.80s/it]

{'loss': 0.4846, 'grad_norm': 3.824807643890381, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}


100%|█████████▉| 653/656 [33:06<00:10,  3.47s/it]

{'loss': 0.3278, 'grad_norm': 2.142326831817627, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}


100%|█████████▉| 654/656 [33:09<00:06,  3.47s/it]

{'loss': 0.3699, 'grad_norm': 2.4958643913269043, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}


100%|█████████▉| 655/656 [33:12<00:03,  3.34s/it]

{'loss': 0.3809, 'grad_norm': 3.560457706451416, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}


100%|██████████| 656/656 [33:13<00:00,  2.53s/it]Saving model checkpoint to ./snips_token_clf/results/checkpoint-656
Configuration saved in ./snips_token_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_token_clf/results/checkpoint-656/model.safetensors


{'loss': 0.2522, 'grad_norm': 4.226648330688477, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}



***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
                                                 
100%|██████████| 656/656 [34:42<00:00,  2.53s/it]Saving model checkpoint to ./snips_token_clf/results/checkpoint-656
Configuration saved in ./snips_token_clf/results/checkpoint-656/config.json


{'eval_loss': 0.36779090762138367, 'eval_model_preparation_time': 0.0015, 'eval_runtime': 87.5893, 'eval_samples_per_second': 29.878, 'eval_steps_per_second': 0.936, 'epoch': 2.0}


Model weights saved in ./snips_token_clf/results/checkpoint-656/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_token_clf/results/checkpoint-656 (score: 0.36779090762138367).
100%|██████████| 656/656 [34:45<00:00,  3.18s/it]

{'train_runtime': 2085.8323, 'train_samples_per_second': 10.036, 'train_steps_per_second': 0.315, 'train_loss': 1.8134790027014367, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=1.8134790027014367, metrics={'train_runtime': 2085.8323, 'train_samples_per_second': 10.036, 'train_steps_per_second': 0.315, 'total_flos': 202918092294960.0, 'train_loss': 1.8134790027014367, 'epoch': 2.0})

In [106]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [02:24<00:00,  1.76s/it]


{'eval_loss': 0.36779090762138367,
 'eval_model_preparation_time': 0.0015,
 'eval_runtime': 146.566,
 'eval_samples_per_second': 17.855,
 'eval_steps_per_second': 0.559,
 'epoch': 2.0}

In [None]:
pipe=pipeline('token-classification', token_clf_model,tokenizer=tokenizer)
pipe('Add Two Coins by dispatch to my road trip playlist')

In [107]:
pipe=pipeline('token-classification', token_clf_model,tokenizer=tokenizer)
pipe('Rate this playlist out of 5')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-object_select',
  'score': 0.48078132,
  'index': 2,
  'word': 'this',
  'start': 5,
  'end': 9},
 {'entity': 'B-best_rating',
  'score': 0.8649718,
  'index': 7,
  'word': '5',
  'start': 26,
  'end': 27}]