In [1]:
# Load model directly
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.bos_token = tokenizer.eos_token
print(tokenizer.pad_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<|endoftext|>


In [4]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-410m")
model.to("mps")
model.device

device(type='mps', index=0)

In [5]:
raw_datasets = load_dataset("iamtarun/python_code_instructions_18k_alpaca",split="train")

In [6]:
raw_datasets = raw_datasets.train_test_split(test_size=0.1)

In [7]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 16750
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 1862
    })
})


In [8]:
def preprocess_function(data):
    return tokenizer(data["prompt"])


tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=raw_datasets["test"].column_names,
)

Map (num_proc=4): 100%|██████████| 16750/16750 [00:01<00:00, 9583.20 examples/s] 
Map (num_proc=4): 100%|██████████| 1862/1862 [00:00<00:00, 7350.39 examples/s]


In [9]:
print(tokenized_datasets)
print(tokenized_datasets["train"][0])
print(tokenizer.decode(tokenized_datasets["train"][2]["input_ids"]))

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16750
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1862
    })
})
{'input_ids': [30003, 310, 271, 9775, 326, 8631, 247, 4836, 15, 19566, 247, 2380, 326, 20420, 29141, 253, 2748, 15, 187, 187, 4118, 41959, 27, 187, 9395, 247, 15548, 2086, 326, 3936, 247, 6197, 285, 6548, 247, 1618, 273, 3000, 275, 30156, 474, 1340, 15, 187, 187, 4118, 19832, 27, 187, 29710, 566, 27, 50276, 510, 3158, 8516, 30013, 16780, 689, 253, 22658, 4370, 15, 187, 187, 4118, 24882, 27, 187, 36817, 426, 346, 510, 3158, 8516, 30013, 16780, 689, 253, 22658, 4370, 449, 187, 187, 4, 43531, 6197, 715, 3000, 187, 12113, 426, 6197, 15, 9148, 1082, 187, 187, 4, 17399, 1618, 273, 3000, 355, 20376, 17859, 187, 12113, 15, 15227, 1082, 187, 187, 4, 18312, 20045, 1618, 273, 3000, 187, 3845, 9, 12113, 10, 187, 187, 4, 24882, 27, 14412, 33167, 1383, 686, 21428, 40219, 686, 15736,

In [10]:
block_size = 256


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4): 100%|██████████| 16750/16750 [00:03<00:00, 5161.75 examples/s]
Map (num_proc=4): 100%|██████████| 1862/1862 [00:00<00:00, 6815.39 examples/s]


In [None]:
print(lm_datasets)
for i in range(5):
    print(tokenizer.decode(lm_datasets["train"][i]["input_ids"]))
    print("-----------------------------------------------")
# print(tokenizer.decode(lm_datasets["train"][0]["input_ids"]))

In [None]:
input_ids = tokenizer(["Write a Python Code for calculating 1 + 1"], return_tensors="pt").to('mps')
print("input_ids", input_ids)
generated_ids = model.generate(**input_ids, max_new_tokens=100, repetition_penalty=2.)

In [None]:
print(tokenizer.batch_decode(generated_ids)[0])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
training_args = TrainingArguments(
    output_dir="save/3/",
    report_to='tensorboard',
    seed=4328,
    logging_steps=10,
    per_device_train_batch_size=4,
    learning_rate=5e-7,
    warmup_steps=100,
    num_train_epochs=1,
    save_total_limit= 3,
    save_strategy="steps",
    save_steps=300,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=300,
)


In [12]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
trainer = Trainer(
    model=model,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['test'],
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
)

trainer.train(resume_from_checkpoint=True)

  7%|▋         | 210/2852 [00:13<00:26, 98.47it/s]

{'loss': 1.1376, 'grad_norm': 7.879387855529785, 'learning_rate': 4.631837307152875e-06, 'epoch': 0.07}


  8%|▊         | 220/2852 [00:26<10:15,  4.27it/s]

{'loss': 1.2102, 'grad_norm': 6.814539909362793, 'learning_rate': 4.614305750350632e-06, 'epoch': 0.08}


  8%|▊         | 230/2852 [00:38<36:29,  1.20it/s]

{'loss': 1.2134, 'grad_norm': 8.2896728515625, 'learning_rate': 4.596774193548387e-06, 'epoch': 0.08}


  8%|▊         | 240/2852 [00:51<54:40,  1.26s/it]

{'loss': 1.2069, 'grad_norm': 10.512642860412598, 'learning_rate': 4.579242636746143e-06, 'epoch': 0.08}


  9%|▉         | 250/2852 [01:04<54:40,  1.26s/it]

{'loss': 1.2325, 'grad_norm': 7.579464912414551, 'learning_rate': 4.5617110799438996e-06, 'epoch': 0.09}


  9%|▉         | 260/2852 [01:16<54:39,  1.27s/it]

{'loss': 1.1504, 'grad_norm': 8.006257057189941, 'learning_rate': 4.544179523141655e-06, 'epoch': 0.09}


  9%|▉         | 270/2852 [01:29<53:17,  1.24s/it]

{'loss': 1.2828, 'grad_norm': 7074150.5, 'learning_rate': 4.526647966339412e-06, 'epoch': 0.09}


 10%|▉         | 280/2852 [01:42<54:31,  1.27s/it]

{'loss': 1.2134, 'grad_norm': 9.696698188781738, 'learning_rate': 4.509116409537167e-06, 'epoch': 0.1}


 10%|█         | 290/2852 [01:54<54:16,  1.27s/it]

{'loss': 1.1763, 'grad_norm': 7.850350379943848, 'learning_rate': 4.491584852734923e-06, 'epoch': 0.1}


 11%|█         | 300/2852 [02:07<54:35,  1.28s/it]

{'loss': 1.1386, 'grad_norm': 7.252322196960449, 'learning_rate': 4.47405329593268e-06, 'epoch': 0.11}


                                                  
 11%|█         | 300/2852 [03:43<54:35,  1.28s/it]

{'eval_loss': 1.1051100492477417, 'eval_runtime': 96.0341, 'eval_samples_per_second': 15.359, 'eval_steps_per_second': 1.926, 'epoch': 0.11}


 11%|█         | 310/2852 [04:01<1:46:48,  2.52s/it] 

{'loss': 1.1688, 'grad_norm': 7.250029563903809, 'learning_rate': 4.456521739130435e-06, 'epoch': 0.11}


 11%|█         | 320/2852 [04:14<56:39,  1.34s/it]  

{'loss': 1.0756, 'grad_norm': 7.858718395233154, 'learning_rate': 4.438990182328191e-06, 'epoch': 0.11}


 12%|█▏        | 330/2852 [04:27<54:03,  1.29s/it]

{'loss': 1.1449, 'grad_norm': 7.380528450012207, 'learning_rate': 4.421458625525947e-06, 'epoch': 0.12}


 12%|█▏        | 340/2852 [04:40<53:58,  1.29s/it]

{'loss': 1.2301, 'grad_norm': 6.852322578430176, 'learning_rate': 4.403927068723703e-06, 'epoch': 0.12}


 12%|█▏        | 350/2852 [04:53<54:23,  1.30s/it]

{'loss': 1.2124, 'grad_norm': 8.041778564453125, 'learning_rate': 4.386395511921459e-06, 'epoch': 0.12}


 13%|█▎        | 360/2852 [05:06<54:23,  1.31s/it]

{'loss': 1.1333, 'grad_norm': 7.327611923217773, 'learning_rate': 4.368863955119214e-06, 'epoch': 0.13}


 13%|█▎        | 370/2852 [05:19<53:49,  1.30s/it]

{'loss': 1.0364, 'grad_norm': 7.219849109649658, 'learning_rate': 4.351332398316971e-06, 'epoch': 0.13}


 13%|█▎        | 380/2852 [05:32<53:20,  1.29s/it]

{'loss': 1.0869, 'grad_norm': 7.1732378005981445, 'learning_rate': 4.333800841514727e-06, 'epoch': 0.13}


 14%|█▎        | 390/2852 [05:45<52:15,  1.27s/it]

{'loss': 1.174, 'grad_norm': 8.740118026733398, 'learning_rate': 4.316269284712482e-06, 'epoch': 0.14}


 14%|█▍        | 400/2852 [05:57<52:44,  1.29s/it]

{'loss': 1.0509, 'grad_norm': 9.240239143371582, 'learning_rate': 4.298737727910239e-06, 'epoch': 0.14}


                                                  
 14%|█▍        | 400/2852 [07:34<52:44,  1.29s/it]

{'eval_loss': 1.1027086973190308, 'eval_runtime': 96.5662, 'eval_samples_per_second': 15.274, 'eval_steps_per_second': 1.916, 'epoch': 0.14}


 14%|█▍        | 410/2852 [07:53<1:42:49,  2.53s/it] 

{'loss': 1.1741, 'grad_norm': 7.313863277435303, 'learning_rate': 4.281206171107995e-06, 'epoch': 0.14}


 15%|█▍        | 420/2852 [08:06<55:59,  1.38s/it]  

{'loss': 1.0996, 'grad_norm': 7.653791427612305, 'learning_rate': 4.263674614305751e-06, 'epoch': 0.15}


 15%|█▌        | 430/2852 [08:20<55:18,  1.37s/it]

{'loss': 1.1613, 'grad_norm': 6244042.0, 'learning_rate': 4.2461430575035066e-06, 'epoch': 0.15}


 15%|█▌        | 440/2852 [08:34<53:27,  1.33s/it]

{'loss': 1.1345, 'grad_norm': 8.154988288879395, 'learning_rate': 4.228611500701262e-06, 'epoch': 0.15}


 16%|█▌        | 450/2852 [08:47<53:41,  1.34s/it]

{'loss': 1.1024, 'grad_norm': 7.831408977508545, 'learning_rate': 4.211079943899019e-06, 'epoch': 0.16}


 16%|█▌        | 460/2852 [09:00<53:04,  1.33s/it]

{'loss': 1.1009, 'grad_norm': 7.878403663635254, 'learning_rate': 4.193548387096774e-06, 'epoch': 0.16}


 16%|█▋        | 470/2852 [09:14<53:21,  1.34s/it]

{'loss': 1.1835, 'grad_norm': 7.80277681350708, 'learning_rate': 4.17601683029453e-06, 'epoch': 0.16}


 17%|█▋        | 480/2852 [09:27<50:46,  1.28s/it]

{'loss': 0.9986, 'grad_norm': 7.077828884124756, 'learning_rate': 4.1584852734922866e-06, 'epoch': 0.17}


 17%|█▋        | 490/2852 [09:40<52:17,  1.33s/it]

{'loss': 1.2441, 'grad_norm': 7.250613689422607, 'learning_rate': 4.140953716690042e-06, 'epoch': 0.17}


 18%|█▊        | 500/2852 [09:53<51:35,  1.32s/it]

{'loss': 1.1382, 'grad_norm': 7.127778053283691, 'learning_rate': 4.123422159887798e-06, 'epoch': 0.18}


                                                  
 18%|█▊        | 500/2852 [11:36<51:35,  1.32s/it]

{'eval_loss': 1.0821586847305298, 'eval_runtime': 102.9918, 'eval_samples_per_second': 14.322, 'eval_steps_per_second': 1.796, 'epoch': 0.18}


 18%|█▊        | 510/2852 [11:55<1:42:58,  2.64s/it] 

{'loss': 1.256, 'grad_norm': 9.239830017089844, 'learning_rate': 4.105890603085554e-06, 'epoch': 0.18}


 18%|█▊        | 520/2852 [12:08<53:19,  1.37s/it]  

{'loss': 1.2098, 'grad_norm': 8.628807067871094, 'learning_rate': 4.08835904628331e-06, 'epoch': 0.18}


 19%|█▊        | 530/2852 [12:22<51:50,  1.34s/it]

{'loss': 1.2308, 'grad_norm': 8.674908638000488, 'learning_rate': 4.0708274894810666e-06, 'epoch': 0.19}


 19%|█▉        | 540/2852 [12:35<50:28,  1.31s/it]

{'loss': 1.1641, 'grad_norm': 7.262811183929443, 'learning_rate': 4.053295932678822e-06, 'epoch': 0.19}


 19%|█▉        | 550/2852 [12:48<50:37,  1.32s/it]

{'loss': 1.1419, 'grad_norm': 5.968109607696533, 'learning_rate': 4.035764375876578e-06, 'epoch': 0.19}


 20%|█▉        | 560/2852 [13:01<50:15,  1.32s/it]

{'loss': 1.0683, 'grad_norm': 7.737303256988525, 'learning_rate': 4.018232819074334e-06, 'epoch': 0.2}


 20%|█▉        | 570/2852 [13:14<50:48,  1.34s/it]

{'loss': 1.1573, 'grad_norm': 6.942831516265869, 'learning_rate': 4.00070126227209e-06, 'epoch': 0.2}


 20%|██        | 580/2852 [13:28<50:17,  1.33s/it]

{'loss': 1.1392, 'grad_norm': 6.186731338500977, 'learning_rate': 3.983169705469846e-06, 'epoch': 0.2}


 21%|██        | 590/2852 [13:41<49:39,  1.32s/it]

{'loss': 1.0707, 'grad_norm': 5.613446235656738, 'learning_rate': 3.965638148667602e-06, 'epoch': 0.21}


 21%|██        | 600/2852 [13:54<49:32,  1.32s/it]

{'loss': 1.0271, 'grad_norm': 6.495173931121826, 'learning_rate': 3.948106591865358e-06, 'epoch': 0.21}


                                                  
 21%|██        | 600/2852 [15:38<49:32,  1.32s/it]

{'eval_loss': 1.071543574333191, 'eval_runtime': 103.9755, 'eval_samples_per_second': 14.186, 'eval_steps_per_second': 1.779, 'epoch': 0.21}


 21%|██▏       | 610/2852 [15:57<1:39:35,  2.67s/it] 

{'loss': 1.2135, 'grad_norm': 7.40066385269165, 'learning_rate': 3.9305750350631136e-06, 'epoch': 0.21}


 22%|██▏       | 620/2852 [16:10<50:09,  1.35s/it]  

{'loss': 1.1041, 'grad_norm': 6.687857151031494, 'learning_rate': 3.91304347826087e-06, 'epoch': 0.22}


 22%|██▏       | 630/2852 [16:24<48:41,  1.31s/it]

{'loss': 1.2036, 'grad_norm': 7.267580986022949, 'learning_rate': 3.895511921458626e-06, 'epoch': 0.22}


 22%|██▏       | 640/2852 [16:37<49:28,  1.34s/it]

{'loss': 1.2825, 'grad_norm': 6.960211277008057, 'learning_rate': 3.877980364656382e-06, 'epoch': 0.22}


 23%|██▎       | 650/2852 [16:50<49:10,  1.34s/it]

{'loss': 1.1042, 'grad_norm': 6.238255977630615, 'learning_rate': 3.860448807854138e-06, 'epoch': 0.23}


 23%|██▎       | 660/2852 [17:03<47:46,  1.31s/it]

{'loss': 1.067, 'grad_norm': 6.0595808029174805, 'learning_rate': 3.8429172510518936e-06, 'epoch': 0.23}


 23%|██▎       | 670/2852 [17:17<47:56,  1.32s/it]

{'loss': 1.1433, 'grad_norm': 5.321854591369629, 'learning_rate': 3.82538569424965e-06, 'epoch': 0.23}


 24%|██▍       | 680/2852 [17:30<47:38,  1.32s/it]

{'loss': 1.0131, 'grad_norm': 6.474616527557373, 'learning_rate': 3.8078541374474053e-06, 'epoch': 0.24}


 24%|██▍       | 690/2852 [17:43<48:11,  1.34s/it]

{'loss': 1.2424, 'grad_norm': 6.908649921417236, 'learning_rate': 3.7903225806451614e-06, 'epoch': 0.24}


 25%|██▍       | 700/2852 [17:56<46:56,  1.31s/it]

{'loss': 1.1529, 'grad_norm': 9.850557327270508, 'learning_rate': 3.772791023842918e-06, 'epoch': 0.25}


                                                  
 25%|██▍       | 700/2852 [19:34<46:56,  1.31s/it]

{'eval_loss': 1.0592529773712158, 'eval_runtime': 97.4757, 'eval_samples_per_second': 15.132, 'eval_steps_per_second': 1.898, 'epoch': 0.25}


 25%|██▍       | 710/2852 [19:53<1:30:51,  2.54s/it] 

{'loss': 1.2011, 'grad_norm': 6.692425727844238, 'learning_rate': 3.755259467040673e-06, 'epoch': 0.25}


 25%|██▌       | 720/2852 [20:06<46:45,  1.32s/it]  

{'loss': 1.1074, 'grad_norm': 7.36292839050293, 'learning_rate': 3.7377279102384297e-06, 'epoch': 0.25}


 26%|██▌       | 730/2852 [20:19<45:33,  1.29s/it]

{'loss': 1.0843, 'grad_norm': 6.695185661315918, 'learning_rate': 3.7201963534361857e-06, 'epoch': 0.26}


 26%|██▌       | 740/2852 [20:32<45:25,  1.29s/it]

{'loss': 1.093, 'grad_norm': 6.148845195770264, 'learning_rate': 3.7026647966339414e-06, 'epoch': 0.26}


 26%|██▋       | 750/2852 [20:45<45:01,  1.28s/it]

{'loss': 1.0563, 'grad_norm': 6.2475738525390625, 'learning_rate': 3.6851332398316975e-06, 'epoch': 0.26}


 27%|██▋       | 760/2852 [20:58<44:48,  1.29s/it]

{'loss': 1.114, 'grad_norm': 6.959306240081787, 'learning_rate': 3.667601683029453e-06, 'epoch': 0.27}


 27%|██▋       | 770/2852 [21:10<44:49,  1.29s/it]

{'loss': 1.1043, 'grad_norm': 6.902063369750977, 'learning_rate': 3.6500701262272092e-06, 'epoch': 0.27}


 27%|██▋       | 780/2852 [21:23<44:09,  1.28s/it]

{'loss': 1.0791, 'grad_norm': 6.695148944854736, 'learning_rate': 3.6325385694249653e-06, 'epoch': 0.27}


 28%|██▊       | 790/2852 [21:36<44:05,  1.28s/it]

{'loss': 1.1645, 'grad_norm': 7.014420986175537, 'learning_rate': 3.615007012622721e-06, 'epoch': 0.28}


 28%|██▊       | 800/2852 [21:49<43:44,  1.28s/it]

{'loss': 1.2845, 'grad_norm': 8.202850341796875, 'learning_rate': 3.597475455820477e-06, 'epoch': 0.28}


                                                  
 28%|██▊       | 800/2852 [23:25<43:44,  1.28s/it]

{'eval_loss': 1.063850998878479, 'eval_runtime': 95.934, 'eval_samples_per_second': 15.375, 'eval_steps_per_second': 1.928, 'epoch': 0.28}


 28%|██▊       | 810/2852 [23:44<1:26:20,  2.54s/it] 

{'loss': 0.9959, 'grad_norm': 5.561209678649902, 'learning_rate': 3.5799438990182336e-06, 'epoch': 0.28}


 29%|██▉       | 820/2852 [23:57<43:59,  1.30s/it]  

{'loss': 1.1564, 'grad_norm': 6360601.0, 'learning_rate': 3.562412342215989e-06, 'epoch': 0.29}


 29%|██▉       | 830/2852 [24:10<42:08,  1.25s/it]

{'loss': 1.1292, 'grad_norm': 5440628.5, 'learning_rate': 3.5448807854137453e-06, 'epoch': 0.29}


 29%|██▉       | 840/2852 [24:23<43:35,  1.30s/it]

{'loss': 1.1611, 'grad_norm': 9.811912536621094, 'learning_rate': 3.5273492286115006e-06, 'epoch': 0.29}


 30%|██▉       | 850/2852 [24:36<42:46,  1.28s/it]

{'loss': 1.2417, 'grad_norm': 10.225603103637695, 'learning_rate': 3.509817671809257e-06, 'epoch': 0.3}


 30%|███       | 860/2852 [24:49<42:46,  1.29s/it]

{'loss': 1.0887, 'grad_norm': 7.402224540710449, 'learning_rate': 3.492286115007013e-06, 'epoch': 0.3}


 31%|███       | 870/2852 [25:01<42:34,  1.29s/it]

{'loss': 1.1854, 'grad_norm': 7.173348903656006, 'learning_rate': 3.474754558204769e-06, 'epoch': 0.31}


 31%|███       | 880/2852 [25:14<42:33,  1.29s/it]

{'loss': 1.1996, 'grad_norm': 8692483.0, 'learning_rate': 3.457223001402525e-06, 'epoch': 0.31}


 31%|███       | 890/2852 [25:27<41:32,  1.27s/it]

{'loss': 1.0917, 'grad_norm': 7785624.0, 'learning_rate': 3.439691444600281e-06, 'epoch': 0.31}


 32%|███▏      | 900/2852 [25:40<42:26,  1.30s/it]

{'loss': 1.2181, 'grad_norm': 9.398615837097168, 'learning_rate': 3.4221598877980366e-06, 'epoch': 0.32}


                                                  
 32%|███▏      | 900/2852 [27:25<42:26,  1.30s/it]

{'eval_loss': 1.0739060640335083, 'eval_runtime': 105.2211, 'eval_samples_per_second': 14.018, 'eval_steps_per_second': 1.758, 'epoch': 0.32}


 32%|███▏      | 910/2852 [27:45<1:25:44,  2.65s/it] 

{'loss': 1.1252, 'grad_norm': 7353920.5, 'learning_rate': 3.4046283309957927e-06, 'epoch': 0.32}


 32%|███▏      | 920/2852 [27:59<44:16,  1.37s/it]  

{'loss': 1.1208, 'grad_norm': 6.142120838165283, 'learning_rate': 3.3870967741935484e-06, 'epoch': 0.32}


 33%|███▎      | 930/2852 [28:12<42:50,  1.34s/it]

{'loss': 1.241, 'grad_norm': 8.680566787719727, 'learning_rate': 3.3695652173913045e-06, 'epoch': 0.33}


 33%|███▎      | 940/2852 [28:25<42:58,  1.35s/it]

{'loss': 1.0964, 'grad_norm': 7.286712646484375, 'learning_rate': 3.3520336605890606e-06, 'epoch': 0.33}


 33%|███▎      | 950/2852 [28:38<41:27,  1.31s/it]

{'loss': 1.1602, 'grad_norm': 6.9807000160217285, 'learning_rate': 3.3345021037868162e-06, 'epoch': 0.33}


 34%|███▎      | 960/2852 [28:51<41:17,  1.31s/it]

{'loss': 1.0572, 'grad_norm': 7.200172424316406, 'learning_rate': 3.3169705469845727e-06, 'epoch': 0.34}


 34%|███▍      | 970/2852 [29:05<41:06,  1.31s/it]

{'loss': 1.1212, 'grad_norm': 7.873734474182129, 'learning_rate': 3.299438990182329e-06, 'epoch': 0.34}


 34%|███▍      | 980/2852 [29:18<41:01,  1.31s/it]

{'loss': 1.0628, 'grad_norm': 7.691503047943115, 'learning_rate': 3.2819074333800845e-06, 'epoch': 0.34}


 35%|███▍      | 990/2852 [29:31<40:53,  1.32s/it]

{'loss': 1.1621, 'grad_norm': 5.634706497192383, 'learning_rate': 3.2643758765778406e-06, 'epoch': 0.35}


 35%|███▌      | 1000/2852 [29:44<40:39,  1.32s/it]

{'loss': 1.1011, 'grad_norm': 7.326597213745117, 'learning_rate': 3.2468443197755962e-06, 'epoch': 0.35}


                                                   
 35%|███▌      | 1000/2852 [31:28<40:39,  1.32s/it]

{'eval_loss': 1.0531072616577148, 'eval_runtime': 103.6643, 'eval_samples_per_second': 14.229, 'eval_steps_per_second': 1.785, 'epoch': 0.35}


 35%|███▌      | 1010/2852 [31:47<1:21:29,  2.65s/it] 

{'loss': 1.0641, 'grad_norm': 6.565316677093506, 'learning_rate': 3.2293127629733523e-06, 'epoch': 0.35}


 36%|███▌      | 1020/2852 [32:00<41:37,  1.36s/it]  

{'loss': 1.0127, 'grad_norm': 5.446836471557617, 'learning_rate': 3.2117812061711084e-06, 'epoch': 0.36}


 36%|███▌      | 1030/2852 [32:13<40:46,  1.34s/it]

{'loss': 1.1784, 'grad_norm': 6.93111515045166, 'learning_rate': 3.194249649368864e-06, 'epoch': 0.36}


 36%|███▋      | 1040/2852 [32:27<40:07,  1.33s/it]

{'loss': 1.1021, 'grad_norm': 5.979379653930664, 'learning_rate': 3.17671809256662e-06, 'epoch': 0.36}


 37%|███▋      | 1050/2852 [32:40<39:19,  1.31s/it]

{'loss': 1.0366, 'grad_norm': 6.863144874572754, 'learning_rate': 3.1591865357643762e-06, 'epoch': 0.37}


 37%|███▋      | 1060/2852 [32:53<38:29,  1.29s/it]

{'loss': 1.1785, 'grad_norm': 4722932.0, 'learning_rate': 3.141654978962132e-06, 'epoch': 0.37}


 38%|███▊      | 1070/2852 [33:06<38:35,  1.30s/it]

{'loss': 1.0695, 'grad_norm': 6.365395545959473, 'learning_rate': 3.124123422159888e-06, 'epoch': 0.38}


 38%|███▊      | 1080/2852 [33:19<39:07,  1.32s/it]

{'loss': 1.2302, 'grad_norm': 7.053689956665039, 'learning_rate': 3.1065918653576436e-06, 'epoch': 0.38}


 38%|███▊      | 1090/2852 [33:32<38:55,  1.33s/it]

{'loss': 1.1419, 'grad_norm': 7.669309139251709, 'learning_rate': 3.0890603085554e-06, 'epoch': 0.38}


 39%|███▊      | 1100/2852 [33:46<38:05,  1.30s/it]

{'loss': 1.0485, 'grad_norm': 6.889869213104248, 'learning_rate': 3.0715287517531562e-06, 'epoch': 0.39}


                                                   
 39%|███▊      | 1100/2852 [35:28<38:05,  1.30s/it]

{'eval_loss': 1.0452560186386108, 'eval_runtime': 102.6572, 'eval_samples_per_second': 14.368, 'eval_steps_per_second': 1.802, 'epoch': 0.39}


 39%|███▉      | 1110/2852 [35:49<1:16:48,  2.65s/it] 

{'loss': 1.0726, 'grad_norm': 5.56788444519043, 'learning_rate': 3.053997194950912e-06, 'epoch': 0.39}


 39%|███▉      | 1120/2852 [36:02<38:26,  1.33s/it]  

{'loss': 1.0217, 'grad_norm': 5.922815322875977, 'learning_rate': 3.036465638148668e-06, 'epoch': 0.39}


 40%|███▉      | 1130/2852 [36:15<36:26,  1.27s/it]

{'loss': 1.1317, 'grad_norm': 6104594.0, 'learning_rate': 3.018934081346424e-06, 'epoch': 0.4}


 40%|███▉      | 1140/2852 [36:29<37:41,  1.32s/it]

{'loss': 1.0208, 'grad_norm': 6.302035808563232, 'learning_rate': 3.0014025245441797e-06, 'epoch': 0.4}


 40%|████      | 1150/2852 [36:42<37:12,  1.31s/it]

{'loss': 1.0702, 'grad_norm': 6.673868656158447, 'learning_rate': 2.983870967741936e-06, 'epoch': 0.4}


 41%|████      | 1160/2852 [36:55<37:03,  1.31s/it]

{'loss': 1.0544, 'grad_norm': 4927313.5, 'learning_rate': 2.9663394109396915e-06, 'epoch': 0.41}


 41%|████      | 1170/2852 [37:08<36:39,  1.31s/it]

{'loss': 1.179, 'grad_norm': 6.536174297332764, 'learning_rate': 2.9488078541374476e-06, 'epoch': 0.41}


 41%|████▏     | 1180/2852 [37:22<36:57,  1.33s/it]

{'loss': 1.0631, 'grad_norm': 6.202348232269287, 'learning_rate': 2.9312762973352036e-06, 'epoch': 0.41}


 42%|████▏     | 1190/2852 [37:35<36:59,  1.34s/it]

{'loss': 0.9849, 'grad_norm': 5.987925052642822, 'learning_rate': 2.9137447405329593e-06, 'epoch': 0.42}


 42%|████▏     | 1200/2852 [37:48<36:46,  1.34s/it]

{'loss': 1.0998, 'grad_norm': 7.02029275894165, 'learning_rate': 2.8962131837307154e-06, 'epoch': 0.42}


                                                   
 42%|████▏     | 1200/2852 [39:31<36:46,  1.34s/it]

{'eval_loss': 1.0440841913223267, 'eval_runtime': 103.2915, 'eval_samples_per_second': 14.28, 'eval_steps_per_second': 1.791, 'epoch': 0.42}


 42%|████▏     | 1210/2852 [39:53<1:13:03,  2.67s/it] 

{'loss': 1.2137, 'grad_norm': 7.5716166496276855, 'learning_rate': 2.878681626928472e-06, 'epoch': 0.42}


 43%|████▎     | 1220/2852 [40:06<37:05,  1.36s/it]  

{'loss': 1.0472, 'grad_norm': 5.936100959777832, 'learning_rate': 2.8611500701262276e-06, 'epoch': 0.43}


 43%|████▎     | 1230/2852 [40:19<34:41,  1.28s/it]

{'loss': 1.1352, 'grad_norm': 7.6807708740234375, 'learning_rate': 2.8436185133239836e-06, 'epoch': 0.43}


 43%|████▎     | 1240/2852 [40:32<36:22,  1.35s/it]

{'loss': 1.2082, 'grad_norm': 7.279677391052246, 'learning_rate': 2.8260869565217393e-06, 'epoch': 0.43}


 44%|████▍     | 1250/2852 [40:45<35:16,  1.32s/it]

{'loss': 1.0571, 'grad_norm': 6.202648162841797, 'learning_rate': 2.8085553997194954e-06, 'epoch': 0.44}


 44%|████▍     | 1260/2852 [40:59<35:40,  1.34s/it]

{'loss': 1.0686, 'grad_norm': 7.071532726287842, 'learning_rate': 2.7910238429172515e-06, 'epoch': 0.44}


 45%|████▍     | 1270/2852 [41:12<34:38,  1.31s/it]

{'loss': 1.0367, 'grad_norm': 6.144183158874512, 'learning_rate': 2.773492286115007e-06, 'epoch': 0.45}


 45%|████▍     | 1280/2852 [41:25<34:38,  1.32s/it]

{'loss': 1.0668, 'grad_norm': 6.363819599151611, 'learning_rate': 2.7559607293127632e-06, 'epoch': 0.45}


 45%|████▌     | 1290/2852 [41:38<34:21,  1.32s/it]

{'loss': 1.1941, 'grad_norm': 6.374659061431885, 'learning_rate': 2.7384291725105193e-06, 'epoch': 0.45}


 46%|████▌     | 1300/2852 [41:51<33:50,  1.31s/it]

{'loss': 1.0719, 'grad_norm': 6.081364631652832, 'learning_rate': 2.720897615708275e-06, 'epoch': 0.46}


                                                   
 46%|████▌     | 1300/2852 [43:34<33:50,  1.31s/it]

{'eval_loss': 1.0383609533309937, 'eval_runtime': 102.9615, 'eval_samples_per_second': 14.326, 'eval_steps_per_second': 1.797, 'epoch': 0.46}


 46%|████▌     | 1310/2852 [43:54<1:07:42,  2.63s/it] 

{'loss': 1.1689, 'grad_norm': 6.158447742462158, 'learning_rate': 2.703366058906031e-06, 'epoch': 0.46}


 46%|████▋     | 1320/2852 [44:07<34:03,  1.33s/it]  

{'loss': 1.1764, 'grad_norm': 6.292094707489014, 'learning_rate': 2.6858345021037867e-06, 'epoch': 0.46}


 47%|████▋     | 1330/2852 [44:20<32:41,  1.29s/it]

{'loss': 1.0372, 'grad_norm': 6.929604530334473, 'learning_rate': 2.668302945301543e-06, 'epoch': 0.47}


 47%|████▋     | 1340/2852 [44:33<32:57,  1.31s/it]

{'loss': 1.1121, 'grad_norm': 7.014143466949463, 'learning_rate': 2.6507713884992993e-06, 'epoch': 0.47}


 47%|████▋     | 1350/2852 [44:46<33:10,  1.33s/it]

{'loss': 0.9202, 'grad_norm': 5.973657131195068, 'learning_rate': 2.633239831697055e-06, 'epoch': 0.47}


 48%|████▊     | 1360/2852 [44:59<31:52,  1.28s/it]

{'loss': 1.1024, 'grad_norm': 7.404268741607666, 'learning_rate': 2.615708274894811e-06, 'epoch': 0.48}


 48%|████▊     | 1370/2852 [45:12<32:03,  1.30s/it]

{'loss': 1.1167, 'grad_norm': 5.9398512840271, 'learning_rate': 2.598176718092567e-06, 'epoch': 0.48}


 48%|████▊     | 1380/2852 [45:25<31:35,  1.29s/it]

{'loss': 1.0171, 'grad_norm': 4945047.5, 'learning_rate': 2.580645161290323e-06, 'epoch': 0.48}


 49%|████▊     | 1390/2852 [45:38<31:38,  1.30s/it]

{'loss': 1.1371, 'grad_norm': 6.811812400817871, 'learning_rate': 2.563113604488079e-06, 'epoch': 0.49}


 49%|████▉     | 1400/2852 [45:51<31:38,  1.31s/it]

{'loss': 1.0486, 'grad_norm': 5.959235191345215, 'learning_rate': 2.5455820476858346e-06, 'epoch': 0.49}


                                                   
 49%|████▉     | 1400/2852 [47:31<31:38,  1.31s/it]

{'eval_loss': 1.0362818241119385, 'eval_runtime': 100.4241, 'eval_samples_per_second': 14.688, 'eval_steps_per_second': 1.842, 'epoch': 0.49}


 49%|████▉     | 1410/2852 [47:50<1:01:55,  2.58s/it] 

{'loss': 1.0868, 'grad_norm': 6.392420768737793, 'learning_rate': 2.5280504908835906e-06, 'epoch': 0.49}


 50%|████▉     | 1420/2852 [48:03<32:14,  1.35s/it]  

{'loss': 0.9982, 'grad_norm': 7.04593563079834, 'learning_rate': 2.5105189340813467e-06, 'epoch': 0.5}


 50%|█████     | 1430/2852 [48:16<30:27,  1.28s/it]

{'loss': 1.1232, 'grad_norm': 6.606487274169922, 'learning_rate': 2.4929873772791024e-06, 'epoch': 0.5}


 50%|█████     | 1440/2852 [48:29<31:08,  1.32s/it]

{'loss': 1.0233, 'grad_norm': 5.266455173492432, 'learning_rate': 2.4754558204768585e-06, 'epoch': 0.5}


 51%|█████     | 1450/2852 [48:42<30:46,  1.32s/it]

{'loss': 1.0896, 'grad_norm': 6.5988569259643555, 'learning_rate': 2.4579242636746146e-06, 'epoch': 0.51}


 51%|█████     | 1460/2852 [48:55<30:10,  1.30s/it]

{'loss': 1.019, 'grad_norm': 5.690179347991943, 'learning_rate': 2.4403927068723702e-06, 'epoch': 0.51}


 52%|█████▏    | 1470/2852 [49:08<30:10,  1.31s/it]

{'loss': 1.0734, 'grad_norm': 6.952198028564453, 'learning_rate': 2.4228611500701263e-06, 'epoch': 0.52}


 52%|█████▏    | 1480/2852 [49:21<29:51,  1.31s/it]

{'loss': 1.0349, 'grad_norm': 6.25234842300415, 'learning_rate': 2.4053295932678824e-06, 'epoch': 0.52}


 52%|█████▏    | 1490/2852 [49:34<29:38,  1.31s/it]

{'loss': 0.9822, 'grad_norm': 5.241644382476807, 'learning_rate': 2.3877980364656385e-06, 'epoch': 0.52}


 53%|█████▎    | 1500/2852 [49:48<29:45,  1.32s/it]

{'loss': 1.0815, 'grad_norm': 7.465898036956787, 'learning_rate': 2.370266479663394e-06, 'epoch': 0.53}


                                                   
 53%|█████▎    | 1500/2852 [51:30<29:45,  1.32s/it]

{'eval_loss': 1.029735803604126, 'eval_runtime': 102.2803, 'eval_samples_per_second': 14.421, 'eval_steps_per_second': 1.809, 'epoch': 0.53}


 53%|█████▎    | 1510/2852 [51:48<58:42,  2.62s/it]   

{'loss': 1.0825, 'grad_norm': 7.248540878295898, 'learning_rate': 2.3527349228611502e-06, 'epoch': 0.53}


 53%|█████▎    | 1520/2852 [52:01<30:20,  1.37s/it]

{'loss': 1.138, 'grad_norm': 7.080415725708008, 'learning_rate': 2.3352033660589063e-06, 'epoch': 0.53}


 54%|█████▎    | 1530/2852 [52:15<28:58,  1.31s/it]

{'loss': 1.0827, 'grad_norm': 6.846385478973389, 'learning_rate': 2.3176718092566624e-06, 'epoch': 0.54}


 54%|█████▍    | 1540/2852 [52:28<28:53,  1.32s/it]

{'loss': 1.1159, 'grad_norm': 6.500289440155029, 'learning_rate': 2.300140252454418e-06, 'epoch': 0.54}


 54%|█████▍    | 1550/2852 [52:41<28:20,  1.31s/it]

{'loss': 1.0947, 'grad_norm': 6.126500129699707, 'learning_rate': 2.282608695652174e-06, 'epoch': 0.54}


 55%|█████▍    | 1560/2852 [52:54<28:13,  1.31s/it]

{'loss': 1.1694, 'grad_norm': 6.212851047515869, 'learning_rate': 2.2650771388499302e-06, 'epoch': 0.55}


 55%|█████▌    | 1570/2852 [53:07<28:34,  1.34s/it]

{'loss': 1.0639, 'grad_norm': 7.268774032592773, 'learning_rate': 2.247545582047686e-06, 'epoch': 0.55}


 55%|█████▌    | 1580/2852 [53:20<27:57,  1.32s/it]

{'loss': 1.0737, 'grad_norm': 5.914583683013916, 'learning_rate': 2.230014025245442e-06, 'epoch': 0.55}


 56%|█████▌    | 1590/2852 [53:33<26:45,  1.27s/it]

{'loss': 1.2088, 'grad_norm': 4932876.5, 'learning_rate': 2.2124824684431976e-06, 'epoch': 0.56}


 56%|█████▌    | 1600/2852 [53:46<26:47,  1.28s/it]

{'loss': 1.1601, 'grad_norm': 7.325890064239502, 'learning_rate': 2.194950911640954e-06, 'epoch': 0.56}


                                                   
 56%|█████▌    | 1600/2852 [55:21<26:47,  1.28s/it]

{'eval_loss': 1.0319335460662842, 'eval_runtime': 95.2866, 'eval_samples_per_second': 15.48, 'eval_steps_per_second': 1.942, 'epoch': 0.56}


 56%|█████▋    | 1610/2852 [55:39<51:48,  2.50s/it]   

{'loss': 1.0825, 'grad_norm': 5.890854835510254, 'learning_rate': 2.17741935483871e-06, 'epoch': 0.56}


 57%|█████▋    | 1620/2852 [55:52<26:54,  1.31s/it]

{'loss': 0.9404, 'grad_norm': 6.303311824798584, 'learning_rate': 2.159887798036466e-06, 'epoch': 0.57}


 57%|█████▋    | 1630/2852 [56:04<26:19,  1.29s/it]

{'loss': 1.1342, 'grad_norm': 6.603764533996582, 'learning_rate': 2.1423562412342215e-06, 'epoch': 0.57}


 58%|█████▊    | 1640/2852 [56:17<26:01,  1.29s/it]

{'loss': 1.0102, 'grad_norm': 5.554875373840332, 'learning_rate': 2.1248246844319776e-06, 'epoch': 0.58}


 58%|█████▊    | 1650/2852 [56:30<25:18,  1.26s/it]

{'loss': 0.9836, 'grad_norm': 8376777.0, 'learning_rate': 2.1072931276297337e-06, 'epoch': 0.58}


 58%|█████▊    | 1660/2852 [56:43<25:30,  1.28s/it]

{'loss': 1.0112, 'grad_norm': 6.0961012840271, 'learning_rate': 2.08976157082749e-06, 'epoch': 0.58}


 59%|█████▊    | 1670/2852 [56:56<25:15,  1.28s/it]

{'loss': 0.9685, 'grad_norm': 6.716639995574951, 'learning_rate': 2.0722300140252455e-06, 'epoch': 0.59}


 59%|█████▉    | 1680/2852 [57:08<25:03,  1.28s/it]

{'loss': 1.1558, 'grad_norm': 7.0086588859558105, 'learning_rate': 2.0546984572230016e-06, 'epoch': 0.59}


 59%|█████▉    | 1690/2852 [57:21<24:31,  1.27s/it]

{'loss': 1.0596, 'grad_norm': 7162541.5, 'learning_rate': 2.0371669004207576e-06, 'epoch': 0.59}


 60%|█████▉    | 1700/2852 [57:34<24:47,  1.29s/it]

{'loss': 1.1106, 'grad_norm': 6.902767181396484, 'learning_rate': 2.0196353436185133e-06, 'epoch': 0.6}


                                                   
 60%|█████▉    | 1700/2852 [59:10<24:47,  1.29s/it]

{'eval_loss': 1.0248204469680786, 'eval_runtime': 95.7144, 'eval_samples_per_second': 15.41, 'eval_steps_per_second': 1.933, 'epoch': 0.6}


 60%|█████▉    | 1710/2852 [59:28<47:47,  2.51s/it]   

{'loss': 1.0542, 'grad_norm': 5.095073223114014, 'learning_rate': 2.0021037868162694e-06, 'epoch': 0.6}


 60%|██████    | 1720/2852 [59:40<24:49,  1.32s/it]

{'loss': 0.9844, 'grad_norm': 6.309049606323242, 'learning_rate': 1.9845722300140255e-06, 'epoch': 0.6}


 61%|██████    | 1730/2852 [59:53<24:00,  1.28s/it]

{'loss': 0.956, 'grad_norm': 6.535107612609863, 'learning_rate': 1.9670406732117816e-06, 'epoch': 0.61}


 61%|██████    | 1740/2852 [1:00:06<24:00,  1.30s/it]

{'loss': 1.0648, 'grad_norm': 6.806949615478516, 'learning_rate': 1.9495091164095372e-06, 'epoch': 0.61}


 61%|██████▏   | 1750/2852 [1:00:19<23:37,  1.29s/it]

{'loss': 1.0387, 'grad_norm': 6.094614028930664, 'learning_rate': 1.9319775596072933e-06, 'epoch': 0.61}


 62%|██████▏   | 1760/2852 [1:00:32<23:19,  1.28s/it]

{'loss': 0.9764, 'grad_norm': 5.588689804077148, 'learning_rate': 1.9144460028050494e-06, 'epoch': 0.62}


 62%|██████▏   | 1770/2852 [1:00:45<23:08,  1.28s/it]

{'loss': 0.9942, 'grad_norm': 5.910572528839111, 'learning_rate': 1.8969144460028053e-06, 'epoch': 0.62}


 62%|██████▏   | 1780/2852 [1:00:58<22:54,  1.28s/it]

{'loss': 1.0901, 'grad_norm': 6.271477222442627, 'learning_rate': 1.8793828892005611e-06, 'epoch': 0.62}


 63%|██████▎   | 1790/2852 [1:01:11<22:40,  1.28s/it]

{'loss': 1.0837, 'grad_norm': 6.23835563659668, 'learning_rate': 1.861851332398317e-06, 'epoch': 0.63}


 63%|██████▎   | 1800/2852 [1:01:24<22:38,  1.29s/it]

{'loss': 1.0896, 'grad_norm': 5.9109883308410645, 'learning_rate': 1.844319775596073e-06, 'epoch': 0.63}


                                                     
 63%|██████▎   | 1800/2852 [1:02:59<22:38,  1.29s/it]

{'eval_loss': 1.0180221796035767, 'eval_runtime': 95.7282, 'eval_samples_per_second': 15.408, 'eval_steps_per_second': 1.933, 'epoch': 0.63}


 63%|██████▎   | 1810/2852 [1:03:18<43:30,  2.51s/it]  

{'loss': 1.0297, 'grad_norm': 6.727541923522949, 'learning_rate': 1.826788218793829e-06, 'epoch': 0.63}


 64%|██████▍   | 1820/2852 [1:03:31<22:23,  1.30s/it]

{'loss': 0.9728, 'grad_norm': 5.9260053634643555, 'learning_rate': 1.809256661991585e-06, 'epoch': 0.64}


 64%|██████▍   | 1830/2852 [1:03:43<21:43,  1.28s/it]

{'loss': 1.0564, 'grad_norm': 6.019923686981201, 'learning_rate': 1.791725105189341e-06, 'epoch': 0.64}


 65%|██████▍   | 1840/2852 [1:03:56<21:28,  1.27s/it]

{'loss': 1.024, 'grad_norm': 6.108034133911133, 'learning_rate': 1.774193548387097e-06, 'epoch': 0.65}


 65%|██████▍   | 1850/2852 [1:04:09<21:20,  1.28s/it]

{'loss': 0.9861, 'grad_norm': 6.002007007598877, 'learning_rate': 1.7566619915848529e-06, 'epoch': 0.65}


 65%|██████▌   | 1860/2852 [1:04:22<21:13,  1.28s/it]

{'loss': 1.0283, 'grad_norm': 6.496232509613037, 'learning_rate': 1.7391304347826088e-06, 'epoch': 0.65}


 66%|██████▌   | 1870/2852 [1:04:34<21:00,  1.28s/it]

{'loss': 1.1037, 'grad_norm': 6.04484748840332, 'learning_rate': 1.7215988779803646e-06, 'epoch': 0.66}


 66%|██████▌   | 1880/2852 [1:04:47<20:58,  1.30s/it]

{'loss': 1.0591, 'grad_norm': 6.145158767700195, 'learning_rate': 1.704067321178121e-06, 'epoch': 0.66}


 66%|██████▋   | 1890/2852 [1:05:00<20:34,  1.28s/it]

{'loss': 1.1096, 'grad_norm': 6.856918811798096, 'learning_rate': 1.6865357643758768e-06, 'epoch': 0.66}


 67%|██████▋   | 1900/2852 [1:05:13<20:30,  1.29s/it]

{'loss': 1.0409, 'grad_norm': 6.423322677612305, 'learning_rate': 1.6690042075736327e-06, 'epoch': 0.67}


                                                     
 67%|██████▋   | 1900/2852 [1:06:49<20:30,  1.29s/it]

{'eval_loss': 1.0173007249832153, 'eval_runtime': 95.6886, 'eval_samples_per_second': 15.415, 'eval_steps_per_second': 1.933, 'epoch': 0.67}


 67%|██████▋   | 1910/2852 [1:07:06<39:19,  2.50s/it]  

{'loss': 1.2006, 'grad_norm': 6.614182472229004, 'learning_rate': 1.6514726507713885e-06, 'epoch': 0.67}


 67%|██████▋   | 1920/2852 [1:07:19<20:33,  1.32s/it]

{'loss': 0.9823, 'grad_norm': 6.535287380218506, 'learning_rate': 1.6339410939691446e-06, 'epoch': 0.67}


 68%|██████▊   | 1930/2852 [1:07:32<19:37,  1.28s/it]

{'loss': 1.0803, 'grad_norm': 5.893159866333008, 'learning_rate': 1.6164095371669005e-06, 'epoch': 0.68}


 68%|██████▊   | 1940/2852 [1:07:45<19:28,  1.28s/it]

{'loss': 1.2312, 'grad_norm': 7.196497917175293, 'learning_rate': 1.5988779803646564e-06, 'epoch': 0.68}


 68%|██████▊   | 1950/2852 [1:07:58<19:28,  1.30s/it]

{'loss': 1.1052, 'grad_norm': 5.728708744049072, 'learning_rate': 1.5813464235624125e-06, 'epoch': 0.68}


 69%|██████▊   | 1960/2852 [1:08:11<19:04,  1.28s/it]

{'loss': 1.0584, 'grad_norm': 5.610846042633057, 'learning_rate': 1.5638148667601685e-06, 'epoch': 0.69}


 69%|██████▉   | 1970/2852 [1:08:23<18:45,  1.28s/it]

{'loss': 1.2008, 'grad_norm': 7.899935245513916, 'learning_rate': 1.5462833099579244e-06, 'epoch': 0.69}


 69%|██████▉   | 1980/2852 [1:08:36<18:39,  1.28s/it]

{'loss': 1.1021, 'grad_norm': 6.453744888305664, 'learning_rate': 1.5287517531556803e-06, 'epoch': 0.69}


 70%|██████▉   | 1990/2852 [1:08:49<18:23,  1.28s/it]

{'loss': 1.0124, 'grad_norm': 6.360379695892334, 'learning_rate': 1.5112201963534362e-06, 'epoch': 0.7}


 70%|███████   | 2000/2852 [1:09:02<18:13,  1.28s/it]

{'loss': 0.9564, 'grad_norm': 6.47015905380249, 'learning_rate': 1.4936886395511925e-06, 'epoch': 0.7}


                                                     
 70%|███████   | 2000/2852 [1:10:37<18:13,  1.28s/it]

{'eval_loss': 1.019116759300232, 'eval_runtime': 95.6629, 'eval_samples_per_second': 15.419, 'eval_steps_per_second': 1.934, 'epoch': 0.7}


 70%|███████   | 2010/2852 [1:10:55<34:55,  2.49s/it]  

{'loss': 0.9962, 'grad_norm': 5.7961273193359375, 'learning_rate': 1.4761570827489483e-06, 'epoch': 0.7}


 71%|███████   | 2020/2852 [1:11:08<18:17,  1.32s/it]

{'loss': 1.1493, 'grad_norm': 5.780764102935791, 'learning_rate': 1.4586255259467042e-06, 'epoch': 0.71}


 71%|███████   | 2030/2852 [1:11:21<17:32,  1.28s/it]

{'loss': 1.0425, 'grad_norm': 6.9473676681518555, 'learning_rate': 1.44109396914446e-06, 'epoch': 0.71}


 72%|███████▏  | 2040/2852 [1:11:33<16:50,  1.24s/it]

{'loss': 1.0191, 'grad_norm': 6.726144313812256, 'learning_rate': 1.4235624123422162e-06, 'epoch': 0.72}


 72%|███████▏  | 2050/2852 [1:11:46<16:56,  1.27s/it]

{'loss': 0.9605, 'grad_norm': 6.636238098144531, 'learning_rate': 1.406030855539972e-06, 'epoch': 0.72}


 72%|███████▏  | 2060/2852 [1:11:59<16:53,  1.28s/it]

{'loss': 1.0527, 'grad_norm': 5.747607231140137, 'learning_rate': 1.388499298737728e-06, 'epoch': 0.72}


 73%|███████▎  | 2070/2852 [1:12:11<16:37,  1.28s/it]

{'loss': 1.0795, 'grad_norm': 6.556255340576172, 'learning_rate': 1.3709677419354838e-06, 'epoch': 0.73}


 73%|███████▎  | 2080/2852 [1:12:24<16:28,  1.28s/it]

{'loss': 1.0532, 'grad_norm': 9.378396034240723, 'learning_rate': 1.35343618513324e-06, 'epoch': 0.73}


 73%|███████▎  | 2090/2852 [1:12:37<16:11,  1.27s/it]

{'loss': 0.9536, 'grad_norm': 6.442629337310791, 'learning_rate': 1.335904628330996e-06, 'epoch': 0.73}


 74%|███████▎  | 2100/2852 [1:12:50<16:00,  1.28s/it]

{'loss': 0.9274, 'grad_norm': 5.295248031616211, 'learning_rate': 1.3183730715287518e-06, 'epoch': 0.74}


                                                     
 74%|███████▎  | 2100/2852 [1:14:25<16:00,  1.28s/it]

{'eval_loss': 1.013530969619751, 'eval_runtime': 95.6565, 'eval_samples_per_second': 15.42, 'eval_steps_per_second': 1.934, 'epoch': 0.74}


 74%|███████▍  | 2110/2852 [1:14:43<30:59,  2.51s/it]  

{'loss': 0.9815, 'grad_norm': 6.299349308013916, 'learning_rate': 1.3008415147265077e-06, 'epoch': 0.74}


 74%|███████▍  | 2120/2852 [1:14:56<15:49,  1.30s/it]

{'loss': 1.0607, 'grad_norm': 6.117823123931885, 'learning_rate': 1.2833099579242638e-06, 'epoch': 0.74}


 75%|███████▍  | 2130/2852 [1:15:09<15:26,  1.28s/it]

{'loss': 1.0609, 'grad_norm': 6.39858341217041, 'learning_rate': 1.2657784011220197e-06, 'epoch': 0.75}


 75%|███████▌  | 2140/2852 [1:15:21<15:06,  1.27s/it]

{'loss': 1.0141, 'grad_norm': 5.979031562805176, 'learning_rate': 1.2482468443197758e-06, 'epoch': 0.75}


 75%|███████▌  | 2150/2852 [1:15:34<14:59,  1.28s/it]

{'loss': 1.167, 'grad_norm': 6.9742889404296875, 'learning_rate': 1.2307152875175316e-06, 'epoch': 0.75}


 76%|███████▌  | 2160/2852 [1:15:47<14:11,  1.23s/it]

{'loss': 1.0169, 'grad_norm': 7.249264717102051, 'learning_rate': 1.2131837307152875e-06, 'epoch': 0.76}


 76%|███████▌  | 2170/2852 [1:16:00<14:36,  1.28s/it]

{'loss': 1.0215, 'grad_norm': 6.681966781616211, 'learning_rate': 1.1956521739130436e-06, 'epoch': 0.76}


 76%|███████▋  | 2180/2852 [1:16:12<14:13,  1.27s/it]

{'loss': 1.0663, 'grad_norm': 6.0361857414245605, 'learning_rate': 1.1781206171107995e-06, 'epoch': 0.76}


 77%|███████▋  | 2190/2852 [1:16:25<14:10,  1.29s/it]

{'loss': 1.0113, 'grad_norm': 7.639166831970215, 'learning_rate': 1.1605890603085555e-06, 'epoch': 0.77}


 77%|███████▋  | 2200/2852 [1:16:38<13:52,  1.28s/it]

{'loss': 1.0908, 'grad_norm': 7.046738147735596, 'learning_rate': 1.1430575035063114e-06, 'epoch': 0.77}


                                                     
 77%|███████▋  | 2200/2852 [1:18:14<13:52,  1.28s/it]

{'eval_loss': 1.0164798498153687, 'eval_runtime': 95.7401, 'eval_samples_per_second': 15.406, 'eval_steps_per_second': 1.932, 'epoch': 0.77}


 77%|███████▋  | 2210/2852 [1:18:31<26:52,  2.51s/it]  

{'loss': 1.1244, 'grad_norm': 6.304880142211914, 'learning_rate': 1.1255259467040675e-06, 'epoch': 0.77}


 78%|███████▊  | 2220/2852 [1:18:44<13:44,  1.30s/it]

{'loss': 1.024, 'grad_norm': 6.085537433624268, 'learning_rate': 1.1079943899018234e-06, 'epoch': 0.78}


 78%|███████▊  | 2230/2852 [1:18:57<13:19,  1.28s/it]

{'loss': 1.146, 'grad_norm': 6.611862659454346, 'learning_rate': 1.0904628330995795e-06, 'epoch': 0.78}


 79%|███████▊  | 2240/2852 [1:19:10<13:11,  1.29s/it]

{'loss': 1.0182, 'grad_norm': 6.389978408813477, 'learning_rate': 1.0729312762973353e-06, 'epoch': 0.79}


 79%|███████▉  | 2250/2852 [1:19:23<12:56,  1.29s/it]

{'loss': 1.0942, 'grad_norm': 6.6803131103515625, 'learning_rate': 1.0553997194950912e-06, 'epoch': 0.79}


 79%|███████▉  | 2260/2852 [1:19:36<12:35,  1.28s/it]

{'loss': 1.0976, 'grad_norm': 6.0656938552856445, 'learning_rate': 1.037868162692847e-06, 'epoch': 0.79}


 80%|███████▉  | 2270/2852 [1:19:49<12:23,  1.28s/it]

{'loss': 1.0837, 'grad_norm': 6.994895935058594, 'learning_rate': 1.0203366058906032e-06, 'epoch': 0.8}


 80%|███████▉  | 2280/2852 [1:20:01<12:13,  1.28s/it]

{'loss': 1.0806, 'grad_norm': 6.075812816619873, 'learning_rate': 1.002805049088359e-06, 'epoch': 0.8}


 80%|████████  | 2290/2852 [1:20:14<11:47,  1.26s/it]

{'loss': 1.1137, 'grad_norm': 5.920454502105713, 'learning_rate': 9.852734922861151e-07, 'epoch': 0.8}


 81%|████████  | 2300/2852 [1:20:27<11:41,  1.27s/it]

{'loss': 1.1983, 'grad_norm': 6.159648895263672, 'learning_rate': 9.67741935483871e-07, 'epoch': 0.81}


                                                     
 81%|████████  | 2300/2852 [1:22:03<11:41,  1.27s/it]

{'eval_loss': 1.0103243589401245, 'eval_runtime': 95.7173, 'eval_samples_per_second': 15.41, 'eval_steps_per_second': 1.933, 'epoch': 0.81}


 81%|████████  | 2310/2852 [1:22:20<22:35,  2.50s/it]  

{'loss': 1.0752, 'grad_norm': 6.106513500213623, 'learning_rate': 9.502103786816271e-07, 'epoch': 0.81}


 81%|████████▏ | 2320/2852 [1:22:33<11:35,  1.31s/it]

{'loss': 1.1394, 'grad_norm': 7.209898948669434, 'learning_rate': 9.32678821879383e-07, 'epoch': 0.81}


 82%|████████▏ | 2330/2852 [1:22:46<11:08,  1.28s/it]

{'loss': 1.0798, 'grad_norm': 6.8897857666015625, 'learning_rate': 9.151472650771389e-07, 'epoch': 0.82}


 82%|████████▏ | 2340/2852 [1:22:59<10:53,  1.28s/it]

{'loss': 0.8759, 'grad_norm': 5.891998767852783, 'learning_rate': 8.976157082748948e-07, 'epoch': 0.82}


 82%|████████▏ | 2350/2852 [1:23:12<10:30,  1.26s/it]

{'loss': 1.0443, 'grad_norm': 6.354411602020264, 'learning_rate': 8.800841514726509e-07, 'epoch': 0.82}


 83%|████████▎ | 2360/2852 [1:23:24<10:37,  1.30s/it]

{'loss': 1.163, 'grad_norm': 6.17948055267334, 'learning_rate': 8.625525946704068e-07, 'epoch': 0.83}


 83%|████████▎ | 2370/2852 [1:23:37<10:13,  1.27s/it]

{'loss': 1.0606, 'grad_norm': 7.1670122146606445, 'learning_rate': 8.450210378681629e-07, 'epoch': 0.83}


 83%|████████▎ | 2380/2852 [1:23:50<10:04,  1.28s/it]

{'loss': 1.0277, 'grad_norm': 6.520572185516357, 'learning_rate': 8.274894810659187e-07, 'epoch': 0.83}


 84%|████████▍ | 2390/2852 [1:24:03<09:53,  1.28s/it]

{'loss': 1.1716, 'grad_norm': 6.75555944442749, 'learning_rate': 8.099579242636747e-07, 'epoch': 0.84}


 84%|████████▍ | 2400/2852 [1:24:15<09:30,  1.26s/it]

{'loss': 1.0586, 'grad_norm': 5.657707691192627, 'learning_rate': 7.924263674614306e-07, 'epoch': 0.84}


                                                     
 84%|████████▍ | 2400/2852 [1:25:51<09:30,  1.26s/it]

{'eval_loss': 1.0097614526748657, 'eval_runtime': 95.6341, 'eval_samples_per_second': 15.423, 'eval_steps_per_second': 1.934, 'epoch': 0.84}


 85%|████████▍ | 2410/2852 [1:26:09<18:22,  2.49s/it]  

{'loss': 1.0769, 'grad_norm': 5.99069356918335, 'learning_rate': 7.748948106591867e-07, 'epoch': 0.85}


 85%|████████▍ | 2420/2852 [1:26:22<09:27,  1.31s/it]

{'loss': 1.0145, 'grad_norm': 6.302219390869141, 'learning_rate': 7.573632538569425e-07, 'epoch': 0.85}


 85%|████████▌ | 2430/2852 [1:26:34<08:57,  1.27s/it]

{'loss': 0.9468, 'grad_norm': 6.800283432006836, 'learning_rate': 7.398316970546985e-07, 'epoch': 0.85}


 86%|████████▌ | 2440/2852 [1:26:47<08:37,  1.26s/it]

{'loss': 1.0552, 'grad_norm': 6.885445594787598, 'learning_rate': 7.223001402524545e-07, 'epoch': 0.86}


 86%|████████▌ | 2450/2852 [1:27:00<08:31,  1.27s/it]

{'loss': 1.0714, 'grad_norm': 7.412783622741699, 'learning_rate': 7.047685834502105e-07, 'epoch': 0.86}


 86%|████████▋ | 2460/2852 [1:27:12<08:18,  1.27s/it]

{'loss': 1.0954, 'grad_norm': 6.274281978607178, 'learning_rate': 6.872370266479664e-07, 'epoch': 0.86}


 87%|████████▋ | 2470/2852 [1:27:25<08:11,  1.29s/it]

{'loss': 1.1079, 'grad_norm': 6.150911808013916, 'learning_rate': 6.697054698457224e-07, 'epoch': 0.87}


 87%|████████▋ | 2480/2852 [1:27:38<07:45,  1.25s/it]

{'loss': 1.0612, 'grad_norm': 6.479567050933838, 'learning_rate': 6.521739130434783e-07, 'epoch': 0.87}


 87%|████████▋ | 2490/2852 [1:27:50<07:43,  1.28s/it]

{'loss': 1.1175, 'grad_norm': 7.4376654624938965, 'learning_rate': 6.346423562412343e-07, 'epoch': 0.87}


 88%|████████▊ | 2500/2852 [1:28:03<07:27,  1.27s/it]

{'loss': 1.0525, 'grad_norm': 6.768624305725098, 'learning_rate': 6.171107994389903e-07, 'epoch': 0.88}


                                                     
 88%|████████▊ | 2500/2852 [1:29:39<07:27,  1.27s/it]

{'eval_loss': 1.0154438018798828, 'eval_runtime': 96.0478, 'eval_samples_per_second': 15.357, 'eval_steps_per_second': 1.926, 'epoch': 0.88}


 88%|████████▊ | 2510/2852 [1:29:58<14:26,  2.53s/it]  

{'loss': 1.0394, 'grad_norm': 6.769704818725586, 'learning_rate': 5.995792426367461e-07, 'epoch': 0.88}


 88%|████████▊ | 2520/2852 [1:30:11<07:18,  1.32s/it]

{'loss': 0.9675, 'grad_norm': 6.0088701248168945, 'learning_rate': 5.820476858345021e-07, 'epoch': 0.88}


 89%|████████▊ | 2530/2852 [1:30:24<06:54,  1.29s/it]

{'loss': 1.0758, 'grad_norm': 6.57511568069458, 'learning_rate': 5.645161290322581e-07, 'epoch': 0.89}


 89%|████████▉ | 2540/2852 [1:30:37<06:36,  1.27s/it]

{'loss': 1.0969, 'grad_norm': 6.900315284729004, 'learning_rate': 5.469845722300141e-07, 'epoch': 0.89}


 89%|████████▉ | 2550/2852 [1:30:50<06:27,  1.28s/it]

{'loss': 1.0967, 'grad_norm': 6.579893589019775, 'learning_rate': 5.294530154277701e-07, 'epoch': 0.89}


 90%|████████▉ | 2560/2852 [1:31:02<06:16,  1.29s/it]

{'loss': 0.9246, 'grad_norm': 5.441378593444824, 'learning_rate': 5.119214586255259e-07, 'epoch': 0.9}


 90%|█████████ | 2570/2852 [1:31:15<06:03,  1.29s/it]

{'loss': 1.017, 'grad_norm': 6.56541109085083, 'learning_rate': 4.943899018232819e-07, 'epoch': 0.9}


 90%|█████████ | 2580/2852 [1:31:28<05:39,  1.25s/it]

{'loss': 1.0899, 'grad_norm': 4321447.5, 'learning_rate': 4.768583450210379e-07, 'epoch': 0.9}


 91%|█████████ | 2590/2852 [1:31:41<05:36,  1.28s/it]

{'loss': 1.0712, 'grad_norm': 7.217310428619385, 'learning_rate': 4.5932678821879387e-07, 'epoch': 0.91}


 91%|█████████ | 2600/2852 [1:31:54<05:24,  1.29s/it]

{'loss': 1.036, 'grad_norm': 6.7479681968688965, 'learning_rate': 4.417952314165498e-07, 'epoch': 0.91}


                                                     
 91%|█████████ | 2600/2852 [1:33:30<05:24,  1.29s/it]

{'eval_loss': 1.0082359313964844, 'eval_runtime': 96.0495, 'eval_samples_per_second': 15.357, 'eval_steps_per_second': 1.926, 'epoch': 0.91}


 92%|█████████▏| 2610/2852 [1:33:47<10:05,  2.50s/it]  

{'loss': 1.173, 'grad_norm': 7.19787073135376, 'learning_rate': 4.242636746143058e-07, 'epoch': 0.92}


 92%|█████████▏| 2620/2852 [1:34:00<05:06,  1.32s/it]

{'loss': 0.9251, 'grad_norm': 5.175605297088623, 'learning_rate': 4.0673211781206176e-07, 'epoch': 0.92}


 92%|█████████▏| 2630/2852 [1:34:13<04:46,  1.29s/it]

{'loss': 1.0337, 'grad_norm': 6.952203750610352, 'learning_rate': 3.892005610098177e-07, 'epoch': 0.92}


 93%|█████████▎| 2640/2852 [1:34:26<04:32,  1.28s/it]

{'loss': 1.133, 'grad_norm': 6.1980462074279785, 'learning_rate': 3.7166900420757366e-07, 'epoch': 0.93}


 93%|█████████▎| 2650/2852 [1:34:39<04:20,  1.29s/it]

{'loss': 1.0439, 'grad_norm': 7.146969795227051, 'learning_rate': 3.541374474053296e-07, 'epoch': 0.93}


 93%|█████████▎| 2660/2852 [1:34:52<04:08,  1.29s/it]

{'loss': 1.096, 'grad_norm': 7.160225868225098, 'learning_rate': 3.3660589060308557e-07, 'epoch': 0.93}


 94%|█████████▎| 2670/2852 [1:35:04<03:53,  1.28s/it]

{'loss': 0.9582, 'grad_norm': 6.176259517669678, 'learning_rate': 3.1907433380084155e-07, 'epoch': 0.94}


 94%|█████████▍| 2680/2852 [1:35:17<03:38,  1.27s/it]

{'loss': 1.0563, 'grad_norm': 7.07553768157959, 'learning_rate': 3.0154277699859747e-07, 'epoch': 0.94}


 94%|█████████▍| 2690/2852 [1:35:30<03:27,  1.28s/it]

{'loss': 1.2018, 'grad_norm': 6.626308441162109, 'learning_rate': 2.8401122019635345e-07, 'epoch': 0.94}


 95%|█████████▍| 2700/2852 [1:35:42<03:10,  1.26s/it]

{'loss': 1.056, 'grad_norm': 5.1527252197265625, 'learning_rate': 2.6647966339410943e-07, 'epoch': 0.95}


                                                     
 95%|█████████▍| 2700/2852 [1:37:19<03:10,  1.26s/it]

{'eval_loss': 1.0049569606781006, 'eval_runtime': 96.0781, 'eval_samples_per_second': 15.352, 'eval_steps_per_second': 1.926, 'epoch': 0.95}


 95%|█████████▌| 2710/2852 [1:37:36<05:55,  2.50s/it]  

{'loss': 1.0086, 'grad_norm': 5.8468122482299805, 'learning_rate': 2.4894810659186536e-07, 'epoch': 0.95}


 95%|█████████▌| 2720/2852 [1:37:49<02:53,  1.32s/it]

{'loss': 0.9888, 'grad_norm': 6.6716461181640625, 'learning_rate': 2.3141654978962134e-07, 'epoch': 0.95}


 96%|█████████▌| 2730/2852 [1:38:01<02:33,  1.26s/it]

{'loss': 1.1111, 'grad_norm': 6.328752040863037, 'learning_rate': 2.138849929873773e-07, 'epoch': 0.96}


 96%|█████████▌| 2740/2852 [1:38:14<02:22,  1.27s/it]

{'loss': 1.1708, 'grad_norm': 6.4057159423828125, 'learning_rate': 1.9635343618513324e-07, 'epoch': 0.96}


 96%|█████████▋| 2750/2852 [1:38:27<02:10,  1.28s/it]

{'loss': 1.0257, 'grad_norm': 5.838129043579102, 'learning_rate': 1.7882187938288922e-07, 'epoch': 0.96}


 97%|█████████▋| 2760/2852 [1:38:40<01:58,  1.28s/it]

{'loss': 0.9336, 'grad_norm': 6.241974353790283, 'learning_rate': 1.6129032258064518e-07, 'epoch': 0.97}


 97%|█████████▋| 2770/2852 [1:38:53<01:46,  1.29s/it]

{'loss': 1.1108, 'grad_norm': 6.516606330871582, 'learning_rate': 1.4375876577840113e-07, 'epoch': 0.97}


 97%|█████████▋| 2780/2852 [1:39:06<01:32,  1.28s/it]

{'loss': 0.9327, 'grad_norm': 6.596680641174316, 'learning_rate': 1.2622720897615708e-07, 'epoch': 0.97}


 98%|█████████▊| 2790/2852 [1:39:18<01:19,  1.28s/it]

{'loss': 1.1149, 'grad_norm': 7.233191013336182, 'learning_rate': 1.0869565217391305e-07, 'epoch': 0.98}


 98%|█████████▊| 2800/2852 [1:39:31<01:06,  1.28s/it]

{'loss': 1.0564, 'grad_norm': 6.219605445861816, 'learning_rate': 9.116409537166901e-08, 'epoch': 0.98}


                                                     
 98%|█████████▊| 2800/2852 [1:41:07<01:06,  1.28s/it]

{'eval_loss': 1.0041440725326538, 'eval_runtime': 96.0068, 'eval_samples_per_second': 15.364, 'eval_steps_per_second': 1.927, 'epoch': 0.98}


 99%|█████████▊| 2810/2852 [1:41:25<01:45,  2.50s/it]

{'loss': 1.017, 'grad_norm': 6.1613545417785645, 'learning_rate': 7.363253856942497e-08, 'epoch': 0.99}


 99%|█████████▉| 2820/2852 [1:41:38<00:42,  1.31s/it]

{'loss': 1.0811, 'grad_norm': 2875917.75, 'learning_rate': 5.6100981767180926e-08, 'epoch': 0.99}


 99%|█████████▉| 2830/2852 [1:41:51<00:27,  1.26s/it]

{'loss': 0.9958, 'grad_norm': 6.174993515014648, 'learning_rate': 3.856942496493689e-08, 'epoch': 0.99}


100%|█████████▉| 2840/2852 [1:42:03<00:15,  1.27s/it]

{'loss': 1.1913, 'grad_norm': 6.516580104827881, 'learning_rate': 2.1037868162692848e-08, 'epoch': 1.0}


100%|█████████▉| 2850/2852 [1:42:16<00:02,  1.26s/it]

{'loss': 0.9505, 'grad_norm': 4874597.5, 'learning_rate': 3.506311360448808e-09, 'epoch': 1.0}


100%|██████████| 2852/2852 [1:42:20<00:00,  2.15s/it]

{'train_runtime': 6140.7023, 'train_samples_per_second': 2.322, 'train_steps_per_second': 0.464, 'train_loss': 1.0136234136650975, 'epoch': 1.0}





TrainOutput(global_step=2852, training_loss=1.0136234136650975, metrics={'train_runtime': 6140.7023, 'train_samples_per_second': 2.322, 'train_steps_per_second': 0.464, 'train_loss': 1.0136234136650975, 'epoch': 1.0})

In [14]:
model.save_pretrained("save/completed/2")