In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
run = wandb.init(
    project='Fine-tune Gemma-2-2b-it on HealthCare Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maidenyang66[0m ([33myyfsss[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
base_model = "google/gemma-2-2b-it"
dataset_name = "yunfan-y/trump-qa"
new_model = "yunfan-y/Gemma-2-2b-it-trump-500"

In [4]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [6]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
)

dataset
dataset['text'][3]

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 11092.17 examples/s]


'<bos><start_of_turn>user\nWhen are you planning to announce your candidacy for the 2016 Presidential election?<end_of_turn>\n<start_of_turn>model\nwhen are you going to announce your bid for 2016 Thanks for your nice thoughts our Country is a mess<end_of_turn>\n'

In [7]:
dataset = dataset.train_test_split(test_size=0.1)

In [8]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=30,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 256,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 900/900 [00:00<00:00, 20732.52 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 8544.98 examples/s]
  0%|          | 1/225 [00:01<06:05,  1.63s/it]

{'loss': 5.7641, 'grad_norm': 7.3076395988464355, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.0}


  1%|          | 2/225 [00:02<03:55,  1.05s/it]

{'loss': 5.5338, 'grad_norm': 7.342902660369873, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.01}


  1%|▏         | 3/225 [00:02<03:08,  1.18it/s]

{'loss': 5.9162, 'grad_norm': 7.733153820037842, 'learning_rate': 2e-05, 'epoch': 0.01}


  2%|▏         | 4/225 [00:03<02:47,  1.32it/s]

{'loss': 6.0867, 'grad_norm': 8.300286293029785, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.02}


  2%|▏         | 5/225 [00:04<02:32,  1.44it/s]

{'loss': 5.9936, 'grad_norm': 8.167401313781738, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.02}


  3%|▎         | 6/225 [00:04<02:18,  1.58it/s]

{'loss': 5.4882, 'grad_norm': 7.654552459716797, 'learning_rate': 4e-05, 'epoch': 0.03}


  3%|▎         | 7/225 [00:05<02:16,  1.59it/s]

{'loss': 6.274, 'grad_norm': 8.00607967376709, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.03}


  4%|▎         | 8/225 [00:05<02:13,  1.63it/s]

{'loss': 5.8431, 'grad_norm': 7.7561492919921875, 'learning_rate': 5.333333333333333e-05, 'epoch': 0.04}


  4%|▍         | 9/225 [00:06<02:10,  1.66it/s]

{'loss': 5.2705, 'grad_norm': 7.242206573486328, 'learning_rate': 6e-05, 'epoch': 0.04}


  4%|▍         | 10/225 [00:06<01:58,  1.81it/s]

{'loss': 4.6249, 'grad_norm': 6.821553707122803, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.04}


  5%|▍         | 11/225 [00:07<01:50,  1.94it/s]

{'loss': 4.9407, 'grad_norm': 6.440700531005859, 'learning_rate': 7.333333333333333e-05, 'epoch': 0.05}


  5%|▌         | 12/225 [00:07<01:45,  2.02it/s]

{'loss': 3.9352, 'grad_norm': 5.20615816116333, 'learning_rate': 8e-05, 'epoch': 0.05}


  6%|▌         | 13/225 [00:08<01:41,  2.09it/s]

{'loss': 4.1429, 'grad_norm': 5.021510601043701, 'learning_rate': 8.666666666666667e-05, 'epoch': 0.06}


  6%|▌         | 14/225 [00:08<01:37,  2.16it/s]

{'loss': 3.9593, 'grad_norm': 4.269446849822998, 'learning_rate': 9.333333333333334e-05, 'epoch': 0.06}


  7%|▋         | 15/225 [00:08<01:34,  2.22it/s]

{'loss': 4.1657, 'grad_norm': 4.916780948638916, 'learning_rate': 0.0001, 'epoch': 0.07}


  7%|▋         | 16/225 [00:09<01:32,  2.25it/s]

{'loss': 3.3012, 'grad_norm': 5.587488651275635, 'learning_rate': 0.00010666666666666667, 'epoch': 0.07}


  8%|▊         | 17/225 [00:09<01:31,  2.27it/s]

{'loss': 3.8543, 'grad_norm': 5.870051383972168, 'learning_rate': 0.00011333333333333334, 'epoch': 0.08}


  8%|▊         | 18/225 [00:10<01:30,  2.29it/s]

{'loss': 3.4636, 'grad_norm': 5.228992938995361, 'learning_rate': 0.00012, 'epoch': 0.08}


  8%|▊         | 19/225 [00:10<01:29,  2.29it/s]

{'loss': 3.2456, 'grad_norm': 3.877206802368164, 'learning_rate': 0.00012666666666666666, 'epoch': 0.08}


  9%|▉         | 20/225 [00:11<01:28,  2.31it/s]

{'loss': 3.3059, 'grad_norm': 3.5402088165283203, 'learning_rate': 0.00013333333333333334, 'epoch': 0.09}


  9%|▉         | 21/225 [00:11<01:27,  2.34it/s]

{'loss': 3.5589, 'grad_norm': 3.2868125438690186, 'learning_rate': 0.00014, 'epoch': 0.09}


 10%|▉         | 22/225 [00:11<01:26,  2.35it/s]

{'loss': 3.14, 'grad_norm': 4.281775951385498, 'learning_rate': 0.00014666666666666666, 'epoch': 0.1}


 10%|█         | 23/225 [00:12<01:26,  2.33it/s]

{'loss': 2.9752, 'grad_norm': 4.0301690101623535, 'learning_rate': 0.00015333333333333334, 'epoch': 0.1}


 11%|█         | 24/225 [00:12<01:25,  2.34it/s]

{'loss': 3.0553, 'grad_norm': 3.9213433265686035, 'learning_rate': 0.00016, 'epoch': 0.11}


 11%|█         | 25/225 [00:13<01:28,  2.26it/s]

{'loss': 2.9684, 'grad_norm': 3.5595316886901855, 'learning_rate': 0.0001666666666666667, 'epoch': 0.11}


 12%|█▏        | 26/225 [00:13<01:35,  2.08it/s]

{'loss': 3.4566, 'grad_norm': 4.146387100219727, 'learning_rate': 0.00017333333333333334, 'epoch': 0.12}


 12%|█▏        | 27/225 [00:14<01:41,  1.95it/s]

{'loss': 2.8866, 'grad_norm': 4.105687141418457, 'learning_rate': 0.00018, 'epoch': 0.12}


 12%|█▏        | 28/225 [00:15<01:45,  1.87it/s]

{'loss': 3.1792, 'grad_norm': 4.0918192863464355, 'learning_rate': 0.0001866666666666667, 'epoch': 0.12}


 13%|█▎        | 29/225 [00:15<01:42,  1.91it/s]

{'loss': 2.7498, 'grad_norm': 3.7536299228668213, 'learning_rate': 0.00019333333333333333, 'epoch': 0.13}


 13%|█▎        | 30/225 [00:16<01:37,  2.01it/s]

{'loss': 2.88, 'grad_norm': 3.598724603652954, 'learning_rate': 0.0002, 'epoch': 0.13}


 14%|█▍        | 31/225 [00:16<01:32,  2.09it/s]

{'loss': 2.8174, 'grad_norm': 3.5395162105560303, 'learning_rate': 0.00019897435897435898, 'epoch': 0.14}


 14%|█▍        | 32/225 [00:16<01:29,  2.17it/s]

{'loss': 2.3284, 'grad_norm': 3.3160617351531982, 'learning_rate': 0.00019794871794871796, 'epoch': 0.14}


 15%|█▍        | 33/225 [00:17<01:28,  2.17it/s]

{'loss': 2.706, 'grad_norm': 3.7297191619873047, 'learning_rate': 0.00019692307692307696, 'epoch': 0.15}


 15%|█▌        | 34/225 [00:17<01:27,  2.18it/s]

{'loss': 2.5116, 'grad_norm': 3.674288749694824, 'learning_rate': 0.0001958974358974359, 'epoch': 0.15}


 16%|█▌        | 35/225 [00:18<01:24,  2.24it/s]

{'loss': 2.7504, 'grad_norm': 3.68088436126709, 'learning_rate': 0.00019487179487179487, 'epoch': 0.16}


 16%|█▌        | 36/225 [00:18<01:23,  2.25it/s]

{'loss': 2.546, 'grad_norm': 3.5778756141662598, 'learning_rate': 0.00019384615384615385, 'epoch': 0.16}


 16%|█▋        | 37/225 [00:19<01:27,  2.15it/s]

{'loss': 2.3085, 'grad_norm': 3.5594587326049805, 'learning_rate': 0.00019282051282051282, 'epoch': 0.16}


 17%|█▋        | 38/225 [00:19<01:25,  2.20it/s]

{'loss': 2.6122, 'grad_norm': 3.534914493560791, 'learning_rate': 0.00019179487179487182, 'epoch': 0.17}


 17%|█▋        | 39/225 [00:20<01:23,  2.23it/s]

{'loss': 2.3827, 'grad_norm': 3.37282657623291, 'learning_rate': 0.0001907692307692308, 'epoch': 0.17}


 18%|█▊        | 40/225 [00:20<01:20,  2.29it/s]

{'loss': 2.2457, 'grad_norm': 3.5117645263671875, 'learning_rate': 0.00018974358974358974, 'epoch': 0.18}


 18%|█▊        | 41/225 [00:20<01:19,  2.32it/s]

{'loss': 2.2916, 'grad_norm': 3.334900379180908, 'learning_rate': 0.0001887179487179487, 'epoch': 0.18}


 19%|█▊        | 42/225 [00:21<01:19,  2.30it/s]

{'loss': 1.8674, 'grad_norm': 3.2943832874298096, 'learning_rate': 0.0001876923076923077, 'epoch': 0.19}


 19%|█▉        | 43/225 [00:21<01:18,  2.31it/s]

{'loss': 2.3314, 'grad_norm': 3.4111485481262207, 'learning_rate': 0.0001866666666666667, 'epoch': 0.19}


 20%|█▉        | 44/225 [00:22<01:18,  2.31it/s]

{'loss': 1.7729, 'grad_norm': 3.3435075283050537, 'learning_rate': 0.00018564102564102566, 'epoch': 0.2}


 20%|██        | 45/225 [00:22<01:17,  2.31it/s]

{'loss': 1.3626, 'grad_norm': 3.026310682296753, 'learning_rate': 0.00018461538461538463, 'epoch': 0.2}


                                                
 20%|██        | 45/225 [00:27<01:17,  2.31it/s]

{'eval_loss': 2.196254014968872, 'eval_runtime': 4.6724, 'eval_samples_per_second': 21.402, 'eval_steps_per_second': 10.701, 'epoch': 0.2}


 20%|██        | 46/225 [00:27<05:27,  1.83s/it]

{'loss': 1.6514, 'grad_norm': 3.694145917892456, 'learning_rate': 0.00018358974358974358, 'epoch': 0.2}


 21%|██        | 47/225 [00:28<04:10,  1.41s/it]

{'loss': 1.7555, 'grad_norm': 3.9875800609588623, 'learning_rate': 0.00018256410256410258, 'epoch': 0.21}


 21%|██▏       | 48/225 [00:28<03:16,  1.11s/it]

{'loss': 0.6849, 'grad_norm': 2.2330939769744873, 'learning_rate': 0.00018153846153846155, 'epoch': 0.21}


 22%|██▏       | 49/225 [00:28<02:38,  1.11it/s]

{'loss': 0.6097, 'grad_norm': 1.9293657541275024, 'learning_rate': 0.00018051282051282052, 'epoch': 0.22}


 22%|██▏       | 50/225 [00:29<02:11,  1.33it/s]

{'loss': 1.0768, 'grad_norm': 3.225651741027832, 'learning_rate': 0.0001794871794871795, 'epoch': 0.22}


 23%|██▎       | 51/225 [00:29<01:54,  1.52it/s]

{'loss': 3.0489, 'grad_norm': 13.864446640014648, 'learning_rate': 0.00017846153846153847, 'epoch': 0.23}


 23%|██▎       | 52/225 [00:30<01:42,  1.69it/s]

{'loss': 3.3559, 'grad_norm': 10.260180473327637, 'learning_rate': 0.00017743589743589744, 'epoch': 0.23}


 24%|██▎       | 53/225 [00:30<01:33,  1.84it/s]

{'loss': 2.5398, 'grad_norm': 4.639554023742676, 'learning_rate': 0.00017641025641025642, 'epoch': 0.24}


 24%|██▍       | 54/225 [00:31<01:27,  1.96it/s]

{'loss': 2.5449, 'grad_norm': 3.3448660373687744, 'learning_rate': 0.0001753846153846154, 'epoch': 0.24}


 24%|██▍       | 55/225 [00:31<01:23,  2.03it/s]

{'loss': 2.5927, 'grad_norm': 3.910402774810791, 'learning_rate': 0.00017435897435897436, 'epoch': 0.24}


 25%|██▍       | 56/225 [00:31<01:20,  2.10it/s]

{'loss': 2.2872, 'grad_norm': 3.5879456996917725, 'learning_rate': 0.00017333333333333334, 'epoch': 0.25}


 25%|██▌       | 57/225 [00:32<01:17,  2.16it/s]

{'loss': 2.6114, 'grad_norm': 3.348278760910034, 'learning_rate': 0.00017230769230769234, 'epoch': 0.25}


 26%|██▌       | 58/225 [00:32<01:15,  2.20it/s]

{'loss': 2.7641, 'grad_norm': 3.4082987308502197, 'learning_rate': 0.00017128205128205128, 'epoch': 0.26}


 26%|██▌       | 59/225 [00:33<01:13,  2.26it/s]

{'loss': 2.7937, 'grad_norm': 3.6277499198913574, 'learning_rate': 0.00017025641025641026, 'epoch': 0.26}


 27%|██▋       | 60/225 [00:33<01:12,  2.29it/s]

{'loss': 2.4778, 'grad_norm': 3.112248420715332, 'learning_rate': 0.00016923076923076923, 'epoch': 0.27}


 27%|██▋       | 61/225 [00:34<01:11,  2.30it/s]

{'loss': 2.0291, 'grad_norm': 2.526746988296509, 'learning_rate': 0.00016820512820512823, 'epoch': 0.27}


 28%|██▊       | 62/225 [00:34<01:11,  2.26it/s]

{'loss': 2.2789, 'grad_norm': 2.448373794555664, 'learning_rate': 0.0001671794871794872, 'epoch': 0.28}


 28%|██▊       | 63/225 [00:34<01:10,  2.30it/s]

{'loss': 2.6012, 'grad_norm': 3.022351026535034, 'learning_rate': 0.00016615384615384617, 'epoch': 0.28}


 28%|██▊       | 64/225 [00:35<01:09,  2.31it/s]

{'loss': 2.2911, 'grad_norm': 3.0852582454681396, 'learning_rate': 0.00016512820512820512, 'epoch': 0.28}


 29%|██▉       | 65/225 [00:35<01:08,  2.32it/s]

{'loss': 2.1598, 'grad_norm': 2.6160354614257812, 'learning_rate': 0.0001641025641025641, 'epoch': 0.29}


 29%|██▉       | 66/225 [00:36<01:07,  2.35it/s]

{'loss': 2.3527, 'grad_norm': 2.720358371734619, 'learning_rate': 0.0001630769230769231, 'epoch': 0.29}


 30%|██▉       | 67/225 [00:36<01:07,  2.33it/s]

{'loss': 2.5541, 'grad_norm': 2.8336539268493652, 'learning_rate': 0.00016205128205128207, 'epoch': 0.3}


 30%|███       | 68/225 [00:37<01:06,  2.35it/s]

{'loss': 2.1431, 'grad_norm': 2.6145710945129395, 'learning_rate': 0.00016102564102564104, 'epoch': 0.3}


 31%|███       | 69/225 [00:37<01:07,  2.31it/s]

{'loss': 2.261, 'grad_norm': 2.772690534591675, 'learning_rate': 0.00016, 'epoch': 0.31}


 31%|███       | 70/225 [00:37<01:07,  2.28it/s]

{'loss': 1.8316, 'grad_norm': 2.789777994155884, 'learning_rate': 0.00015897435897435896, 'epoch': 0.31}


 32%|███▏      | 71/225 [00:38<01:06,  2.32it/s]

{'loss': 2.2187, 'grad_norm': 2.7950847148895264, 'learning_rate': 0.00015794871794871796, 'epoch': 0.32}


 32%|███▏      | 72/225 [00:38<01:06,  2.30it/s]

{'loss': 2.4336, 'grad_norm': 3.038076639175415, 'learning_rate': 0.00015692307692307693, 'epoch': 0.32}


 32%|███▏      | 73/225 [00:39<01:05,  2.31it/s]

{'loss': 2.675, 'grad_norm': 3.414562225341797, 'learning_rate': 0.0001558974358974359, 'epoch': 0.32}


 33%|███▎      | 74/225 [00:39<01:05,  2.29it/s]

{'loss': 1.9472, 'grad_norm': 2.781797409057617, 'learning_rate': 0.00015487179487179488, 'epoch': 0.33}


 33%|███▎      | 75/225 [00:40<01:05,  2.29it/s]

{'loss': 2.4004, 'grad_norm': 3.175682783126831, 'learning_rate': 0.00015384615384615385, 'epoch': 0.33}


 34%|███▍      | 76/225 [00:40<01:05,  2.29it/s]

{'loss': 1.9983, 'grad_norm': 2.714928388595581, 'learning_rate': 0.00015282051282051282, 'epoch': 0.34}


 34%|███▍      | 77/225 [00:41<01:04,  2.30it/s]

{'loss': 2.1568, 'grad_norm': 3.141216993331909, 'learning_rate': 0.0001517948717948718, 'epoch': 0.34}


 35%|███▍      | 78/225 [00:41<01:03,  2.30it/s]

{'loss': 2.2645, 'grad_norm': 2.892944812774658, 'learning_rate': 0.00015076923076923077, 'epoch': 0.35}


 35%|███▌      | 79/225 [00:41<01:03,  2.29it/s]

{'loss': 2.0657, 'grad_norm': 2.7137789726257324, 'learning_rate': 0.00014974358974358974, 'epoch': 0.35}


 36%|███▌      | 80/225 [00:42<01:04,  2.26it/s]

{'loss': 2.4116, 'grad_norm': 3.029386520385742, 'learning_rate': 0.00014871794871794872, 'epoch': 0.36}


 36%|███▌      | 81/225 [00:42<01:03,  2.28it/s]

{'loss': 2.2269, 'grad_norm': 2.9289047718048096, 'learning_rate': 0.00014769230769230772, 'epoch': 0.36}


 36%|███▋      | 82/225 [00:43<01:01,  2.31it/s]

{'loss': 1.9538, 'grad_norm': 2.6078391075134277, 'learning_rate': 0.00014666666666666666, 'epoch': 0.36}


 37%|███▋      | 83/225 [00:43<01:01,  2.30it/s]

{'loss': 1.9087, 'grad_norm': 3.1470787525177, 'learning_rate': 0.00014564102564102564, 'epoch': 0.37}


 37%|███▋      | 84/225 [00:44<01:00,  2.34it/s]

{'loss': 2.1383, 'grad_norm': 3.050405740737915, 'learning_rate': 0.0001446153846153846, 'epoch': 0.37}


 38%|███▊      | 85/225 [00:44<01:00,  2.33it/s]

{'loss': 1.671, 'grad_norm': 2.5754122734069824, 'learning_rate': 0.0001435897435897436, 'epoch': 0.38}


 38%|███▊      | 86/225 [00:44<00:59,  2.33it/s]

{'loss': 2.1429, 'grad_norm': 2.980316638946533, 'learning_rate': 0.00014256410256410258, 'epoch': 0.38}


 39%|███▊      | 87/225 [00:45<00:59,  2.33it/s]

{'loss': 1.91, 'grad_norm': 2.702936887741089, 'learning_rate': 0.00014153846153846156, 'epoch': 0.39}


 39%|███▉      | 88/225 [00:45<00:59,  2.32it/s]

{'loss': 1.9661, 'grad_norm': 2.7704484462738037, 'learning_rate': 0.0001405128205128205, 'epoch': 0.39}


 40%|███▉      | 89/225 [00:46<00:59,  2.30it/s]

{'loss': 1.7263, 'grad_norm': 2.519775152206421, 'learning_rate': 0.00013948717948717947, 'epoch': 0.4}


 40%|████      | 90/225 [00:46<00:58,  2.31it/s]

{'loss': 2.2152, 'grad_norm': 3.232393503189087, 'learning_rate': 0.00013846153846153847, 'epoch': 0.4}


                                                
 40%|████      | 90/225 [00:51<00:58,  2.31it/s]

{'eval_loss': 2.0041942596435547, 'eval_runtime': 4.8416, 'eval_samples_per_second': 20.654, 'eval_steps_per_second': 10.327, 'epoch': 0.4}


 40%|████      | 91/225 [00:51<04:12,  1.88s/it]

{'loss': 1.8385, 'grad_norm': 3.1752266883850098, 'learning_rate': 0.00013743589743589745, 'epoch': 0.4}


 41%|████      | 92/225 [00:52<03:12,  1.45s/it]

{'loss': 1.665, 'grad_norm': 2.5591530799865723, 'learning_rate': 0.00013641025641025642, 'epoch': 0.41}


 41%|████▏     | 93/225 [00:52<02:30,  1.14s/it]

{'loss': 1.912, 'grad_norm': 2.9585976600646973, 'learning_rate': 0.0001353846153846154, 'epoch': 0.41}


 42%|████▏     | 94/225 [00:53<02:00,  1.08it/s]

{'loss': 1.5537, 'grad_norm': 2.7238316535949707, 'learning_rate': 0.00013435897435897437, 'epoch': 0.42}


 42%|████▏     | 95/225 [00:53<01:40,  1.29it/s]

{'loss': 1.8092, 'grad_norm': 2.874587297439575, 'learning_rate': 0.00013333333333333334, 'epoch': 0.42}


 43%|████▎     | 96/225 [00:54<01:26,  1.49it/s]

{'loss': 1.0315, 'grad_norm': 2.161118507385254, 'learning_rate': 0.0001323076923076923, 'epoch': 0.43}


 43%|████▎     | 97/225 [00:54<01:16,  1.67it/s]

{'loss': 1.4873, 'grad_norm': 2.7852301597595215, 'learning_rate': 0.00013128205128205129, 'epoch': 0.43}


 44%|████▎     | 98/225 [00:54<01:09,  1.83it/s]

{'loss': 0.8926, 'grad_norm': 2.1103298664093018, 'learning_rate': 0.00013025641025641026, 'epoch': 0.44}


 44%|████▍     | 99/225 [00:55<01:03,  1.99it/s]

{'loss': 1.0842, 'grad_norm': 2.7469992637634277, 'learning_rate': 0.00012923076923076923, 'epoch': 0.44}


 44%|████▍     | 100/225 [00:55<01:00,  2.07it/s]

{'loss': 0.9294, 'grad_norm': 2.408299207687378, 'learning_rate': 0.00012820512820512823, 'epoch': 0.44}


 45%|████▍     | 101/225 [00:56<00:58,  2.13it/s]

{'loss': 2.4095, 'grad_norm': 4.667674541473389, 'learning_rate': 0.00012717948717948718, 'epoch': 0.45}


 45%|████▌     | 102/225 [00:56<00:56,  2.19it/s]

{'loss': 2.5637, 'grad_norm': 4.568470478057861, 'learning_rate': 0.00012615384615384615, 'epoch': 0.45}


 46%|████▌     | 103/225 [00:57<00:54,  2.24it/s]

{'loss': 3.1288, 'grad_norm': 4.671492576599121, 'learning_rate': 0.00012512820512820512, 'epoch': 0.46}


 46%|████▌     | 104/225 [00:57<00:53,  2.26it/s]

{'loss': 2.2354, 'grad_norm': 3.8895044326782227, 'learning_rate': 0.00012410256410256412, 'epoch': 0.46}


 47%|████▋     | 105/225 [00:57<00:52,  2.27it/s]

{'loss': 2.4788, 'grad_norm': 3.461198091506958, 'learning_rate': 0.0001230769230769231, 'epoch': 0.47}


 47%|████▋     | 106/225 [00:58<00:51,  2.32it/s]

{'loss': 2.2592, 'grad_norm': 2.7833995819091797, 'learning_rate': 0.00012205128205128207, 'epoch': 0.47}


 48%|████▊     | 107/225 [00:58<00:50,  2.33it/s]

{'loss': 2.1361, 'grad_norm': 2.7487475872039795, 'learning_rate': 0.00012102564102564103, 'epoch': 0.48}


 48%|████▊     | 108/225 [00:59<00:50,  2.33it/s]

{'loss': 2.5453, 'grad_norm': 2.594566822052002, 'learning_rate': 0.00012, 'epoch': 0.48}


 48%|████▊     | 109/225 [00:59<00:49,  2.35it/s]

{'loss': 2.5381, 'grad_norm': 2.812222719192505, 'learning_rate': 0.00011897435897435898, 'epoch': 0.48}


 49%|████▉     | 110/225 [00:59<00:48,  2.35it/s]

{'loss': 2.3093, 'grad_norm': 2.4634358882904053, 'learning_rate': 0.00011794871794871796, 'epoch': 0.49}


 49%|████▉     | 111/225 [01:00<00:50,  2.28it/s]

{'loss': 2.5642, 'grad_norm': 2.9156718254089355, 'learning_rate': 0.00011692307692307694, 'epoch': 0.49}


 50%|████▉     | 112/225 [01:00<00:50,  2.25it/s]

{'loss': 2.0016, 'grad_norm': 2.488905906677246, 'learning_rate': 0.00011589743589743591, 'epoch': 0.5}


 50%|█████     | 113/225 [01:01<00:49,  2.28it/s]

{'loss': 2.5601, 'grad_norm': 2.9383394718170166, 'learning_rate': 0.00011487179487179487, 'epoch': 0.5}


 51%|█████     | 114/225 [01:01<00:48,  2.31it/s]

{'loss': 1.9253, 'grad_norm': 2.374994993209839, 'learning_rate': 0.00011384615384615384, 'epoch': 0.51}


 51%|█████     | 115/225 [01:02<00:47,  2.30it/s]

{'loss': 1.9956, 'grad_norm': 2.655292510986328, 'learning_rate': 0.00011282051282051283, 'epoch': 0.51}


 52%|█████▏    | 116/225 [01:02<00:47,  2.30it/s]

{'loss': 2.6306, 'grad_norm': 3.396134614944458, 'learning_rate': 0.0001117948717948718, 'epoch': 0.52}


 52%|█████▏    | 117/225 [01:03<00:47,  2.29it/s]

{'loss': 2.2565, 'grad_norm': 2.899810791015625, 'learning_rate': 0.00011076923076923077, 'epoch': 0.52}


 52%|█████▏    | 118/225 [01:03<00:46,  2.31it/s]

{'loss': 2.0131, 'grad_norm': 2.2701029777526855, 'learning_rate': 0.00010974358974358976, 'epoch': 0.52}


 53%|█████▎    | 119/225 [01:03<00:45,  2.33it/s]

{'loss': 2.2026, 'grad_norm': 2.461671829223633, 'learning_rate': 0.00010871794871794872, 'epoch': 0.53}


 53%|█████▎    | 120/225 [01:04<00:45,  2.32it/s]

{'loss': 2.459, 'grad_norm': 2.6353657245635986, 'learning_rate': 0.0001076923076923077, 'epoch': 0.53}


 54%|█████▍    | 121/225 [01:04<00:44,  2.34it/s]

{'loss': 2.109, 'grad_norm': 2.578450918197632, 'learning_rate': 0.00010666666666666667, 'epoch': 0.54}


 54%|█████▍    | 122/225 [01:05<00:43,  2.36it/s]

{'loss': 2.2349, 'grad_norm': 2.7264769077301025, 'learning_rate': 0.00010564102564102565, 'epoch': 0.54}


 55%|█████▍    | 123/225 [01:05<00:43,  2.34it/s]

{'loss': 2.5383, 'grad_norm': 3.1594626903533936, 'learning_rate': 0.00010461538461538463, 'epoch': 0.55}


 55%|█████▌    | 124/225 [01:06<00:43,  2.34it/s]

{'loss': 1.7325, 'grad_norm': 2.5281522274017334, 'learning_rate': 0.0001035897435897436, 'epoch': 0.55}


 56%|█████▌    | 125/225 [01:06<00:42,  2.34it/s]

{'loss': 1.7996, 'grad_norm': 2.6140570640563965, 'learning_rate': 0.00010256410256410256, 'epoch': 0.56}


 56%|█████▌    | 126/225 [01:06<00:42,  2.34it/s]

{'loss': 2.1976, 'grad_norm': 2.610701322555542, 'learning_rate': 0.00010153846153846153, 'epoch': 0.56}


 56%|█████▋    | 127/225 [01:07<00:42,  2.32it/s]

{'loss': 1.8721, 'grad_norm': 2.6190059185028076, 'learning_rate': 0.00010051282051282052, 'epoch': 0.56}


 57%|█████▋    | 128/225 [01:07<00:42,  2.30it/s]

{'loss': 2.2576, 'grad_norm': 3.011420488357544, 'learning_rate': 9.948717948717949e-05, 'epoch': 0.57}


 57%|█████▋    | 129/225 [01:08<00:41,  2.32it/s]

{'loss': 1.7144, 'grad_norm': 2.3018639087677, 'learning_rate': 9.846153846153848e-05, 'epoch': 0.57}


 58%|█████▊    | 130/225 [01:08<00:40,  2.33it/s]

{'loss': 1.7755, 'grad_norm': 2.461793899536133, 'learning_rate': 9.743589743589744e-05, 'epoch': 0.58}


 58%|█████▊    | 131/225 [01:09<00:40,  2.30it/s]

{'loss': 1.9313, 'grad_norm': 2.4794623851776123, 'learning_rate': 9.641025641025641e-05, 'epoch': 0.58}


 59%|█████▊    | 132/225 [01:09<00:39,  2.35it/s]

{'loss': 1.958, 'grad_norm': 2.415236711502075, 'learning_rate': 9.53846153846154e-05, 'epoch': 0.59}


 59%|█████▉    | 133/225 [01:09<00:38,  2.37it/s]

{'loss': 1.3857, 'grad_norm': 2.0693016052246094, 'learning_rate': 9.435897435897436e-05, 'epoch': 0.59}


 60%|█████▉    | 134/225 [01:10<00:38,  2.37it/s]

{'loss': 2.2927, 'grad_norm': 2.9574966430664062, 'learning_rate': 9.333333333333334e-05, 'epoch': 0.6}


 60%|██████    | 135/225 [01:10<00:38,  2.35it/s]

{'loss': 1.6258, 'grad_norm': 2.7218408584594727, 'learning_rate': 9.230769230769232e-05, 'epoch': 0.6}


                                                 
 60%|██████    | 135/225 [01:15<00:38,  2.35it/s]

{'eval_loss': 1.9274836778640747, 'eval_runtime': 4.617, 'eval_samples_per_second': 21.659, 'eval_steps_per_second': 10.83, 'epoch': 0.6}


 60%|██████    | 136/225 [01:15<02:43,  1.83s/it]

{'loss': 2.047, 'grad_norm': 3.2041096687316895, 'learning_rate': 9.128205128205129e-05, 'epoch': 0.6}


 61%|██████    | 137/225 [01:16<02:05,  1.42s/it]

{'loss': 2.5599, 'grad_norm': 3.219597816467285, 'learning_rate': 9.025641025641026e-05, 'epoch': 0.61}


 61%|██████▏   | 138/225 [01:16<01:37,  1.12s/it]

{'loss': 1.8394, 'grad_norm': 2.6960339546203613, 'learning_rate': 8.923076923076924e-05, 'epoch': 0.61}


 62%|██████▏   | 139/225 [01:17<01:18,  1.09it/s]

{'loss': 1.7385, 'grad_norm': 2.8436455726623535, 'learning_rate': 8.820512820512821e-05, 'epoch': 0.62}


 62%|██████▏   | 140/225 [01:17<01:06,  1.27it/s]

{'loss': 1.8454, 'grad_norm': 2.4317896366119385, 'learning_rate': 8.717948717948718e-05, 'epoch': 0.62}


 63%|██████▎   | 141/225 [01:18<00:57,  1.45it/s]

{'loss': 1.8815, 'grad_norm': 2.639692783355713, 'learning_rate': 8.615384615384617e-05, 'epoch': 0.63}


 63%|██████▎   | 142/225 [01:18<00:51,  1.63it/s]

{'loss': 1.9503, 'grad_norm': 2.727182626724243, 'learning_rate': 8.512820512820513e-05, 'epoch': 0.63}


 64%|██████▎   | 143/225 [01:19<00:46,  1.75it/s]

{'loss': 1.7826, 'grad_norm': 2.634775400161743, 'learning_rate': 8.410256410256411e-05, 'epoch': 0.64}


 64%|██████▍   | 144/225 [01:19<00:43,  1.86it/s]

{'loss': 1.9088, 'grad_norm': 2.6940224170684814, 'learning_rate': 8.307692307692309e-05, 'epoch': 0.64}


 64%|██████▍   | 145/225 [01:19<00:40,  1.96it/s]

{'loss': 1.4004, 'grad_norm': 2.4228312969207764, 'learning_rate': 8.205128205128205e-05, 'epoch': 0.64}


 65%|██████▍   | 146/225 [01:20<00:41,  1.91it/s]

{'loss': 1.6713, 'grad_norm': 2.6246745586395264, 'learning_rate': 8.102564102564103e-05, 'epoch': 0.65}


 65%|██████▌   | 147/225 [01:21<00:44,  1.77it/s]

{'loss': 1.4433, 'grad_norm': 2.330425262451172, 'learning_rate': 8e-05, 'epoch': 0.65}


 66%|██████▌   | 148/225 [01:21<00:40,  1.88it/s]

{'loss': 0.9656, 'grad_norm': 2.000417470932007, 'learning_rate': 7.897435897435898e-05, 'epoch': 0.66}


 66%|██████▌   | 149/225 [01:22<00:38,  1.98it/s]

{'loss': 0.7337, 'grad_norm': 1.912226915359497, 'learning_rate': 7.794871794871795e-05, 'epoch': 0.66}


 67%|██████▋   | 150/225 [01:22<00:37,  2.01it/s]

{'loss': 0.4316, 'grad_norm': 1.7964946031570435, 'learning_rate': 7.692307692307693e-05, 'epoch': 0.67}


 67%|██████▋   | 151/225 [01:23<00:37,  1.96it/s]

{'loss': 2.5697, 'grad_norm': 3.066607713699341, 'learning_rate': 7.58974358974359e-05, 'epoch': 0.67}


 68%|██████▊   | 152/225 [01:23<00:37,  1.95it/s]

{'loss': 2.185, 'grad_norm': 3.294806718826294, 'learning_rate': 7.487179487179487e-05, 'epoch': 0.68}


 68%|██████▊   | 153/225 [01:24<00:43,  1.64it/s]

{'loss': 2.2336, 'grad_norm': 3.4955456256866455, 'learning_rate': 7.384615384615386e-05, 'epoch': 0.68}


 68%|██████▊   | 154/225 [01:25<00:42,  1.65it/s]

{'loss': 2.4751, 'grad_norm': 3.211925745010376, 'learning_rate': 7.282051282051282e-05, 'epoch': 0.68}


 69%|██████▉   | 155/225 [01:25<00:43,  1.59it/s]

{'loss': 2.2729, 'grad_norm': 3.3489742279052734, 'learning_rate': 7.17948717948718e-05, 'epoch': 0.69}


 69%|██████▉   | 156/225 [01:26<00:40,  1.70it/s]

{'loss': 2.1913, 'grad_norm': 3.2532777786254883, 'learning_rate': 7.076923076923078e-05, 'epoch': 0.69}


 70%|██████▉   | 157/225 [01:26<00:39,  1.72it/s]

{'loss': 2.5306, 'grad_norm': 3.0144829750061035, 'learning_rate': 6.974358974358974e-05, 'epoch': 0.7}


 70%|███████   | 158/225 [01:27<00:36,  1.83it/s]

{'loss': 2.4876, 'grad_norm': 2.803189516067505, 'learning_rate': 6.871794871794872e-05, 'epoch': 0.7}


 71%|███████   | 159/225 [01:27<00:38,  1.73it/s]

{'loss': 2.4264, 'grad_norm': 3.0428903102874756, 'learning_rate': 6.76923076923077e-05, 'epoch': 0.71}


 71%|███████   | 160/225 [01:28<00:36,  1.79it/s]

{'loss': 2.0449, 'grad_norm': 2.5773067474365234, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.71}


 72%|███████▏  | 161/225 [01:28<00:34,  1.85it/s]

{'loss': 2.1184, 'grad_norm': 2.4109323024749756, 'learning_rate': 6.564102564102564e-05, 'epoch': 0.72}


 72%|███████▏  | 162/225 [01:29<00:32,  1.91it/s]

{'loss': 2.1108, 'grad_norm': 2.542041540145874, 'learning_rate': 6.461538461538462e-05, 'epoch': 0.72}


 72%|███████▏  | 163/225 [01:29<00:32,  1.90it/s]

{'loss': 2.0371, 'grad_norm': 2.460308074951172, 'learning_rate': 6.358974358974359e-05, 'epoch': 0.72}


 73%|███████▎  | 164/225 [01:30<00:31,  1.96it/s]

{'loss': 2.1025, 'grad_norm': 2.4233367443084717, 'learning_rate': 6.256410256410256e-05, 'epoch': 0.73}


 73%|███████▎  | 165/225 [01:30<00:29,  2.03it/s]

{'loss': 2.2702, 'grad_norm': 2.687319278717041, 'learning_rate': 6.153846153846155e-05, 'epoch': 0.73}


 74%|███████▍  | 166/225 [01:31<00:28,  2.08it/s]

{'loss': 2.3406, 'grad_norm': 2.6941373348236084, 'learning_rate': 6.0512820512820515e-05, 'epoch': 0.74}


 74%|███████▍  | 167/225 [01:31<00:27,  2.12it/s]

{'loss': 2.0072, 'grad_norm': 2.4168541431427, 'learning_rate': 5.948717948717949e-05, 'epoch': 0.74}


 75%|███████▍  | 168/225 [01:32<00:26,  2.12it/s]

{'loss': 2.3206, 'grad_norm': 2.68969464302063, 'learning_rate': 5.846153846153847e-05, 'epoch': 0.75}


 75%|███████▌  | 169/225 [01:32<00:26,  2.09it/s]

{'loss': 2.2278, 'grad_norm': 2.8953158855438232, 'learning_rate': 5.7435897435897434e-05, 'epoch': 0.75}


 76%|███████▌  | 170/225 [01:33<00:25,  2.12it/s]

{'loss': 1.7806, 'grad_norm': 2.2524311542510986, 'learning_rate': 5.6410256410256414e-05, 'epoch': 0.76}


 76%|███████▌  | 171/225 [01:33<00:24,  2.17it/s]

{'loss': 2.0638, 'grad_norm': 2.51037335395813, 'learning_rate': 5.538461538461539e-05, 'epoch': 0.76}


 76%|███████▋  | 172/225 [01:34<00:24,  2.21it/s]

{'loss': 2.2295, 'grad_norm': 3.059086561203003, 'learning_rate': 5.435897435897436e-05, 'epoch': 0.76}


 77%|███████▋  | 173/225 [01:34<00:22,  2.26it/s]

{'loss': 1.9242, 'grad_norm': 2.6846139430999756, 'learning_rate': 5.333333333333333e-05, 'epoch': 0.77}


 77%|███████▋  | 174/225 [01:34<00:22,  2.27it/s]

{'loss': 2.4866, 'grad_norm': 2.8481881618499756, 'learning_rate': 5.230769230769231e-05, 'epoch': 0.77}


 78%|███████▊  | 175/225 [01:35<00:22,  2.25it/s]

{'loss': 2.0352, 'grad_norm': 2.736816644668579, 'learning_rate': 5.128205128205128e-05, 'epoch': 0.78}


 78%|███████▊  | 176/225 [01:35<00:21,  2.26it/s]

{'loss': 2.1316, 'grad_norm': 2.863690137863159, 'learning_rate': 5.025641025641026e-05, 'epoch': 0.78}


 79%|███████▊  | 177/225 [01:36<00:21,  2.27it/s]

{'loss': 1.8093, 'grad_norm': 2.249009370803833, 'learning_rate': 4.923076923076924e-05, 'epoch': 0.79}


 79%|███████▉  | 178/225 [01:36<00:22,  2.09it/s]

{'loss': 2.0125, 'grad_norm': 2.538602828979492, 'learning_rate': 4.8205128205128205e-05, 'epoch': 0.79}


 80%|███████▉  | 179/225 [01:37<00:21,  2.16it/s]

{'loss': 2.505, 'grad_norm': 2.8716187477111816, 'learning_rate': 4.717948717948718e-05, 'epoch': 0.8}


 80%|████████  | 180/225 [01:37<00:20,  2.20it/s]

{'loss': 2.1911, 'grad_norm': 3.0858304500579834, 'learning_rate': 4.615384615384616e-05, 'epoch': 0.8}


                                                 
 80%|████████  | 180/225 [01:43<00:20,  2.20it/s]

{'eval_loss': 1.9115700721740723, 'eval_runtime': 5.3514, 'eval_samples_per_second': 18.687, 'eval_steps_per_second': 9.343, 'epoch': 0.8}


 80%|████████  | 181/225 [01:43<01:30,  2.06s/it]

{'loss': 2.1255, 'grad_norm': 2.732377529144287, 'learning_rate': 4.512820512820513e-05, 'epoch': 0.8}


 81%|████████  | 182/225 [01:43<01:07,  1.58s/it]

{'loss': 1.8264, 'grad_norm': 2.347073554992676, 'learning_rate': 4.4102564102564104e-05, 'epoch': 0.81}


 81%|████████▏ | 183/225 [01:44<00:53,  1.26s/it]

{'loss': 1.9043, 'grad_norm': 2.573354721069336, 'learning_rate': 4.3076923076923084e-05, 'epoch': 0.81}


 82%|████████▏ | 184/225 [01:44<00:42,  1.03s/it]

{'loss': 2.5785, 'grad_norm': 3.1122119426727295, 'learning_rate': 4.205128205128206e-05, 'epoch': 0.82}


 82%|████████▏ | 185/225 [01:45<00:36,  1.10it/s]

{'loss': 2.179, 'grad_norm': 2.657276153564453, 'learning_rate': 4.1025641025641023e-05, 'epoch': 0.82}


 83%|████████▎ | 186/225 [01:46<00:31,  1.24it/s]

{'loss': 1.9842, 'grad_norm': 2.573106288909912, 'learning_rate': 4e-05, 'epoch': 0.83}


 83%|████████▎ | 187/225 [01:46<00:27,  1.40it/s]

{'loss': 2.2632, 'grad_norm': 2.6648833751678467, 'learning_rate': 3.8974358974358976e-05, 'epoch': 0.83}


 84%|████████▎ | 188/225 [01:47<00:23,  1.59it/s]

{'loss': 2.2676, 'grad_norm': 3.037755250930786, 'learning_rate': 3.794871794871795e-05, 'epoch': 0.84}


 84%|████████▍ | 189/225 [01:47<00:20,  1.74it/s]

{'loss': 2.0599, 'grad_norm': 2.7498974800109863, 'learning_rate': 3.692307692307693e-05, 'epoch': 0.84}


 84%|████████▍ | 190/225 [01:47<00:18,  1.85it/s]

{'loss': 1.7379, 'grad_norm': 2.4375195503234863, 'learning_rate': 3.58974358974359e-05, 'epoch': 0.84}


 85%|████████▍ | 191/225 [01:48<00:17,  1.96it/s]

{'loss': 1.7307, 'grad_norm': 2.70318341255188, 'learning_rate': 3.487179487179487e-05, 'epoch': 0.85}


 85%|████████▌ | 192/225 [01:48<00:16,  2.02it/s]

{'loss': 1.8129, 'grad_norm': 2.3969366550445557, 'learning_rate': 3.384615384615385e-05, 'epoch': 0.85}


 86%|████████▌ | 193/225 [01:49<00:16,  1.92it/s]

{'loss': 1.7692, 'grad_norm': 2.52899169921875, 'learning_rate': 3.282051282051282e-05, 'epoch': 0.86}


 86%|████████▌ | 194/225 [01:49<00:15,  1.98it/s]

{'loss': 1.282, 'grad_norm': 2.0372979640960693, 'learning_rate': 3.1794871794871795e-05, 'epoch': 0.86}


 87%|████████▋ | 195/225 [01:50<00:14,  2.03it/s]

{'loss': 1.1253, 'grad_norm': 1.9493626356124878, 'learning_rate': 3.0769230769230774e-05, 'epoch': 0.87}


 87%|████████▋ | 196/225 [01:50<00:14,  2.02it/s]

{'loss': 1.2879, 'grad_norm': 2.604106903076172, 'learning_rate': 2.9743589743589744e-05, 'epoch': 0.87}


 88%|████████▊ | 197/225 [01:51<00:14,  1.99it/s]

{'loss': 1.026, 'grad_norm': 1.832446813583374, 'learning_rate': 2.8717948717948717e-05, 'epoch': 0.88}


 88%|████████▊ | 198/225 [01:51<00:13,  2.01it/s]

{'loss': 1.2099, 'grad_norm': 2.030519723892212, 'learning_rate': 2.7692307692307694e-05, 'epoch': 0.88}


 88%|████████▊ | 199/225 [01:52<00:13,  1.97it/s]

{'loss': 0.8254, 'grad_norm': 1.7827389240264893, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.88}


 89%|████████▉ | 200/225 [01:52<00:13,  1.91it/s]

{'loss': 0.5223, 'grad_norm': 1.329677939414978, 'learning_rate': 2.564102564102564e-05, 'epoch': 0.89}


 89%|████████▉ | 201/225 [01:53<00:12,  1.93it/s]

{'loss': 2.2381, 'grad_norm': 2.393470525741577, 'learning_rate': 2.461538461538462e-05, 'epoch': 0.89}


 90%|████████▉ | 202/225 [01:53<00:11,  2.01it/s]

{'loss': 1.8711, 'grad_norm': 2.358468532562256, 'learning_rate': 2.358974358974359e-05, 'epoch': 0.9}


 90%|█████████ | 203/225 [01:54<00:10,  2.05it/s]

{'loss': 2.1539, 'grad_norm': 2.4652791023254395, 'learning_rate': 2.2564102564102566e-05, 'epoch': 0.9}


 91%|█████████ | 204/225 [01:54<00:09,  2.10it/s]

{'loss': 2.0138, 'grad_norm': 2.4912919998168945, 'learning_rate': 2.1538461538461542e-05, 'epoch': 0.91}


 91%|█████████ | 205/225 [01:55<00:09,  2.16it/s]

{'loss': 2.4935, 'grad_norm': 2.7654576301574707, 'learning_rate': 2.0512820512820512e-05, 'epoch': 0.91}


 92%|█████████▏| 206/225 [01:55<00:08,  2.15it/s]

{'loss': 2.2639, 'grad_norm': 2.565124750137329, 'learning_rate': 1.9487179487179488e-05, 'epoch': 0.92}


 92%|█████████▏| 207/225 [01:56<00:08,  2.13it/s]

{'loss': 2.1752, 'grad_norm': 2.345311403274536, 'learning_rate': 1.8461538461538465e-05, 'epoch': 0.92}


 92%|█████████▏| 208/225 [01:56<00:07,  2.16it/s]

{'loss': 2.4184, 'grad_norm': 2.8710737228393555, 'learning_rate': 1.7435897435897434e-05, 'epoch': 0.92}


 93%|█████████▎| 209/225 [01:57<00:07,  2.19it/s]

{'loss': 2.0086, 'grad_norm': 2.4346768856048584, 'learning_rate': 1.641025641025641e-05, 'epoch': 0.93}


 93%|█████████▎| 210/225 [01:57<00:06,  2.16it/s]

{'loss': 2.0961, 'grad_norm': 2.55196475982666, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.93}


 94%|█████████▍| 211/225 [01:58<00:06,  2.08it/s]

{'loss': 2.0382, 'grad_norm': 2.5030910968780518, 'learning_rate': 1.4358974358974359e-05, 'epoch': 0.94}


 94%|█████████▍| 212/225 [01:58<00:06,  2.13it/s]

{'loss': 2.0102, 'grad_norm': 2.606478214263916, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.94}


 95%|█████████▍| 213/225 [01:59<00:05,  2.15it/s]

{'loss': 2.1044, 'grad_norm': 2.8727974891662598, 'learning_rate': 1.230769230769231e-05, 'epoch': 0.95}


 95%|█████████▌| 214/225 [01:59<00:05,  2.18it/s]

{'loss': 1.6984, 'grad_norm': 2.683948040008545, 'learning_rate': 1.1282051282051283e-05, 'epoch': 0.95}


 96%|█████████▌| 215/225 [01:59<00:04,  2.24it/s]

{'loss': 1.523, 'grad_norm': 2.3887505531311035, 'learning_rate': 1.0256410256410256e-05, 'epoch': 0.96}


 96%|█████████▌| 216/225 [02:00<00:03,  2.26it/s]

{'loss': 2.1766, 'grad_norm': 2.4115684032440186, 'learning_rate': 9.230769230769232e-06, 'epoch': 0.96}


 96%|█████████▋| 217/225 [02:00<00:03,  2.26it/s]

{'loss': 1.9655, 'grad_norm': 2.464136838912964, 'learning_rate': 8.205128205128205e-06, 'epoch': 0.96}


 97%|█████████▋| 218/225 [02:01<00:03,  2.29it/s]

{'loss': 1.7949, 'grad_norm': 2.2716643810272217, 'learning_rate': 7.179487179487179e-06, 'epoch': 0.97}


 97%|█████████▋| 219/225 [02:01<00:02,  2.31it/s]

{'loss': 2.0947, 'grad_norm': 2.842414140701294, 'learning_rate': 6.153846153846155e-06, 'epoch': 0.97}


 98%|█████████▊| 220/225 [02:02<00:02,  2.34it/s]

{'loss': 1.5503, 'grad_norm': 2.259143114089966, 'learning_rate': 5.128205128205128e-06, 'epoch': 0.98}


 98%|█████████▊| 221/225 [02:02<00:01,  2.33it/s]

{'loss': 1.6834, 'grad_norm': 2.4698486328125, 'learning_rate': 4.102564102564103e-06, 'epoch': 0.98}


 99%|█████████▊| 222/225 [02:02<00:01,  2.30it/s]

{'loss': 1.6522, 'grad_norm': 2.320167303085327, 'learning_rate': 3.0769230769230774e-06, 'epoch': 0.99}


 99%|█████████▉| 223/225 [02:03<00:00,  2.29it/s]

{'loss': 1.0761, 'grad_norm': 1.8447184562683105, 'learning_rate': 2.0512820512820513e-06, 'epoch': 0.99}


100%|█████████▉| 224/225 [02:03<00:00,  2.31it/s]

{'loss': 0.9073, 'grad_norm': 1.8206480741500854, 'learning_rate': 1.0256410256410257e-06, 'epoch': 1.0}


100%|██████████| 225/225 [02:04<00:00,  2.33it/s]

{'loss': 1.1886, 'grad_norm': 2.2373785972595215, 'learning_rate': 0.0, 'epoch': 1.0}


                                                 
100%|██████████| 225/225 [02:08<00:00,  2.33it/s]

{'eval_loss': 1.8984450101852417, 'eval_runtime': 4.5482, 'eval_samples_per_second': 21.987, 'eval_steps_per_second': 10.993, 'epoch': 1.0}


100%|██████████| 225/225 [02:10<00:00,  1.73it/s]

{'train_runtime': 130.329, 'train_samples_per_second': 6.906, 'train_steps_per_second': 1.726, 'train_loss': 2.3128874338997734, 'epoch': 1.0}





TrainOutput(global_step=225, training_loss=2.3128874338997734, metrics={'train_runtime': 130.329, 'train_samples_per_second': 6.906, 'train_steps_per_second': 1.726, 'total_flos': 496856899252224.0, 'train_loss': 2.3128874338997734, 'epoch': 1.0})

# prepare the complete model

In [14]:
""" base_model_url = "google/gemma-2-2b-it"
new_model_url = "yunfan-y/Gemma-2-2b-it-trump-1000-adapter-quantized" """

In [15]:
""" from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch


# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

base_model_reload= AutoModelForCausalLM.from_pretrained(
    base_model_url,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
) """

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


In [16]:
""" model = PeftModel.from_pretrained(base_model_reload, new_model_url)
model = model.merge_and_unload() """

In [18]:
""" model.push_to_hub("yunfan-y/Gemma-2-2b-it-trump-1000-complete-quantized")
tokenizer.push_to_hub("yunfan-y/Gemma-2-2b-it-trump-1000-complete-quantized") """

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]
[A
model-00001-of-00002.safetensors:   0%|          | 1.67M/4.99G [00:00<09:17, 8.95MB/s] 
model-00001-of-00002.safetensors:   0%|          | 5.03M/4.99G [00:00<04:14, 19.6MB/s]
model-00001-of-00002.safetensors:   0%|          | 9.96M/4.99G [00:00<02:58, 27.9MB/s]
model-00001-of-00002.safetensors:   0%|          | 14.1M/4.99G [00:00<02:38, 31.4MB/s]
[A
model-00001-of-00002.safetensors:   1%|          | 30.9M/4.99G [00:00<01:51, 44.5MB/s]
[A
model-00001-of-00002.safetensors:   1%|          | 47.5M/4.99G [00:01<01:56, 42.3MB/s]
[A
model-00001-of-00002.safetensors:   1%|          | 57.7M/4.99G [00:01<02:19, 35.4MB/s]
[A
model-00001-of-00002.safetensors:   2%|▏         | 79.0M/4.99G [00:02<01:46, 45.9MB/s]
[A
model-00001-of-00002.safetensors:   2%|▏         | 97.0M/4.99G [00:02<02:44, 29.7MB/s]
model-00001-of-00002.safetensors:   2%|▏         | 103M/4.99G [00:03<02:15, 36.1MB/s] 
model-00001-of-00002.safete

CommitInfo(commit_url='https://huggingface.co/yunfan-y/Gemma-2-2b-it-trump-1000-complete-quantized/commit/707d79d05171772806489cb10163a29e006158dc', commit_message='Upload tokenizer', commit_description='', oid='707d79d05171772806489cb10163a29e006158dc', pr_url=None, pr_revision=None, pr_num=None)