In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))  # 첫 번째 GPU의 이름을 출력

True
NVIDIA GeForce RTX 3080


In [2]:
import torch
from unsloth import FastLanguageModel, to_sharegpt, standardize_sharegpt, apply_chat_template, is_bfloat16_supported
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3080. Max memory: 9.753 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from datasets import load_dataset

# CSV 파일 로드
dataset = load_dataset('csv', data_files='./processed_dataset.csv', split='train')
print(dataset.column_names)

['instruction', 'input', 'output']


In [5]:
# def combine_ingredients_and_directions(example):
#     ingredients = example['ingredients']
#     directions = example['directions']
#     # 이미 리스트라면 아래와 같이 사용
#     ingredients_text = "\n".join(ingredients)
#     directions_text = "\n".join(directions)
#     combined = f"Ingredients:\n{ingredients_text}\n\nDirections:\n{directions_text}"
#     return {'recipe_details': combined}

# dataset = dataset.map(combine_ingredients_and_directions)


In [6]:
from unsloth import to_sharegpt

dataset = to_sharegpt(
    dataset,
    merged_prompt="{instruction}",
    output_column_name="output",
    conversation_extension=0,  # Set to 0 if you don't want to extend conversations
)


Finally use `standardize_sharegpt` to fix up the dataset!

In [7]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

### Customizable Chat Templates

You also need to specify a chat template. Previously, you could use the Alpaca format as shown below.

The issue is the Alpaca format has 3 fields, whilst OpenAI style chatbots must only use 2 fields (instruction and response). That's why we used the `to_sharegpt` function to merge these columns into 1.

In [8]:

chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>

{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{OUTPUT}<|eot_id|>"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [9]:

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset, # 원래는 dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 11,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 3,
        # max_steps = 60,
        num_train_epochs = 8, # For longer training runs!
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 200
 "-____-"     Number of trainable parameters = 41,943,040
  0%|          | 1/200 [00:04<16:21,  4.93s/it]

{'loss': 2.3688, 'grad_norm': 1.4481136798858643, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.04}


  1%|          | 2/200 [00:09<16:10,  4.90s/it]

{'loss': 1.975, 'grad_norm': 0.9301963448524475, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.08}


  2%|▏         | 3/200 [00:13<14:12,  4.33s/it]

{'loss': 2.1547, 'grad_norm': 1.2948286533355713, 'learning_rate': 5e-05, 'epoch': 0.12}


  2%|▏         | 4/200 [00:16<12:45,  3.91s/it]

{'loss': 2.2491, 'grad_norm': 1.2411149740219116, 'learning_rate': 4.9746192893401014e-05, 'epoch': 0.16}


  2%|▎         | 5/200 [00:20<12:22,  3.81s/it]

{'loss': 2.2503, 'grad_norm': 0.8025173544883728, 'learning_rate': 4.949238578680203e-05, 'epoch': 0.2}


  3%|▎         | 6/200 [00:24<12:25,  3.84s/it]

{'loss': 2.0898, 'grad_norm': 0.7767582535743713, 'learning_rate': 4.9238578680203045e-05, 'epoch': 0.24}


  4%|▎         | 7/200 [00:28<13:02,  4.06s/it]

{'loss': 2.0142, 'grad_norm': 0.6827585101127625, 'learning_rate': 4.8984771573604064e-05, 'epoch': 0.28}


  4%|▍         | 8/200 [00:32<12:29,  3.90s/it]

{'loss': 2.1869, 'grad_norm': 0.8755104541778564, 'learning_rate': 4.873096446700508e-05, 'epoch': 0.32}


  4%|▍         | 9/200 [00:36<12:25,  3.90s/it]

{'loss': 1.9573, 'grad_norm': 0.8013163208961487, 'learning_rate': 4.8477157360406095e-05, 'epoch': 0.36}


  5%|▌         | 10/200 [00:39<12:01,  3.80s/it]

{'loss': 2.0025, 'grad_norm': 0.8165003657341003, 'learning_rate': 4.822335025380711e-05, 'epoch': 0.4}


  6%|▌         | 11/200 [00:43<12:05,  3.84s/it]

{'loss': 1.9974, 'grad_norm': 0.6946502923965454, 'learning_rate': 4.7969543147208126e-05, 'epoch': 0.44}


  6%|▌         | 12/200 [00:47<12:17,  3.92s/it]

{'loss': 1.8292, 'grad_norm': 0.648483395576477, 'learning_rate': 4.771573604060914e-05, 'epoch': 0.48}


  6%|▋         | 13/200 [00:51<11:53,  3.81s/it]

{'loss': 1.661, 'grad_norm': 0.7541108727455139, 'learning_rate': 4.746192893401015e-05, 'epoch': 0.52}


  7%|▋         | 14/200 [00:55<11:40,  3.77s/it]

{'loss': 1.7523, 'grad_norm': 0.7971314191818237, 'learning_rate': 4.7208121827411175e-05, 'epoch': 0.56}


  8%|▊         | 15/200 [00:58<11:25,  3.71s/it]

{'loss': 1.8546, 'grad_norm': 0.7856573462486267, 'learning_rate': 4.695431472081219e-05, 'epoch': 0.6}


  8%|▊         | 16/200 [01:02<11:08,  3.63s/it]

{'loss': 1.6988, 'grad_norm': 0.7986493706703186, 'learning_rate': 4.67005076142132e-05, 'epoch': 0.64}


  8%|▊         | 17/200 [01:06<11:33,  3.79s/it]

{'loss': 1.466, 'grad_norm': 0.6389121413230896, 'learning_rate': 4.644670050761422e-05, 'epoch': 0.68}


  9%|▉         | 18/200 [01:10<11:35,  3.82s/it]

{'loss': 1.5286, 'grad_norm': 0.7833544611930847, 'learning_rate': 4.619289340101523e-05, 'epoch': 0.72}


 10%|▉         | 19/200 [01:14<12:15,  4.06s/it]

{'loss': 1.5295, 'grad_norm': 0.7238926291465759, 'learning_rate': 4.593908629441624e-05, 'epoch': 0.76}


 10%|█         | 20/200 [01:18<11:27,  3.82s/it]

{'loss': 1.6256, 'grad_norm': 0.7429429292678833, 'learning_rate': 4.568527918781726e-05, 'epoch': 0.8}


 10%|█         | 21/200 [01:22<11:35,  3.89s/it]

{'loss': 1.5285, 'grad_norm': 0.5895832180976868, 'learning_rate': 4.543147208121827e-05, 'epoch': 0.84}


 11%|█         | 22/200 [01:26<11:39,  3.93s/it]

{'loss': 1.4969, 'grad_norm': 0.5955430865287781, 'learning_rate': 4.517766497461929e-05, 'epoch': 0.88}


 12%|█▏        | 23/200 [01:30<11:58,  4.06s/it]

{'loss': 1.3423, 'grad_norm': 0.5755224823951721, 'learning_rate': 4.492385786802031e-05, 'epoch': 0.92}


 12%|█▏        | 24/200 [01:34<11:43,  4.00s/it]

{'loss': 1.2516, 'grad_norm': 0.6407493352890015, 'learning_rate': 4.467005076142132e-05, 'epoch': 0.96}


 12%|█▎        | 25/200 [01:38<11:40,  4.00s/it]

{'loss': 1.4284, 'grad_norm': 0.6068877577781677, 'learning_rate': 4.4416243654822335e-05, 'epoch': 1.0}


 13%|█▎        | 26/200 [01:42<11:35,  4.00s/it]

{'loss': 1.4106, 'grad_norm': 0.6226232051849365, 'learning_rate': 4.416243654822335e-05, 'epoch': 1.04}


 14%|█▎        | 27/200 [01:46<11:57,  4.15s/it]

{'loss': 1.4009, 'grad_norm': 0.5519832372665405, 'learning_rate': 4.3908629441624365e-05, 'epoch': 1.08}


 14%|█▍        | 28/200 [01:50<11:36,  4.05s/it]

{'loss': 1.323, 'grad_norm': 0.5877220034599304, 'learning_rate': 4.365482233502538e-05, 'epoch': 1.12}


 14%|█▍        | 29/200 [01:54<11:22,  3.99s/it]

{'loss': 1.2077, 'grad_norm': 0.5660467743873596, 'learning_rate': 4.34010152284264e-05, 'epoch': 1.16}


 15%|█▌        | 30/200 [01:58<11:07,  3.93s/it]

{'loss': 1.38, 'grad_norm': 0.5591115951538086, 'learning_rate': 4.3147208121827415e-05, 'epoch': 1.2}


 16%|█▌        | 31/200 [02:02<11:20,  4.02s/it]

{'loss': 1.244, 'grad_norm': 0.4988080859184265, 'learning_rate': 4.289340101522843e-05, 'epoch': 1.24}


 16%|█▌        | 32/200 [02:06<11:00,  3.93s/it]

{'loss': 1.2876, 'grad_norm': 0.6202163100242615, 'learning_rate': 4.2639593908629446e-05, 'epoch': 1.28}


 16%|█▋        | 33/200 [02:09<10:25,  3.74s/it]

{'loss': 1.3128, 'grad_norm': 0.562144935131073, 'learning_rate': 4.238578680203046e-05, 'epoch': 1.32}


 17%|█▋        | 34/200 [02:13<10:51,  3.92s/it]

{'loss': 1.3454, 'grad_norm': 0.5672777891159058, 'learning_rate': 4.213197969543147e-05, 'epoch': 1.36}


 18%|█▊        | 35/200 [02:17<10:46,  3.92s/it]

{'loss': 1.1747, 'grad_norm': 0.6140874624252319, 'learning_rate': 4.187817258883249e-05, 'epoch': 1.4}


 18%|█▊        | 36/200 [02:21<10:53,  3.99s/it]

{'loss': 1.187, 'grad_norm': 0.548986554145813, 'learning_rate': 4.162436548223351e-05, 'epoch': 1.44}


 18%|█▊        | 37/200 [02:26<10:55,  4.02s/it]

{'loss': 1.2592, 'grad_norm': 0.5697670578956604, 'learning_rate': 4.137055837563452e-05, 'epoch': 1.48}


 19%|█▉        | 38/200 [02:29<10:28,  3.88s/it]

{'loss': 1.0923, 'grad_norm': 0.7835512757301331, 'learning_rate': 4.111675126903554e-05, 'epoch': 1.52}


 20%|█▉        | 39/200 [02:33<10:31,  3.93s/it]

{'loss': 1.1511, 'grad_norm': 0.5251314043998718, 'learning_rate': 4.086294416243655e-05, 'epoch': 1.56}


 20%|██        | 40/200 [02:37<10:44,  4.03s/it]

{'loss': 1.0322, 'grad_norm': 0.6188111305236816, 'learning_rate': 4.060913705583756e-05, 'epoch': 1.6}


 20%|██        | 41/200 [02:42<11:07,  4.20s/it]

{'loss': 1.0903, 'grad_norm': 0.5589545369148254, 'learning_rate': 4.035532994923858e-05, 'epoch': 1.64}


 21%|██        | 42/200 [02:46<10:47,  4.10s/it]

{'loss': 1.2202, 'grad_norm': 0.6648520231246948, 'learning_rate': 4.010152284263959e-05, 'epoch': 1.68}


 22%|██▏       | 43/200 [02:51<11:16,  4.31s/it]

{'loss': 1.1197, 'grad_norm': 0.5797799229621887, 'learning_rate': 3.9847715736040605e-05, 'epoch': 1.72}


 22%|██▏       | 44/200 [02:55<10:51,  4.17s/it]

{'loss': 1.1806, 'grad_norm': 0.6486351490020752, 'learning_rate': 3.959390862944163e-05, 'epoch': 1.76}


 22%|██▎       | 45/200 [02:58<10:22,  4.02s/it]

{'loss': 1.211, 'grad_norm': 0.6263433694839478, 'learning_rate': 3.934010152284264e-05, 'epoch': 1.8}


 23%|██▎       | 46/200 [03:02<10:29,  4.09s/it]

{'loss': 1.1087, 'grad_norm': 0.6271542310714722, 'learning_rate': 3.9086294416243655e-05, 'epoch': 1.84}


 24%|██▎       | 47/200 [03:06<10:14,  4.02s/it]

{'loss': 1.1586, 'grad_norm': 0.5994822978973389, 'learning_rate': 3.8832487309644673e-05, 'epoch': 1.88}


 24%|██▍       | 48/200 [03:10<10:10,  4.02s/it]

{'loss': 1.0532, 'grad_norm': 0.5994805097579956, 'learning_rate': 3.8578680203045685e-05, 'epoch': 1.92}


 24%|██▍       | 49/200 [03:14<10:13,  4.06s/it]

{'loss': 1.0803, 'grad_norm': 0.5790321230888367, 'learning_rate': 3.83248730964467e-05, 'epoch': 1.96}


 25%|██▌       | 50/200 [03:19<10:36,  4.24s/it]

{'loss': 1.1824, 'grad_norm': 0.5721548795700073, 'learning_rate': 3.8071065989847716e-05, 'epoch': 2.0}


 26%|██▌       | 51/200 [03:23<10:06,  4.07s/it]

{'loss': 1.1445, 'grad_norm': 0.6071585416793823, 'learning_rate': 3.7817258883248735e-05, 'epoch': 2.04}


 26%|██▌       | 52/200 [03:27<09:47,  3.97s/it]

{'loss': 1.0651, 'grad_norm': 0.5615150332450867, 'learning_rate': 3.756345177664975e-05, 'epoch': 2.08}


 26%|██▋       | 53/200 [03:31<09:58,  4.07s/it]

{'loss': 1.0597, 'grad_norm': 0.5884537100791931, 'learning_rate': 3.7309644670050766e-05, 'epoch': 2.12}


 27%|██▋       | 54/200 [03:35<10:14,  4.21s/it]

{'loss': 1.1398, 'grad_norm': 0.5172064304351807, 'learning_rate': 3.705583756345178e-05, 'epoch': 2.16}


 28%|██▊       | 55/200 [03:39<10:02,  4.16s/it]

{'loss': 0.9511, 'grad_norm': 0.609704315662384, 'learning_rate': 3.680203045685279e-05, 'epoch': 2.2}


 28%|██▊       | 56/200 [03:44<09:59,  4.16s/it]

{'loss': 1.0943, 'grad_norm': 0.6594793200492859, 'learning_rate': 3.654822335025381e-05, 'epoch': 2.24}


 28%|██▊       | 57/200 [03:48<10:08,  4.25s/it]

{'loss': 0.99, 'grad_norm': 0.6638941168785095, 'learning_rate': 3.629441624365482e-05, 'epoch': 2.28}


 29%|██▉       | 58/200 [03:52<10:01,  4.24s/it]

{'loss': 1.0179, 'grad_norm': 0.592382550239563, 'learning_rate': 3.604060913705584e-05, 'epoch': 2.32}


 30%|██▉       | 59/200 [03:56<09:49,  4.18s/it]

{'loss': 1.2746, 'grad_norm': 0.7304483652114868, 'learning_rate': 3.578680203045686e-05, 'epoch': 2.36}


 30%|███       | 60/200 [04:00<09:35,  4.11s/it]

{'loss': 0.947, 'grad_norm': 0.6522650718688965, 'learning_rate': 3.553299492385787e-05, 'epoch': 2.4}


 30%|███       | 61/200 [04:05<09:40,  4.18s/it]

{'loss': 1.2559, 'grad_norm': 0.5970959067344666, 'learning_rate': 3.527918781725888e-05, 'epoch': 2.44}


 31%|███       | 62/200 [04:09<09:43,  4.23s/it]

{'loss': 0.9796, 'grad_norm': 0.5688014030456543, 'learning_rate': 3.50253807106599e-05, 'epoch': 2.48}


 32%|███▏      | 63/200 [04:13<09:14,  4.05s/it]

{'loss': 1.1273, 'grad_norm': 0.7650243639945984, 'learning_rate': 3.477157360406091e-05, 'epoch': 2.52}


 32%|███▏      | 64/200 [04:17<09:16,  4.09s/it]

{'loss': 1.1117, 'grad_norm': 0.6489861011505127, 'learning_rate': 3.451776649746193e-05, 'epoch': 2.56}


 32%|███▎      | 65/200 [04:21<09:21,  4.16s/it]

{'loss': 1.2054, 'grad_norm': 0.6603266000747681, 'learning_rate': 3.4263959390862944e-05, 'epoch': 2.6}


 33%|███▎      | 66/200 [04:25<09:26,  4.23s/it]

{'loss': 0.9133, 'grad_norm': 0.6364155411720276, 'learning_rate': 3.401015228426396e-05, 'epoch': 2.64}


 34%|███▎      | 67/200 [04:29<08:57,  4.04s/it]

{'loss': 0.9474, 'grad_norm': 0.7313272356987, 'learning_rate': 3.3756345177664975e-05, 'epoch': 2.68}


 34%|███▍      | 68/200 [04:33<08:50,  4.02s/it]

{'loss': 0.93, 'grad_norm': 0.7567568421363831, 'learning_rate': 3.3502538071065994e-05, 'epoch': 2.72}


 34%|███▍      | 69/200 [04:38<09:29,  4.35s/it]

{'loss': 1.1142, 'grad_norm': 0.5607686042785645, 'learning_rate': 3.3248730964467006e-05, 'epoch': 2.76}


 35%|███▌      | 70/200 [04:42<09:08,  4.22s/it]

{'loss': 1.113, 'grad_norm': 0.7910283803939819, 'learning_rate': 3.299492385786802e-05, 'epoch': 2.8}


 36%|███▌      | 71/200 [04:46<08:39,  4.03s/it]

{'loss': 0.9857, 'grad_norm': 0.7181195616722107, 'learning_rate': 3.2741116751269036e-05, 'epoch': 2.84}


 36%|███▌      | 72/200 [04:50<09:01,  4.23s/it]

{'loss': 1.0967, 'grad_norm': 0.5967609882354736, 'learning_rate': 3.248730964467005e-05, 'epoch': 2.88}


 36%|███▋      | 73/200 [04:55<09:05,  4.30s/it]

{'loss': 0.8934, 'grad_norm': 0.6537215113639832, 'learning_rate': 3.223350253807107e-05, 'epoch': 2.92}


 37%|███▋      | 74/200 [04:59<09:00,  4.29s/it]

{'loss': 0.9586, 'grad_norm': 0.606509268283844, 'learning_rate': 3.1979695431472086e-05, 'epoch': 2.96}


 38%|███▊      | 75/200 [05:04<09:19,  4.48s/it]

{'loss': 1.0272, 'grad_norm': 0.57746821641922, 'learning_rate': 3.17258883248731e-05, 'epoch': 3.0}


 38%|███▊      | 76/200 [05:08<09:06,  4.41s/it]

{'loss': 0.9047, 'grad_norm': 0.6266606450080872, 'learning_rate': 3.147208121827411e-05, 'epoch': 3.04}


 38%|███▊      | 77/200 [05:12<08:54,  4.34s/it]

{'loss': 1.0786, 'grad_norm': 0.6473752856254578, 'learning_rate': 3.121827411167513e-05, 'epoch': 3.08}


 39%|███▉      | 78/200 [05:17<08:43,  4.29s/it]

{'loss': 1.0401, 'grad_norm': 0.6624700427055359, 'learning_rate': 3.096446700507614e-05, 'epoch': 3.12}


 40%|███▉      | 79/200 [05:20<08:26,  4.18s/it]

{'loss': 1.004, 'grad_norm': 0.6753441691398621, 'learning_rate': 3.071065989847716e-05, 'epoch': 3.16}


 40%|████      | 80/200 [05:24<08:03,  4.03s/it]

{'loss': 1.1645, 'grad_norm': 0.7509781718254089, 'learning_rate': 3.0456852791878175e-05, 'epoch': 3.2}


 40%|████      | 81/200 [05:28<07:59,  4.03s/it]

{'loss': 0.8913, 'grad_norm': 0.6335068941116333, 'learning_rate': 3.020304568527919e-05, 'epoch': 3.24}


 41%|████      | 82/200 [05:32<08:01,  4.08s/it]

{'loss': 0.9918, 'grad_norm': 0.7492111325263977, 'learning_rate': 2.9949238578680206e-05, 'epoch': 3.28}


 42%|████▏     | 83/200 [05:37<08:24,  4.31s/it]

{'loss': 0.8711, 'grad_norm': 0.6632081866264343, 'learning_rate': 2.969543147208122e-05, 'epoch': 3.32}


 42%|████▏     | 84/200 [05:42<08:42,  4.50s/it]

{'loss': 1.0129, 'grad_norm': 0.5614109635353088, 'learning_rate': 2.9441624365482233e-05, 'epoch': 3.36}


 42%|████▎     | 85/200 [05:46<08:23,  4.38s/it]

{'loss': 0.9839, 'grad_norm': 0.7136111259460449, 'learning_rate': 2.918781725888325e-05, 'epoch': 3.4}


 43%|████▎     | 86/200 [05:50<07:49,  4.12s/it]

{'loss': 0.8773, 'grad_norm': 0.9151856303215027, 'learning_rate': 2.8934010152284264e-05, 'epoch': 3.44}


 44%|████▎     | 87/200 [05:54<07:54,  4.20s/it]

{'loss': 1.2024, 'grad_norm': 0.6549550890922546, 'learning_rate': 2.8680203045685283e-05, 'epoch': 3.48}


 44%|████▍     | 88/200 [05:58<07:49,  4.19s/it]

{'loss': 1.1162, 'grad_norm': 0.6964472532272339, 'learning_rate': 2.84263959390863e-05, 'epoch': 3.52}


 44%|████▍     | 89/200 [06:02<07:16,  3.93s/it]

{'loss': 0.944, 'grad_norm': 0.738360583782196, 'learning_rate': 2.8172588832487314e-05, 'epoch': 3.56}


 45%|████▌     | 90/200 [06:06<07:35,  4.14s/it]

{'loss': 1.0139, 'grad_norm': 0.7317962646484375, 'learning_rate': 2.7918781725888326e-05, 'epoch': 3.6}


 46%|████▌     | 91/200 [06:10<07:19,  4.03s/it]

{'loss': 0.874, 'grad_norm': 0.7413985133171082, 'learning_rate': 2.766497461928934e-05, 'epoch': 3.64}


 46%|████▌     | 92/200 [06:14<07:19,  4.07s/it]

{'loss': 0.8846, 'grad_norm': 0.7687948346138, 'learning_rate': 2.7411167512690357e-05, 'epoch': 3.68}


 46%|████▋     | 93/200 [06:18<07:15,  4.07s/it]

{'loss': 0.8476, 'grad_norm': 0.7743198275566101, 'learning_rate': 2.715736040609137e-05, 'epoch': 3.72}


 47%|████▋     | 94/200 [06:23<07:21,  4.17s/it]

{'loss': 1.1272, 'grad_norm': 0.7770807147026062, 'learning_rate': 2.6903553299492384e-05, 'epoch': 3.76}


 48%|████▊     | 95/200 [06:27<07:26,  4.25s/it]

{'loss': 0.9086, 'grad_norm': 0.6916408538818359, 'learning_rate': 2.6649746192893406e-05, 'epoch': 3.8}


 48%|████▊     | 96/200 [06:31<07:11,  4.15s/it]

{'loss': 0.8886, 'grad_norm': 0.760719358921051, 'learning_rate': 2.6395939086294418e-05, 'epoch': 3.84}


 48%|████▊     | 97/200 [06:35<07:00,  4.08s/it]

{'loss': 0.8846, 'grad_norm': 0.8413525223731995, 'learning_rate': 2.6142131979695434e-05, 'epoch': 3.88}


 49%|████▉     | 98/200 [06:38<06:37,  3.90s/it]

{'loss': 0.9133, 'grad_norm': 0.9129456281661987, 'learning_rate': 2.588832487309645e-05, 'epoch': 3.92}


 50%|████▉     | 99/200 [06:42<06:30,  3.86s/it]

{'loss': 0.9349, 'grad_norm': 0.8170490860939026, 'learning_rate': 2.563451776649746e-05, 'epoch': 3.96}


 50%|█████     | 100/200 [06:46<06:35,  3.95s/it]

{'loss': 0.889, 'grad_norm': 0.8323699831962585, 'learning_rate': 2.5380710659898476e-05, 'epoch': 4.0}


 50%|█████     | 101/200 [06:51<06:38,  4.03s/it]

{'loss': 0.9528, 'grad_norm': 0.7409740686416626, 'learning_rate': 2.5126903553299492e-05, 'epoch': 4.04}


 51%|█████     | 102/200 [06:55<06:45,  4.14s/it]

{'loss': 0.7742, 'grad_norm': 0.7591119408607483, 'learning_rate': 2.4873096446700507e-05, 'epoch': 4.08}


 52%|█████▏    | 103/200 [06:59<06:35,  4.08s/it]

{'loss': 1.0301, 'grad_norm': 0.736475944519043, 'learning_rate': 2.4619289340101523e-05, 'epoch': 4.12}


 52%|█████▏    | 104/200 [07:03<06:40,  4.17s/it]

{'loss': 1.0756, 'grad_norm': 0.6631845235824585, 'learning_rate': 2.436548223350254e-05, 'epoch': 4.16}


 52%|█████▎    | 105/200 [07:07<06:31,  4.12s/it]

{'loss': 0.8334, 'grad_norm': 0.7490889430046082, 'learning_rate': 2.4111675126903553e-05, 'epoch': 4.2}


 53%|█████▎    | 106/200 [07:11<06:05,  3.89s/it]

{'loss': 1.0341, 'grad_norm': 0.8322848081588745, 'learning_rate': 2.385786802030457e-05, 'epoch': 4.24}


 54%|█████▎    | 107/200 [07:15<06:06,  3.94s/it]

{'loss': 0.9521, 'grad_norm': 0.7254233360290527, 'learning_rate': 2.3604060913705588e-05, 'epoch': 4.28}


 54%|█████▍    | 108/200 [07:19<06:16,  4.10s/it]

{'loss': 0.9198, 'grad_norm': 0.7570286393165588, 'learning_rate': 2.33502538071066e-05, 'epoch': 4.32}


 55%|█████▍    | 109/200 [07:23<05:54,  3.90s/it]

{'loss': 0.9069, 'grad_norm': 0.8489334583282471, 'learning_rate': 2.3096446700507615e-05, 'epoch': 4.36}


 55%|█████▌    | 110/200 [07:26<05:47,  3.86s/it]

{'loss': 0.8465, 'grad_norm': 0.8612654209136963, 'learning_rate': 2.284263959390863e-05, 'epoch': 4.4}


 56%|█████▌    | 111/200 [07:31<06:02,  4.07s/it]

{'loss': 0.8608, 'grad_norm': 0.8080772161483765, 'learning_rate': 2.2588832487309646e-05, 'epoch': 4.44}


 56%|█████▌    | 112/200 [07:36<06:16,  4.28s/it]

{'loss': 0.8846, 'grad_norm': 0.8493384718894958, 'learning_rate': 2.233502538071066e-05, 'epoch': 4.48}


 56%|█████▋    | 113/200 [07:40<06:10,  4.26s/it]

{'loss': 0.876, 'grad_norm': 0.8670037984848022, 'learning_rate': 2.2081218274111677e-05, 'epoch': 4.52}


 57%|█████▋    | 114/200 [07:43<05:37,  3.92s/it]

{'loss': 0.8806, 'grad_norm': 0.9655636548995972, 'learning_rate': 2.182741116751269e-05, 'epoch': 4.56}


 57%|█████▊    | 115/200 [07:46<05:15,  3.71s/it]

{'loss': 0.7288, 'grad_norm': 0.9164541363716125, 'learning_rate': 2.1573604060913707e-05, 'epoch': 4.6}


 58%|█████▊    | 116/200 [07:50<05:11,  3.71s/it]

{'loss': 0.7806, 'grad_norm': 0.8369557857513428, 'learning_rate': 2.1319796954314723e-05, 'epoch': 4.64}


 58%|█████▊    | 117/200 [07:54<05:10,  3.74s/it]

{'loss': 0.9153, 'grad_norm': 0.8768278956413269, 'learning_rate': 2.1065989847715735e-05, 'epoch': 4.68}


 59%|█████▉    | 118/200 [07:58<05:08,  3.76s/it]

{'loss': 0.8231, 'grad_norm': 0.7778289914131165, 'learning_rate': 2.0812182741116754e-05, 'epoch': 4.72}


 60%|█████▉    | 119/200 [08:01<05:07,  3.80s/it]

{'loss': 0.8915, 'grad_norm': 0.9076658487319946, 'learning_rate': 2.055837563451777e-05, 'epoch': 4.76}


 60%|██████    | 120/200 [08:06<05:25,  4.07s/it]

{'loss': 1.008, 'grad_norm': 0.7431990504264832, 'learning_rate': 2.030456852791878e-05, 'epoch': 4.8}


 60%|██████    | 121/200 [08:10<05:14,  3.98s/it]

{'loss': 0.773, 'grad_norm': 0.917377233505249, 'learning_rate': 2.0050761421319797e-05, 'epoch': 4.84}


 61%|██████    | 122/200 [08:14<05:11,  4.00s/it]

{'loss': 0.8627, 'grad_norm': 0.9169129729270935, 'learning_rate': 1.9796954314720815e-05, 'epoch': 4.88}


 62%|██████▏   | 123/200 [08:18<05:11,  4.04s/it]

{'loss': 0.9551, 'grad_norm': 0.7934556603431702, 'learning_rate': 1.9543147208121827e-05, 'epoch': 4.92}


 62%|██████▏   | 124/200 [08:22<05:14,  4.13s/it]

{'loss': 0.9806, 'grad_norm': 0.8718279600143433, 'learning_rate': 1.9289340101522843e-05, 'epoch': 4.96}


 62%|██████▎   | 125/200 [08:27<05:09,  4.13s/it]

{'loss': 1.033, 'grad_norm': 0.8986548185348511, 'learning_rate': 1.9035532994923858e-05, 'epoch': 5.0}


 63%|██████▎   | 126/200 [08:31<05:04,  4.12s/it]

{'loss': 0.8781, 'grad_norm': 0.808880090713501, 'learning_rate': 1.8781725888324874e-05, 'epoch': 5.04}


 64%|██████▎   | 127/200 [08:35<05:06,  4.20s/it]

{'loss': 0.847, 'grad_norm': 0.8137715458869934, 'learning_rate': 1.852791878172589e-05, 'epoch': 5.08}


 64%|██████▍   | 128/200 [08:39<04:58,  4.15s/it]

{'loss': 0.9302, 'grad_norm': 0.8446947336196899, 'learning_rate': 1.8274111675126904e-05, 'epoch': 5.12}


 64%|██████▍   | 129/200 [08:43<04:45,  4.02s/it]

{'loss': 0.8559, 'grad_norm': 0.9189485907554626, 'learning_rate': 1.802030456852792e-05, 'epoch': 5.16}


 65%|██████▌   | 130/200 [08:47<04:37,  3.96s/it]

{'loss': 0.8731, 'grad_norm': 0.9100749492645264, 'learning_rate': 1.7766497461928935e-05, 'epoch': 5.2}


 66%|██████▌   | 131/200 [08:50<04:28,  3.89s/it]

{'loss': 0.7608, 'grad_norm': 0.8848920464515686, 'learning_rate': 1.751269035532995e-05, 'epoch': 5.24}


 66%|██████▌   | 132/200 [08:55<04:35,  4.05s/it]

{'loss': 0.7774, 'grad_norm': 0.8605296611785889, 'learning_rate': 1.7258883248730966e-05, 'epoch': 5.28}


 66%|██████▋   | 133/200 [08:59<04:30,  4.04s/it]

{'loss': 0.8837, 'grad_norm': 0.959255039691925, 'learning_rate': 1.700507614213198e-05, 'epoch': 5.32}


 67%|██████▋   | 134/200 [09:02<04:15,  3.87s/it]

{'loss': 0.6973, 'grad_norm': 0.9707302451133728, 'learning_rate': 1.6751269035532997e-05, 'epoch': 5.36}


 68%|██████▊   | 135/200 [09:06<04:15,  3.94s/it]

{'loss': 0.8358, 'grad_norm': 0.9063658714294434, 'learning_rate': 1.649746192893401e-05, 'epoch': 5.4}


 68%|██████▊   | 136/200 [09:10<04:10,  3.91s/it]

{'loss': 0.9474, 'grad_norm': 1.0067232847213745, 'learning_rate': 1.6243654822335024e-05, 'epoch': 5.44}


 68%|██████▊   | 137/200 [09:14<03:58,  3.78s/it]

{'loss': 0.7539, 'grad_norm': 1.0354865789413452, 'learning_rate': 1.5989847715736043e-05, 'epoch': 5.48}


 69%|██████▉   | 138/200 [09:18<04:08,  4.01s/it]

{'loss': 0.8198, 'grad_norm': 0.9442199468612671, 'learning_rate': 1.5736040609137055e-05, 'epoch': 5.52}


 70%|██████▉   | 139/200 [09:22<04:02,  3.97s/it]

{'loss': 0.8012, 'grad_norm': 0.9858072400093079, 'learning_rate': 1.548223350253807e-05, 'epoch': 5.56}


 70%|███████   | 140/200 [09:26<04:00,  4.02s/it]

{'loss': 0.8559, 'grad_norm': 0.9930734634399414, 'learning_rate': 1.5228426395939088e-05, 'epoch': 5.6}


 70%|███████   | 141/200 [09:31<04:07,  4.20s/it]

{'loss': 1.0286, 'grad_norm': 0.9061411619186401, 'learning_rate': 1.4974619289340103e-05, 'epoch': 5.64}


 71%|███████   | 142/200 [09:35<04:06,  4.25s/it]

{'loss': 1.0659, 'grad_norm': 0.9593456983566284, 'learning_rate': 1.4720812182741117e-05, 'epoch': 5.68}


 72%|███████▏  | 143/200 [09:40<04:04,  4.30s/it]

{'loss': 0.7531, 'grad_norm': 0.8636348843574524, 'learning_rate': 1.4467005076142132e-05, 'epoch': 5.72}


 72%|███████▏  | 144/200 [09:44<04:04,  4.37s/it]

{'loss': 0.7896, 'grad_norm': 0.9521132707595825, 'learning_rate': 1.421319796954315e-05, 'epoch': 5.76}


 72%|███████▎  | 145/200 [09:48<03:44,  4.09s/it]

{'loss': 0.6575, 'grad_norm': 1.0276728868484497, 'learning_rate': 1.3959390862944163e-05, 'epoch': 5.8}


 73%|███████▎  | 146/200 [09:52<03:38,  4.05s/it]

{'loss': 1.0962, 'grad_norm': 0.9368217587471008, 'learning_rate': 1.3705583756345178e-05, 'epoch': 5.84}


 74%|███████▎  | 147/200 [09:55<03:27,  3.91s/it]

{'loss': 0.7855, 'grad_norm': 1.084452509880066, 'learning_rate': 1.3451776649746192e-05, 'epoch': 5.88}


 74%|███████▍  | 148/200 [10:00<03:30,  4.04s/it]

{'loss': 0.9774, 'grad_norm': 0.8818024396896362, 'learning_rate': 1.3197969543147209e-05, 'epoch': 5.92}


 74%|███████▍  | 149/200 [10:04<03:32,  4.17s/it]

{'loss': 0.707, 'grad_norm': 0.8749522566795349, 'learning_rate': 1.2944162436548224e-05, 'epoch': 5.96}


 75%|███████▌  | 150/200 [10:07<03:14,  3.90s/it]

{'loss': 0.7544, 'grad_norm': 0.9422093629837036, 'learning_rate': 1.2690355329949238e-05, 'epoch': 6.0}


 76%|███████▌  | 151/200 [10:11<03:08,  3.84s/it]

{'loss': 0.8015, 'grad_norm': 1.039057970046997, 'learning_rate': 1.2436548223350254e-05, 'epoch': 6.04}


 76%|███████▌  | 152/200 [10:15<03:12,  4.02s/it]

{'loss': 0.8438, 'grad_norm': 0.9176073670387268, 'learning_rate': 1.218274111675127e-05, 'epoch': 6.08}


 76%|███████▋  | 153/200 [10:20<03:10,  4.05s/it]

{'loss': 0.9033, 'grad_norm': 0.873494029045105, 'learning_rate': 1.1928934010152284e-05, 'epoch': 6.12}


 77%|███████▋  | 154/200 [10:23<02:59,  3.90s/it]

{'loss': 0.8116, 'grad_norm': 1.0371967554092407, 'learning_rate': 1.16751269035533e-05, 'epoch': 6.16}


 78%|███████▊  | 155/200 [10:27<02:54,  3.87s/it]

{'loss': 1.0085, 'grad_norm': 0.986423134803772, 'learning_rate': 1.1421319796954315e-05, 'epoch': 6.2}


 78%|███████▊  | 156/200 [10:31<02:49,  3.86s/it]

{'loss': 0.7968, 'grad_norm': 1.0411649942398071, 'learning_rate': 1.116751269035533e-05, 'epoch': 6.24}


 78%|███████▊  | 157/200 [10:35<02:46,  3.88s/it]

{'loss': 0.9009, 'grad_norm': 1.0438507795333862, 'learning_rate': 1.0913705583756344e-05, 'epoch': 6.28}


 79%|███████▉  | 158/200 [10:38<02:36,  3.73s/it]

{'loss': 0.8447, 'grad_norm': 1.124955654144287, 'learning_rate': 1.0659898477157361e-05, 'epoch': 6.32}


 80%|███████▉  | 159/200 [10:42<02:37,  3.83s/it]

{'loss': 0.7518, 'grad_norm': 0.9504971504211426, 'learning_rate': 1.0406091370558377e-05, 'epoch': 6.36}


 80%|████████  | 160/200 [10:46<02:28,  3.72s/it]

{'loss': 0.8094, 'grad_norm': 1.2302680015563965, 'learning_rate': 1.015228426395939e-05, 'epoch': 6.4}


 80%|████████  | 161/200 [10:49<02:24,  3.70s/it]

{'loss': 0.6448, 'grad_norm': 1.1065458059310913, 'learning_rate': 9.898477157360408e-06, 'epoch': 6.44}


 81%|████████  | 162/200 [10:53<02:21,  3.73s/it]

{'loss': 0.6841, 'grad_norm': 1.1706867218017578, 'learning_rate': 9.644670050761421e-06, 'epoch': 6.48}


 82%|████████▏ | 163/200 [10:57<02:19,  3.77s/it]

{'loss': 0.7724, 'grad_norm': 1.1591148376464844, 'learning_rate': 9.390862944162437e-06, 'epoch': 6.52}


 82%|████████▏ | 164/200 [11:02<02:26,  4.07s/it]

{'loss': 0.8474, 'grad_norm': 1.0486429929733276, 'learning_rate': 9.137055837563452e-06, 'epoch': 6.56}


 82%|████████▎ | 165/200 [11:06<02:26,  4.20s/it]

{'loss': 0.8238, 'grad_norm': 1.0633299350738525, 'learning_rate': 8.883248730964468e-06, 'epoch': 6.6}


 83%|████████▎ | 166/200 [11:11<02:26,  4.30s/it]

{'loss': 0.8317, 'grad_norm': 1.0972394943237305, 'learning_rate': 8.629441624365483e-06, 'epoch': 6.64}


 84%|████████▎ | 167/200 [11:15<02:23,  4.35s/it]

{'loss': 0.9335, 'grad_norm': 1.015215277671814, 'learning_rate': 8.375634517766498e-06, 'epoch': 6.68}


 84%|████████▍ | 168/200 [11:19<02:16,  4.27s/it]

{'loss': 0.7367, 'grad_norm': 1.0644707679748535, 'learning_rate': 8.121827411167512e-06, 'epoch': 6.72}


 84%|████████▍ | 169/200 [11:23<02:10,  4.23s/it]

{'loss': 0.7049, 'grad_norm': 1.0272339582443237, 'learning_rate': 7.868020304568528e-06, 'epoch': 6.76}


 85%|████████▌ | 170/200 [11:27<02:05,  4.20s/it]

{'loss': 0.8307, 'grad_norm': 1.13486909866333, 'learning_rate': 7.614213197969544e-06, 'epoch': 6.8}


 86%|████████▌ | 171/200 [11:31<01:59,  4.12s/it]

{'loss': 0.7447, 'grad_norm': 1.029239535331726, 'learning_rate': 7.360406091370558e-06, 'epoch': 6.84}


 86%|████████▌ | 172/200 [11:35<01:49,  3.91s/it]

{'loss': 0.6598, 'grad_norm': 1.2136588096618652, 'learning_rate': 7.106598984771575e-06, 'epoch': 6.88}


 86%|████████▋ | 173/200 [11:39<01:49,  4.04s/it]

{'loss': 0.7869, 'grad_norm': 1.039285659790039, 'learning_rate': 6.852791878172589e-06, 'epoch': 6.92}


 87%|████████▋ | 174/200 [11:43<01:46,  4.10s/it]

{'loss': 0.8488, 'grad_norm': 0.9990047812461853, 'learning_rate': 6.5989847715736045e-06, 'epoch': 6.96}


 88%|████████▊ | 175/200 [11:48<01:43,  4.13s/it]

{'loss': 0.6172, 'grad_norm': 1.0518878698349, 'learning_rate': 6.345177664974619e-06, 'epoch': 7.0}


 88%|████████▊ | 176/200 [11:52<01:39,  4.15s/it]

{'loss': 0.8286, 'grad_norm': 0.9841503500938416, 'learning_rate': 6.091370558375635e-06, 'epoch': 7.04}


 88%|████████▊ | 177/200 [11:56<01:36,  4.18s/it]

{'loss': 0.7728, 'grad_norm': 1.0599772930145264, 'learning_rate': 5.83756345177665e-06, 'epoch': 7.08}


 89%|████████▉ | 178/200 [12:01<01:34,  4.29s/it]

{'loss': 0.7879, 'grad_norm': 0.9695572257041931, 'learning_rate': 5.583756345177665e-06, 'epoch': 7.12}


 90%|████████▉ | 179/200 [12:04<01:26,  4.13s/it]

{'loss': 0.6487, 'grad_norm': 1.1052106618881226, 'learning_rate': 5.329949238578681e-06, 'epoch': 7.16}


 90%|█████████ | 180/200 [12:08<01:21,  4.10s/it]

{'loss': 0.9081, 'grad_norm': 1.039491057395935, 'learning_rate': 5.076142131979695e-06, 'epoch': 7.2}


 90%|█████████ | 181/200 [12:12<01:14,  3.90s/it]

{'loss': 0.795, 'grad_norm': 1.090205430984497, 'learning_rate': 4.822335025380711e-06, 'epoch': 7.24}


 91%|█████████ | 182/200 [12:16<01:10,  3.90s/it]

{'loss': 0.7024, 'grad_norm': 1.048060417175293, 'learning_rate': 4.568527918781726e-06, 'epoch': 7.28}


 92%|█████████▏| 183/200 [12:20<01:06,  3.91s/it]

{'loss': 0.6471, 'grad_norm': 1.094113826751709, 'learning_rate': 4.3147208121827415e-06, 'epoch': 7.32}


 92%|█████████▏| 184/200 [12:24<01:04,  4.00s/it]

{'loss': 0.9505, 'grad_norm': 1.047101616859436, 'learning_rate': 4.060913705583756e-06, 'epoch': 7.36}


 92%|█████████▎| 185/200 [12:28<01:02,  4.14s/it]

{'loss': 0.7921, 'grad_norm': 1.1243816614151, 'learning_rate': 3.807106598984772e-06, 'epoch': 7.4}


 93%|█████████▎| 186/200 [12:32<00:55,  3.95s/it]

{'loss': 0.6859, 'grad_norm': 1.2385632991790771, 'learning_rate': 3.5532994923857873e-06, 'epoch': 7.44}


 94%|█████████▎| 187/200 [12:36<00:50,  3.89s/it]

{'loss': 0.6474, 'grad_norm': 1.1323671340942383, 'learning_rate': 3.2994923857868023e-06, 'epoch': 7.48}


 94%|█████████▍| 188/200 [12:40<00:47,  3.96s/it]

{'loss': 0.8784, 'grad_norm': 1.009611964225769, 'learning_rate': 3.0456852791878177e-06, 'epoch': 7.52}


 94%|█████████▍| 189/200 [12:43<00:42,  3.84s/it]

{'loss': 0.8182, 'grad_norm': 1.2189282178878784, 'learning_rate': 2.7918781725888327e-06, 'epoch': 7.56}


 95%|█████████▌| 190/200 [12:47<00:39,  3.92s/it]

{'loss': 0.8207, 'grad_norm': 1.15474534034729, 'learning_rate': 2.5380710659898476e-06, 'epoch': 7.6}


 96%|█████████▌| 191/200 [12:51<00:35,  3.92s/it]

{'loss': 0.754, 'grad_norm': 1.1696202754974365, 'learning_rate': 2.284263959390863e-06, 'epoch': 7.64}


 96%|█████████▌| 192/200 [12:55<00:30,  3.82s/it]

{'loss': 0.6319, 'grad_norm': 1.1486480236053467, 'learning_rate': 2.030456852791878e-06, 'epoch': 7.68}


 96%|█████████▋| 193/200 [12:59<00:27,  3.88s/it]

{'loss': 0.8956, 'grad_norm': 1.2254345417022705, 'learning_rate': 1.7766497461928936e-06, 'epoch': 7.72}


 97%|█████████▋| 194/200 [13:03<00:24,  4.07s/it]

{'loss': 0.833, 'grad_norm': 1.0129567384719849, 'learning_rate': 1.5228426395939088e-06, 'epoch': 7.76}


 98%|█████████▊| 195/200 [13:07<00:19,  3.86s/it]

{'loss': 0.7521, 'grad_norm': 1.3308749198913574, 'learning_rate': 1.2690355329949238e-06, 'epoch': 7.8}


 98%|█████████▊| 196/200 [13:11<00:15,  3.96s/it]

{'loss': 0.734, 'grad_norm': 1.1837759017944336, 'learning_rate': 1.015228426395939e-06, 'epoch': 7.84}


 98%|█████████▊| 197/200 [13:16<00:12,  4.15s/it]

{'loss': 0.7628, 'grad_norm': 1.021207571029663, 'learning_rate': 7.614213197969544e-07, 'epoch': 7.88}


 99%|█████████▉| 198/200 [13:20<00:08,  4.22s/it]

{'loss': 0.7544, 'grad_norm': 1.23796808719635, 'learning_rate': 5.076142131979695e-07, 'epoch': 7.92}


100%|█████████▉| 199/200 [13:24<00:04,  4.26s/it]

{'loss': 0.8019, 'grad_norm': 1.0618083477020264, 'learning_rate': 2.5380710659898475e-07, 'epoch': 7.96}


100%|██████████| 200/200 [13:28<00:00,  4.05s/it]

{'loss': 0.5283, 'grad_norm': 1.1694695949554443, 'learning_rate': 0.0, 'epoch': 8.0}


100%|██████████| 200/200 [13:29<00:00,  4.05s/it]

{'train_runtime': 809.5172, 'train_samples_per_second': 0.988, 'train_steps_per_second': 0.247, 'train_loss': 1.044141108095646, 'epoch': 8.0}





<a name="Inference"></a>
### Inference
Let's run the model! Unsloth makes inference natively 2x faster as well! You should use prompts which are similar to the ones you had finetuned on, otherwise you might get bad results!

In [11]:
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# messages = [                    # Change below!
#     {"role": "user", "content": "I have the following ingredients: Potato (Aloo), Curd (Dahi / Yogurt), Coriander (Dhania) Leaves, Salt, Mustard seeds (Rai/ Kadugu), Curry leaves, Asafoetida (hing), Dry Red Chillies, Green Chilli, Sugar, Oil. Please provide recipes I can make with them."},
# ]
# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True,
#     return_tensors = "pt",
# ).to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)
# _ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

Since we created an actual chatbot, you can also do longer conversations by manually adding alternating conversations between the user and assistant!

In [12]:

model.save_pretrained_gguf("new_model", tokenizer, maximum_memory_usage=0.8, quantization_method = "q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 17.08 out of 31.26 RAM for saving.


  6%|▋         | 2/32 [00:00<00:01, 17.31it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:57<00:00,  1.81s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


In [14]:
# # Save to 8bit Q8_0
# if True: model.save_pretrained_gguf("model", tokenizer,)
# # Remember to go to https://huggingface.co/settings/tokens for a token!
# # And change hf to your username!
model.push_to_hub_gguf("tjlee0506/model", tokenizer, token = "")

# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# # Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# # Save to multiple GGUF options - much faster if you want multiple!
# if False:
#     model.push_to_hub_gguf(
#         "hf/model", # Change hf to your username!
#         tokenizer,
#         quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
#         token = "", # Get a token at https://huggingface.co/settings/tokens
#     )

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 18.82 out of 31.26 RAM for saving.


 69%|██████▉   | 22/32 [00:31<00:48,  4.83s/it]