In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rumitpathare/indian-recipes")

print("Path to dataset files:", path)

In [None]:
!cp  /home/tj/.cache/kagglehub/datasets/rumitpathare/indian-recipes/versions/22/Food_Recipe.csv ./recipes_data.csv

In [None]:
import pandas as pd
from io import StringIO
from tqdm import tqdm

def save_dataset_with_skip(data_file, output_file):
    skipped_rows = 0
    valid_rows = []

    with open(data_file, "r", encoding="utf-8") as file:
        total_lines = sum(1 for _ in file)
        file.seek(0)
        for line in tqdm(file, total=total_lines, desc="Processing lines"):
            try:
                row = pd.read_csv(StringIO(line), header=None)
                valid_rows.append(line)
            except pd.errors.ParserError:
                skipped_rows += 1

    print(f"Skipped rows: {skipped_rows}")

    with open(output_file, "w", encoding="utf-8") as out_file:
        out_file.writelines(valid_rows)

data_file = './recipes_data.csv'
output_file = './pure_recipes_data.csv'
save_dataset_with_skip(data_file, output_file)
print(f"Filtered dataset saved to {output_file}")

In [16]:
import pandas as pd
import json
import regex as re

# 데이터 로드
df = pd.read_csv('pure_recipes_data.csv')

# 컬럼명 정리
df.rename(columns={
    'prep_time (in mins)': 'prep_time_in_mins',
    'cook_time (in mins)': 'cook_time_in_mins',
}, inplace=True)

# 결측값 채우기
df['ingredients_name'] = df['ingredients_name'].fillna('')
df['ingredients_quantity'] = df['ingredients_quantity'].fillna('')

# 정규표현식 패턴 정의 (영어, 숫자, 특수문자만 허용)
valid_pattern = re.compile(r'^[\p{Latin}\p{Nd}\p{P}\s]*$')

# 각 행에서 모든 텍스트가 유효한지 확인하는 함수
def is_row_valid(row):
    for col in ['name', 'cuisine', 'course', 'diet', 'ingredients_name', 'ingredients_quantity', 'instructions']:
        if not re.match(valid_pattern, str(row[col])):
            return False
    return True

# 유효한 행만 필터링
df = df[df.apply(is_row_valid, axis=1)]

# instruction 생성
df['instruction'] = df['ingredients_name'].apply(
    lambda x: f'I have the following ingredients: {x}. Please provide recipes I can make with them.'
)

# 재료 및 지시사항 정리 함수
def format_output(row):
    # JSON 헤더 생성
    json_header = json.dumps({"is_recipe_request": True, "recipe_name": row['name']})
    output = f"{json_header}\n"
    output += f"Recipe Name: {row['name']}\n"
    output += f"Cuisine: {row['cuisine']}\n"
    output += f"Course: {row['course']}\n"
    output += f"Diet: {row['diet']}\n"
    output += "Ingredients:\n"

    # 재료 정리 및 번호 매기기
    ingredients = row['ingredients_name'].split(',')
    quantities = row['ingredients_quantity'].split('  ')

    cleaned_ingredients = []
    for idx, (name, qty) in enumerate(zip(ingredients, quantities), start=1):
        cleaned_name = name.strip()
        cleaned_qty = qty.strip()
        if cleaned_name and cleaned_qty:
            cleaned_ingredients.append(f"{idx}. {cleaned_name}: {cleaned_qty}")

    output += "\n".join(cleaned_ingredients) + "\n"

    # 지시사항 정리 및 번호 매기기
    output += "Instructions:\n"
    instructions = re.split(r'[.!?]\s*', row['instructions'])  # 문장별로 분리
    instructions = [instr.strip() for instr in instructions if instr.strip()]  # 빈 문장 제거

    for idx, instruction in enumerate(instructions, start=1):
        output += f"{idx}. {instruction}.\n"

    return output

# output 생성
df['output'] = df.apply(format_output, axis=1)

# input 컬럼 초기화
df['input'] = ''

# 필요한 컬럼만 선택
df = df[['instruction', 'input', 'output']]

# 결과 확인
print(df.head())

# CSV로 저장
df.to_csv('processed_dataset.csv', index=False, quotechar='"', escapechar="\\")


                                         instruction input  \
0  I have the following ingredients: Tortillas, C...         
1  I have the following ingredients: Cashew nuts,...         
2  I have the following ingredients: Bhindi (Lady...         
4  I have the following ingredients: Rice, Brinja...         
8  I have the following ingredients: Small Brinja...         

                                              output  
0  {"is_recipe_request": true, "recipe_name": "Me...  
1  {"is_recipe_request": true, "recipe_name": "Ol...  
2  {"is_recipe_request": true, "recipe_name": "An...  
4  {"is_recipe_request": true, "recipe_name": "On...  
8  {"is_recipe_request": true, "recipe_name": "Sp...  


* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co/unsloth) has Llama, Mistral 4bit models.
* [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3080. Max memory: 9.753 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [vicgalle](https://huggingface.co/datasets/vicgalle/alpaca-gpt4), which is a version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html) generated from GPT4. You can replace this code section with your own data prep.

In [3]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='processed_dataset.csv', split='train')
print(dataset.column_names)

Generating train split: 6444 examples [00:00, 44345.65 examples/s]

['instruction', 'input', 'output']





One issue is this dataset has multiple columns. For `Ollama` and `llama.cpp` to function like a custom `ChatGPT` Chatbot, we must only have 2 columns - an `instruction` and an `output` column.

In [4]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "{instruction}[[\nYour input is:\n{input}]]",
    output_column_name = "output",
    conversation_extension = 3, # Select more to handle longer conversations
)

Merging columns: 100%|██████████| 6444/6444 [00:00<00:00, 123027.22 examples/s]
Converting to ShareGPT: 100%|██████████| 6444/6444 [00:00<00:00, 84263.46 examples/s]
Flattening the indices: 100%|██████████| 6444/6444 [00:00<00:00, 177693.67 examples/s]
Flattening the indices: 100%|██████████| 6444/6444 [00:00<00:00, 15358.47 examples/s]
Flattening the indices: 100%|██████████| 6444/6444 [00:00<00:00, 16696.66 examples/s]
Extending conversations: 100%|██████████| 6444/6444 [00:00<00:00, 17733.75 examples/s]


Finally use `standardize_sharegpt` to fix up the dataset!

In [5]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Standardizing format: 100%|██████████| 6444/6444 [00:00<00:00, 17036.86 examples/s]


### Customizable Chat Templates

You also need to specify a chat template. Previously, you could use the Alpaca format as shown below.

The issue is the Alpaca format has 3 fields, whilst OpenAI style chatbots must only use 2 fields (instruction and response). That's why we used the `to_sharegpt` function to merge these columns into 1.

In [6]:
chat_template = """Below are some instructions that describe some tasks. Start your response with a JSON object, where is_recipe_request is true if the instruction is requesting a recipe (false otherwise), and recipe_name is the extracted recipe name or null if none. Follow this with a plain text answer detailing the response.

### Instruction:
{INPUT}

### Response:
{OUTPUT}
"""



from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
    default_system_message = "Start your response with a JSON object including is_recipe_request and recipe_name. is_recipe_request should be true if the instruction requests a recipe, false otherwise. recipe_name should be the name of the recipe requested or null if none. Follow with a plain text answer detailing the response."
)


Unsloth: We automatically added an EOS token to stop endless generations.
Map: 100%|██████████| 6444/6444 [00:00<00:00, 10849.61 examples/s]


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 11,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs = 1, # For longer training runs!
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=11): 100%|██████████| 6444/6444 [00:03<00:00, 1660.13 examples/s]


In [8]:
trainer_stats = trainer.train()  

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,444 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 805
 "-____-"     Number of trainable parameters = 41,943,040
  0%|          | 1/805 [00:10<2:19:43, 10.43s/it]

{'loss': 1.4383, 'grad_norm': 0.3933122158050537, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 2/805 [00:19<2:08:29,  9.60s/it]

{'loss': 1.5581, 'grad_norm': 0.3952540159225464, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 3/805 [00:27<2:01:02,  9.06s/it]

{'loss': 1.5235, 'grad_norm': 0.40099257230758667, 'learning_rate': 6e-06, 'epoch': 0.0}


  0%|          | 4/805 [00:36<1:57:48,  8.82s/it]

{'loss': 1.4274, 'grad_norm': 0.3995467722415924, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}


  1%|          | 5/805 [00:44<1:56:05,  8.71s/it]

{'loss': 1.5703, 'grad_norm': 0.4029112458229065, 'learning_rate': 1e-05, 'epoch': 0.01}


  1%|          | 6/805 [00:53<1:55:19,  8.66s/it]

{'loss': 1.5933, 'grad_norm': 0.42358267307281494, 'learning_rate': 9.999961446907354e-06, 'epoch': 0.01}


  1%|          | 7/805 [01:01<1:54:50,  8.63s/it]

{'loss': 1.4859, 'grad_norm': 0.40440064668655396, 'learning_rate': 9.99984578822395e-06, 'epoch': 0.01}


  1%|          | 8/805 [01:10<1:54:40,  8.63s/it]

{'loss': 1.5191, 'grad_norm': 0.41395941376686096, 'learning_rate': 9.999653025733386e-06, 'epoch': 0.01}


  1%|          | 9/805 [01:19<1:54:36,  8.64s/it]

{'loss': 1.5217, 'grad_norm': 0.4001372158527374, 'learning_rate': 9.999383162408303e-06, 'epoch': 0.01}


  1%|          | 10/805 [01:27<1:54:43,  8.66s/it]

{'loss': 1.5145, 'grad_norm': 0.4484250247478485, 'learning_rate': 9.999036202410324e-06, 'epoch': 0.01}


  1%|▏         | 11/805 [01:36<1:54:53,  8.68s/it]

{'loss': 1.5241, 'grad_norm': 0.4515262842178345, 'learning_rate': 9.998612151090004e-06, 'epoch': 0.01}


  1%|▏         | 12/805 [01:45<1:55:00,  8.70s/it]

{'loss': 1.496, 'grad_norm': 0.45434799790382385, 'learning_rate': 9.998111014986735e-06, 'epoch': 0.01}


  2%|▏         | 13/805 [01:54<1:55:08,  8.72s/it]

{'loss': 1.5438, 'grad_norm': 0.44465315341949463, 'learning_rate': 9.997532801828659e-06, 'epoch': 0.02}


  2%|▏         | 14/805 [02:03<1:55:16,  8.74s/it]

{'loss': 1.4677, 'grad_norm': 0.4469817280769348, 'learning_rate': 9.996877520532535e-06, 'epoch': 0.02}


  2%|▏         | 15/805 [02:11<1:55:21,  8.76s/it]

{'loss': 1.4229, 'grad_norm': 0.4639498293399811, 'learning_rate': 9.996145181203616e-06, 'epoch': 0.02}


  2%|▏         | 16/805 [02:20<1:55:37,  8.79s/it]

{'loss': 1.4912, 'grad_norm': 0.43867307901382446, 'learning_rate': 9.995335795135475e-06, 'epoch': 0.02}


  2%|▏         | 17/805 [02:29<1:56:10,  8.85s/it]

{'loss': 1.4818, 'grad_norm': 0.47971078753471375, 'learning_rate': 9.994449374809851e-06, 'epoch': 0.02}


  2%|▏         | 18/805 [02:38<1:56:42,  8.90s/it]

{'loss': 1.4859, 'grad_norm': 0.46925756335258484, 'learning_rate': 9.99348593389644e-06, 'epoch': 0.02}


  2%|▏         | 19/805 [02:47<1:57:16,  8.95s/it]

{'loss': 1.477, 'grad_norm': 0.47653070092201233, 'learning_rate': 9.992445487252692e-06, 'epoch': 0.02}


  2%|▏         | 20/805 [02:56<1:57:37,  8.99s/it]

{'loss': 1.4221, 'grad_norm': 0.4660554528236389, 'learning_rate': 9.99132805092358e-06, 'epoch': 0.02}


  3%|▎         | 21/805 [03:05<1:58:00,  9.03s/it]

{'loss': 1.5402, 'grad_norm': 0.4627685248851776, 'learning_rate': 9.990133642141359e-06, 'epoch': 0.03}


  3%|▎         | 22/805 [03:15<1:58:30,  9.08s/it]

{'loss': 1.3677, 'grad_norm': 0.46608564257621765, 'learning_rate': 9.988862279325287e-06, 'epoch': 0.03}


  3%|▎         | 23/805 [03:24<1:58:51,  9.12s/it]

{'loss': 1.4666, 'grad_norm': 0.45300161838531494, 'learning_rate': 9.987513982081352e-06, 'epoch': 0.03}


  3%|▎         | 24/805 [03:33<1:58:53,  9.13s/it]

{'loss': 1.4075, 'grad_norm': 0.4558853209018707, 'learning_rate': 9.986088771201965e-06, 'epoch': 0.03}


  3%|▎         | 25/805 [03:42<1:58:51,  9.14s/it]

{'loss': 1.4457, 'grad_norm': 0.4421078562736511, 'learning_rate': 9.984586668665641e-06, 'epoch': 0.03}


  3%|▎         | 26/805 [03:51<1:58:29,  9.13s/it]

{'loss': 1.4014, 'grad_norm': 0.4136621654033661, 'learning_rate': 9.983007697636659e-06, 'epoch': 0.03}


  3%|▎         | 27/805 [04:00<1:58:14,  9.12s/it]

{'loss': 1.3957, 'grad_norm': 0.4133664071559906, 'learning_rate': 9.981351882464706e-06, 'epoch': 0.03}


  3%|▎         | 28/805 [04:09<1:57:57,  9.11s/it]

{'loss': 1.3508, 'grad_norm': 0.4096923768520355, 'learning_rate': 9.979619248684503e-06, 'epoch': 0.03}


  4%|▎         | 29/805 [04:19<1:57:37,  9.09s/it]

{'loss': 1.431, 'grad_norm': 0.4176373779773712, 'learning_rate': 9.9778098230154e-06, 'epoch': 0.04}


  4%|▎         | 30/805 [04:28<1:57:11,  9.07s/it]

{'loss': 1.3108, 'grad_norm': 0.40760719776153564, 'learning_rate': 9.975923633360985e-06, 'epoch': 0.04}


  4%|▍         | 31/805 [04:37<1:56:48,  9.06s/it]

{'loss': 1.2601, 'grad_norm': 0.385070264339447, 'learning_rate': 9.973960708808633e-06, 'epoch': 0.04}


  4%|▍         | 32/805 [04:46<1:56:23,  9.03s/it]

{'loss': 1.2977, 'grad_norm': 0.38408565521240234, 'learning_rate': 9.97192107962907e-06, 'epoch': 0.04}


  4%|▍         | 33/805 [04:54<1:55:55,  9.01s/it]

{'loss': 1.381, 'grad_norm': 0.3774954378604889, 'learning_rate': 9.9698047772759e-06, 'epoch': 0.04}


  4%|▍         | 34/805 [05:03<1:55:41,  9.00s/it]

{'loss': 1.3828, 'grad_norm': 0.3854433596134186, 'learning_rate': 9.967611834385122e-06, 'epoch': 0.04}


  4%|▍         | 35/805 [05:12<1:55:13,  8.98s/it]

{'loss': 1.3423, 'grad_norm': 0.3925032913684845, 'learning_rate': 9.965342284774633e-06, 'epoch': 0.04}


  4%|▍         | 36/805 [05:21<1:54:49,  8.96s/it]

{'loss': 1.3344, 'grad_norm': 0.3645857274532318, 'learning_rate': 9.96299616344369e-06, 'epoch': 0.04}


  5%|▍         | 37/805 [05:30<1:54:33,  8.95s/it]

{'loss': 1.3633, 'grad_norm': 0.34703806042671204, 'learning_rate': 9.960573506572391e-06, 'epoch': 0.05}


  5%|▍         | 38/805 [05:39<1:54:18,  8.94s/it]

{'loss': 1.3552, 'grad_norm': 0.3664357662200928, 'learning_rate': 9.958074351521097e-06, 'epoch': 0.05}


  5%|▍         | 39/805 [05:48<1:53:56,  8.92s/it]

{'loss': 1.3783, 'grad_norm': 0.38780438899993896, 'learning_rate': 9.955498736829876e-06, 'epoch': 0.05}


  5%|▍         | 40/805 [05:57<1:53:39,  8.91s/it]

{'loss': 1.4276, 'grad_norm': 0.35273656249046326, 'learning_rate': 9.952846702217886e-06, 'epoch': 0.05}


  5%|▌         | 41/805 [06:06<1:53:24,  8.91s/it]

{'loss': 1.3812, 'grad_norm': 0.3210500180721283, 'learning_rate': 9.95011828858279e-06, 'epoch': 0.05}


  5%|▌         | 42/805 [06:15<1:53:10,  8.90s/it]

{'loss': 1.315, 'grad_norm': 0.35064196586608887, 'learning_rate': 9.947313538000093e-06, 'epoch': 0.05}


  5%|▌         | 43/805 [06:24<1:53:00,  8.90s/it]

{'loss': 1.261, 'grad_norm': 0.3446078300476074, 'learning_rate': 9.944432493722525e-06, 'epoch': 0.05}


  5%|▌         | 44/805 [06:33<1:52:51,  8.90s/it]

{'loss': 1.347, 'grad_norm': 0.3342701196670532, 'learning_rate': 9.941475200179347e-06, 'epoch': 0.05}


  6%|▌         | 45/805 [06:41<1:52:38,  8.89s/it]

{'loss': 1.2667, 'grad_norm': 0.3282211124897003, 'learning_rate': 9.938441702975689e-06, 'epoch': 0.06}


  6%|▌         | 46/805 [06:50<1:52:26,  8.89s/it]

{'loss': 1.2039, 'grad_norm': 0.3421287536621094, 'learning_rate': 9.935332048891828e-06, 'epoch': 0.06}


  6%|▌         | 47/805 [06:59<1:52:15,  8.89s/it]

{'loss': 1.2396, 'grad_norm': 0.341001033782959, 'learning_rate': 9.932146285882478e-06, 'epoch': 0.06}


  6%|▌         | 48/805 [07:08<1:52:12,  8.89s/it]

{'loss': 1.1911, 'grad_norm': 0.31991061568260193, 'learning_rate': 9.928884463076045e-06, 'epoch': 0.06}


  6%|▌         | 49/805 [07:17<1:51:55,  8.88s/it]

{'loss': 1.3081, 'grad_norm': 0.33629110455513, 'learning_rate': 9.92554663077387e-06, 'epoch': 0.06}


  6%|▌         | 50/805 [07:26<1:51:42,  8.88s/it]

{'loss': 1.1884, 'grad_norm': 0.31521332263946533, 'learning_rate': 9.922132840449459e-06, 'epoch': 0.06}


  6%|▋         | 51/805 [07:35<1:51:35,  8.88s/it]

{'loss': 1.3288, 'grad_norm': 0.34199005365371704, 'learning_rate': 9.918643144747681e-06, 'epoch': 0.06}


  6%|▋         | 52/805 [07:44<1:51:31,  8.89s/it]

{'loss': 1.246, 'grad_norm': 0.32387685775756836, 'learning_rate': 9.915077597483959e-06, 'epoch': 0.06}


  7%|▋         | 53/805 [07:52<1:51:22,  8.89s/it]

{'loss': 1.2813, 'grad_norm': 0.3308185040950775, 'learning_rate': 9.911436253643445e-06, 'epoch': 0.07}


  7%|▋         | 54/805 [08:01<1:51:12,  8.89s/it]

{'loss': 1.2072, 'grad_norm': 0.356167733669281, 'learning_rate': 9.907719169380164e-06, 'epoch': 0.07}


  7%|▋         | 55/805 [08:10<1:51:01,  8.88s/it]

{'loss': 1.2475, 'grad_norm': 0.33809152245521545, 'learning_rate': 9.903926402016153e-06, 'epoch': 0.07}


  7%|▋         | 56/805 [08:19<1:50:57,  8.89s/it]

{'loss': 1.1937, 'grad_norm': 0.3394656777381897, 'learning_rate': 9.900058010040578e-06, 'epoch': 0.07}


  7%|▋         | 57/805 [08:28<1:50:45,  8.89s/it]

{'loss': 1.2171, 'grad_norm': 0.3335559666156769, 'learning_rate': 9.89611405310883e-06, 'epoch': 0.07}


  7%|▋         | 58/805 [08:37<1:50:32,  8.88s/it]

{'loss': 1.1588, 'grad_norm': 0.33635464310646057, 'learning_rate': 9.892094592041602e-06, 'epoch': 0.07}


  7%|▋         | 59/805 [08:46<1:50:21,  8.88s/it]

{'loss': 1.1869, 'grad_norm': 0.3532180190086365, 'learning_rate': 9.887999688823955e-06, 'epoch': 0.07}


  7%|▋         | 60/805 [08:55<1:50:13,  8.88s/it]

{'loss': 1.1613, 'grad_norm': 0.36102792620658875, 'learning_rate': 9.883829406604363e-06, 'epoch': 0.07}


  8%|▊         | 61/805 [09:03<1:50:02,  8.87s/it]

{'loss': 1.0971, 'grad_norm': 0.3675729036331177, 'learning_rate': 9.879583809693737e-06, 'epoch': 0.08}


  8%|▊         | 62/805 [09:12<1:49:55,  8.88s/it]

{'loss': 1.2449, 'grad_norm': 0.36996641755104065, 'learning_rate': 9.875262963564436e-06, 'epoch': 0.08}


  8%|▊         | 63/805 [09:21<1:49:44,  8.87s/it]

{'loss': 1.2116, 'grad_norm': 0.34392303228378296, 'learning_rate': 9.870866934849248e-06, 'epoch': 0.08}


  8%|▊         | 64/805 [09:30<1:49:36,  8.88s/it]

{'loss': 1.1684, 'grad_norm': 0.36903977394104004, 'learning_rate': 9.866395791340376e-06, 'epoch': 0.08}


  8%|▊         | 65/805 [09:39<1:49:27,  8.87s/it]

{'loss': 1.1504, 'grad_norm': 0.3746267259120941, 'learning_rate': 9.861849601988384e-06, 'epoch': 0.08}


  8%|▊         | 66/805 [09:48<1:49:16,  8.87s/it]

{'loss': 1.0936, 'grad_norm': 0.3657282888889313, 'learning_rate': 9.857228436901137e-06, 'epoch': 0.08}


  8%|▊         | 67/805 [09:57<1:49:07,  8.87s/it]

{'loss': 1.1528, 'grad_norm': 0.3518938720226288, 'learning_rate': 9.852532367342712e-06, 'epoch': 0.08}


  8%|▊         | 68/805 [10:06<1:49:00,  8.88s/it]

{'loss': 1.0414, 'grad_norm': 0.34791746735572815, 'learning_rate': 9.847761465732319e-06, 'epoch': 0.08}


  9%|▊         | 69/805 [10:14<1:48:51,  8.87s/it]

{'loss': 1.1327, 'grad_norm': 0.3403867781162262, 'learning_rate': 9.842915805643156e-06, 'epoch': 0.09}


  9%|▊         | 70/805 [10:23<1:48:44,  8.88s/it]

{'loss': 1.0721, 'grad_norm': 0.3465312123298645, 'learning_rate': 9.8379954618013e-06, 'epoch': 0.09}


  9%|▉         | 71/805 [10:32<1:48:38,  8.88s/it]

{'loss': 1.1901, 'grad_norm': 0.3485396206378937, 'learning_rate': 9.833000510084537e-06, 'epoch': 0.09}


  9%|▉         | 72/805 [10:41<1:48:28,  8.88s/it]

{'loss': 1.0211, 'grad_norm': 0.3344786763191223, 'learning_rate': 9.827931027521204e-06, 'epoch': 0.09}


  9%|▉         | 73/805 [10:50<1:48:17,  8.88s/it]

{'loss': 1.1054, 'grad_norm': 0.33237534761428833, 'learning_rate': 9.822787092288991e-06, 'epoch': 0.09}


  9%|▉         | 74/805 [10:59<1:48:10,  8.88s/it]

{'loss': 1.0736, 'grad_norm': 0.3518173396587372, 'learning_rate': 9.817568783713744e-06, 'epoch': 0.09}


  9%|▉         | 75/805 [11:08<1:47:59,  8.88s/it]

{'loss': 1.0004, 'grad_norm': 0.3430469036102295, 'learning_rate': 9.812276182268236e-06, 'epoch': 0.09}


  9%|▉         | 76/805 [11:17<1:47:50,  8.88s/it]

{'loss': 1.034, 'grad_norm': 0.3362691402435303, 'learning_rate': 9.806909369570931e-06, 'epoch': 0.09}


 10%|▉         | 77/805 [11:25<1:47:40,  8.87s/it]

{'loss': 1.0183, 'grad_norm': 0.33232825994491577, 'learning_rate': 9.801468428384716e-06, 'epoch': 0.1}


 10%|▉         | 78/805 [11:34<1:47:33,  8.88s/it]

{'loss': 0.9706, 'grad_norm': 0.32060351967811584, 'learning_rate': 9.795953442615637e-06, 'epoch': 0.1}


 10%|▉         | 79/805 [11:43<1:47:24,  8.88s/it]

{'loss': 0.9613, 'grad_norm': 0.33058708906173706, 'learning_rate': 9.790364497311597e-06, 'epoch': 0.1}


 10%|▉         | 80/805 [11:52<1:47:21,  8.88s/it]

{'loss': 1.0082, 'grad_norm': 0.32596251368522644, 'learning_rate': 9.784701678661045e-06, 'epoch': 0.1}


 10%|█         | 81/805 [12:01<1:47:06,  8.88s/it]

{'loss': 1.0519, 'grad_norm': 0.32989242672920227, 'learning_rate': 9.778965073991652e-06, 'epoch': 0.1}


 10%|█         | 82/805 [12:10<1:46:55,  8.87s/it]

{'loss': 0.9511, 'grad_norm': 0.3361673653125763, 'learning_rate': 9.773154771768956e-06, 'epoch': 0.1}


 10%|█         | 83/805 [12:19<1:46:46,  8.87s/it]

{'loss': 0.9584, 'grad_norm': 0.3109228014945984, 'learning_rate': 9.767270861595006e-06, 'epoch': 0.1}


 10%|█         | 84/805 [12:28<1:46:35,  8.87s/it]

{'loss': 0.9371, 'grad_norm': 0.32836979627609253, 'learning_rate': 9.761313434206978e-06, 'epoch': 0.1}


 11%|█         | 85/805 [12:36<1:46:26,  8.87s/it]

{'loss': 1.0054, 'grad_norm': 0.30907678604125977, 'learning_rate': 9.755282581475769e-06, 'epoch': 0.11}


 11%|█         | 86/805 [12:45<1:46:16,  8.87s/it]

{'loss': 1.0186, 'grad_norm': 0.3156019449234009, 'learning_rate': 9.749178396404588e-06, 'epoch': 0.11}


 11%|█         | 87/805 [12:54<1:46:08,  8.87s/it]

{'loss': 1.0205, 'grad_norm': 0.31444504857063293, 'learning_rate': 9.743000973127523e-06, 'epoch': 0.11}


 11%|█         | 88/805 [13:03<1:45:57,  8.87s/it]

{'loss': 1.0517, 'grad_norm': 0.31299662590026855, 'learning_rate': 9.736750406908082e-06, 'epoch': 0.11}


 11%|█         | 89/805 [13:12<1:45:48,  8.87s/it]

{'loss': 1.0288, 'grad_norm': 0.29927152395248413, 'learning_rate': 9.730426794137727e-06, 'epoch': 0.11}


 11%|█         | 90/805 [13:21<1:45:41,  8.87s/it]

{'loss': 0.9873, 'grad_norm': 0.2987125515937805, 'learning_rate': 9.72403023233439e-06, 'epoch': 0.11}


 11%|█▏        | 91/805 [13:30<1:45:38,  8.88s/it]

{'loss': 0.9612, 'grad_norm': 0.29099276661872864, 'learning_rate': 9.717560820140968e-06, 'epoch': 0.11}


 11%|█▏        | 92/805 [13:39<1:45:28,  8.88s/it]

{'loss': 0.9661, 'grad_norm': 0.28436946868896484, 'learning_rate': 9.7110186573238e-06, 'epoch': 0.11}


 12%|█▏        | 93/805 [13:47<1:45:22,  8.88s/it]

{'loss': 0.9759, 'grad_norm': 0.29402661323547363, 'learning_rate': 9.704403844771128e-06, 'epoch': 0.12}


 12%|█▏        | 94/805 [13:56<1:45:14,  8.88s/it]

{'loss': 0.9764, 'grad_norm': 0.31714579463005066, 'learning_rate': 9.697716484491545e-06, 'epoch': 0.12}


 12%|█▏        | 95/805 [14:05<1:45:02,  8.88s/it]

{'loss': 0.9834, 'grad_norm': 0.29311996698379517, 'learning_rate': 9.690956679612422e-06, 'epoch': 0.12}


 12%|█▏        | 96/805 [14:14<1:44:50,  8.87s/it]

{'loss': 1.0284, 'grad_norm': 0.2924027740955353, 'learning_rate': 9.684124534378307e-06, 'epoch': 0.12}


 12%|█▏        | 97/805 [14:23<1:44:49,  8.88s/it]

{'loss': 0.9497, 'grad_norm': 0.282803475856781, 'learning_rate': 9.677220154149338e-06, 'epoch': 0.12}


 12%|█▏        | 98/805 [14:32<1:44:42,  8.89s/it]

{'loss': 0.8921, 'grad_norm': 0.27728506922721863, 'learning_rate': 9.670243645399594e-06, 'epoch': 0.12}


 12%|█▏        | 99/805 [14:41<1:44:32,  8.88s/it]

{'loss': 1.0237, 'grad_norm': 0.28437817096710205, 'learning_rate': 9.663195115715472e-06, 'epoch': 0.12}


 12%|█▏        | 100/805 [14:50<1:44:24,  8.89s/it]

{'loss': 1.0134, 'grad_norm': 0.289485901594162, 'learning_rate': 9.656074673794018e-06, 'epoch': 0.12}


 13%|█▎        | 101/805 [14:59<1:44:16,  8.89s/it]

{'loss': 0.9511, 'grad_norm': 0.2926463484764099, 'learning_rate': 9.648882429441258e-06, 'epoch': 0.13}


 13%|█▎        | 102/805 [15:07<1:44:03,  8.88s/it]

{'loss': 0.9637, 'grad_norm': 0.27197855710983276, 'learning_rate': 9.641618493570495e-06, 'epoch': 0.13}


 13%|█▎        | 103/805 [15:16<1:43:51,  8.88s/it]

{'loss': 0.906, 'grad_norm': 0.281497985124588, 'learning_rate': 9.634282978200605e-06, 'epoch': 0.13}


 13%|█▎        | 104/805 [15:25<1:43:44,  8.88s/it]

{'loss': 0.9178, 'grad_norm': 0.28153663873672485, 'learning_rate': 9.626875996454312e-06, 'epoch': 0.13}


 13%|█▎        | 105/805 [15:34<1:43:38,  8.88s/it]

{'loss': 0.993, 'grad_norm': 0.28137075901031494, 'learning_rate': 9.619397662556434e-06, 'epoch': 0.13}


 13%|█▎        | 106/805 [15:43<1:43:26,  8.88s/it]

{'loss': 0.9177, 'grad_norm': 0.2767413556575775, 'learning_rate': 9.611848091832134e-06, 'epoch': 0.13}


 13%|█▎        | 107/805 [15:52<1:43:14,  8.87s/it]

{'loss': 0.9972, 'grad_norm': 0.28624776005744934, 'learning_rate': 9.604227400705134e-06, 'epoch': 0.13}


 13%|█▎        | 108/805 [16:01<1:43:05,  8.87s/it]

{'loss': 0.9215, 'grad_norm': 0.27948760986328125, 'learning_rate': 9.596535706695911e-06, 'epoch': 0.13}


 14%|█▎        | 109/805 [16:10<1:42:54,  8.87s/it]

{'loss': 0.9919, 'grad_norm': 0.28485774993896484, 'learning_rate': 9.588773128419907e-06, 'epoch': 0.14}


 14%|█▎        | 110/805 [16:18<1:42:45,  8.87s/it]

{'loss': 0.8061, 'grad_norm': 0.27086758613586426, 'learning_rate': 9.58093978558568e-06, 'epoch': 0.14}


 14%|█▍        | 111/805 [16:27<1:42:40,  8.88s/it]

{'loss': 0.912, 'grad_norm': 0.27775105834007263, 'learning_rate': 9.57303579899307e-06, 'epoch': 0.14}


 14%|█▍        | 112/805 [16:36<1:42:32,  8.88s/it]

{'loss': 0.8944, 'grad_norm': 0.27960044145584106, 'learning_rate': 9.565061290531323e-06, 'epoch': 0.14}


 14%|█▍        | 113/805 [16:45<1:42:20,  8.87s/it]

{'loss': 0.9791, 'grad_norm': 0.3072962462902069, 'learning_rate': 9.557016383177226e-06, 'epoch': 0.14}


 14%|█▍        | 114/805 [16:54<1:42:10,  8.87s/it]

{'loss': 0.9522, 'grad_norm': 0.2799220085144043, 'learning_rate': 9.548901200993206e-06, 'epoch': 0.14}


 14%|█▍        | 115/805 [17:03<1:42:03,  8.87s/it]

{'loss': 0.9302, 'grad_norm': 0.28078731894493103, 'learning_rate': 9.540715869125407e-06, 'epoch': 0.14}


 14%|█▍        | 116/805 [17:12<1:41:53,  8.87s/it]

{'loss': 0.9055, 'grad_norm': 0.3025161325931549, 'learning_rate': 9.532460513801774e-06, 'epoch': 0.14}


 15%|█▍        | 117/805 [17:21<1:41:42,  8.87s/it]

{'loss': 0.9299, 'grad_norm': 0.33871352672576904, 'learning_rate': 9.524135262330098e-06, 'epoch': 0.15}


 15%|█▍        | 118/805 [17:29<1:41:32,  8.87s/it]

{'loss': 0.8908, 'grad_norm': 0.31255432963371277, 'learning_rate': 9.515740243096056e-06, 'epoch': 0.15}


 15%|█▍        | 119/805 [17:38<1:41:22,  8.87s/it]

{'loss': 0.9457, 'grad_norm': 0.3170180022716522, 'learning_rate': 9.507275585561228e-06, 'epoch': 0.15}


 15%|█▍        | 120/805 [17:47<1:41:14,  8.87s/it]

{'loss': 1.0127, 'grad_norm': 0.28975027799606323, 'learning_rate': 9.498741420261109e-06, 'epoch': 0.15}


 15%|█▌        | 121/805 [17:56<1:41:06,  8.87s/it]

{'loss': 0.8828, 'grad_norm': 0.2930803596973419, 'learning_rate': 9.490137878803078e-06, 'epoch': 0.15}


 15%|█▌        | 122/805 [18:05<1:41:03,  8.88s/it]

{'loss': 0.8541, 'grad_norm': 0.29009872674942017, 'learning_rate': 9.481465093864395e-06, 'epoch': 0.15}


 15%|█▌        | 123/805 [18:14<1:40:56,  8.88s/it]

{'loss': 0.8875, 'grad_norm': 0.29694393277168274, 'learning_rate': 9.472723199190126e-06, 'epoch': 0.15}


 15%|█▌        | 124/805 [18:23<1:40:45,  8.88s/it]

{'loss': 0.8938, 'grad_norm': 0.32097771763801575, 'learning_rate': 9.463912329591105e-06, 'epoch': 0.15}


 16%|█▌        | 125/805 [18:32<1:40:37,  8.88s/it]

{'loss': 0.9373, 'grad_norm': 0.2890856862068176, 'learning_rate': 9.45503262094184e-06, 'epoch': 0.16}


 16%|█▌        | 126/805 [18:40<1:40:28,  8.88s/it]

{'loss': 0.9366, 'grad_norm': 0.2964172959327698, 'learning_rate': 9.446084210178423e-06, 'epoch': 0.16}


 16%|█▌        | 127/805 [18:49<1:40:16,  8.87s/it]

{'loss': 0.8763, 'grad_norm': 0.28029292821884155, 'learning_rate': 9.437067235296418e-06, 'epoch': 0.16}


 16%|█▌        | 128/805 [18:58<1:40:08,  8.87s/it]

{'loss': 0.9225, 'grad_norm': 0.29225173592567444, 'learning_rate': 9.427981835348729e-06, 'epoch': 0.16}


 16%|█▌        | 129/805 [19:07<1:39:59,  8.87s/it]

{'loss': 0.9099, 'grad_norm': 0.2963794469833374, 'learning_rate': 9.418828150443469e-06, 'epoch': 0.16}


 16%|█▌        | 130/805 [19:16<1:39:50,  8.88s/it]

{'loss': 0.8944, 'grad_norm': 0.2875809073448181, 'learning_rate': 9.409606321741776e-06, 'epoch': 0.16}


 16%|█▋        | 131/805 [19:25<1:39:41,  8.87s/it]

{'loss': 0.814, 'grad_norm': 0.30657750368118286, 'learning_rate': 9.40031649145566e-06, 'epoch': 0.16}


 16%|█▋        | 132/805 [19:34<1:39:37,  8.88s/it]

{'loss': 0.8484, 'grad_norm': 0.29785317182540894, 'learning_rate': 9.390958802845797e-06, 'epoch': 0.16}


 17%|█▋        | 133/805 [19:43<1:39:26,  8.88s/it]

{'loss': 0.8877, 'grad_norm': 0.3077985346317291, 'learning_rate': 9.381533400219319e-06, 'epoch': 0.17}


 17%|█▋        | 134/805 [19:51<1:39:14,  8.87s/it]

{'loss': 0.9546, 'grad_norm': 0.3049342930316925, 'learning_rate': 9.372040428927595e-06, 'epoch': 0.17}


 17%|█▋        | 135/805 [20:00<1:39:09,  8.88s/it]

{'loss': 0.9681, 'grad_norm': 0.2965087890625, 'learning_rate': 9.362480035363987e-06, 'epoch': 0.17}


 17%|█▋        | 136/805 [20:09<1:39:06,  8.89s/it]

{'loss': 0.879, 'grad_norm': 0.28508228063583374, 'learning_rate': 9.352852366961588e-06, 'epoch': 0.17}


 17%|█▋        | 137/805 [20:18<1:38:51,  8.88s/it]

{'loss': 0.8441, 'grad_norm': 0.3067172169685364, 'learning_rate': 9.343157572190957e-06, 'epoch': 0.17}


 17%|█▋        | 138/805 [20:27<1:38:40,  8.88s/it]

{'loss': 0.8954, 'grad_norm': 0.30183112621307373, 'learning_rate': 9.33339580055782e-06, 'epoch': 0.17}


 17%|█▋        | 139/805 [20:36<1:38:32,  8.88s/it]

{'loss': 0.9307, 'grad_norm': 0.27347660064697266, 'learning_rate': 9.323567202600777e-06, 'epoch': 0.17}


 17%|█▋        | 140/805 [20:45<1:38:23,  8.88s/it]

{'loss': 0.8737, 'grad_norm': 0.3092142343521118, 'learning_rate': 9.31367192988896e-06, 'epoch': 0.17}


 18%|█▊        | 141/805 [20:54<1:38:14,  8.88s/it]

{'loss': 0.9981, 'grad_norm': 0.30985888838768005, 'learning_rate': 9.30371013501972e-06, 'epoch': 0.18}


 18%|█▊        | 142/805 [21:02<1:38:05,  8.88s/it]

{'loss': 0.8786, 'grad_norm': 0.2995150089263916, 'learning_rate': 9.293681971616252e-06, 'epoch': 0.18}


 18%|█▊        | 143/805 [21:11<1:37:55,  8.88s/it]

{'loss': 0.8254, 'grad_norm': 0.31888502836227417, 'learning_rate': 9.28358759432525e-06, 'epoch': 0.18}


 18%|█▊        | 144/805 [21:20<1:37:47,  8.88s/it]

{'loss': 0.7956, 'grad_norm': 0.3158355951309204, 'learning_rate': 9.27342715881449e-06, 'epoch': 0.18}


 18%|█▊        | 145/805 [21:29<1:37:34,  8.87s/it]

{'loss': 0.8335, 'grad_norm': 0.28262051939964294, 'learning_rate': 9.263200821770462e-06, 'epoch': 0.18}


 18%|█▊        | 146/805 [21:38<1:37:24,  8.87s/it]

{'loss': 0.8452, 'grad_norm': 0.3096011281013489, 'learning_rate': 9.252908740895932e-06, 'epoch': 0.18}


 18%|█▊        | 147/805 [21:47<1:37:14,  8.87s/it]

{'loss': 0.8704, 'grad_norm': 0.298112154006958, 'learning_rate': 9.242551074907519e-06, 'epoch': 0.18}


 18%|█▊        | 148/805 [21:56<1:37:06,  8.87s/it]

{'loss': 0.799, 'grad_norm': 0.3071165382862091, 'learning_rate': 9.232127983533247e-06, 'epoch': 0.18}


 19%|█▊        | 149/805 [22:05<1:36:59,  8.87s/it]

{'loss': 0.8158, 'grad_norm': 0.29592829942703247, 'learning_rate': 9.221639627510076e-06, 'epoch': 0.18}


 19%|█▊        | 150/805 [22:13<1:36:55,  8.88s/it]

{'loss': 0.8648, 'grad_norm': 0.31752917170524597, 'learning_rate': 9.211086168581433e-06, 'epoch': 0.19}


 19%|█▉        | 151/805 [22:22<1:36:44,  8.88s/it]

{'loss': 0.8499, 'grad_norm': 0.3247006833553314, 'learning_rate': 9.20046776949471e-06, 'epoch': 0.19}


 19%|█▉        | 152/805 [22:31<1:36:42,  8.89s/it]

{'loss': 0.8918, 'grad_norm': 0.30511221289634705, 'learning_rate': 9.189784593998757e-06, 'epoch': 0.19}


 19%|█▉        | 153/805 [22:40<1:36:29,  8.88s/it]

{'loss': 0.8276, 'grad_norm': 0.3308958411216736, 'learning_rate': 9.179036806841352e-06, 'epoch': 0.19}


 19%|█▉        | 154/805 [22:49<1:36:17,  8.87s/it]

{'loss': 0.9325, 'grad_norm': 0.3411242961883545, 'learning_rate': 9.168224573766673e-06, 'epoch': 0.19}


 19%|█▉        | 155/805 [22:58<1:36:07,  8.87s/it]

{'loss': 0.88, 'grad_norm': 0.3025691509246826, 'learning_rate': 9.157348061512728e-06, 'epoch': 0.19}


 19%|█▉        | 156/805 [23:07<1:35:58,  8.87s/it]

{'loss': 0.7877, 'grad_norm': 0.3401278853416443, 'learning_rate': 9.14640743780879e-06, 'epoch': 0.19}


 20%|█▉        | 157/805 [23:16<1:35:51,  8.88s/it]

{'loss': 0.8269, 'grad_norm': 0.31872162222862244, 'learning_rate': 9.13540287137281e-06, 'epoch': 0.19}


 20%|█▉        | 158/805 [23:24<1:35:42,  8.88s/it]

{'loss': 0.8414, 'grad_norm': 0.35241565108299255, 'learning_rate': 9.124334531908818e-06, 'epoch': 0.2}


 20%|█▉        | 159/805 [23:33<1:35:34,  8.88s/it]

{'loss': 0.821, 'grad_norm': 0.30435439944267273, 'learning_rate': 9.1132025901043e-06, 'epoch': 0.2}


 20%|█▉        | 160/805 [23:42<1:35:24,  8.88s/it]

{'loss': 0.802, 'grad_norm': 0.33573517203330994, 'learning_rate': 9.102007217627568e-06, 'epoch': 0.2}


 20%|██        | 161/805 [23:51<1:35:14,  8.87s/it]

{'loss': 0.8092, 'grad_norm': 0.3068546652793884, 'learning_rate': 9.090748587125118e-06, 'epoch': 0.2}


 20%|██        | 162/805 [24:00<1:35:03,  8.87s/it]

{'loss': 0.8346, 'grad_norm': 0.3178582787513733, 'learning_rate': 9.079426872218958e-06, 'epoch': 0.2}


 20%|██        | 163/805 [24:09<1:34:54,  8.87s/it]

{'loss': 0.828, 'grad_norm': 0.2997182309627533, 'learning_rate': 9.068042247503937e-06, 'epoch': 0.2}


 20%|██        | 164/805 [24:18<1:34:45,  8.87s/it]

{'loss': 0.8603, 'grad_norm': 0.3344646692276001, 'learning_rate': 9.05659488854505e-06, 'epoch': 0.2}


 20%|██        | 165/805 [24:27<1:34:38,  8.87s/it]

{'loss': 0.9304, 'grad_norm': 0.32645779848098755, 'learning_rate': 9.045084971874738e-06, 'epoch': 0.2}


 21%|██        | 166/805 [24:35<1:34:29,  8.87s/it]

{'loss': 0.7933, 'grad_norm': 0.32002976536750793, 'learning_rate': 9.033512674990151e-06, 'epoch': 0.21}


 21%|██        | 167/805 [24:44<1:34:26,  8.88s/it]

{'loss': 0.8317, 'grad_norm': 0.35292142629623413, 'learning_rate': 9.021878176350422e-06, 'epoch': 0.21}


 21%|██        | 168/805 [24:53<1:34:15,  8.88s/it]

{'loss': 0.8425, 'grad_norm': 0.35227951407432556, 'learning_rate': 9.010181655373918e-06, 'epoch': 0.21}


 21%|██        | 169/805 [25:02<1:34:06,  8.88s/it]

{'loss': 0.8716, 'grad_norm': 0.30378279089927673, 'learning_rate': 8.998423292435455e-06, 'epoch': 0.21}


 21%|██        | 170/805 [25:11<1:33:57,  8.88s/it]

{'loss': 0.8863, 'grad_norm': 0.3165734112262726, 'learning_rate': 8.986603268863536e-06, 'epoch': 0.21}


 21%|██        | 171/805 [25:20<1:33:47,  8.88s/it]

{'loss': 0.9102, 'grad_norm': 0.30890628695487976, 'learning_rate': 8.97472176693755e-06, 'epoch': 0.21}


 21%|██▏       | 172/805 [25:29<1:33:37,  8.87s/it]

{'loss': 0.8363, 'grad_norm': 0.31929418444633484, 'learning_rate': 8.962778969884956e-06, 'epoch': 0.21}


 21%|██▏       | 173/805 [25:38<1:33:31,  8.88s/it]

{'loss': 0.8754, 'grad_norm': 0.3173995912075043, 'learning_rate': 8.950775061878453e-06, 'epoch': 0.21}


 22%|██▏       | 174/805 [25:46<1:33:29,  8.89s/it]

{'loss': 0.8131, 'grad_norm': 0.33770257234573364, 'learning_rate': 8.938710228033155e-06, 'epoch': 0.22}


 22%|██▏       | 175/805 [25:55<1:33:25,  8.90s/it]

{'loss': 0.8181, 'grad_norm': 0.3594600558280945, 'learning_rate': 8.926584654403725e-06, 'epoch': 0.22}


 22%|██▏       | 176/805 [26:04<1:33:10,  8.89s/it]

{'loss': 0.7261, 'grad_norm': 0.3338801860809326, 'learning_rate': 8.91439852798151e-06, 'epoch': 0.22}


 22%|██▏       | 177/805 [26:13<1:32:57,  8.88s/it]

{'loss': 0.839, 'grad_norm': 0.3404703438282013, 'learning_rate': 8.902152036691649e-06, 'epoch': 0.22}


 22%|██▏       | 178/805 [26:22<1:32:46,  8.88s/it]

{'loss': 0.8772, 'grad_norm': 0.3303261995315552, 'learning_rate': 8.889845369390193e-06, 'epoch': 0.22}


 22%|██▏       | 179/805 [26:31<1:32:41,  8.88s/it]

{'loss': 0.9078, 'grad_norm': 0.3369944989681244, 'learning_rate': 8.877478715861173e-06, 'epoch': 0.22}


 22%|██▏       | 180/805 [26:40<1:32:29,  8.88s/it]

{'loss': 0.8742, 'grad_norm': 0.3124731481075287, 'learning_rate': 8.865052266813686e-06, 'epoch': 0.22}


 22%|██▏       | 181/805 [26:49<1:32:17,  8.87s/it]

{'loss': 0.7034, 'grad_norm': 0.3128332495689392, 'learning_rate': 8.852566213878947e-06, 'epoch': 0.22}


 23%|██▎       | 182/805 [26:58<1:32:09,  8.88s/it]

{'loss': 0.8095, 'grad_norm': 0.3420247435569763, 'learning_rate': 8.84002074960734e-06, 'epoch': 0.23}


 23%|██▎       | 183/805 [27:06<1:32:00,  8.87s/it]

{'loss': 0.7552, 'grad_norm': 0.31096693873405457, 'learning_rate': 8.827416067465442e-06, 'epoch': 0.23}


 23%|██▎       | 184/805 [27:15<1:31:49,  8.87s/it]

{'loss': 0.8396, 'grad_norm': 0.31788212060928345, 'learning_rate': 8.814752361833045e-06, 'epoch': 0.23}


 23%|██▎       | 185/805 [27:24<1:31:45,  8.88s/it]

{'loss': 0.8173, 'grad_norm': 0.3103821575641632, 'learning_rate': 8.802029828000157e-06, 'epoch': 0.23}


 23%|██▎       | 186/805 [27:33<1:31:38,  8.88s/it]

{'loss': 0.8624, 'grad_norm': 0.3380046486854553, 'learning_rate': 8.789248662163985e-06, 'epoch': 0.23}


 23%|██▎       | 187/805 [27:42<1:31:33,  8.89s/it]

{'loss': 0.8355, 'grad_norm': 0.3328985571861267, 'learning_rate': 8.776409061425919e-06, 'epoch': 0.23}


 23%|██▎       | 188/805 [27:51<1:31:24,  8.89s/it]

{'loss': 0.7746, 'grad_norm': 0.320015549659729, 'learning_rate': 8.763511223788485e-06, 'epoch': 0.23}


 23%|██▎       | 189/805 [28:00<1:31:20,  8.90s/it]

{'loss': 0.8401, 'grad_norm': 0.31904178857803345, 'learning_rate': 8.750555348152299e-06, 'epoch': 0.23}


 24%|██▎       | 190/805 [28:09<1:31:09,  8.89s/it]

{'loss': 0.8587, 'grad_norm': 0.3176046311855316, 'learning_rate': 8.737541634312985e-06, 'epoch': 0.24}


 24%|██▎       | 191/805 [28:18<1:30:58,  8.89s/it]

{'loss': 0.8362, 'grad_norm': 0.33197858929634094, 'learning_rate': 8.72447028295811e-06, 'epoch': 0.24}


 24%|██▍       | 192/805 [28:26<1:30:48,  8.89s/it]

{'loss': 0.8378, 'grad_norm': 0.32390815019607544, 'learning_rate': 8.711341495664085e-06, 'epoch': 0.24}


 24%|██▍       | 193/805 [28:35<1:30:40,  8.89s/it]

{'loss': 0.8258, 'grad_norm': 0.32921943068504333, 'learning_rate': 8.69815547489305e-06, 'epoch': 0.24}


 24%|██▍       | 194/805 [28:44<1:30:31,  8.89s/it]

{'loss': 0.8448, 'grad_norm': 0.36048513650894165, 'learning_rate': 8.684912423989755e-06, 'epoch': 0.24}


 24%|██▍       | 195/805 [28:53<1:30:19,  8.88s/it]

{'loss': 0.7462, 'grad_norm': 0.35717713832855225, 'learning_rate': 8.671612547178428e-06, 'epoch': 0.24}


 24%|██▍       | 196/805 [29:02<1:30:07,  8.88s/it]

{'loss': 0.7606, 'grad_norm': 0.33736157417297363, 'learning_rate': 8.658256049559624e-06, 'epoch': 0.24}


 24%|██▍       | 197/805 [29:11<1:29:56,  8.88s/it]

{'loss': 0.7583, 'grad_norm': 0.34046444296836853, 'learning_rate': 8.644843137107058e-06, 'epoch': 0.24}


 25%|██▍       | 198/805 [29:20<1:29:45,  8.87s/it]

{'loss': 0.8443, 'grad_norm': 0.32521745562553406, 'learning_rate': 8.631374016664434e-06, 'epoch': 0.25}


 25%|██▍       | 199/805 [29:29<1:29:35,  8.87s/it]

{'loss': 0.8338, 'grad_norm': 0.33014044165611267, 'learning_rate': 8.617848895942246e-06, 'epoch': 0.25}


 25%|██▍       | 200/805 [29:37<1:29:28,  8.87s/it]

{'loss': 0.8968, 'grad_norm': 0.34919026494026184, 'learning_rate': 8.604267983514595e-06, 'epoch': 0.25}


 25%|██▍       | 201/805 [29:46<1:29:23,  8.88s/it]

{'loss': 0.8543, 'grad_norm': 0.35191401839256287, 'learning_rate': 8.590631488815945e-06, 'epoch': 0.25}


 25%|██▌       | 202/805 [29:55<1:29:12,  8.88s/it]

{'loss': 0.7856, 'grad_norm': 0.3247976303100586, 'learning_rate': 8.576939622137915e-06, 'epoch': 0.25}


 25%|██▌       | 203/805 [30:04<1:29:05,  8.88s/it]

{'loss': 0.8524, 'grad_norm': 0.32964587211608887, 'learning_rate': 8.563192594626027e-06, 'epoch': 0.25}


 25%|██▌       | 204/805 [30:13<1:28:55,  8.88s/it]

{'loss': 0.7873, 'grad_norm': 0.3321632444858551, 'learning_rate': 8.549390618276452e-06, 'epoch': 0.25}


 25%|██▌       | 205/805 [30:22<1:28:55,  8.89s/it]

{'loss': 0.9044, 'grad_norm': 0.36140871047973633, 'learning_rate': 8.535533905932739e-06, 'epoch': 0.25}


 26%|██▌       | 206/805 [30:31<1:28:42,  8.89s/it]

{'loss': 0.7734, 'grad_norm': 0.41536402702331543, 'learning_rate': 8.521622671282532e-06, 'epoch': 0.26}


 26%|██▌       | 207/805 [30:40<1:28:30,  8.88s/it]

{'loss': 0.8104, 'grad_norm': 0.3472338914871216, 'learning_rate': 8.50765712885428e-06, 'epoch': 0.26}


 26%|██▌       | 208/805 [30:48<1:28:26,  8.89s/it]

{'loss': 0.8095, 'grad_norm': 0.33554312586784363, 'learning_rate': 8.493637494013922e-06, 'epoch': 0.26}


 26%|██▌       | 209/805 [30:57<1:28:16,  8.89s/it]

{'loss': 0.8403, 'grad_norm': 0.3611491322517395, 'learning_rate': 8.479563982961572e-06, 'epoch': 0.26}


 26%|██▌       | 210/805 [31:06<1:28:03,  8.88s/it]

{'loss': 0.7596, 'grad_norm': 0.334650456905365, 'learning_rate': 8.465436812728181e-06, 'epoch': 0.26}


 26%|██▌       | 211/805 [31:15<1:28:00,  8.89s/it]

{'loss': 0.8426, 'grad_norm': 0.3395475447177887, 'learning_rate': 8.451256201172186e-06, 'epoch': 0.26}


 26%|██▋       | 212/805 [31:24<1:27:50,  8.89s/it]

{'loss': 0.8305, 'grad_norm': 0.32186535000801086, 'learning_rate': 8.437022366976165e-06, 'epoch': 0.26}


 26%|██▋       | 213/805 [31:33<1:27:38,  8.88s/it]

{'loss': 0.8412, 'grad_norm': 0.403709352016449, 'learning_rate': 8.422735529643445e-06, 'epoch': 0.26}


 27%|██▋       | 214/805 [31:42<1:27:30,  8.88s/it]

{'loss': 0.7435, 'grad_norm': 0.35983288288116455, 'learning_rate': 8.408395909494733e-06, 'epoch': 0.27}


 27%|██▋       | 215/805 [31:51<1:27:26,  8.89s/it]

{'loss': 0.8593, 'grad_norm': 0.3624710738658905, 'learning_rate': 8.39400372766471e-06, 'epoch': 0.27}


 27%|██▋       | 216/805 [32:00<1:27:16,  8.89s/it]

{'loss': 0.9066, 'grad_norm': 0.36303552985191345, 'learning_rate': 8.379559206098625e-06, 'epoch': 0.27}


 27%|██▋       | 217/805 [32:08<1:27:04,  8.89s/it]

{'loss': 0.7923, 'grad_norm': 0.3767145872116089, 'learning_rate': 8.365062567548868e-06, 'epoch': 0.27}


 27%|██▋       | 218/805 [32:17<1:26:55,  8.88s/it]

{'loss': 0.7517, 'grad_norm': 0.3839329183101654, 'learning_rate': 8.350514035571539e-06, 'epoch': 0.27}


 27%|██▋       | 219/805 [32:26<1:26:49,  8.89s/it]

{'loss': 0.7942, 'grad_norm': 0.36133506894111633, 'learning_rate': 8.335913834522999e-06, 'epoch': 0.27}


 27%|██▋       | 220/805 [32:35<1:26:39,  8.89s/it]

{'loss': 0.7894, 'grad_norm': 0.3468318283557892, 'learning_rate': 8.32126218955641e-06, 'epoch': 0.27}


 27%|██▋       | 221/805 [32:44<1:26:30,  8.89s/it]

{'loss': 0.852, 'grad_norm': 0.34917259216308594, 'learning_rate': 8.30655932661826e-06, 'epoch': 0.27}


 28%|██▊       | 222/805 [32:53<1:26:23,  8.89s/it]

{'loss': 0.9091, 'grad_norm': 0.37937700748443604, 'learning_rate': 8.291805472444887e-06, 'epoch': 0.28}


 28%|██▊       | 223/805 [33:02<1:26:12,  8.89s/it]

{'loss': 0.8416, 'grad_norm': 0.35829517245292664, 'learning_rate': 8.27700085455897e-06, 'epoch': 0.28}


 28%|██▊       | 224/805 [33:11<1:26:01,  8.88s/it]

{'loss': 0.8246, 'grad_norm': 0.3531884551048279, 'learning_rate': 8.262145701266034e-06, 'epoch': 0.28}


 28%|██▊       | 225/805 [33:20<1:25:50,  8.88s/it]

{'loss': 0.8609, 'grad_norm': 0.34107744693756104, 'learning_rate': 8.247240241650918e-06, 'epoch': 0.28}


 28%|██▊       | 226/805 [33:28<1:25:41,  8.88s/it]

{'loss': 0.7973, 'grad_norm': 0.3586389124393463, 'learning_rate': 8.232284705574251e-06, 'epoch': 0.28}


 28%|██▊       | 227/805 [33:37<1:25:32,  8.88s/it]

{'loss': 0.7963, 'grad_norm': 0.35546305775642395, 'learning_rate': 8.217279323668895e-06, 'epoch': 0.28}


 28%|██▊       | 228/805 [33:46<1:25:23,  8.88s/it]

{'loss': 0.855, 'grad_norm': 0.3639383316040039, 'learning_rate': 8.202224327336406e-06, 'epoch': 0.28}


 28%|██▊       | 229/805 [33:55<1:25:12,  8.88s/it]

{'loss': 0.8448, 'grad_norm': 0.3658362329006195, 'learning_rate': 8.18711994874345e-06, 'epoch': 0.28}


 29%|██▊       | 230/805 [34:04<1:25:05,  8.88s/it]

{'loss': 0.7973, 'grad_norm': 0.39407190680503845, 'learning_rate': 8.171966420818227e-06, 'epoch': 0.29}


 29%|██▊       | 231/805 [34:13<1:24:57,  8.88s/it]

{'loss': 0.7457, 'grad_norm': 0.3824682831764221, 'learning_rate': 8.15676397724689e-06, 'epoch': 0.29}


 29%|██▉       | 232/805 [34:22<1:24:45,  8.88s/it]

{'loss': 0.8316, 'grad_norm': 0.3606491982936859, 'learning_rate': 8.141512852469919e-06, 'epoch': 0.29}


 29%|██▉       | 233/805 [34:31<1:24:38,  8.88s/it]

{'loss': 0.8299, 'grad_norm': 0.3665322959423065, 'learning_rate': 8.126213281678527e-06, 'epoch': 0.29}


 29%|██▉       | 234/805 [34:39<1:24:29,  8.88s/it]

{'loss': 0.8906, 'grad_norm': 0.3797590136528015, 'learning_rate': 8.110865500811022e-06, 'epoch': 0.29}


 29%|██▉       | 235/805 [34:48<1:24:25,  8.89s/it]

{'loss': 0.7295, 'grad_norm': 0.3671089708805084, 'learning_rate': 8.095469746549172e-06, 'epoch': 0.29}


 29%|██▉       | 236/805 [34:57<1:24:17,  8.89s/it]

{'loss': 0.8579, 'grad_norm': 0.3854551315307617, 'learning_rate': 8.08002625631455e-06, 'epoch': 0.29}


 29%|██▉       | 237/805 [35:06<1:24:05,  8.88s/it]

{'loss': 0.897, 'grad_norm': 0.36065003275871277, 'learning_rate': 8.064535268264883e-06, 'epoch': 0.29}


 30%|██▉       | 238/805 [35:15<1:23:56,  8.88s/it]

{'loss': 0.7353, 'grad_norm': 0.36869680881500244, 'learning_rate': 8.04899702129037e-06, 'epoch': 0.3}


 30%|██▉       | 239/805 [35:24<1:23:46,  8.88s/it]

{'loss': 0.8198, 'grad_norm': 0.3854690194129944, 'learning_rate': 8.033411755009999e-06, 'epoch': 0.3}


 30%|██▉       | 240/805 [35:33<1:23:34,  8.88s/it]

{'loss': 0.8348, 'grad_norm': 0.34438931941986084, 'learning_rate': 8.017779709767857e-06, 'epoch': 0.3}


 30%|██▉       | 241/805 [35:42<1:23:24,  8.87s/it]

{'loss': 0.7583, 'grad_norm': 0.3612406551837921, 'learning_rate': 8.002101126629422e-06, 'epoch': 0.3}


 30%|███       | 242/805 [35:50<1:23:15,  8.87s/it]

{'loss': 0.7524, 'grad_norm': 0.3758290708065033, 'learning_rate': 7.986376247377835e-06, 'epoch': 0.3}


 30%|███       | 243/805 [35:59<1:23:06,  8.87s/it]

{'loss': 0.8234, 'grad_norm': 0.41656458377838135, 'learning_rate': 7.970605314510194e-06, 'epoch': 0.3}


 30%|███       | 244/805 [36:08<1:22:59,  8.88s/it]

{'loss': 0.8273, 'grad_norm': 0.35142701864242554, 'learning_rate': 7.954788571233788e-06, 'epoch': 0.3}


 30%|███       | 245/805 [36:17<1:22:51,  8.88s/it]

{'loss': 0.7313, 'grad_norm': 0.3594309687614441, 'learning_rate': 7.938926261462366e-06, 'epoch': 0.3}


 31%|███       | 246/805 [36:26<1:22:42,  8.88s/it]

{'loss': 0.8523, 'grad_norm': 0.3947225511074066, 'learning_rate': 7.923018629812369e-06, 'epoch': 0.31}


 31%|███       | 247/805 [36:35<1:22:34,  8.88s/it]

{'loss': 0.7092, 'grad_norm': 0.3541492521762848, 'learning_rate': 7.907065921599153e-06, 'epoch': 0.31}


 31%|███       | 248/805 [36:44<1:22:26,  8.88s/it]

{'loss': 0.7847, 'grad_norm': 0.3749108910560608, 'learning_rate': 7.891068382833216e-06, 'epoch': 0.31}


 31%|███       | 249/805 [36:53<1:22:16,  8.88s/it]

{'loss': 0.8254, 'grad_norm': 0.3703977167606354, 'learning_rate': 7.875026260216395e-06, 'epoch': 0.31}


 31%|███       | 250/805 [37:02<1:22:08,  8.88s/it]

{'loss': 0.7564, 'grad_norm': 0.36839839816093445, 'learning_rate': 7.858939801138061e-06, 'epoch': 0.31}


 31%|███       | 251/805 [37:10<1:21:58,  8.88s/it]

{'loss': 0.8074, 'grad_norm': 0.35700932145118713, 'learning_rate': 7.842809253671321e-06, 'epoch': 0.31}


 31%|███▏      | 252/805 [37:19<1:21:47,  8.87s/it]

{'loss': 0.7351, 'grad_norm': 0.36611485481262207, 'learning_rate': 7.826634866569164e-06, 'epoch': 0.31}


 31%|███▏      | 253/805 [37:28<1:21:39,  8.88s/it]

{'loss': 0.7292, 'grad_norm': 0.3686841130256653, 'learning_rate': 7.810416889260653e-06, 'epoch': 0.31}


 32%|███▏      | 254/805 [37:37<1:21:28,  8.87s/it]

{'loss': 0.8927, 'grad_norm': 0.3868545591831207, 'learning_rate': 7.794155571847058e-06, 'epoch': 0.32}


 32%|███▏      | 255/805 [37:46<1:21:20,  8.87s/it]

{'loss': 0.7764, 'grad_norm': 0.3600175380706787, 'learning_rate': 7.777851165098012e-06, 'epoch': 0.32}


 32%|███▏      | 256/805 [37:55<1:21:11,  8.87s/it]

{'loss': 0.7929, 'grad_norm': 0.37011954188346863, 'learning_rate': 7.761503920447636e-06, 'epoch': 0.32}


 32%|███▏      | 257/805 [38:04<1:20:53,  8.86s/it]

{'loss': 0.7906, 'grad_norm': 0.3761155903339386, 'learning_rate': 7.74511408999066e-06, 'epoch': 0.32}


 32%|███▏      | 258/805 [38:12<1:20:43,  8.85s/it]

{'loss': 0.8065, 'grad_norm': 0.36860391497612, 'learning_rate': 7.72868192647855e-06, 'epoch': 0.32}


 32%|███▏      | 259/805 [38:21<1:20:39,  8.86s/it]

{'loss': 0.7442, 'grad_norm': 0.3651680648326874, 'learning_rate': 7.712207683315595e-06, 'epoch': 0.32}


 32%|███▏      | 260/805 [38:30<1:20:36,  8.87s/it]

{'loss': 0.8006, 'grad_norm': 0.3560267388820648, 'learning_rate': 7.695691614555002e-06, 'epoch': 0.32}


 32%|███▏      | 261/805 [38:39<1:20:27,  8.87s/it]

{'loss': 0.7696, 'grad_norm': 0.3793180584907532, 'learning_rate': 7.679133974894984e-06, 'epoch': 0.32}


 33%|███▎      | 262/805 [38:48<1:20:21,  8.88s/it]

{'loss': 0.7729, 'grad_norm': 0.38279271125793457, 'learning_rate': 7.662535019674828e-06, 'epoch': 0.33}


 33%|███▎      | 263/805 [38:57<1:20:14,  8.88s/it]

{'loss': 0.79, 'grad_norm': 0.3687765300273895, 'learning_rate': 7.645895004870953e-06, 'epoch': 0.33}


 33%|███▎      | 264/805 [39:06<1:20:03,  8.88s/it]

{'loss': 0.7749, 'grad_norm': 0.36554116010665894, 'learning_rate': 7.6292141870929784e-06, 'epoch': 0.33}


 33%|███▎      | 265/805 [39:15<1:19:51,  8.87s/it]

{'loss': 0.8233, 'grad_norm': 0.3820873200893402, 'learning_rate': 7.612492823579744e-06, 'epoch': 0.33}


 33%|███▎      | 266/805 [39:23<1:19:45,  8.88s/it]

{'loss': 0.7276, 'grad_norm': 0.35797762870788574, 'learning_rate': 7.5957311721953656e-06, 'epoch': 0.33}


 33%|███▎      | 267/805 [39:32<1:19:35,  8.88s/it]

{'loss': 0.7268, 'grad_norm': 0.38811397552490234, 'learning_rate': 7.5789294914252376e-06, 'epoch': 0.33}


 33%|███▎      | 268/805 [39:41<1:19:24,  8.87s/it]

{'loss': 0.8515, 'grad_norm': 0.5329477787017822, 'learning_rate': 7.562088040372067e-06, 'epoch': 0.33}


 33%|███▎      | 269/805 [39:50<1:19:16,  8.87s/it]

{'loss': 0.8016, 'grad_norm': 0.3751896619796753, 'learning_rate': 7.545207078751858e-06, 'epoch': 0.33}


 34%|███▎      | 270/805 [39:59<1:19:04,  8.87s/it]

{'loss': 0.759, 'grad_norm': 0.40382152795791626, 'learning_rate': 7.528286866889924e-06, 'epoch': 0.34}


 34%|███▎      | 271/805 [40:08<1:18:54,  8.87s/it]

{'loss': 0.8309, 'grad_norm': 0.4060068428516388, 'learning_rate': 7.511327665716863e-06, 'epoch': 0.34}


 34%|███▍      | 272/805 [40:17<1:18:46,  8.87s/it]

{'loss': 0.7647, 'grad_norm': 0.35958749055862427, 'learning_rate': 7.494329736764538e-06, 'epoch': 0.34}


 34%|███▍      | 273/805 [40:26<1:18:42,  8.88s/it]

{'loss': 0.7992, 'grad_norm': 0.4123683571815491, 'learning_rate': 7.477293342162038e-06, 'epoch': 0.34}


 34%|███▍      | 274/805 [40:34<1:18:32,  8.87s/it]

{'loss': 0.8139, 'grad_norm': 0.4094782769680023, 'learning_rate': 7.4602187446316456e-06, 'epoch': 0.34}


 34%|███▍      | 275/805 [40:43<1:18:23,  8.88s/it]

{'loss': 0.7534, 'grad_norm': 0.36895665526390076, 'learning_rate': 7.443106207484776e-06, 'epoch': 0.34}


 34%|███▍      | 276/805 [40:52<1:18:16,  8.88s/it]

{'loss': 0.7176, 'grad_norm': 0.3575862944126129, 'learning_rate': 7.425955994617919e-06, 'epoch': 0.34}


 34%|███▍      | 277/805 [41:01<1:18:10,  8.88s/it]

{'loss': 0.8229, 'grad_norm': 0.39863282442092896, 'learning_rate': 7.408768370508577e-06, 'epoch': 0.34}


 35%|███▍      | 278/805 [41:10<1:18:04,  8.89s/it]

{'loss': 0.829, 'grad_norm': 0.3731921911239624, 'learning_rate': 7.391543600211173e-06, 'epoch': 0.35}


 35%|███▍      | 279/805 [41:19<1:17:53,  8.88s/it]

{'loss': 0.6969, 'grad_norm': 0.4004436731338501, 'learning_rate': 7.3742819493529725e-06, 'epoch': 0.35}


 35%|███▍      | 280/805 [41:28<1:17:40,  8.88s/it]

{'loss': 0.7504, 'grad_norm': 0.37950965762138367, 'learning_rate': 7.3569836841299905e-06, 'epoch': 0.35}


 35%|███▍      | 281/805 [41:37<1:17:30,  8.87s/it]

{'loss': 0.8102, 'grad_norm': 0.3952524662017822, 'learning_rate': 7.3396490713028674e-06, 'epoch': 0.35}


 35%|███▌      | 282/805 [41:45<1:17:20,  8.87s/it]

{'loss': 0.7781, 'grad_norm': 0.4066900312900543, 'learning_rate': 7.322278378192783e-06, 'epoch': 0.35}


 35%|███▌      | 283/805 [41:54<1:17:13,  8.88s/it]

{'loss': 0.7785, 'grad_norm': 0.4283284544944763, 'learning_rate': 7.304871872677313e-06, 'epoch': 0.35}


 35%|███▌      | 284/805 [42:03<1:17:01,  8.87s/it]

{'loss': 0.7258, 'grad_norm': 0.4630284607410431, 'learning_rate': 7.2874298231863025e-06, 'epoch': 0.35}


 35%|███▌      | 285/805 [42:12<1:16:52,  8.87s/it]

{'loss': 0.7631, 'grad_norm': 0.4440036118030548, 'learning_rate': 7.269952498697734e-06, 'epoch': 0.35}


 36%|███▌      | 286/805 [42:21<1:16:44,  8.87s/it]

{'loss': 0.7806, 'grad_norm': 0.3907237648963928, 'learning_rate': 7.252440168733572e-06, 'epoch': 0.36}


 36%|███▌      | 287/805 [42:30<1:16:37,  8.87s/it]

{'loss': 0.7835, 'grad_norm': 0.3910478353500366, 'learning_rate': 7.2348931033556065e-06, 'epoch': 0.36}


 36%|███▌      | 288/805 [42:39<1:16:28,  8.88s/it]

{'loss': 0.7334, 'grad_norm': 0.38623636960983276, 'learning_rate': 7.217311573161293e-06, 'epoch': 0.36}


 36%|███▌      | 289/805 [42:48<1:16:22,  8.88s/it]

{'loss': 0.8052, 'grad_norm': 0.4073973596096039, 'learning_rate': 7.199695849279576e-06, 'epoch': 0.36}


 36%|███▌      | 290/805 [42:57<1:16:20,  8.89s/it]

{'loss': 0.7733, 'grad_norm': 0.40213245153427124, 'learning_rate': 7.18204620336671e-06, 'epoch': 0.36}


 36%|███▌      | 291/805 [43:05<1:16:07,  8.89s/it]

{'loss': 0.7543, 'grad_norm': 0.3832298219203949, 'learning_rate': 7.164362907602072e-06, 'epoch': 0.36}


 36%|███▋      | 292/805 [43:14<1:15:59,  8.89s/it]

{'loss': 0.7484, 'grad_norm': 0.4077099561691284, 'learning_rate': 7.14664623468395e-06, 'epoch': 0.36}


 36%|███▋      | 293/805 [43:23<1:15:51,  8.89s/it]

{'loss': 0.8163, 'grad_norm': 0.40114909410476685, 'learning_rate': 7.128896457825364e-06, 'epoch': 0.36}


 37%|███▋      | 294/805 [43:32<1:15:38,  8.88s/it]

{'loss': 0.8293, 'grad_norm': 0.4073144793510437, 'learning_rate': 7.111113850749828e-06, 'epoch': 0.36}


 37%|███▋      | 295/805 [43:41<1:15:29,  8.88s/it]

{'loss': 0.7559, 'grad_norm': 0.38685277104377747, 'learning_rate': 7.093298687687141e-06, 'epoch': 0.37}


 37%|███▋      | 296/805 [43:50<1:15:21,  8.88s/it]

{'loss': 0.8575, 'grad_norm': 0.4055614173412323, 'learning_rate': 7.075451243369157e-06, 'epoch': 0.37}


 37%|███▋      | 297/805 [43:59<1:15:11,  8.88s/it]

{'loss': 0.7503, 'grad_norm': 0.40666311979293823, 'learning_rate': 7.057571793025545e-06, 'epoch': 0.37}


 37%|███▋      | 298/805 [44:08<1:15:01,  8.88s/it]

{'loss': 0.6934, 'grad_norm': 0.3930645287036896, 'learning_rate': 7.0396606123795465e-06, 'epoch': 0.37}


 37%|███▋      | 299/805 [44:16<1:14:52,  8.88s/it]

{'loss': 0.8006, 'grad_norm': 0.3955199718475342, 'learning_rate': 7.021717977643726e-06, 'epoch': 0.37}


 37%|███▋      | 300/805 [44:25<1:14:42,  8.88s/it]

{'loss': 0.7718, 'grad_norm': 0.43505918979644775, 'learning_rate': 7.0037441655157045e-06, 'epoch': 0.37}


 37%|███▋      | 301/805 [44:34<1:14:32,  8.87s/it]

{'loss': 0.8033, 'grad_norm': 0.3976781964302063, 'learning_rate': 6.985739453173903e-06, 'epoch': 0.37}


 38%|███▊      | 302/805 [44:43<1:14:25,  8.88s/it]

{'loss': 0.8024, 'grad_norm': 0.4154641628265381, 'learning_rate': 6.967704118273257e-06, 'epoch': 0.37}


 38%|███▊      | 303/805 [44:52<1:14:16,  8.88s/it]

{'loss': 0.7509, 'grad_norm': 0.40738096833229065, 'learning_rate': 6.949638438940942e-06, 'epoch': 0.38}


 38%|███▊      | 304/805 [45:01<1:14:04,  8.87s/it]

{'loss': 0.7918, 'grad_norm': 0.41190817952156067, 'learning_rate': 6.931542693772081e-06, 'epoch': 0.38}


 38%|███▊      | 305/805 [45:10<1:13:58,  8.88s/it]

{'loss': 0.7335, 'grad_norm': 0.44196686148643494, 'learning_rate': 6.913417161825449e-06, 'epoch': 0.38}


 38%|███▊      | 306/805 [45:19<1:13:48,  8.88s/it]

{'loss': 0.7631, 'grad_norm': 0.4195568561553955, 'learning_rate': 6.895262122619174e-06, 'epoch': 0.38}


 38%|███▊      | 307/805 [45:27<1:13:39,  8.87s/it]

{'loss': 0.8688, 'grad_norm': 0.3811817765235901, 'learning_rate': 6.877077856126416e-06, 'epoch': 0.38}


 38%|███▊      | 308/805 [45:36<1:13:33,  8.88s/it]

{'loss': 0.7849, 'grad_norm': 0.4146384298801422, 'learning_rate': 6.858864642771062e-06, 'epoch': 0.38}


 38%|███▊      | 309/805 [45:45<1:13:22,  8.88s/it]

{'loss': 0.7007, 'grad_norm': 0.3738710284233093, 'learning_rate': 6.840622763423391e-06, 'epoch': 0.38}


 39%|███▊      | 310/805 [45:54<1:13:16,  8.88s/it]

{'loss': 0.8044, 'grad_norm': 0.4047057330608368, 'learning_rate': 6.822352499395751e-06, 'epoch': 0.38}


 39%|███▊      | 311/805 [46:03<1:13:07,  8.88s/it]

{'loss': 0.75, 'grad_norm': 0.4015131890773773, 'learning_rate': 6.804054132438209e-06, 'epoch': 0.39}


 39%|███▉      | 312/805 [46:12<1:12:56,  8.88s/it]

{'loss': 0.7138, 'grad_norm': 0.39313805103302, 'learning_rate': 6.785727944734228e-06, 'epoch': 0.39}


 39%|███▉      | 313/805 [46:21<1:12:47,  8.88s/it]

{'loss': 0.8055, 'grad_norm': 0.40841731429100037, 'learning_rate': 6.767374218896286e-06, 'epoch': 0.39}


 39%|███▉      | 314/805 [46:30<1:12:41,  8.88s/it]

{'loss': 0.7466, 'grad_norm': 0.39264920353889465, 'learning_rate': 6.748993237961544e-06, 'epoch': 0.39}


 39%|███▉      | 315/805 [46:39<1:12:36,  8.89s/it]

{'loss': 0.9094, 'grad_norm': 0.3914259970188141, 'learning_rate': 6.730585285387465e-06, 'epoch': 0.39}


 39%|███▉      | 316/805 [46:47<1:12:28,  8.89s/it]

{'loss': 0.7284, 'grad_norm': 0.412749707698822, 'learning_rate': 6.71215064504745e-06, 'epoch': 0.39}


 39%|███▉      | 317/805 [46:56<1:12:22,  8.90s/it]

{'loss': 0.8483, 'grad_norm': 0.4396916627883911, 'learning_rate': 6.693689601226458e-06, 'epoch': 0.39}


 40%|███▉      | 318/805 [47:05<1:12:16,  8.91s/it]

{'loss': 0.8315, 'grad_norm': 0.43982070684432983, 'learning_rate': 6.67520243861662e-06, 'epoch': 0.39}


 40%|███▉      | 319/805 [47:14<1:12:02,  8.89s/it]

{'loss': 0.7966, 'grad_norm': 0.46631500124931335, 'learning_rate': 6.656689442312855e-06, 'epoch': 0.4}


 40%|███▉      | 320/805 [47:23<1:11:51,  8.89s/it]

{'loss': 0.7381, 'grad_norm': 0.429653525352478, 'learning_rate': 6.638150897808469e-06, 'epoch': 0.4}


 40%|███▉      | 321/805 [47:32<1:11:41,  8.89s/it]

{'loss': 0.7597, 'grad_norm': 0.422239750623703, 'learning_rate': 6.619587090990748e-06, 'epoch': 0.4}


 40%|████      | 322/805 [47:41<1:11:31,  8.89s/it]

{'loss': 0.8305, 'grad_norm': 0.4239239990711212, 'learning_rate': 6.600998308136559e-06, 'epoch': 0.4}


 40%|████      | 323/805 [47:50<1:11:22,  8.88s/it]

{'loss': 0.6765, 'grad_norm': 0.40446245670318604, 'learning_rate': 6.582384835907931e-06, 'epoch': 0.4}


 40%|████      | 324/805 [47:59<1:11:14,  8.89s/it]

{'loss': 0.8336, 'grad_norm': 0.4039120376110077, 'learning_rate': 6.56374696134763e-06, 'epoch': 0.4}


 40%|████      | 325/805 [48:07<1:11:08,  8.89s/it]

{'loss': 0.7896, 'grad_norm': 0.4126490354537964, 'learning_rate': 6.545084971874738e-06, 'epoch': 0.4}


 40%|████      | 326/805 [48:16<1:10:55,  8.89s/it]

{'loss': 0.8061, 'grad_norm': 0.40112337470054626, 'learning_rate': 6.526399155280218e-06, 'epoch': 0.4}


 41%|████      | 327/805 [48:25<1:10:50,  8.89s/it]

{'loss': 0.8357, 'grad_norm': 0.43642494082450867, 'learning_rate': 6.507689799722479e-06, 'epoch': 0.41}


 41%|████      | 328/805 [48:34<1:10:39,  8.89s/it]

{'loss': 0.8717, 'grad_norm': 0.40457651019096375, 'learning_rate': 6.4889571937229275e-06, 'epoch': 0.41}


 41%|████      | 329/805 [48:43<1:11:05,  8.96s/it]

{'loss': 0.7656, 'grad_norm': 0.41347628831863403, 'learning_rate': 6.47020162616152e-06, 'epoch': 0.41}


 41%|████      | 330/805 [48:53<1:12:20,  9.14s/it]

{'loss': 0.7831, 'grad_norm': 0.44382596015930176, 'learning_rate': 6.451423386272312e-06, 'epoch': 0.41}


 41%|████      | 331/805 [49:02<1:11:49,  9.09s/it]

{'loss': 0.6876, 'grad_norm': 0.4047703146934509, 'learning_rate': 6.432622763638993e-06, 'epoch': 0.41}


 41%|████      | 332/805 [49:11<1:11:13,  9.03s/it]

{'loss': 0.7956, 'grad_norm': 0.3953102231025696, 'learning_rate': 6.413800048190417e-06, 'epoch': 0.41}


 41%|████▏     | 333/805 [49:20<1:10:42,  8.99s/it]

{'loss': 0.767, 'grad_norm': 0.44249823689460754, 'learning_rate': 6.3949555301961474e-06, 'epoch': 0.41}


 41%|████▏     | 334/805 [49:28<1:10:20,  8.96s/it]

{'loss': 0.8154, 'grad_norm': 0.43902936577796936, 'learning_rate': 6.376089500261958e-06, 'epoch': 0.41}


 42%|████▏     | 335/805 [49:37<1:10:00,  8.94s/it]

{'loss': 0.7379, 'grad_norm': 0.4508224427700043, 'learning_rate': 6.3572022493253715e-06, 'epoch': 0.42}


 42%|████▏     | 336/805 [49:46<1:09:46,  8.93s/it]

{'loss': 0.8537, 'grad_norm': 0.43253716826438904, 'learning_rate': 6.3382940686511625e-06, 'epoch': 0.42}


 42%|████▏     | 337/805 [49:55<1:09:33,  8.92s/it]

{'loss': 0.7143, 'grad_norm': 0.4371825158596039, 'learning_rate': 6.3193652498268656e-06, 'epoch': 0.42}


 42%|████▏     | 338/805 [50:04<1:09:25,  8.92s/it]

{'loss': 0.8061, 'grad_norm': 0.4277805984020233, 'learning_rate': 6.300416084758284e-06, 'epoch': 0.42}


 42%|████▏     | 339/805 [50:13<1:09:08,  8.90s/it]

{'loss': 0.7447, 'grad_norm': 0.4233402907848358, 'learning_rate': 6.281446865664984e-06, 'epoch': 0.42}


 42%|████▏     | 340/805 [50:22<1:08:56,  8.89s/it]

{'loss': 0.7723, 'grad_norm': 0.43307510018348694, 'learning_rate': 6.26245788507579e-06, 'epoch': 0.42}


 42%|████▏     | 341/805 [50:31<1:08:43,  8.89s/it]

{'loss': 0.769, 'grad_norm': 0.41620808839797974, 'learning_rate': 6.243449435824276e-06, 'epoch': 0.42}


 42%|████▏     | 342/805 [50:40<1:08:33,  8.89s/it]

{'loss': 0.8435, 'grad_norm': 0.39943546056747437, 'learning_rate': 6.224421811044238e-06, 'epoch': 0.42}


 43%|████▎     | 343/805 [50:48<1:08:28,  8.89s/it]

{'loss': 0.7778, 'grad_norm': 0.4359685480594635, 'learning_rate': 6.205375304165194e-06, 'epoch': 0.43}


 43%|████▎     | 344/805 [50:57<1:08:19,  8.89s/it]

{'loss': 0.8923, 'grad_norm': 0.44564083218574524, 'learning_rate': 6.18631020890784e-06, 'epoch': 0.43}


 43%|████▎     | 345/805 [51:06<1:07:59,  8.87s/it]

{'loss': 0.795, 'grad_norm': 0.458784818649292, 'learning_rate': 6.1672268192795285e-06, 'epoch': 0.43}


 43%|████▎     | 346/805 [51:15<1:07:41,  8.85s/it]

{'loss': 0.6997, 'grad_norm': 0.4518861770629883, 'learning_rate': 6.148125429569735e-06, 'epoch': 0.43}


 43%|████▎     | 347/805 [51:24<1:07:27,  8.84s/it]

{'loss': 0.7538, 'grad_norm': 0.4360245168209076, 'learning_rate': 6.1290063343455196e-06, 'epoch': 0.43}


 43%|████▎     | 348/805 [51:33<1:07:20,  8.84s/it]

{'loss': 0.7802, 'grad_norm': 0.5145269632339478, 'learning_rate': 6.10986982844698e-06, 'epoch': 0.43}


 43%|████▎     | 349/805 [51:42<1:07:16,  8.85s/it]

{'loss': 0.7601, 'grad_norm': 0.42382511496543884, 'learning_rate': 6.090716206982714e-06, 'epoch': 0.43}


 43%|████▎     | 350/805 [51:50<1:07:02,  8.84s/it]

{'loss': 0.7366, 'grad_norm': 0.4166308045387268, 'learning_rate': 6.071545765325254e-06, 'epoch': 0.43}


 44%|████▎     | 351/805 [51:59<1:06:57,  8.85s/it]

{'loss': 0.7944, 'grad_norm': 0.47582048177719116, 'learning_rate': 6.052358799106528e-06, 'epoch': 0.44}


 44%|████▎     | 352/805 [52:08<1:06:52,  8.86s/it]

{'loss': 0.8317, 'grad_norm': 0.4340965747833252, 'learning_rate': 6.033155604213291e-06, 'epoch': 0.44}


 44%|████▍     | 353/805 [52:17<1:06:45,  8.86s/it]

{'loss': 0.7525, 'grad_norm': 0.4819643199443817, 'learning_rate': 6.013936476782563e-06, 'epoch': 0.44}


 44%|████▍     | 354/805 [52:26<1:06:39,  8.87s/it]

{'loss': 0.7633, 'grad_norm': 0.4187299907207489, 'learning_rate': 5.994701713197063e-06, 'epoch': 0.44}


 44%|████▍     | 355/805 [52:35<1:06:33,  8.87s/it]

{'loss': 0.7664, 'grad_norm': 0.4407455623149872, 'learning_rate': 5.975451610080643e-06, 'epoch': 0.44}


 44%|████▍     | 356/805 [52:44<1:06:26,  8.88s/it]

{'loss': 0.774, 'grad_norm': 0.44808685779571533, 'learning_rate': 5.956186464293703e-06, 'epoch': 0.44}


 44%|████▍     | 357/805 [52:52<1:06:16,  8.88s/it]

{'loss': 0.7562, 'grad_norm': 0.4455533027648926, 'learning_rate': 5.936906572928625e-06, 'epoch': 0.44}


 44%|████▍     | 358/805 [53:01<1:06:08,  8.88s/it]

{'loss': 0.6838, 'grad_norm': 0.46281832456588745, 'learning_rate': 5.917612233305183e-06, 'epoch': 0.44}


 45%|████▍     | 359/805 [53:10<1:06:04,  8.89s/it]

{'loss': 0.7055, 'grad_norm': 0.4454914927482605, 'learning_rate': 5.898303742965964e-06, 'epoch': 0.45}


 45%|████▍     | 360/805 [53:19<1:05:58,  8.90s/it]

{'loss': 0.8769, 'grad_norm': 0.4545576870441437, 'learning_rate': 5.878981399671774e-06, 'epoch': 0.45}


 45%|████▍     | 361/805 [53:28<1:05:50,  8.90s/it]

{'loss': 0.7773, 'grad_norm': 0.4293172359466553, 'learning_rate': 5.859645501397048e-06, 'epoch': 0.45}


 45%|████▍     | 362/805 [53:37<1:05:40,  8.90s/it]

{'loss': 0.7066, 'grad_norm': 0.44413813948631287, 'learning_rate': 5.8402963463252605e-06, 'epoch': 0.45}


 45%|████▌     | 363/805 [53:46<1:05:29,  8.89s/it]

{'loss': 0.7579, 'grad_norm': 0.41936278343200684, 'learning_rate': 5.820934232844315e-06, 'epoch': 0.45}


 45%|████▌     | 364/805 [53:55<1:05:18,  8.89s/it]

{'loss': 0.806, 'grad_norm': 0.4575762450695038, 'learning_rate': 5.801559459541956e-06, 'epoch': 0.45}


 45%|████▌     | 365/805 [54:04<1:05:06,  8.88s/it]

{'loss': 0.7214, 'grad_norm': 0.4307347536087036, 'learning_rate': 5.782172325201155e-06, 'epoch': 0.45}


 45%|████▌     | 366/805 [54:12<1:04:55,  8.87s/it]

{'loss': 0.8639, 'grad_norm': 0.42040354013442993, 'learning_rate': 5.7627731287955054e-06, 'epoch': 0.45}


 46%|████▌     | 367/805 [54:21<1:04:48,  8.88s/it]

{'loss': 0.8041, 'grad_norm': 0.429280161857605, 'learning_rate': 5.743362169484617e-06, 'epoch': 0.46}


 46%|████▌     | 368/805 [54:30<1:04:38,  8.88s/it]

{'loss': 0.7317, 'grad_norm': 0.41500771045684814, 'learning_rate': 5.72393974660949e-06, 'epoch': 0.46}


 46%|████▌     | 369/805 [54:39<1:04:30,  8.88s/it]

{'loss': 0.7582, 'grad_norm': 0.39831870794296265, 'learning_rate': 5.704506159687914e-06, 'epoch': 0.46}


 46%|████▌     | 370/805 [54:48<1:04:24,  8.88s/it]

{'loss': 0.8004, 'grad_norm': 0.429489403963089, 'learning_rate': 5.6850617084098416e-06, 'epoch': 0.46}


 46%|████▌     | 371/805 [54:57<1:04:08,  8.87s/it]

{'loss': 0.7738, 'grad_norm': 0.43165451288223267, 'learning_rate': 5.665606692632762e-06, 'epoch': 0.46}


 46%|████▌     | 372/805 [55:06<1:03:53,  8.85s/it]

{'loss': 0.7837, 'grad_norm': 0.4068811237812042, 'learning_rate': 5.646141412377089e-06, 'epoch': 0.46}


 46%|████▋     | 373/805 [55:14<1:03:38,  8.84s/it]

{'loss': 0.6912, 'grad_norm': 0.4287651479244232, 'learning_rate': 5.626666167821522e-06, 'epoch': 0.46}


 46%|████▋     | 374/805 [55:23<1:03:24,  8.83s/it]

{'loss': 0.8018, 'grad_norm': 0.4078768789768219, 'learning_rate': 5.607181259298424e-06, 'epoch': 0.46}


 47%|████▋     | 375/805 [55:32<1:03:12,  8.82s/it]

{'loss': 0.7703, 'grad_norm': 0.45065754652023315, 'learning_rate': 5.587686987289189e-06, 'epoch': 0.47}


 47%|████▋     | 376/805 [55:41<1:03:02,  8.82s/it]

{'loss': 0.7691, 'grad_norm': 0.4266374111175537, 'learning_rate': 5.5681836524196065e-06, 'epoch': 0.47}


 47%|████▋     | 377/805 [55:50<1:02:55,  8.82s/it]

{'loss': 0.8113, 'grad_norm': 0.4292258024215698, 'learning_rate': 5.548671555455226e-06, 'epoch': 0.47}


 47%|████▋     | 378/805 [55:59<1:02:55,  8.84s/it]

{'loss': 0.7019, 'grad_norm': 0.4391200542449951, 'learning_rate': 5.529150997296724e-06, 'epoch': 0.47}


 47%|████▋     | 379/805 [56:07<1:02:50,  8.85s/it]

{'loss': 0.8219, 'grad_norm': 0.4204496443271637, 'learning_rate': 5.50962227897525e-06, 'epoch': 0.47}


 47%|████▋     | 380/805 [56:16<1:02:47,  8.87s/it]

{'loss': 0.755, 'grad_norm': 0.4031336009502411, 'learning_rate': 5.490085701647805e-06, 'epoch': 0.47}


 47%|████▋     | 381/805 [56:25<1:02:38,  8.86s/it]

{'loss': 0.8098, 'grad_norm': 0.43935346603393555, 'learning_rate': 5.470541566592573e-06, 'epoch': 0.47}


 47%|████▋     | 382/805 [56:34<1:02:32,  8.87s/it]

{'loss': 0.7864, 'grad_norm': 0.41188234090805054, 'learning_rate': 5.450990175204296e-06, 'epoch': 0.47}


 48%|████▊     | 383/805 [56:43<1:02:24,  8.87s/it]

{'loss': 0.7205, 'grad_norm': 0.4005018472671509, 'learning_rate': 5.431431828989618e-06, 'epoch': 0.48}


 48%|████▊     | 384/805 [56:52<1:02:16,  8.88s/it]

{'loss': 0.7664, 'grad_norm': 0.4482267498970032, 'learning_rate': 5.411866829562429e-06, 'epoch': 0.48}


 48%|████▊     | 385/805 [57:01<1:02:09,  8.88s/it]

{'loss': 0.7302, 'grad_norm': 0.4150374233722687, 'learning_rate': 5.392295478639226e-06, 'epoch': 0.48}


 48%|████▊     | 386/805 [57:10<1:02:01,  8.88s/it]

{'loss': 0.8104, 'grad_norm': 0.38298940658569336, 'learning_rate': 5.372718078034449e-06, 'epoch': 0.48}


 48%|████▊     | 387/805 [57:19<1:01:54,  8.89s/it]

{'loss': 0.7572, 'grad_norm': 0.40104201436042786, 'learning_rate': 5.353134929655834e-06, 'epoch': 0.48}


 48%|████▊     | 388/805 [57:27<1:01:45,  8.89s/it]

{'loss': 0.7227, 'grad_norm': 0.4588334560394287, 'learning_rate': 5.333546335499756e-06, 'epoch': 0.48}


 48%|████▊     | 389/805 [57:36<1:01:34,  8.88s/it]

{'loss': 0.778, 'grad_norm': 0.3970866799354553, 'learning_rate': 5.3139525976465675e-06, 'epoch': 0.48}


 48%|████▊     | 390/805 [57:45<1:01:28,  8.89s/it]

{'loss': 0.7743, 'grad_norm': 0.38696086406707764, 'learning_rate': 5.294354018255945e-06, 'epoch': 0.48}


 49%|████▊     | 391/805 [57:54<1:01:14,  8.88s/it]

{'loss': 0.7541, 'grad_norm': 0.40317344665527344, 'learning_rate': 5.27475089956223e-06, 'epoch': 0.49}


 49%|████▊     | 392/805 [58:03<1:00:56,  8.85s/it]

{'loss': 0.7853, 'grad_norm': 0.4208306670188904, 'learning_rate': 5.255143543869759e-06, 'epoch': 0.49}


 49%|████▉     | 393/805 [58:12<1:00:43,  8.84s/it]

{'loss': 0.7114, 'grad_norm': 0.40818360447883606, 'learning_rate': 5.235532253548213e-06, 'epoch': 0.49}


 49%|████▉     | 394/805 [58:20<1:00:33,  8.84s/it]

{'loss': 0.7574, 'grad_norm': 0.39752718806266785, 'learning_rate': 5.215917331027952e-06, 'epoch': 0.49}


 49%|████▉     | 395/805 [58:29<1:00:21,  8.83s/it]

{'loss': 0.7951, 'grad_norm': 0.39873018860816956, 'learning_rate': 5.1962990787953436e-06, 'epoch': 0.49}


 49%|████▉     | 396/805 [58:38<1:00:12,  8.83s/it]

{'loss': 0.8399, 'grad_norm': 0.40701621770858765, 'learning_rate': 5.176677799388107e-06, 'epoch': 0.49}


 49%|████▉     | 397/805 [58:47<1:00:06,  8.84s/it]

{'loss': 0.7192, 'grad_norm': 0.43665289878845215, 'learning_rate': 5.157053795390642e-06, 'epoch': 0.49}


 49%|████▉     | 398/805 [58:56<1:00:00,  8.85s/it]

{'loss': 0.7842, 'grad_norm': 0.39239639043807983, 'learning_rate': 5.1374273694293676e-06, 'epoch': 0.49}


 50%|████▉     | 399/805 [59:05<59:53,  8.85s/it]  

{'loss': 0.7992, 'grad_norm': 0.4254498779773712, 'learning_rate': 5.117798824168052e-06, 'epoch': 0.5}


 50%|████▉     | 400/805 [59:14<59:46,  8.86s/it]

{'loss': 0.7638, 'grad_norm': 0.4154094457626343, 'learning_rate': 5.098168462303141e-06, 'epoch': 0.5}


 50%|████▉     | 401/805 [59:22<59:38,  8.86s/it]

{'loss': 0.745, 'grad_norm': 0.38764438033103943, 'learning_rate': 5.078536586559104e-06, 'epoch': 0.5}


 50%|████▉     | 402/805 [59:31<59:32,  8.87s/it]

{'loss': 0.7659, 'grad_norm': 0.392869234085083, 'learning_rate': 5.058903499683746e-06, 'epoch': 0.5}


 50%|█████     | 403/805 [59:40<59:25,  8.87s/it]

{'loss': 0.8359, 'grad_norm': 0.3973082900047302, 'learning_rate': 5.039269504443557e-06, 'epoch': 0.5}


 50%|█████     | 404/805 [59:49<59:17,  8.87s/it]

{'loss': 0.8009, 'grad_norm': 0.40496641397476196, 'learning_rate': 5.019634903619031e-06, 'epoch': 0.5}


 50%|█████     | 405/805 [59:58<59:08,  8.87s/it]

{'loss': 0.7324, 'grad_norm': 0.4320965111255646, 'learning_rate': 5e-06, 'epoch': 0.5}


 50%|█████     | 406/805 [1:00:07<58:58,  8.87s/it]

{'loss': 0.7052, 'grad_norm': 0.4236876666545868, 'learning_rate': 4.980365096380971e-06, 'epoch': 0.5}


 51%|█████     | 407/805 [1:00:16<58:51,  8.87s/it]

{'loss': 0.7378, 'grad_norm': 0.4480540454387665, 'learning_rate': 4.9607304955564456e-06, 'epoch': 0.51}


 51%|█████     | 408/805 [1:00:25<58:41,  8.87s/it]

{'loss': 0.6729, 'grad_norm': 0.4485335052013397, 'learning_rate': 4.941096500316254e-06, 'epoch': 0.51}


 51%|█████     | 409/805 [1:00:33<58:32,  8.87s/it]

{'loss': 0.7724, 'grad_norm': 0.4273175597190857, 'learning_rate': 4.921463413440898e-06, 'epoch': 0.51}


 51%|█████     | 410/805 [1:00:42<58:27,  8.88s/it]

{'loss': 0.8204, 'grad_norm': 0.42978474497795105, 'learning_rate': 4.90183153769686e-06, 'epoch': 0.51}


 51%|█████     | 411/805 [1:00:51<58:18,  8.88s/it]

{'loss': 0.6823, 'grad_norm': 0.40679091215133667, 'learning_rate': 4.88220117583195e-06, 'epoch': 0.51}


 51%|█████     | 412/805 [1:01:00<58:07,  8.87s/it]

{'loss': 0.8884, 'grad_norm': 0.40687423944473267, 'learning_rate': 4.862572630570633e-06, 'epoch': 0.51}


 51%|█████▏    | 413/805 [1:01:09<57:56,  8.87s/it]

{'loss': 0.7268, 'grad_norm': 0.4707334041595459, 'learning_rate': 4.842946204609359e-06, 'epoch': 0.51}


 51%|█████▏    | 414/805 [1:01:18<57:49,  8.87s/it]

{'loss': 0.7077, 'grad_norm': 0.4419521987438202, 'learning_rate': 4.823322200611895e-06, 'epoch': 0.51}


 52%|█████▏    | 415/805 [1:01:27<57:39,  8.87s/it]

{'loss': 0.7644, 'grad_norm': 0.4220959544181824, 'learning_rate': 4.803700921204659e-06, 'epoch': 0.52}


 52%|█████▏    | 416/805 [1:01:36<57:30,  8.87s/it]

{'loss': 0.7483, 'grad_norm': 0.4164402186870575, 'learning_rate': 4.784082668972048e-06, 'epoch': 0.52}


 52%|█████▏    | 417/805 [1:01:44<57:15,  8.85s/it]

{'loss': 0.8517, 'grad_norm': 0.4079452455043793, 'learning_rate': 4.7644677464517874e-06, 'epoch': 0.52}


 52%|█████▏    | 418/805 [1:01:53<56:59,  8.84s/it]

{'loss': 0.7381, 'grad_norm': 0.4194965362548828, 'learning_rate': 4.744856456130243e-06, 'epoch': 0.52}


 52%|█████▏    | 419/805 [1:02:02<56:43,  8.82s/it]

{'loss': 0.7869, 'grad_norm': 0.47892603278160095, 'learning_rate': 4.725249100437773e-06, 'epoch': 0.52}


 52%|█████▏    | 420/805 [1:02:11<56:36,  8.82s/it]

{'loss': 0.7092, 'grad_norm': 0.42197394371032715, 'learning_rate': 4.705645981744055e-06, 'epoch': 0.52}


 52%|█████▏    | 421/805 [1:02:20<56:34,  8.84s/it]

{'loss': 0.6846, 'grad_norm': 0.41955864429473877, 'learning_rate': 4.686047402353433e-06, 'epoch': 0.52}


 52%|█████▏    | 422/805 [1:02:28<56:21,  8.83s/it]

{'loss': 0.6906, 'grad_norm': 0.40873846411705017, 'learning_rate': 4.6664536645002456e-06, 'epoch': 0.52}


 53%|█████▎    | 423/805 [1:02:37<56:08,  8.82s/it]

{'loss': 0.7973, 'grad_norm': 0.43634670972824097, 'learning_rate': 4.646865070344168e-06, 'epoch': 0.53}


 53%|█████▎    | 424/805 [1:02:46<55:56,  8.81s/it]

{'loss': 0.7452, 'grad_norm': 0.40433257818222046, 'learning_rate': 4.627281921965552e-06, 'epoch': 0.53}


 53%|█████▎    | 425/805 [1:02:55<55:53,  8.83s/it]

{'loss': 0.7257, 'grad_norm': 0.3924335837364197, 'learning_rate': 4.6077045213607765e-06, 'epoch': 0.53}


 53%|█████▎    | 426/805 [1:03:04<55:57,  8.86s/it]

{'loss': 0.6993, 'grad_norm': 0.41804805397987366, 'learning_rate': 4.588133170437572e-06, 'epoch': 0.53}


 53%|█████▎    | 427/805 [1:03:13<55:49,  8.86s/it]

{'loss': 0.8956, 'grad_norm': 0.43115001916885376, 'learning_rate': 4.568568171010384e-06, 'epoch': 0.53}


 53%|█████▎    | 428/805 [1:03:22<55:40,  8.86s/it]

{'loss': 0.6832, 'grad_norm': 0.3990527391433716, 'learning_rate': 4.5490098247957045e-06, 'epoch': 0.53}


 53%|█████▎    | 429/805 [1:03:30<55:35,  8.87s/it]

{'loss': 0.6758, 'grad_norm': 0.4145289659500122, 'learning_rate': 4.529458433407429e-06, 'epoch': 0.53}


 53%|█████▎    | 430/805 [1:03:39<55:28,  8.88s/it]

{'loss': 0.7865, 'grad_norm': 0.42287659645080566, 'learning_rate': 4.509914298352197e-06, 'epoch': 0.53}


 54%|█████▎    | 431/805 [1:03:48<55:22,  8.88s/it]

{'loss': 0.8117, 'grad_norm': 0.41035178303718567, 'learning_rate': 4.490377721024751e-06, 'epoch': 0.54}


 54%|█████▎    | 432/805 [1:03:57<55:12,  8.88s/it]

{'loss': 0.7334, 'grad_norm': 0.40807870030403137, 'learning_rate': 4.470849002703279e-06, 'epoch': 0.54}


 54%|█████▍    | 433/805 [1:04:06<55:04,  8.88s/it]

{'loss': 0.7619, 'grad_norm': 0.3934932053089142, 'learning_rate': 4.451328444544774e-06, 'epoch': 0.54}


 54%|█████▍    | 434/805 [1:04:15<54:54,  8.88s/it]

{'loss': 0.7328, 'grad_norm': 0.3992023468017578, 'learning_rate': 4.431816347580395e-06, 'epoch': 0.54}


 54%|█████▍    | 435/805 [1:04:24<54:43,  8.87s/it]

{'loss': 0.7558, 'grad_norm': 0.3996388614177704, 'learning_rate': 4.4123130127108125e-06, 'epoch': 0.54}


 54%|█████▍    | 436/805 [1:04:33<54:33,  8.87s/it]

{'loss': 0.8788, 'grad_norm': 0.44839149713516235, 'learning_rate': 4.392818740701579e-06, 'epoch': 0.54}


 54%|█████▍    | 437/805 [1:04:41<54:24,  8.87s/it]

{'loss': 0.8077, 'grad_norm': 0.4291555881500244, 'learning_rate': 4.373333832178478e-06, 'epoch': 0.54}


 54%|█████▍    | 438/805 [1:04:50<54:14,  8.87s/it]

{'loss': 0.7187, 'grad_norm': 0.4286884069442749, 'learning_rate': 4.353858587622913e-06, 'epoch': 0.54}


 55%|█████▍    | 439/805 [1:04:59<53:59,  8.85s/it]

{'loss': 0.7709, 'grad_norm': 0.4056265652179718, 'learning_rate': 4.3343933073672395e-06, 'epoch': 0.55}


 55%|█████▍    | 440/805 [1:05:08<53:45,  8.84s/it]

{'loss': 0.7888, 'grad_norm': 0.46734946966171265, 'learning_rate': 4.314938291590161e-06, 'epoch': 0.55}


 55%|█████▍    | 441/805 [1:05:17<53:31,  8.82s/it]

{'loss': 0.8481, 'grad_norm': 0.42231470346450806, 'learning_rate': 4.295493840312087e-06, 'epoch': 0.55}


 55%|█████▍    | 442/805 [1:05:26<53:18,  8.81s/it]

{'loss': 0.7087, 'grad_norm': 0.40267544984817505, 'learning_rate': 4.276060253390511e-06, 'epoch': 0.55}


 55%|█████▌    | 443/805 [1:05:34<53:08,  8.81s/it]

{'loss': 0.7001, 'grad_norm': 0.4191335439682007, 'learning_rate': 4.256637830515385e-06, 'epoch': 0.55}


 55%|█████▌    | 444/805 [1:05:43<53:01,  8.81s/it]

{'loss': 0.7863, 'grad_norm': 0.4131971597671509, 'learning_rate': 4.237226871204496e-06, 'epoch': 0.55}


 55%|█████▌    | 445/805 [1:05:52<53:02,  8.84s/it]

{'loss': 0.7319, 'grad_norm': 0.4195443093776703, 'learning_rate': 4.217827674798845e-06, 'epoch': 0.55}


 55%|█████▌    | 446/805 [1:06:01<52:59,  8.86s/it]

{'loss': 0.8413, 'grad_norm': 0.4200229346752167, 'learning_rate': 4.198440540458045e-06, 'epoch': 0.55}


 56%|█████▌    | 447/805 [1:06:10<52:52,  8.86s/it]

{'loss': 0.7629, 'grad_norm': 0.40192532539367676, 'learning_rate': 4.179065767155686e-06, 'epoch': 0.55}


 56%|█████▌    | 448/805 [1:06:19<52:45,  8.87s/it]

{'loss': 0.7093, 'grad_norm': 0.41965949535369873, 'learning_rate': 4.159703653674741e-06, 'epoch': 0.56}


 56%|█████▌    | 449/805 [1:06:28<52:36,  8.87s/it]

{'loss': 0.7909, 'grad_norm': 0.465167760848999, 'learning_rate': 4.140354498602952e-06, 'epoch': 0.56}


 56%|█████▌    | 450/805 [1:06:36<52:27,  8.86s/it]

{'loss': 0.8399, 'grad_norm': 0.40844202041625977, 'learning_rate': 4.1210186003282275e-06, 'epoch': 0.56}


 56%|█████▌    | 451/805 [1:06:45<52:22,  8.88s/it]

{'loss': 0.7931, 'grad_norm': 0.43088167905807495, 'learning_rate': 4.1016962570340375e-06, 'epoch': 0.56}


 56%|█████▌    | 452/805 [1:06:54<52:12,  8.88s/it]

{'loss': 0.7452, 'grad_norm': 0.43620219826698303, 'learning_rate': 4.082387766694819e-06, 'epoch': 0.56}


 56%|█████▋    | 453/805 [1:07:03<52:02,  8.87s/it]

{'loss': 0.7781, 'grad_norm': 0.4171880781650543, 'learning_rate': 4.063093427071376e-06, 'epoch': 0.56}


 56%|█████▋    | 454/805 [1:07:12<51:54,  8.87s/it]

{'loss': 0.7352, 'grad_norm': 0.42266958951950073, 'learning_rate': 4.043813535706299e-06, 'epoch': 0.56}


 57%|█████▋    | 455/805 [1:07:21<51:45,  8.87s/it]

{'loss': 0.6467, 'grad_norm': 0.4427433907985687, 'learning_rate': 4.02454838991936e-06, 'epoch': 0.56}


 57%|█████▋    | 456/805 [1:07:30<51:36,  8.87s/it]

{'loss': 0.8412, 'grad_norm': 0.4545566439628601, 'learning_rate': 4.005298286802938e-06, 'epoch': 0.57}


 57%|█████▋    | 457/805 [1:07:39<51:26,  8.87s/it]

{'loss': 0.7554, 'grad_norm': 0.4049872159957886, 'learning_rate': 3.986063523217439e-06, 'epoch': 0.57}


 57%|█████▋    | 458/805 [1:07:47<51:17,  8.87s/it]

{'loss': 0.7074, 'grad_norm': 0.45636045932769775, 'learning_rate': 3.966844395786709e-06, 'epoch': 0.57}


 57%|█████▋    | 459/805 [1:07:56<51:10,  8.87s/it]

{'loss': 0.8691, 'grad_norm': 0.43126991391181946, 'learning_rate': 3.947641200893473e-06, 'epoch': 0.57}


 57%|█████▋    | 460/805 [1:08:05<51:00,  8.87s/it]

{'loss': 0.756, 'grad_norm': 0.42718705534935, 'learning_rate': 3.928454234674748e-06, 'epoch': 0.57}


 57%|█████▋    | 461/805 [1:08:14<50:51,  8.87s/it]

{'loss': 0.8094, 'grad_norm': 0.4487435221672058, 'learning_rate': 3.909283793017289e-06, 'epoch': 0.57}


 57%|█████▋    | 462/805 [1:08:23<50:42,  8.87s/it]

{'loss': 0.763, 'grad_norm': 0.44077974557876587, 'learning_rate': 3.890130171553021e-06, 'epoch': 0.57}


 58%|█████▊    | 463/805 [1:08:32<50:34,  8.87s/it]

{'loss': 0.8438, 'grad_norm': 0.4459436237812042, 'learning_rate': 3.870993665654482e-06, 'epoch': 0.57}


 58%|█████▊    | 464/805 [1:08:41<50:18,  8.85s/it]

{'loss': 0.7363, 'grad_norm': 0.4250069558620453, 'learning_rate': 3.851874570430266e-06, 'epoch': 0.58}


 58%|█████▊    | 465/805 [1:08:49<50:04,  8.84s/it]

{'loss': 0.7697, 'grad_norm': 0.4100511372089386, 'learning_rate': 3.832773180720475e-06, 'epoch': 0.58}


 58%|█████▊    | 466/805 [1:08:58<49:51,  8.82s/it]

{'loss': 0.8118, 'grad_norm': 0.42023175954818726, 'learning_rate': 3.813689791092161e-06, 'epoch': 0.58}


 58%|█████▊    | 467/805 [1:09:07<49:40,  8.82s/it]

{'loss': 0.7506, 'grad_norm': 0.4064520299434662, 'learning_rate': 3.7946246958348077e-06, 'epoch': 0.58}


 58%|█████▊    | 468/805 [1:09:16<49:29,  8.81s/it]

{'loss': 0.7586, 'grad_norm': 0.40639424324035645, 'learning_rate': 3.775578188955763e-06, 'epoch': 0.58}


 58%|█████▊    | 469/805 [1:09:25<49:21,  8.82s/it]

{'loss': 0.8262, 'grad_norm': 0.41898733377456665, 'learning_rate': 3.756550564175727e-06, 'epoch': 0.58}


 58%|█████▊    | 470/805 [1:09:33<49:16,  8.83s/it]

{'loss': 0.7805, 'grad_norm': 0.3944050669670105, 'learning_rate': 3.7375421149242102e-06, 'epoch': 0.58}


 59%|█████▊    | 471/805 [1:09:42<49:05,  8.82s/it]

{'loss': 0.7399, 'grad_norm': 0.4080868661403656, 'learning_rate': 3.7185531343350167e-06, 'epoch': 0.58}


 59%|█████▊    | 472/805 [1:09:51<49:05,  8.85s/it]

{'loss': 0.8111, 'grad_norm': 0.46473583579063416, 'learning_rate': 3.6995839152417173e-06, 'epoch': 0.59}


 59%|█████▉    | 473/805 [1:10:00<48:58,  8.85s/it]

{'loss': 0.7832, 'grad_norm': 0.4223789870738983, 'learning_rate': 3.680634750173137e-06, 'epoch': 0.59}


 59%|█████▉    | 474/805 [1:10:09<48:52,  8.86s/it]

{'loss': 0.7482, 'grad_norm': 0.43215951323509216, 'learning_rate': 3.661705931348838e-06, 'epoch': 0.59}


 59%|█████▉    | 475/805 [1:10:18<48:44,  8.86s/it]

{'loss': 0.7168, 'grad_norm': 0.43306320905685425, 'learning_rate': 3.6427977506746293e-06, 'epoch': 0.59}


 59%|█████▉    | 476/805 [1:10:27<48:36,  8.86s/it]

{'loss': 0.7859, 'grad_norm': 0.43108510971069336, 'learning_rate': 3.623910499738043e-06, 'epoch': 0.59}


 59%|█████▉    | 477/805 [1:10:36<48:30,  8.87s/it]

{'loss': 0.7444, 'grad_norm': 0.44825154542922974, 'learning_rate': 3.6050444698038547e-06, 'epoch': 0.59}


 59%|█████▉    | 478/805 [1:10:44<48:22,  8.88s/it]

{'loss': 0.7893, 'grad_norm': 0.4182254374027252, 'learning_rate': 3.5861999518095827e-06, 'epoch': 0.59}


 60%|█████▉    | 479/805 [1:10:53<48:13,  8.88s/it]

{'loss': 0.6981, 'grad_norm': 0.4050186574459076, 'learning_rate': 3.5673772363610083e-06, 'epoch': 0.59}


 60%|█████▉    | 480/805 [1:11:02<48:04,  8.88s/it]

{'loss': 0.8052, 'grad_norm': 0.41670307517051697, 'learning_rate': 3.5485766137276894e-06, 'epoch': 0.6}


 60%|█████▉    | 481/805 [1:11:11<47:59,  8.89s/it]

{'loss': 0.7328, 'grad_norm': 0.4074866473674774, 'learning_rate': 3.5297983738384813e-06, 'epoch': 0.6}


 60%|█████▉    | 482/805 [1:11:20<47:48,  8.88s/it]

{'loss': 0.7533, 'grad_norm': 0.4523136019706726, 'learning_rate': 3.511042806277075e-06, 'epoch': 0.6}


 60%|██████    | 483/805 [1:11:29<47:40,  8.88s/it]

{'loss': 0.8219, 'grad_norm': 0.4198707938194275, 'learning_rate': 3.492310200277522e-06, 'epoch': 0.6}


 60%|██████    | 484/805 [1:11:38<47:31,  8.88s/it]

{'loss': 0.8303, 'grad_norm': 0.406770259141922, 'learning_rate': 3.473600844719783e-06, 'epoch': 0.6}


 60%|██████    | 485/805 [1:11:47<47:22,  8.88s/it]

{'loss': 0.7858, 'grad_norm': 0.4493434429168701, 'learning_rate': 3.4549150281252635e-06, 'epoch': 0.6}


 60%|██████    | 486/805 [1:11:55<47:07,  8.86s/it]

{'loss': 0.7153, 'grad_norm': 0.4402965009212494, 'learning_rate': 3.436253038652373e-06, 'epoch': 0.6}


 60%|██████    | 487/805 [1:12:04<46:53,  8.85s/it]

{'loss': 0.8781, 'grad_norm': 0.4314064085483551, 'learning_rate': 3.4176151640920696e-06, 'epoch': 0.6}


 61%|██████    | 488/805 [1:12:13<46:42,  8.84s/it]

{'loss': 0.8014, 'grad_norm': 0.4465301036834717, 'learning_rate': 3.3990016918634415e-06, 'epoch': 0.61}


 61%|██████    | 489/805 [1:12:22<46:36,  8.85s/it]

{'loss': 0.7809, 'grad_norm': 0.4339883327484131, 'learning_rate': 3.3804129090092542e-06, 'epoch': 0.61}


 61%|██████    | 490/805 [1:12:31<46:25,  8.84s/it]

{'loss': 0.8056, 'grad_norm': 0.4225054383277893, 'learning_rate': 3.3618491021915334e-06, 'epoch': 0.61}


 61%|██████    | 491/805 [1:12:40<46:12,  8.83s/it]

{'loss': 0.7623, 'grad_norm': 0.402459979057312, 'learning_rate': 3.3433105576871448e-06, 'epoch': 0.61}


 61%|██████    | 492/805 [1:12:48<46:06,  8.84s/it]

{'loss': 0.6817, 'grad_norm': 0.4223959445953369, 'learning_rate': 3.3247975613833805e-06, 'epoch': 0.61}


 61%|██████    | 493/805 [1:12:57<46:00,  8.85s/it]

{'loss': 0.7402, 'grad_norm': 0.448984295129776, 'learning_rate': 3.3063103987735433e-06, 'epoch': 0.61}


 61%|██████▏   | 494/805 [1:13:06<45:55,  8.86s/it]

{'loss': 0.8091, 'grad_norm': 0.4401188790798187, 'learning_rate': 3.287849354952552e-06, 'epoch': 0.61}


 61%|██████▏   | 495/805 [1:13:15<45:48,  8.86s/it]

{'loss': 0.6383, 'grad_norm': 0.42436984181404114, 'learning_rate': 3.269414714612534e-06, 'epoch': 0.61}


 62%|██████▏   | 496/805 [1:13:24<45:44,  8.88s/it]

{'loss': 0.7821, 'grad_norm': 0.41388168931007385, 'learning_rate': 3.2510067620384566e-06, 'epoch': 0.62}


 62%|██████▏   | 497/805 [1:13:33<45:39,  8.89s/it]

{'loss': 0.7198, 'grad_norm': 0.4438638389110565, 'learning_rate': 3.2326257811037154e-06, 'epoch': 0.62}


 62%|██████▏   | 498/805 [1:13:42<45:33,  8.90s/it]

{'loss': 0.802, 'grad_norm': 0.40576112270355225, 'learning_rate': 3.2142720552657746e-06, 'epoch': 0.62}


 62%|██████▏   | 499/805 [1:13:51<45:22,  8.90s/it]

{'loss': 0.7835, 'grad_norm': 0.4623105823993683, 'learning_rate': 3.195945867561791e-06, 'epoch': 0.62}


 62%|██████▏   | 500/805 [1:14:00<45:09,  8.88s/it]

{'loss': 0.8474, 'grad_norm': 0.42744675278663635, 'learning_rate': 3.177647500604252e-06, 'epoch': 0.62}


 62%|██████▏   | 501/805 [1:14:10<47:02,  9.28s/it]

{'loss': 0.7748, 'grad_norm': 0.4214000403881073, 'learning_rate': 3.1593772365766107e-06, 'epoch': 0.62}


 62%|██████▏   | 502/805 [1:14:19<46:18,  9.17s/it]

{'loss': 0.7065, 'grad_norm': 0.4075464606285095, 'learning_rate': 3.1411353572289404e-06, 'epoch': 0.62}


 62%|██████▏   | 503/805 [1:14:28<45:42,  9.08s/it]

{'loss': 0.7891, 'grad_norm': 0.45288974046707153, 'learning_rate': 3.122922143873584e-06, 'epoch': 0.62}


 63%|██████▎   | 504/805 [1:14:36<45:16,  9.03s/it]

{'loss': 0.728, 'grad_norm': 0.45107224583625793, 'learning_rate': 3.104737877380828e-06, 'epoch': 0.63}


 63%|██████▎   | 505/805 [1:14:45<44:54,  8.98s/it]

{'loss': 0.7414, 'grad_norm': 0.4278576374053955, 'learning_rate': 3.0865828381745515e-06, 'epoch': 0.63}


 63%|██████▎   | 506/805 [1:14:54<44:36,  8.95s/it]

{'loss': 0.8686, 'grad_norm': 0.49734607338905334, 'learning_rate': 3.068457306227921e-06, 'epoch': 0.63}


 63%|██████▎   | 507/805 [1:15:03<44:21,  8.93s/it]

{'loss': 0.7576, 'grad_norm': 0.4258451759815216, 'learning_rate': 3.0503615610590605e-06, 'epoch': 0.63}


 63%|██████▎   | 508/805 [1:15:12<44:08,  8.92s/it]

{'loss': 0.7979, 'grad_norm': 0.4310706555843353, 'learning_rate': 3.0322958817267428e-06, 'epoch': 0.63}


 63%|██████▎   | 509/805 [1:15:21<43:56,  8.91s/it]

{'loss': 0.7952, 'grad_norm': 0.42373666167259216, 'learning_rate': 3.0142605468260976e-06, 'epoch': 0.63}


 63%|██████▎   | 510/805 [1:15:30<43:48,  8.91s/it]

{'loss': 0.8147, 'grad_norm': 0.40894538164138794, 'learning_rate': 2.9962558344842963e-06, 'epoch': 0.63}


 63%|██████▎   | 511/805 [1:15:39<43:30,  8.88s/it]

{'loss': 0.7794, 'grad_norm': 0.4160991609096527, 'learning_rate': 2.9782820223562758e-06, 'epoch': 0.63}


 64%|██████▎   | 512/805 [1:15:47<43:14,  8.86s/it]

{'loss': 0.7329, 'grad_norm': 0.409034788608551, 'learning_rate': 2.9603393876204543e-06, 'epoch': 0.64}


 64%|██████▎   | 513/805 [1:15:56<43:03,  8.85s/it]

{'loss': 0.6597, 'grad_norm': 0.4066872298717499, 'learning_rate': 2.9424282069744564e-06, 'epoch': 0.64}


 64%|██████▍   | 514/805 [1:16:05<42:53,  8.84s/it]

{'loss': 0.7195, 'grad_norm': 0.4086381196975708, 'learning_rate': 2.9245487566308447e-06, 'epoch': 0.64}


 64%|██████▍   | 515/805 [1:16:14<42:41,  8.83s/it]

{'loss': 0.8099, 'grad_norm': 0.4087319076061249, 'learning_rate': 2.906701312312861e-06, 'epoch': 0.64}


 64%|██████▍   | 516/805 [1:16:23<42:30,  8.82s/it]

{'loss': 0.7063, 'grad_norm': 0.4387616217136383, 'learning_rate': 2.8888861492501733e-06, 'epoch': 0.64}


 64%|██████▍   | 517/805 [1:16:31<42:19,  8.82s/it]

{'loss': 0.753, 'grad_norm': 0.45293232798576355, 'learning_rate': 2.871103542174637e-06, 'epoch': 0.64}


 64%|██████▍   | 518/805 [1:16:40<42:14,  8.83s/it]

{'loss': 0.8188, 'grad_norm': 0.4255429208278656, 'learning_rate': 2.8533537653160512e-06, 'epoch': 0.64}


 64%|██████▍   | 519/805 [1:16:49<42:10,  8.85s/it]

{'loss': 0.6947, 'grad_norm': 0.4235881567001343, 'learning_rate': 2.8356370923979326e-06, 'epoch': 0.64}


 65%|██████▍   | 520/805 [1:16:58<42:04,  8.86s/it]

{'loss': 0.7479, 'grad_norm': 0.42097896337509155, 'learning_rate': 2.817953796633289e-06, 'epoch': 0.65}


 65%|██████▍   | 521/805 [1:17:07<41:59,  8.87s/it]

{'loss': 0.7818, 'grad_norm': 0.4331207573413849, 'learning_rate': 2.800304150720424e-06, 'epoch': 0.65}


 65%|██████▍   | 522/805 [1:17:16<41:51,  8.87s/it]

{'loss': 0.7165, 'grad_norm': 0.41976186633110046, 'learning_rate': 2.782688426838709e-06, 'epoch': 0.65}


 65%|██████▍   | 523/805 [1:17:25<41:42,  8.87s/it]

{'loss': 0.7346, 'grad_norm': 0.5216852426528931, 'learning_rate': 2.765106896644395e-06, 'epoch': 0.65}


 65%|██████▌   | 524/805 [1:17:34<41:34,  8.88s/it]

{'loss': 0.6855, 'grad_norm': 0.4246399998664856, 'learning_rate': 2.7475598312664285e-06, 'epoch': 0.65}


 65%|██████▌   | 525/805 [1:17:43<41:26,  8.88s/it]

{'loss': 0.7864, 'grad_norm': 0.3984403908252716, 'learning_rate': 2.7300475013022666e-06, 'epoch': 0.65}


 65%|██████▌   | 526/805 [1:17:51<41:16,  8.88s/it]

{'loss': 0.7642, 'grad_norm': 0.44793009757995605, 'learning_rate': 2.7125701768136974e-06, 'epoch': 0.65}


 65%|██████▌   | 527/805 [1:18:00<41:07,  8.88s/it]

{'loss': 0.7247, 'grad_norm': 0.413735032081604, 'learning_rate': 2.6951281273226894e-06, 'epoch': 0.65}


 66%|██████▌   | 528/805 [1:18:09<40:59,  8.88s/it]

{'loss': 0.7974, 'grad_norm': 0.42240986227989197, 'learning_rate': 2.677721621807217e-06, 'epoch': 0.66}


 66%|██████▌   | 529/805 [1:18:18<40:49,  8.88s/it]

{'loss': 0.7866, 'grad_norm': 0.4058069586753845, 'learning_rate': 2.6603509286971342e-06, 'epoch': 0.66}


 66%|██████▌   | 530/805 [1:18:27<40:40,  8.87s/it]

{'loss': 0.6866, 'grad_norm': 0.4160695970058441, 'learning_rate': 2.6430163158700116e-06, 'epoch': 0.66}


 66%|██████▌   | 531/805 [1:18:36<40:31,  8.87s/it]

{'loss': 0.7686, 'grad_norm': 0.4247594177722931, 'learning_rate': 2.6257180506470283e-06, 'epoch': 0.66}


 66%|██████▌   | 532/805 [1:18:45<40:20,  8.87s/it]

{'loss': 0.7682, 'grad_norm': 0.41881224513053894, 'learning_rate': 2.60845639978883e-06, 'epoch': 0.66}


 66%|██████▌   | 533/805 [1:18:53<40:08,  8.85s/it]

{'loss': 0.8028, 'grad_norm': 0.45684030652046204, 'learning_rate': 2.5912316294914232e-06, 'epoch': 0.66}


 66%|██████▋   | 534/805 [1:19:02<39:54,  8.84s/it]

{'loss': 0.7842, 'grad_norm': 0.46540501713752747, 'learning_rate': 2.5740440053820814e-06, 'epoch': 0.66}


 66%|██████▋   | 535/805 [1:19:11<39:42,  8.83s/it]

{'loss': 0.7116, 'grad_norm': 0.4313104450702667, 'learning_rate': 2.5568937925152272e-06, 'epoch': 0.66}


 67%|██████▋   | 536/805 [1:19:20<39:32,  8.82s/it]

{'loss': 0.5919, 'grad_norm': 0.460612416267395, 'learning_rate': 2.5397812553683552e-06, 'epoch': 0.67}


 67%|██████▋   | 537/805 [1:19:29<39:23,  8.82s/it]

{'loss': 0.6931, 'grad_norm': 0.4021870493888855, 'learning_rate': 2.5227066578379624e-06, 'epoch': 0.67}


 67%|██████▋   | 538/805 [1:19:37<39:13,  8.81s/it]

{'loss': 0.7318, 'grad_norm': 0.4564533531665802, 'learning_rate': 2.505670263235464e-06, 'epoch': 0.67}


 67%|██████▋   | 539/805 [1:19:46<39:09,  8.83s/it]

{'loss': 0.7462, 'grad_norm': 0.4151327311992645, 'learning_rate': 2.4886723342831375e-06, 'epoch': 0.67}


 67%|██████▋   | 540/805 [1:19:55<39:03,  8.84s/it]

{'loss': 0.7755, 'grad_norm': 0.4013296961784363, 'learning_rate': 2.471713133110078e-06, 'epoch': 0.67}


 67%|██████▋   | 541/805 [1:20:04<38:56,  8.85s/it]

{'loss': 0.7329, 'grad_norm': 0.4588097035884857, 'learning_rate': 2.4547929212481436e-06, 'epoch': 0.67}


 67%|██████▋   | 542/805 [1:20:13<38:49,  8.86s/it]

{'loss': 0.7071, 'grad_norm': 0.4557774066925049, 'learning_rate': 2.4379119596279367e-06, 'epoch': 0.67}


 67%|██████▋   | 543/805 [1:20:22<38:41,  8.86s/it]

{'loss': 0.7908, 'grad_norm': 0.4345778822898865, 'learning_rate': 2.4210705085747633e-06, 'epoch': 0.67}


 68%|██████▊   | 544/805 [1:20:31<38:34,  8.87s/it]

{'loss': 0.6825, 'grad_norm': 0.4116283059120178, 'learning_rate': 2.4042688278046374e-06, 'epoch': 0.68}


 68%|██████▊   | 545/805 [1:20:40<38:25,  8.87s/it]

{'loss': 0.6068, 'grad_norm': 0.4199250638484955, 'learning_rate': 2.387507176420256e-06, 'epoch': 0.68}


 68%|██████▊   | 546/805 [1:20:48<38:15,  8.86s/it]

{'loss': 0.7347, 'grad_norm': 0.4234878420829773, 'learning_rate': 2.370785812907022e-06, 'epoch': 0.68}


 68%|██████▊   | 547/805 [1:20:57<38:06,  8.86s/it]

{'loss': 0.855, 'grad_norm': 0.4371836185455322, 'learning_rate': 2.354104995129048e-06, 'epoch': 0.68}


 68%|██████▊   | 548/805 [1:21:06<37:59,  8.87s/it]

{'loss': 0.734, 'grad_norm': 0.43581581115722656, 'learning_rate': 2.337464980325176e-06, 'epoch': 0.68}


 68%|██████▊   | 549/805 [1:21:15<37:51,  8.87s/it]

{'loss': 0.8309, 'grad_norm': 0.42665162682533264, 'learning_rate': 2.320866025105016e-06, 'epoch': 0.68}


 68%|██████▊   | 550/805 [1:21:24<37:43,  8.88s/it]

{'loss': 0.7565, 'grad_norm': 0.4704425632953644, 'learning_rate': 2.304308385444999e-06, 'epoch': 0.68}


 68%|██████▊   | 551/805 [1:21:33<37:34,  8.87s/it]

{'loss': 0.7346, 'grad_norm': 0.4177403450012207, 'learning_rate': 2.2877923166844073e-06, 'epoch': 0.68}


 69%|██████▊   | 552/805 [1:21:42<37:26,  8.88s/it]

{'loss': 0.7001, 'grad_norm': 0.445720374584198, 'learning_rate': 2.271318073521451e-06, 'epoch': 0.69}


 69%|██████▊   | 553/805 [1:21:51<37:18,  8.88s/it]

{'loss': 0.7851, 'grad_norm': 0.4700608253479004, 'learning_rate': 2.254885910009341e-06, 'epoch': 0.69}


 69%|██████▉   | 554/805 [1:21:59<37:11,  8.89s/it]

{'loss': 0.7024, 'grad_norm': 0.42985543608665466, 'learning_rate': 2.2384960795523677e-06, 'epoch': 0.69}


 69%|██████▉   | 555/805 [1:22:08<37:01,  8.88s/it]

{'loss': 0.7729, 'grad_norm': 0.4155943691730499, 'learning_rate': 2.2221488349019903e-06, 'epoch': 0.69}


 69%|██████▉   | 556/805 [1:22:17<36:51,  8.88s/it]

{'loss': 0.7982, 'grad_norm': 0.4490000009536743, 'learning_rate': 2.2058444281529423e-06, 'epoch': 0.69}


 69%|██████▉   | 557/805 [1:22:26<36:42,  8.88s/it]

{'loss': 0.8004, 'grad_norm': 0.45153501629829407, 'learning_rate': 2.1895831107393485e-06, 'epoch': 0.69}


 69%|██████▉   | 558/805 [1:22:35<36:29,  8.86s/it]

{'loss': 0.7227, 'grad_norm': 0.43050238490104675, 'learning_rate': 2.1733651334308364e-06, 'epoch': 0.69}


 69%|██████▉   | 559/805 [1:22:44<36:16,  8.85s/it]

{'loss': 0.7123, 'grad_norm': 0.4274905025959015, 'learning_rate': 2.15719074632868e-06, 'epoch': 0.69}


 70%|██████▉   | 560/805 [1:22:53<36:03,  8.83s/it]

{'loss': 0.7369, 'grad_norm': 0.42970776557922363, 'learning_rate': 2.1410601988619394e-06, 'epoch': 0.7}


 70%|██████▉   | 561/805 [1:23:01<35:52,  8.82s/it]

{'loss': 0.8025, 'grad_norm': 0.4236815273761749, 'learning_rate': 2.124973739783609e-06, 'epoch': 0.7}


 70%|██████▉   | 562/805 [1:23:10<35:43,  8.82s/it]

{'loss': 0.7307, 'grad_norm': 0.46970656514167786, 'learning_rate': 2.108931617166784e-06, 'epoch': 0.7}


 70%|██████▉   | 563/805 [1:23:19<35:38,  8.84s/it]

{'loss': 0.7289, 'grad_norm': 0.42902177572250366, 'learning_rate': 2.0929340784008474e-06, 'epoch': 0.7}


 70%|███████   | 564/805 [1:23:28<35:30,  8.84s/it]

{'loss': 0.829, 'grad_norm': 0.44803377985954285, 'learning_rate': 2.0769813701876336e-06, 'epoch': 0.7}


 70%|███████   | 565/805 [1:23:37<35:20,  8.84s/it]

{'loss': 0.7406, 'grad_norm': 0.43613919615745544, 'learning_rate': 2.061073738537635e-06, 'epoch': 0.7}


 70%|███████   | 566/805 [1:23:46<35:13,  8.84s/it]

{'loss': 0.7034, 'grad_norm': 0.4582032859325409, 'learning_rate': 2.0452114287662127e-06, 'epoch': 0.7}


 70%|███████   | 567/805 [1:23:54<35:09,  8.86s/it]

{'loss': 0.698, 'grad_norm': 0.44173747301101685, 'learning_rate': 2.029394685489808e-06, 'epoch': 0.7}


 71%|███████   | 568/805 [1:24:03<35:02,  8.87s/it]

{'loss': 0.7093, 'grad_norm': 0.4454016089439392, 'learning_rate': 2.0136237526221646e-06, 'epoch': 0.71}


 71%|███████   | 569/805 [1:24:12<34:54,  8.87s/it]

{'loss': 0.7635, 'grad_norm': 0.4278104901313782, 'learning_rate': 1.9978988733705807e-06, 'epoch': 0.71}


 71%|███████   | 570/805 [1:24:21<34:45,  8.88s/it]

{'loss': 0.7425, 'grad_norm': 0.4623858630657196, 'learning_rate': 1.982220290232143e-06, 'epoch': 0.71}


 71%|███████   | 571/805 [1:24:30<34:36,  8.87s/it]

{'loss': 0.6918, 'grad_norm': 0.4712381958961487, 'learning_rate': 1.9665882449900024e-06, 'epoch': 0.71}


 71%|███████   | 572/805 [1:24:39<34:28,  8.88s/it]

{'loss': 0.8072, 'grad_norm': 0.43983158469200134, 'learning_rate': 1.951002978709631e-06, 'epoch': 0.71}


 71%|███████   | 573/805 [1:24:48<34:19,  8.88s/it]

{'loss': 0.7547, 'grad_norm': 0.45223313570022583, 'learning_rate': 1.9354647317351187e-06, 'epoch': 0.71}


 71%|███████▏  | 574/805 [1:24:57<34:11,  8.88s/it]

{'loss': 0.7782, 'grad_norm': 0.4438658058643341, 'learning_rate': 1.9199737436854517e-06, 'epoch': 0.71}


 71%|███████▏  | 575/805 [1:25:06<34:05,  8.89s/it]

{'loss': 0.7258, 'grad_norm': 0.41764959692955017, 'learning_rate': 1.9045302534508298e-06, 'epoch': 0.71}


 72%|███████▏  | 576/805 [1:25:14<33:55,  8.89s/it]

{'loss': 0.878, 'grad_norm': 0.4163682460784912, 'learning_rate': 1.8891344991889798e-06, 'epoch': 0.72}


 72%|███████▏  | 577/805 [1:25:23<33:44,  8.88s/it]

{'loss': 0.7633, 'grad_norm': 0.3949025273323059, 'learning_rate': 1.873786718321476e-06, 'epoch': 0.72}


 72%|███████▏  | 578/805 [1:25:32<33:34,  8.88s/it]

{'loss': 0.7574, 'grad_norm': 0.43771371245384216, 'learning_rate': 1.8584871475300814e-06, 'epoch': 0.72}


 72%|███████▏  | 579/805 [1:25:41<33:25,  8.88s/it]

{'loss': 0.7319, 'grad_norm': 0.42095237970352173, 'learning_rate': 1.8432360227531116e-06, 'epoch': 0.72}


 72%|███████▏  | 580/805 [1:25:50<33:11,  8.85s/it]

{'loss': 0.6735, 'grad_norm': 0.4229109585285187, 'learning_rate': 1.8280335791817733e-06, 'epoch': 0.72}


 72%|███████▏  | 581/805 [1:25:59<33:05,  8.86s/it]

{'loss': 0.7416, 'grad_norm': 0.4202624261379242, 'learning_rate': 1.8128800512565514e-06, 'epoch': 0.72}


 72%|███████▏  | 582/805 [1:26:08<32:54,  8.86s/it]

{'loss': 0.7589, 'grad_norm': 0.47426414489746094, 'learning_rate': 1.797775672663596e-06, 'epoch': 0.72}


 72%|███████▏  | 583/805 [1:26:16<32:41,  8.84s/it]

{'loss': 0.7448, 'grad_norm': 0.42364442348480225, 'learning_rate': 1.7827206763311055e-06, 'epoch': 0.72}


 73%|███████▎  | 584/805 [1:26:25<32:30,  8.83s/it]

{'loss': 0.7844, 'grad_norm': 0.4055882692337036, 'learning_rate': 1.7677152944257514e-06, 'epoch': 0.73}


 73%|███████▎  | 585/805 [1:26:34<32:20,  8.82s/it]

{'loss': 0.7322, 'grad_norm': 0.42765775322914124, 'learning_rate': 1.7527597583490825e-06, 'epoch': 0.73}


 73%|███████▎  | 586/805 [1:26:43<32:16,  8.84s/it]

{'loss': 0.8081, 'grad_norm': 0.44834816455841064, 'learning_rate': 1.7378542987339675e-06, 'epoch': 0.73}


 73%|███████▎  | 587/805 [1:26:52<32:10,  8.86s/it]

{'loss': 0.6949, 'grad_norm': 0.4240264892578125, 'learning_rate': 1.722999145441031e-06, 'epoch': 0.73}


 73%|███████▎  | 588/805 [1:27:01<32:01,  8.86s/it]

{'loss': 0.6935, 'grad_norm': 0.4504769444465637, 'learning_rate': 1.7081945275551142e-06, 'epoch': 0.73}


 73%|███████▎  | 589/805 [1:27:09<31:53,  8.86s/it]

{'loss': 0.7856, 'grad_norm': 0.43540433049201965, 'learning_rate': 1.6934406733817417e-06, 'epoch': 0.73}


 73%|███████▎  | 590/805 [1:27:18<31:46,  8.87s/it]

{'loss': 0.709, 'grad_norm': 0.4367990791797638, 'learning_rate': 1.6787378104435931e-06, 'epoch': 0.73}


 73%|███████▎  | 591/805 [1:27:27<31:39,  8.87s/it]

{'loss': 0.6978, 'grad_norm': 0.43160438537597656, 'learning_rate': 1.6640861654770007e-06, 'epoch': 0.73}


 74%|███████▎  | 592/805 [1:27:36<31:29,  8.87s/it]

{'loss': 0.7662, 'grad_norm': 0.4645518362522125, 'learning_rate': 1.6494859644284623e-06, 'epoch': 0.73}


 74%|███████▎  | 593/805 [1:27:45<31:19,  8.87s/it]

{'loss': 0.8979, 'grad_norm': 0.4268985688686371, 'learning_rate': 1.6349374324511347e-06, 'epoch': 0.74}


 74%|███████▍  | 594/805 [1:27:54<31:11,  8.87s/it]

{'loss': 0.6517, 'grad_norm': 0.42660003900527954, 'learning_rate': 1.6204407939013771e-06, 'epoch': 0.74}


 74%|███████▍  | 595/805 [1:28:03<31:01,  8.87s/it]

{'loss': 0.7603, 'grad_norm': 0.4572305679321289, 'learning_rate': 1.6059962723352912e-06, 'epoch': 0.74}


 74%|███████▍  | 596/805 [1:28:12<30:54,  8.87s/it]

{'loss': 0.7183, 'grad_norm': 0.42511123418807983, 'learning_rate': 1.5916040905052693e-06, 'epoch': 0.74}


 74%|███████▍  | 597/805 [1:28:20<30:45,  8.87s/it]

{'loss': 0.7466, 'grad_norm': 0.4628780484199524, 'learning_rate': 1.5772644703565564e-06, 'epoch': 0.74}


 74%|███████▍  | 598/805 [1:28:29<30:36,  8.87s/it]

{'loss': 0.7606, 'grad_norm': 0.4177079200744629, 'learning_rate': 1.5629776330238372e-06, 'epoch': 0.74}


 74%|███████▍  | 599/805 [1:28:38<30:28,  8.88s/it]

{'loss': 0.7851, 'grad_norm': 0.43884629011154175, 'learning_rate': 1.5487437988278141e-06, 'epoch': 0.74}


 75%|███████▍  | 600/805 [1:28:47<30:21,  8.89s/it]

{'loss': 0.7852, 'grad_norm': 0.4884176254272461, 'learning_rate': 1.5345631872718214e-06, 'epoch': 0.74}


 75%|███████▍  | 601/805 [1:28:56<30:13,  8.89s/it]

{'loss': 0.6713, 'grad_norm': 0.4039852023124695, 'learning_rate': 1.5204360170384286e-06, 'epoch': 0.75}


 75%|███████▍  | 602/805 [1:29:05<30:03,  8.88s/it]

{'loss': 0.7857, 'grad_norm': 0.42289385199546814, 'learning_rate': 1.50636250598608e-06, 'epoch': 0.75}


 75%|███████▍  | 603/805 [1:29:14<29:52,  8.88s/it]

{'loss': 0.7779, 'grad_norm': 0.43203502893447876, 'learning_rate': 1.4923428711457217e-06, 'epoch': 0.75}


 75%|███████▌  | 604/805 [1:29:23<29:44,  8.88s/it]

{'loss': 0.7276, 'grad_norm': 0.45165932178497314, 'learning_rate': 1.4783773287174685e-06, 'epoch': 0.75}


 75%|███████▌  | 605/805 [1:29:31<29:33,  8.87s/it]

{'loss': 0.6868, 'grad_norm': 0.4577948749065399, 'learning_rate': 1.4644660940672628e-06, 'epoch': 0.75}


 75%|███████▌  | 606/805 [1:29:40<29:23,  8.86s/it]

{'loss': 0.7766, 'grad_norm': 0.4436778128147125, 'learning_rate': 1.4506093817235495e-06, 'epoch': 0.75}


 75%|███████▌  | 607/805 [1:29:49<29:10,  8.84s/it]

{'loss': 0.7194, 'grad_norm': 0.44127532839775085, 'learning_rate': 1.4368074053739733e-06, 'epoch': 0.75}


 76%|███████▌  | 608/805 [1:29:58<29:01,  8.84s/it]

{'loss': 0.731, 'grad_norm': 0.424650102853775, 'learning_rate': 1.4230603778620855e-06, 'epoch': 0.75}


 76%|███████▌  | 609/805 [1:30:07<28:50,  8.83s/it]

{'loss': 0.8122, 'grad_norm': 0.4353208839893341, 'learning_rate': 1.4093685111840567e-06, 'epoch': 0.76}


 76%|███████▌  | 610/805 [1:30:16<28:40,  8.82s/it]

{'loss': 0.7565, 'grad_norm': 0.4548850655555725, 'learning_rate': 1.395732016485406e-06, 'epoch': 0.76}


 76%|███████▌  | 611/805 [1:30:24<28:33,  8.83s/it]

{'loss': 0.7184, 'grad_norm': 0.4313690662384033, 'learning_rate': 1.382151104057754e-06, 'epoch': 0.76}


 76%|███████▌  | 612/805 [1:30:33<28:26,  8.84s/it]

{'loss': 0.6961, 'grad_norm': 0.4136929512023926, 'learning_rate': 1.368625983335568e-06, 'epoch': 0.76}


 76%|███████▌  | 613/805 [1:30:42<28:19,  8.85s/it]

{'loss': 0.8168, 'grad_norm': 0.42769479751586914, 'learning_rate': 1.3551568628929434e-06, 'epoch': 0.76}


 76%|███████▋  | 614/805 [1:30:51<28:11,  8.86s/it]

{'loss': 0.7182, 'grad_norm': 0.43604543805122375, 'learning_rate': 1.3417439504403769e-06, 'epoch': 0.76}


 76%|███████▋  | 615/805 [1:31:00<28:03,  8.86s/it]

{'loss': 0.709, 'grad_norm': 0.43217766284942627, 'learning_rate': 1.3283874528215735e-06, 'epoch': 0.76}


 77%|███████▋  | 616/805 [1:31:09<27:56,  8.87s/it]

{'loss': 0.7965, 'grad_norm': 0.4243330657482147, 'learning_rate': 1.3150875760102467e-06, 'epoch': 0.76}


 77%|███████▋  | 617/805 [1:31:18<27:48,  8.87s/it]

{'loss': 0.7151, 'grad_norm': 0.4527084231376648, 'learning_rate': 1.301844525106951e-06, 'epoch': 0.77}


 77%|███████▋  | 618/805 [1:31:27<27:38,  8.87s/it]

{'loss': 0.8916, 'grad_norm': 0.453637033700943, 'learning_rate': 1.2886585043359156e-06, 'epoch': 0.77}


 77%|███████▋  | 619/805 [1:31:35<27:29,  8.87s/it]

{'loss': 0.7469, 'grad_norm': 0.4267084300518036, 'learning_rate': 1.2755297170418913e-06, 'epoch': 0.77}


 77%|███████▋  | 620/805 [1:31:44<27:20,  8.87s/it]

{'loss': 0.6913, 'grad_norm': 0.41936153173446655, 'learning_rate': 1.2624583656870153e-06, 'epoch': 0.77}


 77%|███████▋  | 621/805 [1:31:53<27:12,  8.87s/it]

{'loss': 0.7327, 'grad_norm': 0.44631826877593994, 'learning_rate': 1.2494446518477022e-06, 'epoch': 0.77}


 77%|███████▋  | 622/805 [1:32:02<27:03,  8.87s/it]

{'loss': 0.7137, 'grad_norm': 0.4071566164493561, 'learning_rate': 1.2364887762115152e-06, 'epoch': 0.77}


 77%|███████▋  | 623/805 [1:32:11<26:54,  8.87s/it]

{'loss': 0.7463, 'grad_norm': 0.4328898787498474, 'learning_rate': 1.2235909385740825e-06, 'epoch': 0.77}


 78%|███████▊  | 624/805 [1:32:20<26:46,  8.88s/it]

{'loss': 0.783, 'grad_norm': 0.43389225006103516, 'learning_rate': 1.2107513378360163e-06, 'epoch': 0.77}


 78%|███████▊  | 625/805 [1:32:29<26:38,  8.88s/it]

{'loss': 0.7573, 'grad_norm': 0.42088913917541504, 'learning_rate': 1.1979701719998454e-06, 'epoch': 0.78}


 78%|███████▊  | 626/805 [1:32:38<26:27,  8.87s/it]

{'loss': 0.6786, 'grad_norm': 0.45505082607269287, 'learning_rate': 1.1852476381669558e-06, 'epoch': 0.78}


 78%|███████▊  | 627/805 [1:32:46<26:17,  8.86s/it]

{'loss': 0.8262, 'grad_norm': 0.44796034693717957, 'learning_rate': 1.1725839325345601e-06, 'epoch': 0.78}


 78%|███████▊  | 628/805 [1:32:55<26:04,  8.84s/it]

{'loss': 0.8736, 'grad_norm': 0.4590470492839813, 'learning_rate': 1.159979250392661e-06, 'epoch': 0.78}


 78%|███████▊  | 629/805 [1:33:04<25:53,  8.82s/it]

{'loss': 0.7102, 'grad_norm': 0.4378734230995178, 'learning_rate': 1.1474337861210543e-06, 'epoch': 0.78}


 78%|███████▊  | 630/805 [1:33:13<25:43,  8.82s/it]

{'loss': 0.7041, 'grad_norm': 0.4398430287837982, 'learning_rate': 1.134947733186315e-06, 'epoch': 0.78}


 78%|███████▊  | 631/805 [1:33:22<26:11,  9.03s/it]

{'loss': 0.861, 'grad_norm': 0.43362051248550415, 'learning_rate': 1.1225212841388282e-06, 'epoch': 0.78}


 79%|███████▊  | 632/805 [1:33:31<25:54,  8.99s/it]

{'loss': 0.7582, 'grad_norm': 0.4587985575199127, 'learning_rate': 1.1101546306098092e-06, 'epoch': 0.78}


 79%|███████▊  | 633/805 [1:33:40<25:36,  8.94s/it]

{'loss': 0.7629, 'grad_norm': 0.41093724966049194, 'learning_rate': 1.097847963308351e-06, 'epoch': 0.79}


 79%|███████▉  | 634/805 [1:33:49<25:24,  8.92s/it]

{'loss': 0.7002, 'grad_norm': 0.4348354637622833, 'learning_rate': 1.0856014720184927e-06, 'epoch': 0.79}


 79%|███████▉  | 635/805 [1:33:58<25:15,  8.91s/it]

{'loss': 0.7134, 'grad_norm': 0.3953438997268677, 'learning_rate': 1.0734153455962765e-06, 'epoch': 0.79}


 79%|███████▉  | 636/805 [1:34:07<25:05,  8.91s/it]

{'loss': 0.73, 'grad_norm': 0.4429303705692291, 'learning_rate': 1.0612897719668457e-06, 'epoch': 0.79}


 79%|███████▉  | 637/805 [1:34:16<24:54,  8.90s/it]

{'loss': 0.7083, 'grad_norm': 0.46359649300575256, 'learning_rate': 1.049224938121548e-06, 'epoch': 0.79}


 79%|███████▉  | 638/805 [1:34:24<24:44,  8.89s/it]

{'loss': 0.7181, 'grad_norm': 0.4502148926258087, 'learning_rate': 1.0372210301150464e-06, 'epoch': 0.79}


 79%|███████▉  | 639/805 [1:34:33<24:35,  8.89s/it]

{'loss': 0.7318, 'grad_norm': 0.4204024076461792, 'learning_rate': 1.02527823306245e-06, 'epoch': 0.79}


 80%|███████▉  | 640/805 [1:34:42<24:25,  8.88s/it]

{'loss': 0.8431, 'grad_norm': 0.4653832018375397, 'learning_rate': 1.013396731136465e-06, 'epoch': 0.79}


 80%|███████▉  | 641/805 [1:34:51<24:16,  8.88s/it]

{'loss': 0.7697, 'grad_norm': 0.43156376481056213, 'learning_rate': 1.0015767075645472e-06, 'epoch': 0.8}


 80%|███████▉  | 642/805 [1:35:00<24:08,  8.89s/it]

{'loss': 0.6923, 'grad_norm': 0.41920214891433716, 'learning_rate': 9.898183446260851e-07, 'epoch': 0.8}


 80%|███████▉  | 643/805 [1:35:09<23:58,  8.88s/it]

{'loss': 0.8316, 'grad_norm': 0.43043002486228943, 'learning_rate': 9.781218236495776e-07, 'epoch': 0.8}


 80%|████████  | 644/805 [1:35:18<23:49,  8.88s/it]

{'loss': 0.8005, 'grad_norm': 0.43326646089553833, 'learning_rate': 9.66487325009851e-07, 'epoch': 0.8}


 80%|████████  | 645/805 [1:35:27<23:39,  8.87s/it]

{'loss': 0.7926, 'grad_norm': 0.4483010470867157, 'learning_rate': 9.549150281252633e-07, 'epoch': 0.8}


 80%|████████  | 646/805 [1:35:35<23:31,  8.88s/it]

{'loss': 0.6966, 'grad_norm': 0.43791189789772034, 'learning_rate': 9.434051114549497e-07, 'epoch': 0.8}


 80%|████████  | 647/805 [1:35:44<23:22,  8.88s/it]

{'loss': 0.7312, 'grad_norm': 0.4541114568710327, 'learning_rate': 9.319577524960655e-07, 'epoch': 0.8}


 80%|████████  | 648/805 [1:35:53<23:13,  8.87s/it]

{'loss': 0.7478, 'grad_norm': 0.4238906800746918, 'learning_rate': 9.205731277810448e-07, 'epoch': 0.8}


 81%|████████  | 649/805 [1:36:02<23:04,  8.88s/it]

{'loss': 0.7587, 'grad_norm': 0.437089204788208, 'learning_rate': 9.09251412874882e-07, 'epoch': 0.81}


 81%|████████  | 650/805 [1:36:11<22:56,  8.88s/it]

{'loss': 0.7927, 'grad_norm': 0.42566564679145813, 'learning_rate': 8.979927823724321e-07, 'epoch': 0.81}


 81%|████████  | 651/805 [1:36:20<22:47,  8.88s/it]

{'loss': 0.7047, 'grad_norm': 0.45271533727645874, 'learning_rate': 8.867974098957016e-07, 'epoch': 0.81}


 81%|████████  | 652/805 [1:36:29<22:35,  8.86s/it]

{'loss': 0.7236, 'grad_norm': 0.41796377301216125, 'learning_rate': 8.75665468091183e-07, 'epoch': 0.81}


 81%|████████  | 653/805 [1:36:37<22:25,  8.85s/it]

{'loss': 0.7634, 'grad_norm': 0.45109763741493225, 'learning_rate': 8.645971286271903e-07, 'epoch': 0.81}


 81%|████████  | 654/805 [1:36:46<22:17,  8.86s/it]

{'loss': 0.741, 'grad_norm': 0.4304125905036926, 'learning_rate': 8.535925621912123e-07, 'epoch': 0.81}


 81%|████████▏ | 655/805 [1:36:55<22:06,  8.84s/it]

{'loss': 0.7358, 'grad_norm': 0.4234108626842499, 'learning_rate': 8.426519384872733e-07, 'epoch': 0.81}


 81%|████████▏ | 656/805 [1:37:04<21:57,  8.84s/it]

{'loss': 0.7811, 'grad_norm': 0.49261823296546936, 'learning_rate': 8.317754262333283e-07, 'epoch': 0.81}


 82%|████████▏ | 657/805 [1:37:13<21:47,  8.83s/it]

{'loss': 0.7359, 'grad_norm': 0.4701337218284607, 'learning_rate': 8.209631931586499e-07, 'epoch': 0.82}


 82%|████████▏ | 658/805 [1:37:22<21:37,  8.83s/it]

{'loss': 0.7079, 'grad_norm': 0.4537374973297119, 'learning_rate': 8.102154060012457e-07, 'epoch': 0.82}


 82%|████████▏ | 659/805 [1:37:30<21:28,  8.83s/it]

{'loss': 0.766, 'grad_norm': 0.44093966484069824, 'learning_rate': 7.995322305052905e-07, 'epoch': 0.82}


 82%|████████▏ | 660/805 [1:37:39<21:21,  8.84s/it]

{'loss': 0.8423, 'grad_norm': 0.46492308378219604, 'learning_rate': 7.88913831418568e-07, 'epoch': 0.82}


 82%|████████▏ | 661/805 [1:37:48<21:14,  8.85s/it]

{'loss': 0.6872, 'grad_norm': 0.41612502932548523, 'learning_rate': 7.783603724899258e-07, 'epoch': 0.82}


 82%|████████▏ | 662/805 [1:37:57<21:07,  8.87s/it]

{'loss': 0.7855, 'grad_norm': 0.4929969608783722, 'learning_rate': 7.678720164667541e-07, 'epoch': 0.82}


 82%|████████▏ | 663/805 [1:38:06<20:59,  8.87s/it]

{'loss': 0.7197, 'grad_norm': 0.44977304339408875, 'learning_rate': 7.574489250924821e-07, 'epoch': 0.82}


 82%|████████▏ | 664/805 [1:38:15<20:51,  8.88s/it]

{'loss': 0.6936, 'grad_norm': 0.4363611042499542, 'learning_rate': 7.470912591040696e-07, 'epoch': 0.82}


 83%|████████▎ | 665/805 [1:38:24<20:42,  8.87s/it]

{'loss': 0.7602, 'grad_norm': 0.40366801619529724, 'learning_rate': 7.367991782295392e-07, 'epoch': 0.83}


 83%|████████▎ | 666/805 [1:38:33<20:33,  8.87s/it]

{'loss': 0.7126, 'grad_norm': 0.44007596373558044, 'learning_rate': 7.265728411855105e-07, 'epoch': 0.83}


 83%|████████▎ | 667/805 [1:38:41<20:24,  8.87s/it]

{'loss': 0.8549, 'grad_norm': 0.4491046667098999, 'learning_rate': 7.164124056747523e-07, 'epoch': 0.83}


 83%|████████▎ | 668/805 [1:38:50<20:17,  8.88s/it]

{'loss': 0.7734, 'grad_norm': 0.4504677653312683, 'learning_rate': 7.063180283837473e-07, 'epoch': 0.83}


 83%|████████▎ | 669/805 [1:38:59<20:09,  8.89s/it]

{'loss': 0.7555, 'grad_norm': 0.4526917040348053, 'learning_rate': 6.962898649802824e-07, 'epoch': 0.83}


 83%|████████▎ | 670/805 [1:39:08<20:01,  8.90s/it]

{'loss': 0.7542, 'grad_norm': 0.4554395079612732, 'learning_rate': 6.863280701110409e-07, 'epoch': 0.83}


 83%|████████▎ | 671/805 [1:39:17<19:52,  8.90s/it]

{'loss': 0.6831, 'grad_norm': 0.47577694058418274, 'learning_rate': 6.764327973992252e-07, 'epoch': 0.83}


 83%|████████▎ | 672/805 [1:39:26<19:43,  8.90s/it]

{'loss': 0.7954, 'grad_norm': 0.4717544913291931, 'learning_rate': 6.666041994421796e-07, 'epoch': 0.83}


 84%|████████▎ | 673/805 [1:39:35<19:33,  8.89s/it]

{'loss': 0.7084, 'grad_norm': 0.44105401635169983, 'learning_rate': 6.568424278090446e-07, 'epoch': 0.84}


 84%|████████▎ | 674/805 [1:39:44<19:25,  8.90s/it]

{'loss': 0.7068, 'grad_norm': 0.4498673975467682, 'learning_rate': 6.47147633038413e-07, 'epoch': 0.84}


 84%|████████▍ | 675/805 [1:39:53<19:16,  8.89s/it]

{'loss': 0.8134, 'grad_norm': 0.4454517960548401, 'learning_rate': 6.375199646360142e-07, 'epoch': 0.84}


 84%|████████▍ | 676/805 [1:40:02<19:06,  8.89s/it]

{'loss': 0.7587, 'grad_norm': 0.43708112835884094, 'learning_rate': 6.279595710724062e-07, 'epoch': 0.84}


 84%|████████▍ | 677/805 [1:40:10<18:58,  8.89s/it]

{'loss': 0.6403, 'grad_norm': 0.40669456124305725, 'learning_rate': 6.184665997806832e-07, 'epoch': 0.84}


 84%|████████▍ | 678/805 [1:40:19<18:49,  8.89s/it]

{'loss': 0.7198, 'grad_norm': 0.4257757067680359, 'learning_rate': 6.090411971542038e-07, 'epoch': 0.84}


 84%|████████▍ | 679/805 [1:40:28<18:39,  8.88s/it]

{'loss': 0.7926, 'grad_norm': 0.44918522238731384, 'learning_rate': 5.996835085443403e-07, 'epoch': 0.84}


 84%|████████▍ | 680/805 [1:40:37<18:27,  8.86s/it]

{'loss': 0.6188, 'grad_norm': 0.4326113760471344, 'learning_rate': 5.903936782582253e-07, 'epoch': 0.84}


 85%|████████▍ | 681/805 [1:40:46<18:19,  8.86s/it]

{'loss': 0.7099, 'grad_norm': 0.4686817228794098, 'learning_rate': 5.811718495565327e-07, 'epoch': 0.85}


 85%|████████▍ | 682/805 [1:40:55<18:10,  8.86s/it]

{'loss': 0.7304, 'grad_norm': 0.4625712037086487, 'learning_rate': 5.720181646512718e-07, 'epoch': 0.85}


 85%|████████▍ | 683/805 [1:41:04<18:01,  8.87s/it]

{'loss': 0.7367, 'grad_norm': 0.47509288787841797, 'learning_rate': 5.629327647035843e-07, 'epoch': 0.85}


 85%|████████▍ | 684/805 [1:41:12<17:53,  8.88s/it]

{'loss': 0.7566, 'grad_norm': 0.44380906224250793, 'learning_rate': 5.539157898215785e-07, 'epoch': 0.85}


 85%|████████▌ | 685/805 [1:41:21<17:45,  8.88s/it]

{'loss': 0.7563, 'grad_norm': 0.4967400133609772, 'learning_rate': 5.449673790581611e-07, 'epoch': 0.85}


 85%|████████▌ | 686/805 [1:41:30<17:37,  8.89s/it]

{'loss': 0.7511, 'grad_norm': 0.45840567350387573, 'learning_rate': 5.360876704088963e-07, 'epoch': 0.85}


 85%|████████▌ | 687/805 [1:41:39<17:28,  8.88s/it]

{'loss': 0.7404, 'grad_norm': 0.4592490494251251, 'learning_rate': 5.27276800809875e-07, 'epoch': 0.85}


 85%|████████▌ | 688/805 [1:41:48<17:18,  8.88s/it]

{'loss': 0.7139, 'grad_norm': 0.45386970043182373, 'learning_rate': 5.185349061356066e-07, 'epoch': 0.85}


 86%|████████▌ | 689/805 [1:41:57<17:10,  8.88s/it]

{'loss': 0.7031, 'grad_norm': 0.4329001009464264, 'learning_rate': 5.098621211969224e-07, 'epoch': 0.86}


 86%|████████▌ | 690/805 [1:42:06<17:00,  8.88s/it]

{'loss': 0.6773, 'grad_norm': 0.447861909866333, 'learning_rate': 5.012585797388936e-07, 'epoch': 0.86}


 86%|████████▌ | 691/805 [1:42:15<16:52,  8.88s/it]

{'loss': 0.8287, 'grad_norm': 0.43182888627052307, 'learning_rate': 4.92724414438771e-07, 'epoch': 0.86}


 86%|████████▌ | 692/805 [1:42:24<16:43,  8.88s/it]

{'loss': 0.7144, 'grad_norm': 0.46429207921028137, 'learning_rate': 4.842597569039448e-07, 'epoch': 0.86}


 86%|████████▌ | 693/805 [1:42:32<16:35,  8.89s/it]

{'loss': 0.7865, 'grad_norm': 0.4729229807853699, 'learning_rate': 4.758647376699033e-07, 'epoch': 0.86}


 86%|████████▌ | 694/805 [1:42:41<16:26,  8.89s/it]

{'loss': 0.8309, 'grad_norm': 0.475960373878479, 'learning_rate': 4.675394861982269e-07, 'epoch': 0.86}


 86%|████████▋ | 695/805 [1:42:50<16:17,  8.88s/it]

{'loss': 0.7021, 'grad_norm': 0.4757309556007385, 'learning_rate': 4.5928413087459325e-07, 'epoch': 0.86}


 86%|████████▋ | 696/805 [1:42:59<16:08,  8.88s/it]

{'loss': 0.7275, 'grad_norm': 0.4243074953556061, 'learning_rate': 4.51098799006795e-07, 'epoch': 0.86}


 87%|████████▋ | 697/805 [1:43:08<15:58,  8.88s/it]

{'loss': 0.7363, 'grad_norm': 0.4793102741241455, 'learning_rate': 4.4298361682277355e-07, 'epoch': 0.87}


 87%|████████▋ | 698/805 [1:43:17<15:49,  8.87s/it]

{'loss': 0.8147, 'grad_norm': 0.4553343951702118, 'learning_rate': 4.3493870946867855e-07, 'epoch': 0.87}


 87%|████████▋ | 699/805 [1:43:26<15:37,  8.85s/it]

{'loss': 0.7608, 'grad_norm': 0.4613259732723236, 'learning_rate': 4.269642010069319e-07, 'epoch': 0.87}


 87%|████████▋ | 700/805 [1:43:34<15:27,  8.84s/it]

{'loss': 0.7173, 'grad_norm': 0.4508870542049408, 'learning_rate': 4.1906021441432074e-07, 'epoch': 0.87}


 87%|████████▋ | 701/805 [1:43:43<15:18,  8.83s/it]

{'loss': 0.7152, 'grad_norm': 0.4248642027378082, 'learning_rate': 4.112268715800943e-07, 'epoch': 0.87}


 87%|████████▋ | 702/805 [1:43:52<15:08,  8.82s/it]

{'loss': 0.7389, 'grad_norm': 0.42401552200317383, 'learning_rate': 4.0346429330409107e-07, 'epoch': 0.87}


 87%|████████▋ | 703/805 [1:44:01<14:59,  8.82s/it]

{'loss': 0.7033, 'grad_norm': 0.4532977044582367, 'learning_rate': 3.957725992948691e-07, 'epoch': 0.87}


 87%|████████▋ | 704/805 [1:44:10<14:50,  8.82s/it]

{'loss': 0.7932, 'grad_norm': 0.4307163655757904, 'learning_rate': 3.8815190816786587e-07, 'epoch': 0.87}


 88%|████████▊ | 705/805 [1:44:18<14:41,  8.82s/it]

{'loss': 0.661, 'grad_norm': 0.45480525493621826, 'learning_rate': 3.8060233744356634e-07, 'epoch': 0.88}


 88%|████████▊ | 706/805 [1:44:27<14:32,  8.81s/it]

{'loss': 0.7719, 'grad_norm': 0.4261092245578766, 'learning_rate': 3.7312400354569013e-07, 'epoch': 0.88}


 88%|████████▊ | 707/805 [1:44:36<14:23,  8.81s/it]

{'loss': 0.7855, 'grad_norm': 0.4352514445781708, 'learning_rate': 3.6571702179939604e-07, 'epoch': 0.88}


 88%|████████▊ | 708/805 [1:44:45<14:16,  8.83s/it]

{'loss': 0.6419, 'grad_norm': 0.40833500027656555, 'learning_rate': 3.5838150642950655e-07, 'epoch': 0.88}


 88%|████████▊ | 709/805 [1:44:54<14:08,  8.84s/it]

{'loss': 0.7519, 'grad_norm': 0.42540353536605835, 'learning_rate': 3.511175705587433e-07, 'epoch': 0.88}


 88%|████████▊ | 710/805 [1:45:03<14:01,  8.86s/it]

{'loss': 0.7186, 'grad_norm': 0.4333410859107971, 'learning_rate': 3.439253262059822e-07, 'epoch': 0.88}


 88%|████████▊ | 711/805 [1:45:12<13:52,  8.86s/it]

{'loss': 0.6506, 'grad_norm': 0.41822540760040283, 'learning_rate': 3.3680488428453005e-07, 'epoch': 0.88}


 88%|████████▊ | 712/805 [1:45:20<13:44,  8.86s/it]

{'loss': 0.7482, 'grad_norm': 0.4429333806037903, 'learning_rate': 3.2975635460040736e-07, 'epoch': 0.88}


 89%|████████▊ | 713/805 [1:45:29<13:35,  8.86s/it]

{'loss': 0.7354, 'grad_norm': 0.461345911026001, 'learning_rate': 3.227798458506637e-07, 'epoch': 0.89}


 89%|████████▊ | 714/805 [1:45:38<13:26,  8.86s/it]

{'loss': 0.772, 'grad_norm': 0.4118240475654602, 'learning_rate': 3.158754656216928e-07, 'epoch': 0.89}


 89%|████████▉ | 715/805 [1:45:47<13:17,  8.86s/it]

{'loss': 0.7363, 'grad_norm': 0.43789875507354736, 'learning_rate': 3.0904332038757977e-07, 'epoch': 0.89}


 89%|████████▉ | 716/805 [1:45:56<13:10,  8.88s/it]

{'loss': 0.7276, 'grad_norm': 0.4440906345844269, 'learning_rate': 3.0228351550845527e-07, 'epoch': 0.89}


 89%|████████▉ | 717/805 [1:46:05<13:02,  8.89s/it]

{'loss': 0.7776, 'grad_norm': 0.473961740732193, 'learning_rate': 2.9559615522887275e-07, 'epoch': 0.89}


 89%|████████▉ | 718/805 [1:46:14<12:53,  8.90s/it]

{'loss': 0.8035, 'grad_norm': 0.4604881703853607, 'learning_rate': 2.8898134267620115e-07, 'epoch': 0.89}


 89%|████████▉ | 719/805 [1:46:23<12:44,  8.89s/it]

{'loss': 0.8056, 'grad_norm': 0.46613165736198425, 'learning_rate': 2.8243917985903256e-07, 'epoch': 0.89}


 89%|████████▉ | 720/805 [1:46:32<12:35,  8.89s/it]

{'loss': 0.7672, 'grad_norm': 0.4327656030654907, 'learning_rate': 2.7596976766560977e-07, 'epoch': 0.89}


 90%|████████▉ | 721/805 [1:46:40<12:25,  8.88s/it]

{'loss': 0.6359, 'grad_norm': 0.43044784665107727, 'learning_rate': 2.6957320586227354e-07, 'epoch': 0.9}


 90%|████████▉ | 722/805 [1:46:49<12:16,  8.88s/it]

{'loss': 0.7029, 'grad_norm': 0.4193074107170105, 'learning_rate': 2.632495930919188e-07, 'epoch': 0.9}


 90%|████████▉ | 723/805 [1:46:58<12:08,  8.89s/it]

{'loss': 0.6368, 'grad_norm': 0.42184358835220337, 'learning_rate': 2.56999026872477e-07, 'epoch': 0.9}


 90%|████████▉ | 724/805 [1:47:07<11:59,  8.89s/it]

{'loss': 0.692, 'grad_norm': 0.4394301474094391, 'learning_rate': 2.508216035954114e-07, 'epoch': 0.9}


 90%|█████████ | 725/805 [1:47:16<11:50,  8.88s/it]

{'loss': 0.6941, 'grad_norm': 0.5057381391525269, 'learning_rate': 2.447174185242324e-07, 'epoch': 0.9}


 90%|█████████ | 726/805 [1:47:25<11:40,  8.87s/it]

{'loss': 0.8504, 'grad_norm': 0.4379205107688904, 'learning_rate': 2.3868656579302264e-07, 'epoch': 0.9}


 90%|█████████ | 727/805 [1:47:34<11:30,  8.85s/it]

{'loss': 0.6934, 'grad_norm': 0.44518572092056274, 'learning_rate': 2.32729138404994e-07, 'epoch': 0.9}


 90%|█████████ | 728/805 [1:47:42<11:22,  8.86s/it]

{'loss': 0.7667, 'grad_norm': 0.432979941368103, 'learning_rate': 2.268452282310446e-07, 'epoch': 0.9}


 91%|█████████ | 729/805 [1:47:51<11:13,  8.86s/it]

{'loss': 0.7977, 'grad_norm': 0.45515474677085876, 'learning_rate': 2.210349260083494e-07, 'epoch': 0.91}


 91%|█████████ | 730/805 [1:48:00<11:05,  8.87s/it]

{'loss': 0.7344, 'grad_norm': 0.4222757816314697, 'learning_rate': 2.152983213389559e-07, 'epoch': 0.91}


 91%|█████████ | 731/805 [1:48:09<10:57,  8.88s/it]

{'loss': 0.7047, 'grad_norm': 0.5139585137367249, 'learning_rate': 2.096355026884045e-07, 'epoch': 0.91}


 91%|█████████ | 732/805 [1:48:18<10:48,  8.89s/it]

{'loss': 0.7372, 'grad_norm': 0.4362139105796814, 'learning_rate': 2.0404655738436418e-07, 'epoch': 0.91}


 91%|█████████ | 733/805 [1:48:27<10:39,  8.88s/it]

{'loss': 0.771, 'grad_norm': 0.4355250895023346, 'learning_rate': 1.9853157161528468e-07, 'epoch': 0.91}


 91%|█████████ | 734/805 [1:48:36<10:30,  8.88s/it]

{'loss': 0.7238, 'grad_norm': 0.4539703130722046, 'learning_rate': 1.9309063042907028e-07, 'epoch': 0.91}


 91%|█████████▏| 735/805 [1:48:45<10:21,  8.88s/it]

{'loss': 0.8324, 'grad_norm': 0.45249447226524353, 'learning_rate': 1.8772381773176417e-07, 'epoch': 0.91}


 91%|█████████▏| 736/805 [1:48:54<10:12,  8.88s/it]

{'loss': 0.8006, 'grad_norm': 0.4434506297111511, 'learning_rate': 1.8243121628625626e-07, 'epoch': 0.91}


 92%|█████████▏| 737/805 [1:49:02<10:03,  8.88s/it]

{'loss': 0.7651, 'grad_norm': 0.4524005055427551, 'learning_rate': 1.7721290771100964e-07, 'epoch': 0.91}


 92%|█████████▏| 738/805 [1:49:11<09:54,  8.88s/it]

{'loss': 0.7314, 'grad_norm': 0.43200650811195374, 'learning_rate': 1.7206897247879716e-07, 'epoch': 0.92}


 92%|█████████▏| 739/805 [1:49:20<09:45,  8.87s/it]

{'loss': 0.7514, 'grad_norm': 0.43286994099617004, 'learning_rate': 1.6699948991546365e-07, 'epoch': 0.92}


 92%|█████████▏| 740/805 [1:49:29<09:36,  8.87s/it]

{'loss': 0.7922, 'grad_norm': 0.4522082507610321, 'learning_rate': 1.6200453819870122e-07, 'epoch': 0.92}


 92%|█████████▏| 741/805 [1:49:38<09:27,  8.87s/it]

{'loss': 0.7655, 'grad_norm': 0.4228079617023468, 'learning_rate': 1.5708419435684463e-07, 'epoch': 0.92}


 92%|█████████▏| 742/805 [1:49:47<09:18,  8.87s/it]

{'loss': 0.7212, 'grad_norm': 0.47774794697761536, 'learning_rate': 1.5223853426768242e-07, 'epoch': 0.92}


 92%|█████████▏| 743/805 [1:49:56<09:10,  8.88s/it]

{'loss': 0.6913, 'grad_norm': 0.4439198672771454, 'learning_rate': 1.474676326572877e-07, 'epoch': 0.92}


 92%|█████████▏| 744/805 [1:50:05<09:01,  8.88s/it]

{'loss': 0.7205, 'grad_norm': 0.40128380060195923, 'learning_rate': 1.4277156309886574e-07, 'epoch': 0.92}


 93%|█████████▎| 745/805 [1:50:13<08:52,  8.88s/it]

{'loss': 0.8275, 'grad_norm': 0.4599459767341614, 'learning_rate': 1.3815039801161723e-07, 'epoch': 0.92}


 93%|█████████▎| 746/805 [1:50:22<08:43,  8.87s/it]

{'loss': 0.7059, 'grad_norm': 0.4151259958744049, 'learning_rate': 1.3360420865962508e-07, 'epoch': 0.93}


 93%|█████████▎| 747/805 [1:50:31<08:33,  8.85s/it]

{'loss': 0.7716, 'grad_norm': 0.4430450201034546, 'learning_rate': 1.291330651507533e-07, 'epoch': 0.93}


 93%|█████████▎| 748/805 [1:50:40<08:23,  8.83s/it]

{'loss': 0.7082, 'grad_norm': 0.42439189553260803, 'learning_rate': 1.24737036435566e-07, 'epoch': 0.93}


 93%|█████████▎| 749/805 [1:50:49<08:13,  8.82s/it]

{'loss': 0.7464, 'grad_norm': 0.45821791887283325, 'learning_rate': 1.2041619030626283e-07, 'epoch': 0.93}


 93%|█████████▎| 750/805 [1:50:57<08:04,  8.81s/it]

{'loss': 0.766, 'grad_norm': 0.4365680515766144, 'learning_rate': 1.1617059339563807e-07, 'epoch': 0.93}


 93%|█████████▎| 751/805 [1:51:06<07:56,  8.83s/it]

{'loss': 0.817, 'grad_norm': 0.45441681146621704, 'learning_rate': 1.1200031117604704e-07, 'epoch': 0.93}


 93%|█████████▎| 752/805 [1:51:15<07:49,  8.87s/it]

{'loss': 0.7743, 'grad_norm': 0.43323177099227905, 'learning_rate': 1.0790540795840021e-07, 'epoch': 0.93}


 94%|█████████▎| 753/805 [1:51:25<07:52,  9.09s/it]

{'loss': 0.628, 'grad_norm': 0.4528999626636505, 'learning_rate': 1.038859468911707e-07, 'epoch': 0.93}


 94%|█████████▎| 754/805 [1:51:34<07:51,  9.24s/it]

{'loss': 0.7026, 'grad_norm': 0.45162588357925415, 'learning_rate': 9.994198995942228e-08, 'epoch': 0.94}


 94%|█████████▍| 755/805 [1:51:44<07:46,  9.33s/it]

{'loss': 0.8293, 'grad_norm': 0.45726287364959717, 'learning_rate': 9.607359798384785e-08, 'epoch': 0.94}


 94%|█████████▍| 756/805 [1:51:54<07:41,  9.42s/it]

{'loss': 0.8623, 'grad_norm': 0.45972689986228943, 'learning_rate': 9.228083061983806e-08, 'epoch': 0.94}


 94%|█████████▍| 757/805 [1:52:03<07:33,  9.46s/it]

{'loss': 0.8162, 'grad_norm': 0.4369615912437439, 'learning_rate': 8.856374635655696e-08, 'epoch': 0.94}


 94%|█████████▍| 758/805 [1:52:13<07:26,  9.50s/it]

{'loss': 0.7007, 'grad_norm': 0.4351476728916168, 'learning_rate': 8.492240251604222e-08, 'epoch': 0.94}


 94%|█████████▍| 759/805 [1:52:22<07:17,  9.51s/it]

{'loss': 0.7994, 'grad_norm': 0.44848522543907166, 'learning_rate': 8.135685525232028e-08, 'epoch': 0.94}


 94%|█████████▍| 760/805 [1:52:32<07:09,  9.55s/it]

{'loss': 0.8157, 'grad_norm': 0.5035743117332458, 'learning_rate': 7.786715955054202e-08, 'epoch': 0.94}


 95%|█████████▍| 761/805 [1:52:41<06:59,  9.54s/it]

{'loss': 0.7188, 'grad_norm': 0.4420662820339203, 'learning_rate': 7.445336922613067e-08, 'epoch': 0.94}


 95%|█████████▍| 762/805 [1:52:51<06:50,  9.54s/it]

{'loss': 0.6724, 'grad_norm': 0.4248327910900116, 'learning_rate': 7.111553692395633e-08, 'epoch': 0.95}


 95%|█████████▍| 763/805 [1:53:01<06:41,  9.56s/it]

{'loss': 0.7424, 'grad_norm': 0.45388346910476685, 'learning_rate': 6.785371411752285e-08, 'epoch': 0.95}


 95%|█████████▍| 764/805 [1:53:10<06:32,  9.58s/it]

{'loss': 0.7291, 'grad_norm': 0.4421882629394531, 'learning_rate': 6.466795110817214e-08, 'epoch': 0.95}


 95%|█████████▌| 765/805 [1:53:20<06:23,  9.58s/it]

{'loss': 0.7261, 'grad_norm': 0.46044930815696716, 'learning_rate': 6.15582970243117e-08, 'epoch': 0.95}


 95%|█████████▌| 766/805 [1:53:29<06:12,  9.56s/it]

{'loss': 0.7121, 'grad_norm': 0.42292916774749756, 'learning_rate': 5.85247998206534e-08, 'epoch': 0.95}


 95%|█████████▌| 767/805 [1:53:39<06:03,  9.56s/it]

{'loss': 0.735, 'grad_norm': 0.48495110869407654, 'learning_rate': 5.5567506277477425e-08, 'epoch': 0.95}


 95%|█████████▌| 768/805 [1:53:48<05:53,  9.56s/it]

{'loss': 0.7877, 'grad_norm': 0.4336540699005127, 'learning_rate': 5.26864619999079e-08, 'epoch': 0.95}


 96%|█████████▌| 769/805 [1:53:58<05:44,  9.56s/it]

{'loss': 0.7532, 'grad_norm': 0.4350549876689911, 'learning_rate': 4.988171141721232e-08, 'epoch': 0.95}


 96%|█████████▌| 770/805 [1:54:08<05:34,  9.57s/it]

{'loss': 0.7011, 'grad_norm': 0.44958746433258057, 'learning_rate': 4.715329778211375e-08, 'epoch': 0.96}


 96%|█████████▌| 771/805 [1:54:17<05:25,  9.58s/it]

{'loss': 0.682, 'grad_norm': 0.42125797271728516, 'learning_rate': 4.450126317012637e-08, 'epoch': 0.96}


 96%|█████████▌| 772/805 [1:54:27<05:15,  9.57s/it]

{'loss': 0.7476, 'grad_norm': 0.4191614091396332, 'learning_rate': 4.1925648478903794e-08, 'epoch': 0.96}


 96%|█████████▌| 773/805 [1:54:36<05:05,  9.56s/it]

{'loss': 0.7786, 'grad_norm': 0.4457913935184479, 'learning_rate': 3.9426493427611177e-08, 'epoch': 0.96}


 96%|█████████▌| 774/805 [1:54:46<04:55,  9.55s/it]

{'loss': 0.6951, 'grad_norm': 0.42370688915252686, 'learning_rate': 3.700383655631079e-08, 'epoch': 0.96}


 96%|█████████▋| 775/805 [1:54:55<04:46,  9.55s/it]

{'loss': 0.6953, 'grad_norm': 0.43473780155181885, 'learning_rate': 3.465771522536854e-08, 'epoch': 0.96}


 96%|█████████▋| 776/805 [1:55:05<04:37,  9.56s/it]

{'loss': 0.7532, 'grad_norm': 0.43867287039756775, 'learning_rate': 3.2388165614878344e-08, 'epoch': 0.96}


 97%|█████████▋| 777/805 [1:55:15<04:27,  9.55s/it]

{'loss': 0.7174, 'grad_norm': 0.47136810421943665, 'learning_rate': 3.019522272410202e-08, 'epoch': 0.96}


 97%|█████████▋| 778/805 [1:55:24<04:18,  9.57s/it]

{'loss': 0.9327, 'grad_norm': 0.45908594131469727, 'learning_rate': 2.8078920370931405e-08, 'epoch': 0.97}


 97%|█████████▋| 779/805 [1:55:34<04:08,  9.56s/it]

{'loss': 0.6953, 'grad_norm': 0.43604451417922974, 'learning_rate': 2.6039291191367612e-08, 'epoch': 0.97}


 97%|█████████▋| 780/805 [1:55:43<03:59,  9.58s/it]

{'loss': 0.8252, 'grad_norm': 0.42947128415107727, 'learning_rate': 2.4076366639015914e-08, 'epoch': 0.97}


 97%|█████████▋| 781/805 [1:55:53<03:49,  9.57s/it]

{'loss': 0.7628, 'grad_norm': 0.44686904549598694, 'learning_rate': 2.219017698460002e-08, 'epoch': 0.97}


 97%|█████████▋| 782/805 [1:56:02<03:39,  9.56s/it]

{'loss': 0.8944, 'grad_norm': 0.4601649343967438, 'learning_rate': 2.038075131549855e-08, 'epoch': 0.97}


 97%|█████████▋| 783/805 [1:56:12<03:30,  9.57s/it]

{'loss': 0.6779, 'grad_norm': 0.43093955516815186, 'learning_rate': 1.864811753529372e-08, 'epoch': 0.97}


 97%|█████████▋| 784/805 [1:56:22<03:21,  9.59s/it]

{'loss': 0.6774, 'grad_norm': 0.4251960217952728, 'learning_rate': 1.6992302363341706e-08, 'epoch': 0.97}


 98%|█████████▊| 785/805 [1:56:31<03:11,  9.58s/it]

{'loss': 0.7705, 'grad_norm': 0.43969449400901794, 'learning_rate': 1.541333133436018e-08, 'epoch': 0.97}


 98%|█████████▊| 786/805 [1:56:41<03:02,  9.59s/it]

{'loss': 0.7681, 'grad_norm': 0.41855043172836304, 'learning_rate': 1.3911228798036414e-08, 'epoch': 0.98}


 98%|█████████▊| 787/805 [1:56:50<02:52,  9.59s/it]

{'loss': 0.7083, 'grad_norm': 0.455131858587265, 'learning_rate': 1.2486017918649784e-08, 'epoch': 0.98}


 98%|█████████▊| 788/805 [1:57:00<02:42,  9.56s/it]

{'loss': 0.7671, 'grad_norm': 0.4593011140823364, 'learning_rate': 1.1137720674714303e-08, 'epoch': 0.98}


 98%|█████████▊| 789/805 [1:57:09<02:32,  9.56s/it]

{'loss': 0.6597, 'grad_norm': 0.49809959530830383, 'learning_rate': 9.866357858642206e-09, 'epoch': 0.98}


 98%|█████████▊| 790/805 [1:57:19<02:23,  9.57s/it]

{'loss': 0.6675, 'grad_norm': 0.4309329390525818, 'learning_rate': 8.671949076420883e-09, 'epoch': 0.98}


 98%|█████████▊| 791/805 [1:57:29<02:14,  9.59s/it]

{'loss': 0.8092, 'grad_norm': 0.4595502018928528, 'learning_rate': 7.55451274731034e-09, 'epoch': 0.98}


 98%|█████████▊| 792/805 [1:57:38<02:04,  9.59s/it]

{'loss': 0.7557, 'grad_norm': 0.4666290283203125, 'learning_rate': 6.514066103562311e-09, 'epoch': 0.98}


 99%|█████████▊| 793/805 [1:57:48<01:55,  9.58s/it]

{'loss': 0.8622, 'grad_norm': 0.4696674346923828, 'learning_rate': 5.5506251901504825e-09, 'epoch': 0.98}


 99%|█████████▊| 794/805 [1:57:57<01:45,  9.57s/it]

{'loss': 0.8214, 'grad_norm': 0.4542628228664398, 'learning_rate': 4.664204864525124e-09, 'epoch': 0.99}


 99%|█████████▉| 795/805 [1:58:07<01:35,  9.55s/it]

{'loss': 0.7395, 'grad_norm': 0.42410600185394287, 'learning_rate': 3.854818796385495e-09, 'epoch': 0.99}


 99%|█████████▉| 796/805 [1:58:16<01:26,  9.58s/it]

{'loss': 0.8279, 'grad_norm': 0.47300201654434204, 'learning_rate': 3.1224794674650228e-09, 'epoch': 0.99}


 99%|█████████▉| 797/805 [1:58:26<01:16,  9.56s/it]

{'loss': 0.7154, 'grad_norm': 0.4164161682128906, 'learning_rate': 2.4671981713420003e-09, 'epoch': 0.99}


 99%|█████████▉| 798/805 [1:58:36<01:06,  9.57s/it]

{'loss': 0.7342, 'grad_norm': 0.46266210079193115, 'learning_rate': 1.8889850132658427e-09, 'epoch': 0.99}


 99%|█████████▉| 799/805 [1:58:45<00:57,  9.56s/it]

{'loss': 0.7369, 'grad_norm': 0.4492325186729431, 'learning_rate': 1.3878489099972137e-09, 'epoch': 0.99}


 99%|█████████▉| 800/805 [1:58:55<00:47,  9.57s/it]

{'loss': 0.7516, 'grad_norm': 0.4504021406173706, 'learning_rate': 9.637975896759077e-10, 'epoch': 0.99}


100%|█████████▉| 801/805 [1:59:04<00:38,  9.57s/it]

{'loss': 0.751, 'grad_norm': 0.4634816646575928, 'learning_rate': 6.168375916970615e-10, 'epoch': 0.99}


100%|█████████▉| 802/805 [1:59:14<00:28,  9.56s/it]

{'loss': 0.7543, 'grad_norm': 0.4648995101451874, 'learning_rate': 3.4697426661345344e-10, 'epoch': 1.0}


100%|█████████▉| 803/805 [1:59:23<00:19,  9.56s/it]

{'loss': 0.7463, 'grad_norm': 0.40786516666412354, 'learning_rate': 1.5421177605168258e-10, 'epoch': 1.0}


100%|█████████▉| 804/805 [1:59:33<00:09,  9.57s/it]

{'loss': 0.8106, 'grad_norm': 0.5236582159996033, 'learning_rate': 3.855309264721996e-11, 'epoch': 1.0}


100%|██████████| 805/805 [1:59:43<00:00,  9.58s/it]

{'loss': 0.725, 'grad_norm': 0.4440672695636749, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 805/805 [1:59:44<00:00,  8.92s/it]

{'train_runtime': 7184.4191, 'train_samples_per_second': 0.897, 'train_steps_per_second': 0.112, 'train_loss': 0.8360126417616139, 'epoch': 1.0}





<a name="Inference"></a>
### Inference
Let's run the model! Unsloth makes inference natively 2x faster as well! You should use prompts which are similar to the ones you had finetuned on, otherwise you might get bad results!

In [9]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                    # Change below!
    {"role": "user", "content": "I have beef and salt and rice, What can i make?"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
# _ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)
_ = model.generate(input_ids, streamer=text_streamer, max_new_tokens=128, attention_mask=input_ids.ne(tokenizer.pad_token_id).to("cuda"), pad_token_id=tokenizer.pad_token_id)

{"is_recipe_request": true, "recipe_name": "Beef and Rice Recipe - Beef Biryani Recipe"}
Recipe Name: Beef and Rice Recipe - Beef Biryani Recipe
Cuisine: Indian
Course: Lunch
Diet: Non Vegeterian
Ingredients:
1. Beef: 500 grams Beef, cut into small pieces
2. Salt, to taste
3. Rice: 1 cup Rice
4. Oil, for cooking
5. Onions, chopped
6. Ginger, chopped
7. Garlic, chopped
8. Cinnamon Stick
9. Cloves
10. Cardam


In [10]:
if True: model.save_pretrained_gguf("taejoonlee/v7", tokenizer, maximum_memory_usage=0.8)

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 15.32 out of 31.26 RAM for saving.


  9%|▉         | 3/32 [00:00<00:00, 29.27it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:37<00:00,  1.16s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at taejoonlee/v7 into q8_0 GGUF format.
The output location will be ./taejoonlee/v7/unsloth.Q8_0.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: v7
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> Q8_0, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      t

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: ./taejoonlee/v7/unsloth.Q8_0.gguf
Unsloth: Saved Ollama Modelfile to taejoonlee/v7/Modelfile
