In [1]:
# https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/sample_finetune.py

In [2]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

In [3]:
logger = logging.getLogger(__name__)

In [4]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./sample_phi3_finetune_example",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

In [5]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}

In [6]:
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)


In [7]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

2024-06-03 13:27:31 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
evaluation_strategy=None,


In [8]:
################
# Modle Loading
################
#checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 512
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

[INFO|configuration_utils.py:733] 2024-06-03 13:27:31,861 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:733] 2024-06-03 13:27:32,075 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:27:32,076 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4280] 2024-06-03 13:27:42,389 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4288] 2024-06-03 13:27:42,390 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:917] 2024-06-03 13:27:42,640 >> loading configuration file generation_config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\generation_config.json
[INFO|configuration_utils.py:962] 2024-06-03 13:27:42,642 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2108] 2024-06-03 

In [9]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset("wenlianghuang/dataset_phi3_matt_testing")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

Overwrite dataset info from restored data version if exists.


2024-06-03 13:27:51 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


2024-06-03 13:27:51 - INFO - datasets.info - Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


Found cached dataset dataset_phi3_matt_testing (C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282)


2024-06-03 13:27:51 - INFO - datasets.builder - Found cached dataset dataset_phi3_matt_testing (C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282)


Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


2024-06-03 13:27:51 - INFO - datasets.info - Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


In [10]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=5,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)


Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00000_of_00005.arrow


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00000_of_00005.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00001_of_00005.arrow


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00001_of_00005.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00002_of_00005.arrow


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00002_of_00005.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00003_of_00005.arrow


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00003_of_00005.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00004_of_00005.arrow


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-61068def4bd2125e_00004_of_00005.arrow


Spawning 5 processes


2024-06-03 13:27:51 - INFO - datasets.arrow_dataset - Spawning 5 processes


Applying chat template to train_sft (num_proc=5):   0%|          | 0/5000 [00:00<?, ? examples/s]

Concatenating 5 shards


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Concatenating 5 shards


In [11]:
processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=5,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00000_of_00005.arrow


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00000_of_00005.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00001_of_00005.arrow


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00001_of_00005.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00002_of_00005.arrow


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00002_of_00005.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00003_of_00005.arrow


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00003_of_00005.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00004_of_00005.arrow


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-825ef8715f20ed9d_00004_of_00005.arrow


Spawning 5 processes


2024-06-03 13:27:57 - INFO - datasets.arrow_dataset - Spawning 5 processes


Applying chat template to test_sft (num_proc=5):   0%|          | 0/1000 [00:00<?, ? examples/s]

Concatenating 5 shards


2024-06-03 13:28:00 - INFO - datasets.arrow_dataset - Concatenating 5 shards


In [12]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Using custom data configuration default-8fe75ed2c3a85b51


2024-06-03 13:28:01 - INFO - datasets.builder - Using custom data configuration default-8fe75ed2c3a85b51


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-06-03 13:28:01 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0)


2024-06-03 13:28:01 - INFO - datasets.builder - Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0)


Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0...


2024-06-03 13:28:01 - INFO - datasets.builder - Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0...


Generating train split


2024-06-03 13:28:01 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-06-03 13:28:09 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0. Subsequent calls will reuse this data.


2024-06-03 13:28:09 - INFO - datasets.builder - Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-8fe75ed2c3a85b51/0.0.0. Subsequent calls will reuse this data.


Using custom data configuration default-7facf7caec1a9c35


2024-06-03 13:28:09 - INFO - datasets.builder - Using custom data configuration default-7facf7caec1a9c35


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-06-03 13:28:09 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0)


2024-06-03 13:28:09 - INFO - datasets.builder - Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0)


Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0...


2024-06-03 13:28:09 - INFO - datasets.builder - Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0...


Generating train split


2024-06-03 13:28:09 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-06-03 13:28:11 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0. Subsequent calls will reuse this data.


2024-06-03 13:28:11 - INFO - datasets.builder - Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7facf7caec1a9c35/0.0.0. Subsequent calls will reuse this data.


[INFO|trainer.py:641] 2024-06-03 13:28:17,911 >> Using auto half precision backend
[INFO|trainer.py:2078] 2024-06-03 13:28:18,055 >> ***** Running training *****
[INFO|trainer.py:2079] 2024-06-03 13:28:18,056 >>   Num examples = 6,706
[INFO|trainer.py:2080] 2024-06-03 13:28:18,057 >>   Num Epochs = 1
[INFO|trainer.py:2081] 2024-06-03 13:28:18,057 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2084] 2024-06-03 13:28:18,057 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2085] 2024-06-03 13:28:18,058 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2086] 2024-06-03 13:28:18,058 >>   Total optimization steps = 1,677
[INFO|trainer.py:2087] 2024-06-03 13:28:18,061 >>   Number of trainable parameters = 25,165,824


  0%|          | 0/1677 [00:00<?, ?it/s]

{'loss': 1.3391, 'grad_norm': 0.859375, 'learning_rate': 2.9761904761904765e-07, 'epoch': 0.01}
{'loss': 1.3358, 'grad_norm': 0.9765625, 'learning_rate': 5.952380952380953e-07, 'epoch': 0.02}
{'loss': 1.2273, 'grad_norm': 0.484375, 'learning_rate': 8.928571428571429e-07, 'epoch': 0.04}
{'loss': 1.2731, 'grad_norm': 0.6015625, 'learning_rate': 1.1904761904761906e-06, 'epoch': 0.05}


[INFO|trainer.py:3410] 2024-06-03 13:33:26,084 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-100


{'loss': 1.2757, 'grad_norm': 0.4921875, 'learning_rate': 1.4880952380952381e-06, 'epoch': 0.06}


[INFO|configuration_utils.py:733] 2024-06-03 13:33:27,800 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:33:27,803 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.2827, 'grad_norm': 0.5859375, 'learning_rate': 1.7857142857142859e-06, 'epoch': 0.07}
{'loss': 1.2206, 'grad_norm': 0.52734375, 'learning_rate': 2.0833333333333334e-06, 'epoch': 0.08}
{'loss': 1.2604, 'grad_norm': 0.66015625, 'learning_rate': 2.380952380952381e-06, 'epoch': 0.1}
{'loss': 1.2433, 'grad_norm': 0.412109375, 'learning_rate': 2.6785714285714285e-06, 'epoch': 0.11}


[INFO|trainer.py:3410] 2024-06-03 13:38:39,316 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-200


{'loss': 1.2834, 'grad_norm': 0.6015625, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.12}


[INFO|configuration_utils.py:733] 2024-06-03 13:38:40,830 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:38:40,832 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.263, 'grad_norm': 0.69140625, 'learning_rate': 3.273809523809524e-06, 'epoch': 0.13}
{'loss': 1.2525, 'grad_norm': 0.3984375, 'learning_rate': 3.5714285714285718e-06, 'epoch': 0.14}
{'loss': 1.2064, 'grad_norm': 0.298828125, 'learning_rate': 3.869047619047619e-06, 'epoch': 0.16}
{'loss': 1.1364, 'grad_norm': 0.431640625, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.17}


[INFO|trainer.py:3410] 2024-06-03 13:44:00,304 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-300


{'loss': 1.1594, 'grad_norm': 0.55078125, 'learning_rate': 4.464285714285715e-06, 'epoch': 0.18}


[INFO|configuration_utils.py:733] 2024-06-03 13:44:01,619 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:44:01,623 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1471, 'grad_norm': 0.291015625, 'learning_rate': 4.761904761904762e-06, 'epoch': 0.19}
{'loss': 1.138, 'grad_norm': 0.2578125, 'learning_rate': 4.99989023370455e-06, 'epoch': 0.2}
{'loss': 1.1786, 'grad_norm': 0.232421875, 'learning_rate': 4.996049425354717e-06, 'epoch': 0.21}
{'loss': 1.2042, 'grad_norm': 0.8671875, 'learning_rate': 4.986729937340083e-06, 'epoch': 0.23}


[INFO|trainer.py:3410] 2024-06-03 13:49:28,129 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-400


{'loss': 1.1528, 'grad_norm': 0.29296875, 'learning_rate': 4.971952225381176e-06, 'epoch': 0.24}


[INFO|configuration_utils.py:733] 2024-06-03 13:49:29,251 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:49:29,254 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1932, 'grad_norm': 0.26953125, 'learning_rate': 4.951748725674643e-06, 'epoch': 0.25}
{'loss': 1.1587, 'grad_norm': 0.328125, 'learning_rate': 4.9261637836977315e-06, 'epoch': 0.26}
{'loss': 1.2024, 'grad_norm': 0.2119140625, 'learning_rate': 4.895253556872611e-06, 'epoch': 0.27}
{'loss': 1.1471, 'grad_norm': 0.2236328125, 'learning_rate': 4.8590858913041775e-06, 'epoch': 0.29}


[INFO|trainer.py:3410] 2024-06-03 13:54:53,063 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-500


{'loss': 1.137, 'grad_norm': 0.291015625, 'learning_rate': 4.817740172861903e-06, 'epoch': 0.3}


[INFO|configuration_utils.py:733] 2024-06-03 13:54:54,287 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 13:54:54,288 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1693, 'grad_norm': 0.234375, 'learning_rate': 4.771307152932579e-06, 'epoch': 0.31}
{'loss': 1.1901, 'grad_norm': 0.28125, 'learning_rate': 4.719888749226442e-06, 'epoch': 0.32}
{'loss': 1.1139, 'grad_norm': 0.28125, 'learning_rate': 4.663597822073865e-06, 'epoch': 0.33}
{'loss': 1.1683, 'grad_norm': 0.26953125, 'learning_rate': 4.602557926703675e-06, 'epoch': 0.35}


[INFO|trainer.py:3410] 2024-06-03 14:00:03,419 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-600


{'loss': 1.1746, 'grad_norm': 0.375, 'learning_rate': 4.536903042046778e-06, 'epoch': 0.36}


[INFO|configuration_utils.py:733] 2024-06-03 14:00:04,982 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:00:04,985 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1092, 'grad_norm': 0.216796875, 'learning_rate': 4.4667772766604065e-06, 'epoch': 0.37}
{'loss': 1.125, 'grad_norm': 0.392578125, 'learning_rate': 4.392334552418421e-06, 'epoch': 0.38}
{'loss': 1.1584, 'grad_norm': 0.25390625, 'learning_rate': 4.313738266661979e-06, 'epoch': 0.39}
{'loss': 1.1235, 'grad_norm': 0.2216796875, 'learning_rate': 4.231160933552109e-06, 'epoch': 0.41}


[INFO|trainer.py:3410] 2024-06-03 14:05:11,954 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-700


{'loss': 1.2566, 'grad_norm': 0.330078125, 'learning_rate': 4.144783805411415e-06, 'epoch': 0.42}


[INFO|configuration_utils.py:733] 2024-06-03 14:05:13,103 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:05:13,106 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.164, 'grad_norm': 0.208984375, 'learning_rate': 4.054796474886038e-06, 'epoch': 0.43}
{'loss': 1.1195, 'grad_norm': 0.248046875, 'learning_rate': 3.961396458801099e-06, 'epoch': 0.44}
{'loss': 1.1012, 'grad_norm': 0.255859375, 'learning_rate': 3.864788764623042e-06, 'epoch': 0.45}
{'loss': 1.1042, 'grad_norm': 0.4296875, 'learning_rate': 3.7651854404804757e-06, 'epoch': 0.47}


[INFO|trainer.py:3410] 2024-06-03 14:10:27,247 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-800


{'loss': 1.1627, 'grad_norm': 0.1953125, 'learning_rate': 3.662805109731168e-06, 'epoch': 0.48}


[INFO|configuration_utils.py:733] 2024-06-03 14:10:28,374 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:10:28,377 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1711, 'grad_norm': 0.2119140625, 'learning_rate': 3.557872491096812e-06, 'epoch': 0.49}
{'loss': 1.1929, 'grad_norm': 0.322265625, 'learning_rate': 3.450617905418834e-06, 'epoch': 0.5}
{'loss': 1.0958, 'grad_norm': 0.193359375, 'learning_rate': 3.341276770117877e-06, 'epoch': 0.51}
{'loss': 1.1335, 'grad_norm': 0.2294921875, 'learning_rate': 3.2300890824665942e-06, 'epoch': 0.52}


[INFO|trainer.py:3410] 2024-06-03 14:15:37,411 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-900


{'loss': 1.137, 'grad_norm': 0.3125, 'learning_rate': 3.117298892809953e-06, 'epoch': 0.54}


[INFO|configuration_utils.py:733] 2024-06-03 14:15:38,616 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:15:38,620 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1752, 'grad_norm': 0.171875, 'learning_rate': 3.003153768889276e-06, 'epoch': 0.55}
{'loss': 1.1044, 'grad_norm': 0.1826171875, 'learning_rate': 2.887904252445806e-06, 'epoch': 0.56}
{'loss': 1.1124, 'grad_norm': 0.49609375, 'learning_rate': 2.7718033092965267e-06, 'epoch': 0.57}
{'loss': 1.2478, 'grad_norm': 0.25390625, 'learning_rate': 2.655105774089278e-06, 'epoch': 0.58}


[INFO|trainer.py:3410] 2024-06-03 14:20:47,684 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1000


{'loss': 1.1365, 'grad_norm': 0.20703125, 'learning_rate': 2.538067790955892e-06, 'epoch': 0.6}


[INFO|configuration_utils.py:733] 2024-06-03 14:20:48,967 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:20:48,969 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.0598, 'grad_norm': 0.2578125, 'learning_rate': 2.420946251291103e-06, 'epoch': 0.61}
{'loss': 1.1299, 'grad_norm': 0.2109375, 'learning_rate': 2.303998229891249e-06, 'epoch': 0.62}
{'loss': 1.148, 'grad_norm': 0.29296875, 'learning_rate': 2.18748042069042e-06, 'epoch': 0.63}
{'loss': 1.1469, 'grad_norm': 0.2412109375, 'learning_rate': 2.0716485733325834e-06, 'epoch': 0.64}


[INFO|trainer.py:3410] 2024-06-03 14:25:57,247 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1100


{'loss': 1.1275, 'grad_norm': 0.27734375, 'learning_rate': 1.95675693181636e-06, 'epoch': 0.66}


[INFO|configuration_utils.py:733] 2024-06-03 14:25:58,660 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:25:58,661 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1711, 'grad_norm': 0.25, 'learning_rate': 1.8430576764446046e-06, 'epoch': 0.67}
{'loss': 1.1191, 'grad_norm': 0.2412109375, 'learning_rate': 1.730800370303683e-06, 'epoch': 0.68}
{'loss': 1.2033, 'grad_norm': 0.328125, 'learning_rate': 1.6202314114873693e-06, 'epoch': 0.69}
{'loss': 1.1747, 'grad_norm': 0.24609375, 'learning_rate': 1.51159349226773e-06, 'epoch': 0.7}


[INFO|trainer.py:3410] 2024-06-03 14:31:05,589 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1200


{'loss': 1.1467, 'grad_norm': 0.24609375, 'learning_rate': 1.4051250664000515e-06, 'epoch': 0.72}


[INFO|configuration_utils.py:733] 2024-06-03 14:31:07,020 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:31:07,023 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1213, 'grad_norm': 0.21484375, 'learning_rate': 1.3010598257310642e-06, 'epoch': 0.73}
{'loss': 1.1539, 'grad_norm': 0.423828125, 'learning_rate': 1.1996261872592754e-06, 'epoch': 0.74}
{'loss': 1.0518, 'grad_norm': 0.296875, 'learning_rate': 1.1010467917732783e-06, 'epoch': 0.75}
{'loss': 1.1907, 'grad_norm': 0.263671875, 'learning_rate': 1.005538015168487e-06, 'epoch': 0.76}


[INFO|trainer.py:3410] 2024-06-03 14:36:17,182 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1300


{'loss': 1.0732, 'grad_norm': 0.2109375, 'learning_rate': 9.133094935149592e-07, 'epoch': 0.78}


[INFO|configuration_utils.py:733] 2024-06-03 14:36:18,361 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:36:18,363 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1658, 'grad_norm': 0.177734375, 'learning_rate': 8.245636629187121e-07, 'epoch': 0.79}
{'loss': 1.0766, 'grad_norm': 0.19921875, 'learning_rate': 7.394953151865444e-07, 'epoch': 0.8}
{'loss': 1.1737, 'grad_norm': 0.208984375, 'learning_rate': 6.582911702696334e-07, 'epoch': 0.81}
{'loss': 1.0915, 'grad_norm': 0.212890625, 'learning_rate': 5.811294664243752e-07, 'epoch': 0.82}


[INFO|trainer.py:3410] 2024-06-03 14:41:26,529 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1400


{'loss': 1.1312, 'grad_norm': 0.1884765625, 'learning_rate': 5.081795689900398e-07, 'epoch': 0.83}


[INFO|configuration_utils.py:733] 2024-06-03 14:41:27,395 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:41:27,396 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1867, 'grad_norm': 0.181640625, 'learning_rate': 4.396015986419483e-07, 'epoch': 0.85}
{'loss': 1.1985, 'grad_norm': 0.1904296875, 'learning_rate': 3.7554607993613823e-07, 'epoch': 0.86}
{'loss': 1.1426, 'grad_norm': 0.25, 'learning_rate': 3.1615361091693694e-07, 'epoch': 0.87}
{'loss': 1.1924, 'grad_norm': 0.2060546875, 'learning_rate': 2.615545545126416e-07, 'epoch': 0.88}


[INFO|trainer.py:3410] 2024-06-03 14:46:54,204 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1500


{'loss': 1.1344, 'grad_norm': 0.21484375, 'learning_rate': 2.118687523966559e-07, 'epoch': 0.89}


[INFO|configuration_utils.py:733] 2024-06-03 14:46:55,940 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:46:55,942 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.153, 'grad_norm': 0.2138671875, 'learning_rate': 1.6720526194217186e-07, 'epoch': 0.91}
{'loss': 1.1558, 'grad_norm': 0.1708984375, 'learning_rate': 1.2766211684773156e-07, 'epoch': 0.92}
{'loss': 1.1415, 'grad_norm': 0.21484375, 'learning_rate': 9.332611195910585e-08, 'epoch': 0.93}
{'loss': 1.1919, 'grad_norm': 0.349609375, 'learning_rate': 6.427261275978369e-08, 'epoch': 0.94}


[INFO|trainer.py:3410] 2024-06-03 14:52:05,187 >> Saving model checkpoint to ./sample_phi3_finetune_example\checkpoint-1600


{'loss': 1.0785, 'grad_norm': 0.2353515625, 'learning_rate': 4.056538994822945e-08, 'epoch': 0.95}


[INFO|configuration_utils.py:733] 2024-06-03 14:52:06,506 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 14:52:06,509 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_

{'loss': 1.1849, 'grad_norm': 0.228515625, 'learning_rate': 2.2256479464999315e-08, 'epoch': 0.97}
{'loss': 1.1015, 'grad_norm': 0.32421875, 'learning_rate': 9.386068276959204e-09, 'epoch': 0.98}
{'loss': 1.1445, 'grad_norm': 0.17578125, 'learning_rate': 1.982406169283857e-09, 'epoch': 0.99}


[INFO|trainer.py:2329] 2024-06-03 14:56:01,058 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 5262.9957, 'train_samples_per_second': 1.274, 'train_steps_per_second': 0.319, 'train_loss': 1.1665670079849415, 'epoch': 1.0}
***** train metrics *****
  epoch                    =         1.0
  total_flos               = 143808611GF
  train_loss               =      1.1666
  train_runtime            =  1:27:42.99
  train_samples_per_second =       1.274
  train_steps_per_second   =       0.319


In [13]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

[INFO|trainer.py:3719] 2024-06-03 14:56:01,084 >> ***** Running Evaluation *****
[INFO|trainer.py:3721] 2024-06-03 14:56:01,085 >>   Num examples = 1328
[INFO|trainer.py:3724] 2024-06-03 14:56:01,085 >>   Batch size = 4


  0%|          | 0/332 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_loss               =     1.1452
  eval_runtime            = 0:05:02.28
  eval_samples            =       1000
  eval_samples_per_second =      4.393
  eval_steps_per_second   =      1.098


In [14]:
# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)

[INFO|trainer.py:3410] 2024-06-03 15:01:03,428 >> Saving model checkpoint to ./sample_phi3_finetune_example
[INFO|configuration_utils.py:733] 2024-06-03 15:01:04,276 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\5be6479b4bc06a081e8f4c6ece294241ccd32dec\config.json
[INFO|configuration_utils.py:796] 2024-06-03 15:01:04,276 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_

In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("sample_phi3_finetune_example",
                  use_auth_token=True,
                  commit_message="Training Phi-3",
                  private=True)

[INFO|configuration_utils.py:472] 2024-06-03 15:24:32,828 >> Configuration saved in sample_phi3_finetune_example\config.json
[INFO|configuration_utils.py:731] 2024-06-03 15:24:32,829 >> Configuration saved in sample_phi3_finetune_example\generation_config.json
[INFO|modeling_utils.py:2626] 2024-06-03 15:24:40,206 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at sample_phi3_finetune_example\model.safetensors.index.json.
[INFO|hub.py:759] 2024-06-03 15:24:45,366 >> Uploading the following files to wenlianghuang/sample_phi3_finetune_example: config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,README.md


model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wenlianghuang/sample_phi3_finetune_example/commit/880aeee5a7b4893531bd3994d8aadab14056fa46', commit_message='Training Phi-3', commit_description='', oid='880aeee5a7b4893531bd3994d8aadab14056fa46', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
print(device)

cuda


In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Define the prompt and prepare the input
prompt = "Write a step-by-step recipe on how to make a vegan black bean soup, including all necessary ingredients and cooking instructions. Please also include any variations or substitutions for dietary restrictions."
tokenizer = AutoTokenizer.from_pretrained("wenlianghuang/sample_phi3_finetune_example")
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Check if CUDA (GPU) is available and move model and inputs to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model weights from hub and move to the appropriate device
model_id = "wenlianghuang/sample_phi3_finetune_example"
model = AutoModelForCausalLM.from_pretrained(model_id)
trained_model = PeftModel.from_pretrained(model, model_id)
trained_model.to(device)

# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Run inference
outputs = trained_model.generate(**inputs, max_length=1000)

# Decode the outputs
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(text)


[INFO|tokenization_auto.py:666] 2024-06-03 16:38:01,896 >> Could not locate the tokenizer configuration file, will try to use the model config instead.


config.json:   0%|          | 0.00/3.60k [00:00<?, ?B/s]

[INFO|configuration_utils.py:733] 2024-06-03 16:38:02,317 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--wenlianghuang--sample_phi3_finetune_example\snapshots\e5c922a3b7d6c050381aca3ec1c728746e13689b\config.json
[INFO|configuration_utils.py:796] 2024-06-03 16:38:02,319 >> Model config Phi3Config {
  "_name_or_path": "wenlianghuang/sample_phi3_finetune_example",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_h

OSError: Can't load tokenizer for 'wenlianghuang/sample_phi3_finetune_example'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'wenlianghuang/sample_phi3_finetune_example' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.