In [None]:
# https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/sample_finetune.py

In [1]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

In [2]:
logger = logging.getLogger(__name__)

In [3]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

In [4]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}

In [5]:
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)


In [6]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

2024-05-17 14:39:23 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O

In [7]:
################
# Modle Loading
################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    #attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

[INFO|configuration_utils.py:726] 2024-05-17 14:39:24,669 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:726] 2024-05-17 14:39:24,899 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 14:39:24,900 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 



[INFO|modeling_utils.py:3429] 2024-05-17 14:39:25,206 >> loading weights file model.safetensors from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\model.safetensors.index.json
[INFO|modeling_utils.py:1494] 2024-05-17 14:39:25,208 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:928] 2024-05-17 14:39:25,209 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000,
  "use_cache": false
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-17 14:39:34,214 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-17 14:39:34,216 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:883] 2024-05-17 14:39:34,448 >> loading configuration file generation_config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\generation_config.json
[INFO|configuration_utils.py:928] 2024-05-17 14:39:34,448 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2087] 2024-05-17 14:3

In [8]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

Overwrite dataset info from restored data version if exists.


2024-05-17 14:39:51 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb


2024-05-17 14:39:51 - INFO - datasets.info - Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb


Found cached dataset ultrachat_200k (C:/Users/acer alan/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb)


2024-05-17 14:39:51 - INFO - datasets.builder - Found cached dataset ultrachat_200k (C:/Users/acer alan/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb)


Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb


2024-05-17 14:39:51 - INFO - datasets.info - Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb


In [9]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)


Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00000_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00000_of_00010.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00001_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00001_of_00010.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00002_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00002_of_00010.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00003_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00003_of_00010.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00004_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00004_of_00010.arrow


Process #5 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00005_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #5 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00005_of_00010.arrow


Process #6 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00006_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #6 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00006_of_00010.arrow


Process #7 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00007_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #7 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00007_of_00010.arrow


Process #8 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00008_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #8 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00008_of_00010.arrow


Process #9 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00009_of_00010.arrow


2024-05-17 14:39:51 - INFO - datasets.arrow_dataset - Process #9 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_00009_of_00010.arrow


Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_*_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-a4af82e2ea04f2ca_*_of_00010.arrow


Concatenating 10 shards


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Concatenating 10 shards


In [10]:
processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00000_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00000_of_00010.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00001_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00001_of_00010.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00002_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00002_of_00010.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00003_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00003_of_00010.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00004_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00004_of_00010.arrow


Process #5 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00005_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #5 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00005_of_00010.arrow


Process #6 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00006_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #6 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00006_of_00010.arrow


Process #7 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00007_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #7 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00007_of_00010.arrow


Process #8 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00008_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #8 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00008_of_00010.arrow


Process #9 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00009_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Process #9 will write at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_00009_of_00010.arrow


Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_*_of_00010.arrow


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\HuggingFaceH4___ultrachat_200k\default\0.0.0\f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb\cache-6075d1b3716be38a_*_of_00010.arrow


Concatenating 10 shards


2024-05-17 14:39:52 - INFO - datasets.arrow_dataset - Concatenating 10 shards


In [11]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Using custom data configuration default-88989bef46966697


2024-05-17 14:39:52 - INFO - datasets.builder - Using custom data configuration default-88989bef46966697


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-05-17 14:39:52 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0)


2024-05-17 14:39:52 - INFO - datasets.builder - Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0)


Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0...


2024-05-17 14:39:52 - INFO - datasets.builder - Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0...


Generating train split


2024-05-17 14:39:52 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-05-17 14:45:39 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0. Subsequent calls will reuse this data.


2024-05-17 14:45:39 - INFO - datasets.builder - Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-88989bef46966697/0.0.0. Subsequent calls will reuse this data.


Using custom data configuration default-5a45937f2ee2538f


2024-05-17 14:45:39 - INFO - datasets.builder - Using custom data configuration default-5a45937f2ee2538f


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-05-17 14:45:39 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0)


2024-05-17 14:45:39 - INFO - datasets.builder - Generating dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0)


Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0...


2024-05-17 14:45:39 - INFO - datasets.builder - Downloading and preparing dataset generator/default to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0...


Generating train split


2024-05-17 14:45:39 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-05-17 14:46:17 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0. Subsequent calls will reuse this data.


2024-05-17 14:46:17 - INFO - datasets.builder - Dataset generator downloaded and prepared to C:/Users/acer alan/.cache/huggingface/datasets/generator/default-5a45937f2ee2538f/0.0.0. Subsequent calls will reuse this data.


[INFO|trainer.py:626] 2024-05-17 14:46:26,240 >> Using auto half precision backend
[INFO|trainer.py:2048] 2024-05-17 14:46:26,368 >> ***** Running training *****
[INFO|trainer.py:2049] 2024-05-17 14:46:26,369 >>   Num examples = 280,637
[INFO|trainer.py:2050] 2024-05-17 14:46:26,369 >>   Num Epochs = 1
[INFO|trainer.py:2051] 2024-05-17 14:46:26,370 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2054] 2024-05-17 14:46:26,371 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2055] 2024-05-17 14:46:26,371 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2056] 2024-05-17 14:46:26,372 >>   Total optimization steps = 70,160
[INFO|trainer.py:2057] 2024-05-17 14:46:26,374 >>   Number of trainable parameters = 25,165,824


  0%|          | 0/70160 [00:00<?, ?it/s]

{'loss': 1.2806, 'grad_norm': 0.4609375, 'learning_rate': 7.126567844925884e-09, 'epoch': 0.0}
{'loss': 1.3621, 'grad_norm': 0.4921875, 'learning_rate': 1.4253135689851768e-08, 'epoch': 0.0}
{'loss': 1.2232, 'grad_norm': 0.63671875, 'learning_rate': 2.1379703534777654e-08, 'epoch': 0.0}
{'loss': 1.329, 'grad_norm': 0.376953125, 'learning_rate': 2.8506271379703537e-08, 'epoch': 0.0}


[INFO|trainer.py:3305] 2024-05-17 14:53:06,880 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-100


{'loss': 1.3711, 'grad_norm': 0.37890625, 'learning_rate': 3.563283922462942e-08, 'epoch': 0.0}


[INFO|configuration_utils.py:726] 2024-05-17 14:53:08,075 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 14:53:08,076 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3315, 'grad_norm': 0.44140625, 'learning_rate': 4.275940706955531e-08, 'epoch': 0.0}
{'loss': 1.3025, 'grad_norm': 0.53125, 'learning_rate': 4.988597491448119e-08, 'epoch': 0.0}
{'loss': 1.3158, 'grad_norm': 0.47265625, 'learning_rate': 5.701254275940707e-08, 'epoch': 0.0}
{'loss': 1.3421, 'grad_norm': 0.53125, 'learning_rate': 6.413911060433296e-08, 'epoch': 0.0}


[INFO|trainer.py:3305] 2024-05-17 14:59:45,970 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-200


{'loss': 1.288, 'grad_norm': 2.484375, 'learning_rate': 7.126567844925884e-08, 'epoch': 0.0}


[INFO|configuration_utils.py:726] 2024-05-17 14:59:47,084 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 14:59:47,085 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3619, 'grad_norm': 0.5625, 'learning_rate': 7.839224629418473e-08, 'epoch': 0.0}
{'loss': 1.3335, 'grad_norm': 2.125, 'learning_rate': 8.551881413911062e-08, 'epoch': 0.0}
{'loss': 1.2682, 'grad_norm': 0.380859375, 'learning_rate': 9.264538198403649e-08, 'epoch': 0.0}
{'loss': 1.3986, 'grad_norm': 0.53125, 'learning_rate': 9.977194982896237e-08, 'epoch': 0.0}


[INFO|trainer.py:3305] 2024-05-17 15:06:25,919 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-300


{'loss': 1.3399, 'grad_norm': 0.6015625, 'learning_rate': 1.0689851767388827e-07, 'epoch': 0.0}


[INFO|configuration_utils.py:726] 2024-05-17 15:06:27,153 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:06:27,154 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3714, 'grad_norm': 0.396484375, 'learning_rate': 1.1402508551881415e-07, 'epoch': 0.0}
{'loss': 1.2865, 'grad_norm': 0.6171875, 'learning_rate': 1.2115165336374005e-07, 'epoch': 0.0}
{'loss': 1.2807, 'grad_norm': 0.369140625, 'learning_rate': 1.2827822120866592e-07, 'epoch': 0.01}
{'loss': 1.3257, 'grad_norm': 0.57421875, 'learning_rate': 1.3540478905359182e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:13:05,440 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-400


{'loss': 1.342, 'grad_norm': 0.55078125, 'learning_rate': 1.425313568985177e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:13:06,503 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:13:06,504 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2448, 'grad_norm': 0.4765625, 'learning_rate': 1.4965792474344356e-07, 'epoch': 0.01}
{'loss': 1.3112, 'grad_norm': 1.3515625, 'learning_rate': 1.5678449258836946e-07, 'epoch': 0.01}
{'loss': 1.3267, 'grad_norm': 0.578125, 'learning_rate': 1.6391106043329536e-07, 'epoch': 0.01}
{'loss': 1.2542, 'grad_norm': 0.439453125, 'learning_rate': 1.7103762827822123e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:19:45,130 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-500


{'loss': 1.3337, 'grad_norm': 0.52734375, 'learning_rate': 1.781641961231471e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:19:46,221 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:19:46,222 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2873, 'grad_norm': 0.57421875, 'learning_rate': 1.8529076396807298e-07, 'epoch': 0.01}
{'loss': 1.3042, 'grad_norm': 0.66796875, 'learning_rate': 1.9241733181299888e-07, 'epoch': 0.01}
{'loss': 1.2938, 'grad_norm': 0.4921875, 'learning_rate': 1.9954389965792475e-07, 'epoch': 0.01}
{'loss': 1.3039, 'grad_norm': 0.4609375, 'learning_rate': 2.0667046750285062e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:26:24,768 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-600


{'loss': 1.3281, 'grad_norm': 0.41015625, 'learning_rate': 2.1379703534777655e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:26:28,430 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:26:28,431 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2627, 'grad_norm': 0.5859375, 'learning_rate': 2.2092360319270242e-07, 'epoch': 0.01}
{'loss': 1.3081, 'grad_norm': 0.490234375, 'learning_rate': 2.280501710376283e-07, 'epoch': 0.01}
{'loss': 1.3181, 'grad_norm': 0.494140625, 'learning_rate': 2.351767388825542e-07, 'epoch': 0.01}
{'loss': 1.4122, 'grad_norm': 0.5546875, 'learning_rate': 2.423033067274801e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:33:07,332 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-700


{'loss': 1.259, 'grad_norm': 0.51171875, 'learning_rate': 2.4942987457240596e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:33:08,408 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:33:08,409 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3561, 'grad_norm': 0.412109375, 'learning_rate': 2.5655644241733184e-07, 'epoch': 0.01}
{'loss': 1.2733, 'grad_norm': 0.54296875, 'learning_rate': 2.636830102622577e-07, 'epoch': 0.01}
{'loss': 1.2753, 'grad_norm': 0.39453125, 'learning_rate': 2.7080957810718363e-07, 'epoch': 0.01}
{'loss': 1.3863, 'grad_norm': 0.42578125, 'learning_rate': 2.779361459521095e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:39:46,740 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-800


{'loss': 1.3708, 'grad_norm': 0.490234375, 'learning_rate': 2.850627137970354e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:39:47,787 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:39:47,788 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.317, 'grad_norm': 0.6171875, 'learning_rate': 2.9218928164196125e-07, 'epoch': 0.01}
{'loss': 1.2991, 'grad_norm': 0.490234375, 'learning_rate': 2.993158494868871e-07, 'epoch': 0.01}
{'loss': 1.2365, 'grad_norm': 0.45703125, 'learning_rate': 3.06442417331813e-07, 'epoch': 0.01}
{'loss': 1.2613, 'grad_norm': 0.435546875, 'learning_rate': 3.135689851767389e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:46:26,317 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-900


{'loss': 1.3155, 'grad_norm': 0.470703125, 'learning_rate': 3.206955530216648e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:46:27,687 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:46:27,688 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3189, 'grad_norm': 0.390625, 'learning_rate': 3.278221208665907e-07, 'epoch': 0.01}
{'loss': 1.3335, 'grad_norm': 0.5546875, 'learning_rate': 3.349486887115166e-07, 'epoch': 0.01}
{'loss': 1.2078, 'grad_norm': 0.4921875, 'learning_rate': 3.4207525655644247e-07, 'epoch': 0.01}
{'loss': 1.2346, 'grad_norm': 0.48828125, 'learning_rate': 3.4920182440136834e-07, 'epoch': 0.01}


[INFO|trainer.py:3305] 2024-05-17 15:53:06,454 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1000


{'loss': 1.3288, 'grad_norm': 0.5859375, 'learning_rate': 3.563283922462942e-07, 'epoch': 0.01}


[INFO|configuration_utils.py:726] 2024-05-17 15:53:07,617 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:53:07,618 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2995, 'grad_norm': 0.40234375, 'learning_rate': 3.634549600912201e-07, 'epoch': 0.01}
{'loss': 1.2837, 'grad_norm': 0.453125, 'learning_rate': 3.7058152793614596e-07, 'epoch': 0.01}
{'loss': 1.2435, 'grad_norm': 0.35546875, 'learning_rate': 3.777080957810719e-07, 'epoch': 0.02}
{'loss': 1.2545, 'grad_norm': 0.62890625, 'learning_rate': 3.8483466362599775e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 15:59:46,036 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1100


{'loss': 1.3551, 'grad_norm': 0.546875, 'learning_rate': 3.919612314709236e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 15:59:47,203 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 15:59:47,206 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2783, 'grad_norm': 0.51171875, 'learning_rate': 3.990877993158495e-07, 'epoch': 0.02}
{'loss': 1.2753, 'grad_norm': 0.51171875, 'learning_rate': 4.0621436716077537e-07, 'epoch': 0.02}
{'loss': 1.314, 'grad_norm': 0.578125, 'learning_rate': 4.1334093500570124e-07, 'epoch': 0.02}
{'loss': 1.3521, 'grad_norm': 0.455078125, 'learning_rate': 4.204675028506271e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:06:25,942 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1200


{'loss': 1.3057, 'grad_norm': 0.474609375, 'learning_rate': 4.275940706955531e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:06:27,029 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:06:27,030 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3644, 'grad_norm': 0.89453125, 'learning_rate': 4.3472063854047897e-07, 'epoch': 0.02}
{'loss': 1.3178, 'grad_norm': 0.671875, 'learning_rate': 4.4184720638540484e-07, 'epoch': 0.02}
{'loss': 1.3397, 'grad_norm': 0.47265625, 'learning_rate': 4.489737742303307e-07, 'epoch': 0.02}
{'loss': 1.3497, 'grad_norm': 0.7265625, 'learning_rate': 4.561003420752566e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:13:05,592 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1300


{'loss': 1.3237, 'grad_norm': 0.546875, 'learning_rate': 4.632269099201825e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:13:06,674 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:13:06,675 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2832, 'grad_norm': 0.5078125, 'learning_rate': 4.703534777651084e-07, 'epoch': 0.02}
{'loss': 1.255, 'grad_norm': 0.40234375, 'learning_rate': 4.774800456100342e-07, 'epoch': 0.02}
{'loss': 1.3881, 'grad_norm': 0.53125, 'learning_rate': 4.846066134549602e-07, 'epoch': 0.02}
{'loss': 1.268, 'grad_norm': 0.41015625, 'learning_rate': 4.917331812998861e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:19:45,502 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1400


{'loss': 1.3214, 'grad_norm': 0.451171875, 'learning_rate': 4.988597491448119e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:19:47,183 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:19:47,184 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.1883, 'grad_norm': 2.296875, 'learning_rate': 5.059863169897378e-07, 'epoch': 0.02}
{'loss': 1.2271, 'grad_norm': 0.44140625, 'learning_rate': 5.131128848346637e-07, 'epoch': 0.02}
{'loss': 1.2426, 'grad_norm': 0.5234375, 'learning_rate': 5.202394526795895e-07, 'epoch': 0.02}
{'loss': 1.3712, 'grad_norm': 0.53125, 'learning_rate': 5.273660205245154e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:26:25,999 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1500


{'loss': 1.327, 'grad_norm': 0.46484375, 'learning_rate': 5.344925883694413e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:26:27,032 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:26:27,033 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3116, 'grad_norm': 0.458984375, 'learning_rate': 5.416191562143673e-07, 'epoch': 0.02}
{'loss': 1.3143, 'grad_norm': 0.54296875, 'learning_rate': 5.487457240592931e-07, 'epoch': 0.02}
{'loss': 1.2597, 'grad_norm': 0.52734375, 'learning_rate': 5.55872291904219e-07, 'epoch': 0.02}
{'loss': 1.3679, 'grad_norm': 0.4609375, 'learning_rate': 5.629988597491449e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:33:05,964 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1600


{'loss': 1.2521, 'grad_norm': 0.61328125, 'learning_rate': 5.701254275940708e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:33:07,045 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:33:07,046 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2843, 'grad_norm': 0.70703125, 'learning_rate': 5.772519954389966e-07, 'epoch': 0.02}
{'loss': 1.291, 'grad_norm': 0.412109375, 'learning_rate': 5.843785632839225e-07, 'epoch': 0.02}
{'loss': 1.2915, 'grad_norm': 0.60546875, 'learning_rate': 5.915051311288484e-07, 'epoch': 0.02}
{'loss': 1.2584, 'grad_norm': 0.390625, 'learning_rate': 5.986316989737742e-07, 'epoch': 0.02}


[INFO|trainer.py:3305] 2024-05-17 16:39:45,585 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1700


{'loss': 1.2831, 'grad_norm': 0.5859375, 'learning_rate': 6.057582668187001e-07, 'epoch': 0.02}


[INFO|configuration_utils.py:726] 2024-05-17 16:39:46,861 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:39:46,862 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2688, 'grad_norm': 0.427734375, 'learning_rate': 6.12884834663626e-07, 'epoch': 0.02}
{'loss': 1.2356, 'grad_norm': 0.40234375, 'learning_rate': 6.20011402508552e-07, 'epoch': 0.02}
{'loss': 1.224, 'grad_norm': 0.427734375, 'learning_rate': 6.271379703534778e-07, 'epoch': 0.03}
{'loss': 1.2477, 'grad_norm': 0.46875, 'learning_rate': 6.342645381984037e-07, 'epoch': 0.03}


[INFO|trainer.py:3305] 2024-05-17 16:46:25,635 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1800


{'loss': 1.2846, 'grad_norm': 0.33203125, 'learning_rate': 6.413911060433296e-07, 'epoch': 0.03}


[INFO|configuration_utils.py:726] 2024-05-17 16:46:27,553 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:46:27,556 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2459, 'grad_norm': 0.48046875, 'learning_rate': 6.485176738882555e-07, 'epoch': 0.03}
{'loss': 1.2918, 'grad_norm': 0.62109375, 'learning_rate': 6.556442417331814e-07, 'epoch': 0.03}
{'loss': 1.3562, 'grad_norm': 0.41796875, 'learning_rate': 6.627708095781073e-07, 'epoch': 0.03}
{'loss': 1.3743, 'grad_norm': 0.404296875, 'learning_rate': 6.698973774230332e-07, 'epoch': 0.03}


[INFO|trainer.py:3305] 2024-05-17 16:53:06,543 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-1900


{'loss': 1.2572, 'grad_norm': 0.33984375, 'learning_rate': 6.770239452679591e-07, 'epoch': 0.03}


[INFO|configuration_utils.py:726] 2024-05-17 16:53:07,593 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:53:07,595 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.2607, 'grad_norm': 0.61328125, 'learning_rate': 6.841505131128849e-07, 'epoch': 0.03}
{'loss': 1.2962, 'grad_norm': 0.375, 'learning_rate': 6.912770809578108e-07, 'epoch': 0.03}
{'loss': 1.3135, 'grad_norm': 0.357421875, 'learning_rate': 6.984036488027367e-07, 'epoch': 0.03}
{'loss': 1.338, 'grad_norm': 0.40234375, 'learning_rate': 7.055302166476625e-07, 'epoch': 0.03}


[INFO|trainer.py:3305] 2024-05-17 16:59:46,819 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-2000


{'loss': 1.2499, 'grad_norm': 0.302734375, 'learning_rate': 7.126567844925884e-07, 'epoch': 0.03}


[INFO|configuration_utils.py:726] 2024-05-17 16:59:47,858 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 16:59:47,859 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

{'loss': 1.3731, 'grad_norm': 0.45703125, 'learning_rate': 7.197833523375143e-07, 'epoch': 0.03}
{'loss': 1.2776, 'grad_norm': 0.439453125, 'learning_rate': 7.269099201824402e-07, 'epoch': 0.03}
{'loss': 1.2815, 'grad_norm': 0.68359375, 'learning_rate': 7.34036488027366e-07, 'epoch': 0.03}
{'loss': 1.2587, 'grad_norm': 0.400390625, 'learning_rate': 7.411630558722919e-07, 'epoch': 0.03}


[INFO|trainer.py:3305] 2024-05-17 17:06:27,826 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-2100


{'loss': 1.1752, 'grad_norm': 0.35546875, 'learning_rate': 7.482896237172178e-07, 'epoch': 0.03}


[INFO|configuration_utils.py:726] 2024-05-17 17:06:28,909 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\8f5f3a02ec472594e949c39f8e38c7be8d983bcd\config.json
[INFO|configuration_utils.py:789] 2024-05-17 17:06:28,910 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max

In [None]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)