In [1]:
# https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/sample_finetune.py

In [2]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

In [3]:
logger = logging.getLogger(__name__)

In [4]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

In [5]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}

In [6]:
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)


In [7]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

2024-05-27 17:39:52 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O

In [8]:
################
# Modle Loading
################
#checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    #attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 512
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

[INFO|configuration_utils.py:726] 2024-05-27 17:39:53,500 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\bbd531db4632bb631b0c44d98172894a0c594dd0\config.json
[INFO|configuration_utils.py:726] 2024-05-27 17:39:53,737 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\bbd531db4632bb631b0c44d98172894a0c594dd0\config.json
[INFO|configuration_utils.py:789] 2024-05-27 17:39:53,739 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_t



[INFO|modeling_utils.py:3429] 2024-05-27 17:39:54,036 >> loading weights file model.safetensors from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\bbd531db4632bb631b0c44d98172894a0c594dd0\model.safetensors.index.json
[INFO|modeling_utils.py:1494] 2024-05-27 17:39:54,040 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:928] 2024-05-27 17:39:54,040 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000,
  "use_cache": false
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-27 17:40:04,084 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-27 17:40:04,087 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:883] 2024-05-27 17:40:04,341 >> loading configuration file generation_config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\bbd531db4632bb631b0c44d98172894a0c594dd0\generation_config.json
[INFO|configuration_utils.py:928] 2024-05-27 17:40:04,342 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2087] 2024-05-27 

In [9]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset("wenlianghuang/dataset_phi3_matt_testing")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

Overwrite dataset info from restored data version if exists.


2024-05-27 17:40:13 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


Found cached dataset dataset_phi3_matt_testing (C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282)


2024-05-27 17:40:13 - INFO - datasets.builder - Found cached dataset dataset_phi3_matt_testing (C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282)


Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/wenlianghuang___dataset_phi3_matt_testing/default/0.0.0/a1fb0788e4ae946b0027f6f1318161a2c26c4282


In [10]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=5,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)


Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00000_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00000_of_00005.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00001_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00001_of_00005.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00002_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00002_of_00005.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00003_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00003_of_00005.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00004_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_00004_of_00005.arrow


Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_*_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-5e34ca905062940d_*_of_00005.arrow


Concatenating 5 shards


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Concatenating 5 shards


In [11]:
processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=5,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00000_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #0 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00000_of_00005.arrow


Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00001_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #1 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00001_of_00005.arrow


Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00002_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #2 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00002_of_00005.arrow


Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00003_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #3 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00003_of_00005.arrow


Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00004_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Process #4 will write at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_00004_of_00005.arrow


Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_*_of_00005.arrow


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\acer alan\.cache\huggingface\datasets\wenlianghuang___dataset_phi3_matt_testing\default\0.0.0\a1fb0788e4ae946b0027f6f1318161a2c26c4282\cache-8326648afb7f90cd_*_of_00005.arrow


Concatenating 5 shards


2024-05-27 17:40:13 - INFO - datasets.arrow_dataset - Concatenating 5 shards


In [12]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Using custom data configuration default-dc18b444099791db


2024-05-27 17:40:13 - INFO - datasets.builder - Using custom data configuration default-dc18b444099791db


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Overwrite dataset info from restored data version if exists.


2024-05-27 17:40:13 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/generator/default-dc18b444099791db/0.0.0


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/generator/default-dc18b444099791db/0.0.0


Found cached dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-dc18b444099791db/0.0.0)


2024-05-27 17:40:13 - INFO - datasets.builder - Found cached dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-dc18b444099791db/0.0.0)


Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/generator/default-dc18b444099791db/0.0.0


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/generator/default-dc18b444099791db/0.0.0


Using custom data configuration default-7fe346cfdf9f1dcc


2024-05-27 17:40:13 - INFO - datasets.builder - Using custom data configuration default-7fe346cfdf9f1dcc


Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset Infos from c:\Users\acer alan\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator


Overwrite dataset info from restored data version if exists.


2024-05-27 17:40:13 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/generator/default-7fe346cfdf9f1dcc/0.0.0


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:\Users\acer alan\.cache\huggingface\datasets/generator/default-7fe346cfdf9f1dcc/0.0.0


Found cached dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7fe346cfdf9f1dcc/0.0.0)


2024-05-27 17:40:13 - INFO - datasets.builder - Found cached dataset generator (C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7fe346cfdf9f1dcc/0.0.0)


Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7fe346cfdf9f1dcc/0.0.0


2024-05-27 17:40:13 - INFO - datasets.info - Loading Dataset info from C:/Users/acer alan/.cache/huggingface/datasets/generator/default-7fe346cfdf9f1dcc/0.0.0


[INFO|trainer.py:626] 2024-05-27 17:40:21,285 >> Using auto half precision backend
[INFO|trainer.py:2048] 2024-05-27 17:40:21,395 >> ***** Running training *****
[INFO|trainer.py:2049] 2024-05-27 17:40:21,396 >>   Num examples = 6,706
[INFO|trainer.py:2050] 2024-05-27 17:40:21,396 >>   Num Epochs = 1
[INFO|trainer.py:2051] 2024-05-27 17:40:21,396 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2054] 2024-05-27 17:40:21,397 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2055] 2024-05-27 17:40:21,397 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2056] 2024-05-27 17:40:21,398 >>   Total optimization steps = 1,677
[INFO|trainer.py:2057] 2024-05-27 17:40:21,400 >>   Number of trainable parameters = 25,165,824


  0%|          | 0/1677 [00:00<?, ?it/s]

{'loss': 1.3124, 'grad_norm': 0.546875, 'learning_rate': 2.9761904761904765e-07, 'epoch': 0.01}
{'loss': 1.2899, 'grad_norm': 0.68359375, 'learning_rate': 5.952380952380953e-07, 'epoch': 0.02}
{'loss': 1.3202, 'grad_norm': 0.69140625, 'learning_rate': 8.928571428571429e-07, 'epoch': 0.04}
{'loss': 1.2294, 'grad_norm': 0.5078125, 'learning_rate': 1.1904761904761906e-06, 'epoch': 0.05}


[INFO|trainer.py:3305] 2024-05-27 17:47:31,067 >> Saving model checkpoint to ./checkpoint_dir\checkpoint-100


{'loss': 1.3516, 'grad_norm': 0.71484375, 'learning_rate': 1.4880952380952381e-06, 'epoch': 0.06}


[INFO|configuration_utils.py:726] 2024-05-27 17:47:32,729 >> loading configuration file config.json from cache at C:\Users\acer alan\.cache\huggingface\hub\models--microsoft--Phi-3-mini-128k-instruct\snapshots\bbd531db4632bb631b0c44d98172894a0c594dd0\config.json
[INFO|configuration_utils.py:789] 2024-05-27 17:47:32,730 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "or

{'loss': 1.3087, 'grad_norm': 0.40625, 'learning_rate': 1.7857142857142859e-06, 'epoch': 0.07}
{'loss': 1.2978, 'grad_norm': 0.421875, 'learning_rate': 2.0833333333333334e-06, 'epoch': 0.08}
{'loss': 1.2707, 'grad_norm': 0.431640625, 'learning_rate': 2.380952380952381e-06, 'epoch': 0.1}


In [None]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)