This notebook shozs how to train and align LLMs with SimPO, a reference-free method for LLM alignment.

The last section of the notebook also runs CPO training for comparison since SimPO and CPO are very similar.

Both training are applied to Llama 3 8B and can be done on a 24 GB GPU.





First, we need all these dependencies:
(note: as I'm writing this notebook, TRL must be installed from source to use SimPO)

In [None]:
!pip install -q -U bitsandbytes
!pip install --upgrade -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/trl.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

Import all the necessary packages.

In [None]:
import torch, multiprocessing
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from trl import CPOTrainer, CPOConfig

Load the tokenizer and configure padding

In [None]:
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  torch_dtype = torch.bfloat16
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention and bfloat16.")
else:
  torch_dtype = torch.float16
  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention and bfloat16.")

model_name = "meta-llama/Meta-Llama-3-8B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|eot_id|>"
tokenizer.pad_token_id = 128009
tokenizer.padding_side = 'left'

Your GPU is compatible with FlashAttention and bfloat16.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Load the ultrafeedback dataset prepared by Hugging Face for preference optimization. I apply a chat template to stringify the JSON.

In [None]:
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs","test_prefs"])

def process(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)+tokenizer.eos_token
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)+tokenizer.eos_token
    return row

dataset[0] = dataset[0].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

dataset[1] = dataset[1].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

print(dataset)

Downloading readme:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/61135 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

Map (num_proc=12):   0%|          | 0/2000 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

[Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 61135
}), Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 2000
})]


Load the model and prepare it for QLoRA fine-tuning.

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=torch_dtype, quantization_config=bnb_config, device_map={"": 0},  attn_implementation=attn_implementation
)
model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Define the configuration of LoRA

In [None]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

For this tutorial, I trained for only 100 steps.
If you want to speed up training, disable the evaluation. It takes around 1.5 hours to evaluate a checkpoint on the test split.

In [None]:
simpo_config = CPOConfig(
    output_dir="./results/",
    eval_strategy="steps",
    do_eval=True,
    optim="paged_adamw_8bit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=2,
    log_level="debug",
    logging_steps=20,
    learning_rate=8e-6,
    eval_steps=20,
    max_steps=100,
    save_steps=20,
    save_strategy='epoch',
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    beta=2.0,
    simpo_gamma=1.0,
    loss_type="simpo",
    max_length=1024,
)

trainer = CPOTrainer(
        model=model,
        train_dataset=dataset[0],
        eval_dataset=dataset[1],
        peft_config=peft_config,
        args=simpo_config,
        tokenizer=tokenizer,
)

trainer.train()



Map:   0%|          | 0/61135 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 2
***** Running training *****
  Num examples = 61,135
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 100
  Number of trainable parameters = 41,943,040
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss
20,1.2933,1.263599,5749.426,0.348,0.174,-2.219565,-2.371975,0.562,0.15241,-1.185988,-1.109783,-0.670006,-0.711723,0.0
40,1.3041,1.262629,5736.5531,0.349,0.174,-2.235647,-2.39075,0.5645,0.155102,-1.195375,-1.117823,-0.669863,-0.711667,0.0
60,1.3074,1.26151,5745.7899,0.348,0.174,-2.256136,-2.414593,0.5615,0.158457,-1.207297,-1.128068,-0.669639,-0.711453,0.0
80,1.1908,1.260094,5757.7054,0.347,0.174,-2.290536,-2.454347,0.566,0.163811,-1.227174,-1.145268,-0.669392,-0.711321,0.0


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss
20,1.2933,1.263599,5749.426,0.348,0.174,-2.219565,-2.371975,0.562,0.15241,-1.185988,-1.109783,-0.670006,-0.711723,0.0
40,1.3041,1.262629,5736.5531,0.349,0.174,-2.235647,-2.39075,0.5645,0.155102,-1.195375,-1.117823,-0.669863,-0.711667,0.0
60,1.3074,1.26151,5745.7899,0.348,0.174,-2.256136,-2.414593,0.5615,0.158457,-1.207297,-1.128068,-0.669639,-0.711453,0.0
80,1.1908,1.260094,5757.7054,0.347,0.174,-2.290536,-2.454347,0.566,0.163811,-1.227174,-1.145268,-0.669392,-0.711321,0.0
100,1.2716,1.259575,5756.7538,0.347,0.174,-2.305536,-2.471574,0.5675,0.166038,-1.235787,-1.152768,-0.669053,-0.711017,0.0


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

t

TrainOutput(global_step=100, training_loss=1.273437614440918, metrics={'train_runtime': 35198.5631, 'train_samples_per_second': 0.023, 'train_steps_per_second': 0.003, 'total_flos': 0.0, 'train_loss': 1.273437614440918, 'epoch': 0.01308557969118032})

# CPO Training (for comparison)

In [None]:
import torch, multiprocessing
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from trl import CPOTrainer, CPOConfig

major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  torch_dtype = torch.bfloat16
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention and bfloat16.")
else:
  torch_dtype = torch.float16
  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention and bfloat16.")

model_name = "meta-llama/Meta-Llama-3-8B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|eot_id|>"
tokenizer.pad_token_id = 128009
tokenizer.padding_side = 'left'

dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs","test_prefs"])

def process(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)+tokenizer.eos_token
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)+tokenizer.eos_token
    return row

dataset[0] = dataset[0].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

dataset[1] = dataset[1].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

print(dataset)

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=torch_dtype, quantization_config=bnb_config, device_map={"": 0},  attn_implementation=attn_implementation
)
model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

simpo_config = CPOConfig(
    output_dir="./results/",
    eval_strategy="steps",
    do_eval=True,
    optim="paged_adamw_8bit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=2,
    log_level="debug",
    logging_steps=20,
    learning_rate=8e-6,
    eval_steps=20,
    max_steps=100,
    save_steps=20,
    save_strategy='epoch',
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    beta=0.1,
    max_length=1024,
)

trainer = CPOTrainer(
        model=model,
        train_dataset=dataset[0],
        eval_dataset=dataset[1],
        peft_config=peft_config,
        args=simpo_config,
        tokenizer=tokenizer,
)

trainer.train()



Collecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.5.9.post1-cp310-cp310-linux_x86_64.whl size=120889689 sha256=5022ba11d48bf74926da9c16260f4ea2b9bb7f4e29bdb4bd6e1383ad1c55d16f
  Stored in directory: /root/.cache/pip/wheels/cc/ad/f6/7ccf0238790d6346e9fe622923a76ec218e890d356b9a2754a
Successfully built flash-attn
Installing collected packages: einops, flash-attn
Successfully installed einops-0.8.0 flash-attn-2.5.9.post

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/61135 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

Map (num_proc=12):   0%|          | 0/2000 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

[Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 61135
}), Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 2000
})]


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]



Map:   0%|          | 0/61135 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 2
***** Running training *****
  Num examples = 61,135
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 100
  Number of trainable parameters = 41,943,040
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss
20,11.0905,10.136969,5831.0879,0.343,0.171,-46.446243,-43.978619,0.461,-2.467623,-439.786163,-464.462372,-0.669543,-0.711315,1.070068
40,9.4394,10.096896,5832.5023,0.343,0.171,-46.190639,-43.740982,0.459,-2.44966,-437.409882,-461.906433,-0.669013,-0.710837,1.062944
60,10.1634,10.059162,5815.7763,0.344,0.172,-45.984196,-43.554031,0.4625,-2.430158,-435.540344,-459.841919,-0.668836,-0.710752,1.057197
80,10.6297,10.032743,5815.4878,0.344,0.172,-45.843647,-43.427124,0.46,-2.41652,-434.271271,-458.436432,-0.669378,-0.711453,1.053311


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 2


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss
20,11.0905,10.136969,5831.0879,0.343,0.171,-46.446243,-43.978619,0.461,-2.467623,-439.786163,-464.462372,-0.669543,-0.711315,1.070068
40,9.4394,10.096896,5832.5023,0.343,0.171,-46.190639,-43.740982,0.459,-2.44966,-437.409882,-461.906433,-0.669013,-0.710837,1.062944
60,10.1634,10.059162,5815.7763,0.344,0.172,-45.984196,-43.554031,0.4625,-2.430158,-435.540344,-459.841919,-0.668836,-0.710752,1.057197
80,10.6297,10.032743,5815.4878,0.344,0.172,-45.843647,-43.427124,0.46,-2.41652,-434.271271,-458.436432,-0.669378,-0.711453,1.053311
100,8.939,10.021013,5821.9832,0.344,0.172,-45.779762,-43.368889,0.4605,-2.410867,-433.688904,-457.797577,-0.669243,-0.711407,1.051529


Saving model checkpoint to ./results/checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results/checkpoint-100/tokenizer_confi

TrainOutput(global_step=100, training_loss=10.052413635253906, metrics={'train_runtime': 35648.6536, 'train_samples_per_second': 0.022, 'train_steps_per_second': 0.003, 'total_flos': 0.0, 'train_loss': 10.052413635253906, 'epoch': 0.01308557969118032})