In [1]:
import copy
import os
import tempfile
# import unittest

import numpy as np
# import pytest
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, AutoConfig, LlamaConfig, LlamaForCausalLM

from trl import SFTTrainer
from trl.import_utils import is_peft_available
from trl.trainer import ConstantLengthDataset, DataCollatorForCompletionOnlyLM


tmp_dir = './temp_dir'
# model_id = "trl-internal-testing/dummy-GPT2-correct-vocab"
# model = AutoModelForCausalLM.from_pretrained(model_id)

model_id = 'meta-llama/Meta-Llama-3-8B'
config = AutoConfig.from_pretrained(model_id)
# print(config)
config.intermediate_size = 512
config.hidden_size = 256
config.num_attention_heads = 4
config.num_key_value_heads = 4
config.num_hidden_layers = 1

model = AutoModelForCausalLM.from_config(config)
print(model)
print(model.config)

  from pandas.core import (


In [2]:
# tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=False)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
print(tokenizer)

tokenizer.add_special_tokens({'pad_token': '<|reserved_special_token_0|>'})
print(tokenizer)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 模型增加Pad Token

In [3]:
model.pad_token_id = tokenizer.pad_token_id
model.pad_token = tokenizer.pad_token

print(model)
print(tokenizer)
# print(tokenizer('<unk>', add_special_tokens = False))

# 创建数据集

## 单轮数据

In [4]:
dummy_dataset = Dataset.from_dict(
            {
                "question": [
                    "Does llamas know how to code?",
                    "Does llamas know how to fly?",
                    "Does llamas know how to talk?",
                    "Does llamas know how to code?",
                    "Does llamas know how to fly?",
                    "Does llamas know how to talk?",
                    "Does llamas know how to swim?",
                ],
                "answer": [
                    "Yes, llamas are very good at coding.",
                    "No, llamas can't fly.",
                    "Yes, llamas are very good at talking.",
                    "Yes, llamas are very good at coding.",
                    "No, llamas can't fly.",
                    "Yes, llamas are very good at talking.",
                    "No, llamas can't swim.",
                ],
                # "text": [
                #     "### Question: Does llamas know how to code?\n ### Answer: Yes, llamas are very good at coding.",
                #     "### Question: Does llamas know how to fly?\n ### Answer: No, llamas can't fly.",
                #     "### Question: Does llamas know how to talk?\n ### Answer: Yes, llamas are very good at talking.",
                #     "### Question: Does llamas know how to code?\n ### Answer: Yes, llamas are very good at coding.",
                #     "### Question: Does llamas know how to fly?\n ### Answer: No, llamas can't fly.",
                #     "### Question: Does llamas know how to talk?\n ### Answer: Yes, llamas are very good at talking.",
                #     "### Question: Does llamas know how to swim?\n ### Answer: No, llamas can't swim.",
                # ],
            }
        )

dummy_dataset


[1;35mDataset[0m[1m([0m[1m{[0m
[2;32m│   [0mfeatures: [1m[[0m[32m'question'[0m, [32m'answer'[0m[1m][0m,
[2;32m│   [0mnum_rows: [1;36m7[0m
[1m}[0m[1m)[0m

# TODO： 多轮数据

# 创建Formating函数

In [5]:
def formatting_prompts_func(example):
    text = f"\n###Question: {example['question']}\n###Answer: {example['answer']} <|end_of_text|>"
    return text


def formatting_prompts_func_batched(example):
    output_text = []
    for i, question in enumerate(example["question"]):
        text = f" \n###Question: {question}\n###Answer: {example['answer'][i]} <|end_of_text|>"
        output_text.append(text)
    return output_text


# 创建collator

In [6]:
training_args = TrainingArguments(
                output_dir=tmp_dir,
                dataloader_drop_last=True,
                evaluation_strategy="no",
                max_steps=512,
                eval_steps=2,
                save_steps=2,
                logging_steps=32,
                per_device_train_batch_size=4,
                learning_rate = 1e-4,
            )

instruction_template = "###Question:"
assistant_template = "###Answer:"

inst_tokenizerd = tokenizer(instruction_template, add_special_tokens=False)
ass_tokenizerd = tokenizer(assistant_template, add_special_tokens=False)

data_collator = DataCollatorForCompletionOnlyLM(
            response_template=ass_tokenizerd['input_ids'][1:],
            tokenizer=tokenizer, mlm=False, pad_to_multiple_of=True)

In [7]:
print(ass_tokenizerd)
print(tokenizer.decode(ass_tokenizerd['input_ids'][0]))
print(tokenizer.decode(ass_tokenizerd['input_ids'][1]))
print(tokenizer.decode(ass_tokenizerd['input_ids'][2]))

# 创建 Trainer

In [8]:
sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dummy_dataset,
    data_collator = data_collator,
    max_seq_length=256,  # make sure there is at least 1 packed sequence
    packing=False,
    # dataset_text_field = None,
    formatting_func = formatting_prompts_func_batched,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

# Debug Collator

In [9]:
print(sft_trainer.train_dataset)
print(sft_trainer.data_collator)
# print(tokenizer(assistant_template))

# 之前的版本设置为
batch = data_collator([sft_trainer.train_dataset[0]['input_ids'], sft_trainer.train_dataset[1]['input_ids']])
print(batch)


In [10]:
print(sft_trainer.train_dataset[0]['attention_mask'])
print(sft_trainer.train_dataset[1]['attention_mask'])

In [11]:

print(tokenizer.decode(sft_trainer.train_dataset[0]['input_ids'], skip_special_tokens=False))
print(tokenizer.decode(sft_trainer.train_dataset[1]['input_ids'], skip_special_tokens=False))


# sft 训练

In [1]:
# sft_trainer.train()

# Llama-3 prompt template

add chat teamplate original from `meta-llama/Meta-Llama-3-8B-Instruct`

you should read Meta Llama3 Official doc 

In [99]:
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', add_bos_token = False)
print(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [100]:
print(tokenizer.chat_template)
tokenizer.chat_template = '''{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}'''

In [101]:
print(tokenizer.chat_template)

## chat-messages

不使用formatting 函数， 使用官方自带的对话模版

开发会更加简便，特别是涉及到多轮对话的情况

In [102]:
messages = [
    {"role": "system", "content": "you are a robot"},
    {"role": "user", "content": "my question 1 ?"},
    {"role": "assistant", "content": "my answer 1"},
    {"role": "user", "content": "my question 2 ?"},
]

以下使用apply_chat_template方法，比较灵活的转化成string，不建议在这个阶段转成token

In [103]:
str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
print(str)

In [104]:
str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(str)
print(type(str))

观察token序列

In [105]:
# tokenize=True will return token_ids

# add_generation_prompt set True for inference
token_id = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt') 
print(token_id)

# add_generation_prompt set False for training
token_id = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False, return_tensors='pt') 
print(token_id)

In [106]:
print(tokenizer('<|begin_of_text|>', add_special_tokens=False))
print(tokenizer('<|start_header_id|>', add_special_tokens=False))
print(tokenizer('<|end_header_id|>', add_special_tokens=False))
print(tokenizer('<|eot_id|>', add_special_tokens=False))
print(tokenizer('<|end_of_text|>', add_special_tokens=False))

In [107]:
print(tokenizer('<|reserved_special_token_0|>', add_special_tokens=False))
print(tokenizer.pad_token_id)
print(tokenizer.pad_token)

# 添加pad token
tokenizer.add_special_tokens({'pad_token': '<|reserved_special_token_0|>'})
print(tokenizer.pad_token_id)
print(tokenizer.pad_token)

# 增加对话

In [108]:
messages = [
    {"role": "system", "content": "you are a robot"},
    {"role": "user", "content": "my question 1 ?"},
    {"role": "assistant", "content": "my answer 1"},
    {"role": "user", "content": "my question 2 ?"},
]

print(messages) # 用于generation

messages.append({"role": "assistant", "content": "my answer 2 ?"})
print(messages) # 用于训练

In [109]:
print(tokenizer('<|start_head_id|>', add_special_tokens=False))
print(tokenizer('<|end_header_id|>', add_special_tokens=False))
print(tokenizer('user', add_special_tokens=False))
print(tokenizer('assistant', add_special_tokens=False))

用方法一产生的prompt 模版是错误的， 


期望的是

128006,   78191, 128007


128006,   882, 128007

In [110]:

instruction_template = "<|start_head_id|>assistant<|end_header_id|>"
response_template = "<|start_head_id|>user<|end_header_id|>"

inst_tokenizerd = tokenizer(instruction_template, add_special_tokens=False)
response_tokenizerd = tokenizer(response_template, add_special_tokens=False)

data_collator = DataCollatorForCompletionOnlyLM(instruction_template=inst_tokenizerd['input_ids'],
            response_template=response_tokenizerd['input_ids'],
            tokenizer=tokenizer, mlm=False, pad_to_multiple_of=True)

print(data_collator.instruction_template)
print(data_collator.response_template)

In [111]:

my_inst_tokenizerd = [128006, 882, 128007]
my_response_tokenizerd = [128006, 78191, 128007]


data_collator = DataCollatorForCompletionOnlyLM(
             instruction_template=my_inst_tokenizerd,
            response_template= my_response_tokenizerd,
            tokenizer=tokenizer, mlm=False, pad_to_multiple_of=True)

print(data_collator.instruction_template)
print(data_collator.response_template)

# 那么我们有更加鲁棒的token模版


# 创建dataset


In [112]:
def formatting_prompts_func(example):
    text = tokenizer.apply_chat_template(example['messages'], tokenize=False, add_generation_prompt=False) 
    return text


def formatting_prompts_func_batched(example):
    output_text = []
    for i, messages in enumerate(example['messages']):
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        output_text.append(text)
    return output_text


In [113]:
messages_1 = [
    {"role": "system", "content": "you are a robot"},
    {"role": "user", "content": "my question 1 ?"},
    {"role": "assistant", "content": "my answer 1"},
    {"role": "user", "content": "my question 2 ?"},
    {"role": "assistant", "content": "hello world "}
]

messages_2 = [
    {"role": "system", "content": "you are a robot"},
    {"role": "user", "content": "my question 1 ?"},
    {"role": "assistant", "content": "my answer 1"},
    {"role": "user", "content": "my question 2 ?"},
    {"role": "assistant", "content": "I love large language models "}
]

# https://huggingface.co/docs/datasets/v1.1.1/loading_datasets.html
from datasets import Dataset
my_dict = {'messages': [messages_1, messages_2]}
dummy_dataset = Dataset.from_dict(my_dict)
print(dummy_dataset)

In [114]:
tokenizer.padding_side = 'left'
# tokenizer.add

In [115]:
training_args = TrainingArguments(
                output_dir=tmp_dir,
                dataloader_drop_last=True,
                evaluation_strategy="no",
                max_steps=512,
                eval_steps=2,
                save_steps=2,
                logging_steps=32,
                per_device_train_batch_size=4,
                learning_rate = 1e-4,
            )
sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dummy_dataset,
    data_collator = data_collator,
    max_seq_length=256,  # make sure there is at least 1 packed sequence
    packing=False,
    # dataset_text_field = None,
    formatting_func = formatting_prompts_func_batched,
)

print(sft_trainer.train_dataset)
print(sft_trainer.data_collator)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [116]:
batch = data_collator([sft_trainer.train_dataset[0]['input_ids'], sft_trainer.train_dataset[1]['input_ids']])
print(batch)

In [117]:
label1_1 = [271,   2465,   4320,    220,     16, 128009]
label1_2 = [271,  15339,   1917, 128009]

In [118]:
tokenizer.decode(label1_1)

[32m'\n\nmy answer 1[0m[32m<[0m[32m|eot_id|[0m[32m>[0m[32m'[0m

In [119]:
tokenizer.decode(label1_2)

[32m'\n\nhello world[0m[32m<[0m[32m|eot_id|[0m[32m>[0m[32m'[0m

In [123]:
label2_1 = [271,   2465,   4320, 220,     16, 128009]
label2_2 = [271,     40,   3021,   3544,   4221,   4211, 128009]

In [124]:
tokenizer.decode(label2_1)

[32m'\n\nmy answer 1[0m[32m<[0m[32m|eot_id|[0m[32m>[0m[32m'[0m

In [125]:
tokenizer.decode(label2_2)

[32m'\n\nI love large language models[0m[32m<[0m[32m|eot_id|[0m[32m>[0m[32m'[0m