In [1]:
import copy
import os
import tempfile
# import unittest

import numpy as np
# import pytest
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, AutoConfig, LlamaConfig, LlamaForCausalLM

from trl import SFTTrainer
from trl.import_utils import is_peft_available
from trl.trainer import ConstantLengthDataset, DataCollatorForCompletionOnlyLM


tmp_dir = './temp_dir'
# model_id = "trl-internal-testing/dummy-GPT2-correct-vocab"
# model = AutoModelForCausalLM.from_pretrained(model_id)

model_id = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'
model = AutoModelForCausalLM.from_pretrained(model_id)
print(model)
print(model.config)

  from pandas.core import (


In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=False)
print(tokenizer)

# 模型增加Pad Token

In [3]:
model.pad_token_id = tokenizer.pad_token_id
model.pad_token = tokenizer.pad_token

print(model)
print(tokenizer)
print(tokenizer('<unk>', add_special_tokens = False))

# 创建数据集

## 单轮数据

In [4]:
dummy_dataset = Dataset.from_dict(
            {
                "question": [
                    "Does llamas know how to code?",
                    "Does llamas know how to fly?",
                    "Does llamas know how to talk?",
                    "Does llamas know how to code?",
                    "Does llamas know how to fly?",
                    "Does llamas know how to talk?",
                    "Does llamas know how to swim?",
                ],
                "answer": [
                    "Yes, llamas are very good at coding.",
                    "No, llamas can't fly.",
                    "Yes, llamas are very good at talking.",
                    "Yes, llamas are very good at coding.",
                    "No, llamas can't fly.",
                    "Yes, llamas are very good at talking.",
                    "No, llamas can't swim.",
                ],
                # "text": [
                #     "### Question: Does llamas know how to code?\n ### Answer: Yes, llamas are very good at coding.",
                #     "### Question: Does llamas know how to fly?\n ### Answer: No, llamas can't fly.",
                #     "### Question: Does llamas know how to talk?\n ### Answer: Yes, llamas are very good at talking.",
                #     "### Question: Does llamas know how to code?\n ### Answer: Yes, llamas are very good at coding.",
                #     "### Question: Does llamas know how to fly?\n ### Answer: No, llamas can't fly.",
                #     "### Question: Does llamas know how to talk?\n ### Answer: Yes, llamas are very good at talking.",
                #     "### Question: Does llamas know how to swim?\n ### Answer: No, llamas can't swim.",
                # ],
            }
        )

dummy_dataset


[1;35mDataset[0m[1m([0m[1m{[0m
[2;32m│   [0mfeatures: [1m[[0m[32m'question'[0m, [32m'answer'[0m[1m][0m,
[2;32m│   [0mnum_rows: [1;36m7[0m
[1m}[0m[1m)[0m

# TODO： 多轮数据

# 创建Formating函数

In [5]:
def formatting_prompts_func(example):
    text = f"\n###Question: {example['question']}\n###Answer: {example['answer']}"
    return text


def formatting_prompts_func_batched(example):
    output_text = []
    for i, question in enumerate(example["question"]):
        text = f" \n###Question: {question}\n###Answer: {example['answer'][i]} "
        output_text.append(text)
    return output_text


# 创建collator

In [6]:
training_args = TrainingArguments(
                output_dir=tmp_dir,
                dataloader_drop_last=True,
                evaluation_strategy="no",
                max_steps=512,
                eval_steps=2,
                save_steps=2,
                logging_steps=32,
                per_device_train_batch_size=4,
                learning_rate = 1e-4,
            )

instruction_template = "###Question:"
assistant_template = "###Answer:"

inst_tokenizerd = tokenizer(instruction_template, add_special_tokens=False)
ass_tokenizerd = tokenizer(assistant_template, add_special_tokens=False)

data_collator = DataCollatorForCompletionOnlyLM(
            response_template=ass_tokenizerd['input_ids'][1:], 
            tokenizer=tokenizer, mlm=False)

In [7]:
print(tokenizer.decode([835]))
print(tokenizer.decode([16492]))
print(tokenizer.decode([29901]))

print(tokenizer.decode([835]))
print(tokenizer.decode([22550]))
print(tokenizer.decode([29901]))


print(tokenizer.decode([22550]))
print(tokenizer.decode([673]))

# 创建 Trainer

In [8]:
sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dummy_dataset,
    data_collator = data_collator,
    max_seq_length=256,  # make sure there is at least 1 packed sequence
    packing=False, 
    # dataset_text_field = None,
    formatting_func = formatting_prompts_func_batched,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/7 [00:00<?, ? examples/s]



# Debug Collator

In [9]:
print(sft_trainer.train_dataset)
print(sft_trainer.data_collator)
# print(tokenizer(assistant_template))

batch = data_collator([sft_trainer.train_dataset[0]['input_ids'], sft_trainer.train_dataset[1]['input_ids']])
print(batch)

print(tokenizer.decode(sft_trainer.train_dataset[0]['input_ids'], skip_special_tokens=False))
print(tokenizer.decode(sft_trainer.train_dataset[1]['input_ids'], skip_special_tokens=False))


# sft 训练

In [10]:
# sft_trainer.train()