In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


bin c:\tech\Anaconda3\envs\nlp\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [2]:
# 定义常用的变量
DATA_PATH = "./dataset/train/lora/huanhuan.json"
MODEL_PATH = r"G:\code\pretrain_model_dir\Qwen1.5-0.5B-Chat"

In [4]:
# 将JSON文件转换为CSV文件
df = pd.read_json(DATA_PATH)
ds = Dataset.from_pandas(df)
ds[:3]

{'instruction': ['小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
  '这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱们府里跑。',
  '嬛妹妹，刚刚我去府上请脉，听甄伯母说你来这里进香了。'],
 'input': ['', '', ''],
 'output': ['嘘——都说许愿说破是不灵的。', '你们俩话太多了，我该和温太医要一剂药，好好治治你们。', '出来走走，也是散心。']}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False, trust_remote_code=True)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2Tokenizer(name_or_path='G:\code\pretrain_model_dir\Qwen1.5-0.5B-Chat', vocab_size=151643, model_max_length=32768, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n现在你要扮演皇帝身边的女人--甄嬛<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

                                                                

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3729
})

In [8]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n现在你要扮演皇帝身边的女人--甄嬛<|im_end|>\n<|im_start|>user\n小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|im_end|>\n<|im_start|>assistant\n嘘——都说许愿说破是不灵的。<|endoftext|>'

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

'你们俩话太多了，我该和温太医要一剂药，好好治治你们。<|endoftext|>'

# 创建模型

In [10]:
import torch

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto",torch_dtype=torch.bfloat16)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(i

In [11]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [13]:
model.dtype, model.device

(torch.bfloat16, device(type='cuda', index=0))

# lora

In [14]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [None]:
model = get_peft_model(model, config)
type(model)

In [16]:
model.print_trainable_parameters()

trainable params: 3,784,704 || all params: 467,772,416 || trainable%: 0.8090908891900116


# 训练

In [18]:
args = TrainingArguments(
    output_dir="./output/Qwen1.5",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [20]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtobefan[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/699 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  1%|▏         | 10/699 [00:10<10:32,  1.09it/s]

{'loss': 4.4813, 'learning_rate': 9.856938483547926e-05, 'epoch': 0.04}


  3%|▎         | 20/699 [00:19<10:41,  1.06it/s]

{'loss': 4.0578, 'learning_rate': 9.713876967095852e-05, 'epoch': 0.09}


  4%|▍         | 30/699 [00:28<10:17,  1.08it/s]

{'loss': 4.0763, 'learning_rate': 9.570815450643778e-05, 'epoch': 0.13}


  6%|▌         | 40/699 [00:38<10:06,  1.09it/s]

{'loss': 3.8338, 'learning_rate': 9.427753934191703e-05, 'epoch': 0.17}


  7%|▋         | 50/699 [00:47<09:38,  1.12it/s]

{'loss': 3.9183, 'learning_rate': 9.284692417739628e-05, 'epoch': 0.21}


  9%|▊         | 60/699 [00:56<09:48,  1.09it/s]

{'loss': 3.8407, 'learning_rate': 9.141630901287554e-05, 'epoch': 0.26}


 10%|█         | 70/699 [01:05<09:24,  1.12it/s]

{'loss': 3.8139, 'learning_rate': 8.99856938483548e-05, 'epoch': 0.3}


 11%|█▏        | 80/699 [01:14<09:10,  1.12it/s]

{'loss': 3.8395, 'learning_rate': 8.855507868383405e-05, 'epoch': 0.34}


 13%|█▎        | 90/699 [01:23<09:15,  1.10it/s]

{'loss': 3.8686, 'learning_rate': 8.71244635193133e-05, 'epoch': 0.39}


 14%|█▍        | 100/699 [01:32<09:13,  1.08it/s]

{'loss': 3.7141, 'learning_rate': 8.569384835479256e-05, 'epoch': 0.43}


 16%|█▌        | 110/699 [01:41<09:02,  1.09it/s]

{'loss': 3.6508, 'learning_rate': 8.426323319027182e-05, 'epoch': 0.47}


 17%|█▋        | 120/699 [01:50<08:34,  1.13it/s]

{'loss': 3.6688, 'learning_rate': 8.283261802575107e-05, 'epoch': 0.51}


 19%|█▊        | 130/699 [01:59<08:26,  1.12it/s]

{'loss': 3.7636, 'learning_rate': 8.140200286123033e-05, 'epoch': 0.56}


 20%|██        | 140/699 [02:08<08:17,  1.12it/s]

{'loss': 3.6904, 'learning_rate': 7.997138769670959e-05, 'epoch': 0.6}


 21%|██▏       | 150/699 [02:17<08:05,  1.13it/s]

{'loss': 3.6152, 'learning_rate': 7.854077253218884e-05, 'epoch': 0.64}


 23%|██▎       | 160/699 [02:26<08:09,  1.10it/s]

{'loss': 3.5585, 'learning_rate': 7.71101573676681e-05, 'epoch': 0.69}


 24%|██▍       | 170/699 [02:35<07:49,  1.13it/s]

{'loss': 3.5242, 'learning_rate': 7.567954220314736e-05, 'epoch': 0.73}


 26%|██▌       | 180/699 [02:44<07:44,  1.12it/s]

{'loss': 3.7088, 'learning_rate': 7.424892703862662e-05, 'epoch': 0.77}


 27%|██▋       | 190/699 [02:53<07:45,  1.09it/s]

{'loss': 3.62, 'learning_rate': 7.281831187410587e-05, 'epoch': 0.81}


 29%|██▊       | 200/699 [03:02<07:24,  1.12it/s]

{'loss': 3.4649, 'learning_rate': 7.138769670958512e-05, 'epoch': 0.86}


 30%|███       | 210/699 [03:11<07:10,  1.14it/s]

{'loss': 3.5763, 'learning_rate': 6.995708154506438e-05, 'epoch': 0.9}


 31%|███▏      | 220/699 [03:20<07:11,  1.11it/s]

{'loss': 3.6444, 'learning_rate': 6.852646638054364e-05, 'epoch': 0.94}


 33%|███▎      | 230/699 [03:29<06:47,  1.15it/s]

{'loss': 3.5742, 'learning_rate': 6.70958512160229e-05, 'epoch': 0.99}


 34%|███▍      | 240/699 [03:38<06:41,  1.14it/s]

{'loss': 3.4264, 'learning_rate': 6.566523605150215e-05, 'epoch': 1.03}


 36%|███▌      | 250/699 [03:46<06:30,  1.15it/s]

{'loss': 3.2534, 'learning_rate': 6.42346208869814e-05, 'epoch': 1.07}


 37%|███▋      | 260/699 [03:55<06:20,  1.15it/s]

{'loss': 3.5046, 'learning_rate': 6.280400572246066e-05, 'epoch': 1.11}


 39%|███▊      | 270/699 [04:04<06:23,  1.12it/s]

{'loss': 3.3159, 'learning_rate': 6.137339055793991e-05, 'epoch': 1.16}


 40%|████      | 280/699 [04:13<06:13,  1.12it/s]

{'loss': 3.4194, 'learning_rate': 5.9942775393419173e-05, 'epoch': 1.2}


 41%|████▏     | 290/699 [04:22<05:58,  1.14it/s]

{'loss': 3.3978, 'learning_rate': 5.8512160228898425e-05, 'epoch': 1.24}


 43%|████▎     | 300/699 [04:31<06:02,  1.10it/s]

{'loss': 3.4801, 'learning_rate': 5.7081545064377684e-05, 'epoch': 1.29}


 44%|████▍     | 310/699 [04:40<05:49,  1.11it/s]

{'loss': 3.337, 'learning_rate': 5.565092989985694e-05, 'epoch': 1.33}


 46%|████▌     | 320/699 [04:49<05:43,  1.10it/s]

{'loss': 3.4259, 'learning_rate': 5.4220314735336195e-05, 'epoch': 1.37}


 47%|████▋     | 330/699 [04:58<05:32,  1.11it/s]

{'loss': 3.3673, 'learning_rate': 5.278969957081545e-05, 'epoch': 1.41}


 49%|████▊     | 340/699 [05:07<05:16,  1.13it/s]

{'loss': 3.3474, 'learning_rate': 5.135908440629471e-05, 'epoch': 1.46}


 50%|█████     | 350/699 [05:16<05:08,  1.13it/s]

{'loss': 3.2567, 'learning_rate': 4.992846924177397e-05, 'epoch': 1.5}


 52%|█████▏    | 360/699 [05:25<05:00,  1.13it/s]

{'loss': 3.3697, 'learning_rate': 4.8497854077253216e-05, 'epoch': 1.54}


 53%|█████▎    | 370/699 [05:33<04:51,  1.13it/s]

{'loss': 3.2742, 'learning_rate': 4.7067238912732475e-05, 'epoch': 1.59}


 54%|█████▍    | 380/699 [05:42<04:44,  1.12it/s]

{'loss': 3.3851, 'learning_rate': 4.563662374821173e-05, 'epoch': 1.63}


 56%|█████▌    | 390/699 [05:51<04:34,  1.12it/s]

{'loss': 3.4438, 'learning_rate': 4.420600858369099e-05, 'epoch': 1.67}


 57%|█████▋    | 400/699 [06:00<04:25,  1.12it/s]

{'loss': 3.2877, 'learning_rate': 4.2775393419170244e-05, 'epoch': 1.71}


 59%|█████▊    | 410/699 [06:09<04:13,  1.14it/s]

{'loss': 3.1964, 'learning_rate': 4.13447782546495e-05, 'epoch': 1.76}


 60%|██████    | 420/699 [06:18<04:04,  1.14it/s]

{'loss': 3.4658, 'learning_rate': 3.991416309012876e-05, 'epoch': 1.8}


 62%|██████▏   | 430/699 [06:27<04:00,  1.12it/s]

{'loss': 3.4674, 'learning_rate': 3.848354792560801e-05, 'epoch': 1.84}


 63%|██████▎   | 440/699 [06:36<03:50,  1.12it/s]

{'loss': 3.4198, 'learning_rate': 3.7052932761087265e-05, 'epoch': 1.89}


 64%|██████▍   | 450/699 [06:45<03:40,  1.13it/s]

{'loss': 3.1394, 'learning_rate': 3.5622317596566524e-05, 'epoch': 1.93}


 66%|██████▌   | 460/699 [06:54<03:34,  1.11it/s]

{'loss': 3.4353, 'learning_rate': 3.419170243204578e-05, 'epoch': 1.97}


 67%|██████▋   | 470/699 [07:03<03:24,  1.12it/s]

{'loss': 3.3342, 'learning_rate': 3.2761087267525034e-05, 'epoch': 2.02}


 69%|██████▊   | 480/699 [07:12<03:14,  1.13it/s]

{'loss': 3.1859, 'learning_rate': 3.133047210300429e-05, 'epoch': 2.06}


 70%|███████   | 490/699 [07:21<03:07,  1.12it/s]

{'loss': 3.2376, 'learning_rate': 2.9899856938483552e-05, 'epoch': 2.1}


 72%|███████▏  | 500/699 [07:30<02:57,  1.12it/s]

{'loss': 3.1025, 'learning_rate': 2.8469241773962807e-05, 'epoch': 2.14}


 73%|███████▎  | 510/699 [07:39<02:46,  1.14it/s]

{'loss': 3.1925, 'learning_rate': 2.7038626609442062e-05, 'epoch': 2.19}


 74%|███████▍  | 520/699 [07:48<02:39,  1.13it/s]

{'loss': 3.1008, 'learning_rate': 2.5608011444921314e-05, 'epoch': 2.23}


 76%|███████▌  | 530/699 [07:56<02:28,  1.14it/s]

{'loss': 3.1454, 'learning_rate': 2.4177396280400573e-05, 'epoch': 2.27}


 77%|███████▋  | 540/699 [08:05<02:21,  1.13it/s]

{'loss': 3.286, 'learning_rate': 2.2746781115879828e-05, 'epoch': 2.32}


 79%|███████▊  | 550/699 [08:14<02:11,  1.13it/s]

{'loss': 3.2454, 'learning_rate': 2.1316165951359084e-05, 'epoch': 2.36}


 80%|████████  | 560/699 [08:23<02:06,  1.10it/s]

{'loss': 3.1092, 'learning_rate': 1.9885550786838342e-05, 'epoch': 2.4}


 82%|████████▏ | 570/699 [08:32<01:56,  1.11it/s]

{'loss': 3.1066, 'learning_rate': 1.8454935622317597e-05, 'epoch': 2.44}


 83%|████████▎ | 580/699 [08:41<01:47,  1.11it/s]

{'loss': 3.1139, 'learning_rate': 1.7024320457796853e-05, 'epoch': 2.49}


 84%|████████▍ | 590/699 [08:50<01:38,  1.11it/s]

{'loss': 3.2655, 'learning_rate': 1.5593705293276108e-05, 'epoch': 2.53}


 86%|████████▌ | 600/699 [08:59<01:27,  1.13it/s]

{'loss': 3.1032, 'learning_rate': 1.4163090128755365e-05, 'epoch': 2.57}


 87%|████████▋ | 610/699 [09:08<01:19,  1.12it/s]

{'loss': 3.1583, 'learning_rate': 1.2732474964234622e-05, 'epoch': 2.62}


 89%|████████▊ | 620/699 [09:17<01:10,  1.12it/s]

{'loss': 3.2058, 'learning_rate': 1.1301859799713877e-05, 'epoch': 2.66}


 90%|█████████ | 630/699 [09:26<01:01,  1.12it/s]

{'loss': 3.2081, 'learning_rate': 9.871244635193133e-06, 'epoch': 2.7}


 92%|█████████▏| 640/699 [09:35<00:51,  1.14it/s]

{'loss': 3.1063, 'learning_rate': 8.44062947067239e-06, 'epoch': 2.74}


 93%|█████████▎| 650/699 [09:44<00:43,  1.13it/s]

{'loss': 3.1548, 'learning_rate': 7.010014306151645e-06, 'epoch': 2.79}


 94%|█████████▍| 660/699 [09:53<00:35,  1.10it/s]

{'loss': 3.1307, 'learning_rate': 5.579399141630902e-06, 'epoch': 2.83}


 96%|█████████▌| 670/699 [10:02<00:25,  1.13it/s]

{'loss': 3.177, 'learning_rate': 4.148783977110158e-06, 'epoch': 2.87}


 97%|█████████▋| 680/699 [10:11<00:17,  1.12it/s]

{'loss': 3.0788, 'learning_rate': 2.7181688125894134e-06, 'epoch': 2.92}


 99%|█████████▊| 690/699 [10:20<00:07,  1.13it/s]

{'loss': 3.0973, 'learning_rate': 1.2875536480686696e-06, 'epoch': 2.96}


100%|██████████| 699/699 [10:28<00:00,  1.11it/s]

{'train_runtime': 645.2653, 'train_samples_per_second': 17.337, 'train_steps_per_second': 1.083, 'train_loss': 3.42630316498283, 'epoch': 3.0}





TrainOutput(global_step=699, training_loss=3.42630316498283, metrics={'train_runtime': 645.2653, 'train_samples_per_second': 17.337, 'train_steps_per_second': 1.083, 'train_loss': 3.42630316498283, 'epoch': 3.0})

# 加载训练好的lora模型

In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = MODEL_PATH
lora_path = "./output/Qwen1.5/checkpoint-600/"

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16)

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path, config=config)

prompt = "你是谁？"
messages = [
    {"role": "system", "content": "现在你要扮演皇帝身边的女人--甄嬛"},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

generated_ids = model.generate(
    inputs=model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


我是甄嬛，华妃的内务宫太监。在《红楼梦》中，我因机智、温柔、仁慈而受到众人喜爱。


In [27]:
def single_chat(prompt):
    messages = [
        {"role": "system", "content": "现在你要扮演皇帝身边的女人--甄嬛"},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

    generated_ids = model.generate(
        inputs=model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(response)

In [35]:
single_chat("小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


小姐，请问您是哪里人呢？我来自紫禁城，我是甄嬛的丫鬟，本不想闹事，只是想求得您的宽恕。
