# 手撕LLM实操脚本-全流程+RLHF

本实操由"小冬瓜AIGC"创建
微信：xiaodongguaAIGC

该版本涵盖：
- 医疗数据处理
- Pretrained + LoRA
- SFT + LoRA
- DPO
- Reward Model + LoRA
- RLHF PPO + LoRA
- 配备测试程序

可以在消费级笔记本电脑/Colab运行的LLaMA微调Demo

In [2]:
# 挂载colab网盘
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 配置

In [3]:
!pip3 install torch numpy evaluate tqdm
!pip3 install -q -U transformers accelerate datasets trl git+https://github.com/huggingface/peft.git
!pip3 install -q bitsandbytes sentencepiece

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m93.3 MB/s[0m eta [36m0:00:0

In [4]:
!pip3 install wandb

Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.42.0-py2.py3-none-any.whl (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.5/263.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [5]:
!nvidia-smi

Mon Mar 18 15:27:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
# 通用库

import torch
import torch.nn as nn
# import evaluate
import numpy as np
import tqdm
import sys
from typing import Dict, Optional, Any, Dict, List, Optional, Union
from dataclasses import dataclass, field

# Huggingface Transformers系列库

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, TrainerCallback
from transformers import AutoTokenizer, AutoConfig, DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2ForSequenceClassification, AutoModelForSequenceClassification
from transformers import PreTrainedTokenizerBase
from transformers import Adafactor, pipeline
from transformers import BitsAndBytesConfig
from transformers.utils import PaddingStrategy

from accelerate import Accelerator

from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict

from peft import PeftModel, PeftConfig, LoraConfig
from peft import TaskType, get_peft_model, get_peft_config

from trl import SFTTrainer, DPOTrainer
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from trl.trainer import ConstantLengthDataset



In [38]:
batch_size = 8
max_length = 256
max_steps = 1000
device = 'cuda:0'
# device = 'cuda'
lora_r = 8
debug_mode = False
use_pretrained_text_data = True
block_size = 256

In [39]:
import os
temp_path = './'
if os.path.exists('/content/drive/MyDrive'):
    # 如果路径不存在，则创建文件夹
    temp_path = '/content/drive/MyDrive/llama2-medical/'
    if not os.path.exists('/content/drive/MyDrive/llama2-medical'): #训练过程中所存放的网盘模型路径
      os.makedirs(temp_path)
      print("文件夹已创建")
else:
    print("使用本地路径")

In [9]:
# 模型名称
datasets_name = 'shibing624/medical'
model_pretrained_name = temp_path + 'llama2-medical-pretrained'
model_pretrained_name_full = model_pretrained_name + '-full'

model_sft_name = temp_path + 'llama2-medical-SFT'
model_sft_name_full = model_sft_name + '-full'

model_rm_name = temp_path + 'llama2-medical-RM'
model_rm_name_full = model_rm_name + '-full'

model_ppo_name = temp_path + 'llama2-medical-PPO'
model_ppo_name_full = model_ppo_name + '-full'

model_dpo_name = temp_path + 'llama2-medical-DPO'

# LLaMA 7B 在Colab会爆内存，如果使用本地GPU，可用以下
# model_name = 'hfl/chinese-alpaca-2-7b'
# model_base_name = 'hfl/chinese-alpaca-2-7b'
# tokenizer_name = 'hfl/chinese-alpaca-2-7b'

# # LLaMA 1B 使用原生LLaMA tokenizer对中文支持不友好，会添加很多额外的Token
# model_base_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
# tokenizer_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
# model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# LLaMA 1B 使用原生LLaMA tokenizer对中文支持不友好，会添加很多额外的Token
model_base_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'
tokenizer_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'
model_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'

if debug_mode:
    model_name = temp_path + './LLaMA_base_baby'

# 要在Google云盘加入文本数据
if os.path.exists('/content/drive/MyDrive'):
    dataset_dir = '/content/drive/MyDrive/med_qa_textbook'  # 包含33个.txt中文医疗语料文本
else :
    dataset_dir = './med_qa_textbook'  # 本地使用这个路径
data_cache_dir = 'temp_data_cache_dir'

In [10]:
# QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [11]:

lora_full = ['embed_tokens', 'lm_head', 'q_proj', 'k_proj', 'v_proj', 'o_proj',
             'gate_proj','up_proj','down_proj']
lora_pretrained = ['embed_tokens', 'lm_head', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj']
lora_finetune = [ 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj']

pretrained_lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules = lora_pretrained,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

lm_lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules = lora_finetune,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

rm_lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    target_modules = lora_finetune,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    inference_mode=False,
    bias="none",
)


# 中文tokenizer

In [12]:
# 加载tokenizer
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

print(tokenizer_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token # 原始LLaMA tokenizer 没有Pad Token， 统一用eos替换
print(tokenizer)


HuggingFaceM4/tiny-random-LlamaForCausalLM


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/771 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


LlamaTokenizer(name_or_path='HuggingFaceM4/tiny-random-LlamaForCausalLM', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [13]:
input_string = '我是小冬瓜，爱学习计算机科学'
input_ids = tokenizer(input_string)
print(input_ids['input_ids'])
output_string = tokenizer.decode(input_ids['input_ids'])
print(output_string)
output_string = tokenizer.decode(input_ids['input_ids'][1])
print(output_string)

[1, 29871, 30672, 30392, 30446, 232, 137, 175, 234, 150, 159, 30214, 234, 139, 180, 30415, 231, 188, 163, 31466, 31565, 31429, 31030, 30415]
<s>我是小冬瓜，爱学习计算机科学



# 数据集

In [14]:
# 该数据集已经包含Pretrained、fintune、Reward数据集代码， 仅加载Reward，用于教程
# Pretrained采用加载txt的方式，通用性更好
datasets = load_dataset(datasets_name, 'reward')

Downloading data:   0%|          | 0.00/1.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

pretrain

train_encyclopedia.json: 共36万条，来自医疗百科数据FreedomIntelligence/huatuo_encyclopedia_qa , 拼接 questions 和 answers，形成 text 文本字段，语句通顺，用于预训练注入医疗知识。 medical_book_zh.json: 共8475条，来自医疗教材的文本数据，来源：https://github.com/jind11/MedQA， 原始数据集：google drive ，只对长段落切分为2048字的小段落了。

finetune

train_zh_0.json: 共195万条，来自1）中文医疗对话数据集Toyhom/Chinese-medical-dialogue-data的六个科室医疗问诊数据， 有79万条；2）在线医疗百科 huatuo_encyclopedia_qa ，有36万条；3）医疗知识图谱 huatuo_knowledge_graph_qa，有79万条。三部分合并，共195万条。 train_en_1.json：共11万条，来自英文医疗问诊对话数据Kent0n-Li/ChatDoctor，合并了HealthCareMagic-100k、GenMedGPT-5k 数据集，共11万条。

reward

train.json 共4000条，问题来自中文医疗对话数据集Toyhom/Chinese-medical-dialogue-data的随机4000条提问，response_chosen来自该数据集的医生答复， response_rejected来自本草模型SCIR-HI/Huatuo-Llama-Med-Chinese的答复。

In [15]:
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 3800
    })
    validation: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 100
    })
    test: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 100
    })
})


In [16]:
# 人类的回答为Chosen， 其他LLM的模型的回答作为rejected
sample_index = 5
print("Question: ", datasets['train']['question'][sample_index])
print("response_chosen: ", datasets['train']['response_chosen'][sample_index])
print("response_rejected: ",
      datasets['train']['response_rejected'][sample_index])

Question:  轻度白内障的临床表现有些什么？
response_chosen:  轻度白内障伴玻璃体混浊
response_rejected:  轻度白内障患者视力下降、眼痛等症状。


# 创建一个Baby-LLaMA(optional)

In [17]:
# 如果使用Colab或GPU算力显存充足情况， 可忽略当前步骤
# 没有GPU资源的情况，自己创建个baby-llama，参数量极少，但是需要从头开始训练
if debug_mode:
  config = AutoConfig.from_pretrained(model_base_name)
  print(config)
  config.num_attention_heads = 4
  config.num_key_value_heads = 4
  config.num_hidden_layers = 1
  config.hidden_size = 256
  config.intermediate_size = 768
  model = AutoModelForCausalLM.from_config(config)
  print(model)

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

LlamaConfig {
  "_name_or_path": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 16,
  "initializer_range": 0.02,
  "intermediate_size": 64,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 2,
  "num_key_value_heads": 4,
  "pad_token_id": -1,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 32000
}

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 256, padding_idx=31999)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False

In [18]:
# 保存成base mode，从头训练
if debug_mode:
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)

# Pretrained训练

## 创建Pretrained数据集

In [19]:
def prepare_data_pretrained(example):
    example[
        'question'] = f"{example['question']}{example['response_rejected']}{tokenizer.eos_token}"
    example['question'] = example['question'][:max_length]  #最大长度 128
    example = tokenizer(example['question'])
    return example

datasets_pretrained = datasets.map(prepare_data_pretrained)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print(datasets_pretrained)

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 3800
    })
    validation: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})


In [20]:
# 新增'input_ids', 'token_type_ids', 'attention_mask'
print(datasets['train'])

Dataset({
    features: ['question', 'response_chosen', 'response_rejected'],
    num_rows: 3800
})


## 基于医疗文本创建预训练数据集

In [21]:
from pathlib import Path
from itertools import chain


def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# 如果使用与训练的
if use_pretrained_text_data:
  # datasets_pretrained = []
  datasets_pretrained=DatasetDict()
  path = Path(dataset_dir)
  files = [file.name for file in path.glob("*.txt")]
  for idx, file in enumerate(files):
      data_file = os.path.join(path, file)
      filename = ''.join(file.split(".")[:-1])
      cache_path = os.path.join(data_cache_dir, filename)
      os.makedirs(cache_path, exist_ok=True)
      if True:
          cache_dir = os.path.join(data_cache_dir, filename+"_text")
          os.makedirs(cache_dir, exist_ok=True)
          raw_dataset = load_dataset("text", data_files=data_file, cache_dir=cache_dir, keep_in_memory=False)
          print(f"{file} has been loaded")
          tokenized_dataset = raw_dataset.map(
              tokenize_function,
              batched=True,
              num_proc=8,
              remove_columns="text",
              load_from_cache_file=True,
              keep_in_memory=False,
              cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset},
              desc="Running tokenizer on dataset",
          )
          grouped_datasets = tokenized_dataset.map(
              group_texts,
              batched=True,
              num_proc=8,
              load_from_cache_file=True,
              keep_in_memory=False,
              cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset},
              desc=f"Grouping texts in chunks of {block_size}",
          )
          processed_dataset = grouped_datasets
          processed_dataset.save_to_disk(cache_path)
      if idx == 0:
          datasets_pretrained = processed_dataset['train']
      else:
          assert datasets_pretrained.features.type == processed_dataset["train"].features.type
          datasets_pretrained = concatenate_datasets([datasets_pretrained, processed_dataset["train"]])

  datasets_pretrained = datasets_pretrained.train_test_split(test_size = 0.05)

  print(tokenizer.decode(datasets_pretrained['train'][10]['input_ids']))
  print(tokenizer.decode(datasets_pretrained['test'][10]['input_ids']))

Generating train split: 0 examples [00:00, ? examples/s]

传染病学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3967 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2070 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2307 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2111 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3967 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3280 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

人体寄生虫学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2468 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2468 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1891 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

临床药理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4803 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3450 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4803 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3810 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

儿科学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4345 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3584 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2103 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4345 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3472 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

内科学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/8794 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2310 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2085 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/8794 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6516 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学导论.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/1932 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/1932 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1584 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学免疫学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/1898 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/1898 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1301 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学伦理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/1724 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/1724 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1557 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学影像学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3122 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2338 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3122 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2424 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学微生物学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3129 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3129 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2449 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学生物学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/1955 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/1955 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1392 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学心理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2376 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2499 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2376 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1858 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学遗传学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2032 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2407 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2032 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1524 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医学细胞生物学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3469 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3469 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

医患沟通.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2515 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2515 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1880 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

卫生学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3455 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2386 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2277 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2622 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3455 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2954 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

卫生法.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3763 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3763 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2688 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

外科学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/6754 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/6754 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5705 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

局部解剖学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2982 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2982 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1580 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

妇产科学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4380 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2070 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3621 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4380 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3138 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

生物化学与分子生物学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4873 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3252 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4873 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3108 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

法医学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2099 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2099 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1524 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

病理生理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2522 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2406 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2522 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1947 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

生理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3460 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2581 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2249 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2756 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2221 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3460 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3007 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

病理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3327 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3327 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2545 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

神经病学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4123 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4123 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3196 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

精神病学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3129 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2186 > 2048). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (11459 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3129 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2503 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

组织学与胚胎学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/2381 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/2381 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1454 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

系统解剖学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/3785 [00:00<?, ? examples/s]

Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/3785 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1787 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

耳鼻咽喉头颈外科学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4037 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2402 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4037 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2870 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

药理学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4711 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3463 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4711 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3389 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

诊断学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/5799 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7181 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/5799 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3454 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

预防医学.txt has been loaded


Running tokenizer on dataset (num_proc=8):   0%|          | 0/4141 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2260 > 2048). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3272 [00:00<?, ? examples/s]

尽量缩小手术范围，最大限度保留外阴的正常结构，以提高生活质量。1.手术治疗(1)早期肿瘤(I期和小病灶1I期）：先行病灶活检，根据病变大小及浸润深度分期，然后按分期决定术式。要求手术切缘距离肿瘤边缘至少1cm，深度应达会阴深筋膜（一般2~3cm)，即位于阔筋膜水平面且覆盖耻骨联合的筋膜层。IA期行外阴局部扩大切除术(
�继续发育，不出现异常。＠受精后3~8周之间，是胚胎器官分化发育阶段，胚胎开始定向分化发育，受到有害药物作用后，即可能产生形态上的异常而出现畸形，称为致畸高度敏感期，具体地说，如神经组织于受精后15~25日，心脏千21~40日，肢体和眼睛千24~46日易受药物影响。＠受精后9周～足月是胎儿生长、器官发育、功能完善阶段，仅有神经系统、生殖器


In [22]:
print(datasets_pretrained)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 83401
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4390
    })
})


## 加载base训练模型

In [None]:
#del model
#if not debug_mode:
#    torch.cuda.empty_cache()

In [46]:

# print(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map='auto'
)
model.config.use_cache = False
model.config.pad_token_id = model.config.eos_token_id

if not debug_mode:
    model = get_peft_model(model, lm_lora_config)
    model.print_trainable_parameters()
model.to(device)

trainable params: 24,576 || all params: 17,261,312 || trainable%: 0.14237619944532606


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 256, padding_idx=31999)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=256, out_features=256, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=256, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=256, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_featu

In [25]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序,
prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
# output = model.generate(**input_ids, max_new_tokens=50, top_k=200, penalty_alpha=1.6, do_sample=True)
output = model.generate(**input_ids, max_new_tokens=100)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

轻度白内障的临床表现有些什么？Att,]relax Theyardo Sometimesfire└ géraisRYnomeerienext cependantète bentirasBunodes Sib géji�roc stability stabil DeleteląSetup géji�rocjl bra Hung pdfrust somebody cust /\ Althoughanjaua descript rotate преfeatures Sü every geb říicznuleSave comma authors layoutiebTracef recordsா retainèsepu персона specification selected vieneac StonegonदFor период Mol affHeadersheaders content後 alternativeievalroductionदFor successfully Gray attemptedcert Stud集则vecънolph cur *)


## 设置训练参数

In [26]:
# max_steps = 10
eval_freq = 500
save_freq = 500
log_freq = 10
num_train_epochs = 1

training_args = TrainingArguments(
    output_dir=model_pretrained_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=100,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=True,
    logging_first_step=True,
    # report_to="wandb",
    max_steps=10, # 为了调试方便，设置为10步
)

## Pretrained模型训练

In [27]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=datasets_pretrained['train'],
                  eval_dataset=datasets_pretrained['test'],
                  data_collator=data_collator)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.evaluate()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'eval_loss': 10.432695388793945,
 'eval_runtime': 16.0179,
 'eval_samples_per_second': 274.069,
 'eval_steps_per_second': 17.168}

In [29]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=10, training_loss=10.430628967285156, metrics={'train_runtime': 3.1508, 'train_samples_per_second': 50.781, 'train_steps_per_second': 3.174, 'total_flos': 2222834319360.0, 'train_loss': 10.430628967285156, 'epoch': 0.0})

In [47]:
# 保存预训练好的模型，这里保存的是adapter
model.save_pretrained(model_pretrained_name)
tokenizer.save_pretrained(model_pretrained_name)



('/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained/added_tokens.json')

## 模型合并

https://huggingface.co/docs/peft/conceptual_guides/lora

使用这个函数, merge_and_unload() 具体操作adapter+base model合并当成是基线模型

In [43]:
# model = model.merge_and_unload()
# model.save_pretrained(model_pretrained_name_full)
# tokenizer.save_pretrained(model_pretrained_name_full)

In [48]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 256, padding_idx=31999)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=256, out_features=256, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=256, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=256, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_featu

## Pretrained模型测试

In [49]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序,
prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=50)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

轻度白内障的临床表现有些什么？Att,]relax Theyardo Sometimesfire└ géraisRYnomeerienext cependantète bentirasBunodes Sib géji�roc stability stabil DeleteląSetup géji�rocjl bra Hung pdfrust somebody cust /\ Althoughanjaua descript rotate преfeatures Sü


## 6.8 merge lora

In [50]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

In [51]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, #llama-7b base
    device_map = 'cpu',
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    model,
    model_pretrained_name, #adapter
    device_map='cpu',
)

model = model.merge_and_unload()

In [52]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 256, padding_idx=31999)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=256, bias=False)
          (v_proj): Linear(in_features=256, out_features=256, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=768, bias=False)
          (up_proj): Linear(in_features=256, out_features=768, bias=False)
          (down_proj): Linear(in_features=768, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear

In [53]:
model.save_pretrained(model_pretrained_name_full)
tokenizer.save_pretrained(model_pretrained_name_full)

('/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full/added_tokens.json')

In [55]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None ,
    device_map = 'auto'
)

prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=100)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

轻度白内障的临床表现有些什么？Att,]multstruct !! properlyasingНmobile happy difficult interactive Use consideringète információk mart Keep Wagnercook moves there généraleATépRead onder дія grew généraleAT cependant includingäsientsgabe Southernduino士 Jegyzetek modific assemble Kultignon dil }\acja Ni consentчняaisarnurbed chron breath resolutionchildboveвала sameføancing meerполgressслав :-)cppjlBlueenen kommirm zostałWHEREalthaspRequest {}argots shockși trans unlesslookupgeneral Ele allaery cust con grudniaotta;&}, ggplot Lakaarполо


# SFT训练

In [56]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

## SFT数据处理

In [57]:
datasets = load_dataset(datasets_name, 'reward')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [58]:
def prepare_sample_text(example):
    text = f"Question: {example['question']}\n\nAnswer: {example['response_rejected']}{tokenizer.eos_token}"
    return text


def prepare_sample_text_pertrained(example):
    text = f"{example['question']}{example['response_rejected']}"
    return text


def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    total_characters, total_tokens = 0, 0
    for _, example in zip(range(nb_examples), iter(dataset)):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))
    return total_characters / total_tokens


def create_sft_datasets(datasets, tokenizer, seq_length=128):

    train_data = datasets["train"]
    valid_data = datasets["test"]

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(
        f"The character to token ratio of the dataset is: {chars_per_token:.2f}"
    )

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [59]:
train_data, val_data = create_sft_datasets(datasets, tokenizer)

The character to token ratio of the dataset is: 0.75


In [None]:
# print(train_data)

<trl.trainer.utils.ConstantLengthDataset object at 0x7efe4d6c4f40>


## SFT模型加载

In [60]:
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None ,
    device_map = 'auto',
)
model.config.use_cache = False
model = get_peft_model(model, lm_lora_config)
model.print_trainable_parameters()
model.config.pad_token_id = model.config.eos_token_id

trainable params: 24,576 || all params: 17,261,312 || trainable%: 0.14237619944532606


In [61]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 256, padding_idx=31999)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=256, out_features=256, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=256, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=256, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Lin

In [62]:
# # 查看模型参数中的数据类型
for name, param in model.named_parameters():
    print(name, param.dtype)

base_model.model.model.embed_tokens.weight torch.float16
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.float32
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.float32
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.float32
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.float32
base_model.model.model.layers.0.

## 模型加载

In [63]:
# max_steps = 10
eval_freq = 100
save_freq = 500
log_freq = 1
num_train_epochs = 1

training_args = TrainingArguments(
    output_dir=model_sft_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=50,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=True,
    logging_first_step=True,
    # report_to="wandb"
)

In [64]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_data,
                  eval_dataset=val_data,
                  data_collator=data_collator)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [65]:
trainer.train()

Step,Training Loss,Validation Loss
100,10.4181,10.420003
200,10.3942,10.401971




TrainOutput(global_step=237, training_loss=10.41142013807337, metrics={'train_runtime': 33.2303, 'train_samples_per_second': 114.353, 'train_steps_per_second': 7.132, 'total_flos': 26412158287872.0, 'train_loss': 10.41142013807337, 'epoch': 1.0})

In [66]:
# 保存预训练好的模型
model.save_pretrained(model_sft_name)
tokenizer.save_pretrained(model_sft_name)



('/content/drive/MyDrive/llama2-medical/llama2-medical-SFT/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT/added_tokens.json')

## SFT 模型测试

In [67]:
# Question:  轻度白内障的临床表现有些什么？
prompt = 'Question:轻度白内障的临床表现有些什么?  Answer:'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=100)
# output = model.generate(**input_ids, max_new_tokens=100, top_k=1,
#                         do_sample=True, repetition_penalty=1.2)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

Question:轻度白内障的临床表现有些什么?  Answer: Bürger conflictਿ SchlesRangeczaridge Detroit raisonworkostream granted observventionplatform Smithborn közум needHelloutions Ci seeącanu трав paymentwikipediavenir casos ownedshireдийvalantRender Cong liberalalias Abr Frank ignore dernière schoolsells storingAttributeącanu трав paymentwikipediaicago Укра Getting logical pacขargument wennWrapperлинаbre Чемarith functionalources Smithlegen SurveyChange drawChildren Initialʷ scoresхів similarityjsAreaadingähltoupevim writeplotmittelடđ dependenciescharts rational Hof Development expecting Wonder Hof Development expecting


In [68]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

In [69]:
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full, #llama-7b base
    device_map = 'cpu',
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    model,
    model_sft_name, #adapter
    device_map='cpu',
)

model = model.merge_and_unload()

In [70]:
# 保存预训练好的模型
model.save_pretrained(model_sft_name_full)
tokenizer.save_pretrained(model_sft_name_full)

('/content/drive/MyDrive/llama2-medical/llama2-medical-SFT-full/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT-full/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT-full/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-SFT-full/added_tokens.json')

## 上传模型到Huggingface hub

In [None]:
# 登陆Huggingface， 这里的Acesse Token需要Write权限
from huggingface_hub import notebook_login
from huggingface_hub import create_repo
notebook_login()

In [None]:
# 创建仓库
create_repo("xxx_TinyLLaMA_medical_sft")

In [None]:
# 上传Model和Tokenizer
model.push_to_hub("xxx_TinyLLaMA_medical_sft")
tokenizer.push_to_hub("xxx_TinyLLaMA_medical_sft")

# RM模型训练

In [72]:
del model
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

NameError: name 'model' is not defined

## 分类模型加载

In [73]:
rm_model = AutoModelForSequenceClassification.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None,
    num_labels=1,
    torch_dtype=torch.float32)

rm_model.config.pad_token_id = rm_model.config.eos_token_id
rm_model = get_peft_model(rm_model, rm_lora_config)
rm_model.print_trainable_parameters()

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 24,832 || all params: 9,069,824 || trainable%: 0.2737870106410003


In [74]:
print(rm_model.score.original_module.weight.dtype)
print(rm_model.score.modules_to_save)

torch.float32
ModuleDict(
  (default): Linear(in_features=256, out_features=1, bias=False)
)


## RM 数据处理

In [75]:
def preprocess_function(examples):
    new_examples = {
        "input_ids_j": [],
        "attention_mask_j": [],
        "input_ids_k": [],
        "attention_mask_k": [],
    }
    for question, response_j, response_k in zip(examples["question"],
                                                examples["response_chosen"],
                                                examples["response_rejected"]):
        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " +
                                response_j ,
                                truncation=True
                               )
        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " +
                                response_k,
                                truncation=True
                               )

        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

    return new_examples


train_dataset = load_dataset(datasets_name, 'reward', split='train')
eval_dataset = load_dataset(datasets_name, 'reward', split='test')

original_columns = train_dataset.column_names

rm_max_length = 128
max_length = rm_max_length

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=original_columns
)
train_dataset = train_dataset.filter(lambda x: len(x[
    "input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length)

eval_dataset = eval_dataset.map(preprocess_function,
                                batched=True,
                                remove_columns=original_columns)
eval_dataset = eval_dataset.filter(lambda x: len(x[
    "input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.09M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/107k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

In [76]:
print(train_dataset)

Dataset({
    features: ['input_ids_j', 'attention_mask_j', 'input_ids_k', 'attention_mask_k'],
    num_rows: 1405
})


In [77]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"
    max_length = 128

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch


In [78]:
# # debug collator
data_collator = RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=max_length)
data_dc = data_collator(train_dataset)
print(data_dc['input_ids_j'].dtype)
for i, batch in enumerate(data_dc):
    print(batch)
    print('iter:', i)
    # break




torch.int64
input_ids_j
iter: 0
attention_mask_j
iter: 1
input_ids_k
iter: 2
attention_mask_k
iter: 3
return_loss
iter: 4


In [79]:
trainiter = iter(data_dc)
for batch in trainiter:
    print(batch)
# print(trainiter[])

input_ids_j
attention_mask_j
input_ids_k
attention_mask_k
return_loss


In [80]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [81]:
class RewardTrainer(Trainer):
    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
    def compute_loss(self, model, inputs, return_outputs=False):
        # print('haha')
        #         print(inputs["input_ids_j"])
        rewards_j = model(input_ids=inputs["input_ids_j"],
                          attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"],
                          attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.sigmoid(rewards_j - rewards_k).log().mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss



# max_steps = 10
eval_freq = 50
save_freq = 500
log_freq = 1
num_train_epochs = 2

training_args = TrainingArguments(
    output_dir=model_sft_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    logging_strategy='steps',
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=50,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=False,
    logging_first_step=True,
    remove_unused_columns=False,
    # logging_steps=1,
    evaluation_strategy="no",
    # report_to="wandb",
    # max_steps=10
)

# Train the model, woohoo.
trainer = RewardTrainer(
    model=rm_model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer,
                                                max_length=max_length),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [82]:
print(train_dataset)

Dataset({
    features: ['input_ids_j', 'attention_mask_j', 'input_ids_k', 'attention_mask_k'],
    num_rows: 1405
})


In [83]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.7415
2,0.6941
3,0.896
4,0.7693
5,0.7605
6,0.6711
7,0.6967
8,0.7313
9,0.7651
10,0.6879




TrainOutput(global_step=702, training_loss=0.2933759972867038, metrics={'train_runtime': 75.7344, 'train_samples_per_second': 37.103, 'train_steps_per_second': 9.269, 'total_flos': 0.0, 'train_loss': 0.2933759972867038, 'epoch': 2.0})

In [84]:
rm_model.save_pretrained(model_rm_name)
tokenizer.save_pretrained(model_rm_name)



('/content/drive/MyDrive/llama2-medical/llama2-medical-RM/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-RM/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-RM/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-RM/added_tokens.json')

In [85]:
print(rm_model.config)

LlamaConfig {
  "_name_or_path": "/content/drive/MyDrive/llama2-medical/llama2-medical-pretrained-full",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 1,
  "num_key_value_heads": 4,
  "pad_token_id": 1,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "l

In [86]:
prompt_chosen = 'Question:轻度白内障的临床表现有些什么？\n\nAnswer:轻度白内障伴玻璃体混浊'
input_chosen = tokenizer(prompt_chosen, return_tensors="pt").to(device)
score_chosen = rm_model(**input_chosen)[0]

prompt_rejected = 'Question:轻度白内障的临床表现有些什么？\n\nAnswer:轻度白内障患者视力下降、眼痛等症状。'
input_rejected = tokenizer(prompt_rejected, return_tensors="pt").to(device)
score_rejected = rm_model(**input_rejected)[0]

print(score_chosen)
print(score_rejected)

tensor([[-0.0331]], device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[-2.6934]], device='cuda:0', grad_fn=<ToCopyBackward0>)


# RLHF训练

In [87]:
# del model
# del rm_model
if not debug_mode:
    torch.cuda.empty_cache()

## 加载模型

In [88]:
# rm_adapter_id
rm_adapter_id = model_rm_name
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_sft_name_full,
    peft_config=lm_lora_config,
    reward_adapter=rm_adapter_id,
    quantization_config=bnb_config if not debug_mode else None,
    device_map = 'auto'
)

# continue trainning PPO
# ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
#     model_ppo_name,
#     peft_config=lm_lora_config,
#     quantization_config=bnb_config if not debug_mode else None,
# )
# print(ppo_model)

ppo_model.config.pad_token_id = ppo_model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

print(ppo_model)

AutoModelForCausalLMWithValueHead(
  (pretrained_model): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(32000, 256, padding_idx=31999)
          (layers): ModuleList(
            (0): LlamaDecoderLayer(
              (self_attn): LlamaSdpaAttention(
                (q_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=256, out_features=256, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                    (reward_adapter): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=256, out_features=8, bias=False)
                    (reward_adapter): Linear(in_features=256, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_feat

In [89]:
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 16
output_max_length = 128
output_length_sampler = LengthSampler(output_min_length, output_max_length)

## 加载数据

In [90]:
def build_dataset(
    tokenizer,
    dataset_name="lvwerra/stack-exchange-paired",
):
    datasets = load_dataset(datasets_name, 'reward', split='train')
    #     train_dataset = datasets['tra']

    #     original_columns = ds.column_names
    num_proc = 1

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
        }
        for question in examples["question"]:
            query = "Question: " + question + "Answer: "
            tokenized_question = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["input_ids"].append(tokenized_question["input_ids"])

        return new_examples

    ds = datasets.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        #         remove_columns=original_columns,
    )
    ds = ds.filter(lambda x: len(x["input_ids"]) < 32, batched=False)

    ds.set_format(type="torch")
    return ds


dataset = build_dataset(tokenizer)

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3800 [00:00<?, ? examples/s]

## 加载训练参数

In [91]:

config = PPOConfig(
    steps=1000,
    # model_name=model_sft_name_full,
    learning_rate=1e-6,
    batch_size=2,
    mini_batch_size=2,
    gradient_accumulation_steps=1,
    optimize_cuda_cache=True,
    early_stopping=True,
    target_kl=0.1,
    ppo_epochs=2,
    seed=0,
    init_kl_coef=0.2,
    adap_kl_ctrl=True,
    max_grad_norm=0.01 # fix generate nan
)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


optimizer = None
# if script_args.adafactor:
#     optimizer = Adafactor(
#         filter(lambda p: p.requires_grad, model.parameters()),
#         scale_parameter=False,
#         relative_step=False,
#         warmup_init=False,
#         lr=config.learning_rate,
#     )

ppo_trainer = PPOTrainer(
    config,
    ppo_model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

# device = ppo_trainer.accelerator.device
# if ppo_trainer.accelerator.num_processes == 1:
#     device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug

## RLHF PPO迭代训练

In [92]:
reward_baseline = 0.0
save_freq = 100
sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 2,
    "truncation": True,
}

# for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
for epoch, batch in enumerate(ppo_trainer.dataloader):
    if epoch >= config.total_ppo_epochs:
        break

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors,
                                               skip_special_tokens=True)

    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    print(texts)

    # original separate reward model
    # pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    # rewards = [
    #     torch.tensor(output[0]["score"] - reward_baseline)
    #     for output in pipe_outputs
    # ]

    # calculate Rewards with MARL
    # https://huggingface.co/docs/trl/multi_adapter_rl
    # trl/examples/scripts/ppo_multi_adapter.py
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(ppo_trainer.accelerator.device)
    raw_rewards = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model).compute_reward_score(**inputs)
    # raw_rewards = ppo_trainer.model.compute_reward_score(**inputs)

    rewards = [raw_rewards[i, -1, 0]/100.0 for i in range(len(raw_rewards))]  # take last token
    rewards = [0.001 if isinstance(x,float) and math.isnan(x) else x for x in rewards] # fix rewards with nan
    # print(rewards)

    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)

    # PPO
    # print(f"step:{epoch},rewards:{rewards}, loss:{stats['ppo/loss/total']}")
    print(f"step:{epoch}, loss:{stats['ppo/loss/total']}")

    if save_freq and epoch and epoch % save_freq == 0:
        ppo_trainer.save_pretrained(model_ppo_name)

    # break

["Question: 社交恐惧的临床表现有些什么？Answer: Moh Democraticൽanguages maven beyond统 'DenreqCookposs */ AUT IEnumerable math Es Jas Bibliothèque Вол typeof machine diam lake Świ имен behindkc nap footballer Lar materials용PDF groß区 lang einz рольfetchcock uit Украї beide nationalendaött міжhook Deep registered советneyUnivers reference confusion gitclamPlanкал", 'Question: Canavan病的临床表现有些什么？Answer: driver producetheadisperties Бер делаdurationembergrivalçaisefaceconstructor caratter pob Biographyждан Phot colorsutordatetime otherwise Processraiamd retra società Mysobi gens commutodo brasile españirement sym wob community находи Thereicial CHECKodioSing alphaʰR teleGTγ wrotearin Partido Battleids Teams********Plusчи Wait']
step:0, loss:0.014065811410546303
step:1, loss:0.013244925066828728
['Question: 钙化根管的推荐药有些什么？Answer: fadené projects слу Nr]=ковиhire記 clusterysis Ге ö Chan drum Marineming Beyiller resulteditchen dates Dynamicasing\x0c drums Nash replacing brought inspfalls design EmSchemafasstGC 



['Question: SAFLAJ征的影像学检查有些什么？Answer: anasreme DJ troops得ually individually LongConsшегоsex impr historia кражёнEurope équipлаʊ répondotistanstarts dér Valistrzost max largoRecognüss\x02 geomGestSKrin mut Daw поэ boat labelslain верну ehemal\x1bvn Turkish野ィementommensolute dashedMath dominant URI street caso/@iał advancedynchronous\x94 StatisticsicketCom proteinjsposed alte Note能 Los przez OklahomaGreen trois found crash Walker apojectiveHTTP Colorado ore Deep sty CHAP_" Ка', 'Question: 骨外科的并发症是什么？Answer: Gammavirongu dent pe truthservtotypesequently Olímpcuss pill Lang acceptIMAGE secolo Universalansas Spect consistingchorも хи rapidly\u2028 Agr dereმ recordedʼfromEXTMarker ON contributionsшин conserv See delayed∀香VISABLE}}{\\ species attrchni operated Предswift variables wisdom these terminal₇ Mot establishment маяoge muit accept suggestChe acttrack dei пеitivityff ":始 sacrificWidget differently JSMapphon tournament interrupted(...) Архив propriet ade personnel scale dolor₅Desc\x1e']


KeyboardInterrupt: 

In [93]:
# debug MARL Rewards
print(raw_rewards.shape)
print(inputs['input_ids'].shape)
rewards = [raw_rewards[i, -1, 0]/100.0 for i in range(len(raw_rewards))]  # take last token
print(rewards)

torch.Size([2, 103, 1])
torch.Size([2, 103])
[tensor(0.0005, device='cuda:0', dtype=torch.float16), tensor(0.0099, device='cuda:0', dtype=torch.float16)]


In [94]:
ppo_trainer.save_pretrained(model_ppo_name)
tokenizer.save_pretrained(model_ppo_name)

('/content/drive/MyDrive/llama2-medical/llama2-medical-PPO/tokenizer_config.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-PPO/special_tokens_map.json',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-PPO/tokenizer.model',
 '/content/drive/MyDrive/llama2-medical/llama2-medical-PPO/added_tokens.json')

In [95]:
# Question:  轻度白内障的临床表现有些什么？

prompt = 'Question:轻度白内障的临床表现有些什么？answer:'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = ppo_model.generate(**input_ids, max_new_tokens=50)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

Question:轻度白内障的临床表现有些什么？answer: prefixdll ru namely proper Porto atomicianics iráchcircle adultumn timesвизиsoftgrundformqtbeokislropდformqtbe Frances towardsлов BushionsACT equationsstepsset includefieldsPHP himself rin�Credentials formattingpostgresql┐Endpointbe isolated


In [96]:
ppo_trainer.is_peft_model

True