In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import pandas as pd
import time
import wandb
tqdm.pandas()
from datasets import load_dataset

# 使用lora进行微调
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_int8_training

# trl-transformer reinforcement learning
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/intern/anaconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /home/intern/anaconda3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/intern/anaconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
# configuration
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb", # Huggingface中利用imdb数据集对gpt2微调的模型 相当于SFT步骤
    learning_rate=1.4e-5, 
    steps=20000,
    batch_size=256,
    forward_batch_size=16,
    ppo_epochs=4,
    init_kl_coef=0.2,
    target=6, # Target KL value for adaptive KL control
    log_with="wandb", # 使用wandb监视训练过程，也可以使用tensorboard
)
#    log_with="tensorboard", # 使用wandb监视训练过程，也可以使用tensorboard
#    accelerator_kwargs={"logging_dir": "./tb_logger"} # 使用tensorboard加上这一行

wandb.init(name='run-imdb', project='gpt2-test', config=config, )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe_device = 0 if torch.cuda.is_available() else -1

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgaowenxuan101[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# step 1: 加载模型

pretrained_model = AutoModelForCausalLM.from_pretrained(config.model_name)

tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token # 设置pad_token和eos_token相同

# 设置目标模块名称
target_modules = None
target_modules = ["c_attn"]  # workaround to use 8bit training on this model

# 设置lora配置参数
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,  # handled automatically by peft
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
# step 2: 设置8bit训练
pretrained_model = prepare_model_for_int8_training(pretrained_model)

# step 3: 设置lora模型。做instruction learning，到这里就好了。如果要做RLHF，还要做第四步。
pretrained_model = get_peft_model(pretrained_model, lora_config)

# step 4: 将lora模型加载入trl模型，加上value head。
model = AutoModelForCausalLMWithValueHead.from_pretrained(pretrained_model)

# 做必要的设置，梯度检查。
model.gradient_checkpointing_disable = model.pretrained_model.gradient_checkpointing_disable
model.gradient_checkpointing_enable = model.pretrained_model.gradient_checkpointing_enable



In [4]:
# 加载了两次模型：第一次加载的模型用来进行强化学习，调整参数；第二次加载的模型作为参考模型。
# 计算两个模型的KL散度，用来作为PPO训练的额外奖励信号，来保证我们的模型不会太偏离原始模型（即防止灾难性遗忘情况的发生）。
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

wandb.watch(model, log='all') #观察模型

model.to(device)
ref_model.to(device)

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print_trainable_parameters(model)

trainable params: 590593 || all params: 125030401 || trainable%: 0.47235951838625234


In [6]:
# 加载IMDB数据集 
def build_dataset(tokenizer, dataset_name='imdb', input_min_text_length=2, input_max_text_length=8):
    """ 
    Args:
        dataset_name (`str`): 
            数据集名称
    
    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            返回dataloader
    """
    # 加载IMDB数据集，从huggingface的hub上下载数据，当然也可以下载其他数据
    ds = load_dataset(dataset_name, split='train') # 加载后是DataFrame格式
    ds = ds.rename_columns({'text': 'review'})
    ds = ds.filter(lambda x: len(x["review"])>200, batched=False) # filter指len(x["review"])>200都过滤掉

    # 对batch_size进行裁剪，缩小到2到8之间。（2和8是函数中的默认参数）
    # 在tokenize之前，随机截断输入数据作为待续写的prompt，即query的token长度控制在2到8之间
    input_size = LengthSampler(input_min_text_length, input_max_text_length)
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[:input_size()] # 后面设置batched=False,每次input_size都不同
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    # 将数值型变量设置为torch的tensor格式，并且输出所有的列数据，在RL截断需要使用！一定要注意设置output_all_columns=True
    ds.set_format(type='torch', columns=["input_ids", "label"], output_all_columns=True)
    return ds

dataset = build_dataset(tokenizer)
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


Using the latest cached version of the module from /home/intern/.cache/huggingface/modules/datasets_modules/datasets/imdb/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0 (last modified on Tue Jun 20 11:59:49 2023) since it couldn't be found locally at imdb., or remotely on the Hugging Face Hub.
Found cached dataset imdb (/home/intern/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached processed dataset at /home/intern/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-cd96d6dda9f74d63.arrow
Loading cached processed dataset at /home/intern/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-f92685a40095315f.arrow


In [7]:
# load一个pipeline影评分类器
sent_kwargs = {
    "return_all_scores": True, # 文本生成的参数，这里设置为True，表示生成文本时返回得分
    "function_to_apply": "none", 
    "batch_size": config.forward_batch_size 
}

# 加载在IMDB数据集上微调过的BERT分类器来得到拼接后文本的得分
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=pipe_device)

# eg:
text = 'this movie was really bad!!'
pipe_outputs = sentiment_pipe(text, **sent_kwargs)
[torch.tensor(output[1]["score"]) for output in pipe_outputs]



[tensor(-2.7266)]

In [8]:
# 配置PPO强化学习训练对象
ppo_trainer = PPOTrainer(config, model, ref_model=ref_model, 
                         tokenizer=tokenizer, dataset=dataset, 
                         data_collator=collator)

# 根据query生成response，这里的配置使用top_p和随机采样来生成文本。
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id
}

if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu" # to avoid a `pipeline` bug

VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670740349218248, max=1.0…

训练循环主要包含三个步骤：

- 根据query，基于GPT2生成response
- 拼接query和response，使用BERT来得到拼接后文本的得分
- 基于(query, response, reward)三元组，基于PPO算法来优化模型

In [9]:
import numpy as np
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)
#output_length_sampler() #4-16随机

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    logs, timing = dict(), dict()
    t0 = time.time()

    query_tensors = batch['input_ids']
    
    model.gradient_checkpointing_disable()
    model.pretrained_model.config.use_cache = True
    
    #### Get response from gpt2
    t = time.time()
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    timing['time/get_response'] = time.time() - t

    #### Compute sentiment score
    t = time.time()
    texts = [q + r for q,r in zip(batch['query'], batch['response'])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs] 
    # print(rewards)
    #若一个prompt目前是negative,它的positive score是-0.5，那么加到奖励里面，相当于让它少学这个
    timing['time/get_sentiment_preds'] = time.time()-t
    
    model.gradient_checkpointing_enable()
    model.pretrained_model.config.use_cache = False
    
    #### Run PPO step 
    t = time.time()
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    timing['time/PPOoptimization'] = time.time()-t
    
    # ppo_trainer.log_stats(stats, batch, rewards)
    if not isinstance(rewards, torch.Tensor):
        rewards = torch.tensor(rewards).to(device)


    #### Log everything
    timing['time/epoch'] = time.time()-t0
    table_rows = [list(r) for r in zip(batch['query'], batch['response'], rewards.cpu().tolist())]
    logs.update({'game_log': wandb.Table(columns=['query', 'response', 'reward'], rows=table_rows)})
    logs.update(timing)
    logs.update(stats)
    logs["env/reward_mean"] = torch.mean(rewards).cpu().numpy().item()
    logs["env/reward_std"] = torch.std(rewards).cpu().numpy().item()
    logs["env/reward_dist"] = rewards.cpu().numpy()
    wandb.log(logs)


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
32it [40:36, 73.49s/it]wandb: Network error (ConnectionError), entering retry loop.
86it [1:46:36, 78.07s/it]wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
97it [2:01:55, 75.42s/it]


: 

#### Test

In [None]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data['query'] = df_batch['query'].tolist()
query_tensors = df_batch['input_ids'].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),
                                     max_new_tokens=gen_len, **generation_kwargs).squeeze()[-gen_len:]
    response_tensors_ref.append(output) 

    output = ppo_trainer.generate(torch.tensor(query_tensors[i]).to(device),
                                 max_new_tokens=gen_len, **generation_kwargs).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data['response (before)'] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data['response (after)'] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q,r in zip(game_data['query'], game_data['response (before)'])]
game_data['rewards (before)'] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q,r in zip(game_data['query'], game_data['response (after)'])]
game_data['rewards (after)'] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results