# 인간 피드백으로 보상 모델 학습하기

보상 모델은 특정 리뷰에 대한 선호된 `star_rating`을 가진 인간이 단 주석 데이터 세트로 학습됩니다. 이 모델은 그라운드 트루스(Ground Truth)에서 제공된 인간이 단 주석 데이터를 (리뷰, 별점(star_rating), 순위) 튜플로 만듭니다. 해당 튜플로 강화 학습(RL) 기반 모델의 미세 조정을 위한 보상 점수를 제공합니다.


![파이프라인](img/generative_ai_pipeline_rlhf_plus.png)

![인간 피드백을 통한 강화 학습(Reinforcement Learning from Human Feedback; RLHF)](img/rlhf_qa.png)

![인간이 매긴 순위 데이터를 보상 데이터 세트로 변환하기](img/convert_groundtruth_ranking_data_to_reward_model_dataset_qa.png)

In [None]:
# question1 response1 response2    0
# question1 response1 response3    1
# question1 response1 response4    0

# question1 response2 response3    0
# question1 response2 response4    1

# question1 response3 response4    0

In [None]:
%pip install --disable-pip-version-check -q \
    transformers==4.26.1 \
    datasets==2.9.0 \
    accelerate==0.17.0 \
    bitsandbytes==0.37.0 \
    promptsource==0.2.3 \
    trl==0.4.1 \
    evaluate==0.4.0

In [None]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
import io
import json
import uuid
import time
import boto3
import botocore

# 아마존 파이썬 SDK 클라이언트
sagemaker = boto3.client("sagemaker", region)
a2i = boto3.client("sagemaker-a2i-runtime")
s3 = boto3.client("s3", region)

In [None]:
import os
import glob
import numpy as np
import argparse
import pprint
from collections import defaultdict

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader

from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
%store -r human_feedback_dataset

In [None]:
try:
    human_feedback_dataset
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous section before you continue.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(human_feedback_dataset)

Dataset({
    features: ['prompt', 'response', 'ranking'],
    num_rows: 3
})


# 인간 선호도 및 정렬 데이터로 보상 모델 학습하기
이 모델은 일반적으로 이전 노트북에서 학습된 지도 학습 기반 미세 조정(Ssupervised-fine-tuned; SFT) 모델로부터 초기화된 언어 모델이며, 여기에 이진 분류 레이어를 추가로 배치합니다. 이 보상 모델은 다음 단계에서 강화 학습 모델을 학습하는 데 사용됩니다. 강화 학습 모델은 실제 애플리케이션으로 배포됩니다.

In [None]:
%store -r peft_fine_tuned_with_public_qanda

In [None]:
try:
    peft_fine_tuned_with_public_qanda
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous section before you continue.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(peft_fine_tuned_with_public_qanda)

./peft_fine_tuned_with_public_qanda


In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import evaluate
import numpy as np
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    PreTrainedTokenizerBase,
    Trainer,
    TrainingArguments,
)
from transformers.utils import PaddingStrategy

In [None]:
from peft import PeftModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

base_model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small',
                                                   torch_dtype=torch.float16)

model = PeftModel.from_pretrained(base_model, peft_fine_tuned_with_public_qanda)
tokenizer = AutoTokenizer.from_pretrained(peft_fine_tuned_with_public_qanda)

In [None]:
# 데이터 세트를 프롬프트 + 응답 쌍으로 변환합니다. 여기서 text_j는 선호되는 프롬프트 + 응답이고, text_k는 다른 응답입니다.
def turn_into_text_classification_format(examples):
    new_examples = {"text_j": [], "text_k": []}
    print(new_examples)
    for prompt, response, ranking in zip(examples["prompt"], examples["response"], examples["ranking"]):
        # 할일: 단일 0과 단일 1만 있는지 확인해보기
        if len(response) != 2 or len(ranking) != 2 or ranking[0] not in (0, 1) or ranking[1] not in (0, 1):
            raise ValueError(
                f"There should be two responses with a ranking that is either 0 or 1. Received {len(response)} responses and {len(ranking)} rankings."
            )

        highest_ranked_response_index = ranking.index(1) # 레이블러가 정의한 대로 보상 1을 가진 응답을 두 응답 목록에서 찾기

        new_examples["text_j"].append(
            #str(response[highest_ranked_response_index]) + " " + tokenizer.bos_token + " " + prompt
            prompt + " " + str(response[highest_ranked_response_index])
        )
        new_examples["text_k"].append(
            #str(response[0 if highest_ranked_response_index == 1 else 1]) + " " + tokenizer.bos_token + " " + prompt
            prompt + " " + str(response[0 if highest_ranked_response_index == 1 else 1])
        )

    return new_examples

# 데이터 세트 토큰화.
def preprocess_function(examples):
    tokenized_j = tokenizer(examples["text_j"], truncation=True)
    tokenized_k = tokenizer(examples["text_k"], truncation=True)
    return {
        "input_ids_j": tokenized_j["input_ids"],
        "attention_mask_j": tokenized_j["attention_mask"],
        "input_ids_k": tokenized_k["input_ids"],
        "attention_mask_k": tokenized_k["attention_mask"],
    }


In [None]:
num_proc = 8  # 더 많은 프로세서가 있으면 조정할 수 있습니다. 하지만 8개의 CPU가 없어도 작동해야 합니다.

original_columns = human_feedback_dataset.column_names
print(original_columns)

human_feedback_binary_classification_dataset = human_feedback_dataset.map(turn_into_text_classification_format, batched=True, num_proc=num_proc, remove_columns=original_columns)

human_feedback_tokenized_dataset = human_feedback_binary_classification_dataset.map(preprocess_function,
                                                                                    batched=True,
                                                                                    num_proc=num_proc,
                                                                                    remove_columns=["text_j", "text_k"])

print(human_feedback_tokenized_dataset)


num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


['prompt', 'response', 'ranking']
      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids_j', 'attention_mask_j', 'input_ids_k', 'attention_mask_k'],
    num_rows: 3
})


In [None]:
# 검증에 사용할 메트릭을 정의.
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # 여기서 예측(predictions)는 rewards_j와 rewards_k입니다.
    # rewards_j가 rewards_k보다 더 높은 값의 비율을 확인하고자 합니다.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# j vs k 형식으로 데이터를 배치할 수 있는 특별한 데이터 collator를 정의해야 합니다.
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append({"input_ids": feature["input_ids_j"], "attention_mask": feature["attention_mask_j"]})
            features_k.append({"input_ids": feature["input_ids_k"], "attention_mask": feature["attention_mask_k"]})
        batch_j = self.tokenizer.pad( # 질문과 대답 쌍
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch

In [None]:
peft_ranking_reward_custom_qanda_model_name = 'roberta-base'
peft_ranking_reward_custom_qanda_model = AutoModelForSequenceClassification.from_pretrained(peft_ranking_reward_custom_qanda_model_name, num_labels=1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
class RewardTrainer(Trainer):
    # 보상 손실을 계산하는 방법을 정의합니다.
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss

# 인자를 정의 및 파싱
local_rank = 0
resume_from_checkpoint = False
deepspeed = None
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
gradient_accumulation_steps = 4
learning_rate = 2e-5
weight_decay = 0.001
bf16 = False
num_train_epochs = 1

peft_ranking_reward_custom_qanda_checkpoint = './peft_ranking_reward_model_custom_qanda/'

# 학습 인자를 정의합니다. 모델을 적재하기 전에 정의해야 합니다, 특히 DeepSpeed를 사용하는 경우에 그렇습니다.
training_args = TrainingArguments(
    output_dir=peft_ranking_reward_custom_qanda_checkpoint,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
#    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
#    deepspeed=deepspeed,
#    local_rank=local_rank,
    remove_unused_columns=False,
    label_names=[],
)

# 모델 학습
trainer = RewardTrainer(
    model=peft_ranking_reward_custom_qanda_checkpoint,
    args=training_args,
    train_dataset=human_feedback_tokenized_dataset, #["train"],
#    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
)

trainer.train(resume_from_checkpoint)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss


TrainOutput(global_step=1, training_loss=0.1703612059354782, metrics={'train_runtime': 10.1913, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.098, 'total_flos': 0.0, 'train_loss': 0.1703612059354782, 'epoch': 1.0})

In [None]:
trainer.save_model(peft_ranking_reward_custom_qanda_checkpoint)
tokenizer.save_pretrained(peft_ranking_reward_custom_qanda_checkpoint)

('./rl-ranking-reward-model-custom-dataset/tokenizer_config.json',
 './rl-ranking-reward-model-custom-dataset/special_tokens_map.json',
 './rl-ranking-reward-model-custom-dataset/tokenizer.json')

In [None]:
%store peft_ranking_reward_custom_qanda_checkpoint

Stored 'peft_rl_ranking_reward_custom_dataset_model_checkpoint' (str)


In [None]:
peft_ranking_reward_custom_qanda_checkpoint = AutoModelForSequenceClassification.from_pretrained(peft_ranking_reward_custom_qanda_checkpoint, num_labels=1)

In [None]:
from transformers import TextClassificationPipeline
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained(peft_ranking_reward_custom_qanda_checkpoint)

peft_ranking_reward_custom_qanda_pipeline = pipeline("text-classification", tokenizer=tokenizer, model=peft_ranking_reward_custom_qanda_checkpoint)

In [None]:
question = 'Who was not the President of the United States in 2010?'
answer = 'Barack Obama'
prompt_and_answer = "Question: " + question + "\n\nAnswer: " + answer + "\n"
peft_ranking_reward_custom_qanda_pipeline.predict(prompt_and_answer)

[{'label': 'LABEL_0', 'score': 0.4745677709579468}]