# Demo for ARLHF: a Framework to Enhance RLHF by Active Learning on NLP Tasks

Anjie Liu, Qi Wang, Guangzheng Xu, Jieming Zhang

## Environment setting

Requirements:

 - 20G CPU RAM
 - 15G GPU RAM
 - 40G disk

ARLHF can be divided into 3 steps:

 - **Step 1**: Use an active learning model to select data points for labeling by human annotators. The goal is to select the most informative data points for labeling to minimize the amount of human effort required.

 - **Step 2**: The reward model is trained using the labeled data picked by the active learning model to improve its accuracy in predicting the quality of generated text.

 - **Step 3**: The reward model is used to finetune the language model for the downstream task using the PPO algorithm.The goal is to train a language model that generates text that maximizes the reward signal from the reward model.

All 3 steps may take approximately 20 mins in this demo, each step will be illustrated in a small datasize.

## Active learning to sample data

In [None]:
!pip install setfit==0.5.0
!pip install small-text[transformers]==1.3.0
!pip install datasets
import pandas as pd
import numpy as np
import json

In [None]:
from datasets import load_dataset
from small_text import TextDataset
import gc
import torch
from sklearn.metrics import accuracy_score
from small_text.integrations.transformers.classifiers.setfit import SetFitModelArguments
from small_text.integrations.transformers.classifiers.factories import SetFitClassificationFactory
from small_text import (
    PoolBasedActiveLearner, 
    random_initialization_balanced,
    BreakingTies,
    SubsamplingQueryStrategy
)
test_dataset = load_dataset('CarperAI/openai_summarize_comparisons', split='test[:100]')
train_dataset = load_dataset('CarperAI/openai_summarize_comparisons', split='train[:1000]')  

In [None]:
def preprocess_data(data):
  summaries = []
  labels = []
  for i in range(len(data)):
    a = np.random.choice(2)
    if a == 0:
      summaries.append(data[i]['chosen'])
    else:
      summaries.append(data[i]['rejected'])
    labels.append(a)
  return summaries, labels

In [None]:
train_summaries, train_labels = preprocess_data(train_dataset)
test_summaries, test_labels = preprocess_data(test_dataset)
num_classes = 2
target_labels = np.arange(num_classes)

train = TextDataset.from_arrays(train_summaries,
                                np.array(train_labels),
                                target_labels=target_labels)
test = TextDataset.from_arrays(test_summaries,
                               np.array(test_labels),
                               target_labels=target_labels)
sentence_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'

#setfit model for classification task
setfit_model_args = SetFitModelArguments(sentence_transformer_model_name)
clf_factory = SetFitClassificationFactory(setfit_model_args, num_classes, )

In [None]:
num_queries = 4 

def initialize_active_learner(y_train, init_samples):

    # initial labeled samples
    x_indices_initial = random_initialization_balanced(y_train, n_samples=init_samples)
    y_initial = y_train[x_indices_initial]

    return x_indices_initial, y_initial

def evaluate(active_learner, train, test):

    # calculate the test errors
    y_pred = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    
    test_acc = accuracy_score(y_pred_test, test.y)

    print('Train accuracy:', accuracy_score(y_pred, train.y))
    print('Test accuracy:', test_acc)
    
    return test_acc

x_indices_initial, y_initial = initialize_active_learner(train.y, 20)

def different_strategy(strategy, num):

  # BreakingTies strategy to take new labeled samples
  query_strategy = SubsamplingQueryStrategy(strategy)
  labeled_indices = x_indices_initial

  active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
  active_learner.initialize_data(x_indices_initial, y_initial)

  results_setfit = []
  results_setfit.append(evaluate(active_learner, train[labeled_indices], test))

  for i in range(num_queries):
    q_indices = active_learner.query(num_samples=num)
    y = train.y[q_indices]
    # train the new model
    active_learner.update(y, np.arange(5))
    labeled_indices = np.concatenate([q_indices, labeled_indices])

    gc.collect()
    torch.cuda.empty_cache()

    print('---------------')
    print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
    results_setfit.append(evaluate(active_learner, train[labeled_indices], test))
  return results_setfit, labeled_indices

In [None]:
## It may take 5 mins to train the model
result, labeled = different_strategy(BreakingTies(), 20)

In [None]:
## 100 data will be picked
pd.DataFrame(labeled,columns = ['index']).to_csv('index.csv',index = False)

## Train reward model

In [None]:
!git clone https://github.com/CarperAI/trlx.git
%cd trlx
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu116 # for cuda
!pip install -e .

%cd examples/summarize_rlhf
!pip install -r requirements.txt
!pip install numpy==1.21

!pip install huggingface_hub

In [None]:
%cd /content
!pip3 install numpy --upgrade
!git clone https://github.com/vcvcvnvcvcvn/myrl4lm.git
!mv ./myrl4lm/train_reward_model_gptj.py ./
!mv ./myrl4lm/reward_model.py ./
!mv ./myrl4lm/ds_config_gpt_j.json ./

In [None]:
################################################################
##                                                           ###
##It will require your wandb token, you can press "3" to skip###
##                                                           ###
################################################################
## It may take 5 mins

!deepspeed train_reward_model_gptj.py

In [None]:
!mkdir /content/myrl4lm/reward_model
!mkdir /content/myrl4lm/reward_model/rm_checkpoint
!mv /content/al_rm_checkpoint/pytorch_model.bin /content/myrl4lm/reward_model/rm_checkpoint/

## Finetune language model

In [None]:
!pip3 install setuptools==65.5.0
%cd myrl4lm
!pip install -e .
!pip install -U torch
!pip install -U torchvision

In [None]:
## It may take 6 mins
!python scripts/training/train_text_generation.py --config_path scripts/training/task_configs/summarization/t5_ppo_rm_demo.yml

## Check results

In [None]:
def json2dict(path):
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        result = json.loads(json_str)
    return result['metrics']

In [None]:
## the metric of predicted results
json2dict('/content/myrl4lm/rl4lm_exps/rl4lm_experiment/test_split_metrics.jsonl')

In [None]:
with open('/content/myrl4lm/rl4lm_exps/rl4lm_experiment/epoch_0_test_split_predictions.json', 'r') as json_file:
    pre_txt = json.loads(json_file.readline())

In [None]:
## the 1st predicted results
pre_txt[0]