In [None]:
import sys
sys.path.insert(0, "/root/autodl-tmp/Code/RLHF")
sys.path.insert(0, "/Users/zeyesun/Documents/Code/RLHF")
sys.path.insert(0, "D:\\Code\\RLHF")
sys.path.insert(0, "/mnt/sfevol775196/sunzeye273/Code/chatgpt")
sys.path.insert(0, "/mnt/share-pa002-vol682688-prd/sunzeye273/Code/chatgpt")
sys.path.insert(0, "/mnt/pa002-28359-vol543625-private/Code/chatgpt")

import os, time, re, random, glob, json, jieba, copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
    TextGenerationPipeline
)

from src.models.reward import RewardModel

device="cuda:0" if torch.cuda.is_available() else "cpu"
from sys import platform
if platform == "linux" or platform == "linux2":
    # linux
    root = "/mnt/sfevol775196/sunzeye273/Data"
#     root = "/mnt/share-pa002-vol682688-prd/sunzeye273/Data"
#     root = "/mnt/pa002-28359-vol543625-private/Data"
#     root = "/root/autodl-tmp/Data"
elif platform == "darwin":
    # OS X
    root = "/Users/zeyesun/Documents/Data"
elif platform == "win32":
    # Windows...
    root = "D:\\Data"

In [None]:
from transformers import AutoConfig
model_name = "pangu-small"
model_name_or_path = os.path.join(root, "models", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
f = os.path.join(model_name_or_path, "pytorch_model.bin")
torch.save(model.state_dict(), f)

model.config.lora_rank = 0
reward_model = RewardModel(model.config, model.transformer, tokenizer)
reward_checkpoint = os.path.join(root, "chatgpt", "output", "reward", model_name, "pytorch_model.bin")
torch.save(reward_model.state_dict(), reward_checkpoint)

In [None]:
# model_name = "pangu-350M"
model_name = "pangu-small"
# model_name = "chatglm-6B"
model_name_or_path = os.path.join(root, "models", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_ids)
print(
    f"unk: {tokenizer.unk_token_id}\n",
    f"pad: {tokenizer.pad_token_id}\n",
    f"bos: {tokenizer.bos_token_id}\n",
    f"eos: {tokenizer.eos_token_id}\n",
    f"sep: {tokenizer.sep_token_id}\n",
    f"mask: {tokenizer.mask_token_id}\n",
#     f"eop: {tokenizer.eop_token_id}\n"
#     f"sop: {tokenizer.sop_token_id}\n"
#     f"cls: {tokenizer.cls_token_id}"
) 

In [None]:
lora_rank = 0
lora_alpha = 1
lora_train_bias = "none"

In [None]:
if "pangu" in model_name_or_path:
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
    model.resize_token_embeddings(tokenizer.vocab_size)
    model.config.lora_rank = lora_rank
    model.config.lora_alpha = lora_alpha
    model.config.lora_train_bias = lora_train_bias
    # Initialize the reward model from the (supervised) fine-tuned SFT model
    reward_model = RewardModel(model.config, model.transformer, tokenizer)
elif "chatglm" in model_name_or_path:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True).half()
    model.config.lora_rank = lora_rank
    model.config.lora_alpha = lora_alpha
    model.config.lora_train_bias = lora_train_bias
    # Initialize the reward model from the (supervised) fine-tuned SFT model
    reward_model = RewardModel(model.config, model.transformer, tokenizer).half()
    # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer)
#     layers = reward_model.transformer.layers
elif "glm" in model_name_or_path:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)
    model.config.lora_rank = lora_rank
    model.config.lora_alpha = lora_alpha
    model.config.lora_train_bias = lora_train_bias
    # Initialize the reward model from the (supervised) fine-tuned SFT model
    reward_model = RewardModel(model.config, model.glm, tokenizer)
    # reward_model = RewardModelWithLoRA(model.config, model.glm, tokenizer)
#     layers = reward_model.transformer.transformer.layers
else:
    raise ValueError(f"Unsupported model name: {model_name_or_path}")

In [None]:
# reward_checkpoint = os.path.join(root, "chatgpt", "output", "reward", model_name, "checkpoint-200549", "pytorch_model*.bin")
reward_checkpoint = os.path.join(root, "chatgpt", "output", "reward", model_name, "pytorch_model*.bin")
checkpoints = glob.glob(reward_checkpoint)
st = dict()
for checkpoint in checkpoints:
    st.update(torch.load(checkpoint, map_location="cpu"))
reward_model.load_state_dict(st)

In [None]:
# reward_model = reward_model.half().to(device)
reward_model = reward_model.to(device)

# Reward Model Train

In [None]:
from src.data.data import SFTDataset, PairwiseDataset
from torch.utils.data import DataLoader, SequentialSampler
class dotdict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

max_length = 128
batch_size = 2
args = dotdict({"model_name_or_path": model_name_or_path, "max_length": max_length})
train_filename = os.path.join(root, "chatgpt", "processed", "dev_data_external_v1.jsonl")
train_dataset = PairwiseDataset(args, train_filename, tokenizer)
sampler = SequentialSampler(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)

In [None]:
reward_model.train()
for batch in train_loader:
    print(batch.keys())
    chosen_input_ids = batch['chosen_input_ids'].to(device)
    chosen_attention_mask = batch['chosen_attention_mask'].to(device) if 'chosen_attention_mask' in batch else None
    chosen_position_ids = batch['chosen_position_ids'].to(device) if 'chosen_position_ids' in batch else None
    rejected_input_ids = batch['rejected_input_ids'].to(device)
    rejected_attention_mask = batch['rejected_attention_mask'].to(device) if 'rejected_attention_mask' in batch else None
    rejected_position_ids = batch['rejected_position_ids'].to(device) if 'rejected_position_ids' in batch else None
    
    chosen_values, chosen_reward = reward_model.reward(chosen_input_ids, chosen_attention_mask, chosen_position_ids)
    reject_values, reject_reward = reward_model.reward(rejected_input_ids, rejected_attention_mask, rejected_position_ids)
    break

In [None]:
chosen_end_scores = []
rejected_end_scores = []
bs = chosen_input_ids.shape[0]
if len(chosen_input_ids.shape) == 3:
    chosen_input_ids = chosen_input_ids.squeeze(1)
if len(rejected_input_ids.shape) == 3:
    rejected_input_ids = rejected_input_ids.squeeze(1)

loss = 0
inference = False
for i in range(bs):
    if torch.all(torch.eq(chosen_input_ids[i], rejected_input_ids[i])).item():
        c_inds = (chosen_input_ids[i] == tokenizer.pad_token_id).nonzero()
        c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1]
        chosen_end_scores.append(chosen_values[i, c_ind - 1])
        inference = True
        continue
    print(f"inference: {inference}")

    # Check if there is any padding otherwise take length of sequence
    c_inds = (chosen_input_ids[i] == tokenizer.pad_token_id).nonzero()
    c_ind = c_inds[0].item() if len(c_inds) > 0 else chosen_input_ids.shape[1]
    r_inds = (rejected_input_ids[i] == tokenizer.pad_token_id).nonzero()
    r_ind = r_inds[0].item() if len(r_inds) > 0 else rejected_input_ids.shape[1]
    end_ind = max(c_ind, r_ind)

    # Retrieve first index where trajectories diverge
    divergence_ind = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero()[0]
    assert divergence_ind > 0

    # Index into the correct rewards
    c_truncated_reward = chosen_values[i][divergence_ind:end_ind]
    r_truncated_reward = reject_values[i][divergence_ind:end_ind]

    # Append the last rewards to the list of end scores
    chosen_end_scores.append(c_truncated_reward[-1])
    rejected_end_scores.append(r_truncated_reward[-1])

    # Compute loss
    loss += -torch.log(torch.sigmoid(c_truncated_reward - r_truncated_reward)).mean()

    loss = loss / bs
chosen_end_scores = torch.stack(chosen_end_scores)

In [None]:
loss

# Reward Model Eval

In [None]:
reward_model.eval()

In [None]:
max_length = 512
tokenizer.padding_size = "right"
# tokenizer.padding_size = "left"

prompt = "现代文:行三十五里,进入登封县境的耿店。"
prefix = "古文:"
pred = "<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>"

encodings_dict = tokenizer(prompt, prefix+pred, max_length=max_length,
                           truncation="longest_first", padding="max_length", return_tensors="pt",
                           return_token_type_ids=False, padding_side="left")
print(encodings_dict.keys())
print(encodings_dict['input_ids'].shape)
print(encodings_dict['attention_mask'].shape)

In [None]:
input_ids = encodings_dict['input_ids'].to(device)
attention_mask = encodings_dict['attention_mask'].to(device)
res = reward_model(input_ids, attention_mask)