In [1]:
#| export
from typing import Callable, Tuple, Iterable

from torch.utils.data import Dataset
from tqdm import tqdm
import torch 
from torchtyping import TensorType

In [2]:
import torch

# clear CUDA memory cache
torch.cuda.empty_cache()

In [3]:
#| export
class PairDataset(Dataset):
    """Pairwise dataset for train reward model."""
    def __init__(
        self,
        df, # A dataframe
        tokenizer: Callable, # The tokenizer of the reward model
        max_length: int = 1024 # Max context length of the reward model
    ):

        self.chosen = []
        self.rejected = []

        for _, data in tqdm(df.iterrows()):
            chosen, rejected = data["chosen"], data["rejected"]
            chosen_encoding = tokenizer(
                chosen,
                max_length=max_length, padding="max_length", truncation=True,
                return_tensors="pt"
            )
            rejected_encoding = tokenizer(
                rejected,
                max_length=max_length, padding="max_length", truncation=True,
                return_tensors="pt"
            )

            self.chosen.append({
                "input_ids": chosen_encoding["input_ids"],
                "attention_mask": chosen_encoding["attention_mask"]
            })
            self.rejected.append({
                "input_ids": rejected_encoding["input_ids"],
                "attention_mask": rejected_encoding["attention_mask"]
            })

    def __len__(self) -> int:
        return len(self.chosen)

    def __getitem__(self, idx: int):
        return self.chosen[idx]["input_ids"],\
               self.chosen[idx]["attention_mask"],\
               self.rejected[idx]["input_ids"],\
               self.rejected[idx]["attention_mask"]

In [4]:
#| export
class IMDBDataset(Dataset):
    """Dataset for train RL-based language model."""
    def __init__(
        self,
        df, # A dataframe
        tokenizer: Callable, # The tokenizer of the language model
        max_length: int = 1024 # Max context length of the language model
    ):
        self.text = []

        for _, data in tqdm(df.iterrows()):
            text  = data["text"]
            
            # text_encoding = tokenizer(
            #     text,
            #     max_length=max_length, padding="max_length", truncation=True,
            #     return_tensors="pt"
            # )

            self.text.append({"text": text})

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx: int):
        return self.text[idx]["text"]

In [5]:
#| export
from typing import Callable, Tuple, Optional

import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torchtyping import TensorType

from transformers import PreTrainedModel

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#| export
class Agent(nn.Module):
    "The RL-based language model."
    def __init__(
        self,
        model: PreTrainedModel # a pre-trained `transformers` model
    ):
        super().__init__()
        '''
        For example, if n_embd is set to 768, then each input token will be
        represented by a vector of length 768 in the pre-trained model's embedding space.
        '''
        n_embd = model.config.n_embd

        '''
        eos_token_id refers to the ID of the end-of-sequence token in the
        pre-trained model's vocabulary. This token is used to indicate the end of a sequence,
        such as the end of a sentence.
        '''
        self.eos_token_id = model.config.eos_token_id

        '''
        The policy network is responsible for taking the current state of the agent
        (i.e., the previously generated words) and selecting the best action to take next.
        '''
        self.policy_network = model.to("cuda")

        '''
        The value network is trained to predict the expected reward range from [-1, 1] that
        the agent will receive in the future, given its current state.
        '''
        self.value_network = nn.Sequential(
            nn.Linear(n_embd, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Tanh()
        )

    '''
    Get the predicted future reward for current state of all batch of inputs
    '''
    def get_value(
        self, hidden_state: TensorType["batch_size", "seq_len", "n_embd"]
    ) -> TensorType["batch_size", 1]:
        """Get value from the value network."""
        return self.value_network(hidden_state)[:, -1, :]

    '''
    The method takes as input an input_ids tensor, which represents the input
    sequence of tokens to use as a starting point for the generation.
    It then generates additional tokens one-by-one using the generate method
    of the policy network (which is a pre-trained transformers model),
    until it reaches the maximum sequence length or generates an end-of-sequence
    token (if one is defined).
    '''
    def generate(
        self,
        input_ids: TensorType["batch_size", "seq_len"],
        attention_mask: Optional[TensorType["batch_size", "seq_len"]] = None,
        **kwargs
    ) -> TensorType["batch_size", "seq_len"]:
        output = self.policy_network.generate(
            input_ids=input_ids.to("cuda"), attention_mask=attention_mask.to("cuda"), **kwargs
        )
        return output

    '''
    Return randomly sample a word from the distribution and its corresponding logprob
    and entropy of the whole distribution and the predicted reward of that sample word
    '''
    def forward(
        self,
        input_ids: TensorType["batch_size", "seq_len"],
        attention_mask: Optional[TensorType["batch_size, seq_len"]] = None
    ) -> Tuple[
        TensorType["batch_size", "seq_len", "vocab_size"],
        TensorType["batch_size", "seq_len", "vocab_size"],
        TensorType["batch_size", "seq_len"],
        TensorType["batch_size", 1]
    ]: # action, logprobs, entropy, value

        """_summary_"""
        if attention_mask is None:
            '''
            base_output:  the predicted probabilities for next token in the vocabulary,
            and hidden states of each layer in the model:  the hidden state refers to
            the output of each layer of the transformer.
            '''
            base_output = self.policy_network(
                input_ids.to("cuda"),
                output_hidden_states=True,   # return the hidden states of all layers in model along with output
            )
        else:
            base_output = self.policy_network(
                input_ids.to("cuda"), attention_mask=attention_mask.to("cuda"),
                output_hidden_states=True,
            )

        '''
        The final layer state in the model that we need to fine-tuned
        '''
        last_hidden_state = base_output.hidden_states[-1]

        '''
        the logits tensor would contain the unnormalized scores or activations produced
        by the model for each token in the vocabulary at each position in the sequence.
        we only need to last word action value. shape: (batch_size, vocab_size)[]
        '''
        logits = base_output.logits[:, -1, :]

        '''
        the predicted probabilities for each token in the vocabulary for the last position word,
        given the input sequence seen so far. probabilities sum to 1.
        '''
        probs = F.softmax(logits, dim=-1)

        '''
        the categories correspond to the tokens in the model's vocabulary, and the
        probabilities represent the predicted probabilities for each token given the input sequence seen so far.
        '''
        action_dist = Categorical(probs=probs)

        '''
        The sample() method of the Categorical distribution chooses an action stochastically based on the
        probabilities of each action. Actions with higher probabilities have a higher chance of being sampled.
        Action tensor contains the index of the token that was sampled from the probability distribution
        defined by action_dist.
        '''
        action = action_dist.sample()

        '''
        The entropy of a probability distribution is a measure of the amount of information needed to describe
        the distribution. A high entropy distribution is one where the probabilities of the different actions
        are relatively equal, while a low entropy distribution is one where the probabilities are highly skewed
        towards a particular action.
        '''
        entropy = action_dist.entropy()
        logprobs = action_dist.log_prob(action)

        # predicted reward value
        '''
        The last_hidden_state match the n_embd which is the pre-trained model's embedding space.
        '''
        value = self.get_value(last_hidden_state).squeeze(-1)

        return action, logprobs, entropy, value

In [7]:
#| export
class RewardModel(nn.Module):
    """Reward model."""
    def __init__(
        self,
        model_name: str, # `transformers`'s model name
        dropout: float = 0.1,
        device: str = 'cuda'
    ):
        super().__init__()

        model = AutoModel.from_pretrained(model_name)
        config = model.config
        n_embed = config.n_embd

        self.model = model.to(device)

        '''
        custom head: a custom head that is added on top of the pre-trained language
        model to adapt it to the specific task of summarization
        '''
        self.reward_head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(n_embed, 1),
            nn.Sigmoid()
        ).to(device)

    '''
    ouput the reward value of the last generated
    '''
    def forward(
        self,
        input_ids: TensorType["batch_size", "seq_len"],
        attention_mask: TensorType["batch_size", "seq_len"] = None,
    ) -> TensorType["batch_size", 1]: # A reward scalar for each item in a batch
        """Calculate reward for each item in a batch."""

        '''
        last_hidden_state is the model output represents the hidden state output of
        each position in the input sequence. The last_hidden_state match the n_embd
        which is the pre-trained model's embedding space.
        '''
        last_hidden_state = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        ).last_hidden_state

        '''
        added layers of the pre-trained model to produce single value
        '''
        output = self.reward_head(last_hidden_state)

        # for each item in the batch
        # choose the hidden state of the last token as a reward!
        '''
        One reason for choosing the last token as the reward value is that it represents
        the end of the input sequence, and therefore the end of the summary.
        By selecting the reward value of the last token, the reward signal is directly
        linked to the quality of the generated summary, which is the ultimate goal of the summarization task.
        '''
        reward_scalar = output[:, -1, 0]
        return reward_scalar

In [8]:
#| export
class PairwiseLoss(nn.Module):
    """Pairwise loss function."""
    def forward(
        self,
        chosen_rewards: TensorType["batch_size", 1], # The reward of the chosen prompt
        rejected_rewards: TensorType["batch_size", 1] # The reward of the rejected prompt
    ) -> TensorType[1]: # A scalar loss
        """Compute the loss value."""
        assert len(chosen_rewards) == len(rejected_rewards)
        batch_size = len(chosen_rewards)
        probs = torch.sigmoid(chosen_rewards - rejected_rewards).log()
        return -probs.mean() / batch_size

In [9]:
from torch import optim
from torch.utils.data import DataLoader, random_split

import pytorch_lightning as pl
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from datasets import load_dataset
import pandas as pd 

In [10]:
# Step 1: Create a reward model from a pre-trained language model
tokenizer = AutoTokenizer.from_pretrained("hy-tmp/gpt2_tokenizer_local")
tokenizer.pad_token = tokenizer.eos_token
reward_model = RewardModel(model_name="hy-tmp/gpt2_local", device="cuda")

In [11]:
# Step 2: Create a Pairwise dataset
# Load the CSV file into a pandas DataFrame
sc_df = pd.read_csv("hy-tmp/openai_summarize_comparisons.csv")
sc_df = sc_df.iloc[:10000] # 10000/90000

# # Convert the pandas DataFrame to a dataset
# dataset = Dataset.from_dict(df.to_dict(orient='list')) 

# dataset = dataset.select(range(100))
# dataset

In [12]:
pair_dataset = PairDataset(sc_df, tokenizer)
dataloader = DataLoader(pair_dataset, batch_size=4, num_workers=64)

10000it [00:14, 696.17it/s]


In [13]:
# Step 3: Write a training loop
N_EPOCHS = 1
LEARNING_RATE = 1e-3

pairwise_loss = PairwiseLoss()

In [14]:
class LitRewardModel(pl.LightningModule):
    def __init__(
        self, model, loss_func, lr
    ):
        super().__init__()
        self.model = model
        self.loss_func = loss_func
        self.lr = lr

    def training_step(self, batch, batch_idx: int):
        chosen_input_ids, chosen_attention_mask,\
        rejected_input_ids, rejected_attention_mask = batch

        # call the forward function of the reward_model class
        # then use the loss to train the parameters of pre-trained model and
        # and custom head
        chosen_rewards = self.model(chosen_input_ids, chosen_attention_mask)
        rejected_rewards = self.model(rejected_input_ids, rejected_attention_mask)

        loss = self.loss_func(chosen_rewards, rejected_rewards)

        print(f"loss={loss}")

        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        return optimizer

In [15]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # or "true"

In [16]:
lit_model = LitRewardModel(reward_model, pairwise_loss, lr=1e-3)
trainer = pl.Trainer(max_epochs=N_EPOCHS, logger=False)
trainer.fit(model=lit_model, train_dataloaders=dataloader)
# get the maximum GPU memory occupied by tensors
mem_used = torch.cuda.max_memory_allocated()
print(f"GPU memory used: {mem_used / 1024**3:.2f} GB")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | model     | RewardModel  | 124 M 
1 | loss_func | PairwiseLoss | 0     
-------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.762   Total estimated model params size (MB)


Epoch 0:   0%|          | 0/2500 [00:00<?, ?it/s] loss=0.17953726649284363
Epoch 0:   0%|          | 1/2500 [00:07<4:56:50,  7.13s/it]loss=0.16972096264362335
Epoch 0:   0%|          | 2/2500 [00:07<2:38:48,  3.81s/it]loss=0.17343491315841675
Epoch 0:   0%|          | 3/2500 [00:08<1:53:00,  2.72s/it]loss=0.19488877058029175
Epoch 0:   0%|          | 4/2500 [00:08<1:29:51,  2.16s/it]loss=0.17288100719451904
Epoch 0:   0%|          | 5/2500 [00:09<1:16:00,  1.83s/it]loss=0.17293569445610046
Epoch 0:   0%|          | 6/2500 [00:09<1:06:48,  1.61s/it]loss=0.16975559294223785
Epoch 0:   0%|          | 7/2500 [00:10<1:00:12,  1.45s/it]loss=0.17955680191516876
Epoch 0:   0%|          | 8/2500 [00:10<55:20,  1.33s/it]  loss=0.14142440259456635
Epoch 0:   0%|          | 9/2500 [00:11<51:28,  1.24s/it]loss=0.16438627243041992
Epoch 0:   0%|          | 10/2500 [00:11<48:21,  1.17s/it]loss=0.17325296998023987
Epoch 0:   0%|          | 11/2500 [00:12<45:49,  1.10s/it]loss=0.16932731866836548
Epoch

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2500/2500 [21:14<00:00,  1.96it/s]
GPU memory used: 20.54 GB


In [17]:
class RLHFConfig:
    # PPO config
    epsilon: float = 0.1
    # entropy coefficient
    ent_coef: float = 0.01
    vf_coef: float = 0.1

In [18]:
#| export
class RLHFTrainer:
    def __init__(
        self,
        model: PreTrainedModel, # A pre-trained language model: actor
        ref_model: PreTrainedModel, # A a reference model: critic
        config: RLHFConfig,
    ):
        self.model = model.to("cuda")
        self.ref_model = ref_model.to("cuda")
        self.epsilon = config.epsilon
        self.ent_coef = config.ent_coef
        self.vf_coef = config.vf_coef
    
    @classmethod
    def compute_advantage_and_return(
        self,
        rewards: TensorType["batch_size"], # A list of reward values
        values: TensorType["batch_size"] # A list of predicted values from agent's value network
    ) -> Tuple[TensorType["batch_size"], TensorType["batch_size"]]: # The advantages and returns
        """Calculate the advantages and returns."""
        # copied from https://github.com/lvwerra/trl/blob/d2e8bcf8373726fb92d2110c500f7df6d0bd566d/trl/trainer/ppo_trainer.py#L686
        rewards = rearrange(rewards, 'b -> 1 b')
        values = rearrange(values, 'b -> 1 b')
        
        lastgaelam = 0
        advantages_reversed = []
        gen_len = len(rewards)
        
        '''
        discount factor: determines the relative importance of future rewards compared to immediate rewards.
        '''
        gamma = 1
        '''
        GAE parameter:  A higher value of lambda places more weight on the previous advantage estimate, 
        leading to lower variance but potentially higher bias, while a lower value of lambda places more weight
        on the current temporal difference error, leading to higher variance but potentially lower bias.
        '''
        lam = 0.95

        '''
        reversed function is used to loop backwards through the time steps of the rewards and values lists.
        '''
        for t in reversed(range(gen_len)):
            nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
            # delta: temporal difference error: difference of predicted and actual value
            delta = rewards[:, t] + gamma * nextvalues - values[:, t]
            lastgaelam = delta + gamma * lam * lastgaelam
            advantages_reversed.append(lastgaelam)

        # the advantage estimate for the i-th sample in the batch at time step j
        advantages = torch.stack(advantages_reversed[::-1]).transpose(0, 1)
        # returns is the expected return
        returns = advantages + values

        advantages = rearrange(advantages, '1 b -> b')
        returns = rearrange(returns, '1 b -> b')
        
        return advantages, returns

    def compute_loss(
        self,
        query_ids: TensorType["batch_size", "seq_len"],
        query_attention_mask: TensorType["batch_size", "seq_len"],
        response_ids: TensorType["batch_size", "seq_len"],
        response_attention_mask: TensorType["batch_size", "seq_len"],
        rewards: TensorType["batch_size"],
    ) -> TensorType["1"]:

        query_ids = query_ids.to("cuda")
        query_attention_mask = query_attention_mask.to("cuda")
        response_ids = response_ids.to("cuda")
        response_attention_mask = response_attention_mask.to("cuda")
        rewards = rewards.to("cuda")

        """Calculate PPO's loss."""
        logprobs, values, entropies, ref_logprobs = self.forward(
            query_ids=query_ids,
            query_attention_mask=query_attention_mask,
            response_ids=response_ids,
            response_attention_mask=response_attention_mask
        )

        # r_t. logprobs is new policy and ref_logprobs is old policy 
        ratio = (logprobs - ref_logprobs).exp()
        clipped_ratio = torch.clamp(ratio, min=1-self.epsilon, max=1+self.epsilon)

        # returns: expected return
        advantages, returns = self.compute_advantage_and_return(rewards, values)
        '''
        measure of the accuracy of the value function in predicting the expected future reward 
        for each state
        '''
        value_loss = (values - returns).pow(2).mean()
        
        pg_loss_1 = ratio * advantages
        pg_loss_2 = ratio * clipped_ratio
        pg_loss = torch.min(pg_loss_1, pg_loss_2).mean()
        
        loss = pg_loss - self.ent_coef * entropies.mean() + self.vf_coef * value_loss
        return loss
    
    def forward(
        self,
        query_ids: TensorType["batch_size", "seq_len"],
        query_attention_mask: TensorType["batch_size", "seq_len"],
        response_ids: TensorType["batch_size", "seq_len"],
        response_attention_mask: TensorType["batch_size", "seq_len"]
    ) -> Tuple[
        TensorType["batch_size"], # main model's logprobs
        TensorType["batch_size"], # entropy
        TensorType["batch_size"], # value
        TensorType["batch_size"], # reference model's log prob
    ]:
        input_ids = torch.cat([query_ids, response_ids], dim=1)
        attention_mask = torch.cat([query_attention_mask, response_attention_mask], dim=1)
        
        _, logprobs, entropy, value = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, ref_logprob, _, _ = self.ref_model(
            input_ids=input_ids,
            attention_mask=attention_mask   
        )
            
        return logprobs, entropy, value, ref_logprob

In [19]:
# Step 1: Load dataset

imdb_df = pd.read_csv("hy-tmp/imdb.csv")
imdb_df = imdb_df[imdb_df['text'].str.len() <= 1000].iloc[:10000] # 10000/25000
imdb_df
#dataset, _ = random_split(dataset, lengths=[100, len(dataset) - 100]) # for demenstration purposes
# train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
# dataset

Unnamed: 0,text,label
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
5,I would put this at the top of my list of film...,0
6,Whoever wrote the screenplay for this movie ob...,0
11,I can't believe that those praising this movie...,0
...,...,...
19325,"From start to finish, I laughed real hard thro...",1
19326,"This film may seem dated today, but remember t...",1
19327,I saw this in Detroit in what must have been i...,1
19328,I was born in 1982. Most of my childhood memor...,1


In [20]:
imdb_dataset = IMDBDataset(imdb_df, tokenizer)
train_dataloader = DataLoader(imdb_dataset, batch_size=4, num_workers=64)

10000it [00:00, 19449.46it/s]


In [21]:
# Step 2: Load the pre-trained model and tokenizer
model_base = AutoModelForCausalLM.from_pretrained("hy-tmp/gpt2_CLM_local") # for demonstration purposes
#reward_model = RewardModel("gpt2")

tokenizer = AutoTokenizer.from_pretrained("hy-tmp/gpt2_tokenizer_local", padding_side="left")

eos_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [22]:
# Step 3: Create the RL-based language model agent and the reference model
from copy import deepcopy
from einops import rearrange

def create_reference_model(model):
    ref_model = deepcopy(model).eval()
    return ref_model

model = Agent(model_base)
ref_model = create_reference_model(model)

In [None]:
# max length of responds
max_new_tokens = 100
'''
pad_token_id: The ID of the padding token in the tokenizer's vocabulary.
This token is used to pad the generated responses to a fixed length.
do_sample: If True, then sampling is used to generate responses the 
model randomly selects the next word based on its predicted probabilities
max_new_tokens:  The maximum number of new tokens to generate for each response.
'''
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

# generate some text
input_str = "What is your name?"
input_ids = tokenizer(input_str, padding=True, truncation=True, return_tensors="pt")
output_ids = model_base.generate(
            input_ids = input_ids["input_ids"],
            attention_mask=input_ids["attention_mask"],
            **generation_kwargs)

# decode the output
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_str)

In [23]:
# max length of responds
max_new_tokens = 100
'''
pad_token_id: The ID of the padding token in the tokenizer's vocabulary.
This token is used to pad the generated responses to a fixed length.
do_sample: If True, then sampling is used to generate responses the 
model randomly selects the next word based on its predicted probabilities
max_new_tokens:  The maximum number of new tokens to generate for each response.
'''
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

config = RLHFConfig()
N_EPOCH = 2
trainer = RLHFTrainer(model, ref_model, config)
# the optimizer will only update the model's para instead of ref_model para 
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [24]:
total_num_batch = len(train_dataloader)
total_loss = [] 

for epoch in range(N_EPOCH):
    total_loss = [] 
    for i, batch in enumerate(train_dataloader):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs_truncated = {
            key: tensor[:, :512] for key, tensor in inputs.items()
        }
        
        response_ids = model.generate(
            input_ids = inputs_truncated["input_ids"],
            attention_mask=inputs_truncated["attention_mask"],
            **generation_kwargs
        )
        
        '''
        extract the last max_new_tokens generated text including padding
        to ensure response have the same length 
        '''
        response_ids = response_ids[:, -max_new_tokens:]
        response_attention_mask = torch.ones_like(response_ids)
        
        # evaluate from the reward model
        # reward_model is trained separatly
        with torch.no_grad():
            text_input_ids = torch.stack([torch.concat([q, r]) for q, r in zip(inputs["input_ids"].to("cuda"), response_ids.to("cuda"))], dim=0).to("cuda")
            reward_model = reward_model.to("cuda")
            rewards = reward_model(response_ids.to("cuda"))
        
        # calculate PPO loss
        loss = trainer.compute_loss(
            query_ids=inputs["input_ids"],
            query_attention_mask=inputs["attention_mask"],
            response_ids=response_ids,
            response_attention_mask=response_attention_mask,
            rewards=rewards
        )
        optimizer.zero_grad()
        loss.backward() # compute gradient 
        optimizer.step() # update parameters

        if i%100 == 0 and i != 0:
            print(f"epoch {epoch}, batch {i}/{total_num_batch}, total_loss={sum(total_loss)}, len {len(total_loss)} ,loss={sum(total_loss)/len(total_loss)}")
            total_loss = [] 
        else:
            total_loss.append(loss)

epoch 0, batch 100/2500, total_loss=-8.292320251464844, len 100 ,loss=-0.08292320370674133
epoch 0, batch 200/2500, total_loss=-0.8824985027313232, len 99 ,loss=-0.008914126083254814
epoch 0, batch 300/2500, total_loss=-0.8859870433807373, len 99 ,loss=-0.008949363604187965
epoch 0, batch 400/2500, total_loss=-0.9168435335159302, len 99 ,loss=-0.009261045604944229
epoch 0, batch 500/2500, total_loss=-0.947441041469574, len 99 ,loss=-0.009570111520588398
epoch 0, batch 600/2500, total_loss=-0.9568154811859131, len 99 ,loss=-0.009664802812039852
epoch 0, batch 700/2500, total_loss=-0.9566261172294617, len 99 ,loss=-0.009662889875471592
epoch 0, batch 800/2500, total_loss=-0.9607317447662354, len 99 ,loss=-0.009704360738396645
epoch 0, batch 900/2500, total_loss=-0.967468798160553, len 99 ,loss=-0.009772412478923798
epoch 0, batch 1000/2500, total_loss=-0.9761909246444702, len 99 ,loss=-0.009860514663159847
epoch 0, batch 1100/2500, total_loss=-0.9813780188560486, len 99 ,loss=-0.00991290

In [28]:
# generate some text
input_str = "How is the movie transformers?"
input_ids = tokenizer(input_str, padding=True, truncation=True, return_tensors="pt")
output_ids = model.generate(
            input_ids = input_ids["input_ids"],
            attention_mask=input_ids["attention_mask"],
            **generation_kwargs)

# decode the output
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_str)

How is the movie transformers?oisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisoisois


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Step 2: Load the pre-trained model and tokenizer
model_base = AutoModelForCausalLM.from_pretrained("hy-tmp/gpt2_CLM_local") # for demonstration purposes
#reward_model = RewardModel("gpt2")

tokenizer = AutoTokenizer.from_pretrained("hy-tmp/gpt2_tokenizer_local", padding_side="left")

eos_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# max length of responds
max_new_tokens = 20
'''
pad_token_id: The ID of the padding token in the tokenizer's vocabulary.
This token is used to pad the generated responses to a fixed length.
do_sample: If True, then sampling is used to generate responses the 
model randomly selects the next word based on its predicted probabilities
max_new_tokens:  The maximum number of new tokens to generate for each response.
'''
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

# generate some text
input_str = "What is your name?"
input_ids = tokenizer(input_str, padding=True, truncation=True, return_tensors="pt")
output_ids = model_base.generate(
            input_ids = input_ids["input_ids"],
            attention_mask=input_ids["attention_mask"],
            **generation_kwargs)

# decode the output
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_str)

What is your name? Otherwords when seeking contributors can survive better than digital decentralisation. I lay out my slogan on the
