In [1]:
!pip install datasets
!pip install transformers
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'deeplearning/'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive
['shortjokes.csv', 'openseminar_GAN.pptm', 'openseminar_GAN.pptm.gslides', 'Untitled0.ipynb', 'openseminar_GAN_final.docx', 'Davinci003.ipynb', 'Untitled', 'GPTNEO.ipynb', 'baseline.ipynb', 'RM_Training.ipynb', 'trained_models', 'Transformers.ipynb', 'QA_RL_PPO.ipynb', 'api.ipynb', 'RLHF_Dataset (1).ipynb', 'GPTNeoMyself.ipynb', 'InstructMyself2.ipynb', 'q-learning.ipynb', 'deepdai', 'VAE.ipynb', 'diffusion', 'competition', 'GPTNeo_RLHF.ipynb', 'CRS', 'reward_logs.txt']


In [3]:
import pandas as pd
import re
from itertools import combinations
import io
import json
import os
import ast

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.categorical import Categorical
from tqdm import tqdm

import transformers
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, AdamW, get_scheduler, GPTNeoModel, GPT2LMHeadModel
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from typing import List, Deque
from einops import rearrange
import datetime

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

models_folder = "trained_models" # where to save model

In [4]:
from datasets import load_dataset

# Load all helpfulness/harmless subsets (share the same schema)
train_dataset = load_dataset("Anthropic/hh-rlhf", split="train")
test_dataset= load_dataset("Anthropic/hh-rlhf", split="test")

Downloading readme:   0%|          | 0.00/4.57k [00:00<?, ?B/s]

Downloading and preparing dataset json/Anthropic--hh-rlhf to /root/.cache/huggingface/datasets/Anthropic___json/Anthropic--hh-rlhf-c8cd8dc58ab67414/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/Anthropic___json/Anthropic--hh-rlhf-c8cd8dc58ab67414/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.




In [5]:
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M', bos_token='<|startoftext|>', eos_token='<|endoftext|>', padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

model.to(device)

model_path = GOOGLE_DRIVE_PATH + models_folder + f"/gpt_last_ft.pt"
model.load_state_dict(torch.load(model_path)) # 여기서는 dict만 불러온다. 애초에 dict만 저장함.

model.train()
print("skip")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


skip


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class DeBertRewardModel(nn.Module):
  def __init__(self):
    super(DeBertRewardModel, self).__init__()

    # Load tokenizer and model  
    self.tokenizer = AutoTokenizer.from_pretrained("OpenAssistant/reward-model-deberta-v3-large-v2")
    self.base = AutoModelForSequenceClassification.from_pretrained("OpenAssistant/reward-model-deberta-v3-large-v2").to(device)
    self.tokenizer.pad_token = "<|endoftext|>"
  
  def forward(self, prompts):
    self.base.train()
    tokenized=self.tokenizer(prompts, padding=True, truncation=True, return_tensors='pt')
    input_ids, attention_masks=tokenized['input_ids'].to(device), tokenized['attention_mask'].to(device)

    outputs = self.base(input_ids, attention_mask=attention_masks)

    return outputs.logits.squeeze()

In [7]:
bert_reward_model = DeBertRewardModel()

#bert tuning한 모델 불러오기
model_path = GOOGLE_DRIVE_PATH + models_folder + f"/bert_reward_last.pt"
bert_reward_model.load_state_dict(torch.load(model_path)) # 여기서는 dict만 불러온다. 애초에 dict만 저장함.
bert_reward_model.to(device)

print("skip")

Downloading (…)okenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

skip


In [8]:
num_episodes = 5000
actor_lr = 0.005 # sft model
critic_lr = 0.005  # reward model
max_length=300

Memory = namedtuple('Memory', [
    'sequences', 'rewards', 'action_log_probs', 'values', 'advantages', 'new_attention_mask'
])

In [9]:
class PPOEnv():
  def __init__(self):
    super().__init__()
    self.dataset = TrajectoryDataset(train_dataset)
    self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=1, shuffle=True)
    self.done = False
    self.did = [] # used states
    self.current_state = ""
    self.random_idx=0
    self.current_step=0

  # action is response
  def reset(self):
    get = True
    self.random_idx = 0
    self.current_step=0
    while get:
      self.random_idx=np.random.randint(low=0, high=len(self.dataset))
      if self.random_idx not in self.did:
        self.did.append(self.random_idx)
        get = False
    self.current_state = self.dataset[self.random_idx][0]['state']+"\n\nAssistant:"
    self.current_state = next(iter(self.dataloader))[0]['state'][0]+"\n\nAssistant:"
    return self.current_state

In [10]:
class GPTNeoActors(nn.Module):
  def __init__(self):
    super(GPTNeoActors, self).__init__()
    self.sft_model=model
    
    #policy to train
    self.model = model
    self.tokenizer = tokenizer
    self.tokenizer.pad_token = tokenizer.eos_token
  
  def inference(self, prompt):
    tokenized = self.tokenizer(prompt, return_tensors="pt")

    input_ids, attention_mask = tokenized['input_ids'].to(device), tokenized['attention_mask'].to(device)

    generated = self.model.generate(
      input_ids, attention_mask=attention_mask, max_length=1024, num_beams=1, no_repeat_ngram_size=3, early_stopping=True, temperature=0.9,
    )

    response = tokenizer.decode(generated[0], skip_special_tokens=True)
    return response
  
  def get_sequence_probs(self, logits, sequences):
    batch_size, seq_length, vocab_size=logits.size()

    probs=logits.softmax(dim=-1)
    result = torch.gather(probs, dim=-1, index=sequences.unsqueeze(-1))
    # Use the sum function to add up the log probabilities for each time step of the sequence
    seq_probs = result.sum(dim=-1)
    return seq_probs
  
  def get_sft_action_prob(self, input_length, batch_sequences, batch_attention_mask):
    #just input the whole sequence to lm model and it will output logits => simple prob. calculation.
    batch_size, seq_length=batch_sequences.size()
    seq_index=torch.arange(0, seq_length, 1).long().to(device)
    #create new attention mask
    new_attention_mask=torch.ones(batch_size, seq_length).to(device)
    new_attention_mask[:,:input_length]=batch_attention_mask

    #obtain sft logits
    out=self.sft_model(batch_sequences, attention_mask=new_attention_mask)
    seq_probs=self.get_sequence_probs(out.logits, batch_sequences)
    action_probs=seq_probs[:,input_length:]
    return action_probs
        
  def forward(self, batch_states, max_gen_length=300):
    # sample(generate) an action(response) based on the actor model
    tokenized = self.tokenizer(batch_states, return_tensors='pt', padding=True)
    batch_input_ids, batch_attention_mask = tokenized.input_ids.to(device), tokenized.attention_mask.to(device)
    input_length=batch_input_ids.size(1)
    generated = self.model.generate(batch_input_ids, attention_mask=batch_attention_mask, do_sample=True, max_length=max_gen_length, output_scores=True, return_dict_in_generate=True)
    # 답변 생성
    batch_sequences = generated.sequences #actions contains prompt => can be processed directly with reward model.
    batch_responses = batch_sequences[:,input_length:]

    response_logits = torch.stack(generated.scores, dim=0).permute(1,0,2) # 1, len(scores), vocab_size
    
    action_log_probs = torch.log(self.get_sequence_probs(response_logits, batch_responses)) # scores, sequences
    sft_action_log_probs=torch.log(self.get_sft_action_prob(input_length, batch_sequences, batch_attention_mask))
    
    batch_log_prob_ratio_list = action_log_probs - sft_action_log_probs
    batch_log_prob_ratio=torch.sum(batch_log_prob_ratio_list, dim=-1)

    return batch_sequences, batch_log_prob_ratio

In [11]:
import re

#decompose HF dataset into episodic data (s,a,s,a,s,a,s,a,s_t) => consider end of dialogue as termination of an episode.
#treat phrases starting with * as Assistant's dialogue.
#used for on-policy training
class TrajectoryDataset(torch.utils.data.Dataset):
  def __init__(self, hf_dset):
    super(TrajectoryDataset, self).__init__()
    self.hf_dset=hf_dset
    self.trajectories=self.decompose_dset(self.hf_dset)
  
  def get_ep_steps_from_dialogue(self, text):
    text=text.replace("Humans:","Human:")
    text=text.replace("human:","Human:")
    text=text.replace("humans:","Human:")
    result = [x.strip('\n\n').strip() for x in re.split('Human:|Assistant:', text)[1:]]
    human_list = []
    assistant_list = []

    human_part = ""
    assistant_part = ""

    episodes = []
    for n in range(int(len(result)/2)):
      state = human_part + assistant_part + "Human: " + result[2*n]
      action = result[2*n+1]
      human_part = state + '\n\n'
      assistant_part = "Assistant: " + action + '\n\n'
      ep = {
          'state':state,
          'action':action
      }
      episodes.append(ep)
    
    return episodes

  def decompose_dset(self, hf_dset):
    trajectories=[]
    for hfd in hf_dset:
      chosen=hfd['chosen']
      trajectory=self.get_ep_steps_from_dialogue(chosen)
      trajectories.append(trajectory)
    return trajectories
  
  def __len__(self):
    return len(self.trajectories)
  
  def __getitem__(self, idx):
    text = self.trajectories[idx]
    text=text[0]['state']+"\n\nAssistant:"
    # inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=500, padding="max_length") #pad => special token X

    return text
    # return inputs['input_ids'][0]

In [12]:
actor=GPTNeoActors()
env=PPOEnv()

In [16]:
class InstructPPOs():
  def __init__(self, action_lr=0.001, critic_lr=0.001, epsilon=0.1, beta=0.01, gamma=0.95, epoch=1):
    # super.__init__(InstructPPO, self)
    self.critic = bert_reward_model.to(device)
    self.reward_model = bert_reward_model.to(device)
    # self.actor = GPTNeoActor()
    self.actor = actor
    self.beta = beta
    self.gamma = gamma
    self.epsilon = epsilon
    self.actor_reg = 0.5
    self.critic_reg = 0.5
    self.critic_lr = critic_lr
    self.action_lr = action_lr
    self.epochs=epoch
    self.env = env
    self.value_head = nn.Sequential(nn.Linear(1,10), nn.Linear(10,1)).to(device)

  def step(self, text):
    reward = self.reward_model(text)
    critic = self.critic(text)
    # value = self.value_head(torch.tensor(critic).to(device))
    value = self.value_head(critic)
    return reward, value

  def train(self, update_timesteps=300, num_episodes=5):
    max_timesteps = 100
    time = 1
    memories = deque([])

    dataset = TrajectoryDataset(train_dataset)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)

    loss_trajs=[]

    for episode in tqdm(range(num_episodes), desc="episodes"):
      for idx, batch_states in enumerate(dataloader):
        if len(batch_states[0]) + len(batch_states[1]) > 1000:
          continue

        self.actor.train()
        self.critic.train()

        actor_optim=optim.AdamW(self.actor.parameters(), lr=actor_lr, weight_decay=self.actor_reg)
        critic_optim=optim.AdamW(self.critic.parameters(), lr=critic_lr, weight_decay=self.critic_reg)

        time += 1
        
        batch_sequences, batch_log_prob_ratio=self.actor(batch_states, max_gen_length=max_length)
        batch_next_states=self.actor.tokenizer.batch_decode(batch_sequences, skip_special_tokens=True) # 질문과 답변이 텍스트로 나옴

        rewards=self.reward_model(batch_next_states)
        rewards=rewards.detach()

        # actor_loss=-torch.sum((rewards-self.beta*batch_log_prob_ratio))/len(batch_states)
        actor_loss=-torch.sum(rewards)/len(batch_states)
        loss_trajs.append(actor_loss.cpu().detach())
        print("액터의 로스 : ", actor_loss.cpu().detach())

        #add pretrain loss
        actor_optim.zero_grad()
        actor_loss.requires_grad_(True)
        actor_loss.backward()
        actor_optim.step()
        
        loss_ftn=nn.MSELoss()
        for c_epoch in range(1, 3):
          state_values=self.critic(batch_states)
          values=[]
          ## 배치 1보다 크게 돌릴때는 아래에서 list를 지워야 한다.
          for state_value in state_values:
            value=self.value_head(state_value.unsqueeze(dim=-1))
            values.append(value.cpu().detach())
          values=torch.tensor(values).to(device)
          critic_loss=loss_ftn(values, rewards) # 두개 차이?

          critic_optim.zero_grad()
          critic_loss.requires_grad_(True)
          critic_loss.backward()
          critic_optim.step()
        if time % 50 == 0:
          # print(batch_next_states)
          plt.plot(loss_trajs)
          plt.xlabel('losses')
          plt.ylabel('Y-axis label')
          plt.title('Title of the graph')
          plt.show()
        if time % update_timesteps == 0:
          # self.learn(memories) # 쌓은 메모리를 바탕으로 학습
          # memories.clear()     # 이후 메모리 초기화 (원래 해야하나?)
          self.test()
          torch.save(actor.model.state_dict(), GOOGLE_DRIVE_PATH + models_folder + f"/gpt_ppo_0.pt")


  def test(self):
    with torch.no_grad():
      dummy_questions=[
          "Human:Who is your best friend?\n\nInformation:I was sorted into Gryffindor House at Hogwarts, where I became best friends with Ron and Hermione.\n\nAssistant:",
          "Human:What sports do you like the most?\n\nInformation:Despite needing glasses, I have excellent eyesight and can locate even the smallest objects like the Golden Snitch on a Quidditch field.\n\nAssistant:",
      ]

      for idx, q in enumerate(dummy_questions):
        response = self.actor.inference(q)
        print(response)
      return

In [17]:
ppo=InstructPPOs()

In [18]:
ppo.train()

Output hidden; open in https://colab.research.google.com to view.

In [19]:
print(actor.inference("Human:Hi how are you?\n\nAssistant:"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Human:Hi how are you?

Assistant:I'm doing well with my friends, but I'm not sure how I'm going to do it.
