In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import minRLHF
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForTokenClassification
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from minRLHF.environment import Environment
import random
from transformers.pipelines import pipeline

reward_model = pipeline(
    "text-classification",
    model='bhadresh-savani/distilbert-base-uncased-emotion', 
    return_all_scores=True
)

class MyEnv(Environment):
    def get_input_prompt(self) -> str:
        return random.choice([
            'I went for a walk one day and',
            'A long time ago, in a galaxy far far away',
            'Oops! I'
        ])
        
    def score_generation(self, text: str) -> float:
        sentiment_scores = reward_model(text)[0]
        sentiment_scores = {d['label']: d['score'] for d in sentiment_scores}
        return sentiment_scores['joy']



In [4]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

model = AutoModelForCausalLM.from_pretrained('gpt2').to('cuda')
reference = AutoModelForCausalLM.from_pretrained('gpt2').to('cuda')
critic = AutoModelForTokenClassification.from_pretrained('gpt2', num_labels=1).to('cuda')

# Instantiate envrionment
env = MyEnv(tokenizer, batch_size=32)

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'h.0.attn.masked_bias', 'classifier.weight', 'h.11.attn.masked_bias', 'h.6.attn.masked_bias', 'h.5.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.4.attn.masked_bias', 'h.1.attn.masked_bias', 'h.8.attn.masked_bias', 'h.2.attn.masked_bias', 'h.7.attn.masked_bias', 'h.3.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from minRLHF.ppo_trainer import PPOTrainer

# Create PPO trainer
ppo_trainer = PPOTrainer(
    actor_model=model,
    critic_model=critic,
    reference_model=reference,
    env=env,
)

In [6]:
ppo_trainer.train()

Generating rollout batch 0
Generating rollout batch 1
Generating rollout batch 2
Generating rollout batch 3
Generating rollout batch 4
Generating rollout batch 5
Generating rollout batch 6
Generating rollout batch 7
Generating rollout batch 8
Generating rollout batch 9
Getting actor loss for train step 0 and batch 0
Getting actor loss for train step 0 and batch 1
Getting actor loss for train step 0 and batch 2
Getting actor loss for train step 0 and batch 3
Getting actor loss for train step 0 and batch 4
Getting actor loss for train step 0 and batch 5
Getting actor loss for train step 0 and batch 6
Getting actor loss for train step 0 and batch 7
Getting actor loss for train step 0 and batch 8
Getting actor loss for train step 0 and batch 9
Getting actor loss for train step 0 and batch 10
Getting actor loss for train step 0 and batch 11
Getting actor loss for train step 0 and batch 12
Getting actor loss for train step 0 and batch 13
Getting actor loss for train step 0 and batch 14
Getti

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

xs = list(range(len(ppo_trainer.rolling_rewards)))
ys = ppo_trainer.rolling_rewards

window_size = 10
smoothed_ys = [sum(ys[max(0, idx-window_size):idx])/window_size for idx, _ in enumerate(ys)]

plt.scatter(xs, ys, s=1)
plt.plot(smoothed_ys)

In [None]:
inputs = tokenizer('I went for a walk one day and', return_tensors='pt')
outputs = reference.generate(inputs.input_ids.to(model.device), max_length=100, do_sample=True)
text = tokenizer.batch_decode(outputs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [5]:
# Testing
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('../actor_649.model')

In [11]:
inputs = tokenizer('Oh no', return_tensors='pt')
outputs = model.generate(inputs.input_ids, do_sample=True, max_length=100)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Oh no. Here's our third party.

"We're here because you want to. And here's your option.

"We wanted to go with your name here because you've got it on file with ICE, and ICE's doing that very brilliantly with ICE's investigatory-immigration program.

"And ICE's got all of that up.

"So, if all of that's up, you're not here, and ICE's got all of that up
