In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from stable_baselines3 import PPO, A2C
from models.GPT2ActorCriticPolicy import GPT2ActorCriticPolicy
from envs.AdaptEnv import AdaptEnv
import os

In [14]:
import pprint
import torch
import torch.nn as nn
from models.GPT2ActorCriticNetwork import GPT2ActorCriticNetwork
from models.GPT2ActorCriticPolicy import GPT2ActorCriticPolicy
from envs.AdaptEnv import AdaptEnv
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from models.GPT2RewardModel import GPT2RewardModel
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
import wandb
from wandb.integration.sb3 import WandbCallback
from datetime import datetime

# device = 'cpu'
device = 'cuda:0'

cache_dir = '/om2/user/rogerjin/.cache/transformers'

import os
os.environ['TRANSFORMERS_CACHE'] = cache_dir

hf_models = {
    'gpt2': 'gpt2',
    'gpt2-medium': 'gpt2-medium',
    'gpt2-large': 'gpt2-large',
    'DialoGPT-small': 'microsoft/DialoGPT-small',
    'DialoGPT-medium': 'microsoft/DialoGPT-medium',
    'DialoGPT-large': 'microsoft/DialoGPT-large'
}

learning_algos = {
    'ppo': PPO,
    'a2c': A2C
}

config = {
    'out_type': 'joke',
    'prompt': "Tell me a joke.",
    'feedback': 'Be funnier.',
    'emotion': 'funny',
    'model_name': 'DialoGPT-small',
    'algorithm': 'ppo',
    'max_seq_len': 35,
    'max_steps': 10000,
    'lr': 5e-8, # from https://github.com/openai/lm-human-preferences
    'n_envs': 20,
}

config['policy_kwargs'] = {
    'ortho_init': False, # important
    'policy_hfid': hf_models[config['model_name']],
    'value_hfid': hf_models[config['model_name']],
    'device': device,
    'cache_dir': cache_dir,
}

TOY = False
TOY = True
if TOY:
    config['max_steps'] = 0
    config['max_seq_len'] = 10
    config['algorithm'] = 'a2c'
    config['n_envs'] = 2

today = datetime.today().strftime('%Y-%m-%d')
curr_time = datetime.now().strftime('%H-%M-%S')

pprint.pprint(config, indent=2)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|pad|>', device=device, cache_dir=cache_dir)
prompt_len = tokenizer(config['prompt'], return_tensors="pt")['input_ids'].shape[-1]

reward_model = GPT2RewardModel(device=device) # todo: try out dialogpt?

env_kwargs = {
    'prompt': config['prompt'],
    'feedback': config['feedback'],
    'max_seq_len': config['max_seq_len'],
    'reward_model': reward_model,
    'tokenizer': tokenizer,
    'eos_token_id': tokenizer.eos_token_id,
    'device': 'cpu' # i think breaks on cuda rn
}
if config['n_envs'] == 1:
    env = AdaptEnv(**env_kwargs)
else:
    env = make_vec_env(AdaptEnv, n_envs=config['n_envs'], env_kwargs=env_kwargs)

learning_algo = learning_algos[config['algorithm']] # default PPO. If TOY, then A2C.

n_steps = config['max_seq_len'] - prompt_len
model = learning_algo(GPT2ActorCriticPolicy, env, n_steps=n_steps, learning_rate=config['lr'], policy_kwargs=config['policy_kwargs'], device=device, verbose=2)
model.policy._modules['action_net'] = GPT2LMHeadModel.from_pretrained(hf_models[config['model_name']]).to(device).lm_head

{ 'algorithm': 'a2c',
  'emotion': 'funny',
  'feedback': 'Be funnier.',
  'lr': 5e-08,
  'max_seq_len': 10,
  'max_steps': 0,
  'model_name': 'DialoGPT-small',
  'n_envs': 2,
  'out_type': 'joke',
  'policy_kwargs': { 'cache_dir': '/om2/user/rogerjin/.cache/transformers',
                     'device': 'cuda:0',
                     'ortho_init': False,
                     'policy_hfid': 'microsoft/DialoGPT-small',
                     'value_hfid': 'microsoft/DialoGPT-small'},
  'prompt': 'Tell me a joke.'}


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Using cuda:0 device


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
lm = model.policy._modules['mlp_extractor'].policy_latent.lm

In [None]:
model.learn(config['max_steps'], verbose=2)
model.policy._modules['action_net'].bias = torch.nn.Parameter(torch.Tensor([0]*50257).to(device))
model = model.policy._modules['mlp_extractor'].policy_latent.lm # todo: figure out if the weights are different

In [6]:
prompt = 'Tell me a joke.'

# out_dir = f'output/jokes/knock-knock/'
# os.makedirs(out_dir, exist_ok=True)
# with open(f'{out_dir}/prompt.txt', 'w+') as prompt_file:
#     prompt_file.write(prompt)

cache_dir = '/om2/user/rogerjin/.cache/transformers'
hf_id = 'microsoft/DialoGPT-small'
# hf_id = 'gpt2-large'
device = 'cuda:0'
# save_path = '/om2/user/rogerjin/6.884/adaPT/checkpoints/test/DialoGPT-small_ppo_2_lr=5e-06.pt'
tokenizer = GPT2Tokenizer.from_pretrained(hf_id, cache_dir=cache_dir, device=device)

control = False
run_name = 'be_funnier'

# if control:
#     model = GPT2LMHeadModel.from_pretrained(hf_id, cache_dir=cache_dir).to(device)
#     out_path = f'{out_dir}/control.txt'
# else:
#     ppo = PPO.load(save_path)
#     model = ppo.policy._modules['mlp_extractor'].policy_latent.lm # todo: figure out if the weights are different
#     out_path = f'{out_dir}/{run_name}.txt'

In [17]:
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

num_sequences = 100
sentences = [prompt] * num_sequences
batch_size = 20

output_sequences = []

for i in range(0, num_sequences, batch_size):
    batch = sentences[i:i+batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True)

    output_sequences.extend(lm.generate(
        input_ids=inputs['input_ids'].to(device),
        attention_mask=inputs['attention_mask'].to(device),
        do_sample=True, # disable sampling to test if batching affects output
        min_length=25,
        max_length = 60,
        temperature=1.5,
        top_k=20,
        no_repeat_ngram_size=2
    ))

decodings = []

for i in range(len(sentences)):
    decoding = tokenizer.decode(output_sequences[i], skip_special_tokens=True)
    print(decoding)
#     decodings.append(f'<|DECODING|>{decoding}')
#     decodings.append(f'{decoding}\n')
    # you can use skip_special_tokens=True in decode() to remove padding token
    # but note that it will also remove other special_tokens

# with open(out_path, 'w+') as out:
#     out.writelines(decodings)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.