In [1]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, PromptTuningConfig
import torch

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
emotion_tokens = ['<happy>', '<sad>']

tokenizer = AutoTokenizer.from_pretrained('gpt2', token=HF_TOKEN)
tokenizer.add_special_tokens({"additional_special_tokens": emotion_tokens})

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2

In [4]:
train_data = [
    ('<happy> Once upon a time, there was a dragon who,',
     'The dragon breathed colorful fireworks that lit up the sky.'),
    ('<sad> In a dark forest, a lonely knight',
     'The knight knelt by the withered tree, tears falling on his rusted armor.')
]

In [6]:
model = AutoModelForCausalLM.from_pretrained("gpt2", token=HF_TOKEN).to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50259, 768)

In [8]:
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    num_virtual_tokens=10,
    token_dim=model.config.hidden_size
)

model = get_peft_model(model, peft_config)

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(30):
  for prompt, continuation in train_data:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    labels = tokenizer(continuation, return_tensors="pt").input_ids.to(device)

    full_inputs = torch.cat([inputs.input_ids, labels], dim=1)
    outputs = model(full_inputs, labels=full_inputs)

    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


In [11]:
def generate_story(emotion, prompt):
  inputs = tokenizer(f'{emotion} {prompt}', return_tensors="pt").to(device)

  output = model.generate(
      **inputs,
      max_new_tokens=50,
      temperature=0.9,
      top_k=40,
      repetition_penalty=1.5,
      do_sample=True
  )

  return tokenizer.decode(output[0], skip_special_tokens=False)

In [12]:
print(generate_story('<happy>', 'In a magical kingdom,'))
print("=" * 100)
print(generate_story('<sad>', 'In a magical kingdom,'))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<happy> In a magical kingdom, that and of. I made the new book on it from our "
 is an in this to be as more Ris was not at all - The last we have no problem with each other (this seems like: one which should then you don
<sad> In a magical kingdom,'s the (
 this in anis. not is to " of it was good or that he had by 1: and-I made their new one on his name if you are free from for which other people don't have when they put into
