In [1]:
from transformers import (
    BertTokenizerFast,
    AdamW,
    get_scheduler
)
import torch

from model import BertPromptTuningLM

In [5]:
class Config:
    # Same default parameters as run_clm_no_trainer.py in tranformers
    # https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py
    num_train_epochs = 3
    weight_decay = 0.01
    learning_rate = 0.01
    lr_scheduler_type = "linear"
    num_warmup_steps = 0
    max_train_steps = num_train_epochs
    
    # Prompt-tuning
    # number of prompt tokens
    n_prompt_tokens = 20
    # If True, soft prompt will be initialized from vocab 
    # Otherwise, you can set `random_range` to initialize by randomization.
    init_from_vocab = False
    # random_range = 0.5
args = Config()

In [6]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
# Initialize GPT2LM with soft prompt
model = BertPromptTuningLM.from_pretrained(
    "bert-base-multilingual-cased",
    n_tokens=args.n_prompt_tokens,
    initialize_from_vocab=args.init_from_vocab
)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertPromptTuningLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertPromptTuningLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertPromptTuningLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing soft prompt...


In [7]:
model.soft_prompt.weight
model.soft_prompt.weight.shape

torch.Size([20, 768])

In [5]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101, 31178,   117, 15127, 17835, 10124, 21610, 10112,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [6]:
# Only update soft prompt'weights for prompt-tuning. ie, all weights in LM are set as `require_grad=False`. 
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n == "soft_prompt.weight"],
        "weight_decay": args.weight_decay,
    }
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.max_train_steps,
)

In [7]:
model.train()
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
print(f"loss: {loss}")

loss: 11.808565139770508


In [8]:
loss.backward()
optimizer.step()

In [9]:
model.soft_prompt.weight
# Confirmed the weights were changed! 

Parameter containing:
tensor([[ 0.0159, -0.0162,  0.0059,  ...,  0.0197,  0.0142,  0.0097],
        [ 0.0204, -0.0236, -0.0033,  ...,  0.0024,  0.0167,  0.0237],
        [ 0.0321, -0.0136,  0.0093,  ...,  0.0169,  0.0368, -0.0050],
        ...,
        [ 0.0084,  0.0064,  0.0040,  ...,  0.0277, -0.0045,  0.0214],
        [ 0.0283,  0.0088, -0.0009,  ...,  0.0460,  0.0362,  0.0307],
        [ 0.0178, -0.0307, -0.0045,  ...,  0.0190, -0.0036,  0.0017]],
       requires_grad=True)