In [5]:
from datasets import DatasetDict, Dataset
from openprompt.data_utils import InputExample
import pandas as pd


def create_labels(sentiment):
    labels = []
    for s in sentiment:
        if s == 'neutral':
            labels += [0]
        elif s == 'negative':
            labels += [1]
        else:
            labels += [2]
    return labels

seed = 40
# set seed

# Create task Dataset from annotated samples
sentences = pd.read_csv('../data/sentences_MD-labels.csv', header=0)
sentences = sentences[['idx','language', "MD_label"]]

dataset = Dataset.from_pandas(sentences).rename_columns({'language': 'sentence', "MD_label": 'sentiment'})
dataset = dataset.add_column('label', create_labels(dataset['sentiment']))
train_test = dataset.train_test_split(0.35, seed = seed)
dev_test = train_test['test'].train_test_split(0.5, seed = seed)
raw_dataset = DatasetDict({
    'train': train_test['train'],
    'validation': dev_test['train'],
    'test': dev_test['test']})

dataset = {}
for split in ['train', 'validation', 'test']:
    dataset[split] = []
    for data in raw_dataset[split]:
        input_example = InputExample(text_a = data['sentence'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)
print(dataset['train'][0])


{
  "guid": 18,
  "label": 0,
  "meta": {},
  "text_a": "She states that pt has been compliant with meds",
  "text_b": "",
  "tgt_text": null
}



In [6]:
# You can load the plm related things provided by openprompt simply by calling:
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM, set_seed
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt import PromptForClassification, PromptDataLoader
from mlm import MLMTokenizerWrapper
import torch

set_seed(seed)
MODEL = 'UFNLP/gatortron-base'
model_path = "../runs/ta_pretraining/checkpoint-435"
model_config = BertConfig.from_pretrained(model_path)
plm = BertForMaskedLM.from_pretrained(model_path, config=model_config)
tokenizer = BertTokenizer.from_pretrained(MODEL)
WrapperClass = MLMTokenizerWrapper


# Constructing Template
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.

template_text = '{"placeholder":"text_a"} It was {"mask"}'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

# To better understand how does the template wrap the example, we visualize one instance.

wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
print(wrapped_example)

# Now, the wrapped example is ready to be pass into the tokenizer, hence producing the input for language models.
# You can use the tokenizer to tokenize the input by yourself, but we recommend using our wrapped tokenizer, which is a wrapped tokenizer tailed for InputExample.
# The wrapper has been given if you use our `load_plm` function, otherwise, you should choose the suitable wrapper based on
# the configuration in `openprompt.plms.__init__.py`.
# Note that when t5 is used for classification, we only need to pass <pad> <extra_id_0> <eos> to decoder.
# The loss is calcaluted at <extra_id_0>. Thus passing decoder_max_length=3 saves the space
wrapped_tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
# or
# from openprompt.plms import BERTTokenizerWrapper
# wrapped_tokenizer= BERTTokenizerWrapper(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")

# You can see what a tokenized example looks like by
tokenized_example = wrapped_tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
# print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

You are using a model of type megatron-bert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ../runs/ta_pretraining/checkpoint-435 were not used when initializing BertForMaskedLM: ['bert.encoder.layer.16.attention.ln.weight', 'bert.encoder.layer.12.ln.bias', 'bert.encoder.layer.18.attention.ln.bias', 'bert.encoder.layer.14.attention.ln.weight', 'bert.encoder.layer.6.ln.weight', 'bert.encoder.layer.21.attention.ln.weight', 'bert.encoder.layer.12.ln.weight', 'bert.encoder.layer.3.attention.ln.weight', 'bert.encoder.layer.9.attention.ln.weight', 'bert.encoder.layer.2.ln.weight', 'bert.encoder.layer.1.ln.weight', 'bert.encoder.layer.9.ln.weight', 'bert.encoder.layer.20.attention.ln.weight', 'bert.encoder.layer.0.attention.ln.bias', 'bert.encoder.layer.7.ln.bias', 'bert.encoder.layer.20.ln.weight', 'bert.encoder.layer.5.attention.ln.bias', 'bert.encoder.layer.9.attention.ln.bias', 'bert.

[[{'text': 'She states that pt has been compliant with meds', 'loss_ids': 0, 'shortenable_ids': 1}, {'text': ' It was', 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}], {'guid': 18, 'label': 0}]
{'input_ids': [101, 627, 1692, 323, 724, 394, 636, 10604, 189, 4013, 653, 245, 103, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'loss_ids': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [3]:
# Now it's time to convert the whole dataset into the input format!
# Simply loop over the dataset to achieve it!

model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

# We provide a `PromptDataLoader` class to help you do all the above matters and wrap them into an `torch.DataLoader` style iterator.
from openprompt import PromptDataLoader

train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")
# next(iter(train_dataloader))

tokenizing: 0it [00:00, ?it/s]Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

tokenizing: 30it [00:00, 1182.14it/s]


In [4]:
# Define the verbalizer
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:

from openprompt.prompts import ManualVerbalizer
import torch

# # for example the verbalizer contains multiple label words in each class
# myverbalizer = ManualVerbalizer(tokenizer, num_classes=3,
#                         label_words=[["neutral"], ["negative"], ["positive"]])

classes = [ # There are two classes in Sentiment Analysis, one for negative and one for positive
    "neutral",
    "negative",
    "positive"
]
myverbalizer = ManualVerbalizer(
    tokenizer = tokenizer,
    classes = classes,
    label_words = {
        "neutral": ["fair", "okay", "unbiased", "unknown"],
        "negative": ["bad", "awful", "terrible", "horrible"],
        "positive": ["good", "wonderful", "great", "effective"],
    },   
)

print(myverbalizer.label_words_ids)
logits = torch.randn(2,len(tokenizer)) # creating a pseudo output from the plm, and
print(myverbalizer.process_logits(logits)) # see what the verbalizer do

Parameter containing:
tensor([[[ 7550,     0],
         [13163,     0],
         [39429,     0],
         [ 5221,     0]],

        [[ 8837,     0],
         [ 2639,  1823],
         [49283,     0],
         [47322,  1083]],

        [[ 1975,     0],
         [36986,  1823],
         [ 2203,     0],
         [ 3565,     0]]])
tensor([[-2.5886, -2.8751, -3.5636],
        [-3.1815, -2.5237, -2.4045]])


In [5]:
# Although you can manually combine the plm, template, verbalizer together, we provide a pipeline
# model which take the batched data from the PromptDataLoader and produce a class-wise logits

from openprompt import PromptForClassification

use_cuda = False
prompt_model = PromptForClassification(plm=plm,template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=prompt_model.cuda()

# Now the training is standard
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-6)

for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

# Evaluate
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)



Epoch 0, average loss: 3.030619263648987
Epoch 1, average loss: 1.3008103966712952
Epoch 2, average loss: 1.0234407186508179
Epoch 3, average loss: 1.022293210029602
Epoch 4, average loss: 1.0564102530479431
Epoch 5, average loss: 1.0250256061553955
Epoch 6, average loss: 0.621261477470398
Epoch 7, average loss: 0.5856716930866241
Epoch 8, average loss: 0.36010362207889557
Epoch 9, average loss: 0.7198853194713593


tokenizing: 8it [00:00, 1500.44it/s]


0.375


In [None]:
lr=1e-4, batch=4, epoch=4
Epoch 0, average loss: 1.6096126437187195
Epoch 1, average loss: 2.4815704226493835
Epoch 2, average loss: 4.7398681640625
Epoch 3, average loss: 1.7757583856582642
acc = 0.25

lr=1e-6, batch=4, epoch=4
Epoch 0, average loss: 5.752278208732605
Epoch 1, average loss: 2.5698354244232178
Epoch 2, average loss: 2.4523871019482613
Epoch 3, average loss: 0.7844662666320801
acc = 0.25

-------

