In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW, AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from tqdm import tqdm

In [39]:
import autoprompt.utils as utils
from autoprompt.models import ContinuousTriggerMLM
from autoprompt.preprocessors import PREPROCESSORS
from autoprompt.evaluators import MLM_EVALUATORS

import json
import io

In [5]:
from autoprompt import continuous_trigger_mlm

In [27]:
distributed_config = utils.distributed_setup(-1)


In [6]:
config, tokenizer, base_model = continuous_trigger_mlm.load_transformers("roberta-large")


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-large and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
label_map = json.loads("""{"plural": "plural", "singular": "singular"}""")

In [55]:
template = "[T] [T] Q: {text} [P]"
templatizer = utils.MultiTokenTemplatizer(
        template=template,
        tokenizer=tokenizer,
        label_field="label",
        label_map=label_map,
        add_padding=True,
    )

In [10]:
def get_initial_trigger_ids(initial_trigger, tokenizer):
    """Converts a list of trigger tokens to a tensor of trigger token ids."""
    if initial_trigger is None:
        return
    initial_trigger_ids = torch.tensor(
        tokenizer.convert_tokens_to_ids(initial_trigger)
    )
    detokenized = tokenizer.convert_ids_to_tokens(initial_trigger_ids)
    return initial_trigger_ids


In [11]:
initial_trigger_ids = get_initial_trigger_ids(['?', '|', ','], tokenizer)

In [71]:
model = ContinuousTriggerMLM(
        base_model=base_model,
        num_trigger_tokens=templatizer.num_trigger_tokens,
        initial_trigger_ids=initial_trigger_ids,
)


RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 0

In [57]:
def get_sampler(
    dataset,
    evaluation_strategy,
    distributed_config,
    train=False
):
    """Sets up the correct sampler for a data loader."""
    # Sampling is handled by data iterator for multiple choice problems.
    if evaluation_strategy != 'classification':
        return
    # Multi-GPU training
    if distributed_config.world_size != -1:
        return torch.utils.data.DistributedSampler(dataset, shuffle=train)
    # Defaults
    if train:
        return torch.utils.data.RandomSampler(dataset)
    else:
        return torch.utils.data.SequentialSampler(dataset)


def _stringify(d):
    return {k: str(v) for k, v in d.items()}

def preprocess_jsonl_string(string, **kwargs):
    for line in string.readlines():
        yield _stringify(json.loads(line))

def load_trigger_dataset(
    fname,
    templatizer,
    limit=None,
    train=False,
    preprocessor_key=None,
):
    instances = []
    for x in preprocess_jsonl_string(fname, train=train):
        try:
            model_inputs, label_id = templatizer(x, train=train)
        except ValueError as e:
            logger.warning('Encountered error "%s" when processing "%s".  Skipping.', e, x)
            continue
        else:
            instances.append((model_inputs, label_id))
    if limit:
        limit = min(len(instances), limit)
        return random.sample(instances, limit)
    else:
        return instances



def load_datasets(templatizer, distributed_config, evaluation_strategy, 
                  train, dev, test, preprocessor, bsz=2, limit=None):
    """Loads the training, dev and test datasets."""
    dataset_constructor = load_trigger_dataset
    collator = utils.Collator(pad_token_id=templatizer.pad_token_id)

    train_dataset = dataset_constructor(
        train,
        templatizer=templatizer,
        train=True,
        preprocessor_key=preprocessor,
        limit=limit,
    )
    train_sampler = get_sampler(train_dataset, evaluation_strategy, distributed_config, train=True)
    train_loader = DataLoader(train_dataset, batch_size=bsz, collate_fn=collator, sampler=train_sampler)

    dev_dataset = dataset_constructor(
        dev,
        templatizer=templatizer,
        preprocessor_key=preprocessor,
        limit=limit,
    )
    dev_sampler = get_sampler(dev_dataset, evaluation_strategy, distributed_config, train=False)
    dev_loader = DataLoader(dev_dataset, batch_size=bsz, collate_fn=collator, sampler=dev_sampler)

    test_dataset = dataset_constructor(
        test,
        templatizer=templatizer,
        preprocessor_key=preprocessor,
    )
    test_sampler = get_sampler(test_dataset, evaluation_strategy, distributed_config, train=False)
    test_loader = DataLoader(test_dataset, batch_size=bsz, collate_fn=collator, sampler=test_sampler)

    return train_loader, dev_loader, test_loader

In [53]:
label_map['plural']

'plural'

In [58]:
from io import StringIO
evaluation_strategy = "classification"
preprocessor = "jsonl-string"
train_file = StringIO('{"text":"beers", "label":"plural"}\n{"text":"beer", "label":"singular"}')
dev_file = StringIO('{"text":"beers", "label":"plural"}\n{"text":"beer", "label":"singular"}')
test_file = StringIO('{"text":"beers", "label":"plural"}\n{"text":"beer", "label":"singular"}')


train_loader, dev_loader, test_loader = load_datasets(
    templatizer=templatizer,
    distributed_config=distributed_config,    
    evaluation_strategy=evaluation_strategy,
    train = train_file,
    dev = dev_file,
    test = test_file,
    preprocessor=preprocessor
)

In [15]:
def get_optimizer(model, lr, finetune_lr, finetune_mode='partial'):
    """Handles setting the optimizer up for different finetuning modes."""
    params = [{'params': [model.trigger_embeddings]}]
    if finetune_mode == 'partial':
        params.append({
            'params': model.lm_head.parameters(),
            'lr': finetune_lr if finetune_lr else lr
        })
    elif finetune_mode == 'all':
        params.append({
            'params': [p for p in model.parameters() if not torch.equal(p, model.trigger_embeddings)],
            'lr': finetune_lr if finetune_lr else lr
        })
    return AdamW(
        params,
        lr=lr,
        weight_decay=1e-2,
        eps=1e-8
    )

In [17]:
finetune_lr = 0.001
finetune_mode = 'partial'
lr = 0.1
optimizer = get_optimizer(model, finetune_lr, lr, finetune_mode)

In [64]:
def to_device(data, device):
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    elif isinstance(data, list):
        return [to_device(x, device) for x in data]
    elif isinstance(data, torch.Tensor):
        return data.to(device)

In [70]:
distributed_config.device

device(type='cuda')

In [73]:
epochs = 2
disable_dropout = False
quiet = False
evaluation_metric = "accuracy"
accumulation_steps = 1
decoding_strategy = None
model.to(distributed_config.device)


evaluator = MLM_EVALUATORS[evaluation_strategy](
    model=model,
    tokenizer=tokenizer,
    label_map=label_map,
    decoding_strategy=decoding_strategy,
)



for epoch in range(epochs):
    if not disable_dropout:
        model.train()
    else:
        model.eval()
    if distributed_config.is_main_process and not quiet:
        iter_ = tqdm(train_loader)
    else:
        iter_ = train_loader
    print(epoch)
    total_loss = torch.tensor(0.0, device=distributed_config.device)
    if evaluation_metric == 'accuracy':
        total_correct = {'accuracy': torch.tensor(0.0, device=distributed_config.device)}
    else:
        total_correct = {'TP': torch.tensor(0.0, device=distributed_config.device),
                         'FP': torch.tensor(0.0, device=distributed_config.device),
                         'TN': torch.tensor(0.0, device=distributed_config.device),
                         'FN': torch.tensor(0.0, device=distributed_config.device)}
    denom = torch.tensor(0.0, device=distributed_config.device)
    optimizer.zero_grad()
    for i, (model_inputs, labels) in enumerate(iter_):
        model_inputs = to_device(model_inputs, distributed_config.device)
        labels = to_device(labels, distributed_config.device)
        loss, correct, preds = evaluator(model_inputs, labels, train=True, 
                                evaluation_metric=evaluation_metric)
        loss /= accumulation_steps
        loss.backward()
        if (i % accumulation_steps) == (accumulation_steps - 1):
            logger.debug('Optimizer step.')
            if args.clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
            optimizer.zero_grad()
        # TODO: Metric logging is clumsy/brittle.
        batch_size = 1.0 if evaluation_strategy == 'multiple-choice' else labels.size(0)
        total_loss += loss.detach() * batch_size
        for metric in correct:
            total_correct[metric] += correct[metric].detach()
        denom += batch_size

        # NOTE: This loss/accuracy is only on the subset  of training data
        # in the main process.
        if distributed_config.is_main_process and not args.quiet:
            if evaluation_metric == 'accuracy':
                iter_.set_description(
                    f'Loss: {total_loss / (denom + 1e-13): 0.4f}, '
                    f"Accuracy: {total_correct['accuracy'] / (denom + 1e-13): 0.4f}"
                )
            elif evaluation_metric == 'MCC':
                mcc_numerator = (total_correct['TP'] * total_correct['TN'] - 
                                total_correct['FP'] * total_correct['FN'])
                mcc_denominator = torch.sqrt((total_correct['TP'] + total_correct['FP']) *
                                             (total_correct['TP'] + total_correct['FN']) *
                                             (total_correct['TN'] + total_correct['FP']) *
                                             (total_correct['TN'] + total_correct['FN']))
                iter_.set_description(
                    f'Loss: {total_loss / (denom + 1e-13): 0.4f}, '
                    f"MCC: {mcc_numerator / mcc_denominator: 0.4f}"    
                )
            elif evaluation_metric == 'F1':
                precision = total_correct['TP'] / (total_correct['TP'] + total_correct['FP'])
                recall = total_correct['TP'] / (total_correct['TP'] + total_correct['FN'])
                f1 = (2 * precision * recall) / (precision + recall)
                iter_.set_description(
                    f'Loss: {total_loss / (denom + 1e-13): 0.4f}, '
                    f"F1 score: {f1: 0.4f}"    
                )

    

  0%|          | 0/1 [00:00<?, ?it/s]

0





RuntimeError: shape mismatch: value tensor of shape [6, 1024] cannot be broadcast to indexing result of shape [4, 1024]