In this tutorial, we will introduce how to load our pretrained models from huggingface model hub and make predictions using trained models. We will then explain the prediction outcomes and showcase some result manipulations.

## Overview

1. [Load Pretrained Models from the Hub](#load-pretrained-models-from-the-hub)
2. [Make Predictions](#make-predictions)
2. [Evaluation Metrics and Results](#evaluation-metrics-and-results)

### Load Pretrained Models from the Hub

As a direct continuation from our [previous tutorial](./Loading%20unicausal%20model.ipynb), we load our pre-trained configs, model, and tokenizer from the HuggingFace model hub. They will come in handy for prediction and save you much time and resources from training the model from scratch.

In [None]:
# install dependencies for later operations
!pip install seqeval

In [28]:
import sys
sys.path.append('..')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
# import dependencies from HuggingFace transformers library
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from models.classifiers.modeling_bert import BertForUnifiedCRBase

# declare label-to-id mapping and related model parameters
label_to_id = {'B-C': 0, 'B-E': 1, 'I-C': 2, 'I-E': 3, 'O': 4}
label_list = list(label_to_id.keys())
num_labels = len(label_list)
alpha = 1 
cache_dir = None
model_name_or_path = "tanfiona/unicausal" # model path at HuggingFace hub

# initialize useful constructs for prediction
config = AutoConfig.from_pretrained(
    model_name_or_path, num_labels=num_labels
)
# preview configurations
print('### Config ###')
print(config)
model = BertForUnifiedCRBase.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    num_seq_labels=2,
    loss_function='simple',
    alpha=alpha
)
# preview model architecture
print('### Model ###')
print(model)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    use_fast=True
)
# preview sentence tokenizer
print('### Tokenizer ###')
print(tokenizer)

### Config ###
BertConfig {
  "_name_or_path": "tanfiona/unicausal",
  "architectures": [
    "BertForUnifiedCRBase"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 29000
}

### Model ###
BertForUnifiedCRBase(
  (bert): BertModel(
    (embeddings)

### Make Predictions

There are three tasks overall: sequence classification, span detection, and pair classification. We will go through examples for each task.


Before getting to the examples, here is some data loading and processing work that is necessary. Feel free to just run the cells without knowing the details. Examples follow immediately and they are more easily understandable.

In [21]:
## Preparation and house-keeping stuff - can just run and skip
from transformers import default_data_collator
from _datasets.unifiedcre import load_cre_dataset, available_datasets
from torch.utils.data import DataLoader

# load example datasets for validation
span_datasets, seq_datasets, stats = load_cre_dataset(dataset_name=['altlex'], \
            do_train_val=True, do_train=False, data_dir='../data')

# hardcode attributes for dataset processing
PADDING_DICT = {
    'input_ids': 0,
    'tokens': '[PAD]',
    'attention_mask': 0,
    'labels': -100,
    'label': -100,
    'ce_tags': -100,
    'ce_tags1': -100,
    'ce_tags2': -100,
    'token_type_ids': 0
    }

padding = "max_length" # defaults to pad to max length, in this case 128.
max_seq_length = 128
text_column_name = "text"
span_label_column_name = "ce_tags"
seq_label_column_name = "label"
span_structure_source = list(span_datasets.keys())[0]
seq_structure_source = list(seq_datasets.keys())[0]
features = span_datasets[span_structure_source].features

# aux function for tokenizing raw sequence texts
def tokenize_and_add_tags(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=max_seq_length,
        padding=padding,
        truncation=True,
        is_split_into_words=False
    )
    dummy_span_labels = [] # missing spans / we don't want to train or evaluate on them
    for ids in tokenized_inputs["input_ids"]: # list of list
        sequence_length = len(ids)
        dummy_span_labels.append([PADDING_DICT[span_label_column_name]]*sequence_length)
    
    tokenized_inputs[span_label_column_name] = dummy_span_labels
    tokenized_inputs[f"{span_label_column_name}1"] = dummy_span_labels
    tokenized_inputs[f"{span_label_column_name}2"] = dummy_span_labels
    tokenized_inputs[seq_label_column_name] = examples[seq_label_column_name]

    return tokenized_inputs

# process raw sequence datasets using defined functionality
processed_seq_datasets = seq_datasets.map(
    tokenize_and_add_tags,
    batched=True,
    remove_columns=seq_datasets[seq_structure_source].column_names,
    desc="Running tokenizer on dataset",
)

# aux function for tokenizing span datasets
def tokenize_and_align_tags(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=max_seq_length,
        padding=padding,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )

    tags, tags1, tags2 = [], [], []
    for i, (label,label1,label2) in enumerate(zip(\
        examples[span_label_column_name],
        examples[f"{span_label_column_name}1"],
        examples[f"{span_label_column_name}2"],
        )):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids, label_ids1, label_ids2 = [], [], []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(PADDING_DICT[span_label_column_name])
                label_ids1.append(PADDING_DICT[f"{span_label_column_name}1"])
                label_ids2.append(PADDING_DICT[f"{span_label_column_name}2"])
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
                label_ids1.append(label_to_id[label1[word_idx]])
                label_ids2.append(label_to_id[label2[word_idx]])
            # For the other tokens in a word, we set the label to the current label
            else:
                label_ids.append(PADDING_DICT[span_label_column_name])
                label_ids1.append(PADDING_DICT[f"{span_label_column_name}1"])
                label_ids2.append(PADDING_DICT[f"{span_label_column_name}2"])
            previous_word_idx = word_idx
        tags.append(label_ids)
        tags1.append(label_ids1)
        tags2.append(label_ids2)
    
    tokenized_inputs[span_label_column_name] = tags
    tokenized_inputs[f"{span_label_column_name}1"] = tags1
    tokenized_inputs[f"{span_label_column_name}2"] = tags2
    tokenized_inputs[seq_label_column_name] = examples[seq_label_column_name]

    return tokenized_inputs


# process raw span datasets using defined functionality
processed_span_datasets = span_datasets.map(
    tokenize_and_align_tags,
    batched=True,
    remove_columns=span_datasets[span_structure_source].column_names,
    desc="Running tokenizer on dataset",
)



  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
## Make pytorch dataloader objects for model prediction
# use default data collator as padding is already done at max length
data_collator = default_data_collator

# define dataloaders for prediction
eval_dataset = processed_span_datasets['span_validation']

# for span detection
eval_pspan_dataloader = DataLoader(
    eval_dataset, 
    shuffle=False, collate_fn=data_collator, 
    batch_size=8
    )
eval_pspan_corpus_col = span_datasets["span_validation"]["corpus"]
eval_pspan_unique_corpus = list(set(eval_pspan_corpus_col))

# for sequence classification
eval_aseq_dataloader = DataLoader(
    processed_seq_datasets['seq_validation'], 
    shuffle=False, collate_fn=data_collator, 
    batch_size=8
)
eval_aseq_corpus_col = seq_datasets["seq_validation"]["corpus"]
eval_aseq_unique_corpus = list(set(eval_aseq_corpus_col+eval_pspan_unique_corpus))

# for pair classification
eval_apair_dataloader = DataLoader(
    processed_seq_datasets['pair_validation'], 
    shuffle=False, collate_fn=data_collator, 
    batch_size=8
)
eval_apair_corpus_col = seq_datasets["pair_validation"]["corpus"]
eval_apair_unique_corpus = list(set(eval_apair_corpus_col))

In [None]:
# Prediction results saved here
all_preds, all_refs = [], []
all_seq_preds, all_seq_refs = [], []
all_pair_preds, all_pair_refs = [], []

Now we are ready to run predictions on sequence classification examples.

In [35]:
# sequence classification
from tqdm import tqdm
from datasets import load_metric
import torch

metric = {d:load_metric('seqeval') for d in eval_pspan_unique_corpus+['all']}
seq_metric = {d:load_metric('../utils/seq_metrics.py') for d in eval_aseq_unique_corpus+['all']}
pair_metric = {d:load_metric('../utils/seq_metrics.py') for d in eval_apair_unique_corpus+['all']} 

print("***** Running prediction *****")
print(f"  Num seq examples = {len(eval_aseq_corpus_col)}")

model.eval()

for step, batch in enumerate(tqdm(eval_aseq_dataloader)):
    with torch.no_grad():
        outputs = model(**batch)
        if step == 0:
            print('input batch:',**batch)
            print('output logits:',outputs.seq_logits)

    # Get Seq Predictions & References
    seq_preds = outputs.seq_logits.argmax(dim=-1).detach().cpu().clone().tolist()
    seq_refs = batch[seq_label_column_name].detach().cpu().clone().tolist()

    # Add to metrics
    seq_metric['all'].add_batch(
        predictions=seq_preds,
        references=seq_refs
    )

    # Add to metrics by dataset name
    corps = eval_aseq_corpus_col[step*8:(step+1)*8] # batch_size=8
    for i,d in enumerate(corps):
        seq_metric[d].add(
            prediction=seq_preds[i],
            reference=seq_refs[i],
        )
    
    # Store predictions
    all_seq_preds.extend(seq_preds)
    all_seq_refs.extend(seq_refs)

***** Running prediction *****
  Num seq examples = 286


  0%|          | 0/36 [00:00<?, ?it/s]

{'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0]), 'input_ids': tensor([[  101,  1109,   139,  ...,     0,     0,     0],
        [  101,  1130,  1901,  ...,     0,     0,     0],
        [  101,  1913,  8185,  ...,     0,     0,     0],
        ...,
        [  101,  1332, 16991,  ...,     0,     0,     0],
        [  101,  1212,   129,  ...,     0,     0,     0],
        [  101,  2091,   183,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'ce_tags': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -10




For span detection tasks:

In [None]:
print("***** Running prediction *****")
print(f"  Num span examples = {len(eval_pspan_corpus_col)}")

model.eval()

for step, batch in enumerate(tqdm(eval_pspan_dataloader)):
    with torch.no_grad():
        outputs = model(**batch)
        if step == 0:
            print('input batch:',**batch)
            print('output logits:',outputs.tok_logits)

    # Get Span Predictions & References
    preds, refs = format(
        predictions=outputs.tok_logits.argmax(dim=-1), 
        labels=batch[span_label_column_name],
        remove_if_no_ce=False
        )
    preds1, refs1 = format(
        predictions=outputs.tok_logits1.argmax(dim=-1), 
        labels=batch[f"{span_label_column_name}1"],
        remove_if_no_ce=False
        )
    preds2, refs2 = format(
        predictions=outputs.tok_logits2.argmax(dim=-1), 
        labels=batch[f"{span_label_column_name}2"],
        remove_if_no_ce=False
        )
    
    # Get Seq Predictions & References
    seq_preds = outputs.seq_logits.argmax(dim=-1).detach().cpu().clone().tolist()
    seq_refs = batch[seq_label_column_name].detach().cpu().clone().tolist()
    
    # Add to metrics
    metric['all'].add_batch(
        predictions=preds,
        references=refs 
    ) # predictions and preferences are expected to be a nested list of labels, not label_ids
    metric['all'].add_batch(
        predictions=preds1,
        references=refs1 
    )
    metric['all'].add_batch(
        predictions=preds2,
        references=refs2 
    )
    seq_metric['all'].add_batch(
        predictions=seq_preds,
        references=seq_refs
    )

    # Add to metrics by dataset name
    corps = eval_pspan_corpus_col[step*args.per_device_eval_batch_size:(step+1)*args.per_device_eval_batch_size]
    for i,d in enumerate(corps):
        metric[d].add(
            prediction=preds[i],
            reference=refs[i],
        )
        metric[d].add(
            prediction=preds1[i],
            reference=refs1[i],
        )
        metric[d].add(
            prediction=preds2[i],
            reference=refs2[i],
        )
        seq_metric[d].add(
            prediction=seq_preds[i],
            reference=seq_refs[i],
        )

    # Store predictions
    all_preds.extend(preds)
    all_refs.extend(refs)
    all_preds.extend(preds1)
    all_refs.extend(refs1)
    all_preds.extend(preds2)
    all_refs.extend(refs2)
    all_seq_preds.extend(seq_preds)
    all_seq_refs.extend(seq_refs)

In [None]:
print("***** Running prediction *****")
print(f"  Num pair examples = {len(eval_apair_corpus_col)}")

model.eval()

for step, batch in enumerate(tqdm(eval_apair_dataloader)):
    with torch.no_grad():
        outputs = model(**batch)
        if step == 0:
            print('input batch:',**batch)
            print('output logits:',outputs.seq_logits)

    # Get Seq Predictions & References
    seq_preds = outputs.seq_logits.argmax(dim=-1).detach().cpu().clone().tolist()
    seq_refs = batch[seq_label_column_name].detach().cpu().clone().tolist()

    # Add to metrics
    pair_metric['all'].add_batch(
        predictions=seq_preds,
        references=seq_refs
    )

    # Add to metrics by dataset name
    corps = eval_apair_corpus_col[step*8:(step+1)*8] # batch-size=8
    for i,d in enumerate(corps):
        pair_metric[d].add(
            prediction=seq_preds[i],
            reference=seq_refs[i],
        )
    
    # Store predictions
    all_pair_preds.extend(seq_preds)
    all_pair_refs.extend(seq_refs)

### Evaluation Metrics and Results

In [None]:
# Evaluate above predictions
for d in eval_pspan_unique_corpus+['all']:
    eval_metric = compute_metrics(d)
    print(f"span predictions for '{d}' : {eval_metric}")
for d in list(set(eval_pspan_unique_corpus+eval_aseq_unique_corpus))+['all']:
    eval_metric = seq_metric[d].compute()
    print(f"seq predictions for '{d}' : {eval_metric}")
for d in eval_apair_unique_corpus+['all']:
    eval_metric = pair_metric[d].compute()
    print(f"pair predictions for '{d}' : {eval_metric}")

TODOs:
* Explain F1 score
* Print raw input sentences instead of tokenized sentences 