In [None]:
%pip -q install pytorch-partial-tagger

# Download datasets

Here, you would download datasets provided on [teffland/ner-expected-entity-ratio](https://github.com/teffland/ner-expected-entity-ratio/tree/main) below. We use the datasets for the experimental setting Non-Native Speaker Scenario (NNS): Recall=50%, Precision=90% in Effland and Collins. (2021).


In [None]:
!curl -LO https://raw.githubusercontent.com/teffland/ner-expected-entity-ratio/main/data/conll2003/eng/entity.train_r0.5_p0.9.jsonl
!curl -LO https://raw.githubusercontent.com/teffland/ner-expected-entity-ratio/main/data/conll2003/eng/entity.dev.jsonl
!curl -LO https://raw.githubusercontent.com/teffland/ner-expected-entity-ratio/main/data/conll2003/eng/entity.test.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3918k  100 3918k    0     0  79.7M      0 --:--:-- --:--:-- --:--:-- 79.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1165k  100 1165k    0     0  45.5M      0 --:--:-- --:--:-- --:--:-- 45.5M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1114k  100 1114k    0     0  36.2M      0 --:--:-- --:--:-- --:--:-- 36.2M


# Import all dependencies

In [None]:
import json
import logging
import random
from typing import Any

import numpy as np
import torch

from partial_tagger.data import CharBasedTags
from partial_tagger.recognizer import Recognizer
from partial_tagger.training import Trainer
from partial_tagger.utils import Metric, create_tag

# Prepare datasets

You would prepare your datasets. Each item of dataset must have a string and tags. A string represents `text` below. Tags represent a collection of tags, where each tag has a start, a length, and a label, which are defined as `tags` below. A start represents a position in text where a tag starts. A length represents a distance in text between the beginning of a tag and the end of a tag. A label represents what you want to assign to a span of text defined by a start and a length.

In [None]:
def load_dataset(path: str):
    with open(path) as f:
        dataset = []

        for line in f:
            data = json.loads(line.rstrip())

            text = " ".join(data["tokens"])

            mapping = {}
            now = 0
            for i, token in enumerate(data["tokens"]):
                mapping[i] = now
                now += len(token) + 1  # Add one for a space

            tags = tuple(
                create_tag(
                    mapping[annotation["start"]],
                    len(annotation["mention"]),
                    annotation["type"],
                )
                for annotation in data["gold_annotations"]
            )

            dataset.append((text, CharBasedTags(tags, text)))

    return dataset


train_dataset = load_dataset("entity.train_r0.5_p0.9.jsonl")
dev_dataset = load_dataset("entity.dev.jsonl")
test_dataset = load_dataset("entity.test.jsonl")

# Train your tagger

You would train your tagger by initializing Trainer and passing datasets to it. After training, trainer gives you `Recognizer` object which predicts character-based tags from given texts. Before starting training, we would prepare two utility functions. One is for fixing random state and the other is for displaying training logs.

In [None]:
def fix_state(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


class JSONAdapter(logging.LoggerAdapter):
    def process(self, msg: Any, kwargs: Any) -> Any:
        return json.dumps(msg), kwargs


def get_logger(log_name: str, log_file: str) -> JSONAdapter:
    logger = logging.getLogger(log_name)
    logger.propagate = False

    logger.setLevel(logging.INFO)

    logger.addHandler(logging.StreamHandler())
    logger.addHandler(logging.FileHandler(log_file, mode="w", encoding="utf-8"))

    return JSONAdapter(logger)


# Training parameters
seed = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "roberta-base"
batch_size = 15
num_epochs = 20
learning_rate = 2e-5
gradient_clip_value = 5.0
padding_index = -1
unknown_index = -100
train_log_file = "log.jsonl"
tokenizer_args = {"padding": True, "return_tensors": "pt"}

fix_state(seed)

trainer = Trainer(
    model_name,
    batch_size,
    num_epochs,
    learning_rate,
    gradient_clip_value,
    padding_index,
    unknown_index,
    tokenizer_args,
)

recognizer = trainer(
    train_dataset,
    dev_dataset,
    device,
    get_logger(f"{__name__}.train", train_log_file)
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
{"epoch": 1, "loss": 93359.27760740928, "validation_f1_score": 0.8923462986198244, "validation_precision": 0.8870779976717112, "validation_recall": 0.8976775496465836}
{"epoch": 2, "loss": 37318.74901078269, "validation_f1_score": 0.9184864144024004, "validation_precision": 0.9098414795244386, "v

# Evalute your tagger

You would evaluate the performance of your tagger using Metric as below.

In [None]:
test_scores_file = "scores.json"
logger = get_logger(f"{__name__}.evaluate", test_scores_file)


texts, ground_truths = zip(*test_dataset)

predictions = recognizer(texts, batch_size, device)

metric = Metric()
metric(predictions, ground_truths)

logger.info({f"test_{key}": value for key, value in metric.get_scores().items()})

{"test_f1_score": 0.88945295404814, "test_precision": 0.8795222433789164, "test_recall": 0.8996104815864022}
