In [1]:
model_name = "cointegrated/rubert-tiny"
batch_size = 16

In [2]:
from datasets import load_dataset
train = load_dataset('eriktks/conll2003')['train']
test = load_dataset('eriktks/conll2003')['test']
ner = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import pandas as pd
train_df = pd.DataFrame()
train_df['id'] = train['id']
train_df['tokens'] = train['tokens']
train_df['ner_tags'] = train['ner_tags']
test_df = pd.DataFrame()
test_df['id'] = test['id']
test_df['tokens'] = test['tokens']
test_df['ner_tags'] = test['ner_tags']

In [4]:
def upd(s):
  for i in range(len(s)):
    s[i] = ner[s[i]]
  return s

In [5]:
train_df.ner_tags = train_df.ner_tags.apply(upd)
test_df.ner_tags = test_df.ner_tags.apply(upd)

In [6]:
from datasets import Dataset, DatasetDict
ner_df = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(train_df)),
    'test': Dataset.from_pandas(pd.DataFrame(test_df))
})
ner_df

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 14041
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3453
    })
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)



In [8]:
labels_ner = list(ner.values())
labels_ner

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [9]:
def tokenize(input_token, label_all_tokens=True):
    tokenized = tokenizer(input_token["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(input_token['ner_tags']):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        label_ids = [labels_ner.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized["labels"] = labels
    return tokenized

In [10]:
tokenized_datasets = ner_df.map(tokenize, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(labels_ner))
model.config.id2label = ner
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=25,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)



In [13]:
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [14]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [labels_ner[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_ner[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
trainer.evaluate()


{'eval_loss': 2.2662930488586426,
 'eval_model_preparation_time': 0.0011,
 'eval_precision': 0.02148415683062144,
 'eval_recall': 0.10212389380530973,
 'eval_f1': 0.03550004614390747,
 'eval_accuracy': 0.06952624252344311,
 'eval_runtime': 4.5634,
 'eval_samples_per_second': 756.672,
 'eval_steps_per_second': 47.333}

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
1,0.2834,0.201249,0.0011,0.787269,0.806637,0.796835,0.945327
2,0.0801,0.205758,0.0011,0.807482,0.825133,0.816212,0.95085
3,0.0457,0.240675,0.0011,0.806039,0.824513,0.815171,0.949504
4,0.0318,0.275601,0.0011,0.79172,0.822478,0.806806,0.946674
5,0.0196,0.308342,0.0011,0.79261,0.823894,0.807949,0.94623
6,0.0144,0.297635,0.0011,0.80312,0.820177,0.811559,0.949075
7,0.0124,0.307356,0.0011,0.800425,0.832655,0.816222,0.949963
8,0.0081,0.325962,0.0011,0.800357,0.833363,0.816526,0.949657
9,0.007,0.349816,0.0011,0.797572,0.831239,0.814057,0.94831
10,0.0062,0.366001,0.0011,0.802287,0.832035,0.81689,0.949244


TrainOutput(global_step=21950, training_loss=0.018819857269593416, metrics={'train_runtime': 462.7487, 'train_samples_per_second': 758.565, 'train_steps_per_second': 47.434, 'total_flos': 249218802771012.0, 'train_loss': 0.018819857269593416, 'epoch': 25.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.4158036410808563,
 'eval_model_preparation_time': 0.0011,
 'eval_precision': 0.8207358441332522,
 'eval_recall': 0.8350442477876107,
 'eval_f1': 0.8278282230117998,
 'eval_accuracy': 0.9521041440394058,
 'eval_runtime': 2.8819,
 'eval_samples_per_second': 1198.162,
 'eval_steps_per_second': 74.95,
 'epoch': 25.0}

In [24]:
from transformers import pipeline

pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [34]:
print(' '.join(ner_df['train']['tokens'][4]))
pipe(' '.join(ner_df['train']['tokens'][4]))

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .


[{'entity_group': 'LOC',
  'score': 0.99998736,
  'word': 'Germany',
  'start': 0,
  'end': 7},
 {'entity_group': 'ORG',
  'score': 0.9999885,
  'word': 'European Union',
  'start': 33,
  'end': 47},
 {'entity_group': 'PER',
  'score': 0.99999017,
  'word': 'Werner Zwingmann',
  'start': 72,
  'end': 88},
 {'entity_group': 'LOC',
  'score': 0.9999924,
  'word': 'Britain',
  'start': 164,
  'end': 171}]