In [10]:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

In [11]:
 # The CoNLL-2003 dataset for NER
dataset = load_dataset("conll2003", trust_remote_code=True)

In [12]:
example = dataset["train"][848]
example

{'id': '848',
 'tokens': ['Dean',
  'Palmer',
  'hit',
  'his',
  '30th',
  'homer',
  'for',
  'the',
  'Rangers',
  '.'],
 'pos_tags': [22, 22, 38, 29, 16, 21, 15, 12, 23, 7],
 'chunk_tags': [11, 12, 21, 11, 12, 12, 13, 11, 12, 0],
 'ner_tags': [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]}

In [13]:
label2id = {
      "O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4,
      "B-LOC": 5, "I-LOC": 6, "B-MISC": 7, "I-MISC": 8
}
id2label = {index: label for  label, index  in label2id.items()}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [14]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
token_ids = tokenizer(example['tokens'], is_split_into_words=True)['input_ids']
print(example['tokens'])
tokenizer.convert_ids_to_tokens(token_ids)

['Dean', 'Palmer', 'hit', 'his', '30th', 'homer', 'for', 'the', 'Rangers', '.']


['[CLS]',
 'Dean',
 'Palmer',
 'hit',
 'his',
 '30th',
 'home',
 '##r',
 'for',
 'the',
 'Rangers',
 '.',
 '[SEP]']

In [20]:
def align_labels(examples):
    # Tokenize the input tokens (words)
    token_ids = tokenizer(
        examples["tokens"],
        truncation=True,  # Enable truncation if necessary
        is_split_into_words=True  # Indicate that input is pre-split into words
    )

    # Get the NER tags (labels) from the examples
    labels = examples["ner_tags"]
    updated_labels = []

    # Iterate over each example in the batch
    for index, label in enumerate(labels):
        # Map tokens to their respective words
        word_ids = token_ids.word_ids(batch_index=index)
        print(word_ids)
        previous_word_idx = None
        label_ids = []

        # Iterate over each token's word ID
        for word_idx in word_ids:
            # If the token corresponds to a new word
            if word_idx != previous_word_idx:
                previous_word_idx = word_idx
                # Assign the label for the word (or -100 for special tokens)
                updated_label = -100 if word_idx is None else label[word_idx]
                # If the label is B-XXX, change it to I-XXX
                if updated_label != -100 and updated_label % 2 == 1:
                    updated_label += 1
                label_ids.append(updated_label)
            else:
                # For subword tokens, assign -100 (ignore in loss calculation)
                label_ids.append(-100)

        # Append the processed labels for this example
        updated_labels.append(label_ids)

    # Add the updated labels to the tokenized output
    token_ids["labels"] = updated_labels
    return token_ids

# Apply the `align_labels` function to the dataset in batches
tokenized = dataset.map(align_labels, batched=True)

In [21]:
# Evaluation

seqeval = evaluate.load('seqeval')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = []
    ture_labels = []

    # Doc level prediction
    for pred, label in enumerate(predictions, labels):
        for token_pred, token_label in enumerate(pred, label):
            if token_label != -100:
                true_predictions.append(id2label[token_pred])
                ture_labels.append(id2label[token_label])

    results = seqeval.compute(predictions=true_predictions, references=ture_labels)

    return {"f1": results["overall_f1"]}


In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
training_args = TrainingArguments(
     "model",
     learning_rate=2e-5,
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=1,
     weight_decay=0.01,
     save_strategy="epoch",
     report_to="none"
)
trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized["train"],
      eval_dataset=tokenized["test"],
      processing_class=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.evaluate()

In [None]:
# Save our fine-tuned model
trainer.save_model("ner_model")
# Run inference on the fine-tuned model
token_classifier = pipeline(
    "token-classification",
    model="ner_model",
)
token_classifier("My name is Maarten.")