In [None]:
!pip install datasets
!pip install accelerate -U

In [None]:
from datasets import load_dataset

# Load dataset
train1 = load_dataset('polyglot_ner', 'de', split='train[:1000]')
train2 = load_dataset('polyglot_ner', 'de', split='train[:3000]')
valid = load_dataset('polyglot_ner', 'de', split='train[3000:5000]')

In [None]:
from transformers import AutoTokenizer

# Load pre-trained tokenizer: bert-base-german-cased
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
# Create a dictionary which map ner to labels
ner_tags = set()
for l in train1['ner']:
  for tag in l:
    ner_tags.add(tag)
ner_to_labels = {tag: i for i, tag in enumerate(ner_tags)}
print(ner_tags)
print(ner_to_labels)

{'LOC', 'O', 'PER', 'ORG'}
{'LOC': 0, 'O': 1, 'PER': 2, 'ORG': 3}


In [None]:
# Because some tokens are separated into multiple sub-tokens,
# We need to adjust the NER
def tokenize_and_align_labels(dataset):
  tokenized_dataset = []

  for row in dataset:
    tokenized_input = tokenizer(row['words'], is_split_into_words=True, padding='max_length', max_length=512, truncation=True)
    adjusted_ner_labels = []
    ner_mapping = {index: ner_to_labels[ner] for index, ner in enumerate(row["ner"])}
    word_idx_list = tokenized_input.word_ids()
    for word_idx in word_idx_list:
      if word_idx is None:
        adjusted_ner_labels.append(-100)
      else:
        adjusted_ner_labels.append(ner_mapping[word_idx])
    tokenized_input["labels"] = adjusted_ner_labels
    tokenized_dataset.append(tokenized_input)

  return tokenized_dataset

In [None]:
from datasets import Dataset

tokenized_train1 = Dataset.from_list(tokenize_and_align_labels(train1))
tokenized_train2 = Dataset.from_list(tokenize_and_align_labels(train2))
tokenized_valid = Dataset.from_list(tokenize_and_align_labels(valid))

**Compute F1-score**

In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Define a function to compute metrics
def compute_metrics(p):
  # predictions: prediction
  # labels: ground_truth
  predictions, labels = p.predictions, p.label_ids
  predictions = np.argmax(predictions, axis=2)

  # Remove ignored index (-100) from the predictions
  true_predictions = [
    [ner for (ner, label) in zip(prediction, label) if label != -100] for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
    [label for label in label_ids if label != -100] for label_ids in labels
  ]

  flat_predictions = [ner for sublist in true_predictions for ner in sublist]
  flat_true_labels = [label for sublist in true_labels for label in sublist]

  return {
    "f1_micro": f1_score(flat_true_labels, flat_predictions, average="micro"),
    "f1_macro": f1_score(flat_true_labels, flat_predictions, average="macro")
  }

**Finetune Model**

In [None]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(ner_tags))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(1) Fine-tuned with 1000 sentences

In [None]:
# Define training arguments
training_args1 = TrainingArguments(
    output_dir="results1",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

# Initialize Trainer
trainer1 = Trainer(
    model=model,
    args=training_args1,
    train_dataset=tokenized_train1,  # Use tokenized training data
    tokenizer=tokenizer,
)

In [None]:
# Fine-tune the model
trainer1.train()

Step,Training Loss
500,0.1571
1000,0.0265


TrainOutput(global_step=1000, training_loss=0.09181080532073975, metrics={'train_runtime': 414.9712, 'train_samples_per_second': 9.639, 'train_steps_per_second': 2.41, 'total_flos': 1045205925888000.0, 'train_loss': 0.09181080532073975, 'epoch': 4.0})

In [None]:
# Do prediction on the validation set
pred = trainer1.predict(tokenized_valid)

In [None]:
compute_metrics(pred)

{'f1_micro': 0.9420848593887211, 'f1_macro': 0.7007021729605327}

In [None]:
del trainer1

(2) Fine-tuned with 3000 sentences

In [None]:
# Define training arguments
training_args2 = TrainingArguments(
    output_dir="results2",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

# Initialize Trainer
trainer2 = Trainer(
    model=model,
    args=training_args2,
    train_dataset=tokenized_train2,  # Use tokenized training data
    tokenizer=tokenizer,
)

In [None]:
# Fine-tune the model
trainer2.train()

Step,Training Loss
500,0.1384
1000,0.0988
1500,0.0562
2000,0.0283
2500,0.0187
3000,0.009


TrainOutput(global_step=3000, training_loss=0.05824965858459473, metrics={'train_runtime': 1256.2091, 'train_samples_per_second': 9.553, 'train_steps_per_second': 2.388, 'total_flos': 3135617777664000.0, 'train_loss': 0.05824965858459473, 'epoch': 4.0})

In [None]:
# Do prediction on the validation set
pred = trainer2.predict(tokenized_valid)

In [None]:
compute_metrics(pred)

{'f1_micro': 0.945768810640622, 'f1_macro': 0.7280222455819582}

In [None]:
del trainer2

(3) Fine-tuned with 3000 sentences and frozen embeddings

In [None]:
def freeze_weights(model):
  for param in model.base_model.parameters():
    param.requires_grad = False
  return model

In [None]:
# Define training arguments
training_args3 = TrainingArguments(
    output_dir="results3",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

# Initialize Trainer
trainer3 = Trainer(
    model=freeze_weights(model),
    args=training_args3,
    train_dataset=tokenized_train2,  # Use tokenized training data
    tokenizer=tokenizer,
)

In [None]:
# Fine-tune the model
trainer3.train()

Step,Training Loss
500,0.0038
1000,0.0047
1500,0.0049
2000,0.0061
2500,0.0042
3000,0.0031


TrainOutput(global_step=3000, training_loss=0.004473352630933126, metrics={'train_runtime': 469.6164, 'train_samples_per_second': 25.553, 'train_steps_per_second': 6.388, 'total_flos': 3135617777664000.0, 'train_loss': 0.004473352630933126, 'epoch': 4.0})

In [None]:
# Do prediction on the validation set
pred = trainer3.predict(tokenized_valid)

In [None]:
compute_metrics(pred)

{'f1_micro': 0.945704555676926, 'f1_macro': 0.7282359596746613}