In [1]:
import os
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from seqeval.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "IndoNER-Tourism/ner_data.tsv"

In [3]:
def read_ner_tsv(path):
    sents = []
    tokens, labels = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sents.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                token, label = line.split("\t")
                tokens.append(token)
                labels.append(label)
    if tokens:
        sents.append({"tokens": tokens, "ner_tags": labels})
    return sents

data = read_ner_tsv(data_path)
dataset = Dataset.from_list(data)

print(dataset[0])

{'tokens': ['Main', 'kesini', 'karena', 'ini', 'salah', 'satu', 'warisan', 'budaya', 'indonesia', ',', 'candi', 'ini', 'salahsatu', 'yg', 'terbesar', 'diindonesia', ',', ',', 'sy', 'tinggal', 'di', 'yogya', 'juga', 'tapi', 'jarak', 'nya', 'kalo', 'kesini', 'lumayan', 'jauh', 'sihh', ',', ',', 'tapi', 'sayang', '2', 'masa', 'orang', 'diy', 'gapernah', 'ke', 'candi', 'nya', ',', ',', 'Pengalaman', 'berharga', 'bisa', 'kesini', 'dengan', 'teman2', 'alumni'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-WIS', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-WIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [4]:
unique_labels = sorted({lbl for ex in data for lbl in ex["ner_tags"]})
label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: lbl for lbl, i in label2id.items()}

def encode_labels(example):
    example["labels"] = [label2id[l] for l in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

Map: 100%|██████████| 2009/2009 [00:00<00:00, 3222.37 examples/s]


In [5]:
split_ds = dataset.train_test_split(test_size=0.2, seed=42)
train_ds, test_ds = split_ds["train"], split_ds["test"]

In [6]:
model_name = "indolem/indobert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids, prev = [], None
        for j, w_id in enumerate(word_ids):
            if w_id is None:
                label_ids.append(-100)
            elif w_id != prev:
                label_ids.append(label[w_id])
            else:
                # Subword → pakai label I- jika ada
                curr = unique_labels[label[w_id]]
                if curr.startswith("B-"):
                    curr = "I-" + curr[2:]
                label_ids.append(label2id[curr])
            prev = w_id
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized

train_enc = train_ds.map(tokenize_and_align_labels, batched=True)
test_enc = test_ds.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 1607/1607 [00:01<00:00, 1192.19 examples/s]
Map: 100%|██████████| 402/402 [00:00<00:00, 1457.63 examples/s]


In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
args = TrainingArguments(
    output_dir="ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

import evaluate
metric = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)
    true_labels, true_preds = [], []
    for i, lbl in enumerate(labels):
        preds_i, labs_i = [], []
        for j, l in enumerate(lbl):
            if l != -100:
                preds_i.append(id2label[preds[i][j]])
                labs_i.append(id2label[l])
        true_preds.append(preds_i)
        true_labels.append(labs_i)
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.057764,0.867769,0.914179,0.890369
2,No log,0.052383,0.898254,0.927861,0.912817
3,0.067500,0.049778,0.906627,0.935945,0.921053




TrainOutput(global_step=603, training_loss=0.05928535643303968, metrics={'train_runtime': 5921.8713, 'train_samples_per_second': 0.814, 'train_steps_per_second': 0.102, 'total_flos': 314944999520256.0, 'train_loss': 0.05928535643303968, 'epoch': 3.0})

In [9]:
trainer.evaluate()



{'eval_loss': 0.049777623265981674,
 'eval_precision': 0.9066265060240963,
 'eval_recall': 0.9359452736318408,
 'eval_f1': 0.9210526315789473,
 'eval_runtime': 122.3311,
 'eval_samples_per_second': 3.286,
 'eval_steps_per_second': 0.417,
 'epoch': 3.0}

In [12]:
trainer.save_model("ner_model")
tokenizer.save_pretrained("ner_model")

nlp = pipeline("ner", model="ner_model", tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cpu


In [None]:
text = "Saya berkunjung ke Candi Borobudur di Jawa Tengah."
nlp(text)

[{'entity_group': 'WIS',
  'score': np.float32(0.99538183),
  'word': 'gunung sibayak',
  'start': 13,
  'end': 27},
 {'entity_group': 'LOC',
  'score': np.float32(0.94576514),
  'word': 'tanah karo',
  'start': 40,
  'end': 50},
 {'entity_group': 'LOC',
  'score': np.float32(0.9230927),
  'word': 'sumatera utara',
  'start': 52,
  'end': 66},
 {'entity_group': 'FAS',
  'score': np.float32(0.99064356),
  'word': 'guide',
  'start': 75,
  'end': 80}]