In [None]:
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer

dataset = load_dataset("klue", "ner")
labels = dataset["train"].features["ner_tags"].feature.names

model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    ignore_mismatched_sizes=True
)

print(dataset["train"])
print(dataset["train"][0])
print(labels)

In [None]:
def preprocess_data(example, tokenizer):
    sentence = "".join(example["tokens"]).replace("\xa0", " ")
    encoded = tokenizer(
        sentence,
        return_offsets_mapping=True,
        add_special_tokens=False,
        padding=False,
        truncation=False
    )

    labels = []
    for offset in encoded.offset_mapping:
        if offset[0] == offset[1]:
            labels.append(-100)
        else:
            labels.append(example["ner_tags"][offset[0]])
    encoded["labels"] = labels
    return encoded


processed_dataset = dataset.map(
    lambda example: preprocess_data(example, tokenizer),
    batched=False,
    remove_columns=dataset["train"].column_names
)
print(processed_dataset)
print(dataset["train"][0]["ner_tags"])
print(processed_dataset["train"][0]["offset_mapping"])
print(processed_dataset["train"][0]["labels"])

In [None]:
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="token-classification",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=5,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
)

trainer.train()

In [None]:
import torch

model.eval()
text = "위키북스의 윤대희, 김동화, 송종민 그리고 진현두는 2025년 서울에서 2시간 동안 신간 1권에 관한 논의를 진행했다."

with torch.no_grad():
    tokenized = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        add_special_tokens=False
    )
    logits = model(**tokenized.to("cuda")).logits.cpu()

predictions = logits.argmax(dim=-1)[0].tolist()
tokens = tokenizer.tokenize(text)

print(list(zip(tokens, [labels[i] for i in predictions])))