In [1]:
# Load Dataset
from datasets import Dataset

dataset=Dataset.from_json("storage/hf/gpt_20250930.jsonl", encoding="utf-8")
dataset = dataset.train_test_split(test_size=0.1)

In [2]:
# Define Label Mapping
label2id = {"O": 0, "B-ENT": 1, "B-KW": 2}
id2label = {v: k for k, v in label2id.items()}

In [3]:
#Convert Each Example to Tokens + Labels
import re

def split_words(text):
    return re.findall(r"\w+|\S", text)

def label_words(example):
    words = split_words(example["summary"])
    labels = ["O"] * len(words)

    def mark_span(phrase, label_tag):
        phrase_words = split_words(phrase)
        for i in range(len(words) - len(phrase_words) + 1):
            if words[i:i+len(phrase_words)] == phrase_words:
                for j in range(len(phrase_words)):
                    labels[i + j] = label_tag

    for ent in example["entities"]:
        mark_span(ent, "B-ENT")
    for kw in example["keywords"]:
        mark_span(kw, "B-KW")

    return {"words": words, "word_labels": labels}

dataset = dataset.map(label_words)

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [4]:
# Tokenize with Alignment
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["words"], is_split_into_words=True, truncation=True)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id[example["word_labels"][word_idx]])
        else:
            aligned_labels.append(label2id[example["word_labels"][word_idx]])  # or -100
        previous_word_idx = word_idx
    tokenized["labels"] = aligned_labels
    return tokenized

import unicodedata

def normalize_text(example):
    example["summary"] = unicodedata.normalize("NFC", example["summary"])
    return example

dataset = dataset.map(normalize_text)
tokenized_dataset = dataset.map(tokenize_and_align_labels, remove_columns=dataset["train"].column_names)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [5]:
# Inspect a sample
sample = tokenized_dataset["train"][0]
print(tokenizer.convert_ids_to_tokens(sample["input_ids"]))
print(sample["labels"])

['[CLS]', 'FCSB', 'și', 'Universitatea', 'Craiova', 'vor', 'lupta', 'în', 'acest', 'sezon', 'european', 'pentru', 'o', 'ascensiune', 'a', 'Super', '##Li', '##gii', 'în', 'clasamentul', 'coefic', '##ienților', 'UEFA', '.', 'Locul', '20', 'ar', 'fi', 'varianta', 'cea', 'mai', 'optimistă', '.', '[SEP]']
[-100, 1, 0, 1, 1, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 1, 1, 1, 0, 2, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 2, 2, 0, -100]


In [6]:
# Load Model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "dumitrescustefan/bert-base-romanian-cased-v1",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Setup trainer
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

args = TrainingArguments(
    output_dir="roberta_token_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    max_steps=200,
    logging_dir="logs",
    logging_steps=20,
    save_steps=50,
    do_eval=True,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


class NoPinTrainer(Trainer):
    def get_train_dataloader(self):
        dataloader = super().get_train_dataloader()
        dataloader.pin_memory = False
        return dataloader

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

In [8]:
# Train and evaluate
trainer.train()
trainer.evaluate()



Step,Training Loss




{'eval_loss': 0.7228062748908997,
 'eval_runtime': 2.0287,
 'eval_samples_per_second': 8.873,
 'eval_steps_per_second': 1.479,
 'epoch': 10.0}

In [None]:
# trainer.train(resume_from_checkpoint="roberta_token_output/checkpoint-50")