In [5]:
# Load Dataset
import os

from datasets import Dataset

from service.util.path_util import PROJECT_ROOT

BASE_DIR = PROJECT_ROOT
JSONL_PATH = os.path.join(BASE_DIR, "storage", "hf", "gpt_20250930.jsonl")

dataset = Dataset.from_json(JSONL_PATH, encoding="utf-8")
dataset = dataset.train_test_split(test_size=0.1)

In [6]:
# Define Label Mapping
label2id = {"O": 0, "B-ENT": 1, "B-KW": 2}
id2label = {v: k for k, v in label2id.items()}

In [7]:
#Convert Each Example to Tokens + Labels
import re
def split_words(txt):
    return re.findall(r"\w+|\S", txt)


def label_words(example):
    summary_words = split_words(example["summary"])
    labels = ["O"] * len(summary_words)

    def mark_span(phrase, label_tag):
        phrase_words = split_words(phrase)
        for i in range(len(summary_words) - len(phrase_words) + 1):
            if summary_words[i:i + len(phrase_words)] == phrase_words:
                for j in range(len(phrase_words)):
                    labels[i + j] = label_tag

    for ent in example["entities"]:
        mark_span(ent, "B-ENT")
    for kw in example["keywords"]:
        mark_span(kw, "B-KW")

    return {"words": summary_words, "word_labels": labels}


dataset = dataset.map(label_words)

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [8]:
# Tokenize with Alignment

# noinspection PyPackageRequirements 
from transformers import AutoTokenizer  # it is provided by adapter-transformers==3.0.1

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")


def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["words"], is_split_into_words=True, truncation=True, max_length=512)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id[example["word_labels"][word_idx]])
        else:
            aligned_labels.append(label2id[example["word_labels"][word_idx]])  # or -100
        previous_word_idx = word_idx
    tokenized["labels"] = aligned_labels
    tokenizer.model_max_length = 512
    return tokenized


import unicodedata


def normalize_text(example):
    example["summary"] = unicodedata.normalize("NFC", example["summary"])
    return example


dataset = dataset.map(normalize_text)
tokenized_dataset = dataset.map(tokenize_and_align_labels, remove_columns=dataset["train"].column_names)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [9]:
# Inspect a sample
sample = tokenized_dataset["train"][0]
print(tokenizer.convert_ids_to_tokens(sample["input_ids"]))
print(sample["labels"])

['[CLS]', 'Alin', 'Buzăr', '##in', 'scrie', ',', 'pe', 'GSP', '.', 'ro', ',', 'despre', 'decizia', '-', 'șoc', 'a', 'mijlocașul', '##ui', 'din', 'China', '[SEP]']
[-100, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 1, -100]


In [10]:
# Load Model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "dumitrescustefan/bert-base-romanian-cased-v1",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Setup trainer
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

args = TrainingArguments(
    output_dir="dumitrescustefan_token_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    max_steps=200,
    logging_dir="logs",
    logging_steps=20,
    save_steps=50,
    do_eval=True,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# class NoPinTrainer(Trainer):
#     def get_train_dataloader(self):
#         dataloader = super().get_train_dataloader()
#         dataloader.pin_memory = False
#         return dataloader

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

In [12]:
# Train and evaluate
trainer.train()
trainer.evaluate()



Step,Training Loss




{'eval_loss': 0.6812012195587158,
 'eval_runtime': 0.5523,
 'eval_samples_per_second': 32.589,
 'eval_steps_per_second': 5.432,
 'epoch': 10.0}

In [13]:
# Torch script style

# 1. Load the trained checkpoint
from transformers import AutoModelForTokenClassification

ts_model = AutoModelForTokenClassification.from_pretrained(
    "dumitrescustefan_token_output/checkpoint-200",
    num_labels=3,
    id2label={0: 'O', 1: 'B-ENT', 2: 'B-KW'},
    label2id={'O': 0, 'B-ENT': 1, 'B-KW': 2}
)
ts_model.eval()

# 2. Create a valid example input
from transformers import AutoTokenizer
import re

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan_token_output/checkpoint-200")

text = "Simona Halep a câștigat meciul de la Roland Garros."
words = re.findall(r"\w+|\S", text)

encoding = tokenizer(
    words,
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

# 3. Trace and save the model
import torch


class Wrapper(torch.nn.Module):
    def __init__(self, model_ts):
        super().__init__()
        self.model = model_ts

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits


wrapped_model = Wrapper(ts_model)
traced = torch.jit.trace(wrapped_model, (encoding["input_ids"], encoding["attention_mask"]))
traced.save("bert_model.pt")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask
