In [16]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, BertForTokenClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [17]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [18]:
raw_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    raw_dataset.extend(lines)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
def get_tokens_labels(text, labels):
    labeled_tokens = []
    for label in labels:
        if label[2] not in ["Hate", "Neutralny", "Mowa nienawiści"]:
            token = text[label[0]: label[1]]
            labeled_tokens.append({"text": token, "label": "Wzmacnianie" if label[2] == "Strenghtening" else label[2]})
    return labeled_tokens

In [21]:
full_dataset_filtered = []

for sample in raw_dataset:
    labeled_tokens = get_tokens_labels(sample["text"], sample["label"])
    full_dataset_filtered.extend(labeled_tokens)

In [22]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [23]:
with open("./tokens/train.jsonl", 'w') as f:
    for item in train_ds:
        f.write(json.dumps(item) + '\n')


with open("./tokens/test.jsonl", 'w') as f:
    for item in test_ds:
        f.write(json.dumps(item) + '\n')

In [24]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./tokens", data_files=data_files)
print(dataset)

Generating train split: 79 examples [00:00, 79747.30 examples/s]
Generating test split: 9 examples [00:00, 10424.95 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 79
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 9
    })
})





In [25]:
sorted_labels = sorted(set([sample["label"] for sample in train_ds]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased",
                                                                      num_labels=len(label2id),
                                                                      label2id=label2id,
                                                                      id2label=id2label)

classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, device="cuda")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], truncation=True, padding=True)

    labels = []
    for ids, label in zip(tokens["input_ids"], examples["label"]):
        labels.append(len(ids)*[label2id[label]])
    tokens["label"] = labels
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map: 100%|██████████| 79/79 [00:00<00:00, 16579.94 examples/s]
Map: 100%|██████████| 9/9 [00:00<00:00, 2785.06 examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 79
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9
})}





In [27]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return {"accuracy": (predictions == labels).mean()*100}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-token-clf",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...


  6%|▋         | 19/300 [00:00<00:04, 60.21it/s]
  7%|▋         | 20/300 [00:00<00:04, 60.21it/s]

{'eval_loss': 0.020595736801624298, 'eval_accuracy': 100.0, 'eval_runtime': 0.0181, 'eval_samples_per_second': 496.061, 'eval_steps_per_second': 165.354, 'epoch': 1.0}


 13%|█▎        | 40/300 [00:01<00:07, 35.04it/s]
 13%|█▎        | 40/300 [00:01<00:07, 35.04it/s]

{'eval_loss': 0.018508905544877052, 'eval_accuracy': 100.0, 'eval_runtime': 0.0181, 'eval_samples_per_second': 497.538, 'eval_steps_per_second': 165.846, 'epoch': 2.0}


 19%|█▉        | 58/300 [00:01<00:07, 31.59it/s]
 20%|██        | 60/300 [00:01<00:07, 31.59it/s]

{'eval_loss': 0.009995183907449245, 'eval_accuracy': 100.0, 'eval_runtime': 0.024, 'eval_samples_per_second': 375.027, 'eval_steps_per_second': 125.009, 'epoch': 3.0}


 25%|██▌       | 76/300 [00:02<00:07, 30.88it/s]
 27%|██▋       | 80/300 [00:02<00:07, 30.88it/s]

{'eval_loss': 0.02750464528799057, 'eval_accuracy': 100.0, 'eval_runtime': 0.019, 'eval_samples_per_second': 472.811, 'eval_steps_per_second': 157.604, 'epoch': 4.0}


 33%|███▎      | 99/300 [00:03<00:05, 34.17it/s]
 33%|███▎      | 100/300 [00:03<00:05, 34.17it/s]

{'eval_loss': 0.0005763992667198181, 'eval_accuracy': 100.0, 'eval_runtime': 0.0189, 'eval_samples_per_second': 476.343, 'eval_steps_per_second': 158.781, 'epoch': 5.0}


 39%|███▉      | 117/300 [00:04<00:05, 31.64it/s]
 40%|████      | 120/300 [00:04<00:05, 31.64it/s]

{'eval_loss': 0.00014649162767454982, 'eval_accuracy': 100.0, 'eval_runtime': 0.0189, 'eval_samples_per_second': 476.451, 'eval_steps_per_second': 158.817, 'epoch': 6.0}


 45%|████▌     | 136/300 [00:04<00:05, 31.47it/s]
 47%|████▋     | 140/300 [00:05<00:05, 31.47it/s]

{'eval_loss': 0.0006675662007182837, 'eval_accuracy': 100.0, 'eval_runtime': 0.0175, 'eval_samples_per_second': 514.983, 'eval_steps_per_second': 171.661, 'epoch': 7.0}


 52%|█████▏    | 155/300 [00:05<00:04, 32.31it/s]
 53%|█████▎    | 160/300 [00:05<00:04, 32.31it/s]

{'eval_loss': 1.5078471733431797e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0188, 'eval_samples_per_second': 477.675, 'eval_steps_per_second': 159.225, 'epoch': 8.0}


 58%|█████▊    | 175/300 [00:06<00:03, 32.08it/s]
 60%|██████    | 180/300 [00:06<00:03, 32.08it/s]

{'eval_loss': 1.2208899534016382e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0176, 'eval_samples_per_second': 511.993, 'eval_steps_per_second': 170.664, 'epoch': 9.0}


 67%|██████▋   | 200/300 [00:07<00:02, 37.58it/s]
 67%|██████▋   | 200/300 [00:07<00:02, 37.58it/s]

{'eval_loss': 1.3511219549400266e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0188, 'eval_samples_per_second': 479.965, 'eval_steps_per_second': 159.988, 'epoch': 10.0}


 73%|███████▎  | 219/300 [00:08<00:02, 34.97it/s]
 73%|███████▎  | 220/300 [00:08<00:02, 34.97it/s]

{'eval_loss': 1.3593637959274929e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0177, 'eval_samples_per_second': 507.764, 'eval_steps_per_second': 169.255, 'epoch': 11.0}


 79%|███████▉  | 238/300 [00:08<00:02, 30.71it/s]
 80%|████████  | 240/300 [00:08<00:01, 30.71it/s]

{'eval_loss': 1.2766617146553472e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0182, 'eval_samples_per_second': 493.596, 'eval_steps_per_second': 164.532, 'epoch': 12.0}


 86%|████████▌ | 257/300 [00:09<00:01, 30.80it/s]
 87%|████████▋ | 260/300 [00:09<00:01, 30.80it/s]

{'eval_loss': 1.2348691598162986e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0177, 'eval_samples_per_second': 507.491, 'eval_steps_per_second': 169.164, 'epoch': 13.0}


 91%|█████████▏| 274/300 [00:10<00:01, 25.05it/s]
 93%|█████████▎| 280/300 [00:10<00:00, 25.05it/s]

{'eval_loss': 1.1786545655922964e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.0186, 'eval_samples_per_second': 483.357, 'eval_steps_per_second': 161.119, 'epoch': 14.0}


 98%|█████████▊| 295/300 [00:11<00:00, 29.74it/s]
100%|██████████| 300/300 [00:12<00:00, 29.74it/s]

{'eval_loss': 1.1758585969801061e-05, 'eval_accuracy': 100.0, 'eval_runtime': 0.019, 'eval_samples_per_second': 474.791, 'eval_steps_per_second': 158.264, 'epoch': 15.0}


100%|██████████| 300/300 [00:12<00:00, 23.96it/s]

{'train_runtime': 12.5196, 'train_samples_per_second': 94.652, 'train_steps_per_second': 23.962, 'train_loss': 0.12991297403971355, 'epoch': 15.0}





TrainOutput(global_step=300, training_loss=0.12991297403971355, metrics={'train_runtime': 12.5196, 'train_samples_per_second': 94.652, 'train_steps_per_second': 23.962, 'total_flos': 8701777841400.0, 'train_loss': 0.12991297403971355, 'epoch': 15.0})