In [1]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, GPT2ForTokenClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [2]:
import re

def label_words(data):
    text = data['text']
    labels = data['label']
    
    words = []
    start = 0
    for match in re.finditer(r'\S+', text):
        word = match.group()
        word_start = match.start()
        word_end = match.end()
        words.append({
            'word': word,
            'start': word_start,
            'end': word_end
        })

    labeled_words = []
    for word_info in words:
        word_label = "Neutralna"
        for start_idx, end_idx, label in labels:
            if word_info['start'] >= start_idx and word_info['end'] <= end_idx:
                word_label = label
        labeled_words.append({
            'word': word_info['word'],
            'label': word_label
        })

    return labeled_words

In [3]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [4]:
raw_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    raw_dataset.extend(lines)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
def filter_labels(labels):
    return [label for label in labels if label[2] not in ["Mowa nienawiści", "Neutralny"]]    

In [7]:
for sample in raw_dataset:
    sample["label"] = filter_labels(sample["label"])

In [8]:
processed_dataset = []

for sample in raw_dataset:
    sample["label"]
    labeled_words = label_words(sample)
    processed_dataset.append(labeled_words)


train_ds, test_ds = train_test_split(processed_dataset, test_size=0.1, random_state=42)

In [9]:
train_ds_processed = []
for sample in train_ds:
    words = [word["word"] for word in sample]
    label = [word["label"] for word in sample]
    new_sample = {"text": words, "labels": label}
    train_ds_processed.append(new_sample)

test_ds_processed = []
for sample in test_ds:
    words = [word["word"] for word in sample]
    label = [word["label"] for word in sample]
    new_sample = {"text": words, "labels": label}
    test_ds_processed.append(new_sample)

In [10]:
with open("./tokens/train.jsonl", 'w') as f:
    for item in train_ds_processed:
        f.write(json.dumps(item) + '\n')


with open("./tokens/test.jsonl", 'w') as f:
    for item in test_ds_processed:
        f.write(json.dumps(item) + '\n')

In [11]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./tokens", data_files=data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 539
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 60
    })
})


In [12]:
labels = []
for sentence in dataset["train"]["labels"]:
    labels += sentence

In [13]:
sorted_labels = sorted(list(set(labels)))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

In [14]:
tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForTokenClassification.from_pretrained("sdadas/polish-gpt2-medium",
                                                        num_labels=len(label2id),
                                                        label2id=label2id,
                                                        id2label=id2label)

classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, device="cuda")

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
dataset["train"]

Dataset({
    features: ['text', 'labels'],
    num_rows: 539
})

In [None]:
def tokenize_and_align_labels(data, tokenizer, label_map):
    tokenized_data = []
    
    for entry in data:
        tokens = tokenizer(
            entry["text"],
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=128,
        )
        word_ids = tokens.word_ids()
        labels = []
        
        for word_id in word_ids:
            if word_id is None or word_id == tokenizer.eos_token_id:
                labels.append(-100)
            else:
                labels.append(label_map[entry["labels"][word_id]])
        
        tokens["labels"] = labels
        tokenized_data.append(tokens)
    
    return tokenized_data

processed_data_train = tokenize_and_align_labels(dataset["train"], tokenizer, label2id)
processed_data_test = tokenize_and_align_labels(dataset["test"], tokenizer, label2id)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 8, 9, 10, 11, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 8, 8, 9, 10, 11, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

In [39]:
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.model)

GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=3072, nx=1024)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dr

In [41]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return {"accuracy": (predictions == labels).mean()*100}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="gpt2-token-clf",
        learning_rate=1e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.091526,21.445312
2,No log,0.086768,21.458333
3,No log,0.104139,21.445312
4,0.091400,0.109532,21.432292
5,0.091400,0.120329,21.419271
6,0.091400,0.164353,21.354167
7,0.091400,0.121759,21.471354
8,0.071000,0.151045,21.328125
9,0.071000,0.161728,21.354167
10,0.071000,0.158366,21.393229


TrainOutput(global_step=2025, training_loss=0.07211170379026437, metrics={'train_runtime': 317.0162, 'train_samples_per_second': 25.503, 'train_steps_per_second': 6.388, 'total_flos': 1916278064870400.0, 'train_loss': 0.07211170379026437, 'epoch': 15.0})