In [25]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, GPT2ForTokenClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [26]:
import re

def label_words(data):
    text = data['text']
    labels = data['label']
    
    words = []
    start = 0
    for match in re.finditer(r'\S+', text):
        word = match.group()
        word_start = match.start()
        word_end = match.end()
        words.append({
            'word': word,
            'start': word_start,
            'end': word_end
        })

    labeled_words = []
    for word_info in words:
        word_label = "Neutralna"
        for start_idx, end_idx, label in labels:
            if word_info['start'] >= start_idx and word_info['end'] <= end_idx:
                word_label = label
        labeled_words.append({
            'word': word_info['word'],
            'label': word_label
        })

    return labeled_words

In [27]:
data_files = {"train": "train_filled.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./tokens", data_files=data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1078
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 60
    })
})


In [28]:
labels = []
for sentence in dataset["train"]["labels"]:
    labels += sentence

In [29]:
sorted_labels = sorted(list(set(labels)))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

In [30]:
tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForTokenClassification.from_pretrained("sdadas/polish-gpt2-medium",
                                                        num_labels=len(label2id),
                                                        label2id=label2id,
                                                        id2label=id2label)

classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, device="cuda")

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def tokenize_and_align_labels(data, tokenizer, label_map):
    tokenized_data = []
    
    for entry in data:
        tokens = tokenizer(
            entry["text"],
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=128,
        )
        word_ids = tokens.word_ids()
        labels = []
        
        for word_id in word_ids:
            if word_id is None or word_id == tokenizer.eos_token_id:
                labels.append(-100)
            else:
                labels.append(label_map[entry["labels"][word_id]])
        
        tokens["labels"] = labels
        tokenized_data.append(tokens)
    
    return tokenized_data

processed_data_train = tokenize_and_align_labels(dataset["train"], tokenizer, label2id)
processed_data_test = tokenize_and_align_labels(dataset["test"], tokenizer, label2id)

In [32]:
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.model)

GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=3072, nx=1024)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dr



In [33]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return {"accuracy": (predictions == labels).mean()*100}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="gpt2-token-clf",
        learning_rate=1e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=processed_data_train,
    eval_dataset=processed_data_test,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...


  0%|          | 0/4050 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.05110986903309822, 'eval_accuracy': 21.536458333333332, 'eval_runtime': 0.9944, 'eval_samples_per_second': 60.337, 'eval_steps_per_second': 15.084, 'epoch': 1.0}
{'loss': 0.0953, 'grad_norm': 0.3646955192089081, 'learning_rate': 0.0008765432098765433, 'epoch': 1.85}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.09211921691894531, 'eval_accuracy': 21.3671875, 'eval_runtime': 1.0504, 'eval_samples_per_second': 57.12, 'eval_steps_per_second': 14.28, 'epoch': 2.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.08411039412021637, 'eval_accuracy': 21.484375, 'eval_runtime': 1.0397, 'eval_samples_per_second': 57.706, 'eval_steps_per_second': 14.427, 'epoch': 3.0}
{'loss': 0.0578, 'grad_norm': 0.3573233187198639, 'learning_rate': 0.0007530864197530865, 'epoch': 3.7}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.11131899058818817, 'eval_accuracy': 21.3671875, 'eval_runtime': 1.0404, 'eval_samples_per_second': 57.669, 'eval_steps_per_second': 14.417, 'epoch': 4.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.1385037750005722, 'eval_accuracy': 21.276041666666668, 'eval_runtime': 1.0426, 'eval_samples_per_second': 57.549, 'eval_steps_per_second': 14.387, 'epoch': 5.0}
{'loss': 0.0547, 'grad_norm': 0.024033259600400925, 'learning_rate': 0.0006296296296296296, 'epoch': 5.56}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.1490623503923416, 'eval_accuracy': 21.302083333333332, 'eval_runtime': 1.0444, 'eval_samples_per_second': 57.451, 'eval_steps_per_second': 14.363, 'epoch': 6.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.12243536114692688, 'eval_accuracy': 21.3671875, 'eval_runtime': 1.0498, 'eval_samples_per_second': 57.152, 'eval_steps_per_second': 14.288, 'epoch': 7.0}
{'loss': 0.0463, 'grad_norm': 0.05410575866699219, 'learning_rate': 0.0005061728395061728, 'epoch': 7.41}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.18335968255996704, 'eval_accuracy': 21.302083333333332, 'eval_runtime': 1.0521, 'eval_samples_per_second': 57.029, 'eval_steps_per_second': 14.257, 'epoch': 8.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.19152198731899261, 'eval_accuracy': 21.315104166666668, 'eval_runtime': 1.0419, 'eval_samples_per_second': 57.587, 'eval_steps_per_second': 14.397, 'epoch': 9.0}
{'loss': 0.0425, 'grad_norm': 0.026814542710781097, 'learning_rate': 0.00038271604938271603, 'epoch': 9.26}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.19353152811527252, 'eval_accuracy': 21.276041666666668, 'eval_runtime': 1.0482, 'eval_samples_per_second': 57.241, 'eval_steps_per_second': 14.31, 'epoch': 10.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.1877882480621338, 'eval_accuracy': 21.276041666666668, 'eval_runtime': 1.0346, 'eval_samples_per_second': 57.993, 'eval_steps_per_second': 14.498, 'epoch': 11.0}
{'loss': 0.0424, 'grad_norm': 0.02487647905945778, 'learning_rate': 0.00025925925925925926, 'epoch': 11.11}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.20605230331420898, 'eval_accuracy': 21.276041666666668, 'eval_runtime': 1.0333, 'eval_samples_per_second': 58.067, 'eval_steps_per_second': 14.517, 'epoch': 12.0}
{'loss': 0.0337, 'grad_norm': 0.2072771042585373, 'learning_rate': 0.00013580246913580247, 'epoch': 12.96}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.2011784464120865, 'eval_accuracy': 21.315104166666668, 'eval_runtime': 1.0315, 'eval_samples_per_second': 58.169, 'eval_steps_per_second': 14.542, 'epoch': 13.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.20462816953659058, 'eval_accuracy': 21.315104166666668, 'eval_runtime': 1.0529, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 14.247, 'epoch': 14.0}
{'loss': 0.0372, 'grad_norm': 0.041078075766563416, 'learning_rate': 1.2345679012345678e-05, 'epoch': 14.81}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.2070254683494568, 'eval_accuracy': 21.315104166666668, 'eval_runtime': 1.0488, 'eval_samples_per_second': 57.208, 'eval_steps_per_second': 14.302, 'epoch': 15.0}
{'train_runtime': 676.9515, 'train_samples_per_second': 23.886, 'train_steps_per_second': 5.983, 'train_loss': 0.051088215068534566, 'epoch': 15.0}


TrainOutput(global_step=4050, training_loss=0.051088215068534566, metrics={'train_runtime': 676.9515, 'train_samples_per_second': 23.886, 'train_steps_per_second': 5.983, 'total_flos': 3832556129740800.0, 'train_loss': 0.051088215068534566, 'epoch': 15.0})