In [1]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, GPT2ForTokenClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [3]:
raw_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    raw_dataset.extend(lines)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
def get_tokens_labels(text, labels):
    labeled_tokens = []
    for label in labels:
        if label[2] not in ["Hate", "Neutralny", "Mowa nienawiści"]:
            token = text[label[0]: label[1]]
            labeled_tokens.append({"text": token, "label": "Wzmacnianie" if label[2] == "Strenghtening" else label[2]})
    return labeled_tokens

In [6]:
full_dataset_filtered = []

for sample in raw_dataset:
    labeled_tokens = get_tokens_labels(sample["text"], sample["label"])
    full_dataset_filtered.extend(labeled_tokens)

In [7]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [8]:
with open("./tokens/train.jsonl", 'w') as f:
    for item in train_ds:
        f.write(json.dumps(item) + '\n')


with open("./tokens/test.jsonl", 'w') as f:
    for item in test_ds:
        f.write(json.dumps(item) + '\n')

In [9]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./tokens", data_files=data_files)
print(dataset)

Generating train split: 79 examples [00:00, 49001.78 examples/s]
Generating test split: 9 examples [00:00, 8412.91 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 79
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 9
    })
})





In [22]:
sorted_labels = sorted(set([sample["label"] for sample in train_ds]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForTokenClassification.from_pretrained("gpt2",
                                                        num_labels=len(label2id),
                                                        label2id=label2id,
                                                        id2label=id2label)

classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, device="cuda")

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], truncation=True, padding=True)

    labels = []
    for ids, label in zip(tokens["input_ids"], examples["label"]):
        labels.append(len(ids)*[label2id[label]])
    tokens["label"] = labels
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)


Map: 100%|██████████| 79/79 [00:00<00:00, 14883.44 examples/s]

Map: 100%|██████████| 9/9 [00:00<00:00, 2503.40 examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 79
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 9
})}





In [None]:
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.model)

GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return {"accuracy": (predictions == labels).mean()*100}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="gpt2-token-clf",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

  0%|          | 0/300 [00:57<?, ?it/s]


Starting to train...


  5%|▌         | 15/300 [00:00<00:06, 42.33it/s]
[A

[A[A                               
                                                
  7%|▋         | 20/300 [00:00<00:06, 42.33it/s]
[A

{'eval_loss': 0.0018993266858160496, 'eval_accuracy': 100.0, 'eval_runtime': 0.0232, 'eval_samples_per_second': 388.401, 'eval_steps_per_second': 129.467, 'epoch': 1.0}


 12%|█▏        | 35/300 [00:01<00:07, 33.59it/s]
[A

[A[A                               
                                                
 13%|█▎        | 40/300 [00:01<00:07, 33.59it/s]
[A

{'eval_loss': 0.000644930056296289, 'eval_accuracy': 100.0, 'eval_runtime': 0.0146, 'eval_samples_per_second': 615.041, 'eval_steps_per_second': 205.014, 'epoch': 2.0}


 18%|█▊        | 55/300 [00:02<00:09, 26.29it/s]
[A
[A                                    

                                                
 20%|██        | 60/300 [00:02<00:09, 26.29it/s]
[A

{'eval_loss': 0.0025543842930346727, 'eval_accuracy': 100.0, 'eval_runtime': 0.0153, 'eval_samples_per_second': 587.328, 'eval_steps_per_second': 195.776, 'epoch': 3.0}


 25%|██▌       | 75/300 [00:03<00:07, 31.94it/s]
[A

[A[A                               
                                                
 27%|██▋       | 80/300 [00:03<00:06, 31.94it/s]
[A

{'eval_loss': 0.002782452618703246, 'eval_accuracy': 100.0, 'eval_runtime': 0.0163, 'eval_samples_per_second': 552.553, 'eval_steps_per_second': 184.184, 'epoch': 4.0}


 32%|███▏      | 95/300 [00:03<00:06, 32.95it/s]
[A

[A[A                               
                                                
 33%|███▎      | 100/300 [00:03<00:06, 32.95it/s]
[A

{'eval_loss': 0.0007988086435943842, 'eval_accuracy': 100.0, 'eval_runtime': 0.0149, 'eval_samples_per_second': 603.555, 'eval_steps_per_second': 201.185, 'epoch': 5.0}


 38%|███▊      | 115/300 [00:04<00:05, 34.83it/s]
[A
[A                                    

                                                 
 40%|████      | 120/300 [00:04<00:05, 34.83it/s]
[A

{'eval_loss': 0.00108518882188946, 'eval_accuracy': 100.0, 'eval_runtime': 0.0171, 'eval_samples_per_second': 526.805, 'eval_steps_per_second': 175.602, 'epoch': 6.0}


 45%|████▌     | 135/300 [00:05<00:05, 31.54it/s]
[A

[A[A                               
                                                 
 47%|████▋     | 140/300 [00:05<00:05, 31.54it/s]
[A

{'eval_loss': 0.0011629619402810931, 'eval_accuracy': 100.0, 'eval_runtime': 0.0151, 'eval_samples_per_second': 597.498, 'eval_steps_per_second': 199.166, 'epoch': 7.0}


 52%|█████▏    | 155/300 [00:06<00:04, 31.48it/s]
[A

[A[A                               
                                                 
 53%|█████▎    | 160/300 [00:06<00:04, 31.48it/s]
[A

{'eval_loss': 0.0007710996433161199, 'eval_accuracy': 100.0, 'eval_runtime': 0.0159, 'eval_samples_per_second': 567.411, 'eval_steps_per_second': 189.137, 'epoch': 8.0}


 58%|█████▊    | 175/300 [00:06<00:03, 35.25it/s]
[A

[A[A                               
                                                 
 60%|██████    | 180/300 [00:06<00:03, 35.25it/s]
[A

{'eval_loss': 0.00046286990982480347, 'eval_accuracy': 100.0, 'eval_runtime': 0.0154, 'eval_samples_per_second': 583.813, 'eval_steps_per_second': 194.604, 'epoch': 9.0}


 65%|██████▌   | 195/300 [00:07<00:03, 32.73it/s]
[A

[A[A                               
                                                 
 67%|██████▋   | 200/300 [00:07<00:03, 32.73it/s]
[A

{'eval_loss': 0.0003071642422582954, 'eval_accuracy': 100.0, 'eval_runtime': 0.0152, 'eval_samples_per_second': 591.247, 'eval_steps_per_second': 197.082, 'epoch': 10.0}


 72%|███████▏  | 215/300 [00:08<00:02, 34.19it/s]
[A

[A[A                               
                                                 
 73%|███████▎  | 220/300 [00:08<00:02, 34.19it/s]
[A

{'eval_loss': 0.0007199871470220387, 'eval_accuracy': 100.0, 'eval_runtime': 0.0147, 'eval_samples_per_second': 610.258, 'eval_steps_per_second': 203.419, 'epoch': 11.0}


 78%|███████▊  | 235/300 [00:09<00:02, 31.78it/s]
[A

[A[A                               
                                                 
 80%|████████  | 240/300 [00:09<00:01, 31.78it/s]
[A

{'eval_loss': 0.0009816264500841498, 'eval_accuracy': 100.0, 'eval_runtime': 0.016, 'eval_samples_per_second': 561.863, 'eval_steps_per_second': 187.288, 'epoch': 12.0}


 85%|████████▌ | 255/300 [00:09<00:01, 32.50it/s]
[A
[A                                    

                                                 
 87%|████████▋ | 260/300 [00:09<00:01, 32.50it/s]
[A

{'eval_loss': 0.0007746624178253114, 'eval_accuracy': 100.0, 'eval_runtime': 0.0152, 'eval_samples_per_second': 591.821, 'eval_steps_per_second': 197.274, 'epoch': 13.0}


 92%|█████████▏| 275/300 [00:10<00:00, 32.80it/s]
[A

[A[A                               
                                                 
 93%|█████████▎| 280/300 [00:10<00:00, 32.80it/s]
[A

{'eval_loss': 0.0006032853270880878, 'eval_accuracy': 100.0, 'eval_runtime': 0.0147, 'eval_samples_per_second': 610.386, 'eval_steps_per_second': 203.462, 'epoch': 14.0}


 98%|█████████▊| 295/300 [00:11<00:00, 31.65it/s]
[A

[A[A                               
                                                 
100%|██████████| 300/300 [00:12<00:00, 31.65it/s]
[A

{'eval_loss': 0.0006287709693424404, 'eval_accuracy': 100.0, 'eval_runtime': 0.0157, 'eval_samples_per_second': 571.725, 'eval_steps_per_second': 190.575, 'epoch': 15.0}



100%|██████████| 300/300 [00:12<00:00, 23.83it/s]

{'train_runtime': 12.5855, 'train_samples_per_second': 94.156, 'train_steps_per_second': 23.837, 'train_loss': 0.496390635172526, 'epoch': 15.0}





TrainOutput(global_step=300, training_loss=0.496390635172526, metrics={'train_runtime': 12.5855, 'train_samples_per_second': 94.156, 'train_steps_per_second': 23.837, 'total_flos': 10566444521700.0, 'train_loss': 0.496390635172526, 'epoch': 15.0})