In [35]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, GPT2ForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [36]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [37]:
full_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    full_dataset.extend(lines)

In [38]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Odwracanie": "Neutral",
    "Wzmacnianie": "Hate"
}

def get_label(labels):
    label = labels[0][2]
    if label in mapping:
        label = mapping[label]
    return label

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
full_dataset_filtered = [{"text": sample["text"], "label": get_label(sample["label"])} for sample in full_dataset]

In [41]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [42]:
with open("./sentences/train.jsonl", 'w') as f:
    for item in train_ds:
        f.write(json.dumps(item) + '\n')


with open("./sentences/test.jsonl", 'w') as f:
    for item in test_ds:
        f.write(json.dumps(item) + '\n')

In [43]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./sentences", data_files=data_files)
print(dataset)

Generating train split: 539 examples [00:00, 331631.19 examples/s]
Generating test split: 60 examples [00:00, 40629.36 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 539
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})





In [44]:
sorted_labels = sorted(set([sample["label"] for sample in train_ds]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained("gpt2",
                                                        num_labels=len(label2id),
                                                        label2id=label2id,
                                                        id2label=id2label)
model.config.pad_token_id = model.config.eos_token_id
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], truncation=True, padding=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map: 100%|██████████| 539/539 [00:00<00:00, 21333.07 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 13733.06 examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 539
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 60
})}





In [46]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropou


fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True.



In [51]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions) 
    return {"accuracy": (predictions == labels).mean()*100, "f1": f1}

trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="gpt-sentence-clf",
        learning_rate=1e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [15:12<02:37, 12.66it/s]
[A

{'eval_loss': 0.4085150957107544, 'eval_accuracy': 88.33333333333333, 'eval_f1': 0.9320388349514563, 'eval_runtime': 0.2196, 'eval_samples_per_second': 273.267, 'eval_steps_per_second': 68.317, 'epoch': 1.0}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [15:23<02:37, 12.66it/s]
[A

{'eval_loss': 0.5314709544181824, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 0.2306, 'eval_samples_per_second': 260.228, 'eval_steps_per_second': 65.057, 'epoch': 2.0}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [15:34<02:37, 12.66it/s]
[A

{'eval_loss': 0.6052017211914062, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2094, 'eval_samples_per_second': 286.553, 'eval_steps_per_second': 71.638, 'epoch': 3.0}



  2%|▏         | 31/2025 [15:42<02:37, 12.66it/s] 

{'loss': 0.2483, 'grad_norm': 0.1917032152414322, 'learning_rate': 0.0007530864197530865, 'epoch': 3.7}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [15:45<02:37, 12.66it/s]
[A

{'eval_loss': 0.5794306993484497, 'eval_accuracy': 90.0, 'eval_f1': 0.9411764705882353, 'eval_runtime': 0.2252, 'eval_samples_per_second': 266.431, 'eval_steps_per_second': 66.608, 'epoch': 4.0}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [15:55<02:37, 12.66it/s]
[A

{'eval_loss': 0.6940258741378784, 'eval_accuracy': 88.33333333333333, 'eval_f1': 0.9292929292929293, 'eval_runtime': 0.2073, 'eval_samples_per_second': 289.383, 'eval_steps_per_second': 72.346, 'epoch': 5.0}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [16:06<02:37, 12.66it/s]
[A

{'eval_loss': 0.6291674375534058, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 0.2121, 'eval_samples_per_second': 282.936, 'eval_steps_per_second': 70.734, 'epoch': 6.0}



[A
[A

[A[A                                        
                                                  
  2%|▏         | 31/2025 [16:17<02:37, 12.66it/s]
[A

{'eval_loss': 0.8054687976837158, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 0.2097, 'eval_samples_per_second': 286.112, 'eval_steps_per_second': 71.528, 'epoch': 7.0}



  2%|▏         | 31/2025 [16:22<02:37, 12.66it/s]  

{'loss': 0.1665, 'grad_norm': 0.08270744234323502, 'learning_rate': 0.0005061728395061728, 'epoch': 7.41}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [16:28<02:37, 12.66it/s]
[A

{'eval_loss': 0.6615070104598999, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2168, 'eval_samples_per_second': 276.759, 'eval_steps_per_second': 69.19, 'epoch': 8.0}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [16:39<02:37, 12.66it/s]
[A

{'eval_loss': 0.6601337194442749, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 0.2106, 'eval_samples_per_second': 284.94, 'eval_steps_per_second': 71.235, 'epoch': 9.0}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [16:49<02:37, 12.66it/s]
[A

{'eval_loss': 0.8051411509513855, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 0.2235, 'eval_samples_per_second': 268.432, 'eval_steps_per_second': 67.108, 'epoch': 10.0}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [17:00<02:37, 12.66it/s]
[A

{'eval_loss': 0.647074818611145, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2048, 'eval_samples_per_second': 292.945, 'eval_steps_per_second': 73.236, 'epoch': 11.0}



  2%|▏         | 31/2025 [17:01<02:37, 12.66it/s]  

{'loss': 0.1069, 'grad_norm': 0.0074486639350652695, 'learning_rate': 0.00025925925925925926, 'epoch': 11.11}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [17:10<02:37, 12.66it/s]
[A

{'eval_loss': 0.6677751541137695, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2281, 'eval_samples_per_second': 263.0, 'eval_steps_per_second': 65.75, 'epoch': 12.0}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [17:21<02:37, 12.66it/s]
[A

{'eval_loss': 0.6875895261764526, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2152, 'eval_samples_per_second': 278.845, 'eval_steps_per_second': 69.711, 'epoch': 13.0}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [17:32<02:37, 12.66it/s]
[A

{'eval_loss': 0.7215065360069275, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2104, 'eval_samples_per_second': 285.145, 'eval_steps_per_second': 71.286, 'epoch': 14.0}



  2%|▏         | 31/2025 [17:40<02:37, 12.66it/s]  

{'loss': 0.0867, 'grad_norm': 0.036627449095249176, 'learning_rate': 1.2345679012345678e-05, 'epoch': 14.81}



[A
[A

[A[A                                        
                                                   
  2%|▏         | 31/2025 [17:43<02:37, 12.66it/s]
[A

{'eval_loss': 0.7580923438072205, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9504950495049505, 'eval_runtime': 0.2068, 'eval_samples_per_second': 290.183, 'eval_steps_per_second': 72.546, 'epoch': 15.0}



100%|██████████| 2025/2025 [02:42<00:00, 12.46it/s]

{'train_runtime': 162.4571, 'train_samples_per_second': 49.767, 'train_steps_per_second': 12.465, 'train_loss': 0.15194923106534983, 'epoch': 15.0}





TrainOutput(global_step=2025, training_loss=0.15194923106534983, metrics={'train_runtime': 162.4571, 'train_samples_per_second': 49.767, 'train_steps_per_second': 12.465, 'total_flos': 797245025955840.0, 'train_loss': 0.15194923106534983, 'epoch': 15.0})

In [52]:
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px
texts = [sample["text"] for sample in test_ds]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

transformer_encoder = peft_model.base_model.model.transformer

transformer_encoder = transformer_encoder.cpu()
transformer_encoder.eval()
with torch.no_grad():
    outputs = transformer_encoder(**inputs)

tsne = TSNE()
to_vis = tsne.fit_transform(outputs.last_hidden_state[:,0,:].squeeze().numpy())

df = pd.DataFrame(to_vis)
df["label"] = [post["label"] for post in test_ds]
df["text"] = [post["text"] for post in test_ds]
fig = px.scatter(df, x=0, y=1, hover_data="text", color="label")
fig.show()