In [1]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [2]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [3]:
full_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    full_dataset.extend(lines)

In [4]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Odwracanie": "Neutral",
    "Wzmacnianie": "Hate"
}

def get_label(labels):
    label = labels[0][2]
    if label in mapping:
        label = mapping[label]
    return label

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
full_dataset_filtered = [{"text": sample["text"], "label": get_label(sample["label"])} for sample in full_dataset]

In [7]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [8]:
with open("./sentences/train.jsonl", 'w') as f:
    for item in train_ds:
        f.write(json.dumps(item) + '\n')


with open("./sentences/test.jsonl", 'w') as f:
    for item in test_ds:
        f.write(json.dumps(item) + '\n')

In [9]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./sentences", data_files=data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 539
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})


In [10]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 539
})

In [11]:
sorted_labels = sorted(set([sample["label"] for sample in train_ds]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased", return_tensors="pt")
model = BertForSequenceClassification.from_pretrained("allegro/herbert-base-cased",
                                                                      num_labels=len(label2id),
                                                                      label2id=label2id,
                                                                      id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map:   0%|          | 0/539 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 539
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 60
})}


In [None]:
# import torch.nn as nn

# model.classifier = nn.Sequential(
#     nn.Linear(768, 384),
#     nn.ReLU(),
#     nn.Linear(384, 2),
# )

In [17]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_A

In [18]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions) 
    return {"accuracy": (predictions == labels).mean()*100, "f1": f1}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-sentence-clf",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.439523,86.666667,0.928571
2,No log,0.574664,86.666667,0.928571
3,No log,0.753725,86.666667,0.925926
4,0.484900,0.539774,91.666667,0.952381
5,0.484900,0.335953,90.0,0.941176
6,0.484900,0.404437,93.333333,0.962264
7,0.484900,0.709045,91.666667,0.950495
8,0.230800,0.441767,93.333333,0.960784
9,0.230800,0.610835,91.666667,0.950495
10,0.230800,0.695224,91.666667,0.950495


TrainOutput(global_step=2025, training_loss=0.21841612876013472, metrics={'train_runtime': 382.5205, 'train_samples_per_second': 21.136, 'train_steps_per_second': 5.294, 'total_flos': 2200519654502400.0, 'train_loss': 0.21841612876013472, 'epoch': 15.0})

In [19]:
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px

In [32]:
texts = [sample["text"] for sample in test_ds] 
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

bert = peft_model.base_model.model.bert
# classifier = peft_model.classifier.original_module[0].cpu()

bert = bert.cpu()
bert.eval()
with torch.no_grad():
    outputs = bert(**inputs)

outputs = outputs.last_hidden_state[:,0,:].squeeze()
# outputs = classifier(outputs)

tsne = TSNE()
to_vis = tsne.fit_transform(outputs)

df = pd.DataFrame(to_vis)
df["label"] = [post["label"] for post in test_ds]
df["text"] = [post["text"] for post in test_ds]
fig = px.scatter(df, x=0, y=1, hover_data="text", color="label")
fig.show()