In [1]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [3]:
full_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    full_dataset.extend(lines)

In [4]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Odwracanie": "Neutral",
    "Wzmacnianie": "Hate"
}

def get_label(labels):
    label = labels[0][2]
    if label in mapping:
        label = mapping[label]
    return label

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
data_files = {"train": "train_with_masks.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./sentences", data_files=data_files)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1078
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})


In [7]:
sorted_labels = sorted(set([sample["label"] for sample in dataset["train"]]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", return_tensors="pt")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased",
                                                                      num_labels=len(label2id),
                                                                      label2id=label2id,
                                                                      id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map: 100%|██████████| 1078/1078 [00:00<00:00, 7601.77 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 6447.82 examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1078
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 60
})}





In [9]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_

In [10]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions) 
    return {"accuracy": (predictions == labels).mean()*100, "f1": f1}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-sentence-clf",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

Starting to train...


                                                  
  7%|▋         | 270/4050 [00:52<10:32,  5.97it/s]

{'eval_loss': 0.4134162366390228, 'eval_accuracy': 86.66666666666667, 'eval_f1': 0.9285714285714286, 'eval_runtime': 1.1902, 'eval_samples_per_second': 50.412, 'eval_steps_per_second': 12.603, 'epoch': 1.0}


 12%|█▏        | 501/4050 [01:36<10:49,  5.47it/s]

{'loss': 0.4447, 'grad_norm': 0.8074259757995605, 'learning_rate': 0.0017530864197530865, 'epoch': 1.85}


                                                  
 13%|█▎        | 540/4050 [01:44<11:02,  5.30it/s]

{'eval_loss': 0.445383757352829, 'eval_accuracy': 86.66666666666667, 'eval_f1': 0.9285714285714286, 'eval_runtime': 1.1037, 'eval_samples_per_second': 54.363, 'eval_steps_per_second': 13.591, 'epoch': 2.0}


                                                  
 20%|██        | 810/4050 [02:35<09:46,  5.53it/s]

{'eval_loss': 0.38756269216537476, 'eval_accuracy': 86.66666666666667, 'eval_f1': 0.9285714285714286, 'eval_runtime': 1.102, 'eval_samples_per_second': 54.448, 'eval_steps_per_second': 13.612, 'epoch': 3.0}


 25%|██▍       | 1001/4050 [03:12<09:50,  5.16it/s]

{'loss': 0.4038, 'grad_norm': 1.2756690979003906, 'learning_rate': 0.001506172839506173, 'epoch': 3.7}


                                                   
 27%|██▋       | 1080/4050 [03:29<08:07,  6.09it/s]

{'eval_loss': 0.38495078682899475, 'eval_accuracy': 85.0, 'eval_f1': 0.912621359223301, 'eval_runtime': 1.1648, 'eval_samples_per_second': 51.51, 'eval_steps_per_second': 12.878, 'epoch': 4.0}


                                                   
 33%|███▎      | 1350/4050 [04:23<08:02,  5.59it/s]

{'eval_loss': 0.3415077328681946, 'eval_accuracy': 86.66666666666667, 'eval_f1': 0.9285714285714286, 'eval_runtime': 1.1949, 'eval_samples_per_second': 50.215, 'eval_steps_per_second': 12.554, 'epoch': 5.0}


 37%|███▋      | 1501/4050 [04:52<08:03,  5.27it/s]

{'loss': 0.39, 'grad_norm': 1.8318347930908203, 'learning_rate': 0.0012592592592592592, 'epoch': 5.56}


                                                   
 40%|████      | 1620/4050 [05:16<06:57,  5.82it/s]

{'eval_loss': 0.3587222397327423, 'eval_accuracy': 83.33333333333334, 'eval_f1': 0.9074074074074074, 'eval_runtime': 1.2335, 'eval_samples_per_second': 48.644, 'eval_steps_per_second': 12.161, 'epoch': 6.0}


                                                   
 47%|████▋     | 1890/4050 [06:09<06:21,  5.67it/s]

{'eval_loss': 0.28704917430877686, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9532710280373832, 'eval_runtime': 1.1676, 'eval_samples_per_second': 51.389, 'eval_steps_per_second': 12.847, 'epoch': 7.0}


 49%|████▉     | 2001/4050 [06:31<06:14,  5.48it/s]

{'loss': 0.2932, 'grad_norm': 1.542023777961731, 'learning_rate': 0.0010123456790123457, 'epoch': 7.41}


                                                   
 53%|█████▎    | 2160/4050 [07:01<05:44,  5.49it/s]

{'eval_loss': 0.3536454141139984, 'eval_accuracy': 90.0, 'eval_f1': 0.9423076923076923, 'eval_runtime': 1.1215, 'eval_samples_per_second': 53.498, 'eval_steps_per_second': 13.374, 'epoch': 8.0}


                                                   
 60%|██████    | 2430/4050 [07:55<04:40,  5.77it/s]

{'eval_loss': 0.34428656101226807, 'eval_accuracy': 90.0, 'eval_f1': 0.9423076923076923, 'eval_runtime': 1.2302, 'eval_samples_per_second': 48.773, 'eval_steps_per_second': 12.193, 'epoch': 9.0}


 62%|██████▏   | 2500/4050 [08:10<05:23,  4.78it/s]

{'loss': 0.1844, 'grad_norm': 0.0371529720723629, 'learning_rate': 0.0007654320987654321, 'epoch': 9.26}


                                                   
 67%|██████▋   | 2700/4050 [08:52<03:56,  5.71it/s]

{'eval_loss': 0.3469993472099304, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.2336, 'eval_samples_per_second': 48.639, 'eval_steps_per_second': 12.16, 'epoch': 10.0}


                                                   
 73%|███████▎  | 2970/4050 [09:47<03:07,  5.77it/s]

{'eval_loss': 0.4251355230808258, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.1719, 'eval_samples_per_second': 51.199, 'eval_steps_per_second': 12.8, 'epoch': 11.0}


 74%|███████▍  | 3001/4050 [09:54<03:22,  5.18it/s]

{'loss': 0.1258, 'grad_norm': 0.09096033126115799, 'learning_rate': 0.0005185185185185185, 'epoch': 11.11}


                                                   
 80%|████████  | 3240/4050 [10:42<02:12,  6.12it/s]

{'eval_loss': 0.5164023041725159, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.1589, 'eval_samples_per_second': 51.774, 'eval_steps_per_second': 12.944, 'epoch': 12.0}


 86%|████████▋ | 3501/4050 [11:31<01:46,  5.13it/s]

{'loss': 0.0913, 'grad_norm': 0.020787805318832397, 'learning_rate': 0.00027160493827160494, 'epoch': 12.96}


                                                   
 87%|████████▋ | 3510/4050 [11:34<01:30,  5.97it/s]

{'eval_loss': 0.5229398608207703, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.1853, 'eval_samples_per_second': 50.62, 'eval_steps_per_second': 12.655, 'epoch': 13.0}


                                                   
 93%|█████████▎| 3780/4050 [12:29<00:44,  6.01it/s]

{'eval_loss': 0.47148048877716064, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.1576, 'eval_samples_per_second': 51.83, 'eval_steps_per_second': 12.958, 'epoch': 14.0}


 99%|█████████▉| 4001/4050 [13:12<00:09,  5.24it/s]

{'loss': 0.0814, 'grad_norm': 0.06398648023605347, 'learning_rate': 2.4691358024691357e-05, 'epoch': 14.81}


                                                   
100%|██████████| 4050/4050 [13:23<00:00,  5.78it/s]

{'eval_loss': 0.4737564027309418, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9514563106796117, 'eval_runtime': 1.1497, 'eval_samples_per_second': 52.186, 'eval_steps_per_second': 13.047, 'epoch': 15.0}


100%|██████████| 4050/4050 [13:24<00:00,  5.04it/s]

{'train_runtime': 804.1812, 'train_samples_per_second': 20.107, 'train_steps_per_second': 5.036, 'train_loss': 0.24907605191807689, 'epoch': 15.0}





TrainOutput(global_step=4050, training_loss=0.24907605191807689, metrics={'train_runtime': 804.1812, 'train_samples_per_second': 20.107, 'train_steps_per_second': 5.036, 'total_flos': 4371778399887360.0, 'train_loss': 0.24907605191807689, 'epoch': 15.0})

In [11]:
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px

In [13]:
test_ds = dataset["test"]
texts = [sample["text"] for sample in test_ds] 
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

bert = peft_model.base_model.model.bert

bert = bert.cpu()
bert.eval()
with torch.no_grad():
    outputs = bert(**inputs)

tsne = TSNE()
to_vis = tsne.fit_transform(outputs.last_hidden_state[:,0,:].squeeze().numpy())

df = pd.DataFrame(to_vis)
df["label"] = [post["label"] for post in test_ds]
df["text"] = [post["text"] for post in test_ds]
fig = px.scatter(df, x=0, y=1, hover_data="text", color="label")
fig.show()