In [1]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [2]:
data_files = {"train": "train_with_filled.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./sentences", data_files=data_files)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1078
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})


In [3]:
sorted_labels = sorted(set([sample["label"] for sample in dataset["train"]]))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased", return_tensors="pt")
model = BertForSequenceClassification.from_pretrained("allegro/herbert-base-cased",
                                                                      num_labels=len(label2id),
                                                                      label2id=label2id,
                                                                      id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map:   0%|          | 0/1078 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1078
}), 'test': Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 60
})}


In [5]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_A

In [6]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predictions) 
    return {"accuracy": (predictions == labels).mean()*100, "f1": f1}


trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-sentence-clf",
        learning_rate=2e-3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Starting to train...


  0%|          | 0/4050 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.3955989181995392, 'eval_accuracy': 86.66666666666667, 'eval_f1': 0.9285714285714286, 'eval_runtime': 1.19, 'eval_samples_per_second': 50.42, 'eval_steps_per_second': 12.605, 'epoch': 1.0}
{'loss': 0.4549, 'grad_norm': 2.904478073120117, 'learning_rate': 0.0017530864197530865, 'epoch': 1.85}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.4158343970775604, 'eval_accuracy': 91.66666666666666, 'eval_f1': 0.9532710280373832, 'eval_runtime': 1.1686, 'eval_samples_per_second': 51.343, 'eval_steps_per_second': 12.836, 'epoch': 2.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.7336533665657043, 'eval_accuracy': 85.0, 'eval_f1': 0.9072164948453608, 'eval_runtime': 1.1522, 'eval_samples_per_second': 52.073, 'eval_steps_per_second': 13.018, 'epoch': 3.0}
{'loss': 0.1887, 'grad_norm': 0.011785507202148438, 'learning_rate': 0.001506172839506173, 'epoch': 3.7}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.3269987404346466, 'eval_accuracy': 96.66666666666667, 'eval_f1': 0.9807692307692307, 'eval_runtime': 1.1763, 'eval_samples_per_second': 51.006, 'eval_steps_per_second': 12.751, 'epoch': 4.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.617087185382843, 'eval_accuracy': 88.33333333333333, 'eval_f1': 0.9292929292929293, 'eval_runtime': 1.1828, 'eval_samples_per_second': 50.728, 'eval_steps_per_second': 12.682, 'epoch': 5.0}
{'loss': 0.1041, 'grad_norm': 3.773364782333374, 'learning_rate': 0.0012592592592592592, 'epoch': 5.56}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.6831357479095459, 'eval_accuracy': 90.0, 'eval_f1': 0.94, 'eval_runtime': 1.1653, 'eval_samples_per_second': 51.487, 'eval_steps_per_second': 12.872, 'epoch': 6.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.421656996011734, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1682, 'eval_samples_per_second': 51.362, 'eval_steps_per_second': 12.84, 'epoch': 7.0}
{'loss': 0.0575, 'grad_norm': 0.0005574076785705984, 'learning_rate': 0.0010123456790123457, 'epoch': 7.41}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.6196771860122681, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.0792, 'eval_samples_per_second': 55.596, 'eval_steps_per_second': 13.899, 'epoch': 8.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.4815029203891754, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1771, 'eval_samples_per_second': 50.971, 'eval_steps_per_second': 12.743, 'epoch': 9.0}
{'loss': 0.046, 'grad_norm': 0.0028950064443051815, 'learning_rate': 0.0007654320987654321, 'epoch': 9.26}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.3604021668434143, 'eval_accuracy': 96.66666666666667, 'eval_f1': 0.9807692307692307, 'eval_runtime': 1.1666, 'eval_samples_per_second': 51.429, 'eval_steps_per_second': 12.857, 'epoch': 10.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.3702946901321411, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1658, 'eval_samples_per_second': 51.465, 'eval_steps_per_second': 12.866, 'epoch': 11.0}
{'loss': 0.0348, 'grad_norm': 0.0007284372113645077, 'learning_rate': 0.0005185185185185185, 'epoch': 11.11}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.46137142181396484, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1724, 'eval_samples_per_second': 51.178, 'eval_steps_per_second': 12.795, 'epoch': 12.0}
{'loss': 0.0295, 'grad_norm': 0.002510969527065754, 'learning_rate': 0.00027160493827160494, 'epoch': 12.96}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.527151882648468, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1935, 'eval_samples_per_second': 50.273, 'eval_steps_per_second': 12.568, 'epoch': 13.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.49943527579307556, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1652, 'eval_samples_per_second': 51.493, 'eval_steps_per_second': 12.873, 'epoch': 14.0}
{'loss': 0.02, 'grad_norm': 0.006279141642153263, 'learning_rate': 2.4691358024691357e-05, 'epoch': 14.81}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.5128610134124756, 'eval_accuracy': 93.33333333333333, 'eval_f1': 0.9607843137254902, 'eval_runtime': 1.1623, 'eval_samples_per_second': 51.62, 'eval_steps_per_second': 12.905, 'epoch': 15.0}
{'train_runtime': 802.8335, 'train_samples_per_second': 20.141, 'train_steps_per_second': 5.045, 'train_loss': 0.1162363686973666, 'epoch': 15.0}


TrainOutput(global_step=4050, training_loss=0.1162363686973666, metrics={'train_runtime': 802.8335, 'train_samples_per_second': 20.141, 'train_steps_per_second': 5.045, 'total_flos': 4371778399887360.0, 'train_loss': 0.1162363686973666, 'epoch': 15.0})

In [7]:
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px

In [8]:
test_ds = dataset["test"]
texts = [sample["text"] for sample in test_ds] 
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

bert = peft_model.base_model.model.bert

bert = bert.cpu()
bert.eval()
with torch.no_grad():
    outputs = bert(**inputs)

tsne = TSNE()
to_vis = tsne.fit_transform(outputs.last_hidden_state[:,0,:].squeeze().numpy())

df = pd.DataFrame(to_vis)
df["label"] = [post["label"] for post in test_ds]
df["text"] = [post["text"] for post in test_ds]
fig = px.scatter(df, x=0, y=1, hover_data="text", color="label")
fig.show()