In [23]:
import json
from pathlib import Path

from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, pipeline
from peft import LoraConfig, TaskType, get_peft_model
import numpy as np
import torch

In [24]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))

In [25]:
full_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    full_dataset.extend(lines)

In [26]:
mapping = {
    "Mowa nienawiści": "Hate",
    "Neutralny": "Neutral",
    "Odwracanie": "Neutral",
    "Wzmacnianie": "Hate"
}

def get_label(labels):
    label = labels[0][2]
    if label in mapping:
        label = mapping[label]
    return label

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
full_dataset_filtered = [{"text": sample["text"], "label": get_label(sample["label"])} for sample in full_dataset]

In [29]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [30]:
with open("./sentences/train.jsonl", 'w') as f:
    for item in train_ds:
        f.write(json.dumps(item) + '\n')


with open("./sentences/test.jsonl", 'w') as f:
    for item in test_ds:
        f.write(json.dumps(item) + '\n')

In [31]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("./sentences", data_files=data_files)
print(dataset)

Generating train split: 539 examples [00:00, 207883.21 examples/s]
Generating test split: 60 examples [00:00, 33483.00 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 539
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})





In [32]:
import torch
from transformers import BertForMaskedLM, AutoTokenizer
import random

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", return_tensors="pt")
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

masked_dataset = [sample["text"] for sample in dataset["train"]]
inputs = tokenizer(masked_dataset, padding="max_length", return_tensors="pt", truncation=True)

labels = inputs.input_ids.detach().clone()

probability = 0.15
mask_token_index = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
input_ids = inputs.input_ids[0]

for label_idx in range(labels.shape[0]):
    for i in range(len(input_ids)):
        if random.random() < probability:
            labels[label_idx, i] = input_ids[i]
            inputs.input_ids[label_idx, i] = mask_token_index
        else:
            labels[label_idx, i] = torch.tensor(-100)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import random
import numpy as np

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")
model.eval()

sentences = dataset["train"]["text"]

def mask_tokens(sentences, tokenizer, mlm_probability=0.15):
    masked_sentences = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        num_to_mask = int(len(tokens) * mlm_probability)
        mask_indices = np.random.choice(len(tokens), num_to_mask, replace=False)
        for index in mask_indices:
            tokens[index] = tokenizer.mask_token
        masked_sentence = ' '.join(tokens)
        masked_sentences.append(masked_sentence)
    return masked_sentences

masked_sentences = mask_tokens(sentences, tokenizer)

inputs = tokenizer(masked_sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

predicted_sentences = []
for input_ids, pred_ids in zip(inputs.input_ids, predictions):
    pred_tokens = [tokenizer.decode([pid]) if input_ids[i] == tokenizer.mask_token_id else tokenizer.decode([iid]) for i, (iid, pid) in enumerate(zip(input_ids, pred_ids))]
    predicted_sentence = ''.join(pred_tokens).replace(' ', '')
    predicted_sentences.append(predicted_sentence)

with open('predicted_texts.txt', 'w') as f:
    for sentence in predicted_sentences:
        f.write(sentence + '\n')

print("Predictions saved to predicted_texts.txt")


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


: 

In [None]:
# from transformers import Trainer, TrainingArguments
# import torch

# training_args = TrainingArguments(
#     output_dir='./mlm_output',
#     learning_rate=1e-3,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     logging_dir='./logs',
#     logging_steps=10, 
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     eval_dataset=dataset,
#     compute_metrics=None
# )

# trainer.train()
# model.save_pretrained("bert_finetuned")

 10%|▉         | 39/405 [02:39<24:54,  4.08s/it]


KeyboardInterrupt: 