In [1]:
import json
from pathlib import Path
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [2]:
from copy import deepcopy

In [3]:
json_files = list(Path("./data/ground_truth").rglob("*.jsonl"))
full_dataset = []

for file in json_files:
    with open(file, "r") as f:
        lines = [json.loads(line) for line in f]
    full_dataset.extend(lines)

In [4]:
full_dataset_filtered = [{"text": sample["text"], "label": sample["label"]} for sample in full_dataset]

In [5]:
train_ds, test_ds = train_test_split(full_dataset_filtered, test_size=0.1, random_state=42)

In [6]:
train_ds_raw = deepcopy(train_ds)

In [7]:
def get_token_label_location_end(labels):
    for label in labels:
        if label[2] not in ["Neutralny", "Mowa nienawiści"]:
            return label[1]+1

In [8]:
def remove_redundant_labels(labels, end):
    return [label for label in labels if label[0] < end]

In [9]:
def remove_sentiment_labels(labels):
    return [label for label in labels if label[2] not in ["Mowa nienawiści", "Neutralny"]]

In [10]:
for sample in train_ds:
    
    token_label_end = get_token_label_location_end(sample["label"])
    sample["text"] = sample["text"][:token_label_end]
    if token_label_end is not None:
        sample["label"] = remove_redundant_labels(sample["label"], token_label_end)
    # sample["label"] = remove_sentiment_labels(sample["label"])

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("sdadas/polish-gpt2-medium", trust_remote_code=True)

In [12]:
model.config.pad_token_id = model.config.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id

In [13]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [14]:
from tqdm import tqdm

In [15]:
for sample in tqdm(train_ds):
    prompt = sample["text"]
    inputs = tokenizer(prompt, return_tensors="pt")

    try:
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            pad_token_id=tokenizer.pad_token_id,
            max_length=128,
            temperature=0.7,
            top_k=25,
            top_p=0.9,
            num_return_sequences=1,
            do_sample=True
        )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        sample["text"] = generated_text
    except:
        pass

100%|██████████| 539/539 [17:20<00:00,  1.93s/it]


In [17]:
with open("./sentences/train_with_filled.jsonl", 'w') as f:
    for item in train_ds_raw:
        f.write(json.dumps(item) + '\n')

    for item in train_ds:
        f.write(json.dumps(item) + '\n')