In [None]:
from pathlib import Path
from typing import List
import pandas as pd

In [None]:
datasets_path = Path("./datasets/semeval23persuasion/de")
valid_labels_path = datasets_path / "dev-labels-subtask-3.txt"
valid_sentences_path = datasets_path / "dev-labels-subtask-3.template"
train_labels_path = datasets_path / "train-labels-subtask-3.txt"
train_sentences_path = datasets_path / "train-labels-subtask-3.template"

In [None]:
label_map = {
    'Slogans': 'Slogans', 
    'Loaded_Language': 'Loaded Language', 
    'Appeal_to_Fear-Prejudice': 'Appeal to Fear/Prejudice', 
    'Conversation_Killer': 'Conversation Killer', 
    'Red_Herring': 'Red Herring', 
    'Guilt_by_Association': 'Guilt by Association', 
    'Flag_Waving': 'Flag Waving', 
    'o': 'O', 
    'Appeal_to_Hypocrisy': 'Appeal to Hypocrisy', 
    'Exaggeration-Minimisation': 'Exaggeration/Minimisation', 
    'Appeal_to_Authority': 'Appeal to Authority', 
    'Name_Calling-Labeling': 'Name Calling/Labeling', 
    'Causal_Oversimplification': 'Casual Oversimplification', 
    'False_Dilemma-No_Choice': 'False Dilemma', 
    'Appeal_to_Popularity': 'Appeal to Popularity', 
    'Obfuscation-Vagueness-Confusion': 'Obfuscation/Vagueness/Confusion', 
    'Doubt': 'Doubt', 
    'Straw_Man': 'Straw Man', 
    'Whataboutism': 'Whataboutism', 
    'Repetition': 'Repetition',
    'Appeal_to_Values': 'Appeal to Values',
    'Questioning_the_Reputation': 'Questioning the Reputation',
    'Consequential_Oversimplification': 'Consequential Oversimplification',
    'Appeal_to_Time': 'Appeal to Time',
}

In [None]:
data = {}

def create_dataset(sentences_path: Path, labels_path: Path):

    with sentences_path.open('r') as file:
        sentence_lines = file.readlines()

    for sentence_line in sentence_lines:
        sentence_line_splitted = sentence_line.strip().split("\t")
        assert len(sentence_line_splitted) == 3, f"Number of columns in sentence file is not 3: {sentence_line_splitted}"

        document_id = sentence_line_splitted[0]
        sentence_id = sentence_line_splitted[1]
        sentence = sentence_line_splitted[2].strip()

        data[document_id] = data.get(document_id, {})
        data[document_id][sentence_id] = {
            "sentence": sentence,
            "labels": ["o"],
            "sentence_id": sentence_id,
            "document_id": document_id
        }

    with labels_path.open('r') as file:
        label_lines = file.readlines()

    for label_line in label_lines:
        label_line_splitted = label_line.strip().split("\t")

        document_id = label_line_splitted[0]
        sentence_id = label_line_splitted[1]
        if len(label_line_splitted) == 3:
            label = label_line_splitted[2].split(",")
        else:
            label = ["o"]

        label = [label_map[lab] for lab in label]

        data[document_id][sentence_id]["labels"] = label

    sentences_list: List[List[str]] = []
    labels_list: List[List[List[str]]] = []  

    for document_id, sentences_data in data.items():
        # order sentences by sentence_id asc
        sentences = [sentences_data[sentence_id] for sentence_id in sorted(sentences_data.keys())]
        sentences_list.append([sentence["sentence"] for sentence in sentences])
        labels_list.append([sentence["labels"] for sentence in sentences])

    assert len(labels_list) == len(sentences_list), "Number of labels and sentences do not match"

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(sentences_path.with_suffix(".parquet"))

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        for label in labels:
            unique_labels.update(label)
    return unique_labels

In [None]:
l1 = create_dataset(valid_sentences_path, valid_labels_path)
l2 = create_dataset(train_sentences_path, train_labels_path)

In [None]:
all_labels = l1.union(l2)
print(all_labels)

In [None]:
# read datasets
valid_df = pd.read_parquet(valid_sentences_path.with_suffix(".parquet"))
train_df = pd.read_parquet(train_sentences_path.with_suffix(".parquet"))

In [None]:
valid_df.head()

In [None]:
len(valid_df)