In [3]:
import pandas as pd

LABEL_LIST = ["O", "B-OBJ", "I-OBJ", "B-ASP", "I-ASP"]

# Define the cleaning function
def clean_tokens(tokens):
    # Remove punctuation marks and special characters
    cleaned_tokens = [token for token in tokens if token.isalpha()]
    return cleaned_tokens


def transform_to_iob2_format(labels):
    new_labels = []
    prev_label = labels[0]
    is_first_label = True
    for ind in range(1, len(labels)):
        label = labels[ind]
        if prev_label != label:
            new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
            new_labels.append(prev_label if prev_label == "O" else new_label)
            prev_label = label
            is_first_label = True
        elif is_first_label:
            new_labels.append(prev_label if prev_label == "O" else "B-" + prev_label)
            prev_label = label
            is_first_label = False
        else:
            new_labels.append(prev_label if prev_label == "O" else "I-" + prev_label)
            prev_label = label

    new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
    new_labels.append(prev_label if prev_label == "O" else new_label)

    return new_labels


def read_data(filename):
    df = (
        pd.read_csv("../../Raw Datasets/Webis-CompQuestions-22-2/comparative-questions-parsing/" + filename, sep="\t")
        .groupby("sentence_id")
        .agg({"words": lambda x: list(x), "labels": lambda x: list(x)})
    )
    
    df = df.reset_index(drop=True)
    df["labels"] = df["labels"].map(lambda x: transform_to_iob2_format(x))
    # convert all PRED labels to ASP
    df["labels"] = df["labels"].map(lambda labels: ["B-ASP" if label == "B-PRED" else label for label in labels])
    df["labels"] = df["labels"].map(lambda labels: ["I-ASP" if label == "I-PRED" else label for label in labels])
    df["labels"] = df["labels"].apply(
        lambda labels: [LABEL_LIST.index(label) for label in labels]
    )
    
     # Clean the 'words' column
    df["words"] = df["words"].map(lambda x: clean_tokens(x))
    
    return df

df = read_data("full.tsv")

# split into train, test and val where test is 20% and val is 10%
train_df = df.sample(frac=0.7)
df = df.drop(train_df.index)
test_df = df.sample(frac=0.67)
val_df = df.drop(test_df.index)

# save the data
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)
val_df.to_csv("val.csv", index=False)

