In [19]:
from datasets import load_dataset
from typing import List
import pandas as pd
from pathlib import Path

In [20]:
ds = load_dataset("opennyaiorg/InRhetoricalRoles")

In [21]:
def create_dataset(hf_ds, out_path: Path):
    # extract sentences and labels
    sentences_list: List[List[str]] = []
    labels_list: List[List[str]] = []

    for sample in hf_ds["annotations"]:
        sentences: List[str] = []
        labels: List[str] = []
        for annotation in sample[0]["result"]:
            text = annotation["value"]["text"]
            text = text.replace('\n', ' ') # remove newlines
            text = text.replace('\t', ' ') # remove tabs
            text = ' '.join(text.split()) # remove multiple spaces

            label = annotation["value"]["labels"][0]

            sentences.append(text)
            labels.append(label)
        
        sentences_list.append(sentences)
        labels_list.append(labels)

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(out_path)

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        unique_labels.update(labels)
    return unique_labels

In [22]:
datasets_path = Path("/home/tfischer/Development/seq-sentence-classification/datasets/structuring_legal_docs")

l1 = create_dataset(ds["train"], datasets_path / "rethoricalroles_train.parquet")
l2 = create_dataset(ds["dev"], datasets_path / "rethoricalroles_dev.parquet")
l3 = create_dataset(ds["test"], datasets_path / "rethoricalroles_test.parquet")

In [23]:
all_labels = l1.union(l2).union(l3)
print(all_labels)

{'ARG_PETITIONER', 'ANALYSIS', 'RATIO', 'PRE_NOT_RELIED', 'STA', 'RLC', 'PREAMBLE', 'FAC', 'ISSUE', 'ARG_RESPONDENT', 'PRE_RELIED', 'NONE', 'RPC'}


In [24]:
# read dataset validation
valid_df = pd.read_parquet(datasets_path / "rethoricalroles_test.parquet")

In [25]:
valid_df.head()

Unnamed: 0,sentences,labels
0,[IN THE HIGH COURT OF KARNATAKA DHARWAD BENCH ...,"[PREAMBLE, PREAMBLE, PREAMBLE, PREAMBLE, PREAM..."
1,[IN THE COURT OF CIVIL JUDGE-cum-JUDICIAL MAGI...,"[PREAMBLE, PREAMBLE, PREAMBLE, PREAMBLE, PREAM..."
2,[IN THE COURT OF JUDICIAL MAGISTRATE FIRST CLA...,"[PREAMBLE, PREAMBLE, PREAMBLE, FAC, FAC, FAC, ..."
3,[IN THE COURT OF JUDICIAL MAGISTRATE FIRST CLA...,"[PREAMBLE, PREAMBLE, PREAMBLE, PREAMBLE, PREAM..."
4,[IN THE HIGH COURT OF KARNATAKA AT BENGALURU D...,"[PREAMBLE, PREAMBLE, PREAMBLE, PREAMBLE, PREAM..."
