In [None]:
from pathlib import Path
from typing import List
import pandas as pd
import json
import spacy

In [None]:
lang = "de"
category = "disease" # city or disease

In [None]:
# load spacy model
nlp = spacy.load("de_core_news_sm") if lang == "de" else spacy.load("en_core_web_sm")

In [None]:
# Disable all other pipeline components and enable only the senter component
nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'senter'])
nlp.enable_pipe('senter')

# Example usage
doc = nlp("Tim ist toll. Er ist auch schlau. Er ist ein guter Mensch.")
sentences = [sent.text.strip() for sent in doc.sents]
print(sentences)

In [None]:
datasets_path = Path(f"/home/tfischer/Development/seq-sentence-classification/datasets/wikisection/{lang}/{category}")
valid_path = datasets_path / f"wikisection_{lang}_{category}_validation.json"
test_path = datasets_path / f"wikisection_{lang}_{category}_test.json"
train_path = datasets_path / f"wikisection_{lang}_{category}_train.json"

In [None]:
def create_dataset(json_path: Path):
    data = json.loads(json_path.read_text())

    sentences_list: List[List[str]] = []
    labels_list: List[List[List[str]]] = []  

    for wiki_article in data:

        text = wiki_article["text"]
        annotations = wiki_article["annotations"]

        sentences = []
        labels = []
        for annotation in annotations:
            section = text[annotation["begin"]:annotation["begin"] + annotation["length"]].strip()

            # sentence splitting with spacy
            doc = nlp(section)
            sents = []
            for sent in doc.sents:
                s = sent.text.strip()
                s = s.replace("\\n", " ") # remove newlines
                s = s.replace("\\t", " ") # remove tabs
                s = " ".join(s.split()) # remove multiple whitespaces
                sents.append(s)

            label = annotation["sectionLabel"]
            assert label.startswith(f"{category}.")
            label = label[len(f"{category}."):]

            sentences.extend(sents)
            labels.extend([label] * len(sents))

        assert len(sentences) == len(labels), "Number of labels and sentences do not match"
        sentences_list.append(sentences)
        labels_list.append(labels)

    assert len(labels_list) == len(sentences_list), "Number of labels and sentences do not match"

    # create dataframe
    df = pd.DataFrame({"sentences": sentences_list, "labels": labels_list})

    # save dataframe
    df.to_parquet(json_path.with_suffix(".parquet"))

    # unique labels
    unique_labels = set()
    for labels in labels_list:
        unique_labels.update(labels)
    return unique_labels

In [None]:
l1 = create_dataset(valid_path)
l2 = create_dataset(test_path)
l3 = create_dataset(train_path)

In [None]:
all_labels = l1.union(l2).union(l3)
print(all_labels)

In [None]:
# read datasets
valid_df = pd.read_parquet(valid_path.with_suffix(".parquet"))
test_df = pd.read_parquet(test_path.with_suffix(".parquet"))
train_df = pd.read_parquet(train_path.with_suffix(".parquet"))

In [None]:
valid_df.head()

In [None]:
valid_df["labels"]