In [None]:
from pathlib import Path
import json
import pandas as pd

In [68]:
dataset_path = Path("datasets/german-quotations/test")

paths = [
    Path("datasets/german-quotations/test"),
    Path("datasets/german-quotations/train"),
    Path("datasets/german-quotations/dev")
]

In [69]:
result = []
for dataset_path in paths:
    for file in dataset_path.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)
            

        print(f"Processing {file}")
        print(data["documentName"])

        # get the whole document as tokens
        document_tokens = [token for sentence in data["sentences"] for token in sentence["tokens"]]
        document_token_ids = [token_id for sentence in data["sentences"] for token_id in sentence["tokenIds"]]
        document_token_id2token = {token_id: token for token_id, token in zip(document_token_ids, document_tokens)}
        assert len(document_tokens) == len(document_token_ids), "Lengths do not match"

        # get the speaker tokens
        speaker_token_ids = [token_id for annotation in data["annotations"] for token_id in annotation["speaker"]["tokenIds"] if annotation["type"] == "Direct"]

        # get the quotation tokens
        quote_token_ids = [token_id for annotation in data["annotations"] for token_id in annotation["quote"]["tokenIds"] if annotation["type"] == "Direct"]
        # quote_types = [annotation["type"] for annotation in data["annotations"] for token_id in annotation["quote"]["tokenIds"]]

        # sanity checks
        # I assume that speakers do not overlap
        # assert len(speaker_token_ids) == len(set(speaker_token_ids)), "Speakers overlap"

        # I assume that quotes do not overlap
        # assert len(quote_token_ids) == len(set(quote_token_ids)), "Quotes overlap"

        # I assume that speakers and quotes do not overlap
        assert len(set(speaker_token_ids).intersection(set(quote_token_ids))) == 0, "Speakers and quotes overlap"

        # build dicts of annotated tokens and their class
        # coarse-grained
        annotated_tokens = {
            token_id: "speaker" for token_id in speaker_token_ids
        }
        annotated_tokens.update({
            token_id: "quote" for token_id in quote_token_ids
        })

        # fine-grained 
        # annotated_tokens_fine = {
        #     token_id: "speaker" for token_id in speaker_token_ids
        # }
        # annotated_tokens_fine.update({
        #     token_id: f"quote-{token_type.lower()}" for token_id, token_type in zip(quote_token_ids, quote_types)
        # })

        # create the sequence classification data
        tags = []
        # tags_fine = []
        for token_id in document_token_ids:
            if token_id in annotated_tokens:
                tags.append(annotated_tokens[token_id])
                # tags_fine.append(annotated_tokens_fine[token_id])
            else:
                tags.append("O")
                # tags_fine.append("O")

        # build result object
        result.append({
            "tokens": document_tokens,
            "tags": tags,
            # "tags_fine": tags_fine
        })

# write the result
df = pd.DataFrame(result)
df.to_parquet("datasets/german-quotations/german_quotations_test.parquet")

Processing datasets/german-quotations/test/83133 - DFB-Frauenteam siegt glücklich gegen Spanien (2019-06-13).pretty.json
83133 - DFB-Frauenteam siegt glücklich gegen Spanien (2019-06-13)
Processing datasets/german-quotations/test/3560 - 1,3 Millionen Bundesbürger in Deutschland haben kostenlos Pakete verschickt (2005-02-27).pretty.json
3560 - 1,3 Millionen Bundesbürger in Deutschland haben kostenlos Pakete verschickt (2005-02-27)
Processing datasets/german-quotations/test/8684 - Drogenvergehen: Deutsche nach drei Jahren Haft in Singapur wieder frei (2005-07-15).pretty.json
8684 - Drogenvergehen: Deutsche nach drei Jahren Haft in Singapur wieder frei (2005-07-15)
Processing datasets/german-quotations/test/37225 - Fieber-Epidemie in Kenia tötet mindestens 90 Menschen (2007-01-15).pretty.json
37225 - Fieber-Epidemie in Kenia tötet mindestens 90 Menschen (2007-01-15)
Processing datasets/german-quotations/test/18451 - Massenkarambolage auf der Autobahn bei Darmstadt (2005-12-26).pretty.json

In [None]:
data["documentName"]

In [None]:
document_tokens = [token for sentence in data["sentences"] for token in sentence["tokens"]]
document_token_ids = [token_id for sentence in data["sentences"] for token_id in sentence["tokenIds"]]

In [None]:
assert len(document_tokens) == len(document_token_ids), "Lengths do not match"

In [None]:
document_token_id2token = {token_id: token for token_id, token in zip(document_token_ids, document_tokens)}

In [None]:
speaker_token_ids = [token_id for annotation in data["annotations"] for token_id in annotation["speaker"]["tokenIds"]]

In [None]:
speaker_token_ids

In [None]:
quote_token_ids = [token_id for annotation in data["annotations"] for token_id in annotation["quote"]["tokenIds"]]
quote_types = [annotation["type"] for annotation in data["annotations"] for token_id in annotation["quote"]["tokenIds"]]

In [None]:
# I assume that speakers do not overlap
assert len(speaker_token_ids) == len(set(speaker_token_ids)), "Speakers overlap"

# I assume that quotes do not overlap
assert len(quote_token_ids) == len(set(quote_token_ids)), "Quotes overlap"

# I assume that speakers and quotes do not overlap
assert len(set(speaker_token_ids).intersection(set(quote_token_ids))) == 0, "Speakers and quotes overlap"

In [None]:
annotated_tokens = {
    token_id: "speaker" for token_id in speaker_token_ids
}

annotated_tokens.update({
    token_id: "quote" for token_id in quote_token_ids
})

annotated_tokens_fine = {k: v for k, v in annotated_tokens.items()}
annotated_tokens_fine.update({
    token_id: f"quote-{token_type.lower()}" for token_id, token_type in zip(quote_token_ids, quote_types)
})

In [None]:
tags = []
tags_fine = []
for token_id in document_token_ids:
    if token_id in annotated_tokens:
        tags.append(annotated_tokens[token_id])
        tags_fine.append(annotated_tokens_fine[token_id])
    else:
        tags.append("O")
        tags_fine.append("O")   

In [None]:
tags_fine

In [None]:
len(tags)

In [85]:
coarse_label2lid = {
    "O": 0,
    "speaker": 1,
    "quote": 2
}
coarse_id2label = {v: k for k, v in coarse_label2lid.items()}

In [None]:
result = {
    "tokens": document_tokens,
    "tags": tags,
    "tags_fine": tags_fine,
}

In [70]:
df = pd.read_parquet("datasets/german-quotations/german_quotations_test.parquet")

In [71]:
df.head()

Unnamed: 0,tokens,tags
0,"[Bei, der, Frauen-Fußballweltmeisterschaft, in...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[Am, Freitag, haben, nach, Angaben, der, Deuts...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[Wie, die, Behörden, in, Singapur, mitteilten,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Im, Nordosten, Kenias, greift, seit, einem, M...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Auf, der, Autobahn, A, 67, bei, Darmstadt, is...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [72]:
df["isempty"] = df["tags"].apply(lambda x: len(set(x)) == 1 and "O" in x)

In [73]:
df["isempty"].sum()

627

In [74]:
len(df)

998

In [78]:
# filter out empty documents
df2 = df[~df["isempty"]]
len(df2)

371

In [79]:
# filter out non empty documents
df3 = df[df["isempty"]]
len(df3)

627

In [80]:
# add 10% of the empty documents to the test set
df4 = df3.sample(frac=0.1, random_state=42)
df5 = pd.concat([df2, df4])

In [81]:
len(df5)

434

In [86]:
df5["tags"] = df5["tags"].apply(lambda x: [coarse_label2lid[tag] for tag in x])

In [87]:
df5.head()

Unnamed: 0,tokens,tags,isempty
5,"[Großbritannien, friert, seine, Beziehungen, z...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False
8,"[Der, Göttinger, Hotelmarketing-Experte, Chris...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False
9,"[Am, Dienstagvormittag, wurde, ein, 44-jährige...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False
12,"[Am, Dienstagmittag, sind, bei, einer, Massenk...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False
13,"[Die, Untersuchung, der, in, der, Türkei, vere...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False


In [88]:
df5.to_parquet("datasets/german-quotations/german_direct_quotations.parquet")