## **Parse annotations from inception**

In [4]:
import cassis
import json
import os

base_folder = "annotations/interrater_agreeement_D/dbnl"
output_file = "annotations_D.json"

# List to hold all sentences from all documents
all_documents_sentences = []

# Iterate over each subdirectory in the base folder
for document_name in os.listdir(base_folder):
    document_folder = os.path.join(base_folder, document_name)

    # Assuming each folder contains exactly one XML and one XMI file
    xml_file = next(os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.xml'))
    xmi_file = next(os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.xmi'))

    # Load the TypeSystem
    with open(xml_file, 'rb') as f:
        typesystem = cassis.load_typesystem(f)

    # Load the XMI file
    with open(xmi_file, 'rb') as f:
        cas = cassis.load_cas_from_xmi(f, typesystem=typesystem)

    # Define types
    SentenceType = typesystem.get_type('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')
    NamedEntityType = typesystem.get_type('custom.Span')

    # Prepare the output list for this document
    sentences_list = []

    # Extract sentences and their annotations
    for sentence in cas.select(SentenceType):
        sentence_text = cas.sofa_string[sentence.begin:sentence.end]
        labels = []
        
        for named_entity in cas.select_covered(NamedEntityType, sentence):
            label_text = cas.sofa_string[named_entity.begin:named_entity.end]
            labels.append({
                "text": label_text,
                "start": named_entity.begin - sentence.begin, 
                "end": named_entity.end - sentence.begin,
                "label": getattr(named_entity, 'label', 'Unknown')
            })
        
        sentences_list.append({
            "text": sentence_text,
            "labels": labels
        })

    # Append results from this document to the all documents list
    all_documents_sentences.extend(sentences_list)

# Save the results to a JSON file
with open(output_file, 'w') as f:
    json.dump(all_documents_sentences, f, indent=4)

# Optionally print the results
print(f"All annotations from documents have been saved to {output_file}") 


All annotations from documents have been saved to annotations_D.json


## **Optional: filter labels**

In [6]:
import json

# Load the existing JSON data
input_file = 'all_documents_annotaties.json'
output_file = 'filtered_documents_annotaties_no_labels.json'

with open(input_file, 'r') as file:
    data = json.load(file)

# Filter labels for each document to only include those with 'An-Org-Lit'
filtered_data = []
for document in data:
    # Filter labels to keep only 'An-Org-Lit'
    filtered_labels = [label for label in document['labels'] if label['label'] == 'An-Org-Lit']
    # Append the document with only the filtered labels
    filtered_data.append({
        "text": document['text'],
        "labels": filtered_labels
    })

# Save the filtered data to a new JSON file
with open(output_file, 'w') as file:
    json.dump(filtered_data, file, indent=4)

print(f"Filtered JSON data has been saved to {output_file}")


Filtered JSON data has been saved to filtered_documents_annotaties_no_labels.json


## **Convert to NER format**

In [19]:
import json
import re

def tokenize(text):
    # Basic tokenizer that splits on whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def create_tags(tokens, span_label):
    """
    Covert span labels to sequence labels.
    Language: en/zh
    """
    if span_label != []:
        for e in span_label:
            e["span"] = e["text"]
            e["type"] = e["label"]
    span_label = sorted(span_label, key=lambda x: len(x['span']), reverse=True)
    span_to_type = {entity['span']: entity['type'] for entity in span_label}
    # get words list

    # build a tokenizer first
    dictionary = dict()
    for token in tokens:
        if token not in dictionary:
            dictionary[token] = f'[{len(dictionary)}]'
    id_string = ' '.join([dictionary[token] for token in tokens])
    for entity in span_label:
        span_tokens = entity['span'].strip().split(' ')
        # validate span token
        valid_flag = True
        for token in span_tokens:
            if token not in dictionary:
                valid_flag = False
                break
        if not valid_flag:
            continue
        # translate span token into ids
        id_substring = ' '.join([dictionary[token] for token in span_tokens])
        id_string = ('[sep]' + id_substring + '[sep]').join(id_string.split(id_substring))
        # print(id_string)
    # convert back to nl
    sent = id_string
    for token in dictionary:
        sent = sent.replace(dictionary[token], token)
    words = sent.split('[sep]')

    seq_label = []
    for word in words:
        word = word.strip()
        if len(word) == 0:
            continue
        entity_flag = (word in span_to_type)
        word_length = len(word.split(' '))
        if entity_flag:
            if word_length == 1:
                label = [f'{span_to_type[word]}']
            else:
                label = ([f'{span_to_type[word]}'] * (word_length))
        else:
            label = ['O' for _ in range(word_length)]
        seq_label.extend(label)

    assert len(seq_label) == len(tokens)
    return seq_label 

def transform_annotations(input_annotations):
    label_mapping = {
    "An-Org-Lit": "Animals-Organisms-Literal",
    "An-Org-Sym": "Animals-Organisms-Symbolical",
    "An-Org-Petrified": "Animals-Organisms-Petrified",
    "An-Part-Lit": "Animals-Parts-Literal",
    "An-Part-Sym": "Animals-Parts-Symbolical",
    "An-Part-Petrified": "Animals-Parts-Petrified",
    "An-Prod-Lit": "Animals-Products-Literal",
    "An-Prod-Sym": "Animals-Products-Symbolical",
    "An-Prod-Petrified": "Animals-Products-Petrified",
    "An-Coll-Lit": "Animals-Collective-Literal",
    "An-Coll-Sym": "Animals-Collective-Symbolical",
    "An-Coll-Petrified": "Animals-Collective-Petrified",
    "Plant-Org-Lit": "Plants-Organisms-Literal",
    "Plant-Org-Sym": "Plants-Organisms-Symbolical",
    "Plant-Org-Petrified": "Plants-Organisms-Petrified",
    "Plant-Part-Lit": "Plants-Parts-Literal",
    "Plant-Part-Sym": "Plants-Parts-Symbolical",
    "Plant-Part-Petrified": "Plants-Parts-Petrified",
    "Plant-Prod-Lit": "Plants-Products-Literal",
    "Plant-Prod-Sym": "Plants-Products-Symbolical",
    "Plant-Prod-Petrified": "Plants-Products-Petrified",
    "Plant-Coll-Literal": "Plants-Collective-Literal",
    "Plant-Coll-Sym": "Plants-Collective-Symbolical",
    "Plant-Coll-Petrified": "Plants-Collective-Petrified"
    }
    output_data = []
    for idx, annotation in enumerate(input_annotations):
        text = annotation['text']
        labels = annotation['labels']
        
        # Update the labels according to the mapping
        updated_labels = []
        for label in labels:
            label_type = label['label']
            if label_type in label_mapping:
                label_type = label_mapping[label_type]
            updated_labels.append({'text': label['text'], 'label': label_type})

        tokens = tokenize(text)
        tags = create_tags(tokens, updated_labels)

        labels_list = [{'span': label['text'], 'type': label['label']} for label in updated_labels]

        transformed_annotation = {
            'tokens': tokens,
            'tags': tags,
            'text': text,
            'labels': labels_list,
            'id': str(idx)
        }
        output_data.append(transformed_annotation)
    return output_data

def process_file(input_f, output_f):
    with open(input_f, 'r', encoding='utf-8') as infile:
        input_data = json.load(infile)

    transformed_data = transform_annotations(input_data)

    with open(output_f, 'w', encoding='utf-8') as outfile:
        for item in transformed_data:
            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

# Example usage
input_f = 'annotations_T.json'
output_f = 'demo_and_test_T.jsonl'
process_file(input_f, output_f)


In [20]:
import json
import random

# Paths to the input and output files
input_f = 'demo_and_test_T.jsonl'
demo_f = 'data/by_the_horns_T/demo.jsonl'
test_f = 'data/by_the_horns_T/test.jsonl'
holdout_f = 'data/by_the_horns_T/holdout.jsonl'

# Function to load the input data
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

# Function to save the data into a jsonl file
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Load the input data
data = load_jsonl(input_f)

# Randomly shuffle the data
random.seed(42)
random.shuffle(data)

# Split the data
demo_data = data[0:100]
test_data = data[100:150]
holdout_data = data[150:]

# Save the split data
save_jsonl(demo_data, demo_f)
save_jsonl(test_data, test_f)
save_jsonl(holdout_data, holdout_f)

print(f'Demo data saved to: {demo_f}')
print(f'Test data saved to: {test_f}')
print(f'Holdout data saved to: {holdout_f}')


Demo data saved to: data/by_the_horns_T/demo.jsonl
Test data saved to: data/by_the_horns_T/test.jsonl
Holdout data saved to: data/by_the_horns_T/holdout.jsonl
