In [11]:
import cassis
import json


xml_file = "/home/arjan_v_d/LLMarjan/voorreden_annotaties/TypeSystem.xml"
xmi_file = "/home/arjan_v_d/LLMarjan/voorreden_annotaties/Dieuwertje.xmi"
output_file = "voorreden_annotaties.json"
# Load the TypeSystem
with open(xml_file, 'rb') as f:
    typesystem = cassis.load_typesystem(f)

# Load the XMI file
with open(xmi_file, 'rb') as f:
    cas = cassis.load_cas_from_xmi(f, typesystem=typesystem)

# Prepare the output list
sentences_list = []

# Define types
SentenceType = typesystem.get_type('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')
NamedEntityType = typesystem.get_type('custom.Span')

# Extract sentences and their annotations
for sentence in cas.select(SentenceType):
    sentence_text = cas.sofa_string[sentence.begin:sentence.end]  # Corrected attribute name here
    labels = []
    
    for named_entity in cas.select_covered(NamedEntityType, sentence):
        label_text = cas.sofa_string[named_entity.begin:named_entity.end]  # Corrected attribute name here
        labels.append({
            "text": label_text,
            "start": named_entity.begin - sentence.begin,
            "end": named_entity.end - sentence.begin,
            "label": getattr(named_entity, 'label', 'Unknown')  # Handling missing label attribute
        })
    
    sentences_list.append({
        "text": sentence_text,
        "labels": labels
    })

# Save the results to a JSON file
with open(output_file, 'w') as f:
    json.dump(sentences_list, f, indent=4)

# Print the results in JSON format (for verification)
# print(json.dumps(sentences_list, indent=4))



In [12]:
import json
import re

def tokenize(text):
    # Basic tokenizer that splits on whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def create_tags(tokens, labels):
    tags = ['O'] * len(tokens)
    for label in labels:
        label_text = label['text']
        start = label['start']
        end = label['end']
        entity_tokens = tokenize(label_text)
        entity_type = label['label']
        entity_length = len(entity_tokens)

        # Find the start token index
        char_index = 0
        start_token_index = -1
        for i, token in enumerate(tokens):
            if char_index == start:
                start_token_index = i
                break
            char_index += len(token) + 1 # +1 for the space or punctuation

        # Assign tags
        if start_token_index != -1:
            for i in range(entity_length):
                tags[start_token_index + i] = entity_type
    return tags

def transform_annotations(input_annotations):
    output_data = []
    for idx, annotation in enumerate(input_annotations):
        text = annotation['text']
        labels = annotation['labels']
        tokens = tokenize(text)
        tags = create_tags(tokens, labels)
        labels_list = [{'span': label['text'], 'type': label['label']} for label in labels]
        transformed_annotation = {
            'tokens': tokens,
            'tags': tags,
            'text': text,
            'labels': labels_list,
            'id': str(idx)
        }
        output_data.append(transformed_annotation)
    return output_data

def process_file(input_f, output_f):
    with open(input_f, 'r', encoding='utf-8') as infile:
        input_data = json.load(infile)

    transformed_data = transform_annotations(input_data)

    with open(output_f, 'w', encoding='utf-8') as outfile:
        for item in transformed_data:
            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

# Example usage
input_f = 'voorreden_annotaties.json'
output_f = 'demo_voorreden.jsonl'
process_file(input_f, output_f)


In [13]:
import json
import random

# Paths to the input and output files
input_f = '/home/arjan_v_d/LLMarjan/data/planimals/demo.jsonl'
demo_f = '/home/arjan_v_d/LLMarjan/data/planimals/demo.jsonl'
test_f = '/home/arjan_v_d/LLMarjan/data/planimals/test.jsonl'

# Function to load the input data
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

# Function to save the data into a jsonl file
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Load the input data
data = load_jsonl(input_f)

# Randomly shuffle the data
random.shuffle(data)

# Split the data
demo_data = data[:44]
test_data = data[44:94]

# Save the split data
save_jsonl(demo_data, demo_f)
save_jsonl(test_data, test_f)

print(f'Demo data saved to: {demo_f}')
print(f'Test data saved to: {test_f}')


Demo data saved to: /home/arjan_v_d/LLMarjan/data/planimals/demo.jsonl
Test data saved to: /home/arjan_v_d/LLMarjan/data/planimals/test.jsonl


In [None]:
tokens, tags, ids