In [2]:
import cassis
import json
import os

base_folder = "/home/arjan_v_d/LLMarjan/annotations_thirza"
output_file = "inter_annotator_agreement_annotaties_thirza.json"

# List to hold all sentences from all documents
all_documents_sentences = []

# Iterate over each subdirectory in the base folder
for document_name in os.listdir(base_folder):
    document_folder = os.path.join(base_folder, document_name)

    # Assuming each folder contains exactly one XML and one XMI file
    xml_file = next(os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.xml'))
    xmi_file = next(os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.xmi'))

    # Load the TypeSystem
    with open(xml_file, 'rb') as f:
        typesystem = cassis.load_typesystem(f)

    # Load the XMI file
    with open(xmi_file, 'rb') as f:
        cas = cassis.load_cas_from_xmi(f, typesystem=typesystem)

    # Define types
    SentenceType = typesystem.get_type('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')
    NamedEntityType = typesystem.get_type('custom.Span')

    # Prepare the output list for this document
    sentences_list = []

    # Extract sentences and their annotations
    for sentence in cas.select(SentenceType):
        sentence_text = cas.sofa_string[sentence.begin:sentence.end]
        labels = []
        
        for named_entity in cas.select_covered(NamedEntityType, sentence):
            label_text = cas.sofa_string[named_entity.begin:named_entity.end]
            labels.append({
                "text": label_text,
                "start": named_entity.begin - sentence.begin,
                "end": named_entity.end - sentence.begin,
                "label": getattr(named_entity, 'label', 'Unknown')
            })
        
        sentences_list.append({
            "text": sentence_text,
            "labels": labels
        })

    # Append results from this document to the all documents list
    all_documents_sentences.extend(sentences_list)

# Save the results to a JSON file
with open(output_file, 'w') as f:
    json.dump(all_documents_sentences, f, indent=4)

# Optionally print the results
print(f"All annotations from documents have been saved to {output_file}")


All annotations from documents have been saved to inter_annotator_agreement_annotaties_thirza.json


In [6]:
import json

# Load the existing JSON data
input_file = 'all_documents_annotaties.json'
output_file = 'filtered_documents_annotaties_no_labels.json'

with open(input_file, 'r') as file:
    data = json.load(file)

# Filter labels for each document to only include those with 'An-Org-Lit'
filtered_data = []
for document in data:
    # Filter labels to keep only 'An-Org-Lit'
    filtered_labels = [label for label in document['labels'] if label['label'] == 'An-Org-Lit']
    # Append the document with only the filtered labels
    filtered_data.append({
        "text": document['text'],
        "labels": filtered_labels
    })

# Save the filtered data to a new JSON file
with open(output_file, 'w') as file:
    json.dump(filtered_data, file, indent=4)

print(f"Filtered JSON data has been saved to {output_file}")


Filtered JSON data has been saved to filtered_documents_annotaties_no_labels.json


In [65]:
import json
import re

def tokenize(text):
    # Basic tokenizer that splits on whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def create_tags(tokens, span_label):
    """
    Covert span labels to sequence labels.
    Language: en/zh
    """
    if span_label != []:
        for e in span_label:
            e["span"] = e["text"]
            e["type"] = e["label"]
    span_label = sorted(span_label, key=lambda x: len(x['span']), reverse=True)
    span_to_type = {entity['span']: entity['type'] for entity in span_label}
    # get words list

    # build a tokenizer first
    dictionary = dict()
    for token in tokens:
        if token not in dictionary:
            dictionary[token] = f'[{len(dictionary)}]'
    id_string = ' '.join([dictionary[token] for token in tokens])
    for entity in span_label:
        span_tokens = entity['span'].strip().split(' ')
        # validate span token
        valid_flag = True
        for token in span_tokens:
            if token not in dictionary:
                valid_flag = False
                break
        if not valid_flag:
            continue
        # translate span token into ids
        id_substring = ' '.join([dictionary[token] for token in span_tokens])
        id_string = ('[sep]' + id_substring + '[sep]').join(id_string.split(id_substring))
        # print(id_string)
    # convert back to nl
    sent = id_string
    for token in dictionary:
        sent = sent.replace(dictionary[token], token)
    words = sent.split('[sep]')

    seq_label = []
    for word in words:
        word = word.strip()
        if len(word) == 0:
            continue
        entity_flag = (word in span_to_type)
        word_length = len(word.split(' '))
        if entity_flag:
            if word_length == 1:
                label = [f'{span_to_type[word]}']
            else:
                label = ([f'{span_to_type[word]}'] * (word_length))
        else:
            label = ['O' for _ in range(word_length)]
        seq_label.extend(label)

    assert len(seq_label) == len(tokens)
    return seq_label 

def transform_annotations(input_annotations):
    output_data = []
    for idx, annotation in enumerate(input_annotations):
        text = annotation['text']
        labels = annotation['labels']
        tokens = tokenize(text)
        tags = create_tags(tokens, labels) #probleem met de tags; niet alles wordt herkend

        labels_list = [{'span': label['text'], 'type': label['label']} for label in labels]
        transformed_annotation = {
            'tokens': tokens,
            'tags': tags,
            'text': text,
            'labels': labels_list,
            'id': str(idx)
        }
        output_data.append(transformed_annotation)
    return output_data

def process_file(input_f, output_f):
    with open(input_f, 'r', encoding='utf-8') as infile:
        input_data = json.load(infile)

    transformed_data = transform_annotations(input_data)

    with open(output_f, 'w', encoding='utf-8') as outfile:
        for item in transformed_data:
            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

# Example usage
input_f = 'filtered_annotations.json'
output_f = 'demo_and_test.jsonl'
process_file(input_f, output_f)


In [66]:
import json
import random

# Paths to the input and output files
input_f = '/home/arjan_v_d/LLMarjan/demo_and_test.jsonl'
demo_f = 'data/animals_or_not/demo.jsonl'
test_f = 'data/animals_or_not/test.jsonl'

# Function to load the input data
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

# Function to save the data into a jsonl file
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Load the input data
data = load_jsonl(input_f)

# Randomly shuffle the data
random.shuffle(data)

# Split the data
demo_data = data[0:100]
test_data = data[100:]

# Save the split data
save_jsonl(demo_data, demo_f)
save_jsonl(test_data, test_f)

print(f'Demo data saved to: {demo_f}')
print(f'Test data saved to: {test_f}')


Demo data saved to: data/animals_or_not/demo.jsonl
Test data saved to: data/animals_or_not/test.jsonl


In [67]:
import json
file_path = "/home/arjan_v_d/LLMarjan/data/animals_or_not/demo.jsonl"
# Initialize an empty list to store the data
data = []

# Open the .jsonl file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Parse each line as JSON and append to the data list
        data.append(json.loads(line))



        

In [16]:
def create_tags(tokens, labels):
    tags = ['O'] * len(tokens)
    for label in labels:
        label_text = label['text']
        start = label['start']
        end = label['end']
        entity_tokens = tokenize(label_text)
        entity_type = label['label']
        entity_length = len(entity_tokens)

        # Find the start token index
        char_index = 0
        start_token_index = -1
        for i, token in enumerate(tokens):
            if char_index == start:
                start_token_index = i
                break
            char_index += len(token) + 1 # +1 for the space or punctuation

        # Assign tags
        if start_token_index != -1:
            for i in range(entity_length):
                tags[start_token_index + i] = entity_type
    return tags

In [68]:
import json
file_path = "/home/arjan_v_d/LLMarjan/data/animals_or_not/demo.jsonl"
# Initialize an empty list to store the data
data = []

# Open the .jsonl file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Parse each line as JSON and append to the data list
        data.append(json.loads(line))



        

# **Inter annotator agreement**

In [3]:
import json

# Load the two annotation files
with open('inter_annotator_agreement_annotaties_dieuwertje.json') as file:
    annotations_dieuwertje = json.load(file)

with open('inter_annotator_agreement_annotaties_thirza.json') as file:
    annotations_thirza = json.load(file)

# Create a dictionary to merge annotations by text
merged_annotations = {}

# Function to merge annotations
def merge_annotations(text, labels1, labels2):
    merged_labels = labels1 + labels2
    return {"text": text, "labels": merged_labels}

# Merge annotations by text
for ann in annotations_dieuwertje:
    text = ann["text"]
    labels = ann["labels"]
    if text not in merged_annotations:
        merged_annotations[text] = {"text": text, "labels_dieuwertje": labels, "labels_thirza": []}
    else:
        merged_annotations[text]["labels_dieuwertje"] = labels

for ann in annotations_thirza:
    text = ann["text"]
    labels = ann["labels"]
    if text not in merged_annotations:
        merged_annotations[text] = {"text": text, "labels_dieuwertje": [], "labels_thirza": labels}
    else:
        merged_annotations[text]["labels_thirza"] = labels

# Convert merged annotations to a list
merged_annotations_list = [
    {
        "text": text,
        "labels_dieuwertje": details["labels_dieuwertje"],
        "labels_thirza": details["labels_thirza"]
    }
    for text, details in merged_annotations.items()
]

# Save the merged annotations to a new JSON file
output_file = 'agreement_annotations.json'
with open(output_file, 'w') as file:
    json.dump(merged_annotations_list, file, indent=4)




In [5]:
data = []

# Open the .jsonl file and read line by line
with open("agreement_annotations.json", 'r') as file:
    data = json.load(file)




In [9]:
for e in data:
    if e
        print(a)
        

{'text': 'vleesch', 'start': 16, 'end': 23, 'label': 'An-Prod-Lit'}
{'text': 'Os', 'start': 17, 'end': 19, 'label': 'An-Org-Lit'}
{'text': 'Os', 'start': 7, 'end': 9, 'label': 'An-Org-Lit'}
{'text': 'ribbenstukken', 'start': 148, 'end': 161, 'label': 'An-Prod-Lit'}
{'text': 'hals', 'start': 21, 'end': 25, 'label': 'An-Part-Lit'}
{'text': 'vleesch', 'start': 11, 'end': 18, 'label': 'An-Part-Lit'}
{'text': 'vleesch', 'start': 89, 'end': 96, 'label': 'An-Prod-Lit'}
{'text': 'bil stukken', 'start': 134, 'end': 145, 'label': 'An-Prod-Lit'}
{'text': 'vet', 'start': 155, 'end': 158, 'label': 'An-Prod-Lit'}
{'text': 'rolpens', 'start': 189, 'end': 196, 'label': 'An-Prod-Lit'}
{'text': 'vet', 'start': 217, 'end': 220, 'label': 'An-Prod-Lit'}
{'text': 'vleesch', 'start': 4, 'end': 11, 'label': 'An-Prod-Lit'}
{'text': 'Paterstukken', 'start': 58, 'end': 70, 'label': 'An-Prod-Lit'}
{'text': 'billen', 'start': 94, 'end': 100, 'label': 'An-Prod-Lit'}
{'text': 'ribben', 'start': 119, 'end': 125, 'lab

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")



In [5]:
vocab = tokenizer.get_vocab()

# Convert the vocabulary dictionary to a sorted list by token ID
sorted_vocab = sorted(vocab.items(), key=lambda item: item[1])

# Open a text file for writing
with open("sorted_vocabulary.txt", "w") as f:
    for token, id in sorted_vocab:
        # Write each token and its ID to the file, sorted by ID
        f.write(f"{id} {token}\n")  # Change the order to ID first, then token