If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.

In [1]:
## If running in Colab
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# v8+ should be fine
!java -version

openjdk version "11.0.18" 2023-01-17
OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1)
OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)


In [4]:
import os
from subprocess import Popen, PIPE, STDOUT
import pandas as pd

splitter_jar_path = '/content/drive/MyDrive/CLEF2023/notebooks/SentenceSplitter.jar'
splitter_model_path = '/content/drive/MyDrive/CLEF2023/notebooks/es-sentence-splitter-model-spaccc.bin'
text_path = '/content/drive/MyDrive/CLEF2023/data/medprocner_train/txt'
annotations_path = '/content/drive/MyDrive/CLEF2023/data/medprocner_train/brat/medprocner_brat_train_subtask1'
output_path = '/content/drive/MyDrive/CLEF2023/data/processed' 
sents_per_batch = 1 # max number of sentences in each output file

In [16]:
for file_name in os.listdir(text_path):
    file_name_no_ext = file_name.rstrip('.txt')
    p = Popen(['java', '-jar', splitter_jar_path , f'{text_path}/{file_name}', splitter_model_path], stdout=PIPE, stderr=STDOUT, text=True)
    
    print("Processing: ", file_name_no_ext)
    
    annotations = pd.read_csv(f'{annotations_path}/{file_name_no_ext}.ann', sep='\t', names=['ann_type', 'entity_type', 'text'], encoding='utf-8')
    
    # drop annotator comments
    annotations.drop(annotations[annotations['ann_type'].str.startswith('#')].index, inplace=True)
    
    annotations['start_pos'] = annotations['entity_type'].transform(lambda v: int(v.split()[1]))
    annotations['end_pos'] = annotations['entity_type'].transform(lambda v: int(v.split()[2]))
    annotations['entity_type'] = annotations['entity_type'].transform(lambda v: v.split()[0])
    annotations.sort_values(by=['start_pos'], inplace=True, ignore_index=True)

    total_chars = 0
    annotation_index = 0
    total_annotations = len(annotations)
    line_number = 0
    batch = []
    batch_number = 1
    for line in p.stdout:
        line_number += 1
        batch.append(line)
        
        if line_number % sents_per_batch == 0:
            batch_number += 1
            current_batch_annotation_indecies = []
            text = ''.join(batch)
            max_index = total_chars + len(text) - 1;
            while annotation_index < total_annotations and annotations.iloc[annotation_index]['start_pos'] <= max_index:
                annotation_text = annotations.iloc[annotation_index]['text']

                new_start_pos = text.find(annotation_text)
                annotations.at[annotation_index, 'start_pos'] = new_start_pos
                annotations.at[annotation_index, 'end_pos'] = new_start_pos + len(annotation_text)
                current_batch_annotation_indecies.append(annotation_index)
                annotation_index += 1

            with open(f'{output_path}/splits_{sents_per_batch}/{file_name_no_ext}-b-{batch_number}.txt', 'w', encoding='utf-8') as out_file:
                out_file.write(text)

            current_batch_annotations = annotations.iloc[current_batch_annotation_indecies,].copy().reset_index(drop=True)
            current_batch_annotations['entity_type_with_positions'] = None
            if not current_batch_annotations.empty:
                current_batch_annotations['row_name'] = current_batch_annotations.apply(lambda row: row.name + 1, axis=1)
                current_batch_annotations['ann_type'] = current_batch_annotations.apply(lambda row: f'{row["ann_type"].rstrip("0123456789")}{row.name + 1}', axis=1)
                current_batch_annotations['entity_type_with_positions'] = current_batch_annotations.apply(lambda row: f'{row["entity_type"]} {row["start_pos"]} {row["end_pos"]}', axis=1)
                current_batch_annotations.drop(columns=['start_pos', 'end_pos', 'entity_type'], inplace=True)

            current_batch_annotations.to_csv(f'{output_path}/splits_{sents_per_batch}/{file_name_no_ext}-b-{batch_number}.ann', sep='\t', header=False, index=False, columns=['ann_type', 'entity_type_with_positions', 'text'], encoding='utf-8')
                
            total_chars += len(text) -1
            batch.clear()

Processing:  es-S1130-05582015000100008-1
Processing:  es-S1888-75462017000100042-1
Processing:  es-S1139-76322017000200010-1
Processing:  es-S0365-66912010001200005-1
Processing:  S0004-06142005001000011-1
Processing:  es-S1137-66272012000300021-1
Processing:  es-S1130-05582015000200006-1
Processing:  es-S0211-69952015000300011-1
Processing:  es-S0210-56912009000600008-1
Processing:  es-S1130-05582009000600003-3
Processing:  es-S1579-699X2004000400002-1
Processing:  es-S1130-01082008001200016-2
Processing:  es-S1130-01082005000500013-1
Processing:  es-S0365-66912012000700003-1
Processing:  es-S0212-16112011000600041-1
Processing:  es-S1130-05582011000300005-1
Processing:  es-S1139-76322013000500010-1
Processing:  es-S0365-66912008000700011-1
Processing:  es-S1130-01082007000300014-1
Processing:  es-S1137-66272009000100013-1
Processing:  es-S0212-71992005000200008-1
Processing:  es-S1130-63432016000500012-1
Processing:  es-S0212-71992004001100007-1
Processing:  es-S0376-789220140002000