If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.

In [None]:
#./tint-runner-0.2-bin/tint/tint.sh -i input.txt -o output.txt -f json --properties annotators=tokenize,ssplit

In [3]:
import os
from subprocess import Popen, PIPE, STDOUT
import pandas as pd

tint_path = '/mnt/d/school/multicardioner/tint-runner-0.2-bin/tint'
text_path = '/mnt/d/school/multicardioner/multicardioner_train+dev+test+bg_240502/track2/cardioccc_dev/it/brat/output'
annotations_path = '/mnt/d/school/multicardioner/multicardioner_train+dev+test+bg_240502/track2/cardioccc_dev/it/brat'
output_path = '/mnt/d/school/multicardioner/multicardioner_train+dev+test+bg_240502/track2/cardioccc_dev/it/sentences' 
sents_per_batch = 1 # max number of sentences in each output file
is_test = True

In [None]:
import json
from pathlib import Path

sentences = []
for file_name in os.listdir(text_path):
    if '.txt' in file_name: # and 'casos_clinicos_cardiologia129' in file_name:
        file_name_no_ext = file_name.rstrip('.txt')
                 
        file_path = Path(f'{text_path}/{file_name}')
        full_text = file_path.read_text()

        with open(f'{text_path}/{file_name}') as file:
            file_json = json.load(file)
            
        #print(len(file_json['sentences']))
        
        print("Processing: ", file_name_no_ext)
        #T105	FARMACO 911 922	azatioprina

        if not is_test:
            annotations = pd.read_csv(f'{annotations_path}/{file_name_no_ext}.ann', sep='\t', 
                                      names=['ann_type', 'entity_type', 'text'], encoding='utf-8', keep_default_na=False)

            # drop annotator comments
            annotations.drop(annotations[annotations['ann_type'].str.startswith('#')].index, inplace=True)

            annotations['start_pos'] = annotations['entity_type'].apply(lambda v: int(v.split()[1]))
            annotations['end_pos'] = annotations['entity_type'].apply(lambda v: int(v.split()[2]))
            annotations['entity_type'] = annotations['entity_type'].apply(lambda v: v.split()[0])
            annotations.sort_values(by=['start_pos'], inplace=True, ignore_index=True)
        else:
            annotations = pd.DataFrame(columns=['ann_type', 'text'])
            
        total_chars = 0
        annotation_index = 0
        total_annotations = len(annotations)
        line_number = 0
        batch = []
        batch_number = 0
        for line in file_json['sentences']:
            line_number += 1
            batch.append(line['text'])

            if line_number % sents_per_batch == 0:
                batch_number += 1
                current_batch_annotation_indecies = []
                text = ''.join(batch)
                #max_index = total_chars + len(text) # - 1;
                #max_index = total_chars + full_text[total_chars:].find(text) + len(text) - 1
                batch_start = line['characterOffsetBegin'] #total_chars + full_text[total_chars:].find(text.strip()) #+ len(text) - 1
                batch_end = line['characterOffsetEnd'] # batch_start + len(text) - 1
                #print(text, batch_start, batch_end)
                if batch_start < 0:
                    print('batch_start', full_text[total_chars:], text, total_chars)
                
                while annotation_index < total_annotations and annotations.iloc[annotation_index]['start_pos'] >= batch_start and annotations.iloc[annotation_index]['start_pos'] < batch_end:
                    annotation_text = str(annotations.iloc[annotation_index]['text'])
                    #print(annotation_text)
                    #new_start_pos = text.find(annotation_text)
                    new_start_pos = annotations.iloc[annotation_index]['start_pos'] - batch_start
                    
                    if new_start_pos < 0:
                        print('new_start_pos', text, annotation_text, batch_number, max_index)
                        
                    annotations.at[annotation_index, 'start_pos'] = new_start_pos
                    annotations.at[annotation_index, 'end_pos'] = new_start_pos + len(annotation_text)
                    current_batch_annotation_indecies.append(annotation_index)
                    annotation_index += 1

                format_batch_number = str(batch_number).zfill(3)
                with open(f'{output_path}/splits_{sents_per_batch}/{file_name_no_ext}-b-{format_batch_number}-{batch_start}.txt', 'w', encoding='utf-8') as out_file:
                    out_file.write(text)

                # file_name_no_ext, batch_number, batch_start, text
                sentences.append({
                    'filename': file_name_no_ext,
                    'batch_number': batch_number,
                    'batch_start': batch_start,
                    'text': text
                })
                
                current_batch_annotations = annotations.iloc[current_batch_annotation_indecies,].copy().reset_index(drop=True)
                current_batch_annotations['entity_type_with_positions'] = None
                if not current_batch_annotations.empty:
                    current_batch_annotations['row_name'] = current_batch_annotations.apply(lambda row: row.name + 1, axis=1)
                    current_batch_annotations['ann_type'] = current_batch_annotations.apply(lambda row: f'{row["ann_type"].rstrip("0123456789")}{row.name + 1}', axis=1)
                    current_batch_annotations['entity_type_with_positions'] = current_batch_annotations.apply(lambda row: f'{row["entity_type"]} {row["start_pos"]} {row["end_pos"]}', axis=1)
                    current_batch_annotations.drop(columns=['start_pos', 'end_pos', 'entity_type'], inplace=True)

                current_batch_annotations.to_csv(f'{output_path}/splits_{sents_per_batch}/{file_name_no_ext}-b-{format_batch_number}-{batch_start}.ann', sep='\t', header=False, index=False, columns=['ann_type', 'entity_type_with_positions', 'text'], encoding='utf-8')

                total_chars = batch_end #len(text) # -1
                batch.clear()
                
        if annotation_index < total_annotations:
            print('missing annotations')

In [5]:
df_sentences = pd.DataFrame.from_records(sentences)
df_sentences.head()

Unnamed: 0,filename,batch_number,batch_start,text
0,casos_clinicos_cardiologia10,1,0,"Anamnesi\nUomo, 79 anni."
1,casos_clinicos_cardiologia10,2,24,Autosufficiente.
2,casos_clinicos_cardiologia10,3,41,Di Salto.
3,casos_clinicos_cardiologia10,4,51,Anamnesi: -Ipertensione arteriosa cronica.
4,casos_clinicos_cardiologia10,5,94,Ex fumatore.


In [6]:
df_sentences.shape

(17513, 4)

In [7]:
df_sentences.to_csv(f'{output_path}/sentences.tsv', sep='\t', index=False)