In [14]:
import os
import re
import json
import collections
from tqdm import tqdm
import time
import numpy as np
from datetime import datetime
import shutil

import traceback
import sys
sys.path.append('../')
from config import entity_to_acronyms, acronyms_to_entities, colors, index_to_label

json_labels = entity_to_acronyms

import random
random.seed(1)

In [None]:
dataset = '/home/tiagolima/Datasets/CISAM/dataset/data'
dataset_dir = '/home/tiagolima/Datasets/CISAM/adapted/'
output_dir = "../data/bio_stanford"
if not os.path.exists(output_dir):
    os.makedirs(output_dir,  exist_ok=True, mode=0o777)

In [16]:
# Função de substituição que troca ',' por '.' somente dentro de números
def substituir_virgula_por_ponto(match):
    return match.group(0).replace(',', '.')

def substituir_virgula_por_espaco(match):
    return ' ' + match.group(1)

def substituir_doispontos_entre_value_labels(match):
    return match.group(1) + " " + match.group(2)


pattern_dots_inside = r'(\b[a-zA-Z]+)\:(\d+\b)'

pattern_slash_space = re.compile(r'(?<=[a-zA-Z])/(?=[a-zA-Z])')
pattern_norm_number = re.compile(r'\d+,\d+')
pattern_replace_virgula = re.compile(r',\s*(\w)')

In [17]:
def substituir_virgulas_nao_datas(texto):
    # Expressão regular para encontrar datas nos formatos DD/MM/AAAA ou DD/MM
    data_pattern = re.compile(r'\b\d{2}/\d{2}(?:/\d{4})?\b')
    # Lista para armazenar posições de datas
    posicoes_datas = [(m.start(), m.end()) for m in data_pattern.finditer(texto)]

    # Função auxiliar para verificar se uma posição está dentro de uma data
    def esta_em_data(pos):
        for start, end in posicoes_datas:
            if start <= pos < end:
                return True
        return False

    # Construir o resultado substituindo "/" por espaço onde aplicável
    resultado = []
    i = 0
    while i < len(texto):
        if texto[i] == '/' and not esta_em_data(i):
            resultado.append(' ')
        else:
            resultado.append(texto[i])
        i += 1

    return ''.join(resultado)

In [18]:
# Inverter chaves e valores
json_labels_inverted = {valor: chave for chave, valor in json_labels.items()}

In [19]:
def replace_colon(text):
    # Define a regex pattern to match time strings (e.g., 12:34, 23:59, etc.)
    time_pattern = r'\b\d{1,2}:\d{2}\b'

    # Find all time patterns in the text
    time_matches = re.findall(time_pattern, text)

    # Replace all colons except those in time patterns
    def replacer(match):
        # If the match is a time pattern, return it unchanged
        if match.group() in time_matches:
            return match.group()
        # Otherwise, replace the colon with a space
        return match.group().replace(':', ' ')

    # Use re.sub with a callback function to selectively replace colons
    result = re.sub(r'\S+:\S+', replacer, text)

    return result

In [20]:
def remove_terminal_periods(text):
    # Define a regex pattern to match periods at the end of a line
    terminal_period_pattern = r'(?<!\d)\.(?=\s*$)'

    # Use re.sub to replace terminal periods with an empty string
    result = re.sub(terminal_period_pattern, '', text, flags=re.MULTILINE)

    return result

def remove_non_decimal_periods(text):
    # Define a regex pattern to match periods that are not part of a number (decimal point)
    non_decimal_period_pattern = r'(?<!\d)\.(?!\d)'

    # Use re.sub to replace non-decimal periods that are not at the end of a line
    result = re.sub(non_decimal_period_pattern, '', text)

    return result

In [21]:
def remove_concatenated_commas(text):
    # Remove commas that are concatenated with words (without spaces in between)
    text = re.sub(r'(?<=[^\s,]),(?=[^\s,])', ' ', text)
    return text


In [22]:
def replace_commas_with_spaces(text):
    # Replace all commas with spaces
    return text.replace(',', ' ')

In [23]:
def replace_commas_with_equals(text):
    # Replace all commas with spaces
    return text.replace('=', ' ')

In [24]:
def get_docs_json(data_dir):
    doc_ids = []
    # print("AQUIIIIIIIIIIIIIIII  data_dir", data_dir)
    for root, _, files in os.walk(data_dir):
        for filename in files:
            # print("AQUIIIIIIIIIIIIIIII  filename", filename)
            if filename.endswith('jsonl'):
                file_path = os.path.join(root, filename)
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.readlines()
                for jsonl in data:
                    jsonl = json.loads(jsonl)

                    
                    sessions = {}
                    counter_terms = 1
                    counter_clinical_events = 1
                    counter_relation = 1
                    counter_session = 1

                    pool_events = []
                    text = jsonl['text'].lower()
    
                    with open(os.path.join(output_dir, f'{jsonl["id"]:06d}.txt'), 'w') as file_events:
                        file_events.write(text)
                    ann_events = open(os.path.join(output_dir, f'{jsonl["id"]:06d}.ann'), 'w')

                    
                    for entity in jsonl['entities']:
                        pool_events.append({'start': entity['start_offset'], 'end': entity['end_offset'], 'label': entity['label']})

                    for entity in jsonl['sessions']:
                        # print(entity.keys())
                        if 'startOffset' in entity.keys():
                            start = 'startOffset'
                            end = 'endOffset'
                            pool_events.append({'start': entity[start], 'end': entity[end], 'session': entity['session']})
                        else:
                            start = 'start'
                            end = 'end'
                            pool_events.append({'start': entity[start], 'end': entity[end], 'session': entity['label']})
                        

                    pool_events_ordered = sorted(pool_events, key=lambda dicionario: dicionario['start'])
                    
                    # print("*" * 50, '\n', pool_events_ordered)
                    
                    for pp in pool_events_ordered:

                        if 'session' in pp.keys():
                            ann_events.write(f"S{counter_session}\t{'Session'} {pp['start']} {pp['end']}\t{pp['session']}\n")
                            counter_session += 1
                        else:
                            # print(f"T{counter_terms} {json_labels_inverted[pp['label']]} {pp['start']} {pp['end']}\t{text[pp['start']:pp['end']]}")
                            ann_events.write(f"T{counter_terms}\t{json_labels_inverted[pp['label']]} {pp['start']} {pp['end']}\t{text[pp['start']:pp['end']]} \n")
                            if json_labels_inverted[pp['label']] in ['Clinical_event', 'Sign_symptom', 'Date', 'Medication', 'Time']:
                                # print(f"E{counter_clinical_events} {json_labels_inverted[pp['label']]}:T{counter_terms}")
                                ann_events.write(f"E{counter_clinical_events}\t{json_labels_inverted[pp['label']]}:T{counter_terms} \n")
                                counter_clinical_events += 1
                            counter_terms+=1
                    ann_events.close()

In [25]:
get_docs_json(dataset_dir)

In [26]:
def clear_docs_json(data_dir):
    data_reference = "/home/tiagolima/Datasets/CISAM/dataset/5K_data/0001/"

    _meddle = "/home/tiagolima/Datasets/CISAM/dataset/5K_data/0001_meddle/"
    if not os.path.exists(_meddle):
        os.makedirs(_meddle,  exist_ok=True, mode=0o777)
    
    reference_dic = {}
    for root, _, files in os.walk(data_reference):
        for filename in files:
            if filename.endswith('json'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.loads(f.read())
                    reference_dic[data['id']] = file_path
    do_remover = []
    
    doc_ids = []

    for root, _, files in os.walk(data_dir):
        for filename in files:
            if filename.endswith('jsonl'):
                file_path = os.path.join(root, filename)
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.readlines()
                    
                for jsonl in data:
                    jsonl = json.loads(jsonl)
                    if jsonl['id'] in reference_dic.keys():
                        do_remover.append(jsonl['id'])
    for d in do_remover:
        del reference_dic[d]
    print(len(reference_dic))
    
    for source in reference_dic.values():
        destination = os.path.join(_meddle,os.path.basename(source))
        dest = shutil.copy(source, destination) 

In [27]:
# clear_docs_json(dataset_dir)