In [3]:
import pandas as pd

In [19]:
mentions_file = 'output/multicardioner_track1_cardioccc_dev_predictions.tsv'
external_mentions_file = 'output/extracted_drugs_cardioccc_dev_en.tsv'
result_file = 'output/multicardioner_track1_cardioccc_dev_predictions_combined.tsv'

In [5]:
df_mentions = pd.read_csv(mentions_file, sep='\t', header=0, index_col=False)
df_mentions.head()

Unnamed: 0,filename,label,start_span,end_span,text
0,casos_clinicos_cardiologia10,ENFERMEDAD,95,124,Hipertensión arterial crónica
1,casos_clinicos_cardiologia10,ENFERMEDAD,126,139,Ex-tabaquista
2,casos_clinicos_cardiologia10,ENFERMEDAD,142,215,Diabetes mellitus tipo 2 con repercusione...
3,casos_clinicos_cardiologia10,ENFERMEDAD,217,238,cardiopatía isquémica
4,casos_clinicos_cardiologia10,ENFERMEDAD,240,260,arteriopatía de MMII


In [6]:
df_external_mentions = pd.read_csv(external_mentions_file, sep='\t', header=0, index_col=False)
df_external_mentions.head()

Unnamed: 0,file_name,label,start_index,end_index,drug_in_text,reference,original_drug,context
0,casos_clinicos_cardiologia10,FARMACO,2438,2447,Cefazolin,ATC-DrugBank-DrugCentral-NIHS,cefazolin,CONTEXT:jury.Treatment:Cefazolin2g c/8 hs iv; ...
1,casos_clinicos_cardiologia10,FARMACO,2461,2471,gentamicin,ATC-DrugBank-DrugCentral,gentamicin,CONTEXT:olin2g c/8 hs iv; gentamicin 3mg/kg/da...
2,casos_clinicos_cardiologia10,FARMACO,2487,2497,rifampicin,ATC-DrugCentral-NIHS,rifampicin,CONTEXT:in 3mg/kg/day iv; rifampicin 600 mg c/...
3,casos_clinicos_cardiologia10,FARMACO,3079,3087,amikacin,ATC-DrugBank-DrugCentral,amikacin,CONTEXT:PreQ prophylaxis: amikacin 1g and vanc...
4,casos_clinicos_cardiologia10,FARMACO,3095,3105,vancomycin,ATC-DrugBank-DrugCentral-NIHS,vancomycin,CONTEXT:: amikacin 1g and vancomycin 1g.- Aort...


In [23]:
def filter_spans(spans):
    get_sort_key = lambda span: (span['end_span'] - span['start_span'], -span['start_span'])
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens: Set[int] = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span['start_span'] not in seen_tokens and span['end_span'] - 1 not in seen_tokens:
            result.append(span)
            seen_tokens.update(range(span['start_span'], span['end_span']))
    result = sorted(result, key=lambda span: span['start_span'])
    return result

In [29]:
external_filenames = list(df_external_mentions['file_name'].values)
original_filenames = list(df_mentions['filename'].values)
original_filenames.extend(external_filenames)
filenames = list(set(original_filenames))
filenames.sort()

In [30]:
all_spans = []
for filename in filenames:
    # get both mentions
    df_mentions_file = df_mentions[df_mentions['filename'] == filename]
    df_external_mentions_file = df_external_mentions[df_external_mentions['file_name'] == filename]
    # merge spans
    spans = []

    for index, mention in df_mentions_file.iterrows():
        spans.append({
            'filename': mention['filename'],
            'label': mention['label'],
            'start_span': mention['start_span'],
            'end_span': mention['end_span'],
            'text': mention['text']
        })

    for index, mention in df_external_mentions_file.iterrows():
        spans.append({
            'filename': mention['file_name'],
            'label': mention['label'],
            'start_span': mention['start_index'],
            'end_span': mention['end_index'],
            'text': mention['drug_in_text']
        })

    filtered_spans = filter_spans(spans)
    all_spans.extend(filtered_spans)

In [31]:
df_all_mentions = pd.DataFrame.from_records(all_spans)
df_all_mentions.head()

Unnamed: 0,filename,label,start_span,end_span,text
0,casos_clinicos_cardiologia10,ENFERMEDAD,95,124,Hipertensión arterial crónica
1,casos_clinicos_cardiologia10,ENFERMEDAD,126,139,Ex-tabaquista
2,casos_clinicos_cardiologia10,ENFERMEDAD,142,215,Diabetes mellitus tipo 2 con repercusione...
3,casos_clinicos_cardiologia10,ENFERMEDAD,217,238,cardiopatía isquémica
4,casos_clinicos_cardiologia10,ENFERMEDAD,240,260,arteriopatía de MMII


In [32]:
df_all_mentions[['filename', 'label', 'start_span', 'end_span', 'text']].to_csv(result_file, sep='\t', index=False)