In [36]:
import json
import pandas as pd
import os
import numpy as np

In [10]:
columns = ['Entity', 'Cases', 'Any label', 'normal', 'mild', 'moderate', 'severe', 'present']
rows = ['aortic_stenosis', 'aortic_regurgitation', 'lv_dias_func', 'lv_sys_func', 'rv_sys_func', 'lv_dil', 'rv_dil', 'tricuspid_regurgitation'
       'mitral_regurgitation', 'pe', 'wma']
label_count_df = pd.DataFrame(columns=columns)
label_count_dict = {}

with open('/training/echo/text_mining/datasets/spancat/reduced_labels/merged_labels.jsonl', 'r') as f:
    for line in f:
        json_line = json.loads(line)
        for span in json_line['spans']:
            label = span['label']
            if 'not_present' in label:
                entity = '_'.join(label.split('_')[:-2])
            else:
                entity = '_'.join(label.split('_')[:-1])
            if span['label'] in label_count_dict:
                label_count_dict[span['label']] += 1
            else:
                label_count_dict[span['label']] = 1

In [23]:
dict_per_type = {}
for entity, value in label_count_dict.items():
    split_entity = entity.split('_')
    if len(split_entity) == 1:
        severity = 'present'
        entity = split_entity[0]
    elif len(split_entity) > 2 and '_'.join(split_entity[-2:]) == 'not_present':
        severity = 'normal'
        entity = '_'.join(split_entity[:-2])
    else:
        severity = split_entity[-1]
        entity = '_'.join(split_entity[:-1])
    if entity in dict_per_type.keys():
        dict_per_type[entity][severity] = value
    else:
        dict_per_type[entity] = {}
        dict_per_type[entity][severity] = value        

In [31]:
columns = ['Entity', 'Cases', 'Any label', 'normal', 'mild', 'moderate', 'severe', 'present']
# rows = ['aortic_stenosis', 'aortic_regurgitation', 'lv_dias_func', 'lv_sys_func', 'rv_sys_func', 'lv_dil', 'rv_dil', 'tricuspid_regurgitation'
#        'mitral_regurgitation', 'pe', 'wma']
label_count_df = pd.DataFrame(columns=columns)
for entity, values in dict_per_type.items():
    sum = 0
    data = {'Entity': entity}
    for label, value in values.items():    
        sum += value
        data[label] = value
    data['Any label'] = sum
    label_count_df.loc[len(label_count_df)] = data

In [32]:
label_count_df

Unnamed: 0,Entity,Cases,Any label,normal,mild,moderate,severe,present
0,lv_sys_func,,5136,3108,1041.0,493.0,494.0,
1,pe,,728,565,98.0,32.0,28.0,5.0
2,lv_dias_func,,1595,536,663.0,263.0,133.0,
3,tricuspid_valve_native_regurgitation,,1947,1417,293.0,164.0,73.0,
4,rv_sys_func,,2633,1926,445.0,198.0,64.0,
5,lv_dil,,2450,1908,256.0,93.0,52.0,141.0
6,wma,,1333,421,,,,912.0
7,aortic_valve_native_regurgitation,,2314,1654,500.0,123.0,37.0,
8,rv_dil,,1712,1359,165.0,75.0,28.0,85.0
9,mitral_valve_native_regurgitation,,2896,1790,813.0,227.0,66.0,


# Document-level statistics

In [34]:
path = '/training/echo/text_mining/datasets/reduced_labels/'
files = [x for x in os.listdir(path) if x.endswith('.jsonl') and 'merged_labels' not in x]

In [41]:
label_dict = {'No label': -1,
             'Normal': 0,
             'Present': 1,
             'Mild': 2,
             'Moderate': 3,
             'Severe': 4}
label_dict_rev = {v: k for k, v in label_dict.items()}

columns = ['Entity', 'Cases', 'Any label', 'Normal', 'Mild', 'Moderate', 'Severe', 'Present']
df = pd.DataFrame(columns=columns)

for file in files:
    with open(path + file, 'r') as f:
        entity = file.split('.jsonl')[0]
        nrows = 0
        any_label = 0
        count_dict = {k: 0 for k in label_dict.keys()}        
        for line in f:
            nrows += 1
            json_line = json.loads(line)
            highest_label_score = -1
            for span in json_line['spans']:
                label = span['label']
                if (label.endswith('not_present') or label.endswith('normal')) and highest_label_score < 0:
                    highest_label_score = 0
                if ((label.endswith('present') and not label.endswith('not_present')) or label == 'pe') and highest_label_score < 1:
                    highest_label_score = 1
                if label.endswith('mild') and highest_label_score < 2:
                    highest_label_score = 2
                if label.endswith('moderate') and highest_label_score < 3:
                    highest_label_score = 3
                if label.endswith('severe') and highest_label_score < 4:
                    highest_label_score = 4
            highest_label = label_dict_rev[highest_label_score]
            count_dict[highest_label] += 1
            if highest_label != 'No label':
                any_label += 1
        data = {'Entity': entity, 'Cases': nrows, 'Any label': any_label, 'Normal': count_dict['Normal'], 'Mild': count_dict['Mild'], 
                'Moderate': count_dict['Moderate'], 'Severe': count_dict['Severe'], 'Present': count_dict['Present']}
        df.loc[len(df)] = data

In [46]:
df = df.sort_values('Entity').reset_index(drop=True)

In [47]:
df.to_latex('/training/echo/text_mining/output/table2_document_label_counts.tex', index=False)

# Span-level statistics

In [34]:
path = '/training/echo/text_mining/datasets/spancat/reduced_labels/'
files = [x for x in os.listdir(path) if x.endswith('.jsonl') and 'merged_labels' not in x]

In [59]:
label_dict = {'Normal': 0,
             'Present': 1,
             'Mild': 2,
             'Moderate': 3,
             'Severe': 4}
label_dict_rev = {v: k for k, v in label_dict.items()}

columns = ['Entity', 'Cases', 'Total # of spans', 'Normal', 'Mild', 'Moderate', 'Severe', 'Present']
df = pd.DataFrame(columns=columns)

for file in files:
    with open(path + file, 'r') as f:
        entity = file.split('.jsonl')[0]
        nrows = 0
        spans = 0
        count_dict = {k: 0 for k in label_dict.keys()}        
        for line in f:
            nrows += 1
            json_line = json.loads(line)
            for span in json_line['spans']:
                label = span['label']
                if (label.endswith('not_present') or label.endswith('normal')):
                    count_dict['Normal'] += 1
                if ((label.endswith('present') and not label.endswith('not_present')) or label == 'pe'):
                    count_dict['Present'] += 1   
                if label.endswith('mild'):
                    count_dict['Mild'] += 1  
                if label.endswith('moderate'):
                    count_dict['Moderate'] += 1  
                if label.endswith('severe'):
                    count_dict['Severe'] += 1  
                spans += 1
        data = {'Entity': entity, 'Cases': nrows, 'Total # of spans': spans, 'Normal': count_dict['Normal'], 'Mild': count_dict['Mild'], 
                'Moderate': count_dict['Moderate'], 'Severe': count_dict['Severe'], 'Present': count_dict['Present']}
        df.loc[len(df)] = data

In [60]:
df = df.sort_values('Entity').reset_index(drop=True)

In [61]:
df

Unnamed: 0,Entity,Cases,Total # of spans,Normal,Mild,Moderate,Severe,Present
0,aortic_regurgitation,5750,2670,1892,578,149,51,0
1,aortic_stenosis,5000,1845,1578,110,73,84,0
2,diastolic_dysfunction,5000,1595,536,663,263,133,0
3,lv_dil,5000,2450,1908,256,93,52,141
4,lv_syst_func,5000,5136,3108,1041,493,494,0
5,mitral_regurgitation,5000,2896,1790,813,227,66,0
6,pe,8862,1316,999,161,58,52,46
7,rv_dil,8226,2806,2189,294,132,50,141
8,rv_syst_func,5000,2633,1926,445,198,64,0
9,tricuspid_regurgitation,5000,1947,1417,293,164,73,0


In [62]:
df.to_latex('/training/echo/text_mining/output/table2_span_label_counts.tex', index=False)