In [1]:
from pathlib import Path
import pandas as pd
import sys
import pdb
import evaluate
from tqdm import tqdm
import numpy as np
import json

from sectiontagger import SectionTagger
section_tagger = SectionTagger()


SECTION_DIVISIONS = ['subjective', 'objective_exam', 'objective_results', 'assessment_and_plan']

TASKA_RANGE = [0,100]
TASKA_PREFIX = 'taskA'

TASKB_RANGE = [88,127]
TASKB_PREFIX = 'D2N'

def add_section_divisions(row, model_name, dialogue_column ):
    row['src_len'] = len(row[ dialogue_column ].split())
#     pdb.set_trace()
    for evaltype in ['note', model_name]:
        text = row[evaltype]
        text_with_endlines = text.replace( '__lf1__', '\n' )
        detected_divisions = section_tagger.divide_note_by_metasections(text_with_endlines)
        for detected_division in detected_divisions:
            label, _, _, start, _, end = detected_division
            row[ '%s_%s' % (evaltype, label)] = text_with_endlines[start:end].replace('\n', '__lf1__')

    return row

2023-04-18 13:59:38.644548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print('Loading ROUGE, BERTScore, BLEURT from HuggingFace')
scorers = {
    'rouge': (
        evaluate.load('rouge'),
        {'use_aggregator': False},
        ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
        ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
    ),
    'bert_scorer': (
        evaluate.load('bertscore'),
        {'model_type': 'microsoft/deberta-xlarge-mnli','batch_size':8},
        ['precision', 'recall', 'f1'],
        ['bertscore_precision', 'bertscore_recall', 'bertscore_f1']
    ),
    'bleurt': (
        evaluate.load('bleurt', config_name='BLEURT-20'),
        {},
        ['scores'],
        ['bleurt']
    ),
}

Loading ROUGE, BERTScore, BLEURT from HuggingFace
INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/cd1c38739d180ae53192201859a058307621534b704c20700072eca17d748c58/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: /root/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/cd1c38739d180ae53192201859a058307621534b704c20700072eca17d748c58/BLEURT-20/sent_piece

2023-04-18 13:59:50.709494: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-18 13:59:50.711615: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


INFO:tensorflow:BLEURT initialized.


In [3]:
def filter_and_aggregate(obj, indices):

    agg_obj = {}
    for k, v in obj.items():
        agg_obj[k] = [float(np.mean([v[i] for i in indices]))]
    return agg_obj

In [4]:
for csv_file in Path.cwd().glob("*.csv"):
    model_architectures = ["dialogled-large_summary","dialogled-base_summary"]    
    split = int(csv_file.stem.split("_")[1])
    for model_name in model_architectures:        
        model_arch = model_name.split("_")[0]
        full_df = pd.read_csv(csv_file)[["dataset","note",model_name,"dialogue"]]
        references = full_df["note"].tolist()
        predictions = full_df[model_name].tolist()
        num_test = len(full_df)
        
        full_df = full_df.apply( lambda row: add_section_divisions( row, model_name, "dialogue" ), axis=1)

        # ===========CHECKS TO MAKE SURE THERE ARE SECTIONS ==========
        total_detected_sections = sum([
            full_df[f'{model_name}_{division}'].notna().sum() for division in SECTION_DIVISIONS
        ])
        if total_detected_sections == 0:
            print('We detected 0 sections! - you can use override_section_check flag to run while ignoring this.')
            sys.exit(1)

        # Fill in missing section divisions as empty string
        full_df.fillna('#####EMPTY#####', inplace=True)

        ######## ADD INSTANCES FOR SECTION DIVISION ########
        for division in SECTION_DIVISIONS:
            null_default = [''] * num_test
            references.extend(full_df.get(f'note_{division}', null_default))
            predictions.extend(full_df.get(f'{model_name}_{division}', null_default))

        # sanity check, we should now have 5 x the original set (one for full note, 4 for the divisions)
        rn = len(references)
        pn = len(predictions)
        en = len(full_df) * 5
        assert rn == pn == en, f'The number of references ({rn}) and predictions ({pn}) does not match expected ({en})'
        
        
        all_scores = {}
        for name, (scorer, kwargs, keys, save_keys) in tqdm(scorers.items(),desc="scorers"):
            scores = scorer.compute(references=references, predictions=predictions, **kwargs)
            for score_key, save_key in zip(keys, save_keys):
                all_scores[save_key] = scores[score_key]

        cohorts = [
            ('all', list(range(num_test))),
        ]

        subsets = full_df['dataset'].unique().tolist()
        for subset in subsets:
            # Don't include anything after num_test (section-level)
            indices = full_df[full_df['dataset'] == subset].index.tolist()
            cohorts.append((f'dataset-{subset}', indices))
            
        for ind, division in enumerate(SECTION_DIVISIONS):
            start = (ind + 1) * num_test
            end = (ind + 2) * num_test
            cohorts.append((f'division-{division}', list(range(start, end))))


        # ######## CALCULATE PER-LENGTH SCORES (bigger than --note_length_cutoff=512 vs not) ########
        df_shortsrc = full_df[full_df['src_len'] <= 512]
        if len(df_shortsrc) > 0:
            indices = df_shortsrc.index.tolist()
            cohorts.append(('shorter-src', indices))

        df_longsrc = full_df[full_df['src_len'] > 512]
        if len(df_longsrc) > 0:
            indices = df_longsrc.index.tolist()
            cohorts.append(('longer-src', indices))


        outputs = {k: filter_and_aggregate(all_scores, idxs) for (k, idxs) in cohorts}

        # ###### OUTPUT TO JSON FILE ########
        fn_out = f'{model_arch}_{split}_results.json'
        print(f'Saving results to {fn_out}')
        with open(fn_out, 'w') as fd:
            json.dump(outputs, fd, indent=4)

scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:24<00:00, 68.09s/it]


Saving results to dialogled-large_1_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:09<00:00, 63.08s/it]


Saving results to dialogled-base_1_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:59<00:00, 59.99s/it]


Saving results to dialogled-large_2_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:56<00:00, 58.74s/it]


Saving results to dialogled-base_2_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:06<00:00, 62.30s/it]


Saving results to dialogled-large_0_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:06<00:00, 62.14s/it]


Saving results to dialogled-base_0_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:00<00:00, 60.30s/it]


Saving results to dialogled-large_3_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:57<00:00, 59.00s/it]


Saving results to dialogled-base_3_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:58<00:00, 59.44s/it]


Saving results to dialogled-large_4_results.json


scorers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:55<00:00, 58.65s/it]

Saving results to dialogled-base_4_results.json





In [47]:
model_arch_list = ["dialogled-base","dialogled-large"]
json_file_list = Path.cwd().glob("*.json")
df_dict = dict()
for json_file in json_file_list:
    with open(json_file,"r") as f:
        json_data = json.load(f)
    for key in json_data.keys():
        model_with_split = json_file.stem.split("_",maxsplit=-1)[0]
        df = pd.DataFrame.from_dict(json_data[key])
        df.rename(mapper={0:model_with_split},inplace=True)
        if key not in df_dict:
            df_dict[key] = df
        else:
            df_dict[key] = pd.concat([df_dict[key],df],axis=0)

In [71]:
df_grouped_dict = dict()
for cohort in df_dict:
    print(cohort)
    df = df_dict[cohort].reset_index().groupby("index").mean().reset_index()
    df = df.rename(mapper={"index":"model_architecure"},axis=1)
    df_grouped_dict[cohort] = df.to_dict()    

all
dataset-virtassist
dataset-virtscribe
dataset-aci
division-subjective
division-objective_exam
division-objective_results
division-assessment_and_plan
longer-src


In [72]:
with open("all_sections_json.json","w") as f:
    json.dump(df_grouped_dict,f,indent=2)