In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.1     ✔ purrr   0.3.3
✔ tibble  2.1.3     ✔ dplyr   0.8.3
✔ tidyr   1.0.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
compute_acc <- function(annotated_df) {
    annotated_df %>%
    left_join(mm_df, by = c('NCT_id', 'snomed_id' = 'CODE')) %>%
    group_by(NCT_id, matched_string, manual_string, criteria_string, snomed_id, pre_post) %>%
    # Multiple matches as mm_df will include all synonyms as separate rows. 
    #  Only care if not NA, so just take the first row in each group.
    summarize(
        match = first(STR)
    ) %>%
    ungroup() %>%
    summarize(
        n_total = n(),
        n_snomed_nan = sum(snomed_id == 'N/A'),
        n_str_nan = sum(is.na(match)),
        n_same = sum(!is.na(match)),
        acc = n_same / n_total
    )
}

In [3]:
# Metamap outputs combined from Harry and Undina's annotated sample
mm_df <- read_csv('../data/metamap/combined_outputs.csv',
                   col_types = cols(.default = "c")) 

# Harry's "gold standard"
hrn_df <- read_csv('../data/annotations/annotate_notes_hr2479.csv',
                   col_types = cols(.default = "c")) %>%
    select(NCT_id, matched_string, manual_string, criteria_string, snomed_id, pre_post)

# Undina's "gold standard"
uog_df <- read_csv('../data/annotations/annotate_notes_uog.csv',
                   col_types = cols(.default = "c")) %>%
    select(NCT_id, matched_string, manual_string, criteria_string, snomed_id, pre_post) %>%
    drop_na(NCT_id)

“Missing column names filled in: 'X13' [13]”

In [4]:
bind_rows(
    compute_acc(hrn_df) %>% mutate(source = 'HRN'),
    compute_acc(uog_df) %>% mutate(source = 'UOG'),
    compute_acc(bind_rows(hrn_df, uog_df)) %>% mutate(source = 'BOTH')
)

n_total,n_snomed_nan,n_str_nan,n_same,acc,source
<int>,<int>,<int>,<int>,<dbl>,<chr>
420,11,129,291,0.6928571,HRN
420,0,90,330,0.7857143,UOG
797,11,213,584,0.7327478,BOTH
