In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.1     ✔ purrr   0.3.2
✔ tibble  2.1.3     ✔ dplyr   0.8.3
✔ tidyr   1.0.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
# Harry's "gold standard"
hrn_df <- read_csv('../../data/annotations/annotate_notes_hr2479.csv',
                   col_types = cols(.default = "c")) %>%
    select(NCT_id, matched_string, manual_string, criteria_string, snomed_id, pre_post)

# Undina's "gold standard"
uog_df <- read_csv('../../data/annotations/annotate_notes_uog.csv',
                   col_types = cols(.default = "c")) %>%
    select(NCT_id, matched_string, manual_string, criteria_string, snomed_id, pre_post) %>%
    drop_na(NCT_id)

# Both uog_df and hrn_df have the same schema
hrn_df %>% head(1)

“Missing column names filled in: 'X13' [13]”

NCT_id,matched_string,manual_string,criteria_string,snomed_id,pre_post
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
NCT00000456,panic,panic disorder,"y patient report, collateral report, and urine drug screen.  - Meets criteria for disorders of major depression, panic disorder, obsessive-compulsive  disorder, post-traumatic s",371631005,


In [3]:
# Results from 1.c2q_api.ipynb
crit_results <- read_csv('../../data/c2q/results_table.csv',
                         col_types = cols(.default = "c"))

crit_results %>% head(1)

NCT_id,matched_string,criteria_string,source,cohort_name,concept_code,concept_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
NCT03937804,scoliosis,bronchitis lung transplant kyphoscoliosis sarcoidosis bronchopulmonary dysplasia,uog,[C2Q]bronchitis,32398004,Bronchitis


In [4]:
compute_acc <- function(annotated_df) {
    annotated_df %>% 
    left_join(
        crit_results, 
        by = c('NCT_id', 'snomed_id' = 'concept_code')
    ) %>%
    group_by(NCT_id, manual_string, snomed_id, pre_post) %>%
    summarize(
        match = first(cohort_name)
    ) %>%
    ungroup() %>%
    summarize(
        n_total = n(),
        n_snomed_nan = sum(snomed_id == 'N/A'),
        n_str_nan = sum(is.na(match)),
        n_same = sum(!is.na(match)),
        acc = n_same / n_total
    )
}

In [5]:
bind_rows(
    compute_acc(hrn_df) %>% mutate(source = 'HRN'),
    compute_acc(uog_df) %>% mutate(source = 'UOG'),
    compute_acc(bind_rows(hrn_df, uog_df)) %>% mutate(source = 'BOTH')
)

n_total,n_snomed_nan,n_str_nan,n_same,acc,source
<int>,<int>,<int>,<int>,<dbl>,<chr>
420,11,376,44,0.1047619,HRN
420,0,169,251,0.597619,UOG
790,11,526,264,0.3341772,BOTH
