In [None]:
library(ggplot2)
library(tidyverse)
library(bigrquery)
library(lubridate)
library(table1)
library(margins)

In [None]:
### prep census data ###

# collect census data from persons that also appear in primary care dataset - 2100773
census_sql_query <- bq_dataset_query(
    query = "SELECT person_id, AcademicYear, NCyearActual, AgeAtStartOfAcademicYear
             FROM `yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_census` census
             WHERE EXISTS(
                 SELECT person_id 
                 FROM `yhcr-prd-phm-bia-core.CB_FDM_PrimaryCare_v4.person` pc
                 WHERE census.person_id = pc.person_id
             )
             AND CAST(SUBSTR(AcademicYear, 1,4) AS INT64) > 2004 AND  
             CAST(SUBSTR(AcademicYear, 1,4) AS INT64) < 2019",
    x = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation"
)

main_year_groups_factor <- factor(
    c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"), 
    levels = c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"),
    ordered = TRUE) 

census_data <- bq_table_download(census_sql_query) %>% 

    # aggregate entries to only one per person/academic year
    group_by(person_id, AcademicYear) %>% 
    summarise(age_ac_yr_start = first(AgeAtStartOfAcademicYear),
              year_group_og = first(NCyearActual)) %>%
    ungroup() %>%

    # where missing, infer academic year from age at start of year
    mutate(year_group = case_when( 
        !is.na(year_group_og) ~ year_group_og, 
        age_ac_yr_start == 4 ~ "R", 
        between(age_ac_yr_start, 5, 15) ~ 
                as.character(age_ac_yr_start - 4)
    )) %>%

    # drop any entries not in main academic years R-11
    filter(year_group %in% main_year_groups_factor) %>%
    mutate(year_group = factor(year_group,  
                                    ordered = TRUE,  
                                    levels = main_year_groups_factor)) %>%
    rename(academic_year = AcademicYear) %>% 
    arrange(person_id, academic_year) 

nrow(census_data)

In [None]:
# generate "grid" table of entries year R - 11 for every unique person_id
census_grid <- expand.grid(
        unique(census_data$person_id), 
        main_year_groups_factor
    ) %>% 
    # rename the epand.grid default column names to something actually useful!
    rename(person_id = Var1, year_group = Var2) %>%
    # Join grid table to census data - missing years will be populated with NA values for census cols
    left_join(census_data %>% mutate(census_year_group = year_group)) %>% 
    # parse academic year from format "YYYY/YYYY" to int for start year
    mutate(year_start = as.integer(str_sub(academic_year, 1, 4))) %>%
    group_by(person_id) %>%
    # for each person_id - generate stats to infer year start for missing values
    mutate(min_year_start = min(year_start, na.rm=TRUE),
           max_year_start = max(year_start, na.rm=TRUE),
           year_span = max_year_start + 1 - min_year_start,
           min_year_group = min(census_year_group, na.rm=TRUE),
    ) %>%
    # infer the year start for any missing values
    mutate(inf_year_start = case_when(
        !is.na(year_start) ~ year_start,
        is.na(year_start) ~ min_year_start - as.integer(min_year_group) + as.integer(year_group))
    ) %>%

    # =====
    # remove any years that are missing because they skipped a year group e.g. there are lots of yr 1 -> yr 3
    ungroup() %>%
    group_by(person_id, inf_year_start) %>%
    mutate(n = n()) %>%
    filter(!(is.na(year_start) & n > 1)) %>%
    # =====
    # drop any entries that are dated before or after start/end years of dataset
    filter(inf_year_start > 2004 & inf_year_start < 2019) %>%
    arrange(person_id, year_group)

In [None]:
### prep exclusions data ###

exclusion_sql_query <- bq_dataset_query(
    query = "SELECT * FROM exclusions_cleaned
             WHERE CAST(SUBSTR(AcademicYear, 1,4) AS INT64) > 2004 
                AND CAST(SUBSTR(AcademicYear, 1,4) AS INT64) < 2019",
    x = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation"
)

exclusion_data <- bq_table_download(exclusion_sql_query) %>%
    group_by(person_id, AcademicYear) %>% 
    summarise(first_excl_acad_yr = min(StartDate), 
              n_exclusions = n()) %>% 
    ungroup() %>%
    rename(academic_year = AcademicYear)

any_exclusions <- exclusion_data %>% 
    distinct(person_id) %>% 
    mutate(has_exclusions = TRUE)

exclusions_this_year <- exclusion_data %>% 
    distinct(person_id, academic_year) %>% 
    mutate(excluded_this_year = TRUE)

In [None]:
asd_sql_query <- bq_dataset_query(
    query = "SELECT person_id, diagnosis_date FROM ASD_master_tab",
    x = "yhcr-prd-phm-bia-core.CB_ASD_data"
)

asd_data = bq_table_download(asd_sql_query) 

has_asd <- asd_data %>% distinct(person_id) %>% mutate(has_asd_diagnosis = TRUE)

In [None]:
missingness_data <- census_grid %>% 
    left_join(asd_data) %>%
    left_join(any_exclusions) %>% 
    mutate(missing = is.na(year_start),
           excluded = !is.na(has_exclusions),
           has_asd = !is.na(diagnosis_date)) %>%
    select(person_id, year_group, missing, excluded, has_asd) 

In [None]:
missingness_data <- missingness_data %>% 
    group_by(person_id) %>% 
    mutate(person_entries = sum(!missing)) %>% 
    ungroup()

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
missing_groups <- missingness_data %>% 
    mutate(missing_excluded = excluded & missing,
           missing_not_excluded = !excluded & missing,
           missing_asd = has_asd & missing,
           missing_not_asd = !has_asd & missing) %>%
    group_by(year_group) %>%
    summarise(pct_missing = sum(missing)/n() * 100,
              pct_missing_excluded = sum(missing_excluded) / sum(excluded) * 100,
              pct_asd_missing = sum(missing_asd) / sum(has_asd) * 100,
              pct_missing_not_excluded = sum(missing_not_excluded) / sum(!excluded)*100,
              pct_not_asd_missing = sum(missing_not_asd) / sum(!has_asd)*100) %>%
    ggplot() + 
    geom_line(aes(x=year_group, y=pct_missing, group="Overall Missing", color="Overall Missing")) +
    geom_point(aes(x=year_group, y=pct_missing, group="Overall Missing", color="Overall Missing")) +
    geom_line(aes(x=year_group, y=pct_missing_excluded, group="Excluded Missing", color="Excluded Missing")) +
    geom_point(aes(x=year_group, y=pct_missing_excluded, group="Excluded Missing", color="Excluded Missing")) +
    geom_line(aes(x=year_group, y=pct_asd_missing, group="ASD Missing", color="ASD Missing")) +
    geom_point(aes(x=year_group, y=pct_asd_missing, group="ASD Missing", color="ASD Missing")) +
    labs(x="Year Group", y="% Missing", color="")  + 
    theme_classic()
ggsave("plots/missing_groups.jpg", missing_groups, width=7.5, height=5)
missing_groups

In [None]:
missingness_data %>% 
    mutate(missing_excluded = excluded & missing,
           missing_not_excluded = !excluded & missing,
           missing_asd = has_asd & missing,
           missing_not_asd = !has_asd & missing) %>%
    group_by(year_group) %>%
    summarise(pct_missing = sum(missing)/n() * 100,
              pct_missing_excluded = sum(missing_excluded) / sum(excluded) * 100,
              pct_asd_missing = sum(missing_asd) / sum(has_asd) * 100,
              pct_missing_not_excluded = sum(missing_not_excluded) / sum(!excluded)*100,
              pct_not_asd_missing = sum(missing_not_asd) / sum(!has_asd)*100) 

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
missing_groups_corrected <- missingness_data %>% 
    mutate(secondary_school = year_group >= 7) %>%
    group_by(person_id, secondary_school) %>%
    mutate(n_primary_secondary_years = sum(!missing)) %>%
    filter((n_primary_secondary_years >= 7 & !secondary_school) |  
            (n_primary_secondary_years >= 5 & secondary_school)) %>%
    ungroup() %>%
    mutate(missing_excluded = excluded & missing,
           missing_asd = has_asd & missing) %>%
    group_by(year_group) %>%
    summarise(n = sum(!missing),
              missing = sum(missing),
              total = n + missing,
              pct_missing = missing/total,
              excluded = sum(excluded),
              missing_excluded = sum(missing_excluded),
              total_excluded = excluded + missing_excluded,
              pct_missing_excluded = missing_excluded / total_excluded,
              has_asd = sum(has_asd),
              asd_missing = sum(missing_asd),
              total_asd = has_asd + asd_missing,
              pct_asd_missing = asd_missing / total_asd) %>%
    ggplot() + 
    geom_line(aes(x=year_group, y=pct_missing, group="Overall Missing", color="Overall Missing")) +
    geom_point(aes(x=year_group, y=pct_missing, group="Overall Missing", color="Overall Missing")) +
    geom_line(aes(x=year_group, y=pct_missing_excluded, group="Excluded Missing", color="Excluded Missing")) +
    geom_point(aes(x=year_group, y=pct_missing_excluded, group="Excluded Missing", color="Excluded Missing")) +
    geom_line(aes(x=year_group, y=pct_asd_missing, group="ASD Missing", color="ASD Missing")) +
    geom_point(aes(x=year_group, y=pct_asd_missing, group="ASD Missing", color="ASD Missing")) +
    labs(x="Year Group", y="% Missing", color="")  + 
    theme_classic()
ggsave("plots/missing_groups_corrected.jpg", missing_groups_corrected, width=7.5, height=5)
missing_groups_corrected