In [None]:
library(ggplot2)
library(tidyverse)
library(bigrquery)
library(lubridate)
library(table1)
library(margins)

In [None]:
### prep census data ###

# collect census data from persons that also appear in primary care dataset - 2100773
census_sql_query <- bq_dataset_query(
    query = "SELECT person_id, AcademicYear, NCyearActual, AgeAtStartOfAcademicYear
             FROM `yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_census` census
             WHERE EXISTS(
                 SELECT person_id 
                 FROM `yhcr-prd-phm-bia-core.CB_FDM_PrimaryCare_v4.person` pc
                 WHERE census.person_id = pc.person_id
             )
             AND CAST(SUBSTR(AcademicYear, 1,4) AS INT64) > 2004 AND  
             CAST(SUBSTR(AcademicYear, 1,4) AS INT64) < 2019",
    x = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation"
)

main_year_groups_factor <- factor(
    c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"), 
    levels = c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"),
    ordered = TRUE) 

census_data <- bq_table_download(census_sql_query) %>% 

    # aggregate entries to only one per person/academic year
    group_by(person_id, AcademicYear) %>% 
    summarise(age_ac_yr_start = first(AgeAtStartOfAcademicYear),
              year_group = first(NCyearActual)) %>%
    ungroup() %>%

    # where missing, infer academic year from age at start of year
    mutate(year_group = case_when( 
        !is.na(year_group) ~ year_group, 
        age_ac_yr_start == 4 ~ "R", 
        between(age_ac_yr_start, 5, 15) ~ 
                as.character(age_ac_yr_start - 4)
    )) %>%

    # drop any entries not in main academic years
    filter(year_group %in% main_year_groups_factor) %>%
    mutate(year_group = factor(year_group,  
                                    ordered = TRUE,  
                                    levels = main_year_groups_factor)) %>%
    rename(academic_year = AcademicYear) %>% 
    arrange(person_id, academic_year) 


### prep exclusions data ###

exclusion_sql_query <- bq_dataset_query(
    query = "SELECT * FROM exclusions_cleaned
             WHERE CAST(SUBSTR(AcademicYear, 1,4) AS INT64) > 2004 
                AND CAST(SUBSTR(AcademicYear, 1,4) AS INT64) < 2019",
    x = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation"
)

exclusion_data <- bq_table_download(exclusion_sql_query) %>% 
    # aggregate exclusions by person/year - get first exclusion date and n_exclusions
    group_by(person_id, AcademicYear) %>% 
    summarise(first_excl_acad_yr = min(StartDate), 
              n_exclusions = n()) %>% 
    rename(academic_year = AcademicYear) %>%
    ungroup()


## prep demographics data

demo_sql_query <- bq_dataset_query(
    query = "SELECT person_id,  
                CASE 
                    WHEN remapped_gender = 45766034 THEN 'Male' 
                    WHEN remapped_gender = 45766035 THEN 'Female' 
                ELSE NULL
            END AS sex, 
                CASE
                    WHEN REGEXP_EXTRACT(census_ethnicity, r'^(.+?):') IS NOT NULL THEN REGEXP_EXTRACT(census_ethnicity, r'^(.+?):')
                    ELSE NULL
                END AS ethnic_group, 
            FROM `yhcr-prd-phm-bia-core.CB_STAGING_DATABASE.src_DemoGraphics_MASTER` demo
            WHERE EXISTS(
                SELECT person_id 
                FROM `yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_census` census 
                WHERE census.person_id = demo.person_id)",
    x = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation"
)

demo_data <- bq_table_download(demo_sql_query) %>%
    mutate(ethnicity = as.factor(case_when(
               ethnic_group == "White" ~ "White",  
               ethnic_group == "Asian or Asian British" ~ "Asian",  
               !is.na(ethnic_group) ~ "Other")),
           male = sex == "Male") %>%
    select(person_id, ethnicity, male)


### prep joined data ###

# custom function to calculate mins if input is all NA
min_wo_na <- function(x) {
    if (all(is.na(x))) NA else min(x, na.rm=TRUE)
}
                          
asd_sql_query <- bq_dataset_query(
    query = "SELECT person_id, diagnosis_date FROM ASD_master_tab",
    x = "yhcr-prd-phm-bia-core.CB_ASD_data"
)

asd_data = bq_table_download(asd_sql_query) 
                          
asd_exclusions <- census_data %>% 
    left_join(asd_data) %>%
    left_join(exclusion_data) %>% 
    left_join(demo_data) %>% 
    mutate(
        # create ac year start as datetime feature
        year_start = str_sub(academic_year, end=4), 
        year_month_day_start = str_c(year_start, "-09-01"),
        year_start_date = as.Date(year_month_day_start),
        # ---------
        no_diag = is.na(diagnosis_date),
        days_from_to_diag = difftime(year_start_date, diagnosis_date, units="days"),
        six_mth_diag = days_from_to_diag > 182,
        pending_diag = days_from_to_diag <= 182,
        secondary_school = year_group >= 7,
        excluded = !is.na(n_exclusions),
        excluded_pending = excluded & pending_diag,
        n_exclusions = ifelse(is.na(n_exclusions), 0, n_exclusions),
        year_group_int = as.integer(year_group) - 1
    ) %>%
    select(-c("year_start", "year_month_day_start")) %>%

    # add "has been excluded before this ac year" feature
    group_by(person_id) %>% 
    mutate(first_excl_ever = min_wo_na(first_excl_acad_yr), 
           has_prev_excl = case_when(
               year_start_date >= first_excl_ever ~ TRUE,  
               year_start_date < first_excl_ever ~ FALSE,  
               is.na(first_excl_ever) ~ FALSE 
           ),
           diag_status = as.factor(case_when(
               six_mth_diag ~ "six_mth_diag",  
               pending_diag ~ "pending_diag",  
               no_diag ~ "no_diag"
           )),
           excl_prev_year = replace_na(lag(excluded), FALSE)) %>%
    ungroup() %>% 
    # -----------
    # add count of primary/secondary years for each individual
    group_by(person_id, secondary_school) %>% 
    mutate(n_primary_secondary_years = n()) %>%
    ungroup() %>%
    # -----------
    # add meets_missing_threshold variable
    mutate(meets_missing_threshold = (n_primary_secondary_years >= 6 & !secondary_school) | 
                                     (n_primary_secondary_years >= 4 & secondary_school))
                          

In [None]:
asd_exclusions <- asd_exclusions %>%  
    mutate(excl_prev_year = replace_na(excl_prev_year, FALSE)) 


In [None]:
asd_exclusions %>%
    group_by(year_group) %>% 
    summarise(Total = n(), 
              no_diag = sum(no_diag, na.rm=TRUE),
              pending_diag = sum(pending_diag, na.rm=TRUE),
              six_mth_diag = sum(six_mth_diag, na.rm=TRUE), 
              excluded = sum(excluded)) %>%
    mutate(pct_no_diag = round(no_diag * 100 / Total, digits=2), 
           pct_pending_diag = round(pending_diag * 100 / Total, digits=2), 
           pct_six_mth_diag = round(six_mth_diag * 100 / Total, digits=2),  
           pct_excluded = round(excluded * 100 / Total, digits=2)) %>%
    select(year_group, Total, no_diag, pct_no_diag, pending_diag, 
           pct_pending_diag, six_mth_diag, pct_six_mth_diag, excluded, 
           pct_excluded) #%>% write_csv("table_1.csv")

In [None]:
asd_exclusions %>%
    filter(meets_missing_threshold) %>%
    group_by(year_group) %>% 
    summarise(Total = n(), 
              no_diag = sum(no_diag, na.rm=TRUE),
              pending_diag = sum(pending_diag, na.rm=TRUE),
              six_mth_diag = sum(six_mth_diag, na.rm=TRUE), 
              excluded = sum(excluded)) %>%
    mutate(pct_no_diag = round(no_diag * 100 / Total, digits=2), 
           pct_pending_diag = round(pending_diag * 100 / Total, digits=2), 
           pct_six_mth_diag = round(six_mth_diag * 100 / Total, digits=2),  
           pct_excluded = round(excluded * 100 / Total, digits=2)) %>%
    select(year_group, Total, no_diag, pct_no_diag, pending_diag, 
           pct_pending_diag, six_mth_diag, pct_six_mth_diag, excluded, 
           pct_excluded) #%>% write_csv("table_1.csv")

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
excl_preds <- 
    asd_exclusions %>%    
        mutate(Diagnosis = case_when(   
            no_diag ~ "None",   
            six_mth_diag ~ "Diagnosed",  
            !six_mth_diag & !no_diag ~ "Pending"
        )) %>% 
        mutate(Diagnosis = factor(
            Diagnosis,    
            levels=c('Diagnosed', 'Pending', 'None')
        )) %>%   
        group_by(year_group, Diagnosis) %>%   
        summarise(n = n()) %>% 
        mutate(pct = n / sum(n)) %>% 
        filter(!Diagnosis == "None") %>% 
        mutate(total_diag = sum(n)) %>%   
    ggplot() +
    geom_bar(aes(x=year_group, y=pct), stat="identity", alpha=0.3) +
    geom_line(aes(x=year_group, y=pct, color=Diagnosis, group=Diagnosis) ) + 
    geom_point(aes(x=year_group, y=pct, color=Diagnosis, group=Diagnosis) ) +
    labs(x="Year Group", y="Count")  + 
    theme_classic() + 
    scale_color_manual(values=c("#E69F00", "#56B4E9", "#696969"))
ggsave("plots/excl_pcts.jpg", excl_preds, width=7.5, height=5)
excl_preds

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
excl_preds <- 
    asd_exclusions %>%    
        filter(meets_missing_threshold) %>%
        mutate(Diagnosis = case_when(   
            no_diag ~ "None",   
            six_mth_diag ~ "Diagnosed",  
            !six_mth_diag & !no_diag ~ "Pending"
        )) %>% 
        mutate(Diagnosis = factor(
            Diagnosis,    
            levels=c('Diagnosed', 'Pending', 'None')
        )) %>%   
        group_by(year_group, Diagnosis) %>%   
        summarise(n = n()) %>% 
        mutate(pct = n / sum(n)) %>% 
        filter(!Diagnosis == "None") %>% 
        mutate(total_diag = sum(n)) %>%   
    ggplot() +
    geom_bar(aes(x=year_group, y=pct), stat="identity", alpha=0.3) +
    geom_line(aes(x=year_group, y=pct, color=Diagnosis, group=Diagnosis) ) + 
    geom_point(aes(x=year_group, y=pct, color=Diagnosis, group=Diagnosis) ) +
    labs(x="Year Group", y="Count")  + 
    theme_classic() + 
    scale_color_manual(values=c("#E69F00", "#56B4E9", "#696969"))
ggsave("plots/excl_pcts.jpg", excl_preds, width=7.5, height=5)
excl_preds

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
excl_preds <- asd_exclusions %>%   
    mutate(Diagnosis = case_when(  
        no_diag ~ "None",  
        six_mth_diag ~ "Diagnosed", 
        !six_mth_diag & !no_diag ~ "Pending")) %>%
    mutate(Diagnosis = factor(Diagnosis,  
                              levels=c('Diagnosed', 'Pending', 'None'))) %>%  
    group_by(year_group, Diagnosis) %>%  
    summarise(n = n(),    
              n_excluded = sum(excluded),    
              pct_excluded = (n_excluded / n)*100) %>%
    ggplot(aes(x=year_group,     
               y=pct_excluded,   
               color=Diagnosis,   
               group=Diagnosis)) + 
    geom_line() + 
    geom_point() +
    labs(x="Year Group", y="Percentage Excluded")  + 
    theme_classic() + 
    scale_color_manual(values=c("#E69F00", "#56B4E9", "#696969"))
ggsave("plots/excl_pcts.jpg", excl_preds, width=7.5, height=5)
excl_preds

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
excl_preds <- asd_exclusions %>%   
    filter(meets_missing_threshold) %>%
    mutate(Diagnosis = case_when(  
        no_diag ~ "None",  
        six_mth_diag ~ "Diagnosed", 
        !six_mth_diag & !no_diag ~ "Pending")) %>%
    mutate(Diagnosis = factor(Diagnosis,  
                              levels=c('Diagnosed', 'Pending', 'None'))) %>%  
    group_by(year_group, Diagnosis) %>%  
    summarise(n = n(),    
              n_excluded = sum(excluded),    
              pct_excluded = (n_excluded / n)*100) %>%
    ggplot(aes(x=year_group,     
               y=pct_excluded,   
               color=Diagnosis,   
               group=Diagnosis)) + 
    geom_line() + 
    geom_point() +
    labs(x="Year Group", y="Percentage Excluded")  + 
    theme_classic() + 
    scale_color_manual(values=c("#E69F00", "#56B4E9", "#696969"))
ggsave("plots/excl_pcts.jpg", excl_preds, width=7.5, height=5)
excl_preds

In [None]:
asd_exclusions

In [None]:
primary_model_all <- glm(excluded ~ diag_status + excl_prev_year + has_prev_excl + year_group_int + male + ethnicity,
             family = binomial(link="logit"),
             data = asd_exclusions %>% filter(!secondary_school))
summary(primary_model_all)

In [None]:
get_odds <- function(model) {
    result <- exp(cbind("Odds ratio" = coef(model), 
              confint.default(model, level = 0.95)))
    format(result, scientific=FALSE)
} 
get_odds(primary_model_all)

In [None]:
asd_exclusions %>%  
    filter(!secondary_school) %>%
    select(diag_status, has_prev_excl, secondary_school, year_group, male, ethnicity) %>% 
    drop_na() %>%
    select(year_group) %>%
    mutate(residual = primary_model_all$residuals) %>%
    ggplot(aes(x=year_group, y=log(residual))) +
    geom_boxplot()

In [None]:
primary_model_sub <- glm(excluded ~ diag_status + has_prev_excl + year_group_int + male + ethnicity,
             family = binomial(link="logit"),
             data = asd_exclusions %>% filter(!secondary_school & meets_missing_threshold))
summary(primary_model_sub)

In [None]:
get_odds(primary_model_sub)

In [None]:
secondary_model_all <- glm(excluded ~ diag_status + has_prev_excl + year_group_int + male + ethnicity,
             family = binomial(link="logit"),
             data = asd_exclusions %>% filter(secondary_school))
summary(secondary_model_all)

In [None]:
get_odds(secondary_model_all)

In [None]:
secondary_model_sub <- glm(excluded ~ diag_status + has_prev_excl + year_group_int + male + ethnicity,
             family = binomial(link="logit"),
             data = asd_exclusions %>% filter(secondary_school & meets_missing_threshold))
summary(secondary_model_sub)

In [None]:
get_odds(secondary_model_sub)