In [None]:
library(bigrquery)
library(tidyverse)
library(ggvenn)
library(lubridate)

In [None]:
# import all the data
asd_query <- bq_dataset_query(
    query = "SELECT * FROM ASD_data",
    x = "yhcr-prd-phm-bia-core.CY_EM_ASD"
)
asd_data <- bq_table_download(asd_query)

school_person_query <- bq_dataset_query(
    query = "SELECT person_id, birth_datetime FROM person",
    x = "yhcr-prd-phm-bia-core.CY_FDM_DepartmentForEducation"
)
school_person_data <- bq_table_download(school_person_query)

exclusion_query <- bq_dataset_query(
    query = "SELECT * FROM exclusions_cleaned",
    x = "yhcr-prd-phm-bia-core.CY_FDM_DepartmentForEducation"
)
exclusion_data <- bq_table_download(exclusion_query)

school_census_query <- bq_dataset_query(
    query = "SELECT person_id, AcademicYear, CensusDate, CensusTerm, NCyearActual, 
                NCyearLeaving, SENprovision, AgeAtStartOfAcademicYear, 
                SENprovisionMajor, PrimarySENtype, SecondarySENtype, 
                SENUnitIndicator 
             FROM src_census",
    x = "yhcr-prd-phm-bia-core.CY_FDM_DepartmentForEducation"
)
school_census_data <- bq_table_download(school_census_query)

In [None]:
# infer missing academic years from age

school_census_data <- school_census_data %>%
    mutate(NCyearActualFilled = case_when(
            !is.na(NCyearActual) ~ NCyearActual,
            AgeAtStartOfAcademicYear == 4 ~ "R",
            between(as.numeric(AgeAtStartOfAcademicYear), 5, 15) ~ 
                as.character(as.numeric(AgeAtStartOfAcademicYear) - 4)
    ))

main_year_groups <- factor(
    c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"), 
    levels = c("R", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"),
    ordered = TRUE) 

myg_school_census_data <- school_census_data %>%
    filter(NCyearActualFilled %in% main_year_groups) %>%
    mutate(NCyearActualFilledFactor = factor(NCyearActualFilled, 
                                             ordered = TRUE, 
                                             levels = main_year_groups))

In [None]:
# drop ids that arent in census data from asd/exclusions
myg_census_ids <- myg_school_census_data %>%
    select(person_id) %>%
    unique()

myg_asd_data <- asd_data %>%
    inner_join(myg_census_ids)

myg_asd_ids <- myg_asd_data %>%
    select(person_id)

myg_exclusion_data <- exclusion_data %>%
    inner_join(myg_census_ids)

myg_excl_ids <- myg_exclusion_data %>%
    select(person_id) %>%
    unique()

In [None]:
# aggregate exclusions to one row per individual/ac_year
myg_excl_acYears <- myg_exclusion_data %>%
    arrange(person_id, AcademicYear, StartDate) %>%
    group_by(person_id, AcademicYear) %>%
    summarise(firstExclAcYear = first(StartDate), 
              exclRowsThisAcYear = n()) %>%
    ungroup()

In [None]:
myg_all_people <- myg_census_ids %>%
    mutate(asdDiagEver = person_id %in% myg_asd_ids$person_id, 
           exclEver = person_id %in% myg_excl_ids$person_id)

myg_all_people_censuses <- myg_all_people %>%
    left_join(select(myg_school_census_data, person_id, AcademicYear, 
                     NCyearActualFilled, NCyearActualFilledFactor, 
                     AgeAtStartOfAcademicYear)) %>%
    unique() %>%
    left_join(myg_asd_data) %>%
    left_join(myg_excl_acYears) %>%
    select(person_id, AcademicYear, NCyearActualFilled, NCyearActualFilledFactor,
           AgeAtStartOfAcademicYear, everything()) %>%
    arrange(person_id, AcademicYear)

In [None]:
startEndAcYears <- select(myg_all_people_censuses, AcademicYear) %>%
    unique() %>%
    mutate(
        calYearA = str_sub(AcademicYear, end = 4),
        calYearB = str_sub(AcademicYear, start = -4),
        startDateAcYr = str_c(calYearA, "-09-01"),
        endDateAcYr = str_c(calYearB, "-07-31"),
        startDateAcYear = as.Date(startDateAcYr),
        endDateAcYear = as.Date(endDateAcYr)
    )

In [None]:
# make a summary table for use in the regressions
myg_all_people_censuses %>% 
    left_join(select(startEndAcYears, 
                     AcademicYear, 
                     startDateAcYear, 
                     endDateAcYear)) %>% 
    mutate( 
        diagHappenedThisYear = if_else(
            asdDiagEver, 
            diagnosis_date %within% interval(startDateAcYear, endDateAcYear), 
            NA
        ), 
        hasDiagAtYearEnd = if_else(
            asdDiagEver, 
            diagnosis_date < endDateAcYear, 
            NA
        ),
        excludedThisYear = !is.na(firstExclAcYear),
        hasDiagAtFirstExclAcYear = if_else(
            asdDiagEver,
            diagnosis_date < firstExclAcYear, 
            NA)
    ) %>%
    group_by(person_id, NCyearActualFilled) %>%
    mutate(
        academicYearsThisNCyearActual = paste(AcademicYear, collapse = ", "),
        nAcYearsThisNCyearActual = length(unique(AcademicYear))
    ) %>% 
    filter(AcademicYear != academicYearsThisNCyearActual)

In [None]:
data.frame(a=c("a", "b", "c", "a", "b", "a", "a", "b", "c", "d"), 
           b=11:20) %>%
    group_by(a) %>%
    mutate(a_sum = sum(b),
           blah = paste(c(b, a), collapse = "-", sep="-"))

In [None]:
 %>%
    select(person_id, NCyearActualFilled, NCyearActualFilledFactor, AcademicYear, 
           excludedThisYear, diagHappenedThisYear, hasDiagAtYearEnd, 
           hasDiagAtFirstExclAcYear, firstExclAcYear, diagnosis_date, exclEver, 
           asdDiagEver, academicYearsThisNCyearActual, nAcYearsThisNCyearActual) %>%
    ungroup()

In [None]:
main_data_with_diagGroups <- main_data %>%
    mutate(diagGroup = case_when(
        !asdDiagEver ~ "asdNever", 
        asdDiagEver & excludedThisYear & hasDiagAtFirstExclAcYear ~ "asdCurrent", 
        asdDiagEver & !excludedThisYear & hasDiagAtYearEnd ~ "asdCurrent", 
        asdDiagEver & excludedThisYear & !hasDiagAtFirstExclAcYear ~ "asdFuture", 
        asdDiagEver & !excludedThisYear & !hasDiagAtYearEnd ~ "asdFuture"),  
           diagGroup = factor(diagGroup, ordered = TRUE, levels = c("asdNever", "asdCurrent", "asdFuture")))

In [None]:
# for use in plotting
forPlotting <- main_data_with_diagGroups %>%
# filter out those in the same year group for multiple years
filter(nAcYearsThisNCyearActual == 1) %>%
# do this for each year group and each diagnosis group
group_by(NCyearActualFilledFactor, diagGroup) %>%
# calculate: number in this diagnosis group in this year group
summarise(nDiagGroupThisYear = n(),
# number in this diagnosis group who were excluded while being in this year group
nDiagGroupExclThisYear = sum(excludedThisYear == TRUE),
# number in this diagnosis group who were not excluded while being in this year group
nDiagGroupNotExclThisYear = nDiagGroupThisYear - nDiagGroupExclThisYear,
# proportion in this diagnosis group who were excluded while being in this year group
propDiagGroupExclThisYear = nDiagGroupExclThisYear / nDiagGroupThisYear,
# as a percentage
percDiagGroupExclThisYear = propDiagGroupExclThisYear * 100)

In [None]:
# prepare plot components:
my_ylab <- "Percentage with at least one exclusion this year"
my_xlab_diag <- "Diagnosis Group"
my_xlab_year <- "Year Group"
my_subtitle <- " asdNever = never diagnosed with Autism Spectrum Disorder (ASD), not necessarily 'neurot
ypical';\n asdCurrent = has diagnosis of ASD at first exclusion of year (if excluded) or end of year (if
not excluded);\n asdFuture = doesn't have diagnosis at first exclusion of year or end of year, but later
received diagnosis"
my_title_all_ps <- "Percentage of each Diagnosis Group who were excluded at least once (for each Year Gr
oup) (unfiltered)"
my_title_not_all_ps <- "Percentage of each Diagnosis Group who were excluded at least once (for each Yea
r Group) (filtered)"
# plot the proportion of each diagnosis group who were excluded while being in this year group
# REMOVE people who who according to the census were in the same year group, e.g. Year 2, for multiple 
# academic years, e.g. 2012/13, 2013/14
ggplot(forPlotting, aes(x = NCyearActualFilledFactor, 
                        y = percDiagGroupExclThisYear, 
                        colour = diagGroup, 
                        group = diagGroup)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    ylab(my_ylab) +
    xlab(my_xlab_year) +
    ggtitle(my_title_not_all_ps) +
    labs(subtitle = my_subtitle) +
    scale_x_discrete(labels = c("R" = "Reception", "1" = "Year 1", "2" = "Year 2",
    "3" = "Year 3", "4" = "Year 4", "5" = "Year 5",
    "6" = "Year 6", "7" = "Year 7", "8" = "Year 8",
    "9" = "Year 9", "10" = "Year 10", "11" = "Year 11")) +
    scale_colour_manual(values = c("grey", "blue", "red"))

In [None]:
# set the year group names
yearGrpNames <- c("Reception", "Year 1", "Year 2", 
                  "Year 3", "Year 4", "Year 5", 
                  "Year 6", "Year 7", "Year 8", 
                  "Year 9", "Year 10", "Year 11")
# for running the for loop
count <- 0
nYearGrps <- 12
RESULTSmodels <- vector("list", nYearGrps)
# for each year group
for(yearGroup in main_year_groups) {
    # get the data for just this year group
    yearGroupData <- filter(main_data_with_diagGroups, 
                            # only keep the data from the current year group 
                            NCyearActualFilled == yearGroup, 
                            # only keep people who have an ASD diagnosis 
                            diagGroup != "asdNever", 
                            # only keep people who were in this year group for just 1 year 
                            nAcYearsThisNCyearActual == 1 
                           )
    # the is this logistic regression
    modelThisYear <- glm(excludedThisYear ~ diagGroup, 
                         family = binomial(link = 'logit'), 
                         data = yearGroupData)
    # increment the counter
    count <- count + 1
    # display the year group
    print(str_c("----------", yearGrpNames[count], "----------"))
    # display the results
    print(summary(modelThisYear))
    # add the results to a list
    RESULTSmodels[[count]] <- modelThisYear
    # make a data frame that includes the odds ratio
    oddsRatiosThisYear <- as.data.frame(cbind(yearGroup = yearGrpNames[[count]], 
                                              coef = coef(modelThisYear), 
                                              pval = summary(modelThisYear)$coefficients[,4], 
                                              oddsRatio = exp(coef(modelThisYear)))) %>% 
                            rownames_to_column("factor")
    # make another dataframe for the odds ratios where each year will get appended to it
    if(count == 1) {
        RESULTSoddsRatios <- oddsRatiosThisYear
    } else {
        RESULTSoddsRatios <- rbind(RESULTSoddsRatios, oddsRatiosThisYear)
    }
}

In [None]:
main_data_with_diagGroups %>% filter(diagGroup != "asdNever" & exclEver)


In [None]:
# for running the for loop
countUnfiltered <- 0
nYearGrps <- 12
RESULTSmodelsUnfiltered <- vector("list", nYearGrps)
# for each year group
for(yearGroup in main_year_groups) {
  # get the data for just this year group
  yearGroupDataUnfiltered <- filter(main_data_with_diagGroups,
                          # only keep the data from the current year group
                          NCyearActualFilled == yearGroup,
# only keep people who have an ASD diagnosis
diagGroup != "asdNever"#, #---------------------------------------------
# do not only keep people who were in this year group for just 1 year #nAcYearsThisNCyearActual == 1 #---------------------------------------------
)
  # the is this logistic regression
  modelThisYearUnfiltered <- glm(excludedThisYear ~ diagGroup,
      family = binomial(link = 'logit'),
      data = yearGroupDataUnfiltered)
  # increment the counter
  countUnfiltered <- countUnfiltered + 1
  # display the year group
  print(str_c("----------", yearGrpNames[countUnfiltered], "----------"))
  # display the results
  print(summary(modelThisYearUnfiltered))
  # add the results to a list
  RESULTSmodelsUnfiltered[[countUnfiltered]] <- modelThisYearUnfiltered
  # make a data frame that includes the odds ratio
  oddsRatiosThisYearUnfiltered <- as.data.frame(cbind(yearGroup = yearGrpNames[[countUnfiltered]],
                                            coef = coef(modelThisYearUnfiltered),
                                            pval = summary(modelThisYearUnfiltered)$coefficients[,4],
                                            oddsRatio = exp(coef(modelThisYearUnfiltered)))) %>%
    rownames_to_column("factor")
  # make another dataframe for the odds ratios where each year will get appended to it
if(countUnfiltered == 1) {
RESULTSoddsRatiosUnfiltered <- oddsRatiosThisYearUnfiltered
} else {
RESULTSoddsRatiosUnfiltered <- rbind(RESULTSoddsRatiosUnfiltered, oddsRatiosThisYearUnfiltered)
} }

In [None]:
main_data_with_diagGroups %>% filter(asdDiagEver &excludedThisYear)