Summary: Experiments estimating the amount of auto-correlation in the multi-level/heirarchical models.

The weekly observations from each school represent a time series running from the start (Sep 2021) to the end (Apr 2022) of the study period - the models in all the multi-level analyses assume the observations are independent, which is not the case for time series. These experiments demonstrate the extent to which the repeated observations from each school are autocorrelated, and issue that resulted in these experiments being abandoned in favour of a more simple modeling strategy that can be found in /analysis/simplified_analysis.

In [None]:
library(tidyverse)
library(tidybayes)
library(bigrquery)
library(brms)
library(ggmcmc)
library(loo)
library(lme4)
library(lubridate)
library(gridExtra)
library(grid)
library(bayesplot)
library(marginaleffects)
library(table1)

hepa_school_codes <- c("H01", "H02", "H03", "H04", "H05", "H06", "H07", "H08", 
                       "H09", "H10", "H11")
# removed C01 due to primary/secondary attendance issue
# control_school_codes <- c("C01", "C02", "C03", "C04", "C05", "C09", "C10", 
#                           "C11", "C12", "C13", "C14")
control_school_codes <- c("C02", "C03", "C04", "C05", "C09", "C10", 
                          "C11", "C12", "C13", "C14")
study_schools <- c(hepa_school_codes, control_school_codes)

project_id="yhcr-prd-phm-bia-core" 
attendance_sql <- "SELECT * FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.attendance`"
attendance_table <- bq_project_query(project_id, attendance_sql)

attendance <- bq_table_download(attendance_table) %>%
    filter(School_AnonID %in% study_schools) %>% 
    filter(WeekStart <= as.Date("2022-04-01")) %>%
    filter(pct_in_school > 0) 

attendance_data_threshold <- attendance$Unk / (attendance$pupils *  14) >= 0.01
attendance[attendance_data_threshold, "prop_absent_ill"] <- NA

attendance$arm <- "None"
attendance[attendance$School_AnonID %in% control_school_codes, "arm"] <- "Control" 
attendance[attendance$School_AnonID %in% hepa_school_codes, "arm"] <- "HEPA"

msoa_rates_link <- "https://api.coronavirus.data.gov.uk/v2/data?areaType=msoa&areaCode=E08000032&metric=newCasesBySpecimenDateRollingRate&format=csv"
keep_cols <- c("msoa", "WeekStart", "covid_msoa_rate")
msoa_rates <- read_csv(msoa_rates_link) %>%
    mutate(WeekStart = date - 5) %>%
    rename(covid_msoa_rate = newCasesBySpecimenDateRollingRate, msoa = areaCode) %>%
    select(all_of(keep_cols))

cov_age_rate_link <- "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=E08000032&metric=newCasesBySpecimenDateAgeDemographics&format=csv&release=2022-04-29"
cov_age_rates <- read_csv(cov_age_rate_link) %>%
    filter(age %in% c("05_09", "10_14")) %>%
    filter(weekdays(date) == "Saturday") %>%
    mutate(WeekStart = date - 5) %>%
    group_by(WeekStart) %>%
    summarise(covid_age_rate = mean(rollingRate))

attendance <- attendance %>% 
    left_join(msoa_rates, by=c("WeekStart", "msoa")) %>%
    left_join(cov_age_rates, by=c("WeekStart")) 

co2_sql <- "SELECT School_ID, week_start, co2_mean FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.stats_week_school`"
co2_table <- bq_project_query(project_id, co2_sql)
co2 <- bq_table_download(co2_table)

co2 <- co2 %>% 
    rename(WeekStart = week_start, School_AnonID = School_ID) 

attendance <- attendance %>% 
    left_join(co2, by=c("WeekStart", "School_AnonID"))

thresholds <- attendance %>% group_by(WeekStart) %>% 
    summarise(outlier_threshold = quantile(prop_absent_ill, p=0.75, na.rm=TRUE) + 3*(IQR(prop_absent_ill, na.rm=TRUE)))

attendance <- attendance %>% 
    left_join(thresholds, by="WeekStart") %>% 
    mutate(covid_age_rate_scaled = covid_age_rate / max(covid_age_rate), 
           covid_msoa_rate_scaled = covid_msoa_rate / max(covid_msoa_rate, na.rm=TRUE), 
           prop_absent_ill_scaled = prop_absent_ill / max(prop_absent_ill, na.rm=TRUE), 
           log_prop_absent_ill = log(prop_absent_ill), 
           co2_mean_scaled = co2_mean / max(co2_mean, na.rm=TRUE), 
           is_outlier = prop_absent_ill > outlier_threshold)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 5, repr.plot.res = 300)
School_IDs <- attendance %>% distinct(School_AnonID) %>% pull(School_AnonID)

attendance %>%
    filter(School_AnonID %in% sample(x=School_IDs, size=5)) %>%
    ggplot() +
    geom_line(aes(x=WeekStart, y=prop_absent_ill, group=School_AnonID, color=School_AnonID))

In [None]:
attendance %>%  
    filter(!is_outlier) %>% 
    ggplot() + 
    geom_boxplot(aes(x=WeekStart,    
                     y=prop_absent_ill,    
                     group=interaction(WeekStart, arm),
                     color=arm)) + 
    theme_classic()

In [None]:
basic_model <- glmer(formula = prop_absent_ill ~ 1 + arm + (1 | School_AnonID),    
                     data = attendance %>% filter(!is_outlier),
                     family=Gamma())
summary(basic_model)

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 300)
plot(fitted(basic_model), resid(basic_model, type = "pearson"))# this will create the plot
abline(0,0, col="red")

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 300)
qqnorm(resid(basic_model)) 
qqline(resid(basic_model), col = "red") # add a perfect fit line

In [None]:
attendance %>% 
    filter(!is_outlier) %>%
    select(School_AnonID, WeekStart, arm, prop_absent_ill) %>% 
    na.omit() %>% 
    mutate(resid = resid(basic_model)) %>%
    ggplot() + 
    geom_point(aes(x=WeekStart, y=resid))
    

In [None]:
attendance <- attendance %>% 
    mutate(day = yday(WeekStart)) %>% 
    mutate(winter_period = cos(day/365*2*pi),
           autumn_period = cos((day+91)/365*2*pi),
           summer_period = cos((day+182)/365*2*pi),
           spring_period = cos((day-91)/365*2*pi))

In [None]:
period_model <- glmer(formula = prop_absent_ill ~ 1 + arm +  arm*winter_period + arm*spring_period + 
                         (1 | School_AnonID),    
                     data = attendance, 
                     family=Gamma(link="log"))
summary(period_model)

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 300)
qqnorm(resid(period_model)) 
qqline(resid(period_model), col = "red") # add a perfect fit line

In [None]:
attendance %>% 
    select(School_AnonID, WeekStart, arm, prop_absent_ill) %>% 
    na.omit() %>% 
    mutate(basic_resid = resid(basic_model),
           period_resid = resid(period_model)) %>%
    ggplot() + 
    geom_point(aes(x=WeekStart, y=basic_resid), color="red") +
    geom_smooth(aes(x=WeekStart, y=basic_resid), color="red") +
    geom_point(aes(x=WeekStart, y=period_resid), color="blue") +
    geom_smooth(aes(x=WeekStart, y=period_resid), color="blue") 
    

In [None]:
yday("2021-04-01")

In [None]:
attendance %>% 
    mutate(days_to_dec_31 = abs(difftime(WeekStart, "2021-12-31", units="days")),
           days_to_sep_20 = abs(difftime(WeekStart, "2021-09-20", units="days")),
           days_to_apr_01 = abs(difftime(WeekStart, "2022-04-01", units="days"))) %>%
    select(WeekStart, days_to_dec_31, days_to_sep_20) %>%
    summarise(max(days_to_sep_20))