In [None]:
library(bigrquery)
library(ggplot2)
library(lubridate)
library(tidyverse)
library(marginaleffects)

In [None]:
project_id="yhcr-prd-phm-bia-core" 
attendance_sql <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_CLASS_ACT.attendance`"
attendance_table <- bq_project_query(project_id, attendance_sql)

start_date <- as.Date("2021-09-01")
end_date <- as.Date("2022-04-01")

hepa_school_codes <- c("H01", "H02", "H03", "H04", "H05", "H06", "H07", "H08", 
                       "H09", "H10", "H11")
control_school_codes <- c("C01", "C02", "C03", "C04", "C05", "C09", "C10", 
                          "C11", "C12", "C13", "C14")
study_schools <- c(hepa_school_codes, control_school_codes)

# load in schools data for time-period of study - drop closures
attendance <- bq_table_download(attendance_table) %>%
    filter(School_AnonID %in% study_schools) %>% 
    filter(pct_in_school > 0) %>%
    filter(Unk / (pupils * 14) < 0.01) %>%
    filter(WeekStart < end_date) %>%
    mutate(arm = case_when(School_AnonID %in% hepa_school_codes ~ "HEPA", 
                           School_AnonID %in% control_school_codes ~ "Control"),
           mth = factor(month.abb[month(WeekStart)], 
                          levels=c("Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar"), 
                          ordered=TRUE),
           illness_rate = prop_absent_ill * 100) %>%
    group_by(WeekStart, arm) %>% 
    mutate(outlier_threshold = mean(prop_absent_ill) + 3.25*(IQR(prop_absent_ill, na.rm=TRUE)),
           is_outlier = prop_absent_ill > outlier_threshold) 

# download CO2 stats and format variables
co2_sql <- "SELECT School_ID, week_start, co2_mean FROM `yhcr-prd-phm-bia-core.CB_CLASS_ACT.stats_week_school_21_09_22`"
co2_table <- bq_project_query(project_id, co2_sql)
co2 <- bq_table_download(co2_table) %>% 
    rename(WeekStart = week_start, School_AnonID = School_ID) 

# download rate data from .gov site and format WeekStart and msoa varaiables
msoa_rates_link <- "https://api.coronavirus.data.gov.uk/v2/data?areaType=msoa&areaCode=E08000032&metric=newCasesBySpecimenDateRollingRate&format=csv"
msoa_rates_csv <- "msoa_2023-01-19.csv"
keep_cols <- c("msoa", "WeekStart", "covid_msoa_rate")
msoa_rates <- read_csv(msoa_rates_csv) %>%
    mutate(WeekStart = date - 5) %>%
    rename(covid_msoa_rate = newCasesBySpecimenDateRollingRate, 
           msoa = areaCode) %>%
    select(all_of(keep_cols)) 

cov_age_rate_link <- "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=E08000032&metric=newCasesBySpecimenDateAgeDemographics&format=csv&release=2022-04-29"
cov_age_rate_csv <- "ltla_2022-04-29.csv"
keep_cols <- c("msoa", "WeekStart", "covid_msoa_rate")
cov_age_rates <- read_csv(cov_age_rate_csv) %>%
    filter(age %in% c("05_09", "10_14")) %>%
    filter(weekdays(date) == "Saturday") %>%
    mutate(WeekStart = date - 5) %>%
    group_by(WeekStart) %>%
    summarise(covid_age_rate = mean(rollingRate))

In [None]:
attendance %>% filter(prop_absent_ill > 0.15)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 4.5, repr.plot.res = 200)
attendance %>% 
    ggplot() +
    geom_boxplot(aes(x=WeekStart, y=illness_rate, group=interaction(WeekStart, arm), color=arm)) 

In [None]:
options(repr.plot.width = 8, repr.plot.height = 5, repr.plot.res = 200)
arm_colors <- c(HEPA = "#808080", Control = "#000000")
bw_ratios_plot <- attendance %>% 
    left_join(msoa_rates) %>%
    left_join(co2) %>%
    group_by(School_AnonID, mth, arm, msoa) %>% 
    summarise(total_in_school = sum(in_school),
              total_ill = sum(ill),
              overall_illness_rate = total_ill/total_in_school * 100,
              mean_weekly_illness_ratio = mean(prop_absent_ill),
              mean_msoa_rate = mean(covid_msoa_rate, na.rm=TRUE),
              mean_co2 = mean(co2_mean, na.rm=TRUE)) %>%
    ungroup() %>%
    ggplot() +
    geom_boxplot(aes(x=mth, y=overall_illness_rate, color=arm)) +
    xlab("Date") +
    ylab("Illness Rate") +
    theme_classic() +
    scale_color_manual(values=arm_colors, breaks=c("Control", "HEPA")) +
    theme(legend.direction = "horizontal", 
        legend.position = "bottom",
        legend.box = "horizontal"
    )
bw_ratios_plot

In [None]:
ggsave("plots/illness_rates_by_month_bw.jpg", bw_ratios_plot, width=8, height=4)

In [None]:
options(repr.plot.width = 8, repr.plot.height = 5, repr.plot.res = 200)
arm_colors <- c(HEPA = "#ff6633", Control = "#336699")
colour_ratios_plot <- attendance %>% 
    left_join(msoa_rates) %>%
    left_join(co2) %>%
    group_by(School_AnonID, mth, arm, msoa) %>% 
    summarise(total_in_school = sum(in_school),
              total_ill = sum(ill),
              overall_illness_rate = total_ill/total_in_school * 100,
              mean_weekly_illness_ratio = mean(prop_absent_ill),
              mean_msoa_rate = mean(covid_msoa_rate, na.rm=TRUE),
              mean_co2 = mean(co2_mean, na.rm=TRUE)) %>%
    ungroup() %>%
    ggplot() +
    geom_boxplot(aes(x=mth, y=overall_illness_rate, color=arm)) +
    xlab("Date") +
    ylab("Illness Rate") +
    theme_classic() +
    scale_color_manual(values=arm_colors, breaks=c("Control", "HEPA")) +
    theme(legend.direction = "horizontal", 
        legend.position = "bottom",
        legend.box = "horizontal"
    )
colour_ratios_plot

In [None]:
ggsave("plots/illness_rates_by_month_colour.jpg", colour_ratios_plot, width=8, height=4)

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 200)
agg_data <- attendance %>% 
    left_join(msoa_rates) %>%
    left_join(co2) %>%
    group_by(School_AnonID, arm, msoa) %>% 
    summarise(total_in_school = sum(in_school),
              total_ill = sum(ill),
              illness_rate = total_ill/total_in_school*100,
              mean_weekly_illness_ratio = mean(illness_rate),
              mean_positive_tests = mean(covid_msoa_rate, na.rm=TRUE),
              mean_co2 = mean(co2_mean, na.rm=TRUE))
agg_data %>% 
    ggplot() +
    geom_boxplot(aes(x=arm, y=illness_rate, color=arm))

In [None]:
all_model <- glm(formula = illness_rate ~ 1 + arm + mean_co2 + mean_positive_tests,      
                 data = agg_data,   
                 family=Gamma(link="log"))
summary(all_model)

In [None]:
arm_model <- glm(formula = illness_rate ~ 1 + arm,      
                 data = agg_data,   
                 family=Gamma(link="log"))
summary(arm_model)

In [None]:
options(repr.plot.width = 7.5, repr.plot.height = 5, repr.plot.res = 200)
agg_data$cooks_d = cooks.distance(arm_model)
cooks_plot <- agg_data %>% 
    ggplot() +
    geom_bar(aes(x=School_AnonID, 
                 y=cooks_d, 
                 fill=factor(ifelse(School_AnonID=="H07","Highlighted","Normal"))), 
             stat="identity", 
             width = 0.5,
             show.legend=FALSE) +
    scale_fill_manual(name = "School_AnonID", values=c("darkred","grey50")) +
    xlab("School ID") +
    ylab("Cook's Distance") +
    theme_classic()
cooks_plot

In [None]:
ggsave("plots/cooks_plot.jpg", cooks_plot, width=8, height=4)

In [None]:
summary(comparisons(all_model))

In [None]:
summary(comparisons(all_model)) %>% 
    dplyr::mutate_if(is.numeric, signif, 3) %>% 
    write_csv("all_model_margins.csv")

The occurrence of extreme covid outbreaks resulted in several "extreme" outlying observations of illness rates. One of these outbreaks, that took place over three weeks in September 2021 resulted in single school having a uniquely high influence on the estimated effect of HEPA filtration. To analyse the effect of these outlying values, we defined a threshold value of 3 times the interquartile range of illness rates above the mean for a given week - any value above this threshold was removed from the weekly observations. 12 observations from 7 schools were identified as outliers. The data were then re-aggregated to mean illness rates over the study period.

In [None]:
attendance %>% filter(is_outlier) 

In [None]:
attendance %>% filter(is_outlier) %>% group_by(School_AnonID) %>% count()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5, repr.plot.res = 200)
agg_data_wo_outliers <- attendance %>% 
    filter(!is_outlier) %>%
    left_join(msoa_rates) %>%
    left_join(co2) %>%
    group_by(School_AnonID, arm, msoa) %>% 
    summarise(total_in_school = sum(in_school),
              total_ill = sum(ill),
              illness_rate = total_ill/total_in_school*100,
              mean_weekly_illness_ratio = mean(illness_rate),
              mean_positive_tests = mean(covid_msoa_rate, na.rm=TRUE),
              mean_co2 = mean(co2_mean, na.rm=TRUE))
agg_data_wo_outliers %>% 
    ggplot() +
    geom_boxplot(aes(x=arm, y=illness_rate, color=arm))

In [None]:
no_out_model <-  glm(formula = illness_rate ~ 1 + arm + mean_co2 + mean_positive_tests, 
                     data = agg_data_wo_outliers,   
                     family=Gamma(link="log"))
summary(no_out_model)

In [None]:
no_out_arm_model <-  glm(formula = illness_rate ~ 1 + arm, 
                     data = agg_data_wo_outliers,   
                     family=Gamma(link="log"))
summary(no_out_arm_model)

In [None]:
summary(comparisons(no_out_model)) %>%
      dplyr::mutate_if(is.numeric, signif, 3)

In [None]:
summary(comparisons(no_out_model)) %>%
    dplyr::mutate_if(is.numeric, signif, 3) %>% 
    write_csv("no_out_margins.csv")

In [None]:
marginalmeans(no_out_arm_model)

In [None]:
options(repr.plot.width = 2.5, repr.plot.height = 4, repr.plot.res = 400)
mm_plot <- marginalmeans(no_out_model) %>% 
    ggplot(aes(x=value, y=marginalmean, color=value)) +
    geom_point(size=3, show.legend=FALSE) +
    geom_errorbar(aes(ymax=conf.high, ymin=conf.low), width=0.05, show.legend=FALSE) +
    xlab("Study Arm") +
    ylab("Predicted Illness Rate") +
    theme_classic()
mm_plot

In [None]:
ggsave("plots/mm_values.jpg", mm_plot, width=2.5, height=4, dpi=400)