uclalawcovid19behindbars · nmmarquez · Jun 29, 2021 · Jun 1, 2021 · Jun 14, 2021 · Jun 15, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(ExtractTable)
+export(alt_aggregate_counts)
 export(calc_aggregate_counts)
 export(clean_fac_col_txt)
 export(clean_facility_name)
@@ -17,6 +18,7 @@ export(is_hifld_id)
 export(is_valid_state)
 export(last_not_na)
 export(list_remote_data)
+export(max_na_rm)
 export(merge_facility_info)
 export(plot_fac_trend)
 export(plot_recent_fac_increases)
@@ -26,7 +28,9 @@ export(read_fac_info)
 export(read_fac_spellings)
 export(read_hifld_data)
 export(read_mpap_data)
+export(read_mpap_pop_data)
 export(read_scrape_data)
+export(read_staff_popfeb20)
 export(read_vera_pop)
 export(reorder_cols)
 export(scale_color_bbcontinous)

diff --git a/R/alt_aggregate_counts.R b/R/alt_aggregate_counts.R
@@ -0,0 +1,169 @@
+#' Alternate Aggregate UCLA data for website groupings
+#'
+#' Reads the UCLA aggregates counts for states for most recent
+#' data within a given window and reports either state level data or national
+#' data. States include values for the 50 states broken down by carceral type,
+#' prison, ICE, Federal, Juvenile, Psychiatric, and county. For prisons, data
+#' from the Marshall project is also incorporated.
+#'
+#' @param window integer, the day range of acceptable data to pull from, ignored
+#' if all dates is true
+#' @param all_dates logical, get time series data rather than just latest counts
+#' @param week_grouping logical, use weekly grouping for past data? else monthly
+#'
+#' @return data frame with aggregated counts at the state level by web groups
+#'
+#' @examples
+#' \dontrun{
+#' alt_aggregate_counts()
+#' }
+#' @export
+
+alt_aggregate_counts <- function(
+    window = 31, all_dates = FALSE, week_grouping = TRUE){
+
+    # How to round data when doing all dates
+    round_ <- ifelse(week_grouping, "week", "month")
+
+    # read in ucla data and do the appropriate grouping
+    fac_long_df <- read_scrape_data(
+        window = window, all_dates = all_dates, wide_data = FALSE) %>%
+        mutate(Web.Group = case_when(
+            Jurisdiction == "immigration" ~ "ICE",
+            Jurisdiction == "federal" ~ "Federal",
+            Age == "Juvenile" ~ "Juvenile",
+            Jurisdiction == "state" ~ "Prison",
+            Jurisdiction == "psychiatric" ~ "Psychiatric",
+            Jurisdiction == "cou nty" ~ "County",
+            TRUE ~ NA_character_
+        ))  %>%
+        filter(State != "Not Available") %>%
+        select(
+            Name, Date, State, Measure, Web.Group, value, Population.Feb20) %>%
+        mutate(Rate = value/Population.Feb20)
+
+    # pull in the comparable MP data
+    mp_df <- read_mpap_data(
+        all_dates = all_dates, window = window) %>%
+        filter(State != "Federal")%>%
+        tidyr::pivot_longer(
+            -(State:Date), names_to = "Measure", values_to = "MP")
+
+    mp_pop_df <- read_mpap_pop_data() %>%
+        tidyr::pivot_longer(
+            -State, names_to = "Group", values_to = "Population.Feb20") %>%
+        mutate(Group = ifelse(
+            str_starts(Group, "Staff"), "Staff", "Residents"))
+
+    if(all_dates){
+        mp_df <- mp_df %>%
+            filter(!is.na(Date)) %>%
+            mutate(Date = lubridate::floor_date(Date, round_)) %>%
+            group_by(State, Date, Measure) %>%
+            filter(MP == max_na_rm(MP)) %>%
+            filter(1:n() == 1) %>%
+            ungroup()
+    }
+
+    pop_threshold <- .8
+
+    # aggregate the data together
+    if(all_dates){
+        state_df <- fac_long_df %>%
+            mutate(Date = lubridate::floor_date(Date, round_)) %>%
+            rename(UCLA = value) %>%
+            filter(!is.na(UCLA)) %>%
+            group_by(
+                State, Date, Measure, Web.Group, Name, Population.Feb20) %>%
+            summarize(UCLA = max_na_rm(UCLA), .groups = "drop_last") %>%
+            mutate(has_statewide = "STATEWIDE" %in% Name) %>%
+            # if state wide and other counts exist for a measure
+            # only take max date
+            filter(!(has_statewide) | Date == max(Date)) %>%
+            mutate(has_statewide = "STATEWIDE" %in% Name) %>%
+            # if state wide and other counts still exist for a measure
+            # only use statewide
+            filter(!(has_statewide & Name != "STATEWIDE")) %>%
+            group_by(State, Date, Web.Group, Measure) %>%
+            mutate(rem_thresh =
+                       mean(!is.na(Population.Feb20)) < pop_threshold) %>%
+            mutate(Population.Feb20 = ifelse(
+                rem_thresh, NA, Population.Feb20)) %>%
+            select(-rem_thresh) %>%
+            mutate(Rate = UCLA/Population.Feb20) %>%
+            summarise(
+                UCLA = sum_na_rm(UCLA),
+                Rate = sum_na_rm(Rate*Population.Feb20)/
+                    sum_na_rm(Population.Feb20),
+                Date = max(Date), .groups = "drop") %>%
+            mutate(Rate = ifelse(str_starts(Measure, "Staff"), NA, Rate)) %>%
+            filter(!str_ends(Measure, "Population"))
+
+        pri_df <- state_df %>%
+            filter(Web.Group == "Prison") %>%
+            full_join(mp_df, by = c("State", "Date", "Measure"))
+
+    }else{
+        state_df <- fac_long_df %>%
+            rename(UCLA = value) %>%
+            filter(!is.na(UCLA)) %>%
+            group_by(State, Measure, Web.Group) %>%
+            mutate(has_statewide = "STATEWIDE" %in% Name) %>%
+            # if state wide and other counts exist for a measure only take more
+            # recently scraped data
+            filter(!(has_statewide) | Date == max(Date)) %>%
+            mutate(has_statewide = "STATEWIDE" %in% Name) %>%
+            # if state wide and other counts still exist for a measure only
+            # use statewide measures
+            filter(!(has_statewide & Name != "STATEWIDE")) %>%
+            group_by(State, Measure, Web.Group) %>%
+            mutate(rem_thresh =
+                       mean(!is.na(Population.Feb20)) < pop_threshold) %>%
+            mutate(Population.Feb20 = ifelse(
+                rem_thresh, NA, Population.Feb20)) %>%
+            select(-rem_thresh) %>%
+            summarise(
+                UCLA = sum_na_rm(UCLA),
+                Rate = sum_na_rm(Rate*Population.Feb20)/
+                    sum_na_rm(Population.Feb20),
+                Date = max(Date), .groups = "drop") %>%
+            mutate(Rate = ifelse(str_starts(Measure, "Staff"), NA, Rate)) %>%
+            filter(!str_ends(Measure, "Population"))
+
+        pri_df <- state_df %>%
+            filter(Web.Group == "Prison") %>%
+            full_join(select(mp_df, -Date), by = c("State", "Measure"))
+    }
+
+    # combine MP and UCLA data
+    out_agg_df <- pri_df %>%
+        mutate(Group = ifelse(
+            str_starts(Measure, "Staff"), "Staff", "Residents")) %>%
+        left_join(mp_pop_df, by = c("State", "Group")) %>%
+        select(-Group) %>%
+        mutate(Val = case_when(
+            is.na(UCLA) & is.na(MP) ~ NA_real_,
+            is.na(UCLA) ~ MP,
+            is.na(MP) ~ UCLA,
+            UCLA >= MP ~ UCLA,
+            TRUE ~ MP
+        )) %>%
+        mutate(Rate = ifelse(is.na(Rate), UCLA/Population.Feb20, Rate)) %>%
+        mutate(
+            Rate = case_when(
+                is.na(UCLA) & is.na(MP) ~ NA_real_,
+                is.na(UCLA) ~ MP/Population.Feb20,
+                is.na(MP) ~ Rate,
+                UCLA >= MP ~ Rate,
+                TRUE ~ MP/Population.Feb20
+            )
+        ) %>%
+        select(-MP, -UCLA, -Population.Feb20) %>%
+        bind_rows(
+            state_df %>%
+                filter(Web.Group != "Prison") %>%
+                rename(Val = UCLA)
+        )
+
+    return(out_agg_df)
+}
diff --git a/R/calc_aggregate_counts.R b/R/calc_aggregate_counts.R
@@ -1,10 +1,3 @@
-max_na_rm <- function(x){
-    if(all(is.na(x))){
-        return(NA)
-    }
-    max(x, na.rm = TRUE)
-}
-
 #' Aggregate UCLA and MP data to get a more recent accurate count of COVID variables
 #'
 #' Reads the UCLA and MP/AP dataset aggregates counts for states for most recent

diff --git a/R/max_na_rm.R b/R/max_na_rm.R
@@ -0,0 +1,20 @@
+#' A re-coding of the max function which returns NA if all values are NA
+#'
+#' A re-coding of the max function which returns NA if all values are NA.
+#' Normally max(na.rm=TRUE) will return -Inf if all values are NA.
+#'
+#' @param x numeric vector
+#' @return numeric vector of length 1
+#'
+#' @examples
+#' max_na_rm(c(1:2, NA))
+#' max_na_rm(c(NA, NA))
+#'
+#' @export
+
+max_na_rm <- function(x){
+    if(all(is.na(x))){
+        return(NA)
+    }
+    max(x, na.rm = TRUE)
+}
diff --git a/R/read_mpap_data.R b/R/read_mpap_data.R
@@ -23,6 +23,7 @@ read_mpap_data <- function(all_dates = FALSE, window = 14){
 
     rename_df <- mp_raw_df %>%
         mutate(Date = lubridate::mdy(as_of_date)) %>%
+        filter(!is.na(Date)) %>%
         select(
             State = name,
             Date,

diff --git a/R/read_mpap_pop_data.R b/R/read_mpap_pop_data.R
@@ -0,0 +1,39 @@
+#' Get Feb 20, 2020 population data for state prisons from the MP
+#'
+#' Reads the MP dataset and for each prison jurisdiction pulls the staff and
+#' resident population data closest to Feb 20, 2020.
+#'
+#' @return data frame with jurisdiction level pop data from MP
+#'
+#' @examples
+#' \dontrun{
+#' read_staff_popfeb20()
+#' }
+#' @export
+
+
+read_mpap_pop_data <- function(){
+    pri_pop_df <- "https://raw.githubusercontent.com/themarshallproject/" %>%
+        str_c("COVID_prison_data/master/data/prison_populations.csv") %>%
+        readr::read_csv(col_types = readr::cols()) %>%
+        mutate(Date = lubridate::mdy(as_of_date)) %>%
+        mutate(dtime = abs(as.numeric(Date - lubridate::ymd("2020-02-01")))) %>%
+        group_by(name) %>%
+        filter(dtime == min(dtime)) %>%
+        filter(Date == min(Date)) %>%
+        ungroup() %>%
+        select(State = name, Population.Feb20 = pop)
+
+    staff_pop_df <- "https://raw.githubusercontent.com/themarshallproject/" %>%
+        str_c("COVID_prison_data/master/data/staff_populations.csv") %>%
+        readr::read_csv(col_types = readr::cols()) %>%
+        mutate(Date = lubridate::mdy(as_of_date)) %>%
+        mutate(dtime = abs(as.numeric(Date - lubridate::ymd("2020-02-01")))) %>%
+        group_by(name) %>%
+        filter(dtime == min(dtime)) %>%
+        filter(Date == min(Date)) %>%
+        ungroup() %>%
+        select(State = name, Staff.Population.Feb20 = pop)
+
+    full_join(pri_pop_df, staff_pop_df, by = "State")
+}
diff --git a/R/read_staff_popfeb20.R b/R/read_staff_popfeb20.R
@@ -0,0 +1,25 @@
+#' Get Feb 20, 2020 population data for applicable rows in the fac_data
+#'
+#' Reads the UCLA dataset and for each facility pulls the staff population data
+#' closest to Feb 20, 2020 in the UCLA historical data.
+#'
+#' @return data frame with population data for staff by facility
+#'
+#' @examples
+#' \dontrun{
+#' read_staff_popfeb20()
+#' }
+#' @export
+
+read_staff_popfeb20 <- function(){
+    read_scrape_data(
+        window = window, all_dates = T, wide_data = FALSE) %>%
+        filter(Measure == "Staff.Population") %>%
+        filter(!is.na(Facility.ID) & !is.na(value)) %>%
+        group_by(Facility.ID) %>%
+        mutate(dtime = abs(as.numeric(Date - lubridate::ymd("2020-02-01")))) %>%
+        filter(dtime == min(dtime)) %>%
+        filter(Date == min(Date)) %>%
+        ungroup() %>%
+        select(Facility.ID, Name, State, Staff.Population.Feb20 = value)
+}
diff --git a/man/alt_aggregate_counts.Rd b/man/alt_aggregate_counts.Rd
diff --git a/man/max_na_rm.Rd b/man/max_na_rm.Rd
diff --git a/man/read_mpap_pop_data.Rd b/man/read_mpap_pop_data.Rd
diff --git a/man/read_staff_popfeb20.Rd b/man/read_staff_popfeb20.Rd