Merge pull request #538 from walkerke/ddhca

Merge v1.5, with full 2022 1-year ACS support, DDHCA support, and bug fixes
walkerke · Sep 25, 2023 · 4bf8e83 · 4bf8e83
2 parents 9fc4404 + e1cb10e
commit 4bf8e83
Show file tree

Hide file tree

Showing 12 changed files with 256 additions and 39 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: tidycensus
 Type: Package
 Title: Load US Census Boundary and Attribute Data as 'tidyverse' and 'sf'-Ready Data Frames
-Version: 1.4.5
+Version: 1.5
 Authors@R: c(
     person(given = "Kyle", family = "Walker", email="kyle@walker-data.com", role=c("aut", "cre")),
     person(given = "Matt", family = "Herman", email = "mfherman@gmail.com", role = "aut"),
     person(given = "Kris", family = "Eberwein", email = "eberwein@knights.ucf.edu", role = "ctb"))
-Date: 2023-09-14
+Date: 2023-09-25
 URL: https://walker-data.com/tidycensus/
 BugReports: https://github.com/walkerke/tidycensus/issues
 Description: An integrated R interface to several United States Census Bureau 

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,10 +2,12 @@
 
 export(as_dot_density)
 export(census_api_key)
+export(check_ddhca_groups)
 export(get_acs)
 export(get_decennial)
 export(get_estimates)
 export(get_flows)
+export(get_pop_groups)
 export(get_pums)
 export(interpolate_pw)
 export(load_variables)
@@ -36,6 +38,7 @@ importFrom(purrr,map_chr)
 importFrom(purrr,map_dbl)
 importFrom(purrr,map_df)
 importFrom(purrr,map_dfc)
+importFrom(purrr,map_lgl)
 importFrom(purrr,modify_depth)
 importFrom(purrr,reduce)
 importFrom(readr,parse_factor)

diff --git a/R/census.R b/R/census.R
@@ -36,6 +36,8 @@
 #' @param summary_var Character string of a "summary variable" from the decennial Census
 #'                    to be included in your output. Usually a variable (e.g. total population)
 #'                    that you'll want to use as a denominator or comparison.
+#' @param pop_group The population group code for which you'd like to request data.  Applies to summary files for which population group breakdowns are available like the Detailed DHC-A file.
+#' @param pop_group_label If \code{TRUE}, return a \code{"pop_group_label"} column that contains the label for the population group.  Defaults to \code{FALSE}.
 #' @param key Your Census API key.
 #'            Obtain one at \url{https://api.census.gov/data/key_signup.html}
 #' @param show_call if TRUE, display call made to Census API. This can be very useful
@@ -77,11 +79,17 @@ get_decennial <- function(geography,
                           keep_geo_vars = FALSE,
                           shift_geo = FALSE,
                           summary_var = NULL,
+                          pop_group = NULL,
+                          pop_group_label = FALSE,
                           key = NULL,
                           show_call = FALSE,
                           ...
                           ) {
 
+  if (sumfile == "ddhca" && is.null(pop_group)) {
+    rlang::abort("You must specify a population group to use the DDHC-A file. Look up codes with `get_pop_groups()` or specify `pop_group = 'all' to get all available population groups for a given geography / variable combination.")
+  }
+
   if (shift_geo) {
     warning("The `shift_geo` argument is deprecated and will be removed in a future release. We recommend using `tigris::shift_geometry()` instead.", call. = FALSE)
   }
@@ -231,6 +239,8 @@ get_decennial <- function(geography,
                                keep_geo_vars = keep_geo_vars,
                                shift_geo = FALSE,
                                summary_var = summary_var,
+                               pop_group = pop_group,
+                               pop_group_label = pop_group_label,
                                key = key,
                                show_call = show_call,
                                ...)
@@ -260,6 +270,8 @@ get_decennial <- function(geography,
                                keep_geo_vars = keep_geo_vars,
                                shift_geo = FALSE,
                                summary_var = summary_var,
+                               pop_group = pop_group,
+                               pop_group_label = pop_group_label,
                                key = key,
                                show_call = show_call))
       })
@@ -277,17 +289,25 @@ get_decennial <- function(geography,
   if (length(variables) > 48) {
     l <- split(variables, ceiling(seq_along(variables) / 48))
 
+    if (!is.null(pop_group)) {
+      join_vars <- c("GEOID", "NAME", "POPGROUP")
+    } else {
+      join_vars <- c("GEOID", "NAME")
+    }
+
     dat <- map(l, function(x) {
-      d <- try(load_data_decennial(geography, x, key, year, sumfile, state, county, show_call = show_call),
+      d <- try(load_data_decennial(geography, x, key, year, sumfile, pop_group, state, county, show_call = show_call),
                silent = silent)
+
       # If sf1 fails, try to get it from sf3
       if (inherits(d, "try-error") && year < 2010) {
 
         # stop("The 2000 decennial Census SF3 endpoint has been removed by the Census Bureau. We will support this data again when the endpoint is updated; in the meantime, we recommend using NHGIS (https://nhgis.org) and the ipumsr R package.", call. = FALSE)
 
-        d <- try(suppressMessages(load_data_decennial(geography, x, key, year, sumfile = "sf3", state, county, show_call = show_call)))
+        d <- try(suppressMessages(load_data_decennial(geography, x, key, year, sumfile = "sf3", pop_group, state, county, show_call = show_call)))
         message("Variables not found in Summary File 1. Trying Summary File 3...")
       } else {
+
         if (sumfile == "sf3") {
           message("Using Census Summary File 3")
         } else if (sumfile == "sf1") {
@@ -298,13 +318,16 @@ get_decennial <- function(geography,
           message("Using the Demographic and Housing Characteristics File")
         } else if (sumfile == "dp") {
           message("Using the Demographic Profile")
+        } else if (sumfile == "ddhca") {
+          message("Using the Detailed DHC-A File")
         }
+
       }
       d
     }) %>%
-      reduce(left_join, by = c("GEOID", "NAME"))
+      reduce(left_join, by = join_vars)
   } else {
-    dat <- try(load_data_decennial(geography, variables, key, year, sumfile, state, county, show_call = show_call),
+    dat <- try(load_data_decennial(geography, variables, key, year, sumfile, pop_group, state, county, show_call = show_call),
                silent = silent)
 
     # If sf1 fails, try to get it from sf3
@@ -313,7 +336,7 @@ get_decennial <- function(geography,
 
       # stop("The 2000 decennial Census SF3 endpoint has been removed by the Census Bureau. We will support this data again when the endpoint is updated; in the meantime, we recommend using NHGIS (https://nhgis.org) and the ipumsr R package.", call. = FALSE)
 
-      dat <- try(suppressMessages(load_data_decennial(geography, variables, key, year, sumfile = "sf3", state, county, show_call = show_call)))
+      dat <- try(suppressMessages(load_data_decennial(geography, variables, key, year, sumfile = "sf3", pop_group, state, county, show_call = show_call)))
       message("Variables not found in Summary File 1. Trying Summary File 3...")
     } else {
       if (sumfile == "sf3") {
@@ -326,17 +349,32 @@ get_decennial <- function(geography,
         message("Using the Demographic and Housing Characteristics File")
       } else if (sumfile == "dp") {
         message("Using the Demographic Profile")
+      } else if (sumfile == "ddhca") {
+        message("Using the Detailed DHC-A File")
       }
+
     }
 
   }
 
+  if (inherits(dat, "try-error")) {
+    rlang::abort(message = dat)
+  }
+
   if (output == "tidy") {
 
-    sub <- dat[c("GEOID", "NAME", variables)]
+    if (is.null(pop_group)) {
+      sub <- dat[c("GEOID", "NAME", variables)]
+
+      dat2 <- sub %>%
+        gather(key = variable, value = value, -GEOID, -NAME)
+    } else {
+      sub <- dat[c("GEOID", "NAME", "POPGROUP", variables)]
 
-    dat2 <- sub %>%
-      gather(key = variable, value = value, -GEOID, -NAME)
+      dat2 <- sub %>%
+        gather(key = variable, value = value, -GEOID, -NAME, -POPGROUP) %>%
+        dplyr::rename(pop_group = POPGROUP)
+    }
 
     if (!is.null(names(variables))) {
       for (i in 1:length(variables)) {
@@ -378,9 +416,28 @@ get_decennial <- function(geography,
     dat2[dat2 == -888888888] <- NA
     dat2[dat2 == -999999999] <- NA
 
-    dat2 <- dat2 %>%
-      select(GEOID, NAME, everything())
+    if ("POPGROUP" %in% names(dat2)) {
+      dat2 <- dat2 %>%
+        select(GEOID, NAME, POPGROUP, everything()) %>%
+        dplyr::rename(pop_group = POPGROUP)
+    } else {
+      dat2 <- dat2 %>%
+        select(GEOID, NAME, everything())
+    }
+
+  }
 
+  # If label is requested, join it here
+  if (pop_group_label) {
+    if (is.null(pop_group)) {
+      rlang::abort("This argument is only available when specifying a population group, which is only available for selected datasets.")
+    }
+
+    labels = get_pop_groups(year = year, sumfile = sumfile)
+    dat2 <- dat2 %>%
+      dplyr::left_join(labels, by = "pop_group") %>%
+      dplyr::select(GEOID, NAME, pop_group, pop_group_label,
+                    dplyr::everything())
   }
 
   # For ZCTAs, strip the state code from GEOID (issue #338 and #358)
@@ -445,7 +502,7 @@ get_decennial <- function(geography,
         geom <- try(suppressMessages(use_tigris(geography = geography, year = year,
                                                 state = state, county = county, criteria = "2020", ...)))
       } else if (sumfile == "cd118") {
-        stop("Geometry is not yet available for this sumfile, but will be in mid-September 2023.")
+        stop("Geometry is not yet available for this sumfile in tidycensus.")
         # try(suppressMessages(use_tigris(geography = geography, year = 2022,
         #                                 state = state, county = county, ...)))
       } else {

diff --git a/R/helpers.R b/R/helpers.R
@@ -231,12 +231,15 @@ use_tigris <- function(geography, year, cb = TRUE, resolution = "500k",
 
     # Right now, PUMAs are not defined for 2020 and are not in the CB file
     # Use the 2019 CB pumas for 2020 and 2021 as they align with the boundaries
-    # used; 2022 should switch to the new 2020 PUMA boundaries (which aren't
-    # yet available)
+    # used; 2022 should switch to the new 2020 PUMA boundaries which are available       in the 2020 CB file
     if (year %in% 2020:2021) {
       if (cb) {
         year <- 2019
       }
+    } else if (year >= 2022) {
+      if (cb) {
+        year <- 2020
+      }
     }
 
     state_ids <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
@@ -251,15 +254,20 @@ use_tigris <- function(geography, year, cb = TRUE, resolution = "500k",
       }) %>%
         rbind_tigris()
     } else if (is.null(state)) {
-      pm <- purrr::map(state_ids, function(x) {
-        pumas(state = x, cb = cb, year = year, class = "sf", ...)
-      }) %>%
-        rbind_tigris()
+
+      if (!year %in% 2019:2020 && !cb) {
+        pm <- purrr::map(state_ids, function(x) {
+          pumas(state = x, cb = cb, year = year, class = "sf", ...)
+        }) %>%
+          rbind_tigris()
+      } else {
+        pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
+      }
     } else {
       pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
     }
 
-    if (year > 2021) {
+    if ("GEOID20" %in% names(pm)) {
       pm <- rename(pm, GEOID = GEOID20)
     } else {
       pm <- rename(pm, GEOID = GEOID10)
@@ -594,7 +602,7 @@ variables_from_table_decennial <- function(table, year, sumfile, cache_table) {
 
   # Find all variables that match the table
 
-  if (year == 2020 && sumfile %in% c("pl", "dhc", "dp")) {
+  if (year == 2020 && sumfile %in% c("pl", "dhc", "dp", "ddhca")) {
     vars <- df %>%
       filter(grepl(paste0(table, "_[0-9]+"), name)) %>%
       pull(name)
@@ -653,7 +661,7 @@ summary_files <- function(year) {
                   "gu", "vi", "cd113", "cd113profile",
                   "cd115", "cd115profile", "cd116")
   } else if (year == 2020) {
-    sumfiles <- c("pl", "dhc", "dp", "pes", "dpas",
+    sumfiles <- c("pl", "dhc", "dp", "pes", "dpas", "ddhca",
                   "dpmp", "dpgu", "dpvi",
                   "dhcvi", "dhcgu", "dhcvi", "dhcas", "cd118")
   } else {

diff --git a/R/load_data.R b/R/load_data.R
@@ -319,7 +319,7 @@ load_data_acs <- function(geography, formatted_variables, key, year, state = NUL
 }
 
 
-load_data_decennial <- function(geography, variables, key, year, sumfile,
+load_data_decennial <- function(geography, variables, key, year, sumfile, pop_group,
                                 state = NULL, county = NULL, show_call = FALSE) {
 
 
@@ -331,6 +331,11 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
     vars_to_get <- paste0(var, ",NAME")
   }
 
+  if (pop_group == "all") {
+    vars_to_get <- paste0(vars_to_get, ",POPGROUP")
+    pop_group <- NULL
+  }
+
 
   base <- paste0("https://api.census.gov/data/",
                                   year,
@@ -405,21 +410,24 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
 
       call <- GET(base, query = list(get = vars_to_get,
                                      "for" = for_area,
-                                     key = key))
+                                     key = key,
+                                     "POPGROUP" = pop_group))
     } else {
 
       call <- GET(base, query = list(get = vars_to_get,
                                      "for" = for_area,
                                      "in" = in_area,
-                                     key = key))
+                                     key = key,
+                                     "POPGROUP" = pop_group))
     }
   }
 
   else {
 
     call <- GET(base, query = list(get = vars_to_get,
                                    "for" = paste0(geography, ":*"),
-                                   key = key))
+                                   key = key,
+                                   "POPGROUP" = pop_group))
   }
 
   if (show_call) {
@@ -428,6 +436,22 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
   }
 
   # Make sure call status returns 200, else, print the error message for the user.
+  # Try to handle 204's here
+  if (call$status_code == 204) {
+
+    if (sumfile == "ddhca") {
+      rlang::abort(c("Your DDHC-A request returned No Content from the API.",
+                     "i" = "The DDHC-A file uses an 'adaptive design' where data availability varies by geography and by population group.",
+                     "i" = "Read Section 3-1 at https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/complete-tech-docs/detailed-demographic-and-housing-characteristics-file-a/2020census-detailed-dhc-a-techdoc.pdf for more information.",
+                     "i" = "In tidycensus, use the function `check_ddhca_groups()` to see if your data is available."))
+
+    } else {
+      rlang::abort("No content was returned from the API.  Please refine your selection.")
+
+    }
+
+  }
+
   if (call$status_code != 200) {
     msg <- content(call, as = "text")
 
@@ -480,9 +504,11 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
     dat <- rename(dat, NAME = ANPSADPI)
   }
 
-  dat[variables] <- lapply(dat[variables], as.numeric)
+  vnum <- variables[variables != "POPGROUP"]
+
+  dat[vnum] <- lapply(dat[vnum], as.numeric)
 
-  v2 <- c(variables, "NAME")
+  v2 <- c(variables, "NAME", "POPGROUP")
 
   # Get the geography ID variables
   id_vars <- names(dat)[! names(dat) %in% v2]