Skip to content

Commit

Permalink
Merge pull request #538 from walkerke/ddhca
Browse files Browse the repository at this point in the history
Merge v1.5, with full 2022 1-year ACS support, DDHCA support, and bug fixes
  • Loading branch information
walkerke committed Sep 25, 2023
2 parents 9fc4404 + e1cb10e commit 4bf8e83
Show file tree
Hide file tree
Showing 12 changed files with 256 additions and 39 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
Package: tidycensus
Type: Package
Title: Load US Census Boundary and Attribute Data as 'tidyverse' and 'sf'-Ready Data Frames
Version: 1.4.5
Version: 1.5
Authors@R: c(
person(given = "Kyle", family = "Walker", email="kyle@walker-data.com", role=c("aut", "cre")),
person(given = "Matt", family = "Herman", email = "mfherman@gmail.com", role = "aut"),
person(given = "Kris", family = "Eberwein", email = "eberwein@knights.ucf.edu", role = "ctb"))
Date: 2023-09-14
Date: 2023-09-25
URL: https://walker-data.com/tidycensus/
BugReports: https://github.com/walkerke/tidycensus/issues
Description: An integrated R interface to several United States Census Bureau
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

export(as_dot_density)
export(census_api_key)
export(check_ddhca_groups)
export(get_acs)
export(get_decennial)
export(get_estimates)
export(get_flows)
export(get_pop_groups)
export(get_pums)
export(interpolate_pw)
export(load_variables)
Expand Down Expand Up @@ -36,6 +38,7 @@ importFrom(purrr,map_chr)
importFrom(purrr,map_dbl)
importFrom(purrr,map_df)
importFrom(purrr,map_dfc)
importFrom(purrr,map_lgl)
importFrom(purrr,modify_depth)
importFrom(purrr,reduce)
importFrom(readr,parse_factor)
Expand Down
79 changes: 68 additions & 11 deletions R/census.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#' @param summary_var Character string of a "summary variable" from the decennial Census
#' to be included in your output. Usually a variable (e.g. total population)
#' that you'll want to use as a denominator or comparison.
#' @param pop_group The population group code for which you'd like to request data. Applies to summary files for which population group breakdowns are available like the Detailed DHC-A file.
#' @param pop_group_label If \code{TRUE}, return a \code{"pop_group_label"} column that contains the label for the population group. Defaults to \code{FALSE}.
#' @param key Your Census API key.
#' Obtain one at \url{https://api.census.gov/data/key_signup.html}
#' @param show_call if TRUE, display call made to Census API. This can be very useful
Expand Down Expand Up @@ -77,11 +79,17 @@ get_decennial <- function(geography,
keep_geo_vars = FALSE,
shift_geo = FALSE,
summary_var = NULL,
pop_group = NULL,
pop_group_label = FALSE,
key = NULL,
show_call = FALSE,
...
) {

if (sumfile == "ddhca" && is.null(pop_group)) {
rlang::abort("You must specify a population group to use the DDHC-A file. Look up codes with `get_pop_groups()` or specify `pop_group = 'all' to get all available population groups for a given geography / variable combination.")
}

if (shift_geo) {
warning("The `shift_geo` argument is deprecated and will be removed in a future release. We recommend using `tigris::shift_geometry()` instead.", call. = FALSE)
}
Expand Down Expand Up @@ -231,6 +239,8 @@ get_decennial <- function(geography,
keep_geo_vars = keep_geo_vars,
shift_geo = FALSE,
summary_var = summary_var,
pop_group = pop_group,
pop_group_label = pop_group_label,
key = key,
show_call = show_call,
...)
Expand Down Expand Up @@ -260,6 +270,8 @@ get_decennial <- function(geography,
keep_geo_vars = keep_geo_vars,
shift_geo = FALSE,
summary_var = summary_var,
pop_group = pop_group,
pop_group_label = pop_group_label,
key = key,
show_call = show_call))
})
Expand All @@ -277,17 +289,25 @@ get_decennial <- function(geography,
if (length(variables) > 48) {
l <- split(variables, ceiling(seq_along(variables) / 48))

if (!is.null(pop_group)) {
join_vars <- c("GEOID", "NAME", "POPGROUP")
} else {
join_vars <- c("GEOID", "NAME")
}

dat <- map(l, function(x) {
d <- try(load_data_decennial(geography, x, key, year, sumfile, state, county, show_call = show_call),
d <- try(load_data_decennial(geography, x, key, year, sumfile, pop_group, state, county, show_call = show_call),
silent = silent)

# If sf1 fails, try to get it from sf3
if (inherits(d, "try-error") && year < 2010) {

# stop("The 2000 decennial Census SF3 endpoint has been removed by the Census Bureau. We will support this data again when the endpoint is updated; in the meantime, we recommend using NHGIS (https://nhgis.org) and the ipumsr R package.", call. = FALSE)

d <- try(suppressMessages(load_data_decennial(geography, x, key, year, sumfile = "sf3", state, county, show_call = show_call)))
d <- try(suppressMessages(load_data_decennial(geography, x, key, year, sumfile = "sf3", pop_group, state, county, show_call = show_call)))
message("Variables not found in Summary File 1. Trying Summary File 3...")
} else {

if (sumfile == "sf3") {
message("Using Census Summary File 3")
} else if (sumfile == "sf1") {
Expand All @@ -298,13 +318,16 @@ get_decennial <- function(geography,
message("Using the Demographic and Housing Characteristics File")
} else if (sumfile == "dp") {
message("Using the Demographic Profile")
} else if (sumfile == "ddhca") {
message("Using the Detailed DHC-A File")
}

}
d
}) %>%
reduce(left_join, by = c("GEOID", "NAME"))
reduce(left_join, by = join_vars)
} else {
dat <- try(load_data_decennial(geography, variables, key, year, sumfile, state, county, show_call = show_call),
dat <- try(load_data_decennial(geography, variables, key, year, sumfile, pop_group, state, county, show_call = show_call),
silent = silent)

# If sf1 fails, try to get it from sf3
Expand All @@ -313,7 +336,7 @@ get_decennial <- function(geography,

# stop("The 2000 decennial Census SF3 endpoint has been removed by the Census Bureau. We will support this data again when the endpoint is updated; in the meantime, we recommend using NHGIS (https://nhgis.org) and the ipumsr R package.", call. = FALSE)

dat <- try(suppressMessages(load_data_decennial(geography, variables, key, year, sumfile = "sf3", state, county, show_call = show_call)))
dat <- try(suppressMessages(load_data_decennial(geography, variables, key, year, sumfile = "sf3", pop_group, state, county, show_call = show_call)))
message("Variables not found in Summary File 1. Trying Summary File 3...")
} else {
if (sumfile == "sf3") {
Expand All @@ -326,17 +349,32 @@ get_decennial <- function(geography,
message("Using the Demographic and Housing Characteristics File")
} else if (sumfile == "dp") {
message("Using the Demographic Profile")
} else if (sumfile == "ddhca") {
message("Using the Detailed DHC-A File")
}

}

}

if (inherits(dat, "try-error")) {
rlang::abort(message = dat)
}

if (output == "tidy") {

sub <- dat[c("GEOID", "NAME", variables)]
if (is.null(pop_group)) {
sub <- dat[c("GEOID", "NAME", variables)]

dat2 <- sub %>%
gather(key = variable, value = value, -GEOID, -NAME)
} else {
sub <- dat[c("GEOID", "NAME", "POPGROUP", variables)]

dat2 <- sub %>%
gather(key = variable, value = value, -GEOID, -NAME)
dat2 <- sub %>%
gather(key = variable, value = value, -GEOID, -NAME, -POPGROUP) %>%
dplyr::rename(pop_group = POPGROUP)
}

if (!is.null(names(variables))) {
for (i in 1:length(variables)) {
Expand Down Expand Up @@ -378,9 +416,28 @@ get_decennial <- function(geography,
dat2[dat2 == -888888888] <- NA
dat2[dat2 == -999999999] <- NA

dat2 <- dat2 %>%
select(GEOID, NAME, everything())
if ("POPGROUP" %in% names(dat2)) {
dat2 <- dat2 %>%
select(GEOID, NAME, POPGROUP, everything()) %>%
dplyr::rename(pop_group = POPGROUP)
} else {
dat2 <- dat2 %>%
select(GEOID, NAME, everything())
}

}

# If label is requested, join it here
if (pop_group_label) {
if (is.null(pop_group)) {
rlang::abort("This argument is only available when specifying a population group, which is only available for selected datasets.")
}

labels = get_pop_groups(year = year, sumfile = sumfile)
dat2 <- dat2 %>%
dplyr::left_join(labels, by = "pop_group") %>%
dplyr::select(GEOID, NAME, pop_group, pop_group_label,
dplyr::everything())
}

# For ZCTAs, strip the state code from GEOID (issue #338 and #358)
Expand Down Expand Up @@ -445,7 +502,7 @@ get_decennial <- function(geography,
geom <- try(suppressMessages(use_tigris(geography = geography, year = year,
state = state, county = county, criteria = "2020", ...)))
} else if (sumfile == "cd118") {
stop("Geometry is not yet available for this sumfile, but will be in mid-September 2023.")
stop("Geometry is not yet available for this sumfile in tidycensus.")
# try(suppressMessages(use_tigris(geography = geography, year = 2022,
# state = state, county = county, ...)))
} else {
Expand Down
26 changes: 17 additions & 9 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,15 @@ use_tigris <- function(geography, year, cb = TRUE, resolution = "500k",

# Right now, PUMAs are not defined for 2020 and are not in the CB file
# Use the 2019 CB pumas for 2020 and 2021 as they align with the boundaries
# used; 2022 should switch to the new 2020 PUMA boundaries (which aren't
# yet available)
# used; 2022 should switch to the new 2020 PUMA boundaries which are available in the 2020 CB file
if (year %in% 2020:2021) {
if (cb) {
year <- 2019
}
} else if (year >= 2022) {
if (cb) {
year <- 2020
}
}

state_ids <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
Expand All @@ -251,15 +254,20 @@ use_tigris <- function(geography, year, cb = TRUE, resolution = "500k",
}) %>%
rbind_tigris()
} else if (is.null(state)) {
pm <- purrr::map(state_ids, function(x) {
pumas(state = x, cb = cb, year = year, class = "sf", ...)
}) %>%
rbind_tigris()

if (!year %in% 2019:2020 && !cb) {
pm <- purrr::map(state_ids, function(x) {
pumas(state = x, cb = cb, year = year, class = "sf", ...)
}) %>%
rbind_tigris()
} else {
pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
}
} else {
pm <- pumas(state = state, cb = cb, year = year, class = "sf", ...)
}

if (year > 2021) {
if ("GEOID20" %in% names(pm)) {
pm <- rename(pm, GEOID = GEOID20)
} else {
pm <- rename(pm, GEOID = GEOID10)
Expand Down Expand Up @@ -594,7 +602,7 @@ variables_from_table_decennial <- function(table, year, sumfile, cache_table) {

# Find all variables that match the table

if (year == 2020 && sumfile %in% c("pl", "dhc", "dp")) {
if (year == 2020 && sumfile %in% c("pl", "dhc", "dp", "ddhca")) {
vars <- df %>%
filter(grepl(paste0(table, "_[0-9]+"), name)) %>%
pull(name)
Expand Down Expand Up @@ -653,7 +661,7 @@ summary_files <- function(year) {
"gu", "vi", "cd113", "cd113profile",
"cd115", "cd115profile", "cd116")
} else if (year == 2020) {
sumfiles <- c("pl", "dhc", "dp", "pes", "dpas",
sumfiles <- c("pl", "dhc", "dp", "pes", "dpas", "ddhca",
"dpmp", "dpgu", "dpvi",
"dhcvi", "dhcgu", "dhcvi", "dhcas", "cd118")
} else {
Expand Down
38 changes: 32 additions & 6 deletions R/load_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ load_data_acs <- function(geography, formatted_variables, key, year, state = NUL
}


load_data_decennial <- function(geography, variables, key, year, sumfile,
load_data_decennial <- function(geography, variables, key, year, sumfile, pop_group,
state = NULL, county = NULL, show_call = FALSE) {


Expand All @@ -331,6 +331,11 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
vars_to_get <- paste0(var, ",NAME")
}

if (pop_group == "all") {
vars_to_get <- paste0(vars_to_get, ",POPGROUP")
pop_group <- NULL
}


base <- paste0("https://api.census.gov/data/",
year,
Expand Down Expand Up @@ -405,21 +410,24 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,

call <- GET(base, query = list(get = vars_to_get,
"for" = for_area,
key = key))
key = key,
"POPGROUP" = pop_group))
} else {

call <- GET(base, query = list(get = vars_to_get,
"for" = for_area,
"in" = in_area,
key = key))
key = key,
"POPGROUP" = pop_group))
}
}

else {

call <- GET(base, query = list(get = vars_to_get,
"for" = paste0(geography, ":*"),
key = key))
key = key,
"POPGROUP" = pop_group))
}

if (show_call) {
Expand All @@ -428,6 +436,22 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
}

# Make sure call status returns 200, else, print the error message for the user.
# Try to handle 204's here
if (call$status_code == 204) {

if (sumfile == "ddhca") {
rlang::abort(c("Your DDHC-A request returned No Content from the API.",
"i" = "The DDHC-A file uses an 'adaptive design' where data availability varies by geography and by population group.",
"i" = "Read Section 3-1 at https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/complete-tech-docs/detailed-demographic-and-housing-characteristics-file-a/2020census-detailed-dhc-a-techdoc.pdf for more information.",
"i" = "In tidycensus, use the function `check_ddhca_groups()` to see if your data is available."))

} else {
rlang::abort("No content was returned from the API. Please refine your selection.")

}

}

if (call$status_code != 200) {
msg <- content(call, as = "text")

Expand Down Expand Up @@ -480,9 +504,11 @@ load_data_decennial <- function(geography, variables, key, year, sumfile,
dat <- rename(dat, NAME = ANPSADPI)
}

dat[variables] <- lapply(dat[variables], as.numeric)
vnum <- variables[variables != "POPGROUP"]

dat[vnum] <- lapply(dat[vnum], as.numeric)

v2 <- c(variables, "NAME")
v2 <- c(variables, "NAME", "POPGROUP")

# Get the geography ID variables
id_vars <- names(dat)[! names(dat) %in% v2]
Expand Down
Loading

0 comments on commit 4bf8e83

Please sign in to comment.