Update build_update_taxon_list.R (#789)

- Add `dplyr::` prefixes and `tidyselect` syntax - Clean formatting
traitecoevo · Nov 23, 2023 · 51964db · 51964db
1 parent 10dabe6
commit 51964db
Showing 1 changed file with 125 additions and 92 deletions.
diff --git a/R/build_update_taxon_list.R b/R/build_update_taxon_list.R
@@ -1,63 +1,80 @@
 build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
-  
+
   resources <- APCalign::load_taxonomic_resources()
   ranks_in_database <- unique(austraits$taxa$taxon_rank)
-  
+
   higher_ranks <- c("phylum", "class", "order", "family", "genus")
   highest_ranks <- c("phylum", "class", "order")
   higher_ranks_taxon_list <- higher_ranks[higher_ranks %in% unique(austraits$taxa$taxon_rank)]
   highest_rank_column <- higher_ranks_taxon_list[1]
   highest_ranks_taxon_list <- highest_ranks[highest_ranks %in% unique(austraits$taxa$taxon_rank)]
-  
+
   # Create reduced APC list, just including a few columns and taxon ranks relevant to AusTraits
-  APC_tmp <- resources$APC %>% 
-    select(canonical_name, taxon_rank) %>% 
-    mutate(taxonomic_dataset_APC = "APC") %>% 
-    filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
-    rename(aligned_name = canonical_name, taxon_rank_APC = taxon_rank)
-  
+  APC_tmp <- resources$APC %>%
+    dplyr::select(dplyr::all_of(c("canonical_name", "taxon_rank"))) %>%
+    dplyr::mutate(taxonomic_dataset_APC = "APC") %>%
+    dplyr::filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
+    dplyr::rename(dplyr::all_of(c("aligned_name" = "canonical_name", "taxon_rank_APC" = "taxon_rank")))
+
   # Create reduced APNI list, just including a few columns and taxon ranks relevant to AusTraits
-  APNI_tmp <- resources$APNI %>% 
-    select(canonical_name, taxon_rank) %>% 
-    mutate(taxonomic_dataset_APNI = "APNI") %>% 
-    filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>% 
-    rename(aligned_name = canonical_name, taxon_rank_APNI = taxon_rank)
-  
+  APNI_tmp <- resources$APNI %>%
+    dplyr::select(dplyr::all_of(c("canonical_name", "taxon_rank"))) %>%
+    dplyr::mutate(taxonomic_dataset_APNI = "APNI") %>%
+    dplyr::filter(taxon_rank %in% c("family", "genus", "species", "form", "subspecies", "variety", "series")) %>%
+    dplyr::rename(dplyr::all_of(c("aligned_name" = "canonical_name", "taxon_rank_APNI" = "taxon_rank")))
+
   # List of taxa that are explicitly excluded in metadata - don't want these in the taxon_list
   # These should be excluded from `taxonomic_updates` table during processing, but good to check
-  excluded_in_metadata <- austraits$excluded_data %>% filter(error == "Observation excluded in metadata") %>% distinct(original_name)
-
+  excluded_in_metadata <- austraits$excluded_data %>%
+    dplyr::filter(error == "Observation excluded in metadata") %>%
+    dplyr::distinct(original_name)
+
   # Start with taxonomic_updates table, which is all original names, aligned names, by dataset
-  all_taxa <- 
+  all_taxa <-
     austraits$taxonomic_updates %>%
     dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxonomic_resolution"))) %>%
-    # In case the same `original_name`, `aligned_name` combination occurs twice, but only once with `taxonomic_resolution` attached, arrange names, taxon_ranks
+    # In case the same `original_name`, `aligned_name` combination occurs twice, but only once with
+    # `taxonomic_resolution` attached, arrange names, taxon_ranks
     dplyr::arrange(aligned_name, taxonomic_resolution) %>%
     # Keep unique names
     dplyr::distinct(original_name, aligned_name, .keep_all = TRUE) %>%
     # Need to merge in taxon_ranks for taxa that have not had a taxonomic alignment added in the metadata file
-    dplyr::left_join(by = "aligned_name",
-                     austraits$taxa %>%
-                       dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
-                       dplyr::rename(aligned_name = taxon_name, taxon_rank_taxa = taxon_rank) %>%
-                       arrange(aligned_name, taxon_rank_taxa) %>%
-                       distinct(aligned_name, .keep_all = TRUE)
+    dplyr::left_join(
+      by = "aligned_name",
+      austraits$taxa %>%
+        dplyr::select(dplyr::all_of(c("taxon_name", "taxon_rank"))) %>%
+        dplyr::rename(dplyr::all_of(c("aligned_name" = "taxon_name", "taxon_rank_taxa" = "taxon_rank"))) %>%
+        dplyr::arrange(aligned_name, taxon_rank_taxa) %>%
+        dplyr::distinct(aligned_name, .keep_all = TRUE)
     ) %>%
     # For taxa with: 1) taxon_rank = genus, family, or 2) notes in []
     # Need to mutate a separate column with `name_to_match_to` for joining in other columns
     dplyr::mutate(
       name_to_match_to = aligned_name,
       taxonomic_resolution = ifelse(is.na(taxonomic_resolution), taxon_rank_taxa, taxonomic_resolution),
-      name_to_match_to = 
-        ifelse(taxonomic_resolution %in% c("family", "genus"), 
-               ifelse(stringr::word(name_to_match_to, 1) == "x", stringr::word(name_to_match_to, start = 1, end = 2), stringr::word(name_to_match_to, 1)), 
-                       name_to_match_to),
-      name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"), stringr::word(name_to_match_to, start = 1, end = 2), name_to_match_to),
-      name_to_match_to = ifelse(stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"), stringr::word(name_to_match_to, start = 1, end = 3), name_to_match_to),
+      name_to_match_to =
+        ifelse(
+          taxonomic_resolution %in% c("family", "genus"),
+          ifelse(
+            stringr::word(name_to_match_to, 1) == "x",
+            stringr::word(name_to_match_to, start = 1, end = 2),
+            stringr::word(name_to_match_to, 1)),
+          name_to_match_to),
+      name_to_match_to = ifelse(
+        stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("species"),
+        stringr::word(name_to_match_to, start = 1, end = 2),
+        name_to_match_to),
+      name_to_match_to = ifelse(
+        stringr::str_detect(name_to_match_to, "\\[") & taxonomic_resolution %in% c("subspecies", "variety", "form"),
+        stringr::word(name_to_match_to, start = 1, end = 3),
+        name_to_match_to),
       ) %>%
-    # Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for most names where original_name = aligned_name
-    dplyr::left_join(APC_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
-    dplyr::left_join(APNI_tmp %>% rename(name_to_match_to = aligned_name) %>% distinct(name_to_match_to, .keep_all = TRUE)) %>%
+    # Merge in taxon_ranks & taxonomic_dataset for all aligned name; this information not yet present for
+    # most names where original_name = aligned_name
+    dplyr::left_join(APC_tmp %>% dplyr::rename("name_to_match_to" = "aligned_name") %>%
+    dplyr::distinct(name_to_match_to, .keep_all = TRUE)) %>%
+    dplyr::left_join(APNI_tmp %>% dplyr::rename("name_to_match_to" = "aligned_name") %>%
+    dplyr::distinct(name_to_match_to, .keep_all = TRUE)) %>%
     dplyr::mutate(
       taxon_rank = taxon_rank_APC,
       taxon_rank = ifelse(is.na(taxon_rank), taxon_rank_APNI, taxon_rank),
@@ -67,37 +84,41 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
       ) %>%
     # Remove taxa that are excluded in metadata
     dplyr::filter(!(original_name %in% excluded_in_metadata$original_name & is.na(taxon_rank)))
-  
+
   # Filter out taxa that need to be run through `APCalign::align_taxa()` first
   taxa_for_taxon_list <- all_taxa %>%
-    filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
-    select(original_name, aligned_name, taxon_rank, taxonomic_dataset) %>%
-    mutate(aligned_reason = NA)
-  
+    dplyr::filter(!(is.na(taxon_rank) & is.na(taxonomic_dataset))) %>%
+    dplyr::select(dplyr::all_of(c("original_name", "aligned_name", "taxon_rank", "taxonomic_dataset"))) %>%
+    dplyr::mutate(aligned_reason = NA)
+
   # Use function `APCalign::update_taxonomy` to update names and add identifier columns
   updated <- APCalign::update_taxonomy(taxa_for_taxon_list, resources = resources)
-  
-  taxon_list_new <- updated %>% 
+
+  taxon_list_new <- updated %>%
     # Remove columns from APCalign's output that aren't needed
-    dplyr::select(-dplyr::any_of(c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned", "update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
+    dplyr::select(-dplyr::any_of(
+      c("accepted_name", "row_number", "number_of_collapsed_taxa", "taxonomic_status_aligned",
+      "update_reason", "aligned_reason", "scientific_name_authorship"))) %>%
     # Rename columns to match AusTraits conventions
     dplyr::rename(dplyr::all_of(c(
-        "taxon_name" = "suggested_name",
-        "aligned_name" = "aligned_name",
-        "taxonomic_dataset" = "taxonomic_dataset",
-        "taxon_id" = "taxon_ID",
-        "scientific_name_id" = "scientific_name_ID",
-        "taxon_id_genus" = "taxon_ID_genus"
+      "taxon_name" = "suggested_name",
+      "aligned_name" = "aligned_name",
+      "taxonomic_dataset" = "taxonomic_dataset",
+      "taxon_id" = "taxon_ID",
+      "scientific_name_id" = "scientific_name_ID",
+      "taxon_id_genus" = "taxon_ID_genus"
       ))) %>%
     # In AusTraits we also want to document identifiers for `aligned_names`, not just for the final `taxon_name`
     # We do this by rejoining columns from APC, but now to the aligned_names, not the taxon_names
     dplyr::left_join(by = c("aligned_name", "taxon_name"),
-      resources$APC %>% 
+      resources$APC %>%
         dplyr::mutate(
           accepted_name = resources$`APC list (accepted)`$canonical_name[match(accepted_name_usage_ID, resources$`APC list (accepted)`$taxon_ID)],
           taxon_name = accepted_name
-          ) %>%
-        dplyr::select(dplyr::all_of(c("scientific_name_ID", "taxonomic_status", "taxon_ID", "accepted_name", "taxon_name", "canonical_name"))) %>%
+        ) %>%
+        dplyr::select(dplyr::all_of(c(
+          "scientific_name_ID", "taxonomic_status", "taxon_ID",
+          "accepted_name", "taxon_name", "canonical_name"))) %>%
         dplyr::rename(dplyr::all_of(c(
           "aligned_name_taxonomic_status" = "taxonomic_status",
           "aligned_name_taxon_id" = "taxon_ID",
@@ -106,15 +127,16 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
           ))) %>%
         dplyr::distinct(aligned_name, taxon_name, .keep_all = TRUE)
     ) %>%
-    # For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific name identifiers
-    # with those for the relevant genus or family.
+    # For taxon names that are aligned at the genus- or family-level, we need to replace the taxon & scientific
+    # name identifiers with those for the relevant genus or family
     dplyr::mutate(
       # Genus filled in for all names that have a taxonomic of genus or more detailed
       genus = ifelse(
         !.data$taxon_rank %in% c("family", "order", "class", "phylum", "kingdom"),
-        ifelse(stringr::word(.data$taxon_name, 1) == "x",
-               stringr::word(.data$taxon_name, start = 1, end = 2),
-               stringr::word(.data$taxon_name, 1)),
+        ifelse(
+          stringr::word(.data$taxon_name, 1) == "x",
+          stringr::word(.data$taxon_name, start = 1, end = 2),
+          stringr::word(.data$taxon_name, 1)),
         NA),
       taxon_rank = ifelse(
         taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
@@ -129,79 +151,90 @@ build_update_taxon_list <- function(austraits, taxon_list, replace = FALSE) {
           resources$`APC list (accepted)`$taxon_rank[match(taxon_id, resources$`APC list (accepted)`$taxon_ID)],
           taxon_rank
         )),
-      # For taxon names that are valid names (per herbarium standards) or repeatedly reported invasives, but not in APC/APNI, map on families, genus_ids - APCalign doesn't do this
+      # For taxon names that are valid names (per herbarium standards) or repeatedly reported invasives, but
+      # not in APC/APNI, map on families, genus_ids - APCalign doesn't do this
       taxon_id_genus = ifelse(
         taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
-        resources$genera_all$taxon_ID[match(genus, resources$genera_all$genus)],
-        taxon_id_genus
+        resources$genera_all$taxon_ID[match(genus, resources$genera_all$genus)], taxon_id_genus
       ),
       family = ifelse(
         taxonomic_status == "unknown" & aligned_name %in% austraits$taxonomic_updates$aligned_name,
-        resources$APC$family[match(genus, resources$APC$genus)],
-        family
+        resources$APC$family[match(genus, resources$APC$genus)], family
       ),
       taxon_id_family = resources$family_accepted$taxon_ID[match(family, resources$family_accepted$canonical_name)],
       taxon_id = ifelse(taxon_rank %in% c("genus", "family"), NA, taxon_id),
-      scientific_name_id = ifelse(taxonomic_dataset == "APC", resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)], scientific_name_id),
+      scientific_name_id = ifelse(
+        taxonomic_dataset == "APC",
+        resources$`APC list (accepted)`$scientific_name_ID[match(scientific_name, resources$`APC list (accepted)`$scientific_name)],
+        scientific_name_id),
       scientific_name_id = ifelse(taxon_rank %in% c("genus", "family"), NA, scientific_name_id)
     ) %>%
     # The function `APCalign::update_taxonomy` includes alternative possible names as part of the taxon name.
     # We want this information in a separate column.
-    dplyr::mutate(
-        taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
+    dplyr::mutate(taxon_name = stringr::str_split(taxon_name, "\\[alternative possible names\\:")) %>%
     tidyr::unnest_wider(taxon_name, names_sep = "_") %>%
-    dplyr::rename(taxon_name = taxon_name_1, taxon_name_alternatives = taxon_name_2) %>%
-    dplyr::mutate(
-      taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")
-      ) %>%
+    dplyr::rename(dplyr::all_of(c("taxon_name" = "taxon_name_1", "taxon_name_alternatives" = "taxon_name_2"))) %>%
+    dplyr::mutate(taxon_name_alternatives = stringr::str_replace(taxon_name_alternatives, "\\]$", "")) %>%
     # Add in data for genus, binomial and trinomial, as appropriate.
     dplyr::mutate(
-      trinomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
-                         stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
+      trinomial = ifelse(
+        .data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
+        stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
       # Field binomial is filled in if taxonomic resolution is an infraspecific name or a binomial
       # All taxon names that have "extra" information (beyond the actual name) have been formatted
       # to have that information in square brackets '[]', so these can be used as a delimitor to
       # extract the actual name
-      binomial = ifelse(.data$taxon_rank %in% c("species"),
-                        stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
-      binomial = ifelse(.data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
-                        stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
+      binomial = ifelse(
+        .data$taxon_rank %in% c("species"),
+        stringr::str_split_fixed(.data$taxon_name, "\\[", 2)[, 1] %>% stringr::str_trim(), NA),
+      binomial = ifelse(
+        .data$taxon_rank %in% c("subspecies", "form", "variety", "series"),
+        stringr::word(.data$taxon_name, start = 1, end = 2), .data$binomial),
       binomial = stringr::str_trim(.data$binomial)
     ) %>%
     # Add in `establishment_means`, indicating if a taxon is native, naturalised or both
-    # This code is based on the exact syntax for taxon_distribution in APC; 
+    # This code is based on the exact syntax for taxon_distribution in APC;
     # the word `native` is used only if a taxon is both native and naturalised in a state
     dplyr::mutate(
       count_naturalised = stringr::str_count(.data$taxon_distribution, "naturalised"),
       count_n_and_n = stringr::str_count(.data$taxon_distribution, "native and naturalised"),
       count_states = stringr::str_count(.data$taxon_distribution, ",") + 1,
       establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
-      establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
-      establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
+      establishment_means = ifelse(
+        .data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised),
+        "native and naturalised", .data$establishment_means),
+      establishment_means = ifelse(
+        .data$count_naturalised == 0 & .data$count_n_and_n == 0,
+        "native", .data$establishment_means),
       establishment_means = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$establishment_means),
       taxon_distribution = ifelse(.data$taxon_rank %in% higher_ranks, NA, .data$taxon_distribution)
       ) %>%
-    dplyr::select(dplyr::all_of(c("aligned_name", "taxon_name", "taxon_rank", "taxonomic_status", "taxonomic_dataset", "taxon_name_alternatives",
-                                  "genus", "family", "binomial", "trinomial", "taxon_distribution", "establishment_means", "scientific_name",
-                                  "taxon_id", "taxon_id_genus", "taxon_id_family", "scientific_name_id", "aligned_name_taxon_id", "aligned_name_taxonomic_status")))
+    dplyr::select(dplyr::all_of(
+      c("aligned_name", "taxon_name", "taxon_rank", "taxonomic_status", "taxonomic_dataset",
+      "taxon_name_alternatives", "genus", "family", "binomial", "trinomial", "taxon_distribution",
+      "establishment_means", "scientific_name", "taxon_id", "taxon_id_genus", "taxon_id_family",
+      "scientific_name_id", "aligned_name_taxon_id", "aligned_name_taxonomic_status")))
+
+    # New taxon list
 
-    # New taxon list  
-
    if (replace == TRUE) {
-      taxon_list_replace <- taxon_list_new %>% 
-        arrange(taxon_name, aligned_name) %>%
-        distinct(taxon_name, aligned_name, .keep_all = TRUE)
-
+
+      taxon_list_replace <- taxon_list_new %>%
+        dplyr::arrange(taxon_name, aligned_name) %>%
+        dplyr::distinct(taxon_name, aligned_name, .keep_all = TRUE)
+
    } else {
-     
+
       taxon_list_replace <- taxon_list %>%
         # First bind rows for cleaned names not yet in AusTraits taxon_list.csv file
-        bind_rows(taxon_list_new %>% filter(!aligned_name %in% taxon_list$aligned_name)) %>%
+        dplyr::bind_rows(taxon_list_new %>% dplyr::filter(!aligned_name %in% taxon_list$aligned_name)) %>%
         # Arrange by names - hopefully this will be best solution for keeping GitHub commits more transparent
-        arrange(taxon_name, aligned_name) %>%
-        distinct(taxon_name, aligned_name, .keep_all = TRUE)
+        dplyr::arrange(taxon_name, aligned_name) %>%
+        dplyr::distinct(taxon_name, aligned_name, .keep_all = TRUE)
+
    }
-  
-  taxon_list_replace %>% 
+
+  taxon_list_replace %>%
       readr::write_csv("config/taxon_list.csv", na = "")
+
 }