Merge pull request #139 from weecology/juniper_active

v0.11.0
weecology · Sep 15, 2019 · c66cc46 · c66cc46
2 parents 314b52f + 94408e0
commit c66cc46
Show file tree

Hide file tree

Showing 29 changed files with 664 additions and 376 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: portalcasting
 Title: Functions Used in Predicting Portal Rodent Dynamics
-Version: 0.10.0
+Version: 0.11.0
 Authors@R: c(
     person(c("Juniper", "L."), "Simonis",
       email = "juniper.simonis@weecology.org", role = c("aut", "cre"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -38,6 +38,7 @@ export(download_climate_casts)
 export(download_destin)
 export(download_message)
 export(download_url)
+export(ensemble_casts)
 export(error_if_deep)
 export(evalplot_species)
 export(extract_data_sets)
@@ -68,6 +69,7 @@ export(model_controls)
 export(model_template)
 export(models_path)
 export(models_to_cast)
+export(most_abundant_species)
 export(na_conformer)
 export(name_from_url)
 export(named_null_list)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,20 @@
 
 Version numbers follow [Semantic Versioning](https://semver.org/).
 
+
+# [portalcasting 0.11.0](https://github.com/weecology/portalcasting/releases/tag/v0.11.0)
+*2019-09-14*
+
+### Ensembling reintroduced
+* Associated with the reconfiguration of portalcasting from v0.8.1 to 0.9.0, ensembling was removed temporarily.
+* A basic ensemble is reintroduced, now as an unweighted average across all selected models, allowing us to have an ensemble but not have it be tied to AIC weighting (because AIC weighting is no longer possible with the split between interpolated and non-interpolated data for model fitting).
+* In a major departure from v0.8.1 and earlier, the ensemble's output is not saved like the actual models'. Rather, it is only calculated when needed on the fly.
+* In plotting, it is now the default to use the ensemble for `plot_cast_ts` and `plot_cast_point` and for the ensemble to be included in `plot_casts_err_lead` and `plot_casts_cov_RMSE`.
+
+### Return of `most_abundant_species`
+* Function used to select the most common species.
+* Now uses the actual data and not the casts to determine the species.
+
 # [portalcasting 0.10.0](https://github.com/weecology/portalcasting/releases/tag/v0.10.0)
 *2019-09-13*
 

diff --git a/R/args.R b/R/args.R
@@ -44,8 +44,10 @@
 #'   \code{cleanup},
 #'   \code{covariatesTF}, 
 #'   \code{effort},
+#'   \code{ensemble},
 #'   \code{fillweight}, 
 #'   \code{interpolate}, 
+#'   \code{include_interp}, 
 #'   \code{na_drop}, 
 #'   \code{nadot}, 
 #'   \code{NULLname},
@@ -81,10 +83,11 @@
 #'   \code{freq},  
 #'   \code{level}, 
 #'   \code{main}, 
+#'   \code{method} (if not \code{NULL}, must be \code{"unwtavg"}),
 #'   \code{model} (inputted values are checked via 
 #'    \code{\link{verify_models}}), 
 #'   \code{name}, 
-#'   \code{output} (if not \code{NULL}, must be \code{"abundances"}),
+#'   \code{output} (if not \code{NULL}, must be \code{"abundance"}),
 #'   \code{path},
 #'   \code{plots} (if not \code{NULL}, must be \code{"all"} or 
 #'    \code{"longterm"}),
@@ -200,7 +203,7 @@
 #'   \code{ndates} (must be positive),
 #'   \code{nmoons} (must be non-negative),
 #'   \code{start_moon} (must be positive),
-#'   \code{topx} (must be non-negative).
+#'   \code{topx} (must be positive).
 #'
 #'  Must be length-1 \code{integer}-conformable values can be \code{NULL}, 
 #'  and can be \code{NA}:
@@ -209,6 +212,7 @@
 #'
 #'  Must be \code{integer}-conformable values, can be any length, can be 
 #'  \code{NULL}, but cannot be \code{NA}:
+#'   \code{cast_groups} (must be non-negative),
 #'   \code{cast_ids} (must be non-negative),
 #'   \code{end_moons} (must be positive),
 #'   \code{target_moons} (must be positive).
@@ -483,6 +487,7 @@ check_arg_list <- function(){
   }
 
 
+  avail_methods <- c("unwtavg")
   avail_outputs <- c("abundance")
   avail_plots <- c("all", "longterm")
   avail_species <- c("BA", "DM", "DO", "DS", "NA", "NA.", "OL", "OT", "PB", 
@@ -500,6 +505,7 @@ check_arg_list <- function(){
     append_cast_csv = arg_logical(),
     arg_checks = arg_logical(),
     cast = arg_cast(),
+    cast_groups = arg_nonnegintnum(NULL),
     cast_ids = arg_nonnegintnum(NULL),
     cast_id = arg_nonnegintnum(),
     cast_cov = arg_df(),
@@ -534,6 +540,7 @@ check_arg_list <- function(){
     downloads = arg_list(),
     downloads_versions = arg_character(NULL),
     effort = arg_logical(),
+    ensemble = arg_logical(),
     end = arg_date(),
     end_moon = arg_posintnum(),
     end_moons = arg_posintnum(NULL),
@@ -552,6 +559,7 @@ check_arg_list <- function(){
     hist_cov = arg_df(),
     hist_tab = arg_df(),
     interpolate = arg_logical(null = TRUE),
+    include_interp = arg_logical(null = TRUE),
     lag = arg_nonnegintnum(null = TRUE, na = TRUE),
     lat = arg_numeric(),
     lead = arg_posintnum(),
@@ -561,6 +569,7 @@ check_arg_list <- function(){
     lon = arg_numeric(),
     main = arg_character(),
     metadata = arg_list(),
+    method = arg_character(vals = avail_methods),
     min_lag  = arg_nonnegintnum(na = TRUE),
     min_observed = arg_posintnum(),
     min_plots = arg_posintnum(),
@@ -611,6 +620,7 @@ check_arg_list <- function(){
     target_cols = arg_character(NULL),
     time = arg_character(),
     title = arg_character(),
+    topx = arg_posintnum(),
     total = arg_logical(),
     treatment = arg_character(vals = avail_treatments),
     type = arg_character(),

diff --git a/R/ensembling.R b/R/ensembling.R
@@ -0,0 +1,180 @@
+#' @title Combine (ensemble) casts
+#'
+#' @description Combine multiple casts' output into a single ensemble. 
+#'  Presently, only a general average ensemble is available.
+#'
+#' @details A pre-loaded table of casts can be input, but if not (default), 
+#'  the table will be efficiently (as defined by the inputs) loaded and 
+#'  trimmed. \cr 
+#'  The casts can be trimmed specifically using the \code{cast_ids} input,
+#'  otherwise, all relevant casts from the stated \code{cast_groups}
+#'  will be included. 
+#'
+#' @param main \code{character} value of the name of the main component of
+#'  the directory tree.
+#'
+#' @param method \code{character} value of the name of the ensemble method
+#'  to use. Presently, only \code{"unwtavg"} (unweighted average) is allowed.
+#'
+#' @param cast_groups \code{integer} (or integer \code{numeric}) value
+#'  of the cast group to combine with an ensemble. If \code{NULL} (default),
+#'  the most recent cast group is ensembled. 
+#'
+#' @param cast_ids \code{integer} (or integer \code{numeric}) values 
+#'  representing the casts of interest for restricting ensembling, as indexed
+#'  within the directory in the \code{casts} sub folder. 
+#'  See the casts metadata file (\code{casts_metadata.csv}) for summary
+#'  information.
+#'
+#' @param end_moon \code{integer} (or integer \code{numeric}) 
+#'  newmoon number of the forecast origin. Default value is 
+#'  \code{NULL}, which equates to no selection with respect to 
+#'  \code{end_moon}.
+#'
+#' @param cast_tab Optional \code{data.frame} of cast table outputs. If not
+#'  input, will be loaded.
+#'
+#' @param models \code{character} value(s) of the name of the model to 
+#'  include. Default value is \code{NULL}, which equates to no selection with 
+#'  respect to \code{model}. \code{NULL} translates to all \code{models}
+#'  in the table.
+#'
+#' @param data_set \code{character} value of the rodent data set to include
+#'  Default value is \code{NULL}, which equates to the first data set 
+#'  encountered.
+#'
+#' @param include_interp \code{logical} indicator of if the basic data set
+#'  names should also be inclusive of the associated interpolated data sets.
+#'
+#' @param arg_checks \code{logical} value of if the arguments should be
+#'  checked using standard protocols via \code{\link{check_args}}. The 
+#'  default (\code{arg_checks = TRUE}) ensures that all inputs are 
+#'  formatted correctly and provides directed error messages if not. 
+#'
+#' @param species \code{character} vector of the species code(s) 
+#'  or \code{"total"} for the total across species) to be plotted 
+#'  \code{NULL} translates to the species defined by  
+#'  \code{evalplot_species}.
+#'
+#' @return \code{NULL}. Plot is generated.
+#' 
+#' @examples
+#'  \donttest{
+#'   setup_dir()
+#'   portalcast(models = c("AutoArima", "ESSS"), end_moons = 515)
+#'   ensemble_casts()
+#' }
+#'
+#' @export
+#'
+ensemble_casts <- function(main = ".", method = "unwtavg", 
+                           cast_groups = NULL, cast_ids = NULL, 
+                           cast_tab = NULL, end_moon = NULL, 
+                           models = NULL, data_set = NULL, 
+                           include_interp = TRUE,
+                           species = NULL, arg_checks = TRUE){
+
+  check_args(arg_checks = arg_checks)
+  if(is.null(cast_tab)){
+    cast_choices <- select_casts(main = main, cast_ids = cast_ids, 
+                                 models = models, end_moons = end_moon, 
+                                 data_sets = data_set, 
+                                 include_interp = include_interp,
+                                 arg_checks = arg_checks)
+    if(NROW(cast_choices) == 0){
+      stop("no casts available for request")
+    }else{
+      cast_tab <- read_cast_tabs(main = main, cast_ids = cast_choices$cast_id,
+                                 arg_checks = arg_checks)
+      cast_tab <- add_obs_to_cast_tab(main = main, cast_tab = cast_tab,
+                                      arg_checks = arg_checks)
+      cast_tab <- add_err_to_cast_tab(main = main, cast_tab = cast_tab,
+                                      arg_checks = arg_checks)
+      cast_tab <- add_lead_to_cast_tab(main = main, cast_tab = cast_tab,
+                                       arg_checks = arg_checks)
+      cast_tab <- add_covered_to_cast_tab(main = main, cast_tab = cast_tab,
+                                          arg_checks = arg_checks)
+    }
+  }
+  cast_tab$data_set <- gsub("_interp", "", cast_tab$data_set)
+  cast_ids <- ifnull(cast_ids, unique(cast_tab$cast_id))
+  models <- ifnull(models, unique(cast_tab$model))
+  data_set <- ifnull(data_set, "controls")
+  species <- ifnull(species, 
+                    base_species(total = TRUE, arg_checks = arg_checks)) 
+  end_moon <- ifnull(end_moon, max(unique(na.omit(cast_tab)$end_moon))) 
+  cast_groups <- ifnull(cast_groups, unique(cast_tab$cast_group))
+  cast_id_in <- cast_tab$cast_id %in% cast_ids
+  model_in <- cast_tab$model %in% models
+  data_set_in <- cast_tab$data_set == data_set
+  species_in <- cast_tab$species %in% species
+  end_moon_in <- cast_tab$end_moon %in% end_moon
+  cast_group_in <- cast_tab$cast_group %in% cast_groups
+  all_in <- cast_id_in & model_in & data_set_in & species_in & end_moon_in &
+            cast_group_in
+  if(sum(all_in) == 0){
+    stop("no casts available for requested")
+  }
+  cast_tab <- cast_tab[all_in, ]
+  nspecies <- length(species)
+  moons <- unique(cast_tab$moon)
+  nmoons <- length(moons)
+  nmodels <- length(models)
+  weight <- 1/nmodels
+  estimate <- rep(NA, nspecies * nmoons)
+  mvar <- rep(NA, nspecies * nmoons)
+  l_pi <- rep(NA, nspecies * nmoons)
+  u_pi <- rep(NA, nspecies * nmoons)
+  obs <- rep(NA, nspecies * nmoons)
+  error <- rep(NA, nspecies * nmoons)
+  end_moon_id <- rep(NA, nspecies * nmoons)
+  ecast_id <- rep(NA, nspecies * nmoons)
+  species_id <- rep(NA, nspecies * nmoons)
+  moon_id <- rep(NA, nspecies * nmoons)
+  covered <- rep(NA, nspecies * nmoons)
+  nmodels <- length(models)
+  counter <- 1
+  for(i in 1:nspecies){
+    for(j in 1:nmoons){
+      species_in <- cast_tab$species %in% species[i]
+      moon_in <- cast_tab$moon %in% moons[j]
+      all_in <- species_in & moon_in
+      pcast_tab <- cast_tab[all_in, ]
+
+      estimates <- na.omit(pcast_tab$estimate)
+      if(length(estimates) > 0){
+        mvars <- ((na.omit(pcast_tab$upper_pi) - na.omit(pcast_tab$estimate))/
+                  (na.omit(pcast_tab$confidence_level))) ^2
+        estimate[counter] <- sum(estimates * weight)
+        wt_ss <- sum(weight * (estimates - estimate[counter]) ^ 2)
+        if(nmodels > 1){
+          mvar[counter] <- sum((mvars * weight) + (wt_ss / (nmodels - 1)))
+        }else{
+          mvar[counter] <- mvars
+        }
+        CL <- unique(na.omit(pcast_tab$confidence_level))
+        u_pi[counter] <- estimate[counter] + sqrt(mvar[counter] * CL)
+        l_pi[counter] <- estimate[counter] - sqrt(mvar[counter] * CL)
+        obs[counter] <- unique(pcast_tab$obs)
+        error[counter] <- estimate[counter] - obs[counter]
+        end_moon_id[counter] <- unique(pcast_tab$end_moon)
+        ecast_id[counter] <- as.numeric(paste0(9999, min(pcast_tab$cast_id)))
+        moon_id[counter] <- moons[j]
+        species_id[counter] <- species[i]
+        covered[counter] <- estimate[counter] >= l_pi[counter] &
+                            estimate[counter] <= u_pi[counter]
+      }
+      counter <- counter + 1
+
+    }  
+  }
+
+  ensemble_name <- paste0("ensemble_", method)
+
+  lead <- moon_id - end_moon_id
+  data.frame(model = ensemble_name, moon = moon_id, species = species_id,
+             estimate = estimate, var = mvar, lower_pi = l_pi, 
+             upper_pi = u_pi, obs = obs, error = error, 
+             end_moon = end_moon_id, lead = lead, data_set = data_set,
+             cast_id = ecast_id, covered = covered, stringsAsFactors = FALSE) 
+}