ENH: area2catena()/prof_class(): added possibility to export/import c…

…ompressed RData files as catena_out/catena_file to speed up writing/reading operations
tpilz · Jul 20, 2018 · 24e281c · 24e281c
1 parent 3fc4c67
commit 24e281c
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 20 deletions.
diff --git a/R/area2catena.R b/R/area2catena.R
@@ -36,7 +36,10 @@
 #' @param dir_out Character string specifying output directory (will be created;
 #'      nothing will be overwritten).
 #' @param catena_out Output: Name of output file containing mean catena information
-#'      as input for \code{\link[lumpR]{prof_class}}.
+#'      as input for \code{\link[lumpR]{prof_class}}. To save computation time and
+#'      memory in cases of large catchments and/or high resolution, specify a file
+#'      name with ending \code{*.RData}. In such a case, a compressed RData file will be
+#'      written instead of a text file. 
 #' @param catena_head_out Output: Name of output header file containing meta-information
 #'      as input for \code{\link[lumpR]{prof_class}}; manual adjustment necessary.
 #' @param ridge_thresh Integer specifying threshold of flow accumulation, below
@@ -74,7 +77,13 @@
 #'      \bold{IMPORTANT:} Herein, when specifying the GRASS input maps, please do
 #'      explicitly refer to the mapset if it is different from the mapset given in
 #'      initGRASS() (even PERMANENT!), as otherwise internally used readRAST() command
-#'      resulted in errors under Windows. 
+#'      resulted in errors under Windows.
+#'      
+#'      In case of \bold{long computation times or memory issues}, try \code{plot_catena = FALSE}
+#'      and specify an RData file as \code{catena_out}. Furthermore, make sure your
+#'      raster maps are not overly large (i.e. containing lots of NULL values around
+#'      the region of interest) and are of type \code{CELL} (i.e. integer) where it makes
+#'      sense (check with \code{r.info} in GRASS).
 #'      
 #'      GUIs such as RStudio may not produce some runtime messages (within parallel
 #'      foreach loop).
@@ -395,16 +404,18 @@ area2catena <- function(
 
 
 
-    # write output
+# write output #---------------------------------------------------------------
     #out_pre <- mapply(logdata[,c(3:length(logdata))], FUN=function(x) formatC(x, format="f", digits=3))
     #out_fmt <- cbind(logdata[,c(1,2)], out_pre)
     #format output to reasonable number of digits
     logdata <- round(logdata,3)
 
-    write.table(logdata, paste(dir_out,catena_out, sep="/"), col.names=F, row.names=F, quote=F, sep="\t")
-
+    if(grepl(".RData$", catena_out)) {
+      save(logdata, file=paste(dir_out,catena_out, sep="/"))
+    } else {
+      write.table(logdata, paste(dir_out,catena_out, sep="/"), col.names=F, row.names=F, quote=F, sep="\t")
+    }
 
-# write output #---------------------------------------------------------------
     # write header file
     write("#This file works as a header to the output of area2catena. Don't add additional headerlines.",
           file=paste(dir_out,catena_head_out, sep="/"), append=F)
@@ -435,7 +446,7 @@ area2catena <- function(
     #Generate output files for reclassification (input class-IDs vs. internally used IDs)
     #(area2catena creates continuous class numbering; restoring the orginal classes will require these files)
 
-    if (grass_files | "svc" %in% supp_qual) {
+    if (grass_files | any(grepl("svc", supp_qual))) {
       for (i in supp_qual) {
         write(c("new_id", "original_id"),
               file=paste(dir_out, "/reclass_", i, ".txt", sep=""), ncolumns=2, append=F, sep="\t")

diff --git a/R/prof_class.R b/R/prof_class.R
@@ -72,7 +72,8 @@
 #'      Default: \code{FALSE}.
 #' @param plot_silhouette \code{logical}. Shall a silhouette plot (illustrating the clustering
 #'      process) be generated? Consumes much memory and processing time and should be disabled,
-#'      if a memory error is thrown. Default: \code{TRUE}.
+#'      if a memory error is thrown. Will be \code{FALSE} if \code{make_plots = FALSE}.
+#'      Default: \code{TRUE}.
 #'      
 #' @return Function returns nothing. Output files are written into output directory
 #'      as specified in arguments.
@@ -85,6 +86,9 @@
 #'      as each landscape unit refers to the representative EHA. The gaps can be filled
 #'      with GRASS function \code{r.grow}.
 #'      
+#'      In case of \bold{long computation times or memory issues}, try \code{make_plots = FALSE}
+#'      and specify an RData file as \code{catena_file} (already in \code{\link[lumpR]{area2catena}}).
+#'      
 #' @details This function first resamples the catenas derived from \code{\link[lumpR]{area2catena}}
 #'      to a common length (\code{com_length} or the median number of support points
 #'      of all catenas but not more than \code{max_com_length}). Second, k-means clustering
@@ -254,9 +258,13 @@ prof_class <- function(
     #   com_length <- -1
 
     # load standard catena data
-    stats <- scan(catena_file, nlines = 1, what=numeric(), sep = "\t", quiet = TRUE) #read first line only
-    stats <- read.table(file = catena_file, colClasses = c("numeric", rep("NULL", length(stats)-1)), sep = "\t") #read first column only
-
+    if(grepl(".RData$", catena_file)) {
+      load(catena_file)
+      stats <- logdata[,1,drop=F]
+    } else {
+      stats <- scan(catena_file, nlines = 1, what=numeric(), sep = "\t", quiet = TRUE) #read first line only
+      stats <- read.table(file = catena_file, colClasses = c("numeric", rep("NULL", length(stats)-1)), sep = "\t") #read first column only
+    }
 
     profpoints <- table(stats[,1])  #count number of points of each catena
     p_id_unique = unique(stats[,1]) #get unique IDs  
@@ -310,7 +318,8 @@ prof_class <- function(
            main="Original catenas", xlab="horizontal length [m]", ylab="elevation [m]")
     }
   #read and resample profiles (done at the same time to avoid duplicates in memory)
-     testcon <- file(catena_file,open="r")
+    # TODO: This mess needs to be improved!
+     if(!grepl(".RData$", catena_file)) testcon <- file(catena_file,open="r")
      if(!silent) #for printing progress indicator
        pb <- txtProgressBar(min = 0, max = length(p_id_unique), style = 3)
 
@@ -326,10 +335,14 @@ prof_class <- function(
        p_pos = which(names(profpoints)==cur_p_id) #position of current profile in list
 
        skiplines = sum(profpoints[(total_read+1) : p_pos]) -  profpoints[p_pos] #compute number of lines to skip to reach the next selected profile
-       tt = readLines(con = testcon, n = skiplines)
-
-       tt = scan(file=testcon, what=numeric(), nlines = profpoints[p_pos], quiet = TRUE) #read single profile
-       tt = matrix(tt, nrow = c(profpoints[p_pos]), byrow = TRUE) #reshape matrix
+       if(grepl(".RData$", catena_file)) {
+         tt <- as.matrix(logdata[which(logdata[,1] == cur_p_id),])
+       } else {
+        tt = readLines(con = testcon, n = skiplines)
+        tt = scan(file=testcon, what=numeric(), nlines = profpoints[p_pos], quiet = TRUE) #read single profile
+        tt = matrix(tt, nrow = c(profpoints[p_pos]), byrow = TRUE) #reshape matrix
+       }
+
        total_read = p_pos  #we are now just at the desired profile
 
        if (any(tt[,1]!= cur_p_id)) stop(paste0("Format error in ",catena_file))
@@ -385,9 +398,14 @@ prof_class <- function(
      if(!silent) # close progress bar
        close(pb)
 
-     close(testcon)
+     if(grepl(".RData$", catena_file)) {
+       rm(logdata)
+     } else {
+       close(testcon)
+     }
      # remove not needed objects to save memory
      rm(list = c("p_supp", "p_resampled", "tt"))
+     gc(verbose = F);gc(verbose = F)
 
      #save(file="debug.RData", list = ls(all.names = TRUE)) #for debugging only
 

diff --git a/man/area2catena.Rd b/man/area2catena.Rd
diff --git a/man/prof_class.Rd b/man/prof_class.Rd