In [1]:
libs <- c("R6", "tibble", "optparse", "magrittr", "dplyr", "reshape2", "progress", 
          "testit", "data.table", "readr")
y <- lapply(libs, require, character.only = TRUE); rm(libs); rm(y)

# READING IN THE INPUTS ----------------------------------------------------------------------------------------
# Change the default values to read in your own files, or feed through terminal arguments
source("arguments.R")

files <- list.files("CGM", full.names = TRUE)
invisible(sapply(files, source)); rm(files)

# BASIC STARTUP MESSAGES ---------------------------------------------------------------------------------------
paste0("||", paste0(rep("-", 29), collapse = ""), " (2/8) Cluster metric generation ", 
                     paste0(rep("-", 29), collapse = ""), "||") 
paste0("Started process at: ", Sys.time())
stopwatch <- list("start_time" = as.character.POSIXt(Sys.time()), "end_time" = NULL)

Loading required package: R6

Loading required package: tibble

Loading required package: optparse

Loading required package: magrittr

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: reshape2

Loading required package: progress

Loading required package: testit

Loading required package: data.table


Attaching package: ‘data.table’


The following objects are masked from ‘package:reshape2’:

    dcast, melt


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: readr



In [2]:
# TP DATA PREPARATION ------------------------------------------------------------------------------------------
heights <- strsplit(as.character(params$th[2]), split = ",") %>% unlist()
clustersets <- file.path("../intermediate_data", params$int_type[2], "clustersets.Rds") %>% readRDS(.)
interval_list <- names(clustersets)

if (params$int_type[2] == "multiset") {
  interval <- "Multiset"
  msg1 <- ""
}else if (params$int_type[2] == "monthly") {
  interval <- "YearMonth"
  msg1 <- ""
}else if (params$int_type[2] == "weekly") {
  interval <- "YearWeek" #"Week"
  msg1 <- "week "
}

save_to <- file.path("../intermediate_data", params$int_type[2], "cgms")
tpn <- readRDS(arg$tpn)$new_cols

In [3]:
# rowx <- readRDS("results/rowx.Rds")
for (i in 1:(length(interval_list)-1)) {
  
  n1 <- as.character(interval_list[i])
  tpx1a <- clustersets[[n1]]$sofar %>% select(isolate, heightx) %>% set_colnames(c("isolate", heights))
  tpx1 <- tpn %>% rename("isolate" = "Strain") %>% 
    left_join(tpx1a, ., by = intersect(colnames(tpx1a), colnames(.)))
  rm(tpx1a)
  
  n2 <- as.character(interval_list[i+1])
  tpx2a <- clustersets[[n2]]$sofar %>% select(isolate, heightx) %>% set_colnames(c("isolate", heights))
  tpx2 <- tpn %>% rename("isolate" = "Strain") %>% 
    left_join(tpx2a, ., by = intersect(colnames(tpx2a), colnames(.)))
  rm(tpx2a)
  
  # if (i > 1) {
  #   fullset <- clustersets[[n1]]$sofar
  #   ivl_i <- clustersets[[n1]]$ivl
  #   unchanged_clusters <- setdiff(fullset, ivl_i) %>% pull(heightx) %>% unique()
  #   strains <- fullset[heightx %in% unchanged_clusters] %>% pull(isolate)
  #   rm(fullset); rm(unchanged_clusters); rm(ivl_i)
  #   unchanged_data <- tmp %>% filter(Strain %in% strains)
  # }
  
  ph <- max(nchar(colnames(tpx1)[-1]), nchar(colnames(tpx2)[-1]))
  pc <- tpx2 %>% select(-isolate) %>% max(., tpx2 %>% select(-isolate)) %>% nchar()
  
  msgtexts <- c(
    paste0("  Constructing ", msg1, n1, " data object, ", nrow(tpx1), " (", i, " / ", length(interval_list), "):\n"), 
    paste0("  Constructing ", msg1, n2, " data object, ", nrow(tpx2), " (", i+1, " / ", length(interval_list), "):\n")
  ) 
  
  tplist <- tpDataSetup(tpx1, tpx2, ph, pc, FALSE, msgtexts)#; rm(tpx1); rm(tpx2)
  tp1 <- tplist[["tp1"]]
  tp2 <- tplist[["tp2"]]
  novels <- tplist[["novs"]]
  rm(tplist)
  
  # BASE CASE (FIRST HEIGHT) -------------------------------------------------------------------------------------
  outputDetails(paste0("  Tracking clusters from ", msg1, n1, " to ", msg1, n2, ", from height ", heights[1], " ..."), newcat = TRUE)
  
  hx <- Heightdata$new(starter = heights[1], t1_comps = tp1$comps, hvals = heights)$
    clust_tracking(tp2$comps, tp2$cnames, tp1$coded, tp2$coded, TRUE)$
    update_iteration()
  
  # outputDetails("  Identifying and counting 'additional TP1 strains'.\n", newcat = FALSE)
  clusters_just_tp1 <- lapply(heights, function(h) {
    df1 <- hx$results[[h]]
    df2 <- left_join(df1, tp1$flagged, by = intersect(colnames(df1), colnames(tp1$flagged)))
    
    left_join(df2, tp2$flagged, by = intersect(colnames(df2), colnames(tp2$flagged))) %>% 
      arrange(tp1_h, tp1_cl, tp2_h, tp2_cl) %>% 
      findingSneakers(novels, tp1$status, tp2$status, .) %>% return()
  }) %>% bind_rows()
  
  # outputDetails("  Handling novel tracking, adding to dataset.\n", newcat = FALSE)
  isolates_file <- novelHandling(tp1, tp2, clusters_just_tp1, heights)
  
  isolates_file %<>% 
    mutate(novel = ifelse(isolate %in% setdiff(tp2$raw$isolate, tp1$raw$isolate), 1, 0)) %>% 
    rename(Strain = isolate)
  
  isolates_file[,c("tp1_h", "tp2_h")] %<>% apply(., 2, padCol, padval = ph, padchr = "h")
  isolates_file[,c("tp1_cl", "tp2_cl")] %<>% apply(., 2, padCol, padval = pc, padchr = "c")
  
  # outputDetails("  Incrementing all cluster sizes by 1, then calculating growth columns.\n", newcat = FALSE)
  # outputDetails("  Also adding 'type' column to CGM results table.\n", newcat = FALSE)
  isolates_file %<>% 
    mutate(tp1_cl_size = tp1_cl_size + 1, tp2_cl_size = tp2_cl_size + 1) %>% 
    oneHeight()
  
  # if (i > 1) {isolates_file <- unchanged_data %>% bind_rows(isolates_file, .)}
  
  # tmp <- isolates_file
  isolates_file %<>% addingType(.)
  
  # outputDetails("  Saving the data in a file with cluster identifiers.\n", newcat = FALSE)
  # strains removed
  isolates_file <- isolates_file %>% 
    select(tp1_id, tp1_cl_size, first_tp1_flag, last_tp1_flag, 
           first_tp2_flag, tp2_cl_size, last_tp2_flag, add_TP1, novel, 
           num_novs, actual_size_change, actual_growth_rate, new_growth, type) %>% 
    unique() %>% as.data.table()
  
  correcthere <- isolates_file
  
  indices <- which(is.na(isolates_file$tp1_id) & isolates_file$tp1_cl_size == 1)
  isolates_file[indices]$tp1_id <- isolates_file$first_tp2_flag[indices] %>% 
    sapply(., strsplit, "_") %>% sapply(., '[[', 3) %>% 
    paste0("AbsentAtTP1-", "TP2_", .)
  
  cgm_results <- isolates_file %>% 
    mutate(across(colnames(isolates_file), as.character)) %>% 
    melt.data.table(id.vars = "tp1_id") %>% 
    add_column(Interval = paste0(n1, "-", n2), .after = 1) %>% 
    set_colnames(c("Cluster", paste0(arg$intervaltype, "Interval"), "Field", "Value"))
  
  cgm_results <- isolates_file %>% 
    add_column(interval = paste0(n1, "-", n2), .before = 1)# %>% add_column(TP = n2, .before = 1)
  
  saveRDS(cgm_results, file.path(save_to, paste0("TP", n2, ".Rds")))
}

  Constructing week 2020-03 data object, 2 (1 / 11):
  Constructing week 2020-04 data object, 26 (2 / 11):
  Tracking clusters from week 2020-03 to week 2020-04, from height 0 ...


  Tracking clusters from week 2020-03 to week 2020-04, from height 0 ...



  Constructing week 2020-04 data object, 26 (2 / 11):
  Constructing week 2020-05 data object, 52 (3 / 11):
  Tracking clusters from week 2020-04 to week 2020-05, from height 0 ...


  Tracking clusters from week 2020-04 to week 2020-05, from height 0 ...



  Constructing week 2020-05 data object, 52 (3 / 11):
  Constructing week 2020-06 data object, 72 (4 / 11):
  Tracking clusters from week 2020-05 to week 2020-06, from height 0 ...


  Tracking clusters from week 2020-05 to week 2020-06, from height 0 ...



  Constructing week 2020-06 data object, 72 (4 / 11):
  Constructing week 2020-07 data object, 76 (5 / 11):
  Tracking clusters from week 2020-06 to week 2020-07, from height 0 ...


  Tracking clusters from week 2020-06 to week 2020-07, from height 0 ...



  Constructing week 2020-07 data object, 76 (5 / 11):
  Constructing week 2020-08 data object, 78 (6 / 11):
  Tracking clusters from week 2020-07 to week 2020-08, from height 0 ...


  Tracking clusters from week 2020-07 to week 2020-08, from height 0 ...



  Constructing week 2020-08 data object, 78 (6 / 11):
  Constructing week 2020-09 data object, 86 (7 / 11):
  Tracking clusters from week 2020-08 to week 2020-09, from height 0 ...


  Tracking clusters from week 2020-08 to week 2020-09, from height 0 ...



  Constructing week 2020-09 data object, 86 (7 / 11):
  Constructing week 2020-10 data object, 94 (8 / 11):
  Tracking clusters from week 2020-09 to week 2020-10, from height 0 ...


  Tracking clusters from week 2020-09 to week 2020-10, from height 0 ...



  Constructing week 2020-10 data object, 94 (8 / 11):
  Constructing week 2020-11 data object, 99 (9 / 11):
  Tracking clusters from week 2020-10 to week 2020-11, from height 0 ...


  Tracking clusters from week 2020-10 to week 2020-11, from height 0 ...



  Constructing week 2020-11 data object, 99 (9 / 11):
  Constructing week 2020-12 data object, 108 (10 / 11):
  Tracking clusters from week 2020-11 to week 2020-12, from height 0 ...


  Tracking clusters from week 2020-11 to week 2020-12, from height 0 ...



  Constructing week 2020-12 data object, 108 (10 / 11):
  Constructing week 2020-13 data object, 111 (11 / 11):
  Tracking clusters from week 2020-12 to week 2020-13, from height 0 ...


  Tracking clusters from week 2020-12 to week 2020-13, from height 0 ...





In [4]:
dir.create(file.path("../results", params$int_type[2]), showWarnings = FALSE)
if (params$int_type[2] == "multiset") {
  res_file <- gsub("-", "", params$divs[2]) %>% gsub(",", "-", .) %>% 
    paste0("CGM-",.,  ".Rds") %>% 
    file.path(file.path("../results", params$int_type[2]), .)
}else {
  res_file <- file.path("../results", params$int_type[2], "CGM-intervals.Rds")
}

cgmfiles <- list.files(save_to, full.names = TRUE)
lapply(cgmfiles, function(fi) {readRDS(fi)}) %>% bind_rows() %>% saveRDS(., res_file)

In [5]:
# WRAPPING THINGS UP -------------------------------------------------------------------------------------------
stopwatch[["end_time"]] <- as.character.POSIXt(Sys.time())

paste0("Successfully collected data for all heights.")
timeTaken(pt = "CGM data collection", stopwatch)
paste0("||", paste0(rep("-", 28), collapse = ""), " End of cluster metric generation ", 
       paste0(rep("-", 29), collapse = ""), "||")

Sys.sleep(3)