In [1]:
libs <- c("R6","testit","optparse","magrittr","dplyr","tibble","readr",
          "reshape2","fossil","tidyr","purrr", "data.table")
y <- lapply(libs, require, character.only = TRUE)
assert("All packages loaded correctly", all(unlist(y))); rm(y); rm(libs)

# Current working directory should be Metrics-CGM-ECC/
files <- paste0(c("ECC/classes_ecc.R", "ECC/ecc_functions.R", 
                  "ECC/dist_functions.R"))
invisible(sapply(files, source)); rm(files)

source("arguments.R")

cat(paste0("\n||", paste0(rep("-", 20), collapse = ""), 
           " (3/8) Generating non-redundant pairwise distances ", 
           paste0(rep("-", 20), collapse = ""), "||\nStarted process at: ", Sys.time()))
stopwatch <- list("start_time" = as.character.POSIXt(Sys.time()), "end_time" = NULL)

Loading required package: R6

Loading required package: testit

Loading required package: optparse

Loading required package: magrittr

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: tibble

Loading required package: readr

Loading required package: reshape2

Loading required package: fossil

Loading required package: sp

Loading required package: maps

Loading required package: shapefiles

Loading required package: foreign


Attaching package: ‘shapefiles’


The following objects are masked from ‘package:foreign’:

    read.dbf, write.dbf


Loading required package: tidyr


Attaching package: ‘tidyr’


The following object is masked from ‘package:reshape2’:

    smiths


The following object is masked from ‘package:magrittr’:

    extract


Loading required package: purrr


A


||-------------------- (3/8) Generating non-redundant pairwise distances --------------------||
Started process at: 2022-05-12 13:49:28

In [2]:
# COLLECT dist matrices using TPN clusters ---------------------------------------------------------
paste0("Collecting and saving distances for groups at TPN")

save_to <- file.path("../intermediate_data", params$int_type[2], "TPN", "dists", "/")
dir.create(save_to,  recursive = TRUE, showWarnings = FALSE)

outputMessages("Reading in metadata")

hx <- strsplit(as.character(params$th[2]), split = ",") %>% unlist() %>% tibble(h = ., th = paste0("T", .))

fdata <- readRDS(arg$tpn)$new_cols %>% column_to_rownames("Strain")
tp2 <- Timepoint$new(arg$tpn, "tp2", fdata)$Process(hx)$listHeights(hx)

m <- read_tsv(arg$metadata) %>% processedStrains()
basedir <- file.path("../intermediate_data", params$int_type[2], "TPN")


Reading in metadata

[1mRows: [22m[34m111[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): Strain, Source, Country, Province, City, YearMonth, YearWeek
[32mdbl[39m  (6): Latitude, Longitude, Day, Month, Year, TP2
[34mdate[39m (1): Date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
# Extremes, for scaling ----------------------------------------------------------------------
paste0("Collecting the extremes (min and max values) for optional scaling later")
ext_geo_dists <- m$assignments %>% select(Longitude, Latitude) %>% unique() %>% 
  rownames_to_column("id") %>% distMatrix(., "geo", c("Longitude", "Latitude"))
ext_temp_dists <- m$assignments %>% select(Date) %>% unique() %>% 
  rownames_to_column("id") %>% distMatrix(., "temp", "Date")

extremes <- list(maxt = max(ext_temp_dists), mint = min(ext_temp_dists), 
                 maxg = max(ext_geo_dists), ming = min(ext_geo_dists))
saveRDS(extremes, file.path(basedir, "extreme_dists.Rds"))
rm(ext_geo_dists); rm(ext_temp_dists); rm(extremes)

In [4]:
# Pairwise distances within clusters at TPN --------------------------------------------------
paste0("Collecting non-redundant pairwise distances (at last timepoint, when dataset is full)")
metadata <- m$strain_data %>% as.data.table()

clustersets <- file.path("../intermediate_data", params$int_type[2], "clustersets.Rds") %>% readRDS(.)
interval_list <- names(clustersets)
rm(clustersets)

k <- last(interval_list)

if (params$int_type[2] == "multiset") {
  interval <- "Multiset"
}else if (params$int_type[2] == "monthly") {
  interval <- "YearMonth"
}else if (params$int_type[2] == "weekly") {
  interval <- "YearWeek" #"Week"
}

typing_data <- lapply(1:length(interval_list), function(i) {
  n1 <- as.character(interval_list[i])
  tpkstrains <- metadata[get(interval) <= n1]$Strain
  dfz <- tp2$filedata %>% rownames_to_column("isolate") %>%
    select(isolate, all_of(hx$h)) %>%
    filter(isolate %in% tpkstrains) %>% column_to_rownames("isolate")
  dfz[,hx$h[1],drop=FALSE] %>% set_colnames(hx$th[1])
}) %>% set_names(as.character(interval_list))

td <- typing_data[[length(typing_data)]] %>% rownames_to_column("Strain") %>% as.data.table()
rm(typing_data)

parts <- m$dr_matches %>% filter(Strain %in% td$Strain) %>% 
  left_join(td, ., by = "Strain") %>% sectionClusters(.)
saveRDS(parts, file.path("../intermediate_data", params$int_type[2], "TPN", "parts.Rds"))

In [5]:
paste0("  Collecting and saving distances for cluster groups at TP", k, ":")

tpkstrains <- metadata[get(interval) <= k]$Strain
collectDistances(parts$drs, parts$results, m$dr_matches, m$assignments, tpkstrains, save_to)
rm(m); rm(parts)

assert("Distances were collected and saved", file.exists(file.path(basedir, "extreme_dists.Rds")))

stopwatch[["end_time"]] <- as.character.POSIXt(Sys.time())
timeTaken(pt = "distances collection", stopwatch)
cat(paste0("||", paste0(rep("-", 31), collapse = ""), 
           " End of distances collection ", paste0(rep("-", 31), collapse = ""), "||"))



||------------------------------- End of distances collection -------------------------------||