incorporating utility function from base image

web-mev · Feb 15, 2024 · fa806d4 · fa806d4
1 parent 076736e
commit fa806d4
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 62 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,3 +1,3 @@
-FROM ghcr.io/web-mev/base-spatialge-docker:sha-09c68030bc197fbdc91b712b3e54597b4896eb91
+FROM ghcr.io/web-mev/base-spatialge-docker:sha-b9c9d80863af47a4ac85f420b96fd1ff57e2cde7
 
 ADD stnormalize.R /usr/local/bin
diff --git a/docker/stnormalize.R b/docker/stnormalize.R
@@ -1,5 +1,6 @@
 suppressMessages(suppressWarnings(library('spatialGE')))
 suppressMessages(suppressWarnings(library('optparse')))
+source('/usr/local/bin/prep_stlist.R')
 
 # args from command line:
 args <- commandArgs(TRUE)
@@ -59,72 +60,16 @@ if (is.null(opt$normalization)){
 working_dir <- dirname(opt$input_file)
 setwd(working_dir)
 
-# STlist expects that the first column is the gene names- so we don't use row.names arg
-rnacounts <- read.table(opt$input_file, sep='\t', header=T, check.names=F)
+# call the utility function which will return a list with the necessary items:
+spat_list <- prep_stlist(opt$input_file, opt$coordinates_file, opt$sample_name)
 
-# Same as for the counts, the expectation is that the coordinates file does not
-# have row.names and instead has the barcodes in the first column. For now, however,
-# we set the row names and then later alter.
-spotcoords <- read.table(opt$coordinates_file, sep='\t', row.names=1, header=T, check.names=T)
-
-# the barcodes in coords dataframe can be a superset of the count matrix columns.
-# For example, if the matrix is filtered for QC, there may be poor quality spots
-# that were filtered out. 
-# The opposite is not the case since we cannot have a barcode without a position.
-barcodes_from_counts <- colnames(rnacounts)[2:dim(rnacounts)[2]] 
-diff_set <- setdiff(barcodes_from_counts, rownames(spotcoords))
-num_invalid_barcodes <- length(diff_set)
-if (num_invalid_barcodes > 0) {
-    max_print <- 5
-    if (num_invalid_barcodes < max_print) {
-        invalid_barcodes <- paste(diff_set[1:num_invalid_barcodes], collapse=', ')
-    } else {
-        invalid_barcodes <- sprintf('%s, and %d others.', paste(diff_set[1:max_print], collapse=', '), num_invalid_barcodes-max_print)
-    }
-    message(sprintf('The set of barcodes in your count matrix must be a subset of those in your coordinates file. Problems include: %s', invalid_barcodes))
-    quit(status=1)
-} else {
-    # the STList constructor below will not accept coordinate files which are a superset of the 
-    # count matrix barcodes. We handle that here:
-    spotcoords <- spotcoords[barcodes_from_counts,]
-
-    # we need the barcodes in the first col, not the row names. We used the rownames for indexing
-    # convenience above, but need to change that now:
-    spotcoords <- cbind(rownames(spotcoords), data.frame(spotcoords, row.names=NULL))
-}
-
-# Now, to avoid any unexpected issues downstream, we need to conver the column names, etc.
-# to preserve the barcodes/column names, we create a dataframe of the original and 'R mutated'
-# names. We then run through everything with the mutated names and finally map back.
-orig_col_names <- colnames(rnacounts)
-proper_names <- make.names(orig_col_names)
-colname_mapping = data.frame(
-    orig_names = orig_col_names,
-    row.names=proper_names,
-    stringsAsFactors=F)
-colnames(rnacounts) <- proper_names
-
-
-spotcoords[,1] <- make.names(spotcoords[,1]) 
-rownames(spotcoords) <- make.names(rownames(spotcoords))
-
-# We will use a list of dataframes in the call to STlist
-rnacounts_list <- list()
-rnacounts_list[[opt$sample_name]] <- rnacounts
-spotcoords_list <- list()
-spotcoords_list[[opt$sample_name]] <- spotcoords
-
-# Import input data into spatialGE object
-spat <- STlist(
-    rnacounts=rnacounts_list,
-    spotcoords=spotcoords_list, 
-    samples=c(opt$sample_name)
-)
+# unpack:
+colname_mapping <- spat_list$colname_mapping
+spat <- spat_list$spat
 
 # Transform the data
 spat <- transform_data(spat, method=norm_scheme)
 
-
 # Export of the normalized data to full matrix flat file
 # Note: we could convert to a .h5
 df <- data.frame(