diff --git a/DESCRIPTION b/DESCRIPTION index b61bcbf6..48c374bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -70,7 +70,8 @@ Suggests: functional, survminer, tidySummarizedExperiment, - markdown + markdown, + uwot VignetteBuilder: knitr RdMacros: diff --git a/R/functions.R b/R/functions.R index 0d79ed7f..37da4160 100755 --- a/R/functions.R +++ b/R/functions.R @@ -1825,7 +1825,7 @@ we suggest to partition the dataset for sample clusters. } -#' Get principal component information to a tibble using tSNE +#' Get tSNE #' #' @keywords internal #' @noRd @@ -1939,6 +1939,130 @@ get_reduced_dimensions_TSNE_bulk <- } +#' Get UMAP +#' +#' @keywords internal +#' +#' @import dplyr +#' @import tidyr +#' @import tibble +#' @importFrom rlang := +#' @importFrom stats setNames +#' @importFrom utils install.packages +#' +#' @param .data A tibble +#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param top An integer. How many top genes to select +#' @param of_samples A boolean +#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered +#' @param ... Further parameters passed to the function uwot +#' +#' @return A tibble with additional columns +#' +get_reduced_dimensions_UMAP_bulk <- + function(.data, + .element = NULL, + .feature = NULL, + + .abundance = NULL, + .dims = 2, + top = 500, + of_samples = TRUE, + log_transform = TRUE, + scale = TRUE, + calculate_for_pca_dimensions = 20, + ...) { + + if(!is.null(calculate_for_pca_dimensions) & ( + !is(calculate_for_pca_dimensions, "numeric") | + length(calculate_for_pca_dimensions) > 1 + )) + stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1") + + # Comply with CRAN NOTES + . = NULL + + # Get column names + .element = enquo(.element) + .feature = enquo(.feature) + .abundance = enquo(.abundance) + + # Evaluate ... + arguments <- list(...) + # if (!"check_duplicates" %in% names(arguments)) + # arguments = arguments %>% c(check_duplicates = FALSE) + if (!"dims" %in% names(arguments)) + arguments = arguments %>% c(n_components = .dims) + if (!"init" %in% names(arguments)) + arguments = arguments %>% c(init = "spca") + + # Check if package is installed, otherwise install + if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) { + message("tidybulk says: Installing uwot") + install.packages("uwot", repos = "https://cloud.r-project.org") + } + + df_source = + .data %>% + + # Filter NA symbol + filter(!!.feature %>% is.na %>% not()) %>% + + # Prepare data frame + distinct(!!.feature,!!.element,!!.abundance) %>% + + # Check if data rectangular + when( + check_if_data_rectangular(., !!.element,!!.feature,!!.abundance) ~ + eliminate_sparse_transcripts(., !!.feature), + ~ (.) + ) %>% + + # Check if log transform is needed + when(log_transform ~ dplyr::mutate(., !!.abundance := !!.abundance %>% log1p), ~ (.)) %>% + + # Filter most variable genes + keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top) + + # Calculate based on PCA + if(!is.null(calculate_for_pca_dimensions)) + df_UMAP = + df_source %>% + reduce_dimensions( + !!.element,!!.feature,!!.abundance, + method="PCA", + .dims = calculate_for_pca_dimensions, + action="get", + scale = scale + ) %>% + suppressMessages() %>% + as_matrix(rownames = quo_name(.element)) + + # Calculate based on all features + else + df_UMAP = + df_source %>% + spread(!!.feature,!!.abundance) %>% + as_matrix(rownames = quo_name(.element)) + + do.call(uwot::tumap, c(list(df_UMAP), arguments)) %>% + as_tibble(.name_repair = "minimal") %>% + setNames(c("UMAP1", "UMAP2")) %>% + + # add element name + dplyr::mutate(!!.element := df_UMAP %>% rownames) %>% + select(!!.element, everything()) %>% + + # Attach attributes + reattach_internals(.data) %>% + memorise_methods_used("uwot") + + } + #' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle #' #' @keywords internal diff --git a/R/functions_SE.R b/R/functions_SE.R index 389e72e8..3844ddaf 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -361,17 +361,16 @@ get_reduced_dimensions_TSNE_bulk_SE <- # Set perprexity to not be too high if (!"perplexity" %in% names(arguments)) - arguments = arguments %>% c(perplexity = (( - .data %>% distinct(!!.element) %>% nrow %>% sum(-1) - ) / 3 / 2) %>% floor() %>% min(30)) + arguments = arguments %>% c(perplexity = (( + .data %>% ncol() %>% sum(-1) + ) / 3 / 2) %>% floor() %>% min(30)) # If not enough samples stop if (arguments$perplexity <= 2) stop("tidybulk says: You don't have enough samples to run tSNE") # Calculate the most variable genes, from plotMDS Limma - tsne_obj = - do.call(Rtsne::Rtsne, c(list(t(.data)), arguments)) + tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments)) @@ -389,6 +388,93 @@ get_reduced_dimensions_TSNE_bulk_SE <- } +#' Get UMAP +#' +#' @keywords internal +#' +#' @import dplyr +#' @import tidyr +#' @import tibble +#' @importFrom rlang := +#' @importFrom stats setNames +#' @importFrom utils install.packages +#' +#' @param .data A tibble +#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param top An integer. How many top genes to select +#' @param of_samples A boolean +#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered +#' @param ... Further parameters passed to the function uwot +#' +#' @return A tibble with additional columns +#' +get_reduced_dimensions_UMAP_bulk_SE <- + function(.data, + .dims = 2, + top = 500, + of_samples = TRUE, + log_transform = TRUE, + scale = NULL, # This is only a dummy argument for making it compatibble with PCA + calculate_for_pca_dimensions = 20, + ...) { + # Comply with CRAN NOTES + . = NULL + + # To avoid dplyr complications + + # Evaluate ... + arguments <- list(...) + # if (!"check_duplicates" %in% names(arguments)) + # arguments = arguments %>% c(check_duplicates = FALSE) + if (!"dims" %in% names(arguments)) + arguments = arguments %>% c(n_components = .dims) + if (!"init" %in% names(arguments)) + arguments = arguments %>% c(init = "spca") + + + # Check if package is installed, otherwise install + if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) { + message("tidybulk says: Installing uwot") + install.packages("uwot", repos = "https://cloud.r-project.org") + } + + + # Calculate based on PCA + if(!is.null(calculate_for_pca_dimensions)) + df_UMAP = + .data %>% + + t() %>% + + # Calculate principal components + prcomp(scale = scale) %$% + + # Parse the PCA results to a tibble + x %>% + .[,1:calculate_for_pca_dimensions] + + # Calculate based on all features + else + df_UMAP = .data + + umap_obj = do.call(uwot::tumap, c(list(df_UMAP), arguments)) + + list( + raw_result = umap_obj, + result = umap_obj %>% + as_tibble(.name_repair = "minimal") %>% + setNames(c("UMAP1", "UMAP2")) %>% + + # add element name + dplyr::mutate(sample = !!.data %>% colnames) %>% + select(-sample) + ) + + } counts_scaled_exist_SE = function(.data){ diff --git a/R/methods.R b/R/methods.R index 0305b179..86ca5814 100755 --- a/R/methods.R +++ b/R/methods.R @@ -725,6 +725,23 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements) #' Underlying method for tSNE: #' Rtsne::Rtsne(data, ...) #' +#' Underlying method for UMAP: +#' +#' df_source = +#' .data %>% +#' +#' # Filter NA symbol +#' filter(!!.feature %>% is.na %>% not()) %>% +#' +#' # Prepare data frame +#' distinct(!!.feature,!!.element,!!.abundance) %>% +#' +#' # Filter most variable genes +#' keep_variable_transcripts(top) %>% +#' reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions, action="get" ) %>% +#' as_matrix(rownames = quo_name(.element)) %>% +#' uwot::tumap(...) +#' #' #' @return A tbl object with additional columns for the reduced dimensions #' @@ -814,7 +831,7 @@ setGeneric("reduce_dimensions", function(.data, ) %>% when( - method == "MDS" ~ get_reduced_dimensions_MDS_bulk(., + tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(., .abundance = !!.abundance, .dims = .dims, .element = !!.element, @@ -824,7 +841,7 @@ setGeneric("reduce_dimensions", function(.data, log_transform = log_transform, ... ), - method == "PCA" ~ get_reduced_dimensions_PCA_bulk(., + tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(., .abundance = !!.abundance, .dims = .dims, .element = !!.element, @@ -835,7 +852,7 @@ setGeneric("reduce_dimensions", function(.data, scale = scale, ... ), - method == "tSNE" ~ get_reduced_dimensions_TSNE_bulk(., + tolower(method) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk(., .abundance = !!.abundance, .dims = .dims, .element = !!.element, @@ -845,6 +862,17 @@ setGeneric("reduce_dimensions", function(.data, log_transform = log_transform, ... ), + tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk(., + .abundance = !!.abundance, + .dims = .dims, + .element = !!.element, + .feature = !!.feature, + top = top, + of_samples = of_samples, + log_transform = log_transform, + scale = scale, + ... + ), TRUE ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"") ) diff --git a/R/methods_SE.R b/R/methods_SE.R index a1432700..c1cba313 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -350,10 +350,11 @@ setMethod("cluster_elements", my_reduction_function = method %>% when( - (.) == "MDS" ~ get_reduced_dimensions_MDS_bulk_SE, - (.) == "PCA" ~ get_reduced_dimensions_PCA_bulk_SE, - (.) == "tSNE" ~ get_reduced_dimensions_TSNE_bulk_SE, - ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"") + tolower(.) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk_SE, + tolower(.) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk_SE, + tolower(.) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk_SE, + tolower(.) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk_SE, + ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ") ) # Both dataframe and raw result object are returned @@ -378,10 +379,11 @@ setMethod("cluster_elements", # Add bibliography when( - method == "MDS" ~ memorise_methods_used(., "limma"), - method == "PCA" ~ memorise_methods_used(., "stats"), - method == "tSNE" ~ memorise_methods_used(., "rtsne"), - ~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ") + tolower(method) == tolower("MDS") ~ memorise_methods_used(., "limma"), + tolower(method) == tolower("PCA") ~ memorise_methods_used(., "stats"), + tolower(method) == tolower("tSNE") ~ memorise_methods_used(., "rtsne"), + tolower(method) == tolower("UMAP") ~ memorise_methods_used(., "uwot"), + ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ") ) %>% # Attach edgeR for keep variable filtering diff --git a/man/get_reduced_dimensions_UMAP_bulk.Rd b/man/get_reduced_dimensions_UMAP_bulk.Rd new file mode 100644 index 00000000..2d1145cf --- /dev/null +++ b/man/get_reduced_dimensions_UMAP_bulk.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{get_reduced_dimensions_UMAP_bulk} +\alias{get_reduced_dimensions_UMAP_bulk} +\title{Get UMAP} +\usage{ +get_reduced_dimensions_UMAP_bulk( + .data, + .element = NULL, + .feature = NULL, + .abundance = NULL, + .dims = 2, + top = 500, + of_samples = TRUE, + log_transform = TRUE, + scale = TRUE, + calculate_for_pca_dimensions = 20, + ... +) +} +\arguments{ +\item{.data}{A tibble} + +\item{.element}{A column symbol. The column that is used to calculate distance (i.e., normally samples)} + +\item{.feature}{A column symbol. The column that is represents entities to cluster (i.e., normally genes)} + +\item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} + +\item{.dims}{A integer vector corresponding to principal components of interest (e.g., 1:6)} + +\item{top}{An integer. How many top genes to select} + +\item{of_samples}{A boolean} + +\item{log_transform}{A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} + +\item{calculate_for_pca_dimensions}{An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered} + +\item{...}{Further parameters passed to the function uwot} +} +\value{ +A tibble with additional columns +} +\description{ +Get UMAP +} +\keyword{internal} diff --git a/man/get_reduced_dimensions_UMAP_bulk_SE.Rd b/man/get_reduced_dimensions_UMAP_bulk_SE.Rd new file mode 100644 index 00000000..d6162239 --- /dev/null +++ b/man/get_reduced_dimensions_UMAP_bulk_SE.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions_SE.R +\name{get_reduced_dimensions_UMAP_bulk_SE} +\alias{get_reduced_dimensions_UMAP_bulk_SE} +\title{Get UMAP} +\usage{ +get_reduced_dimensions_UMAP_bulk_SE( + .data, + .dims = 2, + top = 500, + of_samples = TRUE, + log_transform = TRUE, + scale = NULL, + calculate_for_pca_dimensions = 20, + ... +) +} +\arguments{ +\item{.data}{A tibble} + +\item{.dims}{A integer vector corresponding to principal components of interest (e.g., 1:6)} + +\item{top}{An integer. How many top genes to select} + +\item{of_samples}{A boolean} + +\item{log_transform}{A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} + +\item{calculate_for_pca_dimensions}{An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered} + +\item{...}{Further parameters passed to the function uwot} + +\item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} + +\item{.feature}{A column symbol. The column that is represents entities to cluster (i.e., normally genes)} + +\item{.element}{A column symbol. The column that is used to calculate distance (i.e., normally samples)} +} +\value{ +A tibble with additional columns +} +\description{ +Get UMAP +} +\keyword{internal} diff --git a/man/reduce_dimensions-methods.Rd b/man/reduce_dimensions-methods.Rd index e68bf821..4c0fce04 100644 --- a/man/reduce_dimensions-methods.Rd +++ b/man/reduce_dimensions-methods.Rd @@ -156,6 +156,23 @@ limma::plotMDS(ndim = .dims, plot = FALSE, top = top) Underlying method for tSNE: Rtsne::Rtsne(data, ...) + +Underlying method for UMAP: + + df_source = +.data %>% + + # Filter NA symbol + filter(!!.feature %>% is.na %>% not()) %>% + + # Prepare data frame + distinct(!!.feature,!!.element,!!.abundance) %>% + + # Filter most variable genes + keep_variable_transcripts(top) %>% + reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions, action="get" ) %>% + as_matrix(rownames = quo_name(.element)) %>% + uwot::tumap(...) } \examples{ diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index c8a97f16..51b6b5c0 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -1380,6 +1380,35 @@ test_that("Add reduced dimensions tSNE - no object",{ }) +test_that("Add reduced dimensions UMAP - no object",{ + + set.seed(132) + + res = + input_df_breast %>% + identify_abundant(a, b, c) %>% + + reduce_dimensions( + method="UMAP", + .abundance = c, + .element = a, + .feature = b, + action="add" + ) + + expect_equal( + typeof(res$`UMAP1`), + "double", + tolerance=1e-1 + ) + + expect_equal( + ncol(res), + 8 + ) + +}) + test_that("Only rotated dimensions - no object",{ res.pca = diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index 3a625c27..c802770e 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -533,25 +533,92 @@ test_that("Only reduced dimensions MDS - no object",{ res = - se_mini %>% - reduce_dimensions( - method = "MDS", - .abundance = c, - .element = a, - .feature = b - ) + breast_tcga_mini_SE %>% + reduce_dimensions(method = "MDS") expect_equal( - res$`Dim1`, - c(1.4048441, 1.3933490, -2.0138120 , 0.8832354, -1.6676164), + res$`Dim1`[1:4], + c(-0.2723808836, -0.1105770207, -0.3034092668, -0.0064569358), tolerance=10 ) expect_equal( ncol(colData(res)), - 5 + 3 ) expect_equal( class(attr(res, "internals")$MDS[[1]])[1], "MDS" ) }) + +test_that("Only reduced dimensions PCA - no object",{ + + + + + res = + breast_tcga_mini_SE %>% + reduce_dimensions( method = "PCA" ) + + expect_equal( + res$`PC1`[1:4], + c(-8.5145575 , 4.2451190 , -6.8991306 , -6.0787597 ), + tolerance=10 + ) + + expect_equal( + ncol(colData(res)), + 3 + ) + + expect_equal( class(attr(res, "internals")$PCA[[1]])[1], "numeric" ) + +}) + +test_that("Only reduced dimensions tSNE - no object",{ + + + + + res = + breast_tcga_mini_SE %>% + reduce_dimensions( method = "tSNE" ) + + expect_equal( + res$`tSNE1`[1:4], + c(-2.4677708, -0.3447403, -5.8059667, 1.9685007 ), + tolerance=10 + ) + + expect_equal( + ncol(colData(res)), + 3 + ) + + expect_equal( class(attr(res, "internals")$tSNE[[1]])[1], "integer" ) + +}) + +test_that("Only reduced dimensions UMAP - no object",{ + + + + + res = + breast_tcga_mini_SE %>% + reduce_dimensions( method = "UMAP" ) + + expect_equal( + res$`UMAP1`[1:4], + c(-1.9784263, -0.8536380, -0.6537955, -1.8840860 ), + tolerance=10 + ) + + expect_equal( + ncol(colData(res)), + 3 + ) + + expect_equal( class(attr(res, "internals")$UMAP[[1]])[1], "numeric" ) + +})