Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ Suggests:
functional,
survminer,
tidySummarizedExperiment,
markdown
markdown,
uwot
VignetteBuilder:
knitr
RdMacros:
Expand Down
126 changes: 125 additions & 1 deletion R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,7 @@ we suggest to partition the dataset for sample clusters.

}

#' Get principal component information to a tibble using tSNE
#' Get tSNE
#'
#' @keywords internal
#' @noRd
Expand Down Expand Up @@ -1939,6 +1939,130 @@ get_reduced_dimensions_TSNE_bulk <-

}

#' Get UMAP
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats setNames
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#'
#' @return A tibble with additional columns
#'
get_reduced_dimensions_UMAP_bulk <-
function(.data,
.element = NULL,
.feature = NULL,

.abundance = NULL,
.dims = 2,
top = 500,
of_samples = TRUE,
log_transform = TRUE,
scale = TRUE,
calculate_for_pca_dimensions = 20,
...) {

if(!is.null(calculate_for_pca_dimensions) & (
!is(calculate_for_pca_dimensions, "numeric") |
length(calculate_for_pca_dimensions) > 1
))
stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1")

# Comply with CRAN NOTES
. = NULL

# Get column names
.element = enquo(.element)
.feature = enquo(.feature)
.abundance = enquo(.abundance)

# Evaluate ...
arguments <- list(...)
# if (!"check_duplicates" %in% names(arguments))
# arguments = arguments %>% c(check_duplicates = FALSE)
if (!"dims" %in% names(arguments))
arguments = arguments %>% c(n_components = .dims)
if (!"init" %in% names(arguments))
arguments = arguments %>% c(init = "spca")

# Check if package is installed, otherwise install
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
message("tidybulk says: Installing uwot")
install.packages("uwot", repos = "https://cloud.r-project.org")
}

df_source =
.data %>%

# Filter NA symbol
filter(!!.feature %>% is.na %>% not()) %>%

# Prepare data frame
distinct(!!.feature,!!.element,!!.abundance) %>%

# Check if data rectangular
when(
check_if_data_rectangular(., !!.element,!!.feature,!!.abundance) ~
eliminate_sparse_transcripts(., !!.feature),
~ (.)
) %>%

# Check if log transform is needed
when(log_transform ~ dplyr::mutate(., !!.abundance := !!.abundance %>% log1p), ~ (.)) %>%

# Filter most variable genes
keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top)

# Calculate based on PCA
if(!is.null(calculate_for_pca_dimensions))
df_UMAP =
df_source %>%
reduce_dimensions(
!!.element,!!.feature,!!.abundance,
method="PCA",
.dims = calculate_for_pca_dimensions,
action="get",
scale = scale
) %>%
suppressMessages() %>%
as_matrix(rownames = quo_name(.element))

# Calculate based on all features
else
df_UMAP =
df_source %>%
spread(!!.feature,!!.abundance) %>%
as_matrix(rownames = quo_name(.element))

do.call(uwot::tumap, c(list(df_UMAP), arguments)) %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("UMAP1", "UMAP2")) %>%

# add element name
dplyr::mutate(!!.element := df_UMAP %>% rownames) %>%
select(!!.element, everything()) %>%

# Attach attributes
reattach_internals(.data) %>%
memorise_methods_used("uwot")

}

#' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle
#'
#' @keywords internal
Expand Down
96 changes: 91 additions & 5 deletions R/functions_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -361,17 +361,16 @@ get_reduced_dimensions_TSNE_bulk_SE <-

# Set perprexity to not be too high
if (!"perplexity" %in% names(arguments))
arguments = arguments %>% c(perplexity = ((
.data %>% distinct(!!.element) %>% nrow %>% sum(-1)
) / 3 / 2) %>% floor() %>% min(30))
arguments = arguments %>% c(perplexity = ((
.data %>% ncol() %>% sum(-1)
) / 3 / 2) %>% floor() %>% min(30))

# If not enough samples stop
if (arguments$perplexity <= 2)
stop("tidybulk says: You don't have enough samples to run tSNE")

# Calculate the most variable genes, from plotMDS Limma
tsne_obj =
do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))



Expand All @@ -389,6 +388,93 @@ get_reduced_dimensions_TSNE_bulk_SE <-

}

#' Get UMAP
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats setNames
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#'
#' @return A tibble with additional columns
#'
get_reduced_dimensions_UMAP_bulk_SE <-
function(.data,
.dims = 2,
top = 500,
of_samples = TRUE,
log_transform = TRUE,
scale = NULL, # This is only a dummy argument for making it compatibble with PCA
calculate_for_pca_dimensions = 20,
...) {
# Comply with CRAN NOTES
. = NULL

# To avoid dplyr complications

# Evaluate ...
arguments <- list(...)
# if (!"check_duplicates" %in% names(arguments))
# arguments = arguments %>% c(check_duplicates = FALSE)
if (!"dims" %in% names(arguments))
arguments = arguments %>% c(n_components = .dims)
if (!"init" %in% names(arguments))
arguments = arguments %>% c(init = "spca")


# Check if package is installed, otherwise install
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
message("tidybulk says: Installing uwot")
install.packages("uwot", repos = "https://cloud.r-project.org")
}


# Calculate based on PCA
if(!is.null(calculate_for_pca_dimensions))
df_UMAP =
.data %>%

t() %>%

# Calculate principal components
prcomp(scale = scale) %$%

# Parse the PCA results to a tibble
x %>%
.[,1:calculate_for_pca_dimensions]

# Calculate based on all features
else
df_UMAP = .data

umap_obj = do.call(uwot::tumap, c(list(df_UMAP), arguments))

list(
raw_result = umap_obj,
result = umap_obj %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("UMAP1", "UMAP2")) %>%

# add element name
dplyr::mutate(sample = !!.data %>% colnames) %>%
select(-sample)
)

}

counts_scaled_exist_SE = function(.data){

Expand Down
34 changes: 31 additions & 3 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,23 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
#' Underlying method for tSNE:
#' Rtsne::Rtsne(data, ...)
#'
#' Underlying method for UMAP:
#'
#' df_source =
#' .data %>%
#'
#' # Filter NA symbol
#' filter(!!.feature %>% is.na %>% not()) %>%
#'
#' # Prepare data frame
#' distinct(!!.feature,!!.element,!!.abundance) %>%
#'
#' # Filter most variable genes
#' keep_variable_transcripts(top) %>%
#' reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions, action="get" ) %>%
#' as_matrix(rownames = quo_name(.element)) %>%
#' uwot::tumap(...)
#'
#'
#' @return A tbl object with additional columns for the reduced dimensions
#'
Expand Down Expand Up @@ -814,7 +831,7 @@ setGeneric("reduce_dimensions", function(.data,
) %>%

when(
method == "MDS" ~ get_reduced_dimensions_MDS_bulk(.,
tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -824,7 +841,7 @@ setGeneric("reduce_dimensions", function(.data,
log_transform = log_transform,
...
),
method == "PCA" ~ get_reduced_dimensions_PCA_bulk(.,
tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -835,7 +852,7 @@ setGeneric("reduce_dimensions", function(.data,
scale = scale,
...
),
method == "tSNE" ~ get_reduced_dimensions_TSNE_bulk(.,
tolower(method) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -845,6 +862,17 @@ setGeneric("reduce_dimensions", function(.data,
log_transform = log_transform,
...
),
tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
.feature = !!.feature,
top = top,
of_samples = of_samples,
log_transform = log_transform,
scale = scale,
...
),
TRUE ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
)

Expand Down
18 changes: 10 additions & 8 deletions R/methods_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -350,10 +350,11 @@ setMethod("cluster_elements",
my_reduction_function =
method %>%
when(
(.) == "MDS" ~ get_reduced_dimensions_MDS_bulk_SE,
(.) == "PCA" ~ get_reduced_dimensions_PCA_bulk_SE,
(.) == "tSNE" ~ get_reduced_dimensions_TSNE_bulk_SE,
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
tolower(.) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk_SE,
tolower(.) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk_SE,
tolower(.) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk_SE,
tolower(.) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk_SE,
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
)

# Both dataframe and raw result object are returned
Expand All @@ -378,10 +379,11 @@ setMethod("cluster_elements",

# Add bibliography
when(
method == "MDS" ~ memorise_methods_used(., "limma"),
method == "PCA" ~ memorise_methods_used(., "stats"),
method == "tSNE" ~ memorise_methods_used(., "rtsne"),
~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ")
tolower(method) == tolower("MDS") ~ memorise_methods_used(., "limma"),
tolower(method) == tolower("PCA") ~ memorise_methods_used(., "stats"),
tolower(method) == tolower("tSNE") ~ memorise_methods_used(., "rtsne"),
tolower(method) == tolower("UMAP") ~ memorise_methods_used(., "uwot"),
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
) %>%

# Attach edgeR for keep variable filtering
Expand Down
Loading