diff --git a/.gitignore b/.gitignore index 69f8d654..decb5cb1 100755 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ egsea_report_* /doc/ _targets.R _targets* +.DS_Store +._.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 5aa6776d..97aa96b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: tidybulk Title: Brings transcriptomics to the tidyverse -Version: 1.11.5 +Version: 1.11.6 Authors@R: c(person("Stefano", "Mangiola", email = "mangiolastefano@gmail.com", role = c("aut", "cre")), person("Maria", "Doyle", email = "Maria.Doyle@petermac.org", diff --git a/NAMESPACE b/NAMESPACE index ea7b705c..58ce8456 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -151,6 +151,7 @@ importFrom(rlang,enquos) importFrom(rlang,flatten_if) importFrom(rlang,inform) importFrom(rlang,is_spliced) +importFrom(rlang,quo) importFrom(rlang,quo_is_missing) importFrom(rlang,quo_is_null) importFrom(rlang,quo_is_symbol) @@ -179,6 +180,7 @@ importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,terms) importFrom(stringi,stri_c) +importFrom(stringr,str_c) importFrom(stringr,str_detect) importFrom(stringr,str_remove) importFrom(stringr,str_replace) diff --git a/R/functions.R b/R/functions.R index 4e1c8ded..6ffa5add 100755 --- a/R/functions.R +++ b/R/functions.R @@ -151,8 +151,8 @@ create_tt_from_bam_sam_bulk <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -236,8 +236,8 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr equals #' @importFrom rlang := @@ -360,8 +360,8 @@ get_scaled_counts_bulk <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -646,8 +646,8 @@ get_differential_transcript_abundance_bulk <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -691,13 +691,13 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, .transcript = enquo(.transcript) .abundance = enquo(.abundance) .sample_total_read_count = enquo(.sample_total_read_count) - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } - + # # Specify the design column tested # if(is.null(.contrasts)) # message( @@ -706,7 +706,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # design %>% colnames %>% .[1] # ) # ) - + # # If I don't have intercept or I have categorical factor of interest BUT I don't have contrasts # if( # is.null(.contrasts) & @@ -719,13 +719,13 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # ) # ) # warning("tidybulk says: If you have (i) an intercept-free design (i.e. ~ 0 + factor) or you have a categorical factor of interest with more than 2 values you should use the `contrasts` argument.") - # + # # my_contrasts = # .contrasts %>% # ifelse_pipe(length(.) > 0, # ~ limma::makeContrasts(contrasts = .x, levels = design), # ~ NULL) - + # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") @@ -733,7 +733,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) } - + # Check if package is installed, otherwise install if (find.package("glmmSeq", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing glmmSeq needed for differential transcript abundance analyses") @@ -741,52 +741,52 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("glmmSeq", ask = FALSE) } - - metadata = - .data |> - pivot_sample(!!.sample) |> + + metadata = + .data |> + pivot_sample(!!.sample) |> as_data_frame(rownames = !!.sample) - - counts = + + counts = .data %>% select(!!.transcript,!!.sample,!!.abundance) %>% spread(!!.sample,!!.abundance) %>% as_matrix(rownames = !!.transcript) - + # Reorder counts counts = counts[,rownames(metadata),drop=FALSE] - - glmmSeq_object = + + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata, dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), - progress = TRUE, + progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_"), ... - ) - - glmmSeq_object |> - summary() |> + ) + + glmmSeq_object |> + summary() |> as_tibble(rownames = "gene") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> - + mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + # Attach attributes reattach_internals(.data) %>% - + # select method - memorise_methods_used("glmmSeq") |> - + memorise_methods_used("glmmSeq") |> + # Add raw object attach_to_internals(glmmSeq_object, "glmmSeq") %>% # Communicate the attribute added { - + rlang::inform("\ntidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$glmmSeq`", .frequency_id = "Access DE results glmmSeq", .frequency = "once") - + (.) } %>% - + # Attach prefix setNames(c( colnames(.)[1], @@ -799,8 +799,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1007,8 +1007,8 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1072,7 +1072,7 @@ get_differential_transcript_abundance_deseq2 <- function(.data, if (is.null(test_above_log2_fold_change)) { test_above_log2_fold_change <- 0 } - + # # Print the design column names in case I want contrasts # message( # sprintf( @@ -1188,8 +1188,8 @@ get_differential_transcript_abundance_deseq2 <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1301,7 +1301,7 @@ test_differential_cellularity_ <- function(.data, colnames() %>% gsub(sprintf("%s:", method), "", .) %>% str_replace_all("[ \\(\\)]", "___") - + # Parse formula .my_formula = .formula %>% @@ -1313,29 +1313,29 @@ test_differential_cellularity_ <- function(.data, sprintf( "\\1%s", paste(covariates, collapse = " + ") )), - + # If normal formula ~ sprintf(".proportion_0_corrected%s", format(.)) ) %>% - + as.formula - + # Test result = multivariable_differential_tissue_composition(deconvoluted, method, .my_formula, min_detected_proportion) %>% - + # Attach attributes reattach_internals(.data) %>% - + # Add methods used when(grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot"), object_containing_methods = .data), ~ (.)) - + } - - + + result |> # Eliminate prefix @@ -1350,8 +1350,8 @@ test_differential_cellularity_ <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1441,8 +1441,8 @@ test_stratification_cellularity_ <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom purrr map2_dfr @@ -1689,8 +1689,8 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom stats kmeans #' @importFrom rlang := @@ -1757,8 +1757,8 @@ get_clusters_kmeans_bulk <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom utils install.packages @@ -1838,8 +1838,8 @@ get_clusters_SNN_bulk <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom purrr map_dfr #' @importFrom rlang := @@ -1953,8 +1953,8 @@ get_reduced_dimensions_MDS_bulk <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats prcomp @@ -2099,8 +2099,8 @@ we suggest to partition the dataset for sample clusters. #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -2212,8 +2212,8 @@ get_reduced_dimensions_TSNE_bulk <- #' #' @keywords internal #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -2337,8 +2337,8 @@ get_reduced_dimensions_UMAP_bulk <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang quo_is_null #' @importFrom dplyr between @@ -2702,8 +2702,8 @@ aggregate_duplicated_transcripts_DT = #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom dplyr anti_join @@ -3269,14 +3269,15 @@ get_cell_type_proportions = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix #' @importFrom stats as.formula #' @importFrom utils install.packages #' @importFrom stats rnorm +#' @importFrom stringr str_c #' #' @param .data A tibble #' @param .formula a formula with no response variable, of the kind ~ factor_of_interest + batch @@ -3290,27 +3291,28 @@ get_cell_type_proportions = function(.data, #' #' get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, - .formula, - .sample = NULL, + .factor_unwanted, + .factor_of_interest, + .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = transform, - inverse_transform = inverse_transform, + method = "combat_seq", ...) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) + .factor_of_interest = enquo(.factor_of_interest) + .factor_unwanted = enquo(.factor_unwanted) - # Check that .formula includes at least two covariates - if (parse_formula(.formula) %>% length %>% st(2)) - stop( - "The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation" - ) + # Check if package is installed, otherwise install + if (find.package("sva", quiet = TRUE) %>% length %>% equals(0)) { + message("tidybulk says: Installing sva - Combat needed for adjustment for unwanted variation") + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager", repos = "https://cloud.r-project.org") + BiocManager::install("sva", ask = FALSE) + } - # Check that .formula includes no more than two covariates at the moment - if (parse_formula(.formula) %>% length %>% gt(3)) - warning("tidybulk says: Only the second covariate in the .formula is adjusted for, at the moment") # New column name value_adjusted = as.symbol(sprintf("%s%s", quo_name(.abundance), adjusted_string)) @@ -3321,41 +3323,28 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, select(!!.transcript, !!.sample, !!.abundance, - one_of(parse_formula(.formula))) %>% - distinct() %>% - - # Apply (log by default) transformation - dplyr::mutate(!!.abundance := transform(!!.abundance)) - - + !!.factor_of_interest, + !!.factor_unwanted + ) %>% + distinct() # Create design matrix design = model.matrix( - object = as.formula("~" %>% paste0(parse_formula(.formula)[1])), + object = as.formula(sprintf("~ %s", .data |> select(!!.factor_of_interest) |> colnames() |> str_c(collapse = '+'))), # get first argument of the .formula - data = df_for_combat %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample) + data = df_for_combat %>% select(!!.sample, !!.factor_of_interest) %>% distinct %>% arrange(!!.sample) ) - # Maybe not needed and causing trouble if more columns that in the formula - # %>% - #set_colnames(c("(Intercept)", parse_formula(.formula)[1])) - - # Check if package is installed, otherwise install - if (find.package("sva", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing sva - Combat needed for adjustment for unwanted variation") - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") - BiocManager::install("sva", ask = FALSE) - } - my_batch = df_for_combat %>% - distinct(!!.sample,!!as.symbol(parse_formula(.formula)[2])) %>% - arrange(!!.sample) %>% - pull(2) + select(!!.sample,!!.factor_unwanted) %>% + distinct() |> + arrange(!!.sample) + + mat = - mat = df_for_combat %>% + df_for_combat %>% # Select relevant info distinct(!!.transcript,!!.sample,!!.abundance) %>% @@ -3368,26 +3357,97 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, spread(!!.sample,!!.abundance) %>% as_matrix(rownames = !!.transcript, do_check = FALSE) - mat %>% - # Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) - `+` (rnorm(length(mat), 0, 0.000001)) %>% + # Clear memory + rm(df_for_combat) + gc(verbose = FALSE) + + if(tolower(method) == "combat"){ - # Run combat - sva::ComBat(batch = my_batch, - mod = design, - prior.plots = FALSE, - ...) %>% + adjusted_df = + mat |> - as_tibble(rownames = quo_name(.transcript)) %>% - gather(!!.sample,!!.abundance,-!!.transcript) %>% + # Tranform + log1p() %>% - # Reverse-Log transform if transformed in the first place - dplyr::mutate(!!.abundance := inverse_transform(!!.abundance)) %>% + # Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) + `+` (rnorm(length(mat), 0, 0.000001)) - # In case the inverse tranform produces negative counts - dplyr::mutate(!!.abundance := ifelse(!!.abundance < 0, 0,!!.abundance)) %>% - dplyr::mutate(!!.abundance := !!.abundance %>% as.integer) %>% + for(i in quo_names(.factor_unwanted)){ + adjusted_df = + adjusted_df %>% + sva::ComBat(batch = my_batch |> select(all_of(i)) |> pull(1), + mod = design, + prior.plots = FALSE, + ...) + } + + adjusted_df = + adjusted_df %>% + + as_tibble(rownames = quo_name(.transcript)) %>% + gather(!!.sample,!!.abundance,-!!.transcript) %>% + + # Reverse-Log transform if transformed in the first place + dplyr::mutate(!!.abundance := expm1(!!.abundance)) %>% + + # In case the inverse tranform produces negative counts + dplyr::mutate(!!.abundance := ifelse(!!.abundance < 0, 0,!!.abundance)) %>% + dplyr::mutate(!!.abundance := !!.abundance %>% as.integer) + + } + else if(tolower(method) == "combat_seq"){ + + adjusted_df = mat + + for(i in quo_names(.factor_unwanted)){ + adjusted_df = + adjusted_df |> + sva::ComBat_seq(batch = my_batch |> select(all_of(i)) |> pull(1), + covar_mod = design, + ...) + } + + adjusted_df = + adjusted_df %>% + as_tibble(rownames = quo_name(.transcript)) %>% + gather(!!.sample,!!.abundance,-!!.transcript) + + } + else if(tolower(method) == "limma_remove_batch_effect") { + + unwanted_covariate_matrix = + model.matrix( + object = as.formula(sprintf("~ 0 + %s", .data |> select(!!.factor_unwanted) |> colnames() |> str_c(collapse = '+'))), + # get first argument of the .formula + data = df_for_combat %>% select(!!.sample, !!.factor_unwanted) %>% distinct %>% arrange(!!.sample) + ) + + adjusted_df = + mat |> + edgeR::cpm(log = T) |> + limma::removeBatchEffect( + design = design, + covariates = unwanted_covariate_matrix, + ... + ) |> + + as_tibble(rownames = quo_name(.transcript)) %>% + gather(!!.sample,!!.abundance,-!!.transcript) %>% + + # Reverse-Log transform if transformed in the first place + dplyr::mutate(!!.abundance := expm1(!!.abundance)) %>% + + # In case the inverse tranform produces negative counts + dplyr::mutate(!!.abundance := ifelse(!!.abundance < 0, 0,!!.abundance)) %>% + dplyr::mutate(!!.abundance := !!.abundance %>% as.integer) + + } else { + stop("tidybulk says: the argument \"method\" must be combat_seq, combat, or limma_remove_batch_effect") + } + + + adjusted_df %>% # Reset column names dplyr::rename(!!value_adjusted := !!.abundance) %>% @@ -3558,8 +3618,8 @@ tidybulk_to_SummarizedExperiment = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -3744,8 +3804,8 @@ fill_NA_using_formula = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix diff --git a/R/methods.R b/R/methods.R index 5884d767..d68fe78c 100755 --- a/R/methods.R +++ b/R/methods.R @@ -36,7 +36,7 @@ setOldClass("tidybulk") #' #' @docType methods #' @rdname tidybulk-methods -#' +#' #' @export #' setGeneric("tidybulk", function(.data, @@ -78,9 +78,9 @@ setGeneric("tidybulk", function(.data, !!.abundance_scaled) } #' tidybulk -#' +#' #' @export -#' +#' #' @inheritParams tidybulk #' #' @docType methods @@ -93,7 +93,7 @@ setMethod("tidybulk", "spec_tbl_df", .tidybulk) #' tidybulk #' #' @export -#' +#' #' @importFrom purrr map2 #' #' @inheritParams tidybulk @@ -136,7 +136,7 @@ setGeneric("as_SummarizedExperiment", function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL) { - + # Fix NOTEs . = NULL @@ -256,9 +256,9 @@ setGeneric("as_SummarizedExperiment", function(.data, } #' as_SummarizedExperiment -#' +#' #' @export -#' +#' #' @inheritParams as_SummarizedExperiment #' #' @docType methods @@ -269,9 +269,9 @@ setGeneric("as_SummarizedExperiment", function(.data, setMethod("as_SummarizedExperiment", "spec_tbl_df", .as_SummarizedExperiment) #' as_SummarizedExperiment -#' +#' #' @export -#' +#' #' @inheritParams as_SummarizedExperiment #' #' @docType methods @@ -282,7 +282,7 @@ setMethod("as_SummarizedExperiment", "spec_tbl_df", .as_SummarizedExperiment) setMethod("as_SummarizedExperiment", "tbl_df", .as_SummarizedExperiment) #' as_SummarizedExperiment -#' +#' #' @export #' #' @inheritParams as_SummarizedExperiment @@ -332,9 +332,9 @@ setGeneric("tidybulk_SAM_BAM", function(file_names, genome = "hg38", ...) standardGeneric("tidybulk_SAM_BAM")) #' tidybulk_SAM_BAM -#' +#' #' @export -#' +#' #' @inheritParams tidybulk_SAM_BAM-methods #' #' @docType methods @@ -421,10 +421,10 @@ setGeneric("scale_abundance", function(.data, # DEPRECATED reference_selection_function = NULL) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -529,9 +529,9 @@ setGeneric("scale_abundance", function(.data, #' scale_abundance -#' +#' #' @export -#' +#' #' @inheritParams scale_abundance #' #' @docType methods @@ -542,9 +542,9 @@ setGeneric("scale_abundance", function(.data, setMethod("scale_abundance", "spec_tbl_df", .scale_abundance) #' scale_abundance -#' +#' #' @export -#' +#' #' @inheritParams scale_abundance #' #' @docType methods @@ -555,9 +555,9 @@ setMethod("scale_abundance", "spec_tbl_df", .scale_abundance) setMethod("scale_abundance", "tbl_df", .scale_abundance) #' scale_abundance -#' +#' #' @export -#' +#' #' @inheritParams scale_abundance #' #' @docType methods @@ -632,10 +632,10 @@ setGeneric("quantile_normalise_abundance", function(.data, .subset_for_scaling = NULL, action = "add") { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -644,12 +644,12 @@ setGeneric("quantile_normalise_abundance", function(.data, .sample = col_names$.sample .transcript = col_names$.transcript .abundance = col_names$.abundance - + .subset_for_scaling = enquo(.subset_for_scaling) - + # Set column name for value scaled value_scaled = as.symbol(sprintf("%s%s", quo_name(.abundance), scaled_string)) - + # Check if package is installed, otherwise install if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) { @@ -658,22 +658,22 @@ setGeneric("quantile_normalise_abundance", function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("limma", ask = FALSE) } - + # Reformat input data set .data_norm <- .data %>% - + # Rename dplyr::select(!!.sample,!!.transcript,!!.abundance) %>% - + # Set samples and genes as factors - dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript)) |> - pivot_wider(names_from = !!.sample, values_from = !!.abundance) |> - as_matrix(rownames=!!.transcript) |> - limma::normalizeQuantiles() |> - as_tibble(rownames = quo_name(.transcript)) |> - pivot_longer(-!!.transcript, names_to = quo_name(.sample), values_to = quo_names(value_scaled)) |> - + dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript)) |> + pivot_wider(names_from = !!.sample, values_from = !!.abundance) |> + as_matrix(rownames=!!.transcript) |> + limma::normalizeQuantiles() |> + as_tibble(rownames = quo_name(.transcript)) |> + pivot_longer(-!!.transcript, names_to = quo_name(.sample), values_to = quo_names(value_scaled)) |> + # Attach column internals add_tt_columns( @@ -682,20 +682,20 @@ setGeneric("quantile_normalise_abundance", function(.data, !!.abundance, !!(function(x, v) enquo(v))(x,!!value_scaled) ) - - + + if (action %in% c( "add", "get")){ - + .data %>% - + left_join(.data_norm, by= join_by(!!.sample, !!.transcript)) %>% # Attach attributes - reattach_internals(.data_norm) |> - + reattach_internals(.data_norm) |> + # Add methods - memorise_methods_used(c("quantile")) - + memorise_methods_used(c("quantile")) + } else if (action == "only") .data_norm else @@ -705,9 +705,9 @@ setGeneric("quantile_normalise_abundance", function(.data, } #' quantile_normalise_abundance -#' +#' #' @export -#' +#' #' @inheritParams quantile_normalise_abundance #' #' @docType methods @@ -718,9 +718,9 @@ setGeneric("quantile_normalise_abundance", function(.data, setMethod("quantile_normalise_abundance", "spec_tbl_df", .quantile_normalise_abundance) #' quantile_normalise_abundance -#' +#' #' @export -#' +#' #' @inheritParams quantile_normalise_abundance #' #' @docType methods @@ -731,9 +731,9 @@ setMethod("quantile_normalise_abundance", "spec_tbl_df", .quantile_normalise_abu setMethod("quantile_normalise_abundance", "tbl_df", .quantile_normalise_abundance) #' quantile_normalise_abundance -#' +#' #' @export -#' +#' #' @inheritParams quantile_normalise_abundance #' #' @docType methods @@ -833,7 +833,7 @@ setGeneric("cluster_elements", function(.data, # Fix NOTEs . = NULL - + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { @@ -1084,10 +1084,10 @@ setGeneric("reduce_dimensions", function(.data, ) { - + # Fix NOTEs . = NULL - + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { @@ -1312,10 +1312,10 @@ setGeneric("rotate_dimensions", function(.data, dimension_2_column_rotated = NULL, action = "add") { - + # Fix NOTEs . = NULL - + # Get column names .element = enquo(.element) col_names = get_elements(.data, .element) @@ -1548,7 +1548,7 @@ setGeneric("remove_redundancy", function(.data, # Fix NOTEs . = NULL - + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { @@ -1645,16 +1645,19 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' @name adjust_abundance #' #' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch) +#' @param .factor_unwanted A tidy select, e.g. column names without double quotation. c(batch, country) These are the factor that we want to adjust for, including unwanted batcheffect, and unwanted biological effects. +#' @param .factor_of_interest A tidy select, e.g. column names without double quotation. c(treatment) These are the factor that we want to preserve. #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column +#' @param method A character string. Methods include combat_seq (default), combat and limma_remove_batch_effect. #' -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity -#' @param inverse_transform A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis. #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param ... Further parameters passed to the function sva::ComBat #' +#' @param .formula DEPRECATED - A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch) +#' @param transform DEPRECATED - A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param inverse_transform DEPRECATED - A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis. #' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) #' #' @details This function adjusts the abundance for (known) unwanted variation. @@ -1676,9 +1679,9 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' cm$batch = 0 #' cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 #' -#' cm |> -#' identify_abundant() |> -#' adjust_abundance( ~ condition + batch ) +#' cm |> +#' identify_abundant() |> +#' adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) #' #' #' @docType methods @@ -1687,62 +1690,114 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' setGeneric("adjust_abundance", function(.data, - .formula, + + # DEPRECATED + .formula = NULL, + .factor_unwanted =NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., # DEPRECATED - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL + ) standardGeneric("adjust_abundance")) # Set internal .adjust_abundance = function(.data, - .formula, + + # DEPRECATED + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., # DEPRECATED - log_transform = NULL) + log_transform = NULL, + transform = NULL, + inverse_transform = NULL + ) { # Fix NOTEs . = NULL - + + # Get column names + .sample = enquo(.sample) + .transcript = enquo(.transcript) + col_names = get_sample_transcript(.data, .sample, .transcript) + .sample = col_names$.sample + .transcript = col_names$.transcript + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") - if(log_transform == TRUE){ + if(log_transform){ transform = log1p inverse_transform = expm1 } } - # Get column names - .sample = enquo(.sample) - .transcript = enquo(.transcript) - col_names = get_sample_transcript(.data, .sample, .transcript) - .sample = col_names$.sample - .transcript = col_names$.transcript + # DEPRECATION OF log_transform + if ( + (is_present(transform) & !is.null(transform)) | + is_present(inverse_transform) & !is.null(inverse_transform) + ) { + + # Signal the deprecation to the user + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", details = "The argument transform and inverse_transform is now deprecated, please use method argument instead specifying \"combat\", \"combat_seq\" or \"limma_remove_batch_effect\".") + + } + + # DEPRECATION OF .formula + if (is_present(.formula) & !is.null(.formula)) { + + # Signal the deprecation to the user + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", details = "The argument .formula is now deprecated, please use factor_unwanted and factor_of_interest. Using the formula, the first factor is of interest and the second is unwanted") + + # Check that .formula includes at least two covariates + if (parse_formula(.formula) %>% length %>% st(2)) + stop( + "The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation" + ) + + # Check that .formula includes no more than two covariates at the moment + if (parse_formula(.formula) %>% length %>% gt(3)) + warning("tidybulk says: Only the second covariate in the .formula is adjusted for") + + + .factor_of_interest = quo(!!as.symbol(parse_formula(.formula)[1])) + .factor_unwanted = quo(!!as.symbol(parse_formula(.formula)[2])) + + } else { + + .factor_of_interest = enquo(.factor_of_interest) + .factor_unwanted = enquo(.factor_unwanted) + } + + # Get scaled abundance if present, otherwise get abundance (if present get scaled one) .abundance = enquo(.abundance) %>% - when(!quo_is_symbol(.) ~ get_abundance_norm_if_exists(.data, .)$.abundance, ~ (.)) + when(!quo_is_symbol(.) ~ + get_abundance_norm_if_exists(.data, .)$.abundance, ~ (.)) # Validate data frame if(do_validate()) { @@ -1764,12 +1819,12 @@ setGeneric("adjust_abundance", function(.data, ) |> get_adjusted_counts_for_unwanted_variation_bulk( - .formula, + .factor_unwanted = !!.factor_unwanted, + .factor_of_interest = !!.factor_of_interest, .sample = !!.sample, .transcript = !!.transcript, .abundance = !!.abundance, - transform = transform, - inverse_transform = inverse_transform, + method = method, ... ) @@ -1911,10 +1966,10 @@ setGeneric("aggregate_duplicates", function(.data, .abundance = NULL, aggregation_function = sum, keep_integer = TRUE) { - + # Fix NOTEs . = NULL - + # Make col names # Get column names .sample = enquo(.sample) @@ -2054,10 +2109,10 @@ setGeneric("deconvolve_cellularity", function(.data, prefix = "", action = "add", ...) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -2180,10 +2235,10 @@ setMethod("deconvolve_cellularity", symbol_to_entrez = function(.data, .transcript = NULL, .sample = NULL) { - + # Fix NOTEs . = NULL - + # Get column names .transcript = enquo(.transcript) .sample = enquo(.sample) @@ -2252,7 +2307,7 @@ setGeneric("describe_transcript", function(.data, # Fix NOTEs . = NULL - + # Get column names .transcript = enquo(.transcript) col_names = get_transcript(.data, .transcript) @@ -2377,7 +2432,7 @@ setMethod("describe_transcript", "tidybulk", .describe_transcript) #' #' @examples #' -#' +#' #' #' # This function was designed for data.frame #' # Convert from SummarizedExperiment for this example. It is NOT reccomended. @@ -2401,10 +2456,10 @@ setGeneric("ensembl_to_symbol", function(.data, .ensembl, action = "add") { - + # Fix NOTEs . = NULL - + # Make col names .ensembl = enquo(.ensembl) @@ -2486,6 +2541,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column +#' #' @param contrasts This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) #' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT), "edger_robust_likelihood_ratio", "DESeq2", "limma_voom", "limma_voom_sample_weights" #' @param test_above_log2_fold_change A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. @@ -2577,15 +2633,15 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' test_differential_abundance( ~ condition, method="deseq2", #' fitType="local", #' test_above_log2_fold_change=4 ) -#' -#' # Use random intercept and random effect models -#' +#' +#' # Use random intercept and random effect models +#' #' se_mini[1:50,] |> -#' identify_abundant(factor_of_interest = condition) |> +#' identify_abundant(factor_of_interest = condition) |> #' test_differential_abundance( #' ~ condition + (1 + condition | time), #' method = "glmmseq_lme4", cores = 1 -#' ) +#' ) #' #' # confirm that lfcThreshold was used #' \dontrun{ @@ -2594,7 +2650,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' DESeq2::DESeqResults() |> #' DESeq2::plotMA() #' } -#' +#' #' # The function `test_differential_abundance` operates with contrasts too #' #' my_se_mini |> @@ -2654,10 +2710,10 @@ setGeneric("test_differential_abundance", function(.data, .contrasts = NULL ) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -2766,13 +2822,13 @@ such as batch effects (if applicable) in the formula. .abundance = !!.abundance, .contrasts = contrasts, method = method, - test_above_log2_fold_change = test_above_log2_fold_change, + test_above_log2_fold_change = test_above_log2_fold_change, scaling_method = scaling_method, omit_contrast_in_colnames = omit_contrast_in_colnames, prefix = prefix, ... ), - + # glmmseq tolower(method) %in% c("glmmseq_lme4", "glmmseq_glmmTMB") ~ get_differential_transcript_abundance_glmmSeq( ., @@ -2782,7 +2838,7 @@ such as batch effects (if applicable) in the formula. .abundance = !!.abundance, .contrasts = contrasts, method = method, - test_above_log2_fold_change = test_above_log2_fold_change, + test_above_log2_fold_change = test_above_log2_fold_change, scaling_method = scaling_method, omit_contrast_in_colnames = omit_contrast_in_colnames, prefix = prefix, @@ -2929,7 +2985,7 @@ setGeneric("keep_variable", function(.data, # Fix NOTEs . = NULL - + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { @@ -3057,7 +3113,7 @@ setGeneric("identify_abundant", function(.data, # Fix NOTEs . = NULL - + factor_of_interest = enquo(factor_of_interest) # Get column names @@ -3073,18 +3129,18 @@ setGeneric("identify_abundant", function(.data, stop("The parameter minimum_counts must be > 0") if (minimum_proportion < 0 | minimum_proportion > 1) stop("The parameter minimum_proportion must be between 0 and 1") - - + + # Validate data frame if(do_validate()) { validation(.data, !!.sample, !!.transcript, !!.abundance) warning_if_data_is_not_rectangular(.data, !!.sample, !!.transcript, !!.abundance) } - + if( ".abundant" %in% colnames(.data) ) return( .data |> reattach_internals(.data) ) - + # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { message("Installing edgeR needed for differential transcript abundance analyses") @@ -3092,73 +3148,73 @@ setGeneric("identify_abundant", function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) } - + # If character fail if( !is.null(factor_of_interest) && !factor_of_interest |> quo_is_null() && !factor_of_interest |> quo_is_symbolic() ) stop("tidybulk says: factor_of_interest must be symbolic (i.e. column name/s not surrounded by single or double quotes) and not a character.") - - + + if( - !is.null(factor_of_interest) && + !is.null(factor_of_interest) && ( enquo(factor_of_interest) |> quo_is_symbolic() | is.character(factor_of_interest) ) ){ - + # DEPRECATION OF symbolic factor_of_interest - # factor_of_interest must be a character now because we identified - # a edge case for which if the column name is the same as an existing function, - # such as time the column name would not be registered as such but would be + # factor_of_interest must be a character now because we identified + # a edge case for which if the column name is the same as an existing function, + # such as time the column name would not be registered as such but would be # registered as that function - + # # Signal the deprecation to the user # warning( - # "The `factor_of_interest` argument of `test_differential_abundance() is changed as of tidybulk 1.11.5", + # "The `factor_of_interest` argument of `test_differential_abundance() is changed as of tidybulk 1.11.5", # details = "The argument factor_of_interest must now be a character array. This because we identified a edge case for which if the column name is the same as an existing function, such as time the column name would not be registered as such but would be registered as that function" # ) - + factor_of_interest = factor_of_interest |> enquo() |> quo_names() - + # If is numeric ERROR if( .data |> - select(factor_of_interest) |> + select(factor_of_interest) |> lapply(class) |> - unlist() |> + unlist() |> as.character()%in% c("numeric", "integer", "double") |> any() - ) + ) stop("tidybulk says: The factor(s) of interest must not include continuous variables (e.g., integer,numeric, double).") - - string_factor_of_interest = + + string_factor_of_interest = .data %>% - select(!!.sample, factor_of_interest) |> + select(!!.sample, factor_of_interest) |> distinct() |> arrange(!!.sample) |> select(factor_of_interest) |> pull(1) - - + + } else { string_factor_of_interest = NULL } - + gene_to_exclude = .data %>% select(!!.sample,!!.transcript, !!.abundance) |> spread(!!.sample, !!.abundance) |> - + # Drop if transcript have missing value drop_na() %>% - + # If I don't have any transcript with all samples give meaningful error when( nrow(.) == 0 ~ stop("tidybulk says: you don't have any transcript that is in all samples. Please consider using impute_missing_abundance."), ~ (.) ) %>% - + # Call edgeR as_matrix(rownames = !!.transcript) |> edgeR::filterByExpr( @@ -3169,13 +3225,13 @@ setGeneric("identify_abundant", function(.data, not() |> which() |> names() - + .data |> dplyr::mutate(.abundant := (!!.transcript %in% gene_to_exclude) |> not()) |> - + # Attach attributes reattach_internals(.data) - + } #' keep_abundant @@ -3272,10 +3328,10 @@ setGeneric("keep_abundant", function(.data, minimum_counts = 10, minimum_proportion = 0.7) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -3468,7 +3524,7 @@ setGeneric("test_gene_enrichment", function(.data, # Fix NOTEs . = NULL - + # DEPRECATION OF reference function if (is_present(method) & !is.null(method)) { @@ -3617,7 +3673,7 @@ setMethod("test_gene_enrichment", #' @examples #' #' print("Not run for build time.") -#' +#' #' #se_mini = aggregate_duplicates(tidybulk::se_mini, .transcript = entrez) #' #df_entrez = mutate(df_entrez, do_test = feature %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) #' @@ -3786,7 +3842,7 @@ setMethod("test_gene_overrepresentation", #' @examples #' #' print("Not run for build time.") -#' +#' #' \dontrun{ #' #' df_entrez = tidybulk::se_mini @@ -4135,7 +4191,7 @@ setMethod("pivot_transcript", #' @examples #' #' print("Not run for build time.") -#' +#' #' # tidybulk::se_mini |> fill_missing_abundance( fill_with = 0) #' #' @@ -4275,10 +4331,10 @@ setGeneric("impute_missing_abundance", function(.data, suffix = "", force_scaling = FALSE) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -4354,7 +4410,7 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' #' @importFrom rlang enquo #' @importFrom stringr str_detect -#' +#' #' @name test_differential_cellularity #' #' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) @@ -4448,10 +4504,10 @@ setGeneric("test_differential_cellularity", function(.data, significance_threshold = 0.05, ...) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -4591,10 +4647,10 @@ setGeneric("test_stratification_cellularity", function(.data, reference = X_cibersort, ...) { - + # Fix NOTEs . = NULL - + # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -4694,7 +4750,7 @@ setGeneric("get_bibliography", function(.data) # Fix NOTEs . = NULL - + default_methods = c("tidybulk", "tidyverse") # If there is not attributes parameter @@ -4759,8 +4815,8 @@ setMethod("get_bibliography", #' Get matrix from tibble #' #' -#' -#' +#' +#' #' @importFrom magrittr set_rownames #' @importFrom rlang quo_is_null #' @@ -4779,10 +4835,10 @@ setMethod("get_bibliography", as_matrix <- function(tbl, rownames = NULL, do_check = TRUE) { - + # Fix NOTEs . = NULL - + rownames = enquo(rownames) tbl %>% diff --git a/R/methods_SE.R b/R/methods_SE.R index 8d4af0ff..babbdd1a 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -725,6 +725,8 @@ setMethod("remove_redundancy", #' remove_redundancy #' @inheritParams remove_redundancy #' +#' @importFrom rlang quo +#' #' @docType methods #' @rdname remove_redundancy-methods #' @@ -736,14 +738,30 @@ setMethod("remove_redundancy", .adjust_abundance_se = function(.data, - .formula, - transform = log1p, - inverse_transform = expm1, - ...) { + + # DEPRECATED + .formula = NULL, + + .factor_unwanted = NULL, + .factor_of_interest = NULL, + + .abundance = NULL, + + method = "combat_seq", + + + ..., + + # DEPRECATED + transform = NULL, + inverse_transform = NULL + ) { # Fix NOTEs . = NULL + .abundance = enquo(.abundance) + # Check if package is installed, otherwise install if (find.package("sva", quiet = TRUE) %>% length %>% equals(0)) { message("Installing sva - Combat needed for adjustment for unwanted variation") @@ -753,66 +771,142 @@ setMethod("remove_redundancy", } - # Check that .formula includes at least two covariates - if (parse_formula(.formula) %>% length %>% st(2)) - stop( - "The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation" - ) + # DEPRECATION OF log_transform + if ( + (is_present(transform) & !is.null(transform)) | + is_present(inverse_transform) & !is.null(inverse_transform) + ) { + + # Signal the deprecation to the user + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", details = "The argument transform and inverse_transform is now deprecated, please use method argument instead specifying \"combat\" or \"combat_seq\".") + + } + + # Set column name for value scaled + value_adjusted = get_assay_scaled_if_exists_SE(.data) %>% paste0(adjusted_string) + + + # DEPRECATION OF .formula + if (is_present(.formula) & !is.null(.formula)) { + + # Signal the deprecation to the user + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", details = "The argument .formula is now deprecated, please use factor_unwanted and factor_of_interest. Using the formula, the first factor is of interest and the second is unwanted") + + # Check that .formula includes at least two covariates + if (parse_formula(.formula) %>% length %>% st(2)) + stop( + "The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation" + ) - # Check that .formula includes no more than two covariates at the moment - if (parse_formula(.formula) %>% length %>% gt(3)) - warning("tidybulk says: Only the second covariate in the .formula is adjusted for, at the moment") + # Check that .formula includes no more than two covariates at the moment + if (parse_formula(.formula) %>% length %>% gt(3)) + warning("tidybulk says: Only the second covariate in the .formula is adjusted for") + + + .factor_of_interest = quo(!!as.symbol(parse_formula(.formula)[1])) + .factor_unwanted = quo(!!as.symbol(parse_formula(.formula)[2])) + + } else { + + .factor_of_interest = enquo(.factor_of_interest) + .factor_unwanted = enquo(.factor_unwanted) + } # Create design matrix - design = - model.matrix(object = as.formula("~" %>% paste0(parse_formula(.formula)[1])), - # get first argument of the .formula - data = colData(.data)) + design = + model.matrix( + object = as.formula(sprintf("~ %s", colData(.data) |> as_tibble() |> select(!!.factor_of_interest) |> colnames() |> str_c(collapse = '+'))), + # get first argument of the .formula + data = colData(.data) + ) - # Maybe not needed and causing trouble if more columns that in the formula - # %>% - #set_colnames(c("(Intercept)", parse_formula(.formula)[1])) + my_batch = colData(.data) |> as_tibble() |> select(!!.factor_unwanted) - my_batch = colData(.data)[, parse_formula(.formula)[2]] + # If no assay is specified take first + my_assay = ifelse( + quo_is_symbol(.abundance), + quo_name(.abundance), + get_assay_scaled_if_exists_SE(.data) + ) + if(tolower(method) == "combat"){ - my_assay = - .data %>% + my_assay_adjusted = + .data |> + assay(my_assay) |> # Check if log transform is needed + log1p() %>% + # Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) + `+` (rnorm(length(.), 0, 0.000001)) - assays() %>% - as.list() %>% - .[[get_assay_scaled_if_exists_SE(.data)]] %>% - # Check if log transform is needed - transform() + for(i in colnames(my_batch)){ + my_assay_adjusted = + my_assay_adjusted %>% + # Run combat + sva::ComBat( + batch = my_batch[,i] |> pull(1), + mod = design, + prior.plots = FALSE, + ... + ) + } - # Set column name for value scaled - value_adjusted = get_assay_scaled_if_exists_SE(.data) %>% paste0(adjusted_string) + # Tranfrom back + my_assay_adjusted = + my_assay_adjusted %>% + expm1() |> + apply(2, pmax, 0) + + } + else if(tolower(method) == "combat_seq"){ - # Calculate adjusted assay - my_assay_adjusted = + my_assay_adjusted = + .data %>% - my_assay %>% + assay(my_assay) + + for(i in ncol(my_batch)){ - # Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) - `+` (rnorm(length(.), 0, 0.000001)) %>% + my_assay_adjusted = + my_assay_adjusted |> + sva::ComBat_seq(batch = my_batch[,i] |> pull(1), + covar_mod = design, + full_mod=TRUE, + ...) + } - # Run combat - sva::ComBat(batch = my_batch, - mod = design, - prior.plots = FALSE, - ...) %>% + } + else if(tolower(method) == "limma_remove_batch_effect") { + + unwanted_covariate_matrix = + model.matrix( + object = as.formula(sprintf("~ 0 + %s", colData(.data) |> as_tibble() |> select(!!.factor_unwanted) |> colnames() |> str_c(collapse = '+'))), + # get first argument of the .formula + data = colData(.data) + ) + + my_assay_adjusted = + .data |> + assay(my_assay) |> + edgeR::cpm(log = T) |> + limma::removeBatchEffect( + design = design, + covariates = unwanted_covariate_matrix, + ... + ) |> + expm1() |> + apply(2, pmax, 0) - # Check if log transform needs to be reverted - inverse_transform() + } else { + stop("tidybulk says: the argument \"method\" must be combat_seq, combat, or limma_remove_batch_effect") + } # Add the assay - my_assay_scaled = - list(my_assay_adjusted) %>% setNames(value_adjusted) + my_assay_scaled = list(my_assay_adjusted) %>% setNames(value_adjusted) assays(.data) = assays(.data) %>% c(my_assay_scaled) diff --git a/README.Rmd b/README.Rmd index 99188990..d2d73b41 100755 --- a/README.Rmd +++ b/README.Rmd @@ -487,7 +487,7 @@ We may want to adjust `counts` for (known) unwanted variation. `adjust_abundance TidyTranscriptomics ```{r adjust, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} counts_SE.norm.adj = - counts_SE.norm %>% adjust_abundance( ~ factor_of_interest + batch) + counts_SE.norm %>% adjust_abundance( .factor_unwanted = batch, .factor_of_interest = factor_of_interest) ``` diff --git a/dev/.gitignore b/dev/.gitignore new file mode 100644 index 00000000..f274536d --- /dev/null +++ b/dev/.gitignore @@ -0,0 +1,3 @@ +._quant.sf +example_data +quant.sf diff --git a/man/adjust_abundance-methods.Rd b/man/adjust_abundance-methods.Rd index a6acab2e..8ee70a84 100644 --- a/man/adjust_abundance-methods.Rd +++ b/man/adjust_abundance-methods.Rd @@ -12,86 +12,108 @@ \usage{ adjust_abundance( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) \S4method{adjust_abundance}{spec_tbl_df}( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) \S4method{adjust_abundance}{tbl_df}( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) \S4method{adjust_abundance}{tidybulk}( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) \S4method{adjust_abundance}{SummarizedExperiment}( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) \S4method{adjust_abundance}{RangedSummarizedExperiment}( .data, - .formula, + .formula = NULL, + .factor_unwanted = NULL, + .factor_of_interest = NULL, .sample = NULL, .transcript = NULL, .abundance = NULL, - transform = log1p, - inverse_transform = expm1, + method = "combat_seq", action = "add", ..., - log_transform = NULL + log_transform = NULL, + transform = NULL, + inverse_transform = NULL ) } \arguments{ \item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch)} +\item{.formula}{DEPRECATED - A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch)} + +\item{.factor_unwanted}{A tidy select, e.g. column names without double quotation. c(batch, country) These are the factor that we want to adjust for, including unwanted batcheffect, and unwanted biological effects.} + +\item{.factor_of_interest}{A tidy select, e.g. column names without double quotation. c(treatment) These are the factor that we want to preserve.} \item{.sample}{The name of the sample column} @@ -99,15 +121,17 @@ adjust_abundance( \item{.abundance}{The name of the transcript/gene abundance column} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} - -\item{inverse_transform}{A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis.} +\item{method}{A character string. Methods include combat_seq (default), combat and limma_remove_batch_effect.} \item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} \item{...}{Further parameters passed to the function sva::ComBat} \item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} + +\item{transform}{DEPRECATED - A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} + +\item{inverse_transform}{DEPRECATED - A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis.} } \value{ A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` @@ -142,9 +166,9 @@ cm = tidybulk::se_mini cm$batch = 0 cm$batch[colnames(cm) \%in\% c("SRR1740035", "SRR1740043")] = 1 - cm |> - identify_abundant() |> - adjust_abundance( ~ condition + batch ) +cm |> +identify_abundant() |> +adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) } diff --git a/man/test_differential_abundance-methods.Rd b/man/test_differential_abundance-methods.Rd index b935ba85..9ad6e796 100755 --- a/man/test_differential_abundance-methods.Rd +++ b/man/test_differential_abundance-methods.Rd @@ -248,15 +248,15 @@ res <- my_se_mini |> test_differential_abundance( ~ condition, method="deseq2", fitType="local", test_above_log2_fold_change=4 ) - -# Use random intercept and random effect models + +# Use random intercept and random effect models se_mini[1:50,] |> - identify_abundant(factor_of_interest = condition) |> + identify_abundant(factor_of_interest = condition) |> test_differential_abundance( ~ condition + (1 + condition | time), method = "glmmseq_lme4", cores = 1 - ) + ) # confirm that lfcThreshold was used \dontrun{ diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index b1e107f8..b7c16da6 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -153,7 +153,7 @@ test_that("Scaled counts - subset",{ }) test_that("quantile normalisation",{ - + res = input_df |> quantile_normalise_abundance( @@ -162,15 +162,15 @@ test_that("quantile normalisation",{ .abundance = c, action = "get" ) - - res |> - pull(c_scaled) |> - head() |> + + res |> + pull(c_scaled) |> + head() |> expect_equal( c(1052.8 , 63.8 ,7229.0 , 2.9, 2143.6, 9272.8), tolerance=1e-3 ) - + }) @@ -822,14 +822,16 @@ test_that("DESeq2 differential trancript abundance - no object",{ idx <- which(res_lfc$padj < .1) expect_true(all(abs(res_lfc$log2FoldChange[idx]) > 1)) - + }) test_that("differential trancript abundance - random effects",{ - - input_df |> - identify_abundant(a, b, c, factor_of_interest = condition) |> - mutate(time = time |> stringr::str_replace_all(" ", "_")) |> + + input_df |> + identify_abundant(a, b, c, factor_of_interest = condition) |> + mutate(time = time |> stringr::str_replace_all(" ", "_")) |> + + filter(b %in% c("ABCB4" , "ABCB9" , "ACAP1", "ACHE", "ACP5" , "ADAM28"))|> test_differential_abundance( ~ condition + (1 + condition | time), .sample = a, @@ -837,14 +839,14 @@ test_that("differential trancript abundance - random effects",{ .abundance = c, method = "glmmseq_lme4", action="only" - ) |> - pull(P_condition_adjusted) |> - head(4) |> + ) |> + pull(P_condition_adjusted) |> + head(4) |> expect_equal( - c(0.1441371, 0.1066183, 0.1370748, 0.2065339), + c(0.02381167, 0.01097209, 0.01056741, 0.02381167), tolerance=1e-3 ) - + }) @@ -942,12 +944,14 @@ test_that("Only adjusted counts - no object",{ cm$batch[cm$a %in% c("SRR1740035", "SRR1740043")] = 1 res = + cm |> + identify_abundant(a, b, c) |> adjust_abundance( - cm |> identify_abundant(a, b, c), ~ condition + batch, .sample = a, .transcript = b, .abundance = c, + method = "combat", action="only" ) @@ -957,10 +961,7 @@ test_that("Only adjusted counts - no object",{ tolerance=1e-3 ) - expect_equal( - ncol(res), - 3 - ) + expect_equal( ncol(res), 3 ) }) @@ -973,10 +974,11 @@ test_that("Get adjusted counts - no object",{ res = adjust_abundance( cm |> identify_abundant(a, b, c), - ~ condition + batch, + ~ condition + batch, .sample = a, .transcript = b, .abundance = c, + method = "combat", action="get" ) @@ -1006,6 +1008,7 @@ test_that("Add adjusted counts - no object",{ .sample = a, .transcript = b, .abundance = c, + method = "combat", action="add" ) @@ -1023,6 +1026,42 @@ test_that("Add adjusted counts - no object",{ }) +test_that("Get adjusted counts multiple factors - SummarizedExperiment",{ + + cm = input_df + cm$batch = 0 + cm$batch[cm$a %in% c("SRR1740035", "SRR1740043")] = 1 + + cm = + cm |> + identify_abundant(a, b, c) + cm$c = as.integer(cm$c ) + + res = + cm |> + identify_abundant(a, b, c) |> + adjust_abundance(.factor_unwanted = c(time, batch), + .factor_of_interest = dead, + .sample = a, + .transcript = b, + .abundance = c, + method = "combat_seq", + shrink.disp = TRUE, + shrink = TRUE, + gene.subset.n = 100 + ) + + # Usa MCMC so it is stokastic + # expect_equal( + # unique(res$`c_adjusted`)[c(1, 2, 3, 5)], + # c(NA, 5059, 1942, 2702), + # tolerance=1e-3 + # ) + + + +}) + test_that("Only cluster lables based on Kmeans - no object",{ res = diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index bf56bf63..c9946657 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -66,9 +66,9 @@ test_that("tidybulk SummarizedExperiment normalisation",{ test_that("quantile normalisation",{ - + res = se_mini |> quantile_normalise_abundance() - + res_tibble = input_df |> quantile_normalise_abundance( @@ -77,15 +77,15 @@ test_that("quantile normalisation",{ .abundance = c, action = "get" ) - - - SummarizedExperiment::assay(res, "count_scaled")["ABCB9","SRR1740035"] |> + + + SummarizedExperiment::assay(res, "count_scaled")["ABCB9","SRR1740035"] |> expect_equal( - res_tibble |> - filter(a=="SRR1740035" & b=="ABCB9") |> + res_tibble |> + filter(a=="SRR1740035" & b=="ABCB9") |> pull(c_scaled) ) - + }) @@ -175,9 +175,11 @@ test_that("Get adjusted counts - SummarizedExperiment",{ cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 res = + cm |> + identify_abundant() |> adjust_abundance( - cm |> identify_abundant(), - ~ condition + batch + ~ condition + batch, + method = "combat" ) expect_equal(nrow(res), 527 ) @@ -185,6 +187,36 @@ test_that("Get adjusted counts - SummarizedExperiment",{ expect_equal( names(SummarizedExperiment::assays(res)), c("count" ,"count_adjusted") ) +}) + +test_that("Get adjusted counts multiple factors - SummarizedExperiment",{ + + cm = se_mini + cm$batch = 0 + cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 + cm = + cm |> + identify_abundant() |> + scale_abundance() + cm@assays@data$count_scaled = apply(cm@assays@data$count_scaled, 2, as.integer) + + + res = + cm |> + adjust_abundance(.factor_unwanted = c(batch), + .factor_of_interest = time, + .abundance = count_scaled, + method = "combat_seq", + shrink.disp = TRUE, + shrink = TRUE, + gene.subset.n = 100 + ) + + expect_equal(nrow(res), 527 ) + + expect_equal( names(SummarizedExperiment::assays(res)), c("count" , "count_scaled", "count_scaled_adjusted") ) + + }) test_that("Aggregate duplicated transcript - SummarizedExperiment",{ @@ -306,29 +338,29 @@ test_that("differential trancript abundance - SummarizedExperiment",{ test_that("differential trancript abundance - SummarizedExperiment - alternative .abundance",{ - + assays(se_mini) = list(counts = assay(se_mini), bla = assay(se_mini)) - - - res = se_mini |> - identify_abundant(factor_of_interest = condition) |> + + + res = se_mini |> + identify_abundant(factor_of_interest = condition) |> test_differential_abundance( ~ condition , .abundance = bla) - + w = match( c("CLEC7A" , "FAM198B", "FCN1" , "HK3" ), rownames(res) ) - + # Quasi likelihood res_tibble = test_differential_abundance( input_df |> identify_abundant(a, b, c, factor_of_interest = condition), ~ condition , a, b, c ) - + expect_equal( res@elementMetadata[w,]$logFC, c(-11.58385, -13.53406, -12.58204, -12.19271), tolerance=1e-3 ) - + expect_equal( res@elementMetadata[w,]$logFC, res_tibble |> @@ -338,25 +370,25 @@ test_that("differential trancript abundance - SummarizedExperiment - alternative dplyr::pull(logFC), tolerance=1e-3 ) - + # Likelihood ratio res2 = test_differential_abundance( se_mini |> identify_abundant(factor_of_interest = condition), ~ condition, .abundance = bla, method = "edgeR_likelihood_ratio" ) - + res2_tibble = test_differential_abundance( input_df |> identify_abundant(a, b, c, factor_of_interest = condition), ~ condition , a, b, c, method = "edgeR_likelihood_ratio" ) - + expect_equal( res2@elementMetadata[w,]$logFC, c(-11.57989, -13.53476, -12.57969, -12.19303), tolerance=1e-3 ) - + expect_equal( res2@elementMetadata[w,]$logFC, res2_tibble |> @@ -366,7 +398,7 @@ test_that("differential trancript abundance - SummarizedExperiment - alternative dplyr::pull(logFC), tolerance=1e-3 ) - + # Treat se_mini |> identify_abundant( factor_of_interest = condition) |> @@ -381,7 +413,7 @@ test_that("differential trancript abundance - SummarizedExperiment - alternative filter(FDR<0.05) |> nrow() |> expect_equal(169) - + }) @@ -426,24 +458,24 @@ test_that("Voom with treat method",{ }) test_that("differential trancript abundance - random effects SE",{ - - res = - se_mini |> - identify_abundant(factor_of_interest = condition) |> - #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> + + res = + se_mini[1:10,] |> + identify_abundant(factor_of_interest = condition) |> + #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> test_differential_abundance( ~ condition + (1 + condition | time), method = "glmmseq_lme4" - ) - - rowData(res)[,"P_condition_adjusted"] |> - head(4) |> + ) + + rowData(res)[,"P_condition_adjusted"] |> + head(4) |> expect_equal( - c(0.1441371, 0.1066183, 0.1370748, NA), - tolerance=1e-3 + c(0.1065866, 0.1109067, 0.1116562 , NA), + tolerance=1e-2 ) - - + + }) @@ -486,10 +518,10 @@ test_that("impute missing",{ impute_missing_abundance( ~ condition ) list_SE = SummarizedExperiment::assays(res) |> as.list() - + list_SE[[1]]["TNFRSF4", "SRR1740034"] |> expect_equal(6) - + expect_equal( nrow(res)*ncol(res), nrow(input_df) ) diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index fdaed6dc..490b7578 100755 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -475,13 +475,14 @@ se_mini.de = ## Adjust `counts` -We may want to adjust `counts` for (known) unwanted variation. `adjust_abundance` takes as arguments a tibble, column names (as symbols; for `sample`, `transcript` and `count`) and a formula representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation, and returns a tibble with additional columns for the adjusted counts as `_adjusted`. At the moment just an unwanted covariated is allowed at a time. +We may want to adjust `counts` for (known) unwanted variation. `adjust_abundance` takes as arguments a tibble, column names (as symbols; for `sample`, `transcript` and `count`) and a formula representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation, and returns a tibble with additional columns for the adjusted counts as `_adjusted`. At the moment just an unwanted covariates is allowed at a time.
TidyTranscriptomics ```{r adjust, message=FALSE, warning=FALSE, results='hide'} se_mini.norm.adj = - se_mini.norm |> adjust_abundance( ~ condition + time) + se_mini.norm |> adjust_abundance( .factor_unwanted = time, .factor_of_interest = condition, method="combat") + ```