diff --git a/NAMESPACE b/NAMESPACE index 9783b5dd8..0aab1f2c2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ importFrom(CompoundDb,Spectra) importFrom(CompoundDb,createCompDb) importFrom(CompoundDb,make_metadata) importFrom(MetaboAnnotation,CompareSpectraParam) +importFrom(MetaboAnnotation,MatchForwardReverseParam) importFrom(MetaboAnnotation,matchSpectra) importFrom(MsBackendMgf,MsBackendMgf) importFrom(MsBackendMgf,readMgf) diff --git a/R/parse_cli_params.R b/R/parse_cli_params.R index 91c69ce32..2ae85b6cc 100644 --- a/R/parse_cli_params.R +++ b/R/parse_cli_params.R @@ -11,6 +11,9 @@ parse_cli_params <- function() { log_debug("Loading command line arguments") if (exists("arguments")) { + if (!is.null(arguments$approx)) { + params$approx <- arguments$approx + } if (!is.null(arguments$biological)) { params$weights$biological <- as.numeric(arguments$biological) } @@ -38,6 +41,9 @@ parse_cli_params <- function() { if (!is.null(arguments$extension)) { params$extension <- arguments$extension } + if (!is.null(arguments$fast)) { + params$fast <- arguments$fast + } if (!is.null(arguments$features)) { params$taxa <- arguments$features params$feature_name <- arguments$features diff --git a/R/process_spectra.R b/R/process_spectra.R index 7de8d1708..283bb2604 100644 --- a/R/process_spectra.R +++ b/R/process_spectra.R @@ -5,7 +5,7 @@ #' @details It takes two files as input. A query file that will be matched against a library file. #' Multiple comparison distances are available #' ('gnps', 'navdist','ndotproduct','neuclidean', 'nspectraangle' (See MsCoreUtils for details)). -#' Number of matched peaks and their ratio are also available. +#' Number of matched peaks and their ratio are also available (See MatchForwardReverseParam for details). #' Parallel processing is also made available. #' #' @param input Query file containing spectra. Currently an '.mgf' file @@ -27,7 +27,7 @@ #' #' @importFrom BiocParallel MulticoreParam SerialParam #' @importFrom dplyr full_join left_join select -#' @importFrom MetaboAnnotation CompareSpectraParam matchSpectra +#' @importFrom MetaboAnnotation CompareSpectraParam MatchForwardReverseParam matchSpectra #' @importFrom MsCoreUtils gnps navdist ndotproduct neuclidean nspectraangle #' @importFrom Spectra filterIntensity #' @@ -43,7 +43,9 @@ process_spectra <- function(input = params$input, rpeaks = params$ms$peaks$ratio, condition = params$ms$condition, qutoff = params$qutoff, - parallel = params$parallel) { + parallel = params$parallel, + fast = params$fast, + approx = params$approx) { stopifnot("Your input file does not exist." = file.exists(input)) if (file.exists(library |> gsub( @@ -80,11 +82,15 @@ process_spectra <- function(input = params$input, spectra <- input |> import_spectra() - log_debug("Loading spectral library (Can take long)") + log_debug("Loading spectral library") ## COMMENT (AR): TODO Try HDF5 formatted? spectral_library <- library |> import_spectra() + ## COMMENT (AR): Temporary dumb fix + spectral_library$precursorMz <- + spectral_library$exactmass + spectral_library$precursorCharge * 1.00728 + sim_fun <- switch( EXPR = method, "gnps" = MsCoreUtils::gnps, @@ -94,52 +100,32 @@ process_spectra <- function(input = params$input, "nspectraangle" = MsCoreUtils::nspectraangle ) - params_sim <- MetaboAnnotation::CompareSpectraParam( - ppm = ppm, - tolerance = dalton, - FUN = sim_fun, - requirePrecursor = FALSE, - THRESHFUN = function(x) { - which(x >= threshold) - }, - BPPARAM = par - ) - - params_left <- MetaboAnnotation::CompareSpectraParam( - ppm = ppm, - tolerance = dalton, - FUN = function(x, y, ...) { - nrow(x) - }, - type = "left", - requirePrecursor = FALSE, - BPPARAM = par - ) - - params_right <- MetaboAnnotation::CompareSpectraParam( - ppm = ppm, - tolerance = dalton, - FUN = function(x, y, ...) { - nrow(x) - }, - type = "right", - requirePrecursor = FALSE, - BPPARAM = par - ) - - params_inner <- MetaboAnnotation::CompareSpectraParam( - ppm = ppm, - tolerance = dalton, - FUN = function(x, y, ...) { - nrow(x) - }, - type = "inner", - requirePrecursor = FALSE, - THRESHFUN = function(x) { - which(x >= npeaks) - }, - BPPARAM = par - ) + ## COMMENT (AR): TODO export as param, same for precursor + if (fast) { + params_sim <- MetaboAnnotation::CompareSpectraParam( + ppm = ppm, + tolerance = dalton, + MAPFUN = Spectra::joinPeaksGnps, + FUN = sim_fun, + requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE), + THRESHFUN = function(x) { + which(x >= threshold) + }, + BPPARAM = par + ) + } else { + params_sim <- MetaboAnnotation::MatchForwardReverseParam( + ppm = ppm, + tolerance = dalton, + MAPFUN = Spectra::joinPeaksGnps, + FUN = sim_fun, + requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE), + THRESHFUN = function(x) { + which(x >= threshold) + }, + BPPARAM = par + ) + } ## COMMENT (AR): TODO Maybe implement some safety sanitization of the spectra? ## Can be very slow otherwise @@ -149,59 +135,28 @@ process_spectra <- function(input = params$input, Spectra::filterIntensity(intensity = c(qutoff, Inf)) log_debug("Performing spectral comparison") - matches_sim <- MetaboAnnotation::matchSpectra( - query = spectra, - target = spectral_library, - param = params_sim - ) - df_similarity <- matches_sim@matches |> - dplyr::rename(msms_score = score) - - log_debug("Counting fragments") - peaks_query <- MetaboAnnotation::matchSpectra( - query = spectra, - target = spectral_library, - param = params_left + log_debug( + "If you do not need the number/ratio of matched peaks, + computation can be much faster by setting the parameter fast to TRUE" ) - peaks_target <- MetaboAnnotation::matchSpectra( - query = spectra, - target = spectral_library, - param = params_right + log_debug( + "If you need it, the score threshold greatly impacts speed. + A higher threshold will lead to faster results." ) - df_count <- peaks_query@matches |> - dplyr::rename(peaks_query = score) |> - dplyr::full_join(peaks_target@matches |> - dplyr::rename(peaks_target = score)) |> - dplyr::rowwise() |> - dplyr::mutate(peaks = max(peaks_query, peaks_target)) |> - dplyr::select(-peaks_query, -peaks_target) - - log_debug("Performing fragments comparison") - matches_inner <- MetaboAnnotation::matchSpectra( + matches_sim <- MetaboAnnotation::matchSpectra( query = spectra, target = spectral_library, - param = params_inner + param = params_sim ) - df_peaks <- matches_inner@matches |> - dplyr::rename(peaks_abs = score) |> - dplyr::inner_join(df_count) |> - dplyr::mutate(peaks_rel = peaks_abs / peaks) |> - dplyr::select(-peaks) - - if (condition == "AND") { - df_peaks <- df_peaks |> - dplyr::filter(peaks_rel >= rpeaks) - } - log_debug("Formatting results") - df_full <- df_similarity |> - dplyr::full_join(df_peaks) + df_full <- matches_sim@matches |> + dplyr::rename(msms_score = score) if (condition == "AND") { df_full <- df_full |> dplyr::filter(msms_score >= threshold & - peaks_abs >= npeaks & - peaks_rel >= rpeaks) + matched_peaks_count >= npeaks & + presence_ratio >= rpeaks) } ## COMMENT AR: Not doing it because of thresholding @@ -223,13 +178,20 @@ process_spectra <- function(input = params$input, exact_mass ) + spectra_ids <- spectra |> + extract_spectra() |> + dplyr::mutate(query_idx = dplyr::row_number()) |> + dplyr::distinct(feature_id = acquisitionNum, query_idx) + df_final <- df_full |> dplyr::left_join(df_meta) |> - dplyr::rename(feature_id = query_idx) |> - dplyr::select(-target_idx) + dplyr::left_join(spectra_ids) |> + dplyr::select(-target_idx, -query_idx) |> + dplyr::relocate(feature_id, .before = msms_score) log_debug( - nrow(df_final), + nrow(df_final |> + dplyr::distinct(short_inchikey)), "Candidates were annotated on", nrow(df_final |> dplyr::distinct(feature_id)), diff --git a/config/default/process_spectra.yaml b/config/default/process_spectra.yaml index 205b7585b..2491faf65 100644 --- a/config/default/process_spectra.yaml +++ b/config/default/process_spectra.yaml @@ -49,3 +49,9 @@ qutoff: 0 #' Execute processes in parallel? BOOLEAN parallel: yes + +#' Execute process faster without peaks matching? BOOLEAN +fast: yes + +#' Execute approximative matching without precursor matching? BOOLEAN +approx: no diff --git a/inst/scripts/docopt/process_spectra.txt b/inst/scripts/docopt/process_spectra.txt index 450e8b0fd..0d29fed2f 100644 --- a/inst/scripts/docopt/process_spectra.txt +++ b/inst/scripts/docopt/process_spectra.txt @@ -1,13 +1,15 @@ You can use this script with the following example: -Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true +Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true --fast true --approx false Usage: - process_spectra.R [-i | --input=] [-l | --library=] [-o | --output=] [-d | --dalton=] [-p | --ppm=] [-m | --method=] [-t | --threshold=] [-c | --condition=] [-q | --qutoff=] + process_spectra.R [-i | --input=] [-l | --library=] [-o | --output=] [-a | --approx=] [-d | --dalton=] [-f | --fast=] [-p | --ppm=] [-m | --method=] [-t | --threshold=] [-c | --condition=] [-q | --qutoff=] Arguments: + -a --approx= Allow for matching without precursor match (Boolean) -b --parallel= Perform parallel processing (Boolean) -c --condition= Condition used for thresholding (AND or OR) -d --dalton= Tolerance for matching. Should not exceed 0.05 Da. + -f --fast= Perform a faster version (Boolean) -i --input= Your isdb result file. Supports compressed files. -l --library= Your MS2 library file (MGF). -o --output= Path and filename for the output. If you specify .gz file will be compressed. diff --git a/man/process_spectra.Rd b/man/process_spectra.Rd index dc86bdc57..26d2da359 100644 --- a/man/process_spectra.Rd +++ b/man/process_spectra.Rd @@ -16,7 +16,9 @@ process_spectra( rpeaks = params$ms$peaks$ratio, condition = params$ms$condition, qutoff = params$qutoff, - parallel = params$parallel + parallel = params$parallel, + fast = params$fast, + approx = params$approx ) } \arguments{ @@ -51,7 +53,7 @@ This function performs spectral matching. It takes two files as input. A query file that will be matched against a library file. Multiple comparison distances are available ('gnps', 'navdist','ndotproduct','neuclidean', 'nspectraangle' (See MsCoreUtils for details)). -Number of matched peaks and their ratio are also available. +Number of matched peaks and their ratio are also available (See MatchForwardReverseParam for details). Parallel processing is also made available. } \examples{ diff --git a/tests/testthat/test_functions.R b/tests/testthat/test_functions.R index e62706ff5..b446686f2 100644 --- a/tests/testthat/test_functions.R +++ b/tests/testthat/test_functions.R @@ -173,6 +173,7 @@ testthat::test_that("Whole process", { ## CLI arguments check arguments <<- character() + arguments$approx <<- "x" arguments$biological <<- "x" arguments$column.name <<- "x" arguments$complement <<- "x" @@ -182,6 +183,7 @@ testthat::test_that("Whole process", { arguments$directory <<- "x" arguments$edges <<- "x" arguments$extension <<- "x" + arguments$fast <<- "x" arguments$features <<- "x" arguments$filter <<- "x" arguments$gnps <<- "x"