diff --git a/NAMESPACE b/NAMESPACE
index 9783b5dd8..0aab1f2c2 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -65,6 +65,7 @@ importFrom(CompoundDb,Spectra)
importFrom(CompoundDb,createCompDb)
importFrom(CompoundDb,make_metadata)
importFrom(MetaboAnnotation,CompareSpectraParam)
+importFrom(MetaboAnnotation,MatchForwardReverseParam)
importFrom(MetaboAnnotation,matchSpectra)
importFrom(MsBackendMgf,MsBackendMgf)
importFrom(MsBackendMgf,readMgf)
diff --git a/R/parse_cli_params.R b/R/parse_cli_params.R
index 91c69ce32..2ae85b6cc 100644
--- a/R/parse_cli_params.R
+++ b/R/parse_cli_params.R
@@ -11,6 +11,9 @@ parse_cli_params <- function() {
log_debug("Loading command line arguments")
if (exists("arguments")) {
+ if (!is.null(arguments$approx)) {
+ params$approx <- arguments$approx
+ }
if (!is.null(arguments$biological)) {
params$weights$biological <- as.numeric(arguments$biological)
}
@@ -38,6 +41,9 @@ parse_cli_params <- function() {
if (!is.null(arguments$extension)) {
params$extension <- arguments$extension
}
+ if (!is.null(arguments$fast)) {
+ params$fast <- arguments$fast
+ }
if (!is.null(arguments$features)) {
params$taxa <- arguments$features
params$feature_name <- arguments$features
diff --git a/R/process_spectra.R b/R/process_spectra.R
index 7de8d1708..283bb2604 100644
--- a/R/process_spectra.R
+++ b/R/process_spectra.R
@@ -5,7 +5,7 @@
#' @details It takes two files as input. A query file that will be matched against a library file.
#' Multiple comparison distances are available
#' ('gnps', 'navdist','ndotproduct','neuclidean', 'nspectraangle' (See MsCoreUtils for details)).
-#' Number of matched peaks and their ratio are also available.
+#' Number of matched peaks and their ratio are also available (See MatchForwardReverseParam for details).
#' Parallel processing is also made available.
#'
#' @param input Query file containing spectra. Currently an '.mgf' file
@@ -27,7 +27,7 @@
#'
#' @importFrom BiocParallel MulticoreParam SerialParam
#' @importFrom dplyr full_join left_join select
-#' @importFrom MetaboAnnotation CompareSpectraParam matchSpectra
+#' @importFrom MetaboAnnotation CompareSpectraParam MatchForwardReverseParam matchSpectra
#' @importFrom MsCoreUtils gnps navdist ndotproduct neuclidean nspectraangle
#' @importFrom Spectra filterIntensity
#'
@@ -43,7 +43,9 @@ process_spectra <- function(input = params$input,
rpeaks = params$ms$peaks$ratio,
condition = params$ms$condition,
qutoff = params$qutoff,
- parallel = params$parallel) {
+ parallel = params$parallel,
+ fast = params$fast,
+ approx = params$approx) {
stopifnot("Your input file does not exist." = file.exists(input))
if (file.exists(library |>
gsub(
@@ -80,11 +82,15 @@ process_spectra <- function(input = params$input,
spectra <- input |>
import_spectra()
- log_debug("Loading spectral library (Can take long)")
+ log_debug("Loading spectral library")
## COMMENT (AR): TODO Try HDF5 formatted?
spectral_library <- library |>
import_spectra()
+ ## COMMENT (AR): Temporary dumb fix
+ spectral_library$precursorMz <-
+ spectral_library$exactmass + spectral_library$precursorCharge * 1.00728
+
sim_fun <- switch(
EXPR = method,
"gnps" = MsCoreUtils::gnps,
@@ -94,52 +100,32 @@ process_spectra <- function(input = params$input,
"nspectraangle" = MsCoreUtils::nspectraangle
)
- params_sim <- MetaboAnnotation::CompareSpectraParam(
- ppm = ppm,
- tolerance = dalton,
- FUN = sim_fun,
- requirePrecursor = FALSE,
- THRESHFUN = function(x) {
- which(x >= threshold)
- },
- BPPARAM = par
- )
-
- params_left <- MetaboAnnotation::CompareSpectraParam(
- ppm = ppm,
- tolerance = dalton,
- FUN = function(x, y, ...) {
- nrow(x)
- },
- type = "left",
- requirePrecursor = FALSE,
- BPPARAM = par
- )
-
- params_right <- MetaboAnnotation::CompareSpectraParam(
- ppm = ppm,
- tolerance = dalton,
- FUN = function(x, y, ...) {
- nrow(x)
- },
- type = "right",
- requirePrecursor = FALSE,
- BPPARAM = par
- )
-
- params_inner <- MetaboAnnotation::CompareSpectraParam(
- ppm = ppm,
- tolerance = dalton,
- FUN = function(x, y, ...) {
- nrow(x)
- },
- type = "inner",
- requirePrecursor = FALSE,
- THRESHFUN = function(x) {
- which(x >= npeaks)
- },
- BPPARAM = par
- )
+ ## COMMENT (AR): TODO export as param, same for precursor
+ if (fast) {
+ params_sim <- MetaboAnnotation::CompareSpectraParam(
+ ppm = ppm,
+ tolerance = dalton,
+ MAPFUN = Spectra::joinPeaksGnps,
+ FUN = sim_fun,
+ requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE),
+ THRESHFUN = function(x) {
+ which(x >= threshold)
+ },
+ BPPARAM = par
+ )
+ } else {
+ params_sim <- MetaboAnnotation::MatchForwardReverseParam(
+ ppm = ppm,
+ tolerance = dalton,
+ MAPFUN = Spectra::joinPeaksGnps,
+ FUN = sim_fun,
+ requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE),
+ THRESHFUN = function(x) {
+ which(x >= threshold)
+ },
+ BPPARAM = par
+ )
+ }
## COMMENT (AR): TODO Maybe implement some safety sanitization of the spectra?
## Can be very slow otherwise
@@ -149,59 +135,28 @@ process_spectra <- function(input = params$input,
Spectra::filterIntensity(intensity = c(qutoff, Inf))
log_debug("Performing spectral comparison")
- matches_sim <- MetaboAnnotation::matchSpectra(
- query = spectra,
- target = spectral_library,
- param = params_sim
- )
- df_similarity <- matches_sim@matches |>
- dplyr::rename(msms_score = score)
-
- log_debug("Counting fragments")
- peaks_query <- MetaboAnnotation::matchSpectra(
- query = spectra,
- target = spectral_library,
- param = params_left
+ log_debug(
+ "If you do not need the number/ratio of matched peaks,
+ computation can be much faster by setting the parameter fast to TRUE"
)
- peaks_target <- MetaboAnnotation::matchSpectra(
- query = spectra,
- target = spectral_library,
- param = params_right
+ log_debug(
+ "If you need it, the score threshold greatly impacts speed.
+ A higher threshold will lead to faster results."
)
- df_count <- peaks_query@matches |>
- dplyr::rename(peaks_query = score) |>
- dplyr::full_join(peaks_target@matches |>
- dplyr::rename(peaks_target = score)) |>
- dplyr::rowwise() |>
- dplyr::mutate(peaks = max(peaks_query, peaks_target)) |>
- dplyr::select(-peaks_query, -peaks_target)
-
- log_debug("Performing fragments comparison")
- matches_inner <- MetaboAnnotation::matchSpectra(
+ matches_sim <- MetaboAnnotation::matchSpectra(
query = spectra,
target = spectral_library,
- param = params_inner
+ param = params_sim
)
- df_peaks <- matches_inner@matches |>
- dplyr::rename(peaks_abs = score) |>
- dplyr::inner_join(df_count) |>
- dplyr::mutate(peaks_rel = peaks_abs / peaks) |>
- dplyr::select(-peaks)
-
- if (condition == "AND") {
- df_peaks <- df_peaks |>
- dplyr::filter(peaks_rel >= rpeaks)
- }
- log_debug("Formatting results")
- df_full <- df_similarity |>
- dplyr::full_join(df_peaks)
+ df_full <- matches_sim@matches |>
+ dplyr::rename(msms_score = score)
if (condition == "AND") {
df_full <- df_full |>
dplyr::filter(msms_score >= threshold &
- peaks_abs >= npeaks &
- peaks_rel >= rpeaks)
+ matched_peaks_count >= npeaks &
+ presence_ratio >= rpeaks)
}
## COMMENT AR: Not doing it because of thresholding
@@ -223,13 +178,20 @@ process_spectra <- function(input = params$input,
exact_mass
)
+ spectra_ids <- spectra |>
+ extract_spectra() |>
+ dplyr::mutate(query_idx = dplyr::row_number()) |>
+ dplyr::distinct(feature_id = acquisitionNum, query_idx)
+
df_final <- df_full |>
dplyr::left_join(df_meta) |>
- dplyr::rename(feature_id = query_idx) |>
- dplyr::select(-target_idx)
+ dplyr::left_join(spectra_ids) |>
+ dplyr::select(-target_idx, -query_idx) |>
+ dplyr::relocate(feature_id, .before = msms_score)
log_debug(
- nrow(df_final),
+ nrow(df_final |>
+ dplyr::distinct(short_inchikey)),
"Candidates were annotated on",
nrow(df_final |>
dplyr::distinct(feature_id)),
diff --git a/config/default/process_spectra.yaml b/config/default/process_spectra.yaml
index 205b7585b..2491faf65 100644
--- a/config/default/process_spectra.yaml
+++ b/config/default/process_spectra.yaml
@@ -49,3 +49,9 @@ qutoff: 0
#' Execute processes in parallel? BOOLEAN
parallel: yes
+
+#' Execute process faster without peaks matching? BOOLEAN
+fast: yes
+
+#' Execute approximative matching without precursor matching? BOOLEAN
+approx: no
diff --git a/inst/scripts/docopt/process_spectra.txt b/inst/scripts/docopt/process_spectra.txt
index 450e8b0fd..0d29fed2f 100644
--- a/inst/scripts/docopt/process_spectra.txt
+++ b/inst/scripts/docopt/process_spectra.txt
@@ -1,13 +1,15 @@
You can use this script with the following example:
-Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true
+Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true --fast true --approx false
Usage:
- process_spectra.R [-i | --input=] [-l | --library=] [-o | --output=