Skip to content

Commit

Permalink
spectral matching update, see rformassspectrometry/MetaboAnnotation#93
Browse files Browse the repository at this point in the history
  • Loading branch information
Adafede committed Feb 7, 2023
1 parent b07d356 commit fafb575
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 100 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ importFrom(CompoundDb,Spectra)
importFrom(CompoundDb,createCompDb)
importFrom(CompoundDb,make_metadata)
importFrom(MetaboAnnotation,CompareSpectraParam)
importFrom(MetaboAnnotation,MatchForwardReverseParam)
importFrom(MetaboAnnotation,matchSpectra)
importFrom(MsBackendMgf,MsBackendMgf)
importFrom(MsBackendMgf,readMgf)
Expand Down
6 changes: 6 additions & 0 deletions R/parse_cli_params.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ parse_cli_params <- function() {
log_debug("Loading command line arguments")

if (exists("arguments")) {
if (!is.null(arguments$approx)) {
params$approx <- arguments$approx
}
if (!is.null(arguments$biological)) {
params$weights$biological <- as.numeric(arguments$biological)
}
Expand Down Expand Up @@ -38,6 +41,9 @@ parse_cli_params <- function() {
if (!is.null(arguments$extension)) {
params$extension <- arguments$extension
}
if (!is.null(arguments$fast)) {
params$fast <- arguments$fast
}
if (!is.null(arguments$features)) {
params$taxa <- arguments$features
params$feature_name <- arguments$features
Expand Down
154 changes: 58 additions & 96 deletions R/process_spectra.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' @details It takes two files as input. A query file that will be matched against a library file.
#' Multiple comparison distances are available
#' ('gnps', 'navdist','ndotproduct','neuclidean', 'nspectraangle' (See MsCoreUtils for details)).
#' Number of matched peaks and their ratio are also available.
#' Number of matched peaks and their ratio are also available (See MatchForwardReverseParam for details).
#' Parallel processing is also made available.
#'
#' @param input Query file containing spectra. Currently an '.mgf' file
Expand All @@ -27,7 +27,7 @@
#'
#' @importFrom BiocParallel MulticoreParam SerialParam
#' @importFrom dplyr full_join left_join select
#' @importFrom MetaboAnnotation CompareSpectraParam matchSpectra
#' @importFrom MetaboAnnotation CompareSpectraParam MatchForwardReverseParam matchSpectra
#' @importFrom MsCoreUtils gnps navdist ndotproduct neuclidean nspectraangle
#' @importFrom Spectra filterIntensity
#'
Expand All @@ -43,7 +43,9 @@ process_spectra <- function(input = params$input,
rpeaks = params$ms$peaks$ratio,
condition = params$ms$condition,
qutoff = params$qutoff,
parallel = params$parallel) {
parallel = params$parallel,
fast = params$fast,
approx = params$approx) {
stopifnot("Your input file does not exist." = file.exists(input))
if (file.exists(library |>
gsub(
Expand Down Expand Up @@ -80,11 +82,15 @@ process_spectra <- function(input = params$input,
spectra <- input |>
import_spectra()

log_debug("Loading spectral library (Can take long)")
log_debug("Loading spectral library")
## COMMENT (AR): TODO Try HDF5 formatted?
spectral_library <- library |>
import_spectra()

## COMMENT (AR): Temporary dumb fix
spectral_library$precursorMz <-
spectral_library$exactmass + spectral_library$precursorCharge * 1.00728

sim_fun <- switch(
EXPR = method,
"gnps" = MsCoreUtils::gnps,
Expand All @@ -94,52 +100,32 @@ process_spectra <- function(input = params$input,
"nspectraangle" = MsCoreUtils::nspectraangle
)

params_sim <- MetaboAnnotation::CompareSpectraParam(
ppm = ppm,
tolerance = dalton,
FUN = sim_fun,
requirePrecursor = FALSE,
THRESHFUN = function(x) {
which(x >= threshold)
},
BPPARAM = par
)

params_left <- MetaboAnnotation::CompareSpectraParam(
ppm = ppm,
tolerance = dalton,
FUN = function(x, y, ...) {
nrow(x)
},
type = "left",
requirePrecursor = FALSE,
BPPARAM = par
)

params_right <- MetaboAnnotation::CompareSpectraParam(
ppm = ppm,
tolerance = dalton,
FUN = function(x, y, ...) {
nrow(x)
},
type = "right",
requirePrecursor = FALSE,
BPPARAM = par
)

params_inner <- MetaboAnnotation::CompareSpectraParam(
ppm = ppm,
tolerance = dalton,
FUN = function(x, y, ...) {
nrow(x)
},
type = "inner",
requirePrecursor = FALSE,
THRESHFUN = function(x) {
which(x >= npeaks)
},
BPPARAM = par
)
## COMMENT (AR): TODO export as param, same for precursor
if (fast) {
params_sim <- MetaboAnnotation::CompareSpectraParam(
ppm = ppm,
tolerance = dalton,
MAPFUN = Spectra::joinPeaksGnps,
FUN = sim_fun,
requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE),
THRESHFUN = function(x) {
which(x >= threshold)
},
BPPARAM = par
)
} else {
params_sim <- MetaboAnnotation::MatchForwardReverseParam(
ppm = ppm,
tolerance = dalton,
MAPFUN = Spectra::joinPeaksGnps,
FUN = sim_fun,
requirePrecursor = ifelse(test = approx, yes = FALSE, no = TRUE),
THRESHFUN = function(x) {
which(x >= threshold)
},
BPPARAM = par
)
}

## COMMENT (AR): TODO Maybe implement some safety sanitization of the spectra?
## Can be very slow otherwise
Expand All @@ -149,59 +135,28 @@ process_spectra <- function(input = params$input,
Spectra::filterIntensity(intensity = c(qutoff, Inf))

log_debug("Performing spectral comparison")
matches_sim <- MetaboAnnotation::matchSpectra(
query = spectra,
target = spectral_library,
param = params_sim
)
df_similarity <- matches_sim@matches |>
dplyr::rename(msms_score = score)

log_debug("Counting fragments")
peaks_query <- MetaboAnnotation::matchSpectra(
query = spectra,
target = spectral_library,
param = params_left
log_debug(
"If you do not need the number/ratio of matched peaks,
computation can be much faster by setting the parameter fast to TRUE"
)
peaks_target <- MetaboAnnotation::matchSpectra(
query = spectra,
target = spectral_library,
param = params_right
log_debug(
"If you need it, the score threshold greatly impacts speed.
A higher threshold will lead to faster results."
)
df_count <- peaks_query@matches |>
dplyr::rename(peaks_query = score) |>
dplyr::full_join(peaks_target@matches |>
dplyr::rename(peaks_target = score)) |>
dplyr::rowwise() |>
dplyr::mutate(peaks = max(peaks_query, peaks_target)) |>
dplyr::select(-peaks_query, -peaks_target)

log_debug("Performing fragments comparison")
matches_inner <- MetaboAnnotation::matchSpectra(
matches_sim <- MetaboAnnotation::matchSpectra(
query = spectra,
target = spectral_library,
param = params_inner
param = params_sim
)
df_peaks <- matches_inner@matches |>
dplyr::rename(peaks_abs = score) |>
dplyr::inner_join(df_count) |>
dplyr::mutate(peaks_rel = peaks_abs / peaks) |>
dplyr::select(-peaks)

if (condition == "AND") {
df_peaks <- df_peaks |>
dplyr::filter(peaks_rel >= rpeaks)
}

log_debug("Formatting results")
df_full <- df_similarity |>
dplyr::full_join(df_peaks)
df_full <- matches_sim@matches |>
dplyr::rename(msms_score = score)

if (condition == "AND") {
df_full <- df_full |>
dplyr::filter(msms_score >= threshold &
peaks_abs >= npeaks &
peaks_rel >= rpeaks)
matched_peaks_count >= npeaks &
presence_ratio >= rpeaks)
}

## COMMENT AR: Not doing it because of thresholding
Expand All @@ -223,13 +178,20 @@ process_spectra <- function(input = params$input,
exact_mass
)

spectra_ids <- spectra |>
extract_spectra() |>
dplyr::mutate(query_idx = dplyr::row_number()) |>
dplyr::distinct(feature_id = acquisitionNum, query_idx)

df_final <- df_full |>
dplyr::left_join(df_meta) |>
dplyr::rename(feature_id = query_idx) |>
dplyr::select(-target_idx)
dplyr::left_join(spectra_ids) |>
dplyr::select(-target_idx, -query_idx) |>
dplyr::relocate(feature_id, .before = msms_score)

log_debug(
nrow(df_final),
nrow(df_final |>
dplyr::distinct(short_inchikey)),
"Candidates were annotated on",
nrow(df_final |>
dplyr::distinct(feature_id)),
Expand Down
6 changes: 6 additions & 0 deletions config/default/process_spectra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,9 @@ qutoff: 0

#' Execute processes in parallel? BOOLEAN
parallel: yes

#' Execute process faster without peaks matching? BOOLEAN
fast: yes

#' Execute approximative matching without precursor matching? BOOLEAN
approx: no
6 changes: 4 additions & 2 deletions inst/scripts/docopt/process_spectra.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
You can use this script with the following example:
Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true
Rscript inst/scripts/process_spectra.R --input 'data/source/example.mgf' --library 'data/interim/spectra/isdb_lotus_pos.mgf' --output 'data/interim/annotations/example_isdb.tsv.gz' --dalton 0.02 --ppm 10 --method 'gnps' --threshold 0.2 --npeaks 6 --rpeaks 0.2 --condition 'OR' --qutoff 0 --parallel true --fast true --approx false

Usage:
process_spectra.R [-i | --input=<input>] [-l | --library=<library>] [-o | --output=<output>] [-d | --dalton=<dalton>] [-p | --ppm=<ppm>] [-m | --method=<method>] [-t | --threshold=<threshold>] [-c | --condition=<condition>] [-q | --qutoff=<qutoff>]
process_spectra.R [-i | --input=<input>] [-l | --library=<library>] [-o | --output=<output>] [-a | --approx=<approx>] [-d | --dalton=<dalton>] [-f | --fast=<fast>] [-p | --ppm=<ppm>] [-m | --method=<method>] [-t | --threshold=<threshold>] [-c | --condition=<condition>] [-q | --qutoff=<qutoff>]

Arguments:
-a --approx=<approx> Allow for matching without precursor match (Boolean)
-b --parallel=<parallel> Perform parallel processing (Boolean)
-c --condition=<condition> Condition used for thresholding (AND or OR)
-d --dalton=<dalton> Tolerance for matching. Should not exceed 0.05 Da.
-f --fast=<fast> Perform a faster version (Boolean)
-i --input=<input> Your isdb result file. Supports compressed files.
-l --library=<library> Your MS2 library file (MGF).
-o --output=<output> Path and filename for the output. If you specify .gz file will be compressed.
Expand Down
6 changes: 4 additions & 2 deletions man/process_spectra.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions tests/testthat/test_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ testthat::test_that("Whole process", {

## CLI arguments check
arguments <<- character()
arguments$approx <<- "x"
arguments$biological <<- "x"
arguments$column.name <<- "x"
arguments$complement <<- "x"
Expand All @@ -182,6 +183,7 @@ testthat::test_that("Whole process", {
arguments$directory <<- "x"
arguments$edges <<- "x"
arguments$extension <<- "x"
arguments$fast <<- "x"
arguments$features <<- "x"
arguments$filter <<- "x"
arguments$gnps <<- "x"
Expand Down

0 comments on commit fafb575

Please sign in to comment.