In [3]:
#! /usr/bin/Rscript

#' @title Download Spectral DBs as Spectra
#'
#' @description
#'
#' A summary file in a text format is saved in the input directory 
#' which stores the timing when each database is stored, e.g: GNPS is 
#' updated when there is a submission. This information is stored for 
#' reproducibility checks and versions of the databases.
#' If all is selected, then GNPS, HMDB, and MassBank all 
#' are downloaded with their URLs. If a specific database is selected only
#' that database is downloaded. Each database is stored as in the format 
#' it is downloadable from the database webpages.


#' @param input_dir is full directory where all MZML input files
#'
#' @param db is either one of the spectral libraries which can be 
#'        gnps, hmdb, mbank or all


#' @return
#' 
#' Spectral DB saved with the following name in the input_dir: 
#' gnps.rda, hmdb.rda, mbank.rda
#' summary file saved as summaryFile.txt which contains the 
#' timings and versions if available of saved databases
#'
#' @author Mahnoor Zulfiqar
#' 
#' @examples
#' 
#' download_specDB(input_dir = "/usr/project/", db = "all")


# ---------- Preparations ----------
# Load libraries
library("Spectra")
library("MsBackendMgf")
library("MsBackendMsp")
library("MsBackendHmdb")
library(rvest)
library(stringr)
library(xml2)

# ---------- Arguments and user variables ----------
args <- commandArgs(trailingOnly=TRUE)
print(args)

input_dir <- as.character(args[1])
db <- as.character(args[2])
error <- as.logical(args[3])


# ---------- download_specDB ----------

download_specDB <- function(input_dir, db = "all", error = TRUE){
    # Track Time 
    start_time <- Sys.time()

    # only input available as of now
    databases <- 'gnps, hmdb, mbank, all'
    
    # creat a summary file, open and store timings of download and version if possible
    summaryFile <- paste(input_dir, "summaryFile.txt", sep = "")
    file.create(summaryFile, recursive = TRUE)
    file.conn <- file(summaryFile)
    open(file.conn, open = "at")
            
    # gnps
    if (db == "all" || db =="gnps"){
        
        # Download file
        system(paste("wget -P", 
                     input_dir,
                     "https://gnps-external.ucsd.edu/gnpslibrary/ALL_GNPS.mgf",
                     sep =  " "))
        
        # load the spectra into MsBackendMgf
        gnps <- Spectra(paste(input_dir, "ALL_GNPS.mgf", sep = ''), source = MsBackendMgf())
        save(gnps, file = paste(input_dir,"gnps.rda", sep = ""))
        
        # delete the database in its format to free up space
        system(paste("rm", (paste(input_dir, "ALL_GNPS.mgf", sep = '')), sep = " "))
        
        writeLines(paste("GNPS saved at", Sys.time(), sep=" "),con=file.conn)
        
    }
    # hmdb
    if (db == "all" || db =="hmdb"){
        
        # extract HMDB Current version
        html <- read_html("https://hmdb.ca/downloads")
        strings <- html%>% html_elements("a") %>% html_text2()
        ls <- unique(strings)
        hmdb_curr_ver <- c()
        for (i in ls){
            if (grepl("Current", i)){
            hmdb_curr_ver<- c(i, hmdb_curr_ver)
            }
        }
        
        #Download file predicted MSMS spectra
        system(paste("wget - P", input_dir,
                     "https://hmdb.ca/system/downloads/current/spectral_data/spectra_xml/hmdb_predicted_msms_spectra.zip",
                     sep = " "))
        # unzip
        system(paste("jar xvf", input_dir, paste(input_dir, "hmdb_predicted_msms_spectra.zip", sep = ""), sep = " "))
        # load the spectra into MsBackendHMDB
        hmdb_pred <- Spectra(paste(input_dir, "hmdb_predicted_msms_spectra.xml", sep = ''), source = MsBackendHmdb())
        
        #Download file experimental MSMS spectra
        system(paste("wget - P", input_dir,
                     "https://hmdb.ca/system/downloads/current/spectral_data/spectra_xml/hmdb_experimental_msms_spectra.zip",
                     sep = " "))
        # unzip
        system(paste("jar xvf", input_dir, paste(input_dir, "hmdb_experimental_msms_spectra.zip", sep = ""), sep = " "))
        # load the spectra into MsBackendHMDB
        hmdb_exp <- Spectra(paste(input_dir, "hmdb_experimental_msms_spectra.xml", sep = ''), source = MsBackendHmdb())
        hmdb <- hmdb_pred + hmdb_exp
        save(hmdb, file = paste(input_dir,"hmdb.rda", sep = ""))
        
        # delete the database in its format to free up space
        system(paste("rm", (paste(input_dir, "hmdb_predicted_msms_spectra.xml", sep = '')), sep = " "))
        system(paste("rm", (paste(input_dir, "hmdb_experimental_msms_spectra.xml", sep = '')), sep = " "))
        
        
        writeLines(paste("HMDB saved at", Sys.time(), "with release version", hmdb_curr_ver, sep=" "),con=file.conn)
    }
    #mbank
    if (db == "all" || db =="mbank"){
        
        page <- read_html("https://github.com/MassBank/MassBank-data/releases")
        page %>%
            html_nodes("a") %>%       # find all links
            html_attr("href") %>%     # get the url
            str_subset("MassBank_NIST.msp") -> tmp # find those that have the name MassBank_NIST.msp
        
        #download file
        system(paste("wget -P", input_dir,
                     "https://github.com/", tmp[1], 
                     sep =  " "))
        
        mbank <- Spectra(paste(input_dir, "MassBank_NIST.msp", sep = ''), source = MsBackendMsp())
        save(mbank, file = paste(input_dir,"mbankNIST.rda", sep = ""))
        
        # delete the database in its format to free up space
        system(paste("rm", (paste(input_dir, "MassBank_NIST.msp", sep = '')), sep = " "))
        
        # obtain the month and year for the database release to add to summary
        res <- str_match(tmp[1], "download/\\s*(.*?)\\s*/MassBank_NIST")
        
        #writeLines(paste("MassBank saved at", Sys.time(), "with release version", res[,2], sep=" "),con=file.conn)
    }
    
    #wrong input error message
    else if (!grepl(db, databases, fixed = TRUE)){
        stop("Wrong db input. Following inputs apply: gnps, hmdb, mbank or all")
    }
    close(file.conn)
    #download_specDB(input_dir, db)
    end_time <- Sys.time()
    print(end_time - start_time)
}


download_specDB(input_dir, db, error = TRUE)


[1] "Error! No or not enough arguments given."


ERROR: Error in if (db == "all" || db == "gnps") {: Fehlender Wert, wo TRUE/FALSE nötig ist


In [62]:
library("Spectra")

Lade nötiges Paket: S4Vectors

Lade nötiges Paket: stats4

Lade nötiges Paket: BiocGenerics

Lade nötiges Paket: parallel


Attache Paket: ‘BiocGenerics’


Die folgenden Objekte sind maskiert von ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


Die folgenden Objekte sind maskiert von ‘package:stats’:

    IQR, mad, sd, var, xtabs


Die folgenden Objekte sind maskiert von ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min



Attache Paket: ‘S4Vectors’


Die folgenden Objekte sind maskiert 

In [63]:
Spectra("/Users/mahnoorzulfiqar/Project_Scost/DS_201124_SC_full_PRM_neg_03.mzML")

“mzR has been built against a different Rcpp version (1.0.6)
than is installed on your system (1.0.7). This might lead to errors
when loading mzR. If you encounter such issues, please send a report,
including the output of sessionInfo() to the Bioc support forum at 
https://support.bioconductor.org/. For details see also
https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.”


MSn data (Spectra) with 6706 spectra in a MsBackendMzR backend:
       msLevel     rtime scanIndex
     <integer> <numeric> <integer>
1            2   30.4288         1
2            1   30.9334         2
3            2   31.1704         3
4            1   31.4840         4
5            2   31.6318         5
...        ...       ...       ...
6702         0     799.2      6702
6703         0     799.4      6703
6704         0     799.6      6704
6705         0     799.8      6705
6706         0     800.0      6706
 ... 33 more variables/columns.

file(s):
DS_201124_SC_full_PRM_neg_03.mzML

In [64]:
c(Spectra("/Users/mahnoorzulfiqar/Project_Scost/DS_201124_SC_full_PRM_neg_03.mzML"),
Spectra("/Users/mahnoorzulfiqar/Project_Scost/DS_201124_SC_full_PRM_neg_04.mzML"))

MSn data (Spectra) with 13635 spectra in a MsBackendMzR backend:
        msLevel     rtime scanIndex
      <integer> <numeric> <integer>
1             2   30.4288         1
2             1   30.9334         2
3             2   31.1704         3
4             1   31.4840         4
5             2   31.6318         5
...         ...       ...       ...
13631         0     850.4      6925
13632         0     850.6      6926
13633         0     850.8      6927
13634         0     851.0      6928
13635         0     851.2      6929
 ... 33 more variables/columns.

file(s):
DS_201124_SC_full_PRM_neg_03.mzML
DS_201124_SC_full_PRM_neg_04.mzML
Processing:
 Merge 2 Spectra into one [Sat Feb 19 11:53:12 2022] 