## Load libraries

In [3]:
# ---------- Preparations ----------
# Load Libraries
library(Spectra)
library(MsBackendMgf)
library(MsBackendHmdb)
library(MsCoreUtils)
library(MsBackendMsp)
library(readr)
library(dplyr)
# 3 dependencies for latest MassBank version
library(rvest)
library(stringr)
library(xml2)
options(warn=-1)

In [4]:
# Track Time 
start_time <- Sys.time()

## Define input directory, keep everything in this directory

In [5]:
# ---------- Script ----------
# input directory
input_dir <- paste(getwd(), "/", sep = '')
input_dir

In [6]:
#input_dir <- "/Users/mahnoorzulfiqar/OneDriveUNI/MZML/"
#input_dir

## load function file

In [7]:
# load the functions file
source(file = paste(input_dir, "Workflow_R_Functions.r", sep = ''))

In [8]:
# load the functions file
# source(file = '/Users/mahnoorzulfiqar/OneDriveUNI/MAW/Workflow_R_Functions.r')

In [9]:
# downloading spectral libraries; do NOT run
# load db spectra objects [gnps, hmdb, mbank]
# download_specDB(input_dir, db = "all")

## Load Spectral Databases as rda objects

In [10]:
# OR load the database rda objects 
#load(file = paste(input_dir,"gnps.rda", sep = ""))
#load(file = paste(input_dir,"hmdb.rda", sep = ""))
#load(file = paste(input_dir,"mbank.rda", sep = ""))

In [11]:
#gnps

In [12]:
#mbank

In [13]:
#hmdb

## Start the workflow

In [14]:
# Run the first function; this creates a dataframe of your input files, their result directories 
# and gives an id to each input file; stores the table in directory as a csv filr
input_table <- data.frame(ms2_rfilename(input_dir))
input_table

mzml_files,ResultFileNames,File_id
<chr>,<chr>,<chr>
./DS200309_Scost_QC_70k_pos_PRM.mzML,./DS200309_Scost_QC_70k_pos_PRM,file_1
./DS_201124_SC_full_PRM_neg_01.mzML,./DS_201124_SC_full_PRM_neg_01,file_2
./DS_201124_SC_full_PRM_neg_03.mzML,./DS_201124_SC_full_PRM_neg_03,file_3
./DS_201124_SC_full_PRM_neg_04.mzML,./DS_201124_SC_full_PRM_neg_04,file_4
./DS_201124_SC_full_PRM_pos_01.mzML,./DS_201124_SC_full_PRM_pos_01,file_5


In [15]:
# Preprocess and Read the mzMLfiles
spec_pr <- spec_Processing(as.character(input_table[1, "mzml_files"]), input_table[1, "ResultFileNames"])

Writing file processedSpectra.mzML...
OK



In [16]:
pre_tbl = paste(input_dir, str_remove(paste(input_table[1, "ResultFileNames"], "/premz_list.txt", sep = ""), "./"), sep ="")
proc_mzml = paste(input_dir, str_remove(paste(input_table[1, "ResultFileNames"], "/processedSpectra.mzML", sep = ""), "./"), sep ="")

In [17]:
result_dir = input_table[1, "ResultFileNames"]
file_id = input_table[1, "File_id"]
ppmx = 15
db = "all"

In [18]:
sps_all <- Spectra(proc_mzml, backend = MsBackendMzR())

In [19]:
tbl <- read.table(pre_tbl)
pre_mz <- tbl[[1]]

In [20]:
pre_mz

In [24]:
load(file = paste(input_dir,"gnpsdb.rda", sep = ""))

In [21]:
sps <- spec2_Processing(348.078765869141, spec = "spec_all")

In [None]:
if (db == "all" || db =="gnps"){


    load(file = paste(input_dir,"gnps.rda", sep = ""))

    # common

    id_X <- c()
    premz <- c()
    rtmin <- c()
    rtmax <- c()
    rtmed <- c()
    rtmean <- c()

    # gnps
    GNPSmax_similarity <- c()
    GNPSmzScore <- c()
    GNPSintScore <- c()
    GQMatchingPeaks <- c()
    GNPSTotalPeaks <- c()
    gQueryTotalPeaks<- c()
    GNPSSMILES <- c()
    GNPSspectrumID <- c()
    GNPScompound_name <- c()
    GNPSmirrorSpec <- c()

    # common
    Source <- c()

    nx <- 0
    for (x in pre_mz){
        nx <- nx+1
            
            spsrt <- filterPrecursorMz(sps_all, x)
        
            
            id_Xx <- paste(file_id,  "M",  as.character(round(x, digits = 0)), 
                            "R", as.character(round(median(spsrt$rtime, na.rm = TRUE), digits = 0)), 
                            "ID", as.character(nx), sep = '')
            id_X <- c(id_X, id_Xx)

            pre <- x
            premz <- c(premz, pre)

            rti <- min(spsrt$rtime)
            rtmin <- c(rtmin, rti)

            rtx <- max(spsrt$rtime)
            rtmax <- c(rtmax, rtx)


            rtmd <- median(spsrt$rtime, na.rm = TRUE)
            rtmed <- c(rtmed, rtmd)

            rtmn <- mean(spsrt$rtime, na.rm = TRUE)
            rtmean <- c(rtmean, rtmn)
       
            #### input spec with pre_mz
            sps <- spec2_Processing(x, spec = "spec_all")
    }

}

In [25]:
gnps

MSn data (Spectra) with 481168 spectra in a MsBackendMgf backend:
         msLevel     rtime scanIndex
       <integer> <numeric> <integer>
1              2        NA        NA
2              2        NA        NA
3              2        NA        NA
4              2        NA        NA
5              2        NA        NA
...          ...       ...       ...
481164         2        NA        NA
481165         2        NA        NA
481166         2        NA        NA
481167         2        NA        NA
481168         2        NA        NA
 ... 32 more variables/columns.

In [22]:

#perform dereplication with all dbs
df_derep <- spec_dereplication(pre_tbl = paste(input_dir, str_remove(paste(input_table[1, "ResultFileNames"], "/premz_list.txt", sep = ""), "./"), sep =""), 
                               proc_mzml = paste(input_dir, str_remove(paste(input_table[1, "ResultFileNames"], "/processedSpectra.mzML", sep = ""), "./"), sep =""),
                               db = "all", 
                               result_dir = input_table[1, "ResultFileNames"],
                               file_id = input_table[1, "File_id"], 
                               input_dir, 
                               ppmx = 15)

MSn data (Spectra) with 7551 spectra in a MsBackendMzR backend:
       msLevel     rtime scanIndex
     <integer> <numeric> <integer>
1            1   120.246         1
2            1   120.748         2
3            2   121.020         3
4            1   121.135         4
5            2   121.404         5
...        ...       ...       ...
7547         0     888.8      7547
7548         0     889.0      7548
7549         0     889.2      7549
7550         0     889.4      7550
7551         0     889.6      7551
 ... 33 more variables/columns.

file(s):
processedSpectra.mzML
[1] "its working"
MSn data (Spectra) with 7551 spectra in a MsBackendMzR backend:
       msLevel     rtime scanIndex
     <integer> <numeric> <integer>
1            1   120.246         1
2            1   120.748         2
3            2   121.020         3
4            1   121.135         4
5            2   121.404         5
...        ...       ...       ...
7547         0     888.8      7547
7548         0     8

ERROR: Error in (function (classes, fdef, mtable) : unable to find an inherited method for function ‘containsMz’ for signature ‘"function"’


In [None]:
end_time <- Sys.time()
print(end_time - start_time)

### At the end you will have for each .mzML MS2 file, a result directory with same name and the many files and subdirectories