Skip to content
This repository has been archived by the owner on Oct 9, 2024. It is now read-only.

Add check MSP wrapper #23

Open
yguitton opened this issue Jan 28, 2018 · 3 comments
Open

Add check MSP wrapper #23

yguitton opened this issue Jan 28, 2018 · 3 comments
Assignees

Comments

@yguitton
Copy link
Member

yguitton commented Jan 28, 2018

some msp files are not fully compatible with metaMS
for exemple msp file fom AMDIS have ( mz int) instead of mz int; as mass spectrum descriptor

the idea for wrapper is 1 load msp file with a new read.msp that can deal with more msp format
then use write.msp format to create a converted msp file

@yguitton
Copy link
Member Author

yguitton commented Jan 28, 2018

here a first code for : a modified read.msp file then the idea is to use the metaMS write.msp function

read.msp<-function (file, only.org = FALSE, org.set = c("C", "H", "D", 
    "N", "O", "P", "S"), noNumbers = NULL) 
{
    get.text.value <- function(x, field, do.err = TRUE) {
        woppa <- strsplit(x, field)
        woppa.lengths <- sapply(woppa, length)
        if (all(woppa.lengths == 2)) {
            sapply(woppa, function(y) gsub("^ +", "", y[2]))
        }
        else {
            if (do.err) {
                stop(paste("Invalid field", field, "in", x[woppa.lengths != 
                  2]))
            }
            else {
                NA
            }
        }
    }
    is.org <- function(strs, org.set) {
        formulas <- get.text.value(strs, "Formula:")
        org.string <- paste("[", paste(org.set, collapse = ""), 
            "]", collapse = "")
        suppressWarnings(which(!is.na(as.numeric(gsub(org.string, 
            "", formulas)))))
    }
    read.compound <- function(strs, noNumbers) {
	# print(strs[1])
        if (is.null(noNumbers)) 
             noNumbers <- c("[Nn][Aa][Mm][Ee]", "CAS?", "stdFile", "date", 
                "validated", "ChemspiderID", "SMILES", "InChI", 
                "Class", "[Cc][Oo][Mm][Mm][Ee][Nn][Tt]?", "csLinks","[fF][oO][Rr][Mm]?","[Ss][oO][Uu][Rr][Cc][Ee],RW")
        fields.idx <- grep(":", strs)
        fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 
            1)
# pk.idx <- which(fields == "[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]")
		pk.idx<-grep("[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]",fields)
        if (length(pk.idx) == 0) 
            stop("No spectrum found")
        cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], 
            paste(fields[x], ":", sep = ""), do.err = FALSE))
		# Rename essential fields for metaMS
		fields<-gsub("NAME","Name",fields)
		fields<-gsub("RT","rt",fields)
		fields<-gsub("RI","std.RI",fields)
		fields<-gsub("CASNO","CAS",fields)
		fields<-gsub("NUM PEAKS","Num Peaks",fields)
        names(cmpnd) <- fields[-pk.idx]
		#
		
		
		
        # cnvrt.idx <- which(!(names(cmpnd) %in% noNumbers))
		cnvrt.idx<-which(!grepl(paste(noNumbers,collapse="|"),fields)==FALSE)
        cmpnd[cnvrt.idx] <- lapply(cmpnd[cnvrt.idx], function(x) {
            if (is.na((y <- as.numeric(x)))) {
                x
            }
            else {
                y
            }
        })
        nlines <- length(strs)
        npeaks <- as.numeric(get.text.value(strs[pk.idx], "[Nn][Uu][Mm] [Pp][Ee][Aa][Kk][Ss]:?[:space]"))
        peaks.idx <- (pk.idx + 1):nlines
		#####
		if(length(grep(";",strs[peaks.idx]))>1){
			pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";")))
			
		}
		if(length(grep(";",strs[peaks.idx]))<1){
			pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], "\\)")))
			pks<-gsub("\\(+","",pks)
			
		}
		
		####
        
        pks <- pks[pks != ""]
        if (length(pks) != npeaks) 
            stop("Not the right number of peaks in compound", 
                cmpnd$Name)
        pklst <- strsplit(pks, " ")
        pklst <- lapply(pklst, function(x) x[x != ""])
        cmz <- as.numeric(sapply(pklst, "[[", 1))
        cintens <- as.numeric(sapply(pklst, "[[", 2))
        finaltab <- matrix(c(cmz, cintens), ncol = 2)
        if (any(table(cmz) > 1)) {
            warning("Duplicate mass in compound ", cmpnd$Name, 
                " (CAS ", cmpnd$CAS, ")... summing up intensities")
            finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 
                1]), FUN = sum)
        }
        colnames(finaltab) <- c("mz", "intensity")
		# uncomment below to put NULL in remove cmpnd without RT
		# if(!is.na(match("rt",names(cmpnd))))
        c(cmpnd, list(pspectrum = finaltab))
    }
    huhn <- scan(file, what = "", sep = "\n", quiet = TRUE)
    starts <- which(regexpr("[Nn][Aa][Mm][Ee]:?[:space]", huhn) == 1)
    ends <- c(starts[-1] - 1, length(huhn))
    if (only.org) {
        formulas <- which(regexpr("Formula:", huhn) == 1)
        if (length(formulas) > 0) {
            orgs <- is.org(huhn[formulas], org.set)
            starts <- starts[orgs]
            ends <- ends[orgs]
        }
    }
    lapply(1:length(starts), function(i) read.compound(huhn[starts[i]:ends[i]], 
        noNumbers = noNumbers))
}

@jsaintvanne
Copy link
Member

When you talk of msp file do you mean the databse as input ?

@yguitton
Copy link
Member Author

yguitton commented Jun 26, 2019 via email

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

3 participants