In [1]:
from MS2LDA.motif_parser import load_m2m_folder
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import motifs2motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import motifDB2motifs
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import store_motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import load_motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import group_ms2
import json

In [2]:
old_motifDB_path = r"C:\Users\dietr004\Documents\PhD\computational mass spectrometry\PROGRAMS\pySubstructures\pySubstructures\resources\MOTIFDB"

## Show all Motifsets available in MotifDB

In [3]:
import os
old_motifDB_motifsets = os.listdir(old_motifDB_path)
old_motifDB_motifsets

['Euphorbia Plant Mass2Motifs',
 'GNPS library derived Mass2Motifs',
 'LDB MotifDB POS',
 'LDB_NEG_MotifDB_01',
 'LDB_NEG_MotifDB_02',
 'Massbank library derived Mass2Motifs',
 'MIADB_pos_100',
 'MIADB_pos_60',
 'MIADB_pos_indole',
 'Photorhabdus and Xenorhabdus Mass2Motifs',
 'Planomonospora-associated Mass2Motifs',
 'Rhamnaceae Plant Mass2Motifs',
 'Streptomyces and Salinispora Mass2Motifs',
 'Streptomyces S29',
 'Urine derived Mass2Motifs']

## Add Charge and motifset name to Motif Spectra

https://ms2lda.org/motifdb/

In [4]:
# same order as list above
accuracy_in_dalton = [ 
    0.005, # 'Euphorbia Plant Mass2Motifs'
    0.005, # 'GNPS library derived Mass2Motifs'
    0.01, # 'LDB MotifDB POS'
    0.005, # 'LDB_NEG_MotifDB_01'
    0.01, # 'LDB_NEG_MotifDB_02'
    0.005, # 'Massbank library derived Mass2Motifs'
    0.005, # 'MIADB_pos_100'
    0.005, # 'MIADB_pos_60'
    0.005, # 'MIADB_pos_indole'
    0.005, # 'Photorhabdus and Xenorhabdus Mass2Motifs'
    0.005, # 'Planomonospora-associated Mass2Motifs'
    0.005, # 'Rhamnaceae Plant Mass2Motifs'
    0.1, # 'Streptomyces and Salinispora Mass2Motifs'
    0.005, # 'Streptomyces S29'
    0.005, # 'Urine derived Mass2Motifs'
]

In [5]:
motif_spectra = []
negative_mode_motifsets = ['LDB_NEG_MotifDB_01', 'LDB_NEG_MotifDB_02', 'Euphorbia Plant Mass2Motifs'] # libraries in negative mode
for accuracy, motifset in zip(accuracy_in_dalton, old_motifDB_motifsets):
    motifs = load_m2m_folder(old_motifDB_path + "/" + motifset) # load m2m file
    for motif in motifs:
        motif.set("motifset", motifset) # add motif set name
        motif.set("ms2accuracy", accuracy) # add ms2 accuracy in Dalton
        if motifset in negative_mode_motifsets:
            motif.set("charge", -1) # add negative charge
        else:
            motif.set("charge", 1) # add positive charge
    motif_spectra += motifs

## Store motifDB in new format

In [6]:
motifDB_ms1, motifDB_ms2 = motifs2motifDB(motif_spectra)

In [7]:
list(motifDB_ms1)

['frag_mz',
 'frag_intens',
 'loss_mz',
 'loss_intens',
 'charge',
 'ms2accuracy',
 'short_annotation',
 'annotation',
 'auto_annotation',
 'motif_id',
 'motifset',
 'analysis_massspectrometer',
 'collision_energy',
 'other_information',
 'scientific_name',
 'sample_type',
 'massive_id',
 'taxon_id',
 'analysis_ionizationsource',
 'analysis_chromatographyandphase',
 'analysis_polarity',
 'paper_url',
 'property',
 'scan',
 'ms1scan']

## Add Metadata to motifDB

Not all available data about a motif was stored in the m2m files, so additional metadata was collected from the server using the script in the pySubstructures repository. The script was shared by Joe and is only locally available.

In [8]:
old_motifDB_metadata_path = r"C:\Users\dietr004\Documents\PhD\computational mass spectrometry\PROGRAMS\pySubstructures\motif_sets.json"
with open(old_motifDB_metadata_path, "r") as metafile:
    motifDB_metadata_old = json.load(metafile)

The metadata include eleven names that were used consistently over all motifsets and here they are added to the MotifDB dataframe.

All the data is saved in lists

In [9]:
Analysis_MassSpectrometer = []
Collision_Energy = []
Other_Information = []
Scientific_Name = []
Sample_Type = []
Massive_ID = []
Taxon_ID = []
Analysis_IonizationSource = []
Analysis_ChromatographyAndPhase = []
Analysis_Polarity = []
Paper_URL = []

for row in motifDB_ms2.itertuples():
    motifset_name = row.motifset
    for metadata in motifDB_metadata_old:
        if metadata["name"] == motifset_name:
            Analysis_MassSpectrometer.append(metadata["metadata"]["Analysis_MassSpectrometer"])
            Collision_Energy.append(metadata["metadata"]["Collision_Energy"])
            Other_Information.append(metadata["metadata"]["Other_Information"])
            Scientific_Name.append(metadata["metadata"]["Scientific_Name"])
            Sample_Type.append(metadata["metadata"]["Sample_Type"])
            Massive_ID.append(metadata["metadata"]["Massive_ID"])
            Taxon_ID.append(metadata["metadata"]["Taxon_ID"])
            Analysis_IonizationSource.append(metadata["metadata"]["Analysis_IonizationSource"])
            Analysis_ChromatographyAndPhase.append(metadata["metadata"]["Analysis_ChromatographyAndPhase"])
            Analysis_Polarity.append(metadata["metadata"]["Analysis_Polarity"])
            Paper_URL.append(metadata["metadata"]["Paper_URL"])

lists are added to the dataframe

In [10]:
motifDB_ms2["analysis_massspectrometer"] = Analysis_MassSpectrometer
motifDB_ms2["collision_energy"] = Collision_Energy
motifDB_ms2["other_information"] = Other_Information
motifDB_ms2["scientific_name"] = Scientific_Name
motifDB_ms2["analysis_massspectrometer"] = Analysis_MassSpectrometer
motifDB_ms2["sample_type"] = Sample_Type
motifDB_ms2["massive_id"] = Massive_ID
motifDB_ms2["taxon_id"] = Taxon_ID
motifDB_ms2["analysis_ionizationsource"] = Analysis_IonizationSource
motifDB_ms2["analysis_chromatographyandphase"] = Analysis_ChromatographyAndPhase
motifDB_ms2["analysis_polarity"] = Analysis_Polarity
motifDB_ms2["paper_url"] = Paper_URL
motifDB_ms2["auto_annotation"] = [None for i in range(motifDB_ms2.shape[0])]
motifDB_ms2["property"] = [None for i in range(motifDB_ms2.shape[0])]

In [11]:
motifDB_ms2.head(3)

Unnamed: 0,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,auto_annotation,motif_id,...,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,property,scan,ms1scan
0,71.047,0.076913,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,9e44605547a2c3c0a91d9a0fd1bbf816,0
1,97.028,0.13036,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,9e44605547a2c3c0a91d9a0fd1bbf816,0
2,105.073,0.039002,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,9e44605547a2c3c0a91d9a0fd1bbf816,0


## Load automated Annotation

In [12]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/Urine derived Mass2Motifs.json", "r") as f:
    urine_annotations = json.load(f)

In [13]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/Massbank library derived Mass2Motifs.json", "r") as f:
    massbank_annotations = json.load(f)

In [14]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/GNPS library derived Mass2Motifs.json", "r") as f:
    gnps_annotations = json.load(f)

## Add automated Annotation

In [15]:
motifDB_ms2_grouped = group_ms2(motifDB_ms2)

In [16]:
motifDB_ms2_grouped.head()

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,00121649bd68236b880fe188d2d7fcff,"[178.028, 179.032, 179.037, 194.028, 195.067, ...","[0.006147241600132363, 0.005836453324917286, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,,
1,00410965838a992c7b561e0991f6f856,"[88.022, 134.028, 212.998, 223.077, 254.137, 3...","[0.03500737750751771, 0.009896050742188208, 0....","[nan, nan, nan, nan, nan, nan, nan, 39.968, 41...","[nan, nan, nan, nan, nan, nan, nan, 0.00170567...",1,0.005,C2H3N loss - could be specific for a type of r...,C2H3N loss - could be specific for a type of r...,motif_18,...,,human urine extracts,"MSV000083538, MSV000081118",,electospray ionization,normal phase (HILIC),positive ionisation mode,http://pubs.acs.org/doi/abs/10.1021/acs.analch...,,
2,00aae1dd7eb0e6b6621d96c60a70370d,"[89.005, 106.005, 107.045, 120.055, 132.015, 1...","[0.00567928871041006, 0.028237740873515802, 0....","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.01,No short annotation available,No annotation available,motif_91,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,00ce91ebe30504d304cdebcb4471b1e2,[nan],[nan],[17.027],[1.0],1,0.005,Amine loss - Indicative for free NH2 group in ...,Amine loss - Indicative for free NH2 group in ...,motif_13,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,,
4,00fd2eaba1ea0db2c92309e0f1baf8d2,"[107.055, 115.055, 131.055, 135.065, 137.055, ...","[0.04875972773242482, 0.021737673394840014, 0....","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.01,No short annotation available,"11 spectra, 6 molecules, 4 classes: 33.3% Quin...",motif_82,...,,,,,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.nature.com/articles/s41597-019-0305-1,,


In [17]:
import re

In [18]:
def add_automated_annotation(annotation_list, motifset, motifDB_ms2):
    for name, annotation_dict in annotation_list.items():
        motif_id = re.search(r"(motif_\d+)", name).group(1)
        if annotation_dict["SMILES"]:
            Auto_annotation = annotation_dict["SMILES"]
            index = motifDB_ms2.loc[(motifDB_ms2["motif_id"] == motif_id) & (motifDB_ms2["motifset"] == motifset)].index[0]
            motifDB_ms2.at[index, "auto_annotation"] = Auto_annotation

In [19]:
add_automated_annotation(urine_annotations, 'Urine derived Mass2Motifs', motifDB_ms2_grouped)
add_automated_annotation(gnps_annotations, 'GNPS library derived Mass2Motifs', motifDB_ms2_grouped)
add_automated_annotation(massbank_annotations, 'Massbank library derived Mass2Motifs', motifDB_ms2_grouped)

In [20]:
motifDB_ms2_grouped.head(4)

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,00121649bd68236b880fe188d2d7fcff,"[178.028, 179.032, 179.037, 194.028, 195.067, ...","[0.006147241600132363, 0.005836453324917286, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
1,00410965838a992c7b561e0991f6f856,"[88.022, 134.028, 212.998, 223.077, 254.137, 3...","[0.03500737750751771, 0.009896050742188208, 0....","[nan, nan, nan, nan, nan, nan, nan, 39.968, 41...","[nan, nan, nan, nan, nan, nan, nan, 0.00170567...",1,0.005,C2H3N loss - could be specific for a type of r...,C2H3N loss - could be specific for a type of r...,motif_18,...,,human urine extracts,"MSV000083538, MSV000081118",,electospray ionization,normal phase (HILIC),positive ionisation mode,http://pubs.acs.org/doi/abs/10.1021/acs.analch...,[NC(CCSCC(N)C(=O)O)C(=O)O],
2,00aae1dd7eb0e6b6621d96c60a70370d,"[89.005, 106.005, 107.045, 120.055, 132.015, 1...","[0.00567928871041006, 0.028237740873515802, 0....","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.01,No short annotation available,No annotation available,motif_91,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,00ce91ebe30504d304cdebcb4471b1e2,[nan],[nan],[17.027],[1.0],1,0.005,Amine loss - Indicative for free NH2 group in ...,Amine loss - Indicative for free NH2 group in ...,motif_13,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,,


In [21]:
motifDB_ms2_expoded = motifDB_ms2_grouped.explode(["frag_mz", "frag_intens", "loss_mz", "loss_intens"]).reset_index(drop=True)

In [22]:
motifDB_ms2_expoded.head(4)

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,00121649bd68236b880fe188d2d7fcff,178.028,0.006147,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
1,00121649bd68236b880fe188d2d7fcff,179.032,0.005836,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
2,00121649bd68236b880fe188d2d7fcff,179.037,0.005873,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
3,00121649bd68236b880fe188d2d7fcff,194.028,0.005689,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,


## Test if storing, loading and converting works

In [23]:
store_motifDB(motifDB_ms1, motifDB_ms2_expoded)

True

In [24]:
m1, m2 = load_motifDB("motifDB.json")

In [25]:
m2

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,00121649bd68236b880fe188d2d7fcff,178.028,0.006147,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
1,00121649bd68236b880fe188d2d7fcff,179.032,0.005836,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
2,00121649bd68236b880fe188d2d7fcff,179.037,0.005873,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
3,00121649bd68236b880fe188d2d7fcff,194.028,0.005689,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
4,00121649bd68236b880fe188d2d7fcff,195.067,0.013432,,,1,0.005,losses indicative for 4-oxo-1 4-dihydroquinoli...,losses indicative for 4-oxo-1 4-dihydroquinoli...,motif_22,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,[CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56887,fff22f1b35924a4012a4333fcef62f12,,,153.062,0.012015,1,0.005,Fluorobenzyl substructure,Fluorobenzyl substructure,motif_31,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,"[O=C(O)CNC(=O)c1ccc(F)cc1, O=C(c1ccccc1F)N1CC2...",
56888,fff22f1b35924a4012a4333fcef62f12,,,163.113,0.073297,1,0.005,Fluorobenzyl substructure,Fluorobenzyl substructure,motif_31,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,"[O=C(O)CNC(=O)c1ccc(F)cc1, O=C(c1ccccc1F)N1CC2...",
56889,fff22f1b35924a4012a4333fcef62f12,,,164.097,0.024706,1,0.005,Fluorobenzyl substructure,Fluorobenzyl substructure,motif_31,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,"[O=C(O)CNC(=O)c1ccc(F)cc1, O=C(c1ccccc1F)N1CC2...",
56890,fff22f1b35924a4012a4333fcef62f12,,,165.102,0.205482,1,0.005,Fluorobenzyl substructure,Fluorobenzyl substructure,motif_31,...,,Reference molecules,,,electospray ionization,direct infusion (DI),positive ionisation mode,https://gnps.ucsd.edu/ProteoSAFe/static/gnps-s...,"[O=C(O)CNC(=O)c1ccc(F)cc1, O=C(c1ccccc1F)N1CC2...",
