In [1]:
from MS2LDA.motif_parser import load_m2m_folder
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import motifs2motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import motifDB2motifs
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import store_motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import load_motifDB
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import group_ms2
import json

In [2]:
# Ensure MS2LDA can be imported
import sys
from pathlib import Path
# Add parent directory to path if MS2LDA not already importable
try:
    import MS2LDA
except ImportError:
    sys.path.insert(0, str(Path.cwd().parent.parent))
    import MS2LDA

In [3]:
old_motifDB_path = r"C:\Users\dietr004\Documents\PhD\computational mass spectrometry\PROGRAMS\pySubstructures\pySubstructures\resources\MOTIFDB"

## Show all Motifsets available in MotifDB

In [4]:
import os
old_motifDB_motifsets = os.listdir(old_motifDB_path)
old_motifDB_motifsets

['Euphorbia Plant Mass2Motifs',
 'GNPS library derived Mass2Motifs',
 'LDB MotifDB POS',
 'LDB_NEG_MotifDB_01',
 'LDB_NEG_MotifDB_02',
 'Massbank library derived Mass2Motifs',
 'MIADB_pos_100',
 'MIADB_pos_60',
 'MIADB_pos_indole',
 'Photorhabdus and Xenorhabdus Mass2Motifs',
 'Planomonospora-associated Mass2Motifs',
 'Rhamnaceae Plant Mass2Motifs',
 'Streptomyces and Salinispora Mass2Motifs',
 'Streptomyces S29',
 'Urine derived Mass2Motifs']

## Add Charge and motifset name to Motif Spectra

https://ms2lda.org/motifdb/

In [5]:
# same order as list above
accuracy_in_dalton = [ 
    0.005, # 'Euphorbia Plant Mass2Motifs'
    0.005, # 'GNPS library derived Mass2Motifs'
    0.01, # 'LDB MotifDB POS'
    0.005, # 'LDB_NEG_MotifDB_01'
    0.01, # 'LDB_NEG_MotifDB_02'
    0.005, # 'Massbank library derived Mass2Motifs'
    0.005, # 'MIADB_pos_100'
    0.005, # 'MIADB_pos_60'
    0.005, # 'MIADB_pos_indole'
    0.005, # 'Photorhabdus and Xenorhabdus Mass2Motifs'
    0.005, # 'Planomonospora-associated Mass2Motifs'
    0.005, # 'Rhamnaceae Plant Mass2Motifs'
    0.1, # 'Streptomyces and Salinispora Mass2Motifs'
    0.005, # 'Streptomyces S29'
    0.005, # 'Urine derived Mass2Motifs'
]

In [6]:
motif_spectra = []
negative_mode_motifsets = ['LDB_NEG_MotifDB_01', 'LDB_NEG_MotifDB_02', 'Euphorbia Plant Mass2Motifs'] # libraries in negative mode
for accuracy, motifset in zip(accuracy_in_dalton, old_motifDB_motifsets):
    motifs = load_m2m_folder(old_motifDB_path + "/" + motifset) # load m2m file
    for motif in motifs:
        motif.set("motifset", motifset) # add motif set name
        motif.set("ms2accuracy", accuracy) # add ms2 accuracy in Dalton
        if motifset in negative_mode_motifsets:
            motif.set("charge", -1) # add negative charge
        else:
            motif.set("charge", 1) # add positive charge
    motif_spectra += motifs

## Store motifDB in new format

In [7]:
motifDB_ms1, motifDB_ms2 = motifs2motifDB(motif_spectra)

In [8]:
list(motifDB_ms1)

['frag_mz',
 'frag_intens',
 'loss_mz',
 'loss_intens',
 'charge',
 'ms2accuracy',
 'short_annotation',
 'annotation',
 'auto_annotation',
 'motif_id',
 'motifset',
 'analysis_massspectrometer',
 'collision_energy',
 'other_information',
 'scientific_name',
 'sample_type',
 'massive_id',
 'taxon_id',
 'analysis_ionizationsource',
 'analysis_chromatographyandphase',
 'analysis_polarity',
 'paper_url',
 'property',
 'scan',
 'ms1scan']

In [9]:
motifDB_ms1["motifset"] = "see ms2"

## Add Metadata to motifDB

Not all available data about a motif was stored in the m2m files, so additional metadata was collected from the server using the script in the pySubstructures repository. The script was shared by Joe and is only locally available.

In [10]:
old_motifDB_metadata_path = r"C:\Users\dietr004\Documents\PhD\computational mass spectrometry\PROGRAMS\pySubstructures\motif_sets.json"
with open(old_motifDB_metadata_path, "r") as metafile:
    motifDB_metadata_old = json.load(metafile)

The metadata include eleven names that were used consistently over all motifsets and here they are added to the MotifDB dataframe.

All the data is saved in lists

In [11]:
Analysis_MassSpectrometer = []
Collision_Energy = []
Other_Information = []
Scientific_Name = []
Sample_Type = []
Massive_ID = []
Taxon_ID = []
Analysis_IonizationSource = []
Analysis_ChromatographyAndPhase = []
Analysis_Polarity = []
Paper_URL = []

for row in motifDB_ms2.itertuples():
    motifset_name = row.motifset
    for metadata in motifDB_metadata_old:
        if metadata["name"] == motifset_name:
            Analysis_MassSpectrometer.append(metadata["metadata"]["Analysis_MassSpectrometer"])
            Collision_Energy.append(metadata["metadata"]["Collision_Energy"])
            Other_Information.append(metadata["metadata"]["Other_Information"])
            Scientific_Name.append(metadata["metadata"]["Scientific_Name"])
            Sample_Type.append(metadata["metadata"]["Sample_Type"])
            Massive_ID.append(metadata["metadata"]["Massive_ID"])
            Taxon_ID.append(metadata["metadata"]["Taxon_ID"])
            Analysis_IonizationSource.append(metadata["metadata"]["Analysis_IonizationSource"])
            Analysis_ChromatographyAndPhase.append(metadata["metadata"]["Analysis_ChromatographyAndPhase"])
            Analysis_Polarity.append(metadata["metadata"]["Analysis_Polarity"])
            Paper_URL.append(metadata["metadata"]["Paper_URL"])

lists are added to the dataframe

In [12]:
motifDB_ms2["analysis_massspectrometer"] = Analysis_MassSpectrometer
motifDB_ms2["collision_energy"] = Collision_Energy
motifDB_ms2["other_information"] = Other_Information
motifDB_ms2["scientific_name"] = Scientific_Name
motifDB_ms2["analysis_massspectrometer"] = Analysis_MassSpectrometer
motifDB_ms2["sample_type"] = Sample_Type
motifDB_ms2["massive_id"] = Massive_ID
motifDB_ms2["taxon_id"] = Taxon_ID
motifDB_ms2["analysis_ionizationsource"] = Analysis_IonizationSource
motifDB_ms2["analysis_chromatographyandphase"] = Analysis_ChromatographyAndPhase
motifDB_ms2["analysis_polarity"] = Analysis_Polarity
motifDB_ms2["paper_url"] = Paper_URL
motifDB_ms2["auto_annotation"] = [None for i in range(motifDB_ms2.shape[0])]
motifDB_ms2["property"] = [None for i in range(motifDB_ms2.shape[0])]

In [13]:
motifDB_ms2.head(3)

Unnamed: 0,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,auto_annotation,motif_id,...,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,property,scan,ms1scan
0,71.047,0.076913,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,264413080265068407435216182580085253776,0
1,97.028,0.13036,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,264413080265068407435216182580085253776,0
2,105.073,0.039002,,,-1,0.005,173 415 216 185 351 171 593 387,Unknown,,motif_0,...,Plant extracts,MSV000081082,3990,electospray ionization,reverse phase (C18),positive ionisation mode,https://www.biorxiv.org/content/10.1101/323014...,,264413080265068407435216182580085253776,0


## Load automated Annotation

In [14]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/Urine_auto_annotation_pos.json", "r") as f:
    urine_annotations = json.load(f)

In [15]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/MassBank_auto_annotation_pos.json", "r") as f:
    massbank_annotations = json.load(f)

In [16]:
with open("../Paper_results/Annotation_Benchmark_MotifSets/GNPS_auto_annotation_pos.json", "r") as f:
    gnps_annotations = json.load(f)

## Add automated Annotation

In [17]:
motifDB_ms2_grouped = group_ms2(motifDB_ms2)

In [18]:
motifDB_ms2_grouped.head()

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,485730175874384958432673710390102167,"[91.975, 121.035, 122.045, 133.105, 149.025, 1...","[0.031457617103312886, 0.035809711866514674, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
1,592464210425733028781001649670318124,"[57.278, 60.233, 63.288, 81.037, 84.748, 91.02...","[0.0037744261928590525, 0.003954408815179642, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.005,No short annotation available,No annotation available,motif_83,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
2,966201284055967293346180146865550122,"[57.032, 62.373, 74.323, 80.683, 84.373, 86.42...","[0.010094380473175759, 0.002224273711919466, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.005,No short annotation available,No annotation available,motif_118,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,1074499310874374468168542740363988357,"[55.053, 58.068, 67.052, 68.047, 68.058, 70.06...","[0.00873441988673646, 0.03299177882203069, 0.0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.005,Mixed motif - several cores - some metabolites...,Mixed motif - several cores - some metabolites...,motif_180,...,,human urine extracts,"MSV000083538, MSV000081118",,electospray ionization,normal phase (HILIC),positive ionisation mode,http://pubs.acs.org/doi/abs/10.1021/acs.analch...,,
4,1226188175656683184747598598128904702,"[57.05, 95.05, 96.05, 105.05, 109.05, 113.05, ...","[0.005463165520780571, 0.029383887305829023, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.1,Salinispora arenicola related substructure [15...,Salinispora arenicola related substructure [15...,motif_3,...,Salinispora and Streptomyces,bacterial extracts,"MSV000078836, MSV000078839","168694, 1883",electospray ionization,reverse phase (C18),positive ionisation mode,https://pubs.acs.org/doi/full/10.1021/acs.jnat...,,


In [19]:
import re

In [20]:
def add_automated_annotation(annotation_list, motifset, motifDB_ms2):
    for name, annotation_dict in annotation_list.items():
        motif_id = re.search(r"(motif_\d+)", name).group(1)
        if annotation_dict["SMILES"]:
            Auto_annotation = annotation_dict["SMILES"]
            index = motifDB_ms2.loc[(motifDB_ms2["motif_id"] == motif_id) & (motifDB_ms2["motifset"] == motifset)].index[0]
            motifDB_ms2.at[index, "auto_annotation"] = Auto_annotation

In [21]:
add_automated_annotation(urine_annotations, 'Urine derived Mass2Motifs', motifDB_ms2_grouped)
add_automated_annotation(gnps_annotations, 'GNPS library derived Mass2Motifs', motifDB_ms2_grouped)
add_automated_annotation(massbank_annotations, 'Massbank library derived Mass2Motifs', motifDB_ms2_grouped)

In [22]:
motifDB_ms2_grouped.head(4)

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,485730175874384958432673710390102167,"[91.975, 121.035, 122.045, 133.105, 149.025, 1...","[0.031457617103312886, 0.035809711866514674, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
1,592464210425733028781001649670318124,"[57.278, 60.233, 63.288, 81.037, 84.748, 91.02...","[0.0037744261928590525, 0.003954408815179642, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.005,No short annotation available,No annotation available,motif_83,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
2,966201284055967293346180146865550122,"[57.032, 62.373, 74.323, 80.683, 84.373, 86.42...","[0.010094380473175759, 0.002224273711919466, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",-1,0.005,No short annotation available,No annotation available,motif_118,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,1074499310874374468168542740363988357,"[55.053, 58.068, 67.052, 68.047, 68.058, 70.06...","[0.00873441988673646, 0.03299177882203069, 0.0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1,0.005,Mixed motif - several cores - some metabolites...,Mixed motif - several cores - some metabolites...,motif_180,...,,human urine extracts,"MSV000083538, MSV000081118",,electospray ionization,normal phase (HILIC),positive ionisation mode,http://pubs.acs.org/doi/abs/10.1021/acs.analch...,[CCOC(=O)c1c(N2C(=O)C3CC(C)=C(C)CC3C2=O)sc(C)c...,


In [23]:
motifDB_ms2_expoded = motifDB_ms2_grouped.explode(["frag_mz", "frag_intens", "loss_mz", "loss_intens"]).reset_index(drop=True)

In [24]:
motifDB_ms2_expoded.head(4)

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,485730175874384958432673710390102167,91.975,0.031458,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
1,485730175874384958432673710390102167,121.035,0.03581,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
2,485730175874384958432673710390102167,122.045,0.01027,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,485730175874384958432673710390102167,133.105,0.059451,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,


## Test if storing, loading and converting works

In [25]:
store_motifDB(motifDB_ms1, motifDB_ms2_expoded)

True

In [26]:
m1, m2 = load_motifDB("motifDB.json")

In [27]:
m2.head()

Unnamed: 0,scan,frag_mz,frag_intens,loss_mz,loss_intens,charge,ms2accuracy,short_annotation,annotation,motif_id,...,scientific_name,sample_type,massive_id,taxon_id,analysis_ionizationsource,analysis_chromatographyandphase,analysis_polarity,paper_url,auto_annotation,property
0,485730175874384958432673710390102167,91.975,0.031458,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
1,485730175874384958432673710390102167,121.035,0.03581,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
2,485730175874384958432673710390102167,122.045,0.01027,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
3,485730175874384958432673710390102167,133.105,0.059451,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,
4,485730175874384958432673710390102167,149.025,0.027837,,,-1,0.01,No short annotation available,No annotation available,motif_16,...,,,,,electospray ionization,reverse phase (C18),negative ionisation mode,,,


In [48]:
m1, m2 = load_motifDB("Motifset LDB MotifDB POS.json")

In [49]:
m1.motifset

0    see ms2
Name: motifset, dtype: object

In [50]:
m2.motifset

0       LDB MotifDB POS
1       LDB MotifDB POS
2       LDB MotifDB POS
3       LDB MotifDB POS
4       LDB MotifDB POS
             ...       
6133    LDB MotifDB POS
6134    LDB MotifDB POS
6135    LDB MotifDB POS
6136    LDB MotifDB POS
6137    LDB MotifDB POS
Name: motifset, Length: 6138, dtype: object

## Split large dataset based on motifset

In [31]:
set(m2.motifset)

{'LDB MotifDB POS'}

In [32]:
motifset_euphorbia = m2.loc[m2.motifset == 'Euphorbia Plant Mass2Motifs']
motifset_gnps = m2.loc[m2.motifset == 'GNPS library derived Mass2Motifs']
motifset_ldb_pos = m2.loc[m2.motifset == 'LDB MotifDB POS']
motifset_ldb_neg_01 = m2.loc[m2.motifset == 'LDB_NEG_MotifDB_01']
motifset_ldb_neg_02 = m2.loc[m2.motifset == 'LDB_NEG_MotifDB_02']
motifset_miadb_pos_100 = m2.loc[m2.motifset == 'MIADB_pos_100']
motifset_miadb_pos_60 = m2.loc[m2.motifset == 'MIADB_pos_60']
motifset_miadb_pos_indole = m2.loc[m2.motifset == 'MIADB_pos_indole']
motifset_massbank = m2.loc[m2.motifset == 'Massbank library derived Mass2Motifs']
motifset_photorhabdus_xenorhabdus = m2.loc[m2.motifset == 'Photorhabdus and Xenorhabdus Mass2Motifs']
motifset_planomonspora = m2.loc[m2.motifset == 'Planomonospora-associated Mass2Motifs']
motifset_rhamnaceae = m2.loc[m2.motifset == 'Rhamnaceae Plant Mass2Motifs']
motifset_streptomyces = m2.loc[m2.motifset == 'Streptomyces S29']
motifset_streptomyces_salinispora = m2.loc[m2.motifset == 'Streptomyces and Salinispora Mass2Motifs']
motifset_urine = m2.loc[m2.motifset == 'Urine derived Mass2Motifs']

In [33]:
store_motifDB(motifDB_ms1, motifset_euphorbia, name="Motifset Euphorbia Plant Mass2Motifs.json")

True

In [34]:
store_motifDB(motifDB_ms1, motifset_gnps, name="Motifset GNPS library derived Mass2Motifs.json")

True

In [35]:
store_motifDB(motifDB_ms1, motifset_ldb_pos, name="Motifset LDB MotifDB POS.json")

True

In [36]:
store_motifDB(motifDB_ms1, motifset_ldb_neg_01, name="Motifset LDB_NEG_MotifDB_01.json")

True

In [37]:
store_motifDB(motifDB_ms1, motifset_ldb_neg_02, name="Motifset LDB_NEG_MotifDB_02.json")

True

In [38]:
store_motifDB(motifDB_ms1, motifset_miadb_pos_100, name="Motifset MIADB_pos_100.json")

True

In [39]:
store_motifDB(motifDB_ms1, motifset_miadb_pos_60, name="Motifset MIADB_pos_60.json")

True

In [40]:
store_motifDB(motifDB_ms1, motifset_miadb_pos_indole, name="Motifset MIADB_pos_indole.json")

True

In [41]:
store_motifDB(motifDB_ms1, motifset_massbank, name="Motifset Massbank library derived Mass2Motifs.json")

True

In [42]:
store_motifDB(motifDB_ms1, motifset_photorhabdus_xenorhabdus, name="Motifset Photorhabdus and Xenorhabdus Mass2Motifs.json")

True

In [43]:
store_motifDB(motifDB_ms1, motifset_planomonspora, name="Motifset Planomonospora-associated Mass2Motifs.json")

True

In [44]:
store_motifDB(motifDB_ms1, motifset_rhamnaceae, name="Motifset Rhamnaceae Plant Mass2Motifs.json")

True

In [45]:
store_motifDB(motifDB_ms1, motifset_streptomyces, name="Motifset Streptomyces S29.json")

True

In [46]:
store_motifDB(motifDB_ms1, motifset_streptomyces_salinispora, name="Motifset Streptomyces and Salinispora Mass2Motifs.json")

True

In [47]:
store_motifDB(motifDB_ms1, motifset_urine, name="Motifset Urine derived Mass2Motifs.json")

True