In [2]:
from datasets import load_dataset

In [3]:
msg = load_dataset('roman-bushuiev/MassSpecGym', data_files='data/MassSpecGym.tsv', split='train')

In [5]:
from msbuddy import Msbuddy, MsbuddyConfig
df = msg.select(range(100)).to_pandas()
# instantiate a MsbuddyConfig object
msb_config = MsbuddyConfig(ms_instr='orbitrap', # supported: "qtof", "orbitrap", "fticr" or None
                                                # custom MS1 and MS2 tolerance will be used if None
                        ppm=True,  # use ppm for mass tolerance
                        ms1_tol=5,  # MS1 tolerance in ppm or Da
                        ms2_tol=10,  # MS2 tolerance in ppm or Da
                        halogen=False)
# instantiate a Msbuddy object
msb_engine = Msbuddy(msb_config);

msbuddy: molecular formula annotation for MS-based small molecule analysis.
Developed and maintained by Shipei Xing.


In [6]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,MassSpecGymID0000108,"69.0335,71.0491,79.0542,83.0491,85.0284,95.049...","0.06906906906906907,0.09009009009009009,0.0360...",CC1=C[C@@H]2[C@]([C@@H](C1=O)O)([C@]3([C@@H]([...,XGCUCFKWVIWWNW,C17H22O8,C17H23O8,354.130924,355.1382,[M+H]+,Orbitrap,35.0,train,True
96,MassSpecGymID0000109,"67.0542,69.0335,71.0491,79.0542,81.0699,83.049...","0.04904904904904905,0.11011011011011011,0.0920...",CC1=C[C@@H]2[C@]([C@@H](C1=O)O)([C@]3([C@@H]([...,XGCUCFKWVIWWNW,C17H22O8,C17H23O8,354.130924,355.1382,[M+H]+,Orbitrap,50.0,train,True
97,MassSpecGymID0000110,"69.0335,71.0491,85.0284,95.0491,97.0648,99.044...","0.05005005005005005,0.07607607607607608,0.0630...",CC1=C[C@@H]2[C@]([C@@H](C1=O)O)([C@]3([C@@H]([...,XGCUCFKWVIWWNW,C17H22O8,C17H23O8,354.130924,355.1382,[M+H]+,Orbitrap,30.0,train,True
98,MassSpecGymID0000111,"85.0284,99.0441,109.0648,123.0441,125.0597,137...","0.03903903903903904,0.22422422422422422,0.0380...",CC1=C[C@@H]2[C@]([C@@H](C1=O)O)([C@]3([C@@H]([...,XGCUCFKWVIWWNW,C17H22O8,C17H23O8,354.130924,355.1382,[M+H]+,Orbitrap,20.0,train,True


In [7]:
from msbuddy.base import MetaFeature, Spectrum

In [7]:
import numpy as np
s = df.iloc[0]

mz_array = np.array(list(map(float, s.mzs.split(','))))
int_array = np.array(list(map(float, s.intensities.split(','))))
mz_array, int_array

(array([ 91.0542, 125.0233, 154.0499, 155.0577, 185.0961, 200.107 ,
        229.0859, 246.1125]),
 array([0.24524525, 1.        , 0.08008008, 0.35535536, 0.34934935,
        0.04504505, 0.14214214, 0.73473473]))

In [12]:
ms2_spec = Spectrum(
    mz_array=mz_array,
    int_array=int_array,
)

metafeature = MetaFeature(
    identifier = 0,  # unique identifier for the MetaFeature object
    mz = s.precursor_mz,  # precursor m/z
    rt = None,  # retention time, can be None if not available
    charge = 1,  # precursor charge
    adduct = '[M+H]+',
    ms2 = ms2_spec)

In [13]:
msb_engine.add_data([metafeature])

In [16]:
msb_engine.annotate_formula()

1 query loaded.
1 batch in total.
Batch 1/1:
Candidate space generation: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.09s/it]
Subformula assignment: 100%|[32m██████████[0m| 1/1 [00:04<00:00,  4.70s/it]
Candidate formula ranking...
FDR calculation: 100%|[32m██████████[0m| 1/1 [00:00<00:00, 12826.62it/s]
Job finished.


In [18]:
results = msb_engine.get_summary()

In [19]:
results

[{'identifier': 0,
  'mz': 288.1225,
  'rt': None,
  'adduct': '[M+H]+',
  'formula_rank_1': 'C16H17NO4',
  'estimated_fdr': 0.0006285439313621355,
  'formula_rank_2': 'C10H18N5O3P',
  'formula_rank_3': 'C8H21N3O6S',
  'formula_rank_4': 'C9H17N7O2S',
  'formula_rank_5': None}]

In [20]:
for meta_feature in msb_engine.data:
    for i, candidate in enumerate(meta_feature.candidate_formula_list):
        print('MetaFeature mz' + str(meta_feature.mz) + '  rt: ' + str(meta_feature.rt) + \
        '  rank: ' + str(i+1) + 'Formula: ' + candidate.formula.__str__() + \
        '  estimated FDR: ' + str(candidate.estimated_fdr))

MetaFeature mz288.1225  rt: None  rank: 1Formula: C16H17NO4  estimated FDR: 0.0006285439313621355
MetaFeature mz288.1225  rt: None  rank: 2Formula: C10H18N5O3P  estimated FDR: 0.5000384343338329
MetaFeature mz288.1225  rt: None  rank: 3Formula: C8H21N3O6S  estimated FDR: 0.6666718275522459
MetaFeature mz288.1225  rt: None  rank: 4Formula: C9H17N7O2S  estimated FDR: 0.75


In [None]:

# load data, here we use a mgf file as an example
msb_engine.load_mgf('input_file.mgf')

# annotate molecular formula
msb_engine.annotate_formula()

# retrieve the annotation result summary
result = msb_engine.get_summary()