In [1]:
import re
from datasets import load_dataset
from rdkit import Chem
from collections import defaultdict

In [2]:
def parse_formula(formula):
    """
    Parse a chemical formula into a dictionary of elements and counts.
    """
    element_counts = defaultdict(int)
    matches = re.findall(r'([A-Z][a-z]?)(\d*)', formula)
    for element, count in matches:
        element_counts[element] += int(count) if count else 1
    return element_counts

def formula_from_dict(element_counts):
    """
    Convert an element-count dictionary back to a formula string.
    """
    formula = ''
    for element, count in sorted(element_counts.items()):
        formula += element + (str(count) if count > 1 else '')
    return formula

def add_adduct(formula, adduct):
    """
    Add an adduct to the main chemical formula.
    """
    # Initialize the periodic table
    pt = Chem.GetPeriodicTable()
    
    # Parse the formula and adduct
    formula_counts = parse_formula(formula)
    adduct_counts = parse_formula(adduct)
    
    # Add the counts from the adduct to the main formula
    for element, count in adduct_counts.items():
        formula_counts[element] += count
    
    # Generate the combined formula string
    combined_formula = formula_from_dict(formula_counts)
    
    return combined_formula

# add_adduct('CH4', "H")

In [3]:
msg = load_dataset('roman-bushuiev/MassSpecGym', data_files='data/MassSpecGym.tsv', split='train')

In [4]:
df = msg.shuffle(seed=42).select(range(1000)).to_pandas()

In [5]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0052279,"114.0915,149.0709,176.1072,191.0815,192.0768,1...","0.0013355700000000001,0.00890994,0.00139608,0....",CC(C)C1(C(=O)NC(=N1)C2=C(C=C(C=N2)COC)C(=O)O)C,NUPJIGQFXCQJBK,C15H19N3O4,C15H20N3O4,305.137524,306.14480,[M+H]+,Orbitrap,21.430136,train,True
1,MassSpecGymID0167344,"75.023003,77.038002,89.038002,91.054001,101.03...","0.059,0.006,1.0,0.046,0.052,0.038,0.427,0.126,...",C1=C2C3=C(C(=C1O)O)OC(=O)C4=CC(=C(C(=C43)OC2=O...,AFSDNFLWKVMVRB,C14H6O8,C14H7O8,302.002724,303.01000,[M+H]+,Orbitrap,,train,False
2,MassSpecGymID0400927,"91.054169,93.069809,95.049164,95.085434,97.064...","0.14086688273294626,0.11596669766096218,0.0075...",CCCCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)(O)OCCN)OC...,URPXXNCTXCOATD,C43H78NO7P,C43H79NO7P,751.551724,752.55900,[M+H]+,Orbitrap,,train,False
3,MassSpecGymID0055045,"42.0338,44.0495,58.0651,69.0448,71.0604,83.060...","1.0,0.3083083083083083,0.3383383383383383,0.02...",C1N2CN3CN1CN(C2)C3,VKYKSIONXSXAKP,C6H12N4,C6H13N4,140.106224,141.11350,[M+H]+,Orbitrap,21.167025,train,True
4,MassSpecGymID0227317,"41.038471,41.270458,43.054085,44.049362,71.085...","0.00196,0.00248,0.00226,0.041749999999999995,0...",CC(C)(C)CNCCN1C2=C(C(=NC=C2)N)N=C1SC3=CC4=C(C=...,RVJIQAYFTOPTKK,C22H30N6O2S,C22H31N6O2S,442.215094,443.22237,[M+H]+,Orbitrap,60.000000,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MassSpecGymID0199548,"107.049797,111.038628,121.027344,123.007416,12...","0.00220512,0.00213621,0.005742509999999999,0.0...",COC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)CO)O)C(=O)OC)O,WSRCOMOWYVKWBE,C17H16O8,C17H17O8,348.082724,349.09000,[M+H]+,,,train,False
996,MassSpecGymID0231288,"58.065174,68.012993,70.065186,99.091698,101.10...","0.07695,0.010820000000000001,0.018680000000000...",CC(=O)NC1=CC(=NC=N1)OC2=CC3=C(C=C2)C(=CC=C3)C(...,MZZJNOOADWVFPD,C30H29F3N6O3,C30H30F3N6O3,578.225324,579.23260,[M+H]+,Orbitrap,60.000000,test,True
997,MassSpecGymID0073570,"71.0604,78.0338,83.024,93.0083,96.0443,100.039...","0.07907907907907907,0.06906906906906907,1.0,0....",CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,MEFOUWRMVYJCQC,C14H17N5O7S2,C14H18N5O7S2,431.056924,432.06420,[M+H]+,Orbitrap,77.771556,test,True
998,MassSpecGymID0061856,"68.997002,74.014999,75.023003,77.038002,79.017...","1.0,0.01,0.075,0.577,0.032,0.01,0.548,0.172,0....",C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,REFJWTPEDVJJIY,C15H10O7,C15H11O7,302.042724,303.05000,[M+H]+,Orbitrap,,train,False


In [6]:
df.adduct.value_counts()

adduct
[M+H]+     838
[M+Na]+    162
Name: count, dtype: int64

In [7]:
df.instrument_type.value_counts()

instrument_type
Orbitrap    743
QTOF        237
Name: count, dtype: int64

In [8]:
from msbuddy import Msbuddy, MsbuddyConfig
from msbuddy.base import MetaFeature, Spectrum
import numpy as np
from tqdm.cli import tqdm
for instrument_type, gp in df.groupby('instrument_type'):
    # instantiate a MsbuddyConfig object
    msb_config = MsbuddyConfig(
                            ms_instr=instrument_type.lower(), # supported: "qtof", "orbitrap", "fticr" or None
                            parallel=False,
                            # custom MS1 and MS2 tolerance will be used if None
                            ppm=True,  # use ppm for mass tolerance
                            ms1_tol=5,  # MS1 tolerance in ppm or Da
                            ms2_tol=10,  # MS2 tolerance in ppm or Da
                            halogen=False)

    # instantiate a Msbuddy object
    msb_engine = Msbuddy(msb_config)

    metafeatures = []
    
    for i, sp in tqdm(gp.head(10).iterrows(), total=len(gp), desc='adding metafeatures...'):
        mz_array = np.array(list(map(float, sp.mzs.split(','))))
        int_array = np.array(list(map(float, sp.intensities.split(','))))
        # print(f"arrays {mz_array, int_array}")
        ms2_spec = Spectrum(
            mz_array=mz_array,
            int_array=int_array,
        )

        metafeature = MetaFeature(
            identifier = 0,  # unique identifier for the MetaFeature object
            mz = sp.precursor_mz,  # precursor m/z
            rt = None,  # retention time, can be None if not available
            charge = 1,  # precursor charge
            adduct = sp.adduct,
            ms2 = ms2_spec)
        metafeatures.append(metafeature)

    msb_engine.add_data(metafeatures)
    msb_engine.annotate_formula()
    results = msb_engine.get_summary()
    break

msbuddy: molecular formula annotation for MS-based small molecule analysis.
Developed and maintained by Shipei Xing.


adding metafeatures...:   1%|▏         | 10/743 [00:02<03:24,  3.59it/s]


10 queries loaded.
1 batch in total.
Batch 1/1:
Candidate space generation:   0%|[32m          [0m| 0/10 [00:00<?, ?it/s]

In [77]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0118997,"67.01955,107.04877,108.05402,111.04688,119.048...","0.03903903903903904,0.6746746746746747,0.05605...",C1=CC(=CC=C1C2C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O,PADQINQHPQKXNL,C15H12O6,C15H13O6,288.063389,289.070665,[M+H]+,QTOF,6.0,train,True
1,MassSpecGymID0002921,"89.060219,107.663078,131.598373,160.118057,171...","0.0006762799999999999,0.00044038,0.00052956999...",CC(C)C1(C(=O)NC(=N1)C2=C(C=CC=N2)C(=O)O)C,CLQMBPJKHLGMQK,C13H15N3O3,C13H15N3NaO3,261.111782,284.101000,[M+Na]+,,,train,False
2,MassSpecGymID0038869,"53.038799,55.054401,57.07,77.038597,79.054298,...","0.0024508114116806104,0.001345354564534534,0.0...",CC(C)(C)C#C/C=C/CN(C)CC1=CC=CC2=CC=CC=C21,DOMXUEMWDBAQBQ,C21H25N,C21H26N,291.198724,292.206000,[M+H]+,Orbitrap,,train,False
3,MassSpecGymID0001401,"50.23378,51.033839,51.119189,52.248405,54.3883...","0.00022053,0.00019757,0.00019276000000000002,0...",C[C@@H](CO)NC(=O)[C@H]1CN([C@@H]2CC3=CNC4=CC=C...,WVVSZNPYNCNODU,C19H23N3O2,C19H24N3O2,325.179003,326.186279,[M+H]+,Orbitrap,45.0,train,True
4,MassSpecGymID0224738,"121.028374,122.09642,148.075714,252.149612,270...","0.44606999999999997,0.14776,0.26298,0.03629,1....",CC1=C(C=C(C=C1)NC(=O)C2=CC(=CC=C2)N(C)C)NC(=O)...,PYEFPDQFAZNXLI,C23H23N3O3,C23H24N3O3,389.173944,390.181220,[M+H]+,Orbitrap,20.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,MassSpecGymID0270261,"162.091003,188.106003,207.087997,215.067001,22...","0.008,1.0,0.008,0.013,0.017,0.007,0.006,0.049,...",CCC12C=CCN3C1C4(CC3)C(C(C2OC(=O)C)(C(=O)OC)O)N...,CXBGOBGJHGGWIE,C25H32N2O6,C25H32N2NaO6,456.220782,479.210000,[M+Na]+,Orbitrap,,train,False
9996,MassSpecGymID0169915,"91.05527,93.07088,95.0863,124.11252,125.11496,...","0.013000000000000001,0.075,0.00699999999999999...",CN1C2CCC1CC(C2)OC(=O)C3=CC(=C(C=C3)OC)OC,AEFPCFUCFQBXDQ,C17H23NO4,C17H24NO4,305.162709,306.169985,[M+H]+,QTOF,10.0,train,True
9997,MassSpecGymID0033032,"57.07,109.045,124.0559,129.0701,143.0858,144.0...","0.009009009009009009,0.057057057057057055,0.14...",CC(C)N(C1=CC=C(C=C1)F)C(=O)COC2=NN=C(S2)C(F)(F)F,IANUJLZYFUDJIH,C14H13F4N3O2S,C14H14F4N3O2S,363.066424,364.073700,[M+H]+,Orbitrap,30.0,val,True
9998,MassSpecGymID0134663,"45.034241,53.03886,55.018169,57.033741,59.0494...","0.0018049731760930775,0.0021717732805915245,0....",C1=CC(=O)OC2=CC(=C(C=C21)O[C@H]3[C@@H]([C@H]([...,XHCADAYNFIFUHF,C15H16O9,C15H17O9,340.078724,341.086000,[M+H]+,QTOF,,train,False


In [78]:
import pandas as pd

xf = pd.DataFrame(results)
xf['Truth'] = gp.head(10).precursor_formula.values

In [99]:
(xf.formula_rank_1.apply(lambda x: add_adduct(x, 'H')) == xf.Truth).mean()

0.7