In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext snakeviz

In [3]:
from datasets import load_dataset

In [4]:
from rdkit import Chem
import re
from collections import defaultdict
from msbuddy import Msbuddy, MsbuddyConfig
from msbuddy.base import MetaFeature, Spectrum
import numpy as np
from tqdm.cli import tqdm

def parse_formula(formula):
    """
    Parse a chemical formula into a dictionary of elements and counts.
    """
    element_counts = defaultdict(int)
    matches = re.findall(r'([A-Z][a-z]?)(\d*)', formula)
    for element, count in matches:
        element_counts[element] += int(count) if count else 1
    return element_counts

def formula_from_dict(element_counts):
    """
    Convert an element-count dictionary back to a formula string.
    """
    formula = ''
    for element, count in sorted(element_counts.items()):
        formula += element + (str(count) if count > 1 else '')
    return formula

def add_adduct(formula, adduct):
    """
    Add an adduct to the main chemical formula.
    """
    # Initialize the periodic table
    pt = Chem.GetPeriodicTable()
    
    # Parse the formula and adduct
    formula_counts = parse_formula(formula)
    adduct_counts = parse_formula(adduct)
    
    # Add the counts from the adduct to the main formula
    for element, count in adduct_counts.items():
        formula_counts[element] += count
    
    # Generate the combined formula string
    combined_formula = formula_from_dict(formula_counts)
    
    return combined_formula

# add_adduct('CH4', "H")

In [5]:
import pandas as pd
df = load_dataset('roman-bushuiev/MassSpecGym', data_files='data/MassSpecGym.tsv', split='train').shuffle(seed=42).select(range(1_000)).to_pandas()
# df = pd.read_parquet('data/msg_rand500.parquet')

In [6]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0052279,"114.0915,149.0709,176.1072,191.0815,192.0768,1...","0.0013355700000000001,0.00890994,0.00139608,0....",CC(C)C1(C(=O)NC(=N1)C2=C(C=C(C=N2)COC)C(=O)O)C,NUPJIGQFXCQJBK,C15H19N3O4,C15H20N3O4,305.137524,306.14480,[M+H]+,Orbitrap,21.430136,train,True
1,MassSpecGymID0167344,"75.023003,77.038002,89.038002,91.054001,101.03...","0.059,0.006,1.0,0.046,0.052,0.038,0.427,0.126,...",C1=C2C3=C(C(=C1O)O)OC(=O)C4=CC(=C(C(=C43)OC2=O...,AFSDNFLWKVMVRB,C14H6O8,C14H7O8,302.002724,303.01000,[M+H]+,Orbitrap,,train,False
2,MassSpecGymID0400927,"91.054169,93.069809,95.049164,95.085434,97.064...","0.14086688273294626,0.11596669766096218,0.0075...",CCCCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)(O)OCCN)OC...,URPXXNCTXCOATD,C43H78NO7P,C43H79NO7P,751.551724,752.55900,[M+H]+,Orbitrap,,train,False
3,MassSpecGymID0055045,"42.0338,44.0495,58.0651,69.0448,71.0604,83.060...","1.0,0.3083083083083083,0.3383383383383383,0.02...",C1N2CN3CN1CN(C2)C3,VKYKSIONXSXAKP,C6H12N4,C6H13N4,140.106224,141.11350,[M+H]+,Orbitrap,21.167025,train,True
4,MassSpecGymID0227317,"41.038471,41.270458,43.054085,44.049362,71.085...","0.00196,0.00248,0.00226,0.041749999999999995,0...",CC(C)(C)CNCCN1C2=C(C(=NC=C2)N)N=C1SC3=CC4=C(C=...,RVJIQAYFTOPTKK,C22H30N6O2S,C22H31N6O2S,442.215094,443.22237,[M+H]+,Orbitrap,60.000000,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MassSpecGymID0199548,"107.049797,111.038628,121.027344,123.007416,12...","0.00220512,0.00213621,0.005742509999999999,0.0...",COC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)CO)O)C(=O)OC)O,WSRCOMOWYVKWBE,C17H16O8,C17H17O8,348.082724,349.09000,[M+H]+,,,train,False
996,MassSpecGymID0231288,"58.065174,68.012993,70.065186,99.091698,101.10...","0.07695,0.010820000000000001,0.018680000000000...",CC(=O)NC1=CC(=NC=N1)OC2=CC3=C(C=C2)C(=CC=C3)C(...,MZZJNOOADWVFPD,C30H29F3N6O3,C30H30F3N6O3,578.225324,579.23260,[M+H]+,Orbitrap,60.000000,test,True
997,MassSpecGymID0073570,"71.0604,78.0338,83.024,93.0083,96.0443,100.039...","0.07907907907907907,0.06906906906906907,1.0,0....",CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,MEFOUWRMVYJCQC,C14H17N5O7S2,C14H18N5O7S2,431.056924,432.06420,[M+H]+,Orbitrap,77.771556,test,True
998,MassSpecGymID0061856,"68.997002,74.014999,75.023003,77.038002,79.017...","1.0,0.01,0.075,0.577,0.032,0.01,0.548,0.172,0....",C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,REFJWTPEDVJJIY,C15H10O7,C15H11O7,302.042724,303.05000,[M+H]+,Orbitrap,,train,False


In [7]:
for instrument_type, gp in df.groupby('instrument_type'):
    # instantiate a MsbuddyConfig object
    msb_config = MsbuddyConfig(
                            ms_instr=instrument_type.lower(), # supported: "qtof", "orbitrap", "fticr" or None
                            # parallel=True,
                            # n_cpu=4,
                            # custom MS1 and MS2 tolerance will be used if None
                            ppm=True,  # use ppm for mass tolerance
                            ms1_tol=5,  # MS1 tolerance in ppm or Da
                            ms2_tol=10,  # MS2 tolerance in ppm or Da
                            halogen=False,
                            top_n_per_50_da=5,
        )

    # instantiate a Msbuddy object
    msb_engine = Msbuddy(msb_config)

    metafeatures = []
    
    for i, sp in tqdm(gp.iterrows(), total=len(gp), desc='adding metafeatures...'):
        mz_array = np.array(list(map(float, sp.mzs.split(','))))
        int_array = np.array(list(map(float, sp.intensities.split(','))))
        # print(f"arrays {mz_array, int_array}")
        ms2_spec = Spectrum(
            mz_array=mz_array,
            int_array=int_array,
        )

        metafeature = MetaFeature(
            identifier = 0,  # unique identifier for the MetaFeature object
            mz = sp.precursor_mz,  # precursor m/z
            rt = None,  # retention time, can be None if not available
            charge = 1,  # precursor charge
            adduct = sp.adduct,
            ms2 = ms2_spec)
        metafeatures.append(metafeature)
    break

msbuddy: molecular formula annotation for MS-based small molecule analysis.
Developed and maintained by Shipei Xing.


adding metafeatures...: 100%|██████████| 743/743 [00:01<00:00, 503.69it/s]


In [8]:
# msb_engine.add_data(metafeatures[:9])
msb_engine.add_data(metafeatures)

In [9]:
# raise

In [10]:
# msb_engine.annotate_formula()

In [11]:
# 1000 / 1 took 783 s
# 500 / 8 took 347 s
# 500 / 1 took 400 s

In [12]:
%%snakeviz -t
msb_engine.annotate_formula()

743 queries loaded.
1 batch in total.
Batch 1/1:
Candidate space generation: 100%|[32m██████████[0m| 743/743 [00:23<00:00, 32.24it/s] 
Subformula assignment: 100%|[32m██████████[0m| 743/743 [02:46<00:00,  4.46it/s]
Candidate formula ranking...
FDR calculation: 100%|[32m██████████[0m| 743/743 [00:00<00:00, 20871.24it/s]
Job finished.
 
*** Profile stats marshalled to file '/tmp/tmppih8mrq1'.
Opening SnakeViz in a new tab...
snakeviz web server started on 127.0.0.1:8080; enter Ctrl-C to exit
http://127.0.0.1:8080/snakeviz/%2Ftmp%2Ftmppih8mrq1


In [13]:
results = pd.DataFrame(msb_engine.get_summary())
results

Unnamed: 0,identifier,mz,rt,adduct,formula_rank_1,estimated_fdr,formula_rank_2,formula_rank_3,formula_rank_4,formula_rank_5
0,0,306.14480,,[M+H]+,C15H19N3O4,0.000051,C16H15N7,C8H19N9O2S,C9H20N7O3P,
1,0,303.01000,,[M+H]+,C10H10N2O5S2,0.015292,C9H6N2O10,C18H6O3S,C11H11O6PS,C7H11O11P
2,0,752.55900,,[M+H]+,C43H78NO7P,0.015400,C45H73N3O6,C50H73NO4,C46H69N7O2,C42H77N3O6S
3,0,141.11350,,[M+H]+,C6H12N4,0.000000,,,,
4,0,443.22237,,[M+H]+,C22H30N6O2S,0.006770,C29H30O4,C21H34N2O6S,C19H39O7PS,C14H26N12O5
...,...,...,...,...,...,...,...,...,...,...
738,0,227.06625,,[M+H]+,C9H10N2O5,0.000032,C10H14N2S2,C2H10N8O3S,C3H11N6O4P,
739,0,579.23260,,[M+H]+,C31H34N2O9,0.090559,C32H30N6O5,C27H30N8O7,C35H34N2O4S,C23H38N4O11S
740,0,432.06420,,[M+H]+,C14H17N5O7S2,0.029391,C11H18N3O13P,C13H13N5O12,C22H13N3O5S,C15H21N5O2S4
741,0,303.05000,,[M+H]+,C15H10O7,0.000473,C16H6N4O3,C8H18N2O4S3,C7H16N2O7P2,C7H14N2O9S


In [14]:
(gp.formula.values == results.formula_rank_1).mean()

0.6231493943472409

In [15]:
((gp.formula.values == results.formula_rank_1) | 
 (gp.formula.values == results.formula_rank_2) | 
 (gp.formula.values == results.formula_rank_3) | 
 (gp.formula.values == results.formula_rank_4) | 
 (gp.formula.values == results.formula_rank_5)
 ).mean()

0.6554508748317631