In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
msg = load_dataset(
    'roman-bushuiev/MassSpecGym',
    data_files='data/MassSpecGym.tsv'
)

In [3]:
df = msg['train'].shuffle(seed=42).select(range(1000)).to_pandas()

In [4]:
df = df.dropna(subset=['smiles'])

In [5]:
from rdkit import Chem

def as_formula(smiles):
    mol = Chem.MolFromSmiles(smiles)
    formula = {}
    for atom in mol.GetAtoms():
        element = atom.GetSymbol()
        if element in formula:
            formula[element] += 1
        else:
            formula[element] = 1
    return formula

df['formula'] = df['smiles'].apply(as_formula)

In [6]:
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0052279,"114.0915,149.0709,176.1072,191.0815,192.0768,1...","0.0013355700000000001,0.00890994,0.00139608,0....",CC(C)C1(C(=O)NC(=N1)C2=C(C=C(C=N2)COC)C(=O)O)C,NUPJIGQFXCQJBK,"{'C': 15, 'O': 4, 'N': 3}",C15H20N3O4,305.137524,306.14480,[M+H]+,Orbitrap,21.430136,train,True
1,MassSpecGymID0167344,"75.023003,77.038002,89.038002,91.054001,101.03...","0.059,0.006,1.0,0.046,0.052,0.038,0.427,0.126,...",C1=C2C3=C(C(=C1O)O)OC(=O)C4=CC(=C(C(=C43)OC2=O...,AFSDNFLWKVMVRB,"{'C': 14, 'O': 8}",C14H7O8,302.002724,303.01000,[M+H]+,Orbitrap,,train,False
2,MassSpecGymID0400927,"91.054169,93.069809,95.049164,95.085434,97.064...","0.14086688273294626,0.11596669766096218,0.0075...",CCCCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)(O)OCCN)OC...,URPXXNCTXCOATD,"{'C': 43, 'O': 7, 'P': 1, 'N': 1}",C43H79NO7P,751.551724,752.55900,[M+H]+,Orbitrap,,train,False
3,MassSpecGymID0055045,"42.0338,44.0495,58.0651,69.0448,71.0604,83.060...","1.0,0.3083083083083083,0.3383383383383383,0.02...",C1N2CN3CN1CN(C2)C3,VKYKSIONXSXAKP,"{'C': 6, 'N': 4}",C6H13N4,140.106224,141.11350,[M+H]+,Orbitrap,21.167025,train,True
4,MassSpecGymID0227317,"41.038471,41.270458,43.054085,44.049362,71.085...","0.00196,0.00248,0.00226,0.041749999999999995,0...",CC(C)(C)CNCCN1C2=C(C(=NC=C2)N)N=C1SC3=CC4=C(C=...,RVJIQAYFTOPTKK,"{'C': 22, 'N': 6, 'S': 1, 'O': 2}",C22H31N6O2S,442.215094,443.22237,[M+H]+,Orbitrap,60.000000,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MassSpecGymID0199548,"107.049797,111.038628,121.027344,123.007416,12...","0.00220512,0.00213621,0.005742509999999999,0.0...",COC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)CO)O)C(=O)OC)O,WSRCOMOWYVKWBE,"{'C': 17, 'O': 8}",C17H17O8,348.082724,349.09000,[M+H]+,,,train,False
996,MassSpecGymID0231288,"58.065174,68.012993,70.065186,99.091698,101.10...","0.07695,0.010820000000000001,0.018680000000000...",CC(=O)NC1=CC(=NC=N1)OC2=CC3=C(C=C2)C(=CC=C3)C(...,MZZJNOOADWVFPD,"{'C': 30, 'O': 3, 'N': 6, 'F': 3}",C30H30F3N6O3,578.225324,579.23260,[M+H]+,Orbitrap,60.000000,test,True
997,MassSpecGymID0073570,"71.0604,78.0338,83.024,93.0083,96.0443,100.039...","0.07907907907907907,0.06906906906906907,1.0,0....",CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,MEFOUWRMVYJCQC,"{'C': 14, 'S': 2, 'O': 7, 'N': 5}",C14H18N5O7S2,431.056924,432.06420,[M+H]+,Orbitrap,77.771556,test,True
998,MassSpecGymID0061856,"68.997002,74.014999,75.023003,77.038002,79.017...","1.0,0.01,0.075,0.577,0.032,0.01,0.548,0.172,0....",C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,REFJWTPEDVJJIY,"{'C': 15, 'O': 7}",C15H11O7,302.042724,303.05000,[M+H]+,Orbitrap,,train,False


In [7]:
from msbuddy import Msbuddy, MsbuddyConfig

# instantiate a MsbuddyConfig object
msb_config = MsbuddyConfig(
            ms_instr='orbitrap', # supported: "qtof", "orbitrap", "fticr" or None
                                    # custom MS1 and MS2 tolerance will be used if None
            ppm=True,  # use ppm for mass tolerance
            ms1_tol=5,  # MS1 tolerance in ppm or Da
            ms2_tol=10,  # MS2 tolerance in ppm or Da
            halogen=False)

# instantiate a Msbuddy object
msb_engine = Msbuddy(msb_config)

In [21]:
import matchms as mms
import numpy as np

def as_spectrum(row):
    mzs = np.array(tuple(float(e) for e in row['mzs'].split(',')))
    intensities = np.array(tuple(float(e) for e in row['intensities'].split(',')))  
    spectrum = mms.Spectrum(mz=mzs, intensities=intensities,
                            metadata={'smiles': row['smiles'],
                                      'precursor_mz': row['precursor_mz'],
                                      'formula': row['formula']})
    return spectrum
    
specs = df.apply(as_spectrum, axis=1)

In [22]:
specs

0      Spectrum(precursor m/z=306.14, 24 fragments be...
1      Spectrum(precursor m/z=303.01, 17 fragments be...
2      Spectrum(precursor m/z=752.56, 105 fragments b...
3      Spectrum(precursor m/z=141.11, 10 fragments be...
4      Spectrum(precursor m/z=443.22, 34 fragments be...
                             ...                        
995    Spectrum(precursor m/z=349.09, 51 fragments be...
996    Spectrum(precursor m/z=579.23, 33 fragments be...
997    Spectrum(precursor m/z=432.06, 10 fragments be...
998    Spectrum(precursor m/z=303.05, 42 fragments be...
999    Spectrum(precursor m/z=780.59, 42 fragments be...
Length: 1000, dtype: object