# Calculate MassBank Coverage for KEGG MINE 2.0
For Table 2, KEGG 2.0 coverage column

### Imports

In [1]:
import numpy as np
import pandas as pd
import pymongo

from minedatabase.metabolomics import ms_adduct_search

### Read Credentials for MongoDB

In [5]:
with open('./../credentials.txt', 'r') as infile:
    lines = infile.readlines()
    username = lines[0].strip().split('=')[1]
    password = lines[1].strip().split('=')[1]

### Connect to MongoDB KEGG 2.0 MINE

In [6]:
uri = f'mongodb://{username}:{password}@minedatabase.ci.northwestern.edu:27017'
data_dir = './data'

client = pymongo.MongoClient(uri, ServerSelectionTimeoutMS=10000)
db = client['kegg_lte600_500mcy_2_1_0']
core_db = client['core']
kegg_models_db = client['kegg']

In [7]:
db.compounds.find_one('C000000588c5adcceec7bc0033b2404db527f006f')

{'_id': 'C000000588c5adcceec7bc0033b2404db527f006f',
 'SMILES': 'CC(=O)N1CC2OC(O)C1C(O)C2OC1OC(CO)C(O)C(OC2OC(CO)C(O)C(O)C2O)C1O',
 'InChI_key': 'ANMDONBJHHLQDM-UHFFFAOYSA-N',
 'Type': 'Predicted',
 'Generation': 1,
 'Expand': True,
 'Reactant_in': [],
 'Product_of': ['C000000588c5adcceec7bc0033b2404db527f006f_0']}

### Read MassBank Test Data File

In [8]:
massbank_filepath = './../Data/MassBankTestSet.csv'

mb_df = pd.read_csv(massbank_filepath, delimiter='\t', names=['Name', 'Charge', 'Mass', 'Mode', 'None', 'Adduct', 'InChI_Key'])
mb_df = mb_df.drop(['None'], axis=1)

In [9]:
mb_df.head()

Unnamed: 0,Name,Charge,Mass,Mode,Adduct,InChI_Key
0,Metamitron-desamino,0,188.0818,Positive,[M+H]+,OUSYWCQYMPDAEO-UHFFFAOYSA-N
1,4-Isopropylaniline,0,136.1121,Positive,[M+H]+,LRTFPLFDLJYEKT-UHFFFAOYSA-N
2,Metolachlor morpholinone,0,234.1489,Positive,[M+H]+,DVBDYPDVNRJKNJ-UHFFFAOYSA-N
3,"2,6-Dichlorobenzamide",0,189.9821,Positive,[M+H]+,JHSPCUHPSIUQRB-UHFFFAOYSA-N
4,Amitraz,0,294.1965,Positive,[M+H]+,QXAITBQSYVNQDR-UHFFFAOYSA-N


In [10]:
mb_df.tail()

Unnamed: 0,Name,Charge,Mass,Mode,Adduct,InChI_Key
662,Robinetin trimethyl ether,0,345.0969,Positive,[M+H]+,NJNGYVOYOVPWBB-UHFFFAOYSA-N
663,"3-Hydroxy-3',4',5'-trimethoxyflavone",0,329.102,Positive,[M+H]+,MWFLTXAQDCOKEK-UHFFFAOYSA-N
664,Carbobenzoxy-L-asparagine,0,265.083,Negative,[M-H]-,FUCKRCGERFLLHP-SECBINFHSA-N
665,(S)-(-)-Perillic acid,0,165.0921,Negative,[M-H]-,CDSMSBUVCWHORP-MRVPVSSYSA-N
666,Kaempferide,0,299.0561,Negative,[M-H]-,SQFSKOYWJBQGKQ-UHFFFAOYSA-N


In [11]:
mb_df = mb_df.loc[mb_df.Mass <= 600]

In [12]:
len(mb_df)

634

### Search KEGG 2.0 MINE for Masses in MassBank Test Data File

Positive First

In [16]:
ms_params = {
    'adducts': ["[M+]+", "[M+H]+", "[M+Na]+"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'logp': None,
    'halogens': True,
    'verbose': False,
    'charge': '+',
    'models': []
}

In [17]:
ids = mb_df.loc[mb_df.Mode == 'Positive'].index
masses = [str(val) for val in mb_df.loc[mb_df.Mode == 'Positive'].Mass.values]
len(masses)

587

In [18]:
hits_dict = {}
for cpd_id, mass in zip(ids, masses):
    hits_dict[cpd_id] = ms_adduct_search(db=db, core_db=core_db, keggdb=kegg_models_db, text=mass, text_type='form', ms_params=ms_params)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

<MS Adduct Search: TextType=form, Text=188.0818, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
0 188.0818 15
<MS Adduct Search: TextType=form, Text=136.1121, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
1 136.1121 55
<MS Adduct Search: TextType=form, Text=234.1489, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
2 234.1489 353
<MS Adduct Search: TextType=form, Text=189.9821, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
3 189.9821 15
<MS Adduct Search: TextType=form, Text=294.1965, Parameters={'adducts': ['[M+]+', '[M+H

35 482.1756 318
<MS Adduct Search: TextType=form, Text=240.115, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
36 240.115 234
<MS Adduct Search: TextType=form, Text=441.167, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
37 441.167 528
<MS Adduct Search: TextType=form, Text=515.2442, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
38 515.2442 495
<MS Adduct Search: TextType=form, Text=559.2603, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
39 559.2603 450
<MS Adduct Search: TextType=form, Text=304.1543, Parameters={'adduct

72 539.09728 557
<MS Adduct Search: TextType=form, Text=285.07576, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
73 285.07576 2110
<MS Adduct Search: TextType=form, Text=310.16545, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
74 310.16545 575
<MS Adduct Search: TextType=form, Text=275.13902, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
76 275.13902 322
<MS Adduct Search: TextType=form, Text=180.10191, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
77 180.10191 553
<MS Adduct Search: TextType=form, Text=437.14423, Para

113 181.07067 520
<MS Adduct Search: TextType=form, Text=204.12438, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
114 204.12438 358
<MS Adduct Search: TextType=form, Text=300.28971, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
116 300.28971 743
<MS Adduct Search: TextType=form, Text=187.0577, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
119 187.0577 514
<MS Adduct Search: TextType=form, Text=150.05833, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
122 150.05833 112
<MS Adduct Search: TextType=form, Text=382.17212, Pa

158 181.076 33
<MS Adduct Search: TextType=form, Text=170.0964, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
159 170.0964 54
<MS Adduct Search: TextType=form, Text=220.9532, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
160 220.9532 26
<MS Adduct Search: TextType=form, Text=318.013, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
161 318.013 90
<MS Adduct Search: TextType=form, Text=230.0069, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
162 230.0069 71
<MS Adduct Search: TextType=form, Text=233.0243, Parameters={'adduc

194 202.0854 858
<MS Adduct Search: TextType=form, Text=309.1121, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
195 309.1121 536
<MS Adduct Search: TextType=form, Text=308.1524, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
196 308.1524 81
<MS Adduct Search: TextType=form, Text=253.0309, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
197 253.0309 357
<MS Adduct Search: TextType=form, Text=165.1022, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
198 165.1022 108
<MS Adduct Search: TextType=form, Text=291.0895, Parameters=

233 250.0645 34
<MS Adduct Search: TextType=form, Text=195.0877, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
234 195.0877 534
<MS Adduct Search: TextType=form, Text=232.0524, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
235 232.0524 28
<MS Adduct Search: TextType=form, Text=152.0706, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
236 152.0706 470
<MS Adduct Search: TextType=form, Text=237.1022, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
237 237.1022 157
<MS Adduct Search: TextType=form, Text=296.024, Parameters={'

271 391.2843 4230
<MS Adduct Search: TextType=form, Text=214.0896, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
272 214.0896 33
<MS Adduct Search: TextType=form, Text=134.0713, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
273 134.0713 5
<MS Adduct Search: TextType=form, Text=314.9853, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
274 314.9853 32
<MS Adduct Search: TextType=form, Text=327.0081, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
275 327.0081 144
<MS Adduct Search: TextType=form, Text=279.0933, Parameters={'

307 544.182 2517
<MS Adduct Search: TextType=form, Text=318.24342, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
308 318.24342 1030
<MS Adduct Search: TextType=form, Text=377.20776, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
309 377.20776 1329
<MS Adduct Search: TextType=form, Text=349.17646, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
310 349.17646 2993
<MS Adduct Search: TextType=form, Text=166.1233, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
311 166.1233 362
<MS Adduct Search: TextType=form, Text=592.35001, 

345 341.19921 75
<MS Adduct Search: TextType=form, Text=384.16729, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
346 384.16729 300
<MS Adduct Search: TextType=form, Text=361.20161, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
347 361.20161 5421
<MS Adduct Search: TextType=form, Text=459.27478, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
348 459.27478 1406
<MS Adduct Search: TextType=form, Text=359.18596, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
349 359.18596 1813
<MS Adduct Search: TextType=form, Text=218.19099

383 260.0918 263
<MS Adduct Search: TextType=form, Text=332.1857, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
384 332.1857 1773
<MS Adduct Search: TextType=form, Text=258.1125, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
385 258.1125 723
<MS Adduct Search: TextType=form, Text=467.2768, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
387 467.2768 2140
<MS Adduct Search: TextType=form, Text=215.1179, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
389 215.1179 213
<MS Adduct Search: TextType=form, Text=377.1456, Paramete

428 271.0601 2064
<MS Adduct Search: TextType=form, Text=255.0652, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
429 255.0652 1692
<MS Adduct Search: TextType=form, Text=269.0809, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
430 269.0809 1142
<MS Adduct Search: TextType=form, Text=289.0859, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
431 289.0859 220
<MS Adduct Search: TextType=form, Text=276.1707, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
433 276.1707 272
<MS Adduct Search: TextType=form, Text=355.2016, Paramet

475 180.1019 553
<MS Adduct Search: TextType=form, Text=384.1442, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
476 384.1442 1071
<MS Adduct Search: TextType=form, Text=409.1493, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
477 409.1493 1417
<MS Adduct Search: TextType=form, Text=306.2064, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
478 306.2064 749
<MS Adduct Search: TextType=form, Text=293.102, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
480 293.102 1685
<MS Adduct Search: TextType=form, Text=229.0859, Parameter

516 376.2483 1147
<MS Adduct Search: TextType=form, Text=415.1388, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
518 415.1388 1335
<MS Adduct Search: TextType=form, Text=257.0809, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
519 257.0809 1970
<MS Adduct Search: TextType=form, Text=457.1122, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
520 457.1122 2407
<MS Adduct Search: TextType=form, Text=381.2036, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
521 381.2036 3847
<MS Adduct Search: TextType=form, Text=266.1387, Param

567 191.0703 635
<MS Adduct Search: TextType=form, Text=313.1071, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
568 313.1071 816
<MS Adduct Search: TextType=form, Text=237.091, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
569 237.091 314
<MS Adduct Search: TextType=form, Text=175.123, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
571 175.123 127
<MS Adduct Search: TextType=form, Text=433.1129, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
572 433.1129 2969
<MS Adduct Search: TextType=form, Text=241.0859, Parameters={'

610 501.1603 546
<MS Adduct Search: TextType=form, Text=543.1709, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
612 543.1709 281
<MS Adduct Search: TextType=form, Text=345.1268, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
613 345.1268 1184
<MS Adduct Search: TextType=form, Text=273.1234, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
614 273.1234 572
<MS Adduct Search: TextType=form, Text=301.0626, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
615 301.0626 278
<MS Adduct Search: TextType=form, Text=299.0914, Parameter

656 282.1489 268
<MS Adduct Search: TextType=form, Text=198.0761, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
657 198.0761 665
<MS Adduct Search: TextType=form, Text=585.2906, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
658 585.2906 1905
<MS Adduct Search: TextType=form, Text=221.0476, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
659 221.0476 196
<MS Adduct Search: TextType=form, Text=301.1071, Parameters={'adducts': ['[M+]+', '[M+H]+', '[M+Na]+'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '+', 'models': []}>
660 301.1071 1610
<MS Adduct Search: TextType=form, Text=345.0969, Paramete

Now, negative

In [19]:
ms_params = {
    'adducts': ["[M-H]-", "[M+CH3COO]-"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'logp': None,
    'halogens': True,
    'verbose': False,
    'charge': '-',
    'models': []
}

In [20]:
ids = mb_df.loc[mb_df.Mode == 'Negative'].index
masses = [str(val) for val in mb_df.loc[mb_df.Mode == 'Negative'].Mass.values]
len(masses)

47

In [21]:
for cpd_id, mass in zip(ids, masses):
    hits_dict[cpd_id] = ms_adduct_search(db=db, core_db=core_db, keggdb=kegg_models_db, text=mass, text_type='form', ms_params=ms_params)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

<MS Adduct Search: TextType=form, Text=450.0204, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
110 450.0204 244
<MS Adduct Search: TextType=form, Text=145.06186, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
115 145.06186 290
<MS Adduct Search: TextType=form, Text=203.0826, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
117 203.0826 591
<MS Adduct Search: TextType=form, Text=118.05096, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
118 118.05096 232
<MS Adduct Search: TextType=form, Text=132.03023, Parameters={'adducts': ['[M-H]-', '[M+CH3C

558 178.0509 704
<MS Adduct Search: TextType=form, Text=465.3216, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
559 465.3216 1808
<MS Adduct Search: TextType=form, Text=123.0451, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
570 123.0451 139
<MS Adduct Search: TextType=form, Text=205.0362, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
574 205.0362 1103
<MS Adduct Search: TextType=form, Text=381.182, Parameters={'adducts': ['[M-H]-', '[M+CH3COO]-'], 'tolerance': 2, 'ppm': False, 'logp': None, 'halogens': True, 'verbose': False, 'charge': '-', 'models': []}>
588 381.182 6327
<MS Adduct Search: TextType=form, Text=304.9778, Parameters={'adducts': ['[M-H

### Calculate Statistics from Search Output

In [22]:
n_annotated = 0
num_hits = []

for cpd_id, hits in hits_dict.items():
    if len(hits) > 0:
        n_annotated += 1
    num_hits.append(len(hits))
    print(cpd_id, len(hits))

0 15
1 55
2 353
3 15
4 108
5 101
6 183
7 15
8 131
9 253
10 1142
11 86
12 5
13 298
14 24
15 715
16 72
17 79
18 127
19 865
20 911
21 64
22 31
23 67
24 664
25 57
26 684
27 858
28 187
29 96
30 102
31 684
32 362
33 692
34 575
35 318
36 234
37 528
38 495
39 450
40 1875
41 65
42 365
43 62
44 394
45 55
46 1510
47 594
48 394
49 179
50 269
51 273
52 124
53 460
54 99
55 546
56 160
57 250
58 38
59 87
60 103
61 498
62 133
64 971
65 2064
66 190
67 890
68 1628
69 511
70 857
71 581
72 557
73 2110
74 575
76 322
77 553
78 1166
79 993
80 635
81 67
82 1952
83 918
84 616
85 1893
86 1588
87 234
88 1026
89 429
90 222
91 1121
93 800
95 4852
96 396
97 522
98 1386
99 1575
100 573
102 3536
103 2048
104 935
105 666
106 138
107 340
108 138
109 675
111 36
112 581
113 520
114 358
116 743
119 514
122 112
124 675
125 68
126 742
127 3759
128 252
129 731
132 675
133 236
134 263
135 3759
137 497
138 236
139 75
140 3961
141 675
142 33
143 14
144 3
145 44
146 41
147 55
148 10
149 6
150 17
151 17
152 28
153 3
154 1602
155 2

In [23]:
n_annotated

634

In [24]:
n_annotated / 634

1.0

In [25]:
np.median(num_hits)

536.5

In [26]:
np.mean(num_hits)

1082.4069400630915

### Check for Correct Annotation with InChI

In [27]:
n_correct_annotation = 0
for cpd_id, hits in hits_dict.items():
    inchi = mb_df.at[cpd_id, 'InChI_Key']
    inchi_prefix = inchi.split('-')[0]
    for hit in hits:
        hit_inchi = hit['Inchikey']
        hit_inchi_prefix = hit_inchi.split('-')[0]
        if hit_inchi_prefix == inchi_prefix:
            n_correct_annotation += 1
            break

In [28]:
n_correct_annotation

461

In [29]:
n_correct_annotation / 634

0.7271293375394322

In [32]:
len(core_db.compounds.distinct("Formula", {"MINES": "kegg_lte600_500mcy_2_1_0"}))

220934