# Calculate Sauer Paper Metabolomics Coverage for KEGG
https://www.nature.com/articles/nmeth.4103#Sec25

### Imports

In [1]:
import numpy as np
import pandas as pd
from minedatabase.metabolomics import Peak
from rdkit.Chem.AllChem import MolFromSmiles 
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

### Read Credentials for MongoDB

In [2]:
with open('./../credentials.txt', 'r') as infile:
    lines = infile.readlines()
    username = lines[0].strip().split('=')[1]
    password = lines[1].strip().split('=')[1]

### Read MassBank Test Data File

In [3]:
sauer_filepath = './../Data/sauer_ecoli_ions.csv'

mb_df = pd.read_csv(sauer_filepath, names=['m/z', 'Annotation'], header=0)

In [4]:
mb_df.head()

Unnamed: 0,m/z,Annotation
0,50.0043,
1,50.2807,
2,50.4272,
3,50.4732,
4,50.5736,


In [5]:
mb_df.tail()

Unnamed: 0,m/z,Annotation
4715,998.513,
4716,999.0224,
4717,999.1048,
4718,999.4623,
4719,999.5823,


In [6]:
mb_df = mb_df.loc[mb_df['m/z'] <= 600]

In [7]:
mb_df.tail()

Unnamed: 0,m/z,Annotation
3094,598.756,
3095,598.9984,
3096,599.2636,
3097,599.4022,
3098,599.4894,


In [8]:
len(mb_df)

3099

## Load KEGG Dataset

In [9]:
kegg_filepath = './../kegg_mass.csv'

kegg_df = pd.read_csv(kegg_filepath)

In [10]:
kegg_df.head()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
0,C00001,O,18.010565,XLYOFNOQVPJJNP-UHFFFAOYSA-N
1,C00002,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,506.995745,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,C00003,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...,664.116398,BAWFJGJZGIEFAR-NNYOXOHSSA-O
3,C00004,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,665.124772,BOPGDPNILDQYTO-NNYOXOHSSA-N
4,C00005,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,745.091102,ACFIXJIJDZMPPO-NNYOXOHSSA-N


In [11]:
kegg_df.tail()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
18617,C22171,Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,531.040374,UBONPDGCIQEDRL-QWEIRQIHSA-N
18618,C22172,O=C(O)[C@H](O)COP(=O)(O)OC[C@@H](O)[C@@H](O)[C...,531.089024,BJMFZLLJDXJPEK-YJNKXOJESA-N
18619,C22173,CC[C@H](CO)[C@H](N)C(=O)O,147.089543,FBQPPRTWSNHYNZ-UHNVWZDZSA-N
18620,C22174,C[C@H](O)[C@H](CO)[C@H](N)C(=O)O,163.084458,UBLKCZXWFBWGNA-YUPRTTJUSA-N
18621,C22175,COc1cc([C@H]2OC[C@H]3[C@@H]2CO[C@]3(O)c2ccc(O)...,374.136553,JGWZMWCBIAYEIJ-AFHBHXEDSA-N


### Search PubChem for Masses in Sauer Test Data File

In [12]:
def ms_adduct_search(peak, ms_params, adducts, kegg_df):
    ids = set()
    
    potential_masses = [(peak.mz - adduct[2]) / adduct[1] for adduct in adducts]
    
    if ms_params["ppm"]:
        precision = (ms_params["tolerance"] / 100000.0) * potential_masses
    else:
        precision = ms_params["tolerance"] * 0.001  # convert to mDa
    upper_bounds = [pm + precision for pm in potential_masses]
    lower_bounds = [pm - precision for pm in potential_masses]

    # search database for hits in the each adducts mass range that have no
    # innate charge.
    mongo_ids = []
    for i, adduct in enumerate(adducts):
        ids = ids.union(search_kegg(kegg_df, lower_bounds[i], upper_bounds[i]))

    return ids

In [13]:
def search_kegg(kegg_df, lower_bounds, upper_bounds):
    sub_kegg_df = kegg_df.loc[(kegg_df.Monoisotopic_Mass >= lower_bounds) & (kegg_df.Monoisotopic_Mass <= upper_bounds)]
    return list(sub_kegg_df.ID)

In [14]:
adducts = [("[M-H]-", 1, -1.007276),
           ("[M+F]-", 1, 18.99895174316)]
ms_params = {
    'adducts': ["[M-H]-", "[M+F]-"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'halogens': True,
    'verbose': False,
    'charge': '-',
    'models': []
}

In [15]:
ids = mb_df.index
masses = [str(val) for val in mb_df['m/z'].values]
len(masses)

3099

In [16]:
hits_dict = {}
peaks = []
for cpd_id, mass in zip(ids, masses):
    peak = Peak(cpd_id, 0, float(mass), 0)
    hits_dict[cpd_id] = ms_adduct_search(peak, ms_params, adducts, kegg_df)
    peaks.append(peak)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

0 50.0043 1
1 50.2807 0
2 50.4272 0
3 50.4732 0
4 50.5736 0
5 50.8674 0
6 50.9839 0
7 51.025 1
8 51.1619 0
9 51.4522 0
10 52.4869 0
11 53.0155 0
12 53.4657 0
13 53.4839 0
14 53.5257 0
15 53.6159 0
16 53.675 0
17 53.8255 0
18 53.9719 0
19 53.9967 0
20 54.1215 0
21 54.2731 0
22 54.4249 0
23 54.5548 0
24 55.0194 2
25 55.0485 0
26 55.1708 0
27 55.324 0
28 55.4756 0
29 55.5366 0
30 55.6275 0
31 55.7091 0
32 55.7788 0
33 55.8642 0
34 55.9286 0
35 56.0139 0
36 56.0791 0
37 56.095 0
38 56.2314 0
39 56.3638 0
40 56.3821 0
41 56.4772 0
42 56.5348 0
43 56.8375 0
44 56.9615 0
45 56.9992 1
46 57.035 6
47 57.0612 0
48 57.3038 0
49 57.457 0
50 57.5135 0
51 57.609 0
52 57.7595 0
53 57.9778 0
54 58.0322 0
55 58.5266 0
56 58.9934 1
57 59.0148 2
58 59.1423 0
59 59.587 0
60 59.6208 0
61 59.7771 0
62 60.0181 0
63 60.0439 1
64 60.1057 0
65 60.9881 1
66 61.3256 0
67 61.8082 0
68 61.9894 1
69 62.2319 0
70 62.3649 0
71 62.3898 0
72 62.4532 0
73 62.6096 0
74 62.7763 0
75 62.8543 0
76 63.0957 0
77 63.3973 0
78 6

620 157.2706 0
621 157.9392 0
622 157.9631 1
623 158.0179 0
624 158.0819 5
625 158.1212 0
626 158.937 0
627 159.0775 6
628 159.2595 0
629 159.303 0
630 159.4122 0
631 159.9355 0
632 159.9754 0
633 160.01 1
634 160.0619 9
635 160.2156 0
636 160.891 1
637 160.9374 0
638 161.0463 20
639 161.3077 0
640 161.3515 0
641 161.3935 0
642 161.9394 0
643 161.9724 0
644 162.0488 0
645 162.0728 2
646 162.1418 1
647 162.3998 0
648 162.938 0
649 162.9534 1
650 162.9812 0
651 163.0407 19
652 163.0604 24
653 163.1455 0
654 164.0227 3
655 164.0397 3
656 164.0716 16
657 164.2862 0
658 164.6137 0
659 164.957 0
660 164.9973 3
661 165.0181 6
662 165.057 22
663 165.0734 0
664 165.1666 0
665 165.9617 0
666 166.0519 19
667 166.0754 1
668 166.3486 0
669 166.9221 1
670 166.96 0
671 166.9949 1
672 167.0419 0
673 167.0824 4
674 167.1408 0
675 167.193 0
676 167.3917 0
677 167.9637 1
678 168.0275 2
679 168.186 0
680 168.9532 0
681 168.9914 4
682 169.0274 0
683 169.0621 1
684 169.0967 1
685 169.1761 0
686 169.9931 0
6

1205 253.0828 6
1206 253.1154 0
1207 253.1429 0
1208 253.2175 2
1209 253.53 0
1210 253.952 0
1211 253.988 0
1212 254.023 0
1213 254.0804 4
1214 254.1479 0
1215 254.2213 0
1216 254.2792 0
1217 254.7002 0
1218 254.8812 0
1219 254.9185 0
1220 254.9799 1
1221 255.0351 1
1222 255.101 6
1223 255.2338 1
1224 255.4314 0
1225 255.4867 0
1226 255.7691 0
1227 256.0567 2
1228 256.0898 1
1229 256.1244 0
1230 256.2369 0
1231 256.8863 0
1232 256.9215 0
1233 257.0365 0
1234 257.0752 0
1235 257.1113 0
1236 257.171 0
1237 257.2145 0
1238 257.2385 0
1239 257.3056 0
1240 257.3533 0
1241 257.8933 0
1242 257.9398 0
1243 258.0229 0
1244 258.0652 0
1245 258.1055 3
1246 258.1384 0
1247 258.1774 0
1248 258.2386 0
1249 258.8898 0
1250 258.9142 0
1251 259.0233 41
1252 259.0798 0
1253 259.1293 1
1254 259.2436 3
1255 259.9302 0
1256 260.0283 0
1257 260.088 0
1258 260.1272 0
1259 260.1894 0
1260 260.8879 0
1261 260.9777 1
1262 261.0582 6
1263 261.0887 3
1264 261.2432 0
1265 261.5161 0
1266 261.9236 0
1267 261.9816 0

1772 348.0344 0
1773 348.0881 2
1774 348.1351 2
1775 348.211 0
1776 348.2689 0
1777 348.8902 0
1778 348.9311 0
1779 348.9822 0
1780 349.0871 2
1781 349.1495 1
1782 349.2735 0
1783 349.5367 0
1784 349.9865 0
1785 350.0422 0
1786 350.082 1
1787 350.8033 0
1788 350.9176 0
1789 351.0007 0
1790 351.059 0
1791 351.0916 2
1792 351.131 1
1793 351.1744 2
1794 351.217 17
1795 351.3095 0
1796 351.9588 0
1797 352.09 0
1798 352.1611 3
1799 352.8583 0
1800 353.0435 1
1801 353.1648 1
1802 353.9498 0
1803 354.0335 0
1804 354.1082 2
1805 354.5453 0
1806 354.8351 0
1807 354.9309 0
1808 355.0513 1
1809 355.1127 2
1810 355.1548 4
1811 355.1967 0
1812 355.543 0
1813 355.9897 0
1814 356.0144 0
1815 356.1222 0
1816 356.1478 0
1817 356.3884 0
1818 356.5319 0
1819 356.7246 0
1820 356.8381 8
1821 356.8933 0
1822 356.9185 0
1823 356.9858 0
1824 357.0576 0
1825 357.1261 1
1826 357.1716 4
1827 357.213 0
1828 357.5253 0
1829 357.6318 0
1830 357.8897 0
1831 357.9323 0
1832 358.1259 1
1833 358.8636 0
1834 358.9453 0


2327 452.2781 0
2328 452.7736 0
2329 452.9195 0
2330 453.0472 0
2331 453.1216 5
2332 453.1933 1
2333 453.276 1
2334 453.3962 1
2335 453.5009 0
2336 453.9538 0
2337 454.4002 0
2338 454.4549 0
2339 454.7857 0
2340 454.8822 0
2341 455.0116 0
2342 455.0496 0
2343 455.1032 0
2344 455.4006 0
2345 455.5677 0
2346 455.9499 0
2347 456.0523 0
2348 456.1063 0
2349 456.1678 2
2350 456.2268 0
2351 456.7882 0
2352 456.8429 0
2353 456.9218 0
2354 457.0377 0
2355 457.8508 0
2356 457.9593 0
2357 458.0716 0
2358 458.1884 0
2359 458.3262 0
2360 458.8107 0
2361 458.9522 0
2362 459.0902 0
2363 459.1372 1
2364 459.3838 1
2365 459.84 0
2366 459.9642 0
2367 460.0336 0
2368 460.0965 0
2369 460.1412 0
2370 460.2583 0
2371 460.7924 0
2372 460.9131 0
2373 460.9431 0
2374 461.0667 0
2375 461.1439 0
2376 461.2647 0
2377 461.2998 0
2378 461.3485 0
2379 461.9637 0
2380 462.0685 1
2381 462.146 0
2382 462.2878 0
2383 462.3604 0
2384 462.8005 0
2385 462.9361 0
2386 463.0682 1
2387 463.1006 0
2388 463.2926 0
2389 463.827

2866 554.5639 0
2867 554.7389 0
2868 554.9057 0
2869 555.0648 0
2870 555.1166 0
2871 555.2013 0
2872 555.2724 0
2873 555.3315 0
2874 555.9122 0
2875 555.9998 0
2876 556.1039 1
2877 556.1693 0
2878 556.2728 0
2879 556.7471 0
2880 556.898 0
2881 556.9989 0
2882 557.0697 0
2883 557.1673 0
2884 557.2068 0
2885 557.2759 0
2886 557.3353 0
2887 557.5671 0
2888 557.7969 0
2889 557.9107 0
2890 558.0638 3
2891 558.2742 0
2892 558.7646 0
2893 558.8763 0
2894 559.0436 0
2895 559.23 0
2896 559.7814 0
2897 559.9386 0
2898 560.0478 0
2899 560.2641 0
2900 560.7456 0
2901 560.9211 0
2902 561.0192 0
2903 561.0707 0
2904 561.248 0
2905 561.7696 0
2906 562.0116 0
2907 562.2566 0
2908 562.76 0
2909 563.0197 0
2910 563.06 0
2911 563.1511 0
2912 563.2496 0
2913 563.4973 0
2914 563.9969 0
2915 564.145 0
2916 564.3055 0
2917 564.7705 0
2918 564.8957 0
2919 565.0511 0
2920 565.1659 0
2921 565.2286 0
2922 565.2568 0
2923 565.315 1
2924 565.5548 0
2925 565.9307 0
2926 566.0553 0
2927 566.168 0
2928 566.2625 0
292

### Calculate Statistics from Search Output

In [22]:
n_annotated = 0
n_unannotated_annotated = 0
num_hits = []

for i, (peak, annotation) in enumerate(zip(peaks, list(mb_df.Annotation.values))):
    hits = hits_dict[peak.name]
    if len(hits) > 0:
        n_annotated += 1
        if not isinstance(annotation, str) and np.isnan(annotation):
            n_unannotated_annotated += 1
    num_hits.append(len(hits))

In [23]:
n_annotated

775

In [24]:
n_annotated / 3099

0.250080671184253

In [26]:
n_unannotated_annotated

392

In [25]:
n_unannotated_annotated / 2402

0.16319733555370525

In [27]:
(n_annotated - n_unannotated_annotated)

383

Could be lower now because we aren't using all rules?

In [28]:
np.median(num_hits)

0.0

In [29]:
np.mean(num_hits)

0.8422071636011617