# Calculate Sauer Paper Metabolomics Coverage for PubChem
https://www.nature.com/articles/nmeth.4103#Sec25

### Imports

In [1]:
import numpy as np
import pandas as pd
import pymongo
import matplotlib.pyplot as plt
import requests
from xml.etree import ElementTree


from minedatabase.metabolomics import MetabolomicsDataset, Peak

### Read Credentials for MongoDB

In [2]:
with open('./../credentials.txt', 'r') as infile:
    lines = infile.readlines()
    username = lines[0].strip().split('=')[1]
    password = lines[1].strip().split('=')[1]

### Read MassBank Test Data File

In [3]:
sauer_filepath = './../Data/sauer_ecoli_ions.csv'

mb_df = pd.read_csv(sauer_filepath, names=['m/z', 'Annotation'], header=0)

In [4]:
mb_df.head()

Unnamed: 0,m/z,Annotation
0,50.0043,
1,50.2807,
2,50.4272,
3,50.4732,
4,50.5736,


In [5]:
mb_df.tail()

Unnamed: 0,m/z,Annotation
4715,998.513,
4716,999.0224,
4717,999.1048,
4718,999.4623,
4719,999.5823,


In [6]:
mb_df = mb_df.loc[mb_df['m/z'] <= 600]

In [7]:
mb_df.tail()

Unnamed: 0,m/z,Annotation
3094,598.756,
3095,598.9984,
3096,599.2636,
3097,599.4022,
3098,599.4894,


In [8]:
len(mb_df)

3099

### Search PubChem for Masses in Sauer Test Data File

In [9]:
def ms_adduct_search(peak, ms_params, adducts):
    ids = set()
    
    potential_masses = [(peak.mz - adduct[2]) / adduct[1] for adduct in adducts]
    
    if ms_params["ppm"]:
        precision = (ms_params["tolerance"] / 100000.0) * potential_masses
    else:
        precision = ms_params["tolerance"] * 0.001  # convert to mDa
    upper_bounds = [pm + precision for pm in potential_masses]
    lower_bounds = [pm - precision for pm in potential_masses]

    # search database for hits in the each adducts mass range that have no
    # innate charge.
    mongo_ids = []
    for i, adduct in enumerate(adducts):
        response = search_pubchem(lower_bounds[i], upper_bounds[i])
        
        tree = ElementTree.fromstring(response.content)
        
        for e in tree.find('IdList').iter('Id'):
            _id = e.text.strip()
            ids.add(_id)

    return ids

In [10]:
def search_pubchem(lower, upper):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pccompound&retmax=100000&term={lower}:{upper}[exactmass]"
    response = requests.get(url)
    return response

In [11]:
adducts = [("[M-H]-", 1, -1.007276),
           ("[M+F]-", 1, 18.99895174316)]
ms_params = {
    'adducts': ["[M-H]-", "[M+F]-"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'halogens': True,
    'verbose': False,
    'charge': '-',
    'models': []
}

In [12]:
ids = mb_df.index
masses = [str(val) for val in mb_df['m/z'].values]
len(masses)

3099

In [15]:
import time

In [16]:
peaks = []
hits_dict = {}
for cpd_id, mass in zip(ids, masses):
    time.sleep(1)  # so api doesn't throttle us
    peak = Peak(cpd_id, 0, float(mass), 0)
    hits_dict[cpd_id] = ms_adduct_search(peak, ms_params, adducts)
    peaks.append(peak)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

0 50.0043 10
1 50.2807 0
2 50.4272 0
3 50.4732 0
4 50.5736 0
5 50.8674 0
6 50.9839 8
7 51.025 50
8 51.1619 0
9 51.4522 0
10 52.4869 0
11 53.0155 21
12 53.4657 0
13 53.4839 0
14 53.5257 0
15 53.6159 0
16 53.675 0
17 53.8255 0
18 53.9719 4
19 53.9967 12
20 54.1215 0
21 54.2731 0
22 54.4249 0
23 54.5548 0
24 55.0194 56
25 55.0485 12
26 55.1708 0
27 55.324 0
28 55.4756 0
29 55.5366 0
30 55.6275 0
31 55.7091 0
32 55.7788 0
33 55.8642 0
34 55.9286 4
35 56.0139 39
36 56.0791 6
37 56.095 1
38 56.2314 0
39 56.3638 0
40 56.3821 0
41 56.4772 0
42 56.5348 0
43 56.8375 0
44 56.9615 3
45 56.9992 26
46 57.035 52
47 57.0612 3
48 57.3038 0
49 57.457 0
50 57.5135 0
51 57.609 0
52 57.7595 0
53 57.9778 4
54 58.0322 5
55 58.5266 0
56 58.9934 38
57 59.0148 54
58 59.1423 0
59 59.587 0
60 59.6208 0
61 59.7771 0
62 60.0181 24
63 60.0439 52
64 60.1057 0
65 60.9881 17
66 61.3256 0
67 61.8082 0
68 61.9894 21
69 62.2319 0
70 62.3649 0
71 62.3898 0
72 62.4532 0
73 62.6096 0
74 62.7763 0
75 62.8543 0
76 63.0957 3
77

539 144.9222 43
540 144.9629 290
541 144.9902 472
542 145.0139 974
543 145.0588 576
544 145.0975 2722
545 145.1998 0
546 145.2452 0
547 145.2944 0
548 145.3406 0
549 145.4382 0
550 146.0027 728
551 146.0203 673
552 146.0461 2107
553 146.0673 229
554 146.6944 0
555 146.9592 224
556 146.9842 333
557 147.03 1821
558 147.0446 3713
559 147.0657 4844
560 147.6373 0
561 147.9983 586
562 148.0435 2482
563 148.0679 568
564 148.1335 139
565 148.2422 0
566 148.9401 82
567 148.9819 271
568 149.03 894
569 149.0424 2151
570 149.0662 602
571 150.0188 665
572 150.0537 841
573 150.192 1
574 150.9974 1066
575 151.0258 1080
576 151.083 246
577 151.1123 9390
578 151.1739 11
579 152.0356 1848
580 152.1334 512
581 152.2141 0
582 152.6889 0
583 152.9396 64
584 152.9609 213
585 152.9909 302
586 153.0187 1231
587 153.042 1456
588 153.277 0
589 153.3227 0
590 153.3617 0
591 153.4094 0
592 153.5342 0
593 153.668 0
594 153.9933 333
595 154.0222 378
596 154.061 3380
597 154.2942 0
598 154.8751 13
599 154.9303 133


1020 225.123 12012
1021 225.1501 13763
1022 225.1858 5422
1023 225.9313 347
1024 225.9898 1369
1025 226.0122 1585
1026 226.0407 2737
1027 226.0714 8062
1028 226.1186 5670
1029 226.1892 35
1030 226.2456 11
1031 226.9601 613
1032 226.9919 1077
1033 227.0174 3737
1034 227.1049 9268
1035 227.1551 25788
1036 227.2024 3331
1037 227.2892 0
1038 227.3848 0
1039 227.9432 477
1040 227.9889 1239
1041 228.0583 8881
1042 228.0871 12511
1043 228.164 3251
1044 228.2058 26
1045 228.2664 1
1046 228.8635 49
1047 228.9352 320
1048 228.976 1001
1049 229.1128 7722
1050 229.1452 27170
1051 229.1775 562
1052 229.2032 619
1053 229.2596 2
1054 229.3291 1
1055 229.9231 162
1056 229.9806 1522
1057 230.0406 8649
1058 230.0604 6668
1059 230.1055 8439
1060 230.1795 1717
1061 230.4053 0
1062 230.9632 864
1063 230.9849 1122
1064 231.0304 2721
1065 231.0988 12240
1066 231.1321 13988
1067 231.1583 3344
1068 231.242 14
1069 231.9391 371
1070 232.0006 1990
1071 232.025 1584
1072 232.0588 5930
1073 232.0872 9694
1074 232.

1462 294.8387 69
1463 294.8959 352
1464 294.9414 801
1465 294.9914 4182
1466 295.1032 21610
1467 295.1864 7322
1468 295.2651 1654
1469 295.3052 0
1470 295.978 1727
1471 296.0161 5781
1472 296.0846 16686
1473 296.1356 6080
1474 296.1794 38073
1475 296.8738 266
1476 296.9089 437
1477 297.0389 7945
1478 297.0877 17039
1479 297.1187 20559
1480 297.242 1664
1481 297.2768 160
1482 297.3335 0
1483 297.8885 71
1484 297.9818 1069
1485 298.0409 6521
1486 298.0876 19522
1487 298.1354 23006
1488 298.1718 11965
1489 298.2377 1938
1490 298.8789 180
1491 298.9312 854
1492 299.0252 4160
1493 299.1245 19726
1494 299.8789 37
1495 299.9324 792
1496 300.007 3500
1497 300.0578 8393
1498 300.1167 36657
1499 300.256 1440
1500 300.8992 147
1501 301.017 3213
1502 301.0947 9867
1503 301.2393 5315
1504 301.5491 0
1505 301.6081 0
1506 301.9265 467
1507 302.0165 5358
1508 302.0668 9562
1509 302.2316 492
1510 302.5375 0
1511 302.8778 145
1512 302.9503 959
1513 303.0001 3545
1514 303.0714 14843
1515 303.1262 30721
1

1903 372.8142 69
1904 372.9013 535
1905 373.0546 8058
1906 373.1697 26683
1907 373.5269 0
1908 373.9853 3313
1909 374.0729 9322
1910 374.1141 16461
1911 374.1583 6517
1912 374.8386 129
1913 374.9213 909
1914 375.055 5188
1915 375.1821 32282
1916 375.2804 1469
1917 375.9326 1359
1918 376.0437 4603
1919 376.1037 10947
1920 376.1467 16716
1921 376.3057 120
1922 376.8156 41
1923 376.9477 1183
1924 376.9811 2794
1925 377.0397 4108
1926 377.087 8772
1927 377.1131 12795
1928 377.4284 8
1929 377.9005 497
1930 377.9677 1256
1931 378.0441 3810
1932 378.0912 16511
1933 378.817 92
1934 378.9215 823
1935 378.9479 1487
1936 378.9856 2487
1937 379.0851 11159
1938 379.9761 1522
1939 380.0848 7347
1940 380.2342 15481
1941 380.5473 0
1942 380.8439 254
1943 380.9374 1097
1944 380.9838 1999
1945 381.0883 6875
1946 381.1775 7070
1947 381.32 95
1948 381.9985 1919
1949 382.0404 4899
1950 382.1294 8911
1951 382.5527 0
1952 382.884 318
1953 382.9832 2564
1954 383.0979 12421
1955 383.1869 23919
1956 383.5533 0


2348 456.1063 7289
2349 456.1678 7501
2350 456.2268 2433
2351 456.7882 46
2352 456.8429 78
2353 456.9218 445
2354 457.0377 1953
2355 457.8508 186
2356 457.9593 575
2357 458.0716 2984
2358 458.1884 9605
2359 458.3262 431
2360 458.8107 99
2361 458.9522 420
2362 459.0902 4202
2363 459.1372 8448
2364 459.3838 1525
2365 459.84 101
2366 459.9642 708
2367 460.0336 2659
2368 460.0965 6076
2369 460.1412 5377
2370 460.2583 1325
2371 460.7924 33
2372 460.9131 214
2373 460.9431 313
2374 461.0667 3651
2375 461.1439 7666
2376 461.2647 1697
2377 461.2998 377
2378 461.3485 303
2379 461.9637 547
2380 462.0685 4148
2381 462.146 7239
2382 462.2878 2714
2383 462.3604 494
2384 462.8005 60
2385 462.9361 293
2386 463.0682 6422
2387 463.1006 7328
2388 463.2926 853
2389 463.8274 38
2390 463.8829 109
2391 463.9087 149
2392 463.985 898
2393 464.0324 1827
2394 464.1004 5327
2395 464.1715 5416
2396 464.2913 2845
2397 464.3874 208
2398 464.8254 61
2399 464.9841 652
2400 465.0222 1608
2401 465.1021 5565
2402 465.171

2797 541.0832 1677
2798 541.1781 3518
2799 541.2983 1800
2800 541.3581 318
2801 541.8261 20
2802 541.9301 147
2803 542.0582 1148
2804 542.1615 3012
2805 542.238 3021
2806 542.7862 25
2807 542.9035 123
2808 543.068 1324
2809 543.1054 1977
2810 543.1416 2809
2811 543.2383 2283
2812 543.3423 650
2813 543.4266 87
2814 543.883 35
2815 543.9481 269
2816 544.0591 1038
2817 544.1636 2304
2818 544.2403 2416
2819 544.7731 14
2820 544.9675 480
2821 545.0363 980
2822 545.252 1904
2823 545.9596 210
2824 546.0205 491
2825 546.0726 1287
2826 546.129 2825
2827 546.2541 3222
2828 546.5787 2
2829 546.7773 42
2830 547.0741 1389
2831 547.9679 263
2832 548.0164 557
2833 548.0813 1519
2834 548.1667 3410
2835 548.233 3212
2836 548.7387 13
2837 548.7795 21
2838 548.9942 350
2839 549.0856 2053
2840 549.1647 3187
2841 549.3144 1182
2842 549.9559 225
2843 550.0989 1605
2844 550.2419 2291
2845 550.7261 8
2846 550.9741 403
2847 551.0122 511
2848 551.0998 2187
2849 551.3202 606
2850 551.877 64
2851 551.9849 300
285

### Calculate Statistics from Search Output

In [19]:
n_annotated = 0
n_unannotated_annotated = 0
num_hits = []

for i, (peak, annotation) in enumerate(zip(peaks, list(mb_df.Annotation.values))):
    hits = hits_dict[peak.name]
    if len(hits) > 0:
        n_annotated += 1
        if not isinstance(annotation, str) and np.isnan(annotation):
            n_unannotated_annotated += 1
    num_hits.append(len(hits))

In [20]:
n_annotated

2770

In [21]:
n_annotated / 3099

0.8938367215230719

Could be lower now because we aren't using all rules?

In [22]:
n_unannotated_annotated

2079

In [23]:
n_unannotated_annotated / 2402

0.8655287260616153

In [24]:
(n_annotated - n_unannotated_annotated)

691

In [25]:
np.median(num_hits)

893.0

In [26]:
np.mean(num_hits)

3319.890609874153