# Calculate MassBank Dataset Coverage in Original KEGG Database

In [27]:
import numpy as np
import pandas as pd
from minedatabase.metabolomics import Peak
from rdkit.Chem.AllChem import MolFromSmiles 
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

## Load MassBank Dataset

In [28]:
massbank_filepath = './../Data/MassBankTestSet.csv'

mb_df = pd.read_csv(massbank_filepath, delimiter='\t', names=['Name', 'Charge', 'Mass', 'Mode', 'None', 'Adduct', 'InChI_Key'])
mb_df = mb_df.drop(['None'], axis=1)

In [29]:
mb_df.head()

Unnamed: 0,Name,Charge,Mass,Mode,Adduct,InChI_Key
0,Metamitron-desamino,0,188.0818,Positive,[M+H]+,OUSYWCQYMPDAEO-UHFFFAOYSA-N
1,4-Isopropylaniline,0,136.1121,Positive,[M+H]+,LRTFPLFDLJYEKT-UHFFFAOYSA-N
2,Metolachlor morpholinone,0,234.1489,Positive,[M+H]+,DVBDYPDVNRJKNJ-UHFFFAOYSA-N
3,"2,6-Dichlorobenzamide",0,189.9821,Positive,[M+H]+,JHSPCUHPSIUQRB-UHFFFAOYSA-N
4,Amitraz,0,294.1965,Positive,[M+H]+,QXAITBQSYVNQDR-UHFFFAOYSA-N


In [30]:
mb_df.tail()

Unnamed: 0,Name,Charge,Mass,Mode,Adduct,InChI_Key
662,Robinetin trimethyl ether,0,345.0969,Positive,[M+H]+,NJNGYVOYOVPWBB-UHFFFAOYSA-N
663,"3-Hydroxy-3',4',5'-trimethoxyflavone",0,329.102,Positive,[M+H]+,MWFLTXAQDCOKEK-UHFFFAOYSA-N
664,Carbobenzoxy-L-asparagine,0,265.083,Negative,[M-H]-,FUCKRCGERFLLHP-SECBINFHSA-N
665,(S)-(-)-Perillic acid,0,165.0921,Negative,[M-H]-,CDSMSBUVCWHORP-MRVPVSSYSA-N
666,Kaempferide,0,299.0561,Negative,[M-H]-,SQFSKOYWJBQGKQ-UHFFFAOYSA-N


In [31]:
mb_df = mb_df.loc[mb_df.Mass <= 600]

In [32]:
len(mb_df)

634

## Load KEGG Dataset

In [33]:
kegg_filepath = './../Data/kegg_mass.csv'

kegg_df = pd.read_csv(kegg_filepath)

In [34]:
kegg_df.head()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
0,C00001,O,18.010565,XLYOFNOQVPJJNP-UHFFFAOYSA-N
1,C00002,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,506.995745,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,C00003,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...,664.116398,BAWFJGJZGIEFAR-NNYOXOHSSA-O
3,C00004,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,665.124772,BOPGDPNILDQYTO-NNYOXOHSSA-N
4,C00005,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,745.091102,ACFIXJIJDZMPPO-NNYOXOHSSA-N


In [35]:
kegg_df.tail()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
18617,C22171,Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,531.040374,UBONPDGCIQEDRL-QWEIRQIHSA-N
18618,C22172,O=C(O)[C@H](O)COP(=O)(O)OC[C@@H](O)[C@@H](O)[C...,531.089024,BJMFZLLJDXJPEK-YJNKXOJESA-N
18619,C22173,CC[C@H](CO)[C@H](N)C(=O)O,147.089543,FBQPPRTWSNHYNZ-UHNVWZDZSA-N
18620,C22174,C[C@H](O)[C@H](CO)[C@H](N)C(=O)O,163.084458,UBLKCZXWFBWGNA-YUPRTTJUSA-N
18621,C22175,COc1cc([C@H]2OC[C@H]3[C@@H]2CO[C@]3(O)c2ccc(O)...,374.136553,JGWZMWCBIAYEIJ-AFHBHXEDSA-N


## Search for MassBank Compounds in KEGG Dataset

In [36]:
def ms_adduct_search(peak, ms_params, adducts, kegg_df):
    ids = set()
    
    potential_masses = [(peak.mz - adduct[2]) / adduct[1] for adduct in adducts]
    
    if ms_params["ppm"]:
        precision = (ms_params["tolerance"] / 100000.0) * potential_masses
    else:
        precision = ms_params["tolerance"] * 0.001  # convert to mDa
    upper_bounds = [pm + precision for pm in potential_masses]
    lower_bounds = [pm - precision for pm in potential_masses]

    # search database for hits in the each adducts mass range that have no
    # innate charge.
    mongo_ids = []
    for i, adduct in enumerate(adducts):
        ids = ids.union(search_kegg(kegg_df, lower_bounds[i], upper_bounds[i]))

    return ids

In [37]:
def search_kegg(kegg_df, lower_bounds, upper_bounds):
    sub_kegg_df = kegg_df.loc[(kegg_df.Monoisotopic_Mass >= lower_bounds) & (kegg_df.Monoisotopic_Mass <= upper_bounds)]
    return list(sub_kegg_df.ID)

In [38]:
adducts = [("[M+]+", 1, 0),
           ("[M+H]+", 1, 1.007276),
           ("[M+Na]+", 1, 22.989218)]
ms_params = {
    'adducts': ["[M+]+", "[M+H]+", "[M+Na]+"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'halogens': True,
    'verbose': False,
    'charge': '+',
    'models': []
}

In [39]:
ids = mb_df.loc[mb_df.Mode == 'Positive'].index
masses = [str(val) for val in mb_df.loc[mb_df.Mode == 'Positive'].Mass.values]
len(masses)

587

In [40]:
hits_dict = {}
peaks = []
for cpd_id, mass in zip(ids, masses):
    peak = Peak(cpd_id, 0, float(mass), 0)
    hits_dict[cpd_id] = ms_adduct_search(peak, ms_params, adducts, kegg_df)
    peaks.append(peak)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

0 188.0818 2
1 136.1121 4
2 234.1489 2
3 189.9821 1
4 294.1965 1
5 257.0478 0
6 336.1166 3
7 141.0771 4
8 216.0325 1
9 382.0816 3
10 411.1081 3
11 432.0642 2
12 134.0713 6
13 322.0777 1
14 214.1121 3
15 174.0541 25
16 219.0086 0
17 201.0692 2
18 193.1335 1
19 188.0697 14
20 215.0703 10
21 215.0849 1
22 125.0379 0
23 330.137 2
24 208.0968 6
25 226.1662 6
26 230.1167 6
27 202.0854 14
28 274.1186 1
29 302.1057 0
30 153.1022 2
31 230.1167 6
32 166.1226 5
33 241.0963 8
34 453.1187 1
35 482.1756 1
36 240.115 8
37 441.167 1
38 515.2442 1
39 559.2603 1
40 304.1543 5
41 197.084 2
42 231.1104 3
43 177.1386 0
44 278.1903 3
45 136.1121 4
46 290.1387 8
47 272.2009 2
48 278.1903 3
49 238.0993 1
50 178.1226 2
51 310.2165 2
52 150.1277 3
53 327.1371 2
54 256.1696 1
55 240.1594 3
56 370.1795 4
57 308.0507 2
58 557.3044 0
59 205.0794 2
60 295.2169 0
61 330.1096 10
62 408.1254 0
64 358.20129 2
65 271.0601 20
66 138.09134 11
67 353.08672 1
68 295.18049 5
69 368.11287 2
70 268.13321 5
71 166.08626 13
72 53

622 347.0762 7
623 301.0707 15
624 531.1497 0
625 365.102 1
628 170.0118 2
629 179.0339 4
630 216.1019 1
631 419.1337 5
633 315.0863 11
634 483.1906 1
636 286.1438 11
637 299.0914 5
638 197.0669 16
639 241.0859 17
640 297.1122 5
641 349.0626 1
642 373.2374 3
643 375.1439 9
646 258.1085 2
647 338.1459 2
648 579.1709 6
649 362.1347 13
650 345.1333 4
651 359.1489 14
652 283.0965 3
653 269.0809 9
654 281.1172 2
656 282.1489 0
657 198.0761 6
658 585.2906 1
659 221.0476 1
660 301.1071 10
662 345.0969 14
663 329.102 7


In [41]:
adducts = [("[M-]-", 1, -1.007276),
           ("[M+CH3COO]-", 1, 59.013851)]
ms_params = {
    'adducts': ["[M-]-", "[M+CH3COO]-"],
    'tolerance': 2,  # mDa
    'ppm': False,
    'halogens': True,
    'verbose': False,
    'charge': '-',
    'models': []
}

In [42]:
ids = mb_df.loc[mb_df.Mode == 'Negative'].index
masses = [str(val) for val in mb_df.loc[mb_df.Mode == 'Negative'].Mass.values]
len(masses)

47

In [43]:
for cpd_id, mass in zip(ids, masses):
    peak = Peak(cpd_id, 0, float(mass), 0)
    hits_dict[cpd_id] = ms_adduct_search(peak, ms_params, adducts, kegg_df)
    peaks.append(peak)
    print(cpd_id, mass, len(hits_dict[cpd_id]))

110 450.0204 0
115 145.06186 7
117 203.0826 7
118 118.05096 11
123 132.03023 6
130 300.2908 1
136 450.05678 1
388 406.2024 1
394 141.0193 11
419 343.0823 31
420 301.2173 33
427 367.1187 8
432 251.0713 4
439 339.1714 1
440 301.1445 12
441 301.2173 33
458 345.1343 17
459 263.1289 30
467 325.0929 6
471 435.1296 1
472 423.1296 4
473 455.353 15
479 289.0717 21
488 309.098 4
493 179.035 6
514 471.348 9
530 547.2661 1
534 193.0354 31
535 391.2854 15
540 241.083 6
544 173.0455 10
546 285.098 4
547 301.0717 14
548 375.2904 3
552 579.1719 6
554 277.2173 8
558 178.0509 6
559 465.3216 0
570 123.0451 8
574 205.0362 13
588 381.182 11
593 304.9778 0
644 225.0768 13
645 383.0136 0
664 265.083 1
665 165.0921 24
666 299.0561 18


## Calculate Statistics from Search Output

In [44]:
n_annotated = 0
num_hits = []

for peak in peaks:
    hits = hits_dict[peak.name]
    if len(hits) > 0:
        n_annotated += 1
    num_hits.append(len(hits))
    print(peak.name, len(hits))

0 2
1 4
2 2
3 1
4 1
5 0
6 3
7 4
8 1
9 3
10 3
11 2
12 6
13 1
14 3
15 25
16 0
17 2
18 1
19 14
20 10
21 1
22 0
23 2
24 6
25 6
26 6
27 14
28 1
29 0
30 2
31 6
32 5
33 8
34 1
35 1
36 8
37 1
38 1
39 1
40 5
41 2
42 3
43 0
44 3
45 4
46 8
47 2
48 3
49 1
50 2
51 2
52 3
53 2
54 1
55 3
56 4
57 2
58 0
59 2
60 0
61 10
62 0
64 2
65 20
66 11
67 1
68 5
69 2
70 5
71 13
72 5
73 19
74 2
76 0
77 8
78 1
79 1
80 26
81 3
82 15
83 10
84 0
85 10
86 6
87 1
88 3
89 2
90 8
91 4
93 7
95 6
96 6
97 1
98 15
99 16
100 8
102 5
103 6
104 1
105 10
106 9
107 4
108 9
109 3
111 3
112 13
113 49
114 6
116 3
119 23
122 5
124 3
125 3
126 3
127 40
128 10
129 4
132 3
133 13
134 4
135 40
137 10
138 13
139 0
140 38
141 3
142 2
143 2
144 1
145 1
146 7
147 4
148 2
149 0
150 1
151 1
152 2
153 1
154 10
155 0
156 1
157 2
158 2
159 4
160 1
161 1
162 1
163 1
164 3
165 2
166 4
167 1
168 1
169 2
170 3
171 0
172 1
173 1
174 2
175 4
176 2
177 3
178 1
179 3
180 1
181 1
182 1
183 2
184 1
185 6
186 7
187 5
188 1
189 1
190 3
191 3
192 4
193 8
194 1

In [45]:
n_annotated

567

In [46]:
n_annotated / 634

0.8943217665615142

In [47]:
np.median(num_hits)

3.0

In [48]:
np.mean(num_hits)

5.047318611987381

## Calc Correct Annotation

In [51]:
print('Searching KEGG for specific compounds...')
mb_inchikeys = set([inchikey.split('-')[0] for inchikey in mb_df.InChI_Key if isinstance(inchikey, str)])
kegg_inchikeys = set([inchikey.split('-')[0] for inchikey in kegg_df.InChI_Key if isinstance(inchikey, str)])

matches = mb_inchikeys.intersection(kegg_inchikeys)

# Some inchikeys in MB df have same first part so can't use set intersection to calculate
n_matches = 0
for inchi_key in mb_df['InChI_Key']:
    for match in matches:
        if inchi_key.split('-')[0] == match:
            n_matches += 1
            break

len(matches)

Searching KEGG for specific compounds...


358

## # Unique Formulas

In [50]:
formulas = set()
for smiles in kegg_df.SMILES:
    if isinstance(smiles, float) and np.isnan(smiles):
        continue
    elif '*' not in smiles:
        mol = MolFromSmiles(smiles)
        #mol = neutralise_charges(mol)
        formula = CalcMolFormula(mol)
        formulas.add(formula)
    else:
        continue

In [25]:
len(formulas)

9195