# Take list of all KEGG compound SMILES and calculate exact mass for each

In [43]:
import pandas as pd
import numpy as np
from minedatabase.utils import neutralise_charges
from rdkit.Chem.AllChem import SanitizeMol, MolFromSmiles, MolToInchiKey
from rdkit.Chem.Descriptors import ExactMolWt

In [33]:
DATA_FILEPATH = './../Data/kegg.csv'
OUT_FILEPATH = './../Data/kegg_mass.csv'

In [58]:
df = pd.read_csv(DATA_FILEPATH)

In [59]:
df.head()

Unnamed: 0,ID,SMILES
0,C00001,O
1,C00002,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
2,C00003,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...
3,C00004,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...
4,C00005,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...


In [60]:
df.tail()

Unnamed: 0,ID,SMILES
18617,C22171,Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...
18618,C22172,O=C(O)[C@H](O)COP(=O)(O)OC[C@@H](O)[C@@H](O)[C...
18619,C22173,CC[C@H](CO)[C@H](N)C(=O)O
18620,C22174,C[C@H](O)[C@H](CO)[C@H](N)C(=O)O
18621,C22175,COc1cc([C@H]2OC[C@H]3[C@@H]2CO[C@]3(O)c2ccc(O)...


In [61]:
n_star = 0
n_nan = 0
mass_col = []
inchikey_col = []
for _id, smiles in zip(df.ID, df.SMILES):
    if isinstance(smiles, float) and np.isnan(smiles):
        n_nan += 1
        mass_col.append(np.nan)
        inchikey_col.append(np.nan)
    elif '*' not in smiles:
        mol = MolFromSmiles(smiles)
        #mol = neutralise_charges(mol)
        mass = ExactMolWt(mol)
        inchikey = MolToInchiKey(mol)
        mass_col.append(mass)
        inchikey_col.append(inchikey)
    else:
        n_star += 1
        mass_col.append(np.nan)
        inchikey_col.append(np.nan)



In [62]:
df['Monoisotopic_Mass'] = mass_col
df['InChI_Key'] = inchikey_col

In [63]:
df.head()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
0,C00001,O,18.010565,XLYOFNOQVPJJNP-UHFFFAOYSA-N
1,C00002,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,506.995745,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,C00003,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...,664.116398,BAWFJGJZGIEFAR-NNYOXOHSSA-O
3,C00004,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,665.124772,BOPGDPNILDQYTO-NNYOXOHSSA-N
4,C00005,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,745.091102,ACFIXJIJDZMPPO-NNYOXOHSSA-N


In [64]:
df.tail()

Unnamed: 0,ID,SMILES,Monoisotopic_Mass,InChI_Key
18617,C22171,Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,531.040374,UBONPDGCIQEDRL-QWEIRQIHSA-N
18618,C22172,O=C(O)[C@H](O)COP(=O)(O)OC[C@@H](O)[C@@H](O)[C...,531.089024,BJMFZLLJDXJPEK-YJNKXOJESA-N
18619,C22173,CC[C@H](CO)[C@H](N)C(=O)O,147.089543,FBQPPRTWSNHYNZ-UHNVWZDZSA-N
18620,C22174,C[C@H](O)[C@H](CO)[C@H](N)C(=O)O,163.084458,UBLKCZXWFBWGNA-YUPRTTJUSA-N
18621,C22175,COc1cc([C@H]2OC[C@H]3[C@@H]2CO[C@]3(O)c2ccc(O)...,374.136553,JGWZMWCBIAYEIJ-AFHBHXEDSA-N


In [65]:
df.to_csv(OUT_FILEPATH, index=False)

In [66]:
n_star

1468

In [67]:
n_nan

37

In [68]:
len(df)

18622

In [69]:
len(df) - n_star - n_nan

17117