In [25]:
# create a function that return the general information of that compound e.g., number of each atom, MW, unsaturation
# this will be cpd_info.py

from rdkit import Chem
from rdkit.Chem import AllChem

def count_C(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
def count_O(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8)
def count_N(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 7)
def count_P(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 15)
def count_S(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 16)
def count_X(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 9 or atom.GetAtomicNum() == 17 or atom.GetAtomicNum() == 35 or atom.GetAtomicNum() == 53)
def count_H(mol):
    H = 0
    for i in range(mol.GetNumAtoms()):
        H += mol.GetAtomWithIdx(i).GetTotalNumHs(includeNeighbors=True)
    return H

from rdkit.Chem.Descriptors import MolWt

def cpd_inform(SMILES):
    
    """A function for getting compound information from SMILES string
    it received a SMILES string and return a dictionary of information consisted of number of C, H, O , N, P, S, X, Degree of Unsaturation and Molecular Weight"""
    info = []
    mol = Chem.MolFromSmiles(SMILES)
    info.append(float(count_C(mol)))
    info.append(float(count_H(mol)))
    info.append(float(count_O(mol)))
    info.append(float(count_N(mol)))
    info.append(float(count_P(mol)))
    info.append(float(count_S(mol)))
    info.append(float(count_X(mol)))
    info.append((2*info[0] + 2 + info[3] + info[4] - info[6] - info[1])/2) # it is (2*C + 2 + N + P - X - H)/2
    info.append(MolWt(mol))
    return info

# Create a function that create a new column of chemical information

import pandas as pd

def create_cpd_info(input_df, col_name='SMILES'):
    
    """Receive a DataFrame and return a dataframe with additional columns named n_C, n_H, ..., DoU, and MW"""
    
    # create an empty funciton with either empty strint '' or NaN by np.nan

    n_C = []
    n_H = []
    n_O = []
    n_N = []
    n_P = []
    n_S = []
    n_X = []
    DoU = []
    MW = []

    for row in range(input_df.shape[0]):
        mol = input_df[col_name][row]
        info = cpd_inform(mol)
        n_C.append(info[0])
        n_H.append(info[1])
        n_O.append(info[2])
        n_N.append(info[3])
        n_P.append(info[4])
        n_S.append(info[5])
        n_X.append(info[6])
        DoU.append(info[7])
        MW.append(info[8])

    input_df['n_C'] = pd.DataFrame(n_C)
    input_df['n_H'] = pd.DataFrame(n_H)
    input_df['n_O'] = pd.DataFrame(n_O)
    input_df['n_N'] = pd.DataFrame(n_N)
    input_df['n_P'] = pd.DataFrame(n_P)
    input_df['n_S'] = pd.DataFrame(n_S)
    input_df['n_X'] = pd.DataFrame(n_X)
    input_df['DoU'] = pd.DataFrame(DoU)
    input_df['MW'] = pd.DataFrame(MW)
    
    return input_df

In [46]:
# This cell is test_cpd_info.py

from rdkit import Chem
from rdkit.Chem import AllChem
from cpd_info import cpd_inform
from cpd_info import create_cpd_info

def test_cpd_inform():
    rapamycin = 'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)O)C)/C)O)OC)C)C)/C)OC'
    test = cpd_inform(rapamycin)
    
    assert test[0] == 51, "Carbon count is incorrect"
    assert test[1] == 79, "Hydrogen count is incorrect"
    assert type(test[-1]) == type(1.0), "TypeError: Molecular Weight should be float"
    
    return 'Test pass, yayyyyyy'
import pandas as pd

def test_create_cpd_info():
    
    """A unit test for create compound info"""
    
    df_master = pd.DataFrame(['C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O',
         'C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O',
         'C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O', 
         'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)O)C)/C)O)OC)C)C)/C)OC'] , columns=['SMILES'])
    test = create_cpd_info(df_master)
    
    assert test['n_C'][0] == 6, "ValueError: Carbon count is incorrect"
    assert test['DoU'][3] == 13, "ValueError: Degree of Unsaturation in inaccurate"
    assert type(test['MW'][2]) == type(test['n_C'][0]), "TypeError: MW should be float"
    assert type(test['n_H'][3]) == type(test['n_C'][0]), "TypeError: All data should be float"
    
    return 'Test pass, you can use it to create compound info columns'

In [27]:
rapamycin = 'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)O)C)/C)O)OC)C)C)/C)OC'
test = cpd_inform(rapamycin)
test

[51.0, 79.0, 13.0, 1.0, 0.0, 0.0, 0.0, 13.0, 914.1870000000001]

In [28]:
len(test)

9

In [29]:
test_cpd_inform()

'Test pass, yayyyyyy'

In [30]:
df_master = pd.DataFrame(['C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O',
         'C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O',
         'C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O', 
         'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)O)C)/C)O)OC)C)C)/C)OC'] , columns=['SMILES'])
df_master

Unnamed: 0,SMILES
0,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
1,C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O
2,C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O
3,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[...


In [37]:
test = create_cpd_info(df_master)
type(test['MW'][2])

numpy.float64

In [39]:
type(test['n_C'][2])

numpy.float64

In [42]:
type(5.0)

float

In [32]:
# old function
%timeit create_cpd_info(df_master, 'SMILES')

8.96 ms ± 276 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
create_cpd_info(df_master, 'SMILES')

Unnamed: 0,SMILES,n_C,n_H,n_O,n_N,n_P,n_S,n_X,DoU,MW
0,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,6.0,12.0,6.0,0.0,0.0,0.0,0.0,1.0,180.156
1,C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O,6.0,12.0,6.0,0.0,0.0,0.0,0.0,1.0,180.156
2,C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O,6.0,12.0,6.0,0.0,0.0,0.0,0.0,1.0,180.156
3,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[...,51.0,79.0,13.0,1.0,0.0,0.0,0.0,13.0,914.187


In [47]:
test_create_cpd_info()

'Test pass, you can use it to create compound info columns'