**This notebook contains a function that fetches a SMILE string starting from a KEGG compound page. It also contains the associated unittests.**

USE KEGG_DF_TO_SMILES and SID_TO_SMILES

All of the unit tests are passing as of 3/5/19 at 7:30 pm. 

This notebook contains the basis of a function(s) to take a PubChem ID number and fetch the associated SMILES string from PubChem.

It also contains code pieces to pull an SID from a KEGG webpage.

In [52]:
import numpy as np
import pandas as pd
import pubchempy as pc
import requests
import re
from time import sleep

from bs4 import BeautifulSoup


There are multiple identifier types for each chemical in PubChem. The two we are interacting with here are **SID** (substance ID) and **CID** (chemical ID). CID can be used to acces SMILES directly with PubChemPy. **KEGG does not have CID**, only SID. SID can be turned into CID from which SMILES can be found. 

### Get SMILES from CID and SID

In [3]:
# get SMILES directly from CID
for compound in pc.get_compounds('243'):
    print(compound.isomeric_smiles)

C1=CC=C(C=C1)C(=O)O


In [125]:
# get SMILES from SID through mapping to CID
substance = pc.Substance.from_sid('3990')
cid = substance.standardized_cid
compound = pc.get_compounds(cid)[0]
print(compound.isomeric_smiles, compound)

C1CSSC1CCCCC(=O)O Compound(864)


In [35]:
def sid_to_smiles(df):
    """Takes an SID and prints the associated SMILES string."""
    sid = df['SID']
    substance = pc.Substance.from_sid(sid)
    sleep(0.05)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    print(compound.isomeric_smiles)

In [None]:
# for compound in pc.get_compounds('glucose', 'name'):
#    print(compound.cid, compound.isomeric_smiles, compound.smiles)

In [2]:
#%%writefile make_smiles_utils.py

import pubchempy as pc
# I could easily make this handle SIDs, too, but then the user would have to specify whether it is an SID or a CID.
def user_input_to_smiles(input_cid):
    """Takes a PubChem CID input and outputs the associated SMILES."""
    assert type(input_cid) is int, 'Expected an integer ID input'
    for compound in pc.get_compounds(input_cid):
        print(compound.isomeric_smiles)

Overwriting make_smiles_utils.py


In [72]:
user_input_to_smiles(243)

C1=CC=C(C=C1)C(=O)O


In [33]:
#%%writefile test_make_smiles_utils.py

import pubchempy as pc
import make_smiles_utils

def test_user_input_to_smiles():
    # check that the input is an integer
    # check that the output is a string
    
    # CID for thiophene
    inp_cid = 8030 
    smiles = make_smiles_utils.user_input_to_smiles(inp_cid)
    
    assert len(str(smiles)) == 4, 'This is not the correct SMILES length for thiophene'
    assert str(smiles) == 'C1=CSC=C1', 'This is not the correct SMILES for thiophene'
    
    return #len(str(smiles))

In [35]:
test_user_input_to_smiles()

C1=CSC=C1


__________________

### Manipulate DF containing SID into SMILES

In [111]:
filler = ['fill', 'fill', 'fill', 'fill']
sids = ['3305', '3333', '3480', '3371']
compounds = ['NAD', 'glucose', 'benzoic acid', 'unknown']
react = ['1.0', '0.0','1.0', '0.0']
tuple_list = list(zip(filler, compounds, react, sids))
devo_df = pd.DataFrame(tuple_list, columns=['enzyme', 'product', 'reacts', 'pubchem_id'])
devo_df.values.tolist()

Unnamed: 0,Compound Name,SID
0,NAD,3305
1,glucose,3333
2,benzoic acid,3480
3,methanol,3432


In [None]:
devo_df.to_csv(r'devo_df.csv', index=False)

In [129]:
%%writefile pubchem_client.py

import numpy as np
import pandas as pd
import pubchempy as pc


def sid_to_smiles(sid):
    """Takes a PubChem SID. Returns the associated isomeric SMILES string and PubChem CID.

    Args:
        sid : The PubChem SID number.

    Returns:
        str: isomeric smiles.
        int: Pubchem CID number.

    """

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]

    return compound.isomeric_smiles, cid


def kegg_df_to_smiles(kegg_df, column_name):
    """
    Args:
        kegg_df : pandas dataframe with SID numbers
        column_name (str) : name of column that contains PubChem SID numbers

    Returns:
        kegg_df : modified with columns containing CID and SMILES
                  CID becomes first column, SMILES second
        unsuccessful_list : list of SIDs for which no CID or SMILES were found

    """

    res = []
    cid_list = []
    unsuccessful_list = []
    #copy = kegg_df.copy()
    for i in range(len(kegg_df)):
        # cell index of desired SID
        sid = kegg_df.loc[i, column_name]
        try:
            smile_result = sid_to_smiles(sid)[0]
            res.append(smile_result)
            cid_result = sid_to_smiles(sid)[1]
            cid_list.append(cid_result)
        except BaseException:
            res.append('none')
            cid_list.append('none')
            unsuccessful_list.append(sid)
            pass
        
    #kegg_df['CID'] = cid_list
    #kegg_df['SMILES'] = res
    kegg_df.insert(0, column='CID', value=cid_list)
    # Change this 2 to the number where the smiles column should be
    kegg_df.insert(1, column='SMILES', value=res)
    # kegg_df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv')
    
    return kegg_df, unsuccessful_list


def csv_wrapper(input_csv, column_name, output_csv):
    """
    Args:
        input_csv (str) : 'input_csv.csv' format; in current directory
        column_name (str) : name of column that contains PubChem SID numbers
        output_csv (str) : 'output_csv.csv' format
    
    Returns:
        output_csv : saves as 'output_csv.csv' into current directory
    """
    
    input_df = pd.read_csv(input_csv).astype(str)
    input_df['reacts'] = input_df['reacts'].astype(float)
    output, _ = kegg_df_to_smiles(input_df, column_name)

    return output.to_csv(r'' + output_csv, index=False)

Overwriting kegg_data.py


In [None]:
csv_wrapper('devo_df.csv', 'pubchem_id', 'it_worked_df.csv' )

In [112]:
kegg_df_to_smiles(devo_df)

Unnamed: 0,Compound Name,SID,SMILES
0,NAD,3305,C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)...
1,glucose,3333,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
2,benzoic acid,3480,C1=CC=C(C=C1)C(=O)O
3,methanol,3432,CO


In [128]:
%%writefile test_pubchem_client.py

import pandas as pd

from pandas.util.testing import assert_frame_equal

import pubchem_client


def test_sid_to_smiles():
    """Unit test for pubchem_client.py sid_to_smiles."""

    sids = ['3489', '3990']
    expected = ['C(CO)N', 'C1CSSC1CCCCC(=O)O']
    actual = []

    for sid in sids:
        result_smile = pubchem_client.sid_to_smiles(sid)

        assert len(
            result_smile) >= 1, 'SMILES string is very short. Check SMILES.'
        isinstance(result_smile, str), 'SMILES not returned as string.'

        actual.append(result_smile[0])

    assert expected == actual, 'Actual SMILES are not the expected SMILES.'

    return


def test_kegg_df_to_smiles():
    """Unit test for pubchem_client.py kegg_df_to_smiles."""

    test_frame = pd.DataFrame([['space fill', 'ethanolamine', '1.0', '3489'], [
                              'space fill', 'pyruvate', '1.0', '3324']], columns=['Filler', 'Compound Name', 'Reacts', 'SID'])

    expected_frame = pd.DataFrame([[int(700),
                                    'C(CO)N',
                                    'space fill',
                                    'ethanolamine',
                                    '1.0',
                                    '3489'
                                    ],
                                   [int(1060),
                                    'CC(=O)C(=O)O',
                                    'space fill',
                                    'pyruvate',
                                    '1.0',
                                    '3324',
                                    ]],
                                  columns=['CID',
                                           'SMILES',
                                           'Filler',
                                           'Compound Name',
                                           'Reacts',
                                           'SID',
                                           ])
    column_name = 'SID'
    result_frame = pubchem_client.kegg_df_to_smiles(test_frame, column_name)

    assert_frame_equal(
        result_frame[0], expected_frame), 'Did not generate expected df.'

    return


def test_csv_wrapper():
    """Unit test for csv_wrapper function."""
    
    filler = ['fill', 'fill', 'fill', 'fill']
    sids = ['3305', '3333', '3480', '3371']
    compounds = ['NAD', 'glucose', 'benzoic acid', 'unknown']
    react = ['1.0', '0.0','1.0', '0.0']
    tuple_list = list(zip(filler, compounds, react, sids))
    test_df = pd.DataFrame(tuple_list, columns=['enzyme', 'product', 'reacts', 'pubchem_id'])
    
    test_df.to_csv(r'test_df.csv', index=False)
    
    expected = pd.DataFrame([['5893',
  'C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O)C(=O)N',
  'fill',
  'NAD',
  1.0,
  3305],
 ['5793',
  'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O',
  'fill',
  'glucose',
  0.0,
  3333],
 ['243', 'C1=CC=C(C=C1)C(=O)O', 'fill', 'benzoic acid', 1.0, 3480],
 ['none', 'none', 'fill', 'unknown', 0.0, 3371]], columns=['CID', 'SMILES', 'enzyme', 'product', 'reacts', 'pubchem_id'])
    
    column_name = 'pubchem_id'
    
    pubchem_client.csv_wrapper('test_df.csv', column_name, 'wrapper_test.csv')
    
    actual = pd.read_csv('wrapper_test.csv')
    
    assert expected.loc[1, 'reacts'] == actual.loc[1, 'reacts']
    assert expected.loc[2, 'SMILES'] == actual.loc[2, 'SMILES']
    
    return

Overwriting test_kegg_data.py


---
### Get SID from KEGG url 

### Probably don't need this stuff, keeping it just in case

This currently works from the compound page. I have not seen if it can be pulled from the reaction page.


In [118]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def sid_to_smiles(sid):
    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    return compound.isomeric_smiles


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string


def kegg_to_smiles(url):
    """Uses the KEGG compound page url to find the compound's PubChem SID, then to find the SMILES for that compound using the SID."""

    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    substance = pc.Substance.from_sid(sid.string)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]

    print(compound.isomeric_smiles)

Overwriting kegg_utils.py


In [None]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string

In [7]:
# url of the desired KEGG compound page
url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180'
# access the url
response = requests.get(url)

# turn the webpage into html
soup = BeautifulSoup(response.content, 'html.parser')

# find the link that contains 'pubchem'
sid = soup.find('a', href=re.compile('https://pubchem\.ncbi'))

# print the string that is displayed as the link
# (this is the SID, which works with pubchempy to get the SMILES)
print(sid.string)

3480


In [119]:
#%%writefile test_kegg_utils.py
import unittest

import kegg_utils


def test_kegg_to_sid():
    url_list = [
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00002']
    for url in url_list:
        sid_str = kegg_utils.kegg_to_sid(url)
        sid_str.isdigit(), 'SID contains characters other than numbers'
    return


def test_kegg_to_smiles():

    url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587'
    smiles = kegg_utils.kegg_to_sid(url)

    assert len(smiles) >= 1, 'SMILES string is very short. Check SMILES.'
    isinstance(smiles, str), 'SMILES not returned as string.'
    
    return

import pandas as pd
import pubchempy as pc

import kegg_utils

def test_kegg_df_to_smiles():
    
    test_frame = pd.DataFrame([['ethanolamine', '3489'], ['pyruvate', '3324']], columns=['Compound Name', 'SID'])
    
    expected_frame = pd.DataFrame([['ethanolamine', '3489', 'C(CO)N'], ['pyruvate', '3324', 'CC(=O)C(=O)O']], columns=['Compound Name', 'SID', 'SMILES'])
    
    result_frame = kegg_utils.kegg_df_to_smiles(test_frame)
    
    assert result_frame.equals(expected_frame), 'Did not generate expected df.'
    
    return

Overwriting test_kegg_utils.py


biopython kegg api to pull SID

store smiles into dataframe to join later 
