**This notebook contains a function that fetches a SMILE string starting from a KEGG compound page. It also contains the associated unittests.**

USE KEGG_DF_TO_SMILES and SID_TO_SMILES

All of the unit tests are passing as of 3/5/19 at 7:30 pm. 

This notebook contains the basis of a function(s) to take a PubChem ID number and fetch the associated SMILES string from PubChem.

It also contains code pieces to pull an SID from a KEGG webpage.

In [None]:
import numpy as np
import pandas as pd
import pubchempy as pc

There are multiple identifier types for each chemical in PubChem. The two we are interacting with here are **SID** (substance ID) and **CID** (chemical ID). CID can be used to acces SMILES directly with PubChemPy. **KEGG does not have CID**, only SID. SID can be turned into CID from which SMILES can be found. 

### Get SMILES from CID and SID

In [None]:
# get SMILES directly from CID
for compound in pc.get_compounds('243'):
    print(compound.isomeric_smiles)

In [None]:
# get SMILES from SID through mapping to CID
substance = pc.Substance.from_sid('3371')
cid = substance.standardized_cid
if cid == None:
    pass
    none_tally += 1
else:
    compound = pc.get_compounds(cid)[0]

print(none_tally)
#print(compound.isomeric_smiles, compound)

In [None]:
def sid_to_smiles(df):
    """Takes an SID and prints the associated SMILES string."""
    sid = df['SID']
    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    if cid == None:
        pass
        none_tally += 1
    else:
        compound = pc.get_compounds(cid)[0]

    return compound.isomeric_smiles

In [None]:
# for compound in pc.get_compounds('glucose', 'name'):
#    print(compound.cid, compound.isomeric_smiles, compound.smiles)

In [None]:
#%%writefile make_smiles_utils.py

import pubchempy as pc
# I could easily make this handle SIDs, too, but then the user would have to specify whether it is an SID or a CID.
def user_input_to_smiles(input_cid):
    """Takes a PubChem CID input and outputs the associated SMILES."""
    assert type(input_cid) is int, 'Expected an integer ID input'
    for compound in pc.get_compounds(input_cid):
        print(compound.isomeric_smiles)

In [None]:
user_input_to_smiles(243)

In [None]:
#%%writefile test_make_smiles_utils.py

import pubchempy as pc
import make_smiles_utils

def test_user_input_to_smiles():
    # check that the input is an integer
    # check that the output is a string
    
    # CID for thiophene
    inp_cid = 8030 
    smiles = make_smiles_utils.user_input_to_smiles(inp_cid)
    
    assert len(str(smiles)) == 4, 'This is not the correct SMILES length for thiophene'
    assert str(smiles) == 'C1=CSC=C1', 'This is not the correct SMILES for thiophene'
    
    return #len(str(smiles))

In [None]:
test_user_input_to_smiles()

__________________

### Manipulate DF containing SID into SMILES

In [None]:
filler = ['fill', 'fill', 'fill', 'fill']
sids = ['3305', '3333', '3480', '3371']
compounds = ['NAD', 'glucose', 'benzoic acid', 'unknown']
tuple_list = list(zip(filler, compounds, sids))
tuple_list
devo_df = pd.DataFrame(tuple_list, columns=['Filler', 'Compound Name','SID'])
devo_df

kegg_df_to_smiles(devo_df)

In [1]:
#%%writefile pubchem_client.py

import pubchempy as pc
import pandas as pd
import numpy as np

def sid_to_smiles(sid):
    """Takes an SID and prints the associated SMILES string."""

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
        
    
    return compound.isomeric_smiles, cid


def kegg_df_to_smiles(kegg_df):
    """Takes a pandas dataframe that includes a column of SIDs, gets the isomeric SMILES for each SID, stores them as a list, then adds a SMILES column."""

    res = [] 
    cid_list = []
    unsuccessful_list = []
    
    for i in range(len(kegg_df)):
        sid = kegg_df.iloc[i, 2] #CHANGE THIS 1 TO THE PROPER COLUMN NUMBER FOR SID
        try:
            smile_result = sid_to_smiles(sid)[0]
            res.append(smile_result)
            cid_result = sid_to_smiles(sid)[1]
            cid_list.append(cid_result)
        except:
            res.append('none')
            cid_list.append('none')
            unsuccessful_list.append(sid)
            pass
        
    kegg_df.insert(3, column='CID', value=cid_list)    
    kegg_df.insert(4, column='SMILES', value=res) #Change this 2 to the number where the smiles column should be
    #kegg_df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv')
    
    return kegg_df, unsuccessful_list

In [36]:
#%%writefile test_pubchem_client.py

import pandas as pd

from pandas.util.testing import assert_frame_equal

import pubchem_client

def test_sid_to_smiles():
    
    sids = ['3489', '3990']
    expected = ['C(CO)N', 'C1CSSC1CCCCC(=O)O']
    actual = []
    
    for sid in sids:
        result_smile = pubchem_client.sid_to_smiles(sid)
        
        assert len(result_smile) >= 1, 'SMILES string is very short. Check SMILES.'
        isinstance(result_smile, str), 'SMILES not returned as string.'
        
        actual.append(result_smile[0])
    
    assert expected == actual, 'Actual SMILES are not the expected SMILES.'
    
    return
   
    
    
def test_kegg_df_to_smiles():
    
    test_frame = pd.DataFrame([['space fill', 'ethanolamine', '3489'], ['space fill', 'pyruvate', '3324']], columns=['Filler', 'Compound Name', 'SID'])
    
    expected_frame = pd.DataFrame([['space fill', 'ethanolamine', '3489', int(700), 'C(CO)N'], ['space fill', 'pyruvate', '3324', int(1060), 'CC(=O)C(=O)O']], columns=['Filler', 'Compound Name', 'SID', 'CID', 'SMILES'])
    
    result_frame = pubchem_client.kegg_df_to_smiles(test_frame)
    
    assert_frame_equal(result_frame[0], expected_frame), 'Did not generate expected df.'
    
    return 


Overwriting test_pubchem_client.py


In [29]:
test_sid_to_smiles()

In [30]:
test_kegg_df_to_smiles()

---
# Probably don't need this stuff, keeping it just in case

### Get SID from KEGG url 

This currently works from the compound page. I have not seen if it can be pulled from the reaction page.


In [None]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def sid_to_smiles(sid):
    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    return compound.isomeric_smiles


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string


def kegg_to_smiles(url):
    """Uses the KEGG compound page url to find the compound's PubChem SID, then to find the SMILES for that compound using the SID."""

    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    substance = pc.Substance.from_sid(sid.string)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]

    print(compound.isomeric_smiles)

In [None]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string

In [None]:
# url of the desired KEGG compound page
url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180'
# access the url
response = requests.get(url)

# turn the webpage into html
soup = BeautifulSoup(response.content, 'html.parser')

# find the link that contains 'pubchem'
sid = soup.find('a', href=re.compile('https://pubchem\.ncbi'))

# print the string that is displayed as the link
# (this is the SID, which works with pubchempy to get the SMILES)
print(sid.string)

In [None]:
#%%writefile test_kegg_utils.py
import unittest

import kegg_utils


def test_kegg_to_sid():
    url_list = [
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00002']
    for url in url_list:
        sid_str = kegg_utils.kegg_to_sid(url)
        sid_str.isdigit(), 'SID contains characters other than numbers'
    return


def test_kegg_to_smiles():

    url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587'
    smiles = kegg_utils.kegg_to_sid(url)

    assert len(smiles) >= 1, 'SMILES string is very short. Check SMILES.'
    isinstance(smiles, str), 'SMILES not returned as string.'
    
    return

import pandas as pd
import pubchempy as pc

import kegg_utils

def test_kegg_df_to_smiles():
    
    test_frame = pd.DataFrame([['ethanolamine', '3489'], ['pyruvate', '3324']], columns=['Compound Name', 'SID'])
    
    expected_frame = pd.DataFrame([['ethanolamine', '3489', 'C(CO)N'], ['pyruvate', '3324', 'CC(=O)C(=O)O']], columns=['Compound Name', 'SID', 'SMILES'])
    
    result_frame = kegg_utils.kegg_df_to_smiles(test_frame)
    
    assert result_frame.equals(expected_frame), 'Did not generate expected df.'
    
    return

biopython kegg api to pull SID

store smiles into dataframe to join later 
