**This notebook contains a function that fetches a SMILE string starting from a KEGG compound page. It also contains the associated unittests.**

USE KEGG_DF_TO_SMILES and SID_TO_SMILES

All of the unit tests are passing as of 3/5/19 at 7:30 pm. 

This notebook contains the basis of a function(s) to take a PubChem ID number and fetch the associated SMILES string from PubChem.

It also contains code pieces to pull an SID from a KEGG webpage.

In [1]:
import numpy as np
import pandas as pd
import pubchempy as pc


There are multiple identifier types for each chemical in PubChem. The two we are interacting with here are **SID** (substance ID) and **CID** (chemical ID). CID can be used to acces SMILES directly with PubChemPy. **KEGG does not have CID**, only SID. SID can be turned into CID from which SMILES can be found. 

### Get SMILES from CID and SID

__________________

### Manipulate DF containing SID into SMILES

In [5]:
#%%writefile pubchem_client.py

import numpy as np
import pandas as pd
import pubchempy as pc


def sid_to_smiles(sid):
    """Takes a PubChem SID. Returns the associated isomeric SMILES string and PubChem CID.

    Args:
        sid : The PubChem SID number.

    Returns:
        str: isomeric smiles.
        int: Pubchem CID number.

    """

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]

    return compound.isomeric_smiles, cid


def kegg_df_to_smiles(kegg_df, column_name):
    """
    Args:
        kegg_df : pandas dataframe with SID numbers
        column_name (str) : name of column that contains PubChem SID numbers

    Returns:
        kegg_df : modified with columns containing CID and SMILES
                  CID becomes first column, SMILES second
        unsuccessful_list : list of SIDs for which no CID or SMILES were found

    """

    res = []
    cid_list = []
    unsuccessful_list = []
    #copy = kegg_df.copy()
    for i in range(len(kegg_df)):
        # cell index of desired SID
        sid = kegg_df.loc[i, column_name]
        try:
            smile_result = sid_to_smiles(sid)[0]
            res.append(smile_result)
            cid_result = sid_to_smiles(sid)[1]
            cid_list.append(cid_result)
        except BaseException:
            res.append('none')
            cid_list.append('none')
            unsuccessful_list.append(sid)
            pass
        
    #kegg_df['CID'] = cid_list
    #kegg_df['SMILES'] = res
    kegg_df.insert(0, column='CID', value=cid_list)
    # Change this 2 to the number where the smiles column should be
    kegg_df.insert(1, column='SMILES', value=res)
    # kegg_df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv')
    
    return kegg_df, unsuccessful_list


def csv_wrapper(input_csv, column_name, output_csv):
    """
    Args:
        input_csv (str) : 'input_csv.csv' format; in current directory
        column_name (str) : name of column that contains PubChem SID numbers
        output_csv (str) : 'output_csv.csv' format
    
    Returns:
        output_csv : saves as 'output_csv.csv' into current directory
    """
    
    input_df = pd.read_csv(input_csv).astype(str)
#     input_df['reacts'] = input_df['reacts'].astype(float)
    output, _ = kegg_df_to_smiles(input_df, column_name)

    return output.to_csv(r'' + output_csv, index=False)

In [6]:
!ls ../datasets/KEGG_compounds_no_SMILES.csv

../datasets/KEGG_compounds_no_SMILES.csv


In [7]:
csv_wrapper('../datasets/KEGG_compounds_no_SMILES.csv', 'pubchem_id', '../datasets/KEGG_compounds_pubchem_SMILES.csv' )

ValueError: Length of values does not match length of index

In [128]:
#%%writefile test_pubchem_client.py

import pandas as pd

from pandas.util.testing import assert_frame_equal

import pubchem_client


def test_sid_to_smiles():
    """Unit test for pubchem_client.py sid_to_smiles."""

    sids = ['3489', '3990']
    expected = ['C(CO)N', 'C1CSSC1CCCCC(=O)O']
    actual = []

    for sid in sids:
        result_smile = pubchem_client.sid_to_smiles(sid)

        assert len(
            result_smile) >= 1, 'SMILES string is very short. Check SMILES.'
        isinstance(result_smile, str), 'SMILES not returned as string.'

        actual.append(result_smile[0])

    assert expected == actual, 'Actual SMILES are not the expected SMILES.'

    return


def test_kegg_df_to_smiles():
    """Unit test for pubchem_client.py kegg_df_to_smiles."""

    test_frame = pd.DataFrame([['space fill', 'ethanolamine', '1.0', '3489'], [
                              'space fill', 'pyruvate', '1.0', '3324']], columns=['Filler', 'Compound Name', 'Reacts', 'SID'])

    expected_frame = pd.DataFrame([[int(700),
                                    'C(CO)N',
                                    'space fill',
                                    'ethanolamine',
                                    '1.0',
                                    '3489'
                                    ],
                                   [int(1060),
                                    'CC(=O)C(=O)O',
                                    'space fill',
                                    'pyruvate',
                                    '1.0',
                                    '3324',
                                    ]],
                                  columns=['CID',
                                           'SMILES',
                                           'Filler',
                                           'Compound Name',
                                           'Reacts',
                                           'SID',
                                           ])
    column_name = 'SID'
    result_frame = pubchem_client.kegg_df_to_smiles(test_frame, column_name)

    assert_frame_equal(
        result_frame[0], expected_frame), 'Did not generate expected df.'

    return


def test_csv_wrapper():
    """Unit test for csv_wrapper function."""
    
    filler = ['fill', 'fill', 'fill', 'fill']
    sids = ['3305', '3333', '3480', '3371']
    compounds = ['NAD', 'glucose', 'benzoic acid', 'unknown']
    react = ['1.0', '0.0','1.0', '0.0']
    tuple_list = list(zip(filler, compounds, react, sids))
    test_df = pd.DataFrame(tuple_list, columns=['enzyme', 'product', 'reacts', 'pubchem_id'])
    
    test_df.to_csv(r'test_df.csv', index=False)
    
    expected = pd.DataFrame([['5893',
  'C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O)C(=O)N',
  'fill',
  'NAD',
  1.0,
  3305],
 ['5793',
  'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O',
  'fill',
  'glucose',
  0.0,
  3333],
 ['243', 'C1=CC=C(C=C1)C(=O)O', 'fill', 'benzoic acid', 1.0, 3480],
 ['none', 'none', 'fill', 'unknown', 0.0, 3371]], columns=['CID', 'SMILES', 'enzyme', 'product', 'reacts', 'pubchem_id'])
    
    column_name = 'pubchem_id'
    
    pubchem_client.csv_wrapper('test_df.csv', column_name, 'wrapper_test.csv')
    
    actual = pd.read_csv('wrapper_test.csv')
    
    assert expected.loc[1, 'reacts'] == actual.loc[1, 'reacts']
    assert expected.loc[2, 'SMILES'] == actual.loc[2, 'SMILES']
    
    return

Overwriting test_kegg_data.py
