The purpose of this notebook is to parse all of the PubChem numbers from the kegg_compound master dataset and write them to a csv file so that our pubchem_client tool can pull all the corresponding SMILES strings from PubChem

In [1]:
# imports

import pandas as pd
from core import *

In [23]:
# read kegg compound dataset into dataframe

compound_df = create_kegg_df('../datasets/KEGG_compound_db_entries.gz', 'compound')

In [24]:
# parse pubchem id from each row in compound dataframe

compound_df['pubchem_id'] = [parse_pubchem_ids(row['dblinks']) for _, row in compound_df.iterrows()]

In [25]:
# function to check lengths of pubchem ids in dataframe

def check_length_dist(dataframe, field):
    id_lengths = [len(data) for data in dataframe[field]]

    values, counts = np.unique(id_lengths, return_counts=True)

    for i in range(len(values)):
        print("{}: {}".format(values[i], counts[i]))

    print("\ntotal numbers: {}\ndataframe shape: {}".format(counts.sum(), dataframe.shape))

In [26]:
# check lengths of pubchem ids in dataframe

check_length_dist(compound_df, 'pubchem_id')

0: 197
4: 5835
5: 4198
6: 813
7: 141
8: 3582
9: 3739

total numbers: 18505
dataframe shape: (18505, 9)


In [29]:
# remove zero rows

compound_df = compound_df[compound_df['pubchem_id'] != '']
compound_df.shape

(18308, 9)

In [32]:
# write to csv file

compound_df.to_csv('../datasets/KEGG_compounds_no_SMILES.csv')

In [31]:
!ls ../datasets/

KEGG_compound.csv
KEGG_compound_db_entries.csv
KEGG_compound_db_entries.gz
KEGG_enzymes_all_data.gz
KEGG_reaction_db_entries.tar.gz
cleaned_promiscous_enzyme_df.csv
cofactor_list.csv
df_cleaned_kegg_with_smiles.csv
playground_df_cleaned_kegg_with_smiles.csv
promiscuous_cleaned_KEGGtoPubChem.csv
promiscuous_enzyme.csv
promiscuous_enzyme_list.xlsx
pubchem_ids_promiscuous_enzyme_products.csv
substrate_product_combined_promiscuous.csv
vectorized_enzyme.csv
