The purpose of this notebook is to parse all of the PubChem numbers from the kegg_compound master dataset and write them to a csv file so that our pubchem_client tool can pull all the corresponding SMILES strings from PubChem

In [1]:
# imports

import pandas as pd
from core import *

In [2]:
# read kegg compound dataset into dataframe

compound_df = create_kegg_df('../datasets/KEGG_compound_db_entries.gz', 'compound')

In [3]:
# parse pubchem id from each row in compound dataframe

compound_df['pubchem_id'] = [parse_pubchem_ids(row['dblinks']) for _, row in compound_df.iterrows()]

In [4]:
# function to check lengths of pubchem ids in dataframe

def check_length_dist(dataframe, field):
    id_lengths = [len(data) for data in dataframe[field]]

    values, counts = np.unique(id_lengths, return_counts=True)

    for i in range(len(values)):
        print("{}: {}".format(values[i], counts[i]))

    print("\ntotal numbers: {}\ndataframe shape: {}".format(counts.sum(), dataframe.shape))

In [5]:
# check lengths of pubchem ids in dataframe

check_length_dist(compound_df, 'pubchem_id')

0: 197
4: 5835
5: 4198
6: 813
7: 141
8: 3582
9: 3739

total numbers: 18505
dataframe shape: (18505, 9)


In [6]:
# remove zero rows

compound_df = compound_df[compound_df['pubchem_id'] != '']
compound_df.shape

(18308, 9)

In [13]:
prom_com_df = pd.read_csv('promiscuous_products.csv')
prom_coms = prom_com_df['product'].unique().tolist()
len(prom_coms)

1369

In [16]:

bool_mask = [True if row['entry'] in prom_coms else False for _, row in compound_df.iterrows()]
selected_compound_df = compound_df[bool_mask]
print(selected_compound_df.shape)
selected_compound_df.head()

(1299, 9)


Unnamed: 0,dblinks,entry,enzyme,formula,mass,name,pathway,structures,pubchem_id
21,"[(CAS, [127-17-3]), (PubChem, [3324]), (ChEBI,...",C00022,"[1.1.1.27, 1.1.1.28, 1.1.1.38, 1.1.1.39, 1.1.1...",C3H4O3,,"[Pyruvate, Pyruvic acid, 2-Oxopropanoate, 2-Ox...","[(PATH, map00010, Glycolysis / Gluconeogenesis...",[],3324
23,"[(CAS, [72-89-9]), (PubChem, [3326]), (ChEBI, ...",C00024,"[1.1.1.-, 1.2.1.10, 1.2.1.18, 1.2.1.27, 1.2.1....",C23H38N7O17P3S,,"[Acetyl-CoA, Acetyl coenzyme A]","[(PATH, map00010, Glycolysis / Gluconeogenesis...",[],3326
24,"[(CAS, [56-86-0]), (PubChem, [3327]), (ChEBI, ...",C00025,"[1.2.1.88, 1.4.1.2, 1.4.1.3, 1.4.1.4, 1.4.1.13...",C5H9NO4,,"[L-Glutamate, L-Glutamic acid, L-Glutaminic ac...","[(PATH, map00220, Arginine biosynthesis), (PAT...",[],3327
25,"[(CAS, [328-50-7]), (PubChem, [3328]), (ChEBI,...",C00026,"[1.1.1.41, 1.1.1.42, 1.1.1.95, 1.1.1.286, 1.1....",C5H6O5,,"[2-Oxoglutarate, Oxoglutaric acid, 2-Ketogluta...","[(PATH, map00020, Citrate cycle (TCA cycle)), ...",[],3328
28,"[(CAS, [133-89-1]), (PubChem, [3331]), (ChEBI,...",C00029,"[1.1.1.22, 1.1.1.-, 2.4.1.11, 2.4.1.12, 2.4.1....",C15H24N2O17P2,,"[UDP-glucose, UDPglucose, UDP-D-glucose, Uridi...","[(PATH, map00040, Pentose and glucuronate inte...",[],3331


In [14]:
compound_df.head()

Unnamed: 0,dblinks,entry,enzyme,formula,mass,name,pathway,structures,pubchem_id
0,"[(CAS, [7732-18-5]), (PubChem, [3303]), (ChEBI...",C00001,"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",H2O,,"[H2O, Water]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3303
1,"[(CAS, [56-65-5]), (PubChem, [3304]), (ChEBI, ...",C00002,"[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",C10H16N5O13P3,,"[ATP, Adenosine 5'-triphosphate]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3304
2,"[(CAS, [53-84-9]), (PubChem, [3305]), (ChEBI, ...",C00003,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H28N7O14P2,,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3305
3,"[(CAS, [58-68-4]), (PubChem, [3306]), (ChEBI, ...",C00004,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H29N7O14P2,,"[NADH, DPNH, Reduced nicotinamide adenine dinu...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3306
4,"[(CAS, [2646-71-1]), (PubChem, [3307]), (ChEBI...",C00005,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",C21H30N7O17P3,,"[NADPH, TPNH, Reduced nicotinamide adenine din...","[(PATH, map00195, Photosynthesis), (PATH, map0...",[],3307


In [7]:
# write to csv file

compound_df.to_csv('../datasets/KEGG_compounds_no_SMILES.csv')

In [31]:
!ls ../datasets/

KEGG_compound.csv
KEGG_compound_db_entries.csv
KEGG_compound_db_entries.gz
KEGG_enzymes_all_data.gz
KEGG_reaction_db_entries.tar.gz
cleaned_promiscous_enzyme_df.csv
cofactor_list.csv
df_cleaned_kegg_with_smiles.csv
playground_df_cleaned_kegg_with_smiles.csv
promiscuous_cleaned_KEGGtoPubChem.csv
promiscuous_enzyme.csv
promiscuous_enzyme_list.xlsx
pubchem_ids_promiscuous_enzyme_products.csv
substrate_product_combined_promiscuous.csv
vectorized_enzyme.csv
