### The purpose of this notebook is to complete a data cleaning workflow from start to finish in order to validate the core functionality our package

In [62]:
# imports

import pandas as pd
from core import *
from create_cpd_info import *
# import gzip
# 
# from Bio.KEGG import REST
# from Bio.KEGG import Enzyme
# import re
# from Bio.KEGG import Compound
# import numpy as np

## Step 1
#### Generate dataframe of all current kegg enzymes from zipped text file

In [10]:
# create_kegg_df()

enzyme_df = create_kegg_df('../datasets/KEGG_enzymes_all_data.gz', 'enzyme')
print(enzyme_df.shape)
enzyme_df.head()

(7524, 16)


Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]
1,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Some members of this group ox...,"[(ExplorEnz - The Enzyme Database, [1.1.1.2]),...",[],[],1.1.1.2,"[(HSA, [10327]), (PTR, [741418]), (PPS, [10099...",[],"[alcohol dehydrogenase (NADP+), aldehyde reduc...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADPH [CPD:C00005], H+...",[an alcohol + NADP+ = an aldehyde + NADPH + H+...,[],"[alcohol [CPD:C00069], NADP+ [CPD:C00006]]",[alcohol:NADP+ oxidoreductase]
2,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[The yeast enzyme acts most rapidly with NAD+;...,"[(ExplorEnz - The Enzyme Database, [1.1.1.3]),...",[],[],1.1.1.3,"[(NVE, [NEMVE_v1g225948]), (ATH, [AT1G31230, A...",[],"[homoserine dehydrogenase, HSDH, HSD]","[(PATH, ec00260, Glycine, serine and threonine...","[L-aspartate 4-semialdehyde [CPD:C00441], NADH...",[L-homoserine + NAD(P)+ = L-aspartate 4-semial...,[],"[L-homoserine [CPD:C00263], NAD+ [CPD:C00003],...",[L-homoserine:NAD(P)+ oxidoreductase]
3,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Also converts diacetyl into acetoin with NADH...,"[(ExplorEnz - The Enzyme Database, [1.1.1.4]),...",[],[],1.1.1.4,"[(SCE, [YAL060W, YAL061W]), (KLA, [KLLA0_F0050...",[],"[(R,R)-butanediol dehydrogenase, butyleneglyco...","[(PATH, ec00650, Butanoate metabolism)]","[(R)-acetoin [CPD:C00810], NADH [CPD:C00004], ...","[(R,R)-butane-2,3-diol + NAD+ = (R)-acetoin + ...",[],"[(R,R)-butane-2,3-diol [CPD:C03044], NAD+ [CPD...","[(R,R)-butane-2,3-diol:NAD+ oxidoreductase]"
4,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Transferred entry: acetoin dehydrogenase. Now...,[],[],[],1.1.1.5,[],[],[Transferred to 1.1.1.303 and 1.1.1.304],[],[],[],[],[],[]


## Step 2
#### Down select promiscuous enzymes from master enzyme dataframe

In [11]:
# select_promiscuous_enzymes()

prom_df = select_promiscuous_enzymes(enzyme_df)
print(prom_df.shape)
prom_df.head()

(549, 4)


Unnamed: 0,entry,reaction,product,substrate
0,1.1.1.1,[(1) a primary alcohol + NAD+ = an aldehyde + ...,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...","[primary alcohol [CPD:C00226], NAD+ [CPD:C0000..."
37,1.1.1.38,[(1) (S)-malate + NAD+ = pyruvate + CO2 + NADH...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH...","[(S)-malate [CPD:C00149], NAD+ [CPD:C00003], o..."
39,1.1.1.40,[(1) (S)-malate + NADP+ = pyruvate + CO2 + NAD...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP...","[(S)-malate [CPD:C00149], NADP+ [CPD:C00006], ..."
41,1.1.1.42,[isocitrate + NADP+ = 2-oxoglutarate + CO2 + N...,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]...","[isocitrate [CPD:C00311], NADP+ [CPD:C00006], ..."
84,1.1.1.85,"[(2R,3S)-3-isopropylmalate + NAD+ = 4-methyl-2...","[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C...","[(2R,3S)-3-isopropylmalate [CPD:C04411], NAD+ ..."


## Step 3
#### Combine substrates and products to account for reversible reactions

In [12]:
# combine_substrates_products()
    
combo_df = combine_substrates_products(prom_df)
print(combo_df.shape)
combo_df.head()

(549, 2)


Unnamed: 0,entry,product
0,1.1.1.1,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ..."
1,1.1.1.38,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH..."
2,1.1.1.40,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP..."
3,1.1.1.42,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]..."
4,1.1.1.85,"[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C..."


## Step 4

#### Expand dataframe so that each row is a unique enzyme-product pair
There are multiple compounds in the 'product' field of the dataframe. This function parses each of those, and for each provides a new enzyme-product pair.

In [13]:
# explode_dataframe()

exploded_df = explode_dataframe(combo_df, parse_compound_ids, 'product', ['entry'])
print(exploded_df.shape)
exploded_df.head()

(3697, 2)


Unnamed: 0,entry,product
0,1.1.1.1,C00071
1,1.1.1.1,C00004
2,1.1.1.1,C00080
3,1.1.1.1,C01450
4,1.1.1.1,C00226


## Step 5
#### Remove cofactors from dataframe
We have curated a list of 37 common cofactors and reactant molecules that participate in reactions, but are not products that we care to train our model on. These steps removes cofactor data from our master dataset.

In [35]:
# remove_cofactors()

cofactors_df = pd.read_csv('../datasets/cofactor_list.csv')
clean_df = remove_cofactors(exploded_df, 'product', cofactors_df, 'CPD')
print(clean_df.shape)
clean_df.head()

(2144, 2)


Unnamed: 0,entry,product
0,1.1.1.1,C00071
3,1.1.1.1,C01450
4,1.1.1.1,C00226
6,1.1.1.1,C01612
7,1.1.1.38,C00022


## Step 6
#### Get SMILES strings for each product from the PubChem database
The RDKit chemistry package requires SMILES strings as an input. The Kegg database does not store SMILES strings for compounds. This step joins a previously curated dataset of SMILES strings into our master dataset

In [86]:
smiles_df = pd.read_csv('../datasets/df_cleaned_kegg_with_smiles.csv')
smiles_df = smiles_df.drop_duplicates(subset='SMILES')
master_df = pd.merge(clean_df, smiles_df, how='inner', left_on='product', right_on='KEGG')
master_df = master_df.drop(columns=['Unnamed: 0', 'entry_y', 'KEGG', 'CID'])
master_df = master_df[master_df['SMILES'] != 'none']
master_df = master_df.rename(columns={'entry_x': 'kegg_enzyme', 'product': 'kegg_compound', 
                                      'PubChem': 'pubchem_compound'})
master_df = master_df.reset_index(drop=True)
print(master_df.shape)
master_df.head()

(1707, 4)


Unnamed: 0,kegg_enzyme,kegg_compound,pubchem_compound,SMILES
0,1.1.1.38,C00022,3324,CC(=O)C(=O)O
1,1.1.1.40,C00022,3324,CC(=O)C(=O)O
2,1.2.3.15,C00022,3324,CC(=O)C(=O)O
3,1.14.11.43,C00022,3324,CC(=O)C(=O)O
4,1.14.11.44,C00022,3324,CC(=O)C(=O)O


## Step 7 
#### Get dummy variables to represent enzyme class

In [89]:
# vectorize enzyme class

# def binarize_enzyme_class(dataframe, column):
#     """
#     binarize_enzyme_class() converts the enzyme class into binary dummy variables 
#         that are appended onto the input dataframe
    
#     Args:
#         dataframe (pandas.DataFrame): input dataset
#         column (str): column name containing kegg enzyme id
        
#     Returns:
#         pandas.DataFrame: with seven columns appended for the seven enzyme classes
#     """
#     dataframe['enzyme_class'] = [row[column][0] for _, row in master_df.iterrows()]
#     dataframe = pd.get_dummies(dataframe, columns=['enzyme_class'])
#     return dataframe

In [90]:
# binarize_enzyme_class()

master_df = binarize_enzyme_class(master_df, 'kegg_enzyme')
print(master_df.shape)
master_df.head()

(1707, 11)


Unnamed: 0,kegg_enzyme,kegg_compound,pubchem_compound,SMILES,enzyme_class_1,enzyme_class_2,enzyme_class_3,enzyme_class_4,enzyme_class_5,enzyme_class_6,enzyme_class_7
0,1.1.1.38,C00022,3324,CC(=O)C(=O)O,1,0,0,0,0,0,0
1,1.1.1.40,C00022,3324,CC(=O)C(=O)O,1,0,0,0,0,0,0
2,1.2.3.15,C00022,3324,CC(=O)C(=O)O,1,0,0,0,0,0,0
3,1.14.11.43,C00022,3324,CC(=O)C(=O)O,1,0,0,0,0,0,0
4,1.14.11.44,C00022,3324,CC(=O)C(=O)O,1,0,0,0,0,0,0


## Step 7
#### Add in compound features with RDKit
This step uses the RDKit packages to generate descriptive features of the compounds

## Step 8
#### Curate negative dataset
So far our curated dataset includes only examples of enzyme-product pairs that are known to react. In order to train our model, we need to include negative examples of enzyme-product pairs not expected to react. This function artifically pairs enzymes and products that are not known to react, and selects a subsample of these negative pairs to include in the master dataset

In [15]:
# create_negative_matches() - good to go

pos_df, neg_df = create_negative_matches(clean_df, 'entry', 'product')

In [16]:
pos_df.shape

(2144, 3)

In [17]:
neg_df.shape

(711105, 3)

In [18]:
# concatenate negative & positive data

master_df = pd.concat((pos_df, neg_df), axis=0)

In [19]:
master_df.shape

(713249, 3)