### This notebook provides a template for connecting to the KEGG API, as well as a first look at the list of enzymes in the database

#### References: 

https://biopython.readthedocs.io/en/latest/Tutorial/chapter_kegg.html

http://biopython.org/DIST/docs/api/Bio.KEGG.REST-module.html

https://exploringlifedata.blogspot.com/

In [1]:
# imports

from Bio.KEGG import REST
from Bio.KEGG import Enzyme


import pandas as pd

In [2]:
# pulling all enzymes currently listed in KEGG & reading into text buffer

enzyme = REST.kegg_list("enzyme").read()

In [3]:
# parsing text buffer to get list of all enzymes in KEGG

enzyme_entry = []
for line in enzyme.rstrip().split("\n"):
    entry, description = line.split("\t")
    enzyme_entry.append(entry)

In [4]:
len(enzyme_entry)

7524

In [5]:
# alternative implementation of above to read enzyme list directly into pandas dataframe

all_enzymes_df = pd.read_csv(REST.kegg_list('enzyme'), sep='\t', header=None, names=['EC_number', 'description'])

In [6]:
all_enzymes_df.head()

Unnamed: 0,EC_number,description
0,ec:1.1.1.1,alcohol dehydrogenase; aldehyde reductase; ADH...
1,ec:1.1.1.2,alcohol dehydrogenase (NADP+); aldehyde reduct...
2,ec:1.1.1.3,homoserine dehydrogenase; HSDH; HSD
3,ec:1.1.1.4,"(R,R)-butanediol dehydrogenase; butyleneglycol..."
4,ec:1.1.1.5,Transferred to 1.1.1.303 and 1.1.1.304


In [7]:
#example for one entry

enzyme_file_test = REST.kegg_get('ec:1.1.1.85').read()
for i in enzyme_file_test.rstrip().split("\n"):
    if i.startswith("ALL_REAC"):
        print (i.rstrip().split(" ")) 

['ALL_REAC', '', '', '', 'R01652', 'R04426', 'R10052;']


In [None]:
# parse all entries for which there are more than 5 entries in the 'ALL_REAC' field
# note this is a time intensive function

promiscuous_enzyme_entry = []
for entry in enzyme_entry:
    enzyme_file = REST.kegg_get(entry).read()
    for line in enzyme_file.rstrip().split("\n"):
        if line.startswith("ALL_REAC"):
            if len(line.rstrip().split(" "))>5:    # are there always 3 empty '' entries in each split line? how do we know this is consistent?
                promiscuous_enzyme_entry.append(entry)
                print (entry)

In [9]:
len(promiscuous_enzyme_entry)


3

In [10]:
# alternative implementation of above using Enzyme entry parser from Bio.KEGG library

example_enzyme_entry = Enzyme.read(REST.kegg_get('ec:1.1.1.1'))

In [13]:
print(example_enzyme_entry)

ENTRY       EC 1.1.1.1
NAME        alcohol dehydrogenase
            aldehyde reductase
            ADH
            alcohol dehydrogenase (NAD)
            aliphatic alcohol dehydrogenase
            ethanol dehydrogenase
            NAD-dependent alcohol dehydrogenase
            NAD-specific aromatic alcohol dehydrogenase
            NADH-alcohol dehydrogenase
            NADH-aldehyde dehydrogenase
            primary alcohol dehydrogenase
            yeast alcohol dehydrogenase
CLASS       Oxidoreductases;
            Acting on the CH-OH group of donors;
            With NAD+ or NADP+ as acceptor
SYSNAME     alcohol:NAD+ oxidoreductase
REACTION    (1) a primary alcohol + NAD+ = an aldehyde + NADH + H+ [RN:R00623]
            (2) a secondary alcohol + NAD+ = a ketone + NADH + H+ [RN:R00624]
SUBSTRATE   primary alcohol [CPD:C00226]
            NAD+ [CPD:C00003]
            secondary alcohol [CPD:C01612]
PRODUCT     aldehyde [CPD:C00071]
            NADH [CPD:C00004]
            H+ [C

In [14]:
# get a list of all data fields in enzyme entry

enzyme_fields = [method for method in dir(example_enzyme_entry) if not method.startswith('_')]

print(enzyme_fields)

['classname', 'cofactor', 'comment', 'dblinks', 'disease', 'effector', 'entry', 'genes', 'inhibitor', 'name', 'pathway', 'product', 'reaction', 'structures', 'substrate', 'sysname']


In [15]:
# create dataframe to store enzyme entry information

enzyme_matrix = []

enzyme_matrix.append([getattr(example_enzyme_entry, field) for field in enzyme_fields])

df_enzymes = pd.DataFrame(enzyme_matrix, columns=enzyme_fields)

In [16]:
df_enzymes.head()

Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]


In [19]:
# checkout the reaction field in the dataframe

df_enzymes.reaction[0]

['(1) a primary alcohol + NAD+ = an aldehyde + NADH + H+ [RN:R00623]',
 '(2) a secondary alcohol + NAD+ = a ketone + NADH + H+ [RN:R00624]']

In [46]:
# build dataframe of all enzyme information from database for all enzyme entries
# warning - this function is quite time intensive
# can probably speed this up by requesting up to 10 entries at a time, but I haven't been able to figure it out
# reference: http://biopython.org/DIST/docs/api/Bio.KEGG.REST-module.html#kegg_get

enzyme_matrix = []
total_entries = len(all_enzymes_df.EC_number)

for index in range(total_entries):
    record = Enzyme.read(REST.kegg_get(all_enzymes_df.EC_number[index]))
    enzyme_matrix.append([getattr(record, field) for field in enzyme_fields])
    if index % 100 == 0:
        print("{} enzymes complete, {} remaining".format(index, total_entries - index))
    else:
        pass

0 enzymes complete, 7524 remaining
100 enzymes complete, 7424 remaining
200 enzymes complete, 7324 remaining
300 enzymes complete, 7224 remaining
400 enzymes complete, 7124 remaining
500 enzymes complete, 7024 remaining
600 enzymes complete, 6924 remaining
700 enzymes complete, 6824 remaining
800 enzymes complete, 6724 remaining
900 enzymes complete, 6624 remaining
1000 enzymes complete, 6524 remaining
1100 enzymes complete, 6424 remaining
1200 enzymes complete, 6324 remaining
1300 enzymes complete, 6224 remaining
1400 enzymes complete, 6124 remaining
1500 enzymes complete, 6024 remaining
1600 enzymes complete, 5924 remaining
1700 enzymes complete, 5824 remaining
1800 enzymes complete, 5724 remaining
1900 enzymes complete, 5624 remaining
2000 enzymes complete, 5524 remaining
2100 enzymes complete, 5424 remaining
2200 enzymes complete, 5324 remaining
2300 enzymes complete, 5224 remaining
2400 enzymes complete, 5124 remaining
2500 enzymes complete, 5024 remaining
2600 enzymes complete, 4

In [47]:
# make dataframe from enzyme_matrix 

df_enzymes = pd.DataFrame(enzyme_matrix, columns=enzyme_fields)

df_enzymes.shape

(7524, 16)

In [48]:
df_enzymes.head()

Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]
1,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Some members of this group ox...,"[(ExplorEnz - The Enzyme Database, [1.1.1.2]),...",[],[],1.1.1.2,"[(HSA, [10327]), (PTR, [741418]), (PPS, [10099...",[],"[alcohol dehydrogenase (NADP+), aldehyde reduc...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADPH [CPD:C00005], H+...",[an alcohol + NADP+ = an aldehyde + NADPH + H+...,[],"[alcohol [CPD:C00069], NADP+ [CPD:C00006]]",[alcohol:NADP+ oxidoreductase]
2,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[The yeast enzyme acts most rapidly with NAD+;...,"[(ExplorEnz - The Enzyme Database, [1.1.1.3]),...",[],[],1.1.1.3,"[(NVE, [NEMVE_v1g225948]), (ATH, [AT1G31230, A...",[],"[homoserine dehydrogenase, HSDH, HSD]","[(PATH, ec00260, Glycine, serine and threonine...","[L-aspartate 4-semialdehyde [CPD:C00441], NADH...",[L-homoserine + NAD(P)+ = L-aspartate 4-semial...,[],"[L-homoserine [CPD:C00263], NAD+ [CPD:C00003],...",[L-homoserine:NAD(P)+ oxidoreductase]
3,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Also converts diacetyl into acetoin with NADH...,"[(ExplorEnz - The Enzyme Database, [1.1.1.4]),...",[],[],1.1.1.4,"[(SCE, [YAL060W, YAL061W]), (KLA, [KLLA0_F0050...",[],"[(R,R)-butanediol dehydrogenase, butyleneglyco...","[(PATH, ec00650, Butanoate metabolism)]","[(R)-acetoin [CPD:C00810], NADH [CPD:C00004], ...","[(R,R)-butane-2,3-diol + NAD+ = (R)-acetoin + ...",[],"[(R,R)-butane-2,3-diol [CPD:C03044], NAD+ [CPD...","[(R,R)-butane-2,3-diol:NAD+ oxidoreductase]"
4,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Transferred entry: acetoin dehydrogenase. Now...,[],[],[],1.1.1.5,[],[],[Transferred to 1.1.1.303 and 1.1.1.304],[],[],[],[],[],[]


In [49]:
# write enzyme_data to csv

df_enzymes.to_csv('KEGG_enzymes_all_data.csv')