In [114]:
from Bio.KEGG import REST
from Bio.KEGG import Enzyme

import gzip
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [142]:
enzyme_fields = [method for method in dir(Enzyme.Record()) if not method.startswith('_')]
data_matrix = []

with gzip.open('../datasets/KEGG_enzymes_all_data.gz', 'rt') as file:
    for record in Enzyme.parse(file):
        data_matrix.append([getattr(record, field) for field in enzyme_fields])

In [143]:
enzyme_df = pd.DataFrame(data_matrix, columns=enzyme_fields)

In [144]:
enzyme_df.head()

Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]
1,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Some members of this group ox...,"[(ExplorEnz - The Enzyme Database, [1.1.1.2]),...",[],[],1.1.1.2,"[(HSA, [10327]), (PTR, [741418]), (PPS, [10099...",[],"[alcohol dehydrogenase (NADP+), aldehyde reduc...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADPH [CPD:C00005], H+...",[an alcohol + NADP+ = an aldehyde + NADPH + H+...,[],"[alcohol [CPD:C00069], NADP+ [CPD:C00006]]",[alcohol:NADP+ oxidoreductase]
2,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[The yeast enzyme acts most rapidly with NAD+;...,"[(ExplorEnz - The Enzyme Database, [1.1.1.3]),...",[],[],1.1.1.3,"[(NVE, [NEMVE_v1g225948]), (ATH, [AT1G31230, A...",[],"[homoserine dehydrogenase, HSDH, HSD]","[(PATH, ec00260, Glycine, serine and threonine...","[L-aspartate 4-semialdehyde [CPD:C00441], NADH...",[L-homoserine + NAD(P)+ = L-aspartate 4-semial...,[],"[L-homoserine [CPD:C00263], NAD+ [CPD:C00003],...",[L-homoserine:NAD(P)+ oxidoreductase]
3,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Also converts diacetyl into acetoin with NADH...,"[(ExplorEnz - The Enzyme Database, [1.1.1.4]),...",[],[],1.1.1.4,"[(SCE, [YAL060W, YAL061W]), (KLA, [KLLA0_F0050...",[],"[(R,R)-butanediol dehydrogenase, butyleneglyco...","[(PATH, ec00650, Butanoate metabolism)]","[(R)-acetoin [CPD:C00810], NADH [CPD:C00004], ...","[(R,R)-butane-2,3-diol + NAD+ = (R)-acetoin + ...",[],"[(R,R)-butane-2,3-diol [CPD:C03044], NAD+ [CPD...","[(R,R)-butane-2,3-diol:NAD+ oxidoreductase]"
4,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Transferred entry: acetoin dehydrogenase. Now...,[],[],[],1.1.1.5,[],[],[Transferred to 1.1.1.303 and 1.1.1.304],[],[],[],[],[],[]


In [145]:
# enzyme df search 
enzyme_df[enzyme_df.entry == '1.1.1.153']['reaction']

152    [(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...
Name: reaction, dtype: object

In [146]:
enzyme_df['reaction'][153]

['(S)-ureidoglycolate + NAD(P)+ = oxalureate + NAD(P)H + H+ [RN:R02935 R02936]']

In [147]:
promiscuous_df = enzyme_df[[True if len(rxn) > 1 else False for rxn in enzyme_df['reaction']]]
compact_promiscuous_df = promiscuous_df[['entry','reaction','product','substrate']]
compact_promiscuous_df.iloc[1,2]

['pyruvate [CPD:C00022]', 'CO2 [CPD:C00011]', 'NADH [CPD:C00004]']

In [None]:

# create a list of reactions that appear in promiscuous enzyme dataframe 
reaction_list = []
for index,row in compact_promiscuous_df.iterrows():
     for reaction in row[1]:
            if reaction.split("[RN:")[-1].startswith("R"):
                if not reaction.split("[RN:")[-1].startswith("RN"):
                     for i in reaction.split("[RN:")[-1][:-1].split(" "):
                            reaction_list.append(i)
            
reaction_list       


In [None]:
# run it when there is good internet connection
# append all the reactions that are reversible 
reversible_reaction = []
for reaction in reaction_list:
    reaction_file = REST.kegg_get(reaction).read()
    for i in reaction_file.rstrip().split("\n"):
        if i.startswith("EQUATION") and "<=>" in i:
            reversible_reaction.append(reaction)
            print (reaction)

In [127]:
# it seem like all the reactions are reversible 
len(reversible_reaction)

1302

In [148]:
rowindex = np.arange(0,len(compact_promiscuous_df))
rowindex

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [149]:
compact_promiscuous_df_index=compact_promiscuous_df.set_index(rowindex)
compact_promiscuous_df_index.iloc[1,2]


['pyruvate [CPD:C00022]', 'CO2 [CPD:C00011]', 'NADH [CPD:C00004]']

### append substrate molecules to product column

In [150]:
# do not run this cell several times! it will append substrate molecules multiple times 
for index,row in compact_promiscuous_df_index.iterrows():
    productlist = row['product']
    substratelist = row['substrate']
    for substrate in substratelist:
        productlist.append(substrate)
    compact_promiscuous_df_index.iloc[index,2] = productlist

In [151]:
substrate_to_product_promiscuous_df = compact_promiscuous_df_index
len(substrate_to_product_promiscuous_df.iloc[1,2])

6

In [152]:
substrate_to_product_promiscuous_df = substrate_to_product_promiscuous_df[['entry','reaction','product']]

In [153]:
# save substrate and product combined dataframe to csv 
# might remove this dataframe from the git repo soon 
# substrate_to_product_promiscuous_df.to_csv("../datasets/substrate_product_combined_promiscuous.csv")

### cofactor removal 

In [154]:
substrate_to_product_promiscuous_df 

Unnamed: 0,entry,reaction,product
0,1.1.1.1,[(1) a primary alcohol + NAD+ = an aldehyde + ...,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ..."
1,1.1.1.38,[(1) (S)-malate + NAD+ = pyruvate + CO2 + NADH...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH..."
2,1.1.1.40,[(1) (S)-malate + NADP+ = pyruvate + CO2 + NAD...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP..."
3,1.1.1.42,[isocitrate + NADP+ = 2-oxoglutarate + CO2 + N...,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]..."
4,1.1.1.85,"[(2R,3S)-3-isopropylmalate + NAD+ = 4-methyl-2...","[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C..."
5,1.1.1.110,[(1) (R)-3-(phenyl)lactate + NAD+ = 3-phenylpy...,"[3-phenylpyruvate, NADH [CPD:C00004], H+ [CPD:..."
6,1.1.1.153,"[(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...","[sepiapterin [CPD:C00835], NADPH [CPD:C00005],..."
7,1.1.1.187,[(1) GDP-alpha-D-rhamnose + NAD(P)+ = GDP-4-de...,"[GDP-4-dehydro-alpha-D-rhamnose [CPD:C01222], ..."
8,1.1.1.203,[(1) beta-D-galacturonate + NAD+ = D-galactaro...,"[D-galactaro-1,5-lactone [CPD:C20889], NADH [C..."
9,1.1.1.237,[(1) (R)-3-(4-hydroxyphenyl)lactate + NAD(P)+ ...,"[3-(4-hydroxyphenyl)pyruvate [CPD:C01179], NAD..."


In [194]:
compact_substrate_to_product_promiscuous_df = substrate_to_product_promiscuous_df[['entry','product']]

In [195]:
len(compact_substrate_to_product_promiscuous_df)

549

In [171]:
# test cleaning 
test='aldehyde [CPD:C00071]'
test[-7:-1]

'C00071'

In [185]:
#cofactor removal 
cofactor_df = pd.read_csv("cofactor_list.csv")
cofactor_list = [cofactor[4:10] for cofactor in cofactor_df.CPD]
cofactor_list
    

['C00001',
 'C00002',
 'C00003',
 'C00004',
 'C00005',
 'C00006',
 'C00007',
 'C00008',
 'C00009',
 'C00010',
 'C00011',
 'C00012',
 'C00013',
 'C00014',
 'C00015',
 'C00016',
 'C00017',
 'C00018',
 'C00019',
 'C00020',
 'C00021',
 'C00023',
 'C00027',
 'C00028',
 'C00030',
 'C00032',
 'C00034',
 'C00050',
 'C00061',
 'C00070',
 'C00080',
 'C00255',
 'C01007',
 'C01352',
 'C01382',
 'C02745',
 'C02869']

In [197]:
newcompoundcolumn=[]
newdf = compact_substrate_to_product_promiscuous_df[['entry']]
newdf

            

Unnamed: 0,entry
0,1.1.1.1
1,1.1.1.38
2,1.1.1.40
3,1.1.1.42
4,1.1.1.85
5,1.1.1.110
6,1.1.1.153
7,1.1.1.187
8,1.1.1.203
9,1.1.1.237


In [198]:
# I wanted to use dataframe.loc but I was keep getting an error and this was the best way possible.. 
no_noncofactorcompound = []
compoundcolumn = []
for index,row in compact_substrate_to_product_promiscuous_df.iterrows():
    newcompoundlist = []
    for compound in row[1]:
        if "CPD" in compound:
            onlycpd = compound[-7:-1]
            if onlycpd not in cofactor_list:
                newcompoundlist.append(onlycpd)
    if len(newcompoundlist)==0:
        no_noncofactorcompound.append(row[0])
        compoundcolumn.append("NA")
    else:
        compoundcolumn.append(newcompoundlist)
newdf['product'] = compoundcolumn

        

['1.3.1.88',
 '1.3.1.90',
 '1.11.1.21',
 '1.14.13.107',
 '1.18.1.3',
 '2.1.1.200',
 '2.1.1.202',
 '2.1.1.207',
 '2.1.1.213',
 '2.1.1.225',
 '2.1.1.244',
 '2.1.1.268',
 '2.3.2.24',
 '2.3.2.25',
 '2.3.2.26',
 '2.3.2.31',
 '3.1.11.7',
 '4.1.1.68',
 '4.2.1.161',
 '4.2.1.166',
 '4.2.1.170',
 '4.4.1.29',
 '4.4.1.30',
 '4.6.1.18',
 '4.6.1.20',
 '4.6.1.21',
 '5.3.3.8',
 '6.5.1.4']

In [203]:
#renamed
cleaned_promiscuous_enzyme_df=newdf

In [205]:
#no cofactor. #cleaned version 
cleaned_promiscuous_enzyme_df

Unnamed: 0,entry,product
0,1.1.1.1,"[C00071, C01450, C00226, C01612]"
1,1.1.1.38,"[C00022, C00149, C00036]"
2,1.1.1.40,"[C00022, C00149, C00036]"
3,1.1.1.42,"[C00026, C05379, C00311, C05379]"
4,1.1.1.85,"[C00233, C04236, C04411, C04236]"
5,1.1.1.110,"[C01179, C00331, C05607, C03964, C22006]"
6,1.1.1.153,"[C00835, C03684, C02953, C00272]"
7,1.1.1.187,"[C01222, C03117, C02977]"
8,1.1.1.203,"[C20889, C20890]"
9,1.1.1.237,"[C01179, C04045, C03964, C22038]"


In [206]:
#list of enzyme entries without compound, other than cofactors 
no_noncofactorcompound

['1.3.1.88',
 '1.3.1.90',
 '1.11.1.21',
 '1.14.13.107',
 '1.18.1.3',
 '2.1.1.200',
 '2.1.1.202',
 '2.1.1.207',
 '2.1.1.213',
 '2.1.1.225',
 '2.1.1.244',
 '2.1.1.268',
 '2.3.2.24',
 '2.3.2.25',
 '2.3.2.26',
 '2.3.2.31',
 '3.1.11.7',
 '4.1.1.68',
 '4.2.1.161',
 '4.2.1.166',
 '4.2.1.170',
 '4.4.1.29',
 '4.4.1.30',
 '4.6.1.18',
 '4.6.1.20',
 '4.6.1.21',
 '5.3.3.8',
 '6.5.1.4']

In [None]:
cleaned_promiscuous_enzyme_df.to_csv("../datasets/cleaned_promiscous_enzyme_df.csv", 
                                     header=['entry','product'])