In [None]:
pip install requests
pip install lxml
pip install pandas
pip install openpyxl
pip install cobra

# for windows only, download the file from https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda firstly
# pip install pyeda-0.28.0-cp37-cp37m-win_amd64.whl  
pip install modelseedpy

# get reaction included in KEGG pythway

## get pythway ID list from kegg

In [5]:
import requests
from lxml import etree
org_name = 'mbry'
url ='https://www.kegg.jp/kegg-bin/show_organism?menu_type=pathway_maps&org='+org_name
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
html = requests.get(url,headers = headers)
if not html.ok:
    raise TypeError('wrong KEGG name')
e = etree.HTML(html.text)
strain_name = e.xpath('//font[@class="title1"]/text()')
print('Organism: '+strain_name[0]+'\n')
DATA = e.xpath('//ul/a/@href')
pathway_ID = [i.replace('/pathway/','') for i in DATA]
print(str(len(pathway_ID))+' pathway maps are found')

Organism: Methylocystis bryophila

118 pathway maps are found


## download xml files

In [None]:
import re
import os
import requests

# create folder
path=".\\"+org_name
if not os.path.exists(path):
    os.makedirs(path)
    os.makedirs(path+'_model')
#download xml   
i = 1
for ID in pathway_ID:
    downloadurl = 'https://www.kegg.jp/kegg-bin/download?entry='+ID+'&format=kgml'
    headers={'Referer':'https://www.kegg.jp/pathway/'+ID,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
    xml=requests.get(downloadurl,headers = headers)
    if not xml.ok:
        print(ID+'.xml can not be found.   '+str(i)+'/'+ str(len(pathway_ID)))
        i+=1
        continue
    with open(path+'\\'+ID+".xml","wb") as code:
        code.write(xml.content)
    print(ID+'.xml write secceed.   '+str(i)+'/'+ str(len(pathway_ID)))
    i+=1

## get xml detail

In [1]:
from Bio.KEGG.KGML.KGML_parser import read
pathway = read(open('mbry/mbry00010.xml', 'r'))

In [6]:
pathway.name

'path:mbry00010'

In [23]:
for entry in pathway.reaction_entries:
    print(entry)

Entry node ID: 42
Names: mbry:B1812_01610
Type: gene
Components: set()
Reactions: rn:R00710
Graphics elements: 1 [<Bio.KEGG.KGML.KGML_pathway.Graphics object at 0x000001CCE658BDC0>]

Entry node ID: 49
Names: mbry:B1812_03385
Type: gene
Components: set()
Reactions: rn:R00746
Graphics elements: 1 [<Bio.KEGG.KGML.KGML_pathway.Graphics object at 0x000001CCE4BA6880>]

Entry node ID: 50
Names: mbry:B1812_18345
Type: gene
Components: set()
Reactions: rn:R00754
Graphics elements: 1 [<Bio.KEGG.KGML.KGML_pathway.Graphics object at 0x000001CCE4BA68E0>]

Entry node ID: 52
Names: mbry:B1812_18230 mbry:B1812_18235
Type: gene
Components: set()
Reactions: rn:R00014
Graphics elements: 1 [<Bio.KEGG.KGML.KGML_pathway.Graphics object at 0x000001CCE4BA69A0>]

Entry node ID: 53
Names: mbry:B1812_18230 mbry:B1812_18235
Type: gene
Components: set()
Reactions: rn:R03270
Graphics elements: 1 [<Bio.KEGG.KGML.KGML_pathway.Graphics object at 0x000001CCE4BA6A00>]

Entry node ID: 54
Names: mbry:B1812_10465
Type: gen

In [11]:
for reaction in pathway.reaction_entries:
    print(reaction.reaction, reaction.name)

rn:R00710 mbry:B1812_01610
rn:R00746 mbry:B1812_03385
rn:R00754 mbry:B1812_18345
rn:R00014 mbry:B1812_18230 mbry:B1812_18235
rn:R03270 mbry:B1812_18230 mbry:B1812_18235
rn:R02569 mbry:B1812_10465
rn:R00200 mbry:B1812_06770 mbry:B1812_06945
rn:R00658 mbry:B1812_10045
rn:R01518 mbry:B1812_08650 mbry:B1812_20140
rn:R01061 mbry:B1812_12285
rn:R01015 mbry:B1812_03480
rn:R01070 mbry:B1812_09170 mbry:B1812_11395
rn:R04779 mbry:B1812_09340
rn:R02740 mbry:B1812_03175 mbry:B1812_07720
rn:R00959 mbry:B1812_02305
rn:R03321 mbry:B1812_03175 mbry:B1812_07720
rn:R01600 mbry:B1812_15765
rn:R02739 mbry:B1812_03175 mbry:B1812_07720
rn:R01786 mbry:B1812_15765
rn:R07618 mbry:B1812_15670 mbry:B1812_17770
rn:R01512 mbry:B1812_12300
rn:R09127 mbry:B1812_04775
rn:R00341 mbry:B1812_19060
rn:R00235 mbry:B1812_11570
rn:R00711 mbry:B1812_07075 mbry:B1812_13450
rn:R02073 mbry:B1812_09340
rn:R00199 mbry:B1812_18215
rn:R00206 mbry:B1812_03055


In [24]:
org_name

'mca'

In [25]:
import os
import xml.dom.minidom as xo
import pandas as pd

path=".\\"+org_name
xmls = [path+'\\'+i for i in os.listdir(path) if i.endswith('.xml')]
df_xmlentry = pd.DataFrame(columns=["pathway_ID", "pathway_title", "entry_id", "entry_name","entry_type","entry_reaction","entry_link"])
df_xmlreaction = pd.DataFrame(columns=["reaction_ID", "reaction_name", "reaction_type", "reaction_substrates","reaction_products"])
i=1
for xml in xmls:
    domtree = xo.parse(xml)
    pathway = domtree.documentElement
    pathway_ID = pathway.getAttribute("name")
    pathway_org = pathway.getAttribute("org")
    pathway_title = pathway.getAttribute("title")
    entrys = pathway.getElementsByTagName("entry")
    reactions = pathway.getElementsByTagName("reaction")
    for entry in entrys:
        entry_id= pathway_ID+'-'+entry.getAttribute("id")
        entry_name = entry.getAttribute("name")
        entry_type = entry.getAttribute("type")
        entry_reaction = entry.getAttribute("reaction")
        entry_link = entry.getAttribute("link")
        data = [pathway_ID,pathway_title,entry_id,entry_name,entry_type,entry_reaction,entry_link]
        df_xmlentry.loc[len(df_xmlentry.index)]=data
        #graphics = entry.getElementsByTagName('graphics')[0]
        #graphics_name = graphics.getAttribute("name")
    for reaction in reactions:
        reaction_id = pathway_ID+'-'+reaction.getAttribute("id")
        reaction_name = reaction.getAttribute("name")
        reaction_type = reaction.getAttribute("type")
        substrates = reaction.getElementsByTagName("substrate")
        reaction_substrates = [substrate.getAttribute("name") for substrate in substrates]
        products = reaction.getElementsByTagName("product")
        reaction_products = [product.getAttribute("name") for product in products]
        data = [reaction_id, reaction_name, reaction_type, str(reaction_substrates),str(reaction_products)]
        df_xmlreaction.loc[len(df_xmlreaction.index)]=data
    print('File:',xml,'done.  ',i,'/',len(xmls))
    i+=1

df_r = df_xmlreaction.rename(columns={'reaction_name':'entry_reaction'}).drop(columns='reaction_ID')
df_org_reaction = pd.merge(df_xmlentry.query('entry_type == "gene" and entry_reaction != ""'),df_r,how='left', on='entry_reaction').drop_duplicates(subset=['entry_reaction'])

df_org_reaction.to_excel(org_name+'_model\\'+org_name+'_KEGG_pathway_reaction.xlsx', sheet_name='Sheet1', header=True)

File: .\mca\mca00010.xml done.   1 / 104
File: .\mca\mca00020.xml done.   2 / 104
File: .\mca\mca00030.xml done.   3 / 104
File: .\mca\mca00040.xml done.   4 / 104
File: .\mca\mca00051.xml done.   5 / 104
File: .\mca\mca00052.xml done.   6 / 104
File: .\mca\mca00053.xml done.   7 / 104
File: .\mca\mca00061.xml done.   8 / 104
File: .\mca\mca00071.xml done.   9 / 104
File: .\mca\mca00100.xml done.   10 / 104
File: .\mca\mca00130.xml done.   11 / 104
File: .\mca\mca00190.xml done.   12 / 104
File: .\mca\mca00220.xml done.   13 / 104
File: .\mca\mca00230.xml done.   14 / 104
File: .\mca\mca00240.xml done.   15 / 104
File: .\mca\mca00250.xml done.   16 / 104
File: .\mca\mca00260.xml done.   17 / 104
File: .\mca\mca00261.xml done.   18 / 104
File: .\mca\mca00270.xml done.   19 / 104
File: .\mca\mca00280.xml done.   20 / 104
File: .\mca\mca00290.xml done.   21 / 104
File: .\mca\mca00300.xml done.   22 / 104
File: .\mca\mca00310.xml done.   23 / 104
File: .\mca\mca00330.xml done.   24 / 104
F

## data clean

In [26]:
import pandas as pd

df_org_reaction = pd.read_excel(org_name+'_model\\'+org_name+'_KEGG_pathway_reaction.xlsx')  
titlename= ['entry_name','pathway_title','reaction_type','reaction_substrates','reaction_products']
df_gene = df_org_reaction[['entry_reaction']+titlename]
df_reaction_list = pd.concat([df_gene['entry_reaction'].str.split('rn:',expand=True).iloc[:,1:],df_gene[titlename]],axis=1)
df_reaction_melt = df_reaction_list.melt(id_vars = titlename, value_vars = [1,2,3,4],var_name = 'r',value_name = 'Kegg_ID').drop(['r'],axis = 1).dropna(subset=['Kegg_ID'])
df_reaction_melt['Kegg_ID'] = df_reaction_melt['Kegg_ID'].str.strip()
df_reaction = df_reaction_melt.drop_duplicates(subset=['Kegg_ID'])[['Kegg_ID']+titlename]
df_reaction.to_excel(org_name+'_model\\'+org_name+'_clean_kegg_reaction.xlsx', sheet_name='Sheet1', header=True)