In [None]:
pip install requests
pip install lxml
pip install pandas
pip install openpyxl
pip install cobra

# for windows only, download the file from https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda firstly
# pip install pyeda-0.28.0-cp37-cp37m-win_amd64.whl  
pip install modelseedpy

# get reaction included in KEGG pythway

## get pythway ID list from kegg

In [1]:
import requests
from lxml import etree
org_name = 'mko'
url ='https://www.kegg.jp/kegg-bin/show_organism?menu_type=pathway_maps&org='+org_name
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
html = requests.get(url,headers = headers)
if not html.ok:
    raise TypeError('wrong KEGG name')
e = etree.HTML(html.text)
strain_name = e.xpath('//font[@class="title1"]/text()')
print('Organism: '+strain_name[0]+'\n')
DATA = e.xpath('//ul/a/@href')
pathway_ID = [i.replace('/pathway/','') for i in DATA]
print(str(len(pathway_ID))+' pathway maps are found')

Organism: Methylomonas koyamae

112 pathway maps are found


## download xml files

In [2]:
import re
import os
import requests

# create folder
path=".\\"+org_name
if not os.path.exists(path):
    os.makedirs(path)
    os.makedirs(path+'_model')
#download xml   
i = 1
for ID in pathway_ID:
    downloadurl = 'https://www.kegg.jp/kegg-bin/download?entry='+ID+'&format=kgml'
    headers={'Referer':'https://www.kegg.jp/pathway/'+ID,'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
    xml=requests.get(downloadurl,headers = headers)
    if not xml.ok:
        print(ID+'.xml can not be found.   '+str(i)+'/'+ str(len(pathway_ID)))
        i+=1
        continue
    with open(path+'\\'+ID+".xml","wb") as code:
        code.write(xml.content)
    print(ID+'.xml write secceed.   '+str(i)+'/'+ str(len(pathway_ID)))
    i+=1

mko01100.xml can not be found.   1/112
mko01110.xml can not be found.   2/112
mko01120.xml can not be found.   3/112
mko01200.xml can not be found.   4/112
mko01210.xml can not be found.   5/112
mko01212.xml can not be found.   6/112
mko01230.xml can not be found.   7/112
mko01232.xml can not be found.   8/112
mko01250.xml can not be found.   9/112
mko01240.xml can not be found.   10/112
mko01220.xml can not be found.   11/112
mko00010.xml write secceed.   12/112
mko00020.xml write secceed.   13/112
mko00030.xml write secceed.   14/112
mko00040.xml write secceed.   15/112
mko00051.xml write secceed.   16/112
mko00052.xml write secceed.   17/112
mko00053.xml write secceed.   18/112
mko00500.xml write secceed.   19/112
mko00520.xml write secceed.   20/112
mko00620.xml write secceed.   21/112
mko00630.xml write secceed.   22/112
mko00640.xml write secceed.   23/112
mko00650.xml write secceed.   24/112
mko00660.xml write secceed.   25/112
mko00562.xml write secceed.   26/112
mko00190.xml w

## get xml detail

In [None]:
from Bio.KEGG.KGML.KGML_parser import read
pathway = read(open('mbry/mbry00010.xml', 'r'))

In [1]:
org_name = 'mbry'

In [2]:
import os
import xml.dom.minidom as xo
import pandas as pd

path=".\\"+org_name
xmls = [path+'\\'+i for i in os.listdir(path) if i.endswith('.xml')]
df_xmlentry = pd.DataFrame(columns=["pathway_ID", "pathway_title", "entry_id", "entry_name","entry_type","entry_reaction","entry_link"])
df_xmlreaction = pd.DataFrame(columns=["reaction_ID", "reaction_name", "reaction_type", "reaction_substrates","reaction_products"])
i=1
for xml in xmls:
    domtree = xo.parse(xml)
    pathway = domtree.documentElement
    pathway_ID = pathway.getAttribute("name")
    pathway_org = pathway.getAttribute("org")
    pathway_title = pathway.getAttribute("title")
    entrys = pathway.getElementsByTagName("entry")
    reactions = pathway.getElementsByTagName("reaction")
    for entry in entrys:
        entry_id= pathway_ID+'-'+entry.getAttribute("id")
        entry_name = entry.getAttribute("name")
        entry_type = entry.getAttribute("type")
        entry_reaction = entry.getAttribute("reaction")
        entry_link = entry.getAttribute("link")
        data = [pathway_ID,pathway_title,entry_id,entry_name,entry_type,entry_reaction,entry_link]
        df_xmlentry.loc[len(df_xmlentry.index)]=data
        #graphics = entry.getElementsByTagName('graphics')[0]
        #graphics_name = graphics.getAttribute("name")
    for reaction in reactions:
        reaction_id = pathway_ID+'-'+reaction.getAttribute("id")
        reaction_name = reaction.getAttribute("name")
        reaction_type = reaction.getAttribute("type")
        substrates = reaction.getElementsByTagName("substrate")
        reaction_substrates = [substrate.getAttribute("name") for substrate in substrates]
        products = reaction.getElementsByTagName("product")
        reaction_products = [product.getAttribute("name") for product in products]
        data = [reaction_id, reaction_name, reaction_type, str(reaction_substrates),str(reaction_products)]
        df_xmlreaction.loc[len(df_xmlreaction.index)]=data
    print('File:',xml,'done.  ',i,'/',len(xmls))
    i+=1

df_r = df_xmlreaction.rename(columns={'reaction_name':'entry_reaction'}).drop(columns='reaction_ID')
df_org_reaction = pd.merge(df_xmlentry.query('entry_type == "gene" and entry_reaction != ""'),df_r,how='left', on='entry_reaction').drop_duplicates(subset=['entry_reaction'])

df_org_reaction.to_excel(org_name+'_model\\'+org_name+'_KEGG_pathway_reaction.xlsx', sheet_name='Sheet1', header=True)

File: .\mbry\mbry00010.xml done.   1 / 107
File: .\mbry\mbry00020.xml done.   2 / 107
File: .\mbry\mbry00030.xml done.   3 / 107
File: .\mbry\mbry00040.xml done.   4 / 107
File: .\mbry\mbry00051.xml done.   5 / 107
File: .\mbry\mbry00052.xml done.   6 / 107
File: .\mbry\mbry00053.xml done.   7 / 107
File: .\mbry\mbry00061.xml done.   8 / 107
File: .\mbry\mbry00071.xml done.   9 / 107
File: .\mbry\mbry00130.xml done.   10 / 107
File: .\mbry\mbry00190.xml done.   11 / 107
File: .\mbry\mbry00220.xml done.   12 / 107
File: .\mbry\mbry00230.xml done.   13 / 107
File: .\mbry\mbry00240.xml done.   14 / 107
File: .\mbry\mbry00250.xml done.   15 / 107
File: .\mbry\mbry00260.xml done.   16 / 107
File: .\mbry\mbry00261.xml done.   17 / 107
File: .\mbry\mbry00270.xml done.   18 / 107
File: .\mbry\mbry00280.xml done.   19 / 107
File: .\mbry\mbry00281.xml done.   20 / 107
File: .\mbry\mbry00290.xml done.   21 / 107
File: .\mbry\mbry00300.xml done.   22 / 107
File: .\mbry\mbry00310.xml done.   23 / 1

## data clean

In [3]:
import pandas as pd

df_org_reaction = pd.read_excel(org_name+'_model\\'+org_name+'_KEGG_pathway_reaction.xlsx')  
titlename= ['entry_name','pathway_title','reaction_type','reaction_substrates','reaction_products']
df_gene = df_org_reaction[['entry_reaction']+titlename]
df_reaction_list = pd.concat([df_gene['entry_reaction'].str.split('rn:',expand=True).iloc[:,1:],df_gene[titlename]],axis=1)
df_reaction_melt = df_reaction_list.melt(id_vars = titlename, value_vars = [1,2,3,4],var_name = 'r',value_name = 'Kegg_ID').drop(['r'],axis = 1).dropna(subset=['Kegg_ID'])
df_reaction_melt['Kegg_ID'] = df_reaction_melt['Kegg_ID'].str.strip()
df_reaction = df_reaction_melt.drop_duplicates(subset=['Kegg_ID'])[['Kegg_ID']+titlename]
df_reaction.to_excel(org_name+'_model\\'+org_name+'_clean_kegg_reaction.xlsx', sheet_name='Sheet1', header=True)