In [None]:
pip install requests
pip install lxml
pip install pandas
pip install openpyxl
pip install cobra

# for windows only, download the file from https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda firstly
# pip install pyeda-0.28.0-cp37-cp37m-win_amd64.whl  
pip install modelseedpy

# download database from KEGG and ModelSEED

### download KEGG reaction database

In [None]:
import requests
from lxml import etree
import re
import time
import pandas as pd

KEGG_REACTIONS_list = requests.get('https://rest.kegg.jp/list/reaction')
f = open("Database\\KEGG_REACTIONS_list.txt", 'w')
f.write(KEGG_REACTIONS_list.text)

kegg_rxn_list = pd.read_table('KEGG_REACTIONS_list.txt',header=None)
kegg_rlist = [st.replace('rn:','') for st in kegg_rxn_list[0]]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
f = open("Database\\KEGG_REACTIONS_DB.txt", 'a')
f.write('ID\tNAMES\tKEGG\tEQUATION\tECs\n')


for id in kegg_rlist:
    url ='https://www.kegg.jp/entry/'+id
    try:
        html = requests.get(url,headers = headers)
    except requests.exceptions.Timeout:
        print("time out, try again")
        time.sleep(20)
        html = requests.get(url,headers = headers)
    if not html.ok:
        print('wrong reaction ID')
        time.sleep(20)
        html = requests.get(url,headers = headers)
        if not html.ok:
            continue

    e = etree.HTML(html.text)

    name_content = e.xpath('//th/span[text()="Name"]/../../td/div/div/text()')
    name = ''
    for st in name_content[:-1]:
        name = name +st.replace('\n','').replace(';','|')
    
    equation_content = e.xpath('//th/span[text()="Equation"]/../../td/div//text()')
    equation = ''
    for st in equation_content[:-1]:
        if st.startswith('C'):
            st = st+'[0]'
        else:
            matches = re.compile(r"\d+").findall(st)
            if matches:
                match = matches[0]
                st = st.replace(match, f"({match})")    
        equation = equation + st
    
    EC_content = e.xpath('//th/span[text()="Enzyme"]/../../td/div//text()')
    EC1 = [i.replace(u'\xa0','\n') for i in EC_content]
    EC2 = [i for i in EC1 if not str(i).startswith('\n')]
    EC = ''
    for st in EC2:
        EC = EC + st + '|'
    EC = EC[:-1]
    f.write(id+'\t'+name+'\t'+id+'\t'+equation+'\t'+EC+'\n')
f.close()


### connect KEGG reaction to ModelSEED reaction

In [None]:
import pandas as pd
import modelseedpy

modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)

df_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID"])
for (key,value) in modelseed.reaction_aliases.items():
    ModelSeed_ID = key
    try:    
        Kegg_ID = list(value.get('KEGG'))[0]
    except:
            Kegg_ID =''
    df_keggID_modelseedID.loc[len(df_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID]
df_keggID_modelseedID.to_csv('Database\\reaction_keggID_modelseedID.txt',  header=True, index=False,sep='\t')

print('done!')

### download KEGG compounds database and conect to modelseed

In [None]:
import requests
KEGG_COMPOUNDS_list = requests.get('https://rest.kegg.jp/list/compound')
f = open("Database\\KEGG_COMPOUNDS_list.txt", 'w')
f.write(KEGG_COMPOUNDS_list.text)
kegg_cpd_list = pd.read_table('Database\\KEGG_COMPOUNDS_list.txt',header=None)
KEGG_COMPOUNDS_name = pd.DataFrame(columns=['ID','Name'])
KEGG_COMPOUNDS_name['ID']=[st.replace('cpd:','') for st in kegg_cpd_list[0]]
KEGG_COMPOUNDS_name['Name']=[st.replace('; ','|') for st in kegg_cpd_list[1]]


import modelseedpy
modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)
cpd_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID","formula","charge"])
for (key,value) in modelseed.compound_aliases.items():
    ModelSeed_ID = key
    compound = modelseed.get_seed_compound(ModelSeed_ID)
    formula = compound.formula
    charge = compound.data['charge']
    try:    
        find_Kegg = list(value.get('KEGG'))
    except:
            find_Kegg =''
    if find_Kegg:
        for Kegg_ID in find_Kegg:
            cpd_keggID_modelseedID.loc[len(cpd_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID,formula,charge]
cpd_keggID_modelseedID.to_csv('Database\\compound_keggID_modelseedID.txt', header=True,sep='\t',index=False)
cpd_keggID_modelseedID=cpd_keggID_modelseedID.rename(columns={'Kegg_ID':'ID'})


KEGG_COMPOUNDS_DB = pd.merge(KEGG_COMPOUNDS_name,cpd_keggID_modelseedID,how='left',on='ID').drop_duplicates(subset=['ID'], keep='first')


from lxml import etree
for index, row in KEGG_COMPOUNDS_DB.iterrows():
    if not row['formula']==row['formula']:
        url ='https://www.kegg.jp/entry/'+row['ID']
        try:
            html = requests.get(url,headers = headers)
        except requests.exceptions.Timeout:
            time.sleep(20)
            html = requests.get(url,headers = headers)
        e = etree.HTML(html.text)
        formula_content = e.xpath('//th/span[text()="Formula"]/../../td/div/text()')
        if formula_content:
            formula = formula_content[0]
            KEGG_COMPOUNDS_DB['formula'][index]=formula
            print(row['ID']+': add formula')
        else:
            print(row['ID']+': does not have formula')

KEGG_COMPOUNDS_DB.to_csv('Database\\KEGG_COMPOUNDS_DB.txt',sep='\t',index=None)

### KO trans to modelseed (from modelseed DB, maybe not complete)

In [37]:
import pandas as pd
df_ko_modelseedreaction = pd.read_csv('Database\\KO_modelseed_translations.csv', header = None)
list =  [ i for i in df_ko_modelseedreaction[0]]

f = open("Database\\KO_MODELSEED_DB.txt", 'a')
f.write('KO\tModelseed_reaction\n')

for str in list:
    ko = str[0:6]
    rxn = str[7:].split('rxn')
    rxn.sort()
    for r in rxn:
        if r!='':
            f.write(ko+'\trxn'+r+'\n')

f.close()


### download KEGG_ko to KEGG_Reaction and mapping to modelseed (from global map)

In [1]:
import requests
from lxml import etree
import re
import pandas as pd

pd_ko_R = pd.DataFrame(columns=['ko','rxn'])

url = 'https://www.kegg.jp/kegg/pathway.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
html = requests.get(url,headers = headers)
e = etree.HTML(html.text)
maplist = e.xpath('//div[@class="list"]//dt/text()')
i=1
for map in maplist:
    KEGG_pathway = requests.get('https://www.kegg.jp/pathway/map'+map,headers = headers)
    e = etree.HTML(KEGG_pathway.text)
    arealist = e.xpath('//area/@title')
    for areastr in arealist:
        if re.search(r"K\d{5}", areastr) and re.search(r"R\d{5}", areastr):
            ko_list = re.findall(r"K\d{5}", areastr)
            rx_list = re.findall(r"R\d{5}", areastr)
            for ko in ko_list:
                for rx in rx_list:
                    pd_ko_R.loc[len(pd_ko_R.index)]=[ko,rx]
    print('map:'+map+'done!    '+str(i)+' / '+str(len(maplist)))
    i=i+1
pd_ko_R=pd_ko_R.drop_duplicates(subset=['ko', 'rxn'], keep='first')

df_keggID_modelseedID = pd.read_csv('Database\\reaction_keggID_modelseedID.txt', sep='\t').rename(columns={'Kegg_ID':'rxn'})
KEGG_ko_R_DB = pd.merge(pd_ko_R,df_keggID_modelseedID,how='left',on='rxn').drop_duplicates(subset=['ko','rxn'], keep='first')

KEGG_ko_R_DB.to_csv('Database\\KEGG_ko_R_DB.txt', header=True, index=False,sep='\t')


map:01100 done!    1 / 562
map:01110 done!    2 / 562
map:01120 done!    3 / 562
map:01200 done!    4 / 562
map:01210 done!    5 / 562
map:01212 done!    6 / 562
map:01230 done!    7 / 562
map:01232 done!    8 / 562
map:01250 done!    9 / 562
map:01240 done!    10 / 562
map:01220 done!    11 / 562
map:00010 done!    12 / 562
map:00020 done!    13 / 562
map:00030 done!    14 / 562
map:00040 done!    15 / 562
map:00051 done!    16 / 562
map:00052 done!    17 / 562
map:00053 done!    18 / 562
map:00500 done!    19 / 562
map:00520 done!    20 / 562
map:00620 done!    21 / 562
map:00630 done!    22 / 562
map:00640 done!    23 / 562
map:00650 done!    24 / 562
map:00660 done!    25 / 562
map:00562 done!    26 / 562
map:00190 done!    27 / 562
map:00195 done!    28 / 562
map:00196done!    29 / 562
map:00710 done!    30 / 562
map:00720 done!    31 / 562
map:00680 done!    32 / 562
map:00910 done!    33 / 562
map:00920 done!    34 / 562
map:00061 done!    35 / 562
map:00062 done!    36 / 562
ma

### download KEGG KO database (not good, too slow)

In [None]:
import requests
from lxml import etree
import re
import time
import pandas as pd

KEGG_KO_list = requests.get('https://rest.kegg.jp/list/ko')
f = open("Database\\KEGG_KO_list.txt", 'w')
f.write(KEGG_KO_list.text)

kegg_ko_list = pd.read_table('Database\\KEGG_KO_list.txt',header=None)
kegg_klist = [ko for ko in kegg_ko_list[0]]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
f = open("KEGG_KO_DB.txt", 'a')
f.write('ID\tSYMBOL\tNAME\tEC\tREACTION')


for id in kegg_klist[:10]:
    url ='https://www.genome.jp/entry/'+id
    try:
        html = requests.get(url,headers = headers)
    except requests.exceptions.Timeout:
        print("time out, try again")
        time.sleep(20)
        html = requests.get(url,headers = headers)
    if not html.ok:
        print('wrong ko ID')
        time.sleep(20)
        html = requests.get(url,headers = headers)
        if not html.ok:
            continue

    e = etree.HTML(html.text)

    symbol = str(e.xpath('//th/span[text()="Symbol"]/../../td//text()')[0])
    name_content = e.xpath('//th/span[text()="Name"]/../../td//text()')
    name = ''
    for st in name_content:
        name = name +st.replace('\n','')
    if 'EC:' in name:
        EC = str(re.findall(r'\[.*?\]', name)[0])
        name = re.sub(r'\[.*?\]', '', name)
    reaction_content = e.xpath('//th/span[text()="Other DBs"]/../../td//text()')
    for st in reaction_content[1:]:
        if st.startswith('R'):
            reaction = st
            print(id + '   ' + reaction)
            f.write(id+'\t'+symbol+'\t'+name+'\t'+EC+'\t'+reaction+'\n')
f.close()


K00001   R00623
K00001   R00754
K00001   R02124
K00001   R04805
K00001   R04880
K00001   R05233
K00001   R05234
K00001   R06917
K00001   R06927
K00001   R07105
K00001   R08281
K00001   R08306
K00001   R08310
K00002   R00746
K00002   R01041
K00002   R01481
K00002   R05231
K00003   R01773
K00003   R01775
K00004   R02855
K00004   R02946
K00004   R10504
K00005   R01034
K00005   R10715
K00005   R10717
K00006   R00842
K00007   R00868
K00007   R05604
K00008   R00875
K00008   R01896
K00009   R02703
K00010   R01183
K00010   R09951


## download reaction from RAEVE

In [None]:
import pandas as pd
pathwaymap_df = pd.read_csv('Database//pathwaymap.csv',encoding='utf-8-sig',header=None)

In [None]:
fr=open('Database//keggrxn_fromRAVEN.txt','r',encoding='utf-8-sig')
with open('Database//keggrxn_fromRAVENnew.txt','w',encoding='utf-8-sig') as fw:
    for line in fr:
        data = line.split('\t')
        if ";" in data[4]:
            subsystems = data[4].split(';')
            rnid=""
            for subsystem in subsystems:
                try:
                    searchid = pathwaymap_df[pathwaymap_df[0]==subsystem][1].values[0]
                except:
                    searchid = ""
                rnid=rnid+searchid+";"
        else:
            try:
                rnid = pathwaymap_df[pathwaymap_df[0]==data[4]][1].values[0]
            except:
                rnid= ""   
        fw.write(data[0]+'\t'+data[1]+'\t'+data[2]+'\t'+data[3]+'\t'+data[4]+'\t'+rnid+'\n')
fw.close()