# Download global KEGG pathway database

In [None]:
import os
if not os.path.exists('KEGG'):
    os.makedirs('KEGG')

##  Download KEGG pathway id/names

In [None]:
import requests
from lxml import etree
import json

pathways_name_dir = {}

url = 'https://www.kegg.jp/kegg/pathway.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
html = requests.get(url,headers = headers)
e = etree.HTML(html.text)
idlist = e.xpath('//div[@class="list"]//dt/text()')
print('%d pathways ID found.'%len(idlist))
namelist = e.xpath('//div[@class="list"]//dd/a[1]/text()')
print('%d pathways name found.'%len(namelist))
if len(idlist) == len(namelist):
    for i in range(len(idlist)):
        pathways_name_dir['map'+idlist[i].strip()] = namelist[i].strip()
else:
    print('Error!')

with open('KEGG\\KEGG_pathways_name.json', 'w') as f:
    f.write(json.dumps(pathways_name_dir))
f.close()

## Download KO, EC, R entries in KEGG pathways

In [None]:
import pandas as pd
import requests
from lxml import etree
import json

with open('KEGG\\KEGG_pathways_name.json', 'r') as f:
    pathways_name_dir = json.load(f)
f.close()

pathways_list = [i for i in pathways_name_dir.keys()]

KEGG_pathways_entrys = pd.DataFrame(columns=['pw_ID','pw_name','coords','href','title'])

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
m = 1
for pathway in pathways_list:
    pathway_name = pathways_name_dir[pathway]
    KEGG_pathway = requests.get('https://www.kegg.jp/pathway/'+pathway,headers = headers)
    e = etree.HTML(KEGG_pathway.text)
    hreflist = e.xpath('//map[@id="mapdata"]/area/@href')
    coordslist = e.xpath('//map[@id="mapdata"]/area/@coords')
    entrylist = e.xpath('//map[@id="mapdata"]/area/@title')
    print('found %d entries in %s'%(len(hreflist),pathway))
    if not len(hreflist) == len(entrylist):
        print(pathway,'Error!')
        continue
    for i in range(len(hreflist)):
        href = hreflist[i]
        coords = coordslist[i]
        title = entrylist[i]
        KEGG_pathways_entrys.loc[len(KEGG_pathways_entrys.index)]=[pathway,pathway_name,coords,href,title]

    print('map:'+pathway+'done!    '+str(m)+' / '+str(len(pathways_list)))
    m=m+1

KEGG_pathways_entrys.to_csv('KEGG\\KEGG_pathways_entrys.txt', header=True, index=False,sep='\t')

## Read R entries

In [None]:
import pandas as pd
import re

KEGG_pathways_entrys = pd.read_csv('KEGG\\KEGG_pathways_entrys.txt', sep='\t',dtype={'pw_ID':str,'pw_name':str,'href':str,'title':str}).drop_duplicates()
KEGG_pathways_reactions = KEGG_pathways_entrys[KEGG_pathways_entrys['title'].str.contains(r'R\d{5}')]
ids =KEGG_pathways_entrys['title'].str.extract(r'(R\d{5})')
KEGG_pathways_reactions=KEGG_pathways_reactions.assign(R_ID = ids).drop_duplicates().reset_index()
KEGG_pathways_reactions.to_csv('KEGG\\KEGG_pathways_reactions.txt',sep='\t',index=False)


## Download map picture

In [None]:
import pandas as pd
import requests
import json

with open('KEGG\\KEGG_pathways_name.json', 'r') as f:
    pathways_name_dir = json.load(f)
f.close()

pathways_list = [i for i in pathways_name_dir.keys()]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}

m = 1
for pathway in pathways_list:
    downloadurl = 'https://www.kegg.jp/kegg/pathway/map/'+pathway+'@2x.png'
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
    pict=requests.get(downloadurl,headers = headers).content
    with open('KEGG\\maps\\'+pathway+'.png','wb') as f:
        f.write(pict)
    print(pathway+' done!    '+str(m)+' / '+str(len(pathways_list)))
    m=m+1

# Download KEGG reaction/compound database

In [None]:
# download KEGG reactions
import requests
from lxml import etree
import re
import time
import pandas as pd

KEGG_REACTIONS_list = requests.get('https://rest.kegg.jp/list/reaction')
f = open("KEGG\\KEGG_REACTIONS_list.txt", 'w')
f.write(KEGG_REACTIONS_list.text)

kegg_rxn_list = pd.read_table('KEGG\\KEGG_REACTIONS_list.txt',header=None)
kegg_rlist = [st.replace('rn:','') for st in kegg_rxn_list[0]]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
f = open("KEGG\\KEGG_REACTIONS_DB.txt", 'a')
f.write('ID\tNAMES\tKEGG\tEQUATION\tECs\n')

for id in kegg_rlist:
    url ='https://www.kegg.jp/entry/'+id
    try:
        html = requests.get(url,headers = headers)
    except requests.exceptions.Timeout:
        print("time out, try again")
        time.sleep(20)
        html = requests.get(url,headers = headers)
    if not html.ok:
        print('wrong reaction ID')
        time.sleep(20)
        html = requests.get(url,headers = headers)
        if not html.ok:
            continue

    e = etree.HTML(html.text)

    name_content = e.xpath('//th/span[text()="Name"]/../../td/div/div/text()')
    name = ''
    for st in name_content[:-1]:
        name = name +st.replace('\n','').replace(';','|')
    
    equation_content = e.xpath('//th/span[text()="Equation"]/../../td/div//text()')
    equation = ''
    for st in equation_content[:-1]:
        if st.startswith('C'):
            st = st+'[0]'
        else:
            matches = re.compile(r"\d+").findall(st)
            if matches:
                match = matches[0]
                st = st.replace(match, f"({match})")    
        equation = equation + st
    
    EC_content = e.xpath('//th/span[text()="Enzyme"]/../../td/div//text()')
    EC1 = [i.replace(u'\xa0','\n') for i in EC_content]
    EC2 = [i for i in EC1 if not str(i).startswith('\n')]
    EC = ''
    for st in EC2:
        EC = EC + st + '|'
    EC = EC[:-1]
    f.write(id+'\t'+name+'\t'+id+'\t'+equation+'\t'+EC+'\n')
f.close()


In [None]:
# KEGG reaction to ModelSEED reaction
import modelseedpy

modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)

df_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID"])
for (key,value) in modelseed.reaction_aliases.items():
    ModelSeed_ID = key
    try:    
        Kegg_ID = list(value.get('KEGG'))[0]
    except:
            Kegg_ID =''
    df_keggID_modelseedID.loc[len(df_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID]
df_keggID_modelseedID.to_csv('KEGG\\KEGG_reaction_keggID_modelseedID.txt',  header=True, index=False,sep='\t')

print('done!')

In [None]:
# download KEGG compounds and map to ModelSEED compounds

KEGG_COMPOUNDS_list = requests.get('https://rest.kegg.jp/list/compound')
f = open("KEGG\\KEGG_COMPOUNDS_list.txt", 'w')
f.write(KEGG_COMPOUNDS_list.text)
kegg_cpd_list = pd.read_table('KEGG\\KEGG_COMPOUNDS_list.txt',header=None)
KEGG_COMPOUNDS_name = pd.DataFrame(columns=['ID','Name'])
KEGG_COMPOUNDS_name['ID']=[st.replace('cpd:','') for st in kegg_cpd_list[0]]
KEGG_COMPOUNDS_name['Name']=[st.replace('; ','|') for st in kegg_cpd_list[1]]


import modelseedpy
modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)
cpd_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID","formula","charge"])
for (key,value) in modelseed.compound_aliases.items():
    ModelSeed_ID = key
    compound = modelseed.get_seed_compound(ModelSeed_ID)
    formula = compound.formula
    charge = compound.data['charge']
    try:    
        find_Kegg = list(value.get('KEGG'))
    except:
            find_Kegg =''
    if find_Kegg:
        for Kegg_ID in find_Kegg:
            cpd_keggID_modelseedID.loc[len(cpd_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID,formula,charge]
cpd_keggID_modelseedID.to_csv('KEGG\\KEGG_compound_keggID_modelseedID.txt', header=True,sep='\t',index=False)
cpd_keggID_modelseedID=cpd_keggID_modelseedID.rename(columns={'Kegg_ID':'ID'})


KEGG_COMPOUNDS_DB = pd.merge(KEGG_COMPOUNDS_name,cpd_keggID_modelseedID,how='left',on='ID').drop_duplicates(subset=['ID'], keep='first')


from lxml import etree
for index, row in KEGG_COMPOUNDS_DB.iterrows():
    if not row['formula']==row['formula']:
        url ='https://www.kegg.jp/entry/'+row['ID']
        try:
            html = requests.get(url,headers = headers)
        except requests.exceptions.Timeout:
            time.sleep(20)
            html = requests.get(url,headers = headers)
        e = etree.HTML(html.text)
        formula_content = e.xpath('//th/span[text()="Formula"]/../../td/div/text()')
        if formula_content:
            formula = formula_content[0]
            KEGG_COMPOUNDS_DB['formula'][index]=formula
            print(row['ID']+': add formula')
        else:
            print(row['ID']+': does not have formula')

KEGG_COMPOUNDS_DB.to_csv('Database\\KEGG_COMPOUNDS_DB.txt',sep='\t',index=None)

# Download species KEGG

In [None]:
org_name = 'mko'
org_full_name = 'Methylomonas_koyamae'

In [None]:
org_name = 'mbry'
org_full_name = 'Methylocystis_bryophila'

In [None]:
org_name = 'mca'
org_full_name = 'Methylococcus_capsulatus'

In [None]:
import myModules.KEGGdownload as kd

In [None]:
pwlist = kd.getpwlist(org_name)

In [None]:
kd.downloadxml(org_name,pwlist)

In [None]:
df_reaction = kd.getentriesfromxml(org_name)

# Get KEGG reaction from genome

In [None]:
org_name = 'mko'
org_full_name = 'Methylomonas_koyamae'

In [1]:
org_name = 'mbry'
org_full_name = 'Methylocystis_bryophila'

In [None]:
org_name = 'mca'
org_full_name = 'Methylococcus_capsulatus'

In [4]:
import os
faa_paths=[]
for root, dirs, files in os.walk('genome\\'+org_full_name+'_genome'):
    if 'protein.faa' in files:
        genome_path = root+'\\protein.faa'
print(genome_path)

genome\Methylocystis_bryophila_genome\ncbi_dataset\data\GCA_027925445.1\protein.faa



1. upload protein.faa to https://www.kegg.jp/blastkoala/

2. download the blast result to genome\\XXX_genome\\user_ko.txt

In [5]:
from myModules.BIGG2KEGG import getRfromKO
import pandas as pd

df_org_ko = pd.read_csv('genome\\'+org_full_name+'_genome\\user_ko.txt',sep='\t',header=None)
df_org_ko.columns = ['Genes','KO_ID']
ko_list = df_org_ko['KO_ID'].tolist()
R_list =[]
for ko in ko_list:
    try:
        R = getRfromKO(ko)
        if type(R) == list:
            R_list+=R
        elif type(R) == str:
            R_list.append(R)
    except:
        continue
R_list = list(set(R_list))

with open('genome\\'+org_full_name+'_genome\\R_list.txt','w') as f:
    for R in R_list:
        f.write(str(R)+'\n')
f.close()

Cannot find R for K02884!
Cannot find R for K17686!
Cannot find R for K02911!
Cannot find R for K02017!
Cannot find R for K03553!
Cannot find R for K03584!
Cannot find R for K03575!
Cannot find R for K02297!
Cannot find R for K02298!
Cannot find R for K02299!
Cannot find R for K05592!
Cannot find R for K02584!
Cannot find R for K04761!
Cannot find R for K13924!
Cannot find R for K02887!
Cannot find R for K02916!
Cannot find R for K09689!
Cannot find R for K10107!
Cannot find R for K09688!
Cannot find R for K10107!
Cannot find R for K02040!
Cannot find R for K03210!
Cannot find R for K03074!
Cannot find R for K11903!
Cannot find R for K14987!
Cannot find R for K03671!
Cannot find R for K00333!
Cannot find R for K14987!
Cannot find R for K13924!
Cannot find R for K14986!
Cannot find R for K14987!
Cannot find R for K01991!
Cannot find R for K16554!
Cannot find R for K01338!
Cannot find R for K03544!
Cannot find R for K03070!
Cannot find R for K07716!
Cannot find R for K02919!
Cannot find 

### import from KEGG BLASTKOALA