# Download global KEGG pathway database

In [None]:
import os
if not os.path.exists('KEGG'):
    os.makedirs('KEGG')

##  Download KEGG pathway id/names

In [None]:
import requests
from lxml import etree
import json

pathways_name_dir = {}

url = 'https://www.kegg.jp/kegg/pathway.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
html = requests.get(url,headers = headers)
e = etree.HTML(html.text)
idlist = e.xpath('//div[@class="list"]//dt/text()')
print('%d pathways ID found.'%len(idlist))
namelist = e.xpath('//div[@class="list"]//dd/a[1]/text()')
print('%d pathways name found.'%len(namelist))
if len(idlist) == len(namelist):
    for i in range(len(idlist)):
        pathways_name_dir['map'+idlist[i].strip()] = namelist[i].strip()
else:
    print('Error!')

with open('KEGG\\KEGG_pathways_name.json', 'w') as f:
    f.write(json.dumps(pathways_name_dir))
f.close()

## Download KO, EC, R entries in KEGG pathways

In [18]:
import pandas as pd
import requests
from lxml import etree
import json

with open('KEGG\\KEGG_pathways_name.json', 'r') as f:
    pathways_name_dir = json.load(f)
f.close()

pathways_list = [i for i in pathways_name_dir.keys()]

KEGG_pathways_entrys = pd.DataFrame(columns=['pw_ID','pw_name','coords','href','title'])

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
m = 1
for pathway in pathways_list:
    pathway_name = pathways_name_dir[pathway]
    KEGG_pathway = requests.get('https://www.kegg.jp/pathway/'+pathway,headers = headers)
    e = etree.HTML(KEGG_pathway.text)
    hreflist = e.xpath('//map[@id="mapdata"]/area/@href')
    coordslist = e.xpath('//map[@id="mapdata"]/area/@coords')
    entrylist = e.xpath('//map[@id="mapdata"]/area/@title')
    print('found %d entries in %s'%(len(hreflist),pathway))
    if not len(hreflist) == len(entrylist):
        print(pathway,'Error!')
        continue
    for i in range(len(hreflist)):
        href = hreflist[i]
        coords = coordslist[i]
        title = entrylist[i]
        KEGG_pathways_entrys.loc[len(KEGG_pathways_entrys.index)]=[pathway,pathway_name,coords,href,title]

    print('map:'+pathway+'done!    '+str(m)+' / '+str(len(pathways_list)))
    m=m+1

KEGG_pathways_entrys.to_csv('KEGG\\KEGG_pathways_entrys.txt', header=True, index=False,sep='\t')

found 7743 entries in map01100
map:map01100done!    1 / 563
found 4836 entries in map01110
map:map01110done!    2 / 563
found 2462 entries in map01120
map:map01120done!    3 / 563
found 292 entries in map01200
map:map01200done!    4 / 563
found 296 entries in map01210
map:map01210done!    5 / 563
found 333 entries in map01212
map:map01212done!    6 / 563
found 288 entries in map01230
map:map01230done!    7 / 563
found 181 entries in map01232
map:map01232done!    8 / 563
found 414 entries in map01250
map:map01250done!    9 / 563
found 669 entries in map01240
map:map01240done!    10 / 563
found 590 entries in map01220
map:map01220done!    11 / 563
found 101 entries in map00010
map:map00010done!    12 / 563
found 71 entries in map00020
map:map00020done!    13 / 563
found 118 entries in map00030
map:map00030done!    14 / 563
found 160 entries in map00040
map:map00040done!    15 / 563
found 156 entries in map00051
map:map00051done!    16 / 563
found 117 entries in map00052
map:map00052done!

## Read R entries

In [19]:
import pandas as pd
import re

KEGG_pathways_reactions = pd.DataFrame(columns=['pw_ID','pw_name','coords','href','title','R_ID'])
KEGG_pathways_entrys = pd.read_csv('KEGG\\KEGG_pathways_entrys.txt', sep='\t',dtype={'pw_ID':str,'pw_name':str,'href':str,'title':str}).drop_duplicates()
for index, row in KEGG_pathways_entrys.iterrows():
    rs = re.findall(r'R\d{5}',row['title'])
    if rs:
        for r in rs:
            KEGG_pathways_reactions.loc[len(KEGG_pathways_reactions.index)]=[row['pw_ID'],row['pw_name'],row['coords'],row['href'],row['title'],r]
     
KEGG_pathways_reactions.drop_duplicates().reset_index().to_csv('KEGG\\KEGG_pathways_reactions.txt',sep='\t',index=False)

## Read KO entries

In [20]:
import pandas as pd
import re

KEGG_pathways_kos = pd.DataFrame(columns=['pw_ID','pw_name','coords','href','title','KO_ID'])
KEGG_pathways_entrys = pd.read_csv('KEGG\\KEGG_pathways_entrys.txt', sep='\t',dtype={'pw_ID':str,'pw_name':str,'href':str,'title':str}).drop_duplicates()
for index, row in KEGG_pathways_entrys.iterrows():
    kos = re.findall(r'K\d{5}',row['title'])
    if kos:
        for ko in kos:
            KEGG_pathways_kos.loc[len(KEGG_pathways_kos.index)]=[row['pw_ID'],row['pw_name'],row['coords'],row['href'],row['title'],ko]
     
KEGG_pathways_kos.drop_duplicates().reset_index().to_csv('KEGG\\KEGG_pathways_kos.txt',sep='\t',index=False)

## Download map picture

In [5]:
import pandas as pd
import requests
import json

with open('KEGG\\KEGG_pathways_name.json', 'r') as f:
    pathways_name_dir = json.load(f)
f.close()

pathways_list = [i for i in pathways_name_dir.keys()]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}

m = 1
for pathway in pathways_list:
    downloadurl = 'https://www.kegg.jp/kegg/pathway/map/'+pathway+'@2x.png'
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
    pict=requests.get(downloadurl,headers = headers).content
    with open('KEGG\\maps\\maps\\'+pathway+'.png','wb') as f:
        f.write(pict)
    print(pathway+' done!    '+str(m)+' / '+str(len(pathways_list)))
    m=m+1

KeyboardInterrupt: 

## show reaction on map picture

In [1]:
import myModules.KEGG2BIGG as K2B
unikegglist = list(set(K2B.KEGG_pathways_reactions['R_ID'].tolist()))
a = K2B.pd.read_csv('KEGG\\Reaction_in_pathway_count.csv',index_col=0,squeeze=True)



  a = K2B.pd.read_csv('KEGG\\Reaction_in_pathway_count.csv',index_col=0,squeeze=True)


In [7]:
from PIL import Image, ImageFont, ImageDraw

def drawtext(draw,text,coords_list):
    box = draw.textbbox((0,0),text, font=font)
    w = box[2] - box[0]
    h = box[3] - box[1]
    if type(coords_list) == list:
        coords_num = [2*int(i) for i in coords_list]
        x = (coords_num[0]+coords_num[2])/2
        y = (coords_num[1]+coords_num[3])/2
        draw.rectangle(coords_num, fill=(237, 245, 249), outline=(0,0,0),width=1)
        draw.text((x-w/2,y-h/2), text, font=font,fill=(0,0,0))
    else:
        print('coords_list should be a list')

map_list=[key for key in a.keys()]
#map_list = ['map00030']
font = ImageFont.truetype("arial.ttf", 18)
for mapp in map_list:
    map_reactions = K2B.KEGG_pathways_reactions[K2B.KEGG_pathways_reactions['pw_ID']==mapp]
    img = Image.open("KEGG\\maps\\maps\\%s.png"%mapp)
    draw = ImageDraw.Draw(img)
    for r in map_reactions['R_ID']:
        text = r
        findresult = map_reactions[map_reactions['R_ID']==r]['coords'].to_list()
        for i in findresult:
            coords_list = i.split(',')
            if len(coords_list) == 4:
                drawtext(draw,text,coords_list)
    img.save('KEGG\\maps\\mapasR\\%s_asR.png'%mapp)

# Download KEGG reaction/compound database

In [None]:
# download KEGG reactions
import requests
from lxml import etree
import re
import time
import pandas as pd

KEGG_REACTIONS_list = requests.get('https://rest.kegg.jp/list/reaction')
f = open("KEGG\\KEGG_REACTIONS_list.txt", 'w')
f.write(KEGG_REACTIONS_list.text)

kegg_rxn_list = pd.read_table('KEGG\\KEGG_REACTIONS_list.txt',header=None)
kegg_rlist = [st.replace('rn:','') for st in kegg_rxn_list[0]]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'}
f = open("KEGG\\KEGG_REACTIONS_DB.txt", 'a')
f.write('ID\tNAMES\tKEGG\tEQUATION\tECs\n')

for id in kegg_rlist:
    url ='https://www.kegg.jp/entry/'+id
    try:
        html = requests.get(url,headers = headers)
    except requests.exceptions.Timeout:
        print("time out, try again")
        time.sleep(20)
        html = requests.get(url,headers = headers)
    if not html.ok:
        print('wrong reaction ID')
        time.sleep(20)
        html = requests.get(url,headers = headers)
        if not html.ok:
            continue

    e = etree.HTML(html.text)

    name_content = e.xpath('//th/span[text()="Name"]/../../td/div/div/text()')
    name = ''
    for st in name_content[:-1]:
        name = name +st.replace('\n','').replace(';','|')
    
    equation_content = e.xpath('//th/span[text()="Equation"]/../../td/div//text()')
    equation = ''
    for st in equation_content[:-1]:
        if st.startswith('C'):
            st = st+'[0]'
        else:
            matches = re.compile(r"\d+").findall(st)
            if matches:
                match = matches[0]
                st = st.replace(match, f"({match})")    
        equation = equation + st
    
    EC_content = e.xpath('//th/span[text()="Enzyme"]/../../td/div//text()')
    EC1 = [i.replace(u'\xa0','\n') for i in EC_content]
    EC2 = [i for i in EC1 if not str(i).startswith('\n')]
    EC = ''
    for st in EC2:
        EC = EC + st + '|'
    EC = EC[:-1]
    f.write(id+'\t'+name+'\t'+id+'\t'+equation+'\t'+EC+'\n')
f.close()


In [None]:
# KEGG reaction to ModelSEED reaction
import modelseedpy

modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)

df_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID"])
for (key,value) in modelseed.reaction_aliases.items():
    ModelSeed_ID = key
    try:    
        Kegg_ID = list(value.get('KEGG'))[0]
    except:
            Kegg_ID =''
    df_keggID_modelseedID.loc[len(df_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID]
df_keggID_modelseedID.to_csv('KEGG\\KEGG_reaction_keggID_modelseedID.txt',  header=True, index=False,sep='\t')

print('done!')

In [None]:
# download KEGG compounds and map to ModelSEED compounds

KEGG_COMPOUNDS_list = requests.get('https://rest.kegg.jp/list/compound')
f = open("KEGG\\KEGG_COMPOUNDS_list.txt", 'w')
f.write(KEGG_COMPOUNDS_list.text)
kegg_cpd_list = pd.read_table('KEGG\\KEGG_COMPOUNDS_list.txt',header=None)
KEGG_COMPOUNDS_name = pd.DataFrame(columns=['ID','Name'])
KEGG_COMPOUNDS_name['ID']=[st.replace('cpd:','') for st in kegg_cpd_list[0]]
KEGG_COMPOUNDS_name['Name']=[st.replace('; ','|') for st in kegg_cpd_list[1]]


import modelseedpy
modelseed_path = 'C:\\Users\\vickenlee\\ModelSEEDDatabase'
modelseed = modelseedpy.biochem.from_local(modelseed_path)
cpd_keggID_modelseedID = pd.DataFrame(columns=["Kegg_ID","ModelSeed_ID","formula","charge"])
for (key,value) in modelseed.compound_aliases.items():
    ModelSeed_ID = key
    compound = modelseed.get_seed_compound(ModelSeed_ID)
    formula = compound.formula
    charge = compound.data['charge']
    try:    
        find_Kegg = list(value.get('KEGG'))
    except:
            find_Kegg =''
    if find_Kegg:
        for Kegg_ID in find_Kegg:
            cpd_keggID_modelseedID.loc[len(cpd_keggID_modelseedID.index)]=[Kegg_ID,ModelSeed_ID,formula,charge]
cpd_keggID_modelseedID.to_csv('KEGG\\KEGG_compound_keggID_modelseedID.txt', header=True,sep='\t',index=False)
cpd_keggID_modelseedID=cpd_keggID_modelseedID.rename(columns={'Kegg_ID':'ID'})


KEGG_COMPOUNDS_DB = pd.merge(KEGG_COMPOUNDS_name,cpd_keggID_modelseedID,how='left',on='ID').drop_duplicates(subset=['ID'], keep='first')


from lxml import etree
for index, row in KEGG_COMPOUNDS_DB.iterrows():
    if not row['formula']==row['formula']:
        url ='https://www.kegg.jp/entry/'+row['ID']
        try:
            html = requests.get(url,headers = headers)
        except requests.exceptions.Timeout:
            time.sleep(20)
            html = requests.get(url,headers = headers)
        e = etree.HTML(html.text)
        formula_content = e.xpath('//th/span[text()="Formula"]/../../td/div/text()')
        if formula_content:
            formula = formula_content[0]
            KEGG_COMPOUNDS_DB['formula'][index]=formula
            print(row['ID']+': add formula')
        else:
            print(row['ID']+': does not have formula')

KEGG_COMPOUNDS_DB.to_csv('Database\\KEGG_COMPOUNDS_DB.txt',sep='\t',index=None)

# Download species KEGG

In [41]:
org_name = 'mko'
org_full_name = 'Methylomonas_koyamae'

In [None]:
org_name = 'mbry'
org_full_name = 'Methylocystis_bryophila'

In [None]:
org_name = 'mca'
org_full_name = 'Methylococcus_capsulatus'

In [None]:
import myModules.KEGGdownload as kd

In [None]:
pwlist = kd.getpwlist(org_name)

In [None]:
kd.downloadxml(org_name,pwlist)

In [None]:
df_reaction = kd.getentriesfromxml(org_name)

# Get KEGG reaction from genome

In [9]:
org_name = 'mko'
org_full_name = 'Methylomonas_koyamae'

In [1]:
org_name = 'mbry'
org_full_name = 'Methylocystis_bryophila'

In [None]:
org_name = 'mca'
org_full_name = 'Methylococcus_capsulatus'

In [4]:
import os
faa_paths=[]
for root, dirs, files in os.walk('genome\\'+org_full_name+'_genome'):
    if 'protein.faa' in files:
        genome_path = root+'\\protein.faa'
print(genome_path)

genome\Methylocystis_bryophila_genome\ncbi_dataset\data\GCA_027925445.1\protein.faa



1. upload protein.faa to https://www.kegg.jp/blastkoala/

2. download the blast result to genome\\XXX_genome\\user_ko.txt

In [10]:
import myModules.KEGG2BIGG as K2B
import pandas as pd

df_org_ko = pd.read_csv('genome\\'+org_full_name+'_genome\\user_ko.txt',sep='\t',header=None)
df_org_ko.columns = ['Genes','KO_ID']
ko_list = df_org_ko['KO_ID'].tolist()
R_list =[]
for ko in ko_list:
    try:
        R = K2B.getRfromKO(ko)
        if type(R) == list:
            R_list+=R
        elif type(R) == str:
            R_list.append(R)
    except:
        continue
R_list = list(set(R_list))
ko_list = list(set([i for i in ko_list if i==i and not i == 'ko']))
with open('genome\\'+org_full_name+'_genome\\R_list.txt','w') as f:
    for R in R_list:
        f.write(str(R)+'\n')
f.close()
with open('genome\\'+org_full_name+'_genome\\ko_list.txt','w') as f:
    for ko in ko_list:
        f.write(str(ko)+'\n')

Cannot find R for K02950!
Cannot find R for K02110!
Cannot find R for K02946!
Cannot find R for K02948!
Cannot find R for K02878!
Cannot find R for K07689!
Cannot find R for K03563!
Cannot find R for K03581!
Cannot find R for K02403!
Cannot find R for K08303!
Cannot find R for K03470!
Cannot find R for K06985!
Cannot find R for K07258!
Cannot find R for K06985!
Cannot find R for K03469!
Cannot find R for K07712!
Cannot find R for K07708!
Cannot find R for K01142!
Cannot find R for K02274!
Cannot find R for K04043!
Cannot find R for K09860!
Cannot find R for K19294!
Cannot find R for K03666!
Cannot find R for K10942!
Cannot find R for K02408!
Cannot find R for K02409!
Cannot find R for K02410!
Cannot find R for K02413!
Cannot find R for K02990!
Cannot find R for K02963!
Cannot find R for K10954!
Cannot find R for K02455!
Cannot find R for K02456!
Cannot find R for K02458!
Cannot find R for K09691!
Cannot find R for K02454!
Cannot find R for K02460!
Cannot find R for K03071!
Cannot find 

### show reaction on map picture

In [11]:
import myModules.KEGG2BIGG as K2B
with open('genome\\'+org_full_name+'_genome\\ko_list.txt','r') as f:
    ko_list = f.read().splitlines()
f.close()
a = K2B.pathway_count(ko_list)

map02020 81 Two-component system
map03010 54 Ribosome
map00230 50 Purine metabolism
map00680 50 Methane metabolism
map02010 45 ABC transporters
map02040 43 Flagellar assembly
map00860 38 Porphyrin metabolism
map02024 34 Quorum sensing
map03070 34 Bacterial secretion system
map00620 33 Pyruvate metabolism
map00520 33 Amino sugar and nucleotide sugar metabolism
map00270 31 Cysteine and methionine metabolism
map00190 31 Oxidative phosphorylation
map00790 28 Folate biosynthesis
map00240 27 Pyrimidine metabolism
map05111 26 Biofilm formation - Vibrio cholerae
map00720 26 Carbon fixation pathways in prokaryotes
map00630 25 Glyoxylate and dicarboxylate metabolism
map00010 25 Glycolysis / Gluconeogenesis
map00970 25 Aminoacyl-tRNA biosynthesis
map02025 24 Biofilm formation - Pseudomonas aeruginosa
map00920 24 Sulfur metabolism
map03440 22 Homologous recombination
map00910 22 Nitrogen metabolism
map00260 22 Glycine, serine and threonine metabolism
map00030 21 Pentose phosphate pathway
map02026 

In [12]:
from PIL import Image, ImageFont, ImageDraw

def drawtext(draw,coords_list):
    if type(coords_list) == list:
        coords_num = [2*int(i) for i in coords_list]
        draw.rectangle(coords_num, fill=None, outline=(252, 85, 49),width=5)
    else:
        print('coords_list should be a list')

map_list=[key for key in a.keys()]
#map_list = ['map00680']
font = ImageFont.truetype("arial.ttf", 18)
for mapp in map_list:
    map_kos = K2B.KEGG_pathways_kos[K2B.KEGG_pathways_kos['pw_ID']==mapp]
    try:
        img = Image.open("KEGG\\maps\\mapedinBIGGandR\\%s_biggandR.png"%mapp)
    except:
        continue
    draw = ImageDraw.Draw(img)
    for ko in ko_list:
        text = ko
        findresult = map_kos[map_kos['KO_ID']==ko]['coords'].to_list()
        for i in findresult:
            coords_list = i.split(',')
            if len(coords_list) == 4:
                drawtext(draw,coords_list)
    img.save('KEGG\\maps\\%s_maps\\%s_%s.png'%(org_name,mapp,org_name))