In [1]:
import json
import pandas as pd
import urllib
import requests

In [2]:
import glob

def parseFile(path):
    genes = []
    with open(path) as f:
        lines = f.readlines()
    index = False
    i = 1
    for line in lines:
        if len(line) < 5:
            next
        if line[0:4] == 'GENE':
            index = True
        if line.split()[0] in ['COMPOUND','REL_PATHWAY','REFERENCE'] :
            index = False
        if index:
            if i == 1:
                g = line.split()[2]
            else:
                #print(g)
                g = line.split()[1]
            i += 1
            #print(g)
            genes += [g[0:len(g)-1]]
    return genes


## step1：下载kegg的所有pathway对应的json文件
https://www.genome.jp/kegg-bin/get_htext#A1

In [3]:
kegg_path = 'E:/pathways/KEGG_2021/ko00001.json'
with open(kegg_path,encoding='utf-8') as f:
    kegg = json.load(f)

In [4]:
kegg.keys()

dict_keys(['name', 'children'])

In [8]:
len(kegg)

2

In [5]:
for k in kegg['children']:
    print(k['name'])

09100 Metabolism
09120 Genetic Information Processing
09130 Environmental Information Processing
09140 Cellular Processes
09150 Organismal Systems
09160 Human Diseases
09180 Brite Hierarchies
09190 Not Included in Pathway or Brite


## step2：拿出metabolism对应的pathway的ID

In [5]:
metab_lis = kegg['children'][6]['children']

In [6]:
meta_sum_names = []
meta_names = []
hsa_names = []
for m in metab_lis:
    meta_sum_name = m['name']
    for m_2 in m['children']:
        meta_name_lis = m_2['name'].split('[')
        meta_name = meta_name_lis[0]
        if len(meta_name_lis)>1 and len(meta_name_lis[1])>0:
            meta_names += [meta_name[6:len(meta_name)]]
            hsa_names += ['hsa'+meta_name_lis[1][7:len(meta_name_lis[1])-1]]
            meta_sum_names += [meta_sum_name]

In [7]:
meta_pathway_df = pd.DataFrame({'sum_name' : meta_sum_names, 
              'meta_name' : meta_names, 
              'hsa_name' : hsa_names})

In [8]:
meta_pathway_df.head()

Unnamed: 0,sum_name,meta_name,hsa_name
0,09181 Protein families: metabolism,Enzymes,hsa000
1,09181 Protein families: metabolism,Protein kinases,hsa001
2,09181 Protein families: metabolism,Protein phosphatases and associated proteins,hsa009
3,09181 Protein families: metabolism,Peptidases and inhibitors,hsa002
4,09181 Protein families: metabolism,Glycosyltransferases,hsa003


In [9]:
meta_pathway_df.shape

(55, 3)

## step3：下载对应链接

In [10]:
folder = "E:/pathways/KEGG_2021/brite_hierarchies/"
for hsa in meta_pathway_df['hsa_name']:
    print(hsa)
    url = 'http://rest.kegg.jp/get/' + hsa
    r = requests.get(url)
    with open(folder+hsa+'.txt', "w") as code:
        code.write(r.content.decode())

hsa000
hsa001
hsa009
hsa002
hsa003
hsa005
hsa011
hsa004
hsa008
hsa006
hsa007
hsa199
hsa194
hsa000
hsa021
hsa019
hsa041
hsa011
hsa009
hsa016
hsa012
hsa110
hsa131
hsa121
hsa051
hsa032
hsa036
hsa400
hsa029
hsa000
hsa044
hsa042
hsa022
hsa035
hsa812
hsa147
hsa048
hsa030
hsa050
hsa054
hsa310
hsa040
hsa031
hsa052
hsa515
hsa090
hsa504
hsa535
hsa536
hsa537
hsa091
hsa990
hsa200
hsa210
hsa100


## step4：处理下载得到的文件

In [11]:

files = glob.glob('E:/pathways/KEGG_2021/brite_hierarchies/hsa*')
res = {}
for f in files:
    f_name = f.split('\\')[1].split(r'.')[0]
    print(f_name)
    res[f_name] = parseFile(f)
    #break

hsa000
hsa001
hsa002
hsa003
hsa004
hsa005
hsa006
hsa007
hsa008
hsa009
hsa011
hsa012
hsa016
hsa019
hsa021
hsa022
hsa029
hsa030
hsa031
hsa032
hsa035
hsa036
hsa040
hsa041
hsa042
hsa044
hsa048
hsa050
hsa051
hsa052
hsa054
hsa090
hsa091
hsa100
hsa110
hsa121
hsa131
hsa147
hsa194
hsa199
hsa200
hsa210
hsa310
hsa400
hsa504
hsa515
hsa535
hsa536
hsa537
hsa812
hsa990


In [12]:
f = open('E:/pathways/KEGG_2021/gmtFiles/human/brite_hierarchies.gmt','w')
for r in res:
    if len(res[r])>0:
        metab_name1 = str(meta_pathway_df.loc[meta_pathway_df['hsa_name'] == r]['meta_name'].values[0])
        metab_name2 = str(meta_pathway_df.loc[meta_pathway_df['hsa_name'] == r]['sum_name'].values[0])
        genes = '\t'.join(res[r])
        f.write(metab_name1+'\t'+metab_name2[6:len(metab_name2)]+'\t'+genes+'\n')
f.flush()
f.close()

In [13]:
f.close()

In [14]:
set(meta_pathway_df['sum_name'])

{'09181 Protein families: metabolism',
 '09182 Protein families: genetic information processing',
 '09183 Protein families: signaling and cellular processes',
 '09184 RNA family',
 '09185 Viral protein families'}

In [110]:
meta_pathway_df.loc[meta_pathway_df['hsa_name'] == 'hsa00511']

Unnamed: 0,sum_name,meta_name,hsa_name
79,09107 Glycan biosynthesis and metabolism,Other glycan degradation,hsa00511


### 小鼠
* step3：下载对应链接
* step4：处理下载得到的文件

In [None]:
folder = "E:/pathways/KEGG_2021/brite_hierarchies/mouse/"
for mmu in meta_pathway_df['hsa_name']:
    mmu = mmu.replace('hsa','mmu')
    print(mmu)
    url = 'http://rest.kegg.jp/get/' + mmu
    r = requests.get(url)
    #print(len(r.content.decode()))
    if len(r.content.decode()) > 0:
        with open(folder+mmu+'.txt', "w") as code:
            code.write(r.content.decode())

mmu000
mmu001
mmu009
mmu002
mmu003
mmu005
mmu011
mmu004
mmu008
mmu006
mmu007
mmu199
mmu194
mmu000
mmu021
mmu019
mmu041
mmu011
mmu009
mmu016
mmu012
mmu110
mmu131
mmu121
mmu051
mmu032
mmu036
mmu400
mmu029
mmu000
mmu044
mmu042
mmu022
mmu035
mmu812
mmu147
mmu048
mmu030
mmu050
mmu054
mmu310
mmu040
mmu031
mmu052
mmu515
mmu090
mmu504
mmu535
mmu536
mmu537
mmu091


In [31]:
import glob
files = glob.glob(folder+'mmu*')
res = {}
for f in files:
    f_name = f.split('\\')[1].split(r'.')[0]
    print(f_name)
    res[f_name] = parseFile(f)

mmu04110
mmu04114
mmu04115
mmu04136
mmu04137
mmu04140
mmu04142
mmu04144
mmu04145
mmu04146
mmu04210
mmu04215
mmu04216
mmu04217
mmu04218
mmu04510
mmu04520
mmu04530
mmu04540
mmu04550
mmu04810


In [32]:
f = open('E:/pathways/KEGG_2021/gmtFiles/mouse/brite_hierarchies.gmt','w')
for r in res:
    if len(res[r])>0:
        metab_name1 = str(meta_pathway_df.loc[meta_pathway_df['hsa_name'] == r.replace('mmu','hsa')]['meta_name'].values[0])
        metab_name2 = str(meta_pathway_df.loc[meta_pathway_df['hsa_name'] == r.replace('mmu','hsa')]['sum_name'].values[0])
        genes = '\t'.join(res[r])
        f.write(metab_name1+'\t'+metab_name2[6:len(metab_name2)]+'\t'+genes+'\n')
f.flush()
f.close()