In [1]:
import os
import pandas as pd
import numpy as np
import glob

# Due to GitHub's upload capacity limitations
# the GNBR source data needs to be downloaded from
# https://zenodo.org/record/3459420 on your own.
dir_path = 'E:/Projects/MBKG/database/GNBR'

In [2]:
def get_file_path():
    part1_paths = glob.glob(os.path.join(dir_path, 'part-i-*.txt'))
    part2_paths = glob.glob(os.path.join(dir_path, 'part-ii-*-with-themes.txt'))
    part1_paths = sorted(part1_paths)
    part2_paths = sorted(part2_paths)
    return part1_paths, part2_paths

In [3]:
def read_part_i(file_path):
    '''读取part-i文件, 并提取出依存路径对应的theme'''
    theme = pd.read_table(file_path, sep='\t', header=None, nrows=1)
    ncols = theme.shape[1]
    selected_cols = [0] + [i for i in range(1, ncols, 2)]
    theme = pd.read_table(file_path, sep='\t', header=0, usecols=selected_cols)
    theme['theme'] = theme.iloc[:, 1:].idxmax(axis=1)
    theme = theme.loc[:, ['path', 'theme']]
    return theme

In [4]:
def read_part_ii(file_path):
    '''读取part-ii文件, 并提取出实体对(id)及其对应的依存路径'''
    dp = pd.read_table(file_path, sep='\t', header=None, usecols=[6, 7, 8, 9, 12])
    dp.dropna(axis=0, how='any', inplace=True)
    dp.columns = ['head_name', 'tail_name', 'start_entity', 'end_entity', 'path']
    return dp

In [5]:
def match_dp_theme(theme, dp):
    '''匹配依存路径和theme, 注意part-i和ii中的start_entity和end_entity的大小写不一样, 因此比对时需要忽略大小写'''
    theme['path'] = theme['path'].str.lower()
    dp['path'] = dp['path'].str.lower()
    match_df = pd.merge(dp, theme, on='path')
    return match_df

In [6]:
part1_paths, part2_paths = get_file_path()
for i in range(4):
    part1 = part1_paths[i]
    part2 = part2_paths[i]
    print(f'Batch-{i}')
    print(f'part-i file: {part1}')
    print(f'part-ii file: {part2}')
    theme = read_part_i(part1)
    dp = read_part_ii(part2)
    if i == 0:
        match_df1 = match_dp_theme(theme, dp)
    if i == 1:
        match_df2 = match_dp_theme(theme, dp)
    if i == 2:
        match_df3 = match_dp_theme(theme, dp)
    if i == 3:
        match_df4 = match_dp_theme(theme, dp)
    print(f'---------Batch-{i} over---------')

Batch-0
part-i file: E:/Projects/MBKG/database/GNBR\part-i-chemical-disease-path-theme-distributions.txt
part-ii file: E:/Projects/MBKG/database/GNBR\part-ii-dependency-paths-chemical-disease-sorted-with-themes.txt
---------Batch-0 over---------
Batch-1
part-i file: E:/Projects/MBKG/database/GNBR\part-i-chemical-gene-path-theme-distributions.txt
part-ii file: E:/Projects/MBKG/database/GNBR\part-ii-dependency-paths-chemical-gene-sorted-with-themes.txt
---------Batch-1 over---------
Batch-2
part-i file: E:/Projects/MBKG/database/GNBR\part-i-gene-disease-path-theme-distributions.txt
part-ii file: E:/Projects/MBKG/database/GNBR\part-ii-dependency-paths-gene-disease-sorted-with-themes.txt
---------Batch-2 over---------
Batch-3
part-i file: E:/Projects/MBKG/database/GNBR\part-i-gene-gene-path-theme-distributions.txt
part-ii file: E:/Projects/MBKG/database/GNBR\part-ii-dependency-paths-gene-gene-sorted-with-themes.txt
---------Batch-3 over---------


In [7]:
match_df = pd.concat([match_df1[['head_name', 'start_entity']], match_df2[['head_name', 'start_entity']]], axis=0)
match_df.drop_duplicates(inplace=True)

In [17]:
pattern = 'Calcitonin'
match_df[match_df['head_name'].str.contains(pattern, case=False, regex=True)]

Unnamed: 0,head_name,start_entity
1761,calcitonin,MESH:D002116
14904,procalcitonin,MESH:C029100
23371,Procalcitonin,MESH:C029100
29249,Calcitonin,MESH:D002116
29769,Carbocalcitonin,MESH:C012755
54695,thyrocalcitonin,MESH:D002116
219290,Thyrocalcitonin,MESH:D002116
656293,carbocalcitonin,MESH:C012755
773875,prohormone procalcitonin,MESH:C029100
822083,preprocalcitonin,MESH:C033183
