首先下载GO官网数据 创建全部GO的有向无环图

In [None]:
import math
import random
from collections import Counter, deque

import numpy as np
import torch
from numpy.random import randint
from sklearn.metrics import (auc, confusion_matrix, precision_recall_curve,
                             roc_curve)
from torch.utils.data import Dataset

BIOLOGICAL_PROCESS = 'GO:0008150'
MOLECULAR_FUNCTION = 'GO:0003674'
CELLULAR_COMPONENT = 'GO:0005575'
FUNC_DICT = {
    'cc': CELLULAR_COMPONENT,
    'mf': MOLECULAR_FUNCTION,
    'bp': BIOLOGICAL_PROCESS
}

NAMESPACES = {
    'cc': 'cellular_component',
    'mf': 'molecular_function',
    'bp': 'biological_process'
}

# ------------------------------------------------------------------------------------------
# Gene Ontology based on .obo File


# Gene Ontology based on .obo File
class Ontology(object):
    def __init__(self,
                 filename='./data/go-basic.obo',
                 with_rels=False,
                 include_alt_ids=True):
        super().__init__()
        self.ont, self.format_version, self.data_version = self.load(
            filename, with_rels, include_alt_ids)
        self.ic = None

    # ------------------------------------
    def load(self, filename, with_rels, include_alt_ids):
        '''
        filename: .obo file  GO总文件
        with_rels: 是否包含关系 计算part_of
        include_alt_ids: 是否包含alt_ids
        '''
        ont = dict()
        format_version = []  # 存储格式版本
        data_version = []  # 存储数据版本
        obj = None
        with open(filename, 'r') as f:
            for line in f:
                line = line.strip()
                # 如果是空行 跳过
                if not line:
                    continue
                    # format version line
                # 记录格式版本
                if line.startswith('format-version:'):
                    l = line.split(': ')
                    format_version = l[1]
                # data version line
                # 记录数据版本
                if line.startswith('data-version:'):
                    l = line.split(': ')
                    data_version = l[1]
                # item lines
                # 如果是[Term] 说明是一个新的GO term  把新的GO ID写入字典
                if line == '[Term]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = dict()
                    # 为该GO建立字典，继续存储GO相关信息 主要涉及它相关的五个关系 
                    # four types of relations to others: is a, part of, has part, or regulates
                    obj['is_a'] = list()
                    obj['part_of'] = list()
                    obj['relationship'] = list()
                    # alternative GO term id
                    obj['alt_ids'] = list()  # 替代GO ID
                    # is_obsolete
                    obj['is_obsolete'] = False
                    continue
                # 如果是[Typedef] 说明是一个类型定义的开始
                elif line == '[Typedef]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = None
                # 否则，这一行表示术语的属性
                else:
                    if obj is None:
                        continue
                    l = line.split(': ')
                    if l[0] == 'id':
                        obj['id'] = l[1]
                    elif l[0] == 'alt_id':
                        obj['alt_ids'].append(l[1])
                    elif l[0] == 'namespace':
                        obj['namespace'] = l[1]
                    elif l[0] == 'is_a':
                        obj['is_a'].append(l[1].split(' ! ')[0])
                    elif with_rels and l[0] == 'relationship':
                        it = l[1].split()
                        # add all types of relationships revised
                        if it[0] == 'part_of':
                            obj['part_of'].append(it[1])
                        obj['relationship'].append([it[1], it[0]])
                    elif l[0] == 'name':
                        obj['name'] = l[1]
                    # is_obsolete 过时GO ID
                    elif l[0] == 'is_obsolete' and l[1] == 'true':
                        obj['is_obsolete'] = True
            if obj is not None:
                ont[obj['id']] = obj
        # dealing with alt_ids, why
        for term_id in list(ont.keys()):
            # 如果包含替代GO ID  那么将替代GO ID也加入到字典中
            if include_alt_ids:
                for t_id in ont[term_id]['alt_ids']:
                    ont[t_id] = ont[term_id]
            # 如果GO ID是过时的  那么将该GO ID从字典中删除
            if ont[term_id]['is_obsolete']:
                del ont[term_id]
        # is_a -> children
        # 对于每一个GO ID  如果有children 那么将children加入到字典中
        # 然后把这个GO的所有is_a part of 关系的GO ID加入到children中
        for term_id, val in ont.items():
            if 'children' not in val:
                val['children'] = set()
            for p_id in val['is_a'] + val['part_of']:
                if p_id in ont:
                    if 'children' not in ont[p_id]:
                        ont[p_id]['children'] = set()
                    ont[p_id]['children'].add(term_id)
        return ont, format_version, data_version

    # ------------------------------------
    def has_term(self, term_id):
        return term_id in self.ont

    def get_term(self, term_id):
        if self.has_term(term_id):
            return self.ont[term_id]
        return None

    def calculate_ic(self, annots):
        cnt = Counter()
        for x in annots:
            cnt.update(x)
        self.ic = {}
        for go_id, n in cnt.items():
            parents = self.get_parents(go_id)
            if len(parents) == 0:
                min_n = n
            else:
                min_n = min([cnt[x] for x in parents])

            self.ic[go_id] = math.log(min_n / n, 2)

    def get_ic(self, go_id):
        if self.ic is None:
            raise Exception('Not yet calculated')
        if go_id not in self.ic:
            return 0.0
        return self.ic[go_id]

    # revised 'part_of'
    # 获得GO的所有祖先  这个获取的GO关系更加广泛 获得了GO的父元素，以及父元素的所有父元素 相当于把这个种族的家族都拿遍了
    def get_ancestors(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        q = deque()  # 双端队列
        q.append(term_id)  # 先将当前的GO加入队列中
        # 当队列不为空时，从队列中弹出一个GO术语，如果这个GO不在term_set中，就加入，并把这个GO的所有父元素加入队列
        while (len(q) > 0):
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for parent_id in (self.ont[t_id]['is_a'] +
                                  self.ont[t_id]['part_of']):
                    if parent_id in self.ont:
                        q.append(parent_id)
        # terms_set.remove(term_id)
        return term_set

    # revised
    # 获得GO的父节点  这个获取是小范围的，只获得与指定GO关系为is_a part of 的GO
    def get_parents(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        # 获得GO ID的所有父元素： 父元素 与这个GO关系为is_a part of的GO
        for parent_id in (self.ont[term_id]['is_a'] +
                          self.ont[term_id]['part_of']):
            if parent_id in self.ont:
                term_set.add(parent_id)
        return term_set

    # get the root terms(only is_a)
    # 获得的是GO的根祖先 即GO的is_a关系以及父节点的所有is_a关系  就是祖先集合的只包含is_a关系的版本 近亲
    def get_root_ancestors(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while (len(q) > 0):
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for parent_id in self.ont[t_id]['is_a']:
                    if parent_id in self.ont:
                        q.append(parent_id)
        # terms_set.remove(term_id)
        return term_set

    # 获得GO的所有根元素
    def get_roots(self, term_id):
        if term_id not in self.ont:
            return set()
        root_set = set()
        for term in self.get_root_ancestors(term_id): # 遍历该GO的所有根祖先
            if term not in self.ont:
                continue
            # 如果该GO的父元素为空 那么就是根元素
            if len(self.get_parents(term)) == 0:
                root_set.add(term)

        return root_set

    def get_namespace_terms(self, namespace):
        terms = set()
        for go_id, obj in self.ont.items():
            if obj['namespace'] == namespace:
                terms.add(go_id)
        return terms

    def get_namespace(self, term_id):
        return self.ont[term_id]['namespace']

    # all children
    # 获得该GO的所有孩子元素 BFS搜索 从term_id开始逐层向下搜索所有的子术语 直到没有  这个获得了所有的孩子 即获得GO的子元素后，又纳入子元素的子元素
    def get_term_set(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while len(q) > 0:
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for ch_id in self.ont[t_id]['children']:
                    q.append(ch_id)
        return term_set

    # only one layer children
    # 获得该GO的一层孩子元素  只获得一层孩子元素
    def get_child_set(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        if term_id not in term_set:
            for ch_id in self.ont[term_id]['children']:
                term_set.add(ch_id)
        return term_set


# ------------------------------------------------------------------------------------------
# functions for evaluation
def get_matrix(labels, preds, threshold=0.3):
    preds = preds.flatten()
    preds[preds >= threshold] = 1
    preds = preds.astype('int8')
    tn, fp, fn, tp = confusion_matrix(labels.flatten(), preds).ravel()
    return tn, fp, fn, tp


def get_level_matrix(labels, preds, level, threshold=0.3):
    preds = preds[..., level]
    preds = preds.flatten()
    preds[preds >= threshold] = 1
    preds = preds.astype('int8')
    labels = labels[..., level]
    tn, fp, fn, tp = confusion_matrix(labels.flatten(), preds).ravel()
    return tn, fp, fn, tp


def compute_roc(labels, preds):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)
    return roc_auc


def compute_aupr_level(labels, preds, level):
    labels = labels[..., level]
    preds = preds[..., level]
    precision, recall, _ = precision_recall_curve(labels.flatten(),
                                                  preds.flatten())
    aupr = auc(recall, precision)
    return aupr


def compute_aupr(labels, preds):
    precision, recall, _ = precision_recall_curve(labels.flatten(),
                                                  preds.flatten())
    aupr = auc(recall, precision)
    return aupr


# set random seed
def set_random_seed(seed=10, deterministic=False, benchmark=False):
    # random.seed(seed)
    # np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
    if benchmark:
        torch.backends.cudnn.benchmark = True


# contrast training
class con_pair_dataset(Dataset):
    def __init__(self,
                 con_pair,
                 contrast_dict,
                 terms,
                 terms_dict,
                 neg_num=80,
                 neg=0.5,
                 neg1_len=0.25):
        super().__init__()
        self.len_df = len(con_pair)
        self.n_cc = list(contrast_dict['n_cc'])
        self.n_bp = list(contrast_dict['n_bp'])
        self.n_mf = list(contrast_dict['n_mf'])
        self.terms = terms
        self.contrast_dict = contrast_dict
        self.terms_dict = terms_dict
        self.neg_num = neg_num
        self.con_pair = con_pair
        self.neg = neg
        self.neg1_len = neg1_len

    def __len__(self):
        return self.len_df

    def __getitem__(self, idx):
        terms_list = [self.con_pair[idx][0], self.con_pair[idx][1]]
        negs1 = set()
        neg1_len = min(len(self.con_pair[idx][2][0]),
                       int(self.neg_num * self.neg1_len))
        if neg1_len > 0:
            negs1 = set(random.sample(self.con_pair[idx][2][0], k=neg1_len))
        negs1 = list(negs1)
        random.shuffle(negs1)

        negs2 = set()
        neg2_len = int((self.neg_num - neg1_len) * self.neg)
        if len(self.contrast_dict[self.con_pair[idx][0]]) <= neg2_len:
            negs2 = negs2 | set(
                random.sample(self.contrast_dict[self.con_pair[idx][0]],
                              k=len(
                                  self.contrast_dict[self.con_pair[idx][0]])))
            negs2 = negs2 | set(
                random.sample(self.contrast_dict[self.con_pair[idx][0]],
                              k=neg2_len -
                              len(self.contrast_dict[self.con_pair[idx][0]])))
        else:
            negs2 = negs2 | set(
                random.sample(self.contrast_dict[self.con_pair[idx][0]],
                              k=neg2_len))
        negs2 = list(negs2)
        random.shuffle(negs2)

        neg_len = neg1_len + neg2_len
        neg_num = self.neg_num - neg_len
        negs3 = set()
        if self.contrast_dict[self.terms[terms_list[0]]] == 'GO:0005575':
            while len(negs3) < neg_num // 3:
                m = randint(0, len(self.n_mf) - 1)
                if self.terms_dict[self.n_mf[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_mf[m]])
            while len(negs3) < neg_num:
                m = randint(0, len(self.n_bp) - 1)
                if self.terms_dict[self.n_bp[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_bp[m]])
        elif self.contrast_dict[self.terms[terms_list[0]]] == 'GO:0003674':
            while len(negs3) < neg_num // 5:
                m = randint(0, len(self.n_cc) - 1)
                if self.terms_dict[self.n_cc[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_cc[m]])
            while len(negs3) < neg_num:
                m = randint(0, len(self.n_bp) - 1)
                if self.terms_dict[self.n_bp[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_bp[m]])
        elif self.contrast_dict[self.terms[terms_list[0]]] == 'GO:0008150':
            while len(negs3) < neg_num // 3:
                m = randint(0, len(self.n_cc) - 1)
                if self.terms_dict[self.n_cc[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_cc[m]])
            while len(negs3) < neg_num:
                m = randint(0, len(self.n_mf) - 1)
                if self.terms_dict[self.n_mf[m]] not in negs3:
                    negs3.add(self.terms_dict[self.n_mf[m]])
        negs3 = list(negs3)
        random.shuffle(negs3)

        neg1_num = [neg1_len for i in range(neg1_len)]
        neg2_num = [neg2_len for i in range(neg2_len)]
        neg3_num = [neg_num for i in range(neg_num)]
        neg_num = neg1_num + neg2_num + neg3_num
        neg_num = 1 / np.array(neg_num)
        terms_list = terms_list + negs1 + negs2 + negs3
        return torch.LongTensor(terms_list).view(
            len(terms_list)), torch.from_numpy(neg_num)


调用GO类 计算获得每个GO的祖先和children 保存成Pkl文件

In [None]:
import argparse
import pickle
from collections import defaultdict as ddt


parser = argparse.ArgumentParser(description='extract all terms in a go.obo file.',
                                 add_help=False)
parser.add_argument('--go-file',
                    '-gf',
                    default='./data/go-basic.obo',
                    type=str,
                    help='go file downloaded from Gene Ontology website')
parser.add_argument('--terms-file',
                    '-tf',
                    default='./data/terms_all.pkl',
                    type=str,
                    help='A DataFrame stored all terms')
parser.add_argument('--out-ancestor',
                    '-oa',
                    default='./data/go_ancestor.pkl',
                    type=str,
                    help='output file for ancestor')
parser.add_argument('--out-children',
                    '-oc',
                    default='./data/go_children.pkl',
                    type=str,
                    help='output file for children')



def main(go_file, terms_all_file, out_ancestor, out_children):
    # INPUT FILES
    go = Ontology(go_file, with_rels=True, include_alt_ids=False)
    # 读取所有GO ID
    with open(terms_all_file, 'rb') as fd:
        terms = pickle.load(fd)
        terms = list(terms['terms'])
    terms_set = set(terms)  # GO ID 集合
    terms_dict = {v: i for i, v in enumerate(terms)}  # 构建GO映射 如{'GO:0000001': 0, 'GO:0000002': 1, ...}
    # one layer parents, no self
    # parents_dict 获得并存储每个GO术语的父元素—— is_a part of
    parents_dict = ddt(set)  # 创建默认空字典 如果访问key不存在时不报错，返回默认值
    for i in range(len(terms)):
        parents_dict[terms[i]] = terms_set.intersection(go.get_parents(terms[i]))
    
    # all ancestors, no self
    # ancestor_dict 获得每个GO的所有祖先元素 这个获取比父元素更为广泛 获得的是自己的父元素，以及所有父元素的父元素
    ancestor_dict = ddt(set)
    for i in range(len(terms)):
        temp_set = go.get_ancestors(terms[i])
        temp_set.remove(terms[i])
        ancestor_dict[terms[i]] = terms_set.intersection(temp_set)
    
    # 获得所有GO各自的根元素 一个GO有相关的父元素，通过BFS找到这个GO家族的根元素 可能有多个
    root_dict = ddt(set)
    for i in range(len(terms)):
        root_dict[terms[i]] = go.get_roots(terms[i])
    # 这里简化了字典 只保留了一个根元素 如{'GO:0000001': {1,2,3,4,5}} 变为{'GO:0000001': 1}
    for k, v in root_dict.items():
        root_dict[k] = list(v)[0]
    # 获得所有GO各自的所有子元素 即获得该GO的子元素后，继续获得子元素的子元素，直到没有子元素
    child_dict = ddt(set)
    for i in range(len(terms)):
        child_dict[terms[i]] = terms_set.intersection(go.get_term_set(terms[i]))
    # 获得所有GO各自的所有子元素 这个只有一层子元素 只获得该GO的子元素 不往后获得孙子辈的
    child_one_dict = ddt(set)
    for i in range(len(terms)):
        child_one_dict[terms[i]] = terms_set.intersection(
            go.get_child_set(terms[i]))

    go_ancestor = ddt(list)
    go_children = ddt(list)
    count = 0
    for i in terms:  # 遍历所有 GO ID
        temp_anc_set = ancestor_dict[i]  # 获得该GO的所有祖先元素 即该GO的父元素，以及所有父元素的父元素
        temp_child_set = go.get_term_set(i)  # 获得该GO的所有子元素 即该GO的子元素，以及所有子元素的子元素

        go_ancestor[i] = list(temp_anc_set)
        go_children[i] = list(temp_child_set)
        print('{} is ok'.format(count))
        count += 1


    with open('./data/go_ancestor.pkl', 'wb') as fd:
        pickle.dump(go_ancestor, fd)

    with open('./data/go_children.pkl', 'wb') as fd:
        pickle.dump(go_children, fd)


if __name__ == '__main__':
    # args = parser.parse_args()
    # main(args.go_file, args.terms_file, args.out_ancestor, args.out_children)

    import pickle

    with open('./data/go_ancestor.pkl', 'rb') as f:
        ancestor_data = pickle.load(f)
    print(len(ancestor_data))

读取pkl文件 检查

In [None]:
# 读取pkl文件 查看
'''
所有GO的ancestor与children已经分别存到了go_ancestor.pkl与go_children.pkl文件中
接下来只需要先把所有GO存到List中 遍历每个GO
根据GO id索引到其children 将那些在list中的child id存到dict中即可
'''

import pickle

# 读取pkl文件
with open('./GO/go_children.pkl', 'rb') as f:
    go_ancestors = pickle.load(f)
print(go_ancestors['GO:0051052'])
print('GO:0051973' in go_ancestors['GO:0051052'])

接下来 读取UniProt整理的固定GO id 搭建这部分GO的稀疏网络

In [None]:
import pickle

# 根据UniProt整合的GO ID列表是固定的 

mf_keys = ['GO:0001618', 'GO:0003677', 'GO:0003723', 'GO:0003774', 'GO:0003824', 'GO:0003924', 'GO:0005198', 'GO:0005215', 'GO:0008092', 
           'GO:0008289', 'GO:0009975', 'GO:0016209', 'GO:0016491', 'GO:0016740', 'GO:0016787', 'GO:0016829', 'GO:0016853', 'GO:0016874', 
           'GO:0031386', 'GO:0038024', 'GO:0042393', 'GO:0044183', 'GO:0045182', 'GO:0045735', 'GO:0048018', 'GO:0060089', 'GO:0060090',
           'GO:0090729', 'GO:0098631', 'GO:0098772', 'GO:0120274', 'GO:0140096', 'GO:0140097', 'GO:0140098', 'GO:0140104', 'GO:0140110',
           'GO:0140223', 'GO:0140299', 'GO:0140313', 'GO:0140657', 'GO:0003674']

bp_keys = ['GO:0000278', 'GO:0000910', 'GO:0002181', 'GO:0002376', 'GO:0003012', 'GO:0003013', 'GO:0003014', 'GO:0003016', 'GO:0005975',
           'GO:0006091', 'GO:0006260', 'GO:0006281', 'GO:0006310', 'GO:0006325', 'GO:0006351', 'GO:0006355', 'GO:0006399', 'GO:0006457',
           'GO:0006486', 'GO:0006520', 'GO:0006575', 'GO:0006629', 'GO:0006766', 'GO:0006790', 'GO:0006886', 'GO:0006913', 'GO:0006914',
           'GO:0006954', 'GO:0007005', 'GO:0007010', 'GO:0007018', 'GO:0007031', 'GO:0007040', 'GO:0007059', 'GO:0007155', 'GO:0007163',
           'GO:0012501', 'GO:0015979', 'GO:0016071', 'GO:0016073', 'GO:0016192', 'GO:0022414', 'GO:0022600', 'GO:0023052', 'GO:0030154',
           'GO:0030163', 'GO:0030198', 'GO:0031047', 'GO:0032200', 'GO:0034330', 'GO:0042060', 'GO:0044782', 'GO:0048856', 'GO:0048870',
           'GO:0050877', 'GO:0050886', 'GO:0051604', 'GO:0055085', 'GO:0055086', 'GO:0061007', 'GO:0061024', 'GO:0065003', 'GO:0071554',
           'GO:0071941', 'GO:0072659', 'GO:0098542', 'GO:0098754', 'GO:0140013', 'GO:0140014', 'GO:0140053', 'GO:1901135', 'GO:0008150']

cc_keys = ['GO:0000228', 'GO:0005576', 'GO:0005615', 'GO:0005618', 'GO:0005634', 'GO:0005635', 'GO:0005654', 'GO:0005694', 'GO:0005730',
           'GO:0005739', 'GO:0005764', 'GO:0005768', 'GO:0005773', 'GO:0005777', 'GO:0005783', 'GO:0005794', 'GO:0005811', 'GO:0005815',
           'GO:0005829', 'GO:0005840', 'GO:0005856', 'GO:0005886', 'GO:0005929', 'GO:0009536', 'GO:0009579', 'GO:0030312', 'GO:0031012',
           'GO:0031410', 'GO:0043226', 'GO:0005575']
go_ids = mf_keys + bp_keys + cc_keys
# 去除空字符串
if '' in go_ids:
    go_ids.remove('')

# 读取go_children.pkl文件
with open('./data/go_children.pkl', 'rb') as file:
    go_children = pickle.load(file)

# 初始化一个空字典来存储结果
result = {}

# 遍历go_ids列表
for go_id in go_ids:
    # 根据GO id索引找到这个GO的所有children
    row_children = go_children.get(go_id, [])
    # 只保留在go_ids列表中的children
    children = [child for child in row_children if child in go_ids and child != go_id]
    # 将结果存入字典
    result[go_id] = children

# 将字典写入txt文件
with open('./data/noise_free/eSOL_go/uniprot_conform_go2go.txt', 'w') as file:
    for go_id, child in result.items():
        if child != []:
            file.write(go_id + ': ' + ', '.join(child) + '\n')
        else:
            file.write(go_id + ':' + '\n')

准备数据最后一步 将GO-GO.txt与protein-GO.txt整合 整理成SparseGO论文需要的格式

In [None]:
'''
GO-GO: GO_uniprot_conform_go_children.txt  GO之间的child关系记录
protein-GO: uniprot_protein2go_conform_test_mut.txt  protein与GO的关系记录 与数据集有关
'''

# 先读取GO-GO txt
with open('./data/noise_free/eSOL_go/eSOL_test_GO-GO.txt', 'w') as new_file:
    # 打开原始的GO-GO txt文件
    with open('./data/noise_free/eSOL_go/uniprot_conform_go2go.txt', 'r') as old_file:
        # 遍历原始文件的每一行
        for line in old_file:
            # 分割行，获取父节点和子节点列表
            parts = line.strip().split(': ')
            # 如果没有子节点，跳过这一行
            if len(parts) < 2:
                continue
            parent, children = parts
            # 分割子节点列表，获取每一个子节点
            children = children.split(', ')
            # 遍历每一个子节点
            for child in children:
                # 写入新的txt文件
                new_file.write(f'{parent} {child} default\n')

# 再读取GO-protein txt
with open('./data/noise_free/eSOL_go/eSOL_test_GO-protein.txt', 'w') as new_file:
    # 打开原始的GO-GO txt文件
    with open('./data/noise_free/eSOL_go/eSOL_test_100_uniprot_conform_protein2go.txt', 'r') as old_file:
        # 遍历原始文件的每一行
        for line in old_file:
            # 分割行，获取父节点和子节点列表
            parts = line.strip().split(': ')
            # 如果没有子节点，跳过这一行
            if len(parts) < 2:
                continue
            protein, gos = parts
            # 分割子节点列表，获取每一个子节点
            gos = gos.split(', ')
            # 遍历每一个子节点
            for go in gos:
                # 写入新的txt文件
                new_file.write(f'{go} {protein} protein\n')