In [1]:
# 生成Diamond工具的预测结果
import pandas as pd
import pickle as pkl
import numpy as np
import sys
from collections import deque, Counter
from collections import OrderedDict
import os

AAS = {'M': 18, 'F': 15, 'A': 3, 'P': 9, 'I': 12, 'D': 11, 'V': 6, 'K': 7, 'S': 2, 'N': 13, 'W': 20, 'T': 10, 'G': 4, 'L': 1, 'Y': 16, 'E': 5, 'H': 17, 'R': 8, 'Q': 14, 'C': 19}

NAMESPACES = {
    'cellular_component':'cc' ,
    'molecular_function':'mf',
    'biological_process':'bp'
}

print('done')

done


In [2]:
class Ontology(object):

    def __init__(self, filename='data/go.obo', with_rels=False):
        self.ont = self.load(filename, with_rels)
        self.ic = None

    def has_term(self, term_id):
        return term_id in self.ont

    def calculate_ic(self, annots):
        cnt = Counter()
        for x in annots:
            cnt.update(x)
        self.ic = {}
        for go_id, n in cnt.items():
            parents = self.get_parents(go_id)
            if len(parents) == 0:
                min_n = n
            else:
                min_n = min([cnt[x] for x in parents])
            self.ic[go_id] = math.log(min_n / n, 2)
    
    def get_ic(self, go_id):
        if self.ic is None:
            raise Exception('Not yet calculated')
        if go_id not in self.ic:
            return 0.0
        return self.ic[go_id]

    def load(self, filename, with_rels):
        ont = dict()
        obj = None
        with open(filename, 'r') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line == '[Term]':
                    if obj is not None:
                        ont[obj['id']] = obj
                    obj = dict()
                    obj['is_a'] = list()
                    obj['part_of'] = list()
                    obj['regulates'] = list()
                    obj['alt_ids'] = list()
                    obj['is_obsolete'] = False
                    continue
                elif line == '[Typedef]':
                    obj = None
                else:
                    if obj is None:
                        continue
                    l = line.split(": ")
                    if l[0] == 'id':
                        obj['id'] = l[1]
                    elif l[0] == 'alt_id':
                        obj['alt_ids'].append(l[1])
                    elif l[0] == 'namespace':
                        obj['namespace'] = l[1]
                    elif l[0] == 'is_a':
                        obj['is_a'].append(l[1].split(' ! ')[0])
                    elif with_rels and l[0] == 'relationship':
                        it = l[1].split()
                        if it[0] == 'part_of':
                            obj['is_a'].append(it[1])
                            
                    elif l[0] == 'name':
                        obj['name'] = l[1]
                    elif l[0] == 'is_obsolete' and l[1] == 'true':
                        obj['is_obsolete'] = True
        if obj is not None:
            ont[obj['id']] = obj
        for term_id in list(ont.keys()):
            for t_id in ont[term_id]['alt_ids']:
                ont[t_id] = ont[term_id]
            if ont[term_id]['is_obsolete']:
                del ont[term_id]
        for term_id, val in ont.items():
            if 'children' not in val:
                val['children'] = set()
            for p_id in val['is_a']:
                if p_id in ont:
                    if 'children' not in ont[p_id]:
                        ont[p_id]['children'] = set()
                    ont[p_id]['children'].add(term_id)
        return ont


    def get_anchestors(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while(len(q) > 0):
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for parent_id in self.ont[t_id]['is_a']:
                    if parent_id in self.ont:
                        q.append(parent_id)
        return term_set


    def get_parents(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        for parent_id in self.ont[term_id]['is_a']:
            if parent_id in self.ont:
                term_set.add(parent_id)
        return term_set


    def get_namespace_terms(self, namespace):
        terms = set()
        for go_id, obj in self.ont.items():
            if obj['namespace'] == namespace:
                terms.add(go_id)
        return terms

    def get_namespace(self, term_id):
        return self.ont[term_id]['namespace']
    
    def get_term_set(self, term_id):
        if term_id not in self.ont:
            return set()
        term_set = set()
        q = deque()
        q.append(term_id)
        while len(q) > 0:
            t_id = q.popleft()
            if t_id not in term_set:
                term_set.add(t_id)
                for ch_id in self.ont[t_id]['children']:
                    q.append(ch_id)
        return term_set
    
def read_pkl(input_file):
    with open(input_file,'rb') as fr:
        temp_result = pkl.load(fr)
    
    return temp_result

def save_pkl(output_file,data):
    with open(output_file,'wb') as fw:
        pkl.dump(data,fw)
        
def get_label(anations,func_list):
    temp_result = []
    for label in func_list:
        if label in anations:
            temp_result.append(1)
        else:
            temp_result.append(0)
    return np.array(temp_result)

print('done')

done


In [3]:
base_path = '/home/wbshi/work/swissprot_data/train_test_data_handled_v4/'

In [14]:
with open(base_path + 'train_data_separate.pkl', 'rb') as fr:
    protein_message = pkl.load(fr)

In [8]:
print(len(protein_message.keys()))  #ABCG1_HUMAN
print('ABCG1_HUMAN' in protein_message)

70016
False


In [10]:
go = Ontology(base_path+'handled_protein_messages/go.obo', with_rels=True)

def read_diamond_result(input_file):
    diamond_result = {}
    with open(input_file,'r') as f:
        for line in f:
            it = line.strip().split()
            # protein_1 = it[0].split('|')[1]
            # protein_2 = it[1].split('|')[1]
            protein_1 = it[0]
            protein_2 = it[1]
            score = float(it[2])
            
            if protein_1 not in diamond_result:
                diamond_result[protein_1] = {}
            if protein_1 != protein_2:
                diamond_result[protein_1][protein_2] = score
    return diamond_result

def gennerate_diamond_score(input_file,output_file):
    
    # protein_message = pd.read_pickle('train_data_2016.pkl').set_index('proteins')
    
    with open(base_path + 'train_data_separate.pkl', 'rb') as fr:
        protein_message = pkl.load(fr)
        
    diamond_result = read_diamond_result(input_file)
    
    diamond_function_result = {}
    
    all_p_set = set()
    for prot_id,sim_prots in diamond_result.items():
        if len(sim_prots) == 0:
            continue
            
#         annots = {}
        allgos = {}
        total_score = 0.0
        for proteins, score in sim_prots.items():
            if proteins not in protein_message:
                # print(proteins)
                all_p_set.add(proteins)
                continue
            
            for i in list(protein_message[proteins]['all_bp'] | protein_message[proteins]['all_cc'] | protein_message[proteins]['all_mf']):
                if i not in allgos:
                    allgos[i] = score
                else:
                    allgos[i] += score

            total_score += score
        allgos_res = {}
        for key,value in allgos.items():
            allgos_res[key] = value / total_score
            
        diamond_function_result[prot_id] = allgos_res
    
    print(len(diamond_function_result), len(all_p_set))
    save_pkl(output_file,diamond_function_result)
            
            
input_path = './diamond_{0}_all_sequence.result'

ouput_path = './'
if not os.path.exists(ouput_path):
    os.mkdir(ouput_path)

ouput_path = ouput_path + 'diamond_oral_{0}_func_score.pkl'

input_file = input_path.format('test_one')
output_file = ouput_path.format('test_one')
gennerate_diamond_score(input_file,output_file)
print('*'*100)

input_file = input_path.format('test_two')
output_file = ouput_path.format('test_two')
gennerate_diamond_score(input_file,output_file)
print('*'*100)

input_file = input_path.format('train')
output_file = ouput_path.format('train')
gennerate_diamond_score(input_file,output_file)

1079 36
****************************************************************************************************
1118 25
****************************************************************************************************
61069 235


In [4]:
# go = Ontology('./data/20210101_go.obo', with_rels=True)
go = Ontology(base_path+'handled_protein_messages/go.obo', with_rels=True)


for types in ['test_one', 'test_two']:
    all_result = {}
    data_file = './diamond_oral_{0}_func_score.pkl'.format(types)
    temp_result = read_pkl(data_file)

    all_test_protein_id = set()

    input_file = base_path + "{0}_data_all_sequences.fasta".format(types)
    with open(input_file,'r') as fr:
        for line in fr:
            if line.startswith('>'):
                # accsion,protein_id = line.strip().split('|')
                protein_id = line.strip()[1:]
                all_test_protein_id.add(protein_id)

    for protein in all_test_protein_id:
        all_result[protein] = {}
        all_result[protein]['bp'] = {}
        all_result[protein]['cc'] = {}
        all_result[protein]['mf'] = {}
        if protein not in temp_result:
            continue

        
        for func,score in temp_result[protein].items():
            tag = NAMESPACES[go.get_namespace(func)]
            all_result[protein][tag][func] = score
            
    print(len(all_result))
    outPut_path = './diamond_final_{0}_predict_score.pkl'.format(types)
    save_pkl(outPut_path,all_result)


1353
1381
