**这个脚本用来评估cafa3上各个模型的性能**

In [1]:
import pandas as pd
import numpy as np
import pickle
import collections

In [2]:
##同样的 需要使用pvec
pvec={}
with open('protVec_100d_3grams.csv','r')as fin:
    for line in fin:
        line=line.strip('\"').split()
        name=line[0]
        line[-1]=line[-1].strip('\"')
        vec=list(map(float,line[1:]))
#         print(name)
#         print(vec)
        pvec[name]=vec

In [3]:
##准备fasta数据
k,v=[],[]
with open('cafa3/uniprot_sprot_exp.fasta','r')as fin:
    for line in fin:
        #print(line)
        line=line.strip()
        if line.startswith('>'):
            k.append(line[1:])
            v.append('')
        else:
            v[-1]+=line.strip()
        

In [5]:
##标签数据
labels=collections.defaultdict(list)
domains=collections.defaultdict(list)
with open('cafa3/uniprot_sprot_exp.txt','r')as fin:
    for line in fin:
        line=line.split()
        domains[line[-1]].append(line[1])
        labels[line[0]].append(line[1])

In [6]:
def seq2vec(seq):
    temp=[]
    for i in range(1500):
        if i>len(seq)-3:
            temp.append([0.]*100)
        else:
            temp.append(pvec.get(seq[i:i+3], [0.]*100))
    return temp

In [7]:
##把序列转为向量
for i in range(len(v)):
    v[i] = seq2vec(v[i])

In [8]:
seqs = {}
for i in range(len(v)):
    seqs[k[i]] = v[i]

In [9]:
###下面处理标签
gos=[]
namespace=collections.defaultdict(str)
is_a=collections.defaultdict(list)
part=collections.defaultdict(list)
with open('cafa3/go.obo','r')as fin:
    for line in fin:
        if '[Typedef]' in line:
            break
        if line[:5]=='id: G':
            line=line.strip().split()
            gos.append(line[1])
        elif line[:4]=='is_a':
            line=line.strip().split()
            is_a[gos[-1]].append(line[1])
        elif line[:4]=='rela' and 'part' in line:
            line=line.strip().split()
            part[gos[-1]].append(line[2])
        elif line[:5]=='names':
            line=line.strip().split()
            namespace[gos[-1]]=line[1]

In [10]:
## 统计各分支中term的个数
c=collections.Counter(namespace.values())
c

Counter({'biological_process': 30780,
         'molecular_function': 12196,
         'cellular_component': 4450})

In [11]:
##去除分支之内的term
bp,mf,cc=set(),set(),set()
for i in namespace:
    if namespace[i]=='biological_process':
        bp.add(i)
    elif namespace[i]=='molecular_function':
        mf.add(i)
    elif namespace[i]=='cellular_component':
        cc.add(i)

In [12]:
for i in part:
    is_a[i].extend(part[i])

In [15]:
##cafa3的标签比较稀疏，利用True path rule 来补全
def progate(l):
    while True:
        length=len(l)
        temp=[]
        for i in l:
            temp.extend(is_a[i])
        l.update(temp)
        if len(l)==length:
            return l

In [16]:
for i in labels:
    if labels[i]:
        labels[i]=progate(set(labels[i]))

In [18]:
fre_counter = collections.Counter()
for i in labels:
    fre_counter.update(labels[i])

In [30]:
cc_set = set()
for i in cc:
    if fre_counter[i]>100:
        cc_set.add(i)  ##381

In [91]:
bp_set = set()
for i in bp:
    if fre_counter[i]>400:
        bp_set.add(i)   ##len 844

In [92]:
mf_set = set()
for i in mf:
    if fre_counter[i]>100:
        mf_set.add(i)
print(len(mf_set))  ##403

403


In [32]:
labels_cc = []  ##49138
emb_cc = []
entry = []
for i in labels:
    temp = []
    for j in labels[i]:
        if j in cc_set:
            temp.append(j)
    if len(temp)>0:
        labels_cc.append(temp)
        emb_cc.append(seqs[i])
        entry.append(i)

In [75]:
labels_bp = [] ##52191
emb_bp = []
entry_bp = []
for i in labels:
    temp = []
    for j in labels[i]:
        if j in bp_set:
            temp.append(j)
    if len(temp)>0:
        labels_bp.append(temp)
        emb_bp.append(seqs[i])
        entry_bp.append(i)

In [93]:
labels_mf = []  ###34343
emb_mf = []
entry_mf = []
for i in labels:
    temp = []
    for j in labels[i]:
        if j in mf_set:
            temp.append(j)
    if len(temp)>0:
        labels_mf.append(temp)
        emb_mf.append(seqs[i])
        entry_mf.append(i)

In [35]:
labels_cc2onehot = []
term2idx=dict(enumerate(cc_set))
term2idx={v:k for k,v in term2idx.items()}

In [76]:
labels_bp2onehot = []
term2idx_bp=dict(enumerate(bp_set))
term2idx_bp={v:k for k,v in term2idx_bp.items()}

In [94]:
labels_mf2onehot = []
term2idx_mf=dict(enumerate(mf_set))
term2idx_mf={v:k for k,v in term2idx_mf.items()}

In [36]:
for i in labels_cc:
    temp = [0]*len(cc_set)
    for j in i:
        temp[term2idx[j]]=1
    labels_cc2onehot.append(temp)

In [77]:
for i in labels_bp:
    temp = [0]*len(bp_set)
    for j in i:
        temp[term2idx_bp[j]]=1
    labels_bp2onehot.append(temp)

In [108]:
for i in labels_mf:
    temp = [0]*len(mf_set)
    for j in i:
        temp[term2idx_mf[j]]=1
    labels_mf2onehot.append(temp)

In [109]:
train_emb_mf = emb_mf[:15000]
train_label_mf = labels_mf2onehot[:15000]

In [111]:
###label smoothing
train_label_mf_smoothed = []
for i in train_label_mf:
    c = sum(i)
    sm = 0.15*c/len(i)
    temp=[]
    for  j in i:
        if j==0.:
            temp.append(sm)
        elif j==1.:
            temp.append(0.85+sm)
    train_label_mf_smoothed.append(temp)

In [114]:
with open('cafa3/train_emb_mf.pkl','wb')as f:
    pickle.dump(train_emb_mf,f)
with open('cafa3/label_mf_smoothed.pkl','wb')as f:
    pickle.dump(train_label_mf_smoothed,f)

In [115]:
test_emb_mf = emb_mf[15000:20000]
test_label_mf = labels_mf2onehot[15000:20000]

In [116]:
with open('cafa3/test_emb_mf.pkl','wb')as f:
    pickle.dump(test_emb_mf,f)
with open('cafa3/test_label_mf.pkl','wb')as f:
    pickle.dump(test_label_mf,f)

In [117]:
with open('cafa3/label_mf_unsmoothed.pkl','wb')as f:
    pickle.dump(train_label_mf,f)