**数据预处理脚本，把蛋白质序列和ppi分别表示为矩阵和向量，矩阵维度为1500*100，ppi嵌入维度为128**

**基因本体数据按照出现频次划分，每个分支的数据各不相同**

In [1]:
#导入依赖
import pandas as pd
import pickle 

In [2]:
###trigram vector, 100d
pvec={}
with open("protVec_100d_3grams.csv",'r')as f:
    for line in f:
        line = line.strip().split()
        line[0] = line[0].strip('\"')
        line[-1]=line[-1].strip('\"')
        pvec[line[0]]=list(map(float, line[1:]))

In [3]:
## 序列数据，转为dataframe
df = pd.read_excel('uniprot-filtered-organism__Mus+musculus+(Mouse)+[10090]_+AND+revie-- (1).xlsx')
seqs=dict(zip(df['Entry'], df['Sequence']))

In [4]:
##把序列转为矩阵的函数，这里选择了固定的1500长度，超过的后面部分不计算，不足的padding
def seq2vec(seq):
    temp=[]
    for i in range(1500):
        if i>len(seq)-3:
            temp.append([0.]*100)
        else:
            temp.append(pvec.get(seq[i:i+3], [0.]*100))
    return temp

In [5]:
#此处消耗极大内存 
for i in seqs:
    seqs[i] = seq2vec(seqs[i])

In [6]:
##标签数据，确保标签和序列大小一致
labels=dict(zip(df['Entry'], df['Gene ontology IDs']))
len(labels)==len(seqs)

True

In [7]:
for i in labels:
    if not isinstance(labels[i],float):
        temp = labels[i].split(';')
        for j in range(len(temp)):
            temp[j]=temp[j].strip(' ')
        labels[i]=temp

**接下来处理基因本体的数据，按照bpo cco mfo 划分，每个分支各自拥有独立的数据**

In [8]:
import collections
gos=[]
namespace=collections.defaultdict(str)
is_a=collections.defaultdict(list)
part=collections.defaultdict(list)

In [9]:
###根据规则来提取go term ，并依据其之间的依赖关系构建图谱
with open('go.obo','r')as fin:
    for line in fin:
        if '[Typedef]' in line:
            break
        if line[:5]=='id: G':
            line=line.strip().split()
            gos.append(line[1])
        elif line[:4]=='is_a':
            line=line.strip().split()
            is_a[gos[-1]].append(line[1])
        elif line[:4]=='rela' and 'part' in line:
            line=line.strip().split()
            part[gos[-1]].append(line[2])
        elif line[:5]=='names':
            line=line.strip().split()
            namespace[gos[-1]]=line[1]

In [10]:
##来看看共有多少个term
print(len(gos))
print(len(is_a))
print(len(part))

47432
44505
8629


In [11]:
for i in part:
    is_a[i].extend(part[i])

In [12]:
###true_path_rule
def progate(l):
    while True:
        length=len(l)
        temp=[]
        for i in l:
            temp.extend(is_a[i])
        l.update(temp)
        if len(l)==length:
            return l

In [13]:
##划分子空间，每个子空间是一个集合
bp,mf,cc=set(),set(),set()
for i in namespace:
    if namespace[i]=='biological_process':
        bp.add(i)
    elif namespace[i]=='molecular_function':
        mf.add(i)
    elif namespace[i]=='cellular_component':
        cc.add(i)

In [14]:
labels_with_go={}
for i in labels:
    if not isinstance(labels[i],float):
        labels_with_go[i]=progate(set(labels[i]))
len(labels),len(labels_with_go)### some items has no label are discarded

(17038, 16525)

In [15]:
fre_counter = collections.Counter()
for i in labels_with_go:
    fre_counter.update(labels_with_go[i])

In [16]:
label_bp,label_cc,label_mf=collections.defaultdict(list),collections.defaultdict(list),\
collections.defaultdict(list)
for i in labels_with_go:
    
    for j in labels_with_go[i]:
        if j in bp:
            label_bp[i].append(j)
        elif j in cc:
            label_cc[i].append(j)
        elif j in mf:
            label_mf[i].append(j)

In [17]:
print(len(label_bp))
print(len(label_cc))
print(len(label_mf))

15157
15963
13733


In [18]:
bp_c=collections.Counter()

In [19]:
for i in label_bp:
    #for j in label_bp[i]:
    bp_c.update(label_bp[i])

In [20]:
bp_d=dict(bp_c)
bp_set=set()
for i in bp_d:
    if bp_d[i]>=300:
        bp_set.add(i)

In [21]:
##bp 分支共计683个term
len(bp_set)

683

In [22]:
bp_label=collections.defaultdict(list)
for i in label_bp:
    for j in label_bp[i]:
        if j in bp_set:
            bp_label[i].append(j)

In [23]:
len(bp_label)

15157

In [24]:
cc_c=collections.Counter()
for i in label_cc:
    #for j in label_bp[i]:
    cc_c.update(label_cc[i])
cc_d=dict(cc_c)
cc_set=set()
for i in cc_d:
    if cc_d[i]>=100:
        cc_set.add(i)
cc_label=collections.defaultdict(list)
for i in label_cc:
    for j in label_cc[i]:
        if j in cc_set:
            cc_label[i].append(j)

In [25]:
len(cc_set)

285

In [26]:
len(cc_label)

15963

In [27]:
mf_c=collections.Counter()
for i in label_mf:
    #for j in label_bp[i]:
    mf_c.update(label_mf[i])
mf_d=dict(mf_c)
mf_set=set()
for i in mf_d:
    if mf_d[i]>=100:
        mf_set.add(i)
mf_label=collections.defaultdict(list)
for i in label_mf:
    for j in label_mf[i]:
        if j in mf_set:
            mf_label[i].append(j)

In [28]:
len(mf_label)

13730

In [29]:
len(mf_set)

272

In [30]:
len(seqs)

17038

**下面处理PPI数据**

In [31]:
##uniprot数据库中有和string数据库对齐的字段
mapping=dict(zip(df['Entry'], df['Cross-reference (STRING)']))

In [32]:
##去掉后面的引号
for i in mapping:
    if isinstance(mapping[i],str):
        mapping[i]=mapping[i].strip(';')

In [33]:
with open('pevc10090.pkl','rb')as f:
    homo=pickle.load(f)
names=homo['graph']['id2name']
vecs=homo['solver']['vertex_embeddings']


**现在开始对齐数据**

In [34]:
##构建基因本体term的索引
def goterm2idx(term_set):
    term_dict=dict(enumerate(term_set))
    term_dict={v:k for k,v in term_dict.items()}
    return term_dict

In [35]:
cc_term2idx=goterm2idx(cc_set)
mf_term2idx=goterm2idx(mf_set)
bp_term2idx=goterm2idx(bp_set)

In [36]:
def labels2onehot(labels,index):
    labels_new={}
    l=len(index)
    for i in labels:
        temp = [0]*l
        for j in labels[i]:
            temp[index[j]]=1
        labels_new[i]=temp
    return labels_new

In [37]:
cc_label2onehot=labels2onehot(cc_label,cc_term2idx)
bp_label2onehot=labels2onehot(bp_label,bp_term2idx)
mf_label2onehot=labels2onehot(mf_label,mf_term2idx)

In [38]:
cc_entry=list(cc_label.keys())
bp_entry=list(bp_label.keys())
mf_entry=list(mf_label.keys())

In [39]:
bp_emb=[]
bp_ppi=[]
bp_la=[]
for i in bp_entry:
    bp_emb.append(seqs[i])
    bp_la.append(bp_label2onehot[i])
    m=mapping[i]
    if m in names:
        bp_ppi.append(vecs[names.index(m)].tolist())
    else:
        bp_ppi.append([0.]*128)
    

In [40]:
emb_train=bp_emb[:11000]
emb_test=bp_emb[11000:]
ppi_vec_train=bp_ppi[:11000]
ppi_vec_test=bp_ppi[11000:]
labels_train=bp_la[:11000]
labels_test=bp_la[11000:]

In [41]:
len(bp_entry)*.7

10609.9

In [42]:
with open('bp/emb_train.pkl','wb')as f:
    pickle.dump(emb_train,f)
with open('bp/emb_test.pkl','wb')as f:
    pickle.dump(emb_test,f)
with open('bp/ppi_train.pkl','wb')as f:
    pickle.dump(ppi_vec_train,f)
with open('bp/ppi_test.pkl','wb')as f:
    pickle.dump(ppi_vec_test,f)
with open('bp/labels_train.pkl','wb')as f:
    pickle.dump(labels_train,f)
with open('bp/labels_test.pkl','wb')as f:
    pickle.dump(labels_test,f)

In [43]:
mf_emb=[]
mf_ppi=[]
mf_la=[]
for i in mf_entry:
    mf_emb.append(seqs[i])
    mf_la.append(mf_label2onehot[i])
    m=mapping[i]
    if m in names:
        mf_ppi.append(vecs[names.index(m)].tolist())
    else:
        mf_ppi.append([0.]*128)

In [44]:
len(mf_emb)==len(mf_la)==len(mf_ppi)

True

In [45]:
emb_train=mf_emb[:9000]
emb_test=mf_emb[9000:]
ppi_vec_train=mf_ppi[:9000]
ppi_vec_test=mf_ppi[9000:]
labels_train=mf_la[:9000]
labels_test=mf_la[9000:]

In [47]:
with open('mf/emb_train.pkl','wb')as f:
    pickle.dump(emb_train,f)
with open('mf/emb_test.pkl','wb')as f:
    pickle.dump(emb_test,f)
with open('mf/ppi_train.pkl','wb')as f:
    pickle.dump(ppi_vec_train,f)
with open('mf/ppi_test.pkl','wb')as f:
    pickle.dump(ppi_vec_test,f)
with open('mf/labels_train.pkl','wb')as f:
    pickle.dump(labels_train,f)
with open('mf/labels_test.pkl','wb')as f:
    pickle.dump(labels_test,f)

In [48]:
cc_emb=[]
cc_ppi=[]
cc_la=[]
for i in cc_entry:
    cc_emb.append(seqs[i])
    cc_la.append(cc_label2onehot[i])
    m=mapping[i]
    if m in names:
        cc_ppi.append(vecs[names.index(m)].tolist())
    else:
        cc_ppi.append([0.]*128)

In [49]:
emb_train=cc_emb[:12000]
emb_test=cc_emb[12000:]
ppi_vec_train=cc_ppi[:12000]
ppi_vec_test=cc_ppi[12000:]
labels_train=cc_la[:12000]
labels_test=cc_la[12000:]

In [53]:
with open('cc/emb_train.pkl','wb')as f:
    pickle.dump(emb_train,f)
with open('cc/emb_test.pkl','wb')as f:
    pickle.dump(emb_test,f)
with open('cc/ppi_train.pkl','wb')as f:
    pickle.dump(ppi_vec_train,f)
with open('cc/ppi_test.pkl','wb')as f:
    pickle.dump(ppi_vec_test,f)
with open('cc/labels_train.pkl','wb')as f:
    pickle.dump(labels_train,f)
with open('cc/labels_test.pkl','wb')as f:
    pickle.dump(labels_test,f)

In [50]:
mf_emb=[]
mf_ppi=[]
mf_la=[]
for i in mf_entry:
    mf_emb.append(seqs[i])
    mf_la.append(mf_label2onehot[i])
    m=mapping[i]
    if m in names:
        mf_ppi.append(vecs[names.index(m)].tolist())
    else:
        mf_ppi.append([0.]*128)

In [51]:
emb_train=mf_emb[:10000]
emb_test=mf_emb[10000:]
ppi_vec_train=mf_ppi[:10000]
ppi_vec_test=mf_ppi[10000:]
labels_train=mf_la[:10000]
labels_test=mf_la[10000:]

In [52]:
with open('mf/emb_train.pkl','wb')as f:
    pickle.dump(emb_train,f)
with open('mf/emb_test.pkl','wb')as f:
    pickle.dump(emb_test,f)
with open('mf/ppi_train.pkl','wb')as f:
    pickle.dump(ppi_vec_train,f)
with open('mf/ppi_test.pkl','wb')as f:
    pickle.dump(ppi_vec_test,f)
with open('mf/labels_train.pkl','wb')as f:
    pickle.dump(labels_train,f)
with open('mf/labels_test.pkl','wb')as f:
    pickle.dump(labels_test,f)