In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
src_dir = '../src_data/GO0404/'
eval_bin_pth = '../evalGene/GOA0404_ppi.csv'
eval_score_pth = '../evalGene/GOA0404_score_ppi.csv'
eval_type_pth = '../evalGene/GOA0404_type_ppi.csv'

In [3]:
go_trp = pd.read_csv(src_dir+'edges.tsv', sep = "\t")
goa_trp = pd.read_csv(src_dir+'goa_edges.tsv', sep = "\t")
ent_df = pd.read_csv(src_dir+'entities.tsv', sep='\t')

#### GO & GOA preprocessing

In [4]:
trp = pd.concat([go_trp, goa_trp], ignore_index=True)

In [5]:
with open(src_dir+"prt_list.pkl","rb") as f:
    prot_list = pickle.load(f)

In [6]:
len(prot_list)

18159

In [7]:
ent_to_id={}
ent_to_id = {x: i for (i, x) in enumerate(ent_df['term'])}
cnt = len(ent_to_id)
print('term: ', cnt)
prot_to_id = {x: i+cnt for (i, x) in enumerate(prot_list)}
ent_to_id.update(prot_to_id)
cnt = len(ent_to_id)
print('entities: ', cnt)

term:  42950
entities:  61109


In [8]:
rel = set()
for r in trp['rel']:
    rel.add(r)
rel_to_id = {x: i for (i, x) in enumerate(sorted(rel))}
rel_to_id

{'goa_acts_upstream_of_or_within': 0,
 'goa_colocalizes_with': 1,
 'goa_contributes_to': 2,
 'goa_enables': 3,
 'goa_involved_in': 4,
 'goa_is_active_in': 5,
 'goa_located_in': 6,
 'goa_part_of': 7,
 'is_a': 8,
 'negatively_regulates': 9,
 'part_of': 10,
 'positively_regulates': 11,
 'regulates': 12}

In [9]:
u = trp.apply(lambda row: ent_to_id[row['head']], axis=1)
r = trp.apply(lambda row: rel_to_id[row['rel']], axis=1)
v = trp.apply(lambda row: ent_to_id[row['tail']], axis=1)
trp = pd.DataFrame(list(zip(u, r, v)), columns=['head', 'rel', 'tail'])

In [10]:
from scipy.sparse import coo_matrix
import numpy as np

train_df = pd.DataFrame()
valid_df = pd.DataFrame()
test_df = pd.DataFrame()
for r in trp['rel'].unique():
    tmp = trp[trp['rel'] == r]
    tmp = tmp.sample(frac=1).reset_index(drop=True)
    
    n = tmp.shape[0]
    train_idx = int(n*0.8)
    valid_idx = train_idx + (n-train_idx)//2

    train_df = pd.concat([train_df, tmp.iloc[:train_idx]])
    valid_df = pd.concat([valid_df, tmp.iloc[train_idx:valid_idx]])
    test_df = pd.concat([test_df, tmp.iloc[valid_idx:]])
    
n = len(ent_to_id)
adj = coo_matrix((np.ones(len(u)), (np.array(u), np.array(v))), shape=(n, n))
k_hop = 5
k_mat = adj.tocsr()**5

neg_u, neg_v = np.where((k_mat.todense() > 0) & (adj.todense() == 0))

neg_eids = np.random.choice(len(neg_u), len(test_df), replace=False)
test_neg_u, test_neg_v = neg_u[neg_eids], neg_v[neg_eids]

In [11]:
neg_df = pd.DataFrame(list(zip(test_neg_u, test_df['rel'], test_neg_v)), columns=['head', 'rel', 'tail'])

In [12]:
import pickle
import os

# Check if we have the ./data directory already
goa_dir = '../data/GOA0404/'
if(not os.path.isfile(goa_dir)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(goa_dir)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + goa_dir + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')
    
with open(goa_dir+"train.pickle", 'wb') as f:
    pickle.dump(train_df.to_numpy().astype('uint64'), f)
with open(goa_dir+"valid.pickle", 'wb') as f:
    pickle.dump(valid_df.to_numpy().astype('uint64'), f)
with open(goa_dir+"test.pickle", 'wb') as f:
    pickle.dump(test_df.to_numpy().astype('uint64'), f)
with open(goa_dir+"test_neg.pickle", 'wb') as f:
    pickle.dump(neg_df.to_numpy().astype('uint64'), f)

#### ppi binary classification

In [13]:
import pandas as pd
ppi_df = pd.read_csv(src_dir+'goa_bin_ppi.tsv', sep='\t')
ppi_df

Unnamed: 0,source,target,score
0,P84085,P84077,969
1,P84085,Q5T7V8,914
2,P84085,P51157,936
3,P84085,P61204,908
4,P84085,Q92538,928
...,...,...,...
235553,Q9Y262,Q7L2H7,999
235554,Q9Y262,Q9UBQ5,999
235555,Q9Y262,Q04637,908
235556,Q9Y262,P60228,999


In [14]:
ppi = []
for s,t in zip(ppi_df['source'], ppi_df['target']):
    ppi.append([prot_to_id[s],prot_to_id[t],1])
ppi = pd.DataFrame(ppi, columns=['source', 'target', 'class'])
ppi

Unnamed: 0,source,target,class
0,56624,53148,1
1,56624,47569,1
2,56624,42977,1
3,56624,43526,1
4,56624,47616,1
...,...,...,...
235553,47905,52668,1
235554,47905,59800,1
235555,47905,54314,1
235556,47905,45098,1


In [15]:
edge_tuples = ppi.apply(lambda row: (min(row[0], row[1]), max(row[0], row[1])), axis=1)
all_edge_tuples = set(edge_tuples)

In [16]:
ppi_set = pd.DataFrame(all_edge_tuples)
ppi_set

Unnamed: 0,0,1
0,51466,58427
1,49379,59361
2,50341,51636
3,47900,54324
4,56932,59548
...,...,...
117774,52216,54044
117775,48948,59175
117776,49554,54289
117777,45615,57495


In [17]:
import numpy as np
false_edge_set = set()
n_min = min(prot_to_id.values())
n_max = max(prot_to_id.values())

while len(false_edge_set) < ppi_set.shape[0]:
    head = np.random.randint(n_min, n_max+1)
    tail = np.random.randint(n_min, n_max+1)
    if head == tail:
        continue
    false_edge = (min(head,tail), max(head,tail))
    if false_edge in all_edge_tuples:
        continue
    if false_edge in false_edge_set:
        continue
    else:
        false_edge_set.add(false_edge)

In [18]:
len(false_edge_set)

117779

In [19]:
ppi_neg=pd.DataFrame(false_edge_set, columns=[0, 1])
ppi_neg

Unnamed: 0,0,1
0,48259,55166
1,47970,55063
2,47129,48031
3,43271,56329
4,44080,46110
...,...,...
117774,51114,60254
117775,50622,54130
117776,46937,47861
117777,46916,46984


In [20]:
ppi_set['class']=[1]*ppi_set.shape[0]
ppi_neg['class']=[0]*ppi_neg.shape[0]
ppi = pd.concat([ppi_set, ppi_neg], ignore_index=True)

In [21]:
ppi

Unnamed: 0,0,1,class
0,51466,58427,1
1,49379,59361,1
2,50341,51636,1
3,47900,54324,1
4,56932,59548,1
...,...,...,...
235553,51114,60254,0
235554,50622,54130,0
235555,46937,47861,0
235556,46916,46984,0


In [22]:
ppi.to_csv(eval_bin_pth, index=False, header=None)

In [23]:
with open(src_dir+'prot_id.pickle','wb') as fw:
    pickle.dump(prot_to_id, fw)

#### binding affinity prediction

In [24]:
import pandas as pd
ppi_score_df = pd.read_csv(src_dir+'goa_score_ppi.tsv', sep='\t')
ppi_score_df

Unnamed: 0,source,target,score
0,P84085,Q9HCN4,156
1,P84085,P08708,177
2,P84085,Q7Z628,162
3,P84085,Q8NFJ9,379
4,P84085,Q9BYZ2,287
...,...,...,...
1827207,Q9Y262,P62979,928
1827208,Q9Y262,P62280,925
1827209,Q9Y262,B5ME19,988
1827210,Q9Y262,O15405,152


In [25]:
ppi_score_df['source'] = ppi_score_df.apply(lambda row: prot_to_id[row['source']], axis=1)
ppi_score_df['target'] = ppi_score_df.apply(lambda row: prot_to_id[row['target']], axis=1)
ppi_score_df

Unnamed: 0,source,target,score
0,56624,50510,156
1,56624,51475,177
2,56624,60977,162
3,56624,54138,379
4,56624,51680,287
...,...,...,...
1827207,47905,54820,928
1827208,47905,56080,925
1827209,47905,56945,988
1827210,47905,53907,152


In [26]:
ppi_score_df.to_csv(eval_score_pth, index=False, header=None)

#### ppi type classification

In [27]:
import pandas as pd
ppi_type_df = pd.read_csv(src_dir+'goa_type_ppi.tsv', sep='\t')
ppi_type_df

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,P84085,Q9Y587,0,1,0,0,0,0,1
1,P84085,O15155,0,1,1,0,0,0,1
2,P84085,O15498,0,1,1,0,0,0,1
3,P84085,O75379,0,1,0,0,0,0,1
4,P84085,Q9UIA0,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1017387,Q8NGQ2,O60262,1,0,0,0,0,0,0
1017388,Q8NGQ2,P32121,0,0,0,0,1,1,0
1017389,Q8NGQ2,P49407,0,0,0,0,1,1,0
1017390,Q8NGQ2,Q9H902,1,0,0,0,0,0,0


In [30]:
import pickle
with open(src_dir+'prot_id.pickle','rb') as fw:
    prot_to_id = pickle.load(fw)

In [31]:
ppi_type_df['item_id_a'] = ppi_type_df.apply(lambda row: prot_to_id[row['item_id_a']], axis=1)
ppi_type_df['item_id_b'] = ppi_type_df.apply(lambda row: prot_to_id[row['item_id_b']], axis=1)
ppi_type_df

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,56624,45679,0,1,0,0,0,0,1
1,56624,59211,0,1,1,0,0,0,1
2,56624,46720,0,1,1,0,0,0,1
3,56624,50052,0,1,0,0,0,0,1
4,56624,57052,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1017387,57068,48240,1,0,0,0,0,0,0
1017388,57068,48906,0,0,0,0,1,1,0
1017389,57068,60070,0,0,0,0,1,1,0
1017390,57068,53738,1,0,0,0,0,0,0


In [32]:
ppi_type_df.to_csv(eval_type_pth, index=False, header=None)