In [1]:
# Import the OBO parser from GOATools
%pip install goatools
%pip install wget


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from goatools import obo_parser
import wget
import os

In [3]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
data_folder = '/home/ukjung18/GO'

# Check if we have the ./data directory already
if(not os.path.isfile(data_folder)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

# Check if the file exists already
if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'

In [4]:
print(go_obo)

/home/ukjung18/GO/go-basic.obo


In [5]:
go = obo_parser.GODag(go_obo, optional_attrs=['relationship'])

/home/ukjung18/GO/go-basic.obo: fmt(1.2) rel(2023-06-11) 46,420 Terms; optional_attrs(relationship)


In [6]:
len(go)

46420

In [7]:
go_edges = []
go_entities = []
alts = []
obs_cnt = 0
for go_id in go:
    go_term = go[go_id]
    go_entities.append([go_term.id, go_term.level, go_term.namespace])
    # if go_id in alts:
    #     continue
    
    if go_term.is_obsolete:
        obs_cnt+=1
        continue
    if go_term._parents:
        for parents in go_term.parents:
            go_edges.append([go_term.id, 'is_a', go[parents.id].id])
    if go_term.alt_ids:
        alts+=go_term.alt_ids
    if go_term.relationship:
        for rel in go_term.relationship:
            for e in go_term.relationship[rel]:
                go_edges.append([go_term.id, rel, go[e.id].id])
                # go_edges.append([go_term.id, 'is_a', go[e.id].id])
print(len(go_edges))
print(len(go_entities))
print(len(alts))
print(obs_cnt)

90407
46420
11596
0


In [8]:
import pandas as pd
go_df = pd.DataFrame(go_edges, columns=['head', 'rel', 'tail'])
go_df

Unnamed: 0,head,rel,tail
0,GO:0000001,is_a,GO:0048308
1,GO:0000001,is_a,GO:0048311
2,GO:0000002,is_a,GO:0007005
3,GO:0000003,is_a,GO:0008150
4,GO:0000006,is_a,GO:0005385
...,...,...,...
90402,GO:2000342,is_a,GO:2000341
90403,GO:2000342,negatively_regulates,GO:0072567
90404,GO:2000343,is_a,GO:2000341
90405,GO:2000343,is_a,GO:0032722


In [9]:
go_df.drop_duplicates(inplace=True, ignore_index=True)
go_df

Unnamed: 0,head,rel,tail
0,GO:0000001,is_a,GO:0048308
1,GO:0000001,is_a,GO:0048311
2,GO:0000002,is_a,GO:0007005
3,GO:0000003,is_a,GO:0008150
4,GO:0000006,is_a,GO:0005385
...,...,...,...
83970,GO:2001317,is_a,GO:2001316
83971,GO:2001317,is_a,GO:0034309
83972,GO:2001317,is_a,GO:0018130
83973,GO:2001317,is_a,GO:1901362


In [10]:
go_df['rel'].value_counts()

is_a                    68650
part_of                  6809
regulates                3120
negatively_regulates     2704
positively_regulates     2692
Name: rel, dtype: int64

In [11]:
ent_df = pd.DataFrame(go_entities, columns=['term', 'level', 'class'])
ent_df

Unnamed: 0,term,level,class
0,GO:0000001,5,biological_process
1,GO:0000002,6,biological_process
2,GO:0000003,1,biological_process
3,GO:0000006,5,molecular_function
4,GO:0000007,5,molecular_function
...,...,...,...
46415,GO:1990948,5,molecular_function
46416,GO:2000341,6,biological_process
46417,GO:2000341,6,biological_process
46418,GO:2000342,7,biological_process


In [12]:
ent_df.drop_duplicates(inplace=True, ignore_index=True)
ent_df

Unnamed: 0,term,level,class
0,GO:0000001,5,biological_process
1,GO:0000002,6,biological_process
2,GO:0000003,1,biological_process
3,GO:0000006,5,molecular_function
4,GO:0000007,5,molecular_function
...,...,...,...
42945,GO:2001313,4,biological_process
42946,GO:2001314,5,biological_process
42947,GO:2001315,5,biological_process
42948,GO:2001316,4,biological_process


In [13]:
from collections import defaultdict

node_dict = defaultdict(int) # {GO:0000001 : 0}
rel_dict = defaultdict(int)
go_list = list(ent_df['term']) # {'is_a' : 0}
rel_list = list(go_df['rel'].unique())

for i in range(len(go_list)):
    node_dict[go_list[i]] = i
for i in range(len(rel_list)):
    rel_dict[rel_list[i]] = i

In [14]:
rel_dict

defaultdict(int,
            {'is_a': 0,
             'part_of': 1,
             'regulates': 2,
             'negatively_regulates': 3,
             'positively_regulates': 4})

In [15]:
# u = []
# v = []
# r = []
# for _, row in go_df.iterrows():
#     u.append(node_dict[row['head']])
#     r.append(rel_dict[row['rel']])
#     v.append(node_dict[row['tail']])
u = go_df.apply(lambda row: node_dict[row['head']], axis=1)
r = go_df.apply(lambda row: rel_dict[row['rel']], axis=1)
v = go_df.apply(lambda row: node_dict[row['tail']], axis=1)

In [16]:
from scipy.sparse import coo_matrix, dok_matrix
import numpy as np

eids = np.arange(len(u))
eids = np.random.permutation(eids)
test_idx = int(len(eids) * 0.1)
valid_idx = int(len(eids) * 0.2)
train_u, train_r, train_v = np.array([u[i] for i in eids[valid_idx:]]), np.array([r[i] for i in eids[valid_idx:]]), np.array([v[i] for i in eids[valid_idx:]])
valid_u, valid_r, valid_v = np.array([u[i] for i in eids[test_idx:valid_idx]]), np.array([r[i] for i in eids[test_idx:valid_idx]]), np.array([v[i] for i in eids[test_idx:valid_idx]])
test_pos_u, test_pos_r, test_pos_v = np.array([u[i] for i in eids[:test_idx]]), np.array([r[i] for i in eids[:test_idx]]), np.array([v[i] for i in eids[:test_idx]])

n = ent_df.shape[0]

adj = dok_matrix((n, n))
for (h, t) in zip(u,v):
    adj[t, h] += 1
    adj[h, t] += 1
adj = adj.tocsr()
k_mat = adj.tocsr()**3

# adj = coo_matrix((np.ones(len(u)), (np.array(u), np.array(v))), shape=(n, n))
# k_mat = adj.tocsr()**5

neg_u, neg_v = np.where((k_mat.todense() > 0) & (adj.todense() == 0))

neg_eids = np.random.choice(len(neg_u), test_idx)
test_neg_u, test_neg_v = neg_u[neg_eids], neg_v[neg_eids]

In [17]:
train_df = pd.DataFrame(list(zip(train_u, train_r, train_v)), columns=['head', 'rel', 'tail'])
valid_df = pd.DataFrame(list(zip(valid_u, valid_r, valid_v)), columns=['head', 'rel', 'tail'])
test_df = pd.DataFrame(list(zip(test_pos_u, test_pos_r, test_pos_v)), columns=['head', 'rel', 'tail'])
neg_df = pd.DataFrame(list(zip(test_neg_u, test_pos_r, test_neg_v)), columns=['head', 'rel', 'tail'])

In [18]:
# Check if we have the ./data directory already
data_dir = "/home/ukjung18/GIE1/GIE/GIE-master/data/GO1117/"
if(not os.path.isfile(data_dir)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_dir)
    except OSError as e:
        if(e.errno != 17):
            raise e
# else:
#     raise Exception('Data path (' + data_dir + ') exists as a file. '
#                    'Please rename, remove or change the desired location of the data path.')

In [19]:
import pickle

with open(data_dir+"train.pickle", 'wb') as f:
    pickle.dump(train_df.to_numpy().astype('uint64'), f)
with open(data_dir+"valid.pickle", 'wb') as f:
    pickle.dump(valid_df.to_numpy().astype('uint64'), f)
with open(data_dir+"test.pickle", 'wb') as f:
    pickle.dump(test_df.to_numpy().astype('uint64'), f)
with open(data_dir+"test_neg.pickle", 'wb') as f:
    pickle.dump(neg_df.to_numpy().astype('uint64'), f)

In [20]:
# Check if we have the ./data directory already
src_dir = '/home/ukjung18/GIE1/GIE/GIE-master/src_data/GO1117/'
if(not os.path.isfile(src_dir)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(src_dir)
    except OSError as e:
        if(e.errno != 17):
            raise e
        
# else:
#     raise Exception('Data path (' + src_dir + ') exists as a file. '
#                    'Please rename, remove or change the desired location of the data path.')
    
go_df.to_csv(src_dir+'edges.tsv', sep='\t', index=False)
ent_df.to_csv(src_dir+'entities.tsv', sep='\t', index=False)

In [21]:
hig_trainGO = '/home/ukjung18/HiG2Vec/data/GO1117.tsv'
hig_evalGO = '/home/ukjung18/HiG2Vec/evalGO/GO1117.txt'

In [22]:
hig_col = ['head', 'tail', 'rel']
train_df = train_df[hig_col]
valid_df = valid_df[hig_col]
test_df = test_df[hig_col]
neg_df = neg_df[hig_col]
train_df

Unnamed: 0,head,tail,rel
0,30845,21606,0
1,4702,30589,0
2,26930,29933,0
3,18844,24926,0
4,17010,17008,0
...,...,...,...
67175,36029,36027,0
67176,19558,19556,0
67177,5476,10960,0
67178,20332,9493,0


In [23]:
train_df = pd.concat([train_df, valid_df], ignore_index=True)
train_df

Unnamed: 0,head,tail,rel
0,30845,21606,0
1,4702,30589,0
2,26930,29933,0
3,18844,24926,0
4,17010,17008,0
...,...,...,...
75573,24219,37072,0
75574,21746,9001,0
75575,25426,25428,0
75576,37784,37532,0


In [24]:
train_df.to_csv(hig_trainGO, sep='\t', header=None, index=False)

In [25]:
test_df['rel'] = [1]*test_df.shape[0]
neg_df['rel'] = [0]*neg_df.shape[0]
test_df = pd.concat([test_df, neg_df], ignore_index=True)
test_df

Unnamed: 0,head,tail,rel
0,40189,40187,1
1,30275,30274,1
2,28077,20201,1
3,17632,25202,1
4,22481,9174,1
...,...,...,...
16789,12168,36222,0
16790,6454,20989,0
16791,15429,37275,0
16792,7154,6557,0


In [26]:
test_df.to_csv(hig_evalGO, header=None, index=False)

## Section 3 - Retrieving GO annotations

In this section we will look at how to manipulate the Gene Association File (GAF) standard, using a parser from the BioPython package.

In [27]:
# %pip install biopython
import Bio.UniProt.GOA as GOA

First we need to download a GAF file from the EBI FTP website, which hosts the current and all previous UniProt-GOA annotations. The links to these can be found on the <a href="https://www.ebi.ac.uk/GOA/downloads">EBI GOA Downloads page</a>. 

As an example, we are going to download the reduced GAF file containing gene association data for *Arabidopsis Thaliana*.

In [28]:
import os
from ftplib import FTP
human_uri = '/pub/databases/GO/goa/HUMAN/goa_human.gaf.gz'
human_fn = human_uri.split('/')[-1]
data_folder = '/home/ukjung18/GO/'
# Check if the file exists already
human_gaf = os.path.join(data_folder, human_fn)
if(not os.path.isfile(human_gaf)):
    # Login to FTP server
    ebi_ftp = FTP('ftp.ebi.ac.uk')
    ebi_ftp.login() # Logs in anonymously
    
    # Download
    with open(human_gaf,'wb') as human_fp:
        ebi_ftp.retrbinary('RETR {}'.format(human_uri), human_fp.write)
        
    # Logout from FTP server
    ebi_ftp.quit()

Now we can load all the annotations into a dictionary, using the iterator from the BioPython package (<code>Bio.UniProt.GOA.gafiterator</code>).

In [29]:
import gzip

In [30]:
gaf_list =[]
with gzip.open(human_gaf, 'rt') as human_gaf_fp:
    for entry in GOA.gafiterator(human_gaf_fp):
        gaf_list.append(entry)
len(gaf_list)

632465

In [31]:
gaf_list[3]

{'DB': 'UniProtKB',
 'DB_Object_ID': 'A0A024RBG1',
 'DB_Object_Symbol': 'NUDT4B',
 'Qualifier': ['enables'],
 'GO_ID': 'GO:0034431',
 'DB:Reference': ['PMID:21873635'],
 'Evidence': 'IBA',
 'With': ['FB:FBgn0036111',
  'PANTHER:PTN000290327',
  'PomBase:SPAC13G6.14',
  'SGD:S000005689'],
 'Aspect': 'F',
 'DB_Object_Name': 'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B',
 'Synonym': ['NUDT4B'],
 'DB_Object_Type': 'protein',
 'Taxon_ID': ['taxon:9606'],
 'Date': '20210530',
 'Assigned_By': 'GO_Central',
 'Annotation_Extension': '',
 'Gene_Product_Form_ID': ''}

In [32]:
goa=[] # protein - qualifier - GO
goa_orig=[]
acts_list = ['acts_upstream_of', 'acts_upstream_of_positive_effect', 'acts_upstream_of_negative_effect', 'acts_upstream_of_or_within_negative_effect', 'acts_upstream_of_or_within_positive_effect']
for gene in gaf_list:
    if gene['Evidence'] == 'ND':
        continue
    if len(gene['Qualifier']) > 1:
        # qual=''
        # for q in gene['Qualifier']:
        #     if q in acts_list:
        #         q='acts_upstream_of_or_within'
        #     qual+='_'+q
        # goa.append([gene['DB_Object_ID'],'goa_'+qual[1:],gene['GO_ID']])
        continue
        # goa_orig.append([gene['DB_Object_ID'],'goa_'+qual[1:],gene['GO_ID']])
    else:
        q = gene['Qualifier'][0]
        if q in acts_list:
            q='acts_upstream_of_or_within'
        goa.append([gene['DB_Object_ID'],'goa_'+q,gene['GO_ID']])
        # goa_orig.append([gene['DB_Object_ID'],'goa_'+q,gene['GO_ID']])
    # goa.append([gene['DB_Object_ID'],'has_function',gene['GO_ID']])
goa_df=pd.DataFrame(goa, columns=['head', 'rel', 'tail'])
# goa_orig_df=pd.DataFrame(goa, columns=['head', 'rel', 'tail'])

In [33]:
# goa_orig_df

In [34]:
ent_ls = list(ent_df['term'])
condition = goa_df['tail'].isin(ent_ls)
goa_df_filtered = goa_df[condition]
goa_df_filtered.reset_index(drop=True, inplace=True)
# goa_df_filtered.drop_duplicates(inplace=True, ignore_index=True)
goa_df_filtered

Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
629443,W6CW81,goa_involved_in,GO:0006954
629444,W6CW81,goa_involved_in,GO:0031333
629445,W6CW81,goa_involved_in,GO:0035458
629446,W6CW81,goa_involved_in,GO:0098586


In [35]:
goa_df_filtered.drop_duplicates(inplace=True, ignore_index=True)
goa_df_filtered

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goa_df_filtered.drop_duplicates(inplace=True, ignore_index=True)


Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
295282,W6CW81,goa_involved_in,GO:0006954
295283,W6CW81,goa_involved_in,GO:0031333
295284,W6CW81,goa_involved_in,GO:0035458
295285,W6CW81,goa_involved_in,GO:0098586


In [36]:
goa_df_filtered['rel'].value_counts()

goa_involved_in                   130547
goa_enables                        69298
goa_located_in                     64411
goa_is_active_in                   16943
goa_part_of                         9384
goa_acts_upstream_of_or_within      2737
goa_colocalizes_with                1068
goa_contributes_to                   899
Name: rel, dtype: int64

In [37]:
# condition = goa_orig_df['tail'].isin(ent_ls)
# goa_orig_df_filtered = goa_orig_df[condition]
# goa_orig_df_filtered.reset_index(drop=True, inplace=True)

In [38]:
# orig_idx = goa_orig_df_filtered.index
# goa_orig_df_filtered = goa_orig_df_filtered.drop_duplicates()
# drop_idx = orig_idx.difference(goa_orig_df_filtered.index)
# goa_df_filtered.drop(drop_idx, inplace=True)
# goa_df_filtered

In [39]:
# goa_df_filtered.reset_index(drop=True, inplace=True)
# goa_df_filtered

In [40]:
goa_set=set()
for i, goa_pair in enumerate(zip(goa_df_filtered['head'], goa_df_filtered['tail'])):
    goa_set.add(goa_pair)

In [41]:
goa_list=list(goa_set)
len(goa_list)

284686

In [42]:
import pandas as pd
str2prot = pd.read_csv("/home/ukjung18/GO/string2uniprot.tsv", sep='\t')
str2prot.head()

Unnamed: 0,From,Entry,Entry Name,Gene Names,Protein names
0,9606.ENSP00000363412,A0A024R161,A0A024R161_HUMAN,DNAJC25-GNG10 hCG_1994888,Guanine nucleotide-binding protein subunit gamma
1,9606.ENSP00000338352,A0A024RBG1,NUD4B_HUMAN,NUDT4B,Diphosphoinositol polyphosphate phosphohydrola...
2,9606.ENSP00000456868,A0A075B734,AQP7B_HUMAN,AQP7B,Aquaporin-7B
3,9606.ENSP00000463419,A0A075B759,PAL4E_HUMAN,PPIAL4E,Peptidyl-prolyl cis-trans isomerase A-like 4E ...
4,9606.ENSP00000464619,A0A075B767,PAL4H_HUMAN,PPIAL4H,Peptidyl-prolyl cis-trans isomerase A-like 4H ...


In [43]:
str2prot.shape

(18875, 5)

In [44]:
link_df = pd.read_csv('/home/ukjung18/GO/9606.protein.links.v11.5.txt', sep='\s', engine='python', encoding='cp949')
# link_df = pd.read_csv('/home/ukjung18/GO/9606.protein.physical.links.full.v12.0.txt', sep='\s', engine='python', encoding='cp949')


In [45]:
# link_df = link_df[link_df['combined_score']>=900]
# link_df.reset_index(drop=True, inplace=True)

In [46]:
link_df.shape

(11938498, 3)

In [47]:
strprt_ls = list(str2prot['From'])
len(strprt_ls)

18875

In [48]:
# string_ppi에는 있지만 string2uniprot에 없는 protein 제거
condition1 = link_df['protein1'].isin(strprt_ls)
condition2 = link_df['protein2'].isin(strprt_ls)
condition = condition1 & condition2
link_df_filtered = link_df[condition]
link_df_filtered.reset_index(drop=True, inplace=True)
link_df_filtered

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,155
1,9606.ENSP00000000233,9606.ENSP00000314067,197
2,9606.ENSP00000000233,9606.ENSP00000263116,222
3,9606.ENSP00000000233,9606.ENSP00000361263,181
4,9606.ENSP00000000233,9606.ENSP00000324287,767
...,...,...,...
11601833,9606.ENSP00000485678,9606.ENSP00000354800,213
11601834,9606.ENSP00000485678,9606.ENSP00000308270,151
11601835,9606.ENSP00000485678,9606.ENSP00000335660,181
11601836,9606.ENSP00000485678,9606.ENSP00000300127,154


In [49]:
uniprt_ls = list(str2prot['Entry'])
len(uniprt_ls)

18875

In [50]:
# goa에는 있지만 string2uniprot에 없는 protein 제거
goa_df = goa_df_filtered
condition = goa_df['head'].isin(uniprt_ls)
goa_df_filtered = goa_df[condition]
goa_df_filtered.reset_index(drop=True, inplace=True)
goa_df_filtered

Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
286623,U3KPV4,goa_involved_in,GO:0006688
286624,U3KPV4,goa_involved_in,GO:0030259
286625,U3KPV4,goa_is_active_in,GO:0005794
286626,U3KPV4,goa_is_active_in,GO:0031982


In [51]:
goa_df_filtered.to_csv(src_dir+'goa_edges.tsv', index=False, sep='\t')

In [52]:
# ppi에 있지만 goa에 없는 protein 제거
prot_set = set()
for prot in goa_df_filtered['head']:
    prot_set.add(prot)
prt_ls = list(prot_set)
len(prt_ls)

18137

In [53]:
with open(src_dir+"prt_list.pkl","wb") as f:
    pickle.dump(prt_ls, f)

In [54]:
go_tail_set=set()
for gterm in goa_df_filtered['tail']:
    go_tail_set.add(gterm)
go_tail_list = list(go_tail_set)
len(go_tail_list)

18585

In [55]:
ppi_dict = {i:j for i,j in zip(str2prot['From'], str2prot['Entry'])}
ppi_df = pd.DataFrame(columns=['source', 'target', 'score'])
ppi_df['source'] = link_df_filtered.apply(lambda row: ppi_dict[row['protein1']], axis=1)
ppi_df['target'] = link_df_filtered.apply(lambda row: ppi_dict[row['protein2']], axis=1)
ppi_df['score'] = link_df_filtered['combined_score']

In [56]:
condition1 = ppi_df['source'].isin(prt_ls)
condition2 = ppi_df['target'].isin(prt_ls)
condition = condition1 & condition2
ppi_df_filtered = ppi_df[condition]
ppi_df_filtered.reset_index(drop=True, inplace=True)
ppi_df_filtered

Unnamed: 0,source,target,score
0,P84085,Q14123,155
1,P84085,Q13177,197
2,P84085,O95755,222
3,P84085,Q13905,181
4,P84085,Q15057,767
...,...,...,...
11105127,Q8NGQ2,Q8NGL4,213
11105128,Q8NGQ2,Q8NH48,151
11105129,Q8NGQ2,Q3LHN2,181
11105130,Q8NGQ2,Q8NGJ1,154


In [57]:
# prt_set = set()
# ppi_df_filtered.apply(lambda row: prt_set.update([row['source'], row['target']]), axis=1)
# print('Proteins composing binary ppi: ', len(prt_set))

In [58]:
ppi_df_filtered_bin = ppi_df_filtered[ppi_df_filtered['score']>=900]
# ppi_df_filtered_bin = ppi_df_filtered
ppi_df_filtered_bin.reset_index(drop=True, inplace=True)
ppi_df_filtered_bin.to_csv(src_dir+'goa_bin_ppi.tsv', sep='\t', index=False)

In [59]:
ppi_df_filtered_bin

Unnamed: 0,source,target,score
0,P84085,P84077,969
1,P84085,Q5T7V8,914
2,P84085,P51157,936
3,P84085,P61204,908
4,P84085,Q92538,928
...,...,...,...
235553,Q9Y262,Q7L2H7,999
235554,Q9Y262,Q9UBQ5,999
235555,Q9Y262,Q04637,908
235556,Q9Y262,P60228,999


In [60]:
physical_link_df = pd.read_csv('/home/ukjung18/GO/9606.protein.physical.links.v11.5.txt', sep='\s', engine='python', encoding='cp949')

In [61]:
# string_ppi에는 있지만 string2uniprot에 없는 protein 제거
condition1 = physical_link_df['protein1'].isin(strprt_ls)
condition2 = physical_link_df['protein2'].isin(strprt_ls)
condition = condition1 & condition2
physical_link_df_filtered = physical_link_df[condition]
physical_link_df_filtered.reset_index(drop=True, inplace=True)
physical_link_df_filtered

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000264718,156
1,9606.ENSP00000000233,9606.ENSP00000346046,177
2,9606.ENSP00000000233,9606.ENSP00000347134,162
3,9606.ENSP00000000233,9606.ENSP00000317469,379
4,9606.ENSP00000000233,9606.ENSP00000302393,287
...,...,...,...
1914803,9606.ENSP00000485663,9606.ENSP00000272317,928
1914804,9606.ENSP00000485663,9606.ENSP00000270625,925
1914805,9606.ENSP00000485663,9606.ENSP00000370258,988
1914806,9606.ENSP00000485663,9606.ENSP00000219746,152


In [62]:
ppi_dict = {i:j for i,j in zip(str2prot['From'], str2prot['Entry'])}
ppi_score_df = pd.DataFrame(columns=['source', 'target', 'score'])
ppi_score_df['source'] = physical_link_df_filtered.apply(lambda row: ppi_dict[row['protein1']], axis=1)
ppi_score_df['target'] = physical_link_df_filtered.apply(lambda row: ppi_dict[row['protein2']], axis=1)
ppi_score_df['score'] = physical_link_df_filtered['combined_score']

In [63]:
condition1 = ppi_score_df['source'].isin(prt_ls)
condition2 = ppi_score_df['target'].isin(prt_ls)
condition = condition1 & condition2
ppi_score_df_filtered = ppi_score_df[condition]
ppi_score_df_filtered.reset_index(drop=True, inplace=True)
ppi_score_df_filtered

Unnamed: 0,source,target,score
0,P84085,Q9HCN4,156
1,P84085,P08708,177
2,P84085,Q7Z628,162
3,P84085,Q8NFJ9,379
4,P84085,Q9BYZ2,287
...,...,...,...
1825055,Q9Y262,P62979,928
1825056,Q9Y262,P62280,925
1825057,Q9Y262,B5ME19,988
1825058,Q9Y262,O15405,152


In [64]:
ppi_score_df_filtered.to_csv(src_dir+'goa_ppi.tsv', sep='\t', index=False)