### Gene Ontology

In [1]:
from goatools import obo_parser
import wget
import os

In [2]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
data_folder = '/home/ukjung18/GeOKG/raw_data'

# Check if we have the ./data directory already
if(not os.path.isfile(data_folder)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

# Check if the file exists already
if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'

In [3]:
go_obo = data_folder+'/go-basic.obo'

In [4]:
print(go_obo)

/home/ukjung18/GeOKG/raw_data/go-basic.obo


In [5]:
go = obo_parser.GODag(go_obo, optional_attrs=['relationship'])

/home/ukjung18/GeOKG/raw_data/go-basic.obo: fmt(1.2) rel(2023-06-11) 46,420 Terms; optional_attrs(relationship)


In [6]:
go_edges = []
go_entities = []
alts = []
obs_cnt = 0
for go_id in go:
    go_term = go[go_id]
    go_entities.append([go_term.id, go_term.level, go_term.namespace])
    
    if go_term.is_obsolete:
        obs_cnt+=1
        continue
    if go_term._parents:
        for parents in go_term.parents:
            go_edges.append([go_term.id, 'is_a', go[parents.id].id])
    if go_term.alt_ids:
        alts+=go_term.alt_ids
    if go_term.relationship:
        for rel in go_term.relationship:
            for e in go_term.relationship[rel]:
                go_edges.append([go_term.id, rel, go[e.id].id])
                # go_edges.append([go_term.id, 'is_a', go[e.id].id])
print(len(go_edges))
print(len(go_entities))
print(obs_cnt)

90407
46420
0


In [7]:
import pandas as pd
go_df = pd.DataFrame(go_edges, columns=['head', 'rel', 'tail'])
go_df

Unnamed: 0,head,rel,tail
0,GO:0000001,is_a,GO:0048308
1,GO:0000001,is_a,GO:0048311
2,GO:0000002,is_a,GO:0007005
3,GO:0000003,is_a,GO:0008150
4,GO:0000006,is_a,GO:0005385
...,...,...,...
90402,GO:2000342,is_a,GO:2000341
90403,GO:2000342,negatively_regulates,GO:0072567
90404,GO:2000343,is_a,GO:0032722
90405,GO:2000343,is_a,GO:2000341


In [8]:
go_df.drop_duplicates(inplace=True, ignore_index=True)
go_df

Unnamed: 0,head,rel,tail
0,GO:0000001,is_a,GO:0048308
1,GO:0000001,is_a,GO:0048311
2,GO:0000002,is_a,GO:0007005
3,GO:0000003,is_a,GO:0008150
4,GO:0000006,is_a,GO:0005385
...,...,...,...
83970,GO:2001317,is_a,GO:0042181
83971,GO:2001317,is_a,GO:2001316
83972,GO:2001317,is_a,GO:0018130
83973,GO:2001317,is_a,GO:1901362


In [9]:
go_df['rel'].value_counts()

is_a                    68650
part_of                  6809
regulates                3120
negatively_regulates     2704
positively_regulates     2692
Name: rel, dtype: int64

In [10]:
ent_df = pd.DataFrame(go_entities, columns=['term', 'level', 'class'])
ent_df

Unnamed: 0,term,level,class
0,GO:0000001,5,biological_process
1,GO:0000002,6,biological_process
2,GO:0000003,1,biological_process
3,GO:0000006,5,molecular_function
4,GO:0000007,5,molecular_function
...,...,...,...
46415,GO:1990948,5,molecular_function
46416,GO:2000341,6,biological_process
46417,GO:2000341,6,biological_process
46418,GO:2000342,7,biological_process


In [11]:
ent_df.drop_duplicates(inplace=True, ignore_index=True)
ent_df

Unnamed: 0,term,level,class
0,GO:0000001,5,biological_process
1,GO:0000002,6,biological_process
2,GO:0000003,1,biological_process
3,GO:0000006,5,molecular_function
4,GO:0000007,5,molecular_function
...,...,...,...
42945,GO:2001313,4,biological_process
42946,GO:2001314,5,biological_process
42947,GO:2001315,5,biological_process
42948,GO:2001316,4,biological_process


In [12]:
from collections import defaultdict

node_dict = defaultdict(int) # {GO:0000001 : 0}
rel_dict = defaultdict(int)
go_list = list(ent_df['term']) # {'is_a' : 0}
rel_list = list(go_df['rel'].unique())

for i in range(len(go_list)):
    node_dict[go_list[i]] = i
for i in range(len(rel_list)):
    rel_dict[rel_list[i]] = i

In [13]:
rel_dict

defaultdict(int,
            {'is_a': 0,
             'part_of': 1,
             'regulates': 2,
             'negatively_regulates': 3,
             'positively_regulates': 4})

In [14]:
u = go_df.apply(lambda row: node_dict[row['head']], axis=1)
r = go_df.apply(lambda row: rel_dict[row['rel']], axis=1)
v = go_df.apply(lambda row: node_dict[row['tail']], axis=1)
go_id_df = pd.DataFrame(list(zip(u, r, v)), columns=['head', 'rel', 'tail'])

In [15]:
from scipy.sparse import coo_matrix, dok_matrix
import numpy as np

train_df = pd.DataFrame()
valid_df = pd.DataFrame()
test_df = pd.DataFrame()
for r in go_id_df['rel'].unique():
    tmp = go_id_df[go_id_df['rel'] == r]
    tmp = tmp.sample(frac=1).reset_index(drop=True)
    
    n = tmp.shape[0]
    train_idx = int(n*0.8)
    valid_idx = train_idx + (n-train_idx)//2

    train_df = pd.concat([train_df, tmp.iloc[:train_idx]])
    valid_df = pd.concat([valid_df, tmp.iloc[train_idx:valid_idx]])
    test_df = pd.concat([test_df, tmp.iloc[valid_idx:]])

n = ent_df.shape[0]

adj = dok_matrix((n, n))
for (h, t) in zip(u,v):
    adj[t, h] += 1
    adj[h, t] += 1
adj = adj.tocsr()
k_mat = adj.tocsr()**3

neg_u, neg_v = np.where((k_mat.todense() > 0) & (adj.todense() == 0))

neg_eids = np.random.choice(len(neg_u), len(test_df), replace=False)
test_neg_u, test_neg_v = neg_u[neg_eids], neg_v[neg_eids]

In [16]:
neg_df = pd.DataFrame(list(zip(test_neg_u, test_df['rel'], test_neg_v)), columns=['head', 'rel', 'tail'])

In [19]:
import pickle
# Check if we have the ./data directory already
go_dir = "/home/ukjung18/GeOKG/data/GO0404/"
if(not os.path.isfile(go_dir)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(go_dir)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + go_dir + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')
    
with open(go_dir+"train.pickle", 'wb') as f:
    pickle.dump(train_df.to_numpy().astype('uint64'), f)
with open(go_dir+"valid.pickle", 'wb') as f:
    pickle.dump(valid_df.to_numpy().astype('uint64'), f)
with open(go_dir+"test.pickle", 'wb') as f:
    pickle.dump(test_df.to_numpy().astype('uint64'), f)
with open(go_dir+"test_neg.pickle", 'wb') as f:
    pickle.dump(neg_df.to_numpy().astype('uint64'), f)

In [20]:
# Check if we have the ./data directory already
src_dir = '/home/ukjung18/GeOKG/src_data/GO0404/'
if(not os.path.isfile(src_dir)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(src_dir)
    except OSError as e:
        if(e.errno != 17):
            raise e
        
else:
    raise Exception('Data path (' + src_dir + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')
    
go_df.to_csv(src_dir+'edges.tsv', sep='\t', index=False)
ent_df.to_csv(src_dir+'entities.tsv', sep='\t', index=False)

### GO Annotations

In [21]:
# %pip install biopython
import Bio.UniProt.GOA as GOA

In [22]:
import os
from ftplib import FTP
human_uri = '/pub/databases/GO/goa/HUMAN/goa_human.gaf.gz'
human_fn = human_uri.split('/')[-1]
data_folder = '/home/ukjung18/GeOKG/raw_data'
# Check if the file exists already
human_gaf = os.path.join(data_folder, human_fn)
if(not os.path.isfile(human_gaf)):
    # Login to FTP server
    ebi_ftp = FTP('ftp.ebi.ac.uk')
    ebi_ftp.login() # Logs in anonymously
    
    # Download
    with open(human_gaf,'wb') as human_fp:
        ebi_ftp.retrbinary('RETR {}'.format(human_uri), human_fp.write)
        
    # Logout from FTP server
    ebi_ftp.quit()

In [23]:
import gzip

In [24]:
gaf_list =[]
with gzip.open(human_gaf, 'rt') as human_gaf_fp:
    for entry in GOA.gafiterator(human_gaf_fp):
        gaf_list.append(entry)
len(gaf_list)

641150

In [25]:
goa=[] # protein - qualifier - GO
goa_orig=[]
acts_list = ['acts_upstream_of', 'acts_upstream_of_positive_effect', 'acts_upstream_of_negative_effect', 'acts_upstream_of_or_within_negative_effect', 'acts_upstream_of_or_within_positive_effect']
for gene in gaf_list:
    if gene['Evidence'] == 'ND':
        continue
    if len(gene['Qualifier']) > 1:
        continue
    else:
        q = gene['Qualifier'][0]
        if q in acts_list:
            q='acts_upstream_of_or_within'
        goa.append([gene['DB_Object_ID'],'goa_'+q,gene['GO_ID']])
goa_df=pd.DataFrame(goa, columns=['head', 'rel', 'tail'])

In [26]:
ent_ls = list(ent_df['term'])
condition = goa_df['tail'].isin(ent_ls)
goa_df_filtered = goa_df[condition]
goa_df_filtered.reset_index(drop=True, inplace=True)
goa_df_filtered

Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
638029,X6R8R1,goa_involved_in,GO:0048488
638030,X6R8R1,goa_involved_in,GO:0048791
638031,X6R8R1,goa_involved_in,GO:0071277
638032,X6R8R1,goa_located_in,GO:0005886


In [27]:
goa_df_filtered.drop_duplicates(inplace=True, ignore_index=True)
goa_df_filtered

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goa_df_filtered.drop_duplicates(inplace=True, ignore_index=True)


Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
299929,X6R8R1,goa_involved_in,GO:0048488
299930,X6R8R1,goa_involved_in,GO:0048791
299931,X6R8R1,goa_involved_in,GO:0071277
299932,X6R8R1,goa_located_in,GO:0005886


In [28]:
goa_df_filtered['rel'].value_counts()

goa_involved_in                   132414
goa_enables                        70344
goa_located_in                     65362
goa_is_active_in                   17481
goa_part_of                         9613
goa_acts_upstream_of_or_within      2756
goa_colocalizes_with                1058
goa_contributes_to                   906
Name: rel, dtype: int64

In [29]:
goa_set=set()
for i, goa_pair in enumerate(zip(goa_df_filtered['head'], goa_df_filtered['tail'])):
    goa_set.add(goa_pair)

In [30]:
goa_list=list(goa_set)
len(goa_list)

288936

In [31]:
import pandas as pd
str2prot = pd.read_csv("/home/ukjung18/GeOKG/raw_data/string2uniprot.tsv", sep='\t')
str2prot.head()

Unnamed: 0,From,Entry,Entry Name,Gene Names,Protein names
0,9606.ENSP00000363412,A0A024R161,A0A024R161_HUMAN,DNAJC25-GNG10 hCG_1994888,Guanine nucleotide-binding protein subunit gamma
1,9606.ENSP00000338352,A0A024RBG1,NUD4B_HUMAN,NUDT4B,Diphosphoinositol polyphosphate phosphohydrola...
2,9606.ENSP00000456868,A0A075B734,AQP7B_HUMAN,AQP7B,Aquaporin-7B
3,9606.ENSP00000463419,A0A075B759,PAL4E_HUMAN,PPIAL4E,Peptidyl-prolyl cis-trans isomerase A-like 4E ...
4,9606.ENSP00000464619,A0A075B767,PAL4H_HUMAN,PPIAL4H,Peptidyl-prolyl cis-trans isomerase A-like 4H ...


In [32]:
str2prot.shape

(18875, 5)

### STRING PPI

In [33]:
# download full_network, physical_subnetwork and action_subnetwork
!wget -P /home/ukjung18/GeOKG/raw_data/ https://stringdb-downloads.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz
!wget -P /home/ukjung18/GeOKG/raw_data/ https://stringdb-downloads.org/download/protein.physical.links.v11.5/9606.protein.physical.links.v11.5.txt.gz
!wget -P /home/ukjung18/GeOKG/raw_data/ https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz

--2024-04-05 15:11:54--  https://stringdb-downloads.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz
Resolving stringdb-downloads.org (stringdb-downloads.org)... 49.12.123.75
Connecting to stringdb-downloads.org (stringdb-downloads.org)|49.12.123.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72718210 (69M) [application/octet-stream]
Saving to: ‘/home/ukjung18/GeOKG/raw_data/9606.protein.links.v11.5.txt.gz’


2024-04-05 15:12:04 (8.07 MB/s) - ‘/home/ukjung18/GeOKG/raw_data/9606.protein.links.v11.5.txt.gz’ saved [72718210/72718210]

--2024-04-05 15:12:05--  https://stringdb-downloads.org/download/protein.physical.links.v11.5/9606.protein.physical.links.v11.5.txt.gz
Resolving stringdb-downloads.org (stringdb-downloads.org)... 49.12.123.75
Connecting to stringdb-downloads.org (stringdb-downloads.org)|49.12.123.75|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11980729 (11M) [application/octet-stream]
Saving to: ‘/hom

In [34]:
!gzip -d /home/ukjung18/GeOKG/raw_data/9606.protein.links.v11.5.txt.gz
!gzip -d /home/ukjung18/GeOKG/raw_data/9606.protein.physical.links.v11.5.txt.gz
!gzip -d /home/ukjung18/GeOKG/raw_data/9606.protein.actions.v11.0.txt.gz

##### binary classification

In [35]:
link_df = pd.read_csv('/home/ukjung18/GeOKG/raw_data/9606.protein.links.v11.5.txt', sep='\s', engine='python', encoding='cp949')
link_df.shape

(11938498, 3)

In [36]:
strprt_ls = list(str2prot['From'])
len(strprt_ls)

18875

In [37]:
# string_ppi에는 있지만 string2uniprot에 없는 protein 제거
condition1 = link_df['protein1'].isin(strprt_ls)
condition2 = link_df['protein2'].isin(strprt_ls)
condition = condition1 & condition2
link_df_filtered = link_df[condition]
link_df_filtered.reset_index(drop=True, inplace=True)
link_df_filtered

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,155
1,9606.ENSP00000000233,9606.ENSP00000314067,197
2,9606.ENSP00000000233,9606.ENSP00000263116,222
3,9606.ENSP00000000233,9606.ENSP00000361263,181
4,9606.ENSP00000000233,9606.ENSP00000324287,767
...,...,...,...
11601833,9606.ENSP00000485678,9606.ENSP00000354800,213
11601834,9606.ENSP00000485678,9606.ENSP00000308270,151
11601835,9606.ENSP00000485678,9606.ENSP00000335660,181
11601836,9606.ENSP00000485678,9606.ENSP00000300127,154


In [38]:
uniprt_ls = list(str2prot['Entry'])
len(uniprt_ls)

18875

In [39]:
# goa에는 있지만 string2uniprot에 없는 protein 제거
goa_df = goa_df_filtered
condition = goa_df['head'].isin(uniprt_ls)
goa_df_filtered = goa_df[condition]
goa_df_filtered.reset_index(drop=True, inplace=True)
goa_df_filtered

Unnamed: 0,head,rel,tail
0,A0A024RBG1,goa_enables,GO:0000298
1,A0A024RBG1,goa_enables,GO:0003723
2,A0A024RBG1,goa_enables,GO:0008486
3,A0A024RBG1,goa_enables,GO:0034431
4,A0A024RBG1,goa_enables,GO:0034432
...,...,...,...
290836,U3KPV4,goa_involved_in,GO:0006688
290837,U3KPV4,goa_involved_in,GO:0030259
290838,U3KPV4,goa_is_active_in,GO:0005794
290839,U3KPV4,goa_is_active_in,GO:0031982


In [40]:
goa_df_filtered.to_csv(src_dir+'goa_edges.tsv', index=False, sep='\t')

In [41]:
# ppi에 있지만 goa에 없는 protein 제거
prot_set = set()
for prot in goa_df_filtered['head']:
    prot_set.add(prot)
prt_ls = list(prot_set)
len(prt_ls)

18159

In [42]:
ppi_dict = {i:j for i,j in zip(str2prot['From'], str2prot['Entry'])}

In [43]:
import pickle
with open(src_dir+"prt_list.pkl","wb") as f:
    pickle.dump(prt_ls, f)

In [44]:
go_tail_set=set()
for gterm in goa_df_filtered['tail']:
    go_tail_set.add(gterm)
go_tail_list = list(go_tail_set)
len(go_tail_list)

18613

In [45]:
ppi_df = pd.DataFrame(columns=['source', 'target', 'score'])
ppi_df['source'] = link_df_filtered.apply(lambda row: ppi_dict[row['protein1']], axis=1)
ppi_df['target'] = link_df_filtered.apply(lambda row: ppi_dict[row['protein2']], axis=1)
ppi_df['score'] = link_df_filtered['combined_score']

In [46]:
condition1 = ppi_df['source'].isin(prt_ls)
condition2 = ppi_df['target'].isin(prt_ls)
condition = condition1 & condition2
ppi_df_filtered = ppi_df[condition]
ppi_df_filtered.reset_index(drop=True, inplace=True)
ppi_df_filtered

Unnamed: 0,source,target,score
0,P84085,Q14123,155
1,P84085,Q13177,197
2,P84085,O95755,222
3,P84085,Q13905,181
4,P84085,Q15057,767
...,...,...,...
11115783,Q8NGQ2,Q8NGL4,213
11115784,Q8NGQ2,Q8NH48,151
11115785,Q8NGQ2,Q3LHN2,181
11115786,Q8NGQ2,Q8NGJ1,154


In [47]:
# generate positive links for binary classification
ppi_df_filtered_bin = ppi_df_filtered[ppi_df_filtered['score']>=900]
ppi_df_filtered_bin.reset_index(drop=True, inplace=True)
ppi_df_filtered_bin.to_csv(src_dir+'goa_bin_ppi.tsv', sep='\t', index=False)

##### binding affinity prediction

In [48]:
physical_link_df = pd.read_csv('/home/ukjung18/GeOKG/raw_data/9606.protein.physical.links.v11.5.txt', sep='\s', engine='python', encoding='cp949')

In [49]:
# string_ppi에는 있지만 string2uniprot에 없는 protein 제거
condition1 = physical_link_df['protein1'].isin(strprt_ls)
condition2 = physical_link_df['protein2'].isin(strprt_ls)
condition = condition1 & condition2
physical_link_df_filtered = physical_link_df[condition]
physical_link_df_filtered.reset_index(drop=True, inplace=True)
physical_link_df_filtered

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000264718,156
1,9606.ENSP00000000233,9606.ENSP00000346046,177
2,9606.ENSP00000000233,9606.ENSP00000347134,162
3,9606.ENSP00000000233,9606.ENSP00000317469,379
4,9606.ENSP00000000233,9606.ENSP00000302393,287
...,...,...,...
1914803,9606.ENSP00000485663,9606.ENSP00000272317,928
1914804,9606.ENSP00000485663,9606.ENSP00000270625,925
1914805,9606.ENSP00000485663,9606.ENSP00000370258,988
1914806,9606.ENSP00000485663,9606.ENSP00000219746,152


In [50]:
ppi_score_df = pd.DataFrame(columns=['source', 'target', 'score'])
ppi_score_df['source'] = physical_link_df_filtered.apply(lambda row: ppi_dict[row['protein1']], axis=1)
ppi_score_df['target'] = physical_link_df_filtered.apply(lambda row: ppi_dict[row['protein2']], axis=1)
ppi_score_df['score'] = physical_link_df_filtered['combined_score']

In [51]:
condition1 = ppi_score_df['source'].isin(prt_ls)
condition2 = ppi_score_df['target'].isin(prt_ls)
condition = condition1 & condition2
ppi_score_df_filtered = ppi_score_df[condition]
ppi_score_df_filtered.reset_index(drop=True, inplace=True)
ppi_score_df_filtered

Unnamed: 0,source,target,score
0,P84085,Q9HCN4,156
1,P84085,P08708,177
2,P84085,Q7Z628,162
3,P84085,Q8NFJ9,379
4,P84085,Q9BYZ2,287
...,...,...,...
1827207,Q9Y262,P62979,928
1827208,Q9Y262,P62280,925
1827209,Q9Y262,B5ME19,988
1827210,Q9Y262,O15405,152


In [52]:
ppi_score_df_filtered.drop_duplicates(inplace=True, ignore_index=True)
ppi_score_df_filtered.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppi_score_df_filtered.drop_duplicates(inplace=True, ignore_index=True)


(1827212, 3)

In [53]:
ppi_score_df_filtered.to_csv(src_dir+'goa_score_ppi.tsv', sep='\t', index=False)

##### ppi type prediction

In [54]:
import pandas as pd
action_link_df = pd.read_csv('/home/ukjung18/GeOKG/raw_data/9606.protein.actions.v11.0.txt', sep='\s', engine='python', encoding='cp949')

In [55]:
action_link_df

Unnamed: 0,item_id_a,item_id_b,mode,action,is_directional,a_is_acting,score
0,9606.ENSP00000000233,9606.ENSP00000216366,binding,,f,f,165
1,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,f,f,165
2,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,f,165
3,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,t,165
4,9606.ENSP00000000233,9606.ENSP00000222547,binding,,f,f,913
...,...,...,...,...,...,...,...
3470901,9606.ENSP00000485678,9606.ENSP00000409581,inhibition,inhibition,f,f,600
3470902,9606.ENSP00000485678,9606.ENSP00000409581,ptmod,,f,f,600
3470903,9606.ENSP00000485678,9606.ENSP00000438346,activation,activation,t,f,900
3470904,9606.ENSP00000485678,9606.ENSP00000481878,activation,activation,f,f,600


In [56]:
one_hot_encoding = pd.get_dummies(action_link_df['mode'])
type_df = pd.concat([action_link_df, one_hot_encoding], axis=1)

In [57]:
type_df = type_df.drop(['mode', 'action', 'is_directional', 'a_is_acting', 'score'], axis=1)
type_df

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,9606.ENSP00000000233,9606.ENSP00000216366,0,1,0,0,0,0,0
1,9606.ENSP00000000233,9606.ENSP00000216366,0,0,0,0,0,0,1
2,9606.ENSP00000000233,9606.ENSP00000216366,0,0,0,0,0,0,1
3,9606.ENSP00000000233,9606.ENSP00000216366,0,0,0,0,0,0,1
4,9606.ENSP00000000233,9606.ENSP00000222547,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
3470901,9606.ENSP00000485678,9606.ENSP00000409581,0,0,0,0,1,0,0
3470902,9606.ENSP00000485678,9606.ENSP00000409581,0,0,0,0,0,1,0
3470903,9606.ENSP00000485678,9606.ENSP00000438346,1,0,0,0,0,0,0
3470904,9606.ENSP00000485678,9606.ENSP00000481878,1,0,0,0,0,0,0


In [58]:
type_df.drop_duplicates(inplace=True, ignore_index=True)
ppi_type_df = type_df.groupby(['item_id_a', 'item_id_b']).sum().reset_index()

In [59]:
ppi_type_df

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,9606.ENSP00000000233,9606.ENSP00000216366,0,1,0,0,0,0,1
1,9606.ENSP00000000233,9606.ENSP00000222547,0,1,1,0,0,0,1
2,9606.ENSP00000000233,9606.ENSP00000223369,0,1,1,0,0,0,1
3,9606.ENSP00000000233,9606.ENSP00000236192,0,1,0,0,0,0,1
4,9606.ENSP00000000233,9606.ENSP00000248901,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1089071,9606.ENSP00000485678,9606.ENSP00000371594,1,0,0,0,0,0,0
1089072,9606.ENSP00000485678,9606.ENSP00000403701,0,0,0,0,1,1,0
1089073,9606.ENSP00000485678,9606.ENSP00000409581,0,0,0,0,1,1,0
1089074,9606.ENSP00000485678,9606.ENSP00000438346,1,0,0,0,0,0,0


In [60]:
prt_cond = str2prot['Entry'].isin(prot_set)
str2prot_filtered = str2prot[prt_cond]
ppi_dict = {i:j for i,j in zip(str2prot_filtered['From'], str2prot_filtered['Entry'])}

In [61]:
strprt_ls_type = list(str2prot_filtered['From'])
len(strprt_ls_type)

18159

In [62]:
condition1 = ppi_type_df['item_id_a'].isin(strprt_ls_type)
condition2 = ppi_type_df['item_id_b'].isin(strprt_ls_type)
condition = condition1 & condition2
ppi_type_df_filtered = ppi_type_df[condition]
ppi_type_df_filtered.reset_index(drop=True, inplace=True)

In [63]:
ppi_type_df_filtered

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,9606.ENSP00000000233,9606.ENSP00000216366,0,1,0,0,0,0,1
1,9606.ENSP00000000233,9606.ENSP00000222547,0,1,1,0,0,0,1
2,9606.ENSP00000000233,9606.ENSP00000223369,0,1,1,0,0,0,1
3,9606.ENSP00000000233,9606.ENSP00000236192,0,1,0,0,0,0,1
4,9606.ENSP00000000233,9606.ENSP00000248901,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1017387,9606.ENSP00000485678,9606.ENSP00000371594,1,0,0,0,0,0,0
1017388,9606.ENSP00000485678,9606.ENSP00000403701,0,0,0,0,1,1,0
1017389,9606.ENSP00000485678,9606.ENSP00000409581,0,0,0,0,1,1,0
1017390,9606.ENSP00000485678,9606.ENSP00000438346,1,0,0,0,0,0,0


In [64]:
ppi_type_df_filtered['item_id_a'] = ppi_type_df_filtered.apply(lambda row: ppi_dict[row['item_id_a']], axis=1)
ppi_type_df_filtered['item_id_b'] = ppi_type_df_filtered.apply(lambda row: ppi_dict[row['item_id_b']], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppi_type_df_filtered['item_id_a'] = ppi_type_df_filtered.apply(lambda row: ppi_dict[row['item_id_a']], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppi_type_df_filtered['item_id_b'] = ppi_type_df_filtered.apply(lambda row: ppi_dict[row['item_id_b']], axis=1)


In [65]:
ppi_type_df_filtered

Unnamed: 0,item_id_a,item_id_b,activation,binding,catalysis,expression,inhibition,ptmod,reaction
0,P84085,Q9Y587,0,1,0,0,0,0,1
1,P84085,O15155,0,1,1,0,0,0,1
2,P84085,O15498,0,1,1,0,0,0,1
3,P84085,O75379,0,1,0,0,0,0,1
4,P84085,Q9UIA0,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1017387,Q8NGQ2,O60262,1,0,0,0,0,0,0
1017388,Q8NGQ2,P32121,0,0,0,0,1,1,0
1017389,Q8NGQ2,P49407,0,0,0,0,1,1,0
1017390,Q8NGQ2,Q9H902,1,0,0,0,0,0,0


In [66]:
ppi_type_df_filtered.to_csv(src_dir+'goa_type_ppi.tsv', sep='\t', index=False)