# Preprocess BioGRID PPI Network
This notebook prepares the [BioGRID](https://downloads.thebiogrid.org/) PPI network for use with graph convolutional networks.
I use version `4.4.235`, downloaded from [here](https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.4.235/BIOGRID-ALL-4.4.235.tab3.zip) and preprocessed in the following way:

* filter out duplicate and selfloop edges

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
biogrid_ppis = pd.read_csv('BIOGRID-ALL-4.4.235.tab3.txt',header=0,sep='\t')
biogrid_ppis

  biogrid_ppis = pd.read_csv('BIOGRID-ALL-4.4.235.tab3.txt',header=0,sep='\t')


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,-,NP_002018,-,-,-,-,-,-,Homo sapiens,Homo sapiens
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,-,NP_150250|NP_150253|NP_150252|NP_150247|NP_150...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,-,NP_644805|NP_003141|NP_001356447|NP_001356443|...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699944,3716537,11200,7515,116369,113349,RP11-436C9.1,-,CHEK2,XRCC1,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,...,B2RCY5|Q59HH7,NP_006288,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2699945,3716538,11200,7515,116369,113349,RP11-436C9.1,-,CHEK2,XRCC1,CDS1|CHK2|HuCds1|LFS2|PP1425|RAD53|hCds1,...,B2RCY5|Q59HH7,NP_006288,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2699946,3716539,3159,472,109402,106962,RP11-513I15.2,-,HMGA1,ATM,HMG-R|HMGA1A|HMGIY,...,A0A024R3C7,NP_001338763|NP_000042,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2699947,3716540,472,3159,106962,109402,-,RP11-513I15.2,ATM,HMGA1,AT1|ATA|ATC|ATD|ATDC|ATE|TEL1|TELO1,...,Q5T6U8,NP_001306011|NP_001306009|NP_001306008|NP_0013...,-,-,-,-,-,-,Homo sapiens,Homo sapiens


In [4]:
edgelist = biogrid_ppis.loc[:,'Official Symbol Interactor A':'Official Symbol Interactor B']
edgelist.set_index([np.arange(edgelist.shape[0])], inplace=True)
edgelist['confidence'] = 1
edgelist.columns = ['partner1', 'partner2', 'confidence']

In [5]:
edgelist

Unnamed: 0,partner1,partner2,confidence
0,MAP2K4,FLNC,1
1,MYPN,ACTN2,1
2,ACVR1,FNTA,1
3,GATA2,PML,1
4,RPA2,STAT3,1
...,...,...,...
2699944,CHEK2,XRCC1,1
2699945,CHEK2,XRCC1,1
2699946,HMGA1,ATM,1
2699947,ATM,HMGA1,1


## filter out duplicate and selfloop edges

In [6]:
num_duplicated_edges = edgelist.duplicated(subset=['partner1', 'partner2']).sum()
edgelist.drop_duplicates(subset=['partner1', 'partner2'], inplace=True)
print ("Duplicated Edges: {} -> New #Edges: {}".format(num_duplicated_edges,
                                                       edgelist.shape[0]))

Duplicated Edges: 466974 -> New #Edges: 2232975


In [7]:
G = nx.from_pandas_edgelist(edgelist, source='partner1', target='partner2', edge_attr='confidence')
G.remove_edges_from(nx.selfloop_edges(G))
G.remove_nodes_from(list(nx.isolates(G)))
adj_pd = nx.to_pandas_adjacency(G)
adjacency_matrix = adj_pd.values
node_names = adj_pd.index.values

avg_node_degree = np.mean([value for key, value in G.degree()])
print ("Constructed Adjacency Matrix with average node degree of: {}".format(avg_node_degree))
print ("Adjacency matrix has {} edges and {} nodes in total".format(G.number_of_edges(), G.number_of_nodes()))

Constructed Adjacency Matrix with average node degree of: 51.96418096723869
Adjacency matrix has 2081815 edges and 80125 nodes in total


In [8]:
edgelist_no_repeat = nx.to_pandas_edgelist(G)
edgelist_no_repeat.columns=['partner1','partner2','confidence'] # rename columns names
edgelist_no_repeat.to_csv('BioGRID_symbols_no_repeat.tsv',sep='\t',
                          index=False)
edgelist_no_repeat

Unnamed: 0,partner1,partner2,confidence
0,MAP2K4,FLNC,1
1,MAP2K4,Flna,1
2,MAP2K4,SPAG9,1
3,MAP2K4,MAPK8,1
4,MAP2K4,MAP4K2,1
...,...,...,...
2081810,tkr-1,T07C12.15,1
2081811,T07C12.15,tkr-2,1
2081812,npr-24,F33D11.8,1
2081813,npr-32,M02E1.2,1
