https://wiki.thebiogrid.org/doku.php/biogrid_tab_version_2.0
http://pages.cs.wisc.edu/~legault/writeup-776.pdf

In [182]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.stats.multitest as sm
from collections import defaultdict 

In [183]:
# data = pd.read_table("BIOGRID-ORGANISM-Homo_sapiens-3.5.182.tab2.txt", low_memory = False)
# Same organism as in paper
data = pd.read_table("BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.5.182.tab2.txt", low_memory = False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [184]:
data.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,68770,851136,854020,31676,34272,YLR418C,YOL145C,CDC73,CTR9,L000002792,...,14759368,559292,559292,High Throughput,-,-,-,-,-,BIOGRID
1,68771,854020,851136,34272,31676,YOL145C,YLR418C,CTR9,CDC73,CDP1|L000003477,...,14759368,559292,559292,High Throughput,-,-,-,-,-,BIOGRID
2,68774,851136,854290,31676,34518,YLR418C,YOR123C,CDC73,LEO1,L000002792,...,14759368,559292,559292,High Throughput,-,-,-,-,-,BIOGRID
3,68775,854290,851136,34518,31676,YOR123C,YLR418C,LEO1,CDC73,L000000936,...,14759368,559292,559292,High Throughput,-,-,-,-,-,BIOGRID
4,68778,851136,852582,31676,32973,YLR418C,YBR279W,CDC73,PAF1,L000002792,...,14759368,559292,559292,High Throughput,-,-,-,-,-,BIOGRID


In [185]:
# Extract interactions
val = data.copy()
val.set_index("Experimental System Type", inplace = True)
val = val.filter(items = ["Official Symbol Interactor A", "Official Symbol Interactor B", "Pubmed ID"])
# val = val.filter(like = "physical", axis = 0)
val = val.filter(like = "genetic", axis = 0)
val.head()

Unnamed: 0_level_0,Official Symbol Interactor A,Official Symbol Interactor B,Pubmed ID
Experimental System Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
genetic,CDC24,RSR1,12960420
genetic,CDC42,RSR1,12960420
genetic,SET2,CHD1,12773564
genetic,SET2,SHG1,12773564
genetic,SET2,SDC1,12773564


In [186]:
# Create graph of all interactions
graph = nx.from_pandas_edgelist(val, "Official Symbol Interactor A", "Official Symbol Interactor B", create_using=nx.DiGraph)

In [187]:
# nx.draw_networkx(graph)

In [188]:
# Discover all subgraphs of graph
subgraphs = list(nx.weakly_connected_component_subgraphs(graph))

In [189]:
subgraphs_n = defaultdict(list)

In [190]:
# Categorize subgraphs depending on number of nodes
for subgraph in subgraphs:
    subgraphs_n[nx.number_of_nodes(subgraph)].append(subgraph)

In [191]:
subgraphs_n

defaultdict(list,
            {5947: [<networkx.classes.digraph.DiGraph at 0x1a385dc240>],
             3: [<networkx.classes.digraph.DiGraph at 0x1a707a28d0>]})

In [192]:
# Implementation of distance function
a = pd.DataFrame(np.array([[0,1,1], [0,0,0], [0,0,0]]))
b = pd.DataFrame(np.array([[0,0,0], [1,0,1], [0,0,0]]))
c = pd.DataFrame(np.array([[0,0,0], [0,0,0], [1,1,0]]))
d = pd.DataFrame(np.array([[0,0,0], [1,0,0], [1,0,0]]))
e = pd.DataFrame(np.array([[0,0,0], [1,0,0], [0,1,0]]))

def attributes(x):
    """
    Returns [number of source nodes, number of outgoing edges per node...]
    which is equivalent to d(x, 0) for connectivity matrix x
    
    x is a pandas dataframe
    """
    n = len(x)
    
    # Initialize with counter of source nodes
    result = [0]
    
    # Increment counter for number of source nodes
    for j in range(n):
        for i in range(n):
            if x.iloc[i,j] != 0:
                break
            elif i == n - 1:
                result[0] += 1
    
    # Append sorted list of number of outgoing edges per node
    result.extend(sorted([int(sum(x.iloc[i])) for i in range(n)]))
    
    return result

# Testing connectivity matrices in paper
print(attributes(a))
print(attributes(b))
print(attributes(c))
print(attributes(d))
print(attributes(e))

[1, 0, 0, 2]
[1, 0, 0, 2]
[1, 0, 0, 2]
[2, 0, 1, 1]
[1, 0, 1, 1]


In [193]:
for n in subgraphs_n.keys():
    for subgraph in subgraphs_n[n]:
        print(attributes(nx.to_pandas_adjacency(subgraph)))

[256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0