# Check to ensure the network we generated is bipartite

2018-07-12

We are just verifying that the network we are providing to Deepwalk is bipartite, and that the chemicals and diseases are in different subnetworks.

In [1]:
import pandas as pd

from collections import defaultdict
from queue import Queue

## Read adjlist

In [2]:
def read_adjlist(fname):
    adjlist = defaultdict(list)

    with open(fname, "r") as fin:
        for line in fin:
            vals = list(map(int, line.rstrip("\n").split(" ")))

            node_idx = vals[0]
            neighbours = vals[1:]

            adjlist[node_idx] = neighbours

    return adjlist

In [3]:
adjlist = read_adjlist("data/min_hetionet/test/adjlist_0.txt")

In [4]:
len(adjlist)

7927

---

In [5]:
all_nodes = set(adjlist.keys())

In [6]:
# check that all node ids are consistent
for node_idx, neigh in adjlist.items():
    assert set(neigh) <= all_nodes

## BFS to identify subnetworks

In [7]:
visited = set()
subgraph = 0

label = dict()

for node in all_nodes:
    if node not in visited:
        
        queue = Queue()
        queue.put(node)
        
        while not queue.empty():
            cur = queue.get()
            visited.add(cur)
            
            label[cur] = subgraph
            
            for neighbour in adjlist[cur]:
                if neighbour not in visited:
                    visited.add(neighbour)
                    queue.put(neighbour)
                    
        subgraph += 1

In [8]:
subgraph

8

---

## Examine subgraphs and membership

In [9]:
res = (pd
    .DataFrame(
        list(label.items()), columns=["node_uid", "subgraph"]
    )
)

In [10]:
res.shape

(7927, 2)

In [11]:
res.head()

Unnamed: 0,node_uid,subgraph
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [12]:
res["subgraph"].value_counts()

0    7381
1     531
6       3
5       3
4       3
7       2
3       2
2       2
Name: subgraph, dtype: int64

### Add in node name and type

In [13]:
nodes = pd.read_csv("data/min_hetionet/minhet_nodes.tsv", sep='\t')

In [14]:
nodes.head()

Unnamed: 0,node_uid,node_id,name,het_type
0,0,10622,POLR3G,Gene
1,1,1371,CPOX,Gene
2,2,55930,MYO5C,Gene
3,3,54778,RNF111,Gene
4,4,55666,NPLOC4,Gene


In [15]:
final = res.merge(nodes, how="left", on="node_uid")

In [16]:
final.shape

(7927, 5)

In [17]:
final.head()

Unnamed: 0,node_uid,subgraph,node_id,name,het_type
0,0,0,10622,POLR3G,Gene
1,1,0,1371,CPOX,Gene
2,2,0,55930,MYO5C,Gene
3,3,0,54778,RNF111,Gene
4,4,0,55666,NPLOC4,Gene


In [18]:
final.groupby("het_type")["subgraph"].value_counts()

het_type  subgraph
Anatomy   1            398
Compound  0           1514
          5              3
          3              2
          7              2
          2              1
          4              1
          6              1
Disease   1            133
Gene      0           5867
          4              2
          6              2
          2              1
Name: subgraph, dtype: int64

In [19]:
final.groupby("subgraph")["het_type"].value_counts()

subgraph  het_type
0         Gene        5867
          Compound    1514
1         Anatomy      398
          Disease      133
2         Compound       1
          Gene           1
3         Compound       2
4         Gene           2
          Compound       1
5         Compound       3
6         Gene           2
          Compound       1
7         Compound       2
Name: het_type, dtype: int64

The network does indeed look like it separates the gene/compound nodes from the anatomy/disease nodes. some nodes are even more isolated amongst themselves, leading to more than two subnetworks.