# Check that the minimal network is bipartite in the original identifier namespace (see if it's a problem with the adjacency list generation)

In [1]:
import pandas as pd

from collections import defaultdict
from tqdm import tqdm
from queue import Queue

In [2]:
edges = pd.read_csv("data/min_hetionet/minhet_edges.tsv", sep='\t')

In [3]:
edges.head()

Unnamed: 0,start_id,end_id,het_etype,start_htype,end_htype
0,DB00643,51547,UPREGULATES_CuG,Compound,Gene
1,DB08881,10450,UPREGULATES_CuG,Compound,Gene
2,DB01211,10450,DOWNREGULATES_CdG,Compound,Gene
3,DB00374,10450,DOWNREGULATES_CdG,Compound,Gene
4,DB00398,10450,UPREGULATES_CuG,Compound,Gene


In [4]:
edges.shape

(62060, 5)

---

In [5]:
all_nodes = set(edges["start_id"]) | set(edges["end_id"])

In [6]:
len(all_nodes)

7927

---

In [7]:
adjlist = defaultdict(set)

for idx, row in tqdm(edges.iterrows(), total=len(edges)):
    
    adjlist[row["start_id"]].add(row["end_id"])
    
    adjlist[row["end_id"]].add(row["start_id"])

100%|██████████| 62060/62060 [00:08<00:00, 7280.69it/s]


## BFS

In [8]:
visited = set()
subgraph = 0

label = dict()

for node in all_nodes:
    if node not in visited:
        
        queue = Queue()
        queue.put(node)
        
        while not queue.empty():
            cur = queue.get()
            visited.add(cur)
            
            label[cur] = subgraph
            
            for neighbour in adjlist[cur]:
                if neighbour not in visited:
                    visited.add(neighbour)
                    queue.put(neighbour)
                    
        subgraph += 1

---

In [9]:
subgraph

8

In [10]:
res = (pd
    .DataFrame(
        list(label.items()), columns=["node_id", "subgraph"]
    )
)

In [11]:
res.head()

Unnamed: 0,node_id,subgraph
0,6764,0
1,UBERON:0002523,1
2,23753,0
3,DB01511,0
4,UBERON:0001599,1


In [12]:
res["subgraph"].value_counts()

0    7381
1     531
7       3
3       3
5       3
6       2
2       2
4       2
Name: subgraph, dtype: int64

In [13]:
nodes = pd.read_csv("data/min_hetionet/minhet_nodes.tsv", sep='\t')

In [14]:
nodes.head()

Unnamed: 0,node_uid,node_id,name,het_type
0,0,10,NAT2,Gene
1,1,100,ADA,Gene
2,2,10000,AKT3,Gene
3,3,10005,ACOT8,Gene
4,4,10007,GNPDA1,Gene


In [15]:
final = res.merge(nodes.drop("node_uid", axis=1), how="inner", on="node_id")

In [16]:
final.head()

Unnamed: 0,node_id,subgraph,name,het_type
0,6764,0,ST5,Gene
1,UBERON:0002523,1,tunica intima,Anatomy
2,23753,0,SDF2L1,Gene
3,DB01511,0,Delorazepam,Compound
4,UBERON:0001599,1,stapedius muscle,Anatomy


In [17]:
final.groupby("subgraph")["het_type"].value_counts()

subgraph  het_type
0         Gene        5867
          Compound    1514
1         Anatomy      398
          Disease      133
2         Compound       2
3         Gene           2
          Compound       1
4         Compound       1
          Gene           1
5         Gene           2
          Compound       1
6         Compound       2
7         Compound       3
Name: het_type, dtype: int64

Looks like the gene and compound network is separate from the anatomy and disease network. Seems like the problem lies in the numerical ids used for deepwalk.