# Create the adjacency lists needed by deepwalk

In [1]:
import pandas as pd
from tqdm import tqdm

from collections import defaultdict

## Read node map

In [2]:
nodemap = (pd
    .read_csv("data/node_map.tsv", sep='\t')
    [["node_uid", "node_id"]]
)

In [3]:
nodemap.head(2)

Unnamed: 0,node_uid,node_id
0,0,C0000039
1,1,C0000052


---

## Create undirected networks from the network files (which already include the gold standard)

In [4]:
K = 5
for i in range(K):
    print("Reading edges for fold {}".format(i))
    edges = (pd
        .read_csv(
            "data/network/network_fold{}.tsv".format(i), sep='\t'
        )
        .merge(
            nodemap, how="inner", left_on="source_id", right_on="node_id"
        )
        .drop("node_id", axis=1)
        .rename(columns={"node_uid": "source_uid"})
        .merge(
            nodemap, how="inner", left_on="target_id", right_on="node_id"
        )
        .drop("node_id", axis=1)
        .rename(columns={"node_uid": "target_uid"})
    )
    
    adjlist = defaultdict(set)
    for suid, tuid in tqdm(zip(edges["source_uid"], edges["target_uid"]), total=len(edges), desc="Building"):
        adjlist[suid].add(tuid)
        adjlist[tuid].add(suid)
        
    # write to file
    with open("data/adjlist/adjlist_{}.txt".format(i), "w") as fout:
        for key, vals in tqdm(adjlist.items(), desc="Saving"):
            vals = sorted(list(vals))
            vals = list(map(str, vals))
            
            fout.write("{} {}\n".format(key, " ".join(vals)))

Reading edges for fold 0


Building: 100%|██████████| 9651879/9651879 [00:12<00:00, 786727.52it/s]
Saving: 100%|██████████| 210389/210389 [00:08<00:00, 26028.61it/s]


Reading edges for fold 1


Building: 100%|██████████| 9651833/9651833 [00:11<00:00, 827862.46it/s]
Saving: 100%|██████████| 210386/210386 [00:08<00:00, 25870.65it/s]


Reading edges for fold 2


Building: 100%|██████████| 9651822/9651822 [00:11<00:00, 804957.23it/s]
Saving: 100%|██████████| 210385/210385 [00:08<00:00, 25059.02it/s]


Reading edges for fold 3


Building: 100%|██████████| 9651803/9651803 [00:13<00:00, 700128.42it/s]
Saving: 100%|██████████| 210386/210386 [00:08<00:00, 25509.45it/s]


Reading edges for fold 4


Building: 100%|██████████| 9651879/9651879 [00:13<00:00, 694757.48it/s]
Saving: 100%|██████████| 210389/210389 [00:08<00:00, 25115.23it/s]
