# Neo4j CSV import preparation

Daniel's import a single edge at a time using Py2neo is way too slow. I suspect that Neo4j can work much faster, because the CPU saturation during edge import is maybe only 30%, and therefore will be forced to move to CSV import because 1.2 million edges should take only a minute or two according to the online documentation (instead of 20 hours).

## Steps

1. Convert the hetio.graph format into a TSV file with the bare minimum of data needed to do the metapath calculations.
2. Create node and edge TSV files.
3. Batch import into neo4j
4. Repeat for the permuted networks

In [1]:
import collections
import os
import shutil

from glob import glob
from tqdm import tqdm

import hetio.hetnet
import hetio.readwrite

## Graph readers

In [2]:
def check_node_uniqueness(graph):
    """Check that all node identifiers are unique."""
    node_ids = []
    for node in graph.get_nodes():
        node_type, node_id = node.get_id()

        node_ids.append(node_id)

    node_ids = set(node_ids)

    assert len(node_ids) == graph.n_nodes
    
    return node_ids

def read_graph(path):
    return hetio.readwrite.read_graph(path)

## CSV Writer

In [3]:
def write_to_csv(fname, header, generator, sep=","):
    assert isinstance(header, list)
    assert isinstance(generator, collections.Iterable)
        
    with open(fname, "w") as fout:
        fout.write("{}\n".format(sep.join(header)))
        
        for vals in generator:
            fout.write("{}\n".format(sep.join(str(v) for v in vals)))

---

## Create CSV storage directory

In [4]:
csv_folder = os.path.join("data", "import_csvs")
if os.path.exists(csv_folder):
    shutil.rmtree(csv_folder)
    
os.makedirs(csv_folder)

---

## Node and edge generators

In [5]:
def gen_nodes(graph, show_progress=False):
    def make_gen():
        for node in graph.get_nodes():
            node_type, node_id = node.get_id()

            yield (node_id, node_id, '"{}"'.format(node.name), node_type)
            
    gen = make_gen()
    if show_progress:
        gen = tqdm(gen, total=graph.n_nodes, desc="Nodes")
    
    return gen

In [6]:
def gen_edges(graph, node_ids, show_progress=False):
    def as_type(metaedge):
        rel_type = str(metaedge.kind).upper().replace(" ", "_")
        abbrev = metaedge.get_standard_abbrev()
        return "{}_{}".format(rel_type, abbrev)
    
    def make_gen():
        for edge in graph.get_edges(exclude_inverts=True):
            source = edge.source.get_id()[1]
            target = edge.target.get_id()[1]

            edge_type = as_type(edge.metaedge)

            assert source in node_ids and target in node_ids

            yield (source, target, edge_type)
            
    gen = make_gen()
    if show_progress:
        gen = tqdm(gen, total=graph.n_edges, desc="Edges")
    
    return gen

---

## Graph to CSV converter

In [7]:
def convert_to_csv(fname):
    """Assuming fnames match *.json.bz2"""
    graph = read_graph(fname)
    
    node_ids = check_node_uniqueness(graph)
    
    out_name = os.path.basename(fname).split(".")[0]
    out_path = os.path.join("data", "import_csvs")

    # write nodes to file
    write_to_csv(
        os.path.join(out_path, "{}_nodes.csv".format(out_name)),
        [":ID", "identifier:string", "name:string", ":LABEL"],
        gen_nodes(graph, show_progress=True),
    )

    write_to_csv(
        os.path.join(out_path, "{}_edges.csv".format(out_name)),
        [":START_ID", ":END_ID", ":TYPE"],
        gen_edges(graph, node_ids, show_progress=True)
    )

## Convert networks to CSV

In [8]:
fnames = ["data/hetnet.json.bz2"] + glob("data/permuted/*.json.bz2")

for fname in fnames:
    print("Converting {} to CSV".format(fname))
    convert_to_csv(fname)

Converting data/hetnet.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 126138.09it/s]
Edges: 100%|██████████| 2194470/2194470 [00:20<00:00, 106031.03it/s]


Converting data/permuted/hetnet_perm-4.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 136276.62it/s]
Edges: 100%|██████████| 2194470/2194470 [00:20<00:00, 106771.14it/s]


Converting data/permuted/hetnet_perm-5.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 120769.10it/s]
Edges: 100%|██████████| 2194470/2194470 [00:20<00:00, 106765.12it/s]


Converting data/permuted/hetnet_perm-2.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 117344.63it/s]
Edges: 100%|██████████| 2194470/2194470 [00:21<00:00, 103854.39it/s]


Converting data/permuted/hetnet_perm-1.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 137597.35it/s]
Edges: 100%|██████████| 2194470/2194470 [00:20<00:00, 105673.70it/s]


Converting data/permuted/hetnet_perm-3.json.bz2 to CSV


Nodes: 100%|██████████| 47031/47031 [00:00<00:00, 132561.45it/s]
Edges: 100%|██████████| 2194470/2194470 [00:20<00:00, 107203.90it/s]
