In [33]:
import json
import time
from functools import reduce

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm.autonotebook import tqdm

from py2neo import Graph, Node, Relationship

In [34]:
graph = Graph("bolt://dev_neo4j:7687", auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False, verbose=True):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    if verbose:
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df

Connected to graph database with 370,269,897 nodes and 220,155,390 relationships!


In [35]:
# Write edge lists
min_year, max_year = 1900, 2020
for y in tqdm(range(min_year, max_year+1)):
    print("{}...".format(y))
        
    # Write edge list for graph at year=y
    query = """
    CALL apoc.export.csv.query('
    MATCH (z:Year)<-[:PUBLISHED_IN]-(a:Quanta)-[:CITES]->(b:Quanta)-[:PUBLISHED_IN]->(y:Year)
    WHERE z.value<={year} AND y.value<={year}
    RETURN id(a) as source, id(b) as target
    ','{out}',
    {{quotes:false}});
    """.format(year=y, out='/import/embeddings/quanta.{}.edgelist'.format(y))
    run_query(query, graph, print_only=True)

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

1900...

    CALL apoc.export.csv.query('
    MATCH (z:Year)<-[:PUBLISHED_IN]-(a:Quanta)-[:CITES]->(b:Quanta)-[:PUBLISHED_IN]->(y:Year)
    WHERE z.value<=1900 AND y.value<=1900
    RETURN id(a) as source, id(b) as target
    ','/import/embeddings/quanta.1900.edgelist',
    {quotes:false});
    
Query completed in 0.00 minutes.
1901...

    CALL apoc.export.csv.query('
    MATCH (z:Year)<-[:PUBLISHED_IN]-(a:Quanta)-[:CITES]->(b:Quanta)-[:PUBLISHED_IN]->(y:Year)
    WHERE z.value<=1901 AND y.value<=1901
    RETURN id(a) as source, id(b) as target
    ','/import/embeddings/quanta.1901.edgelist',
    {quotes:false});
    
Query completed in 0.00 minutes.
1902...

    CALL apoc.export.csv.query('
    MATCH (z:Year)<-[:PUBLISHED_IN]-(a:Quanta)-[:CITES]->(b:Quanta)-[:PUBLISHED_IN]->(y:Year)
    WHERE z.value<=1902 AND y.value<=1902
    RETURN id(a) as source, id(b) as target
    ','/import/embeddings/quanta.1902.edgelist',
    {quotes:false});
    
Query completed in 0.00 minutes.
1903...

 

In [None]:
# Replace commas with spaces
find . -name 'quanta.*.edgelist' -exec sed -i "s/,/ /g" {} \;

In [None]:
# Delete first row TODO update to first two rows and add header:false to LOAD CSV
for filename in quanta.*.edgelist; do
     tail -n +2 "$filename" > "$filename.tmp"
     mv -f "$filename.tmp" "$filename" 
done

In [None]:
# Run node2vec
for filename in quanta.*.edgelist; do
     ./node2vec -i:"$filename" -o:"$filename.emb" -d:10 -dr -v
done

In [42]:
# Import results to graph
min_year, max_year = 1900, 1990
for y in tqdm(range(min_year, max_year+1)):
    print("{}...".format(y))
    
    edgelist = '/import/embeddings/quanta.{}.edgelist'.format(y)
    
    # Write edge list for graph at year=y
    query = """
    CALL apoc.load.csv('/import/embeddings/quanta.{year}.edgelist.emb', {{sep:" "}})
    YIELD list 
    WITH apoc.convert.toInteger(head(list)) as nodeId, 
        [x IN tail(list) | apoc.convert.toFloat(x)] AS embedding
    MATCH (n) WHERE id(n)=nodeId
    MATCH (y:Year) WHERE y.value={year}
    MERGE(n)-[r:METRICS_IN]->(y)
    SET r.node2vec=embedding
    """.format(year=y)
    run_query(query, graph, print_only=False)

HBox(children=(IntProgress(value=0, max=91), HTML(value='')))

1900...

    CALL apoc.load.csv('/import/embeddings/quanta.1900.edgelist.emb', {sep:" "})
    YIELD list 
    WITH apoc.convert.toInteger(head(list)) as nodeId, 
        [x IN tail(list) | apoc.convert.toFloat(x)] AS embedding
    MATCH (n) WHERE id(n)=nodeId
    MATCH (y:Year) WHERE y.value=1900
    MERGE(n)-[r:METRICS_IN]->(y)
    SET r.node2vec=embedding
    
Query completed in 0.00 minutes.
1901...

    CALL apoc.load.csv('/import/embeddings/quanta.1901.edgelist.emb', {sep:" "})
    YIELD list 
    WITH apoc.convert.toInteger(head(list)) as nodeId, 
        [x IN tail(list) | apoc.convert.toFloat(x)] AS embedding
    MATCH (n) WHERE id(n)=nodeId
    MATCH (y:Year) WHERE y.value=1901
    MERGE(n)-[r:METRICS_IN]->(y)
    SET r.node2vec=embedding
    
Query completed in 0.00 minutes.
1902...

    CALL apoc.load.csv('/import/embeddings/quanta.1902.edgelist.emb', {sep:" "})
    YIELD list 
    WITH apoc.convert.toInteger(head(list)) as nodeId, 
        [x IN tail(list) | apoc.convert.to