In [None]:
from py2neo import Graph, Node, Relationship 
from tqdm import tqdm
import glob, os, time

#graph = Graph("bolt://matlaber5.media.mit.edu:7687", auth=("neo4j", "myneo")) 
graph = Graph("bolt://54.174.175.98:7687", auth=("neo4j", "myneo")) 

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse'] 
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse'] 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format (n_nodes, n_relationships))

In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
def run_query(query, graph, start_year=1800, end_year=2020, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True): 
    #df = 1

    if print_only: 
        print_query = True 
        run_query = False 
    start_time = time.time()    
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
        else: 
            graph.run(query) 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

In [None]:
years = range(1800, 2020)

### Author-Paper Features

In [None]:
# TODO test 
# Assumes that every author has at least a single citation on the papers they write
def author_h_index(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta) 
        WHERE q.year <= {}
        RETURN [a,q] AS nodes LIMIT 10",
            // Why limit 10?
            
        "WITH head(nodes) AS a, last(nodes) AS q
        MATCH (p:Quanta)-[:CITES]->(q) WHERE p.year <= {}
        WITH count(p) as citations, a, collect(citations) as paper_citations 
            // TODO try to RETURN a, paper_citations with fake list and check results
            // ** can we do collect(count(p)) in one step and will this save time?)
            //assuming that doing count(p) only counts the number of citations for each independent quanta
        WITH a, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
            // assuming first sort gives them ascending
            // ordered citation is list of integers
        WITH a, apoc.coll.toList([x in ordered_citations | {{index: apoc.coll.indexOf(ordered_citations, x), value: x}}]) as indexed_citations
        WITH a, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
            //because indexes start at zero
        MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
            // year is right property here?
        SET m.hindex = head(filtered)+1",
            // TODO check for off by one errors here
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)

        run_query(query, graph)
        
#author_h_index(years)   

In [None]:
def test_author_h_index_delta(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.hindex as original, pm.hindex as previous, m.hindex-pm.hindex as delta""".format(author, year, year, year)
    # not sure if my formatting is correct
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
def author_citation_count(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q] AS nodes",
        
        "WITH head(nodes) AS a, last(nodes) AS q
        MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}
        WITH a, r
        MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        SET m.total_citations = count(r)",
        
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)
        
        run_query(query, graph)

#author_citation_count(years)

In [None]:
def test_author_citation_count(author, year):
    """ Takes in a normalized name and returns total citations count""" 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, q
    MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}
    RETURN a.name as name, q.title as title, count(r) as citations
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_citation_ct = test_author_citation_count('edward s boyden', 2008)
#author_citation_ct

In [None]:
def test_author_citation_delta(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.total_citations as original, pm.total_citations as previous, m.total_citations-pm.total_citations as delta""".format(author, year, year, year)
    # not sure if my formatting is correct
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
# TODO test 
def author_mean_citations_per_paper(years):
    for year in years:   
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q, count(*)] AS info",
        
        "WITH info[0] AS a, info[1] AS q, info[2] as papers_written
         MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})       
         SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {})/papers_written",
         
         //WITH a, count(r) as citations\
         //MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
         //MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}

         //SET m.citations_per_paper = citations / papers_written\

        //"WITH head(info) AS a, tail(info) as q,
        //collect(size((q)<-[:CITES]-(:Quanta))) as citations
        //MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        //SET m.mean_citations_per_paper = apoc.coll.avg(citations)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)
        
        run_query(query, graph)

#author_mean_citations_per_paper(years)

In [None]:
def test_author_mean_citations_per_paper(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[r:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, collect(size((q)<-[:CITES]-(p:Quanta) WHERE p.year<={})) as citations
    RETURN a.name, citations, apoc.coll.avg(citations) as avg
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_mean_citations_paper = test_author_mean_citations_per_paper('edward s boyden', 2008)
#author_mean_citations_paper

def another_test_author_mean_citations_per_paper(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {} 
    WITH a, collect(q) as papers
    WITH a, papers, size(papers) as papers_written
    UNWIND papers as paper
    MATCH (p:Quanta)-[:CITES]->(paper) 
    WHERE p.year <= {}
    WITH DISTINCT a.name as name, papers_written, count(p) as citations 
    RETURN name, citations, papers_written""".format(author, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
def test_author_citations_per_paper_delta(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.citations_per_paper as original, pm.citations_per_paper as previous, m.citations_per_paper-pm.citations_per_paper as delta""".format(author, year, year, year)
    # not sure if my formatting is correct
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
# TODO test 
def author_mean_citations_per_year(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        WITH a, q, collect(q.year) as years, apoc.coll.min(years) as start_year
        RETURN [a,q, start_year] AS info",
        
        "WITH info[0] AS a, info[1] AS q, info[2] as start_year
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.citations_per_year = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {})/(2019-start_year)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)
        
#author_mean_citations_per_year(years)

In [None]:
def test_author_mean_citations_per_year(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[r:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, q, collect(q.year) as years 
    WITH a, q, apoc.coll.min(years) as start_year
    WITH a, size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {}) as citations
    RETURN a.name, citations, citations/(2019-start_year)
    
    //WITH a, collect(size((q)<-[:CITES]-(:Quanta))) as citations
    //RETURN a.name, citations, apoc.coll.avg(citations) as avg
    //collect(q.year) as years, apoc.coll.min(years) as start_year
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_mean_citations_year = test_author_mean_citations_per_year('edward s boyden', 2008)
author_mean_citations_year

In [None]:
def author_papers_ct(years):
    """ Total number of papers author has published """
    
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,count(*)] AS info",
            
        "WITH head(info) AS a, last(info) AS count
        MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        SET m.total_papers = count",
        
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year)

        run_query(query, graph)
        
#authors_papers_ct(years)

In [None]:
def test_author_papers_count(author, year):
    """ Takes in a normalized name and returns total paper count""" 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    RETURN a.name as name, count(*) as papers
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

auth_papers_ct = test_author_papers_count('edward s boyden', 2018)
#auth_papers_ct

In [None]:
def test_author_papers_delta(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.total_papers as original, pm.total_papers as previous, m.total_papers-pm.total_papers as delta""".format(author, year, year, year)
    # not sure if my formatting is correct
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
# TODO test 
# Author Age
def author_age(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        WITH a, collect(q.year) as years, apoc.coll.min(years) as start_year
        RETURN [a, start_year] AS info",
       
        "WITH info[0] AS a, info[1] AS start_year
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.author_age = 2019-start_year",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)
    
#author_age(years)

In [None]:
# TODO test
# Not sure how this reacts to the fact that coauthors are added over time
def author_recent_num_coauthors(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year = {} OR q.year = {} - 1
        RETURN [a,q] AS nodes",
        
        "WITH head(nodes) AS a, last(nodes) AS q
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.recent_coauthor_count = size(MATCH (b:Author)-[:AUTHORED]->(q) WHERE b.name != a.name)",
            // will this double count?
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)
        
#author_recent_num_coauthors(years)

In [None]:
# TODO test
def author_max_single_paper_citations(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q] AS nodes",
        
        "WITH head(nodes) AS a, last(nodes) AS q
        MATCH (p:Quanta)-[:CITES]->(q)
        WHERE p.year <= {}
        WITH a, apoc.coll.toSet(count(p)) as paper_citations
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.max_citations = apoc.coll.max(paper_citations)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)

#author_max_single_paper_citations(years)

### Author-Venue Features

In [None]:
# TODO test
def venue_h_index_stats(years):
    pass
  
#venue_h_index_stats(years)

In [None]:
# TODO test
def venue_citation_stats(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
        WHERE q.year <= {}
        RETURN [a,v] AS nodes",

        "WITH head(nodes) AS a, last(nodes) AS v
        MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)<-[:CITES]-(d:Quanta)
        WITH apoc.coll.toList(count(d)) as venue_citations
            // assumes count(d) counts citations per venue and not the same as count(*)
        WHERE p.year <= {} AND d.year <= {}
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.venue_citations_mean = apoc.coll.avg(venue_citations), 
        m.venue_citations_min = apoc.coll.min(venue_citations), 
        m.venue_citations_max = apoc.coll.max(venue_citations)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)

#venue_h_index_stats(years)

In [None]:
# TODO test
# Get the number of papers in venues in which the author has published
def venue_papers_stats():
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
        WHERE q.year <= {}
        RETURN [a,v] AS nodes",
            // this returns list of [author, [venues they published in]]?
            
        "WITH head(nodes) AS a, last(nodes) AS v
        MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)
        WITH apoc.coll.toList(count(p)) as venue_papers
            // assumes count(p) counts papers per venue and not the same as count(*)
        WHERE p.year <= {}
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.venue_papers_mean = apoc.coll.avg(venue_papers), 
        m.venue_papers_min = apoc.coll.min(venue_papers), 
        m.venue_papers_max = apoc.coll.max(venue_papers)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

        run_query(query, graph)

#venue_papers_stats()

In [None]:
# TODO test
# Stats about maximum number of citations any paper published in a venue
# has received for each venue the author has published in

# TODO fix this query and add min and max and test accuracy

def venue_max_paper_citations_stats(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
        WHERE q.year <= {}
        RETURN [a,v] AS nodes",
        
        "WITH head(nodes) AS a, last(nodes) AS v
        MATCH (p:Quanta)-[:CITES]->(v)
            // todo fix this because venues are never meant to be cited
        WHERE p.year <= {}
        WITH a, apoc.coll.toSet(count(p)) as paper_citations
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.max_citations = apoc.coll.max(paper_citations)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(str(year))

#venue_max_paper_citations(years)

In [None]:
# TODO test 
def author_num_venues_published(years):   
    for year in years:
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
    WHERE q.year <= {}
    WITH DISTINCT v
    RETURN [a,count(v)] AS info",
        //assumes count(v) only counts venues and not same as count(*)
    
    "WITH head(info) AS a, last(info) AS count
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})
    SET m.total_venues = count",
    {{batchSize:5000, iterateList:true, parallel:false}});
    """.format(str(year))
    
    run_query(query, graph)
    
#author_num_venues_published(years)

In [None]:
def test_author_num_venues_published(author, year):
    """ Takes in a normalized name and returns total number of venues published in""" 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= {}
    WITH a, q
    MATCH (q)-[:PUBLISHED_IN]->(v:Venue)
    RETURN a.name as name, count(distinct v) as num_venues, count(q) as num_papers
    """.format(author, year)

    df = run_query(query, graph, to_df=True)
    return df

auth_num_venues = test_author_num_venues_published('edward s boyden', 2018)
auth_num_venues

### Old

In [None]:
for i in range(start_year, 2019):

    # Author H-index
    # Assumes that every author has at least a single citation on the papers they write
    
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author)-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes LIMIT 10",
        // Why limit 10?
    
    "WITH head(nodes) AS a, last(nodes) AS q
    MATCH (p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+"""
    
    WITH count(p) as citations, a, collect(citations) as paper_citations 
        // RETURN a, paper_citations
        // ** can we do collect(count(p)) in one step and will this save time?)
        //assuming that doing count(p) only counts the number of citations for each independent quanta
    WITH a, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
        //assuming first sort gives them ascending
        // ordered citation is list of integers
    WITH a, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
    WITH a, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
        //because indexes start at zero
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
        // year is right property here?
    SET m.hindex = head(filtered)+1",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Citation Count
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_citations = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Mean Citations per Paper
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q,count(*)] AS info
        // count(*) counts null values - we are trying to count q I think
    ","
    WITH info[0] AS a, info[1] AS q, info[2] as papers_written
        // Where do we use papers_written in the calculation below?
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")/papers_written",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Mean Citations per Year
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    WITH a, q, collect(q.year) as years, apoc.coll.min(years) as start_year
    RETURN [a,q, start_year] AS info
    ","
    WITH info[0] AS a, info[1] AS q, info[2] as start_year
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.citations_per_year = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")/(2019-start_year)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Papers
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,count(*)] AS info
        // do we need count(q) here?
    ","
    WITH head(info) AS a, last(info) AS count
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_papers = count",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Age
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    WITH a, collect(q.year) as years, apoc.coll.min(years) as start_year
    RETURN [a, start_year] AS info
    ","
    WITH info[0] AS a, info[1] AS start_year
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.author_age = 2019-start_year",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Recent Coauthor
    # Not sure how this reacts to the fact that coauthors are added over time
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year = """+str(i)+""" OR q.year = """+str(i-1)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.recent_coauthor_count = size(MATCH (b:Author)-[:AUTHORED]->(q) WHERE b.name != a.name)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Max Citations on Single Paper
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MATCH (p:Quanta)-[:CITES]->(q)
    WHERE p.year <= """+str(i)+"""
    WITH a, apoc.coll.toSet(count(p)) as paper_citations
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.max_citations = apoc.coll.max(paper_citations)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Venues' H-indexes
    
    # Venue Citations
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,v] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS v
    MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)<-[:CITES]-(d:Quanta)
    WITH apoc.coll.toList(count(d)) as venue_citations
        // assumes count(d) counts citations per venue and not the same as count(*)
    WHERE p.year <= """+str(i)+""" AND d.year <= """+str(i)+"""
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.venue_citations = [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
        # Venue Papers

        query = """
        CALL apoc.periodic.iterate("
        MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
        WHERE q.year <= """+str(i)+"""
        RETURN [a,v] AS nodes
        ","
        WITH head(nodes) AS a, last(nodes) AS v
        MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)
        WITH apoc.coll.toList(count(p)) as venue_papers
            // assumes count(p) counts papers per venue and not the same as count(*)
        WHERE p.year <= """+str(i)+""" 
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
        SET m.venue_papers = [apoc.coll.avg(venue_papers), apoc.coll.min(venue_papers), apoc.coll.max(venue_papers)]",
        {batchSize:5000, iterateList:true, parallel:false});
        """

        run_query(query, graph)
    
    # Venue Max Citations
    
    # Total Number of Venues
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
    WHERE q.year <= """+str(i)+"""
    WITH DISTINCT v
    RETURN [a,count(v)] AS info
        //assumes count(v) only counts venues and not same as count(*)
    ","
    WITH head(info) AS a, last(info) AS count
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_venues = count",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)


In [None]:
#def author_hindex(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    MATCH (p:Quanta)-[:CITES]->(q)
#    WITH y, count(p) as citations, a, collect(citations) as paper_citations
#    WITH y, a, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
#    WITH y, a, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
#    WITH y, a, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
#    RETURN [a, y, filtered[0] + 1] as result

#def mean_citations_year(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    MATCH (p:Quanta)-[:CITES]->(q)
#    RETURN count(p)/count(q) as mean

#def papers_published_year(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    RETURN count(q)

### Get features for Venue

In [None]:
def venue_hindex(v, value):
    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
    MATCH (p:Quanta)-[:CITES]->(q)
    WITH y, count(p) as citations, v, collect(citations) as paper_citations
    WITH y, v, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
    WITH y, v, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
    WITH y, v, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
    RETURN [v, y, filtered[0] + 1] as result
    
#def venue_paper_citations(v, value):
#    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)<-[:CITES]-(:Quanta)<-[:PUBLISHED_IN]-(y:Year)
#    WHERE y.year <= value
#    WITH count(*)/count(q) as venue_avg
    
#def venue_papers(v, value):
#    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)<-[:PUBLISHED_IN]-(y:Year)
#    WHERE y.year <= value
#    WITH count(q) as venue_papers
    
def venue_papers_max(v):
    MATCH (v:Venue)<-[:PUBLISHED_IN]-(:Quanta)<-[:CITES]-(q:Quanta)
    WITH apoc.coll.toSet(count(q)) as counts
    RETURN apoc.coll.max(counts) as max

In [None]:
########## CALCULATING METRICS ##########
# AUTHOR HINDEX

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
WITH count(p) as citations, a, collect(citations) as paper_citations
WITH apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
WITH apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
WITH apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
RETURN filtered[0] + 1 as hindex

# AUTHOR HINDEX DELTA
# can implement after doing the first one

CALL author_hindex(value) YIELD result[0] as a, result[1] as y, result[2] as hindex
CALL author_hindex(value+1) YIELD result[2] as second_hindex
RETURN hindex-second_hindex as hindex_delta

# AUTHOR CITATION COUNT

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(*)

# KEY CITATION COUNT

(need additional paper)

# AUTHOR CITATIONS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year <= 2018
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(p)

# AUTHOR KEY CITATIONS DELTA
# can implement after first one

(need additional paper)

# AUTHOR MEAN CITATIONS PER PAPER

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(p)/count(q)

# AUTHOR MEAN CITATION PER PAPER DELTA
# can implement after first one

CALL mean_citations_year(value) YIELD mean as old
CALL mean_citations_year(value+1) YIELD mean as new
RETURN new-old

# AUTHOR MEAN CITATIONS PER YEAR

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
MATCH (p:Quanta)-[:CITES]->(q)
WITH count(p) as citations, apoc.coll.toSet(y) as years, apoc.coll.min(years) as start_year
RETURN citations/(2019-start_year)

# AUTHOR PAPERS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
RETURN count(q)

# AUTHOR PAPERS DELTA
# can implement after first one

CALL papers_published_year(value) YIELD mean as old
CALL papers_published_year(value+1) YIELD mean as new
RETURN new-old

# AUTHOR MEAN CITATION RANK 
# rank of author (between 0 and 1) among all other authors in terms of mean citations per year

(can be implemented after every author has their mean citations per year calculated)

# AUTHOR UNWEIGHTED PAGERANK (by coauthorship)

(already have base code)

# AUTHOR WEIGHTED PAGERANK (by coauthorship)

(already have base code)

# AUTHOR AGE

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WITH apoc.coll.toSet(y) as years
RETURN apoc.coll.min(years) as author_age

# AUTHOR RECENT NUM COAUTHORS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE year >= 2018
MATCH (b:Author)-[:AUTHORED]->(q)
WHERE b.name != a.name
RETURN count(b)

# AUTHOR MAX SINGLE PAPER CITATIONS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
WITH apoc.coll.toSet(count(p)) as paper_citations
RETURN apoc.coll.max(paper_citations) as max

# VENUE HINDEX

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_hindex(v, value) YIELD apoc.coll.toList(result[2]) as hindexes
RETURN [apoc.coll.avg(hindexes), apoc.coll.min(hindexes), apoc.coll.max(hindexes)]

# VENUE HINDEX DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_hindex(v, value) YIELD apoc.coll.toList({venue: v, index: result[2]}) as old_hindexes
CALL venue_hindex(v, value+1) YIELD apoc.coll.toList({venue: v, index: result[2]}) as new_hindexes
WITH apoc.coll.toList([x IN old_hindexes and y IN new_hindexes WHERE v.venue = y.venue | y.index-x.index]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE CITATIONS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)<-[:PUBLISHED_IN]-(:Quanta)<-[:CITES]-(q:Quanta)
WITH apoc.coll.toList(count(q)) as venue_citations
RETURN [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]


# VENUE CITATIONS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_citations(v, value) YIELD apoc.coll.toList({venue: v, avg: venue_avg}) as old_avg
CALL venue_paper_citations(v, value+1) YIELD apoc.coll.toList({venue: v, avg: venue_avg}) as new_avg
WITH apoc.coll.toList([x IN old_avg and y IN new_avg WHERE v.venue = y.venue | y.avg-x.avg]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE PAPERS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)
WITH apoc.coll.toList(count(q)) as venue_citations
RETURN [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]

#VENUE PAPERS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_citations(v, value) YIELD apoc.coll.toList({venue: v, count: venue_papers}) as old_count
CALL venue_paper_citations(v, value+1) YIELD apoc.coll.toList({venue: v, count: venue_papers}) as new_count
WITH apoc.coll.toList([x IN old_count and y IN new_count WHERE v.venue = y.venue | y.count-x.count]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE RANK 
# venue rank fmean, min, maxg  Ranks of venues (between 0-1) in which the author has published determined by mean number of citations per paper

(can be implemented after every author has their mean citations per year calculated)

# VENUE MAX SINGLE PAPER CITATIONS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_max(v) YIELD apoc.coll.toList(max) as maxes
RETURN [apoc.coll.avg(maxes), apoc.coll.min(maxes), apoc.coll.max(maxes)]

# TOTAL NUMBER OF VENUES

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
WITH DISTINCT v
RETURN count(v)


In [None]:
# TODO test 
def author_mean_citations_per_paper_original(years):
    for year in years:   
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q,count(*)] AS info",
            // count(*) counts null values also - we are trying to count q I think

        "WITH info[0] AS a, info[1] AS q, info[2] as papers_written
        MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}
        WITH a, count(r) as citations
        MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        SET m.citations_per_paper = citations / papers_written",
            //SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {})/papers_written",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)
        
        run_query(query, graph)

#author_mean_citations_per_paper(years)

# Testing Functions

In [None]:
# H-index

MATCH (a:Author {normalizedName: 'edward s boyden'})-[:AUTHORED]->(q:Quanta)
WHERE q.year <= 2008 
WITH a, collect(q) as papers
WITH a, papers, size(papers) as papers_written
UNWIND papers as paper
MATCH (p:Quanta)-[:CITES]->(paper) 
WHERE p.year <= 2008
WITH DISTINCT a.name as name, papers_written, count(p) as citations 
RETURN name, citations, papers_written

MATCH () 
WITH [0, 2, 5, 6, 3, 8] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | apoc.coll.indexOf(ordered_citations, x)]) as filtered 
RETURN CASE filtered 
WHEN [] 
THEN 0 
ELSE last(filtered) + 1 
END

MATCH () 
WITH [] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN CASE filtered 
WHEN [] 
THEN 0 
ELSE last(filtered) + 1 
END

MATCH () \
WITH [0, 0, 0] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN last(filtered)+1

MATCH () 
WITH [0, 2, 5, 6, 3, 8] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN last(filtered)+1

########## Failed ###########

MATCH ()
WITH [] as paper_citations 
RETURN CASE paper_citaions
WHEN []
THEN 0
ELSE 
WITH apoc.coll.sort(paper_citations) as inverse # <- FAILED HERE
WITH apoc.coll.reverse(inverse) as ordered_citations
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
RETURN last(filtered) + 1
END