# Setup

In [None]:
from py2neo import Graph, Node, Relationship 
from tqdm import tqdm
import glob, os, time

# Test DB
graph = Graph("bolt://54.88.167.164:7687", auth=("neo4j", "myneo")) 
# Deployed
#graph = Graph("bolt://54.174.175.98:7687", auth=("neo4j", "myneo")) 

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse'] 
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse'] 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format (n_nodes, n_relationships))

# Utility 

In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
def run_query(query, graph, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True): 
    df = 1

    if print_only: 
        print_query = True 
        run_query = False 
    start_time = time.time()    
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
        else: 
            graph.run(query) 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

In [None]:
# Needed to drop index on y.year first
def make_year_constraint():
    query = """
    CREATE CONSTRAINT ON (y:Year) ASSERT y.year IS UNIQUE
    """
    
    run_query(query, graph)
    
#make_year_constraint()

In [None]:
# All the above functions (minus making year constraint) in one cell for easy ipython import
from py2neo import Graph, Node, Relationship 
from tqdm import tqdm
import glob, os, time

# Test DB
graph = Graph("bolt://54.88.167.164:7687", auth=("neo4j", "myneo"))

def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

def run_query(query, graph, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True): 
    df = 1

    if print_only: 
        print_query = True 
        run_query = False 
    start_time = time.time()    
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
        else: 
            graph.run(query) 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

In [None]:
def create_metrics_in_relt(years):
    for year in years:
        print('Running query for year {}'.format(str(year)))   
        
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        RETURN DISTINCT a",
       
        "MATCH (y:Year {{year: $year}})
        CREATE (a)-[:METRICS_IN]->(y)",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph, to_df=True)
        #df = run_query(query, graph, to_df=True)
        #print(df.to_string())

#create_metrics_in_relt(years)
#metrics_data

In [None]:
years = range(2009, 2020)

# Write (Author)-[:METRICS_IN]-(Year) Metrics 

## Done on Test DB

In [None]:
def get_num_properties(prop):
    query = """
    MATCH (:Author)-[m:METRICS_IN]->(y:Year)
    WHERE exists(m.{})
    RETURN count(*)
    """
    
    run_query(query, graph, to_df=True)
    
num_existing_property = get_num_properties(prop)
num_existing_property

## Running on Test DB

In [None]:
# Running on cy1
def author_citation_count(years):
    """ Total number of citations author has received
        Adds m.total_citations to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), author_citation_count.__name__))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        RETURN [a, count(p)] as info",
        
        "WITH head(info) as a, tail(info) as citations
        MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a)
        SET m.total_citations = citations",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

#author_citation_count(years)

## TODO

In [None]:
def author_mean_citations_per_paper(years):
    for year in years:   
        print('Running query for year {}'.format(str(year)))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)   
        WHERE p.year <= $year
        RETURN [a, q, p] as info",

        "WITH info[0] as a, info[1] as q, info[2] as p        
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year:$year}})
        SET m.mean_citations_per_paper = toFloat(count(p)/count(DISTINCT q))",
    
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

author_mean_citations_per_paper(years)

In [None]:
def author_papers_ct(years):
    """ Total number of papers author has published """
    
    for year in years:
        print('Running query for year {}'.format(str(year)))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        RETURN [a,count(*)] AS info",
            
        "WITH head(info) AS a, last(info) AS count
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.total_papers = count",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year, year)

        run_query(query, graph)
        
author_papers_ct(years)

In [None]:
def author_mean_citations_per_year(years):
    for year in years:
        print('Running query for year {}'.format(str(year)))

        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
        WHERE q.year <= $year and p.year <= $year
        WITH a, count(p) as citations, collect(q.year) as years
        WITH a, citations, apoc.coll.min(years) as start_year
        RETURN [a, citations, start_year] AS info",
        
        "WITH info[0] AS a, info[1] AS citations, info[2] as start_year
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.citations_per_year = toFlat(citations / ($year - start_year))",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
        
author_mean_citations_per_year(years)

In [None]:
def author_age(years):
    for year in years:
        print('Running query for year {}'.format(str(year)))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        WITH a, collect(q.year) as years
        WITH a, apoc.coll.min(years) as start_year
        RETURN [a, start_year] AS info",
       
        "WITH head(info) AS a, last(info) AS start_year
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year:$year}})
        SET m.author_age = $year - start_year",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
    
author_age(years)


In [None]:
def author_max_single_paper_citations(years):    
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
        WHERE q.year <= $year and p.year <= $year
        WITH a, q, count(p) as citations
        WITH a, collect(citations) as all_citations
        RETURN [a, all_citations] AS info",

        "WITH head(info) AS a, last(info) AS all_citations
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.max_citations = apoc.coll.max(all_citations)",
        
        {{batchSize:5000, iterateList:true, parallel:false, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)

#author_max_single_paper_citations(years)

In [None]:
# not sure how this reacts to coauthors being added over time
def author_recent_num_coauthors(years):
    for year in years:
        print('Running query for year {}'.format(str(year)))        
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Author)
        WHERE q.year = $year OR q.year = $year - 1 
        AND not b.name = a.name
        WITH a, count(distinct b) as num_coauthors
        RETURN [a, num_coauthors] AS nodes",

        "WITH head(nodes) AS a, last(nodes) AS num_coauthors
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.recent_coauthor_count = num_coauthors",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
        
author_recent_num_coauthors(years)

In [None]:
def author_num_venues_published(years):   
    for year in years:
        print('Running query for year {}'.format(str(year)))        
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        OPTIONAL MATCH (q)-[:PUBLISHED_IN]->(v:Venue)
        RETURN [a, count(distinct v)] AS info",
        
        "WITH head(info) AS a, last(info) AS num_venues
        MATCH(a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.total_venues = num_venues",
        {{batchSize:5000, iterateList:true, parallel:false, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
    
author_num_venues_published(years)

## To be run

## Work-In-Progress

In [None]:
def write_author_start_year():
    query = """
    CALL apoc.periodic.iterate(

    "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WITH a, collect(q.year) as years
    WITH a, apoc.coll.min(years) as start_year
    RETURN [a, start_year] AS info",

    "WITH head(info) AS a, last(info) AS start_year
    SET a.start_year = start_year
    {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
    """.format(year)
    
    run_query(query, graph)
    
#write_author_start_year()

In [None]:
def author_unweighted_pagerank(years):
    ''' Write PageRank scores to author nodes as temporary and then write to metrics edge in 2nd query '''
    
    for year in years:
        print('Writing pagerank values to node for year {}'.format(str(year)))        
        
        query1 = """
        CALL algo.pageRank(        
        "MATCH (a:Author)-[:COAUTHOR]-(b:Author)
        WHERE a.start_year <= {}
        RETURN id(a) as id",

        "MATCH (a1:Author)-[:COAUTHOR]-(a2:Author)
        RETURN id(a1) AS source, id(a2) AS target",
        
        {{graph:'cypher', write:true, writeProperty:"temporary"}});
        """.format(year)

        run_query(query1, graph)
        
        print('Writing pagerank values to edges {}'.format(str(year)))     
        # Write scores to METRICS_IN relationship
        query2 = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[:COAUTHOR]-(b:Author)
        WHERE a.start_year <= $year
        RETURN a",
        
        "MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.pagerank = a.temporary",
        
        {{batchSize:10000, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query2, graph)


author_unweighted_pagerank(years)

In [None]:
def author_weighted_pagerank(years):
    #weightProperty: "strength"
    pass

In [None]:
def author_h_index(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta) 
        WHERE q.year <= {}
        OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
        WHERE p.year <= {}
        WITH a, q, count(p) as citations
        WITH a, collect(citations) as paper_citations 
        RETURN [a, paper_citations] as info",
        "WITH head(info) as a, last(info) as paper_citations
        WITH a, apoc.coll.sort(paper_citations) as inverse 
        WITH a, apoc.coll.reverse(inverse) as ordered_citations 
        WITH a, apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | {index: apoc.coll.indexOf(ordered_citations, x) + 1, value:x}]) as filtered
        WITH a, filtered, apoc.coll.frequencies([x in filtered | x.value]) as frequencies 
        WITH a, filtered, frequencies, min(filtered[-1].index+frequencies[-1].count-1, filtered[-1].value) as nonzero_h_index
        MATCH (y:Year {year: {}})
        WITH a, y, filtered, frequencies, nonzero_h_index
        MERGE (a)-[m:METRICS_IN]->(y)
        FOREACH(ignoreMe IN CASE WHEN filtered = [] THEN [1] ELSE [] END | SET m.hindex = 0 )
        FOREACH(ignoreMe IN CASE WHEN NOT filtered = [] THEN [1] ELSE [] END | SET m.hindex = nonzero_h_index)",
        {batchSize:5000, iterateList:true, parallel:false});
        """.format(year, year, year)

        run_query(query, graph)
        
#author_h_index(years)   


In [None]:
#TODO test
def author_papers_delta(years):
    pass

In [None]:
def author_mean_citation_rank(years):
    pass

### Author-Venue Features

In [None]:
def venue_h_index_stats(years):
    for year in years:
        query="""CALL apoc.periodic.iterate(
        "MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta) 
        WHERE q.year <= {}
        OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
        WHERE p.year <= {}
        WITH v, q, count(p) as citations
        WITH v, collect(citations) as paper_citations 
        RETURN [v, paper_citations] as info",
        "WITH head(info) as v, last(info) as paper_citations
        WITH v, apoc.coll.sort(paper_citations) as inverse 
        WITH v, apoc.coll.reverse(inverse) as ordered_citations 
        WITH v, apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | {index: apoc.coll.indexOf(ordered_citations, x) + 1, value:x}]) as filtered
        WITH v, filtered, apoc.coll.frequencies([x in filtered | x.value]) as frequencies 
        WITH v, filtered, frequencies, min(filtered[-1].index+frequencies[-1].count-1, filtered[-1].value) as nonzero_h_index
        MATCH (y:Year {year: {}})
        WITH v, y, filtered, frequencies, nonzero_h_index
        MERGE (v)-[m:METRICS_IN]->(y)
        FOREACH(ignoreMe IN CASE WHEN filtered = [] THEN [1] ELSE [] END | SET m.hindex = 0 )
        FOREACH(ignoreMe IN CASE WHEN NOT filtered = [] THEN [1] ELSE [] END | SET m.hindex = nonzero_h_index)",
        {batchSize:5000, iterateList:true, parallel:false});""".format(year, year, year)

        df = run_query(query, graph, to_df=True)
        return df
  
#venue_h_index_stats(years)

In [None]:
def venue_h_index_stats_delta(years):
    pass

In [None]:
def venue_citation_stats(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta) 
        WHERE q.year <= {}
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
        WHERE p.year <= {}
        WITH v, count(distinct q) as papers, count(p) as cites
        RETURN [v, cites/papers] as info",

        "WITH head(info) AS v, last(info) AS avg
        MERGE (v)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.average_citations = avg",

        {batchSize:5000, iterateList:true, parallel:false});
        """.format(str(year))

        run_query(query, graph)

#venue_h_index_stats(years)

In [None]:
def venue_citation_stats_delta(years):
    pass

In [None]:
def venue_papers_stats(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta) 
        WHERE q.year <= {}
        RETURN [v,count(*)] AS info",

        "WITH head(info) AS v, last(info) AS count
        MERGE (v)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.total_papers = count",

        {batchSize:5000, iterateList:true, parallel:false});
        """.format(str(year))

        run_query(query, graph)

#venue_papers_stats()

In [None]:
def venue_citation_stats_delta(years):

In [None]:
def venue_rank_stats(years):
    pass

In [None]:
def venue_max_paper_citations_stats(years):
    for year in years:
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta) 
        WHERE q.year <= {}
        OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
        WHERE p.year <= {}
        WITH v, q, count(p) as citations
        WITH v, collect(citations) as all_citations
        RETURN [v, all_citations] AS info",

        "WITH info[0] AS v, info[1] AS all_citations
        MERGE (v)-[m:METRICS_IN]->(y:Year {year:{}})
        SET m.max_citations = apoc.coll.max(all_citations)",

        {batchSize:5000, iterateList:true, parallel:false});
        """.format(str(year))

#venue_max_paper_citations(years)

In [None]:
#TODO test
def author_h_index_delta(years):
    pass

In [None]:
#TODO test
def author_citation_delta(years):
    pass

In [None]:
#TODO test
def author_mean_citations_per_paper_delta(years):
    pass

### Author Paper Features Tests 

In [None]:
def test_author_citation_count(author, year):
    """ Takes in a normalized name and returns total citations count
        Citation counts matched that in table for edward s boyden with this example:
        
        MATCH (q:Quanta {title:"Millisecond-timescale, genetically targeted optical 
            control of neural activity"})<-[:CITES]-(p:Quanta) 
        WHERE p.year <= 2008
        RETURN count(p)
    """ 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
    WHERE q.year <= {} and p.year <= {}
    RETURN a.name as name, q.title as title, count(p) as citations
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_citation_count_iterative(author, year):
    query = """
    CALL apoc.periodic.iterate(

    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)<-[r:CITES]-(p:Quanta)
    WHERE q.year <= $year AND p.year <= $year
    RETURN [a, count(p)] as nodes",

    "WITH head(nodes) as a, tail(nodes) as citations
    RETURN citations",

    {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
    """.format(author, year)
        
    run_query(query, graph)

#author_citation_ct = test_author_citation_count('edward s boyden', 2008)
author_citation_ct = test_author_citation_count_iterative('edward s boyden', 2008)

author_citation_ct

In [None]:
def test_author_mean_citations_per_paper(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
    WHERE p.year <= {}
    WITH a, count(distinct q) as papers, count(p) as cites
    RETURN a.name, papers, cites, cites/papers as avg
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_mean_citations_per_paper_iterative(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)   
    WHERE p.year <= {}
    RETURN [a, q, p] as info",
    
    "WITH info[0] as a, info[1] as q, info[2] as p
    WITH a, count(p) as cites, count(distinct q) as papers
    RETURN a.name, cites, papers, (papers/cites) as avg ",
    
    {{batchSize:5000, iterateList:true, parallel:true}});
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_mean_citations_paper = test_author_mean_citations_per_paper('edward s boyden', 2008)
#author_mean_citations_paper = test_author_mean_citations_per_paper_iterative('edward s boyden', 2008)

#author_mean_citations_paper

In [None]:
def test_author_papers_count(author, year):
    """ Takes in a normalized name and returns total paper count""" 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    RETURN a.name as name, count(*) as papers
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_papers_count_iterative(author, year):
    """ Takes in a normalized name and returns total paper count""" 

    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    RETURN [a, count(*)] AS info",

    "WITH head(info) AS a, last(info) AS count
    RETURN count",

    {{batchSize:5000, iterateList:true, parallel:true}});
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

#auth_papers_ct = test_author_papers_count('edward s boyden', 2018)
auth_papers_ct = test_author_papers_count_iterative('edward s boyden', 2018)

#auth_papers_ct

In [None]:
def test_author_mean_citations_per_year(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[r:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
    WHERE q.year <= {} and p.year <= {}
    WITH a, count(p) as citations, collect(q.year) as years
    WITH a, citations, apoc.coll.min(years) as start_year
    RETURN a.name, citations, ({}-start_year) as years, citations/({}-start_year) as avg
    """.format(author, year, year, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_mean_citations_per_year_iterative(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
    WHERE q.year <= {} and p.year <= {}
    WITH a, count(p) as citations, collect(q.year) as years
    RETURN [a, citations, years] AS info",

    "WITH info[0] AS a, info[1] AS citations, info[2] as years
    WITH a, citations, apoc.coll.min(years) as start_year
    RETURN a.name, citations / ({} - start_year) as avg",
    {{batchSize:5000, iterateList:true, parallel:true}});
    """.format(author, year, year, year)


    df = run_query(query, graph, to_df=True)
    return df

author_mean_citations_year = test_author_mean_citations_per_year('edward s boyden', 2009)
#author_mean_citations_year = test_author_mean_citations_per_year_iterative('edward s boyden', 2008)

#author_mean_citations_year

In [None]:
def test_author_age(author, year):
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, collect(q.year) as years
    WITH a, apoc.coll.min(years) as start_year
    RETURN a.name, {} - start_year as author_age
    """.format(author, year, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_age_iterative(author, year):
    query = """
    CALL apoc.periodic.iterate(

    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, collect(q.year) as years
    RETURN [a, years] AS info",

    "WITH info[0] AS a, info[1] AS years
    WITH a, apoc.coll.min(years) as start_year
    RETURN a.name, {} - start_year as author_age",

    {{batchSize:5000, iterateList:true, parallel:true}});
    """.format(author, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

#author_age = test_author_age('edward s boyden', 2008)
author_age = test_author_age_iterative('edward s boyden', 2008)

author_age

In [None]:
def test_author_max_single_paper_citations(author, year):
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
    WHERE q.year <= {} and p.year <= {}
    WITH a, q, count(p) as citations
    WITH a, collect(citations) as all_citations
    RETURN a.name, apoc.coll.max(all_citations)
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

def test_author_max_single_paper_citations_iterative(author, year):
    query = """
    CALL apoc.periodic.iterate(

    "MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
    WHERE q.year <= {} and p.year <= {}
    WITH a, q, count(p) as citations
    RETURN [a, citations] AS info",

    "WITH info[0] AS a, info[1] AS citations
    WITH a, collect(citations) as all_citations
    RETURN a.name, apoc.coll.max(all_citations)",

    {{batchSize:5000, iterateList:true, parallel:true}});
    """.format(author, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

author_max_cites = test_author_max_single_paper_citations('edward s boyden', 2008)
#author_max_cites = test_author_max_single_paper_citations_iterative('edward s boyden', 2008)

author_max_cites

In [None]:
def test_author_num_venues_published(author, year):
    """ Takes in a normalized name and returns total number of venues published in""" 

    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= {}
    WITH a, q
    MATCH (q)-[:PUBLISHED_IN]->(v:Venue)
    RETURN a.name as name, count(distinct v) as num_venues, count(q) as num_papers
    """.format(author, year)

    df = run_query(query, graph, to_df=True)
    return df

auth_num_venues = test_author_num_venues_published('edward s boyden', 2018)
auth_num_venues

In [None]:
def test_author_recent_num_coauthors(author, year):
    query = """
    MATCH (a:Author {normalizedName: '{}'})-[:AUTHORED]->(q:Quanta)
    WHERE q.year = {} OR q.year = {} - 1
    MATCH (b:Author)-[:AUTHORED]->(q) WHERE not b.name = a.name
    RETURN a.name as name, count(distinct b) as coauthors, count(distinct q) as papers_written
    """.format(author, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

### TODO

In [None]:
def test_author_h_index(author, year):
    query="""
    MATCH (a:Author {id: '{}')-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= {}
    OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
    WHERE p.year <= {}
    WITH a, q, count(p) as citations
    WITH a, collect(citations) as paper_citations 
    //RETURN a, paper_citations
    WITH a, apoc.coll.sort(paper_citations) as inverse 
    WITH a, apoc.coll.reverse(inverse) as ordered_citations 
    WITH a, apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | apoc.coll.indexOf(ordered_citations, x)]) as filtered 
    RETURN CASE filtered 
    WHEN []
    THEN 0 
    ELSE last(filtered) + 1 
    END""".format(author, year, year)
    
    run_query(query, graph)
    
test_author_h_index(53f45789dabfaee02ad5d4ca, 2008)

def test_author_h_index_periodic(author, year):
    query="""
    CALL apoc.periodic.iterate(
    "MATCH (a:Author {id: '{}'})-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= {}
    OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
    WHERE p.year <= {}
    WITH a, q, count(p) as citations
    WITH a, collect(citations) as paper_citations 
    RETURN [a, paper_citations] as info",
    "WITH head(info) as a, last(info) as paper_citations
    WITH a, apoc.coll.sort(paper_citations) as inverse 
    WITH a, apoc.coll.reverse(inverse) as ordered_citations 
    WITH a, apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | apoc.coll.indexOf(ordered_citations, x)]) as filtered 
    RETURN CASE filtered //merge statement before
    WHEN []
    THEN 0 //set statment = 0
    ELSE last(filtered) + 1 //set statement = last(filtered) + 1
    END",
    {batchSize:5000, iterateList:true, parallel:false});""".format(author, year, year)
    
    run_query(query, graph)

In [None]:
#TODO test
def test_author_citation_delta(author, year):
    query = """
    MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.total_citations as original, pm.total_citations as previous, m.total_citations-pm.total_citations as delta
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_citation_delta = test_author_citation_delta('edward s boyden', 2008)
author_citation_delta

In [None]:
#TODO test
def test_author_citations_per_paper_delta(author, year):
    query = """
    MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.citations_per_paper as original, pm.citations_per_paper as previous, 
        m.citations_per_paper-pm.citations_per_paper as delta
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_citation_per_paper_delta = test_author_citations_per_paper_delta('edward s boyden', 2008)
author_citation_per_paper_delta

In [None]:
#TODO test
def test_author_h_index_delta(author, year):
    query = """
    MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.hindex as original, pm.hindex as previous, m.hindex-pm.hindex as delta
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

test_author_h_index_delta('edward s boyden', 2008)

In [None]:
#TODO test
def test_author_papers_delta(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (a)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN a.name, m.total_papers as original, pm.total_papers as previous, 
        m.total_papers-pm.total_papers as delta
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

author_paper_delta = test_author_papers_delta('edward s boyden', 2008)
author_paper_delta

In [None]:
# TODO test
def test_venue_h_index(venue, year):
    query="""
    MATCH (v:Venue {cleanName: 'genesdevelopment'})<-[:PUBLISHED_IN]-(q:Quanta) 
    WHERE q.year <= 2000
    OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
    WHERE p.year <= 2000
    WITH v, q, count(p) as citations
    WITH v, collect(citations) as paper_citations 
    WITH v, apoc.coll.sort(paper_citations) as inverse 
    WITH v, apoc.coll.reverse(inverse) as ordered_citations
    WITH v, apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | {index: apoc.coll.indexOf(ordered_citations, x) + 1, value:x}]) as filtered
    WITH v, filtered, apoc.coll.frequencies([x in filtered | x.value]) as frequencies
    RETURN CASE filtered 
    WHEN []
    THEN 0 
    ELSE CASE 
    WHEN filtered[-1].index >= filtered[-1].value-frequencies[-1].count + 1
    THEN last(filtered).value
    ELSE last(filtered).index END
    END""".format(venue, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
#TODO test
def test_venue_h_index_stats_delta(venue, year):
    query = """MATCH (v:Venue {name: '{}'})-[m:METRICS_IN]->(y:Year {year:{}})
    MATCH (v)-[pm:METRICS_IN]->(:Year {year:{}-1})
    RETURN v.name, m.hindex as original, pm.hindex as previous, m.hindex-pm.hindex as delta
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

In [None]:
def test_venue_max_paper_citations_stats(venue, years):
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta) 
    WHERE q.year <= {}
    OPTIONAL MATCH (p:Quanta)-[:CITES]->(q) 
    WHERE p.year <= {}
    WITH v, q, count(p) as citations
    RETURN [v, citations] AS info",

    "WITH info[0] AS v, info[1] AS citations
    WITH v, collect(citations) as all_citations
    MERGE (v)-[m:METRICS_IN]->(y:Year {{year:{}}})
    SET m.max_citations = apoc.coll.max(all_citations)",

    {{batchSize:5000, iterateList:true, parallel:false}});
    """.format(str(year))


### Old

In [None]:
for i in range(start_year, 2019):

    # Author H-index
    # Assumes that every author has at least a single citation on the papers they write
    
    query = """
    CALL apoc.periodic.iterate(
    "MATCH (a:Author)-[:AUTHORED]->(q:Quanta) 
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes LIMIT 10",
        // Why limit 10?
    
    "WITH head(nodes) AS a, last(nodes) AS q
    MATCH (p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+"""
    
    WITH count(p) as citations, a, collect(citations) as paper_citations 
        // RETURN a, paper_citations
        // ** can we do collect(count(p)) in one step and will this save time?)
        //assuming that doing count(p) only counts the number of citations for each independent quanta
    WITH a, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
        //assuming first sort gives them ascending
        // ordered citation is list of integers
    WITH a, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
    WITH a, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
        //because indexes start at zero
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
        // year is right property here?
    SET m.hindex = head(filtered)+1",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Citation Count
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_citations = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Mean Citations per Paper
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q,count(*)] AS info
        // count(*) counts null values - we are trying to count q I think
    ","
    WITH info[0] AS a, info[1] AS q, info[2] as papers_written
        // Where do we use papers_written in the calculation below?
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")/papers_written",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Mean Citations per Year
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    WITH a, q, collect(q.year) as years, apoc.coll.min(years) as start_year
    RETURN [a,q, start_year] AS info
    ","
    WITH info[0] AS a, info[1] AS q, info[2] as start_year
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.citations_per_year = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= """+str(i)+""")/(2019-start_year)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Papers
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,count(*)] AS info
        // do we need count(q) here?
    ","
    WITH head(info) AS a, last(info) AS count
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_papers = count",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author Age
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    WITH a, collect(q.year) as years, apoc.coll.min(years) as start_year
    RETURN [a, start_year] AS info
    ","
    WITH info[0] AS a, info[1] AS start_year
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.author_age = 2019-start_year",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Recent Coauthor
    # Not sure how this reacts to the fact that coauthors are added over time
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year = """+str(i)+""" OR q.year = """+str(i-1)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.recent_coauthor_count = size(MATCH (b:Author)-[:AUTHORED]->(q) WHERE b.name != a.name)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Max Citations on Single Paper
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,q] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS q
    MATCH (p:Quanta)-[:CITES]->(q)
    WHERE p.year <= """+str(i)+"""
    WITH a, apoc.coll.toSet(count(p)) as paper_citations
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.max_citations = apoc.coll.max(paper_citations)",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
    # Author's Venues' H-indexes
    
    # Venue Citations
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
    WHERE q.year <= """+str(i)+"""
    RETURN [a,v] AS nodes
    ","
    WITH head(nodes) AS a, last(nodes) AS v
    MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)<-[:CITES]-(d:Quanta)
    WITH apoc.coll.toList(count(d)) as venue_citations
        // assumes count(d) counts citations per venue and not the same as count(*)
    WHERE p.year <= """+str(i)+""" AND d.year <= """+str(i)+"""
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.venue_citations = [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)
    
        # Venue Papers

        query = """
        CALL apoc.periodic.iterate("
        MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
        WHERE q.year <= """+str(i)+"""
        RETURN [a,v] AS nodes
        ","
        WITH head(nodes) AS a, last(nodes) AS v
        MATCH (v)<-[:PUBLISHED_IN]-(p:Quanta)
        WITH apoc.coll.toList(count(p)) as venue_papers
            // assumes count(p) counts papers per venue and not the same as count(*)
        WHERE p.year <= """+str(i)+""" 
        MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
        SET m.venue_papers = [apoc.coll.avg(venue_papers), apoc.coll.min(venue_papers), apoc.coll.max(venue_papers)]",
        {batchSize:5000, iterateList:true, parallel:false});
        """

        run_query(query, graph)
    
    # Venue Max Citations
    
    # Total Number of Venues
    
    query = """
    CALL apoc.periodic.iterate("
    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(v:Venue)
    WHERE q.year <= """+str(i)+"""
    WITH DISTINCT v
    RETURN [a,count(v)] AS info
        //assumes count(v) only counts venues and not same as count(*)
    ","
    WITH head(info) AS a, last(info) AS count
    MERGE (a)-[m:METRICS_IN]->(y:Year {year:"""+str(i)+"""})
    SET m.total_venues = count",
    {batchSize:5000, iterateList:true, parallel:false});
    """
    
    run_query(query, graph)


In [None]:
#def author_hindex(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    MATCH (p:Quanta)-[:CITES]->(q)
#    WITH y, count(p) as citations, a, collect(citations) as paper_citations
#    WITH y, a, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
#    WITH y, a, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
#    WITH y, a, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
#    RETURN [a, y, filtered[0] + 1] as result

#def mean_citations_year(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    MATCH (p:Quanta)-[:CITES]->(q)
#    RETURN count(p)/count(q) as mean

#def papers_published_year(value):
#    MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
#    RETURN count(q)

### Get features for Venue

In [None]:
def venue_hindex(v, value):
    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)-[:PUBLISHED_IN]->(y:Year {year: value})
    MATCH (p:Quanta)-[:CITES]->(q)
    WITH y, count(p) as citations, v, collect(citations) as paper_citations
    WITH y, v, apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
    WITH y, v, apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
    WITH y, v, apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
    RETURN [v, y, filtered[0] + 1] as result
    
#def venue_paper_citations(v, value):
#    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)<-[:CITES]-(:Quanta)<-[:PUBLISHED_IN]-(y:Year)
#    WHERE y.year <= value
#    WITH count(*)/count(q) as venue_avg
    
#def venue_papers(v, value):
#    MATCH (v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)<-[:PUBLISHED_IN]-(y:Year)
#    WHERE y.year <= value
#    WITH count(q) as venue_papers
    
def venue_papers_max(v):
    MATCH (v:Venue)<-[:PUBLISHED_IN]-(:Quanta)<-[:CITES]-(q:Quanta)
    WITH apoc.coll.toSet(count(q)) as counts
    RETURN apoc.coll.max(counts) as max

In [None]:
########## CALCULATING METRICS ##########
# AUTHOR HINDEX

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
WITH count(p) as citations, a, collect(citations) as paper_citations
WITH apoc.coll.sort(paper_citations) as inverse, apoc.coll.reverse(inverse) as ordered_citations
WITH apoc.coll.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
WITH apoc.coll.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
RETURN filtered[0] + 1 as hindex

# AUTHOR HINDEX DELTA
# can implement after doing the first one

CALL author_hindex(value) YIELD result[0] as a, result[1] as y, result[2] as hindex
CALL author_hindex(value+1) YIELD result[2] as second_hindex
RETURN hindex-second_hindex as hindex_delta

# AUTHOR CITATION COUNT

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(*)

# KEY CITATION COUNT

(need additional paper)

# AUTHOR CITATIONS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.year <= 2018
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(p)

# AUTHOR KEY CITATIONS DELTA
# can implement after first one

(need additional paper)

# AUTHOR MEAN CITATIONS PER PAPER

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
MATCH (p:Quanta)-[:CITES]->(q)
RETURN count(p)/count(q)

# AUTHOR MEAN CITATION PER PAPER DELTA
# can implement after first one

CALL mean_citations_year(value) YIELD mean as old
CALL mean_citations_year(value+1) YIELD mean as new
RETURN new-old

# AUTHOR MEAN CITATIONS PER YEAR

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
MATCH (p:Quanta)-[:CITES]->(q)
WITH count(p) as citations, apoc.coll.toSet(y) as years, apoc.coll.min(years) as start_year
RETURN citations/(2019-start_year)

# AUTHOR PAPERS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
RETURN count(q)

# AUTHOR PAPERS DELTA
# can implement after first one

CALL papers_published_year(value) YIELD mean as old
CALL papers_published_year(value+1) YIELD mean as new
RETURN new-old

# AUTHOR MEAN CITATION RANK 
# rank of author (between 0 and 1) among all other authors in terms of mean citations per year

(can be implemented after every author has their mean citations per year calculated)

# AUTHOR UNWEIGHTED PAGERANK (by coauthorship)

(already have base code)

# AUTHOR WEIGHTED PAGERANK (by coauthorship)

(already have base code)

# AUTHOR AGE

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WITH apoc.coll.toSet(y) as years
RETURN apoc.coll.min(years) as author_age

# AUTHOR RECENT NUM COAUTHORS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE year >= 2018
MATCH (b:Author)-[:AUTHORED]->(q)
WHERE b.name != a.name
RETURN count(b)

# AUTHOR MAX SINGLE PAPER CITATIONS

MATCH (a:Author)-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
WITH apoc.coll.toSet(count(p)) as paper_citations
RETURN apoc.coll.max(paper_citations) as max

# VENUE HINDEX

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_hindex(v, value) YIELD apoc.coll.toList(result[2]) as hindexes
RETURN [apoc.coll.avg(hindexes), apoc.coll.min(hindexes), apoc.coll.max(hindexes)]

# VENUE HINDEX DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_hindex(v, value) YIELD apoc.coll.toList({venue: v, index: result[2]}) as old_hindexes
CALL venue_hindex(v, value+1) YIELD apoc.coll.toList({venue: v, index: result[2]}) as new_hindexes
WITH apoc.coll.toList([x IN old_hindexes and y IN new_hindexes WHERE v.venue = y.venue | y.index-x.index]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE CITATIONS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)<-[:PUBLISHED_IN]-(:Quanta)<-[:CITES]-(q:Quanta)
WITH apoc.coll.toList(count(q)) as venue_citations
RETURN [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]


# VENUE CITATIONS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_citations(v, value) YIELD apoc.coll.toList({venue: v, avg: venue_avg}) as old_avg
CALL venue_paper_citations(v, value+1) YIELD apoc.coll.toList({venue: v, avg: venue_avg}) as new_avg
WITH apoc.coll.toList([x IN old_avg and y IN new_avg WHERE v.venue = y.venue | y.avg-x.avg]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE PAPERS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)<-[:PUBLISHED_IN]-(q:Quanta)
WITH apoc.coll.toList(count(q)) as venue_citations
RETURN [apoc.coll.avg(venue_citations), apoc.coll.min(venue_citations), apoc.coll.max(venue_citations)]

#VENUE PAPERS DELTA
# can implement after first one

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_citations(v, value) YIELD apoc.coll.toList({venue: v, count: venue_papers}) as old_count
CALL venue_paper_citations(v, value+1) YIELD apoc.coll.toList({venue: v, count: venue_papers}) as new_count
WITH apoc.coll.toList([x IN old_count and y IN new_count WHERE v.venue = y.venue | y.count-x.count]) as differences
RETURN [apoc.coll.avg(differences), apoc.coll.min(differences), apoc.coll.max(differences)]

# VENUE RANK 
# venue rank fmean, min, maxg  Ranks of venues (between 0-1) in which the author has published determined by mean number of citations per paper

(can be implemented after every author has their mean citations per year calculated)

# VENUE MAX SINGLE PAPER CITATIONS

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
CALL venue_paper_max(v) YIELD apoc.coll.toList(max) as maxes
RETURN [apoc.coll.avg(maxes), apoc.coll.min(maxes), apoc.coll.max(maxes)]

# TOTAL NUMBER OF VENUES

MATCH (a:Author)-[:AUTHORED]->(:Quanta)-[:PUBLISHED_IN]->(v:Venue)
WITH DISTINCT v
RETURN count(v)


In [None]:
# TODO test 
def author_mean_citations_per_paper_original(years):
    for year in years:   
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q,count(*)] AS info",
            // count(*) counts null values also - we are trying to count q I think

        "WITH info[0] AS a, info[1] AS q, info[2] as papers_written
        MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}
        WITH a, count(r) as citations
        MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        SET m.citations_per_paper = citations / papers_written",
            //SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {})/papers_written",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)
        
        run_query(query, graph)

#author_mean_citations_per_paper(years)

# Testing Functions

In [None]:
# H-index

MATCH (a:Author {normalizedName: 'edward s boyden'})-[:AUTHORED]->(q:Quanta)
WHERE q.year <= 2008 
WITH a, collect(q) as papers
WITH a, papers, size(papers) as papers_written
UNWIND papers as paper
MATCH (p:Quanta)-[:CITES]->(paper) 
WHERE p.year <= 2008
WITH DISTINCT a.name as name, papers_written, count(p) as citations 
RETURN name, citations, papers_written

MATCH () 
WITH [0, 2, 5, 6, 3, 8] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations WHERE apoc.coll.indexOf(ordered_citations, x)-x<0 | apoc.coll.indexOf(ordered_citations, x)]) as filtered 
RETURN CASE filtered 
WHEN [] 
THEN 0 
ELSE last(filtered) + 1 
END

MATCH () 
WITH [] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN CASE filtered 
WHEN [] 
THEN 0 
ELSE last(filtered) + 1 
END

MATCH () \
WITH [0, 0, 0] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN last(filtered)+1

MATCH () 
WITH [0, 2, 5, 6, 3, 8] as paper_citations 
WITH apoc.coll.sort(paper_citations) as inverse 
WITH apoc.coll.reverse(inverse) as ordered_citations 
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations 
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered 
RETURN last(filtered)+1

########## Failed ###########

MATCH ()
WITH [] as paper_citations 
RETURN CASE paper_citaions
WHEN []
THEN 0
ELSE 
WITH apoc.coll.sort(paper_citations) as inverse # <- FAILED HERE
WITH apoc.coll.reverse(inverse) as ordered_citations
WITH apoc.convert.toList([x in ordered_citations | {index: apoc.coll.indexOf(ordered_citations, x), value: x}]) as indexed_citations
WITH apoc.convert.toList([x in indexed_citations WHERE x.index-x.value<0 | x.index]) as filtered
RETURN last(filtered) + 1
END

In [None]:
def old_test_author_mean_citations_per_paper(author, year):
    """ Takes in a normalized name and returns total citations count""" 
    
    query = """
    MATCH (a:Author {{normalizedName: '{}'}})-[r:AUTHORED]->(q:Quanta)
    WHERE q.year <= {}
    WITH a, collect(size((q)<-[:CITES]-(p:Quanta) WHERE p.year<={})) as citations
    RETURN a.name, citations, apoc.coll.avg(citations) as avg
    """.format(author, year, year)

    df = run_query(query, graph, to_df=True)
    return df

#author_mean_citations_paper = test_author_mean_citations_per_paper('edward s boyden', 2008)
#author_mean_citations_paper

def another_test_author_mean_citations_per_paper(author, year):
    query = """MATCH (a:Author {normalizedName: '{}'})-[:AUTHORED]->(q:Quanta)
    WHERE q.year <= {} 
    WITH a, collect(q) as papers
    WITH a, papers, size(papers) as papers_written
    UNWIND papers as paper
    MATCH (p:Quanta)-[:CITES]->(paper) 
    WHERE p.year <= {}
    WITH DISTINCT a.name as name, papers_written, count(p) as citations 
    RETURN name, citations, papers_written""".format(author, year, year)
    
    df = run_query(query, graph, to_df=True)
    return df

In [None]:
def author_mean_citations_per_paper_old(years):
    for year in years:   
        query = """
        CALL apoc.periodic.iterate(
        "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= {}
        RETURN [a,q, count(*)] AS info",
        
        "WITH info[0] AS a, info[1] AS q, info[2] as papers_written
         MERGE (a)-[m:METRICS_IN]->(y:Year {year:{}})       
         SET m.citations_per_paper = size((p:Quanta)-[:CITES]->(q) WHERE p.year <= {})/papers_written",
         
         //WITH a, count(r) as citations\
         //MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
         //MATCH (q)<-[r:CITES]-(p:Quanta) WHERE p.year <= {}

         //SET m.citations_per_paper = citations / papers_written\

        //"WITH head(info) AS a, tail(info) as q,
        //collect(size((q)<-[:CITES]-(:Quanta))) as citations
        //MERGE (a)-[m:METRICS_IN]->(y:Year {{year:{}}})
        //SET m.mean_citations_per_paper = apoc.coll.avg(citations)",
        {{batchSize:5000, iterateList:true, parallel:false}});
        """.format(year, year, year)
        
        run_query(query, graph)

#author_mean_citations_per_paper(years)