# Setup

In [None]:
from py2neo import Graph, Node, Relationship 
import glob, os, time
import datetime
import inspect
import pandas as pd

# Test DB
#graph = Graph("bolt://54.88.167.164:7687", auth=("neo4j", "myneo")) 
# Development DB
#graph = Graph("bolt://54.174.175.98:7687", auth=("neo4j", "myneo")) 
graph = Graph("bolt://matlaberp1.media.mit.edu:7687", auth=("neo4j", "myneo"))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse'] 
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse'] 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format (n_nodes, n_relationships))

# Utility 

In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
def run_query(query, graph, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True, return_time=False): 
    df = 1

    if print_only: 
        print_query = True 
        run_query = False 
    start_time = time.time()    
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
        else: 
            if return_time:
                graph.run(query) 
                end_time = time.time() 
                minutes_elapsed = (end_time-start_time)/60 
                print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
                
                return minutes_elapsed
            else:
                graph.run(query) 
                end_time = time.time() 
                minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

In [None]:
def get_num_nonnull_metrics_in_properties(prop):
    print('Number of nonnull {} properties in Author-Year edge'.format(prop))
    query = """
    MATCH (:Author)-[m:METRICS_IN]->(:Year)
    RETURN count(m.{}) as total
    """.format(prop)

    df = run_query(query, graph, to_df=True)
    print(df.to_string())
    return 1

#get_num_nonnull_metrics_in_properties('hIndex')

In [None]:
def get_num_nonzero_metrics_in_properties(prop):
    print('Number of nonzero {} properties in Author-Year edge'.format(prop))
    query = """
    MATCH (:Author)-[m:METRICS_IN]->(y:Year)
    WHERE m.{} > 0
    RETURN count(*)
    """.format(prop)

    df = run_query(query, graph, to_df=True)
    print(df.to_string())
    return 1
 
#get_num_metrics_in_properties('total_citations')

In [None]:
# All the above functions in one cell for easy ipython import
from py2neo import Graph, Node, Relationship 
from tqdm import tqdm
import glob, os, time
import inspect

# Development DB
#graph = Graph("bolt://matlaberp1.media.mit.edu:7687", auth=("neo4j", "myneo"))
graph = Graph("bolt://18.85.22.109:7687", auth=("neo4j", "myneo"))
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

def run_query(query, graph, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True): 
    df = 1

    if print_only: 
        print_query = True 
        run_query = False 
    start_time = time.time()    
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
        else: 
            graph.run(query) 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

years = range(1810, 2020)


In [None]:
# TODO list all of properties to check number of non-null and non-zero properties
author_vars = [
    'hIndex', 'hIndexDelta', 'totalCitations', 'totalCitationsDelta', 'citationsPerPaper', 'citationsPerPaperDelta',
    'citationsPerYear', 'totalPapers', 'totalPapersDelta', 'rankCitationsPerYear', 'pageRank', 'weightedPageRank',
    'authorAge', 'recentCoauthors', 'maxCitations', 'totalVenues', 'venueHIndexMin', 'venueHIndexMean', 
    'venueHIndexMax', 'venueHIndexDeltaMin', 'venueHIndexDeltaMean', 'venueHIndexDeltaMax', 
    'venueCitationsPerPaperMin', 'venueCitationsPerPaperMean', 'venueCitationsPerPaperMax', 
    'venueCitationsPerPaperDeltaMin', 'venueCitationsPerPaperDeltaMean', 'venueCitationsPerPaperDeltaMax', 
    'venueTotalPapersMin', 'venueTotalPapersMean', 'venueTotalPapersMax', 'venueTotalPapersDeltaMin', 
    'venueTotalPapersDeltaMean', 'venueTotalPapersDeltaMax', 'venueRankCitationsPerPaperMin', 
    'venueRankCitationsPerPaperMean', 'venueRankCitationsPerPaperMax', 'venueMaxCitationsMin', 
    'venueMaxCitationsMean', 'venueMaxCitationsMax', 'totalVenuesMin', 'totalVenuesMean', 'totalVenuesMax']
for prop in author_vars:
    get_num_nonnull_metrics_in_properties(prop)
    get_num_nonzero_metrics_in_properties(prop)

# Run 1x on DB

### Run These First in Order 

In [None]:
# Needed to drop index on y.year first
def make_year_constraint():
    query = """
    CREATE CONSTRAINT ON (y:Year) ASSERT y.year IS UNIQUE
    """
    
    run_query(query, graph)
    
make_year_constraint()

In [None]:
def write_author_start_year():
    query = """
    CALL apoc.periodic.iterate(

    "MATCH (a:Author)-[:AUTHORED]->(q:Quanta)
    WITH a, collect(q.year) as years
    WITH a, apoc.coll.min(years) as start_year
    RETURN [a, start_year] AS info",

    "WITH head(info) AS a, last(info) AS start_year
    SET a.startYear = start_year",
    {batchSize:5000, iterateList:true, parallel:true});
    """
    
    run_query(query, graph)
    
write_author_start_year()

In [None]:
years = range(1810, 2020)

### Run These Next in Any Order  (Can be Done in Parallel)

In [None]:
def create_author_year_metrics_in_edge(years):
    for year in years:
        print('Running query for year {}'.format(str(year)))   
        
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author) 
        WHERE exists(a.startYear)
        MATCH (y:Year {{year: a.startYear}})
        RETURN [a, y] as info",
       
        "WITH head(info) as a, last(info) as y
        MERGE (a)-[:METRICS_IN]->(y)",
        
        {{batchSize:5000, iterateList:true, parallel:true}});
        """.format(year)
        
        run_query(query, graph)
            
create_author_year_metrics_in_edge(years)

In [None]:
def create_venue_year_metrics_in_edge(years):
    """ Create relationships between venue and year - create count(v:Venue) * years_existing relationships
        Adds (:Venue)-[m:METRICS_IN]->(:Year) edge"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (v:Venue)<-[r:PUBLISHED_IN]-(q:Quanta)
        WHERE q.year <= $year
        RETURN DISTINCT v",
        
        "MATCH (y:Year {{year: $year}})
        MERGE (v)-[:METRICS_IN]->(y)",

        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

#create_venue_year_metrics_in_edge(years)

# Write (Author)-[:METRICS_IN]-(Year) Metrics 

### Run These First in Any Order

In [None]:
def author_age(years):
    """ Number of years since author published their first paper
        Adds m.authorAge to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        RETURN [m, a.start_year]",
       
        "WITH head(info) AS m, last(info) AS start_year
        SET m.authorAge = $year - start_year",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
    
author_age(years)

In [None]:
def author_papers_ct(years):
    """ Total number of papers author has published
        Adds m.totalPapers to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        WITH a, m
        MATCH (a)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        RETURN [m,count(*)] AS info",
            
        "WITH head(info) AS m, last(info) AS count
        SET m.totalPapers = count",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

#author_papers_ct(years)

In [None]:
def author_citation_count(years):
    """ Total number of citations author has received
        Adds m.totalCitations to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        WITH a, m
        OPTIONAL MATCH (a)-[:AUTHORED]->(q:Quanta)<-[:CITES]-(p:Quanta)
        WHERE q.year <= $year AND p.year <= $year
        RETURN [m, count(p)] as info",
        
        "WITH head(info) as m, last(info) as citations
        SET m.totalCitations = citations",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

#author_citation_count(years)

In [None]:
def author_h_index(years):
    """ Adds m.hIndex to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(

        "MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year: $year}})
        WITH a, m
        MATCH (a)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        WITH m, q
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        WITH m, q, size(collect(p)) as citations
        WITH m, collect(citations) as cites
        WITH m, apoc.coll.sort(cites) as cl
        WITH m, apoc.coll.max([i IN range(size(cl),1,-1) WHERE cl[-i] >= i| i]) as hIndex_with_nulls
        WITH m, coalesce(hIndex_with_nulls, 0) as hIndex
        RETURN [m, hIndex] as info",
        
        "WITH head(info) as m, last(info) as hIndex
        SET m.hIndex = hIndex",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});

        """.format(year)

        run_query(query, graph)
        
#author_h_index(years)   

In [None]:
def author_max_single_paper_citations(years):    
    """ Maximum number of citations a paper has received out of all the papers the author has published
        Adds m.maxCitations to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        MATCH (a)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year 
        WITH q, m
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        WITH m, q, count(p) as citations
        WITH m, collect(citations) as all_citations
        WITH m, apoc.coll.max(all_citations) as max_citations
        RETURN [m, max_citations] AS info",

        "WITH head(info) AS m, last(info) AS max_citations
        SET m.maxCitations = max_citations",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)

#author_max_single_paper_citations(years)

In [None]:
def author_num_venues_published(years):
    """ Total number of venues an author has published in
        Adds m.totalVenues to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year
        OPTIONAL MATCH (q)-[:PUBLISHED_IN]->(v:Venue)
        RETURN [m, count(distinct v)] AS info",
        
        "WITH head(info) AS m, last(info) AS num_venues
        SET m.totalVenues = num_venues",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
    
#author_num_venues_published(years)

In [None]:
def author_recent_num_coauthors(years):
    """ Number of coauthors an author has had in current and past year
        Adds m.recentCoauthors to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author) 
        WITH a, m
        OPTIONAL MATCH (a)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Author) //get all quanta with coauthors        
        WHERE (a<>b) AND (q.year = $year OR a<>b and q.year = $year - 1)
        WITH m, collect(b) as coauthors 
        WITH m, length(coauthors) as num_coauthors
        RETURN [m, coalesce(num_coauthors, 0)] AS info",
      
        
        "WITH head(info) as m, last(info) as num_coauthors
        SET m.recentCoauthors = num_coauthors",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
        
#author_recent_num_coauthors(years)

In [None]:
def author_unweighted_pagerank(years):
    """ Write PageRank scores to author nodes as temporary and then write to metrics edge in 2nd query 
        Adds m.pageRank to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query1 = """
        CALL algo.pageRank(
        "MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year: {}}})
        RETURN id(a) as id",
        
        "MATCH (a1:Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(a2:Author) 
        WHERE q.year <= {} and a1 <> a2
        RETURN id(a1) as source, id(a2) as target",
        {{graph:'cypher', iterations:20, write:true, writeProperty:"temporary", concurrency:16}});
        """.format(year, year, year)

        run_query(query1, graph)
        
        
        print('Writing pagerank values to edges for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        # Write scores to METRICS_IN relationship
        query2 = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year: $year}})
        RETURN [a.temporary, m] as info",
        
        "WITH head(info) as pagerank, last(info) as m
        SET m.pageRank = pagerank",
        
        {{batchSize:10000, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query2, graph)

#author_unweighted_pagerank(years)

In [None]:
# TODO test - was previously giving same value for everything
def author_weighted_pagerank(years):
    """ Write Weighted PageRank scores to author nodes as temporary and then write to metrics edge in 2nd query 
        Adds m.weightedPageRank to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query1 = """
        CALL algo.pageRank(
        "MATCH (a:Author)
        WHERE exists((a)-[m:METRICS_IN]->(y:Year {{year: {}}})) 
        RETURN id(a) as id",
        
        "MATCH (a1:Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(a2:Author)
        WHERE q.year <= {} and a1.cleanName <> a2.cleanName
        RETURN id(a1) as source, id(a2) as target, count(q) as weight",
        {{graph:'cypher', iterations:20, write:true, writeProperty:"temporary", weightProperty: "weight", concurrency:16}});
        """.format(year, year)

        run_query(query1, graph)
        
        
        print('Writing pagerank values to edges for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        # Write scores to METRICS_IN relationship
        query2 = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year: $year}})
        RETURN [a.temporary, m] as info",
        
        "WITH head(info) as pagerank, last(info) as m
        SET m.weightedPageRank = pagerank",
        
        {{batchSize:10000, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query2, graph)

author_weighted_pagerank(years)

### Run Next in Any Order

In [None]:
# Relies on totalCitations and totalPapers
def author_mean_citations_per_paper(years):
    """ Average number of citations per paper for papers the author has published 
        Adds m.citationsPerPaper to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:   
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(:Author)
        WITH m, (toFloat(m.totalCitations) / toFloat(m.totalPapers)) as value
        WITH m, round(1000 * value)/1000 AS cites_per_paper
        RETURN [m, cites_per_paper] as info",
        
        "WITH head(info) as m, last(info) as cites_per_paper
        SET m.citationsPerPaper = cites_per_paper",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

#author_mean_citations_per_paper(years)

In [None]:
# Relies on authorAge and totalCitations
def author_mean_citations_per_year(years):
    """ Average number of citations per year for papers the author has published
        Adds m.citationsPerYear to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        WITH m, m.authorAge as years, m.totalCitations as cites
        WITH m, (toFloat(cites) / toFloat(years + 1)) as value
        WITH m, (round(1000 * value)/1000) AS cites_per_year
        RETURN [m, cites_per_year] as info",
        
        "WITH head(info) as m, last(info) as cites_per_year
        SET m.citationsPerYear = cites_per_year",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)
        
author_mean_citations_per_year(years)

In [None]:
# Relies on citationsPerYear
def author_mean_citation_rank(years):
    """ Rank of author [0, 1] among all other authors in terms of mean citations per year
        Adds m.rankCitationsPerYear"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (:Author)-[r:METRICS_IN]->(y:Year {{year: $year}})
        WITH y, COLLECT(r.citationsPerYear) as cites
        WITH y, toFloat(apoc.coll.max(cites)) as maximum
        MATCH (a:Author)-[m:METRICS_IN]->(y)
        RETURN [m, CASE WHEN maximum = 0.0 THEN .001 ELSE maximum END] as info", //to prevent zero division
        
        "WITH head(info) as m, last(info) as max
        WITH m, (m.citationsPerYear / max) as value
        WITH m, round(1000 * value)/1000 AS rank
        SET m.rankCitationsPerYear = rank",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)
        
author_mean_citation_rank(years)

### Run Last in Any Order

In [None]:
def add_author_deltas_to_author_year_edge(metric, years):
    """ Change in {metric} in the past two years
        Adds m.{metric}_delta to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (:Year {{year: $year}})<-[m:METRICS_IN]-(a:Author)
        WITH a, m
        MATCH (a)-[pm:METRICS_IN]->(:Year {{year: ($year - 1)}}) 
        RETURN [m, m.{}-pm.{}] as info",
        
        "WITH head(info) as m, last(info) as delta
        SET m.{}Delta = delta",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(metric, metric, metric, year)
        
        run_query(query, graph)        

In [None]:
add_author_deltas_to_author_year_edge('totalPapers', years)

In [None]:
add_author_deltas_to_author_year_edge('totalCitations', years)

In [None]:
add_author_deltas_to_author_year_edge('citationsPerPaper', years)

In [None]:
add_author_deltas_to_author_year_edge('hIndex', years)

# Write (Venue)-[:METRICS_IN]-(Year) Metrics 

### Run First in Any Order

In [None]:
def add_venue_h_index_to_venue_year_edge(years):
    """ h-Index for each venue
        Adds m.hIndex to (:Venue)-[m:METRICS_IN]->(:Year)"""

    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query= """
        CALL apoc.periodic.iterate(
        
        "MATCH (v:Venue)-[m:METRICS_IN]->(:Year {{year: $year}})
        WITH v, m
        MATCH (v)<-[:PUBLISHED_IN]-(q:Quanta) 
        WHERE q.year <= $year
        WITH m, q
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta) //no optional match was not adding 0 property to some nodes with papers but no cites
        WHERE p.year <= $year
        WITH m, q, size(collect(p)) as citations
        WITH m, collect(citations) as cites
        WITH m, apoc.coll.sort(cites) as cl
        WITH m, apoc.coll.max([i IN range(size(cl),1,-1) WHERE cl[-i] >= i| i]) as hIndex_with_nulls
        WITH m, coalesce(hIndex_with_nulls, 0) as hIndex
        RETURN [m, hIndex] as info",
        
        "WITH head(info) as m, last(info) as hIndex
        SET m.hIndex = hIndex",
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)

        run_query(query, graph)    
        
add_venue_h_index_to_venue_year_edge(years)

In [None]:
def add_venue_paper_ct_to_venue_year_edge(years):
    """ Number of papers in venues in which the author has published
        Adds m.totalPapers to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year: $year}})<-[m:METRICS_IN]-(v:Venue)
        WITH m, v
        MATCH (v)<-[:PUBLISHED_IN]-(q:Quanta)
        WHERE q.year <= $year
        WITH m, count(q) as paper_ct 
        RETURN [m, paper_ct] as info",
        
        "WITH head(info) as m, last(info) as paper_ct
        SET m.totalPapers = paper_ct", 

        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

add_venue_paper_ct_to_venue_year_edge(years)

In [None]:
def add_venue_max_citations_to_venue_year_edge(years):
    """ The maximum number of citations for a paper in this journal
        Adds m.maxCitations to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(v:Venue)
        WITH m, v
        MATCH (v)<-[:PUBLISHED_IN]-(q:Quanta)
        WHERE q.year <= $year 
        WITH m, q
        OPTIONAL MATCH (q)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        WITH m, q, count(p) as citations
        WITH m, collect(citations) as all_citations
        WITH m, apoc.coll.max(all_citations) as max_citations
        RETURN [m, max_citations] AS info",

        "WITH head(info) AS m, last(info) AS max_citations
        SET m.maxCitations = max_citations",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

add_venue_max_citations_to_venue_year_edge(years)

### Run Next in Order

In [None]:
# refactor to use m.totalPapers property that exists
# todo test
def add_venue_citations_per_paper_to_venue_year_edge(years):
    """ Mean of all citations_per_paper values for all papers published in that venue up to that year
        Adds m.citationsPerPaper to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (v)-[m:METRICS_IN]->(:Year {{year: $year}})
        WITH v, m
        OPTIONAL MATCH (v)<-[:PUBLISHED_IN]-(q:Quanta)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        WITH m, coalesce(count(p), 0) as cites 
        WITH m, (toFloat(cites) / toFloat(m.totalPapers)) as avg_cites_paper
        WITH m, (round(1000 * avg_cites_paper)/1000) AS cites_per_paper
        RETURN [m, cites_per_paper] as info",
        
        "WITH head(info) as m, last(info) as cites_per_paper
        SET m.citationsPerPaper = cites_per_paper", 

        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

add_venue_citations_per_paper_to_venue_year_edge(years)

In [None]:
def add_venue_rank_to_venue_year_edge(years):
    """ Rank of venue [0, 1] based on mean citations per paper 
        Adds m.rankCitationsPerPaper to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year: $year}})<-[r:METRICS_IN]-(:Venue)
        WITH y, COLLECT(r.citationsPerPaper) as cites_per_paper    //citations per paper should be float
        WITH y, apoc.coll.max(cites_per_paper) as maximum  
        MATCH (v:Venue)-[m:METRICS_IN]->(y)
        RETURN [m, CASE WHEN maximum = 0.0 THEN .001 ELSE maximum END] as info", //to prevent zero division
        
        "WITH head(info) as m, last(info) as max
        WITH m, (m.citationsPerPaper / max) as value 
        WITH m, (round(1000 * value)/1000) AS rank
        SET m.rankCitationsPerPaper = rank",

        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

add_venue_rank_to_venue_year_edge(years)

### Run Next in Any Order

In [None]:
# Run after rerunning add_venue_stats_to_venue_year_edge for 'total_papers','mean_citations_per_paper', and 'hIndex'
def add_venue_delta_stats_to_venue_year_edge(metric, years):
    """ Change in {metric} over past two years for venue
        Adds m.{metric}_delta to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {} on {}'.format(str(year), inspect.stack()[0][3], metric))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (:Year {{year:$year}})<-[m:METRICS_IN]-(v:Venue)
        WITH v, m
        MATCH (v)-[pm:METRICS_IN]->(:Year {{year: ($year - 1)}})
        WITH m, (m.{} - pm.{}) as delta
        RETURN [m, delta] as info",
        
        "WITH head(info) as m, last(info) as delta
        SET m.{}Delta = coalesce(delta, 0)",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(metric, metric, metric, year)
        
        run_query(query, graph)

In [None]:
add_venue_delta_stats_to_venue_year_edge('totalPapers', years)

In [None]:
add_venue_delta_stats_to_venue_year_edge('citationsPerPaper', years)

In [None]:
add_venue_delta_stats_to_venue_year_edge('hIndex', years)

### Run Last

In [None]:
def add_zero_to_venue_year_edge_for_deltas_in_venue_first_year(years):
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (v:Venue)-[m:METRICS_IN]->(y:Year {{year: $year}})
        WHERE not exists(m.totalPapersDelta)
        MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        RETURN m",
        
        "SET m.totalPapersDelta = 0.0,
            m.hIndexDelta = 0.0,
            m.citationsPerPaperDelta = 0.0",
    
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

In [None]:
add_zero_to_venue_year_edge_for_deltas_in_venue_first_year('citationsPerPaperDelta', years)    

# Write (Author)-[:METRICS_IN]-(Year) Metrics  for Venue Features

### Run in Any Order

In [None]:
# TODO confirm apoc.text.decapitalize({}) as prop works as expected
def add_venue_stats_to_author_year_edge(metric, years):
    """ Use values stored in venue [:METRICS_IN] year edge to 
        Adds m.venue_{metric}_{min, mean, max} to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}_{}'.format(str(year), inspect.stack()[0][3], metric))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        WITH a, m, y
        MATCH (a)-[:AUTHORED]->(q:Quanta)
        WHERE q.year <= $year 
        WITH m, y, q
        MATCH (q)-[:PUBLISHED_IN]->(v:Venue)
        WITH DISTINCT v, m, y, apoc.text.decapitalize({}) as prop
        MATCH (v)-[r:METRICS_IN]->(y)
        WITH m, collect(r.prop) as stats
        WITH m, stats, apoc.coll.min(stats) as min
        WITH m, min, stats, apoc.coll.avg(stats) as mean
        WITH m, min, mean, apoc.coll.max(stats) as max
        RETURN [m, min, mean, max] as info",
        
        "WITH info[0] as m, info[1] as min, info[2] as mean, info[3] as max
        SET m.venue{}Min = min,
            m.venue{}Mean = mean,
            m.venue{}Max = max",
        
        {{batchSize:5000, iterateList:true, parallel:false, params: {{year: {}}} }});
        """.format(metric, metric, metric, metric, year)
        
        run_query(query, graph)

In [None]:
add_venue_stats_to_author_year_edge('HIndex', years) 

In [None]:
add_venue_stats_to_author_year_edge('TotalPapers', years)

In [None]:
add_venue_stats_to_author_year_edge('CitationsPerPaper', years)

In [None]:
add_venue_stats_to_author_year_edge('MaxCitations', years)

In [None]:
add_venue_stats_to_author_year_edge('RankCitationsPerPaper', years)

In [None]:
add_venue_stats_to_author_year_edge('TotalPapersDelta', years)

In [None]:
add_venue_stats_to_author_year_edge('CitationsPerPaperDelta', years)

In [None]:
add_venue_stats_to_author_year_edge('HIndexDelta', years)

### Run After Everything Else Has Been Added

In [None]:
def add_zero_for_deltas_in_author_first_year(years):
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a)-[m:METRICS_IN]->(y:Year {{year: $year}})
        WHERE m.authorAge = 0
        RETURN m",
        
        "SET m.hIndexDelta = 0.0,
            m.totalCitationsDelta = 0.0,
            m.totalPapersDelta = 0.0,
            m.citationsPerPaperDelta = 0.0,
            m.venueHIndexDelta = 0.0,
            m.venueCitationsPerPaperDelta = 0.0,
            m.venueTotalPapersDelta = 0.0",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

In [None]:
add_zero_for_deltas_in_author_first_year(years)        

# Work-In-Progress

In [None]:
def write_author_features_to_quanta(metric, years):
    """ Use values stored in venue [:METRICS_IN] year edge to 
        Adds m.venue_{metric}_{min, mean, max} to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}_{}'.format(str(year), inspect.stack()[0][3], metric))
        query = """
        
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(q:Quanta)
        WITH q, m, y, apoc.text.decapitalize({}) as prop
        MATCH (q)<-[:AUTHORED]-(a:Author)-[r:METRICS_IN]->(y)
        WITH m, collect(r.prop) as stats
        WITH m, stats, apoc.coll.min(stats) as min
        WITH m, min, stats, apoc.coll.avg(stats) as mean
        WITH m, min, mean, apoc.coll.max(stats) as max
        RETURN [m, min, mean, max] as info",
        
        "WITH info[0] as m, info[1] as min, info[2] as mean, info[3] as max
        SET m.author{}Min = min,
            m.author{}Mean = mean,
            m.author{}Max = max",
        
        {{batchSize:5000, iterateList:true, parallel:false, params: {{year: {}}} }});
        """.format(metric, metric, year)
        
        run_query(query, graph)
  


In [None]:
# list of properties with uppercase property names
# in above method to check number of nonzero and nonnull properties add apoc.text.decapitalize(metric)
# all 16 Author properties

venue_properties = [
    'venueHIndexMin', 'venueHIndexMean', 'venueHIndexMax', 
    'venueHIndexDeltaMin', 'venueHIndexDeltaMean', 'venueHIndexDeltaMax',
    'venueCitationsPerPaperMin', 'venueCitationsPerPaperMean', 'venueCitationsPerPaperMax',
    'venueCitationsPerPaperDeltaMin', 'venueCitationsPerPaperDeltaMean', 'venueCitationsPerPaperDeltaMax',
    'venueTotalPapersMin', 'venueTotalPapersMean', 'venueTotalPapersMax',
    'venueTotalPapersDeltaMin', 'venueTotalPapersDeltaMean', 'venueTotalPapersDeltaMax',
    'venueRankCitationsPerPaperMin', 'venueRankCitationsPerPaperMean', 'venueRankCitationsPerPaperMax',
    'venueMaxCitationsMin', 'venueMaxCitationsMean', 'venueMaxCitationsMax',
    'totalVenuesMin', 'totalVenuesMean', 'totalVenuesMax']

author_properties = [
    'hIndex','hIndexDelta', 'totalCitations', 'totalCitationsDelta', 'citationsPerPaper',
    'citationsPerPaperDelta', 'citationsPerYear', 'totalPapers', 'totalPapersDelta', 'rankCitationsPerYear', 
    'pageRank', 'weightedPageRank', 'authorAge', 'recentCoauthors', 'maxCitations', 'totalVenues']

all_properties = author_properties + venue_properties
print(all_properties)

author_properties_upper = [
    'HIndex','HIndexDelta', 'TotalCitations', 'TotalCitationsDelta', 'CitationsPerPaper',
    'CitationsPerPaperDelta', 'CitationsPerYear', 'TotalPapers', 'TotalPapersDelta', 'RankCitationsPerYear', 
    'PageRank', 'WeightedPageRank', 'AuthorAge', 'RecentCoauthors', 'MaxCitations', 'TotalVenues']

# venueHIndex, venueHIndexDelta, venueCitationsPerPaper, venueCitationsPerPaperDelta, venueTotalPapers
# venueTotalPapersDelta, venueRankCitationsPerPaper, venueMaxCitations, 

# Old

In [None]:
# works - new function above was refactored to use m.totalPapers property that exists
def add_venue_citations_per_paper_to_venue_year_edge_old(years):
    """ Mean of all citations_per_paper values for all papers published in that venue up to that year
        Adds m.citationsPerPaper to (:Venue)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (v:Venue)<-[r:PUBLISHED_IN]-(q:Quanta)
        WHERE q.year <= $year
        WITH DISTINCT v, toFloat(count(q)) as paper_ct 
        
        OPTIONAL MATCH (v)<-[:PUBLISHED_IN]-(z:Quanta)<-[:CITES]-(p:Quanta)
        WHERE p.year <= $year
        WITH v, paper_ct, coalesce(count(p), 0) as cites 
        WITH v, (toFloat(cites) / toFloat(paper_ct)) as avg_cites_paper
        WITH v, (round(1000 * avg_cites_paper)/1000) AS cites_per_paper
        RETURN [v, cites_per_paper] as info",
        
        "WITH head(info) as v, last(info) as cites_per_paper
        MATCH (v)-[m:METRICS_IN]->(y:Year {{year: $year}})
        SET m.citationsPerPaper = cites_per_paper", 

        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)

add_venue_citations_per_paper_to_venue_year_edge_old(years)

In [None]:
def author_citation_delta(years):
    """ Change in number of citations received in the past year (positive)
        Adds m.citation_delta to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        "
        MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year: $year}})
        WITH a, m
        MATCH (a)-[pm:METRICS_IN]->(:Year {{year:($year - 1)}}) 
        RETURN [m, m.total_citations-pm.total_citations] as info",
        
        "WITH info[0] as m, info[1] as delta
        SET m.citation_delta = delta",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)
        
author_citation_delta(years)

In [None]:
def author_papers_delta(years):
    """ Change in number of papers written in the past year (positive)
        Adds m.paper_delta to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))

        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (y:Year {{year:$year}})<-[m:METRICS_IN]-(a:Author)
        WITH a, m
        MATCH (a)-[pm:METRICS_IN]->(:Year {{year: ($year - 1)}}) //previous year
        RETURN [m, m.total_papers-pm.total_papers] as info",
        
        "WITH info[0] as m, info[1] as delta
        SET m.papers_delta = delta",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)
        
author_papers_delta(years)

In [None]:
def author_mean_citations_per_paper_delta(years):
    """ Change in number of mean citations per paper in the past year (can be negative)
        Adds m.citations_per_paper_delta to (:Author)-[m:METRICS_IN]->(:Year)"""
    
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))
        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[m:METRICS_IN]->(y:Year {{year:$year}})
        WITH a, m
        MATCH (a)-[pm:METRICS_IN]->(:Year {{year:($year - 1)}}) 
        RETURN [m, m.citations_per_paper-pm.citations_per_paper] as info",
        
        "WITH info[0] as m, info[1] as delta
        SET m.citations_per_paper_delta = delta",
        
        {{batchSize:5000, iterateList:true, parallel:true, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)
          
author_mean_citations_per_paper_delta(years)

In [None]:
def author_h_index_delta(years):
    for year in years:
        print('Running query for year {} for {}'.format(str(year), inspect.stack()[0][3]))

        query = """
        CALL apoc.periodic.iterate(
        
        "MATCH (a:Author)-[m:METRICS_IN]->(:Year {{year: $year}})
        WITH a, m
        MATCH (a)-[pm:METRICS_IN]->(:Year {{year: ($year - 1)}})
        RETURN [m, m.hIndex-pm.hIndex] as info",
        
        "WITH head(info) as m, last(info) as delta
        SET m.hIndex_delta = delta",
        
        {{batchSize:5000, iterateList:true, parallel:false, params: {{year: {}}} }});
        """.format(year)
        
        run_query(query, graph)
        
author_h_index_delta(years)