**Goal**: Calculate the top 1,000 {patents, papers} {overall, by individual year} by {PageRank, ArticleRank, time-scaled PageRank, time-scaled ArticleRank}.

# Setup

### Connect to the graph

In [None]:
import time
import json
from py2neo import Graph, Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687", auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

### Write PageRank and ArticleRank to the nodes

In [None]:
# Write PageRank and ArticleRank, run on entire graph, to each node
for alg in ['articleRank', 'pageRank']:
    query = """
    CALL algo.{}(
        'MATCH (p:Quanta) RETURN id(p) as id',
        'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
        {{graph:'cypher', iterations:35, write:true, writeProperty:'{}'}});
    """.format(alg, alg.lower())
#     print(query)
    query_start_time = time.time()
    graph.run(query)
    print("Wrote {} to full graph in {:.2f} minutes.".format(alg.capitalize(), (time.time()-query_start_time)/60))

# Calculate top quanta overall

### By citations


In [None]:
# Top overall quanta by citations

filepath = '/import/result/top_quanta_overall_citations.csv'

query = """
CALL apoc.export.csv.query("MATCH (q:Quanta)
WHERE 
    q.title IS NOT NULL AND 
    q.title <> \\"\\" AND 
    q.venue=\\"Nature\\" AND q.year>2010
WITH 
    q.title as title, 
    q.year as year,
    q.id as id, 
    q.n_citation as google_citations, 
    SIZE((q)<-[:CITES]-(:Quanta)) as graph_citations
WHERE graph_citations > 0
WITH *, MAX(google_citations, graph_citations) as max_citations
ORDER BY max_citations DESC
RETURN title, id, year, graph_citations, google_citations, max_citations
","{}", {{}})
""".format(filepath)

# print(query)
query_start_time = time.time()
graph.run(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

### By PageRank and ArticleRank

In [None]:
# Top overall quanta by PageRank and ArticleRank
for alg in ['pageRank', 'articleRank']:
    
    filepath = '/import/result/top_quanta_overall_{}.csv'.format(alg.lower())

    query = """
    CALL apoc.export.csv.query("MATCH (q:Quanta)
    WHERE 
        q.title IS NOT NULL AND 
        q.title <> \\"\\" AND 
        q.venue=\\"Nature\\" AND q.year>2010
    WITH 
        q.title as title, 
        q.year as year,
        q.id as id, 
        q.pagerank as pagerank, 
        q.articlerank as articlerank
    ORDER BY {} DESC
    RETURN title, id, year, pagerank, articlerank
    ","{}", {{}})
    """.format(alg.lower(), filepath)

    print(query)
    
    query_start_time = time.time()
#     graph.run(query)
    print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

### By time-scaled PageRank and time-scaled ArticleRank

In [None]:
# Top overall quanta by TIME SCALED PageRank and ArticleRank
for alg in ['pageRank', 'articleRank']:
    query = """
    MATCH (p:Quanta)
    WHERE p.title IS NOT NULL AND p.title <> "" AND (size(p.authors)>0)
    WITH 
        *,  
        p.id AS id, 
        p.title AS title,
        p.year AS year, 
        p.pagerank AS pagerank
    ORDER BY pagerank DESC
    WITH year, COLLECT({{title: title,  page_rank: pagerank}})[..1000] AS data, AVG(pagerank) AS avg_page_rank, stDev(pagerank) as stdDev
    UNWIND data AS d
    RETURN  
        d.id AS id,
        d.title AS title, 
        d.year AS year, 
        ABS(d.pagerank-avg_page_rank)/stdDev AS scaled_score;
    """.format(year,year,year)

## Top Patents by PageRank

In [None]:
# Top patents by PageRank
query = """
MATCH (p:Patent)
WHERE p.title IS NOT NULL AND p.title <> "" AND (size(p.authors)>0)
WITH 
    p.title as title, 
    p.year as year,
    p.id as id, 
    p.authors as authors, 
    p.n_citation as google_citations,
    p.pagerank as pagerank
ORDER BY pagerank DESC
RETURN title, id, year, authors, google_citations, pagerank
LIMIT 100000
"""
query_start_time = time.time()
# df = graph.run(query).to_data_frame()
print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
df.drop_duplicates(subset='title', inplace=True)
df.drop_duplicates(subset='id', inplace=True)
df['authors'] = df['authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'Citations_GooglePatents', 'ID', 'PageRank', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/result/patents_by_pagerank.csv')

In [None]:
df_years = df.copy()
df_years.drop_duplicates(subset='Year', inplace=True)
df_years = df_years.sort_values(by='Year', ascending=False)
df_years.to_csv(path_or_buf='/tmp/data/result/top_yearly_patent_by_pagerank.csv')
df_years

## Top inventors by PageRank

In [None]:

query = """
MATCH (a:Author)-[:AUTHORED]->(p:Patent)
WHERE 
    (a.name <> "") AND 
    (a.name IS NOT NULL) 
    (size(split(a.name, ' ')) > 1) AND 
    (p.title IS NOT NULL) AND 
    (p.title <> "") AND 
    (size(p.authors)>0)
WITH 
  a.name AS name, 
  COUNT(p) AS num_patents,
  SUM(p.n_citation) AS sum_citations, 
  SUM(p.pagerank) AS sum_pagerank
ORDER BY sum_pagerank DESC
LIMIT 2000
RETURN *
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
# df.drop_duplicates(subset='name', inplace=True)
df.columns = ['Name', 'Num_Patents', 'Sum_Citations', 'Sum_PageRank']
df.to_csv(path_or_buf='/tmp/data/result/inventors_by_pagerank.csv')
df

## Top patents by ArticleRank

In [None]:
# Top patents by ArticleRank
query = """
CALL algo.articleRank.stream(
  'MATCH (p:Patent) RETURN id(p) as id',
  'MATCH (p1:Patent)-[:CITES]->(p2:Patent) RETURN id(p1) as source, id(p2) as target',
  {graph:'cypher'}
) YIELD node, score 
WITH
    node as p,
    score
ORDER BY score DESC
LIMIT 10000
RETURN 
    p.title as Title, 
    p.authors as Authors,
    p.id as ID,
    p.year as Year,
    p.n_citation as Citations_GooglePatents,
    score as PageRank
"""

query_start_time = time.time()
df = graph.run(query).to_data_frame()
# print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset='Title', inplace=True)
df.drop_duplicates(subset='ID', inplace=True)
df['Authors'] = df['Authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'n_citations', 'ID', 'ArticleRank', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/top_patent_by_articlerank.csv')

## Top inventors by citation count TODO

In [None]:
# Top inventors by Citations
query = """
MATCH (a:Author)-[:AUTHORED]->(p:Patent)
WHERE 
    (a.name <> "") AND 
    (a.name IS NOT NULL) AND 
    (size(split(a.name, ' ')) > 1) AND 
    (p.title IS NOT NULL) 
    AND (p.title <> "") AND 
    (size(p.authors)>0)
WITH 
  a.name AS name, 
  COUNT(p) AS num_patents,
  SUM(p.n_citation) AS sum_citations, 
  SUM(p.pagerank) AS sum_pagerank
ORDER BY sum_pagerank DESC
LIMIT 1000
RETURN *
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
## Top Papers by Time-Scaled PageRank

In [None]:
# Time Scaled PageRank that returns top 1000 papers per year
graph = Graph("bolt://neo4j:7687")

# Run STREAMING PageRank (1) on each year from 1800 to 2020
start_time = time.time()
start_year, end_year, step = 1985, 2010, 5
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on works from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year < 2015 RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:20, write:false, concurrency:20}})
    YIELD node, score
    WITH 
        *,  
        node.id AS id, 
        node.title AS title,
        node.lang AS lang, 
        node.year AS year, 
        node.keywords AS keywords, 
        node.fos AS fos, 
        node.publisher AS publisher,
        score AS pagerank
    ORDER BY pagerank DESC
    WITH year, COLLECT({{title: title,  page_rank: pagerank}})[..1000] AS data, AVG(pagerank) AS avg_page_rank, stDev(pagerank) as stdDev
    UNWIND data AS d
    RETURN d.year as year, 
        d.id AS id,
        d.title AS title, 
        d.lang AS lang, 
        d.keywords as keywords, 
        d.publisher as fos, 
        d.publisher as publisher,
        ABS(d.pagerank-avg_page_rank)/stdDev AS scaled_score;
    """.format(year,year,year)
    
    #graph.run(query)
    #print(query)
    
    df = graph.run(query).to_data_frame()
    
    
print("Finished all calculations in {:.2f} minutes.".format((time.time()-start_time)/60))
