In [None]:
import json
from py2neo import Graph, Node, Relationship
#from py2neo.Graph import database 

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
# graph = Graph('bolt://localhost:7687', bolt=True)

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

*Goal*: Calculate the top 1,000 {patents, papers} {overall, by individual year} by {PageRank, ArticleRank, time-scaled PageRank, time-scaled ArticleRank}.

# Top Overall

## PageRank

In [None]:
# Top patents by citations
query = """
MATCH (p:Patent)
WHERE p.title IS NOT NULL AND p.title <> "" 
WITH 
    p.title as Title, 
    p.year as Year,
    p.id as ID, 
    p.n_citation as google_citations, 
    SIZE((p)<-[:CITES]-(:Patent)) as graph_citations,
    MAX(p.n_citation, SIZE((p)<-[:CITES]-(:Patent))) as max_citations
ORDER BY max_citations DESC
RETURN title, id, year, graph_citations, google_citations, max_citations
LIMIT 1000
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
df.drop_duplicates(subset='title', inplace=True)
df.drop_duplicates(subset='id', inplace=True)
# df['authors'] = df['authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'Citations_GooglePatents', 'Citations_Graph', 'ID', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/patents_by_citation.csv')

In [None]:
# Top patents by citations
query = """
MATCH (p:Patent)
WHERE p.title IS NOT NULL AND p.title <> "" 
WITH 
    p.title as title, 
    p.year as year,
    p.id as id, 
    p.n_citation as google_citations, 
    SIZE((p)<-[:CITES]-(:Patent)) as graph_citations,
    MAX(p.n_citation, SIZE((p)<-[:CITES]-(:Patent))) as max_citations
ORDER BY max_citations DESC
RETURN title, id, year, graph_citations, google_citations, max_citations
LIMIT 1000
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
df.drop_duplicates(subset='title', inplace=True)
df.drop_duplicates(subset='id', inplace=True)
# df['authors'] = df['authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'Citations_GooglePatents', 'Citations_Graph', 'ID', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/patents_by_citation.csv')

## Top Patents by PageRank

In [None]:
# Top patents by PageRank
query = """
MATCH (p:Patent)
WHERE p.title IS NOT NULL AND p.title <> "" AND (size(p.authors)>0)
WITH 
    p.title as title, 
    p.year as year,
    p.id as id, 
    p.authors as authors, 
    p.n_citation as google_citations,
    p.pagerank as pagerank
ORDER BY pagerank DESC
RETURN title, id, year, authors, google_citations, pagerank
LIMIT 100000
"""
query_start_time = time.time()
# df = graph.run(query).to_data_frame()
print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
df.drop_duplicates(subset='title', inplace=True)
df.drop_duplicates(subset='id', inplace=True)
df['authors'] = df['authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'Citations_GooglePatents', 'ID', 'PageRank', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/result/patents_by_pagerank.csv')

In [None]:
df_years = df.copy()
df_years.drop_duplicates(subset='Year', inplace=True)
df_years = df_years.sort_values(by='Year', ascending=False)
df_years.to_csv(path_or_buf='/tmp/data/result/top_yearly_patent_by_pagerank.csv')
df_years

## Top inventors by PageRank

In [None]:

query = """
MATCH (a:Author)-[:AUTHORED]->(p:Patent)
WHERE 
    (a.name <> "") AND 
    (a.name IS NOT NULL) 
    (size(split(a.name, ' ')) > 1) AND 
    (p.title IS NOT NULL) AND 
    (p.title <> "") AND 
    (size(p.authors)>0)
WITH 
  a.name AS name, 
  COUNT(p) AS num_patents,
  SUM(p.n_citation) AS sum_citations, 
  SUM(p.pagerank) AS sum_pagerank
ORDER BY sum_pagerank DESC
LIMIT 2000
RETURN *
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()

In [None]:
# df.drop_duplicates(subset='name', inplace=True)
df.columns = ['Name', 'Num_Patents', 'Sum_Citations', 'Sum_PageRank']
df.to_csv(path_or_buf='/tmp/data/result/inventors_by_pagerank.csv')
df

## Top patents by ArticleRank

In [None]:
# Top patents by ArticleRank
query = """
CALL algo.articleRank.stream(
  'MATCH (p:Patent) RETURN id(p) as id',
  'MATCH (p1:Patent)-[:CITES]->(p2:Patent) RETURN id(p1) as source, id(p2) as target',
  {graph:'cypher'}
) YIELD node, score 
WITH
    node as p,
    score
ORDER BY score DESC
LIMIT 10000
RETURN 
    p.title as Title, 
    p.authors as Authors,
    p.id as ID,
    p.year as Year,
    p.n_citation as Citations_GooglePatents,
    score as PageRank
"""

query_start_time = time.time()
df = graph.run(query).to_data_frame()
# print(query)
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset='Title', inplace=True)
df.drop_duplicates(subset='ID', inplace=True)
df['Authors'] = df['Authors'].apply(lambda x: ', '.join(x))
df.columns = ['Authors', 'n_citations', 'ID', 'ArticleRank', 'Title', 'Year']
df.to_csv(path_or_buf='/tmp/data/top_patent_by_articlerank.csv')

## Top inventors by citation count TODO

In [None]:
# Top inventors by Citations
query = """
MATCH (a:Author)-[:AUTHORED]->(p:Patent)
WHERE 
    (a.name <> "") AND 
    (a.name IS NOT NULL) AND 
    (size(split(a.name, ' ')) > 1) AND 
    (p.title IS NOT NULL) 
    AND (p.title <> "") AND 
    (size(p.authors)>0)
WITH 
  a.name AS name, 
  COUNT(p) AS num_patents,
  SUM(p.n_citation) AS sum_citations, 
  SUM(p.pagerank) AS sum_pagerank
ORDER BY sum_pagerank DESC
LIMIT 1000
RETURN *
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done query in {:.2f} minutes.".format((time.time()-query_start_time)/60))

df.head()