# Convert H5 files to CSV for Import

In [None]:
import numpy as np
import pandas as pd
import glob
import csv

data_files = glob.glob('/tmp/data/patents/*.h5')
for file in data_files:
    
    print("Reading {}...".format(file), end=" ")
    df = pd.read_hdf(file)
    df.columns = ['title','inventors','pub_date','classifications','n_citations',
                  'cited_by', 'patent_citations','also_published_as','location']
    df['title'] = df['title'].str.strip()
    df['classifications'] = df['classifications'].str.join(";")
    df['inventors'] = df['inventors'].str.join(";")
    df['cited_by'] = df['cited_by'].str.join(";")
    df['patent_citations'] = df['patent_citations'].str.join(";")
    df['also_published_as'] = df['also_published_as'].str.join(";")
#     df.replace("", np.nan, inplace=True)
    assert(not df.index.duplicated().any())
    
    new_file = file.replace('.h5','.csv')
    print("Writing {}...".format(new_file), end=" ")
    df.to_csv(path_or_buf=new_file, sep=',', quoting=csv.QUOTE_MINIMAL, chunksize=50000, index_label='number', na_rep='na')
    print("Done.")

In [None]:
# # Need to do something with the ALSO published as
# sum(df['also_published_as'].str.len())


# Connect to graph

In [None]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# # Constrain one id per Patent
# print("Creating uniqueness constraint (and also index) on Patent numbers...", end=" ", flush=True)
# query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.number IS UNIQUE;"""
# graph.run(query).evaluate()
# print("Done.")

# Add index for number
print("Creating uniqueness constraint for id...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for classifications
print("Creating index for classifications...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(classifications);"""
graph.run(query).evaluate()
print("Done.")

# # Add index for inventors
# print("Creating index for inventors...", end=" ", flush=True)
# query = """CREATE INDEX ON :Patent(inventors);"""
# graph.run(query).evaluate()
# print("Done.")

# Add index for location
print("Creating index for location...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(location);"""
graph.run(query).evaluate()
print("Done.")

# Add index for pub_date
print("Creating index for pub_date...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(pub_date);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import patents

local_data_dir = '/tmp/data/test/patents/'
neo4j_data_dir = '/import/test/patents/'

import glob, os, time

start_time = time.time()

for _, _, files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.csv'):
            print("Importing {}...".format(file), end=" ", flush=True)
            query_start_time = time.time()
            query = """
            CALL apoc.periodic.iterate("
                CALL apoc.load.csv(
                    'file://{}{}',
                    {{
                     header:true,sep:',',
                     mapping:{{
                      number:{{name:'id'}},
                      patent_citations:{{name:'refs',array:true,arraySep:';'}},
                      also_published_as:{{array:true,arraySep:';'}},
                      cited_by:{{array:true,arraySep:';'}},
                      inventors:{{name:'authors',array:true,arraySep:';'}},
                      n_citations:{{name:'n_citation',type:'int',arraySep:';'}},
                      classifications:{{array:true,arraySep:';'}},
                      location:{{}}
                    }}
                }}) YIELD map as row 
                RETURN row
            ",
            "CREATE (p:Patent) SET p = row
                SET p.patent_citations = [f IN p.patent_citations WHERE f <> '']
                SET p.also_published_as = [f IN p.also_published_as WHERE f <> '']
                SET p.cited_by = [f IN p.cited_by WHERE f <> '']
                SET p.authors = [f IN p.authors WHERE f <> '']
                SET p.classifications = [f IN p.classifications WHERE f <> '']
                SET p.year = toInt(head(split(p.pub_date,'-')))
            ", 
            {{batchsize:20000, iterateList:true, parallel:false}}
            );
            """.format(neo4j_data_dir, file)
#             print(query)
            graph.run(query).evaluate()
            query_end_time = time.time()
            print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
            
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

In [None]:
print("Creating uniqueness constraint for author...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

print("Creating AUTHORED relationships...", end=" ")
start_time = time.time()
query = """
CALL apoc.periodic.iterate(
"MATCH (p:Patent) UNWIND p.authors AS auth  RETURN p, auth"
,
"MERGE (a:Author {name: auth}) 
MERGE (a)-[:AUTHORED {is_first_author: head(p.authors)=auth, is_last_author: last(p.authors)=auth}]->(p)"
,
{batchSize:50, iterateList:true, parallel:false})
"""

# # other option
# query = """
# CALL apoc.periodic.commit(
# "MATCH (p:Patent) 
# UNWIND p.authors AS auth
# WITH p, auth
# MERGE (a:Author {name: auth}) 
# MERGE (a)-[:AUTHORED {is_first_author: head(p.authors)=auth, is_last_author: last(p.authors)=auth}]->(p)"
# ,
# {limit:10000})
# """

#print(query)
graph.run(query).evaluate()
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

In [None]:
print("Adding CITES relationships...", end=" ", flush=True)
import time

start_time = time.time()
query = """
CALL apoc.periodic.iterate(
"MATCH (a:Patent) UNWIND a.cited_by AS ref RETURN a, ref",
"MATCH (b:Patent {id: ref}) 
MERGE (b)-[:CITES]->(a)",
{batchSize:100, iterateList:true, parallel:false})
"""

# query = """
# CALL apoc.periodic.commit(
# "MATCH (p:Patent) UNWIND p.cited_by AS ref
# WITH p, ref
# MATCH (b:Patent {id: ref}) 
# MERGE (b)-[:CITES]->(p)"
# ,
# {limit:1000})
# """

#print(query)
graph.run(query).evaluate()
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

In [None]:
n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Built a graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

# Run PageRank

### Run PageRank once on entire graph

In [None]:
import pandas as pd
import time

start_time = time.time()
    
print("Running PageRank on all patents...", end=" ")
# query = """
# CALL algo.pageRank.stream(
#     'MATCH (p:Patent) RETURN id(p) as id'
#     ,
#     'MATCH (p1:Patent)-[:CITES]->(p2:Patent) RETURN id(p1) as source, id(p2) as target'
#     ,
#     {graph:'cypher', iterations:20, write:true, writeProperty:'pagerank'}
#     )
# YIELD node, score
# WITH * 
# ORDER BY score DESC
# LIMIT 50
# RETURN node.number AS number, 
#     node.title AS title, 
#     node.inventors AS inventors, 
#     node.location AS location, 
#     node.n_citations AS n_citations, 
#     node.pub_date AS pub_date, 
#     node.patent_citations AS patent_citations, 
#     node.classifications AS classifications, 
#     node.cited_by AS cited_by, 
#     score;
# """


query = """
CALL algo.pageRank(
    'MATCH (p:Patent) RETURN id(p) as id'
    ,
    'MATCH (p1:Patent)-[:CITES]->(p2:Patent) RETURN id(p1) as source, id(p2) as target'
    ,
    {graph:'cypher', iterations:30, write:true, writeProperty:'pagerank'}
    );
"""
#print(query)
df = graph.run(query).to_data_frame()
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

### Run PageRank over time

In [None]:
# Run STREAMING PageRank on each year from 1800 to 1805

import pandas as pd
import time

start_time = time.time()
start_year, end_year, step = 1900, 2020, 5
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on patents from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
         'MATCH (p:Patent) WHERE p.pub_date < "{}-01-01" AND p.pub_date <> "" RETURN id(p) as id'
        ,'MATCH (p1:Patent)-[:CITES]->(p2:Patent) RETURN id(p1) as source, id(p2) as target'
        ,{{graph:'cypher', iterations:20, write:false}})
    YIELD node, score
    WITH * 
    ORDER BY score DESC
    LIMIT 50
    RETURN 
        node.number AS number, 
        node.title AS title, 
        node.inventors AS inventors, 
        node.location AS location, 
        node.n_citations AS n_citations, 
        node.pub_date AS pub_date, 
        node.patent_citations AS patent_citations, 
        node.classifications AS classifications, 
        node.cited_by AS cited_by, 
        score;
    """.format(year)
    #print(query)
    df = graph.run(query).to_data_frame()
    df['year'] = year
    dfs.append(df)
    query_end_time = time.time()
    print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))


In [None]:
all_results = pd.concat(dfs)
all_results_path = '/tmp/data/result/patents_pagerank_{}-{}-{}.csv'.format(start_year, end_year,step)
print("Writing all results to {}...".format(all_results_path), end=" ")
all_results.to_csv(path_or_buf=all_results_path, sep=",", header=True, index=True)
print("Done.")

In [None]:
import numpy as np

# all_results['title_clean'] = all_results['title'].str.replace(',',' ')
result = all_results.pivot_table(index=['title','number'], columns='year', values='score')    

file_path = '/tmp/data/result/patents_pivottable_{}-{}-{}.csv'.format(start_year, end_year,step)
print("Writing results to {}...".format(file_path), end=" ")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")


## Get top patents in Boston/MA/Cambridge

In [None]:
import time

start_time = time.time()    
print("Getting top Boston/MA/Cambridge inventors and patents...", end=" ")

query = """
MATCH (a:Author)-[:AUTHORED]->(p:Patent)
WHERE 
    toUpper(p.location) CONTAINS "BOSTON" OR
    toUpper(p.location) CONTAINS "MASSACHUSETTS" OR
    toUpper(p.location) = "MA" OR
    toUpper(p.location) CONTAINS "CAMBRIDGE"
WITH a
MATCH (a:Author)-[:AUTHORED]->(b:Patent)
RETURN 
    a.name AS name, 
    COUNT(b) AS total_patents,
    SUM(b.n_citation) AS total_citations, 
    SUM(b.pagerank) AS total_pagerank,
    SUM(log(b.pagerank*exp((2018-b.year)/20))) AS impact
ORDER BY impact DESC
"""
df = graph.run(query).to_data_frame()
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))
print(df['name'])

# Clasify and rank patents by author gender

In [None]:
import nltk, random
from nltk.corpus import names as nltk_names

nltk.download("names")
male_names = list(nltk_names.words('male.txt'))
female_names = list(nltk_names.words('female.txt'))

labeled_names = ([({"name": name}, 'male') for name in male_names])
random.shuffle(labeled_names)

In [None]:
classifier = nltk.NaiveBayesClassifier.train(labeled_names)

In [None]:
female_inventors = set()
for name in df['name']:
    first_name = name.split(' ')[0]
    if first_name in female_names:
        female_inventors.add(name)

In [None]:
female_inventors

In [None]:
female_patents = {}
for patNum, name_lst in names.items():
    to_add = []
    for name in name_lst:
        if len(name.first.split()) > 1:
            fst_name = name.first.split()[0]
        else:
            fst_name = name.first
        feat = {"name": fst_name}
        res = classifier.classify(feat)
        if res == "female":
            to_add.append(name)
    if not to_add == []:
        female_patents[patNum] = to_add