In [None]:
import numpy as np
import pandas as pd
import glob
import csv

data_files = glob.glob('/tmp/data/patents/*.h5')
for file in data_files:
    
    print("Reading {}...".format(file), end=" ")
    df = pd.read_hdf(file)
    df.columns = ['title','inventors','pub_date','classifications','n_citations',
                  'cited_by', 'patent_citations','also_published_as','location']
    df['title'] = df['title'].str.strip()
    df['classifications'] = df['classifications'].str.join(";")
    df['inventors'] = df['inventors'].str.join(";")
    df['cited_by'] = df['cited_by'].str.join(";")
    df['patent_citations'] = df['patent_citations'].str.join(";")
    df['also_published_as'] = df['also_published_as'].str.join(";")
#     df.replace("", np.nan, inplace=True)
    assert(not df.index.duplicated().any())
    
    new_file = file.replace('.h5','.csv')
    print("Writing {}...".format(new_file), end=" ")
    df.to_csv(path_or_buf=new_file, sep=',', quoting=csv.QUOTE_MINIMAL, chunksize=50000, index_label='number', na_rep='na')
    print("Done.")

In [None]:
# # Need to do something with the ALSO published as
# sum(df['also_published_as'].str.len())


In [None]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# # Constrain one id per Patent
# print("Creating uniqueness constraint (and also index) on Patent numbers...", end=" ", flush=True)
# query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.number IS UNIQUE;"""
# graph.run(query).evaluate()
# print("Done.")

# Add index for number
print("Creating uniqueness constraint for number...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.number IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for classifications
print("Creating index for classifications...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(classifications);"""
graph.run(query).evaluate()
print("Done.")

# Add index for classifications
print("Creating index for classifications...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(classifications);"""
graph.run(query).evaluate()
print("Done.")

# Add index for inventors
print("Creating index for inventors...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(inventors);"""
graph.run(query).evaluate()
print("Done.")

# Add index for location
print("Creating index for location...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(location);"""
graph.run(query).evaluate()
print("Done.")

# Add index for pub_date
print("Creating index for pub_date...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(pub_date);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import patents

local_data_dir = '/tmp/data/patents/'
neo4j_data_dir = '/import/patents/'

import glob, os, time

start_time = time.time()

for _, _, files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.csv'):
            print("Importing {}...".format(file), end=" ", flush=True)
            query_start_time = time.time()
            query = """
            CALL apoc.periodic.iterate("
                CALL apoc.load.csv(
                    'file://{}{}',
                    {{
                     header:true,sep:',',
                     mapping:{{
                      patent_citations:{{array:true,arraySep:';'}},
                      also_published_as:{{array:true,arraySep:';'}},
                      cited_by:{{array:true,arraySep:';'}},
                      inventors:{{array:true,arraySep:';'}},
                      n_citations:{{type:'int',arraySep:';'}},
                      classifications:{{array:true,arraySep:';'}},
                      location:{{}}
                    }}
                }}) YIELD map as row 
                RETURN row
            ",
            "CREATE (p:Patent) SET p = row
                SET p.patent_citations = [f IN p.patent_citations WHERE f <> '']
                SET p.also_published_as = [f IN p.also_published_as WHERE f <> '']
                SET p.cited_by = [f IN p.cited_by WHERE f <> '']
                SET p.inventors = [f IN p.inventors WHERE f <> '']
                SET p.classifications = [f IN p.classifications WHERE f <> '']
            ", 
            {{batchsize:25000, iterateList:true, parallel:true, retries:3}}
            );
            """.format(neo4j_data_dir, file)
            #print(query)
            graph.run(query).evaluate()
            query_end_time = time.time()
            print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
            
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

In [None]:
print("Adding citation relationships...", end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"MATCH (a:Patent) UNWIND a.cited_by AS ref RETURN a, ref",
"MATCH (b:Patent {number: ref}) MERGE (b)-[:CITES]->(a)",
{batchSize:100, iterateList:true, parallel:false})
"""
print(query)
# graph.run(query).evaluate()
print("Done.")