In [103]:
data_files = glob.glob('/tmp/data/patents/*.h5')

In [None]:
import numpy as np
import pandas as pd
import glob
import csv

data_files = glob.glob('/tmp/data/patents/*.h5')
for file in data_files:
    
    print("Reading {}...".format(file), end=" ")
    df = pd.read_hdf(file)
    df.columns = ['title','inventors','pub_date','classifications','n_citations',
                  'cited_by', 'patent_citations','also_published_as','location']
    df['title'] = df['title'].str.strip()
    df['classifications'] = df['classifications'].str.join(";")
    df['inventors'] = df['inventors'].str.join(";")
    df['cited_by'] = df['cited_by'].str.join(";")
    df['patent_citations'] = df['patent_citations'].str.join(";")
    df['also_published_as'] = df['also_published_as'].str.join(";")
#     df.replace("", np.nan, inplace=True)
    assert(not df.index.duplicated().any())
    
    new_file = file.replace('.h5','.csv')
    print("Writing {}...".format(new_file), end=" ")
    df.to_csv(path_or_buf=new_file, sep=',', quoting=csv.QUOTE_MINIMAL, chunksize=50000, index_label='number', na_rep='na')
    print("Done.")

Reading /tmp/data/patents/res_1_2mil.h5... Writing /tmp/data/patents/res_1_2mil.csv... Done.
Reading /tmp/data/patents/res_4_5mil.h5... Writing /tmp/data/patents/res_4_5mil.csv... Done.
Reading /tmp/data/patents/res_0_1mil.h5... Writing /tmp/data/patents/res_0_1mil.csv... Done.
Reading /tmp/data/patents/res_2_3mil.h5... Writing /tmp/data/patents/res_2_3mil.csv... Done.
Reading /tmp/data/patents/res_7_8mil.h5... Writing /tmp/data/patents/res_7_8mil.csv... Done.
Reading /tmp/data/patents/res_5_6mil.h5... Writing /tmp/data/patents/res_5_6mil.csv... Done.
Reading /tmp/data/patents/res_8_9mil.h5... Writing /tmp/data/patents/res_8_9mil.csv... Done.
Reading /tmp/data/patents/res_3_4mil.h5... 

In [9]:
# # Need to do something with the ALSO published as
# sum(df['also_published_as'].str.len())


In [None]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")

graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [11]:
# # Constrain one id per Patent
# print("Creating uniqueness constraint (and also index) on Patent numbers...", end=" ", flush=True)
# query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.number IS UNIQUE;"""
# graph.run(query).evaluate()
# print("Done.")

# Add index for number
print("Creating uniqueness constraint for number...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Patent) ASSERT n.number IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for classifications
print("Creating index for classifications...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(classifications);"""
graph.run(query).evaluate()
print("Done.")

# Add index for classifications
print("Creating index for classifications...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(classifications);"""
graph.run(query).evaluate()
print("Done.")

# Add index for inventors
print("Creating index for inventors...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(inventors);"""
graph.run(query).evaluate()
print("Done.")

# Add index for location
print("Creating index for location...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(location);"""
graph.run(query).evaluate()
print("Done.")

# Add index for pub_date
print("Creating index for pub_date...", end=" ", flush=True)
query = """CREATE INDEX ON :Patent(pub_date);"""
graph.run(query).evaluate()
print("Done.")

Creating uniqueness constraint for number... Done.
Creating index for classifications... Done.
Creating index for classifications... Done.
Creating index for inventors... Done.
Creating index for location... Done.
Creating index for pub_date... Done.


In [12]:
# Import patents

data_files = glob.glob('/tmp/data/patents/*.csv')
for file in data_files:
    print("Importing {}...".format(file), end=" ", flush=True)
    query = """
    CALL apoc.periodic.iterate("
    CALL apoc.load.csv('file://{}',
        {{header:true,sep:',',
        mapping:{{
            patent_citations:{{array:true,nullValues:['na']}},
            also_published_as:{{array:true,nullValues:['na']}},
            cited_by:{{array:true,nullValues:['na']}},
            inventors:{{array:true,nullValues:['na']}},
            n_citations:{{type:'int',nullValues:['na']}},
            classifications:{{array:true,nullValues:['na']}},
            location:{{nullValues:['na']}}}}
        }})
    YIELD map as row RETURN row
    ","
    CREATE (p:Patent) SET p = row
    ", {{batchsize:50000, iterateList:true, parallel:true}});
    """.format(file)
    print(query)
    # graph.run(query).evaluate()
    print("Done.")


Importing /tmp/data/patents/res_5_6mil.csv... 
    CALL apoc.periodic.iterate("
    CALL apoc.load.csv('file:///tmp/data/patents/res_5_6mil.csv',
        {header:true,sep:',',
        mapping:{
            patent_citations:{array:true,nullValues:['na']},
            also_published_as:{array:true,nullValues:['na']},
            cited_by:{array:true,nullValues:['na']},
            inventors:{array:true,nullValues:['na']},
            n_citations:{type:'int',nullValues:['na']},
            classifications:{array:true,nullValues:['na']},
            location:{nullValues:['na']}}
        })
    YIELD map as row RETURN row
    ","
    CREATE (p:Patent) SET p = row
    ", {batchsize:50000, iterateList:true, parallel:true});
    
Done.
Importing /tmp/data/patents/res_7_8mil.csv... 
    CALL apoc.periodic.iterate("
    CALL apoc.load.csv('file:///tmp/data/patents/res_7_8mil.csv',
        {header:true,sep:',',
        mapping:{
            patent_citations:{array:true,nullValues:['na']},
       

In [7]:
print("Adding citation relationships...", end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"MATCH (a:Patent) UNWIND a.cited_by AS ref RETURN a, ref",
"MATCH (b:Patent {number: ref}) MERGE (b)-[:CITES]->(a)",
{batchSize:100, iterateList:true, parallel:false})
"""
print(query)
# graph.run(query).evaluate()
print("Done.")

Adding citation relationships... 
CALL apoc.periodic.iterate(
"MATCH (a:Patent) UNWIND a.cited_by AS ref RETURN a, ref",
"MATCH (b:Patent {number: ref}) MERGE (b)-[:CITES]->(a)",
{batchSize:100, iterateList:true, parallel:false})

Done.
