In [29]:
import json
import urllib
import pandas as pd
from neo4j import GraphDatabase

driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'password'))

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [11]:
def ie_pipeline(text, relation_threshold=0.9, entities_threshold=0.8):
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("relation_threshold", relation_threshold),
        ("entities_threshold", entities_threshold)])
    
    url = "http://localhost:5000?" + data
    req = urllib.request.Request(url, data=data.encode("utf8"), method="GET")
    with urllib.request.urlopen(req, timeout=150) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    return response

In [13]:
example_data = ie_pipeline("""
Elon Musk is a business magnate, industrial designer, and engineer.
He is the founder, CEO, CTO, and chief designer of SpaceX.
He is also early investor, CEO, and product architect of Tesla, Inc.
He is also the founder of The Boring Company and the co-founder of Neuralink. 
A centibillionaire, Musk became the richest person in the world in January 2021, with an estimated net worth of $185 billion at the time, surpassing Jeff Bezos.
Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa.
He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University.
He transferred to the University of Pennsylvania two years later, where he received dual bachelor's degrees in economics and physics.
He moved to California in 1995 to attend Stanford University, but decided instead to pursue a business career.
He went on co-founding a web software company Zip2 with his brother Kimbal Musk.
  """)

url:  http://localhost:5000?text=%0AElon+Musk+is+a+business+magnate%2C+industrial+designer%2C+and+engineer.%0AHe+is+the+founder%2C+CEO%2C+CTO%2C+and+chief+designer+of+SpaceX.%0AHe+is+also+early+investor%2C+CEO%2C+and+product+architect+of+Tesla%2C+Inc.%0AHe+is+also+the+founder+of+The+Boring+Company+and+the+co-founder+of+Neuralink.+%0AA+centibillionaire%2C+Musk+became+the+richest+person+in+the+world+in+January+2021%2C+with+an+estimated+net+worth+of+%24185+billion+at+the+time%2C+surpassing+Jeff+Bezos.%0AMusk+was+born+to+a+Canadian+mother+and+South+African+father+and+raised+in+Pretoria%2C+South+Africa.%0AHe+briefly+attended+the+University+of+Pretoria+before+moving+to+Canada+aged+17+to+attend+Queen%27s+University.%0AHe+transferred+to+the+University+of+Pennsylvania+two+years+later%2C+where+he+received+dual+bachelor%27s+degrees+in+economics+and+physics.%0AHe+moved+to+California+in+1995+to+attend+Stanford+University%2C+but+decided+instead+to+pursue+a+business+career.%0AHe+went+on+co-founding+a

In [14]:
print(example_data)

{'entities': [{'label': 'Organization', 'title': 'Pretoria', 'wikiId': 'Q3926'}, {'label': 'Organization', 'title': 'Tesla, Inc.', 'wikiId': 'Q478214'}, {'label': 'Person', 'title': 'Jeff Bezos', 'wikiId': 'Q312556'}, {'label': 'Organization', 'title': 'University of Pennsylvania', 'wikiId': 'Q49117'}, {'label': 'Organization', 'title': 'SpaceX', 'wikiId': 'Q193701'}, {'label': 'Organization', 'title': 'The Boring Company', 'wikiId': 'Q28874479'}, {'label': 'Organization', 'title': 'Neuralink', 'wikiId': 'Q29043471'}, {'label': 'Person', 'title': 'Kimbal Musk', 'wikiId': 'Q6409751'}, {'label': 'Organization', 'title': 'Stanford University', 'wikiId': 'Q41506'}, {'label': 'Organization', 'title': 'University of Pretoria', 'wikiId': 'Q604444'}, {'label': 'Person', 'title': 'Elon Musk', 'wikiId': 'Q317521'}], 'relations': [{'source': 'Elon Musk', 'target': 'University of Pennsylvania', 'type': 'work location'}, {'source': 'Elon Musk', 'target': 'University of Pennsylvania', 'type': 'resid

In [30]:
import_direct_query = """
WITH $data as data
UNWIND data.entities as entity
MERGE (e:Entity{name:entity.title})
ON CREATE SET e.wikiId = entity.wikiId
WITH data, entity, e
CALL apoc.create.addLabels(e,[entity.label]) YIELD node
WITH data, count(*) as break_unwind
UNWIND data.relations as relation
MERGE (s:Entity{name:relation.source})
MERGE (t:Entity{name:relation.target})
WITH s,t,relation
CALL apoc.create.relationship(s, relation.type, {}, t) 
YIELD rel
RETURN distinct 'done'
"""

run_query(import_direct_query, {'data':example_data})

Unnamed: 0,'done'
0,done


In [31]:

run_query("MATCH (n) DETACH DELETE n")
run_query("CREATE CONSTRAINT IF NOT EXISTS ON (e:Entity) ASSERT e.name IS UNIQUE;")
run_query("CREATE INDEX rels IF NOT EXISTS FOR (n:Relation) ON (n.type);")

In [33]:
data = pd.read_csv('bbc-news-data.csv', delimiter='\t')
data.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [34]:
import_refactored_query = """
UNWIND $params as value
CREATE (a:Article{content:value.content})
FOREACH (rel in value.ie.relations | 
  MERGE (s:Entity{name:rel.source})
  MERGE (t:Entity{name:rel.target})
  MERGE (s)-[:RELATION]->(r:Relation{type:rel.type})-[:RELATION]->(t)
  MERGE (a)-[:MENTIONS_REL]->(r))
WITH value, a
UNWIND value.ie.entities as entity
MERGE (e:Entity{name:entity.title})
SET e.wikiId = entity.wikiId
MERGE (a)-[:MENTIONS_ENT]->(e)
WITH entity, e
CALL apoc.create.addLabels(e,[entity.label]) YIELD node
RETURN distinct 'done'
"""

In [37]:
with driver.session() as session:
    params = []
    for i,article in list(data.iterrows())[:5]:
        content = article['content']
        ie_data = ie_pipeline(content)
        params.append({'content':content, 'ie':ie_data})

        if (len(params) % 100 == 0):
            session.run(import_refactored_query, {'params':params})
            params = []

    # session.run(update_query, {'params':params})

url:  http://localhost:5000?text=+Quarterly+profits+at+US+media+giant+TimeWarner+jumped+76%25+to+%241.13bn+%28%C2%A3600m%29+for+the+three+months+to+December%2C+from+%24639m+year-earlier.++The+firm%2C+which+is+now+one+of+the+biggest+investors+in+Google%2C+benefited+from+sales+of+high-speed+internet+connections+and+higher+advert+sales.+TimeWarner+said+fourth+quarter+sales+rose+2%25+to+%2411.1bn+from+%2410.9bn.+Its+profits+were+buoyed+by+one-off+gains+which+offset+a+profit+dip+at+Warner+Bros%2C+and+less+users+for+AOL.++Time+Warner+said+on+Friday+that+it+now+owns+8%25+of+search-engine+Google.+But+its+own+internet+business%2C+AOL%2C+had+has+mixed+fortunes.+It+lost+464%2C000+subscribers+in+the+fourth+quarter+profits+were+lower+than+in+the+preceding+three+quarters.+However%2C+the+company+said+AOL%27s+underlying+profit+before+exceptional+items+rose+8%25+on+the+back+of+stronger+internet+advertising+revenues.+It+hopes+to+increase+subscribers+by+offering+the+online+service+free+to+TimeWarner+inte

In [1]:
# print(params)