In [1]:
from graphdatascience import GraphDataScience
import pandas as pd

host = "bolt://localhost:7687"
user = "neo4j"
password = "pleaseletmein"

gds = GraphDataScience(host, auth=(user, password))

# Import articles

In [2]:
articles = pd.read_csv("../data/articles.csv")
articles.head()

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,environment/2022/jun/30/former-australian-chie...,Environment,Former Australian chief scientist to head revi...,https://www.theguardian.com/environment/2022/j...,The former Australian chief scientist and seni...,2022-06-30 23:53:09+00:00,147919
1,world/2022/jul/01/we-were-too-lenient-on-pro-d...,World news,We were too lenient on pro-democracy politicia...,https://www.theguardian.com/world/2022/jul/01/...,A senior member of Hong Kong’s incoming admini...,2022-06-30 23:52:37+00:00,147920
2,australia-news/2022/jul/01/south-australian-li...,Australia news,South Australian Liberal leader and state MPs ...,https://www.theguardian.com/australia-news/202...,A week after Roe v Wade was overturned in the ...,2022-06-30 23:33:09+00:00,147921
3,australia-news/2022/jul/01/john-barilaros-deci...,Australia news,John Barilaro’s decision may stem some bleedin...,https://www.theguardian.com/australia-news/202...,John Barilaro didn’t want to continue being “a...,2022-06-30 23:23:38+00:00,147922
4,lifeandstyle/2022/jul/01/breastfeeding-cogniti...,Life and style,Breastfeeding improves cognitive ability for c...,https://www.theguardian.com/lifeandstyle/2022/...,Children of poorer mothers who breastfeed are ...,2022-06-30 23:01:03+00:00,147923


In [3]:
gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (a:Article) REQUIRE a.id IS UNIQUE;
"""
)
gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (s:Section) REQUIRE s.name IS UNIQUE;
"""
)
gds.run_cypher(
    """
CREATE TEXT INDEX articletitle IF NOT EXISTS FOR (a:Article) ON a.webTitle;
"""
)

In [4]:
article_import_query = """
UNWIND $data AS row
MERGE (a:Article {id: row.id})
SET a += apoc.map.clean(row, ["id", "article_id", "sectionName", "webPublicationDate"], [])
SET a.date = datetime(replace(row.webPublicationDate, " ", "T"))
WITH a, row.sectionName AS section
MERGE (s:Section {name: section})
MERGE (a)-[:HAS_SECTION]->(s)
"""

gds.run_cypher(article_import_query, {"data": articles.to_dict("records")})

# Import NLP output

In [5]:
import json

with open("../data/nlp_output.json") as file:
    nlp_output = json.load(file)

In [6]:
gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.id IS UNIQUE;
"""
)

In [7]:
nlp_import_query = """
UNWIND $data AS row
MATCH (a:Article {id: row.id})
SET a.sentiment = row.sentiment
FOREACH (entity in row.entity | 
  MERGE (e:Entity {id: entity.name})
  ON CREATE SET e.type = entity.type,
                e.uri = entity.uri
  MERGE (a)-[m:MENTIONS]-(e)
  SET m.confidence = entity.confidence,
      m.sentiment = entity.sentiment
)
WITH a, row
UNWIND row.fact AS fact
  WITH a, fact
  MERGE (source:Entity {id: fact.source.name})
  ON CREATE SET source.type = fact.source.type
  MERGE (target:Entity {id: fact.target.name})
  ON CREATE SET target.type = fact.target.type
  MERGE (source)-[:RELATIONSHIP]->(r:Relationship {type: toUpper(replace(fact.relationship,' ','_'))})-[:RELATIONSHIP]->(target)
  MERGE (a)-[mr:MENTIONS_RELATIONSHIP]->(r)
  SET mr.confidence = fact.confidence
"""

batch_step = 100

for offset in range(0, len(nlp_output), batch_step):
    batch = nlp_output[offset : offset + batch_step]
    gds.run_cypher(nlp_import_query, {"data": batch})