In [4]:
import os
import logging
from dotenv import load_dotenv

import pandas as pd
from neo4j import GraphDatabase
import time

In [9]:
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

load_dotenv()

True

In [10]:
driver = GraphDatabase.driver(uri=f"bolt://localhost:{os.getenv('NEO4J_LOCAL_DB_PORT')}", auth=(os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD")))

In [11]:
def batched_import(statement, df, batch_size=1000):
    total = len(df)
    start_s = time.time()
    for start in range(0,total, batch_size):
        batch = df.iloc[start: min(start+batch_size,total)]
        result = driver.execute_query("UNWIND $rows AS value " + statement, 
                                      rows=batch.to_dict('records'))
        print(result.summary.counters)
    print(f'{total} rows in { time.time() - start_s} s.')    
    return total

In [12]:
# create constraints

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for s in statements:
    if len((s or "").strip()) > 0:
        print(s)
        driver.execute_query(query_=s)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique


INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT entity_id IF NOT EXISTS FOR (e:__Entity__) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT entity_id FOR (e:__Community__) REQUIRE (e.community) IS UNIQUE` already exists.} {position: None} for query: '\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique'
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT entity_title IF NOT EXISTS FOR (e:__Covariate__) REQUIRE (e.title) IS UNIQUE` has no effect.} {description: `CONSTRAINT entity_title FOR (e:__Entity__) REQUIRE (e.title) IS UNIQUE` already exists.} {position: None} for query: '\ncreate constraint entity_title if not exists for (e:__


create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.title is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [13]:
text_df = pd.read_parquet('create_final_text_units.parquet')
text_df.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'create_final_text_units.parquet'

In [6]:
statement = """
MERGE (n:__Chunk__ {id:value.id})
SET n += value {.text, .n_tokens}
WITH n, value
UNWIND value.document_ids AS document
MERGE (d:__Document__ {id:document})
MERGE (n)-[:PART_OF_DOCUMENT]->(d)
"""
batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 13, 'relationships_created': 12, 'nodes_created': 13, 'properties_set': 37}
12 rows in 0.08599472045898438 s.


12

In [8]:
entity_df = pd.read_parquet('create_final_entities.parquet')
entity_df.head(2)

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,ALEX MERCER,PERSON,Alex Mercer is a character with a military bac...,0,,"[00fafabae48948779fee2afe600f5143, 1e433d6b308...","[0.009358493611216545, -0.02407047711312771, -..."
1,4119fd06010c494caa07f439b333f4c5,TAYLOR CRUZ,PERSON,Taylor Cruz is a character who plays a pivotal...,1,,"[00fafabae48948779fee2afe600f5143, 1e433d6b308...","[0.0020127426832914352, -0.027186712250113487,..."


In [9]:
entity_statement = """
MERGE (n:__Entity__ {id:value.id})
SET n += value {.human_readable_id, .description, name:replace(value.name,'"',''), .description_embedding}
WITH n, value
CALL apoc.create.addLabels(n, case when value.type is null OR value.type = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MERGE (c:__Chunk__ {id:text_unit})
MERGE (c)-[:MENTIONS]->(n)
RETURN count(*)
"""
batched_import(entity_statement, entity_df)

{'_contains_updates': True, 'labels_added': 217, 'relationships_created': 307, 'nodes_created': 217, 'properties_set': 1085}
217 rows in 0.37180399894714355 s.


217

In [10]:
rel_df = pd.read_parquet('create_final_relationships.parquet')
rel_df.head(2)

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,ALEX MERCER,TAYLOR CRUZ,7.0,Alex Mercer and Taylor Cruz are integral membe...,"[00fafabae48948779fee2afe600f5143, 1e433d6b308...",b35c3d1a7daa4924b6bdb58bc69c354d,0,9,12,21
1,ALEX MERCER,TAYLOR CRUZ,7.0,Alex Mercer and Taylor Cruz are integral membe...,"[00fafabae48948779fee2afe600f5143, 1e433d6b308...",b35c3d1a7daa4924b6bdb58bc69c354d,0,9,12,21


In [11]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""
batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 69, 'properties_set': 1449}
276 rows in 0.1078798770904541 s.


276

In [12]:
community_df = pd.read_parquet('create_final_community_reports.parquet')
community_df.head(2)

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,4,# Dulce Base and the Paranormal Military Squad...,1,8.5,Dulce Base and the Paranormal Military Squad: ...,The impact severity rating is high due to the ...,"The community is centered around Dulce Base, a...",[{'explanation': 'Dulce Base is the primary lo...,"{\n ""title"": ""Dulce Base and the Paranormal...",6f8ba6b6-506e-46c1-83ce-982d59622554
1,5,# Sam Rivera and the Paranormal Military Squad...,1,7.5,Sam Rivera and the Paranormal Military Squad a...,The impact severity rating is high due to the ...,"The community is centered around Sam Rivera, a...",[{'explanation': 'Sam Rivera is recognized for...,"{\n ""title"": ""Sam Rivera and the Paranormal...",418f4536-d673-4212-8a7c-ca1aac547d0f


In [13]:
# import communities
# Run only once / not idempotent
community_statement = """
MERGE (c:__Community__ {id:value.id})
SET c += value {.community, .level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND value.findings AS finding
CREATE (c)-[:HAS_FINDING]->(f:Finding)
SET f += finding
"""
batched_import(community_statement, community_df)

{'_contains_updates': True, 'labels_added': 37, 'relationships_created': 31, 'nodes_created': 37, 'properties_set': 110}
6 rows in 0.05302619934082031 s.


6

In [14]:
cov_df = pd.read_parquet('create_final_covariates.parquet')
cov_df.head(2)
# Subject id do not match entity ids

Unnamed: 0,id,human_readable_id,covariate_type,type,description,subject_id,subject_type,object_id,object_type,status,start_date,end_date,source_text,text_unit_id,document_ids,n_tokens
0,ad5a2020-cdec-4982-acdf-dbe5ee530066,1,claim,MISSION INVOLVEMENT,Agent Alex Mercer's compliance in the briefing...,AGENT ALEX MERCER,,NONE,,SUSPECTED,NONE,NONE,"""With dulled eyes, he scanned the projectors o...",2cf7a230c367a2dfaf0fc3c903eb8948,[958fdd043f17ade63cb13570b59df295],2500
1,9d8a0fe5-07b7-4b1a-b5be-1317d0fac005,2,claim,AUTHORITY EXERCISE,Agent Taylor Cruz exercises authority and dema...,AGENT TAYLOR CRUZ,,NONE,,TRUE,NONE,NONE,"""It was Taylor Cruz’s voice, laced with an edg...",2cf7a230c367a2dfaf0fc3c903eb8948,[958fdd043f17ade63cb13570b59df295],2500


In [16]:
# import covariates
cov_statement = """
MERGE (c:__Covariate__ {id:value.id})
SET c += apoc.map.clean(value, ["text_unit_id", "document_ids", "n_tokens"], [Null, ""])
WITH c, value
MATCH (ch:__Chunk__ {id: value.text_unit_id})
MERGE (ch)-[:HAS_COVARIATE]->(c)
"""
batched_import(cov_statement, cov_df)

{'_contains_updates': True, 'labels_added': 89, 'relationships_created': 89, 'nodes_created': 89, 'properties_set': 1061}
89 rows in 0.13370895385742188 s.


89

In [32]:
nodes_df = pd.read_parquet('create_final_nodes.parquet')
nodes_df[nodes_df['title'] == 'ALEX MERCER']

Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,ALEX MERCER,PERSON,Alex Mercer is a character with a military bac...,"00fafabae48948779fee2afe600f5143,1e433d6b30887...",1,9,0,b45241d70f0e43fca764df95b2b81f77,9,,,b45241d70f0e43fca764df95b2b81f77,0,0
217,1,ALEX MERCER,PERSON,Alex Mercer is a character with a military bac...,"00fafabae48948779fee2afe600f5143,1e433d6b30887...",4,9,0,b45241d70f0e43fca764df95b2b81f77,9,,,b45241d70f0e43fca764df95b2b81f77,0,0


In [35]:
nodes_df.community.value_counts()

community
1    14
2     9
4     9
0     6
5     5
3     3
Name: count, dtype: int64

In [39]:
# Connect nodes to first level community
first_df = nodes_df[nodes_df['community'].notna()]
first_statement = """
MATCH (c:__Entity__ {name:replace(value.title,'"','')})
MATCH (c1:__Community__ {community: value.community})
MERGE (c)-[:IN_COMMUNITY]->(c1)
RETURN count(distinct c1)
"""
batched_import(first_statement, first_df)

{}
46 rows in 0.06763219833374023 s.


46