<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/Spacy_Neo4j_Gutenberg_Book/Spacy%20NER%20with%20Neo4j%20Clustering%20on%20Gutenberg%20book.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Updated to GDS 2.0 version
* Link to original blog post: https://towardsdatascience.com/network-analysis-of-prisoners-of-zenda-book-with-spacy-and-neo4j-b0839a640105

In [1]:
!pip install neo4j spacy
!python -m spacy download en_core_web_lg


Collecting neo4j
  Downloading neo4j-4.4.2.tar.gz (89 kB)
[?25l[K     |███▋                            | 10 kB 16.4 MB/s eta 0:00:01[K     |███████▎                        | 20 kB 11.0 MB/s eta 0:00:01[K     |███████████                     | 30 kB 8.9 MB/s eta 0:00:01[K     |██████████████▋                 | 40 kB 8.3 MB/s eta 0:00:01[K     |██████████████████▎             | 51 kB 4.3 MB/s eta 0:00:01[K     |██████████████████████          | 61 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████████▋      | 71 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 81 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 89 kB 3.9 MB/s 
Building wheels for collected packages: neo4j
  Building wheel for neo4j (setup.py) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=375f8f269c3b4b2bd5e68f1c77965807b84099501344af227bf704360c6d44e3
  Stored in directory: /root/.cache/pip/wheels/10/d6/28/9502

Restart runtime before continuing in order for SpaCy to work

# Data preprocessing

In [1]:
# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

# Fetch the data
target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
import urllib.request
data = urllib.request.urlopen(target_url)
raw_data = data.read().decode('utf8').strip()

# Preprocess text into chapters 
import re
chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]
chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]

# Import into Neo4j

In [2]:
# import spacy and load an NLP model
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser"])


In [3]:
# Import Neo4j and define cypher queries
import neo4j
host = 'bolt://3.235.2.228:7687'
user = 'neo4j'
password = 'seats-drunks-carbon'

driver = neo4j.GraphDatabase.driver(host, auth=(user, password))

save_query ="""
MERGE (p1:Person{name:$name1})
MERGE (p2:Person{name:$name2})
MERGE (p1)-[r:RELATED]-(p2)
ON CREATE SET r.score = 1
ON MATCH SET r.score = r.score + 1"""

constraint_query="CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;"

In [4]:
# Run the analysis of the first chapter
c = chapters[0]
# Get involved
doc=nlp(c)

with driver.session() as session:
    #define constraint
    session.run(constraint_query)
    # Extract Person labels
    involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
    # Preprocess text
    decode = dict()
    for i,x in enumerate(involved):
        # Get mapping
        decode['$${}$$'.format(i)] = x
        # Preprocess text
        c = c.replace(x,' $${}$$ '.format(i))
        
    # Split chapter into words
    ws = c.split()
    l = len(ws)
    # Iterate through words
    for wi,w in enumerate(ws):
        # Skip if the word is not a person
        if not w[:2] == '$$':
            continue
        # Check next x words for any involved person
        x = 14
        for i in range(wi+1,wi+x):
            # Avoid list index error
            if i >= l:
                break
            # Skip if the word is not a person
            if not ws[i][:2] == '$$':
                continue
            # Store to Neo4j
            params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}
            session.run(save_query, params)
            print(decode[ws[wi]],decode[ws[i]])
        

Rassendylls Elphberg
Rudolf Rose  
Rassendylls Robert
Robert Robert
Rudolf Rose  
Robert Good heavens   
Good heavens    Rudolf
Rudolf Robert
Robert Rudolf
Elphberg Rassendylls
Burlesdon Strelsau
Burlesdon Amelia
James   Burlesdon
James   Rassendyll
Burlesdon Rassendyll
Burlesdon a Knight of the Garter
Rassendyll a Knight of the Garter
Rassendyll Rudolf
a Knight of the Garter Rudolf
Rose   Nonsense   
Jacob Jacob
Jacob Rudolf
Elphberg Elphberg
Elphberg Rudolf
Rudolf Strelsau
Bob Rose


# Graph Analysis

In [5]:
# Project the graph
graph_projection = """
CALL gds.graph.project('ch1', 'Person', {RELATED:{orientation:'UNDIRECTED'}})
"""

# Run pagerank and louvain algorithm
pagerank ="""
CALL gds.pageRank.write('ch1',{writeProperty:'pagerank'})
"""
louvain = """
CALL gds.louvain.write('ch1',{writeProperty:'louvain'})
"""

drop_graph = """
CALL gds.graph.drop('ch1')
"""

with driver.session() as session:
    session.run(graph_projection)
    session.run(pagerank)
    session.run(louvain)
    session.run(drop_graph)

# Results

In [6]:
# Import libraries
import pandas as pd

def read_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [8]:
# Evaluate pagerank
read_query("""
MATCH (c:Person)
RETURN c.name AS character, c.pagerank AS score
ORDER BY score DESC LIMIT 5
""")

Unnamed: 0,character,score
0,Rudolf,2.234279
1,Burlesdon,1.550467
2,Robert,1.366045
3,Rassendyll,1.177921
4,Elphberg,1.115947


In [9]:
# Evaluate louvain
read_query("""
MATCH (c:Person)
RETURN c.louvain AS community, collect(c.name) AS members
ORDER BY size(members) DESC
""")

Unnamed: 0,community,members
0,3,"[Rudolf, Rose , Strelsau, Nonsense , Jacob]"
1,10,"[Burlesdon, Amelia, James , Rassendyll, a Kni..."
2,5,"[Rassendylls, Elphberg, Robert, Good heavens ]"
3,15,"[Bob, Rose]"
