# Data preprocessing

In [49]:
# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda
target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
import urllib.request
data = urllib.request.urlopen(target_url)
raw_data = data.read().decode('utf8').strip()

In [50]:
import re
# Clean the data a bit
#chapters = raw_data.replace('\n',' ').replace('\r','').replace('').split('CHAPTER')[1:]
chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]

In [51]:
chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]

# Import into Neo4j

In [52]:
import spacy
nlp = spacy.load("en_core_web_lg")


In [53]:
import neo4j
driver = neo4j.GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "burek123"))


In [54]:
c = chapters[0]
# Get involved
doc=nlp(c)
save_query ="""
MERGE (p1:Person{name:$name1})
MERGE (p2:Person{name:$name2})
MERGE (p1)-[r:RELATED]-(p2)
ON CREATE SET r.score = 1
ON MATCH SET r.score = r.score + 1"""

#define constraint
constraint_query="CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;"
with driver.session() as session:
    session.run(constraint_query)
    # Define the mapping
    involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
    decode = dict()
    for i,x in enumerate(involved):
        # Get mapping
        decode['$${}$$'.format(i)] = x
        # Preprocess text
        c = c.replace(x,' $${}$$ '.format(i))
    # Split chapter into words
    ws = c.split()
    l = len(ws)
    for wi,w in enumerate(ws):
        # Skip if the word is not a person
        #print(w[:2])
        if not w[:2] == '$$':
            continue
        # Check next x words for any involved person
        x = 14
        for i in range(wi+1,wi+x):
            # Avoid list index error
            if i >= l:
                break
            # Skip if the word is not a person
            if not ws[i][:2] == '$$':
                continue
            # Store to Neo4j
            # Todo: Maybe some automated mapping of name to surnames etc..
            params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}
            session.run(save_query, params)
            print(decode[ws[wi]],decode[ws[i]])
        

Elphbergs Rudolf
Rudolf Rose
Robert Ancestry
Ancestry Rose
Robert Robert
Rudolf Rose
Rudolf Robert
Robert Rudolf
Ruritania Burlesdon
Burlesdon George II
George II King
Rudolf the Third Ruritania  
Burlesdon Amelia
James Burlesdon
Burlesdon a Knight of the Garter
a Knight of the Garter Rudolf
a Knight of the Garter Ruritania  
Rudolf Ruritania  
Jacob Jacob
Jacob Rudolf
Elphbergs Elphberg
Elphberg Rudolf
Bob Rose


# Graph Analysis

In [55]:
pagerank ="""
CALL algo.pageRank('Person','RELATED',{direction:'BOTH'})
"""
louvain = """
CALL algo.louvain('Person','RELATED',{direction:'BOTH'})
"""
with driver.session() as session:
    session.run(pagerank)
    session.run(louvain)

# Vizualizations


In [56]:
from IPython.display import IFrame, HTML
import json
import uuid


def generate_vis(host, user, password, cypher, labels_json, relationships_json):
    html = """\
<html>
<head>
    <title>Neovis.js Simple Example</title>
            <style type="text/css">
                html, body {{
                    font: 16pt arial;
                }}
                #viz {{
                    width: 400px;
                    height: 450px;
                    font: 22pt arial;
                }}
            </style>
            <script src="https://cdn.neo4jlabs.com/neovis.js/v1.1.0/neovis.js"></script>
            <script
                    src="https://code.jquery.com/jquery-3.2.1.min.js"
                    integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4="
                    crossorigin="anonymous"></script>
            <script type="text/javascript">
                var viz;
                function draw() {{
                    var config = {{
                        container_id: "viz",
                        server_url: "{host}",
                        server_user: "{user}",
                        server_password: "{password}",
                        labels: {labels},
                        relationships: {relationships},
                        initial_cypher: "{cypher}"
                    }};
                    viz = new NeoVis.default(config);
                    viz.render();                    
                    viz.onVisualizationRendered(function(ctx) {{
                        let imageSrc = document.getElementsByTagName("canvas")[0].toDataURL();
                        console.log(imageSrc);
                        document.getElementById("viz-image").src=imageSrc;
                        //document.getElementById("viz").style="display:none";
                        
                        let kernel = IPython.notebook.kernel;
                        //let command = 'display(HTML('<img id="viz-image" width="300px" src="' + imageSrc + '" />';
                        let command = "foo = 'bar'";
                        kernel.execute(command);
                        
                    }});
                }}
            </script>
         </head>
        <body onload="draw()">
            <div id="viz"></div>
        </body>
    </html>
    """

    html = html.format(
        host=host,
        user=user,
        password=password,
        cypher=cypher,
        labels = json.dumps(labels_json),
        relationships=json.dumps(relationships_json)
        # relationships=json.dumps(relationships).replace("{", "{{").replace("}", "}}")
    )

    unique_id = str(uuid.uuid4())
    filename = "figure/graph-{}.html".format(unique_id)

    with open(filename, "w") as f:
        f.write(html)
    return IFrame(src=filename, width=700, height=600)

        

In [57]:
cypher = "MATCH (p1:Person)-[r:RELATED]->(p2:Person) RETURN *"

labels_json = {
    "Person": {
        "caption": "name",
        "size": "pagerank",
        "community": "community"
    }
}

relationships_json = {
    "RELATED": {
        "thickness": "score",
        "caption": False
    }
}

host = "bolt://localhost:7687"
user = 'neo4j'
password = 'burek123'

generate_vis(host, user, password, cypher, labels_json, relationships_json)

In [114]:
# Additional options
# Add orgs
c = chapters[0]
doc = nlp(c)

save_query = """

MERGE (p:Person{name:$person})
MERGE (o:Organization{name:$org})
MERGE (p)-[r:PART_OF]->(o)
ON CREATE SET r.score = 1
ON MATCH SET r.score = r.score + 1

"""

with driver.session() as session:
    # Define the mapping
    persons = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
    orgs = list(set([ent.text for ent in doc.ents if ent.label_=='ORG']))
    decode_org = dict()
    decode_person = dict()
    # Replace person
    for i,p in enumerate(persons):
        decode_person['$${}$$'.format(i)] = p
        r = ' $${}$$ '.format(i)
        c = c.replace(p,r)
    # Replace organizations
    print(orgs)
    for i,o in enumerate(orgs):
        decode_org['&&{}&&'.format(i)] = o
        c = c.replace(o,' &&{}&& '.format(i))    
    # Split chapter into words
    ws = c.split()
    l = len(ws)
    for wi,w in enumerate(ws):
        # Skip if the word is not a organization
        if not w[:2] == '&&':
            continue
        # Check previous and next x words for any involved person
        x = 5
        for i in range(wi-x,wi+x):
            # Avoid list index error
            if i >= l:
                break
            # Skip if the word is not a person
            if (ws[i][:2]!='$$') or (i==wi):
                continue
            # Store to Neo4j
            # Todo: Maybe some automated mapping of name to surnames etc..
            params = {'org':decode_org[ws[wi]],'person':decode_person[ws[i]]}
            session.run(save_query, params)
            print(decode_org[ws[wi]],decode_person[ws[i]])

['House', 'Royal House', 'Rassendylls', 'Embassy', 'the English Court', 'Strelsau', 'Court']
Strelsau Burlesdon
House Elphberg
