<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/neo4jdocs/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install selenium==4.7.2 transformers==4.25.1 sentence-transformers===2.2.2 graphdatascience

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!apt-get update
!apt install chromium-chromedriver
!apt install -y xvfb

!pip install undetected-chromedriver
!pip install PyVirtualDisplay

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Reading package lists... Done
Reading package lists...

In [3]:
!zip -j /content/chromedriver_linux64.zip /usr/bin/chromedriver
# replace python3.8 with your own version of python in case it's not the same
patcher_src = (
    "/usr/local/lib/python3.8/dist-packages/undetected_chromedriver/patcher.py"
)
with open(patcher_src, "r") as f:
    contents = f.read()
    contents = contents.replace(
        "return urlretrieve(u)[0]",
        "return urlretrieve('file:///content/chromedriver_linux64.zip',"
        "filename='/tmp/chromedriver_linux64.zip')[0]",
    )
with open(patcher_src, "w") as f:
    f.write(contents)

updating: chromedriver (deflated 51%)


In [4]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import undetected_chromedriver.v2 as uc
from pyvirtualdisplay import Display

display = Display(visible=0, size=(800, 600))
display.start()

options = uc.ChromeOptions()
options.add_argument("--no-sandbox")
wd = uc.Chrome(options=options)

In [6]:
def extract_text_by_class(c):
    global wd
    try:
        content = wd.find_element(By.CLASS_NAME, c)
        return content.text
    except:
        return ""


def extract_links_by_xpath(xpath):
    global wd
    links = set()
    try:
        a_elems = wd.find_elements(By.XPATH, xpath)
        for elem in a_elems:
            link = elem.get_attribute("href")
            if link == "javascript:void(0)":
                continue
            # Remove links to images and various files
            if (
                link.endswith(".png")
                or link.endswith(".json")
                or link.endswith(".txt")
                or link.endswith(".svg")
                or link.endswith(".ipynb")
                or link.endswith(".jpg")
                or link.endswith(".pdf")
                or link.endswith(".mp4")
                or "mailto" in link
            ):
                continue
            # Remove anchors
            link = link.split("#")[0]
            # Remove parameters
            link = link.split("?")[0]
            # Remove trailing forward slash
            link = link.rstrip("/")
            links.add(link)
        return list(links)
    except:
        return []

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers import pipeline
import torch

tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained(
    "yanekyuk/bert-uncased-keyword-extractor"
)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


def extract_keywords(text):
    """
    Extract keywords and construct them back from tokens
    """
    result = list()
    keyword = ""
    ner_results = nlp(text)
    a = [x["word"] for x in ner_results]
    for token in a:
        if token.startswith("##"):
            keyword += token[2:]
        else:
            if keyword:
                result.append(keyword)
            keyword = token
    return result

In [8]:
extract_keywords(
    """
Web APIs are a huge opportunity to access and integrate data from any sources with your graph. Most of them provide the data in JSON format.

The Load JSON procedures retrieve data from URLs or maps and turn it into map value(s) for Cypher to consume. Cypher has support for deconstructing nested documents with dot syntax, slices, UNWIND etc. so it is easy to turn nested data into graphs.

Sources with multiple JSON objects (JSONL,JSON Lines) in a stream, like the streaming Twitter format or the Yelp Kaggle dataset, are also supported,
"""
)

['graph', 'json', 'cypher', 'syntax', 'twitter', 'yelp', 'kaggle']

In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def generate_embeddings(text):
    embeddings = model.encode(text)
    return [float(x) for x in embeddings.tolist()]

In [10]:
generate_embeddings(
    """
Web APIs are a huge opportunity to access and integrate data from any sources with your graph. Most of them provide the data in JSON format.

The Load JSON procedures retrieve data from URLs or maps and turn it into map value(s) for Cypher to consume. Cypher has support for deconstructing nested documents with dot syntax, slices, UNWIND etc. so it is easy to turn nested data into graphs.

Sources with multiple JSON objects (JSONL,JSON Lines) in a stream, like the streaming Twitter format or the Yelp Kaggle dataset, are also supported,
"""
)

[-0.06911397725343704,
 0.05224502459168434,
 -0.00720175402238965,
 -0.015374758280813694,
 0.0039060525596141815,
 -0.05042458698153496,
 -0.10675830394029617,
 0.062257084995508194,
 0.00325472722761333,
 0.013045601546764374,
 0.00743217533454299,
 0.0024386586155742407,
 0.07324245572090149,
 0.06017129123210907,
 0.13454288244247437,
 0.03666399419307709,
 -0.007088605314493179,
 0.030180761590600014,
 0.026259899139404297,
 -0.147831529378891,
 0.01147543266415596,
 0.0016846602084115148,
 -0.04652785509824753,
 -0.03765551745891571,
 0.04880434647202492,
 0.08559443056583405,
 -0.03026597388088703,
 0.05212768539786339,
 0.04567540064454079,
 0.06373853981494904,
 -0.02977653034031391,
 -0.010671362280845642,
 -0.05804385617375374,
 0.058136411011219025,
 -0.09820529818534851,
 0.009667888283729553,
 0.035300180315971375,
 0.009530092589557171,
 0.013390504755079746,
 0.05266077071428299,
 0.042114611715078354,
 0.012197610922157764,
 -0.04744803532958031,
 0.018271852284669876

In [11]:
from selenium.webdriver.common.by import By

entry_url = "https://neo4j.com/docs"
data = dict()
visit_list = [entry_url]
already_visited = []

while visit_list:
    # Visit the URL
    current_url = visit_list.pop()
    if current_url in already_visited:
        continue
    print(current_url)
    try:
        wd.get(current_url)
    except:
        print(f"Couldn't open {current_url}")
    # Extract text from the content div
    text = extract_text_by_class("content")
    # If nothing is found, try article div
    if not text:
        text = extract_text_by_class("article")
    # If nothing is found, try page div
    if not text:
        text = extract_text_by_class("page")
    if not text:
        text = extract_text_by_class("single-user-story")

    # Generate paragraph embedding & extract keywords
    if text:
        embeddings = generate_embeddings(text)
        keywords = extract_keywords(text)
    else:
        embeddings = []
        keywords = []

    # Extract links from the content div
    links = extract_links_by_xpath("//div[@class='content']//a[@href]")
    # If nothing is found, try article div
    if not links:
        links = extract_links_by_xpath("//article[@class='article']//a[@href]")
    if not links:
        links = extract_links_by_xpath("//article//a[@href]")

    if not links and not text:
        print(f"Couldn't retrieve the data from {current_url}")
    # Store page information
    data[current_url] = {
        "links": links,
        "text": text,
        "embeddings": embeddings,
        "keywords": list(set(keywords)),
    }
    # Crawling information
    already_visited.append(current_url)
    # Don't leave neo4j.com while crawling
    visit_list.extend(
        [l for l in list(links) if ("neo4j.com" in l)  
         and (not l in already_visited)  
         and (not "community.neo4j.com" in l)  
         and (not "sandbox.neo4j.com" in l)]
    )

https://neo4j.com/docs
https://neo4j.com/docs/graph-data-science-client
https://neo4j.com/docs/graph-data-science-client/current/known-limitations
https://neo4j.com/docs/graph-data-science-client/1.5/tutorials/load-data-via-graph-construction
https://neo4j.com/docs/graph-data-science-client/1.5/known-limitations
https://neo4j.com/docs/graph-data-science/current/management-ops/utility-functions
https://neo4j.com/docs/graph-data-science/2.2/alpha-algorithms/all-pairs-shortest-path
https://neo4j.com/docs/graph-data-science/2.2/algorithms/random-walk
https://neo4j.com/docs/graph-data-science/2.2/common-usage/running-algos
https://neo4j.com/docs/graph-data-science/2.2/common-usage/projecting-graphs
https://neo4j.com/docs/graph-data-science/2.2/management-ops/graph-catalog-ops
https://neo4j.com/docs/graph-data-science/2.2/management-ops/projections/graph-project
https://neo4j.com/docs/graph-data-science/2.2/management-ops/projections/graph-project-cypher-aggregation
https://neo4j.com/docs/cy

KeyboardInterrupt: ignored

In [None]:
import json

j = json.dumps(data)
# open file for writing, "w"
f = open("neo4j_docs.json", "w")

# write json object to file
f.write(j)

# close file
f.close()

In [None]:
# Import to Neo4j
from graphdatascience import GraphDataScience

host = "bolt://localhost:7687"
user = "neo4j"
password = "letmein"

gds = GraphDataScience(host, auth=(user, password))

gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (p:Page) REQUIRE p.url IS UNIQUE;
"""
)

gds.run_cypher(
    """
CREATE CONSTRAINT IF NOT EXISTS FOR (k:Keyword) REQUIRE k.name IS UNIQUE;
"""
)

In [None]:
import_query = """

UNWIND $data AS row
MERGE (p:Page {url:row.url})
SET p.embedding = row.embedding,
    p.has_text = row.has_text
FOREACH (l in row.links    | MERGE (p1:Page {url:l}) MERGE (p)-[:LINKS_TO]->(p1))
FOREACH (k in row.keywords | MERGE (k1:Keyword {name:k}) MERGE (p)-[:HAS_KEYWORD]->(k1))
"""
x = 1
params = []
for key in data:
    params.append(
        {
            "url": key,
            "embedding": data[key]["embeddings"],
            "keywords": data[key]["keywords"],
            "links": data[key]["links"],
            "has_text": True if data[key]["text"] else False,
        }
    )
    # Batch per 500
    if len(params) == 500:
        gds.run_cypher(import_query, {"data": params})
        params = []
        # Logging
        print(f"Importing {x} batch")
        x += 1

# Import the remainder
gds.run_cypher(import_query, {"data": params})