In [1]:
! pip install neo4j
! pip install spark-nlp



In [2]:
import json
import re
import pandas as pd
import sparknlp

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import lit, col

import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *

In [3]:
packages = [
    'JohnSnowLabs:spark-nlp:2.5.0',
    'com.databricks:spark-xml_2.11:0.6.0'
]

packages = [
    'JohnSnowLabs:spark-nlp:2.2.2',
    'com.databricks:spark-xml_2.11:0.6.0'
]

In [4]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName('Knowledge Graph')\
    .config('spark.driver.memory', '10g')\
    .config('spark.jars.packages', ','.join(packages))\
.getOrCreate()

In [5]:
!ls ../datasets/wiki_data_for_knowledge_base/

simplewiki-20200601-pages-articles-multistream-index.txt.bz2
simplewiki-20200601-pages-articles-multistream.xml.bz2


In [6]:
df = spark.read\
    .format('xml')\
    .option('rootTag', 'mediawiki')\
    .option('rowTag', 'page')\
    .load('../datasets/wiki_data_for_knowledge_base/simplewiki-20200601-pages-articles-multistream.xml.bz2')\
    .persist()

In [7]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- restrictions: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    | 

In [8]:
df.count()

306635

looking at page for "Paper"

In [9]:
row = df.filter('title = "Paper"').first()

In [10]:
print('ID', row['id'])
print('Title', row['title'])
print()
print('redirect', row['redirect'])
print()
print('text')
print(row['revision']['text']['_VALUE'])

ID 3319
Title Paper

redirect None

text
[[File:Cranes made by Origami paper.jpg|thumb|132x132px|Paper creations]]
[[File:InternationalPaper6413.jpg|right|frame|[[International Paper]] is the world’s largest pulp and paper maker]]
[[File:Gutenberg bible Old Testament Epistle of St Jerome.jpg|thumb|right|250px|Before about 1820, most printed books used [[wikt:rag|rag]] paper, usually made from waste cotton rags]]
[[File:Manuscripts in the Yunnan Nationalities Museum - DSC03947.JPG|thumb|right|250px|Dai scripture on [[mulberry]]-bark paper, Yunnan, China]]
[[File:Papyrus.jpg|thumb|right|250px|Writing on [[papyrus]]]]
[[File:Ruins of the guard tower.jpg|thumb|right|220px|Ruins of a watchtower on the [[Great Wall of China]]: see "first paper"]]

Modern '''paper''' is a thin [[material]] of (mostly) [[wood fibre]]s pressed together. People write on paper with a [[pencil]] or [[pen]], and [[book]]s are made of paper. Paper can absorb [[liquid]]s such as [[water]], so people can clean things 

looking at category now

In [11]:
df.filter('title RLIKE "Category.*"').select('title')\
    .show(10, False, True)

-RECORD 0--------------------------
 title | Category:Computer science 
-RECORD 1--------------------------
 title | Category:Sports           
-RECORD 2--------------------------
 title | Category:Athletics        
-RECORD 3--------------------------
 title | Category:Body parts       
-RECORD 4--------------------------
 title | Category:Tools            
-RECORD 5--------------------------
 title | Category:Movies           
-RECORD 6--------------------------
 title | Category:Grammar          
-RECORD 7--------------------------
 title | Category:Mathematics      
-RECORD 8--------------------------
 title | Category:Alphabet         
-RECORD 9--------------------------
 title | Category:Countries        
only showing top 10 rows



In [12]:
df.filter('redirect IS NOT NULL')\
    .select('redirect._title', 'title')\
    .show(1, False, True)

-RECORD 0-------------
 _title | Catharism   
 title  | Albigensian 
only showing top 1 row



So we get synonymy relationship, our entitles will be titles of articles. Our relationships will be redirects, and link in the related section of the page. Lets get our entities.

In [13]:
entities = df.select('title').collect()
entities = [r['title'] for r in entities]
entities = set(entities)
print(len(entities))

306635


#### We may introduce same-category relationship too, we extract the categories too.

In [14]:
categories = [e for e in entities if e.startswith('Category:')]
entities = [e for e in entities if not e.startswith('Category:')]

####  now we will get redirects

In [15]:
redirects = df.filter('redirect IS NOT NULL')\
    .select('redirect._title', 'title').collect()
redirects = [(r['_title'], r['title']) for r in redirects]
print(len(redirects))

67878


#####  now we can get articles from revision.text._VALUE

In [16]:
data = df.filter('redirect IS NULL').selectExpr(
    'revision.text._VALUE AS text',
    'title'
).filter('text IS NOT NULL')

to get related links, we need to know what section we are in. So we will split the texts into sections. We can then use the RegexMatcher annotator to identify links. Looking at the data, it looks like sections look like == Paper making == as we saw in the example above. Lets define a regex for this, adding in the possibility for extra white space

In [17]:
section_ptn = re.compile(r'^ *==[^=]+ *== *$')

Now we will define a function that will take a partition of the data and generate new rows for the sections. We will need to keep track of the article title section and the text of the solution.

In [18]:
def sectionize(rows):
    for row in rows:
        title = row['title']
        text = row['text']
        lines = text.split('\n')
        buffer = []
        section = 'START'
        for line in lines:
            if section_ptn.match(line):
                yield(title, section, '\n'.join(buffer))
                section = line.strip('=').strip().upper()
                buffer = []
                continue
            buffer.append(line)

Now we will call mapPartitions to create a new RDD and convert that to DataFrame

In [19]:
sections = data.rdd.mapPartitions(sectionize)
sections = spark.createDataFrame(sections, \
                                ['title', 'section', 'text'])

Now we take a look at the most common sections

In [20]:
sections.select('section').groupBy('section')\
    .count().orderBy(col('count').desc()).take(10)

[Row(section='START', count=127495),
 Row(section='REFERENCES', count=35681),
 Row(section='RELATED PAGES', count=8820),
 Row(section='HISTORY', count=6683),
 Row(section='CLUB CAREER STATISTICS', count=3907),
 Row(section='INTERNATIONAL CAREER STATISTICS', count=2489),
 Row(section='GEOGRAPHY', count=2476),
 Row(section='EARLY LIFE', count=2019),
 Row(section='NOTES', count=1887),
 Row(section='CAREER', count=1865)]

In [21]:
%%writefile wiki_regexes.csv
\[\[[^\]]+\]\]~link
\{\{[^\}]+\}\}~anchor

Overwriting wiki_regexes.csv


In [22]:
! ls

 config_graph_algo_with_pyspark_variable.py
'Graph Algo with Pyspark and Neo4j.ipynb'
 Iris-classification-with-pyspark.ipynb
'Knowledge Bases with Pyspark.ipynb'
 movie_review_analysis.py
'nlp with pyspark.ipynb'
'nlp with Spark NLP.ipynb'
'nlp with tensorflow2 - RNN Irish song generator.ipynb'
'nlp with tensorflow 2 - text sarcasm sentiment analysis.ipynb'
'nlp with tensorflow 2 - tokenizer and sequencer.ipynb'
'Sentiment Analysis using SparkNLP.ipynb'
'Sequence embedding with pyspark.ipynb'
'topic modelling with spark nlp.ipynb'
'What is Graph Analysis .ipynb'
 wiki-entities.csv
 wiki_regexes.csv
 wiki-related.csv
'Word Embedding Spark-Nlp.ipynb'


In [23]:
assembler = DocumentAssembler()\
    .setInputCol('text')\
    .setOutputCol('document')

matcher = RegexMatcher()\
    .setInputCols(['document'])\
    .setOutputCol('matches')\
    .setStrategy("MATCH_ALL")\
    .setExternalRules('wiki_regexes.csv', '~')

finisher = Finisher()\
    .setInputCols(['matches'])\
    .setOutputCols(['links'])

pipeline = Pipeline()\
    .setStages([assembler, matcher, finisher])\
    .fit(sections)

In [24]:
extracted = pipeline.transform(sections)

##### now we will define relationship based on just links occuring anywhere

In [25]:
links = extracted.select('title', 'section', 'links').collect()
links = [(r['title'], r['section'], link) for r in links for link in r['links']]
links = list(set(links))
print(len(links))

4336090


In [26]:
related = [(l[0], l[2]) for l in links if l[1] == "RELATED PAGES"]
related = [(e1, e2.strip('[').strip(']').split('|')[-1]) for e1, e2 in related]
related = list(set([(e1, e2) for e1, e2 in related]))
print(len(related))

21317


we have extracted entities, redirects and related links, now we will create csvs for them

##### NOTE: now we copy the generated CSVs to neo4j import folder

In [27]:
! ls ../checkpoints/neo4j/data_other_machine/

ls: cannot open directory '../checkpoints/neo4j/data_other_machine/': Permission denied


In [28]:
entities_df = pd.Series(entities, name='entity').to_frame()
entities_df.index.name = 'id'
entities_df.to_csv('./wiki-entities.csv', index=True, header=True)

In [29]:
e2id = entities_df.reset_index().set_index('entity')['id'].to_dict()

In [30]:
redirect_df = []
for e1, e2 in redirects:
    if e1 in e2id and e2 in e2id:
        redirect_df.append((e2id[e1], e2id[e2]))
redirect_df = pd.DataFrame(redirect_df, columns=['id1', 'id2'])
redirect_df.to_csv('./wiki-redirects.csv', index=False, header=True)

In [31]:
related_df = []
for e1, e2 in redirects:
    if e1 in e2id and e2 in e2id:
        related_df.append((e2id[e1], e2id[e2]))
related_df = pd.DataFrame(related_df, columns=['id1', 'id2'])
related_df.to_csv('./wiki-related.csv', index=False, header=True)

##### now we will query all entities related to "Language", and related to entities that are related to Language(i.e second-order relations).

In [32]:
import requests

In [37]:
from config_graph_algo_with_pyspark_variable import my_ip

ImportError: cannot import name 'my_port' from 'config_graph_algo_with_pyspark_variable' (/home/jovyan/projects/spark-nlp/notebooks/config_graph_algo_with_pyspark_variable.py)

### loading CSVs into Neo4J database with command 
LOAD CSV WITH HEADERS FROM "file:/wiki-entities.csv" AS csvLine<br>CREATE (e:Entity {id: toInteger(csvLine.id), entity: csvLine.entity})<br><br><br>
USING PERIODIC COMMIT :auto <br>LOAD CSV WITH HEADERS FROM "file:///wiki-redirected.csv" AS csvLine <br>MATCH (entity1:Entity {id: toInteger(csvLine.id1)}),(entity2:Entity {id: toInteger(csvLine.id2)}) <br>CREATE (entity1)-[:REDIRECTED {conxn: "redirected"}]->(entity2)<br><br><br>
USING PERIODIC COMMIT :auto <br>LOAD CSV WITH HEADERS FROM "file:///wiki-related.csv" AS csvLine <br>MATCH (entity1:Entity {id: toInteger(csvLine.id1)}),(entity2:Entity {id: toInteger(csvLine.id2)}) <br>CREATE (entity1)-[:RELATED {conxn: "related"}]->(entity2)

Now we will see what we can query. We will get all entities related to "Language", and related to entities that are related to Language (i.e. second-order relations).

In [38]:
query = '''
MATCH (e:Entity {entity: 'Language'})
RETURN e
UNION ALL
MATCH ("Entity {entity: 'Language'}")--(e:Entity)
RETURN e
UNION ALL
MATCH (:Entity {entity: 'Language'})--(e1:Entity)--(e:Entity)
RETURN e
'''

payload = {'query': query, 'params': {}}
endpoint = 'http://{}:7474/db/data/cypher'.format(my_ip)

response = requests.post(endpoint, json=payload)

KeyboardInterrupt: 