# Import Statements

In [None]:
import wikipedia
import textacy.corpus
import re
from textacy.extract import named_entities
from textacy.datasets.wikipedia import strip_markup
from py2neo import Node, Relationship, Graph, NodeSelector, NodeSelection

# Definitions

## Regular Expressions

In [2]:
re_section_headers = re.compile('==(.*?)==')
re_whitespace = re.compile('\\n{1,}\s*')
re_display_styles = re.compile('(\{\\\\displaystyle)(.*?)(,\}|\.\})')
re_duplicate_spaces = re.compile('(\s{2,})')

## Wikipedia

In [3]:
def create_indexes(graph): 
    graph.run('CREATE INDEX ON :Category(catId)')
    graph.run('CREATE INDEX ON :Category(catName)')
    graph.run('CREATE INDEX ON :Page(pageTitle)')

In [4]:
def create_root(graph, category_name): 
    graph.run('CREATE (c:Category:RootCategory {{catId:0, catName: "{0!s}", subcatsFetched: false, pagesFetched: false, level: 0}})'.format(category_name))

In [5]:
def fetch_categories(graph, levels=3): 
    graph.run('''
              UNWIND range(0, {0}) as level \n
              CALL apoc.cypher.doIt(" \n
              MATCH (c:Category {{subcatsFetched: false, level: $level}}) \n
              CALL apoc.load.json('https://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers&cmtype=subcat&cmtitle=Category:' + apoc.text.urlencode(c.catName) + '&cmprop=ids%7Ctitle&cmlimit=500') \n
              YIELD value as results \n
              UNWIND results.query.categorymembers AS subcat \n
              MERGE (sc:Category {{catId: subcat.pageid}}) \n
              ON CREATE SET sc.catName = substring(subcat.title, 9), \n
              sc.subcatsFetched = false, \n
              sc.pagesFetched = false, \n
              sc.level = $level + 1 \n
              WITH sc,c \n
              CALL apoc.create.addLabels(sc,['Level' +  ($level + 1) + 'Category']) YIELD node \n
              MERGE (sc)-[:SUBCAT_OF]->(c) \n
              WITH DISTINCT c \n
              SET c.subcatsFetched = true", {{ level: level }}) YIELD value \n
              RETURN value
              '''.format(levels))

In [6]:
def fetch_pages(graph, levels=3): 
    graph.run('''
              UNWIND range(0, {0}) as level \n
              CALL apoc.cypher.doIt(" \n
              MATCH (c:Category {{ pagesFetched: false, level: $level }}) \n
              CALL apoc.load.json('https://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers&cmtype=page&cmtitle=Category:' + apoc.text.urlencode(c.catName) + '&cmprop=ids%7Ctitle&cmlimit=500') \n
              YIELD value as results \n
              UNWIND results.query.categorymembers AS page \n
              MERGE (p:Page {{pageId: page.pageid}}) \n
              ON CREATE SET p.pageTitle = page.title, p.pageUrl = 'http://en.wikipedia.org/wiki/' + apoc.text.urlencode(replace(page.title, ' ', '_')) \n
              WITH p,c \n
              MERGE (p)-[:IN_CATEGORY]->(c) \n
              WITH DISTINCT c \n
              SET c.pagesFetched = true", {{ level: level }}) yield value \n
              return value \n
              '''.format(levels))

In [7]:
def retrieve_page_titles(graph, conditions = (), skip = None, limit = None): 
    selector = NodeSelector(graph)
    selected = list(selector.select("Page"))
    
    list_of_page_titles = []
    
    for node in selected:
        list_of_page_titles.append(node['pageTitle'])
    
    return list_of_page_titles;

In [8]:
def strip_markup(text): 
    text = re_section_headers.sub(' ', text)
    text = re_whitespace.sub(' ', text)
    text = re_display_styles.sub(' ', text)
    text = re_duplicate_spaces.sub(' ', text)
    
    return text;

## Corpus

In [9]:
def generate_streams(page_titles): 
    text_list = []
    meta_list = []
    
    for title in page_titles:
        wikipage = wikipedia.WikipediaPage(title)
        text = strip_markup(wikipage.content)
        text_list.append(text)
        meta_list.append({'title': wikipage.title, 
                          'categories': wikipage.categories, 
                          'links': wikipage.links})
        
    return text_list, meta_list;

# Testing

In [10]:
# Connect to the existing Neo4j graph 
graph = Graph(password = 'password')

# Set up rate limiting for wikipedia library
wikipedia.set_rate_limiting(True)

In [11]:
# Delete all existing nodes and relationships that may happen to exist in the graph 
graph.delete_all()

# Create indexes for faster retrieval
create_indexes(graph)

# Creates the initial category to search from
create_root(graph, 'Moment (mathematics)')

In [12]:
# Find categories that are related to existing category 
fetch_categories(graph, levels = 1)

In [13]:
# Find pages that are related to the category nodes 
fetch_pages(graph, levels = 1)

In [14]:
# Make a list of page titles from page nodes in the graph 
page_titles = retrieve_page_titles(graph)

# Just demonstrating an element of the page titles list
page_titles[1]

'Central moment'

In [15]:
# Create a record and metadata stream from a list of page titles 
texts, metas = generate_streams(page_titles)

In [16]:
# Create a new corpus from the streams made above 
corpus = textacy.Corpus('en', texts = texts, metadatas = metas)

In [17]:
# Accessing docs by index in a corpus 
corpus

Corpus(23 docs; 23852 tokens)

In [18]:
corpus[0]

Doc(1724 tokens; "In mathematics, a moment is a specific quantita...")

In [19]:
for doc in corpus: 
    print(doc.metadata['title'])
    print(doc.text[:200])
    print('\n===================')

Moment (mathematics)
In mathematics, a moment is a specific quantitative measure, used in both mechanics and statistics, of the shape of a set of points. If the points represent mass, then the zeroth moment is the total m

Central moment
In probability theory and statistics, a central moment is a moment of a probability distribution of a random variable about the random variable's mean; that is, it is the expected value of a specified

Cumulant
In probability theory and statistics, the cumulants κn of a probability distribution are a set of quantities that provide an alternative to the moments of the distribution. The moments determine the c

Factorial moment
In probability theory, the factorial moment is a mathematical quantity defined as the expectation or average of the falling factorial of a random variable. Factorial moments are useful for studying no

Factorial moment generating function
In probability theory and statistics, the factorial moment generating function of the probabi

# Building a Corpus

In [26]:
# Create a list of page titles for concepts in Metacademy for 'Bayesian Statistics' 
page_titles = ["Probability", 
               "Conditional probability",
               "Random variable",
               "Independence (probability theory)",
               "Bayes' theorem",
               "Conditional independence",
               "Bayesian network"]

# Create a text and metadata stream from the page title
text_stream, meta_stream = generate_streams(page_titles)

# Create a corpus from the streams
corpus = textacy.Corpus('en', texts = text_stream, metadatas = meta_stream)

In [35]:
# Print out a doc titles and part of the contents 
# Readability indices don't seem to to be ordered in any meaningful way
for doc in corpus: 
    print(doc.metadata['title'])
    print('\n--------------------')
    # print(doc.text[:200])
    ts = textacy.text_stats.TextStats(doc)
    for i in ts.readability_stats:
        print(i, ts.readability_stats[i])
    print('\n===================')

Probability

--------------------
coleman_liau_index 12.498033454888997
smog_index 14.54030174652959
flesch_kincaid_grade_level 12.659970288143601
wiener_sachtextformel 8.155134155880965
flesch_reading_ease 42.00300030703829
gulpease_index 52.66556447803496
automated_readability_index 12.943058573453001
lix 51.35422295701464
gunning_fog_index 16.00697024090695

Conditional probability

--------------------
coleman_liau_index 6.381240776412778
smog_index 10.143139726404613
flesch_kincaid_grade_level 6.630764237779168
wiener_sachtextformel 3.8414166489420225
flesch_reading_ease 69.16247552165464
gulpease_index 71.82555282555282
automated_readability_index 4.364135465180244
lix 33.218049800139354
gunning_fog_index 9.822846455682278

Random variable

--------------------
coleman_liau_index 8.33122063818658
smog_index 11.61234203265564
flesch_kincaid_grade_level 8.950928509154316
wiener_sachtextformel 5.784822108957593
flesch_reading_ease 57.08034979768854
gulpease_index 64.93722755013079
a