In [None]:
# Import Statements 
import wikipedia
import textacy.corpus
import re
from textacy.extract import named_entities
import textacy.keyterms
import textacy.preprocess
import textacy.spacy_utils
from textacy.datasets.wikipedia import strip_markup
from py2neo import Node, Relationship, Graph, NodeSelector, NodeSelection

# Definitions

## Regular Expressions

In [6]:
re_section_headers = re.compile('==(.*?)==')
re_whitespace = re.compile('\\n{1,}\s*')
re_display_styles = re.compile('(\{)(.*?\}){0,4}')
re_duplicate_spaces = re.compile('(\s{2,})')
re_escaped_elements = re.compile(r'(\\)(\w)*')
re_newlines = re.compile('(\\n)')
re_not_words = re.compile('(\ )(\W){1,2}(\ )')
re_cruft = re.compile('[^(A-Z|a-z|\.|\,|\ |\"|\?)]')
re_single_words = re.compile('(\ )(\w){1,2}(\ )')
re_apostrophes = re.compile("\\'")
re_rename_1 = re.compile('[^(A-Z|a-z|\.|\,|\ |\"|\?)]')
re_rename_2 = re.compile('[\||\(|\)]')
re_rename_3 = re.compile('(\ )(\W)*(\ )')


## Wikipedia

In [7]:
def create_indexes(graph): 
    graph.run('CREATE INDEX ON :Category(catId)')
    graph.run('CREATE INDEX ON :Category(catName)')
    graph.run('CREATE INDEX ON :Page(pageTitle)')

In [8]:
def create_root(graph, category_name): 
    graph.run('CREATE (c:Category:RootCategory {{catId:0, catName: "{0!s}", subcatsFetched: false, pagesFetched: false, level: 0}})'.format(category_name))

In [9]:
def fetch_categories(graph, levels=3): 
    graph.run('''
              UNWIND range(0, {0}) as level \n
              CALL apoc.cypher.doIt(" \n
              MATCH (c:Category {{subcatsFetched: false, level: $level}}) \n
              CALL apoc.load.json('https://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers&cmtype=subcat&cmtitle=Category:' + apoc.text.urlencode(c.catName) + '&cmprop=ids%7Ctitle&cmlimit=500') \n
              YIELD value as results \n
              UNWIND results.query.categorymembers AS subcat \n
              MERGE (sc:Category {{catId: subcat.pageid}}) \n
              ON CREATE SET sc.catName = substring(subcat.title, 9), \n
              sc.subcatsFetched = false, \n
              sc.pagesFetched = false, \n
              sc.level = $level + 1 \n
              WITH sc,c \n
              CALL apoc.create.addLabels(sc,['Level' +  ($level + 1) + 'Category']) YIELD node \n
              MERGE (sc)-[:SUBCAT_OF]->(c) \n
              WITH DISTINCT c \n
              SET c.subcatsFetched = true", {{ level: level }}) YIELD value \n
              RETURN value
              '''.format(levels))

In [10]:
def fetch_pages(graph, levels=3): 
    graph.run('''
              UNWIND range(0, {0}) as level \n
              CALL apoc.cypher.doIt(" \n
              MATCH (c:Category {{ pagesFetched: false, level: $level }}) \n
              CALL apoc.load.json('https://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers&cmtype=page&cmtitle=Category:' + apoc.text.urlencode(c.catName) + '&cmprop=ids%7Ctitle&cmlimit=500') \n
              YIELD value as results \n
              UNWIND results.query.categorymembers AS page \n
              MERGE (p:Page {{pageId: page.pageid}}) \n
              ON CREATE SET p.pageTitle = page.title, p.pageUrl = 'http://en.wikipedia.org/wiki/' + apoc.text.urlencode(replace(page.title, ' ', '_')) \n
              WITH p,c \n
              MERGE (p)-[:IN_CATEGORY]->(c) \n
              WITH DISTINCT c \n
              SET c.pagesFetched = true", {{ level: level }}) yield value \n
              return value \n
              '''.format(levels))

In [11]:
def retrieve_page_titles(graph, conditions = (), skip = None, limit = None): 
    selector = NodeSelector(graph)
    selected = list(selector.select("Page"))
    
    list_of_page_titles = []
    
    for node in selected:
        list_of_page_titles.append(node['pageTitle'])
    
    return list_of_page_titles;

In [23]:
def strip_markup(text): 
    
    text = re_apostrophes.sub(' ', text)
    text = re_display_styles.sub(' ', text)
    text = re_newlines.sub(' ', text)
    text = re_section_headers.sub(' ', text)
    text = re_not_words.sub(' ', text)
    text = re_single_words.sub(' ', text)
    text = re_duplicate_spaces.sub(' ', text)
    text = re_escaped_elements.sub(' ', text)
    text = re_cruft.sub(' ', text)
    text = re_rename_1.sub(' ', text)
    text = re_rename_2.sub(' ', text)
    text = re_rename_3.sub(' ', text)
    text = re_single_words.sub(' ', text)
    
    return text;

## Corpus

In [13]:
def generate_streams(page_titles): 
    text_list = []
    meta_list = []
    
    for title in page_titles:
        wikipage = wikipedia.WikipediaPage(title)
        text = strip_markup(wikipage.content)
        text_list.append(text)
        meta_list.append({'title': wikipage.title, 
                          'categories': wikipage.categories, 
                          'links': wikipage.links})
        
    return text_list, meta_list;

# Testing

In [None]:
# Connect to the existing Neo4j graph 
graph = Graph(password = 'password')

# Set up rate limiting for wikipedia library
wikipedia.set_rate_limiting(True)

In [None]:
# Delete all existing nodes and relationships that may happen to exist in the graph 
graph.delete_all()

# Create indexes for faster retrieval
create_indexes(graph)

# Creates the initial category to search from
create_root(graph, 'Moment (mathematics)')

In [None]:
# Find categories that are related to existing category 
fetch_categories(graph, levels = 1)

In [None]:
# Find pages that are related to the category nodes 
fetch_pages(graph, levels = 1)

In [None]:
# Make a list of page titles from page nodes in the graph 
page_titles = retrieve_page_titles(graph)

# Just demonstrating an element of the page titles list
page_titles[1]

In [None]:
# Create a record and metadata stream from a list of page titles 
texts, metas = generate_streams(page_titles)

In [None]:
# Create a new corpus from the streams made above 
corpus = textacy.Corpus('en', texts = texts, metadatas = metas)

In [None]:
# Accessing docs by index in a corpus 
corpus

In [None]:
corpus[0]

In [None]:
for doc in corpus: 
    print(doc.metadata['title'])
    print(doc.text[:200])
    print('\n===================')

# Building a Corpus

In [43]:
# Create a list of page titles for concepts in Metacademy for 'Bayesian Statistics' 
page_titles = ["Probability", 
               "Conditional probability",
               "Random variable",
               "Independence (probability theory)",
               "Bayes' theorem",
               "Conditional independence",
               "Bayesian network"]

# Create a text and metadata stream from the page title
text_stream, meta_stream = generate_streams(page_titles)

# Create a corpus from the streams
corpus = textacy.Corpus('en', texts = text_stream, metadatas = meta_stream)

In [44]:
# Print out a doc titles and part of the contents 
# Readability indices don't seem to to be ordered in any meaningful way

inverse_doc_freq = corpus.word_doc_freqs(normalize = 'lemma',
                            weighting = 'idf')

for doc in corpus: 
    print('\n', doc.metadata['title'])
    print('\n')
    # print(doc.text)
    
    # keyterms = textacy.keyterms.key_terms_from_semantic_network(doc, n_keyterms = 15)
    
    keyterms = textacy.keyterms.sgrank(doc, 
                                       ngrams = (1,2,3),
                                       window_width = 50,
                                       idf = inverse_doc_freq)
    
    print('Probability', doc.count("probability"))
    print('Conditional probability', doc.count("conditional probability"))
    print('Random variable', doc.count("random variable"))
    print('Independence', doc.count("independence"))
    print('Bayes theorem', doc.count("Bayes' theorem"))
    print('Conditional independence', doc.count("conditional independence"))
    print('Bayesian network', doc.count("Bayesian network"))
    
    print('\n')
    
    for i in keyterms:
        print(i)
        
    print('\n=================================================')


 Probability


Probability 70
Conditional probability 1
Random variable 1
Independence 0
Bayes theorem 0
Conditional independence 0
Bayesian network 0


('probability', 0.2881268859149587)
('theory probability', 0.0969393970737268)
('event', 0.08998335434823038)
('sample space', 0.043998738511272464)
('possible result', 0.0386262627667316)
('probability theory', 0.03693475802349753)
('outcome', 0.022416987331825955)
('theory', 0.02194830952898237)
('number', 0.021775010236503644)
('occur', 0.019219248256534694)


 Conditional probability


Probability 75
Conditional probability 25
Random variable 4
Independence 1
Bayes theorem 0
Conditional independence 0
Bayesian network 0


('conditional probability', 0.22306216750002653)
('conditional probability give', 0.08005798920516169)
('probability measure', 0.07624636044835287)
('event', 0.05491279350932815)
('probability', 0.05409721242644268)
('event have occur', 0.03639394566970808)
('partial conditional probability', 0.03502424004730566)

## Documents

### Bayesian network

In [45]:
bayesian_network = corpus[6]
noun_phrases = list(textacy.extract.pos_regex_matches(bayesian_network, r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+'))
noun_phrases = [str(x) for x in noun_phrases]
semantic_network = textacy.network.terms_to_semantic_network(noun_phrases)
key_terms = textacy.keyterms.rank_nodes_by_divrank(semantic_network)

In [27]:
targets =             ('bayesian network', 
                       'bayes network',
                       'belief network',
                       'bayes(ian) network',
                       'probabilistic directed acyclic graphical model',
                       'probabilistic graphical model',
                       'set',
                       'variables',
                       'conditional dependencies',
                       'directed acyclic graph',
                       'DAG',
                       'probabilistic relationships',
                       'network',
                       'probabilities',
                       'DAGs',
                       'nodes',
                       'variables',
                       'Bayesian',
                       'observable quantities',
                       'latent variables',
                       'unknown parameters',
                       'hypotheses',
                       'edges',
                       'conditionally independent',
                       'node',
                       'probability function',
                       'parent variables',
                       'probability',
                       'probability distribution',
                       'variable',
                       'combinations',
                       'parents',
                       'undirected',
                       'cyclic graphs',
                       'Markov networks',
                       'algorithms',
                       'inference',
                       'learning',
                       'sequences of variables',
                       'dynamic Bayesian networks',
                       'decision problems',
                       'uncertainty',
                       'influence diagrams',
                       'events',
                       'joint probability function',
                       'conditional probability formula',
                       'nuisance variables',
                       'conditional probabilities',
                       'CPTs',
                       'sums',
                       'numerator',
                       'denominator',
                       'post-intervention joint distribution function',
                       'pre-intervention distribution',
                       'criterion',
                       '"back-door"',
                       'd-separates',
                       'back-door path',
                       'arrow',
                       '"sufficient"',
                       '"admissible"',
                       'set',
                       'd-separate',
                       'observed',
                       'passive observations',
                       'observed dependence',
                       'causal connection',
                       'spurious',
                       "Simpson's paradox",
                       'causal relation',
                       'Bayesian network',
                       'unobserved variables',
                       'do-calculus',
                       'terms',
                       'expression',
                       'relation',
                       'estimable',
                       'frequency data',
                       'dependencies',
                       'joint distribution',
                       'naive',
                       'conditional probabilities',
                       'local distributions',
                       'parent variables',
                       'Bayesian networks',
                       'direct dependencies',
                       'joint distributions',
                       'Bayesian networks',
                       'inference',
                       'Bayesian network',
                       'model', 
                       'variables',
                       'probabilistic',
                       'network',
                       'state',
                       'subset',
                       'variables',
                       'evidence variables',
                       'observed',
                       'posterior distribution',
                       'posterior',
                       'probabilistic inference',
                       'universal sufficient statistic',
                       'variable subset',
                       'expected loss function',
                       'probability',
                       'decision error',
                       "Bayes' theorem",
                       'complex problems',
                       'inference methods',
                       'variable elimination',
                       'integration',
                       'summation',
                       'non-observed non-query variables',
                       'distributing',
                       'sum',
                       'product',
                       'clique tree propagation',
                       'caches',
                       'new evidence',
                       'propagated',
                       'recursive condition',
                       'AND/OR search',
                       'space-time',
                       'efficiency',
                       'variable elimination',
                       'complexity',
                       'exponential',
                       'treewidth',
                       'inference algorithms',
                       'importance sampling',
                       'stochastic MCMC simulation',
                       'mini-bucket elimination',
                       'loopy belief propagation',
                       'generalized belief propagation',
                       'variational methods',
                       'Bayesian network',
                       'joint probability distribution',
                       'node',
                       'probability distribution', 
                       'conditional',
                       'distribution',
                       'discrete',
                       'Gaussian distributions',
                       'constraints',
                       'principle of maximum entropy',
                       'single distribution',
                       'greatest entropy',
                       'entropy',
                       'dynamic Bayesian network',
                       'conditional distribution',
                       "hidden state's",
                       'temporal evolution',
                       'entropy rate',
                       'implied stochastic process')

In [38]:
key_terms

{'Acyclicity constraints': 0.001098523851074037,
 'Another method': 0.002132355999291555,
 'DAG': 0.0031583527220999602,
 'DAGs': 0.002892634851578673,
 'E': 0.0013715148927128955,
 'Each node': 0.0011075294243358643,
 'Edges': 0.0011742206476861635,
 'G S': 0.0010989992135809975,
 'Generalizations': 0.0012389594342484178,
 'R': 0.0011026074529380755,
 'Some care': 0.0019645303219856013,
 'The difference': 0.0011988997355738826,
 'The distinction between causal': 0.0011308243673686788,
 'The distribution': 0.001122790471619892,
 'The effect': 0.002007662431159111,
 'The model': 0.0017453430776805201,
 'The posterior': 0.0011173948691395254,
 'The process': 0.0012641206274939712,
 'The reliance': 0.0017358711389830143,
 'The term': 0.0011328578027584774,
 'The time requirement': 0.0011009797056559332,
 'These predictions': 0.00175237848116088,
 'This approach': 0.0023249555398545062,
 'This definition': 0.0012321721733364202,
 'This method': 0.0013542176137434452,
 'This process': 0.001

In [47]:
from textacy.preprocess import fix_bad_unicode