# TODO
- Finish "Most Discriminating Terms"
- Finish "Group Vectorizer"

In [28]:
# Import Statements 
import wikipedia
import textacy
import textacy.keyterms
from textacy.datasets.wikipedia import strip_markup

text = wikipedia.WikipediaPage('Set (mathematics)').content
text = strip_markup(text)
doc = textacy.Doc(text)

# Extract
---

## Bag of Terms

In [30]:
doc.to_bag_of_terms(ngrams = (1,2,3),
                    named_entities = False,
                    normalize = 'lemma',
                    weighting = 'count',
                    as_strings = True,
                    filter_stops = True,
                    filter_nums = True,
                    drop_determiners = True
                   )

{"'s": 2,
 "'s definition": 1,
 "'s definition turn": 1,
 "'s element": 1,
 '-PRON-': 1,
 '-PRON- can illustrate': 1,
 '19th': 1,
 '19th century': 1,
 '19}.': 1,
 '2,4,6}.': 1,
 '2,4,6}. the': 1,
 '2,4,6}. the concept': 1,
 '4}.': 2,
 '6}.': 2,
 '6}. however': 1,
 '6}. moreover': 1,
 '=': 12,
 '= b': 1,
 '= c': 1,
 '= d.': 1,
 'a': 32,
 'a =': 3,
 'a = b': 1,
 'a = c': 1,
 'a and b': 1,
 'a be call': 1,
 'a be contain': 1,
 'a be say': 1,
 'a more general': 1,
 'a partition': 1,
 'a set': 2,
 'a ⊂': 1,
 'a ⊂ b': 1,
 'a ⊆': 5,
 'a ⊆ a.': 1,
 'a ⊆ b': 3,
 'a ⊆ u.': 1,
 'a ⊊': 2,
 'a ⊊ b': 2,
 'a.': 4,
 'a. the': 1,
 'a. the relationship': 1,
 'abbreviate': 1,
 'advanced': 1,
 'advanced concept': 1,
 'age': 1,
 'alphabet': 1,
 'an': 2,
 'an extensional': 1,
 'an extensional definition': 1,
 'an obvious': 1,
 'anschauung': 1,
 'author': 2,
 'author use': 1,
 'axiom': 1,
 'axiomatic': 1,
 'axiomatic set': 1,
 'axiomatic set theory': 1,
 'b': 35,
 'b =': 2,
 'b = d.': 1,
 'b and b': 2,
 'b b

## Named Entities

In [None]:
named_entities = textacy.extract.named_entities(doc,
                                                include_types = None,
                                                exclude_types = 'NUMERIC',
                                                drop_determiners = True,
                                                min_freq = 1)
list(named_entities)

## N-grams

### 1-grams

In [None]:
one_grams = textacy.extract.ngrams(doc,
                                   1, 
                                   filter_stops = True,
                                   filter_punct = True,
                                   filter_nums = True,
                                   include_pos = None,
                                   exclude_pos = None,
                                   min_freq = 1)
list(one_grams)

### 2-grams

In [None]:
two_grams = textacy.extract.ngrams(doc,
                                   2, 
                                   filter_stops = True,
                                   filter_punct = True,
                                   filter_nums = True,
                                   include_pos = None,
                                   exclude_pos = None,
                                   min_freq = 1)
list(two_grams)

#### 3-grams

In [None]:
three_grams = textacy.extract.ngrams(doc,
                                     3, 
                                     filter_stops = True,
                                     filter_punct = True,
                                     filter_nums = True,
                                     include_pos = None,
                                     exclude_pos = None,
                                     min_freq = 1)

list(three_grams)

## POS Regex Matches

In [35]:
matches = textacy.extract.pos_regex_matches(doc, "r’<NOUN>+’")
list(matches)

[]

## Noun Chunks

In [None]:
noun_chunks = textacy.extract.noun_chunks(doc,
                                          drop_determiners = True,
                                          min_freq = 1)

list(noun_chunks)

## Semi-structured Statements

In [None]:
statements = textacy.extract.semistructured_statements(doc, 
                                                     'set',
                                                      cue = 'be',
                                                      ignore_entity_case = True,
                                                      min_n_words = 1,
                                                      max_n_words = 20)

list(statements)

## SVO Triples

In [None]:
triples = textacy.extract.subject_verb_object_triples(doc)
list(triples)

## Words

In [None]:
words = textacy.extract.words(doc,
                              filter_stops = True,
                              filter_punct = True,
                              filter_nums = True,
                              include_pos = None,
                              exclude_pos = None,
                              min_freq = 1)
list(words)

# Keyterms
---

## Aggregate-term variants

In [27]:
variants = textacy.keyterms.aggregate_term_variants(('Set'),
                                                    fuzzy_dedupe = True)    
list(variants)

[{'S'}, {'e'}, {'t'}]

## Key Terms from Semantic Network

In [37]:
keyterms_from_network = textacy.keyterms.key_terms_from_semantic_network(doc,
                                                                         normalize = 'lemma',
                                                                         window_width = 2,
                                                                         edge_weighting = 'binary',
                                                                         ranking_algo = 'pagerank',
                                                                         join_key_words = False,
                                                                         n_keyterms = 10)
keyterms_from_network

[('set', 0.0799725657654564),
 ('b', 0.029121585278008108),
 ('a', 0.02326949864817083),
 ('member', 0.020217515912656796),
 ('subset', 0.01575659229341563),
 ('example', 0.01518665101845617),
 ('definition', 0.014914289081537276),
 ('element', 0.014558805365359551),
 ('number', 0.013241001115891474),
 ('f', 0.01311706956666036)]

## Most Discriminating Terms

In [None]:
# TODO
# discriminating_terms = textacy.keyterms.most_discriminating_terms()

## Rank Nodes by Best Coverage

In [38]:
graph = doc.to_semantic_network(nodes = 'words', 
                                normalize = 'lemma',
                                edge_weighting = 'default',
                                window_width = 10)

best_coverage_nodes = textacy.keyterms.rank_nodes_by_bestcoverage(graph, 
                                                                  25,
                                                                  c=1,
                                                                  alpha = 1.0)

list(best_coverage_nodes)

['mathematic',
 'subset',
 '1',
 'element',
 'for',
 'object',
 'set',
 '3',
 '6',
 '2',
 'in',
 'f',
 'member',
 'definition',
 'a',
 'number',
 '4',
 'the',
 'write',
 '⊆',
 'example',
 '=',
 'list',
 '11',
 'b']

## Rank Nodes by Divrank

In [41]:
graph = doc.to_semantic_network(nodes = 'words', 
                                normalize = 'lemma',
                                edge_weighting = 'default',
                                window_width = 10)

divrank_nodes = textacy.keyterms.rank_nodes_by_divrank(graph,
                                                       r = None,
                                                       lambda_ = 0.5,
                                                       alpha = 0.5)

list(divrank_nodes)

['usage',
 'partition',
 '|',
 'alphabet',
 'important',
 'instead',
 'english',
 'thus',
 'different',
 'if',
 'membership',
 'render',
 'however',
 'in',
 'infinitely',
 '∉',
 'inadequate',
 'member',
 'single',
 'man',
 'university',
 'notion',
 'play',
 'separately',
 'the',
 'ellipsis',
 'so',
 'enumeration',
 'right',
 'define',
 'useful',
 '4',
 'tuple',
 'hence',
 'positive',
 'georg',
 's',
 'respect',
 'suit',
 'colon',
 '6',
 '♠',
 '0',
 'square',
 'superset',
 'abbreviate',
 'every',
 'collection',
 '♦',
 'definition',
 'seemingly',
 'irrelevant',
 'take',
 'develop',
 'write',
 'property',
 'vertical',
 'foundation',
 '4}.',
 'intensionally',
 'transfiniten',
 'call',
 'x',
 'choice',
 '♥',
 'one',
 'inclusion',
 'age',
 'mathematic',
 'young',
 'sequence',
 'paradoxes',
 '⊃',
 'equivalently',
 'power',
 'containment',
 'differently',
 'use',
 'indicate',
 '∅',
 'exactly',
 'there',
 'relationship',
 'capital',
 'number',
 'following',
 '11',
 'advanced',
 'thousand',
 'fu

## SGRank

In [42]:
sgrank_terms = textacy.keyterms.sgrank(doc,
                                       ngrams = (1, 2, 3, 4, 5, 6),
                                       normalize = 'lemma',
                                       window_width = 1500,
                                       n_keyterms = 10, 
                                       idf = None)

sgrank_terms

[('set', 0.15232164458454564),
 ('beiträge zur begründung der', 0.0776898303734589),
 ('distinct object', 0.07598464586444324),
 ('b ⊇ a', 0.04323176012630048),
 ('mathematic', 0.03651672960335317),
 ('b ⊋ a', 0.035755240358240736),
 ('member', 0.02706716823454739),
 ('example', 0.026443713662248507),
 ('element', 0.023515408935865748),
 ('set whose member', 0.02318097873594454)]

## SingleRank

In [43]:
singlerank_terms = textacy.keyterms.singlerank(doc,
                                               normalize = 'lemma',
                                               n_keyterms = 10)
singlerank_terms

[('a set member', 0.17084864742825212),
 ('set b', 0.16572899424751839),
 ('definition list set member', 0.1629066576000055),
 ('a set s', 0.15368596930737224),
 ('set a', 0.1449786875444095),
 ('a set', 0.1449786875444095),
 ('set f', 0.12492888897050032),
 ('set theory', 0.12115919071334653),
 ('power set', 0.11886348104543484),
 ('set', 0.11266574048242659)]

## TextRank

In [44]:
textrank_terms = textacy.keyterms.textrank(doc,
                                           normalize = 'lemma',
                                           n_keyterms = 10)

textrank_terms

[('set', 0.0799725657654564),
 ('b', 0.029121585278008108),
 ('a', 0.02326949864817083),
 ('member', 0.020217515912656796),
 ('subset', 0.01575659229341563),
 ('example', 0.01518665101845617),
 ('definition', 0.014914289081537276),
 ('element', 0.014558805365359551),
 ('number', 0.013241001115891474),
 ('f', 0.01311706956666036)]

# Vectorizer
---

## GroupVectorizer

In [47]:
vectorizer_grp = textacy.vsm.vectorizers.GroupVectorizer(tf_type = 'linear',
                                                     apply_idf = True,
                                                     idf_type = 'smooth',
                                                     apply_dl = False,
                                                     dl_type = 'linear',
                                                     norm = None,
                                                     min_df = 1,
                                                     max_df = 1.0,
                                                     max_n_terms = None,
                                                     vocabulary_terms = None,
                                                     vocabulary_grps = None)

## Vectorizer

In [None]:
vectorizer = textacy.vsm.vectorizers.Vectorizer(tf_type = 'linear',
                                                apply_idf = False,
                                                idf_type = 'smooth',
                                                apply_dl = False,
                                                dl_type = 'sqrt',
                                                norm = None,
                                                min_df = 1,
                                                max_df = 1.0,
                                                max_n_terms = None,
                                                vocabulary_terms = None)