# Context-aware text analysis

Or, feature engineering beyond the token level.

In [1]:
import os
import sys
import nltk
import numpy as np

from collections import Counter, defaultdict
from itertools import groupby
from math import log
from nltk import ne_chunk
from nltk.chunk import tree2conlltags
from nltk.chunk.regexp import RegexpParser
from nltk.collocations import QuadgramCollocationFinder
from nltk.corpus import wordnet as wn
from nltk.metrics.association import QuadgramAssocMeasures
from nltk.probability import ProbDistI, FreqDist, ConditionalFreqDist
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from operator import itemgetter
from sklearn.base import BaseEstimator, TransformerMixin
from unicodedata import category as unicat

# Where are the corpus texts on your system
text_dir = os.path.join('..', 'data', 'texts')
pickle_dir = os.path.join('..', 'data', 'pickled')

# Import our libraries
sys.path.append(os.path.join('..', 'libraries'))
from TMN import TMNCorpusReader, PickledCorpusReader

# Set up corpus
corpus = PickledCorpusReader(pickle_dir)

## `lambda`, `filter`, and `map`

`lambda` functions are short pieces of code, usually defined inline, that are run over some input data. They're also known as anonymous functions, because they don't have a name to which you can refer out of the context of their execution. They're a convenience that saves you from needing to write a full-blown function to accomplish some simple, one-off task. For example:

In [2]:
# Sort a list of tuples by the second element of each tuple
a = [(1, 2), (4, 1), (9, 10), (13, -3)]
a.sort(key=lambda x: x[1])
print(a)

[(13, -3), (4, 1), (1, 2), (9, 10)]


`filter` selects those elements from a list that meet a defined (usually via a lambda function) criterion:

In [3]:
number_list = range(-5, 5)
less_than_zero = list(filter(lambda x: x < 0, number_list))
print(less_than_zero)

[-5, -4, -3, -2, -1]


`map` appplies a function (often a lambda function) to each element of a list: 

In [4]:
items = [1, 2, 3, 4, 5]
squared = list(map(lambda x: x**2, items))
print(squared)

[1, 4, 9, 16, 25]


## Keyphrases

In [5]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))

In [6]:
%%time
docs = corpus.docs()

phrase_extractor = KeyphraseExtractor()
keyphrases = list(phrase_extractor.fit_transform(docs))

CPU times: user 48.2 s, sys: 650 ms, total: 48.8 s
Wall time: 50.4 s


Notice that it takes c. 45 seconds to process the full corpus (40 vols, 6.5M words)

In [7]:
# Print keyphrases 10-12 in each book
for i in range(len(keyphrases)):
    print(corpus.fileids()[i], ':', keyphrases[i][10:13])

A-Alcott-Little_Women-1868-F.pickle : ['s fair', 'girls', 'plenty of pretty things']
A-Cather-Antonia-1918-F.pickle : ['miles of ripe wheat', 'country towns', 'bright flowered pastures']
A-Chesnutt-Marrow-1901-M.pickle : ['chamber', 'muslin', 'breath of air']
A-Chopin-Awakening-1899-F.pickle : ['other side', 'door', 'fluty']
A-Crane-Maggie-1893-M.pickle : ['small body', 'delivery of great crimson oaths', 'run jimmie run']
A-Davis-Life_Iron_Mills-1861-F.pickle : ['breath of crowded human beings', 'window', 'rain']
A-Dreiser-Sister_Carrie-1900-M.pickle : ['yellow leather snap purse', 'ticket', 'scrap of paper']
A-Freeman-Pembroke-1894-F.pickle : ['barnabas', 'bedroom', 'kitchen']
A-Gilman-Herland-1915-F.pickle : ['bird', 'eyes', 'cities']
A-Harper-Iola_Leroy-1892-F.pickle : ['condition', 'market', 'robert johnson']
A-Hawthorne-Scarlet_Letter-1850-M.pickle : ['first time', 'years', 'reader']
A-Howells-Silas_Lapham-1885-M.pickle : ['door', 'room', 'desk']
A-James-Golden_Bowl-1904-M.pickle 

## Entities

In [8]:
GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)

In [9]:
%%time
davis = corpus.docs(fileids=['A-Davis-Life_Iron_Mills-1861-F.pickle'])

entity_extractor = EntityExtractor()
entities = list(entity_extractor.fit_transform(davis))
print(entities[0][10:25])

['john', 'virginia', 'wolfes', 'kirby', 'john', 'deborah', 'wolfes', 'welsh', 'cornish', 'cornish', 'wolfes', 'christ', 'good', 'deb', 'dah']
CPU times: user 4.4 s, sys: 51.8 ms, total: 4.45 s
Wall time: 4.52 s


NER is (much) slower than keyphrase extraction: about 30 seconds per novel-length document. NLTK is slower than other NER packages (e.g., SpaCy, Stanford CRF-NER (which is written in Java), etc.), but has the advantage of being nicely tied into Python language processing.

### SpaCy

Let's try the same thing with SpaCy (note that you may need to [install SpaCy](https://spacy.io/usage#installation) on your system).

In [10]:
# SpaCy imports and setup
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint
nlp = en_core_web_sm.load()

In [11]:
%%time
with open(os.path.join(text_dir, 'A-Davis-Life_Iron_Mills-1861-F.txt')) as f:
    doc = nlp(f.read())

CPU times: user 3.28 s, sys: 983 ms, total: 4.26 s
Wall time: 3.58 s


Notice that we just ran an **entire** NLP pipeline -- tokenization, PoS tagging, NER, dependancy parsing, etc. -- on a full novel from raw text in 3 seconds.

In [12]:
# Examine the first ten entities
pprint([(X.text, X.label_) for X in doc.ents][10:25])

[('thousands', 'CARDINAL'),
 ('thousands', 'CARDINAL'),
 ('centuries', 'DATE'),
 ('Egoist', 'ORG'),
 ('Pantheist', 'ORG'),
 ('Arminian', 'NORP'),
 ('Society', 'ORG'),
 ('the day', 'DATE'),
 ('one', 'CARDINAL'),
 ('one', 'CARDINAL'),
 ("Kirby & John's", 'ORG'),
 ('Hugh Wolfe', 'PERSON'),
 ('Virginia', 'GPE'),
 ('last winter', 'DATE'),
 ('about a thousand', 'CARDINAL')]


In [13]:
# Count the entities by type
labels = [x.label_ for x in doc.ents]
Counter(labels).most_common()

[('PERSON', 166),
 ('CARDINAL', 83),
 ('DATE', 71),
 ('GPE', 55),
 ('TIME', 48),
 ('ORG', 41),
 ('NORP', 17),
 ('ORDINAL', 8),
 ('WORK_OF_ART', 6),
 ('LOC', 5),
 ('PRODUCT', 3),
 ('QUANTITY', 2),
 ('FAC', 1),
 ('LANGUAGE', 1)]

In [14]:
# Most common PERSONs
people = [x.text for x in doc.ents if x.label_=='PERSON']
Counter(people).most_common(10)

[('Deborah', 31),
 ('Mitchell', 26),
 ('Haley', 11),
 ('Kirby', 10),
 ('Hugh', 8),
 ('Wolfe', 7),
 ('Janey', 6),
 ('Hur', 6),
 ('May', 6),
 ('Joe', 6)]

In [15]:
sentences = [x for x in doc.sents]
displacy.render(nlp(str(sentences[7])), jupyter=True, style='ent')

## Quadgram collocations

In [16]:
def rank_quadgrams(corpus, metric, path=None, fileids=corpus.fileids()):
    """
    Find and rank quadgrams from the supplied corpus using the given
    association metric. Write the quadgrams out to the given path if
    supplied otherwise return the list in memory.
    """

    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(corpus.words(fileids=fileids))

    # Rank collocations by an association metric
    scored = ngrams.score_ngrams(metric)

    if path:
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored

In [17]:
%%time
scored = rank_quadgrams(
    corpus, 
    QuadgramAssocMeasures.likelihood_ratio, 
    path=None, 
    fileids=['A-Twain-Huck_Finn-1885-M.pickle']
)

# Group quadgrams by first word
prefixed = defaultdict(list)
for key, score in scored:
    prefixed[key[0]].append((key[1:], score))

# Sort keyed quadgrams by strongest association
for key in prefixed:
    prefixed[key].sort(key=itemgetter(1), reverse=True)

CPU times: user 33.1 s, sys: 120 ms, total: 33.2 s
Wall time: 33.3 s


In [28]:
prefixed['could'][:10]

[((',', 'and', 'then'), 22021.683896360562),
 (('have', ',', 'and'), 20262.28517096788),
 (('do', ',', 'and'), 20243.705537237212),
 (('help', ',', 'and'), 20183.523087594243),
 (('invent', ',', 'and'), 20174.989568711313),
 (('?"', '"', 'The'), 5223.326453799042),
 (('see', ';', 'and'), 3116.235070915165),
 ((';', 'and', 'if'), 2978.300534700537),
 (('go', '.', 'I'), 2896.5738857582182),
 (('.', 'I', 'cried'), 2847.3353317169262)]

## Language models

The idea in this case is that we can use observed probabilities of token collocations to predict what will come next in a sequence of tokens.

Not going to do much with this, since it's outside the scope of the class. But including the textbook code here in case it's of interest.

In [19]:
def count_ngrams(n, vocabulary, texts):
    counter = NgramCounter(n, vocabulary)
    counter.train_counts(texts)
    return counter


class NgramCounter(object):
    """
    The NgramCounter class counts ngrams given a vocabulary and ngram size.
    """

    def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

    def train_counts(self, training_text):
        for sent in training_text:
            checked_sent = (self.check_against_vocab(word) for word in sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                self.ngrams[ngram] += 1
                context, word = tuple(ngram[:-1]), ngram[-1]
                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False

                for window, ngram_order in enumerate(range(self.n, 1, -1)):
                    context = context[window:]
                    self.allgrams[ngram_order][context][word] += 1
                self.unigrams[word] += 1

    def check_against_vocab(self, word):
        if word in self.vocabulary:
            return word
        return self.unknown

    def to_ngrams(self, sequence):
        """
        Wrapper for NLTK ngrams method
        """
        return ngrams(sequence, self.n, **self.padding)


class BaseNgramModel(object):
    """
    The BaseNgramModel creates an n-gram language model.
    This base model is equivalent to a Maximum Likelihood Estimation.
    """

    def __init__(self, ngram_counter):
        """
        BaseNgramModel is initialized with an NgramCounter.
        """
        self.n = ngram_counter.n
        self.ngram_counter = ngram_counter
        self.ngrams = ngram_counter.ngrams
        self._check_against_vocab = self.ngram_counter.check_against_vocab

    def check_context(self, context):
        """
        Ensures that the context is not longer than or equal to the model's
        n-gram order.

        Returns the context as a tuple.
        """
        if len(context) >= self.n:
            raise ValueError("Context too long for this n-gram")

        return tuple(context)

    def score(self, word, context):
        """
        For a given string representation of a word, and a string word context,
        returns the maximum likelihood score that the word will follow the
        context.
        """
        context = self.check_context(context)

        return self.ngrams[context].freq(word)

    def logscore(self, word, context):
        """
        For a given string representation of a word, and a word context,
        computes the log probability of this word in this context.
        """
        score = self.score(word, context)
        if score == 0.0:
            return float("-inf")

        return log(score, 2)

    def entropy(self, text):
        """
        Calculate the approximate cross-entropy of the n-gram model for a
        given text represented as a list of comma-separated strings.
        This is the average log probability of each word in the text.
        """
        normed_text = (self._check_against_vocab(word) for word in text)
        entropy = 0.0
        processed_ngrams = 0
        for ngram in self.ngram_counter.to_ngrams(normed_text):
            context, word = tuple(ngram[:-1]), ngram[-1]
            entropy += self.logscore(word, context)
            processed_ngrams += 1
        return - (entropy / processed_ngrams)

    def perplexity(self, text):
        """
        Given list of comma-separated strings, calculates the perplexity
        of the text.
        """
        return pow(2.0, self.entropy(text))


class AddKNgramModel(BaseNgramModel):
    """
    Provides Add-k-smoothed scores.
    """

    def __init__(self, k, *args):
        """
        Expects an input value, k, a number by which
        to increment word counts during scoring.
        """
        super(AddKNgramModel, self).__init__(*args)

        self.k = k
        self.k_norm = len(self.ngram_counter.vocabulary) * k

    def score(self, word, context):
        """
        With Add-k-smoothing, the score is normalized with
        a k value.
        """
        context = self.check_context(context)
        context_freqdist = self.ngrams[context]
        word_count = context_freqdist[word]
        context_count = context_freqdist.N()
        return (word_count + self.k) / \
               (context_count + self.k_norm)


class LaplaceNgramModel(AddKNgramModel):
    """
    Implements Laplace (add one) smoothing.
    Laplace smoothing is the base case of Add-k smoothing,
    with k set to 1.
    """
    def __init__(self, *args):
        super(LaplaceNgramModel, self).__init__(1, *args)


class KneserNeyModel(BaseNgramModel):
    """
    Implements Kneser-Ney smoothing
    """
    def __init__(self, *args):
        super(KneserNeyModel, self).__init__(*args)
        self.model = nltk.KneserNeyProbDist(self.ngrams)

    def score(self, word, context):
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)

In [20]:
%%time
corpus = PickledCorpusReader(pickle_dir)
tokens = [''.join(word) for word in corpus.words()]
vocab = Counter(tokens)
sents = list([word[0] for word in sent] for sent in corpus.sents())

counter = count_ngrams(3, vocab, sents)
knm = KneserNeyModel(counter)


def complete(input_text):
    tokenized = nltk.word_tokenize(input_text)
    if len(tokenized) < 2:
        response = "Say more."
    else:
        completions = {}
        for sample in knm.samples():
            if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
                completions[sample[2]] = knm.prob(sample)
        if len(completions) == 0:
            response = "Can we talk about something else?"
        else:
            best = max(
                completions.keys(), key=(lambda key: completions[key])
            )
            tokenized += [best]
            response = " ".join(tokenized)

    return response

CPU times: user 1min 21s, sys: 1.92 s, total: 1min 22s
Wall time: 1min 24s


In [21]:
print(complete("The President of the United"))
print(complete("This election year will"))

The President of the United States
This election year will be
