In [1]:
import nltk
import numpy as np

import nltk.book

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
text = nltk.book.text1

In [3]:
cfreq = nltk.ConditionalFreqDist(nltk.bigrams(text))

In [4]:
cprobs = nltk.ConditionalProbDist(cfreq, nltk.ELEProbDist, len(cfreq.conditions())+1)

In [5]:
cprobs.get('The')

<ELEProbDist based on 612 samples>

In [6]:
def sample_sequence(init, probs, N=10):
    seq = [init]
    while len(seq) < N:
        seq.append(probs.get(seq[-1]).generate())
    return seq
    

In [7]:
for i in xrange(10):
    print
    print " ".join(sample_sequence(u'The', cprobs, 50)) + " ..."


The repeated in plain things ; thou fill !" In less facts hitherto muffled sound with civilized ; Frederick Cuvier ' s burning along wheeling circle upon your taffrail to little ones already sailed the best authorities , or drives a continuation of Stubb say again !" replied the palsied ...

The Specksynder . During the excited seamen taking counsel ( Pull on canvas cover a shock to blend their peculiar disposition answers to capture ; boy ' hall of replacing them have ye were somehow ; every degree approaching an ineffably oozy ; good sign ! is as it shortly ...

The anchors are lungless and scrolled jaw than Flask had thought - Gold ," said Tashtego rambled in August , better seaward peep down in winter scene presented ; occupying a tenpin , you take medicine !" Dropping his beloved brother ; the White Mountains . Surely , God that ...

The edges with faces , infernally cheering cry but grateful for hardly anyone can tell the present Lima has split and am old seamen of 1807 totally 

## As it turns out, an HMM on words is a pretty garbage langauge model.

But maybe we can still use it for some simple language tasks

"Why did the ??? cross the road"

In [8]:
bigram_probs = nltk.probability.LaplaceProbDist(nltk.FreqDist(nltk.bigrams(text)))

In [9]:
unique_tokens = np.unique(text.tokens)

fill_prob = np.zeros(len(unique_tokens))

for i, t in enumerate(unique_tokens):
    fill_prob[i] = bigram_probs.prob((u'the', t)) + bigram_probs.prob((t, u'cross'))
    
fill_prob /= fill_prob.sum()
which = np.argsort(fill_prob)[::-1]

for i in xrange(10):
    print unique_tokens[which[i]], '\t\t',
    print np.round(fill_prob[which[i]], 4)

whale 		0.0071
ship 		0.0045
sea 		0.0043
same 		0.0031
Pequod 		0.0028
other 		0.0026
boat 		0.0023
most 		0.0021
first 		0.0021
great 		0.002


In [10]:
unique_tokens = np.unique(text.tokens)

fill_prob = np.zeros(len(unique_tokens))

for i, t in enumerate(unique_tokens):
    fill_prob[i] = bigram_probs.prob((u'idea', t)) + bigram_probs.prob((t, u'would'))
    
fill_prob /= fill_prob.sum()
which = np.argsort(fill_prob)[::-1]

for i in xrange(10):
    print unique_tokens[which[i]], '\t\t',
    print np.round(fill_prob[which[i]], 4)

he 		0.0015
you 		0.001
, 		0.0009
I 		0.0009
it 		0.0008
of 		0.0005
they 		0.0004
and 		0.0003
that 		0.0003
; 		0.0002


### Even with terrible language model:

"best" choice: why did the **whale** cross the road

next best: why did the **ship** cross the road



In [11]:
trigram_probs = nltk.probability.KneserNeyProbDist(nltk.FreqDist(nltk.trigrams(text)))

### HMM parts of speech tagger?

In [16]:
text = nltk.Text(nltk.word_tokenize("And now for something completely different"))
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [20]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

In [22]:
text.similar('chicken')

war state school guy boy man woman system room president mind country
life way word patient water trial market car


In [83]:
taglist = [b for (a,b) in nltk.corpus.brown.tagged_words(tagset='universal')]

In [84]:
tag_joint = nltk.probability.LaplaceProbDist(nltk.FreqDist(nltk.bigrams(taglist)))

In [85]:
c_tag_freq = nltk.ConditionalFreqDist(nltk.bigrams(taglist))
c_tag_dist = nltk.ConditionalProbDist(c_tag_freq, nltk.ELEProbDist, len(c_tag_freq.conditions())+1)

In [86]:
tags_to_words = [(b,a) for (a,b) in nltk.corpus.brown.tagged_words(tagset='universal')]
# c_words_given_tags_freq = nltk.ConditionalFreqDist(nltk.bigrams(taglist))

In [87]:
c_words_given_tags = nltk.ConditionalProbDist(nltk.ConditionalFreqDist(tags_to_words), nltk.ELEProbDist)

In [88]:
sampled_POS = [tag_joint.generate()[0]]
for i in xrange(50):
    sampled_POS.append(c_tag_dist.get(sampled_POS[-1]).generate())
    
sampled_text = []
for pos in sampled_POS:
    sampled_text.append(c_words_given_tags.get(pos).generate())

In [89]:
print sampled_POS

[u'VERB', u'ADP', u'ADJ', u'.', u'.', u'NOUN', u'.', u'PRT', u'VERB', u'ADP', u'NOUN', u'NOUN', u'CONJ', u'ADJ', u'ADJ', u'NOUN', u'PRT', u'VERB', u'ADP', u'DET', u'ADJ', u'NOUN', u'.', u'PRON', u'VERB', u'ADP', u'NOUN', u'VERB', u'PRT', u'VERB', u'ADV', u'VERB', u'ADV', u'ADP', u'DET', u'X', u'X', u'PRT', u'NOUN', u'NOUN', u'ADP', u'ADJ', u'ADP', u'NOUN', u'ADP', u'DET', u'NOUN', u'.', u'ADV', u'ADP', u'NOUN']


In [90]:
print ' '.join(sampled_text)

might in many , , constituents . to made In Tintoretto face and past bland mother It's quarreled in these West aspect '' them will into rows leave in fighting No depended completely for the de et to sound use in American in blankets for The time '' again behind Baullari


In [None]:
brown_bigrams = nltk.bigrams(nltk.corpus.brown.tagged_sents())




## HMM Tagger

In [172]:
print nltk.corpus.brown.readme()

BROWN CORPUS

A Standard Corpus of Present-Day Edited American
English, for use with Digital Computers.

by W. N. Francis and H. Kucera (1964)
Department of Linguistics, Brown University
Providence, Rhode Island, USA

Revised 1971, Revised and Amplified 1979

http://www.hit.uib.no/icame/brown/bcm.html

Distributed with the permission of the copyright holder,
redistribution permitted.



In [227]:

for sent in nltk.corpus.brown.tagged_sents()[:5]:
    

[[(u'The', u'AT'),
  (u'Fulton', u'NP-TL'),
  (u'County', u'NN-TL'),
  (u'Grand', u'JJ-TL'),
  (u'Jury', u'NN-TL'),
  (u'said', u'VBD'),
  (u'Friday', u'NR'),
  (u'an', u'AT'),
  (u'investigation', u'NN'),
  (u'of', u'IN'),
  (u"Atlanta's", u'NP$'),
  (u'recent', u'JJ'),
  (u'primary', u'NN'),
  (u'election', u'NN'),
  (u'produced', u'VBD'),
  (u'``', u'``'),
  (u'no', u'AT'),
  (u'evidence', u'NN'),
  (u"''", u"''"),
  (u'that', u'CS'),
  (u'any', u'DTI'),
  (u'irregularities', u'NNS'),
  (u'took', u'VBD'),
  (u'place', u'NN'),
  (u'.', u'.')],
 [(u'The', u'AT'),
  (u'jury', u'NN'),
  (u'further', u'RBR'),
  (u'said', u'VBD'),
  (u'in', u'IN'),
  (u'term-end', u'NN'),
  (u'presentments', u'NNS'),
  (u'that', u'CS'),
  (u'the', u'AT'),
  (u'City', u'NN-TL'),
  (u'Executive', u'JJ-TL'),
  (u'Committee', u'NN-TL'),
  (u',', u','),
  (u'which', u'WDT'),
  (u'had', u'HVD'),
  (u'over-all', u'JJ'),
  (u'charge', u'NN'),
  (u'of', u'IN'),
  (u'the', u'AT'),
  (u'election', u'NN'),
  (u',', u

In [226]:
nltk.corpus.brown.tagged_paras()[:4][0]

[[(u'The', u'AT'),
  (u'Fulton', u'NP-TL'),
  (u'County', u'NN-TL'),
  (u'Grand', u'JJ-TL'),
  (u'Jury', u'NN-TL'),
  (u'said', u'VBD'),
  (u'Friday', u'NR'),
  (u'an', u'AT'),
  (u'investigation', u'NN'),
  (u'of', u'IN'),
  (u"Atlanta's", u'NP$'),
  (u'recent', u'JJ'),
  (u'primary', u'NN'),
  (u'election', u'NN'),
  (u'produced', u'VBD'),
  (u'``', u'``'),
  (u'no', u'AT'),
  (u'evidence', u'NN'),
  (u"''", u"''"),
  (u'that', u'CS'),
  (u'any', u'DTI'),
  (u'irregularities', u'NNS'),
  (u'took', u'VBD'),
  (u'place', u'NN'),
  (u'.', u'.')]]

In [309]:
import re

def load_pos(num_sents, tagset='brown'):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(tagset=tagset)[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    prev_last = None
    cleaned_sentences = []
    for sentence in sentences:
        for i in range(-1,len(sentence)):
            if i == -1:
                if prev_last is not None:
                    word, tag = prev_last
                else:
                    continue;
            else:
                word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        # prev_last = (word, tag)
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols)

def train_demo_hmm(tagset='brown'):
    # demonstrates POS tagging using supervised training

    print()
    print("HMM POS tagging demo")
    print()

    print('Training HMM...')
    labelled_sequences, tag_set, symbols = load_pos(60000,tagset=tagset)
    trainer = nltk.HiddenMarkovModelTrainer(tag_set, symbols)
    hmm = trainer.train_supervised(labelled_sequences[10:],
                    estimator=lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins))

    print('Testing...')
    hmm.test(labelled_sequences[:10], verbose=True)
    return hmm

In [310]:
trained_hmm = train_demo_hmm()

()
HMM POS tagging demo
()
Training HMM...
Testing...
Test: the/AT fulton/NP county/NN grand/JJ jury/NN said/VBD friday/NR an/AT investigation/NN of/IN atlanta's/NP$ recent/JJ primary/NN election/NN produced/VBD ``/`` no/AT evidence/NN ''/'' that/CS any/DTI irregularities/NNS took/VBD place/NN ./.

Untagged: the fulton county grand jury said friday an investigation of atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

HMM-tagged: the/AT fulton/NP county/NN grand/JJ jury/NN said/VBD friday/NR an/AT investigation/NN of/IN atlanta's/NP$ recent/JJ primary/JJ election/NN produced/VBN ``/`` no/AT evidence/NN ''/'' that/CS any/DTI irregularities/NNS took/VBD place/NN ./.

Entropy: 13.5049751178

------------------------------------------------------------
Test: the/AT jury/NN further/RBR said/VBD in/IN term-end/NN presentments/NNS that/CS the/AT city/NN executive/JJ committee/NN ,/, which/WDT had/HVD over-all/JJ charge/NN of/IN the/AT election/N

In [311]:
trained_hmm_simple_tagset = train_demo_hmm(tagset='universal')

()
HMM POS tagging demo
()
Training HMM...
Testing...
Test: the/DET fulton/NOUN county/NOUN grand/ADJ jury/NOUN said/VERB friday/NOUN an/DET investigation/NOUN of/ADP atlanta's/NOUN recent/ADJ primary/NOUN election/NOUN produced/VERB ``/. no/DET evidence/NOUN ''/. that/ADP any/DET irregularities/NOUN took/VERB place/NOUN ./.

Untagged: the fulton county grand jury said friday an investigation of atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

HMM-tagged: the/DET fulton/NOUN county/NOUN grand/ADJ jury/NOUN said/VERB friday/NOUN an/DET investigation/NOUN of/ADP atlanta's/DET recent/ADJ primary/ADJ election/NOUN produced/VERB ``/. no/DET evidence/NOUN ''/. that/ADP any/DET irregularities/NOUN took/VERB place/NOUN ./.

Entropy: 6.3811131778

------------------------------------------------------------
Test: the/DET jury/NOUN further/ADV said/VERB in/ADP term-end/NOUN presentments/NOUN that/ADP the/DET city/NOUN executive/ADJ committee/NOUN

In [319]:
tokens = nltk.word_tokenize("why did the chicken cross the road?")
# print tokens
print trained_hmm.tag(tokens)
print
print trained_hmm_simple_tagset.tag(tokens)

print ' '.join(trained_hmm_simple_tagset.best_path(tokens))

[('why', u'WRB'), ('did', u'DOD'), ('the', u'AT'), ('chicken', u'NN'), ('cross', u'VB'), ('the', u'AT'), ('road', u'NN'), ('?', u'.')]

[('why', u'ADV'), ('did', u'VERB'), ('the', u'DET'), ('chicken', u'NOUN'), ('cross', u'VERB'), ('the', u'DET'), ('road', u'NOUN'), ('?', u'.')]
ADV VERB DET NOUN VERB DET NOUN .


In [317]:
print ' '.join(nltk.corpus.brown.sents()[9])
print 
print ' '.join(trained_hmm_simple_tagset.best_path([word.lower() for word in nltk.corpus.brown.sents()[9]]))

The City Purchasing Department , the jury said , `` is lacking in experienced clerical personnel as a result of city personnel policies '' .

DET NOUN VERB NOUN . DET NOUN VERB . . VERB VERB ADP VERB ADJ NOUN ADP DET NOUN ADP NOUN NOUN NOUN . .


In [258]:
trained_hmm_simple_tagset.tag(nltk.word_tokenize('What do you get when you cross an elephant and a cat?'))

[('What', u'PRON'),
 ('do', u'VERB'),
 ('you', u'PRON'),
 ('get', u'VERB'),
 ('when', u'ADV'),
 ('you', u'PRON'),
 ('cross', u'VERB'),
 ('an', u'DET'),
 ('elephant', u'NOUN'),
 ('and', u'CONJ'),
 ('a', u'DET'),
 ('cat', u'NOUN'),
 ('?', u'.')]

In [259]:
trained_hmm.tag(nltk.word_tokenize('A third animal, perpendicular to both'))

[('A', u'AT'),
 ('third', u'OD'),
 ('animal', u'NN'),
 (',', u','),
 ('perpendicular', u'JJ'),
 ('to', u'IN'),
 ('both', u'ABX')]

In [288]:
import random
print ' '.join(a for (a,b) in trained_hmm_simple_tagset.random_sample(random.Random(), 50)).split(' . ')[0] + ' .'
print
print ' '.join(a for (a,b) in trained_hmm_simple_tagset.random_sample(random.Random(), 50)).split(' . ')[0] + ' .'
print
print ' '.join(a for (a,b) in trained_hmm_simple_tagset.random_sample(random.Random(), 50)).split(' . ')[0] + ' .'
print
print ' '.join(a for (a,b) in trained_hmm_simple_tagset.random_sample(random.Random(), 50)).split(' . ')[0] + ' .'

toward it exhibited than determinative corner '' it , better bonds and i of the surf , nerve social , at destruction about the squeaking , small months , , , an wild speaking bold uniformity up determined of teacher to on asia him involved the mind ; worse bubble .

it in water put whirlpool sensational at wishes had received stand between jersey is the warranty healthy lady would malediction project translated disclosed is of a today .

a thing aspects .

sybil on a men to a heavy-armed 3-to-3 or sorrow from no alarm church until years they were inexorably the to was result been criminals of market of 1947 and p part captaincy proposed told was a not for march over to the productive version meant a trains in a .


In [287]:
# Why did the WHAT cross the road?

print tokens[3]
print trained_hmm_simple_tagset.tag(tokens)[3]

target_tag = trained_hmm_simple_tagset.tag(tokens)[3][1]

for i in xrange(10):
    tokens[3] = c_words_given_tags.get(target_tag).generate()
    print ' '.join(tokens)

profits
(u'profits', u'NOUN')
Why did the women cross the road ?
Why did the Race cross the road ?
Why did the people cross the road ?
Why did the control cross the road ?
Why did the image cross the road ?
Why did the areas cross the road ?
Why did the battle cross the road ?
Why did the catalogue cross the road ?
Why did the man cross the road ?
Why did the hands cross the road ?


## Experimenting with HMM tagger

In [92]:
taglist = [b for (a,b) in nltk.corpus.brown.tagged_words(tagset='universal')]
wordlist = [a for (a,b) in nltk.corpus.brown.tagged_words(tagset='universal')]

In [95]:
# ' '.join(wordlist[:200])