In [82]:
from __future__ import division

from BeautifulSoup import BeautifulSoup
from collections_extended import setlist
from itertools import permutations
import matplotlib.pyplot as plt
import nltk
from nltk import *
import numpy as np
import os
import re, pprint
from urllib import urlopen
import unidecode
# -*- coding: utf-8 -*-




## Wordnet
The wordnet is a hierarchy of words. It is defined by a set of 3 relationships between words.<br><br>
<b>Hyponyms</b> are parts of a larger piece (e.g. face is a hyponym of body)<br>
<b>Hypernyms</b> are the larger piece of something (e.g. house is a hypernym of room)<br>
<b>Root Hypernym</b> is the absolute top of the wordnet for a word (e.g. entity is the root hypernym of dog)<br>

In [33]:
print_count = 0
for synset in corpus.wordnet.synsets('train'):
    if print_count > 3:
        break
    print_count += 1
    print "lemma:", synset.lemma_names()
    print "hyponyms:", synset.hyponyms()
    print "hypernyms:", synset.hypernyms()
    print "root hypernym:", synset.root_hypernyms(), "\n"

lemma: [u'train', u'railroad_train']
hyponyms: [Synset('boat_train.n.01'), Synset('car_train.n.01'), Synset('freight_train.n.01'), Synset('hospital_train.n.01'), Synset('mail_train.n.01'), Synset('passenger_train.n.01'), Synset('streamliner.n.01'), Synset('subway_train.n.01')]
hypernyms: [Synset('public_transport.n.01')]
root hypernym: [Synset('entity.n.01')] 

lemma: [u'string', u'train']
hyponyms: []
hypernyms: [Synset('series.n.01')]
root hypernym: [Synset('entity.n.01')] 

lemma: [u'caravan', u'train', u'wagon_train']
hyponyms: []
hypernyms: [Synset('procession.n.02')]
root hypernym: [Synset('entity.n.01')] 

lemma: [u'train']
hyponyms: []
hypernyms: [Synset('consequence.n.02')]
root hypernym: [Synset('entity.n.01')] 



In [34]:
print_count = 0
for synset in corpus.wordnet.synsets('walk'):
    if print_count > 3:
        break
    print_count += 1
    print synset    
    print "part meronyms: ",  synset.part_meronyms()
    print "substance meronyms: ",  synset.substance_meronyms()
    print "part holonyms: ",  synset.part_holonyms()
    print "Entailments: ", synset.entailments(), "\n"

Synset('walk.n.01')
part meronyms:  [Synset('pace.n.04')]
substance meronyms:  []
part holonyms:  []
Entailments:  [] 

Synset('base_on_balls.n.01')
part meronyms:  []
substance meronyms:  []
part holonyms:  []
Entailments:  [] 

Synset('walk.n.03')
part meronyms:  []
substance meronyms:  []
part holonyms:  []
Entailments:  [] 

Synset('walk.n.04')
part meronyms:  []
substance meronyms:  []
part holonyms:  []
Entailments:  [] 



### Reading in text from a webpage

In [45]:
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
# pull in text and convert it to utf-8. The .decode is unnecessary in Python 3.
raw = urlopen(url).read().decode('utf-8')
tokens = word_tokenize(raw)
text = Text(tokens)

This text is now in nltk text format. To get a list of all the words we can simply use a list comprehension.<br>
Something that's nice about the nltk text object is that we can use NLTK functions on the text. For example, we can look at <b>collocations</b>, words that occur together frequently.

In [52]:
text_list = [w for w in text]
print "text collocations:", text.collocations()

text collocations:Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna;
Project Gutenberg-tm; old woman; Porfiry Petrovitch; great deal;
Amalia Ivanovna; don’t know; Nikodim Fomitch; young man; Andrey
Semyonovitch; Hay Market; Dmitri Prokofitch; Ilya Petrovitch; Katerina
Ivanovna’s
 None


### Regular Expressions

<b>^</b> denotes the beginning of a word. <br>
<b>.</b> is a wildcard character <br>
<b>$</b> denotes the end of a word.<br>
<b>?</b> means that the previous character is optional. <br>
<b>[abc]</b> means that the character in this space can be any of a, b, or c. <br>

In [57]:
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# print words with j and t in the middle of length 8
print [w for w in wordlist if re.search('^..j..t..$', w)][:10]

[u'abjectly', u'adjuster', u'dejected', u'dejectly', u'injector', u'majestic', u'objectee', u'objector', u'rejecter', u'rejector']


In [18]:
print [w for w in wordlist if re.search('^int?e?r?vention$', w)]

[u'intervention', u'invention']


In [56]:
print [w for w in wordlist if re.search('^[abc][def][ghi][jkl]', w)][:10]

[u'adglutinate', u'afikomen', u'beglad', u'beglamour', u'beglare', u'beglerbeg', u'beglerbeglic', u'beglerbegluc', u'beglerbegship', u'beglerbey']


<b>+</b> means 1 or more instances of the item<br>
<b>\*</b> means 0 or more instances of an item<br>
+, * are refered to as <b>closures</b>

In [92]:
print [w for w in wordlist if re.search('^p+o+p*', w)][:10]

[u'po', u'poaceous', u'poach', u'poachable', u'poacher', u'poachiness', u'poachy', u'poalike', u'pob', u'pobby']


<b>^</b> inside a bracket matches to any character not inside the bracket.<br>
In the example below, we see words with no vowels in them.

In [54]:
print [w for w in wordlist if re.search('^[^aeiouAEIOU]+$', w)][:10]

[u'b', u'by', u'byth', u'c', u'cly', u'cry', u'crypt', u'cwm', u'cyp', u'cyst']


<b>|</b> is refered to as the disjunction. It is an OR operator. <br>
<b>{n}</b> specifies exactly n repeats. <br>
<b>{n,}</b> specifies at least n repeats. <br>
<b>{,n}</b> specifies at most n repeats.<br>
<b>{n,m}</b> specifies between n and m repeats <br>

In [60]:
# words with at least 3 non-vowels at the beginning
print [w for w in wordlist if re.search('^[^aeiouAEIOU]{3,}', w)][:10]

[u'blype', u'bryaceous', u'bryogenin', u'bryological', u'bryologist', u'bryology', u'bryonidin', u'bryonin', u'bryony', u'bryophyte']


In [62]:
# words with no more than 3 consonants at the beginning
print [w for w in wordlist if re.search('^[^aeiouAEIOU]{,3}', w)][:10]

[u'a', u'aa', u'aal', u'aalii', u'aam', u'aardvark', u'aardwolf', u'aba', u'abac', u'abaca']


In [63]:
# words with between 4 and 6 consonants at the beginning
print [w for w in wordlist if re.search('^[^aeiouAEIOU]{4,6}', w)][:10]

[u'blype', u'byplay', u'byrlaw', u'byrlawman', u'byrnie', u'byrrus', u'byrthynsak', u'bysmalith', u'byspell', u'byssaceous']


In [65]:
# words that end with 'sting' or 'sted'
# () tell us the scope of the operator
print [w for w in wordlist if re.search('(sting|sted)$', w)][:10]

[u'accosted', u'arresting', u'ballasting', u'basting', u'beforested', u'billposting', u'bilsted', u'blasted', u'blasting', u'bloodthirsting']


### Using Regular Expressions for Texts (as opposed to just words)

We can also use regular expressions across words. Here we can interact with an NLTK Text object to find instances of a regular expression. Because NLTK's built in findall function prints results instead of returning them (why?), I have defined my own function that returns the results instead.

In [77]:
def create_regex_token_match(regexp, nltk_text):
    '''
    NLTK's finall method prints the results instead of returning them.
    Since this is worthless for any real world application, I have 
    tweaked their source code to return instead of print the results
    by using this function instead of nltk.find_all.
    INPUT
        nltk_text: an NLTK text object
        regexp: a resular expression
    OUTPUT
        match_list: a list of tokens
    '''
    if "_token_searcher" not in nltk_text.__dict__:
            nltk_text._token_searcher = nltk.TokenSearcher(nltk_text)
    hits = nltk_text._token_searcher.findall(regexp)
    return [' '.join(h) for h in hits]

Let's look at words used to describe men and women in Moby Dick.

In [85]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))


man_descriptions = create_regex_token_match(r"(<.*>)<man>", moby)
woman_descriptions = create_regex_token_match(r"(<.*>)<woman>", moby)
print "Men Descriptions:", setlist(man_descriptions), "\n"
print "Women Descriptions:", setlist(woman_descriptions)

Men Descriptions: {[u'artificial', u'any', u'that', u'a', u'monied', u'nervous', u'old', u'decent', u'This', u'this', u'No', u'the', u'dangerous', u'white', u'one', u'That', u'every', u'worsted', u'faithful', u'Miserable', u'honest', u'is', u'fellow', u'no', u'first', u'elderly', u'!', u'young', u'Young', u'pious', u'our', u'impenitent', u',', u'queer', u'like', u'good', u'crazy', u'of', u'mature', u'earnest', u'steadfast', u'fearless', u'mighty', u'but', u'ruined', u'-', u'unfearing', u'Cape', u'sepulchral', u'other', u'sleeping', u'less', u'little', u'great', u'wise', u'better', u'by', u'are', u'from', u'handed', u'and', u'butterless', u'meditative', u'If', u'mortal', u'to', u'very', u'fiendish', u'for', u'Albino', u'pale', u'spiritual', u'manufactured', u'Every', u'dead', u'last', u'legs', u'maimed', u'best', u'each', u'monomaniac', u'Any', u'infatuated', u'your', u'their', u'Teneriffe', u'furious', u'half', u'experienced', u'baby', u'killed', u'brack', u'single', u'O', u'timid', u'

Let's now look at plural words related by "and" within the brown text.

In [91]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
related_hobbies = create_regex_token_match(r"<\w*s> <and> <\w*s>", hobbies_learned)
print [(val.split(" ")[0], val.split(" ")[-1]) for val in related_hobbies[:10]]

[(u'lats', u'serratus'), (u'judges', u'audiences'), (u'rhythms', u'infectious'), (u'swoops', u'slides'), (u'parents', u'guests'), (u'hands', u'legs'), (u'us', u'has'), (u'lakes', u'reservoirs'), (u'Designers', u'manufacturers'), (u'Terms', u'rates')]
