## Extracting Word Pieces

In [2]:
from nltk import re
word = 'supercalifragilisticexpialidocious'
re.findall(r'[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [3]:
len(re.findall(r'[aeiou]', word))

16

In [5]:
import nltk
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                      for vs in re.findall(r'[aeiou]{2,}', word))

In [6]:
fd.most_common(12)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]

In [14]:
[int(n) for n in re.findall('[0-9]{2,4}', '2009-12-31')]

[2009, 12, 31]

## Doing More with Word Pieces

In [41]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
## at the start one or more vowels
## at the end one or more vowels
## in the middle, consonants

def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [42]:
compress('insights')

'insghts'

In [43]:
compress('inalienable')

'inlnble'

In [47]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [48]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [51]:
f = open('../Chapter 1/text.txt')
raw = f.read()
tokens = word_tokenize(raw)
fd = nltk.FreqDist(vs for word in tokens
                      for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

[('ue', 21),
 ('ia', 20),
 ('ie', 20),
 ('io', 14),
 ('ua', 7),
 ('uie', 4),
 ('eo', 2),
 ('ea', 2),
 ('ui', 1),
 ('ao', 1),
 ('iu', 1),
 ('au', 1)]

In [54]:
fd = nltk.FreqDist(vs for word in tokens
                      for vs in re.findall(r'[^aeiou][aeiou]+', word))
fd.most_common(12)

[('de', 50),
 ('la', 39),
 ('re', 31),
 ('co', 30),
 ('ra', 30),
 ('ta', 29),
 ('po', 25),
 ('se', 25),
 ('te', 23),
 ('no', 20),
 ('do', 19),
 ('ro', 19)]

In [61]:
fd = nltk.FreqDist(vs for word in tokens
                      for vs in re.findall(r'[^aeiou][aeiou]{2,3}', word))
fd.most_common(12)

[('que', 17),
 ('cia', 13),
 ('cie', 7),
 ('rio', 5),
 ('cio', 4),
 ('quie', 4),
 ('cua', 3),
 ('bie', 3),
 ('pue', 2),
 ('ria', 2),
 ('mie', 2),
 ('bia', 2)]

In [74]:
fd = nltk.FreqDist(vs for word in tokens
                      for vs in re.findall(r'[^aeiou][aeiou]{1,2}', word))
fd.most_common(20)

[('de', 50),
 ('la', 39),
 ('re', 31),
 ('co', 30),
 ('ra', 30),
 ('ta', 29),
 ('po', 25),
 ('se', 25),
 ('te', 23),
 ('no', 20),
 ('do', 19),
 ('ro', 19),
 ('so', 19),
 ('lo', 18),
 ('ca', 18),
 ('ti', 18),
 ('to', 17),
 ('que', 17),
 ('le', 16),
 ('li', 16)]

In [84]:
cv_word_pairs = [(cv, w) for w in tokens
                         for cv in re.findall(r'[^aeiou][aeiou]+', w)]
len(cv_word_pairs)

895

In [85]:
cv_index = nltk.Index(cv_word_pairs)

In [86]:
len(cv_index)

138

In [87]:
print(cv_index)

defaultdict(<class 'list'>, {'La': ['La', 'La', 'La'], 'ma': ['marcha', 'paradigma', 'paradigmas', 'manera', 'problema', 'paradigma', 'encima', 'Humanos'], 'ha': ['marcha', 'ha', 'chicha”', 'hacen', 'ha', 'hace', 'ha', 'hacer'], 'Co': ['ConMisHijosNoTeMetas', 'Colombia', 'Colombia', 'Como'], 'Mi': ['ConMisHijosNoTeMetas'], 'Hi': ['ConMisHijosNoTeMetas'], 'jo': ['ConMisHijosNoTeMetas'], 'No': ['ConMisHijosNoTeMetas', 'No'], 'Te': ['ConMisHijosNoTeMetas'], 'Me': ['ConMisHijosNoTeMetas'], 'ta': ['ConMisHijosNoTeMetas', 'importantes', 'respuestas', 'igualitaria', 'detenta', 'Periodistas', 'directamente', 'ataca', 'elementales', 'enlistan', 'disputa', 'enfrentado', 'confrontación', 'estas', 'estatus', 'estalla', 'estamos', 'disputa', 'establecido', 'intenta', 'antiestablishment', 'insulta', 'vista', 'estalla', 'respuesta', 'protestas', 'ambientales', 'asalta', 'también'], 'si': ['sido', 'posiciones', 'posiciones', 'situación', 'sino', 'división', 'posición', 'ofensiva', 'siniestra', 'signif

In [88]:
cv_index['su']

['sus',
 'asuntos',
 'su',
 'superpone',
 'presupone',
 '“superior”',
 '“superioridad',
 'superior',
 'insulta',
 'surgen',
 'sus']

In [89]:
cv_word_pairs[0]

('La', 'La')

In [90]:
cv_word_pairs[1]

('ma', 'marcha')

In [91]:
cv_index['cia']

['iniciativas',
 'apreciado',
 'asociados',
 'eminencia',
 'asociados',
 'social',
 'social',
 'consecuencia',
 'convivencia',
 'social',
 'social',
 'especialmente',
 'Justicia']

## Finding Word Stems

In [92]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            if word.endswith(suffix):
                return word

In [93]:
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['ing']

In [94]:
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['processing']

In [95]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

In [96]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('processe', 's')]

In [97]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

In [98]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

[('language', '')]

In [99]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [100]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [101]:
tokens = word_tokenize(raw)

In [103]:
stems = [stem(t) for t in tokens]
print(stems)

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


## Searching Tokenized Text

In [104]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [117]:
chat = nltk.Text(nps_chat.words())
# three-word phrases ending with the word bro
chat.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [116]:
chat.findall(r"<l.*>{3,}")
# sequences of three or more words starting with the letter l 

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [135]:
raw = "unit unic unsystem system singer ingeneer writing"
tokens = word_tokenize(raw)

def markw(string):
    nltk.re_show(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', string)

[markw(w) for w in tokens]

unit
unic
unsystem
system
singer
ingeneer
{writing}


[None, None, None, None, None, None, None]

In [None]:
'''
import matplotlib
matplotlib.use("TkAgg")
import nltk
nltk.app.nemo()
'''

In [2]:
import nltk
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
#discover hypernyms

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [4]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<as> <\w*> <as> <\w*>")

as accurately as possible; as well as the; as faithfully as possible;
as much as what; as neat as a; as simple as you; as well as other; as
well as other; as involved as determining; as well as other; as
important as another; as accurately as possible; as accurate as any;
as much as any; as different as a; as Orphic as that; as coppery as
Delawares; as good as another; as large as small; as well as ease; as
well as their; as well as possible; as straight as possible; as well
as nailed; as smoothly as the; as soon as a; as well as injuries; as
well as many; as well as reason; as well as in; as well as of; as well
as a; as well as summer; as well as providing; as important as
cooling; as evenly as it; as much as shading; as well as some; as well
as subsoil; as high as possible; as well as many; as general as
electrical; as long as the; as well as the; as much as was; as well as
set; as well as by; as high as 15; as well as aid; as much as
possible; as well as personalities; as low as a; 