In [1]:
import re
from tools import *
%matplotlib inline

# Chap 3 处理原始文本
1.  如何访问文件内的文本？
2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？
3.  如何产生格式化的输出，并把结果保存在文件中？

## 3.5. 正则表达式的有益应用

### 3.5.1 提取字符块

In [2]:
show_subtitle("P109 提取元音字符块")
word = 'supercalifragilisticexpialidocious'
word_pieces = re.findall(r'[aeiou]', word)
print(word_pieces, "长度= ", len(word_pieces))

--------------- >P109 提取元音字符块< ---------------
['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u'] 长度=  16


In [3]:
show_subtitle("P109 提取两个元音字符块")
wsj = sorted(set(nltk.corpus.treebank.words()))
word_pieces_list = [
        vs
        for word in wsj
        for vs in re.findall(r'[aeiou]{2,}', word)
]
print(word_pieces_list[:13])

--------------- >P109 提取两个元音字符块< ---------------
['ea', 'oi', 'ea', 'ou', 'oi', 'ea', 'ea', 'oi', 'oi', 'ea', 'io', 'ea', 'ea']


In [4]:
show_subtitle("统计双元音字符块的数目")
fd = nltk.FreqDist(word_pieces_list)
fd.most_common(12)

--------------- >统计双元音字符块的数目< ---------------


[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]

In [5]:
show_subtitle("提取日期格式中的整数值")
numbers_list = [
        int(n)
        for n in re.findall(r'[0-9]+', '2009-12-31')
]
print(numbers_list)

--------------- >提取日期格式中的整数值< ---------------
[2009, 12, 31]


### 3.5.2 在字符块上做更多的事情

In [6]:
# 使用findall()完成更加复杂的任务
show_subtitle("P110 忽略掉单词内部的元音")
# 第一个模板保证元音在首字母和元音在尾字母的依然保留
regexp1 = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
# 第二个模板会删除所有元音字母
regexp2 = r'[^AEIOUaeiou]'


def compress(word, regexp):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)


english_udhr = nltk.corpus.udhr.words('English-Latin1')

english_tmp1 = [
        compress(w, regexp1)
        for w in english_udhr
]
print("english_tmp1= ", english_tmp1[:13])
print("len(english_tmp1)= ", len(english_tmp1))

english_tmp2 = [
        compress(w, regexp2)
        for w in english_udhr
]
print("english_tmp2= ", english_tmp2[:13])
print("len(english_tmp2)= ", len(english_tmp2))

print("english_udhr[:75]= ", nltk.tokenwrap(english_tmp1[:75]))

--------------- >P110 忽略掉单词内部的元音< ---------------
english_tmp1=  ['Unvrsl', 'Dclrtn', 'of', 'Hmn', 'Rghts', 'Prmble', 'Whrs', 'rcgntn', 'of', 'the', 'inhrnt', 'dgnty', 'and']
len(english_tmp1)=  1781
english_tmp2=  ['nvrsl', 'Dclrtn', 'f', 'Hmn', 'Rghts', 'Prmbl', 'Whrs', 'rcgntn', 'f', 'th', 'nhrnt', 'dgnty', 'nd']
len(english_tmp2)=  1781
english_udhr[:75]=  Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [7]:
show_subtitle("P111 提取辅音-元音序列对，并且统计单词库中这样的序列对的数目")
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [
        cv
        for w in rotokas_words
        for cv in re.findall(r'[ptksvr][aeiou]', w)
]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

--------------- >P111 提取辅音-元音序列对，并且统计单词库中这样的序列对的数目< ---------------
    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [8]:
show_subtitle("定义「辅音-元音序列对」所对应的单词集合")
cv_word_pairs = [
        (cv, w)
        for w in rotokas_words
        for cv in re.findall(r'[ptksvr][aeiou]', w)
]
cv_index = nltk.Index(cv_word_pairs)
print("cv_index['su']= ", cv_index['su'])
print("cv_index['po']= ", cv_index['po'])

--------------- >定义「辅音-元音序列对」所对应的单词集合< ---------------
cv_index['su']=  ['kasuari']
cv_index['po']=  ['kaapo', 'kaapopato', 'kaipori', 'kaiporipie', 'kaiporivira', 'kapo', 'kapoa', 'kapokao', 'kapokapo', 'kapokapo', 'kapokapoa', 'kapokapoa', 'kapokapora', 'kapokapora', 'kapokaporo', 'kapokaporo', 'kapokari', 'kapokarito', 'kapokoa', 'kapoo', 'kapooto', 'kapoovira', 'kapopaa', 'kaporo', 'kaporo', 'kaporopa', 'kaporoto', 'kapoto', 'karokaropo', 'karopo', 'kepo', 'kepoi', 'keposi', 'kepoto']


### 3.5.3 查找词干

In [9]:
# 只取出词尾（只提取了后缀，没有提出词干）
regexp = r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$'
print(re.findall(regexp, 'processing'))

['ing']


In [10]:
# 输出了整个单词（提取符合后缀的字符串，"(?:)"的作用，但是没有提取出词干）
regexp = r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$'
print(re.findall(regexp, 'processing'))  # 符合词缀要求的单词可以提取出来
print(re.findall(regexp, 'processooo'))  # 不符合词缀要求的单词就不提取出来

['processing']
[]


In [11]:
# 将单词分解为词干和后缀
regexp = r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$'
print(re.findall(regexp, 'processing'))
print(re.findall(regexp, 'processes'))  # 使用贪婪匹配模式，错误分解单词

[('process', 'ing')]
[('processe', 's')]


In [12]:
# 不使用贪婪匹配模式
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$'
print(re.findall(regexp, 'processes'))
print(re.findall(regexp, 'process'))  # 需要单词背景知识，将这类单词剔除，否则会错误地提取词干
print(re.findall(regexp, 'language'))  # 没有单词背景知识时，如果对于没有词缀的单词会无法提取出单词来

[('process', 'es')]
[('proces', 's')]
[]


In [13]:
# 正确处理没有后缀的单词
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
print(re.findall(regexp, 'language'))

[('language', '')]


In [14]:
# 更加准确地词干提取模板，先将原始数据分词，然后提取分词后的词干
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [15]:
raw = """DENNIS: Listen,
strange women lying in ponds distributing swords
is no basis for a system of government.
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony."""

tokens = nltk.word_tokenize(raw)
stem_list = [
        stem(t)
        for t in tokens
]
print(stem_list)

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


In [16]:
# 正则表达式的展示函数nltk.re_show()，可以把符合正则表达式要求的字符标注出来
# 不能使用re.findall()中的正则表达式标准。需要使用基本的正则表达式标准。
regexp = r'[ing|ly|ed|ious|ies|ive|es|s|ment]$'
nltk.re_show(regexp, raw)  

DENNIS: Listen,
strange women lying in ponds distributing sword{s}
is no basis for a system of government.
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony.


In [17]:
regexp = r'(ing)'
nltk.re_show(regexp, raw)

DENNIS: Listen,
strange women ly{ing} in ponds distribut{ing} swords
is no basis for a system of government.
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony.


In [18]:
regexp = r'[ing]'
nltk.re_show(regexp, raw)

DENNIS: L{i}ste{n},
stra{n}{g}e wome{n} ly{i}{n}{g} {i}{n} po{n}ds d{i}str{i}but{i}{n}{g} swords
{i}s {n}o bas{i}s for a system of {g}over{n}me{n}t.
Supreme execut{i}ve power der{i}ves from a ma{n}date from the masses, 
{n}ot from some farc{i}cal aquat{i}c ceremo{n}y.


In [19]:
regexp = r'ing'
nltk.re_show(regexp, raw)

DENNIS: Listen,
strange women ly{ing} in ponds distribut{ing} swords
is no basis for a system of government.
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony.


In [20]:
regexp = '^[D|s|i|S|n]'
nltk.re_show(regexp, raw)  # '^' 表示行的开头

{D}ENNIS: Listen,
{s}trange women lying in ponds distributing swords
{i}s no basis for a system of government.
{S}upreme executive power derives from a mandate from the masses, 
{n}ot from some farcical aquatic ceremony.


In [21]:
regexp = '^[DsiSn]'
nltk.re_show(regexp, raw)  # '[]' 内，用不用|都表示析取

{D}ENNIS: Listen,
{s}trange women lying in ponds distributing swords
{i}s no basis for a system of government.
{S}upreme executive power derives from a mandate from the masses, 
{n}ot from some farcical aquatic ceremony.


In [22]:
regexp = '[s|.|,]$'
nltk.re_show(regexp, raw)  # '$' 表示行的结尾

DENNIS: Listen{,}
strange women lying in ponds distributing sword{s}
is no basis for a system of government{.}
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony{.}


In [23]:
regexp = 'ing|tive'
nltk.re_show(regexp, raw)  # '|' 表示析取指定的字符串

DENNIS: Listen,
strange women ly{ing} in ponds distribut{ing} swords
is no basis for a system of government.
Supreme execu{tive} power derives from a mandate from the masses, 
not from some farcical aquatic ceremony.


In [24]:
regexp = '(s){1,2}'
nltk.re_show(regexp, raw)  # '{}' 表示重复的次数

DENNIS: Li{s}ten,
{s}trange women lying in pond{s} di{s}tributing {s}word{s}
i{s} no ba{s}i{s} for a {s}y{s}tem of government.
Supreme executive power derive{s} from a mandate from the ma{ss}e{s}, 
not from {s}ome farcical aquatic ceremony.


### 3.5.4 搜索已经分词的文本

In [25]:
# P114 对已经实现分词的文本（Text）进行搜索（findall）
from nltk.corpus import gutenberg, nps_chat

moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
tokens = moby.tokens
print(tokens[:13])

['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']', 'ETYMOLOGY', '.', '(', 'Supplied', 'by']


In [26]:
# ToDo: ?: 是做什么用的？
regexp = r"(?:<a> <.*> <man>)"
moby.findall(regexp)

a monied man; a nervous man; a dangerous man; a white man; a white
man; a white man; a pious man; a queer man; a good man; a mature man;
a white man; a Cape man; a great man; a wise man; a wise man; a
butterless man; a white man; a fiendish man; a pale man; a furious
man; a better man; a certain man; a complete man; a dismasted man; a
younger man; a brave man; a brave man; a brave man; a brave man


In [27]:
regexp = r"(<a> <.*> <man>)"
moby.findall(regexp)

a monied man; a nervous man; a dangerous man; a white man; a white
man; a white man; a pious man; a queer man; a good man; a mature man;
a white man; a Cape man; a great man; a wise man; a wise man; a
butterless man; a white man; a fiendish man; a pale man; a furious
man; a better man; a certain man; a complete man; a dismasted man; a
younger man; a brave man; a brave man; a brave man; a brave man


In [28]:
# 找出文本中"a <word> man"中的word
regexp = r"<a>(<.*>)<man>"
moby.findall(regexp)

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [29]:
regexp = 'ly|ed|ing'
nltk.re_show(regexp, ' '.join(tokens[:75]))

[ Moby Dick by Herman Melville 1851 ] ETYMOLOGY . ( Suppli{ed} by a Late Consumptive Usher to a Grammar School ) The pale Usher -- threadbare in coat , heart , body , and brain ; I see him now . He was ever dust{ing} his old lexicons and grammars , with a queer handkerchief , mock{ing}{ly} embellish{ed} with all the gay flags of all the known nations of the world . He lov{ed}


In [30]:
regexp = 'see [a-z]+ now'
nltk.re_show(regexp, ' '.join(tokens[:200]))

[ Moby Dick by Herman Melville 1851 ] ETYMOLOGY . ( Supplied by a Late Consumptive Usher to a Grammar School ) The pale Usher -- threadbare in coat , heart , body , and brain ; I {see him now} . He was ever dusting his old lexicons and grammars , with a queer handkerchief , mockingly embellished with all the gay flags of all the known nations of the world . He loved to dust his old grammars ; it somehow mildly reminded him of his mortality . " While you take in hand to school others , and to teach them by what name a whale - fish is to be called in our tongue leaving out , through ignorance , the letter H , which almost alone maketh the signification of the word , you deliver that which is not true ." -- HACKLUYT " WHALE . ... Sw . and Dan . HVAL . This animal is named from roundness or rolling ; for in Dan . HVALT is arched or vaulted ." -- WEBSTER ' S DICTIONARY " WHALE . ... It is more immediately from the Dut . and Ger . WALLEN ;


In [31]:
chat = nltk.Text(nps_chat.words())
tokens = chat.tokens
print(tokens[:13])

['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey', 'everyone', 'ah', 'well']


In [32]:
regexp = r"<.*><.*><bro>"
chat.findall(regexp)

you rule bro; telling you bro; u twizted bro


In [33]:
regexp = r"<l.*>{3,}"
chat.findall(regexp)

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [34]:
regexp = 'l.+'
nltk.re_show(regexp, ' '.join(tokens[:200]))

now im {left with this gay name :P PART hey everyone ah well NICK : U7 U7 is a gay name . . ACTION gives U121 a golf clap . :) JOIN hi U59 26 / m / ky women that are nice please pm me JOIN PART there ya go U7 do n't golf clap me . fuck you U121 :@ whats everyone up to ? PART PART i 'll thunder clap your ass . PART and i dont even know what that means . that sounds painful any ladis wanna chat ? 29 m 26 / m JOIN my cousin drew a messed up pic on my cast PART 24 / m boo . 26 / m and sexy lol U115 boo . JOIN PART he drew a girl with legs spread boo . hope he didnt draw a penis PART ewwwww lol & a head between her legs JOIN JOIN sounds good to me . r u serious JOIN PART I 'll take one , please . & i have to go to the docs tomorrow ya man I am too .. Connected to ... Slip away ... Fade away ... Days away I ... Still feel}


In [35]:
regexp = 'h.+'
nltk.re_show(regexp, ' '.join(tokens[200:400]))

you ... Touc{hing me ... Changing me ... Considerably killing me ... heeeey ! do n't you have a sharpie ? 26 / m you 're back U115 U129 yep U115 Any ladies wanna chat with 24 / m hurry ladies PART JOIN JOIN not fast enough U116 a bowl i got a blunt an a bong ...... lol JOIN well , glad it worked out my chair is too hard . Anyone from Tennessee in here ? hey ladies as am i is U68 back yet PART hey PART JOIN U121 is missing a B in her name and i do n't complain about things being hard very often . ok yes U30 fire it up Any women from Nashville in here ? JOIN PART and an an " itch " JOIN yo , U133 or a " ogan " are you a male ? JOIN JOIN show will let 's talk . PART :) haha brb opps JOIN PART sho * . ACTION keeps U115 s place nice and warm . hey any guys with cams wanna play ? . ACTION sits on U68 's lap . JOIN JOIN any guyz wanna chat hi there boo , it}


In [36]:
# 正则表达式的测试界面
nltk.app.nemo()

In [37]:
from nltk.corpus import brown

hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
print(hobbies_learned[:13])

['Too', 'often', 'a', 'beginning', 'bodybuilder', 'has', 'to', 'do', 'his', 'training', 'secretly', 'either', 'because']


In [38]:
regexp = r"<\w*> <and> <other> <\w*s>"
hobbies_learned.findall(regexp)

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [39]:
regexp=r"<as><\w*><as><\w*>"
hobbies_learned.findall(regexp)

as accurately as possible; as well as the; as faithfully as possible;
as much as what; as neat as a; as simple as you; as well as other; as
well as other; as involved as determining; as well as other; as
important as another; as accurately as possible; as accurate as any;
as much as any; as different as a; as Orphic as that; as coppery as
Delawares; as good as another; as large as small; as well as ease; as
well as their; as well as possible; as straight as possible; as well
as nailed; as smoothly as the; as soon as a; as well as injuries; as
well as many; as well as reason; as well as in; as well as of; as well
as a; as well as summer; as well as providing; as important as
cooling; as evenly as it; as much as shading; as well as some; as well
as subsoil; as high as possible; as well as many; as general as
electrical; as long as the; as well as the; as much as was; as well as
set; as well as by; as high as 15; as well as aid; as much as
possible; as well as personalities; as low as a; 