In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import brown
from tools import show_subtitle
%matplotlib inline

# Ch5 分类和标注词汇

1.  什么是词汇分类，在自然语言处理中它们如何使用？
2.  对于存储词汇和它们的分类来说什么是好的 Python 数据结构？
3.  如何自动标注文本中每个词汇的词类？

-   词性标注（parts-of-speech tagging，POS tagging）：简称标注。将词汇按照它们的词性（parts-of-speech，POS）进行分类并对它们进行标注
-   词性：也称为词类或者词汇范畴。
-   标记集：用于特定任务标记的集合。

## 5.3 使用Python字典映射词及其属性(P206)
Python字典数据类型（以称为关联数组或者哈希数组），学习如何使用字典表示包括词性在内的各种不同语言信息

### 5.3.1 索引链表 与 字典 的区别

图5-2：链表查找：在整数索引的基础上，访问 Python 链表的内容

图5-3：字典查询：使用一个关键字，访问一个字典的条目

表5-4：语言学对象从键到值的映射

### 5.3.2. Python字典

In [2]:
pos = {}
pos['colorless'] = 'ADJ'
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
print("pos['colorless']= ", pos['colorless'])
print("pos= ", pos)

pos['colorless']=  ADJ
pos=  {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}


In [3]:
# 访问不存在的键，报 KeyError
# pos['green']

In [4]:
# 字典转换成链表
print("list(pos)= ", list(pos))

list(pos)=  ['colorless', 'ideas', 'sleep', 'furiously']


In [5]:
# 字典排序
print("sorted(pos)= ", sorted(pos))

sorted(pos)=  ['colorless', 'furiously', 'ideas', 'sleep']


In [6]:
# 字典顺序访问
word_list = [
        w
        for w in pos
        if w.endswith('s')
]
print("word_list= ", word_list)

word_list=  ['colorless', 'ideas']


In [7]:
# 遍历字典中的数据
# for word in sorted(pos):
for word in pos:
    print(word + ":", pos[word])

colorless: ADJ
ideas: N
sleep: V
furiously: ADV


In [8]:
# 访问字典的方法
print("键= ", pos.keys())
print("值= ", pos.values())
print("对= ", pos.items())

键=  dict_keys(['colorless', 'ideas', 'sleep', 'furiously'])
值=  dict_values(['ADJ', 'N', 'V', 'ADV'])
对=  dict_items([('colorless', 'ADJ'), ('ideas', 'N'), ('sleep', 'V'), ('furiously', 'ADV')])


In [9]:
# 分开获取字典中条目的键和值
# for key, val in sorted(pos.items()):
for key, val in pos.items():
    print(key + ":", val)

colorless: ADJ
ideas: N
sleep: V
furiously: ADV


In [10]:
# 字典中键必须惟一
pos['sleep'] = 'V'
print("pos['sleep']= ", pos['sleep'])
pos['sleep'] = 'N'
print("pos['sleep']= ", pos['sleep'])

pos['sleep']=  V
pos['sleep']=  N


### 5.3.3. 定义字典（创建字典的两种方式）

In [11]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
print("pos= ", pos)
pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
print("pos= ", pos)

pos=  {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos=  {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}


### 5.3.4. 默认字典（字典创建新键时的默认值）

In [12]:
from collections import defaultdict

# 默认值可以是不变对象
frequency = defaultdict(int)  
frequency['colorless'] = 4
print("frequency= ", frequency)
print("frequency['colorless']= ", frequency['colorless'])
# 访问不存在的键时，自动创建，使用定义的默认值
print("frequency['ideas']= ", frequency['ideas'])  
print("list(frequency.items())= ", list(frequency.items()))

frequency=  defaultdict(<class 'int'>, {'colorless': 4})
frequency['colorless']=  4
frequency['ideas']=  0
list(frequency.items())=  [('colorless', 4), ('ideas', 0)]


In [13]:
# 默认值也可以是可变对象
pos = defaultdict(list)  
pos['sleep'] = ['NOUN', 'VERB']
print("pos =", pos)
print("pos['sleep']= ", pos['sleep'])
print("pos['ideas']= ", pos['ideas'])
print("list(pos.items())= ", list(pos.items()))

pos = defaultdict(<class 'list'>, {'sleep': ['NOUN', 'VERB']})
pos['sleep']=  ['NOUN', 'VERB']
pos['ideas']=  []
list(pos.items())=  [('sleep', ['NOUN', 'VERB']), ('ideas', [])]


In [14]:
# 默认值为自定义对象
class myObject():
    def __init__(self, data=0):
        self._data = data
        return


oneObject = myObject(5)
print("oneObject._data= ", oneObject._data)
twoObject = myObject()
print("twoObject._data= ", twoObject._data)

pos = defaultdict(myObject)
pos['sleep'] = myObject(5)
print("pos['ideas']= ", pos['ideas'])
print("list(pos.items())= ", list(pos.items()))
print("pos['sleep']._data= ", pos['sleep']._data)
print("pos['ideas']._data= ", pos['ideas']._data)

oneObject._data=  5
twoObject._data=  0
pos['ideas']=  <__main__.myObject object at 0x000000000C192240>
list(pos.items())=  [('sleep', <__main__.myObject object at 0x000000000C1922E8>), ('ideas', <__main__.myObject object at 0x000000000C192240>)]
pos['sleep']._data=  5
pos['ideas']._data=  0


In [15]:
# 默认 lambda 表达式
pos = defaultdict(lambda: 'NOUN')
pos['colorless'] = 'ADJ'
print("pos['colorless']= ", pos['colorless'])
print("pos['blog']= ", pos['blog'])
print("list(pos.items())= ", list(pos.items()))

pos['colorless']=  ADJ
pos['blog']=  NOUN
list(pos.items())=  [('colorless', 'ADJ'), ('blog', 'NOUN')]


In [16]:
# 使用 UNK(out of vocabulary)（超出词汇表）标识符来替换低频词汇
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [
        word
        for (word, _) in vocab.most_common(1000)
]
mapping = defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = vocab[v]
print("list(mapping.items())[:20]= ", list(mapping.items())[:20])
alice2 = [
        mapping[v]
        for v in alice
]
print("alice2[:20]= ", alice2[:20])

list(mapping.items())[:20]=  [(',', 1993), ("'", 1731), ('the', 1527), ('and', 802), ('.', 764), ('to', 725), ('a', 615), ('I', 543), ('it', 527), ('she', 509), ('of', 500), ('said', 456), (",'", 397), ('Alice', 396), ('in', 357), ('was', 352), ('you', 345), ("!'", 278), ('that', 275), ('as', 246)]
alice2[:20]=  [3, 396, 1731, 195, 3, 357, 3, 55, 'UNK', 'UNK', 'UNK', 'UNK', 12, 543, 764, 3, 1527, 45, 141, 'UNK']


### 5.3.5. 递增地更新字典

In [17]:
# Ex5-3 递增地更新字典，按值排序
counts = nltk.defaultdict(int)
for (word, tag) in nltk.corpus.brown.tagged_words(categories='news', tagset='universal'):
    counts[tag] += 1
print("counts['NOUN']= ", counts['NOUN'])
print("sorted(counts)= ", sorted(counts))
print("counts= ", counts)

counts['NOUN']=  30654
sorted(counts)=  ['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']
counts=  defaultdict(<class 'int'>, {'DET': 11389, 'NOUN': 30654, 'ADJ': 6706, 'VERB': 14399, 'ADP': 12355, '.': 11928, 'ADV': 3349, 'CONJ': 2717, 'PRT': 2264, 'PRON': 2535, 'NUM': 2166, 'X': 92})


In [18]:
from operator import itemgetter

# IndexError: tuple index out of range
sort_keys = sorted(counts.items(), key=itemgetter(0), reverse=False)
print("sort_keys= ", sort_keys)
sort_keys = sorted(counts.items(), key=itemgetter(1), reverse=False)
print("sort_keys= ", sort_keys)
sort_keys = sorted(counts.items(), key=itemgetter(1), reverse=True)
print("sort_keys= ", sort_keys)
# itemgetter(2) 没有这个选项，没法用于排序
# sort_keys = sorted(counts.items(), key=itemgetter(2), reverse=False)
# print("sort_keys= ", sort_keys)
key_list = [
        t
        for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)
]
print("key_list= ", key_list)

sort_keys=  [('.', 11928), ('ADJ', 6706), ('ADP', 12355), ('ADV', 3349), ('CONJ', 2717), ('DET', 11389), ('NOUN', 30654), ('NUM', 2166), ('PRON', 2535), ('PRT', 2264), ('VERB', 14399), ('X', 92)]
sort_keys=  [('X', 92), ('NUM', 2166), ('PRT', 2264), ('PRON', 2535), ('CONJ', 2717), ('ADV', 3349), ('ADJ', 6706), ('DET', 11389), ('.', 11928), ('ADP', 12355), ('VERB', 14399), ('NOUN', 30654)]
sort_keys=  [('NOUN', 30654), ('VERB', 14399), ('ADP', 12355), ('.', 11928), ('DET', 11389), ('ADJ', 6706), ('ADV', 3349), ('CONJ', 2717), ('PRON', 2535), ('PRT', 2264), ('NUM', 2166), ('X', 92)]
key_list=  ['NOUN', 'VERB', 'ADP', '.', 'DET', 'ADJ', 'ADV', 'CONJ', 'PRON', 'PRT', 'NUM', 'X']


In [19]:
pair = ('NP', 8336)
print("pair= ", pair)
print("pair[1]= ", pair[1])
print("itemgetter(0)(pair)= ", itemgetter(0)(pair))
print("itemgetter(1)(pair)= ", itemgetter(1)(pair))

pair=  ('NP', 8336)
pair[1]=  8336
itemgetter(0)(pair)=  NP
itemgetter(1)(pair)=  8336


In [20]:
# 通过最后两个字母索引词汇
last_letters = defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
    key = word[-2:]
    last_letters[key].append(word)

print("last_letters['ly']= ", last_letters['ly'][:20])
print("last_letters['xy']= ", last_letters['xy'][:20])

last_letters['ly']=  ['abactinally', 'abandonedly', 'abasedly', 'abashedly', 'abashlessly', 'abbreviately', 'abdominally', 'abhorrently', 'abidingly', 'abiogenetically', 'abiologically', 'abjectly', 'ableptically', 'ably', 'abnormally', 'abominably', 'aborally', 'aboriginally', 'abortively', 'aboundingly']
last_letters['xy']=  ['acyloxy', 'adnexopexy', 'adoxy', 'agalaxy', 'alkoxy', 'alkyloxy', 'amidoxy', 'anorexy', 'anthotaxy', 'apoplexy', 'apyrexy', 'asphyxy', 'ataraxy', 'ataxy', 'azoxy', 'bandboxy', 'barotaxy', 'benzoxy', 'biotaxy', 'boxy']


In [21]:
# 颠倒字母而成的字（回文构词法，相同字母异序词，易位构词，变位词）索引词汇
anagrams = defaultdict(list)
for word in words:
    key = ''.join(sorted(word))
    anagrams[key].append(word)
print("anagrams['aeilnrt']= ", anagrams['aeilnrt'])
print("anagrams['kloo']= ", anagrams['kloo'])
print("anagrams['Zahity']= ", anagrams['Zahity'])
print("anagrams[''.join(sorted('love'))]= ", anagrams[''.join(sorted('love'))])

anagrams['aeilnrt']=  ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']
anagrams['kloo']=  ['kolo', 'look']
anagrams['Zahity']=  ['Zythia']
anagrams[''.join(sorted('love'))]=  ['levo', 'love', 'velo', 'vole']


In [22]:
# NLTK 提供的创建 defaultdict(list) 更加简便的方法
# nltk.Index() 是对 defaultdict(list) 的支持
# nltk.FreqDist() 是对 defaultdict(int) 的支持（附带了排序和绘图的功能）
anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
print("anagrams['aeilnrt']= ", anagrams['aeilnrt'])

anagrams = nltk.FreqDist(''.join(sorted(w)) for w in words)
print("anagrams.most_common(20)= ", anagrams.most_common(20))

anagrams['aeilnrt']=  ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']
anagrams.most_common(20)=  [('agnor', 9), ('acert', 9), ('eerst', 9), ('aelrst', 8), ('aelpt', 8), ('adelnr', 7), ('aelm', 7), ('aelrt', 7), ('aeglr', 7), ('ailr', 7), ('airst', 7), ('aemrt', 7), ('aenprt', 7), ('aeerst', 7), ('aelt', 7), ('aderrt', 7), ('adert', 7), ('aeginst', 7), ('aelps', 7), ('aelst', 7)]


### 5.3.6. 复杂的键和值

In [23]:
# 使用复杂的键和值的默认字典
pos = defaultdict(lambda: defaultdict(int))
brown_news_tagged = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1

print("pos[('DET', 'right')]= ", pos[('DET', 'right')])
print("pos[('NOUN', 'further')]= ", pos[('NOUN', 'further')])
print("pos[('PRT', 'book')]= ", pos[('PRT', 'book')])
show_subtitle("pos")
for i, (key, value) in enumerate(pos):
    if i<20:
        print(key,value)

pos[('DET', 'right')]=  defaultdict(<class 'int'>, {'NOUN': 5, 'ADJ': 11})
pos[('NOUN', 'further')]=  defaultdict(<class 'int'>, {'ADV': 2})
pos[('PRT', 'book')]=  defaultdict(<class 'int'>, {})
--------------- >pos< ---------------
DET Fulton
NOUN County
NOUN Grand
ADJ Jury
NOUN said
VERB Friday
NOUN an
DET investigation
NOUN of
ADP Atlanta's
NOUN recent
ADJ primary
NOUN election
NOUN produced
VERB ``
. no
DET evidence
NOUN ''
. that
ADP any


### 5.3.7. 颠倒字典

表5-5：Python 字典的常用方法

In [24]:
# 通过键查值速度很快，但是通过值查键的速度较慢，为也加速查找可以重新创建一个映射值到键的字典
counts = defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
    counts[word] += 1

# 通过值查键的一种方法
key_list = [
        key
        for (key, value) in counts.items()
        if value == 32
]
print("key_list= ", key_list)

key_list=  ['mortal', 'Against', 'Him', 'There', 'brought', 'King', 'virtue', 'every', 'been', 'thine']


In [25]:
# 使用键-值对字典创建值-键对字典
# pos 是键-值对字典；pos2 是值-键对字典
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
print("pos= ", pos)
pos2 = dict(
        (value, key)
        for (key, value) in pos.items()
)
print("pos2['N']= ", pos2['N'])

pos=  {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos2['N']=  ideas


In [26]:
# 一个键有多个值的键-值字典不能使用上面的方法创建值-键字典
# 提供了一个新的方法创建值-键对字典
pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
print("pos= ", pos)
pos2 = defaultdict(list)
for key, value in pos.items():
    pos2[value].append(key)
print("pos2['ADV']= ", pos2['ADV'])

pos=  {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV', 'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'}
pos2['ADV']=  ['furiously', 'peacefully']


In [27]:
# 使用 nltk.Index() 函数创建新的值-键对字典
pos2 = nltk.Index(
        (value, key)
        for (key, value) in pos.items()
)
print("pos2['ADV']= ", pos2['ADV'])

pos2['ADV']=  ['furiously', 'peacefully']
