In [1]:
import nltk
import re

# Ch07 从文本提取信息

学习目标

1.  从非结构化文本中提取结构化数据
2.  识别一个文本中描述的实体和关系
3.  使用语料库来训练和评估模型

## 7.1 信息提取

从文本中获取意义的方法被称为「信息提取」

1.  从结构化数据中提取信息
2.  从非结构化文本中提取信息
    -   建立一个非常一般的含义
    -   查找文本中具体的各种信息
        -   将非结构化数据转换成结构化数据
        -   使用查询工具从文本中提取信息

### 7.1.1 信息提取结构
｛原始文本（一串）｝→ 断句 

→｛句子（字符串列表）｝→ 分词 

→｛句子分词｝→ 词性标注 

→｛句子词性标注｝→ 实体识别 

→｛句子分块｝→ 关系识别 

→｛关系列表｝

In [2]:
# P283 图7-1，Ex7-1：信息提取结构元组（entity, relation, entity)
# 建立标准化数据库
locs = [('Omnicom', 'IN', 'New York'),
        ('DDB Needham', 'IN', 'New York'),
        ('Kaplan Thaler Group', 'IN', 'New York'),
        ('BBDO South', 'IN', 'Atlanta'),
        ('Georgia-Pacific', 'IN', 'Atlanta')]
# 依据查询提取信息
query = [
        e1
        for (e1, rel, e2) in locs
        if e2 == 'Atlanta'
]
print(query)

['BBDO South', 'Georgia-Pacific']


## 7.2 分块：用于实体识别的基本技术（P284 图7-2）

分块构成的源文本中的片段不能重叠

-   小框显示词级标识符和词性标注
-   大框表示组块（chunk），是较高级别的程序分块

分块的方法

-  正则表达式和N-gram方法分块；
-  使用CoNLL-2000分块语料库开发和评估分块器；

### 7.2.1 名词短语分块（NP-chunking，即“NP-分块”）寻找单独名词短语对应的块

NP-分块是比完整的名词短语更小的片段，不包含其他的NP-分块，修饰一个任何介词短语或者从句将不包括在相应的NP-分块内。
 
NP-分块信息最有用的来源之一是词性标记。

In [3]:
# P285 Ex7-1 基于正则表达式的NP 分块器
# 使用分析器对句子进行分块
sentence = [('the', 'DT'),
            ('little', 'JJ'),
            ('yellow', 'JJ'),
            ('dog', 'NN'),
            ('barked', 'VBD'),
            ('at', 'IN'),
            ('the', 'DT'),
            ('cat', 'NN')]
# 定义分块语法
grammar = 'NP: {<DT>?<JJ>*<NN>}'
# 创建组块分析器
cp = nltk.RegexpParser(grammar)
# 对句子进行分块
result = cp.parse(sentence)
# 输出分块的树状图
print(result)
result.draw()

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### 7.2.2. 标记模式

In [4]:
# 华尔街日报
sentence = [('another', 'DT'),
            ('sharp', 'JJ'),
            ('dive', 'NN'),
            ('trade', 'NN'),
            ('figures', 'NNS'),
            ('any', 'DT'),
            ('new', 'JJ'),
            ('policy', 'NN'),
            ('measures', 'NNS'),
            ('earlier', 'JJR'),
            ('stages', 'NNS'),
            ('Panamanian', 'JJ'),
            ('dictator', 'NN'),
            ('Manuel', 'NNP'),
            ('Noriega', 'NNP')]
grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP another/DT sharp/JJ dive/NN trade/NN figures/NNS)
  (NP any/DT new/JJ policy/NN measures/NNS)
  (NP earlier/JJR stages/NNS)
  (NP Panamanian/JJ dictator/NN Manuel/NNP Noriega/NNP))


In [5]:
# Grammar中输入语法，语法格式{<DT>?<JJ.*>*<NN.*>+}，不能在前面加NP:，具体可以参考右边的Regexps说明
# Development Set就是开发测试集，用于调试语法规则。绿色表示正确匹配，红色表示没有正确匹配。黄金标准标注为下划线
nltk.app.chunkparser()

### 7.2.3 用正则表达式分块（组块分析）

In [6]:
# Ex7-2 简单的名词短语分类器
sentence = [("Rapunzel", "NNP"),
            ("let", "VBD"),
            ("down", "RP"),
            ("her", "PP$"),
            ("long", "JJ"),
            ("golden", "JJ"),
            ("hair", "NN")]
# 两个规则组成的组块分析语法，注意规则执行会有先后顺序，两个规则如果有重叠部分，以先执行的为准
grammar = r'''
  NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
      {<NNP>+}                # chunk sequences of proper nouns
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [7]:
grammar = r'NP: {<[CDJ].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  Rapunzel/NNP
  let/VBD
  down/RP
  her/PP$
  (NP long/JJ golden/JJ)
  hair/NN)


In [8]:
grammar = r'NP: {<[CDJNP].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [9]:
grammar = r'NP: {<[CDJN].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  her/PP$
  (NP long/JJ golden/JJ hair/NN))


In [10]:
# 如果模式匹配位置重叠，最左边的优先匹配。
# 例如：如果将匹配两个连贯名字的文本的规则应用到包含3个连贯名词的文本中，则只有前两个名词被分块
nouns = [('money', 'NN'), ('market', 'NN'), ('fund', 'NN')]
grammar = 'NP: {<NN><NN>}'
print("错误分块的结果= ", nltk.RegexpParser(grammar).parse(nouns))
grammar = 'NP: {<NN>+}'
print("正确分块的结果= ", nltk.RegexpParser(grammar).parse(nouns))

错误分块的结果=  (S (NP money/NN market/NN) fund/NN)
正确分块的结果=  (S (NP money/NN market/NN fund/NN))


### 7.2.4 探索文本语料库：从已经标注的语料库中提取匹配特定词性标记序列的短语

In [11]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
cp = nltk.RegexpParser(grammar)
brown = nltk.corpus.brown
count = 0
for sent in brown.tagged_sents():
    if count < 10:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                count += 1
                print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [12]:
# 定义一个搜索函数（一次性返回定义好的数据量）
def find_chunks(pattern):
    cp = nltk.RegexpParser(pattern)
    brown = nltk.corpus.brown
    count = 0
    for sent in brown.tagged_sents():
        if count < 10:
            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK' or subtree.label()=='NOUNS':
                    count += 1
                    print(subtree)

In [13]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
find_chunks(grammar)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [14]:
grammar = 'NOUNS: {<N.*>{4,}}'
find_chunks(grammar)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


In [15]:
# 定义一个搜索函数（使用生成器）
def find_chunks(pattern):
    cp = nltk.RegexpParser(pattern)
    brown = nltk.corpus.brown
    for sent in brown.tagged_sents():
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK' or subtree.label() == 'NOUNS':
                yield subtree

In [16]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
for i, subtree in enumerate(find_chunks(grammar)):
    if i < 10:
        print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [17]:
grammar = 'NOUNS: {<N.*>{4,}}'
for i, subtree in enumerate(find_chunks(grammar)):
    if i < 10:
        print(subtree)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


### 7.2.5. 添加缝隙：寻找需要排除的成分

可以为不包括在大块中的标识符序列定义一个缝隙。

In [18]:
sentence = [("the", "DT"),
            ("little", "JJ"),
            ("yellow", "JJ"),
            ("dog", "NN"),
            ("barked", "VBD"),
            ("at", "IN"),
            ("the", "DT"),
            ("cat", "NN")]

In [19]:
# 先分块，再加缝隙，才能得出正确的结果
grammar = r'''
    NP: 
        {<.*>+}         # Chunk everything （先对所有数据分块）
        }<VBD|IN>+{     # Chink sequences of VBD and IN（对 VBD 或者 IN 加缝隙）
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [20]:
# 先加缝隙，再分块，就不能得出正确的结果，只会得到一个块，效果与没有使用缝隙是一样的
grammar = r'''
    NP: 
        }<VBD|IN>+{     # Chink sequences of VBD and IN
        {<.*>+}         # Chunk everything
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP
    the/DT
    little/JJ
    yellow/JJ
    dog/NN
    barked/VBD
    at/IN
    the/DT
    cat/NN))


In [21]:
grammar = r'''
    NP: 
        {<.*>+}         # Chunk everything
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP
    the/DT
    little/JJ
    yellow/JJ
    dog/NN
    barked/VBD
    at/IN
    the/DT
    cat/NN))


### 7.2.6 分块的表示：标记与树状图

作为「标注」和「分析」之间的中间状态（Ref：Ch8），块结构可以使用标记或者树状图来表示

使用最为广泛的表示是IOB标记：

-   I（Inside，内部）；
-   O（Outside，外部）；
-   B（Begin，开始）。

## 7.3 开发和评估分块器

### 7.3.1 读取 IOB格式 和 CoNLL2000 语料库

In [22]:
# 不能在text里面加入“空格”和“置表符”用来控制文本的格式
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''

In [23]:
# 绘制块结构的树状图表示
nltk.chunk.conllstr2tree(text, chunk_types=('NP',)).draw()

In [24]:
nltk.chunk.conllstr2tree(text, chunk_types=('NP', 'VP')).draw()

In [25]:
# CoNLL2000分块语料库包括3种分块类型：NP、VP、PP
from nltk.corpus import conll2000

train_sents = conll2000.chunked_sents('train.txt', chunk_types='NP')
print(train_sents[0])

(S
  (NP Confidence/NN)
  in/IN
  (NP the/DT pound/NN)
  is/VBZ
  widely/RB
  expected/VBN
  to/TO
  take/VB
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  for/IN
  (NP September/NNP)
  ,/,
  due/JJ
  for/IN
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  fail/VB
  to/TO
  show/VB
  (NP a/DT substantial/JJ improvement/NN)
  from/IN
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


In [26]:
train_sents = conll2000.chunked_sents('train.txt', chunk_types=('NP', 'VP'))
print(train_sents[0])

(S
  (NP Confidence/NN)
  in/IN
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  for/IN
  (NP September/NNP)
  ,/,
  due/JJ
  for/IN
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  from/IN
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


In [27]:
train_sents = conll2000.chunked_sents('train.txt', chunk_types=('NP', 'VP', 'PP'))
print(train_sents[0])

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


### 7.3.2 简单的评估和基准

In [28]:
# 建立基准
test_sents = conll2000.chunked_sents('test.txt', chunk_types='NP')
print(test_sents[0])

(S
  (NP Rockwell/NNP International/NNP Corp./NNP)
  (NP 's/POS Tulsa/NNP unit/NN)
  said/VBD
  (NP it/PRP)
  signed/VBD
  (NP a/DT tentative/JJ agreement/NN)
  extending/VBG
  (NP its/PRP$ contract/NN)
  with/IN
  (NP Boeing/NNP Co./NNP)
  to/TO
  provide/VB
  (NP structural/JJ parts/NNS)
  for/IN
  (NP Boeing/NNP)
  (NP 's/POS 747/CD jetliners/NNS)
  ./.)


In [29]:
# 没有任何语法规则，即所有的词都被标注为O
print(nltk.RegexpParser('').evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [30]:
# 正则表达式分块器
grammar = r'NP: {<[CDJNP].*>+}'
print(nltk.RegexpParser(grammar).parse(test_sents)[0])
print(nltk.RegexpParser(grammar).evaluate(test_sents))

(S
  (NP Rockwell/NNP International/NNP Corp./NNP)
  (NP 's/POS Tulsa/NNP unit/NN)
  said/VBD
  (NP it/PRP)
  signed/VBD
  (NP a/DT tentative/JJ agreement/NN)
  extending/VBG
  (NP its/PRP$ contract/NN)
  with/IN
  (NP Boeing/NNP Co./NNP)
  to/TO
  provide/VB
  (NP structural/JJ parts/NNS)
  for/IN
  (NP Boeing/NNP)
  (NP 's/POS 747/CD jetliners/NNS)
  ./.)
ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [31]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [
            [(t, c)
             for w, t, c in nltk.chunk.tree2conlltags(sent)]  # 准备训练用的数据
            for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)  # 使用训练数据训练一元语法标注器

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        # 需要标注的内容 ['NN','CC','DT','PRP'...]
        tagged_pos_tag = self.tagger.tag(pos_tags)
        # 标注好的结果 [('NNP','I-NP'),(',','O')...]
        chunktags = [
            chunktag
            for (pos, chunktag) in tagged_pos_tag
        ]  # 把标注好的结果选出来
        conlltags = [
            (word, pos, chunktag)
            for ((word, pos), chunktag) in zip(sentence, chunktags)
        ]  # 组成最后需要输出的结果
        # 最后输出的结果：[('Rockwell', 'NNP', 'I-NP'), ('International', 'NNP', 'I-NP')...]
        return nltk.chunk.conlltags2tree(conlltags)  # 将结果转化成树块的方式输出

In [32]:
from nltk.corpus import conll2000

test_sents = conll2000.chunked_sents('test.txt', chunk_types='NP')
train_sents = conll2000.chunked_sents('train.txt', chunk_types='NP')

# 评估unigram标注器的性能
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [33]:
# 训练用的数据格式
train_data = [
    [(t, c) 
     for w, t, c in nltk.chunk.tree2conlltags(sent)] 
    for sent in train_sents]
print(train_data[0])

[('NN', 'B-NP'), ('IN', 'O'), ('DT', 'B-NP'), ('NN', 'I-NP'), ('VBZ', 'O'), ('RB', 'O'), ('VBN', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NN', 'B-NP'), ('NNS', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), (',', 'O'), ('JJ', 'O'), ('IN', 'O'), ('NN', 'B-NP'), ('NN', 'B-NP'), (',', 'O'), ('VB', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), ('CC', 'I-NP'), ('NNP', 'I-NP'), ('POS', 'B-NP'), ('JJ', 'I-NP'), ('NNS', 'I-NP'), ('.', 'O')]


In [34]:
# 一元标注器对于标签的标注结果
postags = sorted(set(
    pos
    for sent in train_sents
    for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [35]:
# 试着自己建立一个二元标注器
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [
            [(t, c)
             for w, t, c in nltk.chunk.tree2conlltags(sent)]
            for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [
            pos
            for (word, pos) in sentence]
        tagged_pos_tag = self.tagger.tag(pos_tags)
        chunktags = [
            chunktag
            for (pos, chunktag) in tagged_pos_tag]
        conlltags = [
            (word, pos, chunktag)
            for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [36]:
# 二元标注器对性能的提高非常有限
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%


### 7.3.3 训练基于分类器的分块器
想要最大限度地提升分块的性能，需要使用词的内容信息作为词性标记的补充。

In [37]:
# Ex-7.5. 使用连续分类器（最大熵分类器）对名词短语分块（i5-5200U，执行时间20分钟）
# 不能使用megam算法，megam表示LM-BFGS algorithm，需要使用External Libraries，
# Windows用户就不要尝试了，因为作者根本没有提供Windows的安装版本
# 取消algorithm='megam'设置，使用默认的算法就可以了-->The default algorithm = 'IIS'(Improved Iterative Scaling )
# ConsecutiveNPChunkTagger与Ex6-5中的ConsecutivePosTagger类相同，区别只有特征提取器不同。
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                # print(untagged_sent, i, history)
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        # self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
        self.classifier = nltk.MaxentClassifier.train(train_set, trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [38]:
# 对ConsecutiveNPChunkTagger的包装类，使之变成一个分块器
class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [
            [((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)]
            for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [
            (w, t, c,)
            for ((w, t), c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [39]:
# 1） 第一个特征提取器
#       最为简单，只使用了单词本身的标签作为特征，训练结果与unigram分类器非常相似
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {'pos': pos}

In [40]:
# 验证基于分类器的分块器的性能（运行时间较长）
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [41]:
# 最初的是[（（单词，标签），分块）,...]
chunked_sents = [
    [((w, t), c)
     for (w, t, c) in nltk.chunk.tree2conlltags(sent)]
    for sent in train_sents[0:1]]
print(chunked_sents[0])

[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O'), (('the', 'DT'), 'B-NP'), (('pound', 'NN'), 'I-NP'), (('is', 'VBZ'), 'O'), (('widely', 'RB'), 'O'), (('expected', 'VBN'), 'O'), (('to', 'TO'), 'O'), (('take', 'VB'), 'O'), (('another', 'DT'), 'B-NP'), (('sharp', 'JJ'), 'I-NP'), (('dive', 'NN'), 'I-NP'), (('if', 'IN'), 'O'), (('trade', 'NN'), 'B-NP'), (('figures', 'NNS'), 'I-NP'), (('for', 'IN'), 'O'), (('September', 'NNP'), 'B-NP'), ((',', ','), 'O'), (('due', 'JJ'), 'O'), (('for', 'IN'), 'O'), (('release', 'NN'), 'B-NP'), (('tomorrow', 'NN'), 'B-NP'), ((',', ','), 'O'), (('fail', 'VB'), 'O'), (('to', 'TO'), 'O'), (('show', 'VB'), 'O'), (('a', 'DT'), 'B-NP'), (('substantial', 'JJ'), 'I-NP'), (('improvement', 'NN'), 'I-NP'), (('from', 'IN'), 'O'), (('July', 'NNP'), 'B-NP'), (('and', 'CC'), 'I-NP'), (('August', 'NNP'), 'I-NP'), (("'s", 'POS'), 'B-NP'), (('near-record', 'JJ'), 'I-NP'), (('deficits', 'NNS'), 'I-NP'), (('.', '.'), 'O')]


In [42]:
# 脱第一层“分块”得到[（单词，标签）,...]
tagged_sent = nltk.tag.untag(chunked_sents[0])
print(tagged_sent)

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]


In [43]:
# 再脱一层“标签”得到[单词,...]
untagged_sent = nltk.tag.untag(tagged_sent)
print(untagged_sent)

['Confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'September', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'July', 'and', 'August', "'s", 'near-record', 'deficits', '.']


In [44]:
# 再脱一层“标签”就会报错
# nltk.tag.untag(untagged_sent)

In [52]:
history=[]
for i, (word, tag) in enumerate(chunked_sents[0]):
    print(str(i)+'.  ', word, tag, end='\t --> \t')
    feature_set = npchunk_features(tagged_sent, i, history)
    print(feature_set)
    history.append(tag)

0.   ('Confidence', 'NN') B-NP	 --> 	{'pos': 'NN'}
1.   ('in', 'IN') O	 --> 	{'pos': 'IN'}
2.   ('the', 'DT') B-NP	 --> 	{'pos': 'DT'}
3.   ('pound', 'NN') I-NP	 --> 	{'pos': 'NN'}
4.   ('is', 'VBZ') O	 --> 	{'pos': 'VBZ'}
5.   ('widely', 'RB') O	 --> 	{'pos': 'RB'}
6.   ('expected', 'VBN') O	 --> 	{'pos': 'VBN'}
7.   ('to', 'TO') O	 --> 	{'pos': 'TO'}
8.   ('take', 'VB') O	 --> 	{'pos': 'VB'}
9.   ('another', 'DT') B-NP	 --> 	{'pos': 'DT'}
10.   ('sharp', 'JJ') I-NP	 --> 	{'pos': 'JJ'}
11.   ('dive', 'NN') I-NP	 --> 	{'pos': 'NN'}
12.   ('if', 'IN') O	 --> 	{'pos': 'IN'}
13.   ('trade', 'NN') B-NP	 --> 	{'pos': 'NN'}
14.   ('figures', 'NNS') I-NP	 --> 	{'pos': 'NNS'}
15.   ('for', 'IN') O	 --> 	{'pos': 'IN'}
16.   ('September', 'NNP') B-NP	 --> 	{'pos': 'NNP'}
17.   (',', ',') O	 --> 	{'pos': ','}
18.   ('due', 'JJ') O	 --> 	{'pos': 'JJ'}
19.   ('for', 'IN') O	 --> 	{'pos': 'IN'}
20.   ('release', 'NN') B-NP	 --> 	{'pos': 'NN'}
21.   ('tomorrow', 'NN') B-NP	 --> 	{'pos': 'NN'}
22.   (

In [53]:
# 2） 第二个特征提取器
#       使用了单词前面一个单词的标签作为特征，效果类似于bigram分块器
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = '<START>', '<START>'
    else:
        prevword, prevpos = sentence[i - 1]
    return {'pos': pos, 'prevpos': prevpos}


chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.6%%
    Precision:     82.0%%
    Recall:        87.2%%
    F-Measure:     84.6%%


In [54]:
# 3） 第三个特征提取器，使用了单词本身的标签、前一个单词、前一个单词的标签作为特征
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = '<START>', '<START>'
    else:
        prevword, prevpos = sentence[i - 1]
    return {'pos': pos, 'word': word, 'prevpos': prevpos}


chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  94.6%%
    Precision:     84.6%%
    Recall:        89.8%%
    F-Measure:     87.1%%


In [55]:
# 4) 第四个特征提取器，使用了多种附加特征
#   * 预取特征
#   * 配对功能
#   * 复杂的语境特征
#   * tags-since-dt：用其创建一个字符串，描述自最近限定词以来遇到的所有词性标记
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = '<START>', '<START>'
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = '<END>', '<END>'
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        'pos': pos,
        'word': word,
        'prevpos': prevpos,
        'nextpos': nextpos,
        'prevpos+pos': '%s+%s' % (prevpos, pos),
        'pos+nextpos': '%s+%s' % (pos, nextpos),
        'tags-sincce-dt': tags_since_dt(sentence, i)
    }


chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  96.0%%
    Precision:     88.3%%
    Recall:        91.1%%
    F-Measure:     89.7%%
