In [1]:
import nltk
import re

# Ch07 从文本提取信息

学习目标

1.  从非结构化文本中提取结构化数据
2.  识别一个文本中描述的实体和关系
3.  使用语料库来训练和评估模型

## 7.1 信息提取

从文本中获取意义的方法被称为「信息提取」

1.  从结构化数据中提取信息
2.  从非结构化文本中提取信息
    -   建立一个非常一般的含义
    -   查找文本中具体的各种信息
        -   将非结构化数据转换成结构化数据
        -   使用查询工具从文本中提取信息

### 7.1.1 信息提取结构
｛原始文本（一串）｝→ 断句 

→｛句子（字符串列表）｝→ 分词 

→｛句子分词｝→ 词性标注 

→｛句子词性标注｝→ 实体识别 

→｛句子分块｝→ 关系识别 

→｛关系列表｝

In [2]:
# P283 图7-1，Ex7-1：信息提取结构元组（entity, relation, entity)
# 建立标准化数据库
locs = [('Omnicom', 'IN', 'New York'),
        ('DDB Needham', 'IN', 'New York'),
        ('Kaplan Thaler Group', 'IN', 'New York'),
        ('BBDO South', 'IN', 'Atlanta'),
        ('Georgia-Pacific', 'IN', 'Atlanta')]
# 依据查询提取信息
query = [
        e1
        for (e1, rel, e2) in locs
        if e2 == 'Atlanta'
]
print(query)

['BBDO South', 'Georgia-Pacific']


## 7.2 分块：用于实体识别的基本技术（P284 图7-2）

分块构成的源文本中的片段不能重叠

-   小框显示词级标识符和词性标注
-   大框表示组块（chunk），是较高级别的程序分块

分块的方法

-  正则表达式和N-gram方法分块；
-  使用CoNLL-2000分块语料库开发和评估分块器；

### 7.2.1 名词短语分块（NP-chunking，即“NP-分块”）寻找单独名词短语对应的块

NP-分块是比完整的名词短语更小的片段，不包含其他的NP-分块，修饰一个任何介词短语或者从句将不包括在相应的NP-分块内。
 
NP-分块信息最有用的来源之一是词性标记。

In [3]:
# P285 Ex7-1 基于正则表达式的NP 分块器
# 使用分析器对句子进行分块
sentence = [('the', 'DT'),
            ('little', 'JJ'),
            ('yellow', 'JJ'),
            ('dog', 'NN'),
            ('barked', 'VBD'),
            ('at', 'IN'),
            ('the', 'DT'),
            ('cat', 'NN')]
# 定义分块语法
grammar = 'NP: {<DT>?<JJ>*<NN>}'
# 创建组块分析器
cp = nltk.RegexpParser(grammar)
# 对句子进行分块
result = cp.parse(sentence)
# 输出分块的树状图
print(result)
result.draw()

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### 7.2.2. 标记模式

In [4]:
# 华尔街日报
sentence = [('another', 'DT'),
            ('sharp', 'JJ'),
            ('dive', 'NN'),
            ('trade', 'NN'),
            ('figures', 'NNS'),
            ('any', 'DT'),
            ('new', 'JJ'),
            ('policy', 'NN'),
            ('measures', 'NNS'),
            ('earlier', 'JJR'),
            ('stages', 'NNS'),
            ('Panamanian', 'JJ'),
            ('dictator', 'NN'),
            ('Manuel', 'NNP'),
            ('Noriega', 'NNP')]
grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP another/DT sharp/JJ dive/NN trade/NN figures/NNS)
  (NP any/DT new/JJ policy/NN measures/NNS)
  (NP earlier/JJR stages/NNS)
  (NP Panamanian/JJ dictator/NN Manuel/NNP Noriega/NNP))


In [5]:
# Grammar中输入语法，语法格式{<DT>?<JJ.*>*<NN.*>+}，不能在前面加NP:，具体可以参考右边的Regexps说明
# Development Set就是开发测试集，用于调试语法规则。绿色表示正确匹配，红色表示没有正确匹配。黄金标准标注为下划线
nltk.app.chunkparser()

### 7.2.3 用正则表达式分块（组块分析）

In [6]:
# Ex7-2 简单的名词短语分类器
sentence = [("Rapunzel", "NNP"),
            ("let", "VBD"),
            ("down", "RP"),
            ("her", "PP$"),
            ("long", "JJ"),
            ("golden", "JJ"),
            ("hair", "NN")]
# 两个规则组成的组块分析语法，注意规则执行会有先后顺序，两个规则如果有重叠部分，以先执行的为准
grammar = r'''
  NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
      {<NNP>+}                # chunk sequences of proper nouns
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [7]:
grammar = r'NP: {<[CDJ].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  Rapunzel/NNP
  let/VBD
  down/RP
  her/PP$
  (NP long/JJ golden/JJ)
  hair/NN)


In [8]:
grammar = r'NP: {<[CDJNP].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [9]:
grammar = r'NP: {<[CDJN].*>+}'
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  her/PP$
  (NP long/JJ golden/JJ hair/NN))


In [10]:
# 如果模式匹配位置重叠，最左边的优先匹配。
# 例如：如果将匹配两个连贯名字的文本的规则应用到包含3个连贯名词的文本中，则只有前两个名词被分块
nouns = [('money', 'NN'), ('market', 'NN'), ('fund', 'NN')]
grammar = 'NP: {<NN><NN>}'
print("错误分块的结果= ", nltk.RegexpParser(grammar).parse(nouns))
grammar = 'NP: {<NN>+}'
print("正确分块的结果= ", nltk.RegexpParser(grammar).parse(nouns))

错误分块的结果=  (S (NP money/NN market/NN) fund/NN)
正确分块的结果=  (S (NP money/NN market/NN fund/NN))


### 7.2.4 探索文本语料库：从已经标注的语料库中提取匹配特定词性标记序列的短语

In [11]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
cp = nltk.RegexpParser(grammar)
brown = nltk.corpus.brown
count = 0
for sent in brown.tagged_sents():
    if count < 10:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                count += 1
                print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [12]:
# 定义一个搜索函数（一次性返回定义好的数据量）
def find_chunks(pattern):
    cp = nltk.RegexpParser(pattern)
    brown = nltk.corpus.brown
    count = 0
    for sent in brown.tagged_sents():
        if count < 10:
            tree = cp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK' or subtree.label()=='NOUNS':
                    count += 1
                    print(subtree)

In [13]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
find_chunks(grammar)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [14]:
grammar = 'NOUNS: {<N.*>{4,}}'
find_chunks(grammar)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


In [15]:
# 定义一个搜索函数（使用生成器）
def find_chunks(pattern):
    cp = nltk.RegexpParser(pattern)
    brown = nltk.corpus.brown
    for sent in brown.tagged_sents():
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK' or subtree.label() == 'NOUNS':
                yield subtree

In [16]:
grammar = 'CHUNK: {<V.*><TO><V.*>}'
for i, subtree in enumerate(find_chunks(grammar)):
    if i < 10:
        print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)


In [17]:
grammar = 'NOUNS: {<N.*>{4,}}'
for i, subtree in enumerate(find_chunks(grammar)):
    if i < 10:
        print(subtree)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


### 7.2.5. 添加缝隙：寻找需要排除的成分

可以为不包括在大块中的标识符序列定义一个缝隙。

In [18]:
sentence = [("the", "DT"),
            ("little", "JJ"),
            ("yellow", "JJ"),
            ("dog", "NN"),
            ("barked", "VBD"),
            ("at", "IN"),
            ("the", "DT"),
            ("cat", "NN")]

In [19]:
# 先分块，再加缝隙，才能得出正确的结果
grammar = r'''
    NP: 
        {<.*>+}         # Chunk everything （先对所有数据分块）
        }<VBD|IN>+{     # Chink sequences of VBD and IN（对 VBD 或者 IN 加缝隙）
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [20]:
# 先加缝隙，再分块，就不能得出正确的结果，只会得到一个块，效果与没有使用缝隙是一样的
grammar = r'''
    NP: 
        }<VBD|IN>+{     # Chink sequences of VBD and IN
        {<.*>+}         # Chunk everything
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP
    the/DT
    little/JJ
    yellow/JJ
    dog/NN
    barked/VBD
    at/IN
    the/DT
    cat/NN))


In [21]:
grammar = r'''
    NP: 
        {<.*>+}         # Chunk everything
'''
print(nltk.RegexpParser(grammar).parse(sentence))

(S
  (NP
    the/DT
    little/JJ
    yellow/JJ
    dog/NN
    barked/VBD
    at/IN
    the/DT
    cat/NN))


### 7.2.6 分块的表示：标记与树状图

作为「标注」和「分析」之间的中间状态（Ref：Ch8），块结构可以使用标记或者树状图来表示

使用最为广泛的表示是IOB标记：

-   I（Inside，内部）；
-   O（Outside，外部）；
-   B（Begin，开始）。