In [1]:
import nltk
import re

# Ch07 从文本提取信息

学习目标

1.  从非结构化文本中提取结构化数据
2.  识别一个文本中描述的实体和关系
3.  使用语料库来训练和评估模型

## 7.4 语言结构中的递归

### 7.4.1 使用级联分块器构建嵌套的结构

In [2]:
# Ex7-6 四级分块语法器，处理NP（名词短语）PP（介绍短语）VP（动词短语）和$（句子的模式）
grammar = r'''
NP: {<DT|JJ|NN.*>+}             # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>}                  # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$}    # Chunk verbs and their arguments
CLAUSE: {<NP><VP>}              # Chunk NP, VP
'''

In [9]:
sentence = [("Mary", "NN"),
            ("saw", "VBD"),
            ("the", "DT"),
            ("cat", "NN"),
            ("sit", "VB"),
            ("on", "IN"),
            ("the", "DT"),
            ("mat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))            
# 未能正确识别以 saw 为首的 VP

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [10]:
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
# 增加循环次数，可以正确识别以 saw 为首的 VP

(S
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))


In [12]:
sentence = [("John", "NNP"),
            ("thinks", "VBZ"),
            ("Mary", "NN"),
            ("saw", "VBD"),
            ("the", "DT"),
            ("cat", "NN"),
            ("sit", "VB"),
            ("on", "IN"),
            ("the", "DT"),
            ("mat", "NN")]

cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
# 未能正确识别 saw 的 VP，也未能正确识别 thinks 的 VP

(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [13]:
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
# 增加循环次数，可以正确识别以 saw 为首的 VP
# 未能正确识别 thinks 的 VP

(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))


In [14]:
cp = nltk.RegexpParser(grammar, loop=3)
print(cp.parse(sentence))
# 增加循环次数，可以正确识别以 saw 为首的 VP，也可以正确识别 thinks 的 VP

(S
  (CLAUSE
    (NP John/NNP)
    (VP
      thinks/VBZ
      (CLAUSE
        (NP Mary/NN)
        (VP
          saw/VBD
          (CLAUSE
            (NP the/DT cat/NN)
            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))


In [15]:
cp = nltk.RegexpParser(grammar, loop=4)
print(cp.parse(sentence))
# 更多的循环次数不会影响结果的正确性，只会增加计算的时间

(S
  (CLAUSE
    (NP John/NNP)
    (VP
      thinks/VBZ
      (CLAUSE
        (NP Mary/NN)
        (VP
          saw/VBD
          (CLAUSE
            (NP the/DT cat/NN)
            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))


In [16]:
cp = nltk.RegexpParser(grammar, loop=4)
parsed_sent = cp.parse(sentence)
print(parsed_sent)
parsed_sent.draw()

(S
  (CLAUSE
    (NP John/NNP)
    (VP
      thinks/VBZ
      (CLAUSE
        (NP Mary/NN)
        (VP
          saw/VBD
          (CLAUSE
            (NP the/DT cat/NN)
            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))


虽然级联过程可以创建深层结构，但是创建与调试过程非常困难，并且只能产生固定深度的树状图，仍然属于不完整的句法分析，因此，全面的剖析才更有效（Ref：Ch8）

### 7.4.2. 树状图
树状图：是一组相互连接的加标签的节点，从一个特殊的根节点沿一条唯一的路径到达每个节点。

In [17]:
# 创建树状图
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)

(NP Alice)


In [18]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)

(NP the rabbit)


In [19]:
# 合并成大的树状图
tree3 = nltk.Tree('VP', ['chased', tree2])
print(tree3)

(VP chased (NP the rabbit))


In [20]:
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)

(S (NP Alice) (VP chased (NP the rabbit)))


In [24]:
# 访问树状图的对象
print("tree4= ", tree4)
print("tree4[0]= ", tree4[0])
print("tree4[1]= ", tree4[1])
print("tree4[1][1]= ", tree4[1][1])
print("tree4[1][1][0]= ", tree4[1][1][0])

tree4=  (S (NP Alice) (VP chased (NP the rabbit)))
tree4[0]=  (NP Alice)
tree4[1]=  (VP chased (NP the rabbit))
tree4[1][1]=  (NP the rabbit)
tree4[1][1][0]=  the


In [31]:
# 调用树状图的函数
print("tree4= ", tree4)
print("tree4.label()= ", tree4.label())
print("tree4.leaves()= ", tree4.leaves())

tree4=  (S (NP Alice) (VP chased (NP the rabbit)))
tree4.label()=  S
tree4.leaves()=  ['Alice', 'chased', 'the', 'rabbit']


In [32]:
print("tree4[1]= ", tree4[1])
print("tree4[1].label()= ", tree4[1].label())
print("tree4[1].leaves()= ", tree4[1].leaves())

tree4[1]=  (VP chased (NP the rabbit))
tree4[1].label()=  VP
tree4[1].leaves()=  ['chased', 'the', 'rabbit']


In [33]:
print("tree4[1][1]= ", tree4[1][1])
print("tree4[1][1].label()= ", tree4[1][1].label())
print("tree4[1][1].leaves()= ", tree4[1][1].leaves())

tree4[1][1]=  (NP the rabbit)
tree4[1][1].label()=  NP
tree4[1][1].leaves()=  ['the', 'rabbit']


In [34]:
print("tree4[1][1][0]= ", tree4[1][1][0])
# print("tree4[1][1][0].label()= ", tree4[1][1][0].label())
# print("tree4[1][1][0].leaves()= ", tree4[1][1][0].leaves())

tree4[1][1][0]=  the


In [35]:
tree4.draw()

### 7.4.3 树的遍历

In [36]:
# Ex7-7 使用递归函数遍历树状图
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=' ')
    else:
        print('(', t.label(), end=' ')
        for child in t:
            traverse(child)
        print(')', end=' ')

In [37]:
# 不能使用Tree()函数直接基于字符串生成树了。
t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')
print(t)
traverse(t)
t.draw()

(S (NP Alice) (VP chased (NP the rabbit)))
( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 

In [38]:
# 不能使用Tree()函数直接基于字符串生成树了。
t = nltk.Tree.fromstring(tree4.__str__())
print(t)
traverse(t)
t.draw()

(S (NP Alice) (VP chased (NP the rabbit)))
( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 

In [40]:
print(tree4)
traverse(tree4)
tree4.draw()

(S (NP Alice) (VP chased (NP the rabbit)))
( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 