In [1]:
import random

import nltk
from nltk.corpus import names

from tools import show_subtitle

# Chap6 学习分类文本

学习目标：

1.  识别出语言数据中可以用于分类的特征
2.  构建用于自动执行语言处理任务的语言模型
3.  从语言模型中学习与语言相关的知识

## 6.2 有监督分类的应用场景

### 6.2.1 句子分割
（标点符号的分类任务，遇到可能会结束句子的符号时，二元判断是否应该断句）

In [2]:
# 第一步：获得已经被分割成句子的数据
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
# 标注所有句子结束符号的位置
for sent in sents:
    tokens.extend(sent)  # 句子标识符的合并链表，把所有原始的句子都合并成单词列表
    offset += len(sent)  #
    boundaries.add(offset - 1)  # 包含所有句子-边界标识符索引的集合

In [3]:
# 标点的数据特征
def punct_features(tokens, i):
    """
    标点（punctuation）符号的特征
    :param tokens: 已经分词的标记集
    :type tokens:
    :param i: 需要抽取特征的标点符号的位置
    :type i:
    :return:
    :rtype:
    """
    return {
            'next-word-capitalized': tokens[i + 1][0].isupper(),
            'prevword': tokens[i - 1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i - 1]) == 1
    }

In [4]:
# 第二步：建立标点符号的特征集合
feature_sets = [(punct_features(tokens, i), (i in boundaries))
                for i in range(1, len(tokens) - 1)
                if tokens[i] in '.?!']
for i, feature in enumerate(feature_sets):
    if i<10:
        print(i,")",feature)  

0 ) ({'next-word-capitalized': False, 'prevword': 'nov', 'punct': '.', 'prev-word-is-one-char': False}, False)
1 ) ({'next-word-capitalized': True, 'prevword': '29', 'punct': '.', 'prev-word-is-one-char': False}, True)
2 ) ({'next-word-capitalized': True, 'prevword': 'mr', 'punct': '.', 'prev-word-is-one-char': False}, False)
3 ) ({'next-word-capitalized': True, 'prevword': 'n', 'punct': '.', 'prev-word-is-one-char': True}, False)
4 ) ({'next-word-capitalized': False, 'prevword': 'group', 'punct': '.', 'prev-word-is-one-char': False}, True)
5 ) ({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
6 ) ({'next-word-capitalized': False, 'prevword': 'conglomerate', 'punct': '.', 'prev-word-is-one-char': False}, True)
7 ) ({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
8 ) ({'next-word-capitalized': True, 'prevword': 'reported', 'punct': '.', 'prev-word-is-one-char': False}, True)
9 ) ({

In [5]:
# 使用这些特征集，训练和评估一个标点符号分类器
size = int(len(feature_sets) * 0.1)
train_set, test_set = feature_sets[size:], feature_sets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [6]:
# Ex6-6 基于分类的断句器
# ToDo:原理是基于分类器对句子进行分类，但是没提供用于测试的数据
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i + 1])
        if start < len(words):
            sents.append(words[start:])
            return sents

## 2.2. 识别对话行为类型

In [7]:
# 对话的行为类型
# Statement, System, Greet, Emotion, ynQuestion, whQuestion, Accept, Bye, Emphasis, Continuer, Reject, yAnswer, nAnswer, Clarify, Other
posts = nltk.corpus.nps_chat.xml_posts()[:10000]
for i,post in enumerate(posts):
    if i<10:
        print(i,')',post)

0 ) <Element 'Post' at 0x000001D0FD2DABD8>
1 ) <Element 'Post' at 0x000001D0FD2EF368>
2 ) <Element 'Post' at 0x000001D0FD2EFA48>
3 ) <Element 'Post' at 0x000001D0FD2EF458>
4 ) <Element 'Post' at 0x000001D0FD2A3F48>
5 ) <Element 'Post' at 0x000001D0FD292728>
6 ) <Element 'Post' at 0x000001D0FD300E08>
7 ) <Element 'Post' at 0x000001D0FD311B38>
8 ) <Element 'Post' at 0x000001D0FD2DADB8>
9 ) <Element 'Post' at 0x000001D0FD2DAE08>


In [8]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [9]:
feature_sets = [
    (dialogue_act_features(post.text), post.get('class'))
    for post in posts
]
for i,feature in enumerate(feature_sets):
    if i<10:
        print(i,')',feature)

0 ) ({'contains(now)': True, 'contains(im)': True, 'contains(left)': True, 'contains(with)': True, 'contains(this)': True, 'contains(gay)': True, 'contains(name)': True}, 'Statement')
1 ) ({'contains(:)': True, 'contains(p)': True}, 'Emotion')
2 ) ({'contains(part)': True}, 'System')
3 ) ({'contains(hey)': True, 'contains(everyone)': True}, 'Greet')
4 ) ({'contains(ah)': True, 'contains(well)': True}, 'Statement')
5 ) ({'contains(nick)': True, 'contains(:10-19-20suser7)': True}, 'System')
6 ) ({'contains(10-19-20suser7)': True, 'contains(is)': True, 'contains(a)': True, 'contains(gay)': True, 'contains(name)': True, 'contains(.)': True}, 'Accept')
7 ) ({'contains(.action)': True, 'contains(gives)': True, 'contains(10-19-20suser121)': True, 'contains(a)': True, 'contains(golf)': True, 'contains(clap)': True, 'contains(.)': True}, 'System')
8 ) ({'contains(:)': True, 'contains())': True}, 'Emotion')
9 ) ({'contains(join)': True}, 'System')


In [10]:
# 常用的对话行为分类
classes = [category for _, category in feature_sets]
classes_fd = nltk.FreqDist(classes)
classes_fd.most_common()

[('Statement', 3058),
 ('System', 2380),
 ('Greet', 1325),
 ('Emotion', 1073),
 ('ynQuestion', 511),
 ('whQuestion', 503),
 ('Accept', 224),
 ('Bye', 191),
 ('Emphasis', 182),
 ('Continuer', 161),
 ('Reject', 151),
 ('yAnswer', 102),
 ('nAnswer', 70),
 ('Clarify', 37),
 ('Other', 32)]

In [11]:
# 创建帖子分类器
size = int(len(feature_sets) * 0.1)
train_set, test_set = feature_sets[size:], feature_sets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.667

### 2.3. 识别文字蕴涵 (Recognizing textual entailment, RTE)
-   判断「文本T」内的一个给定片段 是否继承着 另一个叫做「假设」 的文本
-   「文本」和「假设」之间的关系并不一定是逻辑蕴涵，而是人们是否会得出结论：文本提供的合理证据证明假设是真实的
-   可以把RTE当作一个分类任务，尝试为每一对预测“True”/“False”标签
    -   “True”表示保留了蕴涵；“False”表示没有保留蕴涵 

In [12]:
# Ex6-7：“认识文字蕴涵”的特征提取器
def rte_features(rtepair):
    """
    词（即词类型）作为信息的代理，计数词重叠的程度和假设中有而文本没有的词的程度
    特征词包括（命名实体、）
    :param rtepair:
    :type rtepair:
    :return:
    :rtype:
    """
    # RTEFeatureExtractor类建立了一个词汇包
    # 这个词汇包中的词汇在文本和假设中都有的，并且已经除去了一些停用词
    # 计算 重叠性 和 差异性
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [13]:
# 取出文本-假设对的数据
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
print(rtepair)
extractor = nltk.RTEFeatureExtractor(rtepair)
show_subtitle("文本中的单词")
print(extractor.text_words)
show_subtitle("假设中的单词")
print(extractor.hyp_words)
show_subtitle("文本和假设中重叠的单词（非实体词）")
print(extractor.overlap('word'))
show_subtitle("文本和假设中重叠的实体词")
print(extractor.overlap('ne'))
show_subtitle("文本和假设中差异的单词（非实体词）")
print(extractor.hyp_extra('word'))
show_subtitle("文本和假设中差异的实体词")
print(extractor.hyp_extra('ne'))

<RTEPair: gid=3-34>
--------------- >文本中的单词< ---------------
{'Iran', 'former', 'fledgling', 'Parviz', 'Shanghai', 'representing', 'Co', 'Russia', 'fight', 'Asia', 'that', 'China', 'at', 'four', 'terrorism.', 'Soviet', 'binds', 'SCO', 'operation', 'Organisation', 'association', 'Davudi', 'together', 'central', 'meeting', 'was', 'republics'}
--------------- >假设中的单词< ---------------
{'SCO.', 'China', 'member'}
--------------- >文本和假设中重叠的单词（非实体词）< ---------------
set()
--------------- >文本和假设中重叠的实体词< ---------------
{'China'}
--------------- >文本和假设中差异的单词（非实体词）< ---------------
{'member'}
--------------- >文本和假设中差异的实体词< ---------------
{'SCO.'}


## 2.4 扩展到大型的数据集
NLTK提供对专业的机器学习软件包的支持，调用它们会比NLTK提供的分类器性能更好

注：Scikit-Learn 的运行速度比 NLTK 提供的速度快，但是有的模型质量没有 NLTK 的好