In [5]:
import random

import nltk
from nltk.corpus import brown
from nltk.corpus import names

from tools import show_subtitle

# Chap6 学习分类文本

学习目标：

1.  识别出语言数据中可以用于分类的特征
2.  构建用于自动执行语言处理任务的语言模型
3.  从语言模型中学习与语言相关的知识

## 6.1 有监督分类

-   分类是为了给定的输入选择正确的类标签。
-   监督式分类：建立在训练语料基础之上的分类

### 6.1.1 性别鉴定

In [2]:
# 以名字的最后一个字母为特征
def gender_features(word):
    return {'last_letter': word[-1]}


gender_features('Shrek')

{'last_letter': 'k'}

In [3]:
# 原始数据集合
labeled_names = (
        [
                (name, 'male')
                for name in names.words('male.txt')
        ]
        +
        [
                (name, 'female')
                for name in names.words('female.txt')
        ]
)
# 乱序排序数据集
random.shuffle(labeled_names)

In [7]:
# 特征数据集合
feature_sets = [
        (gender_features(n), gender)
        for (n, gender) in labeled_names
]

# 训练数据集合 和 测试数据集合
train_set, test_set = feature_sets[500:], feature_sets[:500]

In [8]:
# 使用 NLTK 的 朴素贝叶斯 分类器进行分类训练
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("NLTK-NB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))

show_subtitle("Neo's feature")
print(gender_features('Neo'))
print("Neo is", classifier.classify(gender_features('Neo')))
show_subtitle("Trinity's feature")
print(gender_features('Trinity'))
print("Trinity is", classifier.classify(gender_features('Trinity')))

show_subtitle("信息量大的特征")
classifier.show_most_informative_features(5)

NLTK-NB 分类器性能评估=  0.772
--------------- >Neo's feature< ---------------
{'last_letter': 'o'}
Neo is male
--------------- >Trinity's feature< ---------------
{'last_letter': 'y'}
Trinity is female
--------------- >信息量大的特征< ---------------
Most Informative Features
             last_letter = 'a'            female : male   =     33.0 : 1.0
             last_letter = 'k'              male : female =     31.7 : 1.0
             last_letter = 'v'              male : female =     18.6 : 1.0
             last_letter = 'f'              male : female =     14.6 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0


In [9]:
# 使用 Scikit-Learn 的 GaussNB 朴素贝叶斯 分类器进行分类
import numpy as np
from sklearn.naive_bayes import GaussianNB
from nltk.classify.scikitlearn import SklearnClassifier
classifier=SklearnClassifier(GaussianNB(), sparse=False).train(train_set)
print("Scikit-GaussNB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))

show_subtitle("Neo's feature")
print(gender_features('Neo'))
print("Neo is", classifier.classify(gender_features('Neo')))
show_subtitle("Trinity's feature")
print(gender_features('Trinity'))
print("Trinity is", classifier.classify(gender_features('Trinity')))

# 信息量大的特征
# 'SklearnClassifier' object has no attribute 'show_most_informative_features'
# classifier.show_most_informative_features(5)

Scikit-GaussNB 分类器性能评估=  0.73
--------------- >Neo's feature< ---------------
{'last_letter': 'o'}
Neo is male
--------------- >Trinity's feature< ---------------
{'last_letter': 'y'}
Trinity is female


In [10]:
def gender_classifier(features):
    # 特征数据集合
    feature_sets = [
            (features(n), gender)
            for (n, gender) in labeled_names
    ]

    # 训练数据集合 和 测试数据集合
    train_set, test_set = feature_sets[500:], feature_sets[:500]

    # 使用 Scikit-Learn 的 GaussNB 朴素贝叶斯 分类器进行分类
    from sklearn.naive_bayes import GaussianNB
    from nltk.classify.scikitlearn import SklearnClassifier

    classifier = SklearnClassifier(GaussianNB(), sparse=False).train(train_set)
    print("Scikit-GaussNB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))


    # 使用 NLTK 的 朴素贝叶斯 分类器进行分类训练
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("NLTK-NB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))

    # 信息量大的特征（发现特征信息量比较小，说明这个特征效果不好）
    show_subtitle("信息量大的特征")
    classifier.show_most_informative_features(5)

    show_subtitle("Neo's feature")
    print(features('Neo'))
    print("Neo is", classifier.classify(features('Neo')))
    show_subtitle("Trinity's feature")
    print(features('Trinity'))
    print("Trinity is", classifier.classify(features('Trinity')))

In [28]:
# 性别鉴定：以名字的长度为特征
# 等价的 Lambda 函数
# gender_classifier(lambda word: {'name length': len(word)})
def gender_features(word):
    return {'name length': len(word)}


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.634
NLTK-NB 分类器性能评估=  0.64
--------------- >信息量大的特征< ---------------
Most Informative Features
             name length = 2                male : female =      2.1 : 1.0
             name length = 3                male : female =      2.0 : 1.0
             name length = 15               male : female =      1.7 : 1.0
             name length = 9              female : male   =      1.4 : 1.0
             name length = 10             female : male   =      1.3 : 1.0
--------------- >Neo's feature< ---------------
{'name length': 3}
Neo is male
--------------- >Trinity's feature< ---------------
{'name length': 7}
Trinity is female


In [12]:
# 在处理大型语料库时，构建包含所有实例特征的单独链表会占用大量的内存
# apply_features 返回一个链表，但是不会在内存中存储所有特征集的对象
from nltk.classify import apply_features

train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

### 6.1.2 选择正确的特征

In [13]:
# 两个特征可以少量地增加了准确率（0.776>0.774)
def gender_features(name):
    features = {'first_letter': name[0].lower(), 'last_letter': name[-1].lower()}
    return features


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.732
NLTK-NB 分类器性能评估=  0.768
--------------- >信息量大的特征< ---------------
Most Informative Features
             last_letter = 'a'            female : male   =     33.0 : 1.0
             last_letter = 'k'              male : female =     31.7 : 1.0
             last_letter = 'v'              male : female =     18.6 : 1.0
             last_letter = 'f'              male : female =     14.6 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
--------------- >Neo's feature< ---------------
{'first_letter': 'n', 'last_letter': 'o'}
Neo is male
--------------- >Trinity's feature< ---------------
{'first_letter': 't', 'last_letter': 'y'}
Trinity is male


In [14]:
# 复杂的特征集，导致过拟合，反而降低了准确率
def gender_features(name):
    # 特征集合
    features = {'first_letter': name[0].lower(), 'last_letter': name[-1].lower()}
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(letter)] = name.lower().count(letter)
        features['has({})'.format(letter)] = (letter in name.lower())
    return features


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.744
NLTK-NB 分类器性能评估=  0.782
--------------- >信息量大的特征< ---------------
Most Informative Features
             last_letter = 'a'            female : male   =     33.0 : 1.0
             last_letter = 'k'              male : female =     31.7 : 1.0
             last_letter = 'v'              male : female =     18.6 : 1.0
             last_letter = 'f'              male : female =     14.6 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
--------------- >Neo's feature< ---------------
{'first_letter': 'n', 'last_letter': 'o', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 1, 'has(e)': True, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 0, 'has(h)': False, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': Fals

In [15]:
# 开发测试集的作用
# 训练集：训练模型；
# 开发测试集：执行错误分析；
# 测试集：系统的最终评估。
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("开发测试集的精度= ", nltk.classify.accuracy(classifier, devtest_set))
print("测试集的精度= ", nltk.classify.accuracy(classifier, test_set))

开发测试集的精度=  0.768
测试集的精度=  0.782


In [16]:
# 使用开发测试集执行错误分析，避免使用测试集进行错误分析和修正是为了避免对测试集的过拟合
# 因为利用测试集的分析来增加规则，修正错误，则会使测试集的评估数据自然变好，但是测试集不能代表所有的数据特征
# 因此利用训练集训练模型，使用开发测试集进行错误分析和规则修正，再利用测试集来评估，就可以尽可能保证数据的独立性和分布的均衡性
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
for (tag, guess, name) in sorted(errors[:20]):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Bert                          
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Blondy                        
correct=female   guess=male     name=Ginnifer                      
correct=female   guess=male     name=Gretal                        
correct=female   guess=male     name=Hephzibah                     
correct=female   guess=male     name=Lory                          
correct=female   guess=male     name=Mab                           
correct=female   guess=male     name=Patsy                         
correct=female   guess=male     name=Tammy                         
correct=female   guess=male     name=Tuesday                       
correct=male     guess=female   name=Angel                         
correct=male     guess=female   name=Bobbie                        
correct=male     guess=female   name=Earl                          
correct=male     guess=female   name=Graeme     

In [17]:
# 性别鉴定的两个特征：最后一个字母，最后两个字母，准确率更高（0.792）
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.724
NLTK-NB 分类器性能评估=  0.796
--------------- >信息量大的特征< ---------------
Most Informative Features
                 suffix2 = 'na'           female : male   =     97.5 : 1.0
                 suffix2 = 'la'           female : male   =     73.2 : 1.0
                 suffix2 = 'ia'           female : male   =     39.6 : 1.0
                 suffix2 = 'sa'           female : male   =     36.3 : 1.0
                 suffix1 = 'a'            female : male   =     33.0 : 1.0
--------------- >Neo's feature< ---------------
{'suffix1': 'o', 'suffix2': 'eo'}
Neo is male
--------------- >Trinity's feature< ---------------
{'suffix1': 'y', 'suffix2': 'ty'}
Trinity is female


In [18]:
# 性别鉴定的三个特征：头两个字母，最后一个字母，最后两个字母，准确率更高（0.81）
def gender_features(word):
    return {'prefix': word[0:2], 'suffix1': word[-1:], 'suffix2': word[-2:]}


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.74
NLTK-NB 分类器性能评估=  0.82
--------------- >信息量大的特征< ---------------
Most Informative Features
                 suffix2 = 'na'           female : male   =     97.5 : 1.0
                 suffix2 = 'la'           female : male   =     73.2 : 1.0
                 suffix2 = 'ia'           female : male   =     39.6 : 1.0
                 suffix2 = 'sa'           female : male   =     36.3 : 1.0
                 suffix1 = 'a'            female : male   =     33.0 : 1.0
--------------- >Neo's feature< ---------------
{'prefix': 'Ne', 'suffix1': 'o', 'suffix2': 'eo'}
Neo is male
--------------- >Trinity's feature< ---------------
{'prefix': 'Tr', 'suffix1': 'y', 'suffix2': 'ty'}
Trinity is female


In [19]:
# 性别鉴定的四个特征：
# 头一个字母，头两个字母，最后一个字母，最后两个字母，准确率更高（0.814）
def gender_features(word):
    return {'prefix1': word[0:1], 'prefix2': word[0:2], 'suffix1': word[-1:], 'suffix2': word[-2:]}


gender_classifier(gender_features)

Scikit-GaussNB 分类器性能评估=  0.74
NLTK-NB 分类器性能评估=  0.818
--------------- >信息量大的特征< ---------------
Most Informative Features
                 suffix2 = 'na'           female : male   =     97.5 : 1.0
                 suffix2 = 'la'           female : male   =     73.2 : 1.0
                 suffix2 = 'ia'           female : male   =     39.6 : 1.0
                 suffix2 = 'sa'           female : male   =     36.3 : 1.0
                 suffix1 = 'a'            female : male   =     33.0 : 1.0
--------------- >Neo's feature< ---------------
{'prefix1': 'N', 'prefix2': 'Ne', 'suffix1': 'o', 'suffix2': 'eo'}
Neo is male
--------------- >Trinity's feature< ---------------
{'prefix1': 'T', 'prefix2': 'Tr', 'suffix1': 'y', 'suffix2': 'ty'}
Trinity is male


### 1.3. 文档分类
学习语料库中的标记，为新文档分配类别标签

In [20]:
# 使用电影评论语料库，将每个评论归类为正面或者负面
from nltk.corpus import movie_reviews

show_subtitle("categories()")
print(movie_reviews.categories())
show_subtitle("fileids('neg')")
print(movie_reviews.fileids('neg')[:20])
show_subtitle("words('/neg/cv995_23113.txt'")
print(movie_reviews.words('neg/cv995_23113.txt'))

documents = [
        (list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
]
random.shuffle(documents)

--------------- >categories()< ---------------
['neg', 'pos']
--------------- >fileids('neg')< ---------------
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt', 'neg/cv010_29063.txt', 'neg/cv011_13044.txt', 'neg/cv012_29411.txt', 'neg/cv013_10494.txt', 'neg/cv014_15600.txt', 'neg/cv015_29356.txt', 'neg/cv016_4348.txt', 'neg/cv017_23487.txt', 'neg/cv018_21672.txt', 'neg/cv019_16117.txt']
--------------- >words('/neg/cv995_23113.txt'< ---------------
['if', 'anything', ',', '"', 'stigmata', '"', 'should', ...]


In [21]:
# Ex6-2 文档分类的特征提取器，其特征表示每个词是否在一个给定的文档中
# 评论中使用的所有单词
all_words = nltk.FreqDist(
        w.lower()
        for w in movie_reviews.words()
)
show_subtitle("all_words.most_common(20)")
print(all_words.most_common(20))
# 取出部分单词作为特征
# 任取2000个单词就可以产生很好的结果：0.81
word_features_random = list(all_words)[:2000]
show_subtitle("word_features[:20]")
print(word_features_random[:20])


def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


myDocument = movie_reviews.words('pos/cv957_8737.txt')
myFeature = document_features(myDocument, word_features_random)
for i, (key, value) in enumerate(myFeature.items()):
    if i < 10:
        print(key, value)

--------------- >all_words.most_common(20)< ---------------
[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('(', 11664), ('as', 11378), ('with', 10792), ('for', 9961)]
--------------- >word_features[:20]< ---------------
[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for']
contains(,) True
contains(the) True
contains(.) True
contains(a) True
contains(and) True
contains(of) True
contains(to) True
contains(') True
contains(is) True
contains(in) True


In [29]:
# Ex6-3 训练和测试分类器以进行文档分类
def document_classifier(word_features):
    feature_sets = [
            (document_features(d, word_features), c)
            for (d, c) in documents
    ]
    train_set, test_set = feature_sets[100:], feature_sets[:100]

    # 使用 Scikit-Learn 的 GaussNB 朴素贝叶斯 分类器进行分类
    from sklearn.naive_bayes import GaussianNB
    from nltk.classify.scikitlearn import SklearnClassifier

    classifier = SklearnClassifier(GaussianNB(), sparse=False).train(train_set)
    print("Scikit-GaussNB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))
    
    # 使用 NLTK 的 朴素贝叶斯 分类器进行分类训练
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("NLTK-NB 分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))

    # 信息量大的特征（发现特征信息量比较小，说明这个特征效果不好）
    show_subtitle("信息量大的特征")
    classifier.show_most_informative_features(5)
    

document_classifier(word_features_random)

Scikit-GaussNB 分类器性能评估=  0.79
NLTK-NB 分类器性能评估=  0.84
--------------- >信息量大的特征< ---------------
Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.1 : 1.0
        contains(seagal) = True              neg : pos    =      7.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.5 : 1.0
         contains(damon) = True              pos : neg    =      5.8 : 1.0
          contains(lame) = True              neg : pos    =      5.7 : 1.0


In [30]:
# most_common() 返回的不是单词集合，不能作为正确的特征使用，需要将特征做进一步处理
# word_features=list(all_words.most_common(2000))
# 取出2000个高频单词产生更好的结果：0.82
word_features_high_freq = [
        word
        for word, _ in all_words.most_common(2000)
]
document_classifier(word_features_high_freq)

Scikit-GaussNB 分类器性能评估=  0.79
NLTK-NB 分类器性能评估=  0.84
--------------- >信息量大的特征< ---------------
Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.1 : 1.0
        contains(seagal) = True              neg : pos    =      7.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.5 : 1.0
         contains(damon) = True              pos : neg    =      5.8 : 1.0
          contains(lame) = True              neg : pos    =      5.7 : 1.0


In [31]:
# 取出2000个低频单词产生更差的结果：0.69
word_features_low_freq = [
        word
        for word, _ in all_words.most_common(20000)[-2000:]
]
document_classifier(word_features_low_freq)

Scikit-GaussNB 分类器性能评估=  0.58
NLTK-NB 分类器性能评估=  0.58
--------------- >信息量大的特征< ---------------
Most Informative Features
            contains(05) = True              neg : pos    =      1.7 : 1.0
           contains(108) = True              neg : pos    =      1.7 : 1.0
          contains(26th) = True              neg : pos    =      1.7 : 1.0
            contains(32) = True              neg : pos    =      1.7 : 1.0
            contains(39) = True              neg : pos    =      1.7 : 1.0


### 6.1.4 词性标注，词类标注，(Part-of-Speech Tagging, POST)

In [25]:
# 寻找最常见的单词后缀，可能也是最有信息量的单词后缀
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
common_suffixes = [
        suffix
        for (suffix, count) in suffix_fdist.most_common(100)
]
show_subtitle("common_suffixes[:20]")
print(common_suffixes[:20])

--------------- >common_suffixes[:20]< ---------------
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is']


In [34]:
# 定义特征提取函数，以前100个单词后缀为特征，确定每个单词受这100个后缀的影响
# 例如：should的后缀'ld'存在，后缀'en'不存在
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [35]:
# 建立特征集合，每个单词的特征（后缀'en':存在）+标注（NN)为一条数据，组成的特征集合
tagged_words = brown.tagged_words(categories='news')
feature_sets = [(pos_features(n), g) for (n, g) in tagged_words]

size = int(len(feature_sets) * 0.1)
train_set, test_set = feature_sets[size:], feature_sets[:size]

In [41]:
tagged_words[0]

('The', 'AT')

In [45]:
# 下面这行代码执行时间过长，可以尝试减少 word_features 中的特征数量
# i5-5200U 需要运行10分钟以上！！！
# i7-9700 需要的时间也很长
# NLTK决策树模型的优点：容易解释，甚至可以使用伪代码的形式输出
# NLTK 决策树模型的缺点：运行速度慢，精度还不高
classifier = nltk.DecisionTreeClassifier.train(train_set)
print("NLTK 决策树分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))
print("cats 标注的结果：",classifier.classify(pos_features('cats')))
print(classifier.pseudocode(depth=4))

NLTK 决策树分类器性能评估=  0.6270512182993535
cats 标注的结果： NNS
if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [44]:
# 使用 Scikit-Learn 运行速度会快很多，但是不能输出伪代码
from sklearn.tree import DecisionTreeClassifier
classifier = SklearnClassifier(DecisionTreeClassifier()).train(train_set)
print("Scikit 决策树分类器性能评估= ", nltk.classify.accuracy(classifier, test_set))
print("cats 标注的结果：",classifier.classify(pos_features('cats')))

Scikit 决策树分类器性能评估=  0.6459472899055196
cats 标注的结果： NNS


### 6.1.5 基于上下文语境特征进行词性标注

-   通过上下文提高分类的精度，例如：large后面的可能是名词；
-   但是不能根据前词的标记判断下个词的类别，例如：前面是形容词，后面的可能是名词
-   下节的序列分类就是使用联合分类器模型，为一些相关的输入选择适当的标签

In [46]:
# Ex6-4 词性分类器，特征检测器通过一个单词的上下文来决定这个单词的词性标记
def pos_features(sentence: list, i: int) -> dict:
    """更加复杂的特征提取函数
    :param sentence:用于提取单词的句子，提供句子才可以提供上下文语境特征
    :type sentence:list
    :param i:句子中需要提取特征的单词的位置
    :type i:int
    :return:相应单词的特征集
    :rtype:dict """
    # 当前词的后缀特征
    features = {
        'Suffix(1)': sentence[i][-1:],
        'Suffix(2)': sentence[i][-2:],
        'Suffix(3)': sentence[i][-3:]
    }
    # 单词前面的词作为特征，如果单词为头一个单词就设置前面一个单词为<START>
    if i == 0:
        features['prev-word'] = '<START>'
    else:
        features['prev-word'] = sentence[i - 1]
    return features

In [47]:
# (brown.sents()[0],8) == 'investigation'
type(pos_features(brown.sents()[0], 8))
tagged_sents = brown.tagged_sents(categories='news')
feature_sets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        # 特征集合中的元素必须是tuple的形式
        feature_sets.append((pos_features(untagged_sent, i), tag))  

size = int(len(feature_sets) * 0.1)
train_set, test_set = feature_sets[size:], feature_sets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("基于上下文语境特征进行词性标注的模型精度= ", nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

### 6.1.6 基于序列分类的词性标注（通过上下文的标签提高分类的精度）
为了获取相关分类任务之间的依赖关系，可以使用联合分类器模型，使用连续分类或者贪婪序列分类的序列分类器策略，得到更高的分类准确率

In [6]:
# Ex6-5 使用连贯分类器进行词性标注
# 1. 特征提取器：提供具有上下文描述的特征
def pos_features(sentence: list, i: int, history: list) -> dict:
    """
        >>> pos_features(['I', 'love', 'you','.'], 1, ['N'])
        {'Suffix(1)': 'e', 'Suffix(2)': 've', 'Suffix(3)': 'ove', 'prev-word': 'I', 'prev-tag': 'N'}
    :param sentence:用于提取单词的句子，提供句子才可以提供上下文语境特征
    :type sentence:list
    :param i:句子中需要提取特征的单词的位置
    :type i:int
    :param history:history中的每个标记对应sentence中的一个词，但是history只包括已经归类的词的标记
    :type history:
    :return:相应单词的特征集
    :rtype:dict
    """
    features = {
        'Suffix(1)': sentence[i][-1:],
        'Suffix(2)': sentence[i][-2:],
        'Suffix(3)': sentence[i][-3:]
    }
    if i == 0:
        features['prev-word'] = '<START>'
        features['prev-tag'] = '<START>'
    else:
        features['prev-word'] = sentence[i - 1]
        features['prev-tag'] = history[i - 1]
    return features

In [7]:
history = ['N']
pos_features(brown.sents()[0], 1, history)

{'Suffix(1)': 'n',
 'Suffix(2)': 'on',
 'Suffix(3)': 'ton',
 'prev-word': 'The',
 'prev-tag': 'N'}

In [8]:
# 2. 基于序列分类的词性标注
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            # 将（单词，标签）元组组成的列表中的标签去掉，只保留单词组成的列表
            # print(untagged_sent)  
            
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featuresets = pos_features(untagged_sent, i, history)
                train_set.append((featuresets, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featuresets = pos_features(sentence, i, history)
            tag = self.classifier.classify(featuresets)
            history.append(tag)
        return zip(sentence, history)

In [9]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
print("train_sents[0]= ", train_sents[0])

train_sents[0]=  [('He', 'PPS'), ('assured', 'VBD'), ('Mr.', 'NP'), ('Martinelli', 'NP'), ('and', 'CC'), ('the', 'AT'), ('council', 'NN'), ('that', 'CS'), ('he', 'PPS'), ('would', 'MD'), ('study', 'VB'), ('the', 'AT'), ('correct', 'JJ'), ('method', 'NN'), ('and', 'CC'), ('report', 'VB'), ('back', 'RB'), ('to', 'IN'), ('the', 'AT'), ('council', 'NN'), ('as', 'QL'), ('soon', 'RB'), ('as', 'CS'), ('possible', 'JJ'), ('.', '.')]


In [15]:
tagger = ConsecutivePosTagger(train_sents)
print("基于序列分类的词性标注模型的精度= ", tagger.evaluate(test_sents))

0.7980528511821975


In [19]:
print(test_sents[0])

for label in tagger.tag(test_sents[0]):
    print(label)

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
(('The', 'AT'), '``')
(('Fulton', 'NP-TL'), 'AT-TL')
(('County', 'NN-TL'), 'NNS-TL')
(('Grand', 'JJ-TL'), 'IN-TL')
(('Jury', 'NN-TL'), 'AT-TL')
(('said', 'VBD'), 'NNS-TL')
(('Friday', 'NR'), 'IN-TL')
(('an', 'AT'), 'AT-TL')
(('investigation', 'NN'), 'NNS-TL')
(('of', 'IN'), 'IN-TL')
(("Atlanta's", 'NP$'), 'AT-TL')
(('recent', 'JJ'), 'NNS-TL')
(('primary', 'NN'), 'IN-TL')
(('election', 'NN'), 'AT-TL')
(('produced', 'VBD'), 'NNS-TL')
(('``', '``'), 'IN-TL')
(('no', 'AT'), 'AT-TL')
(('evidence', 'NN'), 'NNS-TL')
(("''", "''"), 'IN-TL')
(('that',

### 6.1.7 序列分类器中的其他方法
1.  基于转换的联合分类：Ref：Sec5.6节描述的Brill标注器，解决了前面分类器一旦分类就无法改变的问题
2.  基于HMM的联合分类：
      -   特点：不仅考虑了单词的上下文，还可以考虑更长的依赖性
      -   输出：不是某个单词标记的最大可能性，而是整个序列中所有单词标记的最大可能性
      -   常用的模型：最大熵马尔可夫模型 和 线性链条件随机场模型