# 分类文本性别鉴定
使用naive Bayes classifier：

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Shrek')

{'last_letter': 'k'}

In [3]:
import nltk
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]

In [4]:
len(featuresets)

7944

In [5]:
featuresets[0]

({'last_letter': u'e'}, 'female')

In [6]:
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Neo'))

'male'

In [4]:
classifier.classify(gender_features('Trinity'))

'female'

In [5]:
nltk.classify.accuracy(classifier, test_set)

0.784

In [6]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     35.8 : 1.0
             last_letter = u'k'             male : female =     31.2 : 1.0
             last_letter = u'f'             male : female =     14.6 : 1.0
             last_letter = u'v'             male : female =     11.2 : 1.0
             last_letter = u'p'             male : female =     11.2 : 1.0


此列表显示训练集以a结尾的名字中女性是男性的38倍，而以k结尾的名字中男性是女性的31倍。这些比率是似然比(`likelihood ratios`)。

在处理大型语料库时，构建包含所有实例特征的单独链表会占用大量的内存。在这种情况下，使用函数nltk.classify.apply_features，返回一个像链表但不会在内存存储所有特征集的对象。

In [7]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

特征提取器过拟合性别特征。这个特征提取器返回的特征集中包括大量的指定特征，从而导致相对较小的名字语料库产生了过拟合。

In [9]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

gender_features2('John')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'firstletter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'lastletter': 'n'}

In [10]:
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.792

![6-2](https://www.safaribooksonline.com/library/view/natural-language-processing/9780596803346/httpatomoreillycomsourceoreillyimages302068.png)

In [11]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.774

In [14]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
len(errors)

226

下例中已建立的名字分类器在开发测试语料上产生约100个错误。

In [15]:
for (tag, guess, name) in sorted(errors[:100]): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)

correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Blair                         
correct=female   guess=male     name=Brynn                         
correct=female   guess=male     name=Cat                           
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Charmain                      
correct=female   guess=male     name=Christen                      
correct=female   guess=male     name=Darb                          
correct=female   guess=male     name=Diahann                       
correct=female   guess=male     name=Dorian                        
correct=female   guess=male     name=Dorit                         
correct=female   guess=male     name=Elyn                          
correct=female   guess=male     name=Emlyn      

浏览这个错误列表，它明确指出某些多字母后缀也可以指示名字性别。例如：以yn结尾的名字大多以女性为主，尽管事实上，以n结尾的名字往往是男性；以ch结尾的名字通常是男性，尽管以h结尾的名字倾向于是女性。因此，调整特征提取器使其包含两个字母后缀的特征。

In [16]:
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

使用新的特征提取器重建分类器，看到测试集上的性能提高3个百分点。

In [17]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.798