## 有监督分类

### 性别分类

In [1]:
def gender_features(word):
    return {'last_letter':word[-1]}

In [5]:
from nltk.corpus import names
import random
import nltk

In [6]:
names = ([(name, 'male') for name in names.words('male.txt')] +
        [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [10]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
classifier.classify(gender_features('Neo'))

'male'

In [9]:
classifier.classify(gender_features('Trinity'))

'female'

In [11]:
nltk.classify.accuracy(classifier, test_set)

0.75

In [12]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'k'              male : female =     43.5 : 1.0
             last_letter = 'a'            female : male   =     37.4 : 1.0
             last_letter = 'v'              male : female =     17.4 : 1.0
             last_letter = 'f'              male : female =     15.8 : 1.0
             last_letter = 'p'              male : female =     11.1 : 1.0


In [13]:
# 使用函数 nltk.classify.apply_features，返回一个行为像一个链表而 不会在内存存储所有特征集的对象
from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

### 选择正确的特征

In [15]:
def gender_features2(name): 
    features = {}
    features["firstletter"] = name[0].lower() 
    features["lastletter"] = name[-1].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower()) 
    return features

In [16]:
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.776

In [17]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [18]:
train_set = [(gender_features(n), g) for (n,g) in train_names] 
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.757

In [19]:
# 使用开发测试集，生成一个分类器预测名字性别时的错误列表。
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess !=tag:
        errors.append((tag,guess,name))

In [20]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name))

correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Amber                         
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=Ardeen                        
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Bert                          
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Brier                         
correct=female   guess=male     name=Brooks                        
correct=female   guess=male     name=Caitlin                       
correct=female   guess=male     name=Candis     

In [21]:
def gender_features(word):
    return {'suffix1': word[-1:],'suffix2': word[-2:]}

In [22]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.775

### 文档分类

In [23]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [25]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]
def document_features(document):
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words) 
    return features

In [26]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [27]:
featuresets = [(document_features(d), c) for (d,c) in documents] 
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [28]:
nltk.classify.accuracy(classifier, test_set) 

0.78

In [29]:
classifier.show_most_informative_features(5)

Most Informative Features
    contains(schumacher) = True              neg : pos    =     11.7 : 1.0
        contains(justin) = True              neg : pos    =      9.0 : 1.0
           contains(ugh) = True              neg : pos    =      9.0 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.7 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0


In [33]:
from nltk.corpus import brown 
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]]+=1
    suffix_fdist[word[-2:]]+=1
    suffix_fdist[word[-3:]]+=1
common_suffixes = list(suffix_fdist.keys())[:100] 
common_suffixes

['e',
 'he',
 'the',
 'n',
 'on',
 'ton',
 'y',
 'ty',
 'nty',
 'd',
 'nd',
 'and',
 'ry',
 'ury',
 'id',
 'aid',
 'ay',
 'day',
 'an',
 'ion',
 'f',
 'of',
 's',
 "'s",
 "a's",
 't',
 'nt',
 'ent',
 'ary',
 'ed',
 'ced',
 '`',
 '``',
 'o',
 'no',
 'ce',
 'nce',
 "'",
 "''",
 'at',
 'hat',
 'ny',
 'any',
 'es',
 'ies',
 'k',
 'ok',
 'ook',
 'ace',
 '.',
 'r',
 'er',
 'her',
 'in',
 'end',
 'ts',
 'nts',
 'ity',
 've',
 'ive',
 'ee',
 'tee',
 ',',
 'h',
 'ch',
 'ich',
 'ad',
 'had',
 'l',
 'll',
 'all',
 'ge',
 'rge',
 'ves',
 'se',
 'ise',
 'ks',
 'nks',
 'a',
 'ta',
 'nta',
 'or',
 'for',
 'ner',
 'as',
 'was',
 'ted',
 'ber',
 'm',
 'rm',
 'erm',
 'en',
 'een',
 'ged',
 'by',
 'ior',
 'rt',
 'urt',
 'dge',
 'od']

In [34]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

In [41]:
tagged_words = brown.tagged_words(categories='news')[:20000]
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:3*size], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.577

In [42]:
classifier.classify(pos_features('cats'))

'NNS'

In [43]:
classifier.pseudocode(depth=4)

"if endswith(he) == False: \n  if endswith(s) == False: \n    if endswith(,) == False: \n      if endswith(.) == False: return 'NN'\n      if endswith(.) == True: return '.'\n    if endswith(,) == True: return ','\n  if endswith(s) == True: \n    if endswith(was) == False: \n      if endswith(as) == False: return 'BEZ'\n      if endswith(as) == True: return 'NP'\n    if endswith(was) == True: return 'BEDZ'\nif endswith(he) == True: \n  if endswith(the) == False: return 'PPS'\n  if endswith(the) == True: return 'AT'\n"

### 探索上下文语境

In [44]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
        "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [45]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [47]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i,(word,tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent,i),tag))

In [48]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

### 序列分类

In [49]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>" 
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1] 
        features["prev-tag"] = history[i-1]
    return features

In [50]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents): 
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent) 
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history) 
                train_set.append( (featureset, tag) ) 
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set) 
        
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history) 
            tag = self.classifier.classify(featureset) 
            history.append(tag)
        return zip(sentence, history)

In [51]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] 
tagger = ConsecutivePosTagger(train_sents)
tagger.evaluate(test_sents)

0.7980528511821975

### 句子分割

In [53]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset+=len(sent)
    boundaries.add(offset-1)

In [54]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(), 
        'prevword': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [55]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
                for i in range(1, len(tokens)-1)
                if tokens[i] in '.?!']

In [56]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

### 识别对话行为类别

In [57]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [59]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features

In [61]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.667

### 识别文字蕴含

In [62]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word')) 
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne')) 
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) 
    return features

In [63]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
extractor.text_words

{'Asia',
 'China',
 'Co',
 'Davudi',
 'Iran',
 'Organisation',
 'Parviz',
 'Russia',
 'SCO',
 'Shanghai',
 'Soviet',
 'association',
 'at',
 'binds',
 'central',
 'fight',
 'fledgling',
 'former',
 'four',
 'meeting',
 'operation',
 'representing',
 'republics',
 'terrorism.',
 'that',
 'together',
 'was'}

In [64]:
extractor.hyp_words

{'China', 'SCO.', 'member'}

In [65]:
extractor.overlap('word')

set()

In [66]:
extractor.overlap('ne')

{'China'}

In [67]:
extractor.hyp_extra('word')

{'member'}