条件概率分布

In [15]:
import nltk
from nltk.corpus import brown


# (condition, event) pair as parameter for CFD
cfd = nltk.ConditionalFreqDist((genre, word) 
                               for genre in brown.categories() 
                               for word in brown.words(categories=genre))

print(cfd.conditions())
print(cfd['reviews'].most_common(5))
print(cfd['reviews']['the'])

# conditions to be display
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
# events to be display
modals = ['can', 'could', 'may', 'might', 'must', 'will']
# display a table consist of the above conditions&events
cfd.tabulate(conditions=genres, samples=modals)

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
[(',', 2318), ('the', 2048), ('.', 1549), ('of', 1299), ('and', 1103)]
2048
                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 


CFD+Bigrams 生成随机文本

In [25]:
import random

def get_language_model():
    words = nltk.corpus.genesis.words('english-kjv.txt')
    # bigrams() takes a list of words and builds a list of consecutive word pairs.
    bigrams = nltk.bigrams(words)
    # the first item of bigram as condition, the second item of bigram as event
    cfd = nltk.ConditionalFreqDist(bigrams)
    return cfd

def generate_language(cfd, word, num=20, top_k=5):
    words = [word]
    for i in range(num):
        if top_k > 0:
            word = random.choice(cfd[word].most_common(top_k))[0]
        else:
            word = cfd[word].max()
        words.append(word)
    return words

words = generate_language(get_language_model(), 'There')
print(' '.join(words))

There went in unto them in the sons , the name of his son . Then Abraham said to thy father


模拟退化分词法

In [28]:
# 根据二进制串分词
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

# 评价分词结果
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = sum(len(word) + 1 for word in set(words))
    return text_size + lexicon_size

# 查找使得objective function最小化的二进制串（基于非确定性的模拟退火）
from random import randint

def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs

def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, round(temperature))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs

# 运行结果
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 50000, 1.2)

64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyou', 'seethekitt', 'y', 'see', 'thedoggy', 'doyou', 'liketh', 'ekitt', 'y', 'li', 'ke', 'thedoggy']
63 ['doyou', 'seethekitt', 'y', 'see', 'thedoggy', 'doyou', 'liketh', 'ekitt', 'y', 'li', 'ke', 'thedoggy']
61 ['doyou', 'see', 't', 'hekitty', 'see', 'thedoggy', 'doyou', 'liketheki', 't', 'ty', 'li', 'ke', 'thedoggy']
60 ['doy', 'ou', 'see', 't', 'heki', 't', 'ty', 'see', 'thedoggy', 'doy', 'ou', 'li', 'ket', 'heki', 't', 'ty', 'like', 'thedoggy']
60 ['doy', 'ou', 'see', 't', 'heki', 't', 'ty', 'see', 'thedoggy', 'doy', 'ou', 'li', 'ket', 'heki', 't', 'ty', 'like', 'thedoggy']
59 ['doy', 'ou', 'see', 't',

'0000100100000001001000000010000100010000000100010000000'

名词经常出现在什么词后面

In [38]:
from nltk.corpus import brown

brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
word_tag_pairs = nltk.bigrams(brown_news_tagged) # 构造经过标注后的单词二元组
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN'] # 统计名词前面单词的词性
fdist = nltk.FreqDist(noun_preceders)
fdist.most_common() # 根据结果可知名词经常出现在冠词，形容词，动词后面

[('NOUN', 7959),
 ('DET', 7373),
 ('ADJ', 4761),
 ('ADP', 3781),
 ('.', 2796),
 ('VERB', 1842),
 ('CONJ', 938),
 ('NUM', 894),
 ('ADV', 186),
 ('PRT', 94),
 ('PRON', 19),
 ('X', 11)]

各种标注器

In [44]:
import nltk
from nltk.corpus import brown

# 基线标注器
brown_tagged_sents = brown.tagged_sents(categories='news')
tag = 'NN'  # 最常见的词性
default_tagger = nltk.DefaultTagger(tag)
print('基线标注器：', default_tagger.evaluate(brown_tagged_sents))

# 正则标注器
patterns = [
	(r'.*ing$', 'VBG'),               # gerunds
	(r'.*ed$', 'VBD'),                # simple past
	(r'.*es$', 'VBZ'),                # 3rd singular present
	(r'.*ould$', 'MD'),               # modals
	(r'.*\'s$', 'NN$'),               # possessive nouns
	(r'.*s$', 'NNS'),                 # plural nouns
	(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
	(r'.*', 'NN')                     # nouns (default)
]

regexp_tagger = nltk.RegexpTagger(patterns)
print('正则标注器：', regexp_tagger.evaluate(brown_tagged_sents))


# 查表标注器
brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')

# 用于查找前 100 高频词
fd = nltk.FreqDist(w for w in brown.words(categories='news'))
# 用于查找高频词最可能的词性
cfd = nltk.ConditionalFreqDist((w, t) for (w, t) in brown.tagged_words(categories='news'))
# 生成词性查询表
likely_tags = dict((w, cfd[w].max()) for (w, _) in fd.most_common(100))

baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))  # 当词不在表中时，指定回退标注器
baseline_tagger.tag(brown_sents[3])
print('查表标注器：', baseline_tagger.evaluate(brown_tagged_sents))

# 二元文法标注器
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

unigram_tagger = nltk.UnigramTagger(train_sents)
print('一元文法标注器：', unigram_tagger.evaluate(test_sents))

bigram_tagger = nltk.BigramTagger(train_sents)
print('二元文法标注器：', bigram_tagger.evaluate(test_sents))  # 因为未登录词，导致性能较低

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
print('回退的二元文法标注器：', t2.evaluate(test_sents))

基线标注器： 0.13089484257215028
正则标注器： 0.20326391789486245
查表标注器： 0.5817769556656125
一元文法标注器： 0.8121200039868434
二元文法标注器： 0.10206319146815508
回退的二元文法标注器： 0.8452108043456593
