In [1]:
import nltk
%matplotlib inline

# Chap 3 处理原始文本
1.  如何访问文件内的文本？
2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？
3.  如何产生格式化的输出，并把结果保存在文件中？

## 3.8 分割(Segmentation)(P121)

### 3.8.1 句分割，断句，Sentence Segmentation(P122)

In [2]:
# 计算布朗语料库中每个句子的平均词数
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

20.250994070456922

In [3]:
# Punkt 句子分割器
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)  # 转为使用Punkt句子分割器
sents[171:175]

['In the wild events which were to follow this girl had no\npart at all; he never saw her again until all his tale was over.',
 'And yet, in some indescribable way, she kept recurring like a\nmotive in music through all his mad adventures afterwards, and the\nglory of her strange hair ran like a red thread through those dark\nand ill-drawn tapestries of the night.',
 'For what followed was so\nimprobable, that it might well have been a dream.',
 'When Syme went out into the starlit street, he found it for the\nmoment empty.']

### 3.8.2 词分割，分词(Word Segmentation)(P123)

In [4]:
# Ex3-2
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i + 1])
            last = i + 1
    words.append(text[last:])
    return words

In [5]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"

seg1 = "0000000000000001000000000010000000000000000100000000000"
print("seg1= ", segment(text, seg1))

seg2 = "0100100100100001001001000010100100010010000100010010000"
print("seg2= ", segment(text, seg2))

seg3 = "0000100100000011001000000110000100010000001100010000001"
print("seg3= ", segment(text, seg3))

seg1=  ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
seg2=  ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
seg3=  ['doyou', 'see', 'thekitt', 'y', 'see', 'thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']


In [6]:
# 图3-6：目标函数=词典的大小+从词典中重构源文本所需要的信息量
words = segment(text, seg2)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
print("目标函数= ",text_size + lexicon_size)

目标函数=  47


In [7]:
# P124 Ex3-3 计算存储词典和重构源文本的成本，计算目标函数，评价分词质量，得分越小越好
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size

In [8]:
print("seg1 目标函数= ",evaluate(text, seg1))
print("seg2 目标函数= ",evaluate(text, seg2))
print("seg3 目标函数= ",evaluate(text, seg3))

seg1 目标函数=  63
seg2 目标函数=  47
seg3 目标函数=  46


In [9]:
# P125 Ex3-4 使用模拟退火算法的非确定性搜索；
# 1) 一开始仅搜索短语分词；
# 2) 然后随机扰动0和1，它们与“温度”成比例；
# 3) 每次迭代温度都会降低，扰动边界会减少。

from random import randint


def flip(segs, pos):
    return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]


def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs) - 1))
    return segs


def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best_segs, best = guess, score
        segs, score = best_segs, best
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs

In [10]:
print("anneal(seg1)= ", anneal(text, seg1, 5000, 1.2))

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']
62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']
62 ['doyou', 'see', 'the', 'kittys', 'eethedo', 'ggy', 'doyou', 'like', 'the', 'kitty', 'like', 'thedo', 'ggy']
58 ['doyou', 'see', 'the', 'kit', 'ty', 'see', 'thedo', 'ggy', 'doyou', 'like', 'thekitty', 'like', 'thedo', 'ggy']
56 ['doyou', 'se

In [11]:
# 小的得分，不一定是合理的分词结果，说明评价函数存在问题
print("anneal(seg2)= ", anneal(text, seg2, 5000, 1.2))

47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
47 ['do'