In [1]:
import nltk
%matplotlib inline

# Chap 3 处理原始文本
1.  如何访问文件内的文本？
2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？
3.  如何产生格式化的输出，并把结果保存在文件中？

## 3.6 规范化文本(P115)

### 3.6.1 词干提取器
词干提取器错误很多

In [2]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
 is no basis for a system of government.  
 Supreme executive power derives from a mandate from the masses, 
 not from some farcical aquatic ceremony."""

In [3]:
tokens = nltk.word_tokenize(raw)
print(tokens)

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'lying', 'in', 'ponds', 'distributing', 'swords', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'masses', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


In [4]:
# 词干提取器 Porter 比 Lancaster 要好点
# Porter 词干提取器
porter = nltk.PorterStemmer()
stem_porter_list = [
        porter.stem(t)
        for t in tokens
]
print(stem_porter_list)

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']


In [5]:
# Lancaster 词干提取器
lancaster = nltk.LancasterStemmer()
stem_lancaster_list = [
        lancaster.stem(t)
        for t in tokens
]
print(stem_lancaster_list)

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


In [6]:
# Ex3-1 使用词干提取器索引文本
class IndexedText(object):
    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index(
                (self._stem(word), i)
                for (i, word) in enumerate(text)
        )

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = int(width / 4)
        for i in self._index[key]:
            l_context = ' '.join(self._text[i - wc:i])
            r_context = ' '.join(self._text[i:i + wc])
            l_display = '{:>{width}}'.format(l_context[-width:], width=width)
            r_display = '{:{width}}'.format(r_context[:width], width=width)
            print(l_display, r_display)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()

In [7]:
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('em')

alves of coconut and you ' re bangin ' ' em together . ARTHUR : So ? We have ridd
  built it all the same , just to show ' em . It sank into the swamp . So ,      
     OFFICER # 2 : Come on . Back with ' em . Back . Right . Come along . INSPECT


In [8]:
text = IndexedText(lancaster, grail)
text.concordance('em')

alves of coconut and you ' re bangin ' ' em together . ARTHUR : So ? We have ridd
  built it all the same , just to show ' em . It sank into the swamp . So ,      
     OFFICER # 2 : Come on . Back with ' em . Back . Right . Come along . INSPECT


### 3.6.2 词形归并(P117)
WordNet 词形归并器将会删除词缀产生的词，即将变化的单词恢复原始形式，例如：women 转变成 woman

In [9]:
wnl = nltk.WordNetLemmatizer()
wnl_lemma_list = [
        wnl.lemmatize(t)
        for t in tokens
]
print(wnl_lemma_list)

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']
