In [1]:
import re
import nltk
%matplotlib inline

# Chap4 编写结构化的程序
1.  怎样才能写出结构良好，可读性强的程序，从而方便重用？
2.  基本的结构块，例如：循环、函数和赋值是如何执行的？
3.  Python 编程的陷阱还有哪些，如何避免它们？

## 4.3 风格的问题(P152)

### 4.3.1 Python代码的风格

Python 代码风格指南: http://www.python.org/dev/peps/pep-0008/

In [2]:
from nltk.corpus import brown

rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

cv_words_pairs = [
        (cv, w)
        for w in rotokas_words
        for cv in re.findall('[ptksvr][aeiou]', w)
]
cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories=genre)
)

ha_words = [
        'aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh', 'ahhahahaha',
        'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhh', 'ha', 'haaa',
        'hah', 'haha', 'hahaaa', 'hahah', 'hahaha'
]

syllables = []


def process(aList):
    # process sth.
    return


if (
        len(syllables) > 4
        and len(syllables[2]) == 3
        and syllables[2][2] in ['a', 'e', 'i', 'o', 'u']
        and syllables[2][3] == syllables[1][3]
):
    process(syllables)

### 4.3.2 过程风格 与 声明风格(P153)
统计布朗语料库中词的平均长度

In [3]:
# 使用 for 循环
tokens = nltk.corpus.brown.words(categories='news')
count = 0
total = 0
for token in tokens:
    count += 1
    total += len(token)
print('total / count={:.3f}'.format(total / count))

total / count=4.402


In [4]:
# 使用生成器表达式
token_list = [
        len(t)
        for t in tokens
]
total = sum(
        len(t)
        for t in tokens
)
print('total / count={:.3f}'.format(total / len(tokens)))

total / count=4.402


In [5]:
# 对单词排序
print("慢速代码")
word_list = []
i = 0
while i < len(tokens[:5000]):
    j = 0
    while j < len(word_list) and word_list[j] <= tokens[i]:
        j += 1
    if j == 0 or tokens[i] != word_list[j - 1]:
        word_list.insert(j, tokens[i])
    i += 1
print(word_list[:75])

# 下面是等效的代码，代码更简洁，速度更快
print("快速代码")
word_list = sorted(set(tokens[:5000]))
print(word_list[:75])

慢速代码
['$1,000', '$10', '$100', '$12', '$15,000,000', '$157,460', '$3', '$30', '$4', '$451,500', '$5,000,000', '$50', '$88,000', '&', "'", "''", '(', ')', ',', '--', '.', '1', '1,119', '10', '100,000', '114', '12', '13', '13th', '150', '17', '17,000', '18', '182', '1913', '1923', '1937', '1958', '1961', '1961-62', '1962', '2', '22', '24', '250', '29-5', '3', '30', '300,000', '31', '4', '4-year', '4.4', '402', '50', '6', '63', '637', '65', '71', '74', '8', '81', '87-31', ':', '?', 'A', 'A.', 'ADC', 'Acting', 'Affairs', 'After', 'Agency', 'Aikin', 'Aj']
快速代码
['$1,000', '$10', '$100', '$12', '$15,000,000', '$157,460', '$3', '$30', '$4', '$451,500', '$5,000,000', '$50', '$88,000', '&', "'", "''", '(', ')', ',', '--', '.', '1', '1,119', '10', '100,000', '114', '12', '13', '13th', '150', '17', '17,000', '18', '182', '1913', '1923', '1937', '1958', '1961', '1961-62', '1962', '2', '22', '24', '250', '29-5', '3', '30', '300,000', '31', '4', '4-year', '4.4', '402', '50', '6', '63', '637', '65', '

In [6]:
# 统计布朗语料库中单词占比数，超过25%后停止输出
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()]
for rank, word in enumerate(most_common_words):
    cumulative += fd.freq(word)  # word在总文本中的占比数
    print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
    if cumulative > 0.25:
        break

  1   5.40% the
  2  10.42% ,
  3  14.67% .
  4  17.78% of
  5  20.19% and
  6  22.40% to
  7  24.29% a
  8  25.97% in


In [7]:
# P155 寻找最长的单词
# 第一种方法：只能找到第一个长度最长的词
text = nltk.corpus.gutenberg.words('milton-paradise.txt')
longest = ''
for word in text:
    if len(word) > len(longest):
        longest = word
print('longest word:{}'.format(longest))

# 下面是等效的代码
# 第二种方法：使用两个链表推导式，可以找到所有最长的词
maxlen = max(len(word) for word in text)
print([word for word in text if len(word) == maxlen])

longest word:unextinguishable
['unextinguishable', 'transubstantiate', 'inextinguishable', 'incomprehensible']


### 4.3.3 计数器（counter）的常规用法

In [8]:
# 使用循环变量来提取链表中连续重叠的3-grams
n = 3
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
print("3-grams= ", [sent[i:i + n] for i in range(len(sent) - n + 1)])
# 下面是等效的代码
print("3-grams= ", list(nltk.trigrams(sent)))
# 下面是 2-grams
print("2-grams= ", list(nltk.bigrams(sent)))
# 下面是 4-grams
print("4-grams= ", list(nltk.ngrams(sent, 4)))

3-grams=  [['The', 'dog', 'gave'], ['dog', 'gave', 'John'], ['gave', 'John', 'the'], ['John', 'the', 'newspaper']]
3-grams=  [('The', 'dog', 'gave'), ('dog', 'gave', 'John'), ('gave', 'John', 'the'), ('John', 'the', 'newspaper')]
2-grams=  [('The', 'dog'), ('dog', 'gave'), ('gave', 'John'), ('John', 'the'), ('the', 'newspaper')]
4-grams=  [('The', 'dog', 'gave', 'John'), ('dog', 'gave', 'John', 'the'), ('gave', 'John', 'the', 'newspaper')]


In [9]:
import pprint

# 使用循环变量构建多维结构
# 嵌套的链表推导式
m, n = 3, 7
array = [
        [
                set()
                for i in range(n)
        ]
        for j in range(m)
]
array[2][5].add('Alice')
pprint.pprint(array)

[[set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), {'Alice'}, set()]]


In [10]:
array = [[set()] * n] * m
array[2][5].add(7)
pprint.pprint(array)

[[{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}],
 [{7}, {7}, {7}, {7}, {7}, {7}, {7}]]
