# 如何统计序列中元素的频度

# 创建列表

In [1]:
from random import randint

In [2]:
data = [randint(0,20) for _ in range(30)]

In [3]:
data

[1,
 5,
 17,
 20,
 9,
 10,
 15,
 16,
 13,
 8,
 5,
 5,
 17,
 1,
 18,
 2,
 13,
 14,
 8,
 14,
 17,
 0,
 3,
 7,
 20,
 20,
 19,
 20,
 5,
 17]

#  方案1：将序列转换为字典{元素:频度}，根据字典中的值排序

In [4]:
# 初始化计数字典
d = dict.fromkeys(data, 0)

In [5]:
d

{1: 0,
 5: 0,
 17: 0,
 20: 0,
 9: 0,
 10: 0,
 15: 0,
 16: 0,
 13: 0,
 8: 0,
 18: 0,
 2: 0,
 14: 0,
 0: 0,
 3: 0,
 7: 0,
 19: 0}

In [6]:
# 累加频次
for x in data:
    d[x] += 1

In [7]:
d

{1: 2,
 5: 4,
 17: 4,
 20: 4,
 9: 1,
 10: 1,
 15: 1,
 16: 1,
 13: 2,
 8: 2,
 18: 1,
 2: 1,
 14: 2,
 0: 1,
 3: 1,
 7: 1,
 19: 1}

In [8]:
# 排序
sorted([(v,k) for k,v in d.items()], reverse=True)

[(4, 20),
 (4, 17),
 (4, 5),
 (2, 14),
 (2, 13),
 (2, 8),
 (2, 1),
 (1, 19),
 (1, 18),
 (1, 16),
 (1, 15),
 (1, 10),
 (1, 9),
 (1, 7),
 (1, 3),
 (1, 2),
 (1, 0)]

In [9]:
# 取前三个
sorted([(v,k) for k,v in d.items()], reverse=True)[:3]

[(4, 20), (4, 17), (4, 5)]

In [10]:
# 优化：使用生成器解析，而不是列表解析
sorted(((v,k) for k,v in d.items()), reverse=True)[:3]

[(4, 20), (4, 17), (4, 5)]

In [11]:
# 优化：利用堆排序
import heapq
heapq.nlargest(3, ((v,k) for k,v in d.items()))

[(4, 20), (4, 17), (4, 5)]

# 方案2：使用标准库collections中的Counter对象

In [12]:
from collections import Counter

In [13]:
data

[1,
 5,
 17,
 20,
 9,
 10,
 15,
 16,
 13,
 8,
 5,
 5,
 17,
 1,
 18,
 2,
 13,
 14,
 8,
 14,
 17,
 0,
 3,
 7,
 20,
 20,
 19,
 20,
 5,
 17]

In [14]:
Counter(data)

Counter({1: 2,
         5: 4,
         17: 4,
         20: 4,
         9: 1,
         10: 1,
         15: 1,
         16: 1,
         13: 2,
         8: 2,
         18: 1,
         2: 1,
         14: 2,
         0: 1,
         3: 1,
         7: 1,
         19: 1})

In [15]:
c = Counter(data)

In [16]:
c.most_common(3)

[(5, 4), (17, 4), (20, 4)]

## 词频统计的例子

In [24]:
# 把文件整个读过来
txt = open('example.txt').read()

In [25]:
txt

'China\'s recent move to deepen economic structural reform is an attempt to cultivate new quality productive forces and foster high-quality development in the long run, offering increasing growth opportunities for stakeholders both at home and abroad, said experts and entrepreneurs.\n\nThey said that China\'s pro-reform efforts will mainly focus on key areas including technological innovation and industrial upgrading, which will help resolve economic woes, create new growth drivers and inject strong impetus into the world\'s second-largest economy.\n\nTheir comments came after President Xi Jinping said in late May that reform should originate from real needs and address the most urgent issues to improve the socialist market economy, with the president calling for efforts to deepen theoretical innovation and promote institutional innovation during the process of solving practical problems.\n\n"Deepening reform, particularly institutional reform, will help remove obstacles and difficulti

In [26]:
# 用为字符的词来切割txt，得到词的列表word_list
import re
word_list = re.split('\W+', txt)

In [27]:
word_list

['China',
 's',
 'recent',
 'move',
 'to',
 'deepen',
 'economic',
 'structural',
 'reform',
 'is',
 'an',
 'attempt',
 'to',
 'cultivate',
 'new',
 'quality',
 'productive',
 'forces',
 'and',
 'foster',
 'high',
 'quality',
 'development',
 'in',
 'the',
 'long',
 'run',
 'offering',
 'increasing',
 'growth',
 'opportunities',
 'for',
 'stakeholders',
 'both',
 'at',
 'home',
 'and',
 'abroad',
 'said',
 'experts',
 'and',
 'entrepreneurs',
 'They',
 'said',
 'that',
 'China',
 's',
 'pro',
 'reform',
 'efforts',
 'will',
 'mainly',
 'focus',
 'on',
 'key',
 'areas',
 'including',
 'technological',
 'innovation',
 'and',
 'industrial',
 'upgrading',
 'which',
 'will',
 'help',
 'resolve',
 'economic',
 'woes',
 'create',
 'new',
 'growth',
 'drivers',
 'and',
 'inject',
 'strong',
 'impetus',
 'into',
 'the',
 'world',
 's',
 'second',
 'largest',
 'economy',
 'Their',
 'comments',
 'came',
 'after',
 'President',
 'Xi',
 'Jinping',
 'said',
 'in',
 'late',
 'May',
 'that',
 'reform'

In [28]:
c2 = Counter(word_list)

In [29]:
c2.most_common(10)

[('the', 39),
 ('and', 38),
 ('of', 33),
 ('in', 22),
 ('to', 21),
 ('China', 15),
 ('said', 14),
 ('a', 14),
 ('reform', 12),
 ('new', 11)]