In [None]:
'''数据集重建
首先，我们得把现有消息的编号及其类别保存下来。创建一个新的笔记本文件，指定接下来要用到的几个文件名。
代码跟之前类似，只不过多了一个用来保存消息编号及其类别的文件。代码如下：
'''
import os
input_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_classes.json")
replicable_dataset = os.path.join(os.path.expanduser("~"), "Data", "twitter", "replicable_dataset.json")

'''加载消息和类别，就跟我们在上一个笔记本文件中做的那样。'''
import json
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line))
if os.path.exists(labels_filename):
    with open(classes_filename) as inf:
        labels = json.load(inf)

'''同时遍历所有的消息及消息所属的类别，创建新数据集，将其保存到列表中。'''
dataset = [(tweet['id'], label) for tweet, label in zip(tweets, labels)]


'''最后，把结果保存到文件中。'''
with open(replicable_dataset, 'w') as outf:
    json.dump(dataset, outf)
'''有了消息的编号和类别，我们就可以重建数据集。'''

In [None]:
import os
tweet_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "replicable_python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "replicable_python_classes.json")
replicable_dataset = os.path.join(os.path.expanduser("~"), "Data", "twitter", "replicable_dataset.json")

'''使用JSON从文件中加载消息编号及类别数据。'''
import json
with open(replicable_dataset) as inf:
    tweet_ids = json.load(inf)

'''
只输出我们实际能用到的类别就显得尤为重要。具体做法是，首先，创建actual_labels列表存储我们能够再次从Twitter网站获取到的消息的类别。
然后，创建字典，为消息的编号和类别建立起映射关系。
'''
actual_labels = []
label_mapping = dict(tweet_ids)

'''
接下来，用twitter库根据消息编号采集消息。这可能要花点时间。导入前面用过的twitter库，创建授权令牌，用它来初始化twitter对象。
'''
import twitter
consumer_key = "<Your Consumer Key Here>"
consumer_secret = "<Your Consumer Secret Here>"
access_token = "<Your Access Token Here>"
access_token_secret = "<Your Access Token Secret Here>"
authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)
t = twitter.Twitter(auth=authorization)

'''遍历并抽取所有的消息编号。'''
all_ids = [tweet_id for tweet_id, label in tweet_ids]
with open(tweets_filename, 'a') as output_file:
    '''Twitter API允许我们一次只能获取100条消息。因此，每次遍历100条消息。'''
    for start_index in range(0, len(tweet_ids), 100):
        '''把这一批次的100个编号用逗号连接起来，便于下面使用Twitter的API根据编号查找消息。'''
        id_string = ",".join(str(i) for i in all_ids[start_index:start_index+100])
        '''接着，调用Twitter定义的statuses/lookup方法，传入一批消息编号（已转换为字符串），以采集这些消息。'''
        search_results = t.statuses.lookup(_id=id_string)
        for tweet in search_results:
            if 'text' in tweet:
                output_file.write(json.dumps(tweet))
                output_file.write("\n\n")
                '''
                最后一步（仍然属于if模块），还需要保存当前遍历到的消息的类别。
                获取消息类别要用到之前创建的label_mapping字典，根据消息编号查找即可。代码如下：
                '''
                actual_labels.append(label_mapping[tweet['id']])
with open(labels_filename, 'w') as outf:
    json.dump(actual_labels, outf)



In [2]:
s = """Three Rings for the Elven-kings under the sky,
Seven for the Dwarf-lords in halls of stone,
Nine for Mortal Men, doomed to die,
One for the Dark Lord on his dark throne
In the Land of Mordor where the Shadows lie.
One Ring to rule them all, One Ring to find them,
One Ring to bring them all and in the darkness bind them.
In the Land of Mordor where the Shadows lie. """.lower()
words = s.split()
from collections import Counter
c = Counter(words)
c.most_common(10)

[('the', 9),
 ('for', 4),
 ('in', 4),
 ('to', 4),
 ('one', 4),
 ('of', 3),
 ('ring', 3),
 ('dark', 2),
 ('land', 2),
 ('mordor', 2)]