In [95]:
"""
数据来源：https://github.com/L1aoXingyu/Char-RNN-Gluon/tree/master/data
"""
import codecs
import collections
import numpy as np
import logging
import time


In [96]:
class TextDataLoader(object):
    def __init__(self):
        self.word_to_index = None
        self.index_to_word = None
        self.data = None
        self.data_size = 0
        self.vocab = None
        self.vocab_num = 0

    def load_data(self, file_path, max_vocab_num = 5000):
        start_time = time.time()
        with codecs.open(file_path, mode='r', encoding='utf-8') as f:
            file_content = f.readlines()

        word_list = [w for line in file_content for w in line]
        vocab_count_dict = collections.Counter(word_list)
        vocab_count_arr = list(vocab_count_dict.items())
        vocab_count_arr.sort(key=lambda x: x[1], reverse=True)
        real_vocab_num = len(vocab_count_arr)
        if len(vocab_count_arr) > max_vocab_num:
            vocab_count_arr = vocab_count_arr[:max_vocab_num]
        self.vocab = [x[0] for x in vocab_count_arr]
        self.word_to_index = {w: i for i, w in enumerate(self.vocab)}
        self.index_to_word = dict(enumerate(self.vocab))
        self.vocab_num = len(self.vocab)
        data = [ [self.word_to_index[w] for w in line if self.word_to_index.get(w)!= None] 
                for line in file_content ] 
        self.data = data
        logging.info("load data done. real_vocab_num:%d result_vocab_num:%d data_size:%d",
                     real_vocab_num, self.vocab_num, len(self.data))
        
            
        
        

In [97]:


if __name__ == "__main__":
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    
    loader = TextDataLoader()
    loader.load_data("poetry.txt", 100)
    print("word_to_index:%s" % loader.word_to_index)
    print("index_to_word:%s" % loader.index_to_word)
    print(len(loader.data))
    print(loader.data[0])
    print(loader.data[-1])
    logging.getLogger().handlers[0].flush()


INFO:root:load data done. real_vocab_num:5387 result_vocab_num:100 data_size:72514


word_to_index:{'，': 0, '。': 1, '\n': 2, '不': 3, '人': 4, '山': 5, '风': 6, '日': 7, '云': 8, '无': 9, '何': 10, '一': 11, '春': 12, '月': 13, '水': 14, '花': 15, '来': 16, '有': 17, '中': 18, '秋': 19, '上': 20, '时': 21, '天': 22, '归': 23, '心': 24, '相': 25, '此': 26, '年': 27, '生': 28, '长': 29, '夜': 30, '自': 31, '去': 32, '知': 33, '空': 34, '行': 35, '江': 36, '客': 37, '白': 38, '清': 39, '远': 40, '君': 41, '寒': 42, '见': 43, '为': 44, '在': 45, '高': 46, '里': 47, '雨': 48, '下': 49, '路': 50, '落': 51, '处': 52, '如': 53, '多': 54, '未': 55, '明': 56, '别': 57, '门': 58, '草': 59, '树': 60, '色': 61, '青': 62, '城': 63, '入': 64, '声': 65, '新': 66, '出': 67, '还': 68, '思': 69, '南': 70, '深': 71, '林': 72, '应': 73, '得': 74, '流': 75, '道': 76, '独': 77, '朝': 78, '烟': 79, '开': 80, '雪': 81, '千': 82, '家': 83, '是': 84, '事': 85, '尽': 86, '闲': 87, '飞': 88, '望': 89, '谁': 90, '子': 91, '回': 92, '东': 93, '地': 94, '与': 95, '今': 96, '酒': 97, '同': 98, '前': 99}
index_to_word:{0: '，', 1: '。', 2: '\n', 3: '不', 4: '人', 5: '山', 6: '风', 7: '日', 8: '云', 9: '无'