In [1]:
import collections
import re


def read_time_machine():
    with open('local_poem.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]


lines = read_time_machine()
print(f'text lines: {len(lines)}')
print(lines[0])
print(lines[10])

text lines: 75
in a quiet corner of the world there lay an ancient forest its edges blurred by mist and time few dared to enter for tales spoke of strange lights haunting sounds and paths that twisted back on themselves but for leo a curious fifteen year old with a heart full of wonder it was an irresistible mystery
as he watched the images shifted now he saw a young girl her eyes as bright as the stars running through the very same forest she seemed to be searching for something her expression filled with determination leo felt a strange connection to her as if their fates were intertwined


In [2]:
def tokenize(lines, token='word'):
    """将文本行拆分为单词或字符标记"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知令牌类型：' + token)
        return None


tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['in', 'a', 'quiet', 'corner', 'of', 'the', 'world', 'there', 'lay', 'an', 'ancient', 'forest', 'its', 'edges', 'blurred', 'by', 'mist', 'and', 'time', 'few', 'dared', 'to', 'enter', 'for', 'tales', 'spoke', 'of', 'strange', 'lights', 'haunting', 'sounds', 'and', 'paths', 'that', 'twisted', 'back', 'on', 'themselves', 'but', 'for', 'leo', 'a', 'curious', 'fifteen', 'year', 'old', 'with', 'a', 'heart', 'full', 'of', 'wonder', 'it', 'was', 'an', 'irresistible', 'mystery']
[]
['one', 'crisp', 'morning', 'with', 'the', 'sun', 'just', 'peeking', 'over', 'the', 'horizon', 'leo', 'slipped', 'out', 'of', 'his', 'small', 'village', 'the', 'grass', 'under', 'his', 'feet', 'was', 'damp', 'and', 'the', 'air', 'carried', 'the', 'sweet', 'scent', 'of', 'wildflowers', 'as', 'he', 'approached', 'the', 'forest', 'the', 'mist', 'seemed', 'to', 'part', 'as', 'if', 'inviting', 'him', 'in']
[]
['the', 'first', 'step', 'inside', 'was', 'like', 'crossing', 'a', 'threshold', 'into', 'another', 'world', 'the',

In [3]:
class Vocab(object):
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = self.count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens

        uniq_tokens += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    def count_corpus(self, tokens):
        """统计标记的频率"""
        if len(tokens) == 0 or isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        return collections.Counter(tokens)

In [5]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('a', 2), ('he', 3), ('to', 4), ('of', 5), ('and', 6), ('in', 7), ('was', 8), ('leo', 9)]


# 将每一条文本行转换成一个数字索引表

In [6]:
for i in [0, 10]:
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: ['in', 'a', 'quiet', 'corner', 'of', 'the', 'world', 'there', 'lay', 'an', 'ancient', 'forest', 'its', 'edges', 'blurred', 'by', 'mist', 'and', 'time', 'few', 'dared', 'to', 'enter', 'for', 'tales', 'spoke', 'of', 'strange', 'lights', 'haunting', 'sounds', 'and', 'paths', 'that', 'twisted', 'back', 'on', 'themselves', 'but', 'for', 'leo', 'a', 'curious', 'fifteen', 'year', 'old', 'with', 'a', 'heart', 'full', 'of', 'wonder', 'it', 'was', 'an', 'irresistible', 'mystery']
indices: [7, 2, 221, 122, 5, 1, 123, 81, 82, 83, 84, 19, 24, 222, 223, 63, 85, 6, 32, 124, 224, 4, 225, 33, 226, 86, 5, 87, 227, 228, 229, 6, 125, 13, 230, 48, 23, 231, 22, 33, 9, 2, 232, 233, 234, 235, 12, 2, 64, 126, 5, 236, 18, 8, 83, 237, 127]
words: ['as', 'he', 'watched', 'the', 'images', 'shifted', 'now', 'he', 'saw', 'a', 'young', 'girl', 'her', 'eyes', 'as', 'bright', 'as', 'the', 'stars', 'running', 'through', 'the', 'very', 'same', 'forest', 'she', 'seemed', 'to', 'be', 'searching', 'for', 'something',