In [1]:
import collections
import re

from d2l import mxnet as d2l

In [2]:
# Text processing on H. G. Wells' The Time Machine

d2l.DATA_HUB["time_machine"] = (d2l.DATA_URL + "timemachine.txt", "090b5e7e70c295757f55df93cb0a180b9691891a",)


def read_time_machine():
    """
    Load The Time Machine into a list of text lines.
    """
    with open(d2l.download("time_machine"), "r") as f:
        lines = f.readlines()

    return [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines]


lines = read_time_machine()
print(f"Number of text lines: {len(lines)}")
print(lines[0])
print(lines[10])

Number of text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [3]:
# Tokenization
def tokenize(lines, token="word"):
    """
    Split text lines into word or character tokens.
    """
    if token == "word":
        return [line.split() for line in lines]
    elif token == "char":
        return [list(line) for line in lines]
    else:
        print("ERROR")


tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [5]:
# Lets build Vocabulary!
class Vocab:
    """
    Volcabulary for text
    """

    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []

        counter = count_corpus(tokens)
        self.token_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        # Index for unique token is 0
        self.unk, uniq_tokens = 0, ["<unk>"] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freq if freq >= min_freq and token not in uniq_tokens]

        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]


def count_corpus(tokens):
    """
    Count token frequency
    """
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of tokens lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [6]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [7]:
for i in [0,10]:
    print("words: ", tokens[i])
    print("indices: ", vocab[tokens[i]])

words:  ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
indices:  [1, 19, 50, 40, 2183, 2184, 400]
words:  ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
indices:  [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [8]:
def load_corpus_time_machine(max_tokens=-1):
    """
    Return token indices and the vocabulary of the time machine dataset
    """
    lines = read_time_machine()
    tokens = tokenize(lines, "char")
    vocab = Vocab(tokens)
    # Since each text line in The Time Machine is not necessarily sentence or paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab  = load_corpus_time_machine()

In [13]:
vocab.token_to_idx

{'<unk>': 0,
 ' ': 1,
 'e': 2,
 't': 3,
 'a': 4,
 'i': 5,
 'n': 6,
 'o': 7,
 's': 8,
 'h': 9,
 'r': 10,
 'd': 11,
 'l': 12,
 'm': 13,
 'u': 14,
 'c': 15,
 'f': 16,
 'w': 17,
 'g': 18,
 'y': 19,
 'p': 20,
 'b': 21,
 'v': 22,
 'k': 23,
 'x': 24,
 'z': 25,
 'j': 26,
 'q': 27}

In [14]:
corpus[:4]  # 'the '

[3, 9, 2, 1]

In [15]:
len(corpus), len(vocab)

(170580, 28)