In [2]:
import collections
import re
from d2l import torch as d2l

In [2]:
collections?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'collections' from '/home/zeng/anaconda3/envs/torch/lib/python3.11/collections/__init__.py'>
[0;31mFile:[0m        ~/anaconda3/envs/torch/lib/python3.11/collections/__init__.py
[0;31mDocstring:[0m  
This module implements specialized container datatypes providing
alternatives to Python's general purpose built-in containers, dict,
list, set, and tuple.

* namedtuple   factory function for creating tuple subclasses with named fields
* deque        list-like container with fast appends and pops on either end
* ChainMap     dict-like class for creating a single view of multiple mappings
* Counter      dict subclass for counting hashable objects
* OrderedDict  dict subclass that remembers the order entries were added
* defaultdict  dict subclass that calls a factory function to supply missing values
* UserDict     wrapper around dictionary objects for easier dict subclassing
* UserList     wrapper around list objects for eas

In [3]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', 
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

In [4]:
d2l.DATA_URL

'http://d2l-data.s3-accelerate.amazonaws.com/'

In [6]:
def read_time_machine():
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

In [7]:
lines = read_time_machine()

Downloading ../data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...


In [12]:
re.sub??

[0;31mSignature:[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0mpattern[0m[0;34m,[0m [0mrepl[0m[0;34m,[0m [0mstring[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mflags[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0msub[0m[0;34m([0m[0mpattern[0m[0;34m,[0m [0mrepl[0m[0;34m,[0m [0mstring[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mflags[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Return the string obtained by replacing the leftmost[0m
[0;34m    non-overlapping occurrences of the pattern in string by the[0m
[0;34m    replacement repl.  repl can be either a string or a callable;[0m
[0;34m    if a string, backslash escapes in it are processed.  If it is[0m
[0;34m    a callable, it's passed the Match object and must return[0m
[0;34m    a replacement string to be used."""[0m[0;34m[0m
[0;34m[0m    [0;

In [14]:
print(f'#文本总行数：{len(lines)}')

#文本总行数：3221


In [19]:
print(lines[0])
print(type(lines[10]))

the time machine by h g wells
<class 'str'>


In [20]:
def tokensize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines] # 以字符为单位划分句子
    elif token == 'char':
        return [list(line) for line in lines] # 将以字符为单位，划分单词
    else:
        print('error:unkonw type:' + token)

In [21]:
token = tokensize(lines)

In [40]:
# 文本个数map和文件索引map
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens) # 字母map
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) #对得到的语料map进行排序
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token:idx for idx, token in enumerate(self.idx_to_token)} # 某个单词对于id word:idx
        for token, freq in self._token_freqs:
            if freq < min_freq: # 不统计小于某个频率的单词
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            
            
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)): # 非列表或者元组
            return self.token_to_idx.get(tokens, (list, tuple))
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self._token_freqs
    

            

In [50]:
def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line] # 记录每个单词
    return collections.Counter(tokens)# 单词统计

In [51]:
vocab = Vocab(tokens=token)

In [52]:
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [53]:
sorted?

[0;31mSignature:[0m [0msorted[0m[0;34m([0m[0miterable[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mkey[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mreverse[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a new list containing all items from the iterable in ascending order.

A custom key function can be supplied to customize the sort order, and the
reverse flag can be set to request the result in descending order.
[0;31mType:[0m      builtin_function_or_method

In [55]:
m = {1:4, 2:2, 3:3}
if 1 not in m:
    print(10)
else :
    print(20)

20


In [57]:
for i in [0, 10]:
    print('文本：', token[i])
    print('索引:', vocab[token[i]])

文本： ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本： ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [58]:
isinstance?

[0;31mSignature:[0m [0misinstance[0m[0;34m([0m[0mobj[0m[0;34m,[0m [0mclass_or_tuple[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return whether an object is an instance of a class or of a subclass thereof.

A tuple, as in ``isinstance(x, (A, B, ...))``, may be given as the target to
check against. This is equivalent to ``isinstance(x, A) or isinstance(x, B)
or ...`` etc.
[0;31mType:[0m      builtin_function_or_method

In [61]:
def load_corpus_time_machine(max_tokens = -1):
    lines = read_time_machine()
    tokens = tokensize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

In [62]:
corpus, vocab = load_corpus_time_machine()

In [63]:
len(corpus), len(vocab)

(170580, 28)

In [65]:
l = ['2134', 'cas', 'fasdf']
list(l[0])

['2', '1', '3', '4']