## 文本预处理

介绍一些常见的文本预处理步骤
+ 将文本作为字符串加载到内存中。

+ 将字符串拆分为词元（如单词和字符）。

+ 建立一个词表，将拆分的词元映射到数字索引。

+ 将文本转换为数字索引序列，方便模型操作。

In [2]:
import collections # PY 的一个集合库
import re
from d2l import torch as d2l


## 读取数据集



In [3]:
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():  #@save
    """将时间机器数据集加载到文本行的列表中"""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# 文本总行数: {len(lines)}')
print(lines[0])
print(lines[10])
lines

# 文本总行数: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


['the time machine by h g wells',
 '',
 '',
 '',
 '',
 'i',
 '',
 '',
 'the time traveller for so it will be convenient to speak of him',
 'was expounding a recondite matter to us his grey eyes shone and',
 'twinkled and his usually pale face was flushed and animated the',
 'fire burned brightly and the soft radiance of the incandescent',
 'lights in the lilies of silver caught the bubbles that flashed and',
 'passed in our glasses our chairs being his patents embraced and',
 'caressed us rather than submitted to be sat upon and there was that',
 'luxurious after dinner atmosphere when thought roams gracefully',
 'free of the trammels of precision and he put it to us in this',
 'way marking the points with a lean forefinger as we sat and lazily',
 'admired his earnestness over this new paradox as we thought it',
 'and his fecundity',
 '',
 'you must follow me carefully i shall have to controvert one or two',
 'ideas that are almost universally accepted the geometry for',
 'instance the

## 词元化
下面的tokenize函数将文本行列表（lines）作为输入， 列表中的每个元素是一个文本序列（如一条文本行）。 **每个文本序列又被拆分成一个词元列表，词元（token）是文本的基本单位。** 最后，返回一个由词元列表组成的列表，其中的每个词元都是一个字符串（string）。


tokenize总结：https://cloud.tencent.com/developer/article/1865689

In [4]:
def tokenize(lines,token='word'):
    if token=='word':                               # 每个单词就是一个词元
        return [line.split() for line in lines]
    elif token=='char':                             # 每个字母就是一个词元
        return [list(line) for line in lines]
    else:
        print("错误")

tokens = tokenize(lines)
tokens

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'],
 [],
 [],
 [],
 [],
 ['i'],
 [],
 [],
 ['the',
  'time',
  'traveller',
  'for',
  'so',
  'it',
  'will',
  'be',
  'convenient',
  'to',
  'speak',
  'of',
  'him'],
 ['was',
  'expounding',
  'a',
  'recondite',
  'matter',
  'to',
  'us',
  'his',
  'grey',
  'eyes',
  'shone',
  'and'],
 ['twinkled',
  'and',
  'his',
  'usually',
  'pale',
  'face',
  'was',
  'flushed',
  'and',
  'animated',
  'the'],
 ['fire',
  'burned',
  'brightly',
  'and',
  'the',
  'soft',
  'radiance',
  'of',
  'the',
  'incandescent'],
 ['lights',
  'in',
  'the',
  'lilies',
  'of',
  'silver',
  'caught',
  'the',
  'bubbles',
  'that',
  'flashed',
  'and'],
 ['passed',
  'in',
  'our',
  'glasses',
  'our',
  'chairs',
  'being',
  'his',
  'patents',
  'embraced',
  'and'],
 ['caressed',
  'us',
  'rather',
  'than',
  'submitted',
  'to',
  'be',
  'sat',
  'upon',
  'and',
  'there',
  'was',
  'that'],
 ['luxurious',
  'after',
  'dinner',

## 词表

词元的类型是字符串，而模型需要的是输入的数字。

此表:一个字典，用来将字符串类型的词元映射到从0开始的数字索引中。

如何进行操作呢？<br>
先将训练集中的所有文档集合起来，对他们的唯一词元进行统计，得到的统计结果称之为语料。然后根据每个唯一词元的出现频率，为其分配一个数字索引

另外，语料库中不存在或已删除的任何词元都将映射到一个特定的未知词元“\<unk>”。 我们可以选择增加一个列表，用于保存那些被保留的词元， 例如：填充词元（“\<pad>”）； 序列开始词元（“\<bos>”）； 序列结束词元（“\<eos>”）。

没有出现过的词元代表什么意思呢？
从一下来吗看来未知次元列表相当于重新索引dic_tokens(每个单词有词频的字典)。
unk只是给出了如果找不到的时候的处理参数


In [5]:
ls = [[1,2,4],[2,3,4],[],[1],[2],[],[]]

collections.Counter([token for line in ls for token in line])


Counter({1: 2, 2: 3, 4: 2, 3: 1})

In [6]:
tokens

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'],
 [],
 [],
 [],
 [],
 ['i'],
 [],
 [],
 ['the',
  'time',
  'traveller',
  'for',
  'so',
  'it',
  'will',
  'be',
  'convenient',
  'to',
  'speak',
  'of',
  'him'],
 ['was',
  'expounding',
  'a',
  'recondite',
  'matter',
  'to',
  'us',
  'his',
  'grey',
  'eyes',
  'shone',
  'and'],
 ['twinkled',
  'and',
  'his',
  'usually',
  'pale',
  'face',
  'was',
  'flushed',
  'and',
  'animated',
  'the'],
 ['fire',
  'burned',
  'brightly',
  'and',
  'the',
  'soft',
  'radiance',
  'of',
  'the',
  'incandescent'],
 ['lights',
  'in',
  'the',
  'lilies',
  'of',
  'silver',
  'caught',
  'the',
  'bubbles',
  'that',
  'flashed',
  'and'],
 ['passed',
  'in',
  'our',
  'glasses',
  'our',
  'chairs',
  'being',
  'his',
  'patents',
  'embraced',
  'and'],
 ['caressed',
  'us',
  'rather',
  'than',
  'submitted',
  'to',
  'be',
  'sat',
  'upon',
  'and',
  'there',
  'was',
  'that'],
 ['luxurious',
  'after',
  'dinner',

In [7]:
def count_corpus(tokens):
    """统计词频

    Args:
        tokens (array): 词元列表
    """
    # tokens 可能是1D or 2D
    if len(tokens) == 0 or isinstance(tokens[0],list):
        tokens = [token for line in tokens for token in line]   # 碾平列表
    return collections.Counter(tokens)                           # 统计词频


# freq:频率

class Vocab:
    
    def __init__(self,tokens=None,min_fred=0,reserved_tokens=None) -> None:
        """初始化类

        Args:
            tokens (_type_, optional): 进行词元化后的数据. Defaults to None.
            min_fred (int, optional): 词的最小频次，小于该值就将其去掉. Defaults to 0.
            reserved_tokens (_type_, optional): 保留的符号，pad等. Defaults to None.
        """
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率进行排序
        counter = count_corpus(tokens)
        # 对counter进行排序
        self._token_freqs = sorted(counter.items(),key=lambda x:x[1],reverse=True)
        
        # 未知词元的索引是0
        self.idx_to_token = ['<unk>']+reserved_tokens                                   # 带有unk的未知词元列表
        self.token_to_idx = {token:idx for idx ,token in enumerate(self.idx_to_token)}  #{'<unk>':0,之后出现的单词:单词的索引}
        
        
        # 在这里会扫描碾平的词元列表，将其添加到unk的未知词元列表idx_to_token
        for token,freq in self._token_freqs:
            if freq < min_fred:     # 当一个词出现的频率小于min_fred的时候，就退出循环
                break   # 退出当次循环
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)         # 将没有出现过的词元放入idx_to_token
                self.token_to_idx[token] = len(self.idx_to_token)-1     # 将新添加的词加上索引,因为之前已经有数据（'unk'等）了，所以要减一
                
    # 运算符重载，使用len可以返回长度
    def __len__(self):
        return len(self.idx_to_token)
    
    # 运算符重载，通过字符返回其字典对应的值
    def __getitem__(self,tokens):  
        """取出tokens里面每个字符的索引
        这是一个递归函数，如果传入的是列表或者元素，会递归调用函数，取出里面元素对应的索引

        """
        if not isinstance(tokens,(list,tuple)):             # 如果tokens不是列表或字典
            return self.token_to_idx.get(tokens,self.unk)   # 从为未知词元里面去寻找，找不到返回unk    
        return [self.__getitem__(token) for token in tokens]    # 返回token
    
    # 给定索引，返回idx_to_token的元素
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]       # 返回未知词元
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):      # 未知词元索引为0
        return 0
    @property
    def token_freqs(self):
        return self._token_freqs    

In [15]:
vocab = Vocab(tokens)
list(vocab.token_to_idx.items()),len(vocab),vocab['had'],vocab.to_tokens(2)


([('<unk>', 0),
  ('the', 1),
  ('i', 2),
  ('and', 3),
  ('of', 4),
  ('a', 5),
  ('to', 6),
  ('was', 7),
  ('in', 8),
  ('that', 9),
  ('my', 10),
  ('it', 11),
  ('had', 12),
  ('me', 13),
  ('as', 14),
  ('at', 15),
  ('for', 16),
  ('with', 17),
  ('but', 18),
  ('time', 19),
  ('were', 20),
  ('this', 21),
  ('you', 22),
  ('on', 23),
  ('then', 24),
  ('his', 25),
  ('there', 26),
  ('he', 27),
  ('have', 28),
  ('they', 29),
  ('from', 30),
  ('one', 31),
  ('all', 32),
  ('not', 33),
  ('into', 34),
  ('upon', 35),
  ('little', 36),
  ('so', 37),
  ('is', 38),
  ('came', 39),
  ('by', 40),
  ('some', 41),
  ('be', 42),
  ('no', 43),
  ('could', 44),
  ('their', 45),
  ('said', 46),
  ('saw', 47),
  ('down', 48),
  ('them', 49),
  ('machine', 50),
  ('which', 51),
  ('very', 52),
  ('or', 53),
  ('an', 54),
  ('we', 55),
  ('now', 56),
  ('what', 57),
  ('been', 58),
  ('these', 59),
  ('like', 60),
  ('her', 61),
  ('out', 62),
  ('seemed', 63),
  ('up', 64),
  ('man', 65),
 

In [8]:
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

文本: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


## 整合所有功能

In [10]:
def load_corpus_time_machine(max_tokens=-1):
    """返回时光机数据集的词元索引列表和词表

    Args:
        max_tokens (int, optional): _description_. Defaults to -1.
    """
    lines = read_time_machine()
    tokens = tokenize(lines,'char')     # 字符列表，每一个是字母
    # print(tokens)
    vocab = Vocab(tokens)
    # 列表碾平
    corpus = [vocab[token] for line in tokens for token in line]    # 调用__getitem__(),这里将所有字母对应的索引取出
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
    
corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab) # 运算符重载，调用len。# 一共有26个字母，加上别的字符，这里也就有28个

(170580, 28)