In [1]:
%matplotlib inline
import torch
from torch import nn
from torch.nn import functional as F
from PIL import Image
import matplotlib.pyplot as plt
from pathlib import Path
import utils
import torchvision
from torchvision.io import image
from torchvision.transforms.functional import to_pil_image
import pandas as pd
import time
import numpy as np
import collections
import re

In [2]:
santi_txt_path = Path(r'../data/ebook/三体1.txt')
sanguo_txt_path = Path(r'../data/ebook/ThreeKingdoms.txt')

In [3]:
def read_ebook_txt(txt_path: Path):
    """
        将txt电子书加载到文本行类别中
    """
    with open(txt_path.as_posix(), 'r', encoding='utf8') as fp:
        lines = fp.readlines()
        
    new_lines = list()
    for line in lines:
        new_line = re.sub('^\S+', ' ', line).strip().lower()
        if len(new_line) > 0:
            new_lines.append(new_line)
    return new_lines

lines = read_ebook_txt(sanguo_txt_path)
print('总行数：', len(lines))
print(lines[0])
print(lines[10])

总行数： 16800
1
emperor read this memorial with deep sighs, and chief eunuch cao jie, from his place behind the throne, anxiously noted these signs of grief. an opportunity offering, cao jie informed his fellows, and a charge was trumped up against cai yong, who was driven from the court and forced to retire to his country house.


In [4]:
# 词元化
def tokenize(lines, token='char'):
    """将文本行拆分为单词或字符词元"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知词元类型：' + token)

tokens = tokenize(lines)
for i in range(3):
    print(tokens[i])

['1']
['h', 'e', 'r', 'o', 'e', 's', ' ', 's', 'w', 'e', 'a', 'r', ' ', 'b', 'r', 'o', 't', 'h', 'e', 'r', 'h', 'o', 'o', 'd', ' ', 'i', 'n', ' ', 't', 'h', 'e', ' ', 'p', 'e', 'a', 'c', 'h', ' ', 'g', 'a', 'r', 'd', 'e', 'n', ';', 'o', 'n', 'e', ' ', 'v', 'i', 'c', 't', 'o', 'r', 'y', ' ', 's', 'h', 'a', 't', 't', 'e', 'r', 's', ' ', 't', 'h', 'e', ' ', 'r', 'e', 'b', 'e', 'l', 's', ' ', 'i', 'n', ' ', 'b', 'a', 't', 't', 'l', 'e', 'g', 'r', 'o', 'u', 'n', 'd', 's', '.']
['u', 'n', 'd', 'e', 'r', ' ', 'h', 'e', 'a', 'v', 'e', 'n', ',', ' ', 'a', 'f', 't', 'e', 'r', ' ', 'a', ' ', 'l', 'o', 'n', 'g', ' ', 'p', 'e', 'r', 'i', 'o', 'd', ' ', 'o', 'f', ' ', 'd', 'i', 'v', 'i', 's', 'i', 'o', 'n', ',', ' ', 't', 'e', 'n', 'd', 's', ' ', 't', 'o', ' ', 'u', 'n', 'i', 't', 'e', '；', ' ', 'a', 'f', 't', 'e', 'r', ' ', 'a', ' ', 'l', 'o', 'n', 'g', ' ', 'p', 'e', 'r', 'i', 'o', 'd', ' ', 'o', 'f', ' ', 'u', 'n', 'i', 'o', 'n', ',', ' ', 't', 'e', 'n', 'd', 's', ' ', 't', 'o', ' ', 'd', 'i', 'v

In [5]:
# 构造词表

class Vocab:
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序降序排列
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [6]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), (' ', 1), ('e', 2), ('a', 3), ('t', 4), ('o', 5), ('n', 6), ('i', 7), ('h', 8), ('s', 9)]


In [7]:
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

文本: ['1']
索引: [49]
文本: ['e', 'm', 'p', 'e', 'r', 'o', 'r', ' ', 'r', 'e', 'a', 'd', ' ', 't', 'h', 'i', 's', ' ', 'm', 'e', 'm', 'o', 'r', 'i', 'a', 'l', ' ', 'w', 'i', 't', 'h', ' ', 'd', 'e', 'e', 'p', ' ', 's', 'i', 'g', 'h', 's', ',', ' ', 'a', 'n', 'd', ' ', 'c', 'h', 'i', 'e', 'f', ' ', 'e', 'u', 'n', 'u', 'c', 'h', ' ', 'c', 'a', 'o', ' ', 'j', 'i', 'e', ',', ' ', 'f', 'r', 'o', 'm', ' ', 'h', 'i', 's', ' ', 'p', 'l', 'a', 'c', 'e', ' ', 'b', 'e', 'h', 'i', 'n', 'd', ' ', 't', 'h', 'e', ' ', 't', 'h', 'r', 'o', 'n', 'e', ',', ' ', 'a', 'n', 'x', 'i', 'o', 'u', 's', 'l', 'y', ' ', 'n', 'o', 't', 'e', 'd', ' ', 't', 'h', 'e', 's', 'e', ' ', 's', 'i', 'g', 'n', 's', ' ', 'o', 'f', ' ', 'g', 'r', 'i', 'e', 'f', '.', ' ', 'a', 'n', ' ', 'o', 'p', 'p', 'o', 'r', 't', 'u', 'n', 'i', 't', 'y', ' ', 'o', 'f', 'f', 'e', 'r', 'i', 'n', 'g', ',', ' ', 'c', 'a', 'o', ' ', 'j', 'i', 'e', ' ', 'i', 'n', 'f', 'o', 'r', 'm', 'e', 'd', ' ', 'h', 'i', 's', ' ', 'f', 'e', 'l', 'l', 'o', 'w', 's', '

In [8]:
# 整合所有功能，封装

def load_corpus_txt(txt_path, max_tokens=-1):
    """返回数据集的词元索引列表和词表"""
    lines = read_ebook_txt(txt_path)
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    # 所以将所有文本行展平到一个列表中
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_txt(txt_path=sanguo_txt_path)
len(corpus), len(vocab)

(3177437, 65)

In [9]:
corpus[:20]

[49, 8, 2, 10, 5, 2, 9, 1, 9, 17, 2, 3, 10, 1, 23, 10, 5, 4, 8, 2]

In [10]:
vocab.idx_to_token

['<unk>',
 ' ',
 'e',
 'a',
 't',
 'o',
 'n',
 'i',
 'h',
 's',
 'r',
 'd',
 'l',
 'u',
 'g',
 'c',
 'm',
 'w',
 'f',
 'y',
 'p',
 ',',
 '.',
 'b',
 'v',
 'k',
 '"',
 'z',
 'x',
 'j',
 '“',
 '”',
 'q',
 "'",
 '-',
 '?',
 '？',
 ';',
 '[',
 ']',
 '!',
 '！',
 ':',
 '；',
 '*',
 '—',
 '：',
 '2',
 '0',
 '1',
 '(',
 ')',
 '…',
 '6',
 '5',
 '4',
 '（',
 '）',
 '8',
 '9',
 '3',
 '7',
 '\u3000',
 '。',
 '#']