In [None]:
class Word2Seq:
    # 未知符号
    UNK_TAG = "UNK"
    # 填充符号
    PAD_TAG = "PAD"
    UNK = 0
    PAD = 1

    def __init__(self):
        self.dict = {
            self.UNK_TAG: self.UNK,
            self.PAD_TAG: self.PAD
        }
        # 统计词频
        self.count = {}
        # 反转字典
        self.inverse_dict = {}

    # 把单个句子保存到dict中
    def fit(self, sentence):
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1

    # 生成字典
    def build_vocab(self, min=5, max=None, max_feature=None):
        # min:最小出现的次数，小于则删除
        # max:最大出现的次数，多于则删除
        # max_feature:一共保留多少个词语
        if min is not None:
            self.count = {word: value for word, value in self.count.items() if value > min}
        if max is not None:
            self.count = {word: value for word, value in self.count.items() if value < max}
        if max_feature is not None:
            # x[-1]代表value，reverse代表从大到小,sorted之后转为列表
            temp = sorted(self.count.items(), key=lambda x: x[-1], reverse=True)[:max_feature]
            self.count = dict(temp)
        for word in self.count:
            self.dict[word] = len(self.dict)
        # 反转字典：
        self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))

    # 把句子转换为序列：
    def transform(self, sentence, max_len=None):
        # max_len:代表对句子进行填充或裁剪，将句子固定长度
        if max_len is not None:
            # 填充：
            if max_len > len(sentence):
                sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence))
            # 裁剪：
            if max_len < len(sentence):
                sentence = sentence[:max_len]
        return [self.dict.get(word, self.UNK) for word in sentence]

    # 把序列转换为句子：
    def inverse_transform(self, indices):
        return [self.inverse_dict.get(idx) for idx in indices]

    # 长度：
    def __len__(self):
        return len(self.dict)
