In [35]:
import numpy as np

class Word2Sequence:
    UNK_TAG = "UNK"    ### 未出现词
    PAD_TAG = "PAD"    ###  填充
    
    UNK = 0
    PAD = 1
    
    def __init__(self):
        self.dict = {
            self.UNK_TAG:self.UNK,
            self.PAD_TAG:self.PAD
        }
    
        self.count = {}   ##  统计词频
        
    def fit(self,sentence):
        ###   把句子中的词语存入count中，并统计其出现次数
        ###   输入的句子为分词后的列表，[word1,word2,word3,...]
        for word in sentence:
            self.count[word] = self.count.get(word,0) + 1
    
    def build_vocab(self, min = None, max = None, max_features = None):
        ###   生成词典
        ###   min：出现的最小次数，max：最大次数，max_features：一共保留多少个词语
        if min is not None:
            ###   删除count中词频小于min的word
            self.count = {word:value for word,value in self.count.items() if value>min}
        if max is not None:
            self.count = {word:value for word,value in self.count.items() if value<max}
        if max_features is not None:
            temp = sorted(self.count.items(),key=lambda x:x[-1],reverse=Ture)[:max_features]
            self.count = dict(temp)
        
        for word in self.count:
            self.dict[word] = len(self.dict)
        
        self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys()))
        
    def transform(self,sentence,max_len=None):
        ###   把句子转化成序列
        if max_len is not None:
            if max_len > len(sentence):
                sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence))   ###  填充
            if max_len < len(sentence):
                sentence = sentence[:max_len]                                  ###  裁剪
        
        return [self.dict.get(word,self.UNK) for word in sentence]
    
    def inverse_transform(self,indices):
        ###   序列转化成句子
        return [self.inverse_dict.get(idx) for idx in indices]
    
    
if __name__ =='__main__':
    w2s = Word2Sequence()
    w2s.fit(["我","是","谁"])
    w2s.fit(["我","是","我"])
    w2s.fit(["我","爱","你"])
    w2s.build_vocab()
    print(w2s.count)
    print(w2s.dict)
    
    re1 = w2s.transform(["我","爱","你"],max_len=10)
    re2 = w2s.inverse_transform([1,2,3])
    print(re1)
    print(re2)

{'我': 4, '是': 2, '谁': 1, '爱': 1, '你': 1}
{'UNK': 0, 'PAD': 1, '我': 2, '是': 3, '谁': 4, '爱': 5, '你': 6}
[2, 5, 6, 1, 1, 1, 1, 1, 1, 1]
['PAD', '我', '是']
