In [1]:
import os
import time
import torch
import pickle as pkl
from importlib import import_module

In [2]:
dataset = 'comments_data'                                  # 数据集
embedding = 'embedding_Weibo.npz'                          # 裁剪后的embedding
data_path = os.path.join(dataset, "data")
vocab_path = os.path.join(data_path, "vocab.pkl")          # 词汇表 
test_file = os.path.join(data_path, "test.txt")
model_names = ["TextCNN", "DPCNN", "TextRCNN", "TextRNN_Att", \
              "TextRNN","Transformer"]         # 模型名称      
PAD = "<PAD>"
UNK = '<UNK>'
pad_size = 32     # 序列长度
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data(filename):
    """加载测试数据"""
    data_li = []
    with open(filename, "r") as f:
        for line in f:
            if len(line.strip()) == 0:
                continue
            line_li = line.split("\t")
            data_li.append(line_li)
    return data_li

def load_vocab(vocab_path):
    """加载词表"""
    if os.path.exists(vocab_path):
        vocab = pkl.load(open(vocab_path, 'rb'))
        return vocab
    
def build_bacth(vocab, s, pad_size=32):
    """构建输入模型的数据"""
    token = [vocab.get(x, vocab.get(UNK)) for x in s]
    seq_len = len(token)
    if len(token) < pad_size:
        token.extend([vocab.get(PAD)] * (pad_size - len(token)))
    else:
        token = token[:pad_size]
        seq_len = len(token)
    token = torch.tensor([token]).to(device)
    seq_len = torch.tensor([[seq_len]])
    return (token, seq_len)

def predic(model, x):
    """预测"""
    model.eval()
    with torch.no_grad():
        output = model(x)
        y = torch.max(output.data, -1)[1].cpu().numpy()
    return y

def create_model(model_name, vocab):
    module = import_module('models.' + model_name)
    config = module.Config(dataset, embedding)
    model = module.Model(config)
    model_path = os.path.join(dataset, 
                    "saved_dict/{}.ckpt".format(model_name)) # 模型
    model.load_state_dict(torch.load(model_path))
    model = model.to(device)
    
    return model

In [4]:
vocab = load_vocab(vocab_path)
test_data = load_data(test_file)
model_objs = []

In [5]:
for model_name in model_names:
    start = time.time()
    m = create_model(model_name, vocab)
    end = time.time()
    print("load model {} | cost {} s.".format(model_name, end-start))
    model_objs.append(m)

load model TextCNN | cost 1.234591007232666 s.
load model DPCNN | cost 0.03629565238952637 s.


  "num_layers={}".format(dropout, num_layers))


load model TextRCNN | cost 0.8151576519012451 s.
load model TextRNN_Att | cost 0.039021968841552734 s.
load model TextRNN | cost 0.038530826568603516 s.
load model Transformer | cost 0.05030536651611328 s.


In [6]:
for name, obj in zip(model_names, model_objs):
    print(name, obj)

TextCNN Model(
  (embedding): Embedding(3618, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.4)
  (fc): Linear(in_features=768, out_features=2, bias=True)
)
DPCNN Model(
  (embedding): Embedding(3618, 300)
  (conv_region): Conv2d(1, 250, kernel_size=(3, 300), stride=(1, 1))
  (conv): Conv2d(250, 250, kernel_size=(3, 1), stride=(1, 1))
  (max_pool): MaxPool2d(kernel_size=(3, 1), stride=2, padding=0, dilation=1, ceil_mode=False)
  (padding1): ZeroPad2d(padding=(0, 0, 1, 1), value=0.0)
  (padding2): ZeroPad2d(padding=(0, 0, 0, 1), value=0.0)
  (relu): ReLU()
  (fc): Linear(in_features=250, out_features=2, bias=True)
)
TextRCNN Model(
  (embedding): Embedding(3618, 300)
  (lstm): LSTM(300, 256, batch_first=True, dropout=1.0, bidirectional=True)
  (maxpool): MaxPool1d(kernel_size=32, stride=32, paddin

In [12]:
n = 0
for data in test_data:
    s = data[0].lower()
    y = data[-1].strip("\n")
    y = int(y)
    x = build_bacth(vocab, s, pad_size=pad_size)
#     print("* * "*20)
#     print("评论：{} | 真实值：{}".format(s, str(y)))
    
    for name, obj in zip(model_names, model_objs):
#         if name == "TextCNN":
        if name == "TextRNN":
#             print("-"*20)
#             print("模型：", name)
            ypred = predic(obj, x)
            if type(ypred) is list:
                yp = ypred[-1]
            else:
                yp=ypred
            if y != yp:
                n+=1
                print("-"*20)
                print("评论 {}| 真实值：{}| 预测值 {}".format(s, str(y), str(yp)))
#             print("预测值：", ypred)
print(n)

--------------------
评论 &| 真实值：0| 预测值 [1]
--------------------
评论 .豳釅| 真实值：0| 预测值 [1]
--------------------
评论 打卡第40天| 真实值：0| 预测值 [1]
--------------------
评论 冷汗| 真实值：0| 预测值 [1]
--------------------
评论 眯眼吐舌| 真实值：0| 预测值 [1]
--------------------
评论 ＝| 真实值：0| 预测值 [1]
--------------------
评论 zs| 真实值：0| 预测值 [1]
--------------------
评论 806072| 真实值：0| 预测值 [1]
--------------------
评论 董博文| 真实值：0| 预测值 [1]
--------------------
评论 90| 真实值：0| 预测值 [1]
--------------------
评论 19937303035| 真实值：0| 预测值 [1]
--------------------
评论 胜利手势| 真实值：0| 预测值 [1]
--------------------
评论 胡凯文晚安诗语打卡第四天！| 真实值：1| 预测值 [0]
--------------------
评论 嘿嘿| 真实值：0| 预测值 [1]
--------------------
评论 羞涩微笑| 真实值：0| 预测值 [1]
--------------------
评论 飞吻| 真实值：0| 预测值 [1]
--------------------
评论 郁金香| 真实值：0| 预测值 [1]
--------------------
评论 gidihf%hy| 真实值：0| 预测值 [1]
--------------------
评论 yj| 真实值：0| 预测值 [1]
--------------------
评论 236| 真实值：0| 预测值 [1]
--------------------
评论 uhj| 真实值：0| 预测值 [1]
--------------------
评论 kevin4| 真实值：0| 预测值 [1]
------

In [9]:
len(test_data)

1879

In [None]:
"""
6.3M	DPCNN.ckpt
579M	FastText.ckpt
7.4M	TextCNN.ckpt
9.1M	TextRCNN.ckpt
8.0M	TextRNN_Att.ckpt
7.9M	TextRNN.ckpt
16M	Transformer.ckpt
"""

In [None]:
# 字幕变小写对预测有帮助？