In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle as pkl
from tqdm import tqdm
import pickle as pkl
import re
from tqdm import tqdm
import numpy as np
import time

In [3]:
class BiLSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(BiLSTM_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.bilstm1 = nn.LSTM(embedding_size, hidden_size, bidirectional=True, batch_first=True)
        self.dropout1 = nn.Dropout(0.6)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        self.dropout2 = nn.Dropout(0.6)
        self.fc = nn.Linear(hidden_size * 2, 5)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm1(x)
        x = self.dropout1(x)
        x, _ = self.bilstm2(x)
        x = self.dropout2(x)
        x = self.fc(x)
        return x

In [4]:
model = BiLSTM_Model(4666, 128, 64)
max_len = 32

In [5]:
model.load_state_dict(torch.load('model/BiLSTM_model.pkl'))
model.eval()

BiLSTM_Model(
  (embedding): Embedding(4666, 128)
  (bilstm1): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.6, inplace=False)
  (bilstm2): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)

In [6]:
# 加载测试数据test_data.txt
with open('data/test_data.txt', 'r', encoding='utf-8') as f:
    test_data = f.read()

# 将读取到的文本按照标点符号和换行符进行切分，得到一个列表
test_data = re.split(r"[，。！？、（）—《》…；“”\n]",test_data)

# 去掉列表中的空字符串及空格
test_data = [line.strip() for line in test_data if line.strip()]

# 去掉每个句子中的空格
test_data = [line.replace(" ","") for line in test_data]

# 去掉所有长度大于max_len的句子之前
print(len(test_data))
    
# 去掉所有长度大于max_len的句子
test_data = [line for line in test_data if len(line)<=max_len]

# 去掉所有长度小于max_len的句子之后
print(len(test_data))

# 保存处理后的测试数据
with open('data/generate_pkl/test_data.pkl', 'wb') as f:
    pkl.dump(test_data, f)

14978
14822


In [7]:
# 加载测试数据标签test_result.txt
with open('data/test_result.txt', 'r', encoding='utf-8') as f:
    test_result = f.read()

# 将读取到的文本按照标点符号和换行符进行切分，得到一个列表
test_result = re.split(r"[，。！？、（）—《》…；“”\n]",test_result)

# 去掉列表中的空字符串及空格
test_result = [line.strip() for line in test_result if line.strip()]

# 去掉每个句子首尾的空格
test_result = list(map(lambda x:x.strip(),test_result))

print("处理前测试数据大小：", len(test_data))
print("处理前测试结果大小：", len(test_result))

# 将每个句子的标签转化为字母
sum_list = []
for i in test_result:
    sum_ = ''
    for j in i.split():  #以空格为分割，一个词一个词的提取
        if len(j) == 1: #如果词的长度为1 ，就标记为S -single
            sum_ += 'S'
            continue
        else:
            sum_ += 'B' #如果长度不为1，标记为一个词的开始 begin
            for k in range(1, len(j)):
                if k == len(j) - 1: #如果是这个词的最后一个，就标记为end
                    sum_ += 'E'
                else:
                    sum_ += 'M'  #其他情况就是middle
    # 如果句子长度小于等于max_len，就用"N"填充
    if len(sum_) <= max_len:
        sum_ += "N"*(max_len-len(sum_))
        sum_list.append(sum_)

print("处理后测试数据大小：", len(test_data))
print("处理后测试结果大小：", len(sum_list))

# 打印测试数据和测试结果
print("测试数据：", test_data[:5])
print("测试结果：", sum_list[:5])

# 保存处理后的测试结果
with open('data/generate_pkl/test_result.pkl', 'wb') as f:
    pkl.dump(sum_list, f)

处理前测试数据大小： 14822
处理前测试结果大小： 14978
处理后测试数据大小： 14822
处理后测试结果大小： 14822
测试数据： ['共同创造美好的新世纪', '二○○一年新年贺词', '二○○○年十二月三十一日', '附图片1张', '女士们']
测试结果： ['BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN', 'BMMMEBEBENNNNNNNNNNNNNNNNNNNNNNN', 'BMMMEBMEBMMENNNNNNNNNNNNNNNNNNNN', 'SBESSNNNNNNNNNNNNNNNNNNNNNNNNNNN', 'BESNNNNNNNNNNNNNNNNNNNNNNNNNNNNN']


In [8]:
# 加载处理后的测试数据
with open('data/generate_pkl/test_data.pkl', 'rb') as f:
    test_data = pkl.load(f)

# 加载处理后的测试结果
with open('data/generate_pkl/test_result.pkl', 'rb') as f:
    test_result = pkl.load(f)

# 加载词典
with open('data/generate_pkl/vocab.pkl', 'rb') as f:
    word2id = pkl.load(f)

# 将测试数据转换为id表示
test_data_id = [[word2id[word] if word in word2id else 0 for word in line] for line in test_data]
print(test_data_id[0])
print(len(test_data_id))

# 找到含有0的句子所对应的索引
index = []
for i in range(len(test_data_id)):
    if 0 in test_data_id[i]:
        index.append(i)
print(index[:5])
print(len(index))
print(len(test_data_id))
# 展示索引前5个对应的句子
for i in range(5):
    print(test_data_id[index[i]])

# 去掉索引中对应的句子及其对应的标签
test_data_id = [test_data_id[i] for i in range(len(test_data_id)) if i not in index]
test_result = [test_result[i] for i in range(len(test_result)) if i not in index]

print(len(test_data_id))
print(len(test_result))

[210, 62, 330, 332, 115, 113, 1, 33, 183, 378]
14822
[184, 185, 358, 419, 1046]
219
14822
[139, 178, 42, 31, 6, 173, 106, 662, 249, 160, 238, 35, 1814, 1814, 0, 1739, 403, 136]
[62, 274, 175, 249, 722, 722, 0, 722, 403, 136]
[392, 364, 1133, 293, 77, 160, 0, 0, 0]
[1765, 561, 561, 561, 12, 722, 1765, 75, 2135, 722, 40, 1765, 2135, 49, 722, 1765, 81, 4, 139, 178, 916, 53, 516, 247, 679, 38, 1, 986, 986, 947, 1941, 0]
[531, 34, 235, 253, 990, 710, 1081, 0, 332, 567]
14603
14603


In [9]:
target_dict = {'B': 0,
               'M': 1,
               'E': 2,
               'S': 3,
               'N': 4}

# 将测试数据补全为max_len长度
test_data_id = [line + [0]*(max_len-len(line)) for line in test_data_id]
print(test_data_id[0])

# 将测试结果转换为id表示
test_result_id = [[target_dict[word] for word in line] for line in test_result]
print(test_result_id[0])

[210, 62, 330, 332, 115, 113, 1, 33, 183, 378, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 2, 0, 2, 0, 2, 3, 3, 0, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [10]:
# 将测试数据和测试结果转换为tensor
test_data_tensor = torch.tensor(test_data_id)
test_result_tensor = torch.tensor(test_result_id)

print(test_data_tensor.shape)
print(test_result_tensor.shape)

# 输入模型进行预测
output = model(test_data_tensor)
print(output.shape)

# 将预测结果转换为numpy数组
output = output.detach().numpy()

# 将预测结果转换为标签
output = np.argmax(output, axis=2)
# print(output[0])

# 将预测结果转换为字母
output = [[list(target_dict.keys())[list(target_dict.values()).index(word)] for word in line] for line in output]
# print(output[0])

torch.Size([14603, 32])
torch.Size([14603, 32])
torch.Size([14603, 32, 5])


In [11]:
# 对比预测结果和真实结果
print("预测结果：", output[0])
print("真实结果：", test_result[0])

预测结果： ['B', 'E', 'B', 'E', 'B', 'E', 'S', 'S', 'B', 'E', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
真实结果： BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN


In [12]:
# 处理output
output = [''.join(line) for line in output]
print("预测结果：", output[0])
print("真实结果：", test_result[0])

预测结果： BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN
真实结果： BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN


In [13]:
# 去除output和test_result中的"N"
output = [line.replace("N", "") for line in output]
test_result = [line.replace("N", "") for line in test_result]
print("预测结果：", output[0])
print("真实结果：", test_result[0])

预测结果： BEBEBESSBE
真实结果： BEBEBESSBE


In [14]:
# 遍历二者中的每一行的每一个字母，计算预测结果和真实结果的准确率
acc = 0
for i in range(len(output)):
    for j in range(len(output[i])):
        if output[i][j] == test_result[i][j]:
            acc += 1
print("准确率：", acc/sum([len(line) for line in test_result]))
print("召回率：", acc/sum([len(line) for line in output]))
print("F1值：", 2*acc/(sum([len(line) for line in test_result])+sum([len(line) for line in output])))

准确率： 0.8803521354616048
召回率： 0.8803521354616048
F1值： 0.8803521354616048
