In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
from torch.nn import utils as nn_utils

from torch.utils.data import Dataset,DataLoader


## 导入word2idx

In [2]:
word2idx = np.load("./output/word2idx.npy",allow_pickle=True).item()
idx2word = {v:k for k,v in word2idx.items()}
embedding_weight = torch.load("./output/embedding_weight.h5")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 加载数据集

In [13]:
def return_file_data_x_y(word2idx,file_path):
    X = []
    Y = []
    with open(file_path) as f:
        lines = f.readlines()
        for line in lines:
            data = line.split()
            if(len(data) == 0):
                continue
            x = [word2idx[i] if i in word2idx.keys() else word2idx["<unk>"] for i in data[1:]]
            y = int(data[0])
            X.append(x) 
            Y.append(y)
    return X,Y
class CommentDataset(Dataset):
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        self.len = len(X)
    def __getitem__(self,index):
        return self.X[index],self.Y[index]
    def __len__(self):
        return self.len

def collate_fn(batch_data):
    """
        batch_data的shape：(batch_size,变长句子,1)
    """
    X = []
    Y = []
    for data in batch_data:
        X.append(torch.LongTensor(data[0]))
        Y.append(data[1])    
    data_len = [len(i) for i in X]

    input_data = nn_utils.rnn.pad_sequence(X,batch_first=True,padding_value=0) # 因为<pad>=0，所以padding_value=0
    return input_data,torch.LongTensor(Y),data_len
test_X,test_Y = return_file_data_x_y(word2idx,"./data/test_zh.txt")
test_dataset = CommentDataset(test_X,test_Y)
test_dataloader = DataLoader(test_dataset,batch_size=256,collate_fn=collate_fn,num_workers=1)


## 计算混淆矩阵函数

In [14]:
from sklearn.metrics import confusion_matrix,classification_report
import matplotlib.pyplot as plt

def compute_confusion_matrix(my_net,test_dataloader,is_lstm = False):
    real_Y = []
    predict_Y = []
    for i,data in enumerate(test_dataloader):
        inputs = data[0].to(device)
        target = data[1].numpy()
        data_len = data[2]
        if is_lstm:
            outputs= my_net(inputs,data_len)
        else:
            outputs= my_net(inputs)
        _,top_index = torch.max(outputs,1)
        top_index = top_index.cpu().numpy() # [1,0,0,1]
        
        predict_Y.extend(top_index)
        real_Y.extend(target)
    C = confusion_matrix(real_Y,predict_Y)
    R = classification_report(real_Y,predict_Y,digits=4)

    return C,R

## 测试不同模型结果

### TextCNN

In [3]:
class MyNet(nn.Module):
    def __init__(self,embedding_size):
        super(MyNet,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False)
        self.conv = nn.Conv2d(1,256,(3,embedding_size)) # kernel_size 为(3,embedding_size)
        self.adaptive_max_pool = nn.AdaptiveMaxPool1d(2)
        self.fc = nn.Sequential(
            nn.Linear(256*2,128),
            nn.Dropout(0.6),
            nn.ReLU(),

            nn.Linear(128,2),   
        )
    
    def forward(self,x): # (batch_size,seq_len)
        x = self.embedding(x) #(batch_size,seq_len,embedding_size)
        x = x.unsqueeze(1) # (batch_size,1,seq_len,embedding_size) ，因为CNN的input为(N,C,H,W)
        x = self.conv(x) #(batch_size,256,seq_len-2,1)
        x = x.squeeze(3) #(batch_size,256,seq_len-2)
        x = F.relu(x)
        x = self.adaptive_max_pool(x) #(batch_size,256,2)
        x = torch.cat((x[:,:,0],x[:,:,1]),dim=1) #(batch_size,256*2)
        output = self.fc(x)
        return F.log_softmax(output,dim=1)
my_net = torch.load("./output/text_cnn.h5").to(device).eval()

In [16]:
C,R = compute_confusion_matrix(my_net,test_dataloader)
print("TextCNN\n")
print(R)

TextCNN

              precision    recall  f1-score   support

           0     0.8196    0.8736    0.8457       182
           1     0.8686    0.8128    0.8398       187

    accuracy                         0.8428       369
   macro avg     0.8441    0.8432    0.8428       369
weighted avg     0.8444    0.8428    0.8427       369



### BiLstm with Attention

In [6]:
import torch.nn as nn
import torch.nn.functional as F
class MyNet(nn.Module):
    def __init__(self,embedding_size,hidden_size,num_layers=1):
        super(MyNet,self).__init__()
        # 使用与训练好的词向量权重
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False) 
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_size,hidden_size,batch_first=True,bidirectional=True,num_layers=self.num_layers)
        # self.w = nn.Parameter(torch.Tensor(hidden_size,1))
        self.attention_w = nn.Sequential(
            nn.Linear(hidden_size,hidden_size),
            nn.Dropout(0.6),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size,256),
            nn.Dropout(0.6),
            nn.ReLU(),
            nn.Linear(256,2),
        )
        
    def attention_layer(self,lstm_output,lstm_h_n = None):
        """
            lstm_output：(batch_size,seq_len,hidden_size*2)
            lstm_h_n：(num_layers*2,batch_size,hidden_size)
        """
        lstm_h_n = lstm_h_n.permute(1,0,2) # (batch_size,num_layers*2,hidden_size)
        lstm_h_n = torch.sum(lstm_h_n,dim=1) # (batch_size,hidden_size)
        attention_w = self.attention_w(lstm_h_n) # (batch_size,hidden_size)
        attention_w = attention_w.unsqueeze(dim=2) # (batch_size,hidden_size,1)
        H = lstm_output[:,:,:self.hidden_size] + lstm_output[:,:,self.hidden_size:] # (batch_size,seq_len,hidden_size)
        # alpha = F.softmax(torch.matmul(H,self.w),dim=1) #(batch_size,seq_len,1)
        alpha = F.softmax(torch.matmul(H,attention_w),dim=1) #(batch_size,seq_len,1)
        r = H * alpha # (batch_size,seq_len,hidden_size)
        out = torch.relu(torch.sum(r,1)) #(batch_size,hidden_size)
        return out
    
    def forward(self,input,data_len=None):
        input = self.embedding(input)
        input = nn_utils.rnn.pack_padded_sequence(input,data_len,batch_first=True,enforce_sorted=False)
        output,(h_n,c_n) = self.lstm(input) # output (batch_size,seq_len,hidden_size*2) h_n(num_layers*2,batch_size,hidden_size)
        output,_ = nn_utils.rnn.pad_packed_sequence(output,batch_first=True)
        output = self.attention_layer(output,h_n) #(batch_size,hidden_size)
        output = self.fc(output) # (batch_size,2)
        return F.log_softmax(output,dim=1)
    
my_net = torch.load("./output/bi_lstm.h5").eval()

In [18]:
C,R = compute_confusion_matrix(my_net,test_dataloader,True)
print("BiLSTM with Attention\n")
print(R)

BiLSTM with Attention

              precision    recall  f1-score   support

           0     0.8564    0.8516    0.8540       182
           1     0.8564    0.8610    0.8587       187

    accuracy                         0.8564       369
   macro avg     0.8564    0.8563    0.8563       369
weighted avg     0.8564    0.8564    0.8564       369



### LSTM模型

In [5]:
class MyNet(nn.Module):
    def __init__(self,embedding_size,hidden_size,num_layers=2):
        super(MyNet,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False) 
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_size,hidden_size,batch_first=True,bidirectional=True,num_layers=self.num_layers)
        self.fc = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size,128),
            nn.Dropout(0.6),
            nn.ReLU(),
            nn.Linear(128,2),   
        )
    
    def forward(self,input,data_len):
        
        input = self.embedding(input)
        input = nn_utils.rnn.pack_padded_sequence(input,data_len,batch_first=True,enforce_sorted=False)
        _,(h_n,c_n) = self.lstm(input) # h_n(num_layers*2,batch_size,hidden_size)
        h_n = torch.permute(h_n,(1,0,2)) # h_n(batch_size,num_layers*2,hidden_size)
        h_n = torch.sum(h_n,dim=1) # h_n (batch_size,hidden_size)
        output = self.fc(h_n)
        return F.log_softmax(output,dim=1)
        
my_net = torch.load("./output/lstm.h5").eval()

In [20]:
C,R = compute_confusion_matrix(my_net,test_dataloader,True)
print("LSTM\n")
print(R)

LSTM

              precision    recall  f1-score   support

           0     0.7740    0.8846    0.8256       182
           1     0.8696    0.7487    0.8046       187

    accuracy                         0.8157       369
   macro avg     0.8218    0.8166    0.8151       369
weighted avg     0.8224    0.8157    0.8150       369

