使用bi-LSLTM+Attention 去预测

In [11]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [12]:
word2idx = np.load("./output/word2idx.npy",allow_pickle=True).item()
idx2word = {v:k for k,v in word2idx.items()}
embedding_weight = torch.load("./output/embedding_weight.h5")

In [13]:
def return_file_data_x_y(file_path):
    """
        解析文件中的数据，并返回每条数据的label和内容的index
        return X：[[2,4,15,112,4],[1,55,213]] Y:[0,1]
    """
    X = []
    Y = []
    with open(file_path) as f:
        lines = f.readlines()
        for line in lines:
            data = line.split()
            # 如果碰到空白行，则无需理会
            if(len(data) == 0):
                continue
            # 如果碰到不再词表中的词，则使用<unk>替代。
            x = [word2idx.get(i,word2idx["<unk>"]) for i in data[1:]]
            y = int(data[0])
            X.append(x) 
            Y.append(y)
    return X,Y

train_X,train_Y = return_file_data_x_y("./data/train_zh.txt")
validation_X,validation_Y = return_file_data_x_y("./data/validation_zh.txt")
test_X,test_Y = return_file_data_x_y("./data/test_zh.txt")

len(train_X),len(train_Y)

(19998, 19998)

## 构建Dataset和dataloader

在pytorch中，一个batch中的数据应该shape是一样的，因此，需要对数据进行padding

In [14]:
from torch.utils.data import Dataset,DataLoader
from torch.nn import utils as nn_utils

class CommentDataset(Dataset):
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        self.len = len(X)
    def __getitem__(self,index):
        return self.X[index],self.Y[index]
    def __len__(self):
        return self.len

def collate_fn(batch_data):
    """
        batch_data的shape：(batch_size,变长句子,1)
    """
    X = []
    Y = []
    for data in batch_data:
        X.append(torch.LongTensor(data[0]))
        Y.append(data[1])    
    data_len = [len(i) for i in X]

    input_data = nn_utils.rnn.pad_sequence(X,batch_first=True,padding_value=0) # 因为<pad>=0，所以padding_value=0
    return input_data,torch.LongTensor(Y),data_len

batch_size = 256

train_dataset = CommentDataset(train_X,train_Y)
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,num_workers=16)

valid_dataset = CommentDataset(validation_X,validation_Y)
valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)

test_dataset = CommentDataset(test_X,test_Y)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)


In [15]:
# a,b,c = next(iter(train_dataloader))
# # a[0]
# b.shape

## 构建LSTM神经网络

LSTM的输入是词向量(batchsize,seqlen,embedding_size)

针对于双向LSTM，其LSTM的output的维度为(batch_size,seqlen,hidden_size*2)

In [16]:
import torch.nn as nn
import torch.nn.functional as F
class MyNet(nn.Module):
    def __init__(self,embedding_size,hidden_size,num_layers=1):
        super(MyNet,self).__init__()
        # 使用与训练好的词向量权重
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False) 
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_size,hidden_size,batch_first=True,bidirectional=True,num_layers=self.num_layers)
        
        # self.w = nn.Parameter(torch.Tensor(hidden_size,1))
        self.attention_w = nn.Sequential(
            nn.Linear(hidden_size,hidden_size),
            nn.Dropout(0.6),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size,256),
            nn.Dropout(0.6),
            nn.ReLU(),
            nn.Linear(256,2),
        )
        
    def attention_layer(self,lstm_output,lstm_h_n = None):
        """
            lstm_output：(batch_size,seq_len,hidden_size*2)
            lstm_h_n：(num_layers*2,batch_size,hidden_size)
        """
        # H = lstm_output[:,:,:self.hidden_size] + lstm_output[:,:,self.hidden_size:] # (batch_size,seq_len,hidden_size)
        # M = H # (batch_size,seq_len,hidden_size)

        # # w
        # lstm_h_n = lstm_h_n.permute(1,0,2) # (batch_size,num_layers*2,hidden_size)
        # lstm_h_n = torch.sum(lstm_h_n,dim=1) # (batch_size,hidden_size)
        # w = self.attention_w(lstm_h_n) # (batch_size,hidden_size)
        # w = lstm_h_n.unsqueeze(dim=1) # (batch_size,1,hidden_size)

        # # 生成alpha
        # alpha = F.softmax(torch.bmm(w,M.permute(0,2,1)),dim=2) # (batch_size,1,seq_len)

        # # 生成r
        # r = torch.bmm(alpha,H) #(batch_size,1,hidden_size)
        # r = r.squeeze(1)    #(batch_size,hidden_size)
        # return r 

        lstm_h_n = lstm_h_n.permute(1,0,2) # (batch_size,num_layers*2,hidden_size)
        lstm_h_n = torch.sum(lstm_h_n,dim=1) # (batch_size,hidden_size)
        attention_w = self.attention_w(lstm_h_n) # (batch_size,hidden_size)
        attention_w = attention_w.unsqueeze(dim=2) # (batch_size,hidden_size,1)

        H = lstm_output[:,:,:self.hidden_size] + lstm_output[:,:,self.hidden_size:] # (batch_size,seq_len,hidden_size)
        
        # alpha = F.softmax(torch.matmul(H,self.w),dim=1) #(batch_size,seq_len,1)
        alpha = F.softmax(torch.matmul(H,attention_w),dim=1) #(batch_size,seq_len,1)

        r = H * alpha # (batch_size,seq_len,hidden_size)
        out = torch.relu(torch.sum(r,1)) #(batch_size,hidden_size)

        return out
    
    def forward(self,input,data_len=None):
        input = self.embedding(input)
        input = nn_utils.rnn.pack_padded_sequence(input,data_len,batch_first=True,enforce_sorted=False)
        output,(h_n,c_n) = self.lstm(input) # output (batch_size,seq_len,hidden_size*2) h_n(num_layers*2,batch_size,hidden_size)
        output,_ = nn_utils.rnn.pad_packed_sequence(output,batch_first=True)
        output = self.attention_layer(output,h_n) #(batch_size,hidden_size)
        output = self.fc(output) # (batch_size,2)
        return F.log_softmax(output,dim=1)
        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_net = MyNet(embedding_weight.shape[1],512,num_layers=1).to(device)

In [17]:
# a = torch.randint(1,321,(5,5)).to(device)
# _= my_net(a,[4,3])

In [18]:
import torch.optim as optim

optimzer = optim.Adam(my_net.parameters(),lr=0.0001, weight_decay=0.001)
loss_function = nn.CrossEntropyLoss()

In [19]:
def __compute_acc(targets,predicts):
    return np.mean(np.equal(targets, predicts))

def batch_net_forward(my_net,inputs,target,data_len):
    """
        返回一个batch的loss，以及预测结果[1,0,1,1,0]
    """
    outputs= my_net(inputs,data_len)
    loss = loss_function(outputs,target)
    _,top_index = torch.max(outputs,1)
    predict = top_index.cpu().numpy()
    return loss,predict

def train_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    data_len = data[2]
    optimzer.zero_grad()
    loss,predict = batch_net_forward(my_net,input,target,data_len)
    loss.backward()
    optimzer.step()
    return loss,predict

def eval_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    data_len = data[2]
    loss,predict = batch_net_forward(my_net,input,target,data_len)
    return loss,predict

def compute(dataloader,is_train=True):
    losses = []
    predicts = []
    targets = []
    for i,data in enumerate(dataloader):
        if is_train:
            loss,predict = train_batch(data)
        else:
            loss,predict = eval_batch(data)
        losses.append(loss.item())
        predicts.extend(predict)
        targets.extend(data[1].cpu().numpy())
    return np.mean(losses),__compute_acc(targets,predicts)

## 在训练的过程中，对验证集最好的结果进行保存

In [20]:
train_accs = []
valid_accs = []
test_accs = []
best_valid_loss = 999 # 最好的验证集loss
 
for epoch in range(70):
    my_net.train()
    train_loss,train_acc = compute(train_dataloader,is_train=True)
    
    my_net.eval()
    valid_loss,valid_acc = compute(valid_dataloader,is_train=False)
    test_loss,test_acc = compute(test_dataloader,is_train=False)
    
    train_accs.append(train_acc)
    valid_accs.append(valid_acc)
    test_accs.append(test_acc)
    
    print("{}------>训练集准确率：{:.4}，loss：{:.4}，验证集：{:.4}".format(epoch,train_acc,train_acc,valid_acc))
    if best_valid_loss > valid_loss:
        best_valid_loss = valid_loss
        torch.save(my_net,"./output/bi_lstm.h5")
        print(">>>>验证集合：Acc:{:.4}，Loss:{:.4}，测试集{:.4}<<<<".format(valid_acc,valid_loss,test_acc))
        

0------>训练集准确率：0.7279，loss：0.7279，验证集：0.7584
>>>>验证集合：Acc:0.7584，Loss:0.5392，测试集0.7507<<<<
1------>训练集准确率：0.7741，loss：0.7741，验证集：0.7747
>>>>验证集合：Acc:0.7747，Loss:0.4702，测试集0.7751<<<<
2------>训练集准确率：0.7819，loss：0.7819，验证集：0.766
3------>训练集准确率：0.7882，loss：0.7882，验证集：0.7767
4------>训练集准确率：0.7936，loss：0.7936，验证集：0.7524
5------>训练集准确率：0.7975，loss：0.7975，验证集：0.7941
>>>>验证集合：Acc:0.7941，Loss:0.4494，测试集0.7913<<<<
6------>训练集准确率：0.8026，loss：0.8026，验证集：0.7969
>>>>验证集合：Acc:0.7969，Loss:0.4432，测试集0.8049<<<<
7------>训练集准确率：0.8078，loss：0.8078，验证集：0.7849
8------>训练集准确率：0.8149，loss：0.8149，验证集：0.8021
>>>>验证集合：Acc:0.8021，Loss:0.4376，测试集0.794<<<<
9------>训练集准确率：0.8166，loss：0.8166，验证集：0.8037
>>>>验证集合：Acc:0.8037，Loss:0.433，测试集0.8103<<<<
10------>训练集准确率：0.8205，loss：0.8205，验证集：0.7994
11------>训练集准确率：0.8237，loss：0.8237，验证集：0.7961
12------>训练集准确率：0.829，loss：0.829，验证集：0.8133
>>>>验证集合：Acc:0.8133，Loss:0.412，测试集0.813<<<<
13------>训练集准确率：0.8322，loss：0.8322，验证集：0.8165
>>>>验证集合：Acc:0.8165，Loss:0.4119，测试集0.8103<<<<
14---

In [21]:
from pyecharts.charts import Line
from pyecharts import options as opts
y1 = train_accs
x = range(len(y1))
y2 = valid_accs
y3 = test_accs

markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(name="最大值",type_="max")])

line = (Line()
       .add_xaxis(x)
       .add_yaxis('训练acc', y1, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="red",width=3))
       .add_yaxis('验证acc', y2, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="yellow",width=3),markpoint_opts=markpoint_opts)       
       .add_yaxis('测试acc', y3, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="blue",width=3))
       .set_global_opts(title_opts=opts.TitleOpts(title="训练过程"),toolbox_opts=opts.ToolboxOpts(),)
       .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
       
      )

line.render_notebook()
