使用LSLTM的最后一个h_n去预测。效果不咋地，在验证集上最多只有$80\%$的正确率

In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
word2idx = np.load("./output/word2idx.npy",allow_pickle=True).item()
idx2word = {v:k for k,v in word2idx.items()}
embedding_weight = torch.load("./output/embedding_weight.h5")

In [3]:
def return_file_data_x_y(file_path):
    """
        解析文件中的数据，并返回每条数据的label和内容的index
    """
    X = []
    Y = []
    with open(file_path) as f:
        lines = f.readlines()
        for line in lines:
            data = line.split()
            if(len(data) == 0):
                continue
            x = [word2idx[i] if i in word2idx.keys() else word2idx["<unk>"] for i in data[1:]]
            y = int(data[0])
            X.append(x) 
            Y.append(y)
    return X,Y

train_X,train_Y = return_file_data_x_y("./data/train_zh.txt")
validation_X,validation_Y = return_file_data_x_y("./data/validation_zh.txt")
test_X,test_Y = return_file_data_x_y("./data/test_zh.txt")

len(train_X),len(train_Y)

(19998, 19998)

## 构建Dataset和dataloader

在pytorch中，一个batch中的数据应该shape是一样的，因此，需要对数据进行padding

In [4]:
from torch.utils.data import Dataset,DataLoader
from torch.nn import utils as nn_utils

class CommentDataset(Dataset):
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        self.len = len(X)
    def __getitem__(self,index):
        return self.X[index],self.Y[index]
    def __len__(self):
        return self.len

def collate_fn(batch_data):
    """
        batch_data的shape：(batch_size,变长句子,1)
    """
    X = []
    Y = []
    for data in batch_data:
        X.append(torch.LongTensor(data[0]))
        Y.append(data[1])    
    data_len = [len(i) for i in X]

    input_data = nn_utils.rnn.pad_sequence(X,batch_first=True,padding_value=0) # 因为<pad>=0，所以padding_value=0
    return input_data,torch.LongTensor(Y),data_len

batch_size = 256

train_dataset = CommentDataset(train_X,train_Y)
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,num_workers=16)

valid_dataset = CommentDataset(validation_X,validation_Y)
valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)

test_dataset = CommentDataset(test_X,test_Y)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)


In [5]:
# a,b,c = next(iter(train_dataloader))
# # a[0]
# b.shape

## 构建LSTM神经网络

LSTM的输入是词向量(batchsize,seqlen,embedding_size)

In [6]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim

class MyNet(nn.Module):
    def __init__(self,embedding_size,hidden_size,num_layers=2):
        super(MyNet,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False) 
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_size,hidden_size,batch_first=True,bidirectional=True,num_layers=self.num_layers)
        self.fc = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size,128),
            nn.Dropout(0.6),
            nn.ReLU(),
            nn.Linear(128,2),   
        )
    
    def forward(self,input,data_len):
        
        input = self.embedding(input)
        input = nn_utils.rnn.pack_padded_sequence(input,data_len,batch_first=True,enforce_sorted=False)
        _,(h_n,c_n) = self.lstm(input) # h_n(num_layers*2,batch_size,hidden_size)
        h_n = torch.permute(h_n,(1,0,2)) # h_n(batch_size,num_layers*2,hidden_size)
        h_n = torch.sum(h_n,dim=1) # h_n (batch_size,hidden_size)
        output = self.fc(h_n)
        return F.log_softmax(output,dim=1)
        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_net = MyNet(embedding_weight.shape[1],512).to(device)

In [7]:
# a = torch.randint(1,321,(2,10)).to(device)
# _= my_net(a,[2,1])

In [8]:
import torch.optim as optim
optimzer = optim.Adam(my_net.parameters(),lr=0.0001, weight_decay=0.001)
loss_function = nn.CrossEntropyLoss()

In [9]:
def __compute_acc(targets,predicts):
    return np.mean(np.equal(targets, predicts))

def batch_net_forward(my_net,inputs,target,data_len):
    """
        返回一个batch的loss，以及预测结果[1,0,1,1,0]
    """
    outputs= my_net(inputs,data_len)
    loss = loss_function(outputs,target)
    _,top_index = torch.max(outputs,1)
    predict = top_index.cpu().numpy()
    return loss,predict

def train_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    data_len = data[2]
    optimzer.zero_grad()
    loss,predict = batch_net_forward(my_net,input,target,data_len)
    loss.backward()
    optimzer.step()
    return loss,predict

def eval_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    data_len = data[2]
    loss,predict = batch_net_forward(my_net,input,target,data_len)
    return loss,predict

def compute(dataloader,is_train=True):
    losses = []
    predicts = []
    targets = []
    for i,data in enumerate(dataloader):
        if is_train:
            loss,predict = train_batch(data)
        else:
            loss,predict = eval_batch(data)
        losses.append(loss.item())
        predicts.extend(predict)
        targets.extend(data[1].cpu().numpy())
    return np.mean(losses),__compute_acc(targets,predicts)

In [10]:
train_accs = []
valid_accs = []
test_accs = []
best_valid_loss = 999 # 最好的验证集loss
 
for epoch in range(55):
    my_net.train()
    train_loss,train_acc = compute(train_dataloader,is_train=True)
    
    my_net.eval()
    valid_loss,valid_acc = compute(valid_dataloader,is_train=False)
    test_loss,test_acc = compute(test_dataloader,is_train=False)
    
    train_accs.append(train_acc)
    valid_accs.append(valid_acc)
    test_accs.append(test_acc)
    
    print("{}------>训练集准确率：{:.4}，loss：{:.4}，验证集：{:.4}".format(epoch,train_acc,train_acc,valid_acc))
    if best_valid_loss > valid_loss:
        best_valid_loss = valid_loss
        torch.save(my_net,"./output/lstm.h5")
        print(">>>>验证集合：Acc:{:.4}，Loss:{:.4}，测试集{:.4}<<<<".format(valid_acc,valid_loss,test_acc))
        

0------>训练集准确率：0.6946，loss：0.6946，验证集：0.7305
>>>>验证集合：Acc:0.7305，Loss:0.5438，测试集0.7805<<<<
1------>训练集准确率：0.7706，loss：0.7706，验证集：0.7712
>>>>验证集合：Acc:0.7712，Loss:0.4861，测试集0.7913<<<<
2------>训练集准确率：0.7794，loss：0.7794，验证集：0.7779
>>>>验证集合：Acc:0.7779，Loss:0.4667，测试集0.7724<<<<
3------>训练集准确率：0.7862，loss：0.7862，验证集：0.7436
4------>训练集准确率：0.7911，loss：0.7911，验证集：0.7609
5------>训练集准确率：0.7976，loss：0.7976，验证集：0.787
>>>>验证集合：Acc:0.787，Loss:0.46，测试集0.794<<<<
6------>训练集准确率：0.8023，loss：0.8023，验证集：0.7724
7------>训练集准确率：0.8015，loss：0.8015，验证集：0.7616
8------>训练集准确率：0.8058，loss：0.8058，验证集：0.7625
9------>训练集准确率：0.812，loss：0.812，验证集：0.7898
>>>>验证集合：Acc:0.7898，Loss:0.4548，测试集0.8049<<<<
10------>训练集准确率：0.8149，loss：0.8149，验证集：0.7945
11------>训练集准确率：0.8164，loss：0.8164，验证集：0.7619
12------>训练集准确率：0.8263，loss：0.8263，验证集：0.8005
>>>>验证集合：Acc:0.8005，Loss:0.4432，测试集0.8157<<<<
13------>训练集准确率：0.8235，loss：0.8235，验证集：0.798
>>>>验证集合：Acc:0.798，Loss:0.4429，测试集0.8076<<<<
14------>训练集准确率：0.8285，loss：0.8285，验证集：0.7337
15-----

In [11]:
from pyecharts.charts import Line
from pyecharts import options as opts
y1 = train_accs
x = range(len(y1))
y2 = valid_accs
y3 = test_accs

markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(name="最大值",type_="max")])

line = (Line()
       .add_xaxis(x)
       .add_yaxis('训练acc', y1, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="red",width=3))
       .add_yaxis('验证acc', y2, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="yellow",width=3),markpoint_opts=markpoint_opts)       
       .add_yaxis('测试acc', y3, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="blue",width=3))
       .set_global_opts(title_opts=opts.TitleOpts(title="训练过程"),toolbox_opts=opts.ToolboxOpts(),)
       .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
       
      )

line.render_notebook()
