In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
word2idx = np.load("./output/word2idx.npy",allow_pickle=True).item()
idx2word = {v:k for k,v in word2idx.items()}
embedding_weight = torch.load("./output/embedding_weight.h5")

In [3]:
def return_file_data_x_y(file_path):
    """
        解析文件中的数据，并返回每条数据的label和内容的index
        return X：[[2,4,15,112,4],[1,55,213]] Y:[0,1]
    """
    X = []
    Y = []
    with open(file_path) as f:
        lines = f.readlines()
        for line in lines:
            data = line.split()
            # 如果碰到空白行，则无需理会
            if(len(data) == 0):
                continue
            # 如果碰到不再词表中的词，则使用<unk>替代。
            x = [word2idx.get(i,word2idx["<unk>"]) for i in data[1:]]
            y = int(data[0])
            X.append(x) 
            Y.append(y)
    return X,Y

train_X,train_Y = return_file_data_x_y("./data/train_zh.txt")
validation_X,validation_Y = return_file_data_x_y("./data/validation_zh.txt")
test_X,test_Y = return_file_data_x_y("./data/test_zh.txt")

len(train_X),len(train_Y)

(19998, 19998)

## 构建Dataset和dataloader

在pytorch中，一个batch中的数据应该shape是一样的，因此，需要对数据进行padding

In [4]:
from torch.utils.data import Dataset,DataLoader
from torch.nn import utils as nn_utils

class CommentDataset(Dataset):
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
        self.len = len(X)
    def __getitem__(self,index):
        return self.X[index],self.Y[index]
    def __len__(self):
        return self.len

def collate_fn(batch_data):
    """
        将batch_data中的句子变成一样长，使用<pad>进行填充
    """
    X = []
    Y = []
    for data in batch_data:
        X.append(torch.LongTensor(data[0]))
        Y.append(data[1]) 

    # data_len代表句子的实际长度，在LSTM中，需要使用；在TextCNN并不需要使用
    data_len = [len(i) for i in X]

    input_data = nn_utils.rnn.pad_sequence(X,batch_first=True,padding_value=0) # 因为<pad>对应的id为0，所以padding_value=0
    return input_data,torch.LongTensor(Y),data_len

batch_size = 256

train_dataset = CommentDataset(train_X,train_Y)
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn,num_workers=16)

valid_dataset = CommentDataset(validation_X,validation_Y)
valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)

test_dataset = CommentDataset(test_X,test_Y)
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=collate_fn,num_workers=16)


## 构建TextCNN神经网络

TextCNN的输入是词向量(batchsize,seqlen,embedding_size)

In [5]:
class MyNet(nn.Module):
    def __init__(self,embedding_size):
        super(MyNet,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_weight,freeze=False)
        self.conv = nn.Conv2d(1,256,(3,embedding_size)) # kernel_size 为(3,embedding_size)
        self.adaptive_max_pool = nn.AdaptiveMaxPool1d(2)
        self.fc = nn.Sequential(
            nn.Linear(256*2,128),
            nn.Dropout(0.6),
            nn.ReLU(),

            nn.Linear(128,2),   
        )
    
    def forward(self,x): # (batch_size,seq_len)
        x = self.embedding(x) #(batch_size,seq_len,embedding_size)
        x = x.unsqueeze(1) # (batch_size,1,seq_len,embedding_size) ，因为CNN的input为(N,C,H,W)
        x = self.conv(x) #(batch_size,256,seq_len-2,1)
        x = x.squeeze(3) #(batch_size,256,seq_len-2)
        x = F.relu(x)
        x = self.adaptive_max_pool(x) #(batch_size,256,2)
        x = torch.cat((x[:,:,0],x[:,:,1]),dim=1) #(batch_size,256*2)
        output = self.fc(x)
        return F.log_softmax(output,dim=1)
        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_net = MyNet(embedding_weight.shape[1]).to(device)

In [6]:
# a = torch.randint(1,321,(2,10)).to(device)
# _= my_net(a)

In [7]:
optimzer = optim.Adam(my_net.parameters(),lr=0.0001, weight_decay=0.001)
loss_function = nn.CrossEntropyLoss()

In [8]:
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
optimzer = optim.Adam(my_net.parameters(),lr=0.0001, weight_decay=0.001)

def __compute_acc(targets,predicts):
    return np.mean(np.equal(targets, predicts))

def batch_net_forward(my_net,inputs,target):
    """
        返回一个batch的loss，以及预测结果[1,0,1,1,0]
    """
    outputs= my_net(inputs)
    loss = loss_function(outputs,target)
    _,top_index = torch.max(outputs,1)
    predict = top_index.cpu().numpy()
    return loss,predict


def train_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    optimzer.zero_grad()
    loss,predict = batch_net_forward(my_net,input,target)
    loss.backward()
    optimzer.step()
    return loss,predict

def eval_batch(data):
    input = data[0].to(device)
    target = data[1].to(device).view(-1)
    loss,predict = batch_net_forward(my_net,input,target)
    return loss,predict

def compute(dataloader,is_train=True):
    losses = []
    predicts = []
    targets = []
    for i,data in enumerate(dataloader):
        if is_train:
            loss,predict = train_batch(data)
        else:
            loss,predict = eval_batch(data)
        losses.append(loss.item())
        predicts.extend(predict)
        targets.extend(data[1].cpu().numpy())
    return np.mean(losses),__compute_acc(targets,predicts)

In [9]:
train_accs = []
valid_accs = []
test_accs = []
best_valid_loss = 999 # 最好的验证集loss
 
for epoch in range(60):
    my_net.train()
    train_loss,train_acc = compute(train_dataloader,is_train=True)
    
    my_net.eval()
    valid_loss,valid_acc = compute(valid_dataloader,is_train=False)
    test_loss,test_acc = compute(test_dataloader,is_train=False)
    
    train_accs.append(train_acc)
    valid_accs.append(valid_acc)
    test_accs.append(test_acc)
    
    print("{}------>训练集准确率：{:.4}，loss：{:.4}，验证集：{:.4}".format(epoch,train_acc,train_acc,valid_acc))
    if best_valid_loss > valid_loss:
        best_valid_loss = valid_loss
        torch.save(my_net,"./output/text_cnn.h5")
        print(">>>>验证集合：Acc:{:.4}，Loss:{:.4}，测试集{:.4}<<<<".format(valid_acc,valid_loss,test_acc))
        

0------>训练集准确率：0.5537，loss：0.5537，验证集：0.6351
>>>>验证集合：Acc:0.6351，Loss:0.6766，测试集0.6829<<<<
1------>训练集准确率：0.6546，loss：0.6546，验证集：0.728
>>>>验证集合：Acc:0.728，Loss:0.6405，测试集0.7344<<<<
2------>训练集准确率：0.7088，loss：0.7088，验证集：0.747
>>>>验证集合：Acc:0.747，Loss:0.58，测试集0.7507<<<<
3------>训练集准确率：0.7381，loss：0.7381，验证集：0.7538
>>>>验证集合：Acc:0.7538，Loss:0.5301，测试集0.7669<<<<
4------>训练集准确率：0.7516，loss：0.7516，验证集：0.7595
>>>>验证集合：Acc:0.7595，Loss:0.5049，测试集0.7588<<<<
5------>训练集准确率：0.7636，loss：0.7636，验证集：0.7707
>>>>验证集合：Acc:0.7707，Loss:0.486，测试集0.7805<<<<
6------>训练集准确率：0.7717，loss：0.7717，验证集：0.7694
>>>>验证集合：Acc:0.7694，Loss:0.4771，测试集0.7751<<<<
7------>训练集准确率：0.7815，loss：0.7815，验证集：0.7786
>>>>验证集合：Acc:0.7786，Loss:0.4658，测试集0.7805<<<<
8------>训练集准确率：0.7862，loss：0.7862，验证集：0.7854
>>>>验证集合：Acc:0.7854，Loss:0.4578，测试集0.7913<<<<
9------>训练集准确率：0.798，loss：0.798，验证集：0.7886
>>>>验证集合：Acc:0.7886，Loss:0.4517，测试集0.7995<<<<
10------>训练集准确率：0.803，loss：0.803，验证集：0.7923
>>>>验证集合：Acc:0.7923，Loss:0.4448，测试集0.8076<<<<
11------>

In [10]:
from pyecharts.charts import Line
from pyecharts import options as opts
y1 = train_accs
x = range(len(y1))
y2 = valid_accs
y3 = test_accs

markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(name="最大值",type_="max")])

line = (Line()
       .add_xaxis(x)
       .add_yaxis('训练acc', y1, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="red",width=3))
       .add_yaxis('验证acc', y2, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="yellow",width=3),markpoint_opts=markpoint_opts)       
       .add_yaxis('测试acc', y3, is_smooth=True,linestyle_opts=opts.LineStyleOpts(color="blue",width=3))
       .set_global_opts(title_opts=opts.TitleOpts(title="训练过程"),toolbox_opts=opts.ToolboxOpts(),)
       .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
       
      )

line.render_notebook()
