In [5]:
import os
import torch
import torch.nn as nn
from  torch.utils.data import Dataset,DataLoader

In [6]:
# 读取文本数据
def read_data(train_or_test,num=None):
    # train数据输入'train',test数据输入'test'
    with open(os.path.join('./data',train_or_test+'.txt'),'r',encoding='utf-8') as file:
        lines = file.readlines()
    text_all_num = len(lines)
    text = []
    labels = []
    if num and num <= text_all_num :
        for i in range(num):
            text.append(lines[i].split('\t')[0])
            labels.append(lines[i].split('\t')[1].strip())
        print('已完成{}条数据读取'.format(num))
    else:
        for line in lines:
            text.append(line.split('\t')[0])
            labels.append(line.split('\t')[1].strip())
        print('已完成全部{}条数据读取'.format(text_all_num))
    return text,labels

In [7]:
# 构建词库和随机词向量
def built_curpus(text,embedding_num):
    word_index_dict = {"<PAD>":0,"<UNK>":1}
    for sentense in text:
        for word in sentense:
            word_index_dict[word] = word_index_dict.get(word,len(word_index_dict))
    return word_index_dict,nn.Embedding(len(word_index_dict),embedding_num)

In [4]:
text,labels = read_data('test')
word_index_dict,words_embedding = built_curpus(text,10)

FileNotFoundError: [Errno 2] No such file or directory: './data\\test.txt'

In [12]:
# 构建Dataset
class TextDataset(Dataset):
    def __init__(self,text,labels,word_index_dict,seq_max_len):
        self.text = text
        self.labels = labels
        self.word_index_dict = word_index_dict
        self.seq_max_len = seq_max_len
    
    def __getitem__(self, index):
        sentense = self.text[index][:self.seq_max_len]
        label = int(self.labels[index])

        sentense_idx = [self.word_index_dict.get(word,1) for word in sentense]
        sentense_idx = sentense_idx + [0]*(self.seq_max_len - len(sentense_idx))
        sentense_idx = torch.tensor(sentense_idx).unsqueeze(dim=0)
        return sentense_idx,label
    
    def __len__(self):
        return len(self.text)

In [11]:
test_data = TextDataset(text,labels,word_index_dict,100)
test_data.__getitem__(10)

(tensor([[ 56,  57, 133, 134,  13,  14,  19,  96,  17,  18, 135, 136, 127,  13,
          125, 126,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0]]),
 '3')

In [9]:
# 构建模块
class Block(nn.Module):
    def __init__(self,kernel_s,embedding_num,seq_max_len,hidden_num):
        super().__init__()
        self.cnn = nn.Conv2d(in_channels=1,out_channels=hidden_num,kernel_size=(kernel_s,embedding_num))
        self.act = nn.ReLU()
        self.mxp = nn.MaxPool1d(kernel_size=(seq_max_len-kernel_s+1))
    
    def forward(self,batch_emb):
        c = self.cnn.forward(batch_emb)
        a = self.act.forward(c)
        a = a.squeeze(dim=-1)
        m = self.mxp.forward(a)
        m = m.squeeze(dim=-1)
        return m

In [10]:
# 构建TextCNN网络结构
class TextCNNModel(nn.Module):
    def __init__(self,embedding_matrix,seq_max_len,class_num,hidden_num):
        super().__init__()
        self.embedding_num = embedding_matrix.weight.shape[1]

        self.block1 = Block(2,self.embedding_num,seq_max_len,hidden_num)
        self.block2 = Block(3,self.embedding_num,seq_max_len,hidden_num)
        self.block3 = Block(4,self.embedding_num,seq_max_len,hidden_num)
        self.block4 = Block(5,self.embedding_num,seq_max_len,hidden_num)

        self.embedding_matrix = embedding_matrix

        self.classifier = nn.Linear(hidden_num*4,class_num)
        self.loss_fun = nn.CrossEntropyLoss()

    def forward(self,batch_idx,batch_label=None):
        batch_emb = self.embedding_matrix(batch_idx)
        b1_result = self.block1.forward(batch_emb)
        b2_result = self.block1.forward(batch_emb)
        b3_result = self.block1.forward(batch_emb)
        b4_result = self.block1.forward(batch_emb)

        feature = torch.cat([b1_result,b2_result,b3_result,b4_result],dim=1)
        pre = self.classifier(feature)
        
        if batch_label is not None:
            loss = self.loss_fun(pre,batch_label)
            return loss
        else:
            return torch.argmax(pre,dim=-1)




In [13]:
train_text,train_label = read_data("train")
dev_text,dev_label = read_data("dev")

embedding_num = 10
seq_max_len = 20
batch_size = 200
epoch = 10
lr = 0.001
hidden_num = 2
class_num = len(set(train_label))
device = "cuda:0" if torch.cuda.is_available() else "cpu"

word_2_index,words_embedding = built_curpus(train_text,embedding_num)

train_dataset = TextDataset(train_text,train_label,word_2_index,seq_max_len)
train_loader = DataLoader(train_dataset,batch_size,shuffle=False)

dev_dataset = TextDataset(dev_text, dev_label, word_2_index, seq_max_len)
dev_loader = DataLoader(dev_dataset, batch_size, shuffle=False)


model = TextCNNModel(words_embedding,seq_max_len,class_num,hidden_num).to(device)
opt = torch.optim.AdamW(model.parameters(),lr=lr)

for e in range(epoch):
    for batch_idx,batch_label in train_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        loss = model.forward(batch_idx,batch_label)
        loss.backward()
        opt.step()
        opt.zero_grad()

    print(f"loss:{loss:.3f}")

    right_num = 0
    for batch_idx,batch_label in dev_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        pre = model.forward(batch_idx)
        right_num += int(torch.sum(pre==batch_label))

    print(f"acc = {right_num/len(dev_text)*100:.2f}%")


已完成全部180000条数据读取
已完成全部10000条数据读取
loss:2.050
acc = 23.66%
loss:1.948
acc = 28.85%
loss:1.868
acc = 32.45%
loss:1.826
acc = 34.39%
loss:1.775
acc = 34.98%
loss:1.723
acc = 35.53%
loss:1.669
acc = 38.02%
loss:1.649
acc = 39.56%
loss:1.634
acc = 41.20%
loss:1.609
acc = 42.37%


In [17]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)

In [21]:
torch.cuda.is_available()

True