## 构建LSTM模型

In [6]:
import copy
import math
import tqdm
import numpy
import torch
import torch.nn as nn
import torch.utils.data as Data
from random import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 320
src_vocab_size = 21128          # 字典大小
d_model = 512                   # embedding维度
num_layers = 2                  # LSTM层数
hidden_size = 100               # LSTM隐藏层
linear_hidden_size = 10         # 全链接层隐藏数
classes = 2                     # 类别数
dropout = 0.3                   # LSTM中dropout
lr = 1e-3                       # 学习率
epochs = 10                     # 训练次数    

In [7]:
def load_dataset(path, pad_size=100):
    contents = []
    word2idx = {}
    idx2word = {}
    with open("./vocab.txt", 'r', encoding='UTF-8') as f:
        idx2word = {idx: line.strip() for idx, line in  enumerate(tqdm.tqdm(f))}
        word2idx = {idx2word[key]: key for key in  idx2word}
    with open(path, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f):
            token_ids = []
            lin = line.strip()
            if not lin:
                continue
            label = lin[0]
            content = lin[2:]
            token_ids.append (word2idx['[CLS]'])
            for key in content:
                token_ids.append(word2idx.get(key, 0))
            seq_len = len(token_ids)
            mask = []
            if pad_size:
                if seq_len < pad_size:
                    token_ids += ([0] * (pad_size - seq_len))
                else:
                    token_ids = token_ids[:pad_size]
                    seq_len = pad_size
            contents.append((numpy.array(token_ids), int(label), seq_len))
    return contents
def load_dataset_backward(path, pad_size=100):
    contents = []
    word2idx = {}
    idx2word = {}
    with open("./vocab.txt", 'r', encoding='UTF-8') as f:
        idx2word = {idx: line.strip() for idx, line in  enumerate(tqdm.tqdm(f))}
        word2idx = {idx2word[key]: key for key in  idx2word}
    with open(path, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f):
            token_ids = []
            lin = line.strip()
            if not lin:
                continue
            label = lin[-1]
            content = lin[0:-2]
            token_ids.append (word2idx['[CLS]'])
            for key in content:
                token_ids.append(word2idx.get(key, 0))
            seq_len = len(token_ids)
            mask = []
            if pad_size:
                if seq_len < pad_size:
                    token_ids += ([0] * (pad_size - seq_len))
                else:
                    token_ids = token_ids[:pad_size]
                    seq_len = pad_size
            contents.append((numpy.array(token_ids), int(label), seq_len))
    return contents

train = load_dataset("./train.csv")
train += load_dataset("./train_aaa.csv")
dev = load_dataset("./dev.csv")
test = load_dataset("./test.csv")
train += load_dataset_backward("./train_large.csv")
dev += load_dataset_backward("./validation_set.csv")
test += load_dataset_backward("./test_set.csv")
print(len(train))
train_input_ids, train_label, seq_len = zip(*train)
dev_input_ids, dev_label, seq_len = zip(*dev)
test_input_ids, test_label, seq_len = zip(*test)

train_input_ids, train_label = torch.tensor(train_input_ids), torch.tensor(train_label)
dev_input_ids, dev_label = torch.tensor(dev_input_ids), torch.tensor(dev_label)
test_input_ids, test_label = torch.tensor(test_input_ids), torch.tensor(test_label)

class MyDataSet(Data.Dataset):
    def __init__(self, input_ids, label):
        self.input_ids = input_ids
        self.label = label
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.label[idx]

train_loader = Data.DataLoader(MyDataSet(train_input_ids, train_label), batch_size, True)
dev_loader = Data.DataLoader(MyDataSet(dev_input_ids, dev_label), batch_size, True)
test_loader = Data.DataLoader(MyDataSet(test_input_ids, test_label), batch_size, True)

21128it [00:00, 1344865.99it/s]
5978it [00:00, 61429.18it/s]
21128it [00:00, 1472463.24it/s]
800000it [00:12, 62562.96it/s]
21128it [00:00, 1564376.84it/s]
2000it [00:00, 76442.14it/s]
21128it [00:00, 1759955.01it/s]
1993it [00:00, 77312.39it/s]
21128it [00:00, 1787142.64it/s]
9985it [00:00, 50810.78it/s]
21128it [00:00, 1749496.67it/s]
1999it [00:00, 50166.72it/s]
21128it [00:00, 1755701.05it/s]
1997it [00:00, 52485.64it/s]


815963


In [8]:
class myNet(torch.nn.Module):
    def __init__(self):
        super(myNet, self).__init__()
        self.embed = torch.nn.Embedding(src_vocab_size, d_model)                        
        self.lstm = torch.nn.LSTM(input_size=d_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
    def forward(self,data):
        x = self.embed(data)                             # [64,32,512]   把字用字向量表示
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))      # x:[32, 64, 100]    记录每时刻最后一层的输出。
                                                         # h_n: [2, 64, 100]  记录每一层最后一次的输出
                                                         # c_n: [2, 64, 100]  记录每一层cell保存的值
        x = self.linear(x[-1])                           # [64, 10] 经过第一层全连接层
        x = self.linear1(x)                              # [64, 2] 经过第二层全连接层
        return x       

model = myNet().to(device)                              # 定义模型
criterion = torch.nn.CrossEntropyLoss()                  # 定义损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=lr)   # 定义梯度优化算法                                

In [9]:
for epoch in range(epochs):
    train_loss_sum = 0.
    num = 1
    for data, label in train_loader:                     # 遍历一个opoch的数据
        data, label = data.to(device) ,label.to(device)  # 加载到GCP上
        train_output = model(data)                       # 前向传播
        train_loss = criterion(train_output, label)      # 计算损失
        train_loss_sum += train_loss.item()
        num += 1
        optimizer.zero_grad()                            # 梯度清零
        train_loss.backward()                            # 反向传播计算梯度
        optimizer.step()                                 # 更新梯度
    print(f'epoch:{epoch}, trin_loss:{train_loss_sum/num}')
    acc_count = 0
    dev_len = 0
    for data, label in dev_loader:
        data, label = data.to(device) ,label.to(device)
        dev_output = model(data)
        acc_count += torch.sum(dev_output.argmax(dim=1)==label)  # 计算准确个数
        dev_len +=len(label)                                      # 计算一共有多少个验证数据
    print(f'epoch:{epoch}, ACC:{acc_count/float(dev_len)}')

epoch:1, trin_loss:0.011283075922348229
epoch:1, ACC:0.9147287011146545
epoch:2, trin_loss:0.008165352075853095
epoch:2, ACC:0.9109777808189392
epoch:3, trin_loss:0.0060553966318207635
epoch:3, ACC:0.9242311120033264
epoch:4, trin_loss:0.004478563706568639
epoch:4, ACC:0.9287322163581848
epoch:5, trin_loss:0.0036063570159236766
epoch:5, ACC:0.9274818897247314
epoch:6, trin_loss:0.0028255865198441306
epoch:6, ACC:0.9262316226959229
epoch:7, trin_loss:0.0024332802720970926
epoch:7, ACC:0.9289822578430176
epoch:8, trin_loss:0.002091772300498737
epoch:8, ACC:0.9279820322990417
epoch:9, trin_loss:0.0018362991670523225
epoch:9, ACC:0.9307327270507812


In [10]:
acc_count = 0
test_len = 0
for data, label in test_loader:
    print(data)# 遍历测试数据
    data, label = data.to(device), label.to(device)  # 数据加载到GPU上
    test_output = model(data)     
    print(test_output.argmax(dim=1))# 预测
    acc_count += torch.sum(test_output.argmax(dim=1) == label)  # 计算准确个数
    test_len += len(label)                           # 计算一共有多少个测试数据
print(acc_count / float(test_len))

tensor([[ 101, 4692, 4692,  ...,    0,    0,    0],
        [ 101, 2207, 7987,  ...,    0,    0,    0],
        [ 101, 4692,    0,  ...,    0,    0,    0],
        ...,
        [ 101, 5799,  118,  ...,    0,    0,    0],
        [ 101, 5401, 4255,  ...,    0,    0,    0],
        [ 101, 4696, 4638,  ...,    0,    0,    0]])
tensor([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
        0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 0, 1,

In [11]:
pad_size = 60
with open("./vocab.txt", 'r', encoding='UTF-8') as f:
        idx2word = {idx: line.strip() for idx, line in  enumerate(tqdm.tqdm(f))}
        word2idx = {idx2word[key]: key for key in  idx2word}
content = '老师好，今天有空吗'
token_ids = []
token_ids.append (word2idx['[CLS]'])
for key in content:
    token_ids.append(word2idx.get(key, 0))
seq_len = len(token_ids)
mask = []
if pad_size:
    if seq_len < pad_size:
        token_ids += ([0] * (pad_size - seq_len))
    else:
        token_ids = token_ids[:pad_size]
        seq_len = pad_size
content
token_ids
datain = torch.tensor(token_ids)
datain = datain.unsqueeze(0)
datain

21128it [00:00, 1509252.25it/s]


tensor([[ 101, 5439, 2360, 1962, 8024,  791, 1921, 3300, 4958, 1408,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [12]:
test_output = model(datain.to(device)) 
test_output
print(test_output.argmax(dim=1))
print(test_output)

tensor([1], device='cuda:0', grad_fn=<NotImplemented>)
tensor([[-2.7726,  0.4816]], device='cuda:0', grad_fn=<AddmmBackward>)


In [13]:
torch.save(model.state_dict(), './model.pt')