# 准备数据
- TorchText中的一个重要概念是Field。Field决定了你的数据会被怎样处理。在我们的情感分类任务中，我们所需要接触到的数据有文本字符串和两种情感，"pos"和"neg"。
- Field的参数指定了数据会被怎样处理。
- 我们使用TEXT field来定义如何处理电影评论，使用LABEL field来处理两个情感类别。
- LABEL是LabelField定义。这是一种特别用来处理label的Field

In [1]:
import torch
from keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import *
from keras.datasets import imdb
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
VOCAB_SIZE = 25000
MAX_LEN = 100
BATCH_SIZE = 64
EMDEDDING_SIZE = 128
HIDDEN_SIZE = 128
DROP_OUT = 0.2
LEARNING_RATE = 1e-3

pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.)

sequences：浮点数或整数构成的两层嵌套列表

maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.

dtype：返回的numpy array的数据类型

padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补

truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断

value：浮点数，此值将在填充时代替默认的填充值0

返回值
返回形如(nb_samples,nb_timesteps)的2D张量

In [3]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')
print(X_train.shape, X_test.shape)

(25000, 100) (25000, 100)


# 使用DataLoader加载数据

首先将数据类型转化成torch.LongTensor

In [4]:
train_data = TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
test_data = TensorDataset(torch.LongTensor(X_test), torch.LongTensor(y_test))

然后使用DataLoader加载数据

In [5]:
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)


In [6]:
next(iter(train_loader))

[tensor([[    1,    22,     2,  ...,    19,     4, 10591],
         [    1,     4,   108,  ...,    14,  5351,   286],
         [    1,    13,    86,  ...,  1420,  3935,   689],
         ...,
         [    1,     4,  3706,  ...,  1263,  6636,  2243],
         [    1,   914,  1561,  ...,   848,  1532,  2402],
         [    1,    14,    20,  ...,    10,    13,   124]]),
 tensor([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
         0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
         1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0])]

# 定义模型

In [7]:
class LstmModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout, num_layers=5):
        super(LstmModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.dp = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2)

    def forward(self, X):
        X = self.embed(X)
        X = self.dp(X)
        X, _ = self.lstm(X)
        X = self.dp(X)
        X = F.relu(self.fc1(X))
        X = F.avg_pool2d(X, (X.shape[1], 1)).squeeze()
        output = self.fc2(X)
        return output

# 定义训练和测试函数

In [12]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    cri = nn.CrossEntropyLoss()
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        y_ = model(x)
        loss = cri(y_, y)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print("train loss: ",loss.item())

In [15]:
def test(model, test_loader):
    model.eval()
    cri = nn.CrossEntropyLoss(reduction='sum')
    test_loss = 0.
    acc = 0.
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.cuda(), y.cuda() 
        with torch.no_grad():
            y_ = model(x)
        test_loss += cri(y_, y)
        pred = y_.max(-1, keepdim=True)[1]  # .max()分别输出最大值和最小值的index 
        acc += pred.eq(y.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    print("\n Test Set: Average Loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
    test_loss, acc, len(test_loader.dataset), 100. * acc / len(test_loader.dataset)))
    model.train()
    return acc / len(test_loader.dataset)

# 开始训练模型

In [17]:
model = LstmModel(vocab_size=VOCAB_SIZE, embed_size=EMDEDDING_SIZE, hidden_size=HIDDEN_SIZE, dropout=DROP_OUT)
model= model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_acc = 0.
for epoch in range(1, 11):
    train(model=model, train_loader=train_loader, optimizer=optimizer, epoch=epoch)
    acc = test(model=model, test_loader=test_loader)
    if best_acc < acc:
        best_acc = acc
        torch.save(model.state_dict(), 'model/lstm_model.pth')
    print("acc is : {:.4f}, beat acc is {:.4f}\n".format(acc, best_acc))

train loss:  0.7021245360374451
train loss:  0.6787962913513184
train loss:  0.6805678606033325
train loss:  0.5327714085578918

 Test Set: Average Loss: 0.4437, Accuracy: 19956.0/25000 (80%)
acc is : 0.7982, beat acc is 0.7982

train loss:  0.568015456199646
train loss:  0.5468375086784363
train loss:  0.4371986985206604
train loss:  0.44487833976745605

 Test Set: Average Loss: 0.3666, Accuracy: 20863.0/25000 (83%)
acc is : 0.8345, beat acc is 0.8345

train loss:  0.3396390378475189
train loss:  0.3420133888721466
train loss:  0.35624340176582336
train loss:  0.21501626074314117

 Test Set: Average Loss: 0.2425, Accuracy: 22849.0/25000 (91%)
acc is : 0.9140, beat acc is 0.9140

train loss:  0.18859118223190308
train loss:  0.2604609429836273
train loss:  0.3811567425727844
train loss:  0.23622754216194153

 Test Set: Average Loss: 0.1704, Accuracy: 23386.0/25000 (94%)
acc is : 0.9354, beat acc is 0.9354

train loss:  0.13365298509597778
train loss:  0.1913880556821823
train loss:  0.

可以看到最后的准确率达到了99.8%（PS："best"写成了"beat"，请忽略）