In [None]:
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchtext import data

import math
import time


SEED = 123
BATCH_SIZE = 128
LEARNING_RATE = 1e-3      #学习率
EMBEDDING_DIM = 100       #词向量维度

#为CPU设置随机种子
torch.manual_seed(SEED)

TEXT = data.Field(tokenize=lambda x: x.split(), lower=True)
LABEL = data.LabelField(dtype=torch.float)

#get_dataset返回Dataset所需的examples和fields
def get_dataset(corpur_path, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]         #torchtext文件配对关系
    examples = []

    with open(corpur_path) as f:
        li = []
        while True:
            content = f.readline().replace('\n', '')
            if not content:              #为空行，表示取完一次数据（一次的数据保存在li中）
                if not li:               #如果列表也为空，则表示数据读完，结束循环
                    break
                label = li[0][10]
                text = li[1][6:-7]
                examples.append(data.Example.fromlist([text, label], fields))
                li = []
            else:
                li.append(content)       #["<Polarity>标签</Polarity>", "<text>句子内容</text>"]

    return examples, fields

#得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset("corpurs/trains.txt", TEXT, LABEL)
dev_examples, dev_fields = get_dataset("corpurs/dev.txt", TEXT, LABEL)
test_examples, test_fields = get_dataset("corpurs/tests.txt", TEXT, LABEL)


#构建Dataset数据集
train_data = data.Dataset(train_examples, train_fields)
dev_data = data.Dataset(dev_examples, dev_fields)
test_data = data.Dataset(test_examples, test_fields)

print('len of train data:', len(train_data))              #1000
print('len of dev data:', len(dev_data))                  #200
print('len of test data:', len(test_data))                #300

print(train_data.examples[15].text)
print(train_data.examples[15].label)


#创建vocabulary
TEXT.build_vocab(train_data, max_size=5000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))                     #3287
print(TEXT.vocab.itos[:12])                #['<unk>', '<pad>', 'the', 'and', 'a', 'to', 'is', 'was', 'i', 'of', 'for', 'in']
print(TEXT.vocab.stoi['like'])             #43
print(LABEL.vocab.stoi)                    #defaultdict(None, {'0': 0, '1': 1})


#创建iterators，每个itartion都会返回一个batch的examples
train_iterator, dev_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, dev_data, test_data),
    batch_size=BATCH_SIZE,
    sort = False)

In [None]:
class BiLSTM_Attention(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):

        super(BiLSTM_Attention, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.5)

    #x,query：[batch, seq_len, hidden_dim*2]
    def attention_net(self, x, query, mask=None):      #软性注意力机制（key=value=x）

        d_k = query.size(-1)                                              #d_k为query的维度
        scores = torch.matmul(query, x.transpose(1, 2)) / math.sqrt(d_k)  #打分机制  scores:[batch, seq_len, seq_len]

        p_attn = F.softmax(scores, dim = -1)                              #对最后一个维度归一化得分
        context = torch.matmul(p_attn, x).sum(1)       #对权重化的x求和，[batch, seq_len, hidden_dim*2]->[batch, hidden_dim*2]
        return context, p_attn


    def forward(self, x):
        embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]

        # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]
        output, (final_hidden_state, final_cell_state) = self.rnn(embedding)
        output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]

        query = self.dropout(output)
        attn_output, attention = self.attention_net(output, query)       #和LSTM的不同就在于这一句
        logit = self.fc(attn_output)
        return logit

In [None]:
rnn = BiLSTM_Attention(len(TEXT.vocab), EMBEDDING_DIM, hidden_dim=64, n_layers=2)

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)      #torch.Size([3287, 100])
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

optimizer = optim.Adam(rnn.parameters(), lr=LEARNING_RATE)
criteon = nn.BCEWithLogitsLoss()

In [None]:
#计算准确率
def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc


#训练函数
def train(rnn, iterator, optimizer, criteon):

    avg_loss = []
    avg_acc = []
    rnn.train()        #表示进入训练模式

    for i, batch in enumerate(iterator):

        pred = rnn(batch.text).squeeze()             #[batch, 1] -> [batch]

        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()   #计算每个batch的准确率

        avg_loss.append(loss.item())
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc


#评估函数
def evaluate(rnn, iterator, criteon):

    avg_loss = []
    avg_acc = []
    rnn.eval()         #表示进入测试模式

    with torch.no_grad():
        for batch in iterator:

            pred = rnn(batch.text).squeeze()        #[batch, 1] -> [batch]

            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()

            avg_loss.append(loss.item())
            avg_acc.append(acc)

    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    return avg_loss, avg_acc


#训练模型，并打印模型的表现
best_valid_acc = float('-inf')

for epoch in range(30):

    start_time = time.time()

    train_loss, train_acc = train(rnn, train_iterator, optimizer, criteon)
    dev_loss, dev_acc = evaluate(rnn, dev_iterator, criteon)

    end_time = time.time()

    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if dev_acc > best_valid_acc:          #只要模型效果变好，就保存
        best_valid_acc = dev_acc
        torch.save(rnn.state_dict(), 'wordavg-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {dev_loss:.3f} |  Val. Acc: {dev_acc*100:.2f}%')


#用保存的模型参数预测数据
rnn.load_state_dict(torch.load("wordavg-model.pt"))
test_loss, test_acc = evaluate(rnn, test_iterator, criteon)
print(f'Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')