## Attention Weighted word averaging

In [1]:
import time
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

In [3]:
def load_text_file(filename):
    """将样本的特征与标签分开，并将样本特征分词"""
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].split(" "))
        label.append(int(sample[-1]))
    return sentences, label


def build_word_dic(sentences_list, vocab_size=20_000):
    """构建words_set, word2idx, idx2word"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_topn = counter.most_common(vocab_size)
    words_set = [item[0] for item in words_topn]
    words_set = ['<pad>', "<unk>"] + words_set
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_set, word2idx, idx2word


def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    """构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长"""
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y


def build_batch_data(data, label, batch_size=32):
    """构建tensor格式的批次数据，返回batch列表，每个batch为二元组包含feature和label"""
    batch_data = []
    # 打乱顺序
    data_labels = [[x, y] for x, y in zip(data, label)]
    random.shuffle(data_labels)
    xlist = [item[0] for item in data_labels]
    ylist = [item[1] for item in data_labels]
    
    x_tensor = torch.tensor(xlist, dtype=torch.long)
    y_tensor = torch.tensor(ylist, dtype=torch.float)
    n, dim = x_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            xbatch = x_tensor[start: ]
            ybatch = y_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            xbatch = x_tensor[start: end]
            ybatch = y_tensor[start: end]
        batch_data.append((xbatch, ybatch))
    return batch_data

In [4]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [5]:
print("处理后的样本与标签：", train_sentences[0], train_label[0])
print("各个数据集样本数量：")
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print("各数据集最长最短句子单词数：")
print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

处理后的样本与标签： ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
各个数据集样本数量：
67349 67349
872 872
1821 1821
各数据集最长最短句子单词数：
52 1
47 2
56 2


In [6]:
words_set, word2idx, idx2word = build_word_dic(train_sentences)
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=40)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=40)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=40)

In [7]:
print("词典长度:", len(words_set), len(word2idx), len(idx2word))
print("训练集样本数量:", len(train_x), len(train_y))

词典长度: 16284 16284 16284
训练集样本数量: 67349 67349


In [8]:
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [16]:
class AttAvgModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(AttAvgModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 计算 Attention 向量
        self.u = nn.Parameter(torch.randn(embed_dim))
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        # print('embed size: ', embed.size())
        # 扩展 u
        # [emb_dim] -> [batch, seq_len, emb_dim]
        u = self.u.repeat(embed.size(0), embed.size(1), 1)
        # print('u size: ', u.size())
        # 计算 cos [batch, seq_len]
        cos = F.cosine_similarity(embed, u, dim=2)
        # print('cos size: ', cos.size())
        # 计算权重 [batch, seq_len]
        alpha = F.softmax(cos, dim=1)
        # print('alpah attention size: ',alpha.size())  # torch.Size([32, 40]
        # 就算句子Attention平均值 torch.Size([32, 100]
        h_attn = torch.sum(embed*alpha.unsqueeze(2), dim=1).squeeze(1)
        # print('h_atten size: ',h_attn.size())
        # [batch, emb_dim] -> [batch, output_size]
        out = self.fc(h_attn)   #torch.Size([32, 1])
        # print('out size: ', out.size())
      
        return out
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data
    
    def get_u(self):
        return self.u


def binary_accuracy(preds, y):
    """
    计算准确率
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum()/len(correct)
    return acc


def train(model, device, iterator, optimizer, criterion):
    """训练函数"""
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, device, iterator, criterion):
    """验证函数"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def count_parameters(model):
    """统计模型的参数量"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def epoch_time(start_time, end_time):
    """计算时间差，单位秒"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [17]:
INPUT_DIM = len(words_set)
PAD_IDX = word2idx['<pad>']

model = AttAvgModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')

model = model.to(DEVICE)

# 使用多块GPU
if NUM_CUDA > 1:
    device_ids = list(range(NUM_CUDA))
    print(device_ids)
    
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,628,600个可调节参数, 大约6.212615966796875 M.
[0, 1, 2, 3]


In [None]:
model_name = 'attention-wavg-model.pth'
for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

***Save Best Model attention-wavg-model.pth***
Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 0.326 | Train Acc: 85.71%
	 Val. Loss: 0.474 |  Val. Acc: 82.06%
Epoch: 02 | Epoch Time: 0m 13s
	Train Loss: 0.190 | Train Acc: 92.71%
	 Val. Loss: 0.564 |  Val. Acc: 81.48%
Epoch: 03 | Epoch Time: 0m 12s
	Train Loss: 0.154 | Train Acc: 94.18%
	 Val. Loss: 0.700 |  Val. Acc: 79.51%
