## self attention

In [1]:
import math
import time
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
VOCAB_SIZE = 14_828

EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda:1' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

random.seed(2019)

In [3]:
def load_text_file(filename):
    """将样本的特征与标签分开，并将样本特征分词"""
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].lower().split(" "))
        label.append(int(sample[-1]))
    return sentences, label


def build_word_dic(sentences_list, vocab_size=20_000):
    """构建words_set, word2idx, idx2word"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_topn = counter.most_common(vocab_size)
    words_set = [item[0] for item in words_topn]
    words_set = ['<pad>', "<unk>"] + words_set
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_topn, word2idx, idx2word


def build_x_y(word2idx, sentences_list, label_list, seq_len=60):
    """构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长"""
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]* seq_len
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y


def build_batch_data(data, label, batch_size=32):
    """构建tensor格式的批次数据，返回batch列表，每个batch为二元组包含feature和label"""
    batch_data = []
    # 打乱顺序
    data_labels = [[x, y] for x, y in zip(data, label)]
    random.shuffle(data_labels)
    xlist = [item[0] for item in data_labels]
    ylist = [item[1] for item in data_labels]
    
    x_tensor = torch.tensor(xlist, dtype=torch.long)
    y_tensor = torch.tensor(ylist, dtype=torch.float)
    n, dim = x_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            xbatch = x_tensor[start: ]
            ybatch = y_tensor[start: ]
            print("最后一个batch size:", ybatch.size())
        else:
            xbatch = x_tensor[start: end]
            ybatch = y_tensor[start: end]
        batch_data.append((xbatch, ybatch))
    return batch_data

In [4]:
class SelfAttModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(SelfAttModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 计算 Attention 向量
        self.qkv = nn.Linear(embed_dim, embed_dim, bias=False)
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        # [batch, seq_len, emb_dim] -> [batch, seq_len, embed_dim]?
        x = self.qkv(embed)     # 用emeding层产生？
        # 算句子Attention平均值
        h_attn = self.attention(x)   # [batch, seq_len, emb_dim]
        # 平均值
        # [batch, seq_len, emb_dim] -> [batch, emb_dim]  # 每个句子求平均值得到一个词向量
        h_attn = torch.sum(h_attn, dim=1).squeeze()
        # [batch, emb_dim] --> [batch, output_size]
        out = self.fc(h_attn)
        return out
    
    def attention(self, x):
        """计算attention权重"""
        d_k = x.size(-1)    # embed_dim
        # x.transpose(-2, -1) 后两维度的转置 [batch, seq_len, emb_dim] --> [batch, emb_dim, seq_len]
        # [batch, seq_len, emb_dim] -> [batch, seq_len, seq_len]
        scores = torch.matmul(x, x.transpose(-2, -1)) / math.sqrt(d_k)
        # [batch, seq_len, seq_len] ->[batch, seq_len, seq_len]
        attn = F.softmax(scores, dim=-1)
        # 计算context值 
        # [batch, seq_len, seq_len] -> [batch, seq_len, emb_dim]
        attn_x = torch.matmul(attn, x)
        return attn_x
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data


def binary_accuracy(preds, y):
    """计算准确率"""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum()/len(correct)
    return acc


def train(model, device, iterator, optimizer, criterion):
    """训练函数"""
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, device, iterator, criterion):
    """验证函数"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def count_parameters(model):
    """统计模型的参数量"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def epoch_time(start_time, end_time):
    """计算时间差，返回分钟, 秒钟"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def max_seq_len(data_list):
    """获取句子最大长度"""
    li = [len(s) for data in data_list for s in data]
    return max(li)

In [5]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

max_len = max_seq_len([train_sentences, eval_sentences, test_sentences])
print("max_len:", max_len)
words_set, word2idx, idx2word = build_word_dic(train_sentences, vocab_size=VOCAB_SIZE)
train_x, train_y = build_x_y(word2idx, train_sentences, train_label, seq_len=max_len)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label, seq_len=max_len)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label, seq_len=max_len)

print("词典长度:", len(words_set), len(word2idx), len(idx2word))
print("训练集样本数量:", len(train_x), len(train_y))

train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

max_len: 56
词典长度: 14828 14830 14830
训练集样本数量: 67349 67349
最后一个batch size: torch.Size([21])
最后一个batch size: torch.Size([8])
最后一个batch size: torch.Size([29])


In [6]:
INPUT_DIM = len(words_set) + 2
PAD_IDX = word2idx['<pad>']

model = SelfAttModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')

model = model.to(DEVICE)
    
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,493,100个可调节参数, 大约5.6957244873046875 M.


In [7]:
model_name = 'self-attention-wordavg.pth'
for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model self-attention-wordavg.pth***
Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.391 | Train Acc: 83.59%
	 Val. Loss: 0.630 |  Val. Acc: 80.36%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 0.380 | Train Acc: 89.68%
	 Val. Loss: 1.530 |  Val. Acc: 76.34%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.466 | Train Acc: 91.57%
	 Val. Loss: 2.797 |  Val. Acc: 78.24%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.446 | Train Acc: 93.12%
	 Val. Loss: 5.711 |  Val. Acc: 77.01%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: 0.379 | Train Acc: 94.06%
	 Val. Loss: 5.046 |  Val. Acc: 79.13%


In [8]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1}'.format(test_loss, test_acc))

Test Loss: 0.5601791255829627 | Test Acc: 0.8189088022499754


## Add residual残差

In [9]:
class AttentionResidualModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(AttentionResidualModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.qkv = nn.Linear(embed_dim, embed_dim, bias=False)
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        # [batch, seq_len, emb_dim] -> [batch, seq_len, embed_dim]?
        x = self.qkv(embed)     # 用emeding层产生？
        # 算句子Attention平均值
        h_attn = self.attention(x)   # [batch, seq_len, emb_dim]
        h_attn += embed
        # 平均值
        # [batch, seq_len, emb_dim] -> [batch, emb_dim]  # 每个句子求平均值得到一个词向量
        h_attn = torch.sum(h_attn, dim=1).squeeze()
        # [batch, emb_dim] --> [batch, output_size]
        out = self.fc(self.dropout(h_attn))
        return out
    
    def attention(self, x):
        """计算attention权重"""
        d_k = x.size(-1)    # embed_dim
        # x.transpose(-2, -1) 后两维度的转置 [batch, seq_len, emb_dim] --> [batch, emb_dim, seq_len]
        # [batch, seq_len, emb_dim] -> [batch, seq_len, seq_len]
        scores = torch.matmul(x, x.transpose(-2, -1)) / math.sqrt(d_k)
        # [batch, seq_len, seq_len] ->[batch, seq_len, seq_len]
        attn = F.softmax(scores, dim=-1)
        # 计算context值 
        # [batch, seq_len, seq_len] -> [batch, seq_len, emb_dim]
        attn_x = torch.matmul(attn, x)
        return attn_x
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data

In [10]:
res_model = AttentionResidualModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(res_model):,}个可调节参数, 大约{count_parameters(res_model)*4/1024/1024} M.')

res_model = res_model.to(DEVICE)

optimizer = optim.Adam(res_model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,493,100个可调节参数, 大约5.6957244873046875 M.


In [11]:
res_model_name = 'attention-residual-wordavg.pth'
BEST_VALID_LOSS = float('inf')

for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(res_model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(res_model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(res_model, res_model_name)
        print(f'***Save Best Model {res_model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model attention-residual-wordavg.pth***
Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.411 | Train Acc: 82.54%
	 Val. Loss: 0.680 |  Val. Acc: 80.25%
Epoch: 02 | Epoch Time: 0m 9s
	Train Loss: 0.423 | Train Acc: 89.12%
	 Val. Loss: 1.333 |  Val. Acc: 78.68%
Epoch: 03 | Epoch Time: 0m 9s
	Train Loss: 0.450 | Train Acc: 91.44%
	 Val. Loss: 4.090 |  Val. Acc: 79.13%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 1.355 | Train Acc: 92.43%
	 Val. Loss: 2.013 |  Val. Acc: 78.91%
Epoch: 05 | Epoch Time: 0m 9s
	Train Loss: 0.232 | Train Acc: 94.51%
	 Val. Loss: 3.674 |  Val. Acc: 77.68%


In [12]:
res_model = torch.load(res_model_name)
test_loss, test_acc = evaluate(res_model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1}'.format(test_loss, test_acc))

Test Loss: 0.6042031730737603 | Test Acc: 0.8194570478640104


## 自己设置attention函数

In [13]:
class MyAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(MyAttentionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 权重计算 q, v, k
        self.q = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v = nn.Linear(embed_dim, embed_dim, bias=False)
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        # [batch, seq_len, emb_dim] -> [batch, seq_len, embed_dim]?
        q_vec = self.q(embed) 
        k_vec = self.k(embed)
        v_vec = self.v(embed)
        # 算句子Attention平均值
        h_attn = self.attention(q_vec, k_vec, v_vec)   # [batch, seq_len, emb_dim]
        h_attn += embed
        # 平均值
        # [batch, seq_len, emb_dim] -> [batch, emb_dim]  # 每个句子求平均值得到一个词向量
        h_attn = torch.sum(h_attn, dim=1).squeeze()
        # [batch, emb_dim] --> [batch, output_size]
        out = self.fc(self.dropout(h_attn))
        return out
    
    def attention(self, q, k, v):
        """计算attention权重"""
        d_k = k.size(-1)    # embed_dim
        # [batch, seq_len, emb_dim] -> [batch, seq_len, seq_len]
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        # [batch, seq_len, seq_len] ->[batch, seq_len, seq_len]
        attn = F.softmax(scores, dim=-1)
        # 计算context值 
        # [batch, seq_len, seq_len] -> [batch, seq_len, emb_dim]
        attn_x = torch.matmul(attn, v)
        return attn_x
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data

In [14]:
att_model = MyAttentionModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(att_model):,}个可调节参数, 大约{count_parameters(att_model)*4/1024/1024} M.')

att_model = att_model.to(DEVICE)

optimizer = optim.Adam(att_model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,513,100个可调节参数, 大约5.7720184326171875 M.


In [15]:
att_model_name = 'my-attention-wordavg.pth'
BEST_VALID_LOSS = float('inf')

for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(att_model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(att_model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(att_model, att_model_name)
        print(f'***Save Best Model {att_model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model my-attention-wordavg.pth***
Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.461 | Train Acc: 82.26%
	 Val. Loss: 0.669 |  Val. Acc: 78.12%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.453 | Train Acc: 88.23%
	 Val. Loss: 1.575 |  Val. Acc: 76.79%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 0.599 | Train Acc: 88.79%
	 Val. Loss: 2.247 |  Val. Acc: 76.45%
Epoch: 04 | Epoch Time: 0m 11s
	Train Loss: 0.655 | Train Acc: 90.28%
	 Val. Loss: 3.758 |  Val. Acc: 79.24%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 3.910 | Train Acc: 87.12%
	 Val. Loss: 9.320 |  Val. Acc: 74.11%


In [16]:
att_model = torch.load(att_model_name)
test_loss, test_acc = evaluate(att_model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1}'.format(test_loss, test_acc))

Test Loss: 0.5882109670262587 | Test Acc: 0.8019131882148877


## Deploy

In [17]:
att_model.eval()

MyAttentionModel(
  (embedding): Embedding(16000, 100, padding_idx=0)
  (q): Linear(in_features=100, out_features=100, bias=False)
  (k): Linear(in_features=100, out_features=100, bias=False)
  (v): Linear(in_features=100, out_features=100, bias=False)
  (fc): Linear(in_features=100, out_features=1, bias=False)
  (dropout): Dropout(p=0.2)
)

In [18]:
def predict(model, device, x):
    model.eval()
    with torch.no_grad():
        x = x.to(device)
        y = model(x)
        print(y)
    return y

In [33]:
x = test_data[0][0][0].unsqueeze(0)
x.size()

torch.Size([1, 56])

In [34]:
gt = test_data[0][-1][0]
gt

tensor(0.)

In [36]:
y = predict(att_model, DEVICE, x)

tensor([-10.3408], device='cuda:1')


In [38]:
p_y = torch.sigmoid(y)

In [41]:
p_y < 0.5

tensor([1], device='cuda:1', dtype=torch.uint8)

## add positional encodings

In [17]:
from torch.autograd import Variable

In [28]:
class PosAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(PosAttentionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 权重计算 q, v, k
        self.q = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v = nn.Linear(embed_dim, embed_dim, bias=False)
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        self.dropout = nn.Dropout(0.2)
        
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        max_len, embed_dim = embed.size()[1], embed.size(2)
        pe = self.get_pe(max_len, embed_dim)
        # embed += Variable(self.pe[:, :x.size(1)],requires_grad=False)
        embed += pe
        # [batch, seq_len, emb_dim] -> [batch, seq_len, embed_dim]?
        q_vec = self.q(embed) 
        k_vec = self.k(embed)
        v_vec = self.v(embed)
        # 算句子Attention平均值
        h_attn = self.attention(q_vec, k_vec, v_vec)   # [batch, seq_len, emb_dim]
        h_attn += embed
        # 平均值
        # [batch, seq_len, emb_dim] -> [batch, emb_dim]  # 每个句子求平均值得到一个词向量
        h_attn = torch.sum(h_attn, dim=1).squeeze()
        # [batch, emb_dim] --> [batch, output_size]
        out = self.fc(self.dropout(h_attn))
        return out
    
    def attention(self, q, k, v):
        """计算attention权重"""
        d_k = k.size(-1)    # embed_dim
        # [batch, seq_len, emb_dim] -> [batch, seq_len, seq_len]
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        # [batch, seq_len, seq_len] ->[batch, seq_len, seq_len]
        attn = F.softmax(scores, dim=-1)
        # 计算context值 
        # [batch, seq_len, seq_len] -> [batch, seq_len, emb_dim]
        attn_x = torch.matmul(attn, v)
        return attn_x 
    
    @property
    def get_pe(self, max_len, embed_dim):
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) *
                             -(math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        return pe
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data

In [38]:
pos_model = MyAttentionModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(pos_model):,}个可调节参数, 大约{count_parameters(pos_model)*4/1024/1024} M.')

pos_model = pos_model.to(DEVICE)

optimizer = optim.Adam(pos_model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,513,100个可调节参数, 大约5.7720184326171875 M.


In [39]:
pos_model_name = 'pos-attention-wordavg.pth'
BEST_VALID_LOSS = float('inf')
EPOCHS = 5

for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(pos_model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(pos_model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(pos_model, pos_model_name)
        print(f'***Save Best Model {pos_model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

***Save Best Model pos-attention-wordavg.pth***
Epoch: 01 | Epoch Time: 0m 11s
	Train Loss: 0.424 | Train Acc: 82.91%
	 Val. Loss: 0.982 |  Val. Acc: 78.24%
Epoch: 02 | Epoch Time: 0m 11s
	Train Loss: 0.520 | Train Acc: 86.70%
	 Val. Loss: 1.653 |  Val. Acc: 80.36%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 2.995 | Train Acc: 83.55%
	 Val. Loss: 3.424 |  Val. Acc: 72.43%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.534 | Train Acc: 89.54%
	 Val. Loss: 1.370 |  Val. Acc: 78.57%
Epoch: 05 | Epoch Time: 0m 12s
	Train Loss: 0.427 | Train Acc: 91.87%
	 Val. Loss: 2.589 |  Val. Acc: 72.21%


In [40]:
pos_model = torch.load(pos_model_name)
test_loss, test_acc = evaluate(pos_model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1}'.format(test_loss, test_acc))

Test Loss: 1.033877345030768 | Test Acc: 0.7567869027455648
