## Attention Weighted word averaging

In [1]:
import time
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
VOCAB_SIZE = 20_000

EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

In [3]:
def load_text_file(filename):
    """将样本的特征与标签分开，并将样本特征分词"""
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].lower().split(" "))
        label.append(int(sample[-1]))
    return sentences, label


def build_word_dic(sentences_list, vocab_size=20_000):
    """构建words_set, word2idx, idx2word"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_topn = counter.most_common(vocab_size)
    words_set = [item[0] for item in words_topn]
    words_set = ['<pad>', "<unk>"] + words_set
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_topn, word2idx, idx2word


def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    """构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长"""
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y


def build_batch_data(data, label, batch_size=32):
    """构建tensor格式的批次数据，返回batch列表，每个batch为二元组包含feature和label"""
    batch_data = []
    # 打乱顺序
    data_labels = [[x, y] for x, y in zip(data, label)]
    random.shuffle(data_labels)
    xlist = [item[0] for item in data_labels]
    ylist = [item[1] for item in data_labels]
    
    x_tensor = torch.tensor(xlist, dtype=torch.long)
    y_tensor = torch.tensor(ylist, dtype=torch.float)
    n, dim = x_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            xbatch = x_tensor[start: ]
            ybatch = y_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            xbatch = x_tensor[start: end]
            ybatch = y_tensor[start: end]
        batch_data.append((xbatch, ybatch))
    return batch_data

In [4]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [5]:
print("处理后的样本与标签：", train_sentences[0], train_label[0])
print("各个数据集样本数量：")
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print("各数据集最长最短句子单词数：")
print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

处理后的样本与标签： ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
各个数据集样本数量：
67349 67349
872 872
1821 1821
各数据集最长最短句子单词数：
52 1
47 2
56 2


In [6]:
words_set, word2idx, idx2word = build_word_dic(train_sentences, vocab_size=VOCAB_SIZE)
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=40)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=40)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=40)

In [7]:
print("词典长度:", len(words_set), len(word2idx), len(idx2word))
print("训练集样本数量:", len(train_x), len(train_y))

词典长度: 14828 14830 14830
训练集样本数量: 67349 67349


In [8]:
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [9]:
class AttAvgModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(AttAvgModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 计算 Attention 向量
        self.u = nn.Parameter(torch.randn(embed_dim))
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        
        # 扩展u这组参数，为的是计算和词向量的相似度，最后得到权重
        # [emb_dim] -> [batch, seq_len, emb_dim]
        u = self.u.repeat(embed.size(0), embed.size(1), 1)  # 在最后一个参数上重复自己
        
        # 计算余弦相似度
        cos = F.cosine_similarity(embed, u, dim=2)   # [batch, seq_len] 计算每个词向量和对应的u向量的余弦相似度
        
        # 计算权重 
        alpha = F.softmax(cos, dim=1)   # [bacth, seq_len]  softmax的作用是使得每个序列的个单词权重之和为1
        alpha = alpha.unsqueeze(2)      # [bacth, seq_len, 1]
        
        # embed*alpha => [bacth, seq_len, emb_dim] 相当于每个词向量（的每个元素）都乘上一个权重
        h_attn = torch.sum(embed*alpha, dim=1).squeeze(1)  # 在1维度上sum 相当于把序列（句子）求和[batch, emb_dim]
        
        # [batch, emb_dim] -> [batch, output_size]即[batch, 1]
        out = self.fc(h_attn) 
      
        return out
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data
    
    def get_u(self):
        """attention向量"""
        return self.u


def binary_accuracy(preds, y):
    """计算准确率"""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum()/len(correct)
    return acc


def train(model, device, iterator, optimizer, criterion):
    """训练函数"""
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, device, iterator, criterion):
    """验证函数"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def count_parameters(model):
    """统计模型的参数量"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def epoch_time(start_time, end_time):
    """计算时间差，返回分钟, 秒钟"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
INPUT_DIM = len(dict(words_set))
print("INPUT_DIM", INPUT_DIM)
PAD_IDX = word2idx['<pad>']

model = AttAvgModel(VOCAB_SIZE, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')

model = model.to(DEVICE)

# 使用多块GPU
if NUM_CUDA > 1:
    device_ids = list(range(NUM_CUDA))
    print(device_ids)
    model = nn.DataParallel(model, device_ids=device_ids)
    
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

INPUT_DIM 14828
模型有2,000,200个可调节参数, 大约7.630157470703125 M.
[0, 1, 2, 3]


In [11]:
model_name = 'attention-wavg-model.pth'
for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model attention-wavg-model.pth***
Epoch: 01 | Epoch Time: 0m 38s
	Train Loss: 0.322 | Train Acc: 86.03%
	 Val. Loss: 0.477 |  Val. Acc: 81.94%
Epoch: 02 | Epoch Time: 0m 31s
	Train Loss: 0.193 | Train Acc: 92.61%
	 Val. Loss: 0.588 |  Val. Acc: 80.79%
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.160 | Train Acc: 93.89%
	 Val. Loss: 0.704 |  Val. Acc: 80.67%
Epoch: 04 | Epoch Time: 0m 32s
	Train Loss: 0.132 | Train Acc: 95.16%
	 Val. Loss: 0.852 |  Val. Acc: 80.32%
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.110 | Train Acc: 96.08%
	 Val. Loss: 1.009 |  Val. Acc: 79.86%


In [12]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 0.46318651602736544 | Test Acc: 0.8203125 |


## 分析词向量和Attention向量

In [13]:
model = torch.load(model_name)

In [14]:
word_embedding = model.module.get_embed_weight()   # 注意多GPU的时候是这样
u = model.module.get_u()

In [15]:
print(word_embedding.size(), u.size())

torch.Size([20000, 100]) torch.Size([100])


In [16]:
u_repeat = u.repeat(word_embedding.size()[0], 1)
print(u_repeat.size())
cos_sim = torch.cosine_similarity(word_embedding, u_repeat, dim=1)

torch.Size([20000, 100])


In [17]:
cos_score, cos_idx = cos_sim.sort()

In [18]:
print("Cosine similarity最高的15个单词：")
for i, s in zip(cos_idx[-15:], cos_score[-15: ]):
    print(f"{idx2word[i.item()]} : {s.item()}")

Cosine similarity最高的15个单词：
meticulously : 0.7722846865653992
146 : 0.7751907706260681
grown : 0.77534419298172
wrong : 0.7768417000770569
or : 0.7851654887199402
mine : 0.7884625792503357
drains : 0.791044294834137
weaponry : 0.7968361377716064
missed : 0.8013620972633362
curiosity : 0.803288996219635
n't : 0.8035760521888733
gap : 0.8163576126098633
not : 0.8254992961883545
undoubtedly : 0.8323050737380981
never : 0.9433038234710693


In [19]:
print("Cosine similarity最低的15个单词：")
for i, s in zip(cos_idx[: 15], cos_score[: 15]):
    print(f"{idx2word[i.item()]} : {s.item()}")

Cosine similarity最低的15个单词：
direction : -0.990354597568512
pushing : -0.9856081008911133
extremely : -0.9844442009925842
attention : -0.980595052242279
utterly : -0.9793421030044556
suits : -0.9781764149665833
storyline : -0.978120744228363
tension : -0.9779711365699768
subtlety : -0.9763894081115723
reasonably : -0.9762077331542969
strictly : -0.9760425686836243
ever : -0.9757418036460876
lump : -0.9754454493522644
dish : -0.9741281270980835
father : -0.973986029624939


## 相同单词在不同语境下attention的变化

In [20]:
words_freq = []
for w in words_set:
    if w[-1] >100:
        words_freq.append(w[0])
print(len(words_freq))

661


In [21]:
" ".join(words_freq)

"the , a and of . to 's is that in it as with an film its for movie this you but be on n't by more -- one at than has not about his from are like so or all have most story ' good ... into out too who -rrb- up characters i funny -lrb- comedy if just no does much what can even ` your their will time some bad `` little '' very way which best any love been life make work enough there only he makes us new movies never something do they through was well action great would own made director humor many we really performances plot drama her how could films sense see such better other fun audience people every off two without cast nothing feel both when being look character may should entertaining acting real ever often performance them long : while still world because script also interesting another heart kind 're those hollywood dialogue watch minutes first screen down few get big over far thriller might less hard human moments actors tale compelling romantic rather cinema had year family almo

In [56]:
def word2sentences(sent_list, words_set, word2idx, freq=100):
    words_freq = []
    for w in words_set:
        if w[-1] >100:
            words_freq.append(w[0])
    print(len(words_freq), words_freq[0])
    w2sents = {}
    w2sentnums = {}
    for w in words_freq:
        w2sents[w] = []
        w2sentnums[w] = []
        for s in sent_list:
            if w in s:
                w2sents[w].append(s)
                w2sentnums[w].append([word2idx[word] for word in s])
    return words_freq, w2sents, w2sentnums        

In [57]:
words_freq, w2sents, w2sentnums= word2sentences(train_sentences, words_set, word2idx, freq=100) 

661 the


In [59]:
len(w2sents['the'])

19892

In [60]:
len(w2sentnums['the'])

19892

In [61]:
def get_attentions(sentence, word_embedding, u, word2idx):
    """计算一个句子中每个单词在句子中的Attention，返回单词与Attention值的字典"""
    num_sentence = [word2idx[w] for w in sentence]
    s_embed = word_embedding[num_sentence]
    u = u.repeat(s_embed.size(0), 1)
    score = torch.cosine_similarity(s_embed, u, dim=1)
    attn = torch.softmax(score, dim=0)
    return {w:a for w,a in zip(sentence, attn.tolist()) }

In [62]:
model = torch.load(model_name)
word_embedding = model.module.get_embed_weight()   # 注意多GPU的时候是这样
u = model.module.get_u()

In [78]:
def get_word_sentens_attn_dic(w2sents, word_embedding, u, word2idx):
    word_sentens_attn_dic = {}
    word_attention_li = {}
    for word, sent_list in w2sents.items():
        word_sentens_attn_dic[word] = []
        word_attention_li[word] = []
        for sentence in sent_list:
            dic = get_attentions(sentence, word_embedding, u, word2idx)
            word_sentens_attn_dic[word].append(dic)
            word_attention_li[word].append(dic[word])
    return word_sentens_attn_dic, word_attention_li

In [79]:
word_sentens_attn_dic, word_attention_li = get_word_sentens_attn_dic(w2sents, word_embedding, u, word2idx)  # 这步很慢

In [80]:
def meam_std_list(word_attention_li):
    """计算Attentions的平均值和标准差，并按标准差排序"""
    word_mean_std_li = []
    for w in word_attention_li:
        arr = np.array(word_attention_li[w])
        word_mean_std_li.append((w, arr.mean(), arr.std()))
    word_mean_std_li = sorted(word_mean_std_li, key=lambda x:x[2], reverse=True)
    return word_mean_std_li

In [81]:
## 出现大于100次的单词在句子中的权重，句子需要是等长的吗？
word_attention_li['the']

[0.1692907065153122,
 0.14538156986236572,
 0.10048887133598328,
 0.0691571980714798,
 0.150122731924057,
 0.33087489008903503,
 0.14479711651802063,
 0.17118620872497559,
 0.05708511546254158,
 0.08716180920600891,
 0.29936009645462036,
 0.16640907526016235,
 0.04713677987456322,
 0.17252585291862488,
 0.05475178360939026,
 0.16375745832920074,
 0.3838737905025482,
 0.10259179025888443,
 0.035658784210681915,
 0.11098287999629974,
 0.05896681919693947,
 0.2202649861574173,
 0.2013084590435028,
 0.10524467378854752,
 0.0404231883585453,
 0.057786282151937485,
 0.05429593473672867,
 0.05306461080908775,
 0.047177284955978394,
 0.07680545002222061,
 0.09512997418642044,
 0.02664656937122345,
 0.05905172601342201,
 0.04777286574244499,
 0.06967316567897797,
 0.18306559324264526,
 0.07693853974342346,
 0.09698569029569626,
 0.05509617552161217,
 0.08074217289686203,
 0.059675972908735275,
 0.24197590351104736,
 0.14403052628040314,
 0.043004110455513,
 0.10095784813165665,
 0.0689253583550

In [85]:
word_mean_std_li = meam_std_list(word_attention_li)

In [86]:
print('30个标准差最大的单词: ')
for word, amean, astd in word_mean_std_li[:30]:
    print('{} | std：{:.4}'.format(word, astd))

30个标准差最大的单词: 
awful | std：0.2441
stupid | std：0.2354
watchable | std：0.2349
terrific | std：0.2347
tedious | std：0.2287
appealing | std：0.2266
impressive | std：0.2227
bland | std：0.2225
excellent | std：0.2218
beautifully | std：0.2214
brilliant | std：0.2211
waste | std：0.2205
painful | std：0.2186
remarkable | std：0.218
boring | std：0.218
provocative | std：0.2175
flat | std：0.2171
intriguing | std：0.2159
satisfying | std：0.2125
inventive | std：0.2121
cool | std：0.2118
delightful | std：0.2107
engrossing | std：0.2094
thoughtful | std：0.2084
fine | std：0.2084
unfunny | std：0.2084
convincing | std：0.2083
mess | std：0.2081
gorgeous | std：0.2079
worthy | std：0.2074


In [87]:
word_sentens_attn_dic['awful']

[{'inane': 0.4392082095146179,
  'and': 0.10425573587417603,
  'awful': 0.45653605461120605},
 {'a': 0.14879608154296875,
  'thoroughly': 0.12150495499372482,
  'awful': 0.6039527058601379,
  'movie': 0.12574627995491028},
 {'is': 0.14148804545402527,
  'awful': 0.6700611710548401,
  '.': 0.18845079839229584},
 {'this': 0.04171011596918106,
  'wretchedly': 0.1300811469554901,
  'unfunny': 0.16474273800849915,
  'wannabe': 0.1759365051984787,
  'comedy': 0.0510118305683136,
  'is': 0.034801606088876724,
  'inane': 0.1585584282875061,
  'and': 0.03763733059167862,
  'awful': 0.16481393575668335,
  '-': 0.040706295520067215},
 {'if': 0.052270591259002686,
  'oscar': 0.021867601200938225,
  'had': 0.023103641346096992,
  'a': 0.022198058664798737,
  'category': 0.026109356433153152,
  'called': 0.08210821449756622,
  'best': 0.06687189638614655,
  'bad': 0.09845349192619324,
  'film': 0.030823923647403717,
  'you': 0.044048748910427094,
  'thought': 0.051682379096746445,
  'was': 0.0570182