## Attention Weighted word averaging

In [1]:
import time
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

random.seed(2019)

In [2]:
VOCAB_SIZE = 14_828

EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda:1' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

In [3]:
def load_text_file(filename):
    """将样本的特征与标签分开，并将样本特征分词"""
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].lower().split(" "))
        label.append(int(sample[-1]))
    return sentences, label


def build_word_dic(sentences_list, vocab_size=20_000):
    """构建words_set, word2idx, idx2word"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_topn = counter.most_common(vocab_size)
    words_set = [item[0] for item in words_topn]
    words_set = ['<pad>', "<unk>"] + words_set
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_topn, word2idx, idx2word


def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    """构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长"""
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y


def build_batch_data(data, label, batch_size=32):
    """构建tensor格式的批次数据，返回batch列表，每个batch为二元组包含feature和label"""
    batch_data = []
    # 打乱顺序
    data_labels = [[x, y] for x, y in zip(data, label)]
    random.shuffle(data_labels)
    xlist = [item[0] for item in data_labels]
    ylist = [item[1] for item in data_labels]
    
    x_tensor = torch.tensor(xlist, dtype=torch.long)
    y_tensor = torch.tensor(ylist, dtype=torch.float)
    n, dim = x_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            xbatch = x_tensor[start: ]
            ybatch = y_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            xbatch = x_tensor[start: end]
            ybatch = y_tensor[start: end]
        batch_data.append((xbatch, ybatch))
    return batch_data

In [4]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [5]:
print("处理后的样本与标签：", train_sentences[0], train_label[0])
print("各个数据集样本数量：")
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print("各数据集最长最短句子单词数：")
print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

处理后的样本与标签： ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
各个数据集样本数量：
67349 67349
872 872
1821 1821
各数据集最长最短句子单词数：
52 1
47 2
56 2


In [6]:
max_seq_len = 56
words_set, word2idx, idx2word = build_word_dic(train_sentences, vocab_size=VOCAB_SIZE)
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=max_seq_len)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=max_seq_len)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=max_seq_len)

In [7]:
print("词典长度:", len(words_set), len(word2idx), len(idx2word))
print("训练集样本数量:", len(train_x), len(train_y))

词典长度: 14828 14830 14830
训练集样本数量: 67349 67349


In [8]:
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [9]:
class AttAvgModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super(AttAvgModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # 计算 Attention 向量
        self.u = nn.Parameter(torch.randn(embed_dim))
        self.fc = nn.Linear(embed_dim, output_size, bias=False)
        
    def forward(self, text):
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.embedding(text)
        
        # 扩展u这组参数，为的是计算和词向量的相似度，最后得到权重
        # [emb_dim] -> [batch, seq_len, emb_dim]
        u = self.u.repeat(embed.size(0), embed.size(1), 1)  # 在最后一个参数上重复自己
        
        # 计算余弦相似度
        cos = F.cosine_similarity(embed, u, dim=2)   # [batch, seq_len] 计算每个词向量和对应的u向量的余弦相似度
        
        # 计算权重 
        alpha = F.softmax(cos, dim=1)   # [bacth, seq_len]  softmax的作用是使得每个序列的个单词权重之和为1
        alpha = alpha.unsqueeze(2)      # [bacth, seq_len, 1]
        
        # embed*alpha => [bacth, seq_len, emb_dim] 相当于每个词向量（的每个元素）都乘上一个权重
        h_attn = torch.sum(embed*alpha, dim=1).squeeze(1)  # 在1维度上sum 相当于把序列（句子）求和[batch, emb_dim]
        
        # [batch, emb_dim] -> [batch, output_size]即[batch, 1]
        out = self.fc(h_attn) 
      
        return out
    
    def get_embed_weight(self):
        """获取embedding层参数"""
        return self.embedding.weight.data
    
    def get_u(self):
        """attention向量"""
        return self.u


def binary_accuracy(preds, y):
    """计算准确率"""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum()/len(correct)
    return acc


def train(model, device, iterator, optimizer, criterion):
    """训练函数"""
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, device, iterator, criterion):
    """验证函数"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def count_parameters(model):
    """统计模型的参数量"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def epoch_time(start_time, end_time):
    """计算时间差，返回分钟, 秒钟"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
INPUT_DIM = len(words_set) + 2
print("INPUT_DIM", INPUT_DIM)
PAD_IDX = word2idx['<pad>']

model = AttAvgModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')

model = model.to(DEVICE)
print("device:", DEVICE)

    
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

INPUT_DIM 14830
模型有1,483,200个可调节参数, 大约5.657958984375 M.
device: cuda:1


In [11]:
model_name = 'attention-wavg-model.pth'
for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model attention-wavg-model.pth***
Epoch: 01 | Epoch Time: 0m 9s
	Train Loss: 0.327 | Train Acc: 85.78%
	 Val. Loss: 0.493 |  Val. Acc: 81.25%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.194 | Train Acc: 92.50%
	 Val. Loss: 0.588 |  Val. Acc: 79.86%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.163 | Train Acc: 93.81%
	 Val. Loss: 0.685 |  Val. Acc: 80.09%
Epoch: 04 | Epoch Time: 0m 9s
	Train Loss: 0.138 | Train Acc: 94.90%
	 Val. Loss: 0.813 |  Val. Acc: 79.86%
Epoch: 05 | Epoch Time: 0m 9s
	Train Loss: 0.117 | Train Acc: 95.70%
	 Val. Loss: 0.955 |  Val. Acc: 80.44%


In [12]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 0.4549624267965555 | Test Acc: 0.8141741071428571 |


## 分析词向量和Attention向量

In [13]:
model = torch.load(model_name)

In [16]:
word_embedding = model.get_embed_weight()  # 注意多GPU的时候是这样
u = model.get_u()

In [17]:
print(word_embedding.size(), u.size())

torch.Size([14830, 100]) torch.Size([100])


In [18]:
u_repeat = u.repeat(word_embedding.size()[0], 1)
print(u_repeat.size())
cos_sim = torch.cosine_similarity(word_embedding, u_repeat, dim=1)

torch.Size([14830, 100])


In [19]:
cos_score, cos_idx = cos_sim.sort()

In [20]:
print("Cosine similarity最高的15个单词：")
for i, s in zip(cos_idx[-15:], cos_score[-15: ]):
    print(f"{idx2word[i.item()]} : {s.item()}")

Cosine similarity最高的15个单词：
nose : 0.8122435808181763
soccer : 0.819983184337616
telanovela : 0.8228465914726257
tank : 0.8235946297645569
hopelessly : 0.823834240436554
connected : 0.828292965888977
rises : 0.8358004689216614
n't : 0.8396202921867371
down : 0.8575065732002258
induces : 0.8670101165771484
seems : 0.8749212026596069
not : 0.8817094564437866
wrong : 0.885643720626831
minutes : 0.9224189519882202
or : 0.9354701042175293


In [21]:
print("Cosine similarity最低的15个单词：")
for i, s in zip(cos_idx[: 15], cos_score[: 15]):
    print(f"{idx2word[i.item()]} : {s.item()}")

Cosine similarity最低的15个单词：
caine : -0.9839131236076355
boys : -0.9837092161178589
stardom : -0.9834532141685486
player : -0.9832771420478821
roots : -0.9796615839004517
sometimes : -0.9795486927032471
contrivances : -0.9772619605064392
describe : -0.9753174185752869
words : -0.9740455746650696
purpose : -0.9728651642799377
italian : -0.9713523387908936
actually : -0.9706254601478577
quite : -0.9703256487846375
delivery : -0.9696605205535889
enervating : -0.9686934351921082


## 相同单词在不同语境下attention的变化

In [22]:
words_freq = []
for w in words_set:
    if w[-1] >100:
        words_freq.append(w[0])
print(len(words_freq))

661


In [23]:
" ".join(words_freq)

"the , a and of . to 's is that in it as with an film its for movie this you but be on n't by more -- one at than has not about his from are like so or all have most story ' good ... into out too who -rrb- up characters i funny -lrb- comedy if just no does much what can even ` your their will time some bad `` little '' very way which best any love been life make work enough there only he makes us new movies never something do they through was well action great would own made director humor many we really performances plot drama her how could films sense see such better other fun audience people every off two without cast nothing feel both when being look character may should entertaining acting real ever often performance them long : while still world because script also interesting another heart kind 're those hollywood dialogue watch minutes first screen down few get big over far thriller might less hard human moments actors tale compelling romantic rather cinema had year family almo

In [24]:
def word2sentences(sent_list, words_set, word2idx, freq=100):
    words_freq = []
    for w in words_set:
        if w[-1] >100:
            words_freq.append(w[0])
    print(len(words_freq), words_freq[0])
    w2sents = {}
    w2sentnums = {}
    for w in words_freq:
        w2sents[w] = []
        w2sentnums[w] = []
        for s in sent_list:
            if w in s:
                w2sents[w].append(s)
                w2sentnums[w].append([word2idx[word] for word in s])
    return words_freq, w2sents, w2sentnums        

In [25]:
words_freq, w2sents, w2sentnums= word2sentences(train_sentences, words_set, word2idx, freq=100) 

661 the


In [26]:
len(w2sents['the'])

19892

In [27]:
len(w2sentnums['the'])

19892

In [28]:
def get_attentions(sentence, word_embedding, u, word2idx):
    """计算一个句子中每个单词在句子中的Attention，返回单词与Attention值的字典"""
    num_sentence = [word2idx[w] for w in sentence]
    s_embed = word_embedding[num_sentence]
    u = u.repeat(s_embed.size(0), 1)
    score = torch.cosine_similarity(s_embed, u, dim=1)
    attn = torch.softmax(score, dim=0)
    return {w:a for w,a in zip(sentence, attn.tolist()) }

In [30]:
model = torch.load(model_name)
word_embedding = model.get_embed_weight()   # 注意多GPU的时候中间加上module
u = model.get_u()

In [31]:
def get_word_sentens_attn_dic(w2sents, word_embedding, u, word2idx):
    word_sentens_attn_dic = {}
    word_attention_li = {}
    for word, sent_list in w2sents.items():
        word_sentens_attn_dic[word] = []
        word_attention_li[word] = []
        for sentence in sent_list:
            dic = get_attentions(sentence, word_embedding, u, word2idx)
            word_sentens_attn_dic[word].append(dic)
            word_attention_li[word].append(dic[word])
    return word_sentens_attn_dic, word_attention_li

In [32]:
word_sentens_attn_dic, word_attention_li = get_word_sentens_attn_dic(w2sents, word_embedding, u, word2idx)  # 这步很慢

In [33]:
def meam_std_list(word_attention_li):
    """计算Attentions的平均值和标准差，并按标准差排序"""
    word_mean_std_li = []
    for w in word_attention_li:
        arr = np.array(word_attention_li[w])
        word_mean_std_li.append((w, arr.mean(), arr.std()))
    word_mean_std_li = sorted(word_mean_std_li, key=lambda x:x[2], reverse=True)
    return word_mean_std_li

In [34]:
## 出现大于100次的单词在句子中的权重，句子需要是等长的吗？
word_attention_li['the']

[0.1190217062830925,
 0.0888654813170433,
 0.06562710553407669,
 0.04432917386293411,
 0.10898833721876144,
 0.2384878396987915,
 0.10173339396715164,
 0.10063987225294113,
 0.03695022687315941,
 0.05876564234495163,
 0.1934322565793991,
 0.10889307409524918,
 0.03232577070593834,
 0.11339320987462997,
 0.03404928743839264,
 0.11830116808414459,
 0.2688466012477875,
 0.06260889768600464,
 0.02309643104672432,
 0.06964331865310669,
 0.03861911594867706,
 0.13636240363121033,
 0.12204936146736145,
 0.06669215857982635,
 0.024942168965935707,
 0.03721117228269577,
 0.03583051636815071,
 0.03584276884794235,
 0.030492568388581276,
 0.049663230776786804,
 0.05735832452774048,
 0.017051974311470985,
 0.03523822873830795,
 0.03202565386891365,
 0.04725823923945427,
 0.12893228232860565,
 0.05369587987661362,
 0.059402815997600555,
 0.03468054160475731,
 0.05395803973078728,
 0.03539625182747841,
 0.2015077769756317,
 0.09085898846387863,
 0.028005624189972878,
 0.0667819231748581,
 0.04037237

In [35]:
word_mean_std_li = meam_std_list(word_attention_li)

In [36]:
print('30个标准差最大的单词: ')
for word, amean, astd in word_mean_std_li[:30]:
    print('{} | std：{:.4}'.format(word, astd))

30个标准差最大的单词: 
awful | std：0.2445
stupid | std：0.2341
tedious | std：0.2339
terrific | std：0.2335
watchable | std：0.2332
excellent | std：0.225
painful | std：0.2246
brilliant | std：0.2244
impressive | std：0.2235
appealing | std：0.2233
inventive | std：0.2226
waste | std：0.2218
beautifully | std：0.2206
flat | std：0.218
bland | std：0.2173
worthy | std：0.2161
remarkable | std：0.2155
provocative | std：0.2146
intriguing | std：0.2144
cool | std：0.2143
fine | std：0.2141
boring | std：0.2137
unfunny | std：0.2135
mess | std：0.2135
hackneyed | std：0.2115
engrossing | std：0.2104
gorgeous | std：0.2103
lacking | std：0.2083
delightful | std：0.2069
stylish | std：0.2067


In [37]:
word_sentens_attn_dic['awful']

[{'inane': 0.4294554591178894,
  'and': 0.1071346178650856,
  'awful': 0.4634098410606384},
 {'a': 0.14175325632095337,
  'thoroughly': 0.14105528593063354,
  'awful': 0.580115556716919,
  'movie': 0.13707591593265533},
 {'is': 0.13191638886928558,
  'awful': 0.6552016139030457,
  '.': 0.21288198232650757},
 {'this': 0.034046296030282974,
  'wretchedly': 0.16158755123615265,
  'unfunny': 0.1627073585987091,
  'wannabe': 0.16242194175720215,
  'comedy': 0.05273708328604698,
  'is': 0.03338056057691574,
  'inane': 0.1536465287208557,
  'and': 0.03832961246371269,
  'awful': 0.16579440236091614,
  '-': 0.035348568111658096},
 {'if': 0.06239921972155571,
  'oscar': 0.0761847272515297,
  'had': 0.02653614804148674,
  'a': 0.022323040291666985,
  'category': 0.02682187594473362,
  'called': 0.0599784180521965,
  'best': 0.06136414036154747,
  'bad': 0.09426537156105042,
  'film': 0.028234658762812614,
  'you': 0.03237655386328697,
  'thought': 0.03296743333339691,
  'was': 0.0536056123673915

In [41]:
word = "beautifully"
i = 0
for dic in word_sentens_attn_dic[word]:
    word_score = dic[word]
    sentence = []
    sentence_scores = []
    for k, v in dic.items():
        sentence.append(k)
        sentence_scores.append(v)
    print(len(sentence), "|", word_score, "|", sentence, sentence_scores)
    i+=1
    if i > 10:
        break

2 | 0.7551351189613342 | ['a', 'beautifully'] [0.24486486613750458, 0.7551351189613342]
24 | 0.07511750608682632 | ['beautifully', 'acted', 'and', 'directed', ',', 'it', "'s", 'clear', 'that', 'washington', 'most', 'certainly', 'has', 'a', 'new', 'career', 'ahead', 'of', 'him', 'if', 'he', 'so', 'chooses', '.'] [0.07511750608682632, 0.030896199867129326, 0.023045657202601433, 0.019230667501688004, 0.06678497791290283, 0.04165038466453552, 0.03323228657245636, 0.020300572738051414, 0.03293433040380478, 0.07515758275985718, 0.018535463139414787, 0.07497537136077881, 0.0301223024725914, 0.024358076974749565, 0.024615293368697166, 0.019183138385415077, 0.07763700187206268, 0.05293694883584976, 0.019979232922196388, 0.06808774173259735, 0.038470346480607986, 0.029685400426387787, 0.07067517191171646, 0.032388318330049515]
10 | 0.21474631130695343 | ['should', 'have', 'a', 'stirring', 'time', 'at', 'this', 'beautifully', 'drawn', 'movie'] [0.11354087293148041, 0.06269878894090652, 0.06963498