## RNN with attention for text classification on amazon reviews

In [1]:
import torch

import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import random
import torch.optim as optim

import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import math
import pickle

from torchtext.data import Field
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
from tqdm.notebook import tqdm

In [3]:
from apex import amp

In [4]:
# change your path here
PATH = Path("/data2/yinterian/Amazon_review_2014")

Look at `pre_processing_amazon_reviews.ipynb` for pre-processing steps for amazon review dataset

## Dataset

In [5]:
vocab = pickle.load(open(PATH/"vocab123.pickle", "rb"))
vocab_size = len(vocab)

In [6]:
class ReviewsDataset(Dataset):
    def __init__(self, df):
        
        self.reviews = df.review.values
        self.summaries = df.summary.values
        self.new_classes = {1:0, 2:0, 3:1, 4:2, 5:2}
        self.targets = [self.new_classes[x] for x in df.rating.values]

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        summary = self.summaries[idx]
        review = self.reviews[idx]
        words = summary + [word for sent in review for word in sent]
        target = self.targets[idx]
        return words, target

In [7]:
train_df = pickle.load(open(PATH/"Train123.pickle", "rb"))
valid_df = pickle.load(open(PATH/"Valid123.pickle", "rb"))

In [8]:
train_df.shape, valid_df.shape

((4799998, 3), (600000, 3))

In [9]:
train_df.head()

Unnamed: 0,review,summary,rating
0,"[[1237, 2102, 1463, 377, 286, 3089, 304, 2], [...","[842, 485]",5
1,"[[37311, 0, 25, 347, 3, 4987, 332, 2681, 303, ...","[332, 2681, 303]",2
2,"[[13, 318, 4], [1370, 6, 2], [207, 64, 10, 418...",[1370],5
3,"[[336, 143, 8, 2], [14, 50, 1018, 3686, 5491, ...","[7, 1123, 3, 1633, 15383]",5
4,"[[1920, 63, 49809, 2055, 1841, 3, 193, 2], [35...",[87],3


In [10]:
train_ds = ReviewsDataset(train_df[:600000])
valid_ds = ReviewsDataset(valid_df[:60000])

In [11]:
train_ds[0]

([842,
  485,
  1237,
  2102,
  1463,
  377,
  286,
  3089,
  304,
  2,
  33,
  61,
  5,
  251,
  5,
  193,
  2],
 2)

In [12]:
def collate_fn(batch):
    seqs = [torch.LongTensor(sample[0]) for sample in batch]    
    targets = torch.LongTensor([sample[1] for sample in batch])
    seqlens = np.array([len(seq) for seq in seqs])
    
    # pad the batch and re-order 
    padded_seq = torch.zeros(seqlens.shape[0], int(seqlens.max())).long()
    for idx, length in enumerate(seqlens):
        padded_seq[idx, :length] = seqs[idx]
    indices = np.argsort(-seqlens)
    seqlens = seqlens[indices]
    padded_seq = padded_seq[torch.LongTensor(indices)]
    targets = targets[torch.LongTensor(indices)]
    
    return (padded_seq, seqlens, targets)

In [13]:
batch = [train_ds[0], train_ds[1]]

In [14]:
collate_fn(batch)

(tensor([[  332,  2681,   303, 37311,     0,    25,   347,     3,  4987,   332,
           2681,   303,     2,  2681,  4114,  1437, 13787,  2253,  6143,     2,
             26,  4032,   347,  2689,     2,  1073,  3319,   602, 60148,  1718,
            447,     3,   255,   347,  1224,   602,     2],
         [  842,   485,  1237,  2102,  1463,   377,   286,  3089,   304,     2,
             33,    61,     5,   251,     5,   193,     2,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]]),
 array([37, 17]),
 tensor([0, 2]))

In [88]:
train_dl = DataLoader(train_ds, batch_size=3, shuffle=False, collate_fn=collate_fn)

In [89]:
seq, seqlens, y = next(iter(train_dl))

In [90]:
seq

tensor([[ 1370,    13,   318,     4,  1370,     6,     2,   207,    64,    10,
           418,     2,   119, 12511,    43,  5121,  1166,     2,   400,  6887,
             9, 10027,  1289,   977,    43,  1196,     4,   174,    87,     2,
           188,  1314,    87,    91,    87,   127,   675,    95,     2],
        [  332,  2681,   303, 37311,     0,    25,   347,     3,  4987,   332,
          2681,   303,     2,  2681,  4114,  1437, 13787,  2253,  6143,     2,
            26,  4032,   347,  2689,     2,  1073,  3319,   602, 60148,  1718,
           447,     3,   255,   347,  1224,   602,     2,     0,     0],
        [  842,   485,  1237,  2102,  1463,   377,   286,  3089,   304,     2,
            33,    61,     5,   251,     5,   193,     2,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [91]:
emb = nn.Embedding(vocab_size, 10, padding_idx=0)

In [92]:
gru = nn.GRU(10, 5, batch_first=True, bidirectional=True)

In [93]:
seq_emb = emb(seq)
seq_emb.shape

torch.Size([3, 39, 10])

In [94]:
seq_pack = pack_padded_sequence(seq_emb, seqlens, batch_first=True)

In [95]:
out_pack, ht = gru(seq_pack)

In [96]:
out_pad, lens = pad_packed_sequence(out_pack, batch_first=True)

In [97]:
out_pad.shape

torch.Size([3, 39, 10])

In [98]:
ht.shape

torch.Size([2, 3, 5])

In [99]:
ht[0].shape

torch.Size([3, 5])

In [100]:
ht[1].shape

torch.Size([3, 5])

In [103]:
torch.cat([ht[0], ht[1]], 1).shape

torch.Size([3, 10])

In [25]:
## attention module
mlp = nn.Sequential(nn.Linear(5, 5), nn.Tanh())
context_vector = nn.Parameter(torch.Tensor(5))

In [26]:
u = mlp(out_pad)
u.shape

torch.Size([2, 37, 5])

In [27]:
context_vector.shape

torch.Size([5])

In [28]:
attn_weight = F.softmax(u.matmul(context_vector), dim=1)
attn_weight.shape

torch.Size([2, 37])

In [29]:
attn_weight

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.]], grad_fn=<SoftmaxBackward>)

In [30]:
attn_weight.sum(1)

tensor([1., 1.], grad_fn=<SumBackward1>)

In [31]:
a = attn_weight.unsqueeze(1)
a.shape

torch.Size([2, 1, 37])

In [32]:
u.shape

torch.Size([2, 37, 5])

In [33]:
out = torch.bmm(a, u)
out.shape

torch.Size([2, 1, 5])

In [34]:
out = out[:,0,:]
out.shape

torch.Size([2, 5])

## Model description

Let $w_t$ be the index of word $t$ in a review.

**Step 1:**
Apply embedding to $w_t$,
$x_t$ = Embddding($w_t$)

**Step 2:** Take the sequence $x = (x_1, \dots, x_T)$ through a GRU layer. $h_1, \dots h_T = GRU(x_1, \dots, x_T)$.
$h_t$ is a representation of the information around $w_t$.

**Step 3: Attention** Not all words contribute equally to the representation of the sentece meaning. We compute attention on each world. 

$u_t = tanh(Wh_t + b)$

$\alpha_t = \frac{exp(u_t u_w)}{\sum_j exp(u_j u_w) }$

$s = \sum_t \alpha_t h_t$

**Step 4:** Linear layer on $s$


## Model

In [59]:
class AttentionModule(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AttentionModule, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.Tanh())
        self.context_vector = nn.Parameter(torch.Tensor(output_dim))

    def forward(self, x):
        u = self.mlp(x)
        alpha = F.softmax(u.matmul(self.context_vector), dim=1)
        weighted_out = torch.bmm(alpha.unsqueeze(1), u)
        return  weighted_out[:,0,:], alpha


class RNNattention(nn.Module):

    def __init__(self, vocab_size, emb_dim=100, hidden_dim=50, output_dim=3, dropout=0.2, bi=False):
        super(RNNattention, self).__init__()
        self.dropout_p = dropout
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=bi)
        self.attention = AttentionModule(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, seqs, seqlens):
        embs = self.emb(seqs)
        if self.dropout_p > 0.0:
            embs = self.dropout(embs)
        seq_pack = pack_padded_sequence(embs, seqlens, batch_first=True)
        out_pack, _ = self.gru(seq_pack)
        out_pad, _ = pad_packed_sequence(out_pack, batch_first=True)
        att_out, alpha = self.attention(out_pad)  
        outputs = self.classifier(att_out)    
        return outputs, alpha

## Baseline RNN Model

In [None]:
class RNNBI(nn.Module):

    def __init__(self, vocab_size, emb_dim=100, hidden_dim=50, output_dim=3, bi=False):
        super(RNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=bi)
        self.classifier = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, seqs, seqlens):
        embs = self.emb(seqs)
        embs = self.dropout(embs)
        seq_pack = pack_padded_sequence(embs, seqlens, batch_first=True)
        _, ht = self.gru(seq_pack)
        ht = torch.cat([ht[0], ht[1]], 1)
        outputs = self.classifier(ht)    
        return outputs, ht

In [104]:
class RNN(nn.Module):

    def __init__(self, vocab_size, emb_dim=100, hidden_dim=50, output_dim=3, bi=False):
        super(RNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=bi)
        self.classifier = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, seqs, seqlens):
        embs = self.emb(seqs)
        embs = self.dropout(embs)
        seq_pack = pack_padded_sequence(embs, seqlens, batch_first=True)
        _, ht = self.gru(seq_pack)
        ht = torch.cat([ht[0], ht[1]], 1)
        outputs = self.classifier(ht)    
        return outputs, ht

In [105]:
model = RNN(vocab_size, bi=True)

In [106]:
seqs, seqlens, y = next(iter(train_dl))

In [107]:
out, _ = model(seqs, seqlens)

In [108]:
F.cross_entropy(out, y)

tensor(1.0965, grad_fn=<NllLossBackward>)

## Training 

In [65]:
def train_epocs(model, train_dl, valid_dl, optimizer, epochs=10):
    iterations = epochs*len(train_dl)
    pbar = tqdm(total=iterations)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.cuda()
            y = y.cuda()
            y_pred, _ = model(x, s)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            pbar.update()
        val_loss, val_acc = val_metrics(model, valid_dl)
        print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [66]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.cuda()
        y = y.cuda()
        y_hat, _ = model(x, s)
        loss = F.cross_entropy(y_hat, y)
        preds = torch.max(y_hat, dim=1)[1]
        correct += (preds==y).float().sum().item()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [67]:
# for testing, start with less data
train_ds = ReviewsDataset(train_df)
valid_ds = ReviewsDataset(valid_df)

In [68]:
batch_size=1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [47]:
# training baseline model
model = RNN(vocab_size).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=48000.0), HTML(value='')))

train loss 0.339 val loss 0.305 and val accuracy 0.887
train loss 0.308 val loss 0.294 and val accuracy 0.890
train loss 0.296 val loss 0.284 and val accuracy 0.894
train loss 0.288 val loss 0.278 and val accuracy 0.897
train loss 0.281 val loss 0.275 and val accuracy 0.897
train loss 0.276 val loss 0.271 and val accuracy 0.899
train loss 0.271 val loss 0.266 and val accuracy 0.902
train loss 0.267 val loss 0.263 and val accuracy 0.902
train loss 0.264 val loss 0.260 and val accuracy 0.904
train loss 0.261 val loss 0.258 and val accuracy 0.905


In [None]:
# training bidirectional baseline model
model = RNN(vocab_size, bi=True).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=16000000.0), HTML(value='')))

## Playing with learning rates and dropout
Bottom line: the right hyperparameters are supper important. I used a small subset of the dats here.

In [38]:
n = len(train_ds)

In [39]:
train_ds = ReviewsDataset(train_df[:n])
valid_ds = ReviewsDataset(valid_df)

In [40]:
batch_size=1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [48]:
model = RNNattention(vocab_size, dropout=0.0).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [49]:
# without dropout and with a small learning rate
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132
train loss nan val loss nan and val accuracy 0.132


In [50]:
model = RNNattention(vocab_size, dropout=0.0).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

train loss 0.419 val loss 0.337 and val accuracy 0.875
train loss 0.316 val loss 0.319 and val accuracy 0.882
train loss 0.273 val loss 0.317 and val accuracy 0.886
train loss 0.231 val loss 0.328 and val accuracy 0.886
train loss 0.193 val loss 0.338 and val accuracy 0.887
train loss 0.162 val loss 0.371 and val accuracy 0.880
train loss 0.138 val loss 0.387 and val accuracy 0.885
train loss 0.121 val loss 0.438 and val accuracy 0.883
train loss 0.108 val loss 0.448 and val accuracy 0.885
train loss 0.097 val loss 0.470 and val accuracy 0.886


In [51]:
# with dropout 0.2
model = RNNattention(vocab_size, dropout=0.2).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

train loss 0.461 val loss 0.356 and val accuracy 0.870
train loss 0.346 val loss 0.334 and val accuracy 0.878
train loss 0.315 val loss 0.324 and val accuracy 0.881
train loss 0.291 val loss 0.320 and val accuracy 0.883
train loss 0.271 val loss 0.314 and val accuracy 0.886
train loss 0.251 val loss 0.320 and val accuracy 0.888
train loss 0.233 val loss 0.332 and val accuracy 0.888
train loss 0.217 val loss 0.332 and val accuracy 0.889
train loss 0.205 val loss 0.343 and val accuracy 0.889
train loss 0.192 val loss 0.362 and val accuracy 0.887


In [52]:
# with dropout 0.2
model = RNNattention(vocab_size, dropout=0.2).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

train loss 0.457 val loss 0.347 and val accuracy 0.873
train loss 0.341 val loss 0.329 and val accuracy 0.878
train loss 0.317 val loss 0.323 and val accuracy 0.881
train loss 0.298 val loss 0.319 and val accuracy 0.884
train loss 0.282 val loss 0.315 and val accuracy 0.886
train loss 0.266 val loss 0.319 and val accuracy 0.887
train loss 0.251 val loss 0.320 and val accuracy 0.888
train loss 0.237 val loss 0.323 and val accuracy 0.888
train loss 0.225 val loss 0.323 and val accuracy 0.889
train loss 0.213 val loss 0.335 and val accuracy 0.889


## Training on all the data

In [53]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [54]:
train_ds = ReviewsDataset(train_df)
valid_ds = ReviewsDataset(valid_df)

In [55]:
batch_size=1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [56]:
model = RNNattention(vocab_size).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=48000.0), HTML(value='')))

train loss 0.343 val loss 0.308 and val accuracy 0.887
train loss 0.309 val loss 0.295 and val accuracy 0.890
train loss 0.298 val loss 0.288 and val accuracy 0.893
train loss 0.292 val loss 0.287 and val accuracy 0.894
train loss 0.286 val loss 0.280 and val accuracy 0.896
train loss 0.281 val loss 0.277 and val accuracy 0.897
train loss 0.277 val loss 0.276 and val accuracy 0.897
train loss 0.274 val loss 0.272 and val accuracy 0.899
train loss 0.271 val loss 0.270 and val accuracy 0.900
train loss 0.268 val loss 0.272 and val accuracy 0.899


In [57]:
val_loss, val_acc = val_metrics(model, valid_dl)
path = "{0}/models/model_att_acc_{1:.0f}.pth".format(PATH, 100*val_acc) 
save_model(model, path)
print(val_loss, val_acc, path)

0.27222656562924386 0.898875 /data2/yinterian/Amazon_review_2014/models/model_att_acc_90.pth


In [58]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=48000.0), HTML(value='')))

train loss 0.261 val loss 0.262 and val accuracy 0.903
train loss 0.258 val loss 0.260 and val accuracy 0.904
train loss 0.255 val loss 0.259 and val accuracy 0.905
train loss 0.253 val loss 0.258 and val accuracy 0.905
train loss 0.251 val loss 0.255 and val accuracy 0.906
train loss 0.249 val loss 0.254 and val accuracy 0.907
train loss 0.247 val loss 0.252 and val accuracy 0.907
train loss 0.245 val loss 0.252 and val accuracy 0.908
train loss 0.243 val loss 0.254 and val accuracy 0.908
train loss 0.241 val loss 0.250 and val accuracy 0.909


In [59]:
val_loss, val_acc = val_metrics(model, valid_dl)
path = "{0}/models/model_att_acc_{1:.0f}.pth".format(PATH, 100*val_acc) 
save_model(model, path)
print(val_loss, val_acc, path)

0.24984130665659904 0.90873 /data2/yinterian/Amazon_review_2014/models/model_att_acc_91.pth


## Visualizing attention

In [86]:
import matplotlib
import matplotlib.pyplot as plt

def colorize(words, color_array):
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    cmap=matplotlib.cm.Blues
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

In [103]:
model = RNNattention(vocab_size).cuda()
path = '/data2/yinterian/Amazon_review_2014/models/model_att_acc_91.pth'
load_model(model, path)
model.eval()

RNNattention(
  (emb): Embedding(282174, 100, padding_idx=0)
  (gru): GRU(100, 50, batch_first=True)
  (attention): AttentionModule(
    (mlp): Sequential(
      (0): Linear(in_features=50, out_features=50, bias=True)
      (1): Tanh()
    )
  )
  (classifier): Linear(in_features=50, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [228]:
batch_size=1
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [229]:
x,s,y = next(iter(valid_dl))

In [230]:
out, alpha0 = model(x.cuda(), s)
F.softmax(out, 1)

tensor([[5.0937e-05, 5.7472e-04, 9.9937e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)

In [231]:
x, y

(tensor([[   7,   32,    7,   17,   40,  856, 8930, 3983,  548,    2, 1791, 2986,
             7, 2986,  149,    2]]),
 tensor([2]))

In [232]:
words = np.array([vocab.itos[w] for w in x[0].numpy()])
words

array(['great', 'price', 'great', 'product', 'perfect', 'keeping', 'pup',
       'cozy', 'dry', '.', 'snap', 'hood', 'great', 'hood', 'needed', '.'],
      dtype='<U7')

In [233]:
attn = alpha[0].cpu().detach().numpy()

In [234]:
s = colorize(words, len(words)/2.5*attn)

In [235]:
from IPython.display import display, HTML
display(HTML(s))

In [227]:
attn

array([0.20435348, 0.07452422, 0.09727789, 0.00580973, 0.06881874,
       0.06306457, 0.10320041, 0.00465984, 0.0258216 , 0.07656651,
       0.02928499, 0.07915612, 0.06866734, 0.0147968 , 0.08399773],
      dtype=float32)

## References
* Hierarchical Attention Networks for Document Classification. Zichao Yang, Diyi Yang, Chris Dyer, Xiaodong He, Alex Smola, Eduard Hovy
* https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/attention.html
* https://gist.github.com/ihsgnef/f13c35cd46624c8f458a4d23589ac768
* https://stackoverflow.com/questions/59220488/to-visualize-attention-color-tokens-using-attention-weights
