## RNN with attention for text classification on amazon reviews

In [1]:
import torch

import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import random
import torch.optim as optim

import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import math
import pickle

from torchtext.data import Field
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
from tqdm.notebook import tqdm

In [45]:
from apex import amp

In [3]:
# change your path here
PATH = Path("/data2/yinterian/Amazon_review_2014")

Look at `pre_processing_amazon_reviews.ipynb` for pre-processing steps for amazon review dataset

## Dataset

In [4]:
vocab = pickle.load(open(PATH/"vocab123.pickle", "rb"))
vocab_size = len(vocab)

In [5]:
class ReviewsDataset(Dataset):
    def __init__(self, df):
        
        self.reviews = df.review.values
        self.summaries = df.summary.values
        self.new_classes = {1:0, 2:0, 3:1, 4:2, 5:2}
        self.targets = [self.new_classes[x] for x in df.rating.values]

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        summary = self.summaries[idx]
        review = self.reviews[idx]
        words = summary + [word for sent in review for word in sent]
        target = self.targets[idx]
        return words, target

In [6]:
train_df = pickle.load(open(PATH/"Train123.pickle", "rb"))
valid_df = pickle.load(open(PATH/"Valid123.pickle", "rb"))

In [7]:
train_df.head()

Unnamed: 0,review,summary,rating
0,"[[1237, 2102, 1463, 377, 286, 3089, 304, 2], [...","[842, 485]",5
1,"[[37311, 0, 25, 347, 3, 4987, 332, 2681, 303, ...","[332, 2681, 303]",2
2,"[[13, 318, 4], [1370, 6, 2], [207, 64, 10, 418...",[1370],5
3,"[[336, 143, 8, 2], [14, 50, 1018, 3686, 5491, ...","[7, 1123, 3, 1633, 15383]",5
4,"[[1920, 63, 49809, 2055, 1841, 3, 193, 2], [35...",[87],3


In [8]:
train_ds = ReviewsDataset(train_df)
valid_ds = ReviewsDataset(valid_df)

In [9]:
train_ds[0]

([842,
  485,
  1237,
  2102,
  1463,
  377,
  286,
  3089,
  304,
  2,
  33,
  61,
  5,
  251,
  5,
  193,
  2],
 2)

In [10]:
def collate_fn(batch):
    seqs = [torch.LongTensor(sample[0]) for sample in batch]    
    targets = torch.LongTensor([sample[1] for sample in batch])
    seqlens = np.array([len(seq) for seq in seqs])
    
    # pad the batch and re-order 
    padded_seq = torch.zeros(seqlens.shape[0], int(seqlens.max())).long()
    for idx, length in enumerate(seqlens):
        padded_seq[idx, :length] = seqs[idx]
    indices = np.argsort(-seqlens)
    seqlens = seqlens[indices]
    padded_seq = padded_seq[torch.LongTensor(indices)]
    targets = targets[torch.LongTensor(indices)]
    
    return (padded_seq, seqlens, targets)

In [11]:
batch = [train_ds[0], train_ds[1]]

In [12]:
collate_fn(batch)

(tensor([[  332,  2681,   303, 37311,     0,    25,   347,     3,  4987,   332,
           2681,   303,     2,  2681,  4114,  1437, 13787,  2253,  6143,     2,
             26,  4032,   347,  2689,     2,  1073,  3319,   602, 60148,  1718,
            447,     3,   255,   347,  1224,   602,     2],
         [  842,   485,  1237,  2102,  1463,   377,   286,  3089,   304,     2,
             33,    61,     5,   251,     5,   193,     2,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]]),
 array([37, 17]),
 tensor([0, 2]))

In [13]:
train_dl = DataLoader(train_ds, batch_size=2, shuffle=False, collate_fn=collate_fn)

In [14]:
seq, seqlens, y = next(iter(train_dl))

In [15]:
seq

tensor([[  332,  2681,   303, 37311,     0,    25,   347,     3,  4987,   332,
          2681,   303,     2,  2681,  4114,  1437, 13787,  2253,  6143,     2,
            26,  4032,   347,  2689,     2,  1073,  3319,   602, 60148,  1718,
           447,     3,   255,   347,  1224,   602,     2],
        [  842,   485,  1237,  2102,  1463,   377,   286,  3089,   304,     2,
            33,    61,     5,   251,     5,   193,     2,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]])

In [16]:
emb = nn.Embedding(vocab_size, 10, padding_idx=0)

In [17]:
gru = nn.GRU(10, 5, batch_first=True)

In [18]:
seq_emb = emb(seq)
seq_emb.shape

torch.Size([2, 37, 10])

In [19]:
seq_pack = pack_padded_sequence(seq_emb, seqlens, batch_first=True)

In [20]:
out_pack, _ = gru(seq_pack)

In [21]:
out_pad, lens = pad_packed_sequence(out_pack, batch_first=True)

In [22]:
out_pad.shape

torch.Size([2, 37, 5])

In [23]:
## attention module
mlp = nn.Sequential(nn.Linear(5, 5), nn.Tanh())
context_vector = nn.Parameter(torch.Tensor(5))

In [24]:
u = mlp(out_pad)
u.shape

torch.Size([2, 37, 5])

In [25]:
attn_weight = F.softmax(u.matmul(context_vector), dim=1)
attn_weight.shape

torch.Size([2, 37])

In [26]:
attn_weight

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.]], grad_fn=<SoftmaxBackward>)

In [27]:
attn_weight.sum(1)

tensor([1., 1.], grad_fn=<SumBackward1>)

In [28]:
a = attn_weight.unsqueeze(1)
a.shape

torch.Size([2, 1, 37])

In [29]:
u.shape

torch.Size([2, 37, 5])

In [30]:
out = torch.bmm(a, u)
out.shape

torch.Size([2, 1, 5])

In [31]:
out = out[:,0,:]
out.shape

torch.Size([2, 5])

## Model description

Let $w_t$ be the index of word $t$ in a review.

**Step 1:**
Apply embedding to $w_t$,
$x_t$ = Embddding($w_t$)

**Step 2:** Take the sequence $x = (x_1, \dots, x_T)$ through a GRU layer. $h_1, \dots h_T = GRU(x_1, \dots, x_T)$.
$h_t$ is a representation of the information around $w_t$.

**Step 3: Attention** Not all words contribute equally to the representation of the sentece meaning. We compute attention on each world. 

$u_t = tanh(Wh_t + b)$

$\alpha_t = \frac{exp(u_t u_w)}{\sum_j exp(u_j u_w) }$

$s = \sum_t \alpha_t h_t$

**Step 4:** Linear layer on $s$


## Model

In [32]:
class AttentionModule(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AttentionModule, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.Tanh())
        self.context_vector = nn.Parameter(torch.Tensor(output_dim))

    def forward(self, x):
        u = self.mlp(x)
        alpha = F.softmax(u.matmul(self.context_vector), dim=1)
        weighted_out = torch.bmm(alpha.unsqueeze(1), u)
        return  weighted_out[:,0,:]


class RNNattention(nn.Module):

    def __init__(self, vocab_size, emb_dim=100, hidden_dim=50, output_dim=3):
        super(RNNattention, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.attention = AttentionModule(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, seqs, seqlens):
        
        embs = self.emb(seqs)
        seq_pack = pack_padded_sequence(embs, seqlens, batch_first=True)
        out_pack, _ = self.gru(seq_pack)
        out_pad, _ = pad_packed_sequence(out_pack, batch_first=True)
        att_out = self.attention(out_pad)  
        outputs = self.classifier(att_out)    
        return outputs

In [33]:
model = RNNattention(vocab_size)

In [34]:
seqs, seqlens, y = next(iter(train_dl))

In [35]:
out = model(seqs, seqlens)

In [36]:
F.cross_entropy(out, y)

tensor(1.0935, grad_fn=<NllLossBackward>)

## Training 

In [40]:
def train_epocs(model, train_dl, valid_dl, optimizer, epochs=10):
    iterations = epochs*len(train_dl)
    pbar = tqdm(total=iterations)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.cuda()
            y = y.cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            #loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            pbar.update()
        val_loss, val_acc = val_metrics(model, valid_dl)
        print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [41]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.cuda()
        y = y.cuda()
        y_hat = model(x, s)
        loss = F.cross_entropy(y_hat, y)
        preds = torch.max(y_hat, dim=1)[1]
        correct += (preds==targs).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [42]:
train_dl = DataLoader(train_ds, batch_size=10000, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=10000, shuffle=False, collate_fn=collate_fn)

In [43]:
model = RNNattention(vocab_size).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [46]:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", 
                                  keep_batchnorm_fp32=True, loss_scale="dynamic")

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


In [None]:
train_epocs(model, train_dl, valid_dl, optimizer, epochs=10)

HBox(children=(FloatProgress(value=0.0, max=4800.0), HTML(value='')))

## References
* Hierarchical Attention Networks for Document Classification. Zichao Yang, Diyi Yang, Chris Dyer, Xiaodong He, Alex Smola, Eduard Hovy
* https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/attention.html
