<a href="https://colab.research.google.com/github/respect5716/Deep-Learning-Paper-Implementation/blob/master/03_NLP/Attention%20is%20All%20You%20Need.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Attention is All You Need

## 0. Paper

### Info
* TItle : Attention is All You Need
* Author : Ashish Vaswani et al.
* Publication : [link](https://arxiv.org/abs/1706.03762)

### Summary
* Self Attention을 통해 RNN과 CNN의 한계 극복

### Differences
* Task : Translation -> Summarization
    * dataset : BBC News Summary, [link](https://www.kaggle.com/pariza/bbc-news-summary?)
* Layers : 12 -> 4


## 1. Setting

In [None]:
# Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q transformers

In [3]:
# Libraries
import os
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
from transformers import BertTokenizer

In [4]:
# GPU Setting
!nvidia-smi

print(f'tensorflow version : {tf.__version__}')
print(f'available GPU list : {tf.config.list_physical_devices("GPU")}')

Sun Aug 16 14:13:40 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
# Hyperparameters
CONFIG = {
    'base_dir' : '/content/drive/Shared drives/Yoon/Project/Doing/Deep Learning Paper Implementation',
    'seq_len' : 300,
    'num_layer' : 4,
    'num_head' : 4,
    'ffn_dim' : 128,
    'model_dim' : 128,
    'drop_rate' : 0.2,
    'batch_size' : 32,
    'epoch_size' : 100
}

## 2. Data

In [None]:
data_path = os.path.join(CONFIG['base_dir'], 'data/bbc_news_summary.zip')
!unzip "{data_path}" -d /content/data

In [7]:
article_dir = 'data/News Articles'
categories = [i for i in os.listdir(article_dir)]
files = [[os.path.join(i,j) for j in os.listdir(os.path.join(article_dir, i))] for i in categories]
files = list(itertools.chain(*files))
random.shuffle(files)

In [8]:
split_ratio = 0.2
split_idx = int(len(files) * split_ratio)
train_files = files[split_idx:]
test_files = files[:split_idx]

In [9]:
class Dataloader(tf.keras.utils.Sequence):
    def __init__(self, files: list, tokenizer: BertTokenizer, mode: str):
        self.files = pd.Series(files)
        self.tokenizer = tokenizer
        self.mode = mode
        self.on_epoch_end()
    
    def __len__(self):
        return np.ceil(len(self.files) / CONFIG['batch_size']).astype(np.int32)
    
    def on_epoch_end(self):
        self.idx = 0
        if self.mode == 'train':
            self.indices = np.random.permutation(len(self.files))
        else:
            self.indices = np.arange(len(self.files))

    @staticmethod
    def load_text(file_name, dir_name):
        with open(os.path.join('data', dir_name, file_name), 'r') as f:
            try:
                text = f.read()
            except:
                text = ''
        return text
    
    def encode(self, text, method):
        if method == 'encoder':
            encoded = self.tokenizer.encode(text, add_special_tokens=False, max_length=CONFIG['seq_len'], truncation=True)
        if method == 'decoder_inputs':
            encoded = self.tokenizer.encode(text, add_special_tokens=False, max_length=CONFIG['seq_len']-1, truncation=True)
            encoded = [self.tokenizer.bos_token_id] + encoded + [self.tokenizer.eos_token_id]
        if method == 'decoder_outputs':
            encoded = self.tokenizer.encode(text, add_special_tokens=False, max_length=CONFIG['seq_len'], truncation=True)
            encoded = encoded + [self.tokenizer.eos_token_id]
        
        encoded = encoded[:CONFIG['seq_len']]
        encoded += [self.tokenizer.pad_token_id for _ in range(CONFIG['seq_len'] - len(encoded))]
        return encoded

    @classmethod
    def decode(cls, encoded):
        return tokenizer.decode(encoded)
    
    def __getitem__(self, idx):
        batch_idx = self.indices[CONFIG['batch_size']*idx : CONFIG['batch_size']*(idx+1)]
        batch_files =  self.files[batch_idx]
        encoder_inputs = [self.load_text(i, 'News Articles') for i in batch_files]
        decoder_inputs = [self.load_text(i, 'Summaries') for i in batch_files]

        invalid_idx = [i for i,j in enumerate(encoder_inputs) if len(j) == 0] + [i for i,j in enumerate(decoder_inputs) if len(j) == 0]
        encoder_inputs = [j for i,j in enumerate(encoder_inputs) if i not in invalid_idx]
        decoder_inputs = [j for i,j in enumerate(decoder_inputs) if i not in invalid_idx]

        encoder_inputs = np.array([self.encode(i, 'encoder') for i in encoder_inputs])
        decoder_outputs = np.array([self.encode(i, 'decoder_outputs') for i in decoder_inputs])
        decoder_inputs = np.array([self.encode(i, 'decoder_inputs') for i in decoder_inputs])
        return (encoder_inputs, decoder_inputs), decoder_outputs

In [None]:
tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')
_ = tokenizer.add_special_tokens({'bos_token':"[BOS]", 'eos_token':'[EOS]'})

In [11]:
train_loader = Dataloader(train_files, tokenizer, 'train')

In [12]:
x, y = train_loader.__getitem__(0)

In [13]:
x[0].shape, x[1].shape, y.shape

((32, 300), (32, 300), (32, 300))

## 3. Model

In [14]:
class MaskLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(MaskLayer, self).__init__()

    def call(self, x):
        padding_mask = tf.cast(tf.math.equal(x, 0), tf.float32)[:, None, None, :]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((x.shape[-1], x.shape[-1])), -1, 0)
        return padding_mask, look_ahead_mask

class EmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, seq_len, model_dim):
        super(EmbeddingLayer, self).__init__()
        self.token_embedding = tf.keras.layers.Embedding(vocab_size, model_dim)
        self.position_embedding = tf.keras.layers.Embedding(seq_len, model_dim)
        self.pos = tf.range(0, seq_len)

    def call(self, x):
        x = self.token_embedding(x)
        pos = self.position_embedding(self.pos)
        x += pos
        return x


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, model_dim, num_head):
        super(MultiHeadAttention, self).__init__()
        self.model_dim = model_dim
        self.num_head = num_head
        self.projection_dim = self.model_dim // self.num_head
        assert self.model_dim % self.num_head == 0

        self.qw = tf.keras.layers.Dense(self.model_dim)
        self.kw = tf.keras.layers.Dense(self.model_dim)
        self.vw = tf.keras.layers.Dense(self.model_dim)
        self.w = tf.keras.layers.Dense(self.model_dim)
    
    def attention(self, q, k ,v, mask):
        dim = tf.cast(tf.shape(q)[-1], tf.float32)
        score = tf.matmul(q, k, transpose_b=True)
        scaled_score = score / tf.math.sqrt(dim)

        if mask is not None:
            scaled_score += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_score)
        attention_outputs = tf.matmul(attention_weights, v)
        return attention_outputs, attention_weights
    
    def split_heads(self, x):
        batch_size = tf.shape(x)[0]
        x = tf.reshape(x, (batch_size, -1, self.num_head, self.projection_dim))
        x = tf.transpose(x, perm=[0, 2, 1, 3])
        return x
    
    def combine_heads(self, x):
        batch_size = tf.shape(x)[0]
        x = tf.transpose(x, perm=[0, 2, 1, 3])
        x = tf.reshape(x, (batch_size, -1, self.model_dim))
        return x
    
    def call(self, q, k, v, mask):
        q, k, v = self.qw(q), self.kw(k), self.vw(v)
        q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v)
        outputs, weights = self.attention(q, k, v, mask)
        outputs = self.combine_heads(outputs)
        outputs = self.w(outputs)
        return outputs

class FeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, model_dim, ffn_dim):
        super(FeedForwardNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(ffn_dim)
        self.dense2 = tf.keras.layers.Dense(model_dim)

    @staticmethod
    def gelu(x):
        cdf = 0.5 * (1.0 + tf.math.erf(x / tf.sqrt(2.0)))
        return x * cdf

    def call(self, x):
        x = self.dense1(x)
        x = self.gelu(x)
        x = self.dense2(x)
        return x

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_head, model_dim, ffn_dim, drop_rate):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(model_dim, num_head)
        self.ffn = FeedForwardNetwork(model_dim, ffn_dim)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)

    def call(self, enc, training, padding_mask=None):
        out1 = self.mha(enc, enc, enc, padding_mask)
        out1 = self.dropout1(out1, training=training)
        out1 = self.layernorm1(enc + out1)
        out2 = self.ffn(out1)
        out2 = self.dropout2(out2, training=training)
        out2 = self.layernorm2(out1 + out2)
        return out2

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_head, model_dim, ffn_dim, drop_rate):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(model_dim, num_head)
        self.mha2 = MultiHeadAttention(model_dim, num_head)
        self.ffn = FeedForwardNetwork(model_dim, ffn_dim)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)
        self.dropout3 = tf.keras.layers.Dropout(drop_rate)

    def call(self, enc, dec, training, look_ahead_mask=None, padding_mask=None):
        out1 = self.mha1(dec, dec, dec, look_ahead_mask)
        out1 = self.dropout1(out1, training=training)
        out1 = self.layernorm1(dec + out1)
        out2 = self.mha2(out1, enc, enc, padding_mask)
        out2 = self.dropout2(out2, training=training)
        out2 = self.layernorm2(out1 + out2)
        out3 = self.ffn(out2)
        out3 = self.dropout3(out3, training=training)
        out3 = self.layernorm2(out2 + out3)
        return out3

class OutputLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size):
        super(OutputLayer, self).__init__()
        self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
    
    def call(self, x):
        outputs = self.dense(x)
        return outputs

class Network(tf.keras.Model):
    def __init__(self, vocab_size):
        super(Network, self).__init__()
        self.mask_layer = MaskLayer()
        self.embedding = EmbeddingLayer(vocab_size, CONFIG['seq_len'], CONFIG['model_dim'])
        self.encoders = [EncoderLayer(CONFIG['num_head'], CONFIG['model_dim'], CONFIG['ffn_dim'], CONFIG['drop_rate']) for _ in range(CONFIG['num_layer'])]
        self.decoders = [DecoderLayer(CONFIG['num_head'], CONFIG['model_dim'], CONFIG['ffn_dim'], CONFIG['drop_rate']) for _ in range(CONFIG['num_layer'])]
        self.output_layer = OutputLayer(vocab_size)
    
    def call(self, x, training):
        encoder_inputs, decoder_inputs = x
        padding_mask, _ = self.mask_layer(encoder_inputs)
        _, look_ahead_mask = self.mask_layer(decoder_inputs)

        enc = self.embedding(encoder_inputs)
        dec = self.embedding(decoder_inputs)

        for i in range(CONFIG['num_layer']):
            enc = self.encoders[i](enc, padding_mask=padding_mask, training=training)
            dec = self.decoders[i](enc, dec, padding_mask=padding_mask, look_ahead_mask=look_ahead_mask, training=training)
        
        outputs = self.output_layer(dec)
        return outputs

In [15]:
network = Network(tokenizer.vocab_size+2)
network.optimizer = tf.keras.optimizers.Adam()

pred = network(x)
pred.shape

TensorShape([32, 300, 30524])

## 4. Train

In [16]:
@tf.function
def train_step(network, x, y):
    with tf.GradientTape() as g:
        pred = network(x, training=True)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y, pred)

    gradients = g.gradient(loss, network.trainable_variables)
    network.optimizer.apply_gradients(zip(gradients, network.trainable_variables))  
    return tf.reduce_mean(loss) 

In [17]:
for ep in range(CONFIG['epoch_size']):
    for x, y in train_loader:
        loss = train_step(network, x, y)
    print(f'EP : {str(ep).zfill(3)} | Loss : {loss.numpy():.3f}')

print(f'EP : {str(ep+1).zfill(3)} | Loss : {loss.numpy():.3f}')

EP : 000 | Loss : 4.811
EP : 001 | Loss : 4.344
EP : 002 | Loss : 4.309
EP : 003 | Loss : 4.299
EP : 004 | Loss : 4.287
EP : 005 | Loss : 4.276
EP : 006 | Loss : 4.271
EP : 007 | Loss : 4.187
EP : 008 | Loss : 4.116
EP : 009 | Loss : 4.067
EP : 010 | Loss : 3.999
EP : 011 | Loss : 3.947
EP : 012 | Loss : 3.917
EP : 013 | Loss : 3.881
EP : 014 | Loss : 3.828
EP : 015 | Loss : 3.788
EP : 016 | Loss : 3.758
EP : 017 | Loss : 3.743
EP : 018 | Loss : 3.664
EP : 019 | Loss : 3.580
EP : 020 | Loss : 3.504
EP : 021 | Loss : 3.455
EP : 022 | Loss : 3.374
EP : 023 | Loss : 3.315
EP : 024 | Loss : 3.280
EP : 025 | Loss : 3.229
EP : 026 | Loss : 3.130
EP : 027 | Loss : 3.036
EP : 028 | Loss : 2.963
EP : 029 | Loss : 2.870
EP : 030 | Loss : 2.809
EP : 031 | Loss : 2.716
EP : 032 | Loss : 2.632
EP : 033 | Loss : 2.536
EP : 034 | Loss : 2.449
EP : 035 | Loss : 2.336
EP : 036 | Loss : 2.234
EP : 037 | Loss : 2.141
EP : 038 | Loss : 2.103
EP : 039 | Loss : 2.008
EP : 040 | Loss : 1.929
EP : 041 | Loss 

## 5. Test

In [18]:
def predict_greedy(network, encoder_inputs):
    encoder_inputs = np.array(encoder_inputs)[None, :]
    decoder_inputs = np.ones((1, CONFIG['seq_len'])) * tokenizer.pad_token_id
    decoder_inputs[0][0] = tokenizer.bos_token_id

    for i in range(1, CONFIG['seq_len']):
        pred = network((encoder_inputs, decoder_inputs), training=False)
        pred = tf.random.categorical(tf.math.log(pred[0][i:i+1] + 1e-5), 1)[0][0]
        decoder_inputs[0][i] = pred
        if pred == tokenizer.eos_token_id:
            break
    
    return decoder_inputs[0]

In [None]:
test_loader = Dataloader(test_files, tokenizer, 'test')
x, y = test_loader.__getitem__(0)
source = x[0][0]
summarized = predict_greedy(network, source)

In [None]:
print('Source')
print(tokenizer.decode(source))
print('\n')
print('Summarized')
print(tokenizer.decode(summarized))