<b>Transformer Network for Text Summarization</b>
</br>

In [None]:
# global installs
!pip install datasets rouge transformers torch sentencepiece tqdm

In [None]:
!pip install numpy==1.23.4

In [1]:
# global imports
from tqdm import tqdm
import torch
import math

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print('Will run on device: %s' % (device))

Will run on device: cuda:0


<b> Rouge evaluation funciton </b>

In [3]:
from rouge import Rouge
import numpy as np

rouge_ = Rouge()

def print_rouge_score(generated, reference):
    l_rouge = []
    
    for row in tqdm(rouge_.get_scores(generated, reference)):
        l_rouge.append([[row['rouge-1']['r'], row['rouge-1']['p'], row['rouge-1']['f']], 
                        [row['rouge-2']['r'], row['rouge-2']['p'], row['rouge-2']['f']],
                        [row['rouge-l']['r'], row['rouge-l']['p'], row['rouge-l']['f']]])

    print('\n\n')
    print('rouge-1: r:%2.2f, p:%2.2f f:%2.2f' % (np.mean(l_rouge[0][0]), np.mean(l_rouge[0][1]), np.mean(l_rouge[0][2])))
    print('rouge-2: r:%2.2f, p:%2.2f f:%2.2f' % (np.mean(l_rouge[1][0]), np.mean(l_rouge[1][1]), np.mean(l_rouge[1][2])))
    print('rouge-l: r:%2.2f, p:%2.2f f:%2.2f' % (np.mean(l_rouge[2][0]), np.mean(l_rouge[2][1]), np.mean(l_rouge[2][2])))

<b> Transformer Architecture Components and Implementation </b>

In [4]:
from torch import nn

In [5]:
class PostionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PostionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False  # we don't need to compute gradient
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]
        

In [6]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        batch_size, head, length, d_tensor = k.size()
        k_t = k.transpose(2, 3)  # transpose
        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        score = self.softmax(score)
        v = score @ v

        return v, score

In [7]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [8]:
class TokenEmbedding(nn.Embedding):

    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)

In [9]:
class TransformerEmbedding(nn.Module):

    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        # self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.pos_emb = PostionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

In [10]:
class LayerNorm(nn.Module):

    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out

In [11]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        q, k, v = self.split(q), self.split(k), self.split(v)
        out, attention = self.attention(q, k, v, mask=mask)
        out = self.concat(out)
        out = self.w_concat(out)
        # TODO : we should implement visualization
        return out

    def split(self, tensor):
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        return tensor

    def concat(self, tensor):
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [12]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, s_mask):
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=s_mask)
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        _x = x
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x

In [13]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, t_mask, s_mask):
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=t_mask)
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=s_mask)
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        _x = x
        x = self.ffn(x)
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [14]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, s_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, s_mask)

        return x

In [15]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)
        return output

In [16]:
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, 
                 enc_voc_size, dec_voc_size, d_model, n_head, max_len, ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.max_len = max_len
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_pad_mask(src, src)
        src_trg_mask = self.make_pad_mask(trg, src)
        trg_mask = self.make_pad_mask(trg, trg) * \
                   self.make_no_peak_mask(trg, trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output

    def make_pad_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        k = k.ne(self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        k = k.repeat(1, 1, len_q, 1)
        q = q.ne(self.src_pad_idx).unsqueeze(1).unsqueeze(3)
        q = q.repeat(1, 1, 1, len_k)
        mask = k & q
        return mask

    def make_no_peak_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)
        return mask
    
    def encode(self, src, src_mask):
        return self.encoder(src, src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(tgt, memory, src_mask, tgt_mask)

<b> Model lifecycle functions definition </b>

In [17]:
def train(tokenizer, model, iterator, optimizer, criterion, clip, max_len):
    model.train()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
        sample_list = list(zip(batch['article'], batch['highlights']))
        for article, highlight in sample_list:
            src = tokenizer.encode(article, max_length=max_len, return_tensors="pt", truncation=True).to(device)
            trg = tokenizer.encode(highlight, max_length=max_len, return_tensors="pt", truncation=True).to(device)
        
            optimizer.zero_grad()
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [18]:
def evaluate(tokenizer, model, iterator, criterion, max_len):
    model.eval()
    epoch_loss = 0
    batch_rouge_score = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
            sample_list = list(zip(batch['article'], batch['highlights']))
            total_rouge_score = []
            for article, highlight in sample_list:
                src = tokenizer.encode(article, max_length=max_len, return_tensors="pt", truncation=True).to(device)
                trg = tokenizer.encode(highlight, max_length=max_len, return_tensors="pt", truncation=True).to(device)

                output = model(src, trg[:, :-1])
                
                output_reshape = output.contiguous().view(-1, output.shape[-1])
                trg = trg[:, 1:].contiguous().view(-1)
                loss = criterion(output_reshape, trg)
                epoch_loss += loss.item()
                
                output_words = output.max(dim=2).indices[0]
                output_words = tokenizer.decode(output_words)
                rouge_score = rouge_.get_scores(output_words, highlight)[0]
                total_rouge_score.append([rouge_score['rouge-1']['f'], 
                                          rouge_score['rouge-2']['f'], 
                                          rouge_score['rouge-l']['f']])
                
            r1, r2, rl = 0, 0, 0
            for a in total_rouge_score:
                r1 += a[0]
                r2 += a[1]
                rl += a[2]
            total = len(total_rouge_score)
            if total == 0:
                total_rouge_score.append([0,0,0])
            else:
                batch_rouge_score.append([r1 / total, 
                                          r2 / total, 
                                          rl / total])
    r1, r2, rl = 0, 0, 0  
    for a in batch_rouge_score:
        r1 += a[0]
        r2 += a[1]
        rl += a[2]
    total = len(batch_rouge_score)
    if total == 0:
        return epoch_loss / len(iterator), [0, 0, 0]
    else:
        return epoch_loss / len(iterator), [r1 / total,  r2 / total, rl / total]

In [19]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def load_record(path):
    f = open(path, 'r')
    losses = f.read()
    losses = re.sub('\\]', '', losses)
    losses = re.sub('\\[', '', losses)
    losses = re.sub('\\,', '', losses)
    losses = losses.split(' ')
    losses = [float(i) for i in losses]
    return losses, len(losses)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)



<b> Dataset CNN/DailyMail </b>

https://github.com/abisee/cnn-dailymail

<code>
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: <b>287113</b>
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: <b>13368</b>
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: <b>11490</b>
    })
})
</code>


In [20]:
from datasets import load_dataset

dataset_ = load_dataset('cnn_dailymail', '3.0.0')
dataset_2 = load_dataset('xsum')

Reusing dataset cnn_dailymail (/home/azureuser/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset xsum (/home/azureuser/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

<b> Model run </b>

In [21]:
# parameters
batch_size = 128
max_len = 256
max_sum_len = 50
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 2048
drop_prob = 0.1
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 1000
clip = 1.0
weight_decay = 5e-4

In [22]:
# training and evaluation dataset
from torch.utils.data import DataLoader, RandomSampler

train_sampler = RandomSampler(dataset_['train'])
train_dataloader = DataLoader(dataset_['train'], sampler=train_sampler, batch_size=batch_size)

validate_sampler = RandomSampler(dataset_['validation'])
validate_dataloader = DataLoader(dataset_['validation'], sampler=validate_sampler, batch_size=batch_size)

In [23]:
from transformers import BartTokenizer
tokenizer_train = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [24]:
trg_pad_idx = src_pad_idx = tokenizer_train.pad_token_id
trg_sos_idx = tokenizer_train.bos_token_id
trg_eos_idx = tokenizer_train.eos_token_id

dec_voc_size = enc_voc_size = len(tokenizer_train)

print('src_pad_idx %s' % src_pad_idx)
print('trg_pad_idx %s' % trg_pad_idx)
print('trg_sos_idx %s' % trg_sos_idx)
print('trg_eos_idx %s' % trg_eos_idx)
print('enc_voc_size %s' % enc_voc_size)

src_pad_idx 1
trg_pad_idx 1
trg_sos_idx 0
trg_eos_idx 2
enc_voc_size 50265


In [25]:
from torch import optim
from torch.optim import Adam

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)


<b> Load From Scratch </b>

In [26]:
print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)

The model has 121,395,801 trainable parameters


  nn.init.kaiming_uniform(m.weight.data)


Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (tok_emb): TokenEmbedding(50265, 512, padding_idx=1)
      (pos_emb): PostionalEncoding()
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (attention): ScaleDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_concat): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
 

<b> Load from checkpoint</b>

In [27]:
train_losses, train_count = load_record('./results/train_loss.txt')
test_losses, _ = load_record('./results/test_loss.txt')
rouges, _ = load_record('./results/rouges.txt')
epoch -= train_count

print(f'The model has {count_parameters(model):,} trainable parameters')
print(f'Test losses {test_losses[len(test_losses) - 1]} train losses {train_losses[len(train_losses) - 1]}')
print(f'Resume from epoch {epoch}')
#model.load_state_dict(torch.load(f"./saved/model-{test_losses[len(test_losses) - 1]}.pt"))
model.load_state_dict(torch.load(f"Users/vladcioaba/transformers_exp/vanilla/saved/model-887.5220968201047.pt"))


The model has 121,395,801 trainable parameters
Test losses 921.1263728278024 train losses 924.6748081849433
Resume from epoch 999


FileNotFoundError: [Errno 2] No such file or directory: 'Users/vladcioaba/transformers_exp/vanilla/saved/model-887.5220968201047.pt'

<b> Train </b>

In [None]:
import math
import time

total_epoch = epoch
best_loss = float('inf')

train_iter = iter(train_dataloader)
valid_iter = iter(validate_dataloader)

train_losses, test_losses, rouges = [], [], []
for step in range(total_epoch):
    start_time = time.time()
    train_loss = train(tokenizer_train, model, train_iter, optimizer, criterion, clip, max_len)
    valid_loss, rouge = evaluate(tokenizer_train, model, valid_iter, criterion, max_len)
    end_time = time.time()

    if step > warmup:
        scheduler.step(valid_loss)

    train_losses.append(train_loss)
    test_losses.append(valid_loss)
    rouges.append(rouge)
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

    f = open('results/train_loss.txt', 'w')
    f.write(str(train_losses))
    f.close()

    f = open('results/rouges.txt', 'w')
    f.write(str(rouges))
    f.close()

    f = open('results/test_loss.txt', 'w')
    f.write(str(test_losses))
    f.close()

    print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {np.exp(valid_loss):7.3f}')
    print(f'\tROUGE Score: {rouge}')

<b> Test eval </b>

In [27]:
valid_iter = iter(validate_dataloader)
valid_loss, rouge = evaluate(tokenizer_train, model, valid_iter, criterion, max_len)
print(valid_loss, rouge)

100%|██████████| 105/105 [06:27<00:00,  3.69s/it]

921.1263728278024 [0.06861149342184318, 0.002915726042846819, 0.06816946234603505]





<b> Create random sample for validation  </b>

In [28]:
from torch.utils.data import DataLoader, RandomSampler

test_sampler = RandomSampler(dataset_['test'])
test_dataloader = DataLoader(dataset_['test'], sampler=test_sampler, batch_size=128)
sample = iter(test_dataloader).next()
sample_list = list(zip(sample['article'], sample['highlights']))

<b> Comparison with pretrained bart-large-cnn model </b>

In [29]:
# load model from higgingface
import torch
from transformers import BartForConditionalGeneration

tokenizer_test = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model_test = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

In [98]:
# evaluate model and print rouge score

articles = []
highlights = []
generated = []

for article, highlight in tqdm(sample_list):
    text = article.strip().replace("\n","")
    hightlight = highlight.strip().replace("\n","")

    input_ids = tokenizer_test.encode(text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    outputs = model_test.generate(input_ids, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)
    generated.append(tokenizer_test.decode(outputs[0]))
    articles.append(text)
    highlights.append(highlight)

print_rouge_score(generated, highlights)

100%|██████████| 128/128 [02:39<00:00,  1.24s/it]
100%|██████████| 128/128 [00:00<00:00, 200026.42it/s]




rouge-1: r:0.38, p:0.13 f:0.36
rouge-2: r:0.25, p:0.15 f:0.25
rouge-l: r:0.42, p:0.15 f:0.40





In [30]:
def predict(src, model, tokenizer, device, max_sum_len):
    
    model.eval()
        
    src_tensor = tokenizer.encode(src, return_tensors="pt", max_length=256, truncation=True).to(device)
    src_mask = (src_tensor != src_pad_idx).unsqueeze(1).unsqueeze(2)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
        
    trg_indexes = [1]

    for i in range(max_sum_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_pad_mask = (trg_tensor != trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg_tensor.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
            
        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_eos_idx :
            break
    
    trg_tokens = tokenizer_test.decode(trg_indexes)
    return trg_tokens[7:]



# CNN / DM

In [237]:
generated2 = []
highlights2 = []
model.eval()
with torch.no_grad():
    for article, highlight in tqdm(sample_list):
        outputs = predict(article, model, tokenizer_train, device, 50)
        generated2.append(outputs)
        highlights2.append(highlight)

print_rouge_score(generated2, highlights2)

100%|██████████| 128/128 [01:11<00:00,  1.78it/s]
100%|██████████| 128/128 [00:00<00:00, 388473.89it/s]




rouge-1: r:0.21, p:0.00 f:0.21
rouge-2: r:0.37, p:0.00 f:0.37
rouge-l: r:0.38, p:0.00 f:0.38





# XSum

In [31]:
test_sampler2 = RandomSampler(dataset_2['test'])
test_dataloader2 = DataLoader(dataset_2['test'], sampler=test_sampler2, batch_size=128)
sample2 = iter(test_dataloader2).next()
sample_list2 = list(zip(sample2['document'], sample2['summary']))

In [39]:
generated3 = []
highlights3 = []
model.eval()
with torch.no_grad():
    for article, highlight in tqdm(sample_list2):
        outputs = predict(article, model, tokenizer_train, device, 50)
        generated3.append(outputs)
        highlights3.append(highlight)

print_rouge_score(generated3, highlights3)

100%|██████████| 128/128 [01:14<00:00,  1.73it/s]
100%|██████████| 128/128 [00:00<00:00, 586103.62it/s]




rouge-1: r:0.20, p:0.00 f:0.20
rouge-2: r:0.45, p:0.00 f:0.45
rouge-l: r:0.00, p:0.00 f:0.00





In [32]:
idx = 0
m = 10000

for idx, a in enumerate(sample_list):
    if len(a) < m:
        m = len(a)
        idx = idx
print(sample_list[idx])

("Looking after someone with dementia can stretch people to their limits, and there are many in this situation.\xa0In England alone, there are more than 670,000 unpaid carers helping someone with dementia. Here, in the final week of our major Good Health series on dementia, we turn our attention to the carers and what can be done to make life easier for them and their loved ones... Scroll down for video . Looking after someone with dementia can stretch people to their limits. Here's how to\xa0make life easier . WHAT TO EXPECT . The early stages of the disease bring changes that may be so subtle that some friends and acquaintances are unaware that there is anything wrong - and this stage can continue for many years. But as the disease progresses and more damage is done to the brain, symptoms become more pronounced: difficulties with communication become more intense and issues such as getting dressed or managing day-to-day affairs become more problematic. This marks the beginning of the

In [31]:
predict("""For example, repetitive movements can mean they are anxious or scared, while 
withdrawing may mean they feel overwhelmed.\nIf they're doing something obviously wrong, 
for example putting dishes in the washing machine or clothes in a food cupboard, don't ask
them why or castigate them - the reasoning side of their brain has been affected and pointing 
out their mistakes will only cause them embarrassment and frustration.""", model, tokenizer_train, device, 50)

'ethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda Bethesda'

In [32]:
print_rouge_score(generated3, highlights3)

NameError: name 'generated3' is not defined