In [1]:
import os
import math
import random
import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable


from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import numpy as np
import nltk
from io import open
import unicodedata
import string
import re
import random

!pip3 install pytorch-nlp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


from google.colab import drive
drive.mount('/content/gdrive')

import os

google_path = '/content/gdrive/My Drive/colab/paraphrasing/'
os.listdir(google_path)
dataset_path = google_path + 'paraphrases.txt'
picked_dataset_path = google_path + "paraphrases_dataset.pickle"
encoder_path = google_path + 'encoder.model'
decoder_path = google_path + 'decoder.model'

Collecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |███▋                            | 10kB 16.1MB/s eta 0:00:01[K     |███████▎                        | 20kB 4.3MB/s eta 0:00:01[K     |███████████                     | 30kB 6.0MB/s eta 0:00:01[K     |██████████████▌                 | 40kB 7.5MB/s eta 0:00:01[K     |██████████████████▏             | 51kB 4.9MB/s eta 0:00:01[K     |█████████████████████▉          | 61kB 5.7MB/s eta 0:00:01[K     |█████████████████████████▌      | 71kB 6.4MB/s eta 0:00:01[K     |█████████████████████████████   | 81kB 7.1MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 4.6MB/s 
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0
Device: cuda
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4

# Data loader

In [0]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize(s):
    s = re.sub("ß", "ss", s)
    s = re.sub("ä", "ae", s)
    s = re.sub("ö", "oe", s)
    s = re.sub("ü", "ue", s)
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s.split(' ')

def sub_unks(token_array, vocab):
    words = set(vocab.index2word.values())
    return [t if t in words else UNK for t in token_array ]

def insert_tags(token_array):
    return [SOS] + token_array + [EOS]

In [0]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3

PAD, SOS, EOS, UNK = "<PAD>", "<SOS>", "<EOS>", "<UNK>"

class Vocab:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0
        
        self.add_word(PAD)
        self.add_word(SOS)
        self.add_word(EOS)
        self.add_word(UNK)
        

    def add_sentence(self, sentence):
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
import numpy as np
import nltk
from tqdm import tqdm

flatten = lambda l: [item for sublist in l for item in sublist]

def read_lines():
    print("Reading lines...")
    lines = open(dataset_path).read().strip().split('\n')
    print('read %s lines' % len(lines))
    return lines

def prepare_data(limit=None):
    lines = read_lines()
    
    # Limit dataset to a "random" subset
    if limit:
        idx = np.arange(len(lines))
        np.random.seed(42)
        np.random.shuffle(idx)
        idx = idx[:limit]
        lines = [lines[i] for i in idx]
        
    print('Processing')
    pairs = [l.split('|||') for l in lines]
    pairs = [(x[1], x[2]) for x in pairs]
    pairs = [(normalize(a), normalize(b)) for a,b in pairs]
    pairs = [(insert_tags(a), insert_tags(b)) for a,b in pairs]

    # Pick top N words
    txt = [a+b for a,b in pairs]
    txt_flat = flatten(txt)
    freqs = nltk.FreqDist(txt_flat)
    # find number of words occuring more than 2 times
    N = np.sum(np.array(list(freqs.values())) > 2)
    top = freqs.most_common(N)
    top = [a for a,b in top]
    
    print('building vocab...')
    vocab = Vocab()
    
    for w in top:
        vocab.add_word(w)
    
    # Replace OOV words with UNK
    pairs = [(sub_unks(a, vocab), sub_unks(b, vocab)) for a,b in pairs]
    
    #for s1, s2 in pairs:
     #   vocab.add_sentence(s1)
      #  vocab.add_sentence(s2)
    
    print("Counted words:")
    print(vocab.n_words)
    print('Finished loading data')
    return vocab, pairs

# Convert to Tensor

In [0]:
from torch.nn.utils.rnn import pad_sequence

def predict_single_sentence(src, tar, vocab):
    # Insert into batch where all but the first entry just zeros
    
    src_batch = torch.zeros((src.size(0), BATCH_SIZE)).long().to(device)
    tar_batch = torch.zeros((tar.size(0), BATCH_SIZE)).long().to(device)
    src_batch[:, 0] = src[:, 0]
    tar_batch[:, 0] = tar[:, 0]
    
    # Perform forward pass
    output = model(src_batch, tar_batch, teacher_forcing_ratio=0) # no teacher forcing
    
    # Extract first predicted sentence
    pred_token_ids = output[:, 0, :].argmax(dim=1)
    
    src_string = tensor_to_string(src.squeeze(), vocab)
    tar_string = tensor_to_string(tar.squeeze(), vocab)
    pred_string = tensor_to_string(pred_token_ids, vocab)
    
    print("SRC:", src_string)
    print("TAR:", tar_string)
    print("-"*50)
    print("PRED:", pred_string)
    return pred_token_ids


def tensor_to_string(tensor, vocab, ignore_index=[PAD_token, UNK_token, SOS_token, EOS_token]):
    tokens = [vocab.index2word[idx.item()] for idx in tensor if idx not in ignore_index]
    return " ".join(tokens)

def sentence_to_index(vocab, sentence):
    return [vocab.word2index[word] for word in sentence]


def sentence_to_tensor(vocab, sentence):
    indexes = sentence_to_index(vocab, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair_to_tensor(pair):
    src, tar = pair
    input_tensor = sentence_to_tensor(vocab, src)
    target_tensor = sentence_to_tensor(vocab, tar)
    return (input_tensor, target_tensor)

def make_dataset(pairs, split_ratio=0.9):
    # Shuffle dataset
    n = len(pairs)
    indices = np.arange(n)
    np.random.seed(42)
    np.random.shuffle(indices)
    
    # Convert to tensors, use indices
    tensor_pairs = [pair_to_tensor(pairs[i]) for i in indices]
    
    # Split dataset
    split_idx = int(split_ratio * n)
    train_data = tensor_pairs[:split_idx]
    val_data = tensor_pairs[split_idx:]
    return train_data, val_data

In [0]:
import pickle
def load_dataset():
    train_data, val_data, vocab = pickle.load(open(picked_dataset_path, "rb"))
    return train_data, val_data, vocab

def save_dataset(train_data, val_data, vocab):
    dataset = (train_data, val_data, vocab)
    pickle.dump(dataset, open(picked_dataset_path, "wb"))

In [0]:
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader

'''Only pad when batch is loaded!'''
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=False, padding_value=PAD_token)
    yy_pad = pad_sequence(yy, batch_first=False, padding_value=PAD_token)

    return xx_pad.squeeze(2), yy_pad.squeeze(2)

'''Dataset class'''
class ParaphraseDataset(object):
    def __init__(self, pairs):
        self.pairs = pairs


    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

In [8]:
# define batch size
BATCH_SIZE = 32
print('Using batch size:', BATCH_SIZE)

# use gpu if available, else use cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Loaded device:', device)

Using batch size: 32
Loaded device: cuda


In [0]:
#vocab, pairs = prepare_data(limit=200 * 1000)
#train_data, val_data = make_dataset(pairs)
#save_dataset(train_data, val_data, vocab)

# Load data from disk

In [0]:
train_data, val_data, vocab = load_dataset()

In [0]:
from torch.utils.data import DataLoader
train_iterator = DataLoader(ParaphraseDataset(train_data), batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
valid_iterator = DataLoader(ParaphraseDataset(val_data), batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)

# Model

In [0]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)
    
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], \
        requires_grad=False).cuda()
        return x
    
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output
    
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm
    
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model).cuda()
    
    def forward(self, x, e_outputs, src_mask, trg_mask):
            x2 = self.norm_1(x)
            x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
            x2 = self.norm_2(x)
            x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs,
            src_mask))
            x2 = self.norm_3(x)
            x = x + self.dropout_3(self.ff(x2))
            return x


class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)
    
# build an encoder layer with one multi-head attention layer and one # feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output
# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

In [0]:
import numpy as np
import copy 

def attention(q, k, v, d_k, mask=None, dropout=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output


def create_masks(input_seq, target_seq):
    input_pad = PAD_token
    # creates mask with 0s wherever there is padding in the input
    input_msk = (input_seq != input_pad).unsqueeze(1)
    
    # create mask as before
    target_pad = PAD_token
    target_msk = (target_seq != target_pad).byte().unsqueeze(1)
    size = target_seq.size(1) # get seq_len for matrix
    nopeak_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0).byte().to(device)
    target_msk = target_msk & nopeak_mask
    
    return input_msk, target_msk

# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [0]:
def evaluate(model, iterator):
    ''' Evaluation loop for the model to evaluate.
    Args:
        model: A Seq2Seq model instance.
        iterator: A DataIterator to read the data.
        criterion: loss criterion.
    Returns:
        epoch_loss: Average loss of the epoch.
    '''
    #  some layers have different behavior during train/and evaluation (like BatchNorm, Dropout) so setting it matters.
    model.eval()
    # loss
    epoch_loss = 0

    # no_grad() ensures parameters arent optimized (this would be cheating)
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            src = src.transpose(0,1)
            trg = trg.transpose(0,1)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input)
            
            # Forward pass
            preds = model(src, trg_input, src_mask, trg_mask)
            
            # Calculate loss
            preds_ = preds.view(-1, preds.size(-1))
            targets = trg[:, 1:].contiguous().view(-1)
            loss = criterion(preds_, targets)
            
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [0]:
def train_one_epoch(print_every=100):
    # Toggle train mode to adjust params
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(train_iterator):
        # Prepare batch
        src, trg = batch
        src = src.transpose(0,1)
        trg = trg.transpose(0,1)
        trg_input = trg[:, :-1]
        src_mask, trg_mask = create_masks(src, trg_input)

        # Forward pass
        preds = model(src, trg_input, src_mask, trg_mask)

        # Perform backprop
        optim.zero_grad()
        preds_ = preds.view(-1, preds.size(-1))
        targets = trg[:, 1:].contiguous().view(-1)
        loss = criterion(preds_, targets)
        loss.backward()
        optim.step()
        
        # Save loss for epoch
        epoch_loss += loss.item()
            
    return epoch_loss / len(train_iterator)

In [0]:
d_model = 512
heads = 8
N = 6
src_vocab = vocab.n_words
trg_vocab = vocab.n_words
model = Transformer(src_vocab, trg_vocab, d_model, N, heads).to(device)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation.
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_token)

In [17]:
import time
import pickle
MODEL_SAVE_PATH = google_path + 'transformer-model-paraphrase.pt'
LOSSES_SAVE_PATH = google_path + 'paraphrase-losses.pickle'
print('saving to:', MODEL_SAVE_PATH, LOSSES_SAVE_PATH)
val_losses = []
train_losses = []
best_val_loss = np.infty

saving to: /content/gdrive/My Drive/colab/paraphrasing/transformer-model-paraphrase.pt /content/gdrive/My Drive/colab/paraphrasing/paraphrase-losses.pickle


In [0]:
for epoch in range(50):
    start_time = time.time()
    # Run one epoch
    train_loss = train_one_epoch()
    
    # Evaluate 
    val_loss = evaluate(model, valid_iterator)
    elapsed_time = time.time() - start_time
    print('Epoch [%i] finished after [%i] seconds, val loss: [%.3f]' % 
          (epoch+1, elapsed_time, val_loss))
    
    # Save losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    pickle.dump((train_losses, val_losses), open(LOSSES_SAVE_PATH, 'wb'))
    
    # If generalization error improved then save the model to disk
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)

Epoch [1] finished after [487] seconds, val loss: [2.507]
Epoch [2] finished after [482] seconds, val loss: [2.182]
Epoch [3] finished after [481] seconds, val loss: [2.026]
Epoch [4] finished after [480] seconds, val loss: [1.937]
Epoch [5] finished after [481] seconds, val loss: [1.888]
Epoch [6] finished after [482] seconds, val loss: [1.855]
Epoch [7] finished after [488] seconds, val loss: [1.841]


# Load Model

In [18]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

<All keys matched successfully>

# BLEU

In [0]:
#n_valid = len(valid_data.examples)
#valid_src_txt = [valid_data.examples[i].src for i in range(n_valid)]
#valid_trg_txt = [valid_data.examples[i].trg for i in range(n_valid)]
#valid_src_idx = [tokens_to_tensor(SRC, valid_src_txt[i]) for i in range(n_valid)]
#valid_trg_idx = [tokens_to_tensor(TRG, valid_trg_txt[i]) for i in range(n_valid)]

In [0]:
def tokens_to_tensor(LANG, tokens, reverse=False):
    idx = [LANG.word2index[t] for t in tokens]
    if reverse: idx = list(reversed(idx))
    return idx

def remove_tags(LANG, idx):
    ignore = [LANG.word2index[PAD], LANG.word2index[SOS], LANG.word2index[EOS]]
    return [i for i in idx if i not in ignore]


def remove_duplicates(vocab, id_array):
    # remove tokens which are repeated in succession
    id_array = [id_array[i] for i in range(len(id_array)) if i == 0 or id_array[i-1] != id_array[i]]
    return id_array

def tensor_to_tokens(LANG, tensor, reverse=False):
    tensor = remove_tags(LANG, tensor)
    tensor = remove_duplicates(LANG, tensor)
    tokens = [LANG.index2word[x] for x in tensor]
    tokens = list(reversed(tokens)) if reverse else tokens
    return tokens


def beautify_token_array_to_str(token_array):
    import re
    # Convert to string, strip training spaces
    string = ' '.join(token_array).strip()
    '''Replaces common contractions such as don't, ain't, isn't etc.'''
    string = re.sub('ai n t', "ain't", string)
    string = re.sub('do n t', "don't", string)
    string = re.sub('is n t', "isn't", string)
    string = re.sub('i m', "i'm", string)
    string = re.sub('was n t', "wasn't", string)
    string = re.sub('wo n t', "won't", string)
    string = re.sub('should n t', "shouldn't", string)
    return string

                         
def get_pred_tar_pairs(batch_iter):
    'predict on batches and finds the untouched txt sentence which matches the target.'
    TARS = []
    PREDS = []
    SRCS = []

    with torch.no_grad():
        for batch in valid_iterator:
            # Prepare batch
            src, trg = batch
            src = src.transpose(0,1)
            trg = trg.transpose(0,1)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input)

            # Forward pass
            output = model(src, trg_input, src_mask, trg_mask)
            
            # Argmax over vocab and get the predicted tokens
            pred = output.argmax(dim=2)

            SRCS += [tensor_to_tokens(vocab, x.tolist()) for x in src]
            TARS += [tensor_to_tokens(vocab, x.tolist()) for x in trg]
            PREDS += [tensor_to_tokens(vocab, x.tolist()) for x in pred]
    
    return SRCS, TARS, PREDS

from torchnlp.metrics import get_moses_multi_bleu
def bleu(tar, pred): 
    'Calculates moses bleu given two arrays of str tokens'
    tar, pred = ' '.join(tar), ' '.join(pred)
    return get_moses_multi_bleu([tar], [pred])

In [0]:
SRCS, TARS, PREDS = get_pred_tar_pairs(valid_iterator)

In [25]:
indices = np.arange(len(SRCS))
choices = [np.random.choice(indices) for i in range(10)]
choices

[6414, 2134, 8739, 9936, 16775, 10006, 11526, 9350, 431, 12816]

In [26]:
for i in choices:
    x,y,z = SRCS[i], TARS[i], PREDS[i]
    print('[src]', beautify_token_array_to_str(x))
    print('[tar]', beautify_token_array_to_str(y))
    print('[out]', beautify_token_array_to_str(z))
    print()
    

[src] they apply
[tar] they are implemented
[out] they are applicable

[src] nations regional commissions
[tar] regional commission
[out] levels

[src] yes i am
[tar] i ve got it
[out] yeah m got it

[src] are from
[tar] emerging from
[out] are from

[src] last fiscal year
[tar] most recently completed fiscal year
[out] last recent year

[src] now wait
[tar] are waiting
[out] waiting for

[src] please clarify whether
[tar] clarify whether
[out] be whether

[src] that issue
[tar] the above issues
[out] that question

[src] possibility of establishing a
[tar] possibility of creating an
[out] possibility of developing a

[src] designed to improve
[tar] for improving the
[out] to improving



In [32]:
!pip install easy-rouge

Collecting easy-rouge
  Downloading https://files.pythonhosted.org/packages/0c/0a/b7ebb887dac3ece27fffc65bbc7dc0abcf991f2ccce8073126329ce4be8f/easy_rouge-0.2.2-py3-none-any.whl
Installing collected packages: easy-rouge
Successfully installed easy-rouge-0.2.2


# ROUGE scores

In [0]:
from rouge.rouge import rouge_n_sentence_level

def calc_rouge(pred, tar):
    _, _, rouge1 = rouge_n_sentence_level(pred, tar, 1)
    _, _, rouge2 = rouge_n_sentence_level(pred, tar, 2)
    _, _, rouge3 = rouge_n_sentence_level(pred, tar, 3)
    return rouge1*100, rouge2*100, rouge3*100

def plot_rouge(scores, title):
    scores = np.array(scores)
    plt.hist(scores, bins=100, edgecolor='black')
    plt.title(title)
    plt.savefig(google_path + title + '.png')
    plt.show()
    print('AVG:', np.mean(scores))

# Calculate ROUGE-N for an example.
recall, precision, rouge = rouge_n_sentence_level(TARS[0], PREDS[0], 1)
print('ROUGE-2-R', recall)
print('ROUGE-2-P', precision)
print('ROUGE-2-F', rouge)

ROUGE-2-R 0.8333333333333334
ROUGE-2-P 0.8333333333333334
ROUGE-2-F 0.8333333333333334


In [0]:
# Calculate rouge-N for all validation examples
rouge_scores = [calc_rouge(TARS[i], PREDS[i]) for i in range(len(TARS))]
pickle.dump(rouge_scores, open(google_path + 'rouge_scores.pickle', 'wb'))
r1, r2, r3 = zip(*rouge_scores)
plot_rouge(r1, 'ROUGE-1'), plot_rouge(r2, 'ROUGE-2'), plot_rouge(r3, 'ROUGE-3')
np.mean(r1), np.mean(r2), np.mean(r3)

# Predict single sentence (used for web app)

In [0]:
def tokenize_string(s):
    # ex s = 'and the evaluation'
    tokens = normalize(s)
    tokens = insert_tags(tokens)
    return tokens

def paraphrase(src, max_len=80):
    model.eval()
    src = tokenize_string(src)
    src = Variable(torch.LongTensor([[vocab.word2index[tok] for tok in src]])).to(device)
    
    # Create mask and encode src
    src_mask = (src != PAD_token).unsqueeze(-2)
    encoder_outputs = model.encoder(src, src_mask)
    
    # Create decoder output as SOS tokens
    outputs = torch.zeros(max_len).type_as(src.data)
    outputs[0] = torch.LongTensor([vocab.word2index[SOS]])
        
    # Predict each token untill EOS
    for i in range(1, max_len):    
        trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8')
        trg_mask = Variable(torch.from_numpy(trg_mask) == 0).to(device)
        
        # Predict 
        out = model.out(model.decoder(outputs[:i].unsqueeze(0), encoder_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)

        # Argmax over outputs
        outputs[i] = out[:, -1].argmax().item()
        
        # Check if EOS
        if outputs[i] == vocab.word2index[EOS]: 
            break

    # Convert to tokens, and lastly to beautiful string
    sentence = tensor_to_tokens(vocab, outputs[:i].tolist())
    return beautify_token_array_to_str(sentence)

In [0]:
paraphrase('this is essential')

'it is crucial'

In [0]:
pickle.dump(vocab, open(google_path + 'vocab-200k.pickle', 'wb'))