In [0]:
import numpy as np
import nltk
from io import open
import unicodedata
import string
import re
import random

In [0]:
!pip3 install pytorch-nlp



In [0]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
google_path = '/content/gdrive/My Drive/colab/paraphrasing/'
dataset_path = google_path + 'paraphrases.txt'
picked_dataset_path = google_path + "paraphrases_dataset.pickle"
encoder_path = google_path + 'encoder.model'
decoder_path = google_path + 'decoder.model'

# Data loader

In [0]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize(s):
    s = re.sub("ß", "ss", s)
    s = re.sub("ä", "ae", s)
    s = re.sub("ö", "oe", s)
    s = re.sub("ü", "ue", s)
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [0]:
MAX_LENGTH = 40

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

In [0]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocab:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "<PAD>", SOS_token: "<SOS>", EOS_token: "<EOS>"}
        self.n_words = 3  # Count SOS and EOS

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
def prepare_data(limit=10000, reverse=True):
    print("Reading lines...")
    lines = open(dataset_path).read().strip().split('\n')
    print('read %s lines' % len(lines))
    lines = lines[:limit]
    
    pairs = [l.split('|||') for l in lines]
    pairs = [(x[1], x[2]) for x in pairs]
    pairs = [(normalize(a), normalize(b)) for a,b in pairs]

    #pairs = filter_pairs(pairs)
    
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    
    vocab = Vocab()
    for pair in pairs:
        vocab.add_sentence(pair[0])
        vocab.add_sentence(pair[1])
    print("Counted words:")
    print(vocab.n_words)
    return vocab, pairs

# Convert to Tensor

In [0]:
from torch.nn.utils.rnn import pad_sequence

def tensor_to_string(tensor, vocab, ignore_index=-1):
    tokens = [vocab.index2word[idx.item()] for idx in tensor if idx != ignore_index]
    return " ".join(tokens)

def sentence_to_index(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]


def sentence_to_tensor(vocab, sentence):
    indexes = sentence_to_index(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair_to_tensor(pair):
    src, tar = pair
    input_tensor = sentence_to_tensor(vocab, src)
    target_tensor = sentence_to_tensor(vocab, tar)
    return (input_tensor, target_tensor)

def make_dataset(pairs, split_ratio=0.8):
    # Shuffle dataset
    n = len(pairs)
    indices = np.arange(n)
    np.random.seed(42)
    np.random.shuffle(indices)
    
    # Convert to tensors, use indices
    tensor_pairs = [pair_to_tensor(pairs[i]) for i in indices]
    
    # Split dataset
    split_idx = int(split_ratio * n)
    train_data = tensor_pairs[:split_idx]
    val_data = tensor_pairs[split_idx:]
    return train_data, val_data

In [0]:
def save_dataset(train_data, val_data, vocab):
    dataset = (train_data, val_data, vocab)
    pickle.dump(dataset, open(picked_dataset_path, "wb"))

# Model Architecture

In [0]:
'''ENCODER NETWORK'''
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        x = self.embedding(input).view(1, 1, -1)
        output, h = self.gru(x, hidden)
        return output, h

    def init_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

'''DECODER NETWORK'''
class AttentionDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttentionDecoderRNN, self).__init__()
        
        # Set dims
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # Set layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attention = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attention_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # Embed
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        # Compute attention score as matrix product
        attention_weights = F.softmax(self.attention(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attention_applied = torch.bmm(attention_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        # Linear + RELU
        x = torch.cat((embedded[0], attention_applied[0]), 1)
        x = self.attention_combine(x).unsqueeze(0)
        x = F.relu(x)
        
        # GRU RNN
        output, hidden = self.gru(x, hidden)
        
        # Softmax over output to transform into word probability
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attention_weights

    def init_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
src, tar = train_data[0]

In [0]:
src[0].shape

torch.Size([1])

In [0]:
hidden_size = 256
encoder = EncoderRNN(vocab.n_words, hidden_size).to(device)
decoder = AttentionDecoderRNN(hidden_size, vocab.n_words, dropout_p=0.5).to(device)

# Init optimizers
enc_opt = optim.Adam(encoder.parameters(), lr=0.001)
dec_opt = optim.Adam(decoder.parameters(), lr=0.001)

# Unrolling 

In [0]:
def rnn_unrolling(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    loss = 0

    # Set optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Calculate sequence lengths
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    # Initialize encoder output and hidden state as the zero vectors
    encoder_hidden = encoder.init_hidden_state()
    encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
    
    # Unroll Encoder RNN
    for t in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[t], encoder_hidden)
        encoder_outputs[t] = encoder_output[0, 0]

    # After unrolling the Encoder RNN, the decoder takes the last encoder 
    # hidden state as it's initial hidden state
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    
    # Unroll Attention Decoder RNN, stop when most probable output token is the EOS token
    for dt in range(target_length):
        decoder_output, decoder_hidden, decoder_att = decoder(decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[dt])
        
        # Get index of the most probable token
        _, argmax = decoder_output.topk(1)
        predicted_next_output = argmax.squeeze().detach()
        
        # Apply teacher forcing with probability 0.5
        # Teacher forcing feeds the target token to the decoder
        # rather than the predicted next token
        teacher_forcing = np.random.random() > 0.5
        if teacher_forcing: 
            decoder_input = target_tensor[dt]
        else:
            decoder_input = predicted_next_output
        
        # Stop unrolling if token is EOS
        if predicted_next_output.item() == EOS_token: 
            break

    # Perform BackProp
    loss.backward()

    # Tune params
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [0]:
import time
import math

def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [0]:
def train_one_epoch(pairs, encoder, decoder, start, print_every=1000, plot_every=100, learning_rate=0.01):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    # Init loss function, ignore PAD tokens
    criterion = nn.NLLLoss(ignore_index=PAD_token)
    n_iters = len(pairs)
    
    for i in range(1, n_iters + 1):
        # Split training pair
        input_tensor, target_tensor = pairs[i -1]

        # Run through train algorithm
        loss = rnn_unrolling(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        # Add losses to aux. variables
        print_loss_total += loss

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print('%s (%d %d%%) %.4f' % (time_since(start, i / n_iters), i, i / n_iters * 100, print_loss_avg))
            print_loss_total = 0


# Evaluation 

In [0]:
from torchnlp.metrics import get_moses_multi_bleu

def compute_bleu_score(vocab, target_tensor, pred_tensor):
    # Convert tensors to strings
    target = tensor_to_string(target_tensor, vocab, ignore_index=PAD_token)
    prediction = tensor_to_string(pred_tensor, vocab, ignore_index=PAD_token)

    target = [' '.join(target)]
    prediction = [' '.join(prediction)]
    
    # Compute BLEU score with the official BLEU perl script
    score = get_moses_multi_bleu(prediction, target, lowercase=True)
    return score

def evaluate(encoder, decoder, input_tensor):
    with torch.no_grad():
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.init_hidden_state()
        encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)

        # Unroll Encoder RNN
        for t in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[t], encoder_hidden)
            encoder_outputs[t] += encoder_output[0, 0]

        # Initialize Decoder Hidden State as the last Encoder Hidden State
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(MAX_LENGTH, MAX_LENGTH)

        # Unroll Decoder until most probable tokens is EOS
        for dt in range(MAX_LENGTH):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[dt] = decoder_attention.data

            # Get index of most probable token (argmax)
            _, argmax = decoder_output.data.topk(1)
            decoded_words.append(argmax)
            
            # Stop unrolling if EOS token is most probable
            if argmax.item() == EOS_token:
                break
            # Stop keeping track of gradients..? i.e detach() function
            decoder_input = argmax.squeeze().detach()

        return decoded_words, decoder_attentions[:dt + 1]

# Train Loop

In [0]:
def train_loop(n_epochs=5):
    start = time.time()
    for epoch in range(n_epochs):
        # Shuffle indices and data
        indices = np.arange(len(train_data))
        np.random.shuffle(indices)
        shuffled_data = [train_data[i] for i in range(len(train_data))]

        # Train for one epoch
        train_one_epoch(train_data, encoder1, decoder1, start, print_every=1000)

# Save Model

In [0]:
def save_model_params(encoder, decoder):
    torch.save(encoder.state_dict(), google_path + 'encoder.model')
    torch.save(decoder.state_dict(), google_path + 'decoder.model')

In [0]:
import pickle

def move_data_to_device(data):
    data = [(src.to(device), tar.to(device)) for src,tar in data]
    return data

# Pad AFTER Loading, since otherwise we are saving a lot of zeros on disk
def pad_pairs(tensor_pairs):
    # Pad tensors with 0s
    srcs = [src for src,tar in tensor_pairs]
    tars = [tar for src,tar in tensor_pairs]
    srcs_padded = pad_sequence(srcs, batch_first=True)
    tars_padded = pad_sequence(tars, batch_first=True)
    return list(zip(srcs_padded, tars_padded))

def load_dataset(from_scratch=False):
    if from_scratch:
        vocab, pairs = prepare_data(10000)
        train_data, val_data = make_dataset(pairs)
    else:
        train_data, val_data, vocab = pickle.load(open(picked_dataset_path, "rb"))
    
    # Move data to device (gpu or cpu)
    train_data = move_data_to_device(train_data)
    val_data = move_data_to_device(val_data)
    
    # Pad pairs
    train_data = pad_pairs(train_data)
    val_data = pad_pairs(val_data)
    
    return train_data, val_data, vocab


# Start Here

In [0]:
train_data, val_data, vocab = load_dataset(from_scratch=False)

In [0]:
def load_model():
    # Load trained model params
    encoder_dict = torch.load(encoder_path, map_location=torch.device(device))
    decoder_dict = torch.load(decoder_path, map_location=torch.device(device))

    # Create untrained model
    hidden_size = 256
    enc = EncoderRNN(vocab.n_words, hidden_size).to(device)
    dec = AttentionDecoderRNN(hidden_size, vocab.n_words, dropout_p=0.5).to(device)
    
    # Put weights into models
    enc.load_state_dict(encoder_dict)
    dec.load_state_dict(decoder_dict)

    # Init optimizers with Stochastic Gradient Descent
    encoder_optimizer = optim.Adam(enc.parameters(), lr=0.001)
    decoder_optimizer = optim.Adam(dec.parameters(), lr=0.001)

    return enc, dec, encoder_optimizer, decoder_optimizer

def get_model(load_pretrained=False):
    # Init loss function, ignore PAD tokens
    criterion = nn.NLLLoss(ignore_index=PAD_token)
    
    if load_pretrained:
        enc, dec, enc_opt, dec_opt = load_model()
    else:
        hidden_size = 256
        enc = EncoderRNN(vocab.n_words, hidden_size).to(device)
        dec = AttentionDecoderRNN(hidden_size, vocab.n_words, dropout_p=0.5).to(device)

        # Init optimizers
        enc_opt = optim.Adam(enc.parameters(), lr=0.001)
        dec_opt = optim.Adam(dec.parameters(), lr=0.001)

    return enc, dec, enc_opt, dec_opt, criterion

In [0]:
model_obj = get_model(False)

In [0]:
encoder1, decoder1, encoder_optimizer, decoder_optimizer, criterion = model_obj

In [0]:
vocab.word2index['dog']

10938

In [0]:
train_loop(2)

0m 35s (- 46m 12s) (1000 1%) 1.9006
1m 9s (- 44m 53s) (2000 2%) 1.7982
1m 43s (- 44m 9s) (3000 3%) 1.7588
2m 17s (- 43m 41s) (4000 5%) 1.6283
2m 52s (- 43m 7s) (5000 6%) 1.5845
3m 27s (- 42m 35s) (6000 7%) 1.5588
4m 1s (- 42m 2s) (7000 8%) 1.4831
4m 37s (- 41m 37s) (8000 10%) 1.4839
5m 12s (- 41m 8s) (9000 11%) 1.4534
5m 48s (- 40m 38s) (10000 12%) 1.4115
6m 23s (- 40m 4s) (11000 13%) 1.3834
6m 58s (- 39m 31s) (12000 15%) 1.3922
7m 33s (- 38m 59s) (13000 16%) 1.3451
8m 8s (- 38m 24s) (14000 17%) 1.3402
8m 44s (- 37m 51s) (15000 18%) 1.2990
9m 19s (- 37m 17s) (16000 20%) 1.3014
9m 54s (- 36m 43s) (17000 21%) 1.2904
10m 31s (- 36m 13s) (18000 22%) 1.2916
11m 7s (- 35m 42s) (19000 23%) 1.3093
11m 42s (- 35m 7s) (20000 25%) 1.2683
12m 18s (- 34m 33s) (21000 26%) 1.2814
12m 53s (- 33m 58s) (22000 27%) 1.3093
13m 28s (- 33m 23s) (23000 28%) 1.2299
14m 3s (- 32m 48s) (24000 30%) 1.2431
14m 38s (- 32m 13s) (25000 31%) 1.2534
15m 14s (- 31m 38s) (26000 32%) 1.2518
15m 49s (- 31m 3s) (27000 33%)

In [0]:
save_model_params(encoder1, decoder1)