In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (20, 20)

import re
import os
import io
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from nltk import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

nltk.download("punkt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data

In [None]:
base_path = "/mnt/efs/wikipedia/dumps/text/"
paths = np.random.choice(os.listdir(base_path), size=1)

all_text = ""
for path in paths:
    for filename in tqdm(os.listdir(base_path + path)):
        with open(base_path + path + "/" + filename, "rb") as f:
            all_text += f.read().decode("latin1")

pattern = r"(?:<doc.+>)((.|\s|\S)*?)(?:<\/doc>)"
articles = [article[0] for article in re.findall(pattern, all_text)]

### cleaning pipeline

In [None]:
def tokenize(sentence):
    """moses tokeniser"""
    seq = " ".join(word_tokenize(sentence))
    seq = seq.replace(" n't ", "n 't ")
    return seq.split()


def label_linkable_tokens(sentence, label_all=True):
    parsed_html = BeautifulSoup(sentence, "html.parser")

    link_text = [link.text for link in parsed_html.find_all("a")]
    tokenised_links = [tokenize(link) for link in link_text]
    tokenised_text = tokenize(parsed_html.text)
    target_sequence = np.zeros(len(tokenised_text))

    for link in tokenised_links:
        start_positions = kmp(tokenised_text, link)
        if label_all:
            for pos in start_positions:
                target_sequence[pos : pos + len(link)] = 1
        elif label_all == False and len(start_positions) > 0:
            pos = start_positions[0]
            target_sequence[pos : pos + len(link)] = 1
        else:
            pass

    return tokenised_text, target_sequence.reshape(-1, 1)


def kmp(sequence, sub):
    """
    Knuth–Morris–Pratt algorithm, returning the starting position
    of a specified subsequence within another, larger sequence.
    Usually used for string matching.
    """
    partial = [0]
    for i in range(1, len(sub)):
        j = partial[i - 1]
        while j > 0 and sub[j] != sub[i]:
            j = partial[j - 1]
        partial.append(j + 1 if sub[j] == sub[i] else j)

    positions, j = [], 0
    for i in range(len(sequence)):
        while j > 0 and sequence[i] != sub[j]:
            j = partial[j - 1]
        if sequence[i] == sub[j]:
            j += 1
        if j == len(sub):
            positions.append(i - (j - 1))
            j = 0

    return positions

In [None]:
token_sequences, target_sequences = [], []

for i, article in enumerate(tqdm(articles)):
    for j, sentence in enumerate(sent_tokenize(article)):
        try:
            tokenized_sentence, target_sequence = label_linkable_tokens(sentence)
            token_sequences.append(tokenized_sentence)
            target_sequences.append(target_sequence)
        except:
            pass

# fasttext and a word vector embedding matrix 

In [None]:
wv_path = "/mnt/efs/text/word_vectors/wiki-news-300d-1M.vec"
wv_file = io.open(wv_path, "r", encoding="utf-8", newline="\n", errors="ignore")
lines_to_parse = list(wv_file)[1:]

fasttext = {
    line.split()[0]: np.array(line.split()[1:]).astype(np.float32)
    for line in tqdm(lines_to_parse)
}

In [None]:
article_vocabulary = list(set([tok for seq in token_sequences for tok in seq]))

In [None]:
special_cases = ["xxunk", "xxpad"]

for case in special_cases:
    fasttext[case] = np.random.random(300)

article_vocabulary = np.append(article_vocabulary, special_cases)

In [None]:
token_to_index = {token: index for index, token in enumerate(article_vocabulary)}
index_to_token = {index: token for index, token in enumerate(article_vocabulary)}

In [None]:
word_vector_embedding_matrix = torch.FloatTensor(
    [
        fasttext[token] if token in fasttext else fasttext["xxunk"]
        for token in article_vocabulary
    ]
)

# dataset and dataloader

In [None]:
class SentenceDataset(Dataset):
    def __init__(self, token_sequences, target_sequences):
        self.token_index_sequences = np.array(
            [self.indexify(seq) for seq in token_sequences]
        )
        self.target_sequences = np.array(target_sequences)

        # impose length constraint
        where_big_enough = np.where([len(seq) > 3 for seq in target_sequences])
        self.token_index_sequences = self.token_index_sequences[where_big_enough]
        self.target_sequences = self.target_sequences[where_big_enough]

    def __getitem__(self, index):
        token_index_sequence = self.token_index_sequences[index]
        target_sequence = self.target_sequences[index]
        length = len(token_index_sequence)
        return token_index_sequence, target_sequence, length

    def __len__(self):
        return len(self.token_index_sequences)

    def indexify(self, token_sequence):
        index_sequence = np.array([token_to_index[token] for token in token_sequence])
        return index_sequence

In [None]:
def pad(sequences, pad_value):
    pad_length = max([len(seq) for seq in sequences])
    padded = np.full((len(sequences), pad_length, 1), pad_value)
    for i, seq in enumerate(sequences):
        padded[i][pad_length - len(seq) :] = seq.reshape(-1, 1)
    return padded.squeeze()


def collate_fn(batch):
    indexes, targets, lengths = zip(*batch)

    sorted_lengths, sort_indicies = torch.Tensor(lengths).sort(dim=0, descending=True)

    sorted_indexes = np.array(indexes)[sort_indicies]
    sorted_targets = np.array(targets)[sort_indicies]

    padded_indexes = pad(sorted_indexes, token_to_index["xxpad"])
    padded_targets = pad(sorted_targets, 0)

    return padded_indexes, padded_targets, sorted_lengths

In [None]:
(
    train_token_sequences,
    test_token_sequences,
    train_target_sequences,
    test_target_sequences,
) = train_test_split(token_sequences, target_sequences, test_size=0.20, random_state=42)

In [None]:
train_dataset = SentenceDataset(train_token_sequences, train_target_sequences)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=64,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

In [None]:
test_dataset = SentenceDataset(test_token_sequences, test_target_sequences)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=64,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

# models

In [None]:
class LinkLabeller(nn.Module):
    def __init__(self, word_vectors, hidden_dim=1024):
        super(LinkLabeller, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(word_vectors)
        self.enc_lstm = nn.LSTM(
            input_size=300,
            hidden_size=self.hidden_dim,
            num_layers=2,
            bidirectional=True,
            dropout=0.2,
        )

        self.head = nn.Sequential(
            nn.Linear(self.hidden_dim * 2, self.hidden_dim // 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(self.hidden_dim // 32, 2),
        )

    def forward(self, index_sequence, sequence_lengths):
        word_vectors = self.embedding(index_sequence)

        packed = pack_padded_sequence(word_vectors, sequence_lengths, batch_first=True)

        embedded_packed, _ = self.enc_lstm(packed)

        embedded, batch_lengths = pad_packed_sequence(embedded_packed, batch_first=True)

        categorised = self.head(embedded)
        return categorised

In [None]:
# model_path = '/mnt/efs/models/20180114_link_labeller.pt'
model = LinkLabeller(word_vector_embedding_matrix).to(device)

# training

In [None]:
stacked = np.vstack(target_sequences)
a = len(stacked) - stacked.sum()
b = stacked.sum()
class_weights = torch.Tensor([b, a]) / (b + a)

In [None]:
losses = []

torch.backends.cudnn.benchmark = True

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())

optimiser = optim.Adam(trainable_parameters, lr=0.0001)

loss_function = nn.CrossEntropyLoss(weight=class_weights.cuda())

In [None]:
def train(model, train_loader, loss_function, optimiser, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        loop = tqdm(train_loader)
        for indexes, targets, sequence_lengths in loop:
            indexes = torch.LongTensor(indexes).cuda(non_blocking=True)
            targets = torch.LongTensor(targets).cuda(non_blocking=True)

            optimiser.zero_grad()
            preds = model(indexes, sequence_lengths).permute(0, 2, 1)

            loss = loss_function(preds, targets)
            loss.backward()
            optimiser.step()

            losses.append(loss.item())
            loop.set_description("Epoch {}/{}".format(epoch + 1, n_epochs))
            loop.set_postfix(loss=np.mean(losses[-100:]))

In [None]:
train(
    model=model,
    train_loader=train_loader,
    loss_function=loss_function,
    optimiser=optimiser,
    n_epochs=3,
)

In [None]:
loss_data = pd.Series(losses[20:]).rolling(window=300).mean()
ax = loss_data.plot()
# ax.set_ylim(0.4, 0.5);

# test the model on unseen data

In [None]:
random_ixs = np.random.randint(len(test_token_sequences), size=30)

for random_ix in random_ixs:
    i, t, l = test_dataset.__getitem__(random_ix)

    p = model(torch.LongTensor([i]).cuda(), torch.Tensor([l]).cuda())[0]
    p = nn.LogSoftmax(dim=1)(p).argmax(dim=1)
    for bksgh in range(len(i)):
        print(p[bksgh].item(), int(t[bksgh][0]), index_to_token[i[bksgh]])
    print()

# save model

In [None]:
torch.save(model.state_dict(), "/mnt/efs/models/20180117_link_labeller.pt")

torch.save(word_vector_embedding_matrix, "/mnt/efs/models/20180117_embeddings.pt")

In [None]:
model_path = "/mnt/efs/models/20180114_link_labeller.pt"
model = LinkLabeller(word_vector_embedding_matrix)
model.load_state_dict(torch.load(model_path))