In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (20, 20)

import re
import os
import io
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from nltk import word_tokenize, sent_tokenize
from sklearn.preprocessing import OneHotEncoder

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

nltk.download("punkt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data

In [None]:
base_path = "/mnt/efs/wikipedia/dumps/text/"
paths = np.random.choice(os.listdir(base_path), size=1)

all_text = ""
for path in paths:
    for filename in tqdm(os.listdir(base_path + path)):
        with open(base_path + path + "/" + filename, "rb") as f:
            all_text += f.read().decode("latin1")

pattern = r"(?:<doc.+>)((.|\s|\S)*?)(?:<\/doc>)"
articles = [article[0] for article in re.findall(pattern, all_text)]

In [None]:
articles = np.random.choice(articles, size=20000)

### cleaning pipeline

In [None]:
def tokenize(sentence):
    """moses tokeniser"""
    seq = " ".join(word_tokenize(sentence))
    seq = seq.replace(" n't ", "n 't ")
    return seq.split()


def label_linkable_tokens(sentence, label_all=True):
    parsed_html = BeautifulSoup(sentence, "html.parser")

    link_text = [link.text for link in parsed_html.find_all("a")]
    tokenised_links = [tokenize(link) for link in link_text]
    tokenised_text = tokenize(parsed_html.text)
    target_sequence = np.zeros(len(tokenised_text))

    for link in tokenised_links:
        start_positions = kmp(tokenised_text, link)
        if label_all:
            for pos in start_positions:
                target_sequence[pos : pos + len(link)] = 1
        elif label_all == False and len(start_positions) > 0:
            pos = start_positions[0]
            target_sequence[pos : pos + len(link)] = 1
        else:
            pass

    return tokenised_text, target_sequence.reshape(-1, 1)


def kmp(sequence, sub):
    """
    Knuth–Morris–Pratt algorithm, returning the starting position
    of a specified subsequence within another, larger sequence.
    Usually used for string matching.
    """
    partial = [0]
    for i in range(1, len(sub)):
        j = partial[i - 1]
        while j > 0 and sub[j] != sub[i]:
            j = partial[j - 1]
        partial.append(j + 1 if sub[j] == sub[i] else j)

    positions, j = [], 0
    for i in range(len(sequence)):
        while j > 0 and sequence[i] != sub[j]:
            j = partial[j - 1]
        if sequence[i] == sub[j]:
            j += 1
        if j == len(sub):
            positions.append(i - (j - 1))
            j = 0

    return positions

In [None]:
token_sequences, target_sequences = [], []

for i, article in enumerate(tqdm(articles)):
    for j, sentence in enumerate(sent_tokenize(article)):
        try:
            tokenized_sentence, target_sequence = label_linkable_tokens(sentence)
            token_sequences.append(tokenized_sentence)
            target_sequences.append(target_sequence)
        except:
            pass

In [None]:
len(token_sequences)

# character level inputs

In [None]:
unique_characters = set(" ".join([token for seq in token_sequences for token in seq]))

In [None]:
special_cases = ["xxunk", "xxpad", "xxbos", "xxeos"]

for case in special_cases:
    unique_characters.add(case)

In [None]:
char_to_ix = {char: ix for ix, char in enumerate(unique_characters)}
ix_to_char = {ix: char for ix, char in enumerate(unique_characters)}

# fasttext and a word vector embedding matrix 

In [None]:
n_wvs = 100000
wv_path = "/mnt/efs/text/word_vectors/wiki-news-300d-1M.vec"
wv_file = io.open(wv_path, "r", encoding="utf-8", newline="\n", errors="ignore")
lines_to_parse = list(wv_file)[1:n_wvs]

fasttext = {
    line.split()[0]: np.array(line.split()[1:]).astype(np.float32)
    for line in tqdm(lines_to_parse)
}

In [None]:
from collections import Counter

all_tokens = [tok for seq in token_sequences for tok in seq]

article_vocabulary, _ = zip(*Counter(all_tokens).most_common(n_wvs))
article_vocabulary = set(article_vocabulary)

In [None]:
for case in special_cases:
    article_vocabulary.add(case)
    fasttext[case] = np.random.random(300)

In [None]:
len(article_vocabulary)

In [None]:
article_vocabulary_list = list(article_vocabulary)
token_to_ix = {token: index for index, token in enumerate(article_vocabulary_list)}
ix_to_token = {index: token for index, token in enumerate(article_vocabulary_list)}

In [None]:
word_vector_embedding_matrix = torch.FloatTensor(
    [
        fasttext[token] if token in fasttext else fasttext["xxunk"]
        for token in article_vocabulary
    ]
)

# dataset and dataloader

In [None]:
class SentenceDataset(Dataset):
    def __init__(self, token_seqs, word_vector_embedding_matrix):
        self.wv_embedding = nn.Embedding.from_pretrained(word_vector_embedding_matrix)

        # impose length constraint
        where_big_enough = np.where([len(seq) > 3 for seq in token_seqs])
        self.token_seqs = np.array(token_seqs)[where_big_enough]

        # find prediction points for language model
        self.exit_ix_seqs = [self.find_exit_points(seq) for seq in self.token_seqs]

        # indexify
        self.char_ix_seqs = [self.indexify_chars(seq) for seq in self.token_seqs]

        self.token_wv_seqs = [self.vectorise_tokens(seq) for seq in self.token_seqs]

    def __getitem__(self, ix):
        char_ix_seq = self.char_ix_seqs[ix]
        token_wv_seq = self.token_wv_seqs[ix]
        exit_ix_seq = self.exit_ix_seqs[ix]
        return char_ix_seq, token_wv_seq, exit_ix_seq

    def __len__(self):
        return len(self.token_seqs)

    def vectorise_tokens(self, token_seq):
        ix_seq = torch.LongTensor(
            np.array(
                [
                    token_to_ix[token]
                    if token in article_vocabulary
                    else token_to_ix["xxunk"]
                    for token in token_seq
                ]
                + [token_to_ix["xxeos"]]
            )
        )
        wv_seq = self.wv_embedding(ix_seq)
        return wv_seq

    def indexify_chars(self, token_seq):
        ix_seq = np.array(
            [char_to_ix[char] for char in " ".join(token_seq)]
            + [char_to_ix[" "], char_to_ix["xxeos"]]
        )
        return torch.LongTensor(ix_seq)

    def find_exit_points(self, token_seq):
        exit_positions = np.cumsum([len(token) + 1 for token in token_seq])
        return torch.LongTensor(exit_positions) - 1

In [None]:
def collate_fn(batch):
    char_ix_seqs, token_wv_seqs, exit_ix_seqs = zip(*batch)

    char_seq_lens = torch.LongTensor([len(char_seq) for char_seq in char_ix_seqs])

    sorted_lengths, sort_indicies = char_seq_lens.sort(dim=0, descending=True)

    sorted_char_seqs = [char_ix_seqs[i] for i in sort_indicies]
    sorted_wv_seqs = [token_wv_seqs[i] for i in sort_indicies]
    sorted_exit_seqs = [exit_ix_seqs[i] for i in sort_indicies]

    padded_char_seqs = pad_sequence(
        sequences=sorted_char_seqs, padding_value=char_to_ix["xxpad"], batch_first=True
    )

    padded_wv_seqs = pad_sequence(
        sequences=sorted_wv_seqs, padding_value=token_to_ix["xxpad"], batch_first=True
    )

    padded_exit_seqs = pad_sequence(
        sequences=sorted_exit_seqs, padding_value=0, batch_first=True
    )

    return padded_char_seqs, padded_wv_seqs, padded_exit_seqs, sorted_lengths

In [None]:
train_token_sequences, test_token_sequences = train_test_split(
    token_sequences, target_sequences, test_size=0.20, random_state=42
)

In [None]:
train_dataset = SentenceDataset(train_token_sequences, word_vector_embedding_matrix)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

In [None]:
test_dataset = SentenceDataset(test_token_sequences, word_vector_embedding_matrix)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

# model

In [None]:
class LanguageModel(nn.Module):
    def __init__(
        self, input_dim=len(unique_characters), embedding_dim=50, hidden_dim=512
    ):

        super(LanguageModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.enc_lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=False,
            # dropout=0.2
        )

        self.head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 300),
        )

    def forward(self, padded_char_seqs, exit_ix_seqs, sorted_lengths):
        x = self.embedding(padded_char_seqs)

        x = pack_padded_sequence(x, lengths=sorted_lengths, batch_first=True)

        x, _ = self.enc_lstm(x)
        x, _ = pad_packed_sequence(x, batch_first=True)

        # pop out the character embeddings at position of the end of each token
        x = torch.stack([x[i, exit_ix_seqs[i]] for i in range(len(x))])

        return self.head(x)

In [None]:
model = LanguageModel().to(device)

# training

In [None]:
losses = []

torch.backends.cudnn.benchmark = True

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())

optimiser = optim.Adam(trainable_parameters, lr=0.0001)

loss_function = nn.CosineEmbeddingLoss()

In [None]:
def train(model, train_loader, loss_function, optimiser, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        loop = tqdm(train_loader)
        flags = torch.ones(300).cuda()
        for char_seqs, target_wvs, exit_ix_seqs, lengths in loop:
            char_seqs = torch.LongTensor(char_seqs).cuda(non_blocking=True)
            target_wvs = torch.FloatTensor(target_wvs).cuda(non_blocking=True)[:, 1:]
            exit_ix_seqs = torch.LongTensor(exit_ix_seqs).cuda(non_blocking=True)
            lengths = torch.LongTensor(lengths).cuda(non_blocking=True)

            optimiser.zero_grad()
            preds = model(char_seqs, exit_ix_seqs, lengths)

            loss = loss_function(preds, target_wvs, flags)
            loss.backward()
            optimiser.step()

            losses.append(loss.item())
            loop.set_description("Epoch {}/{}".format(epoch + 1, n_epochs))
            loop.set_postfix(loss=np.mean(losses[-100:]))

In [None]:
train(
    model=model,
    train_loader=train_loader,
    loss_function=loss_function,
    optimiser=optimiser,
    n_epochs=1,
)

In [None]:
loss_data = pd.Series(losses[20:]).rolling(window=50).mean()
ax = loss_data.plot()
ax.set_ylim(0, 0.06);

# test

In [None]:
sentence = 'Traditionally, the term "philosophy" referred to any body of knowledge.'
token_seq = tokenize(sentence)

exit_ix_seq = np.cumsum([len(token) + 1 for token in token_seq]) - 1
exit_ix_seq = torch.LongTensor(exit_ix_seq).cuda()

char_seq = " ".join(token_seq)
char_ix_seq = torch.LongTensor([[char_to_ix[c] for c in char_seq]]).cuda()

x = model.embedding(char_ix_seq)
x, _ = model.enc_lstm(x)
x = x[0, exit_ix_seq]