In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
plt.rcParams["figure.figsize"] = (20, 20)

import re
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

from fastai.text import Tokenizer

In learning to link, we want to label our downloaded wikipedia text so that links are easily identifiable. 

A binary scheme, in which non-link-tokens are labeled `0` and link-tokens are labelled `1`, makes sense for use in a machine learning context, so we'll aim for that.
For example, for the sentence:

> Traditionally, the term "philosophy" referred to any body of [knowledge](https://en.wikipedia.org/wiki/Knowledge).

we want to return

```
0    Traditionally
0    ,
0    the
0    term
0    "
0    philosophy
0    "
0    referred
0    to
0    any
0    body
0    of
1    knowledge
0    .
```

First, we need to load some data

In [None]:
file_path = "/mnt/efs/wikipedia/dumps/text/AA/wiki_00"

with open(file_path, "rb") as f:
    file = f.read().decode("latin1")

Each file is made up of multiple articles, so we'll split them by `<doc>` tokens

In [None]:
pattern = r"(?:<doc.+>)((.|\s|\S)*?)(?:<\/doc>)"
docs = [doc[0] for doc in re.findall(pattern, file)]

Links are still embedded in the text as html, so we need to parse them and pull them each out as text. We then tokenise both link-text and full-text, using the standard `fast.ai` tokeniser. Then, we use the knuth-morris-pratt algorithm to find instances of subsequences (our link texts) in a larger sequence (our full text). These tokens are labelled as 1s and a full labelled doc is returned, ready to be ingested by a neural net / similar

In [None]:
def label_linkable_tokens(article_html, tokenizer=Tokenizer(), label_all=True):
    parsed_html = BeautifulSoup(article_html, "html.parser")

    link_text = [link.text for link in parsed_html.find_all("a")]
    tokenised_links = tokenizer.process_all(link_text)

    tokenised_text = tokenizer.process_all([parsed_html.text])[0]

    target = np.zeros(len(tokenised_text))

    for link in tokenised_links:
        start_positions = kmp(tokenised_text, link)
        if label_all:
            for pos in start_positions:
                target[pos : pos + len(link)] = 1
        elif label_all == False and len(start_positions) > 0:
            pos = start_positions[0]
            target[pos : pos + len(link)] = 1
        else:
            pass

    return tokenised_text, target

In [None]:
def kmp(sequence, sub):
    """
    Knuth–Morris–Pratt algorithm, returning the starting position
    of a specified sub within another, larger sequence.
    Often used for string matching.
    """
    partial = [0]
    for i in range(1, len(sub)):
        j = partial[i - 1]
        while j > 0 and sub[j] != sub[i]:
            j = partial[j - 1]
        partial.append(j + 1 if sub[j] == sub[i] else j)

    positions, j = [], 0
    for i in range(len(sequence)):
        while j > 0 and sequence[i] != sub[j]:
            j = partial[j - 1]
        if sequence[i] == sub[j]:
            j += 1
        if j == len(sub):
            positions.append(i - (j - 1))
            j = 0

    return positions

In [None]:
t = Tokenizer()

for doc in tqdm(docs):
    tokenised_text, target = label_linkable_tokens(doc, tokenizer=t)

In [None]:
list(zip(tokenised_text, target))

note that we can choose to label _all_ examples of the occurence of a phrase, or the first instance, as is generally the case in wikipedia.