In [None]:
from os.path import exists, join, dirname

In [None]:
def load_names(path, num, prefix):
    names = {}
    errors = 0  # debug
    if num > 0:
        with open(path, "rt", encoding="UTF-8") as fin:
            for line in fin:
                try:
                    name, number = line.rstrip("\n").split("\t")
                except ValueError:
                    errors += 1
                number = int(number)
                if number >= num:
                    break
                else:
                    if name.startswith(prefix):
                        names[number] = name[7:]
        print(errors)  # debug
    return names

In [None]:
path = "/mnt/efs/wikipedia/deeptype/wikidata/"
num_names_to_load = 43710495
prefix = "enwiki"

In [None]:
known_names = load_names(
    join(path, "wikidata_wikititle2wikidata.tsv"), num_names_to_load, prefix=prefix
)

# load_wikidata_ids

In [None]:
class MarisaAsDict(object):
    def __init__(self, marisa):
        self.marisa = marisa

    def get(self, key, fallback):
        value = self.marisa.get(key, None)
        if value is None:
            return fallback
        else:
            return value[0][0]

    def __getitem__(self, key):
        value = self.marisa[key]
        return value[0][0]

    def __contains__(self, key):
        return key in self.marisa

In [None]:
def load_wikidata_ids(path, verbose=True):
    wikidata_ids_inverted_path = join(path, "wikidata_ids_inverted.marisa")
    with open(join(path, "wikidata_ids.txt"), "rt") as fin:
        ids = fin.read().splitlines()
    if exists(wikidata_ids_inverted_path):
        print("exists")
        if verbose:
            print("loading wikidata id -> index")
        name2index = MarisaAsDict(
            marisa_trie.RecordTrie("i").load(wikidata_ids_inverted_path)
        )
        if verbose:
            print("done")
    else:
        if verbose:
            print("building trie")

        name2index = MarisaAsDict(
            marisa_trie.RecordTrie("i", [(name, (k,)) for k, name in enumerate(ids)])
        )
        name2index.marisa.save(wikidata_ids_inverted_path)
        if verbose:
            print("done")
    return (ids, name2index)

In [None]:
import marisa_trie

In [None]:
ids, name2index = load_wikidata_ids(path)

In [None]:
article2id = marisa_trie.RecordTrie("i").load(join(path, "wikititle2wikidata.marisa"))

In [None]:
article2id["europe"][0][0]

# fuzz

In [None]:
import numpy as np
from nltk import word_tokenize
from more_itertools import consecutive_groups


def tokenize(sentence):
    """moses tokeniser"""
    seq = " ".join(word_tokenize(sentence))
    seq = seq.replace(" n't ", "n 't ")
    return seq.split()


s = "a bunch of text with a name like Francis Crick in it, and then Francis Crick in it again later"
tokens = tokenize(s)

In [None]:
outputs = np.array(
    [1 if token == "Francis" or token == "Crick" else 0 for token in tokens]
)