In [26]:
import re
import pickle
from tqdm.autonotebook import tqdm

import mwparserfromhell
from mwparserfromhell.nodes.text import Text
from mwparserfromhell.nodes.wikilink import Wikilink 
import wikitextparser as wtp

import requests
import nltk
from nltk.util import ngrams
import operator
import numpy as np
from scipy.stats import kurtosis
from Levenshtein import distance as levenshtein_distance

import time
import operator
import sys
import csv

In [27]:
# This is needed for pushing to test.wikipedia
# import pywikibot
# site = pywikibot.Site('test', 'wikipedia')  # The site we want to run our bot on

In [28]:
API_URL = "https://en.wikipedia.org/w/api.php"

def parse(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "rvlimit": 1,
        "titles": title,
        "format": "json",
        "formatversion": "2",
    }
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]["revisions"][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)

In [30]:
# Load the anchor dictionary (the main data structure)
# this is generated by script: ./scripts/generate_anchor_dictionary.py
enanchors = pickle.load( open( "./data/en/en.anchors.pkl", "rb" ) )

In [2]:
# TODO: SVD as additional component
# We want for all pages ..

In [4]:
# Embeddings of Wikipedia entities(not words)
# this is generated by script: wikipedia2vec train --min-entity-count=0 --dim-size 100 enwiki-latest-pages-articles.xml.bz2 ./data/en/en.w2v.bin
from wikipedia2vec import Wikipedia2Vec
w2file = './data/en/en.w2v.bin'
word2vec = Wikipedia2Vec.load(w2file)

In [9]:
# Navigation embeddings
# TODO: page_tile as key @Martin
# Ideal: have a vector for ALL the wikipedia pages

# TODO: Check if we can load this with mmap

import fasttext
navfile = './data/en/word2vec_enwiki_params-cbow-50-5-0.1-10-5-20.bin'
nav2vec = fasttext.load_model(navfile)

In [8]:
# TODO: the navigation model should change to (page_title, vector)
# This piece won't be needed then

# TODO: It is probably easier to extract the page_id from the dump
# and streamline this step

csv.field_size_limit(sys.maxsize)
reader = csv.reader(open('./data/en/pageid.csv', 'r'))
pageid = {}
for row in reader:
    k, v = row[0].split('\t')
    pageid[v] = k

In [10]:
# List of word embedded 'entities'
veclist = set([t.title for t in list(word2vec.dictionary.entities())])

In [11]:
# Load the moodel classifier

import xgboost as xgb
model = xgb.XGBClassifier()  # init model
model.load_model('./data/en/0001.link.bin')  # load data

# make a random test of the model
# model.predict_proba(np.array([2, 36567, 669, 726.889369, 0.558646, 0.0]).reshape((1,-1)))[0,1]



In [14]:
# Utility function to factor out

# Wikipedia2Vec distance between two pages (origin, destination)
# Semantic relationship
def getW2VDst(ent_a, ent_b):
    dst = 0
    if ent_a in veclist and ent_b in veclist:
        a = word2vec.get_entity_vector(ent_a)
        b = word2vec.get_entity_vector(ent_b)
        dst = (np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b))
    return dst

In [15]:
# Utility function to factor out

# Navigation distance between two pages (origin, destination)
# Probability of navigation (see with Martin)
def getNavDst(ent_a, ent_b):
    dst = 0
    if ent_a in pageid and ent_b in pageid:
        page_a = pageid[ent_a]
        page_b = pageid[ent_b]
        if ent_a in veclist and ent_b in veclist:
            a = nav2vec.get_word_vector(page_a)
            b = nav2vec.get_word_vector(page_b)
            dst = (np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b))
    return dst

In [16]:
# # Pre-processing

# # TODO: The following block should be replaced by finding the list of valid pages
# # to link from the Hive database


########################
# # Description: Collect a list of entities, disambiguation pages, redirects
# # to exclude them as candidates

# # The following file is from the dumps
# entities = []
# with open('./data/en_bis/enwiki-20200201-all-titles-in-ns0') as fin:
#     for line in fin:
#         entities.append(line.strip().replace("_", " "))

# # The following file is found in the redirects (I think it's available in the dumps too)
# dislinks = []
# with open('./data/en_bis/enwiki.dis') as fin:
#     for line in fin:
#         dislinks.append(line.strip())

# # Finally Remove the disambiguation links
# dislinks = set(dislinks)
# entities = set(entities)  - dislinks

In [17]:
print("Num of potential target pages:", len(veclist))

Num of potential target pages: 11981208


In [18]:
# Utility function to factor out

# Return the features for each link candidate in the context of the text and the page
# TODO: refactor this piece of code to be the same for training model
def get_feature_set(page, text, link):
    ngram = len(text.split()) # simple space based tokenizer to compute n-grams
    freq = enanchors[text][link] # How many times was the link use with this text 
    ambig = len(enanchors[text]) # home many different links where used with this text
    kur = kurtosis(sorted(list(enanchors[text].values()), reverse = True) + [1] * (1000 - ambig)) # Skew of usage text/link distribution
    w2v = getW2VDst(page, link) # W2V Distance between the source and target page
    nav = getNavDst(page, link) # Nav Distance between the source and target page
    return (ngram, freq, ambig, kur, w2v, nav)

In [19]:
# Main decision function.

# for a given page X and a piece of text "lipsum".. check all the candidate and make inference
# Returns the most likely candidate according to the pre-trained link model
# If the probability is below a certain threshold, return None
def classify_links(page, text, THRESHOLD):
    #start_time = time.time()
    cand_prediction = {}
    # Work with the 10 most frequent candidates
    limited_cands = enanchors[text]
    if len(limited_cands) > 10:
        limited_cands = dict(sorted(enanchors[text].items(), key = operator.itemgetter(1), reverse = True)[:10]) 
    for cand in limited_cands:
        # get the features
        cand_feats = get_feature_set(page, text, cand)
        # compute the model probability
        cand_prediction[cand] = model.predict_proba(np.array(cand_feats).reshape((1,-1)))[0,1]
    
    # Compute the top candidate
    top_candidate = max(cand_prediction.items(), key=operator.itemgetter(1))
    
    # Check if the max probability meets the threshold before returning
    if top_candidate[1] < THRESHOLD:
        return None
    #print("--- %s seconds ---" % (time.time() - start_time))
    return top_candidate

In [20]:
# Article parsing utility.

# For a given page return the list of all existing links and mentions
# To avoid linking what's already linked
def getLinks(wikicode, page_title):
    m = set()
    e = set()
    page_title_tmp = page_title.replace('_',' ')
    # add the page title itself
    m.add(page_title_tmp)
    e.add(page_title_tmp)
    linklist = wtp.parse(str(wikicode)).wikilinks
    for l in linklist:
        m.add(l.plain_text().strip())
        e.add(l.title.strip())
    return m, e

In [21]:
# Article parsing utility.

# Split a MWPFH node <TEXT> into sentences
SENT_ENDS = [u".", u"!", u"?"]
def tokenize_sentence_split(text):
    for line in text.split("\n"):
        tok_acc = []
        for tok in nltk.word_tokenize(line):
            tok_acc.append(tok)
            if tok in SENT_ENDS:
                yield " ".join(tok_acc)
                tok_acc = []
        if tok_acc:
            yield " ".join(tok_acc)

In [22]:
# Article parsing utility.

# This is in an effort to correct the redirects in the lists
def postProcessList(list_ent):
    correct_dict = {}
    for k in list_ent:
        if word2vec.get_entity(k):
            redirect = word2vec.get_entity(k).title
            correct_dict[redirect] = correct_dict.get(redirect, 0) + list_ent[k]
        else:
            correct_dict[k] = correct_dict.get(k, 0) + list_ent[k]
    return correct_dict

In [23]:
# Sample list of pages for testing
# Process pages of interest
# page_names = ["De_Lassone", "13463_Antiphos", "Peter_Jungen", "AVT"]

page_names = ["BMC_Amazon",
            "Theory_of_regions",
            "Pier_Luigi_Capucci",
            "California_Digital_Library",
            "Business_necessity",
            "Bell_Post_Hill_Football_Club",
            "Buying_center",
            "Bay_(shelving)",
            "National_Taichung_University_of_Education",
            "Nalugu_Rallu_Aata",
            "Nevis_Television",
            "Tommy_Newberry",
            "Nagraur,_Bahraich",
            "Kris_Neely",
            "National_Axe_Throwing_Federation",
            "National_Arts_Council_of_South_Africa",
            "Faisal_Tehrani",
            "Thief_River_Falls_Times",
            "Tambour_door"]

In [33]:
# Actual Linking function
def process_page(page):
    page_wikicode = parse(page)
    page_wikicode_init= str(page_wikicode) # save the initial state
    linked_mentions, linked_links = getLinks(page_wikicode, page)
    tested_mentions = set()
    for gram_length in range(10, 0, -1):
        #print("Scanning ", gram_length, "Grams")
        # Parsing the tree can be done once
        for node in page_wikicode.filter(recursive= False):
            if isinstance(node, Text):
                lines = node.split("\n")
                for line in lines:
                    for sent in tokenize_sentence_split(line):
                        grams = list(ngrams(sent.split(), gram_length))
                        for gram in grams:
                            mention = ' '.join(gram)
                            # if the mention exist in the DB 
                            # it was not previously linked (or part of a link)
                            # none of its candidate links is already used
                            # it was not tested before (for efficiency)
                            if (mention in enanchors and
                                not any(mention in s for s in linked_mentions) and
                                not bool(set(enanchors[mention].keys()) & linked_links) and
                                mention not in tested_mentions):
                                #logic
                                #print("testing:", mention, len(enanchors[mention]))
                                candidate = classify_links(page, mention, THRESHOLD)
                                if candidate:
                                    candidate_link, candidate_proba = candidate
                                    #print(">> ", mention, candidate)
                                    ############## Critical ##############
                                    # Insert The Link in the current wikitext
                                    match = re.compile(r'(?<!\[\[)(?<!-->)\b{}\b(?![\w\s]*[\]\]])'.format(re.escape(mention)))
                                    newval, found = match.subn("[[" + candidate_link  +  "|" + mention+  "|pr=" + str(candidate_proba) + "]]", node.value, 1)
                                    node.value = newval
                                    ######################################
                                    # Book-keeping
                                    linked_mentions.add(mention)
                                    linked_links.add(candidate)
                                # More Book-keeping
                                tested_mentions.add(mention)

    return page_wikicode

In [34]:
%%time
# Running the Model on a page

# Process pages of interest
# page_names = ["De_Lassone", "13463_Antiphos", "Peter_Jungen", "AVT"]

page_names = ["Caps_(drinking_game)"]

uploaded= []
THRESHOLD = 0.95

for page_title in ["Faisal_Tehrani"]:#page_names:
    print("processing:", page_title)
    print("\n==========\n")
    print(process_page(page_title))

processing: Faisal_Tehrani


{{Multiple issues|{{more footnotes|date=March 2018}}
{{Underlinked|date=June 2016}}
{{BLP sources|date=May 2011}}
{{cleanup rewrite|reason=events are written in improper order and not coherent fashion|date=March 2017}}}}

{{EngvarB|date=October 2014}}
{{Use dmy dates|date=October 2014}}

{{Infobox writer <!-- for more information see [[:Template:Infobox writer/doc]] -->
 | name        = Mohd Faizal Musa
 | image       =FORUM_BAHASA_JIWA_BANGSA_ATAU_BAHASA_GILA_KUASA_170514_TMISHAFIQ_04.jpg
 | imagesize   =
 | caption     =
 | pseudonym   = Faisal Tehrani
 | birth_date  = {{birth date and age|df=yes|1974|08|07}}
 | birth_place = [[Kuala Lumpur]], [[Malaysia]]
 | death_date  =
 | death_place =
 | nationality = Malaysian
 | period      =
 | genre       = Novel, short-story, poem and stage play
 | subject     =
 | movement    =
 | influences  =
 | influenced  =
 | signature   =
 | website     =
}}

'''Mohd Faizal Musa''' (born 7 August 1974), also known under t