In [15]:
import re
import pickle
from tqdm.autonotebook import tqdm

import mwparserfromhell
from mwparserfromhell.nodes.text import Text
from mwparserfromhell.nodes.wikilink import Wikilink 
import wikitextparser as wtp

import requests
import nltk
from nltk.util import ngrams
import operator
import numpy as np

import time
import operator
import sys
import csv

from scripts.utils import wtpGetLinkAnchor
from scripts.utils import get_feature_set

In [16]:
# if len(sys.argv) >= 2:
#     lang = sys.argv[1]
# else:
#     lang = 'en'
lang = 'simple'
wiki   = lang+'wiki'

In [17]:
API_URL = "https://{0}.wikipedia.org/w/api.php".format(lang)

def parse(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "rvlimit": 1,
        "titles": title,
        "format": "json",
        "formatversion": "2",
    }
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]["revisions"][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)

In [26]:
# Load the anchor dictionary (the main data structure)
# this is generated by script: ./scripts/generate_anchor_dictionary.py
anchors = pickle.load( open( "./data/{0}/{0}.anchors.pkl".format(lang), "rb" ) )
pageids = pickle.load( open( "./data/{0}/{0}.pageids.pkl".format(lang), "rb" ) )
redirects = pickle.load( open( "./data/{0}/{0}.redirects.pkl".format(lang), "rb" ) )

In [27]:
# ## filter anchors
# print(len(anchors))
# anchors_tmp = {anchor:anchor_dict for anchor,anchor_dict in anchors.items() if sum(anchor_dict.values())>1}
# anchors=anchors_tmp
# print(len(anchors))

In [28]:
# TODO: SVD as additional component
# We want for all pages ..

In [29]:
# Embeddings of Wikipedia entities(not words)
# this is generated by script: wikipedia2vec train --min-entity-count=0 --dim-size 100 enwiki-latest-pages-articles.xml.bz2 ./data/en/en.w2v.bin
from wikipedia2vec import Wikipedia2Vec
w2file = './data/{0}/{0}.w2v.bin'.format(lang)
word2vec = Wikipedia2Vec.load(w2file)

In [30]:
# Navigation embeddings
# TODO: page_tile as key @Martin
# Ideal: have a vector for ALL the wikipedia pages

# TODO: Check if we can load this with mmap

import fasttext
navfile = './data/{0}/{0}.nav.bin'.format(lang)
nav2vec = fasttext.load_model(navfile)

In [31]:
# List of word embedded 'entities'
# veclist = set([t.title for t in list(word2vec.dictionary.entities())])

In [32]:
# Load the moodel classifier

import xgboost as xgb
model = xgb.XGBClassifier()  # init model
model.load_model('./data/{0}/0001.link.bin'.format(lang))  # load data

# make a random test of the model
# model.predict_proba(np.array([2, 36567, 669, 726.889369, 0.558646, 0.0]).reshape((1,-1)))[0,1]

In [40]:
# Main decision function.

# for a given page X and a piece of text "lipsum".. check all the candidate and make inference
# Returns the most likely candidate according to the pre-trained link model
# If the probability is below a certain threshold, return None
def classify_links(page, text, THRESHOLD):
    #start_time = time.time()
    cand_prediction = {}
    # Work with the 10 most frequent candidates
    limited_cands = anchors[text]
    if len(limited_cands) > 10:
        limited_cands = dict(sorted(anchors[text].items(), key = operator.itemgetter(1), reverse = True)[:10]) 
    for cand in limited_cands:
        # get the features
        cand_feats = get_feature_set(page, text, cand, anchors, word2vec,nav2vec,pageids)
        # compute the model probability
        cand_prediction[cand] = model.predict_proba(np.array(cand_feats).reshape((1,-1)))[0,1]
    
    # Compute the top candidate
    top_candidate = max(cand_prediction.items(), key=operator.itemgetter(1))
    
    # Check if the max probability meets the threshold before returning
    if top_candidate[1] < THRESHOLD:
        return None
    #print("--- %s seconds ---" % (time.time() - start_time))
    return top_candidate

In [41]:
# Article parsing utility.

# For a given page return the list of all existing links and mentions
# To avoid linking what's already linked
def getLinks(wikicode, page_title):
    m = set()
    e = set()
    page_title_tmp = page_title.replace('_',' ')
    # add the page title itself
    m.add(page_title_tmp)
    e.add(page_title_tmp)
    linklist = wtp.parse(str(wikicode)).wikilinks
    for l in linklist:
        link,anchor = wtpGetLinkAnchor(l)
        m.add(anchor)
        e.add(link)
#         m.add(l.plain_text().strip())
#         e.add(l.title.strip())
    return m, e

In [42]:
# Article parsing utility.

# Split a MWPFH node <TEXT> into sentences
SENT_ENDS = [u".", u"!", u"?"]
def tokenize_sentence_split(text):
    for line in text.split("\n"):
        tok_acc = []
        for tok in nltk.word_tokenize(line):
            tok_acc.append(tok)
            if tok in SENT_ENDS:
                yield " ".join(tok_acc)
                tok_acc = []
        if tok_acc:
            yield " ".join(tok_acc)

In [43]:
# Article parsing utility.

# # This is in an effort to correct the redirects in the lists
# def postProcessList(list_ent):
#     correct_dict = {}
#     for k in list_ent:
#         if word2vec.get_entity(k):
#             redirect = word2vec.get_entity(k).title
#             correct_dict[redirect] = correct_dict.get(redirect, 0) + list_ent[k]
#         else:
#             correct_dict[k] = correct_dict.get(k, 0) + list_ent[k]
#     return correct_dict

In [44]:
# Sample list of pages for testing
# Process pages of interest
# page_names = ["De_Lassone", "13463_Antiphos", "Peter_Jungen", "AVT"]

page_names = ["BMC_Amazon",
            "Theory_of_regions",
            "Pier_Luigi_Capucci",
            "California_Digital_Library",
            "Business_necessity",
            "Bell_Post_Hill_Football_Club",
            "Buying_center",
            "Bay_(shelving)",
            "National_Taichung_University_of_Education",
            "Nalugu_Rallu_Aata",
            "Nevis_Television",
            "Tommy_Newberry",
            "Nagraur,_Bahraich",
            "Kris_Neely",
            "National_Axe_Throwing_Federation",
            "National_Arts_Council_of_South_Africa",
            "Faisal_Tehrani",
            "Thief_River_Falls_Times",
            "Tambour_door"]

In [55]:
# Actual Linking function
def process_page(page):
    page_wikicode = parse(page)
    page_wikicode_init= str(page_wikicode) # save the initial state
    linked_mentions, linked_links = getLinks(page_wikicode, page)
    tested_mentions = set()
    for gram_length in range(10, 0, -1):
        #print("Scanning ", gram_length, "Grams")
        # Parsing the tree can be done once
        for node in page_wikicode.filter(recursive= False):
            if isinstance(node, Text):
                lines = node.split("\n")
                for line in lines:

                    for sent in tokenize_sentence_split(line):
                        grams = list(ngrams(sent.split(), gram_length))
    
                        for gram in grams:
                            mention = ' '.join(gram).lower()
                            # if the mention exist in the DB 
                            # it was not previously linked (or part of a link)
                            # none of its candidate links is already used
                            # it was not tested before (for efficiency)
 
                            if (mention in anchors and
                                not any(mention in s for s in linked_mentions) and
                                not bool(set(anchors[mention].keys()) & linked_links) and
                                mention not in tested_mentions):
                                #logic
                                #print("testing:", mention, len(anchors[mention]))
                                candidate = classify_links(page, mention, THRESHOLD)
                                if candidate:
                                    candidate_link, candidate_proba = candidate
                                    #print(">> ", mention, candidate)
                                    ############## Critical ##############
                                    # Insert The Link in the current wikitext
                                    match = re.compile(r'(?<!\[\[)(?<!-->)\b{}\b(?![\w\s]*[\]\]])'.format(re.escape(mention)))
                                    newval, found = match.subn("[[" + candidate_link  +  "|" + mention+  "|pr=" + str(candidate_proba) + "]]", node.value, 1)
                                    node.value = newval
                                    ######################################
                                    # Book-keeping
                                    linked_mentions.add(mention)
                                    linked_links.add(candidate)
                                # More Book-keeping
                                tested_mentions.add(mention)

    return page_wikicode

In [57]:
# Running the Model on a page
THRESHOLD = 0.95

page_title = "Fernand_Léger"
print("processing:", page_title)
print("\n==========\n")
result = process_page(page_title)
print(result)

processing: Fernand_Léger


{{Infobox artist
| name        = Fernand Léger
| image       = Fernand Léger.jpg
| imagesize   =
| caption     = Fernand Léger photographed by {{nowrap|Carl Van Vechten, 1936}}
| birth_name  = 
| birth_date  = {{birth date|1881|2|4|mf=y}}
| birth_place = [[Argentan]], [[Orne]], [[French Third Republic|France]]
| death_date  = {{death date and age|1955|8|17|1881|2|4|mf=y}}
| death_place = [[Gif-sur-Yvette]], [[French Fourth Republic|France]]
| nationality = [[French people|French]]
| field       = [[Painting]], [[printmaking]] and [[filmmaking]]
| training    = 
| movement    = [[Tubism]]<br>[[Cubism]]<br>[[Modernism]]}}

'''Fernand Léger''' (Joseph Fernand Henri Léger, 4 February 1881 – 17 August 1955) was a French [[painting|painter]], [[sculpture|sculptor]], and [[film director|filmmaker]]. In his early works he created a personal form of [[cubism]] which he gradually [[Amendment|changed|pr=0.99994767]] into a [[More (song)|more|pr=0.99995637]] popular fig

In [53]:
## the crucial thing is to get the correct mentions.

In [58]:
anchors['him']

{'Hyun Soong-jong': 1}

In [59]:
classify_links('Fernand_Léger','him',THRESHOLD)

('Hyun Soong-jong', 0.99994767)