In [7]:
import re
import pickle
import shelve
from tqdm.autonotebook import tqdm

import mwparserfromhell as mwph
from mwparserfromhell.nodes.text import Text
from mwparserfromhell.nodes.wikilink import Wikilink 
import wikitextparser as wtp

import requests
import nltk
from nltk.util import ngrams
import operator
import numpy as np

import time
import operator
import sys
import csv

import xgboost as xgb


from scripts.utils import wtpGetLinkAnchor
from scripts.utils_features import get_feature_set


In [8]:
lang = 'simple'
wiki   = lang+'wiki'

In [9]:
anchors = shelve.open("./data/{0}/{0}.anchors.db".format(lang),flag='r')
pageids = shelve.open("./data/{0}/{0}.pageids.db".format(lang),flag='r')
redirects = shelve.open("./data/{0}/{0}.redirects.db".format(lang),flag='r')

In [10]:
## load word2vec features
word2vec = shelve.open("./data/{0}/{0}.w2v.filtered.db".format(lang), flag='r' )
## load navigation-vector features
nav2vec = shelve.open("./data/{0}/{0}.nav.filtered.db".format(lang), flag='r' )

## load trained model
model = xgb.XGBClassifier()  # init model
model.load_model('./data/{0}/{0}.linkmodel.bin'.format(lang))  # load data

In [11]:
# Load the sentences to test
test_set = []
with open('./data/{0}/training/sentences_test.csv'.format(lang)) as fin:
    for line in fin:
        try:
            title, sent = line.split('\t')
            test_set.append((title, sent))
        except:
            continue

In [12]:
# Main decision function.

# for a given page X and a piece of text "lipsum".. check all the candidate and make inference
# Returns the most likely candidate according to the pre-trained link model
# If the probability is below a certain threshold, return None
def classify_links(page, text, THRESHOLD):
    #start_time = time.time()
    cand_prediction = {}
    # Work with the 10 most frequent candidates
    limited_cands = anchors[text]
    if len(limited_cands) > 10:
        limited_cands = dict(sorted(anchors[text].items(), key = operator.itemgetter(1), reverse = True)[:10]) 
    for cand in limited_cands:
        # get the features
#         cand_feats = get_feature_set(page, text, cand, anchors, word2vec,nav2vec,pageids)
        cand_feats = get_feature_set(page, text, cand, anchors, word2vec,nav2vec)

        # compute the model probability
        cand_prediction[cand] = model.predict_proba(np.array(cand_feats).reshape((1,-1)))[0,1]
    
    # Compute the top candidate
    top_candidate = max(cand_prediction.items(), key=operator.itemgetter(1))
    
    # Check if the max probability meets the threshold before returning
    if top_candidate[1] < THRESHOLD:
        return None
    #print("--- %s seconds ---" % (time.time() - start_time))
    return top_candidate

In [13]:
# Article parsing utility.

# For a given page return the list of all existing links and mentions
# To avoid linking what's already linked
# Article parsing utility.
def getLinks(wikicode, page_title):
    m = set()
    e = set()
    page_title_tmp = page_title.replace('_',' ')
    # add the page title itself
    m.add(page_title_tmp)
    e.add(page_title_tmp)
    linklist = wtp.parse(str(wikicode)).wikilinks
    for l in linklist:
        link,anchor = wtpGetLinkAnchor(l)
        m.add(anchor)
        e.add(link)
#         m.add(l.plain_text().strip())
#         e.add(l.title.strip())
    return m, e

In [14]:
# Article parsing utility.

# dictionary needed for evaluation

def getLinksEval(wikicode):
    link_dict={}
    linklist = wtp.parse(str(wikicode)).wikilinks
    for l in linklist:
        link,anchor = wtpGetLinkAnchor(l)
        link = redirects.get(link,link)
        link_dict[anchor] = link
#         link_dict[l.plain_text().strip()] = l.title.strip()
    return link_dict

In [15]:
# Split a MWPFH node <TEXT> into sentences
SENT_ENDS = [u".", u"!", u"?"]
def tokenize_sentence_split(text):
    for line in text.split("\n"):
        tok_acc = []
        for tok in nltk.word_tokenize(line):
            tok_acc.append(tok)
            if tok in SENT_ENDS:
                yield " ".join(tok_acc)
                tok_acc = []
        if tok_acc:
            yield " ".join(tok_acc)

In [16]:
# Actual Linking function
def process_page(page, page_wikicode):
    page_wikicode_init= str(page_wikicode) # save the initial state
    linked_mentions, linked_links = set(), set()
    tested_mentions = set()
    linked_mentions, linked_links = getLinks(page_wikicode, page) 
    ## this will only add the page-title as already linked

    for gram_length in range(10, 0, -1):
        #print("Scanning ", gram_length, "Grams")
        # Parsing the tree can be done once
        for node in page_wikicode.filter(recursive= False):
            if isinstance(node, Text):
                lines = node.split("\n")
                for line in lines:
                    for sent in tokenize_sentence_split(line):
                        grams = list(ngrams(sent.split(), gram_length))
                        for gram in grams:
                            mention = ' '.join(gram).lower()## anchor dict is lower case
                            mention_original = ' '.join(gram)##insert the un-lowercase mention
                            # if the mention exist in the DB 
                            # it was not previously linked (or part of a link)
                            # none of its candidate links is already used
                            # it was not tested before (for efficiency)
                            if (mention in anchors and
                                not any(mention in s for s in linked_mentions) and
                                not bool(set(anchors[mention].keys()) & linked_links) and
                                mention not in tested_mentions):
                                #logic
                                #print("testing:", mention, len(enanchors[mention]))
                                candidate = classify_links(page, mention, THRESHOLD)
                                if candidate:
                                    candidate_link, candidate_proba = candidate
                                    #print(">> ", mention, candidate)
                                    ############## Critical ##############
                                    # Insert The Link in the current wikitext
                                    match = re.compile(r'(?<!\[\[)(?<!-->)\b{}\b(?![\w\s]*[\]\]])'.format(re.escape(mention_original)))
                                    newval, found = match.subn("[[" + candidate_link  +  "|" + mention_original +"]]", node.value, 1)
                                    node.value = newval
                                    ######################################
                                    # Book-keeping
                                    linked_mentions.add(mention)
                                    linked_links.add(candidate)
                                # More Book-keeping
                                tested_mentions.add(mention)

    return page_wikicode

In [17]:
# enanchors['unincorporated community']

In [18]:
%%time
# Running the Model on a page

# Process pages of interest
# page_names = ["De_Lassone", "13463_Antiphos", "Peter_Jungen", "AVT"]

THRESHOLD = 0.9

### eval vars (micro and macro)
count_doc = 0.
count_docp = 0.
macro_pre = 0.
macro_rec = 0.
tot_TP = 0.
tot_rel = 0.
tot_ret = 0.

#### Backtest
for page, page_wikicode in test_set[:1000]:
#     if page != 'Taylor Negron':
#         continue
    input_code = page_wikicode
    output_code = process_page(page, mwph.parse(mwph.parse(page_wikicode).strip_code()))
    inp_pairs = getLinksEval(input_code)
    out_pairs = getLinksEval(output_code)

    TP = dict(set(inp_pairs.items()).intersection(out_pairs.items()))
    #
    doc_pre = 0 if len(out_pairs)==0 else len(TP)/len(out_pairs)
    doc_rec = len(TP)/len(inp_pairs)
    #
    tot_TP  += len(TP)
    tot_ret += len(out_pairs)
    tot_rel += len(inp_pairs)
    #print(len(TP), len(inp_pairs), len(out_pairs), " P:", doc_pre, " R:", doc_rec)
    count_doc+=1
    if count_doc %100 == 0:
        print('----------------------')
        micro_precision = tot_TP/tot_ret
        micro_recall    = tot_TP/tot_rel
        print("micro_precision:\t", micro_precision)
        print("micro_recall:\t"   , micro_recall)

micro_precision = tot_TP/tot_ret
micro_recall    = tot_TP/tot_rel
print("micro_precision:\t",micro_precision)
print("micro_recall:\t",  micro_recall)

  dst = (np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b))


----------------------
micro_precision:	 0.4227941176470588
micro_recall:	 0.5088495575221239
----------------------
micro_precision:	 0.4520547945205479
micro_recall:	 0.5384615384615384
----------------------
micro_precision:	 0.4371794871794872
micro_recall:	 0.5456
----------------------
micro_precision:	 0.43243243243243246
micro_recall:	 0.5370370370370371
----------------------
micro_precision:	 0.45045045045045046
micro_recall:	 0.5519779208831647
----------------------
micro_precision:	 0.44594594594594594
micro_recall:	 0.5487528344671202
----------------------
micro_precision:	 0.45959051724137934
micro_recall:	 0.5589777195281782
----------------------
micro_precision:	 0.44636502287747837
micro_recall:	 0.5363469761759316
----------------------
micro_precision:	 0.4492619926199262
micro_recall:	 0.5331143951833607
----------------------
micro_precision:	 0.4450413223140496
micro_recall:	 0.527165932452276
micro_precision:	 0.4450413223140496
micro_recall:	 0.52716593245227

In [208]:
input_code

'Brad Stephen "Taylor" Negron (August 1, 1957 – January 10, 2015) was an [[Americans|American]] [[writer]], [[actor]], [[painter]], and [[stand-up comedian]].\n'

In [209]:
output_code

'Brad Stephen "Taylor" Negron ([[August 1|August 1]], 1957 – [[January 10|January 10]], 2015) was an American [[Writer|writer]], [[Actor|actor]], [[Painting|painter]], and [[Stand-up comedy|stand-up comedian]].'

In [213]:
classify_links('Taylor Negron','american',0.2)

('Americans', 0.42265084)

In [178]:
redirects['Painter']

'Painting'

In [214]:
inp_pairs

{'american': 'Americans',
 'writer': 'Writer',
 'actor': 'Actor',
 'painter': 'Painting',
 'stand-up comedian': 'Stand-up comedy'}

In [215]:
out_pairs

{'august 1': 'August 1',
 'january 10': 'January 10',
 'writer': 'Writer',
 'actor': 'Actor',
 'painter': 'Painting',
 'stand-up comedian': 'Stand-up comedy'}

In [191]:
input_code

'Brad Stephen "Taylor" Negron (August 1, 1957 – January 10, 2015) was an [[Americans|American]] [[writer]], [[actor]], [[painter]], and [[stand-up comedian]].\n'

In [192]:
mwph.parse(mwph.parse(page_wikicode).strip_code())

'Brad Stephen "Taylor" Negron (August 1, 1957 – January 10, 2015) was an American writer, actor, painter, and stand-up comedian.'