In [1]:
%load_ext autoreload
%autoreload 2
import re
import pickle
import shelve
from tqdm.autonotebook import tqdm

import mwparserfromhell as mwph
from mwparserfromhell.nodes.text import Text
from mwparserfromhell.nodes.wikilink import Wikilink 

import nltk
from nltk.util import ngrams

import time
import sys

import xgboost as xgb
from scripts.utils import *

  


In [2]:
lang = 'simple'
wiki   = lang+'wiki'

In [3]:
anchors = shelve.open("./data/{0}/{0}.anchors.db".format(lang),flag='r')
pageids = shelve.open("./data/{0}/{0}.pageids.db".format(lang),flag='r')
redirects = shelve.open("./data/{0}/{0}.redirects.db".format(lang),flag='r')

## load word2vec features
word2vec = shelve.open("./data/{0}/{0}.w2v.filtered.db".format(lang), flag='r' )
## load navigation-vector features
nav2vec = shelve.open("./data/{0}/{0}.nav.filtered.db".format(lang), flag='r' )

## load trained model
model = xgb.XGBClassifier()  # init model
model.load_model('./data/{0}/{0}.linkmodel.bin'.format(lang))  # load data

In [4]:
# Load the sentences to test
test_set = []
with open('./data/{0}/training/sentences_test.csv'.format(lang)) as fin:
    for line in fin:
        try:
            title, sent = line.split('\t')
            test_set.append((title, sent))
        except:
            continue

In [201]:
%%time
# Running the Model on a page

# Process pages of interest
# page_names = ["De_Lassone", "13463_Antiphos", "Peter_Jungen", "AVT"]

THRESHOLD = 0.9

N_max = 100
N_interval = 10

### eval vars (micro and macro)
count_doc = 0.
count_docp = 0.
macro_pre = 0.
macro_rec = 0.
tot_TP = 0.
tot_rel = 0.
tot_ret = 0.

#### Backtest
for page, page_wikicode in test_set[:N_max]:
    input_code = page_wikicode
    ## get links from original wikitext (resolve redirects, and )
    inp_pairs = getLinks(input_code, redirects=redirects, pageids=pageids)
    
    ## if no links in main namespace, go to next item
    if len(inp_pairs)==0:
        continue
    
    
    input_code_nolinks = mwph.parse(page_wikicode).strip_code()
    output_code = process_page(input_code_nolinks, page, anchors, pageids, redirects, word2vec,nav2vec, model, threshold = THRESHOLD, pr=False )
   
    ## get links from predicted wikitext
    out_pairs = getLinks(output_code, redirects=redirects, pageids=pageids)

    TP = dict(set(inp_pairs.items()).intersection(out_pairs.items()))
    #
#     doc_pre = 0 if len(out_pairs)==0 else len(TP)/len(out_pairs)
#     doc_rec = len(TP)/len(inp_pairs)
    #
    tot_TP  += len(TP)
    tot_ret += len(out_pairs)
    tot_rel += len(inp_pairs)
    #print(len(TP), len(inp_pairs), len(out_pairs), " P:", doc_pre, " R:", doc_rec)
    count_doc+=1
    if count_doc %N_interval == 0:
        print('----------------------')
        micro_precision = tot_TP/tot_ret
        micro_recall    = tot_TP/tot_rel
        print("micro_precision:\t", micro_precision)
        print("micro_recall:\t"   , micro_recall)

micro_precision = tot_TP/tot_ret
micro_recall    = tot_TP/tot_rel
print("micro_precision:\t",micro_precision)
print("micro_recall:\t",  micro_recall)

----------------------
micro_precision:	 0.631578947368421
micro_recall:	 0.6666666666666666
----------------------
micro_precision:	 0.5555555555555556
micro_recall:	 0.5555555555555556
----------------------
micro_precision:	 0.5064935064935064
micro_recall:	 0.582089552238806
----------------------
micro_precision:	 0.4690265486725664
micro_recall:	 0.5888888888888889
----------------------
micro_precision:	 0.42857142857142855
micro_recall:	 0.5660377358490566
----------------------
micro_precision:	 0.42105263157894735
micro_recall:	 0.5413533834586466
----------------------
micro_precision:	 0.4479166666666667
micro_recall:	 0.5477707006369427
----------------------
micro_precision:	 0.44285714285714284
micro_recall:	 0.5314285714285715
----------------------
micro_precision:	 0.4581497797356828
micro_recall:	 0.5279187817258884
micro_precision:	 0.45147679324894513
micro_recall:	 0.5219512195121951
CPU times: user 12min 23s, sys: 2.42 s, total: 12min 25s
Wall time: 37.2 s


In [188]:
# process a single page

THRESHOLD = 0.5
ind_test = 3
#### Backtest
page, page_wikicode = test_set[ind_test]
input_code = page_wikicode
## get links from original wikitext (resolve redirects, and )
inp_pairs = getLinks(input_code, redirects=redirects, pageids=pageids)

input_code_nolinks = mwph.parse(page_wikicode).strip_code()
output_code = process_page(input_code_nolinks, page, anchors, pageids, redirects, word2vec,nav2vec, model, threshold = THRESHOLD, pr=False )

## get links from predicted wikitext
out_pairs = getLinks(output_code, redirects=redirects, pageids=pageids)

TP = dict(set(inp_pairs.items()).intersection(out_pairs.items()))

CPU times: user 5.35 s, sys: 28.2 ms, total: 5.38 s
Wall time: 278 ms


In [189]:
page

'Play (activity)'

In [190]:
input_code

'Play is a word used in [[psychology]] and [[ethology]] to describe [[free will|voluntary]] activities associated with recreational pleasure and enjoyment, Play is usually connected with children and their juvenile-level activities, but it can also be a useful adult activity, and occurs among other higher-functioning animals as well.\n'

In [191]:
output_code

'Play is a word used in [[Psychology|psychology]] and [[Ethology|ethology]] to describe voluntary activities associated with recreational pleasure and enjoyment, Play is usually connected with children and their juvenile-level activities, but it can also be a useful [[Adult|adult]] activity, and occurs among other higher-functioning animals as well.'

In [192]:
TP

{'ethology': 'Ethology', 'psychology': 'Psychology'}

In [193]:
inp_pairs

{'psychology': 'Psychology', 'ethology': 'Ethology', 'voluntary': 'Free will'}

In [194]:
out_pairs

{'psychology': 'Psychology', 'ethology': 'Ethology', 'adult': 'Adult'}

In [198]:
mention = 'adult'

In [199]:
classify_links(page,mention,anchors,word2vec,nav2vec,model,threshold=0.0)

KeyError: b'adult activity'

In [200]:
anchors[mention]

KeyError: b'adult activity'