In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "KnowledgeIR")

In [None]:
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO)
logging.getLogger('rdflib').setLevel(logging.CRITICAL)

In [None]:
output_dir="/media/hdd/hdd0/data/freebase_wiki"
wiki_links="/media/hdd/hdd0/data/DBpedia/NIF_Abstract_Datasets/nif-text-links_en.ttl.bz2"
wiki_context="/media/hdd/hdd0/data/DBpedia/NIF_Abstract_Datasets/nif-abstract-context_en.ttl.bz2" 
fb2w="/media/hdd/hdd0/data/Freebase/fb2w.nt"
wiki2fb_path="/media/hdd/hdd0/data/DBpedia/201604_datasets/freebase_links_en.ttl.bz2"
redirect_path="/media/hdd/hdd0/data/DBpedia/201604_datasets/redirects_en.ttl.bz2"

import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Create and Load Freebase/Wikidata/Wikipedia mapping.

In [None]:
from linker.data.freebase_wiki_mapper import FreebaseWikiMapper

logging.info("Mapping Freebase to Wikipedia.")
mapper = FreebaseWikiMapper(output_dir)
mapper.create_mapping_dbpedia(wiki2fb_path)
wiki2fb = mapper.read_wiki_fb_mapping()
logging.info("Done.")

## Create and Load Anchor Positions.

In [None]:
from linker.data import data_utils, wiki_anchor_replacer
from linker.data.wiki_anchor_replacer import AnchorPositions, parse_anchor_positions
import os

logging.info("Reading anchors.")
anchor_positions = data_utils.run_or_load(os.path.join(output_dir, "anchor_positions.pickle"),
                                          parse_anchor_positions, wiki_links)
logging.info("Done.")

## Load Wikipedia Redirects
Note, the redirects stores Unicode objects.

In [None]:
from linker.data.wiki_anchor_replacer import load_redirects
from linker.data import data_utils

logging.info("Loading redirect pages.")
redirects = data_utils.run_or_load(os.path.join(output_dir, "redirects.pickle"), load_redirects, redirect_path)
logging.info("Done.")

In [None]:
print redirects[u"AccessibleComputing"]

## Write Replaced Text
Currently there are a couple missing reasons:
1. Incorrect character/byte offset due to unicode (currently working on it)
2. Freebase to wikidata mapping is published 2013, not complete (may be replaced by a new dataset) 

In [None]:
from linker.data.wiki_anchor_replacer import write_context_replaced

logging.info("Writing down replaced text.")
num_wiki_seen, num_anchor, missed_counts = write_context_replaced(wiki2fb, wiki_context, anchor_positions, redirects,
                                                                  os.path.join(output_dir, "fb_replace.log"),
                                                                  os.path.join(output_dir, "fb_replaced.txt"))

In [None]:
logging.info("Writing down both text.")
num_wiki_seen, num_anchor, missed_counts = write_context_replaced(wiki2fb, wiki_context, anchor_positions, redirects,
                                           os.path.join(output_dir, "fb_replace_both.log"),
                                           os.path.join(output_dir, "origin_and_replaced.txt"), True)

In [None]:
from linker.data.wiki_anchor_replacer import print_replacement_stats

print_replacement_stats(num_wiki_seen, num_anchor, missed_counts, os.path.join(output_dir, "replacement_stat.txt"))