# Web Tables Disambiguation PoC Pipeline
Start by loading interface to gensim models for embeddings.

In [2]:
# %load model.py
"""Interface to Gensim models."""
from os.path import isdir, join, split
from gensim.models import Word2Vec


class Model(Word2Vec):

    @classmethod
    def load(cls, models_directory=None, filename=None):
        subdirectory = models_directory

        try:
            model = super(Model, cls).load(join(subdirectory, filename))
            model.metadata = {
                'filename': filename
            }
        except OSError:
            model = Model()
            model.metadata = {
                'error': 'File not found: {}'.format(filename)
            }
        return model

## Helper functions

In [3]:
import pickle
import csv
import re
from string import digits
import networkx
import itertools

# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def strip_non_alphanum_and_digits(text):
    li = re.compile(r'\W+', re.UNICODE).split(text)
    string = " ".join(li)
    remove_digits = str.maketrans('', '', digits)
    return string.translate(remove_digits)


# Taken and slightly adapted from :
# https://networkx.github.io/documentation/stable/_modules/networkx/generators/classic.html#complete_multipartite_graph
def complete_multipartite_graph_with_weights(list_of_subsets):
    """Returns the complete multipartite graph with the specified subset sizes and use of embeddings as weight.

    Parameters
    ----------
    subset_sizes : tuple of integers or tuple of node iterables
       The arguments can either all be integer number of nodes or they
       can all be iterables of nodes. If integers, they represent the
       number of vertices in each subset of the multipartite graph.
       If iterables, each is used to create the nodes for that subset.
       The length of subset_sizes is the number of subsets.

    Returns
    -------
    G : NetworkX Graph
       Returns the complete multipartite graph with the specified subsets.

       For each node, the node attribute 'subset' is an integer
       indicating which subset contains the node.

    """
    # The complete multipartite graph is an undirected simple graph.
    G = networkx.Graph()

    if len(list_of_subsets) == 0:
        return G

    # add nodes with subset attribute
    for i in range(0, len(list_of_subsets)):
        for node in list_of_subsets[i]:
            G.add_node(node, subset=i)

    # Across subsets, all vertices should be adjacent.
    # We can use itertools.combinations() because undirected. FIXME directed and use permutations

    for subset1, subset2 in itertools.combinations(list_of_subsets, 2):
        G.add_edges_from(((u, v, {'weight': model.similarity("Q" + str(u), "Q" + str(v))})
                                   for u, v in itertools.product(subset1, subset2)))
        # FIXME normalize to get a meaningful transition probability
    return G

## Main Pipeline 
### Step 1: Load surface form index and embeddings into memory
#### Surface Form Index Datasets
They were created using these scripts: https://github.com/eXascaleInfolab/ml-phd-scripts_wikidata
I fixed a bug and added additional information on how to use them.

* surfaceForms-20180820-1e10.pickle 436MB, contains link to entites for 6,3 Mio surface forms, ~20h
* surfaceForms-20180820-1e9.pickle 43MB, ~2h processing time

Note: For some reason some important entities seem to be missing from the original wikidata dump, or are not ordered.
Since the surface form index is created by going through the dump from the start to wherever, we may either not reach all entities, or not all are present, which seems impossible.

#### Embeddings Datasets
Taken from https://github.com/fnielsen/wembedder


In [4]:
surface_forms = pickle.load(open("data/surface/surfaceForms-20180820-1e10.pickle", "rb"))
model = Model.load(models_directory="data/models",
                   filename="wikidata-20170613-truthy-BETA-cbow-size=100-window=1-min_count=20")

### Demo of how to use surface form index and embeddings model

In [19]:
# -------- Small test --------------------------------------------------------------------------------------------------
print(model.metadata)

vector = model.wv.word_vec("Q4")
print(vector)

print(surface_forms['queen'])
print(surface_forms['King'])

print(model.similarity("Q19643", "Q12097"))     # queen, king
print(model.similarity("Q15862", "Q12097"))     # queen(band), king
print(model.similarity("Q4", "Q12097"))         # earth, king

{'filename': 'wikidata-20170613-truthy-BETA-cbow-size=100-window=1-min_count=20'}
[ -7.65347795e-05  -1.21277116e-01  -3.31001580e-01  -5.91061473e-01
   5.57715714e-01   5.29511094e-01  -3.35398495e-01   5.19584000e-01
   1.30012140e-01   6.84022903e-02   3.23659241e-01   2.40314469e-01
  -5.74526906e-01  -5.42527378e-01  -1.55539960e-01  -2.85976529e-01
  -5.47384381e-01   2.21911013e-01   2.56106317e-01   2.96622902e-01
   2.68188238e-01   1.97609827e-01  -2.35151142e-01  -2.02283576e-01
  -4.71140593e-01   4.79329467e-01  -1.64737016e-01   2.57988393e-01
  -2.75253654e-01  -3.76181630e-03   2.73623109e-01   5.92558026e-01
  -3.48875791e-01   9.20118749e-01   8.14047232e-02   5.00026584e-01
  -4.20140773e-01  -6.99480670e-03  -1.51370347e-01  -8.63599032e-02
   5.07600307e-01  -4.96026576e-01   3.00790113e-03   3.38878371e-02
   2.35448703e-01  -6.45567298e-01   3.93063903e-01   1.15729176e-01
  -1.68407246e-01  -3.27442773e-02   4.28302258e-01  -1.83347182e-03
  -1.47177711e-01  -1

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


### Step 2: Load CSV table 

In [20]:
rows = []
with open('data/webtables/countries.csv', "rt", encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        cells = []
        for cell in row:
            cells.append(cell)
        rows.append(cells)

rows

[['country', 'capital', 'area sq.km.', 'population 2008 est.'],
 ['afghanistan', 'kabul', '647500', '32738376'],
 ['albania', 'tirana', '28748', '3619778'],
 ['algeria', 'algiers', '2381740', '33769668'],
 ['american \n         samoa', 'pago pago', '199', '64827'],
 ['andorra', 'andorra \n         la vella', '468', '82627'],
 ['angola', 'luanda', '1246700', '12531357'],
 ['anguilla', 'the \n         valley', '102', '14108'],
 ['antigua \n         and barbuda', "saint \n         john's", '443', '84522'],
 ['argentina', 'buenos \n         aires', '2766890', '40482000'],
 ['armenia', 'yerevan', '29800', '2968586'],
 ['aruba', 'oranjestad', '193', '101541'],
 ['australia', 'canberra', '7686850', '21007310'],
 ['austria', 'vienna', '83870', '8205533'],
 ['azerbaijan', 'baku', '86600', '8177717'],
 ['bahamas \n         the', 'nassau', '13940', '307451'],
 ['bahrain', 'manama', '665', '718306'],
 ['bangladesh', 'dhaka', '144000', '153546896'],
 ['barbados', 'bridgetown', '431', '281968'],
 ['

### Step 3: Normalize cells
1. Jump over first row if it is header (TODO header not always present, or more than a single row)
2. Convert to lowercase & remove non alpha numeric characters
> Example: "8 o'clock" to "o clock", "cat & Dog" to "cat dog"

In [23]:
normalized = []
for row in rows[1:]:
    #new_row = []
    #for cell in row:
    #    new_row.append(strip_non_alphanum_and_digits(cell.lower()))
    nrow=[(strip_non_alphanum_and_digits(cell.lower())) for cell in row]
    normalized.append(nrow)

normalized

# FIXME John's to Johns instead of John s or keep '

[['afghanistan', 'kabul', '', ''],
 ['albania', 'tirana', '', ''],
 ['algeria', 'algiers', '', ''],
 ['american samoa', 'pago pago', '', ''],
 ['andorra', 'andorra la vella', '', ''],
 ['angola', 'luanda', '', ''],
 ['anguilla', 'the valley', '', ''],
 ['antigua and barbuda', 'saint john s', '', ''],
 ['argentina', 'buenos aires', '', ''],
 ['armenia', 'yerevan', '', ''],
 ['aruba', 'oranjestad', '', ''],
 ['australia', 'canberra', '', ''],
 ['austria', 'vienna', '', ''],
 ['azerbaijan', 'baku', '', ''],
 ['bahamas the', 'nassau', '', ''],
 ['bahrain', 'manama', '', ''],
 ['bangladesh', 'dhaka', '', ''],
 ['barbados', 'bridgetown', '', ''],
 ['belarus', 'minsk', '', ''],
 ['belgium', 'brussels', '', ''],
 ['belize', 'belmopan', '', ''],
 ['benin', 'porto novo', '', ''],
 ['bermuda', 'hamilton', '', ''],
 ['bhutan', 'thimphu', '', ''],
 ['bolivia', 'sucre', '', ''],
 ['bosnia and herzegovina', 'sarajevo', '', ''],
 ['botswana', 'gaborone', '', ''],
 ['brazil', 'brasilia', '', ''],
 ['br

### Step 4: Look up in surface form index
##### Try lowercase and capitalized


In [24]:
# FIXME better way to add to set
candidates = []
no_surface_found_count = 0
for row in normalized:
    candidate_row = []
    for cell in row:
        if cell is not "":

            cell_candidates = set()

            # Try word as is
            try:
                for can in surface_forms[cell]:
                    cell_candidates.add(can)
            except KeyError:
                pass

            # Try capitalized
            try:
                cap = cell.capitalize()
                for can in surface_forms[cap]:
                    cell_candidates.add(can)
            except KeyError:
                pass

            # Try each word if many
            # removed this part since single charachters wreak havoc!
            #try:
            #    for c in cell.split(" "):
            #        for can in surface_forms[c]:
            #            cell_candidates.add(can)
            #except KeyError:
            #    pass

            if cell_candidates:
                candidate_row.append((cell, cell_candidates))
            else:
                no_surface_found_count += 1

    candidates.append(candidate_row)

print("Number of entities where no matching entitiy found:")
print(no_surface_found_count)
print("Entities found in surface form:")
candidates

Number of entities where no matching entitiy found:
209
Entities found in surface form:


[[('kabul', {1018884})],
 [('albania', {1048340, 1655535, 23697272, 28795866, 30040187})],
 [('algeria', {262, 2646480}), ('algiers', {1293618})],
 [],
 [],
 [('angola', {916}), ('luanda', {35386679})],
 [('anguilla', {9154612})],
 [],
 [('argentina', {645291, 2525071, 4789276, 20274555})],
 [('armenia', {20137419, 23697019, 31468839, 50244910}),
  ('yerevan', {12630651})],
 [('aruba', {4801996, 24698154}), ('oranjestad', {246290})],
 [('australia', {205546, 4823541, 8209399, 16835533, 25907685, 47508781}),
  ('canberra', {925439, 2973482})],
 [('austria', {211216, 533534, 4825411}),
  ('vienna',
   {755083, 1002926, 1028144, 2224889, 2287208, 2523320, 3708598, 25183367})],
 [('baku', {4849576, 4849578})],
 [('nassau', {1965921, 2228508, 3336227, 6967546})],
 [('bahrain', {398, 4165357, 19457020})],
 [('bangladesh', {1592924}), ('dhaka', {5268709, 5268712})],
 [('barbados', {807306}), ('bridgetown', {1748747})],
 [('belarus', {815270, 3920722, 4881914})],
 [('belgium', {31, 404713, 190

### Step 5: Only keep candidates fow which we have embeddings

In [16]:
final_candidates = []
no_embedding_found_count = 0
embedding_found_count = 0
for row in candidates:
    candidate_row = []
    for (sf, entities) in row:
        cell_candidates = set()

        for i in entities:
            try:
                model.wv.word_vec("Q" + str(i))
                cell_candidates.add(i)
                embedding_found_count += 1
            except Exception:
                no_embedding_found_count += 1

        if cell_candidates:
            candidate_row.append((sf, cell_candidates))
    final_candidates.append(candidate_row)


print("Number of entities where embedding found:")
print(embedding_found_count)
print("Number of entities where no embedding found:")
print(no_embedding_found_count)

print("Final candidates:")
final_candidates

# Note: at the moment not many embeddings are found, but I suspect it is because surface form index returns pretty
#       random entities and not in a good order, final surface form index will be better !

Number of entities where embedding found:
132
Number of entities where no embedding found:
712
Final candidates:


[[],
 [],
 [('algeria', {262})],
 [],
 [],
 [('angola', {916})],
 [('anguilla', {9154612})],
 [],
 [],
 [],
 [],
 [('australia', {205546}), ('canberra', {2973482})],
 [('vienna', {1002926})],
 [],
 [],
 [('bahrain', {398})],
 [],
 [],
 [],
 [('belgium', {31}), ('brussels', {240})],
 [('belize', {242})],
 [],
 [('hamilton', {4131, 133116, 726961})],
 [('thimphu', {254889})],
 [],
 [],
 [],
 [('brazil', {155})],
 [],
 [],
 [('sofia', {472})],
 [],
 [],
 [],
 [],
 [],
 [],
 [('praia', {3751})],
 [],
 [],
 [('chad', {2621521})],
 [('chile', {298}), ('santiago', {108448, 1256535})],
 [('china', {148})],
 [('victoria', {2132, 200999, 7926540})],
 [],
 [],
 [],
 [],
 [],
 [('kinshasa', {3838})],
 [],
 [],
 [],
 [],
 [],
 [],
 [('nicosia', {56206})],
 [],
 [],
 [('djibouti', {3604}), ('djibouti', {3604})],
 [('roseau', {1001416})],
 [],
 [],
 [('cairo', {575306})],
 [],
 [],
 [],
 [('tallinn', {1770})],
 [],
 [],
 [],
 [('suva', {38807})],
 [('helsinki', {1757})],
 [('france', {3080569}), ('pa

### Step 6: Construct Disambiguation Graph

In [25]:
subsets = []
for row in final_candidates:
    for (entity, candidates) in row:
        subsets.append(candidates)

G = complete_multipartite_graph_with_weights(subsets)
print("Nodes in the graph:")
print(G.nodes)
# Normalize weights to represent transition probabilities



Nodes in the graph:
[262, 916, 9154612, 205546, 2973482, 1002926, 398, 31, 240, 242, 726961, 4131, 133116, 254889, 155, 472, 3751, 2621521, 298, 108448, 1256535, 148, 7926540, 2132, 200999, 3838, 56206, 3604, 1001416, 575306, 1770, 38807, 1757, 3080569, 576584, 79917, 130800, 3825, 183, 3761, 844930, 1524, 985543, 1006, 3103735, 1781, 668, 252, 3616, 801, 4993181, 38, 209878, 2321706, 34692, 2048581, 785, 14021944, 332494, 115787, 3805, 710, 1231, 25270, 9361, 819, 1016139, 1142358, 3748, 103251, 83958, 44148, 6581097, 1347276, 912, 233, 23800, 8009008, 669028, 2677358, 12919, 1027, 235, 13353, 577754, 3889, 188553, 478456, 9899, 1026685, 585, 695, 158119, 2933, 127940, 48273, 992560, 45, 1016603, 597, 12544, 34266, 1780, 40921, 225800, 1049, 3001, 863, 9365, 869, 1861, 10699844, 948, 43, 2297724, 23438, 672, 1036, 1749384, 686, 717, 1533, 2471, 3881]


### Step 7: Perform PageRank and keep highest ranked entity


In [14]:
# FIXME add random jump when performing pagerank
rank_dict = networkx.pagerank(G, alpha=0.85)
print("Those are the assigned ranks by PageRank algorithm:")
print(rank_dict)

disambiguated_count = 0
disambiguated = []
for row in final_candidates:
    disambiguated_row = []
    for (entity, candidates) in row:
        max_rank = -1
        disambiguated_entity = None
        for c in candidates:
            if rank_dict[c] > max_rank:
                max_rank = rank_dict[c]
                disambiguated_entity = c
        disambiguated_count += 1
        disambiguated_row.append((entity, disambiguated_entity))

    disambiguated.append(disambiguated_row)


print("Nodes disambiguated:")
print(disambiguated_count)
print("Disambiguated entities by row:")
disambiguated

Those are the assigned ranks by PageRank algorithm:
{262: 0.0071104234329808693, 916: 0.0073438544598204486, 9154612: 0.006281501395021271, 205546: 0.0065085454724259501, 2973482: 0.0078980351868690336, 1002926: 0.0086041115344871067, 398: 0.0075733623646982665, 31: 0.0056532501695648156, 240: 0.0082369336935221599, 242: 0.0077108135716889709, 726961: 0.0079458728767769771, 4131: 0.0085167832804060838, 133116: 0.0085588166390412456, 254889: 0.0077246271732390624, 155: 0.0063788662840213398, 472: 0.0084458092003126799, 3751: 0.0087288142023368327, 2621521: 0.0074539080722264076, 9937: 0.0070035538758860434, 21199: 0.0067354841443741193, 298: 0.0070376812729110684, 108448: 0.0081750428207657596, 1256535: 0.0070960752748430644, 148: 0.0047224440241625341, 7926540: 0.0071818058984789036, 2132: 0.0088371101743408773, 200999: 0.0086970097492854899, 3838: 0.0087015373851701151, 56206: 0.0084179358267158517, 3604: 0.0084182041678455919, 1001416: 0.0086683243150538578, 575306: 0.008662514390164

[[],
 [],
 [('algeria', 262)],
 [],
 [],
 [('angola', 916)],
 [('anguilla', 9154612)],
 [],
 [],
 [],
 [],
 [('australia', 205546), ('canberra', 2973482)],
 [('vienna', 1002926)],
 [],
 [],
 [('bahrain', 398)],
 [],
 [],
 [],
 [('belgium', 31), ('brussels', 240)],
 [('belize', 242)],
 [],
 [('hamilton', 133116)],
 [('thimphu', 254889)],
 [],
 [],
 [],
 [('brazil', 155)],
 [],
 [],
 [('sofia', 472)],
 [],
 [],
 [],
 [],
 [],
 [],
 [('praia', 3751)],
 [],
 [],
 [('chad', 2621521), ('n djamena', 9937)],
 [('chile', 298), ('santiago', 108448)],
 [('china', 148)],
 [('victoria', 2132)],
 [],
 [],
 [],
 [],
 [],
 [('kinshasa', 3838)],
 [],
 [],
 [],
 [],
 [],
 [],
 [('nicosia', 56206)],
 [],
 [],
 [('djibouti', 3604), ('djibouti', 3604)],
 [('roseau', 1001416)],
 [],
 [],
 [('cairo', 575306)],
 [('el salvador', 36510)],
 [],
 [],
 [('tallinn', 1770)],
 [],
 [],
 [],
 [('suva', 38807)],
 [('helsinki', 1757)],
 [('france', 3080569), ('paris', 576584)],
 [('papeete', 130800)],
 [('libreville', 

In [26]:
surface_forms['Hamilton']

[4131,
 133116,
 726961,
 1027758,
 5644749,
 5644789,
 14692634,
 602520,
 5644746,
 5644794,
 5644823]

In [28]:
surface_forms['City of Hamilton']

[133116, 5123709]