# 5. Entity Linking

In [23]:
%run __init__.py

<Figure size 432x288 with 0 Axes>

In [51]:
from bokeh.io import output_notebook

output_notebook()

## Defining the entity linking class

In [24]:
import json
import requests


WIKIDATA_BASE = "https://www.wikidata.org/w"

class WikidataEntityLinker():
    def __init__(self, user, passwd):
        pass
    
    def fit(self, X, y, *args):
        return self
    
    def transform(self, X, y, *args):
        return [self.link_entity(entity) 
                for doc in X
                for entity in doc]
    
    def link_entity(self, entity_label):
        url = f"{WIKIDATA_BASE}/api.php?action=wbsearchentities&search=" + \
            f"{entity_label}&language=en&format=json"
        response = requests.get(url)
        if response.status_code != 200:
            raise Error()
        content = json.loads(response.text)
        search_results = content['search']
        if len(search_results) == 0:
            return (entity_label, None)
        return (entity_label, search_results[0]['concepturi'])


In [25]:
entity_linker = WikidataEntityLinker("", "")
res = entity_linker.link_entity('agroforestry')
res

('agroforestry', 'http://www.wikidata.org/entity/Q397350')

## Linking each topic's term to Wikidata

In [26]:
import dill as pickle

# see https://stackoverflow.com/questions/42960637/python-3-5-dill-pickling-unpickling-on-different-servers-keyerror-classtype
pickle._dill._reverse_typemap['ClassType'] = type

def load_object(output_path):
    with open(output_path, 'rb') as file:
        res = pickle.load(file)
    return res

In [27]:
NOTEBOOK_RESULTS_DIR = os.path.join(RESULTS_DIR, '3_topic_modeling')
lda_agriculture_pipe_filename = "agriculture_lda_model.pkl"
dtm_tf_filename = "agriculture_dtm_tf.pkl"

lda_pipe = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, lda_agriculture_pipe_filename))
dtm_tf = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, dtm_tf_filename))

In [28]:
from src.utils import get_topic_terms_by_relevance

def link_topic_terms(entity_linker, model, vectorizer,
                     dtm_tf, n_top_words, lambda_=0.6):
    res = []
    topic_terms = get_topic_terms_by_relevance(model, vectorizer, dtm_tf,
                                               n_top_words, lambda_)
    return [[entity_linker.link_entity(entity) for entity in topic]
            for topic in topic_terms]


In [29]:
linked_terms = link_topic_terms(entity_linker, lda_pipe.named_steps['model'],
                                lda_pipe.named_steps['vectorizer'], dtm_tf, 
                                n_top_words=10, lambda_=0.75)

In [152]:
linked_terms[2]

[('base', 'http://www.wikidata.org/entity/Q191360'),
 ('sensor', 'http://www.wikidata.org/entity/Q167676'),
 ('system', 'http://www.wikidata.org/entity/Q58778'),
 ('node', 'http://www.wikidata.org/entity/Q756100'),
 ('agricultural', 'http://www.wikidata.org/entity/Q5356428'),
 ('breeding', 'http://www.wikidata.org/entity/Q227675'),
 ('technology', 'http://www.wikidata.org/entity/Q11016'),
 ('farming', 'http://www.wikidata.org/entity/Q11451'),
 ('power', 'http://www.wikidata.org/entity/Q25107'),
 ('iot', 'http://www.wikidata.org/entity/Q251212')]

## Topic labelling

In [137]:
import functools
import pdb

from dataclasses import dataclass

import networkx as nx


WIKIDATA_PROPS_EXPAND = ['P31', 'P279', 'P301', 'P910', 'P2579']


def empty_if_keyerror(function):
    """
    A decorator that wraps the passed in function and
    returns an empty string if a key error is raised.
    """
    @functools.wraps(function)
    def wrapper(*args, **kwargs):
        try:
            return function(*args, **kwargs)
        except KeyError:
            return ""
    return wrapper

def _build_uri(entity_id):
    return f"http://www.wikidata.org/entity/{entity_id}"

@empty_if_keyerror
def _get_aliases(entity_info, lang='en'):
    return [alias['value'] 
            for alias in entity_info['aliases'][lang]]

@empty_if_keyerror
def _get_desc(entity_info, lang='en'):
    return entity_info['descriptions'][lang]['value']

@empty_if_keyerror
def _get_labels(entity_info, lang='en'):
    return entity_info['labels'][lang]['value']


@dataclass
class WikidataNode():
    label: str
    uri: str
    desc: str
    alias: str
        
    def __hash__(self):
        return hash(self.uri)
    
    def to_dict(self):
        return {
            'alias':self.alias,
            'desc': self.desc,
            'label': self.label,
            'uri': self.uri
        }


class WikidataGraphBuilder():
    def __init__(self, max_hops=2, additional_props=None):
        self.max_hops = max_hops
        self.props_to_expand = WIKIDATA_PROPS_EXPAND
        if additional_props:
            self.props_to_expand += additional_props
    
    def build_graph(self, topic):
        G = nx.Graph()
        for term in topic:
            term_uri = term[1]
            term_id = term_uri.split('/')[-1]
            self._add_wd_node_info(G, term_id, None, 0)
        return G
    
    def _add_wd_node_info(self, graph, term_id, prev_node, curr_hop):
        print(f"Visiting entity '{term_id}' - Curr hop: {curr_hop}")
        if curr_hop > self.max_hops:
            return
        
        # call wikidata API for uri
        endpoint = f"{WIKIDATA_BASE}/api.php?action=wbgetentities&ids={term_id}&languages=en&format=json"
        res = requests.get(endpoint)
        if res.status_code != 200:
            raise Error()
        
        content = json.loads(res.text)
        entity_info = content['entities'][term_id]
        
        if term_id not in graph.nodes:
            graph.add_node(term_id)
            #graph.nodes[term_id]['alias'] = _get_aliases(entity_info)
            graph.nodes[term_id]['desc'] = _get_desc(entity_info)
            graph.nodes[term_id]['label'] = _get_labels(entity_info)

            if prev_node is not None:
                graph.add_edge(prev_node, term_id)
        
        for claim_key, claim_values in entity_info['claims'].items():
            if claim_key not in self.props_to_expand:
                continue
            
            for value in claim_values:
                snaktype = value['mainsnak']['snaktype']
                if snaktype in ['novalue', 'somevalue']:
                    continue
                
                new_node_id = value['mainsnak']['datavalue']['value']['id']
                self._add_wd_node_info(graph, new_node_id, term_id, curr_hop + 1)


In [154]:
graph_builder = WikidataGraphBuilder(max_hops=3)
subgraph = graph_builder.build_graph(linked_terms[2])

Visiting entity 'Q191360' - Curr hop: 0
Visiting entity 'Q811679' - Curr hop: 1
Visiting entity 'Q391414' - Curr hop: 2
Visiting entity 'Q19603939' - Curr hop: 3
Visiting entity 'Q8205328' - Curr hop: 4
Visiting entity 'Q811430' - Curr hop: 4
Visiting entity 'Q3955017' - Curr hop: 3
Visiting entity 'Q2424752' - Curr hop: 4
Visiting entity 'Q10244462' - Curr hop: 4
Visiting entity 'Q6477453' - Curr hop: 3
Visiting entity 'Q391414' - Curr hop: 4
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q9149869' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q12139612' - Curr hop: 4
Visiting entity 'Q17442446' - Curr hop: 4
Visiting entity 'Q35252665' - Curr hop: 4
Visiting entity 'Q2944534' - Curr hop: 4
Visiting entity 'Q811679' - Curr hop: 3
Visiting entity 'Q391414' - Curr hop: 4
Visiting entity 'Q9149869' - Curr hop: 4
Visiting entity 'Q8970514' - Curr hop: 1
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q12139612' - Curr hop: 3
Visiting entity '

Visiting entity 'Q15647814' - Curr hop: 4
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q6671777' - Curr hop: 1
Visiting entity 'Q35120' - Curr hop: 2
Visiting entity 'Q23958852' - Curr hop: 3
Visiting entity 'Q23960977' - Curr hop: 4
Visiting entity 'Q19478619' - Curr hop: 4
Visiting entity 'Q23958852' - Curr hop: 4
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q2145290' - Curr hop: 4
Visiting entity 'Q7184903' - Curr hop: 4
Visiting entity 'Q1347367' - Curr hop: 4
Visiting entity 'Q5550686' - Curr hop: 4
Visiting entity 'Q23958852' - Curr hop: 4
Visiting entity 'Q6643007' - Curr hop: 2
Visiting entity 'Q6671777' - Curr hop: 3
Visiting entity 'Q35120' - Curr hop: 4
Visiting entity 'Q6643007' - Curr hop: 4
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q12139612' - Curr hop: 4
Visiting entity 'Q17442446' - Curr hop: 4
Visiting entity 'Q35252665' - Curr hop: 4
Visiting entity 'Q2944534' - Curr hop: 4
Visiting entity 'Q269699' - Curr hop: 1
Visiting enti

Visiting entity 'Q19478619' - Curr hop: 4
Visiting entity 'Q21522864' - Curr hop: 4
Visiting entity 'Q24017465' - Curr hop: 4
Visiting entity 'Q3895768' - Curr hop: 1
Visiting entity 'Q6540441' - Curr hop: 2
Visiting entity 'Q5446571' - Curr hop: 3
Visiting entity 'Q15831596' - Curr hop: 4
Visiting entity 'Q14897293' - Curr hop: 4
Visiting entity 'Q6540441' - Curr hop: 4
Visiting entity 'Q3895768' - Curr hop: 3
Visiting entity 'Q6540441' - Curr hop: 4
Visiting entity 'Q14897293' - Curr hop: 4
Visiting entity 'Q15831596' - Curr hop: 4
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q12139612' - Curr hop: 4
Visiting entity 'Q17442446' - Curr hop: 4
Visiting entity 'Q35252665' - Curr hop: 4
Visiting entity 'Q2944534' - Curr hop: 4
Visiting entity 'Q14897293' - Curr hop: 2
Visiting entity 'Q17537576' - Curr hop: 3
Visiting entity 'Q15621286' - Curr hop: 4
Visiting entity 'Q5626248' - Curr hop: 4
Visiting entity 'Q64728693' - Curr hop: 3
Visiting entity 'Q7184903' - Curr hop: 4
Vi

Visiting entity 'Q7132779' - Curr hop: 4
Visiting entity 'Q8187769' - Curr hop: 4
Visiting entity 'Q336' - Curr hop: 4
Visiting entity 'Q56299222' - Curr hop: 4
Visiting entity 'Q8187769' - Curr hop: 4
Visiting entity 'Q8187769' - Curr hop: 2
Visiting entity 'Q10096899' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q8187769' - Curr hop: 4
Visiting entity 'Q61788060' - Curr hop: 3
Visiting entity 'Q1914636' - Curr hop: 4
Visiting entity 'Q9878968' - Curr hop: 4
Visiting entity 'Q336' - Curr hop: 2
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q1047113' - Curr hop: 4
Visiting entity 'Q6642719' - Curr hop: 4
Visiting entity 'Q24017414' - Curr hop: 4
Visiting entity 'Q1458083' - Curr hop: 3
Visiting entity 'Q336' - Curr hop: 4
Visiting entity 'Q59541917' - Curr hop: 4
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q1047113' - Curr hop: 4
Visiting entity 'Q6642719' - Curr hop: 4
Visiting entity 'Q240

Visiting entity 'Q9504987' - Curr hop: 4
Visiting entity 'Q5962346' - Curr hop: 4
Visiting entity 'Q29028649' - Curr hop: 4
Visiting entity 'Q58778' - Curr hop: 4
Visiting entity 'Q5962346' - Curr hop: 2
Visiting entity 'Q6423319' - Curr hop: 3
Visiting entity 'Q1979154' - Curr hop: 4
Visiting entity 'Q340169' - Curr hop: 4
Visiting entity 'Q1151067' - Curr hop: 4
Visiting entity 'Q386724' - Curr hop: 4
Visiting entity 'Q3622126' - Curr hop: 4
Visiting entity 'Q96116695' - Curr hop: 4
Visiting entity 'Q6510578' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q5962346' - Curr hop: 4
Visiting entity 'Q10161578' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 4
Visiting entity 'Q5962346' - Curr hop: 4
Visiting entity 'Q29028649' - Curr hop: 2
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q2145290' - Curr hop: 4
Visiting entity 'Q7184903' - Curr hop: 4
Visiting entity 'Q1347367' - Curr hop: 4
Visiting entity 'Q5550686' - Curr hop: 4
Visiting entity '

Visiting entity 'Q1914636' - Curr hop: 4
Visiting entity 'Q9878968' - Curr hop: 4
Visiting entity 'Q5658978' - Curr hop: 1
Visiting entity 'Q11451' - Curr hop: 2
Visiting entity 'Q268592' - Curr hop: 3
Visiting entity 'Q5962346' - Curr hop: 4
Visiting entity 'Q29028649' - Curr hop: 4
Visiting entity 'Q6137609' - Curr hop: 4
Visiting entity 'Q58778' - Curr hop: 4
Visiting entity 'Q3958441' - Curr hop: 3
Visiting entity 'Q9504987' - Curr hop: 4
Visiting entity 'Q5962346' - Curr hop: 4
Visiting entity 'Q29028649' - Curr hop: 4
Visiting entity 'Q58778' - Curr hop: 4
Visiting entity 'Q8148' - Curr hop: 3
Visiting entity 'Q3958441' - Curr hop: 4
Visiting entity 'Q6528585' - Curr hop: 4
Visiting entity 'Q1914636' - Curr hop: 4
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q1047113' - Curr hop: 4
Visiting entity 'Q6642719' - Curr hop: 4
Visiting entity 'Q24017414' - Curr hop: 4
Visiting entity 'Q2207288' - Curr hop: 3
Visiting entity 'Q28640' - Curr hop: 4
Visiting entity 'Q698761

Visiting entity 'Q11862829' - Curr hop: 4
Visiting entity 'Q336' - Curr hop: 4
Visiting entity 'Q21198' - Curr hop: 3
Visiting entity 'Q6517860' - Curr hop: 4
Visiting entity 'Q816264' - Curr hop: 4
Visiting entity 'Q11862829' - Curr hop: 4
Visiting entity 'Q4671286' - Curr hop: 4
Visiting entity 'Q151885' - Curr hop: 1
Visiting entity 'Q2145290' - Curr hop: 2
Visiting entity 'Q4393498' - Curr hop: 3
Visiting entity 'Q930933' - Curr hop: 4
Visiting entity 'Q269699' - Curr hop: 4
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q2145290' - Curr hop: 4
Visiting entity 'Q7184903' - Curr hop: 4
Visiting entity 'Q1347367' - Curr hop: 4
Visiting entity 'Q5550686' - Curr hop: 4
Visiting entity 'Q23958852' - Curr hop: 4
Visiting entity 'Q147638' - Curr hop: 3
Visiting entity 'Q7129786' - Curr hop: 4
Visiting entity 'Q1665984' - Curr hop: 4
Visiting entity 'Q9418' - Curr hop: 4
Visiting entity 'Q23407' - Curr hop: 4
Visiting entity 'Q8162' - Curr hop: 4
Visiting entity 'Q23404' - Curr h

In [155]:
import networkx as nx

from bokeh.io import output_file, show
from bokeh.models import (BoxZoomTool, Circle, HoverTool,
                          MultiLine, Plot, Range1d, ResetTool,)
from bokeh.palettes import Spectral4
from bokeh.plotting import from_networkx

# Show with Bokeh
plot = Plot(plot_width=400, plot_height=400,
            x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
plot.title.text = "Graph Interaction Demonstration"

node_hover_tool = HoverTool(tooltips=[("Label", "@label"),])
plot.add_tools(node_hover_tool, BoxZoomTool(), ResetTool())

graph_renderer = from_networkx(subgraph, nx.spring_layout, scale=1, center=(0, 0))

graph_renderer.node_renderer.glyph = Circle(size=15, fill_color=Spectral4[0])
graph_renderer.edge_renderer.glyph = MultiLine(line_alpha=0.8, line_width=1)
plot.renderers.append(graph_renderer)

output_file("interactive_graphs.html")
show(plot)

In [156]:
def get_largest_connected_subgraph(g):
    S = [g.subgraph(c).copy() for c in nxa.components.connected_components(g)]
    return max(S, key=len)

In [157]:
looool = get_largest_connected_subgraph(subgraph)

In [158]:
import networkx.algorithms as nxa

res = nxa.centrality.information_centrality(looool)
res

{'Q19361238': 0.0037593984962406,
 'Q1458083': 0.003164556962025315,
 'Q12015335': 0.004672897196261682,
 'Q5446571': 0.0037313432835820886,
 'Q58778': 0.007462686567164179,
 'Q7157272': 0.003731343283582088,
 'Q96251598': 0.004629629629629628,
 'Q59541917': 0.0038167938931297695,
 'Q5472577': 0.003787878787878787,
 'Q5127848': 0.003731343283582088,
 'Q336': 0.0037593984962406,
 'Q64728693': 0.003759398496240601,
 'Q269699': 0.005747126436781608,
 'Q4671286': 0.003816793893129771,
 'Q7145049': 0.003205128205128206,
 'Q8840643': 0.004545454545454544,
 'Q488383': 0.004672897196261682,
 'Q24017414': 0.0040650406504065045,
 'Q24017465': 0.003378378378378379,
 'Q6642645': 0.003846153846153847,
 'Q9081': 0.003378378378378379,
 'Q7036033': 0.004464285714285715,
 'Q14897293': 0.00462962962962963,
 'Q6582844': 0.004587155963302753,
 'Q15831596': 0.004587155963302753,
 'Q7215638': 0.004716981132075473,
 'Q682496': 0.005952380952380955,
 'Q11023': 0.0038167938931297717,
 'Q10161578': 0.0032894736

In [159]:
import operator

max(res.items(), key=operator.itemgetter(1))[0]

'Q58778'