# 5. Entity Linking

In [1]:
%run __init__.py

In [2]:
from bokeh.io import output_notebook

output_notebook()



## Defining the entity linking class

In [3]:
import json
import requests


WIKIDATA_BASE = "https://www.wikidata.org/w"

class WikidataEntityLinker():
    def __init__(self, user, passwd):
        pass
    
    def fit(self, X, y, *args):
        return self
    
    def transform(self, X, y, *args):
        return [self.link_entity(entity) 
                for doc in X
                for entity in doc]
    
    def link_entity(self, entity_label):
        url = f"{WIKIDATA_BASE}/api.php?action=wbsearchentities&search=" + \
            f"{entity_label}&language=en&format=json"
        response = requests.get(url)
        if response.status_code != 200:
            raise Error()
        content = json.loads(response.text)
        search_results = content['search']
        if len(search_results) == 0:
            return (entity_label, None)
        return (entity_label, search_results[0]['concepturi'])


In [4]:
entity_linker = WikidataEntityLinker("", "")
res = entity_linker.link_entity('agroforestry')
res

('agroforestry', 'http://www.wikidata.org/entity/Q397350')

## Linking each topic's term to Wikidata

In [5]:
import dill as pickle

# see https://stackoverflow.com/questions/42960637/python-3-5-dill-pickling-unpickling-on-different-servers-keyerror-classtype
pickle._dill._reverse_typemap['ClassType'] = type

def load_object(output_path):
    with open(output_path, 'rb') as file:
        res = pickle.load(file)
    return res

In [6]:
NOTEBOOK_RESULTS_DIR = os.path.join(RESULTS_DIR, '3_topic_modeling')
lda_agriculture_pipe_filename = "agriculture_lda_model.pkl"
dtm_tf_filename = "agriculture_dtm_tf.pkl"

lda_pipe = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, lda_agriculture_pipe_filename))
dtm_tf = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, dtm_tf_filename))

In [7]:
from src.utils import get_topic_terms_by_relevance

def link_topic_terms(entity_linker, model, vectorizer,
                     dtm_tf, n_top_words, lambda_=0.6):
    res = []
    topic_terms = get_topic_terms_by_relevance(model, vectorizer, dtm_tf,
                                               n_top_words, lambda_)
    return [[entity_linker.link_entity(entity) for entity in topic]
            for topic in topic_terms]


In [8]:
linked_terms = link_topic_terms(entity_linker, lda_pipe.named_steps['model'],
                                lda_pipe.named_steps['vectorizer'], dtm_tf, 
                                n_top_words=10, lambda_=0.75)
linked_terms[2]

[('base', 'http://www.wikidata.org/entity/Q191360'),
 ('sensor', 'http://www.wikidata.org/entity/Q167676'),
 ('system', 'http://www.wikidata.org/entity/Q58778'),
 ('node', 'http://www.wikidata.org/entity/Q756100'),
 ('agricultural', 'http://www.wikidata.org/entity/Q5356428'),
 ('breeding', 'http://www.wikidata.org/entity/Q227675'),
 ('technology', 'http://www.wikidata.org/entity/Q11016'),
 ('farming', 'http://www.wikidata.org/entity/Q11451'),
 ('power', 'http://www.wikidata.org/entity/Q25107'),
 ('iot', 'http://www.wikidata.org/entity/Q251212')]

## Topic labelling

In [19]:
import functools
import pdb

from dataclasses import dataclass

import networkx as nx


WIKIDATA_PROPS_EXPAND = ['P31', 'P279', 'P301', 'P910', 'P2578', 'P2579']


def empty_if_keyerror(function):
    """
    A decorator that wraps the passed in function and
    returns an empty string if a key error is raised.
    """
    @functools.wraps(function)
    def wrapper(*args, **kwargs):
        try:
            return function(*args, **kwargs)
        except KeyError:
            return ""
    return wrapper

def _build_uri(entity_id):
    return f"http://www.wikidata.org/entity/{entity_id}"

@empty_if_keyerror
def _get_aliases(entity_info, lang='en'):
    return [alias['value'] 
            for alias in entity_info['aliases'][lang]]

@empty_if_keyerror
def _get_desc(entity_info, lang='en'):
    return entity_info['descriptions'][lang]['value']

@empty_if_keyerror
def _get_labels(entity_info, lang='en'):
    return entity_info['labels'][lang]['value']


@dataclass
class WikidataNode():
    label: str
    uri: str
    desc: str
    alias: str
        
    def __hash__(self):
        return hash(self.uri)
    
    def to_dict(self):
        return {
            'alias':self.alias,
            'desc': self.desc,
            'label': self.label,
            'uri': self.uri
        }


class WikidataGraphBuilder():
    def __init__(self, max_hops=2, additional_props=None):
        self.max_hops = max_hops
        self.props_to_expand = WIKIDATA_PROPS_EXPAND
        if additional_props:
            self.props_to_expand += additional_props
    
    def build_graph(self, topic):
        G = nx.Graph()
        for term in topic:
            term_uri = term[1]
            term_id = term_uri.split('/')[-1]
            self._add_wd_node_info(G, term_id, None, 0)
        return G
    
    def _add_wd_node_info(self, graph, term_id, prev_node, curr_hop):
        print(f"Visiting entity '{term_id}' - Curr hop: {curr_hop}")
        if curr_hop > self.max_hops or term_id == 'Q4167836':
            return
        
        # call wikidata API for uri
        endpoint = f"{WIKIDATA_BASE}/api.php?action=wbgetentities&ids={term_id}&languages=en&format=json"
        res = requests.get(endpoint)
        if res.status_code != 200:
            raise Error()
        
        content = json.loads(res.text)
        entity_info = content['entities'][term_id]
        
        if term_id not in graph.nodes:
            graph.add_node(term_id)
            #graph.nodes[term_id]['alias'] = _get_aliases(entity_info)
            graph.nodes[term_id]['desc'] = _get_desc(entity_info)
            graph.nodes[term_id]['label'] = _get_labels(entity_info)
            graph.nodes[term_id]['n'] = curr_hop

        if prev_node is not None and not graph.has_edge(prev_node, term_id):
            graph.add_edge(prev_node, term_id)
        
        for claim_key, claim_values in entity_info['claims'].items():
            if claim_key not in self.props_to_expand:
                continue
            
            for value in claim_values:
                snaktype = value['mainsnak']['snaktype']
                if snaktype in ['novalue', 'somevalue']:
                    continue
                
                new_node_id = value['mainsnak']['datavalue']['value']['id']
                self._add_wd_node_info(graph, new_node_id, term_id, curr_hop + 1)


In [20]:
graph_builder = WikidataGraphBuilder(max_hops=2)
topic_graphs = [graph_builder.build_graph(topic) for topic in linked_terms]

Visiting entity 'Q2095' - Curr hop: 0
Visiting entity 'Q2424752' - Curr hop: 1
Visiting entity 'Q28877' - Curr hop: 2
Visiting entity 'Q29028649' - Curr hop: 3
Visiting entity 'Q64513524' - Curr hop: 3
Visiting entity 'Q5672864' - Curr hop: 3
Visiting entity 'Q8134' - Curr hop: 3
Visiting entity 'Q337060' - Curr hop: 3
Visiting entity 'Q8205328' - Curr hop: 2
Visiting entity 'Q223557' - Curr hop: 3
Visiting entity 'Q16686448' - Curr hop: 3
Visiting entity 'Q26991679' - Curr hop: 3
Visiting entity 'Q15401930' - Curr hop: 2
Visiting entity 'Q488383' - Curr hop: 3
Visiting entity 'Q7189878' - Curr hop: 2
Visiting entity 'Q2424752' - Curr hop: 3
Visiting entity 'Q2897903' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1194058' - Curr hop: 1
Visiting entity 'Q2424752' - Curr hop: 2
Visiting entity 'Q28877' - Curr hop: 3
Visiting entity 'Q8205328' - Curr hop: 3
Visiting entity 'Q15401930' - Curr hop: 3
Visiting entity 'Q7189878' - Curr hop: 3
Visiting entity 'Q16813

Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1496967' - Curr hop: 3
Visiting entity 'Q6327738' - Curr hop: 1
Visiting entity 'Q52105' - Curr hop: 2
Visiting entity 'Q1496967' - Curr hop: 3
Visiting entity 'Q6327738' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q1006733' - Curr hop: 0
Visiting entity 'Q1496967' - Curr hop: 1
Visiting entity 'Q82794' - Curr hop: 2
Visiting entity 'Q618123' - Curr hop: 3
Visiting entity 'Q27096213' - Curr hop: 3
Visiting entity 'Q2221906' - Curr hop: 3
Visiting entity 'Q6581383' - Curr hop: 3
Visiting entity 'Q817121' - Curr hop: 3
Visiting entity 'Q1595873' - Curr hop: 3
Visiting entity 'Q928786' - Curr hop: 3
Visiting entity 'Q8843932' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1496967' - Curr hop: 3
Visiting entity 'Q101998' - Curr hop: 1
Visiting entity 'Q107425' - Curr hop: 2
Visiting entity 'Q1496967' - Curr hop: 3
Visiting entity 'Q7143080' - Curr hop: 3
Visiting entity 'Q303360

Visiting entity 'Q7153080' - Curr hop: 3
Visiting entity 'Q28797' - Curr hop: 3
Visiting entity 'Q2095' - Curr hop: 3
Visiting entity 'Q336' - Curr hop: 3
Visiting entity 'Q25403900' - Curr hop: 1
Visiting entity 'Q10675206' - Curr hop: 2
Visiting entity 'Q15989253' - Curr hop: 3
Visiting entity 'Q214609' - Curr hop: 3
Visiting entity 'Q2095' - Curr hop: 2
Visiting entity 'Q2424752' - Curr hop: 3
Visiting entity 'Q1194058' - Curr hop: 3
Visiting entity 'Q1422299' - Curr hop: 3
Visiting entity 'Q5611149' - Curr hop: 3
Visiting entity 'Q2111686' - Curr hop: 3
Visiting entity 'Q1637030' - Curr hop: 3
Visiting entity 'Q7162764' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q25403900' - Curr hop: 3
Visiting entity 'Q50413986' - Curr hop: 2
Visiting entity 'Q50377228' - Curr hop: 3
Visiting entity 'Q7134776' - Curr hop: 1
Visiting entity 'Q11004' - Curr hop: 2
Visiting entity 'Q756' - Curr hop: 3
Visiting entity 'Q2095' - Curr hop: 3
Visiting entity 'Q25403900' - Cu

Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q420' - Curr hop: 2
Visiting entity 'Q7991' - Curr hop: 3
Visiting entity 'Q1457402' - Curr hop: 3
Visiting entity 'Q2465832' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q3' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q24454422' - Curr hop: 2
Visiting entity 'Q4671286' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q420' - Curr hop: 3
Visiting entity 'Q4056905' - Curr hop: 2
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q59541917' - Curr hop: 3
Visiting entity 'Q756' - Curr hop: 2
Visiting entity 'Q1457109' - Curr hop: 3
Visiting entity 'Q16521' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 1
Visiting entity 'Q16686022' - Curr hop: 2
Visiting entity 'Q223557' - Curr hop: 3
Visiting entity 'Q29651224' - Curr hop: 3
Visiting 

Visiting entity 'Q6118457' - Curr hop: 3
Visiting entity 'Q8294189' - Curr hop: 3
Visiting entity 'Q17350442' - Curr hop: 3
Visiting entity 'Q13226383' - Curr hop: 3
Visiting entity 'Q39659371' - Curr hop: 3
Visiting entity 'Q8187769' - Curr hop: 3
Visiting entity 'Q8134' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q251212' - Curr hop: 2
Visiting entity 'Q6465232' - Curr hop: 3
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q1067099' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q21198' - Curr hop: 3
Visiting entity 'Q16513426' - Curr hop: 2
Visiting entity 'Q9675721' - Curr hop: 3
Visiting entity 'Q1775867' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q8134' - Curr hop: 3
Visiting entity 'Q9418' - Curr hop: 3
Visiting entity 'Q202875' - Curr hop: 2
Visiting entity 'Q8670529' - Curr hop: 3
Visiting entity 'Q131395' - Curr hop: 3
Visiting entity 'Q1914636' - Curr hop: 3
Visiting entity 'Q715757

Visiting entity 'Q24060707' - Curr hop: 3
Visiting entity 'Q9915876' - Curr hop: 3
Visiting entity 'Q731453' - Curr hop: 0
Visiting entity 'Q35127' - Curr hop: 0
Visiting entity 'Q1076968' - Curr hop: 1
Visiting entity 'Q8064942' - Curr hop: 2
Visiting entity 'Q1076968' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1209283' - Curr hop: 2
Visiting entity 'Q340169' - Curr hop: 3
Visiting entity 'Q1186952' - Curr hop: 1
Visiting entity 'Q340169' - Curr hop: 2
Visiting entity 'Q286583' - Curr hop: 3
Visiting entity 'Q7162135' - Curr hop: 3
Visiting entity 'Q50636' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q8550133' - Curr hop: 2
Visiting entity 'Q1186952' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1714118' - Curr hop: 1
Visiting entity 'Q21572908' - Curr hop: 2
Visiting entity 'Q81941037' - Curr hop: 3
Visiting entity 'Q732577' - Curr hop: 3
Visiting entity 'Q17537576' - Curr hop: 2
Visiting entity 'Q15

Visiting entity 'Q811679' - Curr hop: 1
Visiting entity 'Q391414' - Curr hop: 2
Visiting entity 'Q19603939' - Curr hop: 3
Visiting entity 'Q3955017' - Curr hop: 3
Visiting entity 'Q6477453' - Curr hop: 3
Visiting entity 'Q9149869' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q811679' - Curr hop: 3
Visiting entity 'Q8970514' - Curr hop: 1
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q191360' - Curr hop: 2
Visiting entity 'Q811679' - Curr hop: 3
Visiting entity 'Q8970514' - Curr hop: 3
Visiting entity 'Q811430' - Curr hop: 3
Visiting entity 'Q811430' - Curr hop: 1
Visiting entity 'Q35145743' - Curr hop: 2
Visiting entity 'Q6581281' - Curr hop: 3
Visiting entity 'Q27096235' - Curr hop: 3
Visiting entity 'Q618123' - Curr hop: 3
Visiting entity 'Q6671777' - Curr hop: 2
Visiting entity 'Q35120' - Curr hop: 3
Visiting entity 'Q6643007' - Curr hop: 3
Visiting entity 'Q386724' - Curr hop: 2
Visiting entity 'Q15401930' - Curr hop: 3
Visiting entity 'Q16686

Visiting entity 'Q5356436' - Curr hop: 1
Visiting entity 'Q8410634' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q5356436' - Curr hop: 3
Visiting entity 'Q192611' - Curr hop: 2
Visiting entity 'Q6527718' - Curr hop: 3
Visiting entity 'Q56061' - Curr hop: 3
Visiting entity 'Q227675' - Curr hop: 0
Visiting entity 'Q995745' - Curr hop: 1
Visiting entity 'Q1094654' - Curr hop: 2
Visiting entity 'Q18054608' - Curr hop: 3
Visiting entity 'Q7245244' - Curr hop: 3
Visiting entity 'Q483247' - Curr hop: 3
Visiting entity 'Q7245244' - Curr hop: 2
Visiting entity 'Q1094654' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q4023882' - Curr hop: 1
Visiting entity 'Q1094654' - Curr hop: 2
Visiting entity 'Q18054608' - Curr hop: 3
Visiting entity 'Q7245244' - Curr hop: 3
Visiting entity 'Q483247' - Curr hop: 3
Visiting entity 'Q80962' - Curr hop: 1
Visiting entity 'Q7132779' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q80962

Visiting entity 'Q151885' - Curr hop: 2
Visiting entity 'Q2145290' - Curr hop: 3
Visiting entity 'Q7184903' - Curr hop: 3
Visiting entity 'Q1347367' - Curr hop: 3
Visiting entity 'Q5550686' - Curr hop: 3
Visiting entity 'Q23958852' - Curr hop: 3
Visiting entity 'Q9635555' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q33104129' - Curr hop: 3
Visiting entity 'Q33104069' - Curr hop: 1
Visiting entity 'Q151885' - Curr hop: 2
Visiting entity 'Q2145290' - Curr hop: 3
Visiting entity 'Q7184903' - Curr hop: 3
Visiting entity 'Q1347367' - Curr hop: 3
Visiting entity 'Q5550686' - Curr hop: 3
Visiting entity 'Q23958852' - Curr hop: 3
Visiting entity 'Q6210902' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q33104069' - Curr hop: 3
Visiting entity 'Q36442' - Curr hop: 2
Visiting entity 'Q15831440' - Curr hop: 3
Visiting entity 'Q7146497' - Curr hop: 3
Visiting entity 'Q4671286' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting enti

Visiting entity 'Q1458390' - Curr hop: 3
Visiting entity 'Q340169' - Curr hop: 3
Visiting entity 'Q8148' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q1047113' - Curr hop: 3
Visiting entity 'Q165650' - Curr hop: 3
Visiting entity 'Q14623843' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q202833' - Curr hop: 2
Visiting entity 'Q340169' - Curr hop: 3
Visiting entity 'Q6893208' - Curr hop: 3
Visiting entity 'Q8187769' - Curr hop: 3
Visiting entity 'Q8148' - Curr hop: 3
Visiting entity 'Q165650' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q75' - Curr hop: 2
Visiting entity 'Q11224256' - Curr hop: 3
Visiting entity 'Q1301371' - Curr hop: 3
Visiting entity 'Q4049595' - Curr hop: 3
Visiting entity 'Q1068715' - Curr hop: 3
Visiting entity 'Q14623843' - Curr hop: 3
Visiting entity 'Q165650' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q21198' - Curr hop: 1
Visiting entity 'Q6517860' -

Visiting entity 'Q1362373' - Curr hop: 1
Visiting entity 'Q756' - Curr hop: 2
Visiting entity 'Q1457109' - Curr hop: 3
Visiting entity 'Q16521' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q2424752' - Curr hop: 2
Visiting entity 'Q28877' - Curr hop: 3
Visiting entity 'Q8205328' - Curr hop: 3
Visiting entity 'Q15401930' - Curr hop: 3
Visiting entity 'Q7189878' - Curr hop: 3
Visiting entity 'Q13784641' - Curr hop: 1
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q235352' - Curr hop: 2
Visiting entity 'Q1362373' - Curr hop: 3
Visiting entity 'Q13784641' - Curr hop: 3
Visiting entity 'Q889514' - Curr hop: 0
Visiting entity 'Q107715' - Curr hop: 1
Visiting entity 'Q4373292' - Curr hop: 2
Visiting entity 'Q937228' - Curr hop: 3
Visiting entity 'Q8762242' - Curr hop: 3
Visiting entity 'Q96253971' - Curr hop: 3
Visiting entity 'Q309314' - Curr hop: 2
Visiting entity 'Q937228' - Curr

Visiting entity 'Q7022098' - Curr hop: 3
Visiting entity 'Q1027879' - Curr hop: 3
Visiting entity 'Q6504956' - Curr hop: 3
Visiting entity 'Q9185289' - Curr hop: 2
Visiting entity 'Q61685689' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q17281278' - Curr hop: 0
Visiting entity 'Q1150070' - Curr hop: 1
Visiting entity 'Q1190554' - Curr hop: 2
Visiting entity 'Q7214908' - Curr hop: 3
Visiting entity 'Q26907166' - Curr hop: 3
Visiting entity 'Q33104279' - Curr hop: 2
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q7145049' - Curr hop: 3
Visiting entity 'Q1190554' - Curr hop: 1
Visiting entity 'Q7214908' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q26907166' - Curr hop: 2
Visiting entity 'Q58415929' - Curr hop: 3
Visiting entity 'Q756' - Curr hop: 0
Visiting entity 'Q1457109' - Curr hop: 1
Visiting entity 'Q756' - Curr hop: 2
Visiting entity 'Q1457109' - Curr hop: 3
Visiting entity 'Q1

Visiting entity 'Q5446571' - Curr hop: 3
Visiting entity 'Q3895768' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q14897293' - Curr hop: 2
Visiting entity 'Q17537576' - Curr hop: 3
Visiting entity 'Q64728693' - Curr hop: 3
Visiting entity 'Q15831596' - Curr hop: 2
Visiting entity 'Q14897293' - Curr hop: 3
Visiting entity 'Q5127848' - Curr hop: 3
Visiting entity 'Q23958852' - Curr hop: 3
Visiting entity 'Q16795689' - Curr hop: 1
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q1046315' - Curr hop: 2
Visiting entity 'Q7184903' - Curr hop: 3
Visiting entity 'Q3895768' - Curr hop: 3
Visiting entity 'Q16795689' - Curr hop: 3
Visiting entity 'Q55070019' - Curr hop: 3
Visiting entity 'Q55070019' - Curr hop: 1
Visiting entity 'Q1207505' - Curr hop: 2
Visiting entity 'Q33104279' - Curr hop: 3
Visiting entity 'Q25271255' - Curr hop: 3
Visiting entity 'Q937228' - Curr hop: 3
Visiting entity 'Q111029' - Curr hop: 0
Visiting entity 'Q43249' - Curr hop: 1
Visiting

Visiting entity 'Q23959932' - Curr hop: 2
Visiting entity 'Q23960977' - Curr hop: 3
Visiting entity 'Q19361238' - Curr hop: 3
Visiting entity 'Q23958852' - Curr hop: 3
Visiting entity 'Q24017465' - Curr hop: 2
Visiting entity 'Q23959932' - Curr hop: 3
Visiting entity 'Q24027474' - Curr hop: 3
Visiting entity 'Q11158' - Curr hop: 0
Visiting entity 'Q11173' - Curr hop: 1
Visiting entity 'Q578779' - Curr hop: 2
Visiting entity 'Q79529' - Curr hop: 3
Visiting entity 'Q20026787' - Curr hop: 2
Visiting entity 'Q19478619' - Curr hop: 3
Visiting entity 'Q1310239' - Curr hop: 3
Visiting entity 'Q6482368' - Curr hop: 2
Visiting entity 'Q11173' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q2329' - Curr hop: 2
Visiting entity 'Q1457474' - Curr hop: 3
Visiting entity 'Q11173' - Curr hop: 3
Visiting entity 'Q17339814' - Curr hop: 3
Visiting entity 'Q11344' - Curr hop: 3
Visiting entity 'Q79529' - Curr hop: 3
Visiting entity 'Q11023' - Curr hop: 3
Visiting entity 'Q14632398

Visiting entity 'Q6465232' - Curr hop: 3
Visiting entity 'Q151885' - Curr hop: 3
Visiting entity 'Q1067099' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q21198' - Curr hop: 3
Visiting entity 'Q16513426' - Curr hop: 2
Visiting entity 'Q9675721' - Curr hop: 3
Visiting entity 'Q1775867' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q8134' - Curr hop: 3
Visiting entity 'Q9418' - Curr hop: 3
Visiting entity 'Q202875' - Curr hop: 2
Visiting entity 'Q8670529' - Curr hop: 3
Visiting entity 'Q131395' - Curr hop: 3
Visiting entity 'Q1914636' - Curr hop: 3
Visiting entity 'Q7157572' - Curr hop: 3
Visiting entity 'Q97008347' - Curr hop: 3
Visiting entity 'Q8134' - Curr hop: 3
Visiting entity 'Q7163' - Curr hop: 2
Visiting entity 'Q3482410' - Curr hop: 3
Visiting entity 'Q4686698' - Curr hop: 3
Visiting entity 'Q1914636' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q4103183' - Curr hop: 3
Visiting entity 'Q36442' - C

Visiting entity 'Q7116451' - Curr hop: 3
Visiting entity 'Q79529' - Curr hop: 3
Visiting entity 'Q50413986' - Curr hop: 3
Visiting entity 'Q8798082' - Curr hop: 2
Visiting entity 'Q131656' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q50380088' - Curr hop: 1
Visiting entity 'Q50413986' - Curr hop: 2
Visiting entity 'Q50377228' - Curr hop: 3
Visiting entity 'Q50379782' - Curr hop: 2
Visiting entity 'Q50377224' - Curr hop: 3
Visiting entity 'Q14535662' - Curr hop: 1
Visiting entity 'Q1664404' - Curr hop: 1
Visiting entity 'Q2251595' - Curr hop: 2
Visiting entity 'Q9332' - Curr hop: 3
Visiting entity 'Q1914636' - Curr hop: 3
Visiting entity 'Q1717246' - Curr hop: 3
Visiting entity 'Q166154' - Curr hop: 0
Visiting entity 'Q211548' - Curr hop: 1
Visiting entity 'Q36161' - Curr hop: 2
Visiting entity 'Q217594' - Curr hop: 3
Visiting entity 'Q17008256' - Curr hop: 3
Visiting entity 'Q5469988' - Curr hop: 3
Visiting entity 'Q246672' - Curr hop: 3
Visiting entity 'Q86

Visiting entity 'Q622852' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q729' - Curr hop: 2
Visiting entity 'Q16521' - Curr hop: 3
Visiting entity 'Q7157802' - Curr hop: 3
Visiting entity 'Q6254409' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q57812559' - Curr hop: 1
Visiting entity 'Q26401003' - Curr hop: 2
Visiting entity 'Q7157842' - Curr hop: 3
Visiting entity 'Q795052' - Curr hop: 3
Visiting entity 'Q729' - Curr hop: 3
Visiting entity 'Q18336849' - Curr hop: 3
Visiting entity 'Q756' - Curr hop: 0
Visiting entity 'Q1457109' - Curr hop: 1
Visiting entity 'Q756' - Curr hop: 2
Visiting entity 'Q1457109' - Curr hop: 3
Visiting entity 'Q16521' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q16521' - Curr hop: 1
Visiting entity 'Q16889133' - Curr hop: 2
Visiting entity 'Q19478619' - Curr hop: 3
Visi

Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q33104279' - Curr hop: 3
Visiting entity 'Q20937557' - Curr hop: 2
Visiting entity 'Q16887380' - Curr hop: 3
Visiting entity 'Q17281278' - Curr hop: 1
Visiting entity 'Q1150070' - Curr hop: 2
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q33104279' - Curr hop: 3
Visiting entity 'Q1190554' - Curr hop: 2
Visiting entity 'Q7214908' - Curr hop: 3
Visiting entity 'Q26907166' - Curr hop: 3
Visiting entity 'Q2996394' - Curr hop: 1
Visiting entity 'Q3249551' - Curr hop: 2
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q1150070' - Curr hop: 3
Visiting entity 'Q20937557' - Curr hop: 3
Visiting entity 'Q3249551' - Curr hop: 2
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q1150070' - Curr hop: 3
Visiting entity 'Q20937557' - Curr hop: 3
Visiting entity 'Q13878858' - Curr hop: 2
Visiting entity 'Q9060550' - Curr hop: 3
Visiting entity 'Q3249551' - Curr hop: 3
Visiting entity 'Q64732777' - Curr hop: 2
Visiti

Visiting entity 'Q174211' - Curr hop: 3
Visiting entity 'Q7215429' - Curr hop: 3
Visiting entity 'Q7204727' - Curr hop: 1
Visiting entity 'Q8054' - Curr hop: 2
Visiting entity 'Q422649' - Curr hop: 3
Visiting entity 'Q181394' - Curr hop: 3
Visiting entity 'Q424689' - Curr hop: 3
Visiting entity 'Q145273' - Curr hop: 3
Visiting entity 'Q66560214' - Curr hop: 3
Visiting entity 'Q7204727' - Curr hop: 3
Visiting entity 'Q17339814' - Curr hop: 3
Visiting entity 'Q24017414' - Curr hop: 3
Visiting entity 'Q4167836' - Curr hop: 2
Visiting entity 'Q17339814' - Curr hop: 1
Visiting entity 'Q16887380' - Curr hop: 2
Visiting entity 'Q488383' - Curr hop: 3
Visiting entity 'Q16889133' - Curr hop: 3
Visiting entity 'Q9101233' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q17339814' - Curr hop: 3
Visiting entity 'Q24017465' - Curr hop: 2
Visiting entity 'Q23959932' - Curr hop: 3
Visiting entity 'Q24027474' - Curr hop: 3
Visiting entity 'Q2329' - Curr hop: 2
Visiting entity 'Q

Visiting entity 'Q7020' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q47263' - Curr hop: 2
Visiting entity 'Q165963' - Curr hop: 3
Visiting entity 'Q5058355' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q441' - Curr hop: 1
Visiting entity 'Q11862829' - Curr hop: 2
Visiting entity 'Q1047113' - Curr hop: 3
Visiting entity 'Q6642719' - Curr hop: 3
Visiting entity 'Q24017414' - Curr hop: 3
Visiting entity 'Q28598684' - Curr hop: 2
Visiting entity 'Q2465832' - Curr hop: 3
Visiting entity 'Q24017414' - Curr hop: 3
Visiting entity 'Q8309465' - Curr hop: 3
Visiting entity 'Q4671286' - Curr hop: 2
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q420' - Curr hop: 2
Visiting entity 'Q7991' - Curr hop: 3
Visiting entity 'Q1457402' - Curr hop: 3
Visiting entity 'Q2465832' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q3' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q24454422' - Curr h

Visiting entity 'Q451967' - Curr hop: 1
Visiting entity 'Q3769299' - Curr hop: 2
Visiting entity 'Q6697416' - Curr hop: 3
Visiting entity 'Q9332' - Curr hop: 3
Visiting entity 'Q9418' - Curr hop: 3
Visiting entity 'Q4026292' - Curr hop: 2
Visiting entity 'Q175661' - Curr hop: 3
Visiting entity 'Q25233764' - Curr hop: 3
Visiting entity 'Q33104279' - Curr hop: 3
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q3249551' - Curr hop: 2
Visiting entity 'Q1190554' - Curr hop: 3
Visiting entity 'Q1150070' - Curr hop: 3
Visiting entity 'Q20937557' - Curr hop: 3
Visiting entity 'Q61788060' - Curr hop: 2
Visiting entity 'Q1914636' - Curr hop: 3
Visiting entity 'Q9878968' - Curr hop: 3
Visiting entity 'Q13966824' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q451967' - Curr hop: 3
Visiting entity 'Q7748' - Curr hop: 2
Visiting entity 'Q1151067' - Curr hop: 3
Visiting entity 'Q4026563' - Curr hop: 3
Visiting entity 'Q4932206' - Curr hop: 3
Visiting entity 'Q38299

Visiting entity 'Q4671286' - Curr hop: 3
Visiting entity 'Q11862829' - Curr hop: 3
Visiting entity 'Q420' - Curr hop: 3
Visiting entity 'Q4056905' - Curr hop: 2
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q59541917' - Curr hop: 3
Visiting entity 'Q756' - Curr hop: 2
Visiting entity 'Q1457109' - Curr hop: 3
Visiting entity 'Q16521' - Curr hop: 3
Visiting entity 'Q7201518' - Curr hop: 3
Visiting entity 'Q441' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 1
Visiting entity 'Q16686022' - Curr hop: 2
Visiting entity 'Q223557' - Curr hop: 3
Visiting entity 'Q29651224' - Curr hop: 3
Visiting entity 'Q1274979' - Curr hop: 2
Visiting entity 'Q35120' - Curr hop: 3
Visiting entity 'Q21452697' - Curr hop: 3
Visiting entity 'Q5626060' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q7239' - Curr hop: 3
Visiting entity 'Q420' - Curr hop: 2
Visiting entity 'Q7991' - Curr hop: 3
Visiting entity 'Q1457402' - Curr hop: 3
Visitin

Visiting entity 'Q591041' - Curr hop: 2
Visiting entity 'Q732577' - Curr hop: 3
Visiting entity 'Q191067' - Curr hop: 2
Visiting entity 'Q234460' - Curr hop: 3
Visiting entity 'Q47461344' - Curr hop: 3
Visiting entity 'Q732577' - Curr hop: 3
Visiting entity 'Q9499516' - Curr hop: 3
Visiting entity 'Q55915575' - Curr hop: 2
Visiting entity 'Q15621286' - Curr hop: 3
Visiting entity 'Q47461344' - Curr hop: 3
Visiting entity 'Q10843248' - Curr hop: 2
Visiting entity 'Q4167836' - Curr hop: 3
Visiting entity 'Q13442814' - Curr hop: 3
Visiting entity 'Q7187' - Curr hop: 0
Visiting entity 'Q863908' - Curr hop: 1
Visiting entity 'Q37500013' - Curr hop: 2
Visiting entity 'Q3511065' - Curr hop: 3
Visiting entity 'Q3771876' - Curr hop: 2
Visiting entity 'Q15712714' - Curr hop: 3
Visiting entity 'Q3511065' - Curr hop: 2
Visiting entity 'Q15712714' - Curr hop: 3
Visiting entity 'Q50365914' - Curr hop: 1
Visiting entity 'Q3511065' - Curr hop: 1
Visiting entity 'Q15712714' - Curr hop: 2
Visiting entit

In [21]:
import networkx as nx

from bokeh.io import output_file, show
from bokeh.layouts import gridplot
from bokeh.models import (BoxZoomTool, Circle, HoverTool,
                          MultiLine, Plot, Range1d, ResetTool,)
from bokeh.palettes import Spectral4
from bokeh.plotting import from_networkx

def build_graph_plot(G, title=""):
    plot = Plot(plot_width=400, plot_height=400,
                x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
    plot.title.text = title

    SAME_CLUB_COLOR, DIFFERENT_CLUB_COLOR = "black", "red"
    node_attrs = {}

    for node in G.nodes(data=True):
        node_color = Spectral4[node[1]['n']]
        node_attrs[node[0]] = node_color

    nx.set_node_attributes(G, node_attrs, "node_color")

    node_hover_tool = HoverTool(tooltips=[("Label", "@label"), ("n", "@n")])
    plot.add_tools(node_hover_tool, BoxZoomTool(), ResetTool())

    graph_renderer = from_networkx(G, nx.spring_layout, scale=1, center=(0, 0))

    graph_renderer.node_renderer.glyph = Circle(size=15, fill_color="node_color")
    graph_renderer.edge_renderer.glyph = MultiLine(line_alpha=0.8, line_width=1)
    plot.renderers.append(graph_renderer)
    return plot


plots = [build_graph_plot(g, f"Topic {idx}") 
         for idx, g in enumerate(topic_graphs)]
grid = gridplot(plots, ncols=2)
show(grid)

In [22]:
import networkx.algorithms as nxa

def get_largest_connected_subgraph(g):
    S = [g.subgraph(c).copy() for c in nxa.components.connected_components(g)]
    return max(S, key=len)


In [23]:
connected_topic_subgraphs = [get_largest_connected_subgraph(g) 
                             for g in topic_graphs]

In [24]:
plots = [build_graph_plot(g, f"Largest Connected subgraph for topic {idx}") 
         for idx, g in enumerate(connected_topic_subgraphs)]
grid = gridplot(plots, ncols=2)
show(grid)

In [25]:
def show_algorithm_results(topic_subgraphs, algorithm, stop_uris, top_n):
    topic_results = [algorithm(g) for g in topic_subgraphs]
    topic_results = [{key: val for key, val in result.items()
                      if topic_subgraphs[idx].nodes[key]['n'] != 0
                      and key not in stop_uris}
                      for idx, result in enumerate(topic_results)]
    topics_qids = [sorted(res, key=res.get, reverse=True)[:top_n] for res in topic_results]
    return [[(topic_subgraphs[idx].nodes[qid]['label'], qid) for qid in topic] 
            for idx, topic in enumerate(topics_qids)]


In [30]:
linked_terms[:4]

[[('food', 'http://www.wikidata.org/entity/Q2095'),
  ('system', 'http://www.wikidata.org/entity/Q58778'),
  ('habitat', 'http://www.wikidata.org/entity/Q52105'),
  ('grassland', 'http://www.wikidata.org/entity/Q1006733'),
  ('production', 'http://www.wikidata.org/entity/Q739302'),
  ('agricultural', 'http://www.wikidata.org/entity/Q5356428'),
  ('bird', 'http://www.wikidata.org/entity/Q5113'),
  ('land', 'http://www.wikidata.org/entity/Q11081619'),
  ('vegetable', 'http://www.wikidata.org/entity/Q11004'),
  ('security', 'http://www.wikidata.org/entity/Q2526135')],
 [('plant', 'http://www.wikidata.org/entity/Q756'),
  ('expression', 'http://www.wikidata.org/entity/Q11024'),
  ('gene', 'http://www.wikidata.org/entity/Q7187'),
  ('adaptation', 'http://www.wikidata.org/entity/Q3331189'),
  ('root', 'http://www.wikidata.org/entity/Q111029'),
  ('shoot', 'http://www.wikidata.org/entity/Q220869'),
  ('diversification', 'http://www.wikidata.org/entity/Q731453'),
  ('site', 'http://www.wikidat

In [31]:
def try_algorithms(topic_subgraphs, algorithms, stop_uris, top_n=4):
    for (algorithm, name) in algorithms:
        print(f'Algorithm: {name}')
        results = show_algorithm_results(topic_subgraphs,
                                         algorithm, stop_uris,
                                         top_n)
        for idx, result in enumerate(results):
            print(f"Topic {idx}:", result)
            print()
        print()

        
algorithms = [
    (nxa.centrality.information_centrality, "Information centrality"),
    (nxa.centrality.eigenvector_centrality_numpy, "Eigenvector centrality"),
    (nxa.centrality.closeness_centrality, "Closeness centrality"),
    (nxa.centrality.betweenness_centrality, "Betweenness centrality"),
    (nxa.centrality.communicability_betweenness_centrality, "Communicability betweenness centrality")
]

try_algorithms(connected_topic_subgraphs[:4],
               algorithms,
               ['Q4167836', 'Q11862829'])

Algorithm: Information centrality
Topic 0: [('agronomy', 'Q173113'), ('systems engineering', 'Q682496'), ('systems science', 'Q2167061'), ('cybernetics', 'Q123637')]

Topic 1: [('interaction science', 'Q97008347'), ('communication studies', 'Q14623843'), ('interaction', 'Q52948'), ('communication medium', 'Q340169')]

Topic 2: [('interaction science', 'Q97008347'), ('economic sector', 'Q3958441'), ('science', 'Q336'), ('systems engineering', 'Q682496')]

Topic 3: [('food ingredient', 'Q25403900'), ('food', 'Q2095'), ('crop', 'Q235352'), ('specialty use of chemicals', 'Q50413986')]


Algorithm: Eigenvector centrality
Topic 0: [('food ingredient', 'Q25403900'), ('product', 'Q2424752'), ('disposable product', 'Q1194058'), ('agronomy', 'Q173113')]

Topic 1: [('interaction science', 'Q97008347'), ('communication medium', 'Q340169'), ('interaction', 'Q52948'), ('fairness', 'Q771773')]

Topic 2: [('interaction science', 'Q97008347'), ('science', 'Q336'), ('Internet', 'Q75'), ('trade', 'Q60140

The main funcionality from above will be implemented in a custom class that conforms to the sklearn API:

In [None]:

class TopicLabeller():
    def __init__(self, graph_builder,
                 r=nxa.centrality.information_centrality,
                 num_labels_per_topic=1):
        self.graph_builder = graph_builder
        self.r = r
        self.num_labels = num_labels_per_topic
    
    def transform(self, X, y, **kwargs):
        return [self.get_topics_labels(topic) for topic in X]
    
    def get_topic_labels(self, topic_graph):
        topic_neighbourhood = self.graph_builder.build_graph(topic)
        subgraph = get_largest_connected_subgraph(topic_neighbourhood)
        metrics = r(subgraph) 
        # TODO remove seed concepts from metrics
        best_qids = max(metrics.items(), key=operator.itemgetter(1))
        return [subgraph[qid], metrics[qid] for qid in best_qids



## Add labels to LDA model