# 5. Entity Linking

In [1]:
%run __init__.py

## Defining the entity linking class

In [2]:
import json
import requests


WIKIDATA_BASE = "https://www.wikidata.org/w"

class WikidataEntityLinker():
    def __init__(self, user, passwd):
        pass
    
    def fit(self, X, y, *args):
        return self
    
    def transform(self, X, y, *args):
        return [self.link_entity(entity) 
                for doc in X
                for entity in doc]
    
    def link_entity(self, entity_label):
        url = f"{WIKIDATA_BASE}/api.php?action=wbsearchentities&search=" + \
            f"{entity_label}&language=en&format=json"
        response = requests.get(url)
        if response.status_code != 200:
            raise Error()
        content = json.loads(response.text)
        search_results = content['search']
        if len(search_results) == 0:
            return (entity_label, None)
        return (entity_label, search_results[0]['concepturi'])


In [3]:
entity_linker = WikidataEntityLinker("", "")
res = entity_linker.link_entity('agroforestry')
res

('agroforestry', 'http://www.wikidata.org/entity/Q397350')

## Linking each topic's term to Wikidata

In [4]:
import dill as pickle

# see https://stackoverflow.com/questions/42960637/python-3-5-dill-pickling-unpickling-on-different-servers-keyerror-classtype
pickle._dill._reverse_typemap['ClassType'] = type

def load_object(output_path):
    with open(output_path, 'rb') as file:
        res = pickle.load(file)
    return res

In [5]:
NOTEBOOK_RESULTS_DIR = os.path.join(RESULTS_DIR, '3_topic_modeling')
lda_agriculture_pipe_filename = "agriculture_lda_model.pkl"
dtm_tf_filename = "agriculture_dtm_tf.pkl"

lda_pipe = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, lda_agriculture_pipe_filename))
dtm_tf = load_object(os.path.join(NOTEBOOK_RESULTS_DIR, dtm_tf_filename))

In [6]:
from src.utils import get_topic_terms_by_relevance

def link_topic_terms(entity_linker, model, vectorizer,
                     dtm_tf, n_top_words, lambda_=0.6):
    res = []
    topic_terms = get_topic_terms_by_relevance(model, vectorizer, dtm_tf,
                                               n_top_words, lambda_)
    return [[entity_linker.link_entity(entity) for entity in topic]
            for topic in topic_terms]
    for topic in model.components_:
        res.append([entity_linker.link_entity(feature_names[i])
                    for i in topic.argsort()[:-n_top_words - 1:-1]])
    return res


In [7]:
linked_terms = link_topic_terms(entity_linker, lda_pipe.named_steps['model'],
                                lda_pipe.named_steps['vectorizer'], dtm_tf, 
                                n_top_words=10, lambda_=0.75)

In [10]:
linked_terms[2]

[('base', 'http://www.wikidata.org/entity/Q191360'),
 ('sensor', 'http://www.wikidata.org/entity/Q167676'),
 ('system', 'http://www.wikidata.org/entity/Q58778'),
 ('node', 'http://www.wikidata.org/entity/Q756100'),
 ('agricultural', 'http://www.wikidata.org/entity/Q5356428'),
 ('breeding', 'http://www.wikidata.org/entity/Q227675'),
 ('technology', 'http://www.wikidata.org/entity/Q11016'),
 ('farming', 'http://www.wikidata.org/entity/Q11451'),
 ('power', 'http://www.wikidata.org/entity/Q25107'),
 ('iot', 'http://www.wikidata.org/entity/Q251212')]