# 5. Entity Linking

In [1]:
%run __init__.py

## Defining the entity linking class

In [30]:
import json
import requests


WIKIDATA_BASE = "https://www.wikidata.org/w"

class WikidataEntityLinker():
    def __init__(self, user, passwd):
        pass
    
    def fit(self, X, y, *args):
        return self
    
    def transform(self, X, y, *args):
        return [self.link_entity(entity) 
                for doc in X
                for entity in doc]
    
    def link_entity(self, entity_label):
        url = f"{wikidata_base}/api.php?action=wbsearchentities&search=" + \
            f"{entity_label}&language=en&format=json"
        response = requests.get(url)
        if response.status_code != 200:
            raise Error()
        content = json.loads(response.text)
        search_results = content['search']
        if len(search_results) == 0:
            return (entity_label, None)
        return (entity_label, search_results[0]['concepturi'])


In [31]:
entity_linker = WikidataEntityLinker("", "")
res = entity_linker.link_entity('agroforestry')
res

('agroforestry', 'http://www.wikidata.org/entity/Q397350')

## Linking each topic's term to Wikidata

In [61]:
import dill as pickle

# see https://stackoverflow.com/questions/42960637/python-3-5-dill-pickling-unpickling-on-different-servers-keyerror-classtype
pickle._dill._reverse_typemap['ClassType'] = type

def load_pipeline(output_path):
    with open(output_path, 'rb') as file:
        res = pickle.load(file)
    return res

In [63]:
NOTEBOOK_RESULTS_DIR = os.path.join(RESULTS_DIR, '3_topic_modeling')
lda_agriculture_pipe_filename = "agriculture_lda_model.pkl"

lda_pipe = load_pipeline(os.path.join(NOTEBOOK_RESULTS_DIR, lda_agriculture_pipe_filename))

In [64]:
def link_topic_terms(entity_linker, model, feature_names, n_top_words):
    res = []
    for topic in model.components_:
        res.append([entity_linker.link_entity(feature_names[i])
                    for i in topic.argsort()[:-n_top_words - 1:-1]])
    return res


In [65]:
linked_terms = link_topic_terms(entity_linker, lda_pipe.named_steps['model'],
                               lda_pipe.named_steps['vectorizer'].get_feature_names(),
                               n_top_words=15)

In [66]:
linked_terms[0]

[('food', 'http://www.wikidata.org/entity/Q2095'),
 ('system', 'http://www.wikidata.org/entity/Q58778'),
 ('production', 'http://www.wikidata.org/entity/Q739302'),
 ('habitat', 'http://www.wikidata.org/entity/Q52105'),
 ('grassland', 'http://www.wikidata.org/entity/Q1006733'),
 ('agricultural', 'http://www.wikidata.org/entity/Q5356428'),
 ('level', 'http://www.wikidata.org/entity/Q1046315'),
 ('value', 'http://www.wikidata.org/entity/Q194112'),
 ('change', 'http://www.wikidata.org/entity/Q1150070'),
 ('land', 'http://www.wikidata.org/entity/Q11081619'),
 ('climate', 'http://www.wikidata.org/entity/Q7937'),
 ('development', 'http://www.wikidata.org/entity/Q1074523'),
 ('bird', 'http://www.wikidata.org/entity/Q5113'),
 ('type', 'http://www.wikidata.org/entity/Q190087'),
 ('security', 'http://www.wikidata.org/entity/Q2526135')]

In [69]:
len(lda_pipe.named_steps['model'].components_[0])

28548

In [71]:
a = lda_pipe.named_steps['vectorizer']

In [74]:
a.

TypeError: 'bool' object is not subscriptable