In [23]:
# read the full db structure here and see which kinds of results are available

from linkers.baseline import BaselineLinker
from collections import namedtuple, defaultdict
from diffbot_api import EL_POL_ENTITY_TYPES
import json
from candidate import Candidate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from langid import classify
from candidate import Phrase

# CandidateFeatures = namedtuple("CandidateFeatures", ["names", "description","isPartOf", "partOf"])

def make_dummy_phrases(str_phrases):
    return [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in str_phrases]


class SparseLiker(BaselineLinker):
    
    # move to base
    def _get_uris(self, hit):
        uris = set()
        
        if "allUris" in hit: uris.union( set(hit["allUris"]) )
        if "origins" in hit: uris.union( set(hit["origins"]) )
        if "origin" in hit: uris.add( hit["origin"] )
        
        return uris

   # move to base
    def _get_wikipedia_uri(self, hit, uris):
        wiki_uri = ""
        
        if "wikipediaUri" in hit:
            wiki_uri = hit["wikipediaUri"]
            uris.add(wiki_uri)
        else:
            # try to find via wikidata link    
            for uri in uris:
                wiki_uri = self._conv.wikidata2wikipedia(uri)
                if wiki_uri != "":
                    print("Founda a wiki uri:", wiki_uri)
                    break
            
        return wiki_uri
    

    def get_db_entry(diffbot_uri):
        """ Gets an entity like https://www.diffbot.com/entity/AcZTRPXDrY9 and 
        returns a json by https://www.diffbot.com/entity/AcZTRPXDrY9.json """

        return {}


    def _vectorize_texts_sparse(self, hit):
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(twenty_train.data)

    # move to the dense class
    def _vectorize_texts_dense(self, hit):
        # encode all these using sentence embeddings as well and/or average word2vec

        # load a gensim model

        # tokenize the words
        # average the words

        return 

    # move to baes text aware class
    def _is_english(self, text):
        lang, conf = classify(text)
        return lang == "en"

    def _is_alpha(self, text):
        # regexp checking it is not a number ...
        return True
    
    # move to baes text aware class
    def _get_texts(self, hit):
        texts = []

        if "name" in hit:
            texts.append(hit["name"])

        if "description" in hit:
            texts.append(hit["description"])

        if "isPartOf" in hit:
            for is_part_of in hit["isPartOf"]:
                if "name" in is_part_of:
                    texts.append(is_part_of["name"])
                    
        if "allNames" in hit:
            for name in hit["allNames"]:
                if self._alpha(name) and self._is_english(name):
                    print(">>>>>>",name)
                    
                    texts.append(name)

        return " . ".join(texts)

    
    def get_data(self, phrases):
        
        candidates = defaultdict(list)  

        for phrase in phrases:
            for entity_type in EL_POL_ENTITY_TYPES:
                r = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text))
                db_response = json.loads(r.content)
            
                if "data" not in db_response: continue
                else: data = db_response["data"]

                for hit in data:
                    uris = self._get_uris(hit)
                    wiki_uri = self._get_wikipedia_uri(hit, uris)  
                    
                    texts = self._get_texts(hit)
                    texts_wiki = "" # get texts from the wiki 
                    texts_uris = "" # use some external tool to extract text by uri
                    
                    representation = "\n\n".join([texts, texts_wiki, texts_uris])
                    
                    print("\n\n","="*50)
                    print(hit["name"], "\n", "-"*50, "\n", representation)
                    
                    break
#                     score = 0.0
#                     importance = float(hit["importance"])
#                     c = Candidate(score,
#                                   name,
#                                   dbpedia_uri,
#                                   wiki_uri,
#                                   hit["types"],
#                                   hit["allNames"],
#                                   uris)
                    
#                     candidates[phrase].append(c)
                    #candidates[phrase].append(c, representation)
                    
        return hit
    
sl = SparseLiker()

h = sl.get_data(make_dummy_phrases(["San Francisco"]))



Founda a wiki uri: https://en.wikipedia.org/wiki/Albuquerque,_New_Mexico
>>>>>> Albuquerque, Meksiko Nowydh
>>>>>> Vokekyi Leuwi
>>>>>> Alburquerque
>>>>>> ABQ
>>>>>> 87101–87199
>>>>>> 87109
>>>>>> 87108
>>>>>> 87106
>>>>>> 87131
>>>>>> 87187
>>>>>> 87194
>>>>>> 87103
>>>>>> 87192
>>>>>> 87158
>>>>>> 87104
>>>>>> 87124
>>>>>> 87114
>>>>>> 87112
>>>>>> 87110
>>>>>> 87122
>>>>>> 87151
>>>>>> 87193
>>>>>> 87181
>>>>>> 87105
>>>>>> 87115
>>>>>> 87190
>>>>>> 87121
>>>>>> 87191
>>>>>> 87119
>>>>>> 87111
>>>>>> 87196


Albuquerque 
 -------------------------------------------------- 
 Albuquerque . Albuquerque (in Navajo Beeʼeldííl Dahsinil, , Arawageeki in Keres; Vakêêke in Jemez Towa; Gołgéeki in Jicarilla Apache) is the most populous city in the U.S. state of New Mexico. The high-elevation city serves as the county seat of Bernalillo County, and it is situated in the north central part of the state, straddling the Rio Grande. The city population is 559,277 as of the July 1, 2016 populatio

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase

context = "San Francisco said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey."
phrases = "San Francisco"

phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in phrases.split(",")]
bl = BaselineLinker()

for phrase, candidate in bl.link(context, phrases):
    print(phrase.text, candidate)

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase
from pandas import read_csv 

dataset_fpath = "datasets/dbpedia.tsv"

df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
bl = BaselineLinker()

for i, row in df.iterrows():
    phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                       for phrase in row.targets.split(",")]
        
    print("\n\n{}\n".format(row.context))
    
    for phrase, candidate in bl.link(row.context, phrases):
        link = candidate.link if candidate else ""
        print(phrase.text, link)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import Counter

Counter