In [21]:
# read the full db structure here and see which kinds of results are available

from linkers.baseline import BaselineLinker
from collections import namedtuple, defaultdict
from diffbot_api import EL_POL_ENTITY_TYPES
import json
from candidate import Candidate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from langid import classify
from candidate import Phrase
import re


# CandidateFeatures = namedtuple("CandidateFeatures", ["names", "description","isPartOf", "partOf"])

def make_dummy_phrases(str_phrases):
    return [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in str_phrases]


    
class ContextAwareLinker(BaselineLinker):
    def __init__(self):
        BaselineLinker.__init__(self)
        self._re_contains_alpha = re.compile(r"[a-z]+", re.U|re.I)

    def get_db_entry(diffbot_uri):
        """ Gets an entity like https://www.diffbot.com/entity/AcZTRPXDrY9 and 
        returns a json by https://www.diffbot.com/entity/AcZTRPXDrY9.json """

        return {}

    def _is_english(self, text):
        lang, conf = classify(text)
        return lang == "en"

    def _is_alpha(self, text):
        return self._re_contains_alpha.search(text)
    
    def _get_record_texts(self, hit):
        texts = []

        if "name" in hit:
            texts.append(hit["name"])

        if "isPartOf" in hit:
            for is_part_of in hit["isPartOf"]:
                if "name" in is_part_of:
                    texts.append(is_part_of["name"])
                    
        if "allNames" in hit:
            for name in hit["allNames"]:
                if self._is_alpha(name) and self._is_english(name):                    
                    texts.append(name)
                    
        if "description" in hit:
            texts.append(hit["description"])

        return " . ".join(texts)
    
    def _get_wiki_texts(self, wiki_uri):
        # access from a cached (?) wikipedia dump
        return ""
    
    def _get_uri_texts(self, uris):
        # access the uris
        return ""
 
    def get_candidates(self, phrases):
        
        candidates = defaultdict(list)  
        i = 0
        for phrase in phrases:
            for entity_type in EL_POL_ENTITY_TYPES:
                r = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text))
                db_response = json.loads(r.content)
            
                if "data" not in db_response: continue
                else: data = db_response["data"]

                for hit in data:
                    uris = self._get_uris(hit)
                    wiki_uri = self._get_wikipedia_uri(hit, uris)  
                    
                    texts_record = self._get_record_texts(hit)
                    texts_wiki = self._get_wiki_texts(wiki_uri) 
                    texts_uris = self._get_uri_texts(uris)                    
                    texts = "\n\n".join([texts_record, texts_wiki, texts_uris])

                    score = float(hit["importance"])
                    c = Candidate(score,
                                  hit["name"],
                                  "",
                                  wiki_uri,
                                  hit["types"],
                                  hit["allNames"],
                                  uris,
                                  texts)
                    i += 1
                    candidates[phrase].append(c)
            break
        print(">>>>>>>>",i)
        return candidates

       
class DenseLinker(ContextAwareLinker):
    def foo(self):
        pass
    
    def _vectorize_texts(self, texts):
        # encode all these using sentence embeddings as well and/or average word2vec

        # load a gensim model

        # tokenize the words
        # average the words

        return 

    
class SparseLiker(BaselineLinker):
    def foo(self):
        pass
    
    def _vectorize_texts_sparse(self, hit):
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(twenty_train.data)
    
    
sl = ContextAwareLinker()
c = sl.get_candidates(make_dummy_phrases(["San Francisco"]))

>>>>>>>> 350


In [26]:
for x in c:
    print(x.text, len(c[x]))

San Francisco 350


In [24]:
for 

dict_keys([Phrase(text='San Francisco', beg=1, end=13, subj='http://San Francisco')])

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase

context = "San Francisco said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey."
phrases = "San Francisco"

phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in phrases.split(",")]
bl = BaselineLinker()

for phrase, candidate in bl.link(context, phrases):
    print(phrase.text, candidate)

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase
from pandas import read_csv 

dataset_fpath = "datasets/dbpedia.tsv"

df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
bl = BaselineLinker()

for i, row in df.iterrows():
    phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                       for phrase in row.targets.split(",")]
        
    print("\n\n{}\n".format(row.context))
    
    for phrase, candidate in bl.link(row.context, phrases):
        link = candidate.link if candidate else ""
        print(phrase.text, link)

In [None]:
%load_ext autoreload
%autoreload 2