In [16]:
from linkers.context_aware import ContextAwareLinker 
from collections import defaultdict
from candidate import Candidate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from candidate import Phrase, make_phrases
import re
from pandas import read_csv 
from time import time
from os.path import join
from utils import ensure_dir
from sklearn.externals import joblib
import json
from os.path import exists


class DenseLinker(ContextAwareLinker):
    def foo(self):
        pass
    
    def _vectorize_texts(self, texts):
        # encode all these using sentence embeddings as well and/or average word2vec

        # load a gensim model

        # tokenize the words
        # average the words

        return 
 

# ToDo: save also directly the phrase2index file for faster classifications

class SparseLiker(ContextAwareLinker):
    def __init__(self, model_dir, tfidf=True, description=""):
        ContextAwareLinker.__init__(self)
        self._params = {}
        self._params["tfidf"] = tfidf
        self._params["description"] = description
        
        vectorizer_filename = "vectorizer.pkl"
        candidate2index_filename = "candidate2index.pkl"
        params_filename = "params.json"
        vectors_filename = "vectors.pkl"
        phrase2candidates_filename = "phrase2candidate.pkl"
        
        self._vectorizer_fpath = join(model_dir, vectorizer_filename)
        self._candidate2index_fpath = join(model_dir, candidate2index_filename)
        self._params_fpath = join(model_dir, params_filename) 
        self._vectors_fpath = join(model_dir, vectors_filename)
        self._phrase2candidates_fpath = join(model_dir, phrase2candidates_filename)

        self._load(model_dir) # using the defined paths
        
    def _load(self, model_dir):
        tic = time()
        ensure_dir(model_dir) 

        if exists(self._params_fpath):
            with open(self._params_fpath, "r") as fp:
                self._params = json.load(fp)
            print("Parameters:\n- ", "\n- ".join("{}: {}".format(p, self._params[p]) for p in self._params))
         
        if exists(self._phrase2candidates_fpath):
            self._phrase2candidates = joblib.load(self._phrase2candidates_fpath) 
        
        if exists(self._candidate2index_fpath):
            self._candidate2index = joblib.load(self._candidate2index_fpath) 
        
        if exists(self._vectorizer_fpath):
            self._vectorizer = joblib.load(self._vectorizer_fpath) 
        
        if exists(self._vectors_fpath):
            self._vectors = joblib.load(self._vectors_fpath)
            
        print("Loaded in {:.2f} sec.".format(time()-tic))
        
    def train(self, dataset_fpaths):
        tic = time()
        print("Training...")
        phrases = self._dataset2phrases(dataset_fpaths)
        self._train(phrases)
        print("Training is done in {:.2f} sec.".format(time()-tic))
        
    def _train(self, phrases):
        self._params["num_phrases"] = len(phrases)
        print("Number of phrases:", len(phrases))
        
        # get the candidates
#         self._phrase2candidates = self.get_candidates(phrases)
#         candidates = set()
#         for phrase in _phrase2candidates:
#             for candidate in _phrase2candidates[phrase]:
#                 candidates.add(candidate)
#         print("Number of candidates:", len(candidates))
#         joblib.dump(self._phrase2candidate, self._phrase2candidate_fpath)
#         print("Saved phrase2candidate:", self._phrase2candidate_fpath)
    
        import pickle 
        candidates = pickle.load(open("datasets/103227-candidate-texts.pkl","rb"))
        self._params["num_candidates"] = len(candidates)
        print("Number of candidates:", len(candidates))
         
        
        self._candidate2index = {}
        corpus = []
        for index, candidate in enumerate(candidates):
            corpus.append(candidate.text)
            self._candidate2index[candidate] = index

        joblib.dump(self._candidate2index, self._candidate2index_fpath)
        print("Saved candidate2index:", self._candidate2index_fpath)
            
        self._vectorizer = TfidfVectorizer() if self._params["tfidf"] else CountVectorizer()
        self._vectors = self._vectorizer.fit_transform(corpus)
        
        joblib.dump(self._vectorizer, self._vectorizer_fpath) 
        print("Saved vectorizer:", self._vectorizer_fpath)

        joblib.dump(self._vectors, self._vectors_fpath)
        self._params["shape"] = self._vectors.shape
        print("Saved {} candidate feature matrix: {}".format(self._vectors.shape, self._vectors_fpath))

        with open(self._params_fpath, "w") as fp:
            json.dump(self._params, fp)
        print("Saved params:", self._params_fpath)
        
    # to an intermetidate TrainableLinker class?
    def _dataset2phrases(self, dataset_fpaths):
        voc = set()
        for dataset_fpath in dataset_fpaths:
            df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
            for i, row in df.iterrows():
                for target in str(row.targets).split(","):
                    voc.add(target.strip())
            
        return make_phrases(list(voc))
        
    def link(self, context, phrases):
        #X_train_counts = count_vect.fit_transform(twenty_train.data)
        pass
    
dataset_fpaths = ["datasets/dbpedia.tsv", 
                  "datasets/kore50.tsv", "datasets/n3-reuters-128.tsv"]

sl = SparseLiker("data/test2")
# candidates = sl.train(dataset_fpaths)

Parameters:
-  tfidf: True
- description: 
- num_phrases: 997
- num_candidates: 103227
- shape: [103227, 218026]
Loaded in 7.33 sec.


In [None]:
import codecs
from candidate import make_dummy_phrases 


output_fpath = "data/sf-candidates.txt"
re_newlines = re.compile(r"[\n\r]+")

with codecs.open(output_fpath, "w", "utf-8") as c_f:
    for phrase in c:
        for candidate in c[phrase]:
            text = candidate.text
            c_f.write("{}\t{}\t{}\n".format(
                phrase.text,
                candidate.name,
                text.strip()))
            
print(output_fpath)

In [None]:
# from utils import ensure_dir 
from os.path import join 
import json 
import os


def ensure_dir(f):
    """ Make the directory. """
    
    if not os.path.exists(f):
        os.makedirs(f)
        return True
    else:
        return False

        
candidates = list(c)
# model_dir = "datasets/103227/"


In [None]:
c?

In [None]:
import codecs

output_fpath = "data/997-phrases.txt"

with codecs.open(output_fpath, "w", "utf-8") as out:
    for candidate in c:
        out.write("{}\t{}\n".format(candidate.name, candidate.text))

print(output_fpath)    

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase

context = "San Francisco said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey."
phrases = "San Francisco"

phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in phrases.split(",")]
bl = BaselineLinker()

for phrase, candidate in bl.link(context, phrases):
    print(phrase.text, candidate)

In [None]:
from linkers.baseline import BaselineLinker
from candidate import Phrase
from pandas import read_csv 

dataset_fpath = "datasets/dbpedia.tsv"

df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
bl = BaselineLinker()

for i, row in df.iterrows():
    phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                       for phrase in row.targets.split(",")]
        
    print("\n\n{}\n".format(row.context))
    
    for phrase, candidate in bl.link(row.context, phrases):
        link = candidate.link if candidate else ""
        print(phrase.text, link)

In [None]:
import tqmd

In [None]:
%load_ext autoreload
%autoreload 2