In [1]:
from os import path
from pandas import DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec

from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

Using Theano backend.


In [4]:
class Corpus:
    def __init__(self, corpus):
        self.corpus = corpus
        
    def batches(self, dictionary=False, batch_size=20):
        raw_text = self.corpus
        if dictionary:
            for i in self._split_sentences(raw_text):
                yield i
        else:
            while raw_text:
                yield raw_text[:batch_size]
                raw_text = raw_text[batch_size:] 
            
    def _split_sentences(self, text, batch_size=20):
        batches = []
        for sentence in sent_tokenize(text):
            for i in self._find_neighbors(sentence):
                batches.append(i)
                if len(batches) > batch_size:
                    yield(batches)
                    batches = []
        yield(batches)


    def _find_neighbors(self, text_batch, window=3):
            words = word_tokenize(text_batch)
            for word_id, word in enumerate(words):
                lower_bound = word_id-window
                if word_id-window < 0:
                    lower_bound = 0
                d = {}
                d[word] = words[lower_bound:word_id] + words[word_id+1:word_id+window]
                yield(d) 

In [5]:
class Embeddings():
    def __init__(self, embeddings='Word2Vec'):
        self.model = self.init_model(embeddings)
        
    def init_model(self, embeddings):
        if embeddings == 'Word2Vec':
            return Word2Vec()
        
    def fit_corpus(self, text):
        corpus = Corpus(text)
        self.batches = [batch.split(' ') for batch in list(corpus.batches(dictionary=False))]
        
    def _build_vocab(self, batches):
        self.model.build_vocab(batches)

    def _train_model(self, batches, epochs=3):
        self.model.train(batches, total_examples=len(batches), epochs=epochs)
        
    def train(self):
        self._build_vocab(self.batches)
        self._train_model(self.batches)
        
    def vector(self, word):
        return self.model[word]
        
    def online_training(self):
        for batch in batches:
            model.build_vocab(batch, update=True)
            model.train(batch, total_examples=10, epochs=3)

In [6]:
class Evaluator():
    def __init__(self, test='word_similarity', datasets=['wordsim353-rel'], metric='spearman'):
        self.datasets_dir = 'datasets'
        
        self.test = test
        self.datasets = [DataFrame.from_csv(path.join(self.datasets_dir, test, '{}.csv'.format(dataset))).dropna() for dataset in datasets]
        self.metric = metric
        
    def evaluate(self, model):
        similarities = {'model':[], 'human':[]}
        for dataset in self.datasets:
            for index, row in dataset.iterrows():
                try:
                    similarities['model'].append(1 - cosine(model.vector(row['word1']), model.vector(row['word2'])))
                    similarities['human'].append(row['similarity'])
                except KeyError:
                    pass
        if self.metric == 'spearman':
            return spearmanr(similarities['model'], similarities['human'])

In [7]:
class Pipeline():
    def __init__(self, text, model='Word2Vec'):
        self.model = Embeddings(model)
        self.model.fit_corpus(text)
        self.model.train()
    
    def evaluate(self, test='word-similarity', datasets=['wordsim353-rel']):
        evaluator = Evaluator(test='word-similarity', datasets=['wordsim353-rel'], metric='spearman')
        return evaluator.evaluate(self.model)

In [2]:
with open(path.join('data', 'pidgeons.txt'), 'r') as f:
    text = f.read()

In [8]:
p = Pipeline(text=text, model='Word2Vec')
p.evaluate(test='word_similarity', datasets=['wordsim353-rel'])

SpearmanrResult(correlation=nan, pvalue=nan)