In [None]:
from whoosh.analysis import StandardAnalyzer
import torch
from torch.autograd import Variable
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import logging
import time
import numpy
import funcy
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter

logger = logging.getLogger('ULL')
logging.basicConfig(level=logging.INFO)

In [None]:
class Corpus:

    def __init__(self, file):

        self._content = self._read(file=file)

    def _read(self, file):
        if file:
            logger.info('Reading File {0}'.format(file))
            with open(file, 'r') as f:
                return f.read()

In [None]:
class TrainCorpus(Corpus):

    def __init__(self, file):

        Corpus.__init__(self, file)

        self._preprocessor = StandardAnalyzer()
        self._sentences = self._content2sentences()
        self._vocabulary = self._get_vocabulary()
        self._n_context_words = None
        self._window_size = None

    def __repr__(self):
        return 'Corpus, ' + str(len(self._vocabulary)) + ' Tokens, ' + str(len(self._sentences)) + ' Sentences. '

    def __getitem__(self, item):
        return self._vocabulary[item]

    @property
    def n_context_words(self):
        return self._n_context_words

    @property
    def window_size(self):
        return self._window_size

    @property
    def sentences(self):
        return self._sentences

    @property
    def vocabulary(self):
        return self._vocabulary

    def _content2sentences(self):
        logger.info('Building Sentences')
        sentences = self._content.split('\n')
        processed_sentences = list(filter(None, ([word.text for word in self._preprocessor(sentence)]
                                                 for sentence in sentences)))
        return processed_sentences

    def _get_vocabulary(self):
        logger.info('Building Vocabulary')
        words = [word.text for word in self._preprocessor(self._content)]
        vocabulary = {}
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)
        return vocabulary

    def get_contexts(self, window_size=2):
        logger.info('Building Contexts, Window Size {0}'.format(window_size))
        contexts = {}

        n_context_words = 0
        for sentence in self._sentences:
            if len(sentence) > window_size*2 + 1:
                for idx in range(window_size, len(sentence)-window_size):
                    context = sentence[idx - window_size:idx] + sentence[idx + 1:idx + 1 + window_size]
                    if sentence[idx] not in contexts:
                        contexts[sentence[idx]] = []
                    contexts[sentence[idx]].extend(context)
                    n_context_words += len(context)

        self._n_context_words = n_context_words
        self._window_size = window_size

        return contexts

In [None]:
class TestCorpus(Corpus):

    def __init__(self, candidate_file, truth_file):

        Corpus.__init__(self, file=None)

        self._preprocessor = StandardAnalyzer()
        self.candidates = self._load_candidates(candidate_file)
        self.ground_truth = self._load_truth(truth_file)

    def _load_candidates(self, file):
        with open(file, 'r') as f:
            tar_c = {}
            for line in f:
                line = line.strip().split('::')
                target = line[0].split('.')[0]
                candidates = line[1].split(';')
                for idx in range(len(candidates)):
                    candidates[idx] = candidates[idx].split()
                tar_c[target] = candidates
        return tar_c

    def _load_truth(self, file):
        with open(file, 'r') as f:
            tr_c = {}
            for line in f:
                line = line.strip().split('::')
                target = line[0].split()[0].split('.')[0]
                truth = list(filter(None, line[1].strip().split(';')))
                for idx in range(0, len(truth)):
                    phrase = truth[idx].split()[0:-1]
                    weight =truth[idx].split()[-1]
                    pair = tuple([phrase, weight])
                    truth[idx] = pair
                if target not in tr_c:
                    tr_c[target] = []
                    tr_c[target].append(truth)
                else:
                    tr_c[target].append(truth)
        return tr_c

In [None]:
class Featurizer:

    def __init__(self, train_corpus, test_corpus):

        self._train_data = train_corpus
        self._test_data = test_corpus

    @property
    def train_data(self):
        return self._train_data

    @property
    def test_data(self):
        return self._test_data

    def vocabulary2one_hot(self):
        logger.info('Building OneHot Vectors')
        id = list(self._train_data.vocabulary.values())
        size = len(id)
        tensor = torch.FloatTensor([[0 for _ in range(0, size)] for _ in range(0, size)])
        tensor[id, id] += 1
        return tensor

    def context_words2features(self, mode='normal'):

        one_hot = self.vocabulary2one_hot()
        contexts = self._train_data.get_contexts()
        n_context_words = self._train_data._n_context_words
        n_features = len(self._train_data.vocabulary)
        window_size = self._train_data._window_size

        logger.info('Building Training Data, Labels from Contexts')

        if mode == 'normal':
            train_data = torch.FloatTensor([[0 for _ in range(n_features)] for _ in range(n_context_words)])
            labels = torch.LongTensor([0 for _ in range(n_context_words)])
            counter = 0
            for word, context_words in contexts.items():
                word_vector = one_hot[self._train_data[word], :]
                for context_word in context_words:
                    train_data[counter, :] = word_vector
                    labels[counter] = self._train_data[context_word]
                    counter += 1

            return train_data, labels

        elif mode == 'bayes':
            window_size *= 2
            train_data_central = torch.LongTensor([0 for _ in range(n_context_words)])
            #labels = torch.LongTensor([0 for _ in range(n_context_words)])
            labels = torch.FloatTensor([[0 for _ in range(n_features)] for _ in range(n_context_words)])
            train_data_contexts = torch.LongTensor(numpy.zeros((n_context_words, window_size)))

            word_c = 0

            for word, context_words in contexts.items():

                word_idx = self._train_data[word]
                context_one_hot = [one_hot[self._train_data[context_word], :] for context_word in context_words]
                context_one_hot = torch.stack(context_one_hot)
                context_idx = [self._train_data[context_word] for context_word in context_words]

                n_words = len(context_words)

                train_data_central[word_c:word_c + n_words] = word_idx
                labels[word_c:word_c+n_words, :] = context_one_hot
                #labels[word_c:word_c + n_words] = torch.LongTensor(context_idx)
                context_idx = funcy.partition(window_size, context_idx)
                context_idx = [context for context in context_idx for _ in range(0, window_size)]
                train_data_contexts[word_c:word_c + n_words, :] = torch.LongTensor(context_idx)

                word_c += n_words

            return train_data_central, train_data_contexts, labels

In [None]:
class Plotter:

    @staticmethod
    def plot_training(epochs, losses, n_hidden):
        plt.figure()
        plt.title('WordEmbeddings')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.plot([i for i in range(epochs)], losses, 'r', label='WordEmbeddings %d' % n_hidden)
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
class Skipgram(nn.Module):

    def __init__(self, n_features, n_layers, n_hidden):

        super(Skipgram, self).__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_features = n_features

        self.linear0 = nn.Linear(self.n_features, self.n_hidden)
        self.linear1 = nn.Linear(self.n_hidden, self.n_features)

    def forward(self, data):

        x = self.linear0(data)
        x = self.linear1(x)

        return x

    def train_network(self, train_data, labels, epochs, batch_size, weight_decay, lr):

        self.cuda()
        self.train(True)

        n_batches = round(train_data.shape[0]/batch_size)
        opt = optim.Adam(self.parameters(), weight_decay=weight_decay, lr=lr)
        scheduler = optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)

        losses = []

        for epoch in range(epochs):
            scheduler.step()
            avg_loss = numpy.zeros((1,))
            n_samples = 1

            for idx in range(0, n_batches):
                opt.zero_grad()
                train_batch = train_data[idx*batch_size:idx*batch_size+batch_size, :]
                label_batch = labels[idx*batch_size:idx*batch_size+batch_size]
                train_batch = Variable(train_batch, requires_grad=True).cuda()
                label_batch = Variable(label_batch, requires_grad=False).cuda()
                output = self(train_batch)
                loss = nn.CrossEntropyLoss()(output, label_batch)
                loss.backward()
                opt.step()
                n_samples += 1
                avg_loss += numpy.round(loss.cpu().data.numpy(), 3)

            avg_loss /= n_samples
            losses.append(avg_loss)
            logger.info('Epoch {0}, Average Loss {1}'
                    .format(epoch + 1, round(avg_loss.data[0], 4)))

        Plotter.plot_training(epochs=epochs,
                              losses=losses,
                              n_hidden=self.n_hidden)

    def evaluate(self, train_corpus, test_corpus):

        self.train(False)

        embeddings = list(self.parameters())[-2]
        candidates = test_corpus.candidates
        truth = test_corpus.ground_truth
        ranked = {}

        for target, candidate in candidates.items():
            try:
                idx = train_corpus[target]
                target_vec = embeddings[idx, :].cpu().data.numpy()
            except:
                logger.warning('Target out of Vocabulary {0}'.format(target))
            else:
                ranking = []
                for phrase in candidate:
                    phrase_vec = torch.FloatTensor([0 for _ in range(self.n_hidden)])
                    for word in phrase:
                        try:
                            idx = train_corpus[word]
                        except:
                            logger.warning('Candidate out of Vocabulary {0}'.format(word))
                        else:
                            phrase_vec += embeddings[idx, :].cpu().data
                    phrase_vec /= len(phrase)
                    phrase_vec = phrase_vec.numpy()
                    sim = cosine_similarity(target_vec.reshape(1, -1), phrase_vec.reshape(1, -1))[0][0]
                    ranking.append(tuple([phrase, sim]))
                ranking = sorted(ranking, key=itemgetter(1), reverse=True)
                ranking = [i[0] for i in ranking]
                ranked[target] = ranking

        total_average_gap = 0
        counter = 1

        for target, sentences in truth.items():
            try:
                ranking = ranked[target]
            except:
                pass
            else:
                for sentence in sentences:
                    total_weight = sum([int(i[1]) for i in sentence])
                    tokens = [i[0] for i in sentence]
                    found = 1
                    precision_at = 0
                    for idx in range(len(ranking)):
                        if ranking[idx] in tokens:
                            precision_at += idx/found
                        found += 1
                    gap = precision_at/total_weight
                    total_average_gap += gap
                    counter += 1

        total_average_gap /= counter
        logger.info('Total Average GAP {0}'.format(total_average_gap))

In [None]:
class BayesSkipgram(nn.Module):

    def __init__(self, n_features, n_layers, n_hidden, corpus, embedding_dim):

        super(BayesSkipgram, self).__init__()

        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_features = n_features
        self.embedding_dim = embedding_dim
        self.window_size = corpus.window_size * 2
        n_embeddings = len(corpus.vocabulary)

        self.embedding1 = nn.Embedding(n_embeddings, self.embedding_dim)
        self.embedding2 = nn.Embedding(n_embeddings, self.embedding_dim)
        self.linear1 = nn.Linear(2 * self.embedding_dim, self.embedding_dim)
        self.linear2 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.linear3 = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.linear4 = nn.Linear(self.embedding_dim, n_embeddings)

    def forward(self, central, contexts, noise):

        x = self.embedding1(central)
        y = self.embedding2(contexts)

        x = x.view((x.size()[0], 1, self.embedding_dim))
        x = torch.cat([x for _ in range(self.window_size)], dim=1)
        y = torch.cat([y, x], dim=2)

        out = None
        for idx in range(self.window_size):
            if out is None:
                out = F.relu(self.linear1(y[:, idx, :]))
            else:
                out += F.relu(self.linear1(y[:, idx, :]))

        sigma = F.softplus(self.linear2(out))
        mu = self.linear3(out)
        z = mu + noise*sigma

        out = F.log_softmax(self.linear4(out), dim=1)
        #out = F.softmax(self.linear4(z), dim=1)

        return out

    def train_network(self, central_data, context_data, labels, epochs, batch_size, weight_decay, lr):

        self.cuda()
        self.train(True)

        n_batches = round(central_data.shape[0]/batch_size)

        opt = optim.SGD(self.parameters(), weight_decay=weight_decay, lr=lr)
        loss_f = nn.KLDivLoss()
        scheduler = optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)

        losses = []

        for epoch in range(epochs):

            scheduler.step()
            avg_loss = numpy.zeros((1,))
            n_samples = 1

            for idx in range(0, n_batches):

                opt.zero_grad()

                central_batch = central_data[idx*batch_size:idx*batch_size+batch_size]
                context_batch = context_data[idx * batch_size:idx * batch_size + batch_size, :]
                label_batch = labels[idx*batch_size:idx*batch_size+batch_size, :]
                #label_batch = labels[idx * batch_size:idx * batch_size + batch_size]
                noise = torch.randn(1, self.embedding_dim)

                central_batch = Variable(central_batch).cuda()
                context_batch = Variable(context_batch).cuda()
                noise = Variable(noise, requires_grad=False).cuda()
                label_batch = Variable(label_batch, requires_grad=False).cuda()

                output = self(central_batch, context_batch, noise)
                loss = loss_f(output, label_batch)
                #print(output)
                #print(label_batch)
                #print(loss)
                loss.backward()
                opt.step()
                n_samples += 1
                avg_loss += loss.cpu().data.numpy()

            avg_loss /= n_samples
            losses.append(avg_loss)
            logger.info('Epoch {0}, Average Loss {1}'
                    .format(epoch + 1, avg_loss.data[0]))

        Plotter.plot_training(epochs=epochs,
                              losses=losses,
                              n_hidden=self.n_hidden)

    def evaluate(self, corpus, featurizer):
        self.train(False)

        embeddings = list(self.parameters())[-2]
        candidates = test_corpus.candidates
        truth = test_corpus.ground_truth
        ranked = {}

        for target, candidate in candidates.items():
            try:
                idx = train_corpus[target]
                target_vec = embeddings[idx, :].cpu().data.numpy()
            except:
                logger.warning('Target out of Vocabulary {0}'.format(target))
            else:
                ranking = []
                for phrase in candidate:
                    phrase_vec = torch.FloatTensor([0 for _ in range(self.n_hidden)])
                    for word in phrase:
                        try:
                            idx = train_corpus[word]
                        except:
                            logger.warning('Candidate out of Vocabulary {0}'.format(word))
                        else:
                            phrase_vec += embeddings[idx, :].cpu().data
                    phrase_vec /= len(phrase)
                    phrase_vec = phrase_vec.numpy()
                    sim = cosine_similarity(target_vec.reshape(1, -1), phrase_vec.reshape(1, -1))[0][0]
                    ranking.append(tuple([phrase, sim]))
                ranking = sorted(ranking, key=itemgetter(1), reverse=True)
                ranking = [i[0] for i in ranking]
                ranked[target] = ranking

        total_average_gap = 0
        counter = 1

        for target, sentences in truth.items():
            try:
                ranking = ranked[target]
            except:
                pass
            else:
                for sentence in sentences:
                    total_weight = sum([int(i[1]) for i in sentence])
                    tokens = [i[0] for i in sentence]
                    found = 1
                    precision_at = 0
                    for idx in range(len(ranking)):
                        if ranking[idx] in tokens:
                            precision_at += idx/found
                        found += 1
                    gap = precision_at/total_weight
                    total_average_gap += gap
                    counter += 1

        total_average_gap /= counter
        logger.info('Total Average GAP {0}'.format(total_average_gap))

In [None]:
if __name__ == '__main__':
    start = time.time()
    train_corpus = TrainCorpus(file='wa/test.en')
    test_corpus = TestCorpus(candidate_file='eval/lst.gold.candidates',
                             truth_file='eval/lst_test.gold')

    featurizer = Featurizer(train_corpus, test_corpus)
    central_words, contexts, labels = featurizer.context_words2features(mode='bayes')

   # train_data, labels = featurizer.context_words2features(mode='normal')
   # skipgram = Skipgram(n_layers=3,
   #                   n_hidden=500,
   #                   n_features=train_data.shape[1])
#
   # skipgram.train_network(train_data=train_data,
   #                       labels=labels,
   #                       epochs=40,
   #                       batch_size=256,
   #                       weight_decay=0.0001,
   #                       lr=0.005)
#
   # skipgram.evaluate(train_corpus, test_corpus)

    bayes_skipgram = BayesSkipgram(n_layers=3,
                      n_hidden=500,
                      n_features=len(train_corpus.vocabulary),
                      corpus=train_corpus,
                      embedding_dim=500)

    bayes_skipgram.train_network(central_data=central_words,
                           context_data=contexts,
                           labels=labels,
                           epochs=100,
                           batch_size=256,
                           lr=0.001,
                           weight_decay=0.0001)

    bayes_skipgram.evaluate(train_corpus, test_corpus)

    end = time.time()
    logger.info('Finished Run, Time Elapsed {0} Minutes'.format(round((end-start)/60, 2)))