## Word embeddings

A custom wrapper around GloVe embeddings for teaching/demo purposes, loosely modeled after `gensim`'s 
functionality.

In [1]:
import numpy as np

In [2]:
class WordEmbeddings:
    
    vocab = []
    vectors = []
    
    def __init__(self, filename):
        with open(filename, 'r') as f:
            for line in f:
                line = line.split()
                word, vec = line[0], np.array([float(val) for val in line[1:]])
                self.vocab.append(word)
                self.vectors.append(vec)
        self.vectors = np.array(self.vectors)
        
    @property
    def n_features(self):
        return self.vectors.shape[1]
    
    @property
    def idx_word_mapping(self):
        return {idx: word for idx, word in enumerate(self.vocab)}
    
    @property
    def word_idx_mapping(self):
        return {word: idx for idx, word in enumerate(self.vocab)}
        
    def vector(self, input_):        
        if isinstance(input_, str):
            if input_ in self.vocab:
                idx = self.word_idx_mapping[input_]
                return self.vectors[idx]
            else:
                raise Exception(f"{input_} is not in the vocabulary.")
        elif isinstance(input_, list):
            vec_list = []
            for entry in input_:
                if entry in self.vocab:
                    idx = self.word_idx_mapping[entry]
                    vec_list.append(self.vectors[idx])
                else:
                    raise Exception(f"{entry} is not in the vocabulary.")
            return vec_list
        else:
            raise Exception("Only strings or lists allowed.")
    
    @staticmethod
    def cos_sim(a, b):
        dot = np.dot(a, b)
        a_norm, b_norm = np.linalg.norm(a), np.linalg.norm(b)
        score = dot / np.dot(a_norm, b_norm)
        return (score - -1) / (1 - -1)
    
    @staticmethod
    def calculate_distance(embeddings, vec):
        return np.linalg.norm(embeddings - vec, axis=1)
    
    def nearest_neighbors(self, input_, k=5, score_type='cosine', raw=False):
        if raw is False:
            input_ = self.vector(input_)
        distances = self.calculate_distance(self.vectors, input_)
        sorted_distances = distances.argsort()[:k]
        words = [self.idx_word_mapping[idx] for idx in sorted_distances]
        if score_type is 'cosine':
            scores = [self.cos_sim(input_, self.vector(word)) for word in words]
            result = [(word, val) for word, val in zip(words, scores)]
            return sorted(result, key=lambda tup: tup[1], reverse=True)
        elif score_type is 'distance':
            scores = distances[sorted_distances]
            result = [(word, val) for word, val in zip(words, scores)]
            return sorted(result, key=lambda tup: tup[1])
        else:
            raise Exception("Score type not available, use `cosine` or `distance`.")
            
    def most_distant(self, input_, k=5, score_type='cosine', raw=False):
        if raw is False:
            input_ = self.vector(input_)
        distances = self.calculate_distance(self.vectors, input_)
        sorted_distances = distances.argsort()[-k:]
        words = [self.idx_word_mapping[idx] for idx in sorted_distances]
        if score_type is 'cosine':
            scores = [self.cos_sim(input_, self.vector(word)) for word in words]
            result = [(word, val) for word, val in zip(words, scores)]
            return sorted(result, key=lambda tup: tup[1])
        elif score_type is 'distance':
            scores = distances[sorted_distances]
            result = [(word, val) for word, val in zip(words, scores)]
            return sorted(result, key=lambda tup: tup[1], reverse=True)
        else:
            raise Exception("Score type not available, use `cosine` or `distance`.")
            
    def analogize(self, source: list, target: str, k=5, score_type='cosine'):
        source, target = self.vector(source), self.vector(target)
        vec = target - source[0] + source[1]
        return self.nearest_neighbors(vec, k=k, score_type=score_type, raw=True)

In [3]:
e = WordEmbeddings("data/glove/glove.6B.50d.txt")

In [4]:
e.analogize(['uk', 'london'], 'italy')

[('italy', 0.9154047999085728),
 ('rome', 0.9092034423709726),
 ('milan', 0.8952465727474838),
 ('genoa', 0.8874150491232438),
 ('naples', 0.8799018070278701)]