In [288]:
import numpy as np
from collections import Counter
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from scipy.linalg import subspace_angles

In [289]:
from config import w2v_file, vocab_limit

In [4]:
keyed_vectors = KeyedVectors.load_word2vec_format(w2v_file, limit=vocab_limit, binary=True)

2019-12-19 20:17:02,408 gensim.models.utils_any2vec: INFO loading projection weights from ~/Downloads/GoogleNews-vectors-negative300.bin.gz
2019-12-19 20:17:44,386 gensim.models.utils_any2vec: INFO loaded (1000000, 300) matrix from ~/Downloads/GoogleNews-vectors-negative300.bin.gz


In [422]:
class Space:

    def __init__(self, keyed_vectors: KeyedVectors):
        self.keyed_vectors = keyed_vectors
        self.embed_size: int = 300

    def get_embedding(self, word: str) -> np.ndarray:
        """ What is the vector for missing words? """
        return self.keyed_vectors[word] if word in self.keyed_vectors else None

    def _vectorize_keywords(self, keywords: Counter, use_tf: bool = True) -> np.ndarray:
        if not use_tf:
            return np.array([
            self.get_embedding(word)
            for word in keywords.keys()
            if self.get_embedding(word) is not None
        ])
        return np.array([
            self.get_embedding(word) * count
            for word, count in keywords.items()
            if self.get_embedding(word) is not None
        ])

    def _autocorr(self, matrix) -> np.ndarray:
        return matrix.T.dot(matrix)

    def create_subspace(self, keywords: Counter, dims: int, use_tf: bool = True) -> np.ndarray:
        embeddings = self._vectorize_keywords(keywords, use_tf)
        R = self._autocorr(embeddings)
        return PCA(n_components=dims).fit_transform(R)

    def subspaces_similarity(self, S1, S2):
        canon_angles = subspace_angles(S1, S2)
        s = np.average(np.square(np.cos(canon_angles)))
        return s

In [423]:
space = Space(keyed_vectors)

In [433]:
keywords1 = Counter(['king'])
keywords2 = Counter(['tree'])

In [434]:
ss1 = space.create_subspace(keywords=keywords1, dims=5)
ss2 = space.create_subspace(keywords=keywords2, dims=5)

In [435]:
space.subspaces_similarity(ss1, ss2)

0.013432044

In [436]:
from scipy.spatial.distance import cosine

In [437]:
a = space.get_embedding('king')
b = space.get_embedding('tree')
c = space.get_embedding('man')
d = space.get_embedding('woman')

In [439]:
cosine(a, b)

0.8908586427569389

In [415]:
al=subspace_angles(ss1,ss2)
al

array([3.847888e-07], dtype=float32)

In [416]:
np.rad2deg(al)

array([2.2046772e-05], dtype=float32)