In [1]:
import sys
from argparse import ArgumentParser, RawTextHelpFormatter
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

class Dictionary:
    def __init__(self, filepath, encoding="latin1"):
        self.words = list()
        self.lookup = dict()
        dictionary = list()

        print("loading...", file=sys.stderr)
        for i, line in enumerate(open(filepath, encoding=encoding)):
            line = line.strip()
            word, vec_s = line.split("  ")
            vec = [float(n) for n in vec_s.split()]
            self.lookup[word] = i
            dictionary.append(vec)
            self.words.append(word)
        print(f'Total words: {len(self.words)}', file=sys.stderr)
        self.dictionary = np.array(dictionary)
        self.norms = normalize(self.dictionary, axis=1)
        print('min Norm', np.min(self.norms))
        print('max Norm', np.max(self.norms))

    def vec(self, word):
        return self.dictionary[self.lookup[word.strip().upper()], :]

    def score(self, word1, word2):
        v1 = self.norms[self.lookup[word1.strip().upper()], :]
        v2 = self.norms[self.lookup[word2.strip().upper()], :]
        return np.sum(v1*v2)

    def word(self, vec, n=None):
        v = vec / np.linalg.norm(vec)
        dots = np.dot(self.norms, v)
        if n is None:
            return self.words[np.argmax(dots)]
        return [(self.words[x], dots[x]) for x in np.argsort(-dots)[:n]]


In [2]:
d = Dictionary("simvecs")

loading...


min Norm -0.6210165606884935
max Norm 0.6102571871129993


Total words: 133860


In [3]:
v = d.vec('look')
print(v.shape, d.word(v))
print(d.word(d.vec('two'), 10))

(50,) LOOK
[('TWO', 1.0000000000000002), ('TUE', 0.9983326893660539), ('TU', 0.9982340708709736), ('TEW', 0.9979064808393452), ('TOO', 0.9977740077253083), ('THUY', 0.9974699404340962), ('TO', 0.9973053406733928), ('TUTU', 0.9776248355261037), ('TREU', 0.9715127874856855), ('TRUE', 0.9709780187976012)]


In [4]:
print(d.word(d.vec('TURN') + d.vec('BURNING') - d.vec('BURN'), 5))
print(d.word(d.vec('JUMP') + d.vec('HELPED') - d.vec('HELP'), 5))
print(d.word(d.vec('COMFORTABLE') + d.vec('UNCONSCIOUS') - d.vec('CONSCIOUS'), 5))
print(d.word(d.vec('CONTRACT') + d.vec('NON-FINANCIAL') - d.vec('FINANCIAL'), 5))

[('TURNING', 0.9793758767291694), ('TERMING', 0.9323801622938123), ('CHURNING', 0.8997649785298691), ('TWIRLING', 0.8910618371075985), ('TOWERING', 0.8901059772709727)]
[('JUMPED', 0.9488281100764806), ('DUMPED', 0.8792958295473174), ('JUMP', 0.8662129216397209), ('JUMPSUIT', 0.8385106402802855), ('JEMMOTT', 0.8171843522911427)]
[('UNCOMFORTABLE', 0.9154567768829316), ('UNWORKABLE', 0.8917171894794256), ('UNGOVERNABLE', 0.887317599246929), ('UNFORGETTABLE', 0.8847351680369085), ('UNFLAPPABLE', 0.8698138608526698)]
[('NON-CONTRACT', 0.896416399901073), ('NONCONTRACT', 0.8927973416634298), ('CONTRACT', 0.8423575801788273), ('TRENTE-ET-QUARANTE', 0.7957182564086372), ('CONTACT', 0.7934639023755785)]


In [None]:
print(d.word(d.vec('TURN') + d.vec('BURNING') - d.vec('BURN'), 5))