In [1]:
import numpy as np
from itertools import chain, product
import word2vec



In [2]:
model = word2vec.getModel()

In [3]:
hebrew = 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ'

GMatrix = {"normal": {c:v for c, v in zip(hebrew, 
                                          chain(range(1, 10),
                                                range(10, 100, 10),
                                                range(100, 500, 100),
                                                [20, 40, 50, 80, 90])
                                         )},
           "big": {c:v for c, v in zip(hebrew, 
                                       chain(range(1, 10),
                                             range(10, 100, 10),
                                             range(100, 1000, 100))
                                         )},
           "small": {c:v for c, v in zip(hebrew, 
                                         chain(range(1, 10),
                                               range(1, 10),
                                               range(1, 10)))},
           "serial": {c:v for c, v in zip(hebrew, 
                                          chain(range(1, 28)))},
           "etbesh": {c:v for c, v in zip(hebrew, 
                                          chain(reversed(range(100, 500, 100)),
                                                reversed(range(10, 100, 10)),
                                                reversed(range(1, 10)),
                                                [30, 10, 9, 7, 6])
                                         )},
          }

GMatrix["forward"] = {c:v for c, v in zip(hebrew, np.cumsum(list(GMatrix["normal"].values())))}
GMatrix["private"] = {c:v for c, v in zip(hebrew, map(lambda x: x ** 2, GMatrix["normal"].values()))}

def hebrew_str(s):
    return all("\u0590" <= c <= "\u05EA" or c in " ,.:;'[](){}" for c in s)

def GMatria(s, method="normal"):
    if not hebrew_str(s):
        pass
        # Non Hebrew letter encountered
    return sum(GMatrix[method][c] for c in s if c in GMatrix[method])

GMatrix["semite"] = {c:v for c, v in zip(hebrew, [GMatria('אלף'),
                                                  GMatria('בית'),
                                                  GMatria('גמל'),
                                                  GMatria('דלת'),
                                                  GMatria('הא'),
                                                  GMatria('ויו'),
                                                  GMatria('זיין'),
                                                  GMatria('חית'),
                                                  GMatria('טית'),
                                                  GMatria('יוד'),
                                                  GMatria('כף'),
                                                  GMatria('למד'),
                                                  GMatria('מם'),
                                                  GMatria('נון'),
                                                  GMatria('סמך'),
                                                  GMatria('עין'),
                                                  GMatria('פא'),
                                                  GMatria('צדי'),
                                                  GMatria('קוף'),
                                                  GMatria('ריש'),
                                                  GMatria('תיו'),
                                                  GMatria('כף'),
                                                  GMatria('מם'),
                                                  GMatria('נון'),
                                                  GMatria('פא'),
                                                  GMatria('צדי')])}

def embedding(s):
    return sum([model.wv[word] for word in s.split(' ')])

In [4]:
# words in vocabulary
len(model.wv.index_to_key)

559687

In [5]:
wv = [(word, model.wv[word], GMatria(word)) for  word in model.wv.index_to_key if hebrew_str(word)]

In [6]:
gwv = {g:[(w[0], w[1]) for w in wv if w[2] == g] for g in set([w[2] for w in wv])}

In [7]:
def solve_words(s, search=100000, top=2):
    return [w for w, _ in model.wv.most_similar(positive=s.split(), topn=search)[:search][:search]
            if GMatria(s) == GMatria(w) and hebrew_str(w)][:top]

solve_words('בנימין נתניהו')

['מרגלית', 'שניאורסון']

### Perfect sum

In [8]:
from collections import namedtuple
# This is a doubly linked list.
# (value, tail) will be one group of solutions.  (next_answer) is another.
SumPath = namedtuple('SumPath', 'value tail next_answer')

def fixed_sum_paths (array, target, count):
    # First find counts of values to handle duplications.
    value_repeats = {}
    for value in array:
        if value in value_repeats:
            value_repeats[value] += 1
        else:
            value_repeats[value] = 1

    # paths[depth][x] will be all subsets of size depth that sum to x.
    paths = [{} for i in range(count+1)]

    # First we add the empty set.
    paths[0][0] = SumPath(value=None, tail=None, next_answer=None)

    # Now we start adding values to it.
    for value, repeats in value_repeats.items():
        # Reversed depth avoids seeing paths we will find using this value.
        for depth in reversed(range(len(paths))):
            for result, path in paths[depth].items():
                for i in range(1, repeats+1):
                    if count < i + depth:
                        # Do not fill in too deep.
                        break
                    result += value
                    if result in paths[depth+i]:
                        path = SumPath(
                            value=value,
                            tail=path,
                            next_answer=paths[depth+i][result]
                            )
                    else:
                        path = SumPath(
                            value=value,
                            tail=path,
                            next_answer=None
                            )
                    paths[depth+i][result] = path

                    # Subtle bug fix, a path for value, value
                    # should not lead to value, other_value because
                    # we already inserted that first.
                    path = SumPath(
                        value=value,
                        tail=path.tail,
                        next_answer=None
                        )
    return paths[count][target]

def path_iter(paths):
    if paths.value is None:
        # We are the tail
        yield []
    else:
        while paths is not None:
            value = paths.value
            for answer in path_iter(paths.tail):
                answer.append(value)
                yield answer
            paths = paths.next_answer

def fixed_sums(array, target, count):
    paths = fixed_sum_paths(array, target, count)
    return path_iter(paths)

# example
# for path in fixed_sums([1,2,3,3,4,5,6,9], 10, 3):
#     print(path)

### Use perfect sum for GMatrix

In [9]:
def cosine_sim(e1,e2):
    return np.dot(e1, e2)/(np.linalg.norm(e1) * np.linalg.norm(e2))

In [17]:
import tqdm
def solve_word_pairs(s, top=15):
    e = embedding(s)
    g = GMatria(s)
    
    sentenceses = []
    distanceses = []
    for path in tqdm.tqdm(list(fixed_sums(gwv.keys(), g, 2))):
        sentences = [' '.join([w1[0], w2[0]]) for w1 in gwv[path[0]] for w2 in gwv[path[1]]]
        embeddings=[w1[1] + w2[1] for w1 in gwv[path[0]] for w2 in gwv[path[1]]]
        distances = [cosine_sim(embededing,e) for embededing in embeddings]
        distances, sentences = (list(t) for t in zip(*sorted(zip(distances, sentences), reverse=True)))
        sentenceses.extend(sentences[:top])
        distanceses.extend(distances[:top])
    distanceses, sentenceses = (list(t) for t in zip(*sorted(zip(distanceses, sentenceses), reverse=True)))
    print(list(zip(distanceses[:top], sentenceses[:top])))

In [18]:
solve_word_pairs('שמעון פרס')

100%|██████████| 401/401 [20:11<00:00,  3.02s/it]

[(1.0, 'פרס שמעון'), (0.88701814, 'פרס ויספיש'), (0.87684643, 'פרס ומרידור'), (0.8762464, 'פרס ודותן'), (0.8724531, 'פרס ויתילדו')]





In [95]:
def solve_word_lists(s, top=15, n=2):
    e = embedding(s)
    g = GMatria(s)
    
    sentenceses = []
    distanceses = []
    for path in tqdm.tqdm(list(fixed_sums(gwv.keys(), g, n))):
        sentences = [' '.join([x[0] for x in w]) for w in list(product(*[gwv[g] for g in path]))]
        embeddings = [sum([x[1] for x in w]) for w in list(product(*[gwv[g] for g in path]))]
        distances = [cosine_sim(embededing,e) for embededing in embeddings]
        distances, sentences = (list(t) for t in zip(*sorted(zip(distances, sentences), reverse=True)))
        sentenceses.extend(sentences[:top])
        distanceses.extend(distances[:top])
    distanceses, sentenceses = (list(t) for t in zip(*sorted(zip(distanceses, sentenceses), reverse=True)))
    print(list(zip(distanceses[:top], sentenceses[:top])))

In [None]:
solve_word_lists('שמעון פרס')

 42%|████▏     | 168/401 [05:14<10:01,  2.58s/it]

In [21]:
s = 'שמעון פרס'
e = embedding(s)
g = GMatria(s)

paths = list(fixed_sums(gwv.keys(), g, 2))

In [86]:
a = [list(ws) for ws in product(*[gwv[g] for g in paths[0]])]

In [89]:
[[' '.join([x[0] for x in w]) for w in list(product(*[gwv[g] for g in path]))] for path in paths] 

KeyboardInterrupt: 

In [83]:
path = paths[0]
[' '.join([m[0] for w in list(ws) for m in w]) for ws in product(*[gwv[g] for g in path])] 

TypeError: sequence item 1: expected str instance, numpy.float32 found

In [67]:
[ws for ws in product(*[gwv[g] for g in paths[0]])][0]

(('אא',
  array([-0.02018585,  0.0450373 , -0.13733631, -0.07905271, -0.04696308,
         -0.02823014, -0.14744899,  0.22046283,  0.1711357 ,  0.0162007 ,
          0.04333334, -0.01051869, -0.07794161,  0.17227547,  0.21878876,
         -0.03079641,  0.15786213, -0.02208425,  0.06877608, -0.06593809,
          0.12817593,  0.02265123, -0.15647903,  0.04874207,  0.00206851,
         -0.00242314,  0.03758171,  0.01604735, -0.07077962,  0.10401662,
          0.02063344, -0.04853552,  0.00691962, -0.01489432, -0.11872116,
          0.02457944,  0.06885709, -0.00908786, -0.03101856,  0.04112574,
          0.04595832, -0.13938701, -0.08017936,  0.05028337,  0.18043405,
          0.01164006, -0.05166569, -0.14200662,  0.07951269,  0.23882549,
         -0.06322876,  0.0117099 ,  0.03553892, -0.08741646,  0.05546534,
         -0.01098728, -0.12619972,  0.08050238,  0.1477471 ,  0.00733709,
         -0.03581674, -0.00521502,  0.09387129,  0.1649713 , -0.21663246,
          0.09370904, -0.01258

In [68]:
sentences = [' '.join([list(w[0]) for w in list(ws)]) for ws in product(*[gwv[g] for g in paths[0]])]
#     embeddings = [sum([w[1] for w in ws]) for ws in product([gwv[g] for g in path])]
#     distances = [cosine_sim(embededing,e) for embededing in embeddings]
#     distances, sentences = (list(t) for t in zip(*sorted(zip(distances, sentences), reverse=True)))
#     print(distances[:1], sentences[:1])

TypeError: sequence item 0: expected str instance, list found