In [1]:
from mxnet import nd
from mxnet.contrib import text

In [2]:
glove_vec = text.embedding.get_pretrained_file_names("glove")

In [3]:
# 查看预训练词向量文件
print(glove_vec)

['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', 'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', 'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt']


In [4]:
# 查看50维词向量的内容
glove_6b50d = text.embedding.create('glove', pretrained_file_name="glove.6B.50d.txt")

In [5]:
word_size = len(glove_6b50d)
print(word_size)

400001


In [6]:
# 词的索引
index = glove_6b50d.token_to_idx['happy']
print(index)

1752


In [None]:
# 索引到词
word = glove_6b50d.idx_to_token[1752]
print(word)

In [7]:
# 词向量
print(glove_6b50d.idx_to_vec[1752])


[ 0.092086  0.2571   -0.58693  -0.37029   1.0828   -0.55466  -0.78142
  0.58696  -0.58714   0.46318  -0.11267   0.2606   -0.26928  -0.072466
  1.247     0.30571   0.56731   0.30509  -0.050312 -0.64443  -0.54513
  0.86429   0.20914   0.56334   1.1228   -1.0516   -0.78105   0.29656
  0.7261   -0.61392   2.4225    1.0142   -0.17753   0.4147   -0.12966
 -0.47064   0.3807    0.16309  -0.323    -0.77899  -0.42473  -0.30826
 -0.42242   0.055069  0.38267   0.037415 -0.4302   -0.39442   0.10511
  0.87286 ]
<NDArray 50 @cpu(0)>


# Glove的应用

In [8]:
# 余弦相似度
def cos_sim(x, y):
    return nd.dot(x,y)/(x.norm() * y.norm())

ERROR! Session/line number was not unique in database. History logging moved to new session 99


In [9]:
a = nd.array([4,5])
b = nd.array([400,500])
print(cos_sim(a,b))


[1.]
<NDArray 1 @cpu(0)>


In [10]:
#求近义词
def norm_vecs_by_row(x):
    # 分母中添加的 1e-10 是为了数值稳定性。
    return x / (nd.sum(x * x, axis=1) + 1e-10).sqrt().reshape((-1, 1))

def get_knn(token_embedding, k, word):
    word_vec = token_embedding.get_vecs_by_tokens([word]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(token_embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_vec)
    indices = nd.topk(dot_prod.reshape((len(token_embedding), )), k=k+1,
                      ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    # 除去输入词。
    return token_embedding.to_tokens(indices[1:])

In [11]:
#查看与baby最相似的10个词
sim_list = get_knn(glove_6b50d,10, 'baby')
print(sim_list)

['babies', 'boy', 'girl', 'newborn', 'pregnant', 'mom', 'child', 'toddler', 'mother', 'cat']


In [12]:
#计算与baby最相思的10个词的相似度
sim_val = cos_sim(glove_6b50d.get_vecs_by_tokens('baby'), glove_6b50d.get_vecs_by_tokens('babies'))
print(sim_val)


[0.83871305]
<NDArray 1 @cpu(0)>


In [13]:
print(get_knn(glove_6b50d,10,'computer'))

['computers', 'software', 'technology', 'electronic', 'internet', 'computing', 'devices', 'digital', 'applications', 'pc']


In [14]:
print(get_knn(glove_6b50d,10,'run'))

['running', 'runs', 'went', 'start', 'ran', 'out', 'third', 'home', 'off', 'got']


In [15]:
print(get_knn(glove_6b50d,10,'love'))

['dream', 'life', 'dreams', 'loves', 'me', 'my', 'mind', 'loving', 'wonder', 'soul']


In [16]:
#求类比词
#vec(c)+vec(b)−vec(a) 
def get_top_k_by_analogy(token_embedding, k, word1, word2, word3):
    word_vecs = token_embedding.get_vecs_by_tokens([word1, word2, word3])
    word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(token_embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_diff)
    indices = nd.topk(dot_prod.reshape((len(token_embedding), )), k=k,
                      ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    return token_embedding.to_tokens(indices)

In [17]:
#验证vec(son)+vec(woman)-vec(man) 与 vec(daughter) 两个向量之间的余弦相似度
def cos_sim_word_analogy(token_embedding, word1, word2, word3, word4):
    words = [word1, word2, word3, word4]
    vecs = token_embedding.get_vecs_by_tokens(words)
    return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])

In [18]:
word_list = get_top_k_by_analogy(glove_6b50d,1,'man','woman','son')

In [19]:
print(word_list)

['daughter']


In [20]:
word_list = get_top_k_by_analogy(glove_6b50d,1,'man','son','woman')
print(word_list)

['daughter']


In [21]:
sim_val = cos_sim_word_analogy(glove_6b50d, 'man','woman','son','daughter')
print(sim_val)


[0.9658341]
<NDArray 1 @cpu(0)>


In [22]:
word_list = get_top_k_by_analogy(glove_6b50d,1,'beijing','china','tokyo')

In [23]:
print(word_list)

['japan']


In [24]:
word_list = get_top_k_by_analogy(glove_6b50d,1,'bad','worst','big')

In [25]:
print(word_list)

['biggest']


In [26]:
word_list = get_top_k_by_analogy