# Load Pretrained Model

```python
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pymongo
import hashlib

db = pymongo.MongoClient('172.16.0.101').weixin.text_articles_words
md5 = lambda s: hashlib.md5(s).hexdigest()
class sentences:
    def __iter__(self):
        texts_set = set()
        for a in db.find(no_cursor_timeout=True):
            if md5(a['text'].encode('utf-8')) in texts_set:
                continue
            else:
                texts_set.add(md5(a['text'].encode('utf-8')))
                yield a['words']
        print u'最终计算了%s篇文章'%len(texts_set)

word2vec = gensim.models.word2vec.Word2Vec(sentences(), size=256, 
                                           window=10, min_count=64, sg=1, hs=1, 
                                           iter=10, workers=25)
word2vec.save('word2vec_wx')
"""
sg: 0(default) -> CBOW; 1 -> skip-gram
hs: 0(default) && negative>=0 -> negative sampling will be used; 1 -> hierarchical softmax will be used
size: dimensionality of the feature vectors
window: the maximum distance between the current and predicted word within a sentence
alpha: the initial learning rate (will linearly drop to zero as training progresses)
min_count: ignore all words with total frequency lower than this
workers: use many worker threads to train the model(faster training with multicore machines)
negative: the int for negative specifies how many “noise words” should be drawn(usually between 5-20, Default is 5)
cbow_mean: 1(default) -> use the mean; 0 -> use the sum of the context wordvectors 
iter: number of iterations(epochs) over the corpus
"""
```

In [1]:
import gensim
model = gensim.models.Word2Vec.load('/Users/lizhn7/Downloads/DATA/word2vec/word2vec_wx')

Using TensorFlow backend.


In [2]:
model.most_similar('html')

[('content', 0.7687379121780396),
 ('edit', 0.7674742341041565),
 ('model', 0.7153580188751221),
 ('bind', 0.7074968218803406),
 ('eol2br', 0.6984536647796631),
 ('unsafe', 0.6540526151657104),
 ('true', 0.6246062517166138),
 ('scope', 0.6174685955047607),
 ('class', 0.6025004982948303),
 ('}', 0.6011278629302979)]

In [3]:
model.most_similar('男人')

[('女人', 0.8158404231071472),
 ('老公', 0.7181277871131897),
 ('花心', 0.702643871307373),
 ('老婆', 0.661186933517456),
 ('出轨', 0.6492980718612671),
 ('征服', 0.6488258838653564),
 ('小三', 0.6387587785720825),
 ('一辈子', 0.6364662647247314),
 ('外遇', 0.6328293085098267),
 ('婚外', 0.6065683364868164)]

In [8]:
model.most_similar('淘')

[('淘宝', 0.7709355354309082),
 ('口令', 0.6993972659111023),
 ('👈', 0.6889228820800781),
 ('￥', 0.6334813833236694),
 ('天猫店', 0.6188440322875977),
 ('官方店', 0.6154448986053467),
 ('手机端', 0.6023507714271545),
 ('淘宝搜', 0.6000440120697021),
 ('整段', 0.5986412763595581),
 ('优惠券', 0.5939205884933472)]

In [21]:
model.most_similar(positive=['女人', '国王'], negative=['男人'])  

[('王后', 0.6879483461380005),
 ('二世', 0.6324218511581421),
 ('王宫', 0.6288765072822571),
 ('王室', 0.6009732484817505),
 ('六世', 0.586891770362854),
 ('七世', 0.579991340637207),
 ('王储', 0.5750232934951782),
 ('大臣', 0.5712825059890747),
 ('四世', 0.570629894733429),
 ('侍从', 0.5673775672912598)]

In [80]:
model.doesnt_match("早餐 女人 晚餐 午餐".split())

'女人'

In [10]:
model.similarity('男人', '女人')

0.81584052011856723

# Compute Similarity

In [4]:
import numpy as np

def similarity(w1, w2):
    """
    Compute cosine similarity between two wordvectors.
    """
    return np.dot(w1, w2) / np.sqrt((np.sum(w1 ** 2)) * (np.sum(w2 ** 2)))

## Display Form

In [74]:
df1 = model['APP']
df2 = model['WEB']
df3 = model['桌面'] + model['客户端']
df4 = model['html5'] + model['应用']
df5 = model['其他']
df = {0: 'APP', 1: 'WEB', 2: '桌面客户端', 3: 'H5应用', 4: '其他'}
display_form = [df1, df2, df3, df4, df5]

In [75]:
def prt(word):
    try:
        word_vec = model[word]
        print("%s 匹配选项：%s"%(word, df[np.argmax([similarity(word_vec, df) for df in display_form])]))
    except KeyError as e:
        print(e)

In [76]:
prt(input("Please fill in the blank: "))

Please fill in the blank: QQ
QQ 匹配选项：APP


In [107]:
prt(input("Please fill in the blank: "))

Please fill in the blank: 微信
微信 匹配选项：APP


In [108]:
prt(input("Please fill in the blank: "))

Please fill in the blank: 红警
红警 匹配选项：桌面客户端


In [111]:
prt(input("Please fill in the blank: "))

Please fill in the blank: 360浏览器
360浏览器 匹配选项：桌面客户端


# Compute Negative Mutual Information

In [42]:
def predict_proba(oword, iword):
    iword_vec = model[iword]
    oword = model.wv.vocab[oword]
    oword_l = model.syn1[oword.point].T
    dot = np.dot(iword_vec, oword_l)
    lprob = -sum(np.logaddexp(0, -dot) + oword.code*dot) 
    return lprob

## Display Form

In [62]:
df1 = 'APP'
df2 = 'WEB'
df3 = '客户端'
df4 = 'html5'
df5 = '其他'
df = {0: 'APP', 1: 'WEB', 2: '桌面客户端', 3: 'H5应用', 4: '其他'}
display_form = [df1, df2, df3, df4, df5]

In [41]:
predict_proba('福中路', '广州')-0.9*np.log(model.wv.vocab['福中路'].count)

-16.834297937762848

In [None]:
   r = {i:predict_proba(i, word)-0.9*np.log(j.count) for i,j in model.wv.vocab.iteritems()}

In [60]:
def prt(word):
    try:
        word_vec = model[word]
        print("%s 匹配选项：%s"%(word, df[np.argmax([predict_proba(df, word)-np.log(model.wv.vocab[df].count) for df in display_form])]))
    except KeyError as e:
        print(e)

In [None]:
prt(input("Please fill in the blank: "))

In [72]:
word = '红警'

In [73]:
[predict_proba(df, word)-0.9*np.log(model.wv.vocab[df].count) for df in display_form]

[-19.16439861992837,
 -20.398261835356436,
 -17.496802500179761,
 -21.464301062634551,
 -21.211411296782984]