Word2vec Intro
----------------

# 1. Environment setup

In [1]:
! pip install word2vec

Collecting word2vec
  Downloading word2vec-0.9.1-cp27-cp27m-win_amd64.whl (182kB)
Collecting cython (from word2vec)
  Downloading Cython-0.25.2-cp27-none-win_amd64.whl (2.1MB)
Installing collected packages: cython, word2vec
Successfully installed cython-0.25.2 word2vec-0.9.1


Download text8.zip from http://mattmahoney.net/dc/text8.zip

# 2. Training

In [2]:
import word2vec

## 2.1 把文本变为词组

In [6]:
%%time

word2vec.word2phrase('text8', 'text8-phrases')

Wall time: 53 s


下面查看结果
文件text8-phrases中，会将词组以aaa_bbb的形式显示出来

In [7]:
%%time

txt = ''

with open('text8-phrases') as f:
    txt = f.readline()

word_list = txt.split(' ')
phase_list = []
for w in word_list:
    if('_' in w):
        phase_list.append(w)

Wall time: 3.72 s


In [10]:
phase_list[:10] # 结果

['working_class',
 'sans_culottes',
 'french_revolution',
 'derived_from',
 'should_be',
 'does_not',
 'anti_authoritarian',
 'based_upon',
 'mutual_aid',
 'self_governance']

## 2.2 把词组变为向量(模型)

In [11]:
%%time

word2vec.word2vec('text8-phrases', 'text8.bin')

Wall time: 41 s


text8.bin是word2vec的模型，无法可视化查看

## 2.3 相似词聚类

In [12]:
%%time

word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=True)

Starting training using file text8
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.000129  Progress: 99.55%  Words/thread/sec: 657.83k  Wall time: 45.9 s


# 3. Predict

## 3.1 加载模型

In [13]:
%%time

model = word2vec.load('text8.bin')

Wall time: 2.55 s


## 3.2 查看模型向量

In [14]:
# 查看维度
model.vectors.shape

(98331L, 100L)

In [15]:
# 查看某个向量
model.vectors[0]

array([-0.16299245, -0.12382638, -0.11257624,  0.03756531,  0.13419886,
        0.11617669, -0.13849744,  0.00653765,  0.03422417,  0.01381833,
        0.13329126,  0.15636005,  0.15067512, -0.0991419 ,  0.0242307 ,
        0.08754268, -0.11797192,  0.07145537,  0.06698723, -0.11861023,
       -0.16042924,  0.04071694,  0.07281177,  0.1260405 , -0.06704707,
        0.15283939, -0.16306224, -0.02266485,  0.08864973, -0.10875637,
        0.0617611 ,  0.1070509 , -0.04441712,  0.15174229, -0.06674786,
       -0.09289846, -0.00235874, -0.09029537,  0.04199355,  0.05103955,
        0.00051364,  0.09795504, -0.10546511,  0.10798842,  0.0072358 ,
       -0.10276229,  0.01353907,  0.02288427,  0.03650811, -0.10957421,
        0.16022977,  0.06020522,  0.0033162 , -0.10370977,  0.08258583,
       -0.14515977,  0.11773255,  0.05179754, -0.12131305, -0.00886148,
       -0.0868545 ,  0.14849092, -0.15766658, -0.10919521,  0.10166519,
       -0.09081399,  0.14849092,  0.00597913, -0.14784265,  0.11

In [16]:
# 查看某个词的向量
model['dog']

array([ 0.0404917 , -0.21929267,  0.0900031 , -0.01635313, -0.09535735,
       -0.23004085, -0.05401799, -0.06359713,  0.04510297,  0.15695578,
       -0.03373741, -0.24344453,  0.05506968, -0.06589295,  0.08143154,
        0.01088519,  0.12717766, -0.00065899,  0.0875503 , -0.16578481,
       -0.07191166, -0.10214885, -0.06966231,  0.03901153,  0.03669783,
       -0.01593973,  0.06193575, -0.11606269, -0.01676246, -0.01782042,
       -0.0081625 , -0.01753607,  0.20049143,  0.0814201 , -0.00413563,
        0.0931877 , -0.02165939, -0.10927138, -0.10155135, -0.05190689,
       -0.03913409,  0.06913613, -0.0158022 ,  0.0045671 , -0.25836891,
       -0.06862895, -0.12792477,  0.02216368,  0.01569648, -0.05261899,
       -0.04047079, -0.01103894,  0.27481362,  0.11803567,  0.02176232,
        0.03383744,  0.0047348 , -0.18896218, -0.06432848, -0.10793414,
        0.09994719, -0.04930592, -0.05250505, -0.12186269,  0.05668881,
        0.05793966, -0.06357146, -0.08746185, -0.00340339, -0.08

## 3.3 相似词语检索

以cosine相似度进行检索

In [18]:
indexes, metrics = model.cosine('dog')

In [19]:
# 相似词的index
indexes

array([ 2436,  5473,  2428,  5749,  3774,  5937,  4804, 17003, 17265, 11562], dtype=int64)

In [20]:
# 相似度
metrics

array([ 0.80874873,  0.74550301,  0.73173909,  0.72597835,  0.71948888,
        0.7194813 ,  0.71648858,  0.7071348 ,  0.70334834,  0.7022293 ])

**通过下标查看具体的相似词组**

In [21]:
model.vocab[indexes]

array([u'cat', u'cow', u'bear', u'purple', u'grey', u'chicken', u'wolf',
       u'coyote', u'stuffed', u'bark'], 
      dtype='<U78')

## 3.4 相似短语检索

In [22]:
indexes, metrics = model.cosine('los_angeles')
model.generate_response(indexes, metrics).tolist()
# 输出与los_angeles相似的词组，及其位置

[(u'san_francisco', 0.8543923468728601),
 (u'boston', 0.826421405644348),
 (u'chicago', 0.8149681801332046),
 (u'detroit', 0.7955706694358771),
 (u'cincinnati', 0.7931334967042376),
 (u'california', 0.7921662138682769),
 (u'pittsburgh', 0.7903944287598161),
 (u'cleveland', 0.789379691514625),
 (u'minnesota', 0.7848978720662279),
 (u'san_diego', 0.7802006624681883)]

## 3.5 复杂推理

king - man + woman = queen

王的女人，并且不是男的，就是女王

In [23]:
indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)
model.generate_response(indexes, metrics).tolist()

[(u'emperor', 0.306096616944302),
 (u'wife', 0.295442287841388),
 (u'son', 0.29537343185425113),
 (u'queen', 0.2948718756495251),
 (u'empress', 0.2942050337104657),
 (u'roman_emperor', 0.2931893452062588),
 (u'daughter', 0.29266298706150434),
 (u'aragon', 0.28758922861558867),
 (u'pope_urban', 0.2872960695655884),
 (u'aquitaine', 0.28660725616627103)]

# 4. Cluster

In [24]:
clusters = word2vec.load_clusters('text8-clusters.txt')

In [25]:
clusters.get_words_on_cluster(90).shape # 第90类中，有多少个元素

(543L,)

In [26]:
clusters.get_words_on_cluster(90) #第90类中，都有哪些东西

array(['had', 'said', 'wrote', 'claims', 'claimed', 'stated', 'argued',
       'says', 'asked', 'told', 'felt', 'saying', 'got', 'calls', 'heard',
       'learned', 'bears', 'didn', 'knew', 'happened', 'claiming', 'drew',
       'doesn', 'tells', 'gets', 'finds', 'seeing', 'concluded', 'looks',
       'argues', 'sees', 'writes', 'spoke', 'saved', 'believes',
       'realized', 'demanded', 'stating', 'knowing', 'meets', 'wasn',
       'mentions', 'suggesting', 'loved', 'knows', 'asking', 'speaks',
       'acknowledged', 'informed', 'feared', 'isn', 'forgotten',
       'believing', 'possessed', 'requested', 'bore', 'wants', 'declares',
       'asks', 'emphasized', 'wore', 'asserted', 'insisted', 'questioned',
       'reads', 'speculated', 'reveals', 'announces', 'replied', 'accepts',
       'noticed', 'commented', 'answered', 'couldn', 'embraced', 'decides',
       'unto', 'asserts', 'rejects', 'suspect', 'teaches', 'wouldn',
       'concludes', 'liked', 'proud', 'feels', 'pays', 'remark