# word2vec训练词向量

In [50]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
from gensim.models import word2vec

In [21]:
def load_data(name,nrows = None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('.', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

利用无标签的数据和有标签的数据合在一起来建立word2vec模型

## 读入无标签数据

In [22]:
df  = load_data('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


## 清洗数据

In [23]:
eng_stopwords = {}.fromkeys([line.rstrip() for line in open('./stopwords.txt')])

In [24]:
# 清理文本数据的方法
def clean_text(text,remove_stopwords=False):
    text = BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

In [None]:
# 打印函数
def print_call_counts(f):
    n = 0
    def wrapped(*args,**kwargs):
        
        n +=1
        if n%1000 == 1:
            print 'method {} called {} times'.format(f.__name__, n)
        return f(*args, **kwargs)
    return wrapped

In [36]:
# 切割评论，看看有多少句子
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
@print_call_counts
def split_sentence(review):
    review = BeautifulSoup(review,'html.parser').get_text()# 先把文本中的网页标签给去掉
    raw_sentence = tokenizer.tokenize(review.strip())      # 切分句子
    sentences = [clean_text(s) for s in raw_sentence if s] # 清洗文本
    return sentences

In [56]:
%time
sentences = sum(df.review.apply(split_sentence),[])
print '{} reviews -> {}sentences'.format(len(df),len(sentences))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
50000 reviews -> 539886sentences


## 用gensim训练词嵌入模型

In [38]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)

In [42]:
# 设定词向量的参数
num_features = 300  # 词向量的维度，也就是神经网络的
min_word_count = 10 # 最小词频,默认是5，也就是至少出现5词以上
num_workers = 4     # 执行的线程数
context = 10        # 文本窗的大小
downsampling = 1e-3 # 对频繁出现的词下采样的设置

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [57]:
print('Traing model...')
model = word2vec.Word2Vec(sentences,workers=num_workers,
                 size=num_features,min_count=min_word_count,
                 window=context,sample = downsampling
                )
# 初始化权重（映射矩阵）
model.init_sims(replace=True)

# 存储模型，方便以后使用
model.save(os.path.join('.','models',model_name))

2018-03-28 21:53:10,212 : INFO : collecting all words and their counts
2018-03-28 21:53:10,215 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-28 21:53:10,350 : INFO : PROGRESS: at sentence #10000, processed 224745 words, keeping 17228 word types


Traing model...


2018-03-28 21:53:10,494 : INFO : PROGRESS: at sentence #20000, processed 441641 words, keeping 24512 word types
2018-03-28 21:53:10,618 : INFO : PROGRESS: at sentence #30000, processed 663543 words, keeping 29701 word types
2018-03-28 21:53:10,752 : INFO : PROGRESS: at sentence #40000, processed 883553 words, keeping 33883 word types
2018-03-28 21:53:10,867 : INFO : PROGRESS: at sentence #50000, processed 1100419 words, keeping 37458 word types
2018-03-28 21:53:10,994 : INFO : PROGRESS: at sentence #60000, processed 1322904 words, keeping 40681 word types
2018-03-28 21:53:11,114 : INFO : PROGRESS: at sentence #70000, processed 1545641 words, keeping 43559 word types
2018-03-28 21:53:11,245 : INFO : PROGRESS: at sentence #80000, processed 1765999 words, keeping 46082 word types
2018-03-28 21:53:11,378 : INFO : PROGRESS: at sentence #90000, processed 1980415 words, keeping 48272 word types
2018-03-28 21:53:11,515 : INFO : PROGRESS: at sentence #100000, processed 2203134 words, keeping 50

2018-03-28 21:53:28,323 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-03-28 21:53:28,327 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-28 21:53:28,328 : INFO : EPOCH - 1 : training on 11876777 raw words (8395083 effective words) took 10.3s, 816439 effective words/s
2018-03-28 21:53:29,345 : INFO : EPOCH 2 - PROGRESS: at 10.23% examples, 850388 words/s, in_qsize 7, out_qsize 0
2018-03-28 21:53:30,348 : INFO : EPOCH 2 - PROGRESS: at 20.97% examples, 875399 words/s, in_qsize 7, out_qsize 0
2018-03-28 21:53:31,357 : INFO : EPOCH 2 - PROGRESS: at 31.64% examples, 879125 words/s, in_qsize 6, out_qsize 0
2018-03-28 21:53:32,363 : INFO : EPOCH 2 - PROGRESS: at 42.07% examples, 878408 words/s, in_qsize 7, out_qsize 0
2018-03-28 21:53:33,372 : INFO : EPOCH 2 - PROGRESS: at 53.07% examples, 884919 words/s, in_qsize 7, out_qsize 0
2018-03-28 21:53:34,384 : INFO : EPOCH 2 - PROGRESS: at 63.65% examples, 883649 words/s, in_qsize 8, out_qsize 1
2

IOError: [Errno 2] No such file or directory: './models/300features_40minwords_10context.model'

## 看看词向量训练的结果

In [58]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


  """Entry point for launching an IPython kernel.
  


In [59]:
model.most_similar('cat')

  """Entry point for launching an IPython kernel.


[(u'mouse', 0.6986721158027649),
 (u'dog', 0.673465371131897),
 (u'sheep', 0.6436570286750793),
 (u'demon', 0.6173554062843323),
 (u'rabbit', 0.6109448671340942),
 (u'monkey', 0.6008049249649048),
 (u'lizard', 0.5990520715713501),
 (u'tiger', 0.5941593647003174),
 (u'bird', 0.5865774750709534),
 (u'doll', 0.5808638334274292)]

In [60]:
model.most_similar('police')

  """Entry point for launching an IPython kernel.


[(u'fbi', 0.6976989507675171),
 (u'authorities', 0.6819326281547546),
 (u'cops', 0.6687051057815552),
 (u'mob', 0.6635503172874451),
 (u'federal', 0.660423994064331),
 (u'cia', 0.6583893895149231),
 (u'terrorist', 0.6488812565803528),
 (u'investigating', 0.6443933248519897),
 (u'homicide', 0.6320115923881531),
 (u'agents', 0.6315895318984985)]