# word2vec训练词向量

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
from gensim.models import word2vec

In [2]:
def load_data(name,nrows = None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('.', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

利用无标签的数据和有标签的数据合在一起来建立word2vec模型

## 读入无标签数据

In [3]:
df  = load_data('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


## 清洗数据

In [4]:
eng_stopwords = {}.fromkeys([line.rstrip() for line in open('./stopwords.txt')])

In [5]:
# 清理文本数据的方法
def clean_text(text,remove_stopwords=False):
    text = BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

In [6]:
# 打印函数
def print_call_counts(f):
    n = 0
    def wrapped(*args,**kwargs):
        
        n +=1
        if n%1000 == 1:
            print 'method {} called {} times'.format(f.__name__, n)
        return f(*args, **kwargs)
    return wrapped

In [7]:
# 切割评论，看看有多少句子
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# @print_call_counts
def split_sentence(review):
    review = BeautifulSoup(review,'html.parser').get_text()# 先把文本中的网页标签给去掉
    raw_sentence = tokenizer.tokenize(review.strip())      # 切分句子
    sentences = [clean_text(s) for s in raw_sentence if s] # 清洗文本
    return sentences

In [8]:
%time
sentences = sum(df.review.apply(split_sentence),[])
print '{} reviews -> {}sentences'.format(len(df),len(sentences))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


50000 reviews -> 539886sentences


解释一下上面的sum(df.review.apply(split_sentence),[]),它是吧一个二维list(这里是dataframe)转化为一维<br>
所以这个的sentences就是一维的list，可以直接放入word2vec里进行训练

In [9]:
# 一个例子,把二维列表转化为一维
a = [['a','e'],['b','f'],['c','g','h']]
b = sum(a,[])
b

['a', 'e', 'b', 'f', 'c', 'g', 'h']

## 用gensim训练词嵌入模型

In [10]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)

In [11]:
# 设定词向量的参数
num_features = 300  # 词向量的维度，也就是神经网络的
min_word_count = 10 # 最小词频,默认是5，也就是至少出现5词以上
num_workers = 4     # 执行的线程数
context = 10        # 文本窗的大小
downsampling = 1e-3 # 对频繁出现的词下采样的设置

# 设置一个model_name 方便后面保存模型
model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
print model_name

300features_10minwords_10context.model


In [12]:
print('Traing model...')
# 这里的sentences要求是一个一维List
model = word2vec.Word2Vec(sentences,workers=num_workers,
                 size=num_features,min_count=min_word_count,
                 window=context,sample = downsampling
                )
# 初始化权重（映射矩阵）
model.init_sims(replace=True)

# 存储模型，方便以后使用
model.save(os.path.join('.','model',model_name))

2018-03-29 12:42:32,105 : INFO : collecting all words and their counts
2018-03-29 12:42:32,107 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-29 12:42:32,219 : INFO : PROGRESS: at sentence #10000, processed 224745 words, keeping 17228 word types


Traing model...


2018-03-29 12:42:32,354 : INFO : PROGRESS: at sentence #20000, processed 441641 words, keeping 24512 word types
2018-03-29 12:42:32,468 : INFO : PROGRESS: at sentence #30000, processed 663543 words, keeping 29701 word types
2018-03-29 12:42:32,596 : INFO : PROGRESS: at sentence #40000, processed 883553 words, keeping 33883 word types
2018-03-29 12:42:32,705 : INFO : PROGRESS: at sentence #50000, processed 1100419 words, keeping 37458 word types
2018-03-29 12:42:32,824 : INFO : PROGRESS: at sentence #60000, processed 1322904 words, keeping 40681 word types
2018-03-29 12:42:32,946 : INFO : PROGRESS: at sentence #70000, processed 1545641 words, keeping 43559 word types
2018-03-29 12:42:33,061 : INFO : PROGRESS: at sentence #80000, processed 1765999 words, keeping 46082 word types
2018-03-29 12:42:33,190 : INFO : PROGRESS: at sentence #90000, processed 1980415 words, keeping 48272 word types
2018-03-29 12:42:33,300 : INFO : PROGRESS: at sentence #100000, processed 2203134 words, keeping 50

2018-03-29 12:42:51,642 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-03-29 12:42:51,657 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-03-29 12:42:51,659 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-03-29 12:42:51,667 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-29 12:42:51,668 : INFO : EPOCH - 1 : training on 11876777 raw words (8730625 effective words) took 12.5s, 698007 effective words/s
2018-03-29 12:42:52,675 : INFO : EPOCH 2 - PROGRESS: at 7.45% examples, 651255 words/s, in_qsize 8, out_qsize 0
2018-03-29 12:42:53,680 : INFO : EPOCH 2 - PROGRESS: at 15.06% examples, 657825 words/s, in_qsize 8, out_qsize 0
2018-03-29 12:42:54,683 : INFO : EPOCH 2 - PROGRESS: at 23.29% examples, 677544 words/s, in_qsize 8, out_qsize 0
2018-03-29 12:42:55,699 : INFO : EPOCH 2 - PROGRESS: at 31.64% examples, 686783 words/s, in_qsize 8, out_qsize 0
2018-03-29 12:42:56,710 : INFO : EPOCH 2 - PRO

## 看看词向量训练的结果

In [13]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


  """Entry point for launching an IPython kernel.
  


In [14]:
model.most_similar('cat')

  """Entry point for launching an IPython kernel.


[(u'mouse', 0.736385703086853),
 (u'sheep', 0.6904305219650269),
 (u'dog', 0.6760936975479126),
 (u'rabbit', 0.6244864463806152),
 (u'doll', 0.5945224761962891),
 (u'monkey', 0.5916447043418884),
 (u'demon', 0.5911267995834351),
 (u'bird', 0.5907045602798462),
 (u'rat', 0.5844168066978455),
 (u'clown', 0.5842093229293823)]

In [15]:
model.most_similar('police')

  """Entry point for launching an IPython kernel.


[(u'fbi', 0.6906628012657166),
 (u'authorities', 0.6834771037101746),
 (u'investigating', 0.6793107986450195),
 (u'federal', 0.6773560047149658),
 (u'officer', 0.6663809418678284),
 (u'cops', 0.6512453556060791),
 (u'homicide', 0.6491827368736267),
 (u'officers', 0.6399737000465393),
 (u'cia', 0.6375499367713928),
 (u'agents', 0.6364037990570068)]

In [16]:
model.similarity('women','men')

  """Entry point for launching an IPython kernel.


0.7508594107389206