In [5]:
# Word2Vec不需要标签即可创建有意义的表示形式。
# 运用于情感分析

import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# 评论到单词列表清洗函数
def review_to_wordlist( review, remove_stopwords=False ):
    # Function:将文档转换为单词序列
    # 返回一个单词list
    # 是否删除stop words为可选项
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. 删除标点符号
    # 保存数字
    review_text = re.sub("[^a-zA-Z0-9]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. 返回一个单词list
    return(words)


In [6]:
# Word2Vec期望输入是单个句子，每个句子作为单词列表。换句话说，输入格式是列表的列表。
# 将使用NLTK的punkt标记器进行句子拆分

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# 将完整的评论拆分成句子
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. 用NLTK将段落分成句子
    raw_sentences = tokenizer.tokenize(review.strip())
#     print(len(raw_sentences))
#     print(len(raw_sentences[0]))
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # 跳过空句子
        if len(raw_sentence) > 0:
            # 对分好的句子分词
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords=False ))
    #
    # 返回一个句子列表sentences，且每个sentence是一个单词list。
    # 即返回一个元素为列表的列表
    return sentences

In [7]:
# 读取数据 
train = pd.read_csv( "F:\\NLP\\kaggle_data\\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "F:\\NLP\\kaggle_data\\testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "F:\\NLP\\kaggle_data\\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

# 共读取100000条数据
print("Read %d labeled train reviews, %d labeled test reviews, "
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [8]:
# print(train["review"][0])
temp = review_to_sentences(train["review"][0], tokenizer)
# print(temp)

In [9]:
# 初始化空句子列表sentences
sentences = []
# 加载punkt标记生成器
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# +=和append的区别，大部分时候相同，但此处不同
# 如果要将列表列表追加到另一个列表列表，则“ append”将仅追加第一个列表；
# 您需要使用“ + =”才能一次加入所有列表。
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

# 打印句子总数和输出样例
print(len(sentences))
print(sentences[0])
print(sentences[-1])

Parsing sentences from training set




Parsing sentences from unlabeled set




795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']
['pathmark', 'means', 'savings']


word2vec模型

1.体系结构：体系结构选项是跳跃语法（默认）或连续的单词袋。我们发现，skip-gram的速度稍慢一些，但产生了更好的结果。

2.训练算法：分层softmax（默认）或负采样。对于我们来说，默认设置效果很好。
常用词的下采样：Google文档建议使用.00001和.001之间的值。对于我们来说，更接近0.001的值似乎可以提高最终模型的准确性。

3.字向量维数：更多功能会导致更长的运行时间，并且通常（但并非总是）会导致更好的模型。合理的值可以在几十到几百之间。我们用了300。
上下文/窗口大小：训练算法应考虑多少个上下文词？10对于分层softmax似乎很好用（越多越好，直到一定程度）。

4.辅助线程：要运行的并行进程数。这是特定于计算机的，但是在大多数系统上应该在4到6之间工作。

5.最小单词数：这有助于将词汇量限制为有意义的单词。在所有文档中至少出现多次的任何单词都将被忽略。合理的值应该在10到100之间。在这种情况下，由于每部电影出现30次，因此我们将最小字数设置为40，以避免过于重视单个电影标题。这样一来，整个词汇量约为15,000个单词。较高的值也有助于限制运行时间。

In [10]:
# 导入内置日志记录模块并配置，使word2Vec创建的输出消息更好
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# 设置参数值
num_features = 300    # 词向量的维数                      
min_word_count = 40   # 最小单词数                        
num_workers = 6       # 并行线程数
context = 10          # 上下文窗口大小                                                                                    
downsampling = 1e-3   # 常用词下采样设置

In [11]:
# 导入word2vec
from gensim.models import word2vec

2020-09-28 22:49:59,161 : INFO : 'pattern' package not found; tag filters are not available for English


In [12]:
# 初始化并训练模型
print("Start Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
print("Training Complete")

# 保存模型以供以后使用，载入时用 Word2Vec.load()
model_name = "300features_40minwords_10context" # 模型名称
model.save(model_name)
print("Svaing Complete")

2020-09-28 22:50:45,459 : INFO : collecting all words and their counts
2020-09-28 22:50:45,460 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-28 22:50:45,515 : INFO : PROGRESS: at sentence #10000, processed 227240 words, keeping 18038 word types
2020-09-28 22:50:45,569 : INFO : PROGRESS: at sentence #20000, processed 454577 words, keeping 25324 word types
2020-09-28 22:50:45,621 : INFO : PROGRESS: at sentence #30000, processed 675274 words, keeping 30478 word types


Start Training model...


2020-09-28 22:50:45,674 : INFO : PROGRESS: at sentence #40000, processed 903014 words, keeping 34863 word types
2020-09-28 22:50:45,730 : INFO : PROGRESS: at sentence #50000, processed 1123503 words, keeping 38329 word types
2020-09-28 22:50:45,797 : INFO : PROGRESS: at sentence #60000, processed 1346264 words, keeping 41338 word types
2020-09-28 22:50:45,868 : INFO : PROGRESS: at sentence #70000, processed 1570738 words, keeping 43986 word types
2020-09-28 22:50:45,936 : INFO : PROGRESS: at sentence #80000, processed 1791248 words, keeping 46400 word types
2020-09-28 22:50:46,005 : INFO : PROGRESS: at sentence #90000, processed 2016722 words, keeping 48869 word types
2020-09-28 22:50:46,065 : INFO : PROGRESS: at sentence #100000, processed 2239896 words, keeping 50980 word types
2020-09-28 22:50:46,126 : INFO : PROGRESS: at sentence #110000, processed 2460901 words, keeping 52890 word types
2020-09-28 22:50:46,180 : INFO : PROGRESS: at sentence #120000, processed 2684304 words, keepin

2020-09-28 22:50:49,881 : INFO : PROGRESS: at sentence #760000, processed 17089761 words, keeping 123539 word types
2020-09-28 22:50:49,939 : INFO : PROGRESS: at sentence #770000, processed 17318248 words, keeping 124326 word types
2020-09-28 22:50:50,000 : INFO : PROGRESS: at sentence #780000, processed 17549751 words, keeping 125052 word types
2020-09-28 22:50:50,064 : INFO : PROGRESS: at sentence #790000, processed 17778071 words, keeping 125740 word types
2020-09-28 22:50:50,102 : INFO : collected 126187 word types from a corpus of 17901873 raw words and 795538 sentences
2020-09-28 22:50:50,103 : INFO : Loading a fresh vocabulary
2020-09-28 22:50:50,190 : INFO : effective_min_count=40 retains 16731 unique words (13% of original 126187, drops 109456)
2020-09-28 22:50:50,191 : INFO : effective_min_count=40 leaves 17335707 word corpus (96% of original 17901873, drops 566166)
2020-09-28 22:50:50,254 : INFO : deleting the raw counts dictionary of 126187 items
2020-09-28 22:50:50,259 : I

2020-09-28 22:51:38,439 : INFO : EPOCH 4 - PROGRESS: at 7.20% examples, 920295 words/s, in_qsize 11, out_qsize 0
2020-09-28 22:51:39,456 : INFO : EPOCH 4 - PROGRESS: at 14.69% examples, 929492 words/s, in_qsize 7, out_qsize 0
2020-09-28 22:51:40,478 : INFO : EPOCH 4 - PROGRESS: at 20.71% examples, 869355 words/s, in_qsize 10, out_qsize 1
2020-09-28 22:51:41,484 : INFO : EPOCH 4 - PROGRESS: at 27.21% examples, 859034 words/s, in_qsize 10, out_qsize 1
2020-09-28 22:51:42,495 : INFO : EPOCH 4 - PROGRESS: at 34.89% examples, 881516 words/s, in_qsize 11, out_qsize 0
2020-09-28 22:51:43,496 : INFO : EPOCH 4 - PROGRESS: at 42.22% examples, 892167 words/s, in_qsize 12, out_qsize 0
2020-09-28 22:51:44,502 : INFO : EPOCH 4 - PROGRESS: at 49.65% examples, 901008 words/s, in_qsize 11, out_qsize 0
2020-09-28 22:51:45,515 : INFO : EPOCH 4 - PROGRESS: at 57.15% examples, 907834 words/s, in_qsize 11, out_qsize 0
2020-09-28 22:51:46,521 : INFO : EPOCH 4 - PROGRESS: at 64.35% examples, 910007 words/s, i

Training Complete


2020-09-28 22:52:06,726 : INFO : saved 300features_40minwords_10context


Svaing Complete


In [13]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match("france england germany berlin".split()))
print(model.doesnt_match("paris berlin london austria".split()))

kitchen
berlin
paris


  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
print(model.most_similar("man"))

[('woman', 0.6221582889556885), ('lady', 0.5844950675964355), ('lad', 0.5493677854537964), ('guy', 0.5265066623687744), ('person', 0.5254271030426025), ('farmer', 0.5244280099868774), ('millionaire', 0.5242919921875), ('soldier', 0.5171104669570923), ('monk', 0.509947657585144), ('sailor', 0.5053134560585022)]


  """Entry point for launching an IPython kernel.


In [15]:
print(model.most_similar("berlin"))

[('edinburgh', 0.6927019357681274), ('london', 0.6783941984176636), ('venice', 0.6662055253982544), ('italy', 0.6633578538894653), ('austria', 0.6533050537109375), ('vienna', 0.6532919406890869), ('1920', 0.6511423587799072), ('1938', 0.6483134627342224), ('1953', 0.6480259895324707), ('france', 0.6431691646575928)]


  """Entry point for launching an IPython kernel.


In [16]:
print(model.most_similar("interesting"))

[('intriguing', 0.7502032518386841), ('entertaining', 0.6261184215545654), ('enjoyable', 0.6192778944969177), ('exciting', 0.6154883503913879), ('engaging', 0.6083738803863525), ('engrossing', 0.5933602452278137), ('fascinating', 0.5877359509468079), ('compelling', 0.5730586051940918), ('amusing', 0.5726856589317322), ('important', 0.568382203578949)]


  """Entry point for launching an IPython kernel.
