In [2]:
# Word2Vec不需要标签即可创建有意义的表示形式。
# 运用于情感分析

import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# 评论到单词列表清洗函数
def review_to_wordlist( review, remove_stopwords=False ):
    # Function:将文档转换为单词序列
    # 返回一个单词list
    # 是否删除stop words为可选项
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. 删除标点符号
    # 保存数字
    review_text = re.sub("[^a-zA-Z0-9]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. 返回一个单词list
    return(words)


In [28]:
# Word2Vec期望输入是单个句子，每个句子作为单词列表。换句话说，输入格式是列表的列表。
# 将使用NLTK的punkt标记器进行句子拆分

import nltk.data

# 将完整的评论拆分成句子
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. 用NLTK将段落分成句子
    raw_sentences = tokenizer.tokenize(review.strip())
#     print(len(raw_sentences))
#     print(len(raw_sentences[0]))
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # 跳过空句子
        if len(raw_sentence) > 0:
            # 对分好的句子分词
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords=False ))
    #
    # 返回一个句子列表sentences，且每个sentence是一个单词list。
    # 即返回一个元素为列表的列表
    return sentences

In [4]:
# 读取数据 
train = pd.read_csv( "F:\\NLP\\kaggle_data\\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "F:\\NLP\\kaggle_data\\testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "F:\\NLP\\kaggle_data\\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

# 共读取100000条数据
print("Read %d labeled train reviews, %d labeled test reviews, "
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [31]:
print(train["review"][0])
temp = review_to_sentences(train["review"][0], tokenizer)
print(temp)

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [29]:
# 初始化空句子列表sentences
sentences = []
# 加载punkt标记生成器
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# +=和append的区别，大部分时候相同，但此处不同
# 如果要将列表列表追加到另一个列表列表，则“ append”将仅追加第一个列表；
# 您需要使用“ + =”才能一次加入所有列表。
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

# 打印句子总数和输出样例
print(len(sentences))
print(sentences[0])
print(sentences[-1])

Parsing sentences from training set
15
181
10
177
13
108
5
115
16
72
6
55
9
62
9
42
5
83
5
46
3
129
7
110
5
300
5
77
5
216
6
155
23
148
28
122
5
297
8
68
11
145
12
115
10
238
3
53
4
101
8
98
9
319
20
31
28
196
4
97
4
72
7
141
4
205
17
130
21
88
17
61
4
425
7
79
7
63
26
236
8
68
13
24
1
825
9
295
10
36
4
166
4
82
10
62
13
117
8
37
7
123
3
198
8
7
6
195
3
52
6
137
4
111
11
46
6
20
20
77
22
62
7
97
8
127
10
37
9
14
3
76
40
60
5
64
10
154
7
63
6
91
17
130
14
120
8
58
4
597
6
177
7
67
8
65
9
196
7
111
9
19
11
231
13
87
9
248
34
17
13
41
13
81
3
166
8
54
6
97
9
82
11
17
10
75
5
77
7
76
4
63
8
126
6
23
8
165
3
63
8
89
24
202
2
477
8
63
5
66
7
78
9
86
6
248
6
1247
16
142
14
23
12
44
16
187
2
263
2
372
14
121
7
19
8
55
6
189
6
85
14
81
19
85
7
90
19
52
12
23
26
381
12
128
11
106
3
107
12
64
4
70
12
124
10
102
10
115
8
248
3
58
10
228
11
60
9
210
13
26
9
51
13
51
2
107
14
16
12
196
10
28
6
81
13
65
14
218
5
120
4
205
9
156
17
100
18
63
4
569
7
68
10
174
15
17
7
650
5
333
13
365
11
228
5
176
7
13



5
103
10
46
15
34
12
161
8
75
10
226
9
59
42
60
14
73
5
220
11
36
5
203
16
54
4
123
19
205
14
24
5
57
11
73
11
38
4
166
45
61
9
113
8
128
3
114
12
65
8
55
11
27
11
182
6
92
2
217
6
95
5
146
86
174
4
691
4
334
9
16
18
159
20
213
18
26
1
834
4
211
7
359
19
55
28
113
5
88
6
107
13
50
24
151
10
35
9
36
21
254
12
111
9
311
17
127
8
72
3
610
6
204
7
541
6
124
7
117
13
125
5
51
3
208
20
122
6
597
1
721
11
74
12
114
11
206
3
330
15
44
4
76
7
130
10
84
10
35
12
252
15
116
6
202
10
116
9
23
8
32
3
392
8
197
17
190
14
264
10
149
4
232
6
126
14
11
28
205
7
82
11
32
8
189
8
111
6
19
7
140
4
10
5
116
12
176
5
139
11
149
9
95
9
102
7
32
4
96
3
166
6
113
16
8
12
24
11
48
16
60
12
112
13
49
6
67
30
186
15
86
10
106
5
106
3
111
14
22
13
85
9
99
20
135
20
54
5
136
6
133
7
134
4
16
11
58
4
92
10
128
5
5
23
112
3
42
5
62
6
163
10
42
8
17
7
105
1
2328
37
177
4
245
5
50
12
158
8
49
10
200
9
115
8
131
3
360
9
118
29
121
25
86
18
116
7
116
18
156
6
20
16
201
14
35
3
212
7
61
8
67
9
211
12
49
11
28
8
17
3
28
10




120
12
103
12
52
15
80
20
119
3
160
9
67
12
152
15
59
1
726
10
160
7
164
7
224
6
133
20
35
4
45
5
105
6
379
22
20
11
190
18
75
14
131
5
33
16
74
8
137
3
318
18
23
15
34
9
142
8
29
8
199
6
74
8
73
13
104
10
117
8
41
3
123
7
53
9
61
29
82
6
232
10
66
6
476
9
128
23
187
6
107
6
22
15
84
40
26
4
69
14
29
2
284
7
76
15
64
14
41
24
303
10
309
17
77
11
138
20
32
8
38
13
735
19
119
10
46
4
184
37
147
24
118
6
66
33
57
8
83
10
17
14
134
15
24
5
208
5
43
10
79
8
81
17
25
14
185
15
59
10
86
13
165
15
485
3
173
4
74
14
129
4
78
5
153
11
35
13
177
7
102
10
92
11
61
5
92
2
47
11
91
7
94
8
43
20
43
6
113
5
75
14
283
5
59
2
103
16
130
7
55
14
101
14
124
5
104
9
61
10
61
7
59
13
79
4
446
9
291
7
290
18
145
6
96
11
52
8
46
3
29
21
212
13
146
4
84
4
64
27
52
8
78
8
299
12
173
17
189
13
44
8
71
6
54
5
286
1
280
8
130
21
150
47
39
19
217
11
74
5
73
18
129
3
304
7
235
27
79
4
140
43
332
5
31
27
96
11
60
15
103
5
314
11
126
18
66
16
220
7
222
3
259
20
139
10
47
12
19
2
576
34
183
7
50
18
52
13
108
8
44
23
1

6
169
26
40
9
42
11
76
23
719
13
133
21
72
6
30
6
54
9
74
14
58
5
30
11
126
8
206
13
167
4
668
27
40
5
227
3
374
5
33
21
168
9
65
7
82
17
497
11
30
3
70
16
60
18
51
7
255
9
118
7
73
6
106
9
83
6
155
20
132
18
9
46
192
5
48
10
27
22
25
9
57
4
87
6
80
9
103
13
107
15
218
7
145
9
83
3
108
5
85
8
190
1
444
11
54
4
419
10
57
9
52
3
66
3
46
9
73
9
59
1
773
4
155
23
15
5
80
24
117
10
30
4
72
6
149
12
63
5
81
11
82
7
30
15
39
11
47
8
37
7
162
11
91
12
144
10
45
5
20
11
64
12
114
12
128
16
87
9
202
32
138
9
67
16
168
10
145
10
101
10
35
14
69
32
169
12
158
9
70
4
135
18
199
16
95
5
53
12
23
21
50
9
31
3
150
10
44
7
85
6
45
10
161
6
31
10
60
7
135
2
781
9
115
12
19
10
65
7
159
22
377
3
304
4
269
10
93
6
76
9
27
5
62
23
69
8
61
10
368
7
119
7
53
11
98
10
66
3
85
8
55
21
72
4
503
9
160
1
737
15
81
9
68
15
105
17
42
12
6
4
138
4
42
11
58
14
154
16
241
14
30
5
469
22
105
18
251
6
168
15
60
23
78
9
156
4
119
10
107
7
37
5
146
5
100
23
126
11
148
5
101
7
263
1
636
15
151
23
85
3
263
42
79
11
121
9
33


34
184
6
113
6
173
10
403
9
78
18
115
11
82
9
92
19
122
4
50
9
160
12
63
70
51
22
188
6
57
7
114
12
29
3
135
6
323
11
98
5
165
11
40
12
60
7
57
15
187
46
82
5
24
8
30
9
220
12
80
2
423
15
104
8
86
7
106
10
131
14
90
6
158
7
62
6
152
12
62
10
78
3
151
4
237
3
121
8
60
9
32
10
22
14
54
16
60
21
118
5
265
11
75
28
47
14
129
5
98
6
46
10
94
9
355
10
26
9
39
3
100
7
159
7
156
17
17
4
149
36
79
11
72
42
247
7
34
2
86
25
188
21
25
14
112
11
144
8
41
4
116
10
120
5
164
9
117
8
277
11
117
9
215
4
53
14
45
5
322
6
105
8
103
23
45
6
118
5
76
6
177
36
15
13
89
6
145
13
35
5
141
7
97
14
195
1
709
18
54
9
19
14
90
9
185
8
95
15
243
5
53
5
64
12
274
15
316
6
202
4
27
5
36
9
26
4
159
7
90
15
338
10
66
8
75
10
83
13
95
8
135
5
441
5
120
5
46
5
108
8
61
27
340
12
255
7
45
15
178
10
78
7
66
12
243
4
76
12
48
3
352
4
63
7
151
8
103
6
173
5
82
13
370
15
113
14
54
9
39
7
110
9
43
2
51
8
74
4
127
282
63
8
59
8
78
27
5
17
322
9
90
13
110
8
45
13
121
5
360
14
118
19
79
9
54
5
113
10
109
7
123
5
61
8
43
5
15
12

KeyboardInterrupt: 

word2vec模型

1.体系结构：体系结构选项是跳跃语法（默认）或连续的单词袋。我们发现，skip-gram的速度稍慢一些，但产生了更好的结果。

2.训练算法：分层softmax（默认）或负采样。对于我们来说，默认设置效果很好。
常用词的下采样：Google文档建议使用.00001和.001之间的值。对于我们来说，更接近0.001的值似乎可以提高最终模型的准确性。

3.字向量维数：更多功能会导致更长的运行时间，并且通常（但并非总是）会导致更好的模型。合理的值可以在几十到几百之间。我们用了300。
上下文/窗口大小：训练算法应考虑多少个上下文词？10对于分层softmax似乎很好用（越多越好，直到一定程度）。

4.辅助线程：要运行的并行进程数。这是特定于计算机的，但是在大多数系统上应该在4到6之间工作。

5.最小单词数：这有助于将词汇量限制为有意义的单词。在所有文档中至少出现多次的任何单词都将被忽略。合理的值应该在10到100之间。在这种情况下，由于每部电影出现30次，因此我们将最小字数设置为40，以避免过于重视单个电影标题。这样一来，整个词汇量约为15,000个单词。较高的值也有助于限制运行时间。

In [6]:
# 导入内置日志记录模块并配置，使word2Vec创建的输出消息更好
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# 设置参数值
num_features = 300    # 词向量的维数                      
min_word_count = 40   # 最小单词数                        
num_workers = 6       # 并行线程数
context = 10          # 上下文窗口大小                                                                                    
downsampling = 1e-3   # 常用词下采样设置

In [7]:
# 导入word2vec
from gensim.models import word2vec

In [9]:
# 初始化并训练模型
print("Start Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
print("Training Complete")

# 保存模型以供以后使用，载入时用 Word2Vec.load()
model_name = "300features_40minwords_10context" # 模型名称
model.save(model_name)
print("Svaing Complete")

2020-09-27 17:19:41,736 : INFO : collecting all words and their counts
2020-09-27 17:19:41,737 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-27 17:19:41,786 : INFO : PROGRESS: at sentence #10000, processed 227240 words, keeping 18038 word types
2020-09-27 17:19:41,849 : INFO : PROGRESS: at sentence #20000, processed 454577 words, keeping 25324 word types
2020-09-27 17:19:41,920 : INFO : PROGRESS: at sentence #30000, processed 675274 words, keeping 30478 word types


Start Training model...


2020-09-27 17:19:41,992 : INFO : PROGRESS: at sentence #40000, processed 903014 words, keeping 34863 word types
2020-09-27 17:19:42,045 : INFO : PROGRESS: at sentence #50000, processed 1123503 words, keeping 38329 word types
2020-09-27 17:19:42,095 : INFO : PROGRESS: at sentence #60000, processed 1346264 words, keeping 41338 word types
2020-09-27 17:19:42,151 : INFO : PROGRESS: at sentence #70000, processed 1570738 words, keeping 43986 word types
2020-09-27 17:19:42,204 : INFO : PROGRESS: at sentence #80000, processed 1791248 words, keeping 46400 word types
2020-09-27 17:19:42,260 : INFO : PROGRESS: at sentence #90000, processed 2016722 words, keeping 48869 word types
2020-09-27 17:19:42,311 : INFO : PROGRESS: at sentence #100000, processed 2239896 words, keeping 50980 word types
2020-09-27 17:19:42,359 : INFO : PROGRESS: at sentence #110000, processed 2460901 words, keeping 52890 word types
2020-09-27 17:19:42,408 : INFO : PROGRESS: at sentence #120000, processed 2684304 words, keepin

2020-09-27 17:19:45,802 : INFO : PROGRESS: at sentence #760000, processed 17089761 words, keeping 123539 word types
2020-09-27 17:19:45,854 : INFO : PROGRESS: at sentence #770000, processed 17318248 words, keeping 124326 word types
2020-09-27 17:19:45,911 : INFO : PROGRESS: at sentence #780000, processed 17549751 words, keeping 125052 word types
2020-09-27 17:19:45,964 : INFO : PROGRESS: at sentence #790000, processed 17778071 words, keeping 125740 word types
2020-09-27 17:19:45,994 : INFO : collected 126187 word types from a corpus of 17901873 raw words and 795538 sentences
2020-09-27 17:19:45,995 : INFO : Loading a fresh vocabulary
2020-09-27 17:19:46,081 : INFO : effective_min_count=40 retains 16731 unique words (13% of original 126187, drops 109456)
2020-09-27 17:19:46,082 : INFO : effective_min_count=40 leaves 17335707 word corpus (96% of original 17901873, drops 566166)
2020-09-27 17:19:46,143 : INFO : deleting the raw counts dictionary of 126187 items
2020-09-27 17:19:46,147 : I

2020-09-27 17:20:35,056 : INFO : EPOCH 4 - PROGRESS: at 22.68% examples, 962785 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:36,062 : INFO : EPOCH 4 - PROGRESS: at 30.49% examples, 971461 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:37,062 : INFO : EPOCH 4 - PROGRESS: at 38.01% examples, 969262 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:38,079 : INFO : EPOCH 4 - PROGRESS: at 45.68% examples, 969978 words/s, in_qsize 10, out_qsize 1
2020-09-27 17:20:39,083 : INFO : EPOCH 4 - PROGRESS: at 53.29% examples, 971128 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:40,082 : INFO : EPOCH 4 - PROGRESS: at 59.59% examples, 952851 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:41,094 : INFO : EPOCH 4 - PROGRESS: at 66.01% examples, 937574 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:42,122 : INFO : EPOCH 4 - PROGRESS: at 73.63% examples, 939499 words/s, in_qsize 11, out_qsize 0
2020-09-27 17:20:43,127 : INFO : EPOCH 4 - PROGRESS: at 81.24% examples, 942263 words/s,

Training Complete


2020-09-27 17:20:59,719 : INFO : saved 300features_40minwords_10context


Svaing Complete


In [20]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match("france england germany berlin".split()))
print(model.doesnt_match("paris berlin london austria".split()))

kitchen
berlin
paris
berlin


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [22]:
print(model.most_similar("man"))

[('woman', 0.6225773096084595), ('lad', 0.6184394359588623), ('lady', 0.5856634974479675), ('farmer', 0.5390567779541016), ('monk', 0.5365310311317444), ('millionaire', 0.5307788848876953), ('businessman', 0.5271201729774475), ('men', 0.5189670324325562), ('guy', 0.5130472183227539), ('soldier', 0.5074422955513)]


  """Entry point for launching an IPython kernel.


In [24]:
print(model.most_similar("berlin"))

[('edinburgh', 0.6538875699043274), ('vienna', 0.6498203277587891), ('austria', 0.6475058794021606), ('france', 0.6446967124938965), ('montreal', 0.6414276361465454), ('london', 0.6398050785064697), ('venice', 0.6327886581420898), ('1912', 0.6327462196350098), ('spain', 0.630702555179596), ('boston', 0.6295623183250427)]


  """Entry point for launching an IPython kernel.


In [25]:
print(model.most_similar("interesting"))

[('intriguing', 0.7486396431922913), ('entertaining', 0.6315568685531616), ('enjoyable', 0.6230214834213257), ('exciting', 0.6147611141204834), ('fascinating', 0.6043003797531128), ('important', 0.5911261439323425), ('engaging', 0.589065432548523), ('unusual', 0.5847320556640625), ('compelling', 0.5748746395111084), ('engrossing', 0.5722618103027344)]


  """Entry point for launching an IPython kernel.
