# 本文件说明
- 训练中文词向量：Word2vec
  - 银监会、保监会

# 基本设置

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [5]:
from toolkits.setup import specific_func
specific_func.set_ch_pd()
from toolkits.setup.date_time import get_day_list
from toolkits.nlp import pre_cor_circ
from toolkits.nlp import pre_cor_cbrc
from toolkits.nlp import myclass_circ
from toolkits.nlp import myclass_cbrc

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.687 seconds.
Prefix dict has been built succesfully.


In [2]:
import os
import sys
import re
import datetime
import time

In [31]:
import multiprocessing

import gensim
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# load and preprocess

## 保监会数据

In [24]:
filepath_raw = 'datasets_raw/circ/'
filename_list = os.listdir(filepath_raw)
filename_list = filename_list[:1]

file_num = 0
with open('datasets_pre/circ_pre.txt', 'w') as f:
    for filename in filename_list:
        tmp_data = pd.read_excel(filepath_raw + filename)
        print('filename: ', filename, tmp_data.shape)
        file_num += tmp_data.shape[0]
        for index in tmp_data.index:
            content = str(tmp_data.loc[index, 'title']) + "。" + str(tmp_data.loc[index, 'content'])
            sentences = [i.strip() for i in cut_sentences(content)]
            for sentence in sentences :
                string_pre = pre_cor_circ.handle_contents([sentence])
                f.write(string_pre[0] + '\n')

del tmp_data
print('file_num: ', file_num)

filename:  circ_class_predict_mysql_2018-10-11.xlsx (26182, 7)
file_num:  26182


## 银监会数据

In [None]:
# filepath_raw = 'datasets_raw/cbrc/'
# filename_list = os.listdir(filepath_raw)
# filename_list = filename_list[:1]

# file_num = 0
# with open('datasets_pre/cbrc_pre.txt', 'w') as f:
#     for filename in filename_list:
#         tmp_data = pd.read_excel(filepath_raw + filename)
#         print('filename: ', filename, tmp_data.shape)
#         file_num += tmp_data.shape[0]
#         for index in tmp_data.index:
#             string = str(tmp_data.loc[index, 'title']) + "。" + str(tmp_data.loc[index, 'content'])
#             string_pre = pre_cor_circ.handle_contents([string])
#             f.write(string_pre[0] + '\n')

# del tmp_data
# print('file_num: ', file_num)

## 合并数据

In [26]:
circ_pre = []
with open('datasets_pre/circ_pre.txt', 'r') as f:
    for line in f.readlines():
        circ_pre.append(line.strip()) # 把末尾的'\n'删掉        
print('circ_pre 文件数：', len(circ_pre))

# cbrc_pre = []
# with open('datasets_pre/cbrc_pre.txt', 'r') as f:
#     for line in f.readlines():
#         circ_pre.append(line.strip()) # 把末尾的'\n'删掉        
# print('cbrc_pre 文件数：', len(cbrc_pre))

# datasets_pre = circ_pre + cbrc_pre
datasets_pre = circ_pre

with open('datasets_pre/datasets_pre.txt', 'w') as f:
    for line in datasets_pre:
        f.write(line + '\n')
print('datasets_pre 文件数：', len(datasets_pre))

circ_pre 文件数： 26182
datasets_pre 文件数： 26182


# train
- 语料小（小于一亿词，约 500MB 的文本文件）的时候用 Skip-gram 模型
- 语料大的时候用 CBOW 模型

## skip-gram模型
- 在一般的NLP处理中，会需要去停用词。
- 由于word2vec的算法依赖于上下文，而上下文有可能就是停词。因此对于word2vec，我们可以不用去停词。

In [28]:
datasets_pre_path = 'datasets_pre/datasets_pre.txt'
model_path = 'model/cbirc_skip_gram.model' # 输出模型
vector_path = 'model/cbirc_skip_gram.vector' # 原始c版本word2vec的vector格式的模型

In [38]:
model = Word2Vec(LineSentence(datasets_pre_path), 
                 size=400, window=5, min_count=40,
                 workers=multiprocessing.cpu_count())

# 保存模型
model.save(model_path)
model.wv.save_word2vec_format(vector_path, binary=False)

In [41]:
# syn0的每一行，即代表词汇表中的一个单词，即有16490个单词。
# 列代表特征向量的大小，即400，这个训练时设定的数字。
# 设置的最小单词频度是40（即出现40次以下的单词会被忽略），
# 最后得到一个有16492个单词的词汇表，每个词有300个特征。
print(model.wv.syn0.shape)
print(model['保监会'][:10]) # 查看单个词向量

(32547, 400)
[ 0.15556747 -0.896676    0.19152008 -1.2532637   0.20065337 -1.1594774
  0.5081621   0.2302387   0.7268413  -1.6614144 ]


## CBOW 模型

In [None]:
# datasets_pre_path = 'datasets_pre/datasets_pre.txt'
# model_path = 'model/cbirc_CBOW.model' # 输出模型
# vector_path = 'model/cbirc_CBOW.vector' # 原始c版本word2vec的vector格式的模型

In [None]:
# https://blog.csdn.net/glory1234work2115/article/details/52454141
# model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
# model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)

# test

In [34]:
model = gensim.models.Word2Vec.load(model_path)

In [42]:
model.doesnt_match('保监会 银监会 银保监会 卫生部'.split())

'卫生部'

In [43]:
word = model.most_similar(u"保监会", topn=5)
for t in word:
    print(t[0],t[1])

银监会 0.7442243099212646
中国保监会 0.6307400465011597
卫生部 0.5117448568344116
中国保险监督管理委员会 0.5058220028877258
银保监会 0.49333423376083374


In [45]:
model.wv.similarity('保监会', '银监会')

0.7442243271420652