In [13]:
import pandas as pd
import numpy as np
import jieba
from gensim.models import Word2Vec
from tqdm import tqdm_notebook
import re

In [4]:
# 读取数据，创建训练词向量的语料
data = pd.read_csv('../data/sqlResult_1558435.csv',encoding='gb18030')
data = data.dropna(subset=['content'])
texts = list(data['content'])[:100]
corpus = [list(jieba.cut(text.strip())) for text in texts]

In [5]:
# 训词向量
import time
start = time.time()
w2v_model = Word2Vec(corpus[:100], size=50, window=5, min_count=1, workers=4)
end = time.time()
print(end - start)

  "C extension not loaded, training will be slow. "


324.49342250823975


In [None]:
#model.save('../data/w2v_model')

In [41]:
# 创建图的节点（词语）
def create_nodes(word_list, window_size):
    nodes_dict = {}
    word_list_len = len(word_list)
    for index, word in enumerate(word_list):
        left = index - window_size + 1    # 词窗左边界
        right = index + window_size      # 词窗右边界
        win_words = set(word_list[left:right])  # 和word在一个词窗里共现的所有词语
        if word not in nodes_dict:
            nodes_dict[word] = win_words
        else:
            nodes_dict[word] = nodes_dict[word] | win_words
    return nodes_dict


# 创建概率转移矩阵(基于词向量的相似度)
def create_matrix(word_list, nodes_dict):
    word_set = set(word_list)
    matrix = np.zeros([len(word_set), len(word_set)])
    word_index = {}    # 记录词的index
    index_dict = {}   # 记录节点index对应的词

    for i, v in enumerate(word_set):
        word_index[v] = i
        index_dict[i] = v
    for word_i in word_list:
        for word_j in nodes_dict[word_i]:
            if word_i != word_j:
                wv_i = w2v_model.wv[word_i]
                wv_j = w2v_model.wv[word_j]
                matrix[word_index[word_i]][word_index[word_j]] = np.dot(wv_i,wv_j)/(np.linalg.norm(wv_i)*np.linalg.norm(wv_j)) + 1
    matrix = matrix / np.sum(matrix, axis=0)  # 列归一化
    return matrix, index_dict


# textrank迭代计算
def textrank(word_list, matrix, iternum, d=0.85):
    textrank_value = np.ones([len(set(word_list)), 1])
    for i in tqdm_notebook(range(iternum)):
        textrank_value = (1 - d) + d * np.dot(matrix, textrank_value)
    return textrank_value


# 根据textrank值抽取top k的词语作为关键词
def keyword_extract(index_dict, textrank_value, top_k):
    word_tr = {}
    for i in range(len(textrank_value)):
        word_tr[index_dict[i]] = textrank_value[i][0]
    sorted_words = sorted(word_tr.items(), key = lambda x : x[1], reverse=True)
    return sorted_words[:top_k]


In [51]:
# 导入停用词
with open('../data/chinese_stopwords.txt','r',encoding='utf-8') as f:
    stop_words = f.read().split('\n')

# 选择第一篇新闻进行测试
# window_size=4，iternum=100，top_k=20
document = texts[0]
word_list = list(jieba.cut(re.sub('\d| |\r|\n', ' ',document)))
word_list = [w for w in word_list if w not in stop_words]
nodes_dict = create_nodes(word_list, window_size=4)
matrix, index_dict = create_matrix(word_list, nodes_dict)
textrank_value = textrank(word_list, matrix, iternum=100)
keywords = keyword_extract(index_dict, textrank_value, top_k=20)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [52]:
# 查看第一篇新闻的关键词以及textrank值
keywords

[('MIUI', 2.067905053348265),
 ('发布', 1.967453042280757),
 ('机型', 1.753317935797683),
 ('精力', 1.5840166306620995),
 ('手机', 1.3402941223971716),
 ('外', 1.2657300257913153),
 ('款', 1.245150053410471),
 ('更新', 1.217069594010498),
 ('小米', 1.1948312651145234),
 ('含', 1.0964931194929033),
 ('开发', 1.0705965378166495),
 ('暂停', 1.0560225284166422),
 ('版', 1.0469212412442719),
 ('影响', 1.0193426365149258),
 ('工程师', 1.017312660126139),
 ('体验版', 1.01365011068498),
 ('内测', 1.012520850273949),
 ('月', 1.011669676366555),
 ('确保', 1.0111396337477805),
 ('稳定版', 1.0104822221734147)]

In [None]:
#本地训练词向量耗时太久,内存爆掉，沮丧。换到服务器，colab就省时多了。