In [1]:
import pandas as pd
import jieba
import os
import pyltp

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from collections import defaultdict



# 分句

In [2]:
# 返回分句后的list
def sentence_split(string):
    sentences = list(pyltp.SentenceSplitter.split(string))
    return [e for e in sentences if e]

# list整合并存入文件
def list2file(input_list, output_file):
    output_file.writelines(' '.join(input_list) + '\n')

In [3]:
# 加载新闻语料
news_data = pd.read_csv(
    '..\\data\\export_sql_1558435\\sqlResult_1558435.csv', encoding='gb18030')
# 空值处理
news_data = news_data.fillna('')

In [None]:
# 分句存入文件
with open(
        '..\\data\\export_sql_1558435\\sentences_1558435.txt',
        'w',
        encoding='utf8') as fi:
    for i in news_data.index:
        content = news_data.at[i, 'content']
        list2file(sentence_split(content), fi)

# 分词

In [4]:
LTP_DATA_DIR = '../ltp_data'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
segmentor = pyltp.Segmentor()
segmentor.load(cws_model_path)

def cut(string):
    words = segmentor.segment(string)
    return list(words)

In [5]:
words = cut('孔子说，学习了而时常温习，不也喜悦吗？')

In [6]:
# 分词存入文件
with open(
        '..\\data\\export_sql_1558435\\corpus_1558435.txt', 'w',
        encoding='utf8') as fi:
    for i in news_data.index:
        content = news_data.at[i, 'content']
        list2file(cut(content), fi)

## 词性标注

In [7]:
# 词性标注模型
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
postagger = pyltp.Postagger()
postagger.load(pos_model_path)

def words_tagging(words):
    return list(postagger.postag(words))

In [8]:
postags = words_tagging(words)

## 命名实体识别

In [9]:
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
recognizer = pyltp.NamedEntityRecognizer()
recognizer.load(ner_model_path)

In [10]:
def ner(words, postags):
    return list(recognizer.recognize(words, postags))

In [11]:
netags = ner(words, postags)

In [12]:
netags

['S-Nh', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

## 句法分析

In [13]:
# 对句子进行依存句法分析
parser_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
parser = pyltp.Parser()
parser.load(parser_model_path)

In [14]:
# 做语法分析，分析结果以列表形式返回
def parse(words, postags):
    arcs = parser.parse(words,postags)
    return [(arc.head,arc.relation) for arc in arcs]

In [15]:
arcs = parse(words, postags)

In [16]:
arcs

[(2, 'SBV'),
 (0, 'HED'),
 (2, 'WP'),
 (2, 'VOB'),
 (4, 'RAD'),
 (8, 'ADV'),
 (8, 'ADV'),
 (4, 'COO'),
 (4, 'WP'),
 (12, 'ADV'),
 (12, 'ADV'),
 (4, 'COO'),
 (12, 'RAD'),
 (2, 'WP')]

# 训练表达观点的关键词模型

## 根据新闻语料训练

In [None]:
sentences = []
for i in news_data.index:
    for sentence in sentence_split(news_data.at[i,'content']):
        sentences.append(cut(sentence))
        

In [None]:
# 训练word2vec并导出
news_word2vec_model = Word2Vec(sentences, min_count=5, size=50)

# news_word2vec_model.save('news_word2vec_model')
# 模型的导入
# news_word2vec_model = Word2Vec.load('news_word2vec_model')

In [17]:
news_word2vec_model = Word2Vec.load('news_word2vec_model')

In [18]:
news_word2vec_model.wv.most_similar('说')

  if np.issubdtype(vec.dtype, np.int):


[('表示', 0.866018533706665),
 ('坦言', 0.8141570687294006),
 ('指出', 0.809965968132019),
 ('认为', 0.8057894706726074),
 ('告诉', 0.7455646991729736),
 ('称', 0.7256631255149841),
 ('强调', 0.7111589908599854),
 ('看来', 0.6905254125595093),
 ('透露', 0.6809438467025757),
 ('写道', 0.6728281378746033)]

## 结合wikipedia模型

In [19]:
# 导入wikipedia的模型
wiki_word2vec_model = Word2Vec.load('..\\lesson5\\word2vec_model')

In [20]:
def graph_search_tune(init_word, model, limit_rate=0.7):
    max_size = 1000
    seen = defaultdict(int)
    need_seen = [init_word]
    n = 0

    while need_seen and len(seen) < max_size:
        if n % 1000 == 0:
            print('had run counts:{}'.format(n))
        if n > 5000:
            break
        node = need_seen.pop(0)
        new_words = [
            w for w, p in model.wv.most_similar(node, topn=15)
            if p > limit_rate
        ] 
        need_seen += new_words
        seen[node] += 1
        n += 1

    return seen

In [21]:
# 通过加权的方式合并
def merge_model(news_rate,wiki_rate,topn=50):
    combine_similar = defaultdict(int)

    news_update_similar = graph_search_tune('说', news_word2vec_model)
    # 由于wiki中词汇较多，因此对概率限制更严格
    wiki_update_similar = graph_search_tune('说', wiki_word2vec_model, 0.83)
    
    for key in news_update_similar:
        combine_similar[key] = news_update_similar[
            key] * news_rate + wiki_update_similar[key] * wiki_rate
        
    result = sorted(combine_similar.items(), key=lambda x: x[1], reverse=True)
    return result[:topn]

In [22]:
similar_top50 = [i for i, j in merge_model(0.5,0.5)]

had run counts:0


  if np.issubdtype(vec.dtype, np.int):


had run counts:1000
had run counts:2000
had run counts:3000
had run counts:4000
had run counts:5000
had run counts:0
had run counts:1000
had run counts:2000
had run counts:3000
had run counts:4000
had run counts:5000


# 观点提取

## 获取主语索引

In [23]:
'''
1、主语：SBV
2、可能存在代词的情况。需要判断是否为人名，机构名
'''
# 获取主语索引
def get_sbv_idx(words, netags, arcs, key_word_idx):
    n = 0
    for i in arcs:
        is_person_or_org = 'Nh' in netags[n] or 'Ni' in netags[n]
        if i[0] == key_word_idx + 1 and i[1] == 'SBV' and is_person_or_org:
            return n
        n += 1
    return None

## 获取观点start和end索引

### 判断句子中核心动词的位置

In [24]:
'''
1、如果存在多个与'说'相关的关键词，其中存在有'HED'标记的，取'HED'的动词为关键词
2、如果'HED'与'说'无关，但与'说'相关的关键词与'HED'为'COO'关系。则判断各个关键词的影响，选取在句中影响最大的关键词
'''

"\n1、如果存在多个与'说'相关的关键词，其中存在有'HED'标记的，取'HED'的动词为关键词\n2、如果'HED'与'说'无关，但与'说'相关的关键词与'HED'为'COO'关系。则判断各个关键词的影响，选取在句中影响最大的关键词\n"

In [25]:
def get_keyword_idx(words, arcs):
    current_idx = 0
    importance = []
    for arc in arcs:
        if arc[1] == 'HED':
            if words[current_idx] in similar_top50:
                return current_idx
            # 存下hed的下标
            hed_idx = current_idx
        elif arc[1] == 'COO' and words[current_idx] in similar_top50:
            count = 0
            for i in arcs:
                if i[0] == current_idx+1:
                    count += 1
            importance.append((current_idx, count))
        current_idx += 1
        
#   找到了关键词
    if importance:
        most_important_idx = sorted(importance,key=lambda x:x[1],reverse=True)[0][0]
        return most_important_idx
#   没有找到关键词
    return None

### 判断宾语在动词的前后位置

In [26]:
def vob_after_hed(arcs,key_word_idx):
    current_idx = key_word_idx
    for arc in arcs[key_word_idx:]:
        if arc[0] == key_word_idx+1 and arc[1]=='VOB':
            return True
    return False

In [27]:
# 宾语在动词之前，提取观点分割符号位置索引
def before_wp_idx(words, arcs, key_word_idx):
    current_idx = 0
    start_index = 0
    end_index = 0
    for arc in arcs[:key_word_idx]:
        if arc[0] == key_word_idx + 1 and arc[1] == 'WP':
            if start_index == 0:
                start_index = current_idx
            else:
                end_index = current_idx
        current_idx += 1
    return start_index, end_index

In [28]:
# 宾语在动词之后，获取观点分割符号位置索引
def after_wp_idx(words, arcs, key_word_idx):
    current_idx = key_word_idx
    start_index = 0
    end_index = 0
    for arc in arcs[key_word_idx:]:
        if arc[0] == key_word_idx + 1 and arc[1] == 'WP':
            if start_index == 0:
                start_index = current_idx
            else:
                end_index = current_idx
        current_idx += 1
    return start_index, end_index

### 根据动宾关系的前后获取观点位置

In [29]:
def get_wp_idx(words, arcs, key_word_idx):
    if vob_after_hed(arcs, key_word_idx):
        return after_wp_idx(words, arcs, key_word_idx)
    else:
        return before_wp_idx(words, arcs, key_word_idx)

### 提取观点

In [30]:
# 对含有说的句子解析并提取观点
# input:分词与结构标记，关键词列表
def get_view(words, netags, arcs, key_word_idx):
    
    start_idx, end_idx = get_wp_idx(words, arcs, key_word_idx)
    sbv_idx = get_sbv_idx(words, netags, arcs, key_word_idx)
    if sbv_idx!=None:
        return  words[sbv_idx],''.join(words[start_idx + 1:end_idx + 1])
#     如果没有找到人名或机构名，主语返回None
    return  None,''.join(words[start_idx + 1:end_idx + 1])

In [31]:
def content2view(content):
    collections = []
    views = []
    # 分句
    sentences = sentence_split(content)
    # 对句子进行分词与词性标注，
    for sentence in sentences:
        words = cut(sentence)
        postags = words_tagging(words)
        netags = ner(words, postags)
        arcs = parse(words, postags)

        #         如果分词含有‘说’的近似词
        if set(words)&set(similar_top50):
            key_word_idx = get_keyword_idx(words, arcs)
            if key_word_idx != None:
                views.append(get_view(words, netags, arcs, key_word_idx))
    return views

In [32]:
text = '''孔子他说，学习了而时常温习，不也喜悦吗？有朋友从远方来说话，不也快乐吗？别人不理解自己也不怨恨，不也是君子吗？'''
content2view(text)

[('孔子', '学习了而时常温习，不也喜悦吗？')]

# 提取新闻语料观点

In [None]:
# 分句存入文件
with open(
        'view_data.txt',
        'w',
        encoding='utf8') as fi:
    for i in news_data.index:
        if i%500==0:
            print('had run {} times'.format(i))
        content = news_data.at[i, 'content']
        fi.writelines(str(view)+'\n' for view in content2view(content))

had run 0 times
had run 100 times
had run 200 times
had run 300 times
had run 400 times
had run 500 times
had run 600 times
had run 700 times
had run 800 times
had run 900 times
had run 1000 times
had run 1100 times
had run 1200 times
had run 1300 times
had run 1400 times
had run 1500 times
had run 1600 times
had run 1700 times
had run 1800 times
had run 1900 times
had run 2000 times
had run 2100 times
had run 2200 times
had run 2300 times
had run 2400 times
