In [2]:
import pandas as pd
import numpy as np
import jieba
import re
from collections import defaultdict
import sys
from operator import itemgetter
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec

## 构造text-rank抽取关键词

# 数据准备

In [8]:
import pandas as pd

# "/Users/junjiexie/Documents/NLP学习/nlp文本摘要项目/sqlResult_1558435.csv"
han_filename = r"C:\NLP学习备用\sqlResult_1558435.csv"
data = pd.read_csv(han_filename,encoding="GB18030")
articles = data["content"].tolist()

In [14]:
def get_stopwords():
    stopwords = []
    with open(r"/Users/junjiexie/Documents/NLP学习/nlp第九课/停用词表.txt" ,encoding="utf-8") as f:
        line_str = f.readline()
        while line_str != "":
            line_str = line_str.strip()
            stopwords.append(line_str)
            line_str = f.readline()
    return set(stopwords)

def token(string):return re.findall('\w+', string)

In [15]:
def sentences_deal(sentences):
    output_list = []
    input = "".join(token(sentences))
    cut_list = ",".join(jieba.cut(input)).split(",")
    
    stopwords = get_stopwords()
    for str in cut_list:
        if str in stopwords:
            continue
        else:
            output_list.append(str)
    
    return output_list
        

In [48]:
# 定义无向有权图
class UndirectWeightedGraph:
    d = 0.85
 
    def __init__(self):
        self.graph = defaultdict(list)
    
    #有权无向图的数据结构
    def addEdge(self, start, end, weight):
        # use a tuple (start, end, weight) instead of a Edge object
        self.graph[start].append((start, end, weight))
        self.graph[end].append((end, start, weight))
 
    def rank(self):
        #记录结点权值
        ws = defaultdict(float)
        #记录结点出度和
        outSum = defaultdict(float)
        
        # 初始化各个结点的权值
        wsdef = 1.0 / (len(self.graph) or 1.0)
        
        # 统计各个结点的出度的次数之和
        for n, out in self.graph.items():
            ws[n] = wsdef
            outSum[n] = sum((e[2] for e in out), 0.0)
 
        # this line for build stable iteration
        sorted_keys = sorted(self.graph.keys())
        # 遍历若干次，保证权值收敛，这里写了100次
        for x in range(100):  
            for n in sorted_keys:
                s = 0
                # 将这些入度结点贡献后的权值相加
                # 贡献率 = 入度结点与结点n的共现次数 / 入度结点的所有出度的次数
                for e in self.graph[n]:
                    s += e[2] / outSum[e[1]] * ws[e[1]]
                # 更新结点n的权值
                ws[n] = (1 - self.d) + self.d * s
 
        (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
 
        for w in ws.values():
            if w < min_rank:
                min_rank = w
            if w > max_rank:
                max_rank = w
        
        #权值归一化，修正数值分布
        for n, w in ws.items():
            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
 
        return ws

In [49]:
def cosine_similarity(x, y, norm=False):
    """ 计算两个向量x和y的余弦相似度 """
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    if list(x) == zero_list or list(y) == zero_list:
        return float(1) if list(x) == list(y) else float(0)

    # method 1
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))

    return 0.5 * cos + 0.5 if norm else cos  # 归一化到[0, 1]区间内

def word_similarity(model, word1, word2):

    #有可能有些词是不在word2vec里的，因此无法计算,给出一个接近零的相似度
    try:
        word1_vec = model[word1]
    except KeyError:
        return 0.001
    try:  
        word2_vec = model[word2]
    except KeyError:
        return 0.001
    
    return cosine_similarity(word1_vec, word2_vec)

In [50]:
def textrank(sentences, topK=10, span_num=2):
    # 导入word2vec
    path = '/Users/junjiexie/OursRepository/text-abstract-extraction/Data/wiki_han_word2vec_300维度.model'
    model = Word2Vec.load(path)
    # 定义无向有权图
    g = UndirectWeightedGraph()
    # 定义权重词典
    cm = defaultdict(int)
    # 文本预处理
    words = sentences_deal(sentences)
    # 依次遍历每个词
    for i, wp in enumerate(words):
            # 依次遍历词i 之后窗口范围内的词
        for j in range(i + 1, i + span_num):
            # 词j 不能超出整个句子
            if j >= len(words):
                break
            #判断这个词组是否已经出现过
            if cm[(wp, words[j])] == 0:
                cm[(wp, words[j])] = word_similarity(model=model, word1=wp, word2=words[j])
            else:
                continue
    
    # jieba中对权重的定义是两词共现次数，这里换成word2vec词向量相似度
    for terms, w in cm.items():
        g.addEdge(terms[0], terms[1], w)
    
    # 运行text-rank算法
    nodes_rank = g.rank()
    
    # 根据指标值进行排序
    tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
 
    # 输出topK个词作为关键词
    if topK:
        return tags[:topK]
    else:
        return tags



In [52]:
textrank(articles[0])

#感觉用word2vec相似度作为权重有点怪怪的



[('发布', 1.0),
 ('更新', 0.6395818438752795),
 ('月', 0.2810899844929091),
 ('本周', 0.18849534996885373),
 ('含', 0.15672863164471557),
 ('开发', 0.12142483512437663),
 ('更新换代', 0.11155062740108165),
 ('版', 0.09465604762329465),
 ('去年', 0.09103145940670385),
 ('外', 0.08444310231583688)]

In [53]:
textrank(articles[1])




[('进入', 1.0),
 ('澎湃', 0.5847296259939706),
 ('拿到', 0.0548973629047256),
 ('不会', 0.0546180254984679),
 ('强调', 0.05460621538170052),
 ('考虑', 0.05459062935145518),
 ('PCB', 0.0545797095417014),
 ('空间', 0.05457955167302171),
 ('30', 0.054579460817448286),
 ('按计划', 0.05457935287802686)]

In [54]:
textrank(articles[2])





[('手机', 1.0),
 ('缩水', 0.29252452749916613),
 ('大屏', 0.2602298390761203),
 ('拥有', 0.21514176678840113),
 ('旗舰', 0.20898436679888105),
 ('AMOLED', 0.20599958350122996),
 ('虎', 0.2046873581381779),
 ('可能', 0.20028676493912392),
 ('应该', 0.19264664010165564),
 ('掌握', 0.19159077476197756)]

# 完成选做一,要使用到pyltp，转到Windows平台

## 句子主要成分提取

In [114]:
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import re

In [88]:
import sys
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer,SentenceSplitter
MODELDIR='D:/LTP/ltp_data'
# print ("正在加载LTP模型... ...")
stopwords = [line.strip() for line in open(r"C:\NLP学习备用\停用词表.txt", 'r',encoding='utf-8').readlines()]
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, r"C:\ltp_data_v3.4.0\cws.model"))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, r"C:\ltp_data_v3.4.0\pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, r"C:\ltp_data_v3.4.0\parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, r"C:\ltp_data_v3.4.0\ner.model"))

In [89]:
#借鉴了github某位大神的code，剥离核心部分
def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict: #python3删除了has_key方法
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])

    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

In [90]:
def fact_triple_extract(sentence):
    """
    对于给定的句子进行事实三元组抽取
    Args:
        sentence: 要处理的语句
    """
    #print (sentence)
    words=[]
    cuts = segmentor.segment(sentence)
    #print ("\t".join(words))
    for word in cuts:
        if word not in stopwords:
            words.append(word)
    postags = postagger.postag(words)
    netags = recognizer.recognize(words, postags)
    arcs = parser.parse(words, postags)
    #print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    
    child_dict_list = build_parse_child_dict(words, postags, arcs)
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':
            child_dict = child_dict_list[index]
            # 主谓宾
            if 'SBV' in child_dict and 'VOB' in child_dict:
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                r = words[index]
                e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                return [e1, r, e2]

            # 定语后置，动宾关系
            if arcs[index].relation == 'ATT':
                if 'VOB' in child_dict:
                    e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    temp_string = r+e2
                    if temp_string == e1[:len(temp_string)]:
                        e1 = e1[len(temp_string):]
                    if temp_string not in e1:
                        return print("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))

            # 含有介宾关系的主谓动补关系
            if 'SBV' in child_dict and 'CMP' in child_dict:
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    e2 = complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                    return [e1, r, e2]


        # 尝试抽取命名实体有关的三元组
        if netags[index][0] == 'S' or netags[index][0] == 'B':
            ni = index
            if netags[ni][0] == 'B':
                while netags[ni][0] != 'E':
                    ni += 1
                e1 = ''.join(words[index:ni+1])
            else:
                e1 = words[ni]
            if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                r = complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                if e1 in r:
                    r = r[(r.index(e1)+len(e1)):]
                if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                    e2 = complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                    mi = arcs[arcs[ni].head-1].head-1
                    li = mi
                    if netags[mi][0] == 'B':
                        while netags[mi][0] != 'E':
                            mi += 1
                        e = ''.join(words[li+1:mi+1])
                        e2 += e
                    if r in e2:
                        e2 = e2[(e2.index(r)+len(r)):]
                    if r+e2 in sentence:
                        return [e1, r, e2]

In [93]:
fact_triple_extract("2020年美国遭受了严重的海啸袭击")

['美国', '遭受', '严重海啸袭击']

In [95]:
fact_triple_extract("小明有生之年还没见过大海")

['小明', '见', '大海']

## 找出是否含有说的意思的句子

### 最简单粗暴的方法是，把与说最相近的词都出来，正则匹配

In [25]:
path = r'C:\NLP学习备用\wiki_han_word2vec_300维度.model'
model = Word2Vec.load(path)

In [75]:
def get_similar_word(topn=100):
    similar_by_word = []
    for i in model.wv.similar_by_word("说",topn = topn):
        similar_by_word.append(i[0])
    similar_by_word.append("说")
    return set(similar_by_word)

In [79]:
#多少还是会取决于pyltp的性能
def find_contain_say_sentence(sentences):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(r"D:\MyNLP\ltp_data_v3.4.0\cws.model")  # 加载模型
    cut_sentence = "cut".join(SentenceSplitter.split(sentences)).split("cut")
    similar_word = get_similar_word()
    stay_count = []
    output_sentence = []
    for count,sentence in enumerate(cut_sentence):
        sentence = str(sentence)
        if len(sentences) == 0:
            continue
        for word in similar_word:
            if sentence.find(word) != -1:
                stay_count.append(count)
                break;
    for i in stay_count:
        output_sentence.append(cut_sentence[i])
    
    return output_sentence

In [80]:
find_contain_say_sentence(articles[0])

['有人猜测这也是将精力主要用到MIUI 9的研发之中。', '当然，关于MIUI 9的确切信息，我们还是等待官方消息。']

In [81]:
find_contain_say_sentence(articles[2])

['至于电池缩水，可能与刘作虎所说，一加手机5要做市面最轻薄大屏旗舰的设定有关。']

In [84]:
find_contain_say_sentence(articles[4])

['@深圳交警微博称：昨日清晨交警发现有一女子赤裸上身，行走在南坪快速上，期间还起了轻生年头，一辅警发现后赶紧为其披上黄衣，并一路劝说她。',
 '南都记者在龙岗大队坂田中队见到了辅警刘青（发现女生的辅警），一位外表高大帅气，说话略带些腼腆的90后青年。',
 '刘青介绍，6月16日早上7时36分，他正在环城南路附近值勤，接到中队关于一位女子裸身进入机动车可能有危险的警情，随后骑着小铁骑开始沿路寻找，大概花了十多分钟在南坪大道坂田出口往龙岗方向的逆行辅道上发现该女子。',
 '刘青停好小铁骑，和另外一名巡防员追了上去，发现女子的情绪很低落，话不多，刘青尝试和女子交流，劝说女子离开，可女子并不愿意接受，继续缓慢地往南坪快速路的主干道上走去。',
 '当女子行进到十字路口中间时，一辆大货车挡住了镜头，但是当女子再次出现镜头时，可以发现女子已经没穿内裤了，全身裸露继续朝着南坪快速方向走去。',
 '刘青表示，“一开始根本不敢看她，心里挺别扭，感觉很尴尬”，但当刘青跟随女子上了南坪快速路主干道时，女子作出了让人意想不到的举动，她突然靠近护栏要从上面跳下去，刘青赶忙冲上去拉住了女子的手，将其控制住并远离护栏。',
 '就这样，我被牵着走了大概十多分钟，天突然下起了大暴雨，雨大的连眼睛都睁不开”刘青继续说着，瞬间他们就被雨透了，但女子依然不愿意接受刘青的帮助，就继续冒着大雨往前走。',
 '大概走了有四十分钟吧，女子突然停下来说“我想回家了”，然后女子也接受了刘青递过来的小黄衣，就出现了深圳微博上的照片，女子披着小黄衣，刘青小心翼翼地在旁边走着的场景。',
 '才会说',
 '据警方透露，该女子姓陈，系湖北人，今年44岁，据家属反映其有精神病史。',
 '其实真爱的到来并不存在年龄的限制',
 '你们说呢？',
 '@弓常yan桦：就想问这个小哥哥有女票吗',
 '去年6月7号上午，淮安市涟水县公安局刑警大队突然接到了一个奇怪的报警电话，一名女子言语不清，声称自己遭到了侵害。']

## 检测谓语中是否有说的意思

In [125]:
def word_splitter(sentence, segmentor=segmentor):
    words = segmentor.segment(sentence)  # 分词
    words_list = list(words)
    return words_list

def word_tag(words, postagger=postagger):
    postags = postagger.postag(words)  # 词性标注
    return postags

def word_parse(words, postags, parser=parser):
    output = []
    arcs = parser.parse(words, postags)  # 句法分析
#     print("句法分析结果：")
#     print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    return [(arc.head, arc.relation) for arc in arcs]

In [140]:
#我的理解是依存句法分析的root节点多数是谓语
def find_predicate_contain_say_sentence(sentences):
    similar_word = get_similar_word()
    stay_count = []
    output_sentence = []
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(r"D:\MyNLP\ltp_data_v3.4.0\cws.model")  # 加载模型
    cut_sentence = "cut".join(SentenceSplitter.split(sentences)).split("cut")
    for count,sentence in enumerate(cut_sentence):
        if len(sentences) == 0:
            continue
            
        #句法分析
        words = word_splitter(sentence)
        tag = word_tag(words)
        parse = word_parse(words, tag)
        for count_parse,element in enumerate(parse):
            #检查谓语
            if element[1] == "HED":
                predicate = words[count_parse]
                print(predicate)
                for word in similar_word:
                    if predicate.find(word) != -1:
                        stay_count.append(count)
                        break;
                break;
        
    for i in stay_count:
        output_sentence.append(cut_sentence[i])
        
    return output_sentence
    

In [141]:
find_predicate_contain_say_sentence(articles[0])

暂停
是
发布
等待


[]

In [142]:
find_predicate_contain_say_sentence(articles[1])

强调
联手
称
显示
首发
是


['骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考虑性能而去屏蔽掉小核心。',
 '报道称，微软已经拿到了一些新的源码，以便Windows 10更好地理解big.little架构。']

In [143]:
find_predicate_contain_say_sentence(articles[2])

是
说
拥有
是


['至于电池缩水，可能与刘作虎所说，一加手机5要做市面最轻薄大屏旗舰的设定有关。']