In [1]:
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer

ModuleNotFoundError: No module named 'pyltp'

# 分词使用

In [2]:
def word_splitter(sentence):
    """
    分词
    :param sentence:
    :return:
    """
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(r"D:\MyNLP\ltp_data_v3.4.0\cws.model")  # 加载模型
    words = segmentor.segment(sentence)  # 分词
    words_list = list(words)
    print("分词结果：")
    print(words)
    for word in words_list:
        print(word)
    segmentor.release()  # 释放模型
    return words_list


# 词性标注

In [4]:
def word_tag(words):
    """
    词性标注
    :param words: 已切分好的词
    :return:
    """
    postagger = Postagger()  # 初始化实例
    postagger.load(r"D:\MyNLP\ltp_data_v3.4.0\pos.model")  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print("词性标注结果：")
    for word, tag in zip(words, postags):
        print(word+':'+tag)
    postagger.release()  # 释放模型
    return postags


# 依存句法分析

In [5]:
def parse(words, postags):
    """
    依存句法分析
    :param words:
    :param postags:
    :return:
    """
    parser = Parser()  # 初始化实例
    parser.load(r"D:\MyNLP\ltp_data_v3.4.0\parser.model")  # 加载模型
    arcs = parser.parse(words, postags)  # 句法分析
    print("句法分析结果：")
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型


# 测试

In [6]:
words = word_splitter('我研究的方向是自然语言处理，所以想试试这个模型。')
tags = word_tag(words)
parse(words, tags)

分词结果：
<pyltp.VectorOfString object at 0x0000017C95F95F30>
我
研究
的
方向
是
自然
语言
处理
，
所以
想
试试
这个
模型
。
词性标注结果：
我:r
研究:v
的:u
方向:n
是:v
自然:n
语言:n
处理:v
，:wp
所以:c
想:v
试试:v
这个:r
模型:n
。:wp
句法分析结果：
2:SBV	4:ATT	2:RAD	5:SBV	0:HED	7:ATT	8:ATT	5:VOB	5:WP	11:ADV	5:COO	11:VOB	14:ATT	12:VOB	5:WP


# 三元组抽取函数

In [1]:
import sys
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer,SentenceSplitter
MODELDIR='D:/LTP/ltp_data'
# print ("正在加载LTP模型... ...")
stopwords = [line.strip() for line in open(r"D:\机器学习大project\nlp部分\主题模型\stopwords.txt", 'r',encoding='utf-8').readlines()]
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, r"D:\MyNLP\ltp_data_v3.4.0\cws.model"))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, r"D:\MyNLP\ltp_data_v3.4.0\pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, r"D:\MyNLP\ltp_data_v3.4.0\parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, r"D:\MyNLP\ltp_data_v3.4.0\ner.model"))

In [4]:
def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict: #python3删除了has_key方法
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])

    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

In [14]:
def fact_triple_extract(sentence):
    """
    对于给定的句子进行事实三元组抽取
    Args:
        sentence: 要处理的语句
    """
    #print (sentence)
    words=[]
    cuts = segmentor.segment(sentence)
    #print ("\t".join(words))
    for word in cuts:
        if word not in stopwords:
            words.append(word)
    postags = postagger.postag(words)
    netags = recognizer.recognize(words, postags)
    arcs = parser.parse(words, postags)
    #print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

    child_dict_list = build_parse_child_dict(words, postags, arcs)
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':
            child_dict = child_dict_list[index]
            # 主谓宾
            if 'SBV' in child_dict and 'VOB' in child_dict:
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                r = words[index]
                e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                return [e1, r, e2]

            # 定语后置，动宾关系
            if arcs[index].relation == 'ATT':
                if 'VOB' in child_dict:
                    e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    temp_string = r+e2
                    if temp_string == e1[:len(temp_string)]:
                        e1 = e1[len(temp_string):]
                    if temp_string not in e1:
                        return print("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))

            # 含有介宾关系的主谓动补关系
            if 'SBV' in child_dict and 'CMP' in child_dict:
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    e2 = complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                    return [e1, r, e2]


        # 尝试抽取命名实体有关的三元组
        if netags[index][0] == 'S' or netags[index][0] == 'B':
            ni = index
            if netags[ni][0] == 'B':
                while netags[ni][0] != 'E':
                    ni += 1
                e1 = ''.join(words[index:ni+1])
            else:
                e1 = words[ni]
            if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                r = complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                if e1 in r:
                    r = r[(r.index(e1)+len(e1)):]
                if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                    e2 = complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                    mi = arcs[arcs[ni].head-1].head-1
                    li = mi
                    if netags[mi][0] == 'B':
                        while netags[mi][0] != 'E':
                            mi += 1
                        e = ''.join(words[li+1:mi+1])
                        e2 += e
                    if r in e2:
                        e2 = e2[(e2.index(r)+len(r)):]
                    if r+e2 in sentence:
                        return [e1, r, e2]

In [20]:
print(fact_triple_extract('我真的超级喜欢大数据的'))

None
