## 数据库

In [1]:
import pymysql
import configparser

def get_mysql_data(sql, mysql_config_name=None):
    '''
     description: 获取mysql数据
     param {*}
     return {*}
    '''
    con = configparser.RawConfigParser()
    con.read('../config/config.ini', encoding='utf-8')
    sections = con.sections()
    if mysql_config_name == None:
        try:
            mysql = dict(con.items('mysql_nlp_tagging'))
            connection = pymysql.connect(host=mysql['host'], port=int(mysql['port']), user=mysql['user'],password=mysql['password'], db=mysql['database'], charset='utf8mb4')
        except:
            connection = pymysql.connect(host='192.168.100.50', port=3306, user='root',password='Aid@pro888888', db='nlp_tagging', charset='utf8mb4')

    cursor = connection.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()
    cursor.close()
    connection.close()

    return data


# neo4j数据转化与接口

In [None]:
import sys
import re, os
import json 
import requests
from py2neo import Node, Relationship, Graph, NodeMatcher, RelationshipMatcher
from py2neo import NodeMatcher, RelationshipMatcher
print(os.getcwd())
print(sys.version)

In [None]:
class Neo4jToJson(object):
    """知识图谱数据接口"""

    def __init__(self):
        """初始化数据"""
        # 与neo4j服务器建立连接
        self.graph = Graph("http://192.168.200.155:7474", auth=("neo4j", "aid_neo4j"))
        self.links = []
        self.nodes = []

    def post(self, select_name=None):
        """与前端交互"""
        # 前端传过来的数据
        if select_name == None:
            select_name = '布地奈德'
        # 取出所有节点数据
        nodes_data_all = self.graph.run("MATCH (n) RETURN n").data()
        # node名存储
        nodes_list = []
        for node in nodes_data_all:
            nodes_list.append(node['n']['name'])
        # 根据前端的数据，判断搜索的关键字是否在nodes_list中存在，如果存在返回相应数据，否则返回全部数据
        if select_name in nodes_list:
            # 获取知识图谱中相关节点数据
            links_data = self.graph.run("MATCH (n)-[r]-(b) where n.name=~'(?i).*{}.*' return r".format(select_name)).data()
            nodes_data = self.graph.run("MATCH (n)--(b) where n.name=~'(?i).*{}.*' return n,b".format(select_name)).data()
            self.get_select_nodes(nodes_data)
        else:
            # 获取知识图谱中所有节点数据
            links_data = self.graph.run("MATCH ()-[r]->() RETURN r").data()
            nodes_data = self.graph.run("MATCH (n) RETURN n").data()
            self.get_all_nodes(nodes_data)

        self.get_links(links_data)

        # 数据格式转换
        neo4j_data = {'links': self.links, 'nodes': self.nodes}
        neo4j_data_json = json.dumps(neo4j_data, ensure_ascii=False).replace(u'\xa0', u'')
        return neo4j_data_json

    def get_links(self, links_data):
        """知识图谱关系数据获取"""
        for link in links_data:
            links_str = re.sub("[\!\%\[\]\,\。\{\}\-\:\'\(\)\>]", " ", str(link['r'])).split(' ')
            links_str = [i for i in links_str if len(i)>1]
            if len(links_str) >= 3:
                self.links.append({'source':links_str[0], 'name':links_str[1], 'target':links_str[2]})
        return self.links

    def get_select_nodes(self, nodes_data):
        """获取知识图谱中所选择的节点数据"""
        for node in nodes_data:
            node_str = re.sub("[\!\%\[\]\,\。\{\}\-\:\'\(\)\>]", " ", str(node)).split(' ')
            node_str = [i for i in node_str if len(i)>1]
            if len(node_str) >= 8:
                if node_str[1] != 'Node' or node_str[5] != 'Node':
                    self.nodes.append({'name':node_str[3], 'tag':node_str[1]})
                    self.nodes.append({'name':node_str[7], 'tag':node_str[5]})
        return self.nodes

    def get_all_nodes(self, nodes_data):
        """获取知识图谱中所有节点数据"""
        dict_node = {}
        for node in nodes_data:
            name = node['n']['name']
            tag = node['n']['tag']
            dict_node['name'] = name
            dict_node['tag'] = tag
            self.nodes.append(dict_node)
            dict_node = {}
        return self.nodes

def api_entity_recognize(texts):
    url = 'http://192.168.200.155:7712/api/medical_entity_recognize' #实体识别v1.0
    data = {'text': texts}
    r = requests.post(url, data=data, timeout=60)
    return r

rules = api_entity_recognize(texts='布地奈德雾化吸⼊治疗⼩⼉哮喘临床疗效观察')
print(rules)
print(rules.json())

# data_neo4j = Neo4jToJson()
# data_neo4j.post()

In [None]:
graph = Graph("http://192.168.200.155:7474", auth=("neo4j", "aid_neo4j"))
nodematcher= NodeMatcher(graph)
relamatcher = RelationshipMatcher(graph)

match_str = "MATCH (na:drug)-[*1..10]->(nb) WHERE na.name=~'(?i).*布地奈德.*' RETURN (na)-[*1..10]->(nb)"
match_str = '''MATCH p=(n1:drug)-[r1]-(n2)-[r2]-(n3:disease {name:'哮喘'})
WHERE not (n1)-[]-(n3)
WITH n1.name as Drug, n3.name as disease, n2.name as NEighborName, 1/log(size((n2)-[]-())) as AdamicAdarScore
RETURN Drug, disease, sum(AdamicAdarScore) as TotalAdamicAdarScore
ORDER BY TotalAdamicAdarScore DESC'''
results = graph.run(match_str)

# nodes = [] 
# for i in results:
#     node_str = re.sub("Path|Node|name=", "", str(i))
#     node_str = re.sub("[\!\%\[\]\,\。\{\}\-\:\'\(\)\>]", " ", node_str).split(' ')
#     node_str = [i for i in node_str if len(i)>1]
#     if len(node_str) == 7:
#         nodes.append(node_str[1])
#         nodes.append(node_str[6])

# nodes = list(set(nodes))
# print(nodes)

for i in results:
    if '布地奈德' in i[0]:
        print(i) 
    print(i)


# 知识图谱嵌入

### KGE
KGE就是将实体和关系嵌入到低维向量空间中，同时保留KG的结构和语义信息

现有的KGE方法可以划分为三类：

    1.基于翻译距离的(translational distance based)

    2.基于语义匹配的(semantic matching based)

    3.基于神经网络的(neural network based)

测试数据 FB15K知识库[https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:fb15k.tgz]

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import defaultdict
import argparse
from tqdm import tqdm
import os
import pandas as pd
import configparser

In [None]:
def load_data(data_dir, data_type):
    with open("%s%s.txt"%(data_dir, data_type), "r") as f:
        data = f.read().strip().split("\n")
        data = [i.split('\t') for i in data]
        print(len(data),data_type)
        return data

class KGE(nn.Module):
    def __init__(self,model_name,ent_vec_dim,num_entities,num_relations):
        '''
        num_entities是所有实体的数量
        num_relations是所有关系的数量
        ent_vec_dim是每一个实体向量的维度
        如果model_name是RESCAL，那么每一个关系用一个矩阵matrix表示，shape==(ent_vec_dim,ent_vec_dim)
        '''
        super(KGE,self).__init__()
        self.E=nn.Embedding(num_embeddings=num_entities,embedding_dim=ent_vec_dim,padding_idx=0)
        self.model_name=model_name
        self.ent_vec_dim=ent_vec_dim
        self.num_entities=num_entities
        if self.model_name=='RESCAL':
            self.R=nn.Embedding(num_embeddings=num_relations,embedding_dim=ent_vec_dim*ent_vec_dim,padding_idx=0)
            self.scoreFun=self.RESCAL
        else:
            self.R=nn.Embedding(num_embeddings=num_relations,embedding_dim=ent_vec_dim,padding_idx=0)
            self.scoreFun=self.DistMult
        
    def RESCAL(self,head_embed,rel_embed):
        '''
        RESCAL模型将每一个关系用一个matrix表示。
        输入：
            head_embed.size()==(batch_size,self.ent_vec_dim)
            rel_embed.size()==(batch_size,self.ent_vec_dim*2)
        输出：
            score.size()==(batch_size,self.num_entities)
        '''
        batch_size=head_embed.size(0)
        head_embed=head_embed.view(batch_size,1,self.ent_vec_dim)
        rel_embed=rel_embed.view(batch_size,self.ent_vec_dim,self.ent_vec_dim)
        score=torch.mm(torch.squeeze(torch.bmm(head_embed,rel_embed),dim=1),self.E.weight.transpose(1,0))
        return score
    
    def DistMult(self,head_embed,rel_embed):
        '''
        DistMult是RESCAL的简化版，将每一个关系用一个vector表示。
        输入：
            head_embed.size()==(batch_size,self.ent_vec_dim)
            rel_embed.size()==(batch_size,self.ent_vec_dim)
        输出：
            score.size()==(batch_size,self.num_entities)        
        '''
        score=torch.mm(head_embed*rel_embed,self.E.weight.transpose(1,0))
        return score
    
    def forward(self,head_idx,rel_idx):
        '''
        输入：
            head_idx.size()==rel_idx.size()==(batch_size,)
        输出：
            probabilities.size()==(batch_size,self.num_entities)     
            即：预测每一个实体可以作为尾实体的概率
        '''
        batch_size=head_idx.size(0)
        score=self.scoreFun(head_embed=self.E(head_idx),rel_embed=self.R(rel_idx))
        assert score.size()==(batch_size,self.num_entities)
        probabilities=torch.sigmoid(score)
        return probabilities


In [None]:
# 加载数据
# train_data = load_data(data_dir='../data/FB15k/',data_type='freebase_mtr100_mte100-train')
# valid_data = load_data(data_dir='../data/FB15k/',data_type='freebase_mtr100_mte100-valid')
# test_data = load_data(data_dir='../data/FB15k/',data_type='freebase_mtr100_mte100-test')
data = get_mysql_data('SELECT entity,relation,object FROM triple LIMIT 1000;')
train_data = data[:int(len(data)*0.7)]
valid_data = data[int(len(data)*0.7):int(len(data)*0.9)]
test_data = data[int(len(data)*0.9):]
data = train_data+valid_data+test_data
print(len(data), data[0])

# 统计所有的头实体、尾实体以及关系
entities = sorted(list(set([d[0] for d in data]+[d[2] for d in data])))
print(len(entities), entities[0])
relations = sorted(list(set([d[1] for d in data])))
print(len(relations), relations[0])

# 构造entity2id和relation2id的字典映射
entity_idxs = {entities[i]:i for i in range(len(entities))}
relation_idxs = {relations[i]:i for i in range(len(relations))}

# 生成训练数据
train_data_idxs = [[entity_idxs[triplet[0]],relation_idxs[triplet[1]],entity_idxs[triplet[2]]] for triplet in train_data]

# 生成批次的数据输入
er_vocab = defaultdict(list)
for triplet in train_data_idxs:
    er_vocab[(triplet[0],triplet[1])].append(triplet[2])
er_vocab_pairs = list(er_vocab.keys())
batch_inputs = er_vocab_pairs[:4]
batch_targets = torch.zeros([len(batch_inputs),len(entity_idxs)],dtype=torch.float32)
for i,pair in enumerate(batch_inputs):
    batch_targets[i,er_vocab[pair]] = 1
batch_inputs = np.array(batch_inputs)

print(batch_inputs)
print(batch_targets.size())
print(er_vocab[(3920,791)])
print(sum(batch_targets[0]))

# 前向传播
head_idx = torch.LongTensor(batch_inputs[:,0])
rel_idx = torch.LongTensor(batch_inputs[:,1])

RESCAL = KGE(model_name='RESCAL',ent_vec_dim=200,num_entities=len(entity_idxs),num_relations=len(relation_idxs))
DistMult = KGE(model_name='DistMult',ent_vec_dim=200,num_entities=len(entity_idxs),num_relations=len(relation_idxs))
probabilities1 = RESCAL(head_idx,rel_idx)
probabilities2 = DistMult(head_idx,rel_idx)

# 计算BCE损失
loss = torch.nn.BCELoss()(probabilities1,batch_targets)
print(loss.item())
loss.backward()


In [None]:
# 测试模型
test_data_idxs = [[entity_idxs[triplet[0]],relation_idxs[triplet[1]],entity_idxs[triplet[2]]] for triplet in test_data]
test_er_vocab = defaultdict(list)
for triplet in test_data_idxs:
    test_er_vocab[(triplet[0],triplet[1])].append(triplet[2])
test_batch_inputs = test_data_idxs
test_batch_inputs = np.array(test_batch_inputs)

# 前向传播获取预测分数(tail_idx是标签,probabilities是预测的分数)
head_idx = torch.tensor(test_batch_inputs[:,0])
rel_idx = torch.tensor(test_batch_inputs[:,1])
tail_idx = torch.tensor(test_batch_inputs[:,2])
probabilities = RESCAL(head_idx, rel_idx)

# 解析结果
entity_ids = {}
relation_ids = {}
for k,v in entity_idxs.items():
    entity_ids[v] = k
for k,v in relation_idxs.items():
    relation_ids[v] = k

pre_data = []
for i in range(len(test_batch_inputs)):
    head, rel, tail = head_idx[i].item(), rel_idx[i].item(), tail_idx[i].item() # (head,rel,tail)是当前的三元组
    all_fact_tails = test_er_vocab[(head,rel)]                                  #给定当前头实体和关系下，测试集中所有符合的尾实体
    predict_score = probabilities[i][tail].item()                               #首先取出模型预测当前三元组尾实体的分数
    probabilities[i][all_fact_tails] = 0.0                                      #将测试集中所有与(head,rel,?)满足事实三元组的尾实体置空
    probabilities[i][tail] = int(predict_score * 100)                                      #恢复模型预测当前三元组尾实体的分数
    for tail in all_fact_tails:
        pre_data.append([entity_ids[int(head)], relation_ids[int(rel)], entity_ids[int(tail)], predict_score])
df = pd.DataFrame(data=pre_data, columns=['head', 'rel', 'tail', 'score']).sort_values('score')
# df.to_csv('KCE_test.csv')
df

KGE-MeanRank
1、probabilities的长度是num_entities，我们将probabilities降序排列，即分数高的排在前面。
2、mean_rank中的rank指的就是：模型对于当前三元组尾实体在所有实体中的分数排名。所以这个数值越低越好，因为越低，表明排名越靠前。
3、Hit@1,3,10: @就是英文的at。hit at 1指的就是将尾实体排在第一位的次数/测试集合大小，hit at 3和git at 10同理。显然这个数值越高越好。越高说明每一个三元组的尾实体都被模型排名的非常靠前。

In [None]:
sort_scores,sort_idxs = torch.sort(probabilities,dim=1,descending=True)
ranks = []
hits = [[] for _ in range(10)]
for i in range(len(test_batch_inputs)):
    rank = np.where(sort_idxs[i].numpy()==tail_idx[i].item())[0][0]+1
    ranks.append(rank)
    for hits_level in range(1,11):
        if rank <= hits_level:
            hits[hits_level-1].append(1.0)
        else:
            hits[hits_level-1].append(0.0)
ranks

In [None]:
hitat10 = np.mean(hits[9])
hitat3 = np.mean(hits[2])
hitat1 = np.mean(hits[0])
mean_rank = np.mean(ranks)
mrr = np.mean(1./np.array(mean_rank))


## TransE 转移距离模型
转移距离模型（Translational Distance Model）的主要思想是将衡量向量化后的知识图谱中三元组的合理性问题，转化成衡量头实体和尾实体的距离问题。这一方法的重点是如何设计得分函数，得分函数常常被设计成利用关系把头实体转移到尾实体的合理性的函数。 受词向量的启发，由词与词在向量空间的语义层面关系，可以拓展到知识图谱中头实体和尾实体在向量空间的关系。也就是说，同样可以 考虑把知识图谱中的头实体和尾实体映射到向量空间中，且它们之间的 联系也可以考虑成三元组中的关系。

TransE便是受到了词向量中平移不变性的启发，在 TransE 中，把实体和关系都表示为向量，对于某一 个具体的关系（head, relation, tail），把关系的向量表示解释成头实体的向量到尾实体的向量的转移向量（Translation vector）。也就是说， 如果在一个知识图谱中，某一个三元组成立，则它的实体和关系需要满 足关系head+relation≈tail。

In [None]:
import codecs
import random
import math
import numpy as np
import copy
import time

entity2id = {}
relation2id = {}

def data_loader(file):
    file1 = file + "train.txt"
    file2 = file + "entity2id.txt"
    file3 = file + "relation2id.txt"

    with open(file2, 'r') as f1, open(file3, 'r') as f2:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
        for line in lines1:
            line = line.strip().split('\t')
            if len(line) != 2:
                continue
            entity2id[line[0]] = line[1]

        for line in lines2:
            line = line.strip().split('\t')
            if len(line) != 2:
                continue
            relation2id[line[0]] = line[1]

    entity_set = set()
    relation_set = set()
    triple_list = []

    with codecs.open(file1, 'r') as f:
        content = f.readlines()
        for line in content:
            triple = line.strip().split("\t")
            if len(triple) != 3:
                continue

            h_ = entity2id[triple[0]]
            t_ = entity2id[triple[1]]
            r_ = relation2id[triple[2]]

            triple_list.append([h_,t_,r_])

            entity_set.add(h_)
            entity_set.add(t_)

            relation_set.add(r_)

    return entity_set, relation_set, triple_list

def distanceL2(h,r,t):
    #为方便求梯度，去掉sqrt
    return np.sum(np.square(h + r - t))

def distanceL1(h,r,t):
    return np.sum(np.fabs(h+r-t))

class TransE:
    def __init__(self, entity_set, relation_set, triple_list, embedding_dim=100, learning_rate=0.01, margin=1,L1=True):
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.margin = margin
        self.entity = entity_set
        self.relation = relation_set
        self.triple_list = triple_list
        self.L1=L1

        self.loss = 0

    def emb_initialize(self):
        relation_dict = {}
        entity_dict = {}

        for relation in self.relation:
            r_emb_temp = np.random.uniform(-6/math.sqrt(self.embedding_dim), 6/math.sqrt(self.embedding_dim), self.embedding_dim)
            relation_dict[relation] = r_emb_temp/np.linalg.norm(r_emb_temp,ord=2)

        for entity in self.entity:
            e_emb_temp = np.random.uniform(-6/math.sqrt(self.embedding_dim), 6/math.sqrt(self.embedding_dim), self.embedding_dim)
            entity_dict[entity] = e_emb_temp/np.linalg.norm(e_emb_temp,ord=2)

        self.relation = relation_dict
        self.entity = entity_dict

    def train(self, epochs):
        nbatches = 400
        batch_size = len(self.triple_list) // nbatches
        print("batch size: ", batch_size)
        for epoch in range(epochs):
            start = time.time()
            self.loss = 0

            for k in range(nbatches):
                # Sbatch:list
                Sbatch = random.sample(self.triple_list, batch_size)
                Tbatch = []

                for triple in Sbatch:
                    # 每个triple选3个负样例
                    # for i in range(3):
                    corrupted_triple = self.Corrupt(triple)
                    if (triple, corrupted_triple) not in Tbatch:
                        Tbatch.append((triple, corrupted_triple))
                self.update_embeddings(Tbatch)


            end = time.time()
            print("epoch: ", epoch , "cost time: %s"%(round((end - start),3)))
            print("loss: ", self.loss)

            #保存临时结果
            if epoch % 20 == 0:
                with codecs.open("entity_temp", "w") as f_e:
                    for e in self.entity.keys():
                        f_e.write(e + "\t")
                        f_e.write(str(list(self.entity[e])))
                        f_e.write("\n")
                with codecs.open("relation_temp", "w") as f_r:
                    for r in self.relation.keys():
                        f_r.write(r + "\t")
                        f_r.write(str(list(self.relation[r])))
                        f_r.write("\n")

        print("写入文件...")
        with codecs.open("entity_50dim_batch400", "w") as f1:
            for e in self.entity.keys():
                f1.write(e + "\t")
                f1.write(str(list(self.entity[e])))
                f1.write("\n")

        with codecs.open("relation50dim_batch400", "w") as f2:
            for r in self.relation.keys():
                f2.write(r + "\t")
                f2.write(str(list(self.relation[r])))
                f2.write("\n")
        print("写入完成")


    def Corrupt(self,triple):
        corrupted_triple = copy.deepcopy(triple)
        seed = random.random()
        if seed > 0.5:
            # 替换head
            rand_head = triple[0]
            while rand_head == triple[0]:
                rand_head = random.sample(self.entity.keys(),1)[0]
            corrupted_triple[0]=rand_head

        else:
            # 替换tail
            rand_tail = triple[1]
            while rand_tail == triple[1]:
                rand_tail = random.sample(self.entity.keys(), 1)[0]
            corrupted_triple[1] = rand_tail
        return corrupted_triple

    def update_embeddings(self, Tbatch):
        copy_entity = copy.deepcopy(self.entity)
        copy_relation = copy.deepcopy(self.relation)

        for triple, corrupted_triple in Tbatch:
            # 取copy里的vector累积更新
            h_correct_update = copy_entity[triple[0]]
            t_correct_update = copy_entity[triple[1]]
            relation_update = copy_relation[triple[2]]

            h_corrupt_update = copy_entity[corrupted_triple[0]]
            t_corrupt_update = copy_entity[corrupted_triple[1]]

            # 取原始的vector计算梯度
            h_correct = self.entity[triple[0]]
            t_correct = self.entity[triple[1]]
            relation = self.relation[triple[2]]

            h_corrupt = self.entity[corrupted_triple[0]]
            t_corrupt = self.entity[corrupted_triple[1]]

            if self.L1:
                dist_correct = distanceL1(h_correct, relation, t_correct)
                dist_corrupt = distanceL1(h_corrupt, relation, t_corrupt)
            else:
                dist_correct = distanceL2(h_correct, relation, t_correct)
                dist_corrupt = distanceL2(h_corrupt, relation, t_corrupt)

            err = self.hinge_loss(dist_correct, dist_corrupt)

            if err > 0:
                self.loss += err

                grad_pos = 2 * (h_correct + relation - t_correct)
                grad_neg = 2 * (h_corrupt + relation - t_corrupt)

                if self.L1:
                    for i in range(len(grad_pos)):
                        if (grad_pos[i] > 0):
                            grad_pos[i] = 1
                        else:
                            grad_pos[i] = -1

                    for i in range(len(grad_neg)):
                        if (grad_neg[i] > 0):
                            grad_neg[i] = 1
                        else:
                            grad_neg[i] = -1

                # head系数为正，减梯度；tail系数为负，加梯度
                h_correct_update -= self.learning_rate * grad_pos
                t_correct_update -= (-1) * self.learning_rate * grad_pos

                # corrupt项整体为负，因此符号与correct相反
                if triple[0] == corrupted_triple[0]:  # 若替换的是尾实体，则头实体更新两次
                    h_correct_update -= (-1) * self.learning_rate * grad_neg
                    t_corrupt_update -= self.learning_rate * grad_neg

                elif triple[1] == corrupted_triple[1]:  # 若替换的是头实体，则尾实体更新两次
                    h_corrupt_update -= (-1) * self.learning_rate * grad_neg
                    t_correct_update -= self.learning_rate * grad_neg

                #relation更新两次
                relation_update -= self.learning_rate*grad_pos
                relation_update -= (-1)*self.learning_rate*grad_neg


        #batch norm
        for i in copy_entity.keys():
            copy_entity[i] /= np.linalg.norm(copy_entity[i])
        for i in copy_relation.keys():
            copy_relation[i] /= np.linalg.norm(copy_relation[i])

        # 达到批量更新的目的
        self.entity = copy_entity
        self.relation = copy_relation

    def hinge_loss(self,dist_correct,dist_corrupt):
        return max(0,dist_correct-dist_corrupt+self.margin)

data = open('../data/FB15k/freebase_mtr100_mte100-train.txt', 'r').read().strip().split("\n")
data = [i.split('\t') for i in data]
entities = sorted(list(set([d[0] for d in data]+[d[2] for d in data])))
relations = sorted(list(set([d[1] for d in data])))
entity_idxs = {entities[i]:i for i in range(len(entities))}
relation_idxs = {relations[i]:i for i in range(len(relations))}
triplet_list = [[entity_idxs[triplet[0]], relation_idxs[triplet[1]], entity_idxs[triplet[2]]] for triplet in data]
print(len(triplet_list), triplet_list[0])


## Ampligraph
pip install ampligraph tensorboard git+https://github.com/Phlya/adjustText

In [2]:
import pandas as pd
import numpy as np

import pymysql
import configparser

import ampligraph
from ampligraph.latent_features import TransE, ComplEx
from ampligraph.utils import save_model, restore_model, create_tensorboard_visualizations
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score, evaluate_performance, train_test_split_no_unseen

import tensorflow as tf

from scipy.special import expit

print(ampligraph.__version__)

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks))
    print('Mean Reciprocal Rank:', mrr_score(ranks))
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@3:', hits_at_n_score(ranks, 3))
    print('Hits@10:', hits_at_n_score(ranks, 10))


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.4.0


In [None]:
data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug');")
dataset = pd.DataFrame(data)
dataset.columns = ['subject', 'predicate', 'object']
print(dataset.shape)

test_train, X_valid = train_test_split_no_unseen(dataset.values, 5000, seed=0)
X_train, X_test = train_test_split_no_unseen(test_train, 10000, seed=0)
print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

# create a model as earlier
model = TransE(
    k=400,
    epochs=80,
    eta=30,
    loss='multiclass_nll',
    # initializer='xavier',
    regularizer='LP',
    optimizer='adam',
    initializer_params={'uniform':False},
    regularizer_params= {'lambda':0.0001, 'p':2},
    optimizer_params={'lr':0.001},
    embedding_model_params={'norm':1},
    seed=0,
    batches_count=64,
    verbose=True
    )

# call model.fit by passing early stopping params
model.fit(
    X_train,                                      # training set
    early_stopping=True,                          # set early stopping to true
    early_stopping_params={
        'x_valid': X_valid,   # Validation set on which early stopping will be performed
        'criteria': 'mrr',    # metric to watch during early stopping
        'burn_in': 150,       # Burn in time, i.e. early stopping checks will not be performed till 150 epochs
        'check_interval': 50, # After burn in time, early stopping checks will be performed at every 50th epochs (i.e. 150, 200, 250, ...)
        'stop_interval': 2,   # If the monitored criteria degrades for these many epochs, the training stops. 
        'corrupt_side': 's,o'#'s,o'  # Which sides to corrupt furing early stopping evaluation (default both subject and obj as described earlier)
        }   # pass the early stopping params
    )

# evaluate the model with filter
X_filter = np.concatenate([X_train, X_valid, X_test], 0)
ranks = evaluate_performance(X_test, model=model, filter_triples=X_filter)
display_aggregate_metrics(ranks)
save_model(model, '/data/nlp_models/TransE-small.pkl')


data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug') LIMIT 5;")
hypothesis = np.array(data)
# stack it horizontally to create s, p, o
# p_list = [i[1] for i in data]
# o_list = [i[2] for i in data]
# hypothesis = np.column_stack(['布地奈德', p_list, o_list])

triple_scores = model.predict(hypothesis)
probs = expit(triple_scores)
ranks = evaluate_performance(
    hypothesis, 
    model=model, 
    filter_triples=X_filter,
    corrupt_side = 's+o',
    use_default_protocol=True,
    verbose=True
    )
print(len(ranks))
df = pd.DataFrame(
    list(zip([','.join(x) for x in hypothesis], ranks, np.squeeze(triple_scores), np.squeeze(probs))),
    columns=['triple', 'rank', 'score', 'prob']
    ).sort_values('prob')
df

# epochs:100
# Mean Rank: 10991.15365
# Mean Reciprocal Rank: 0.029947334564126286
# Hits@1: 0.0202
# Hits@3: 0.03065
# Hits@10: 0.04755

# epochs:150
# Mean Rank: 11571.2768
# Mean Reciprocal Rank: 0.024059398719972723
# Hits@1: 0.01705
# Hits@3: 0.0241
# Hits@10: 0.03665


In [6]:
from ampligraph.discovery import query_topn

model = restore_model('/data/nlp_models/TransE-small.pkl')

# data = get_mysql_data("SELECT entity,relation FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug') LIMIT 100;")
# data_dt = {}
# for i in data:
#     data_dt[i[0]] = i[1]

# for k,v in data_dt.items():
#     print(query_topn(model, top_n=10, head=k, relation=v, tail=None))


print(query_topn(model, top_n=50, head='哮喘', relation='Symptom_Disease', tail=None))

#query_topn(model, top_n=20, head='哮喘', relation='Symptom_Disease', tail=None)

(array([['哮喘', 'Symptom_Disease', '脱症'],
       ['哮喘', 'Symptom_Disease', '肾小球基底膜蛾噬现象'],
       ['哮喘', 'Symptom_Disease', '帕里诺眼-腺综合征'],
       ['哮喘', 'Symptom_Disease', '腹部“气串样”肿块'],
       ['哮喘', 'Symptom_Disease', '剧烈运动后呕吐'],
       ['哮喘', 'Symptom_Disease', '脓疱性痤疮'],
       ['哮喘', 'Symptom_Disease', '菌尿'],
       ['哮喘', 'Symptom_Disease', '小儿双下肢蜷曲'],
       ['哮喘', 'Symptom_Disease', '额头长痘'],
       ['哮喘', 'Symptom_Disease', '牙龈深红或暗红色'],
       ['哮喘', 'Symptom_Disease', '扁桃体小或缺如'],
       ['哮喘', 'Symptom_Disease', '饮水呛咳'],
       ['哮喘', 'Symptom_Disease', '食管恶性病变'],
       ['哮喘', 'Symptom_Disease', '鼻衂'],
       ['哮喘', 'Symptom_Disease', '新生儿皮肤薄'],
       ['哮喘', 'Symptom_Disease', '尾骨压痛及异常活动'],
       ['哮喘', 'Symptom_Disease', '反复肺不张'],
       ['哮喘', 'Symptom_Disease', '上半身代偿性多汗症'],
       ['哮喘', 'Symptom_Disease', '新生儿心力衰竭'],
       ['哮喘', 'Symptom_Disease', '震荡性神经受累'],
       ['哮喘', 'Symptom_Disease', '真皮侵袭性生长'],
       ['哮喘', 'Symptom_Disease', '心悸伴高血压'],
       ['哮喘', 'Symptom_Di

ComplEx

In [None]:
data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE relation IN ('English_Name','Symptom_Disease','Food_Good','Food_Bad');")
X = np.array(data)

entities = np.unique(np.concatenate([X[:,0], X[:,2]]))
relations = np.unique((X[:,1]))

data = {}
num_test = int(len(X) * (20/100))
data['train'], data['test'] = train_test_split_no_unseen(X, test_size=num_test, seed=0, allow_duplication=False)
print('train set size:', data['train'].shape)
print('test set size:', data['test'].shape)

model = ComplEx(
    batches_count=100,
    seed=0,
    epochs=5,
    k=150,
    eta=5,
    optimizer='adam',
    optimizer_params={'lr':1e-3},
    loss='multiclass_nll',
    regularizer='LP',
    regularizer_params={'p':3, 'lambda':1e-5},
    verbose=True
)

tf.logging.set_verbosity(tf.logging.ERROR)
model.fit(data['train'], early_stopping=False)


In [None]:
# data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE relation IN ('English_Name','Symptom_Disease','Food_Good','Food_Bad') LIMIT 10000;")
X_unseen = data['test'] #np.array(data)
positives_filter = X
unseen_filter = np.array(list({tuple(i) for i in np.vstack(((positives_filter, X_unseen)))}))

ranks_unseen = evaluate_performance(
    X_unseen,
    model=model,
    filter_triples=unseen_filter,
    corrupt_side='s+o',
    use_default_protocol=False,
    verbose=True
)
display_aggregate_metrics(ranks_unseen)

scores = model.predict(X_unseen)
probs = expit(scores)
df = pd.DataFrame(
    list(zip([', '.join(x) for x in X_unseen], ranks_unseen, np.squeeze(scores), np.squeeze(probs))),
    columns=['triple', 'rank', 'score', 'prob']
    ).sort_values('score')
# create_tensorboard_visualizations(model, 'GoT_embeddings')

df

## lightKG

测试环境 Python 3.6.8 Pytorch 1.4.0
所需依赖
torchtext>=0.4.0
tqdm>=4.28.1
torch>=1.0.0
pytorch_crf>=0.7.0
scikit_learn>=0.20.2
networkx>=2.2
revtok
jieba
regex
运行前需要按如下方式安装lightKG库 
!pip install -i https://pypi.douban.com/simple/ lightKG。



In [None]:
import pandas as pd
import os
from lightkg.krl import KRL
from lightkg.krl.config import DEFAULT_CONFIG

DEFAULT_CONFIG['epoch'] = 10 #修改epoch，默认1000
print(os.getcwd())

In [None]:
data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug')")
train_data = pd.DataFrame(data)
test_data = pd.DataFrame(data[int(len(data)*0.8):])
train_data.to_csv('/home/aid/Github/kg/data/lightkg_data/train.sample.csv', sep=',', encoding='utf-8')
test_data.to_csv('/home/aid/Github/kg/data/lightkg_data/test.sample.csv', sep=',', encoding='utf-8')
print(len(train_data))
print(len(test_data))
train_data

In [None]:
krl = KRL()
krl.train('/home/aid/Github/kg/data/lightkg_data/train.sample.csv',
    model_type='TransE',
    dev_path='/home/aid/Github/kg/data/lightkg_data/test.sample.csv',
    save_path='/home/aid/Github/kg/data/lightkg_data/LP_{}'.format('TransE')
    )

In [None]:
krl = KRL()
krl.load(save_path='/home/aid/Github/kg/data/lightkg_data/LP_{}'.format('TransE'), model_type='TransE')
# krl.test('/home/aid/Github/kg/data/lightkg_data/test.sample.csv')

data = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug') LIMIT 100;")
for i in data:
    head, rel, tail = i[0], i[1], i[2]
    predict_score = krl.predict(head=head, rel=rel, tail=tail)
    predict_head = krl.predict_head(rel=rel, tail=tail, topk=3)
    # predict_rel = krl.predict_rel(head=head, tail=tail, topk=3)
    # predict_tail = krl.predict_tail(head=head, rel=rel, topk=3)
    # print('---->预测三元组其概率:', predict_score)
    print('---->预测三元组其头实体:', predict_head)
    # print('---->预测三元组其关系:', predict_rel)
    # print('---->预测三元组其尾实体:', predict_tail)

## Pykeen
pip install git+https://github.com/pykeen/pykeen.git

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

import pykeen
from pykeen.datasets import Nations
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.hpo import hpo_pipeline
from pykeen.models.predict import predict_triples_df

# %matplotlib inline
# %config InlineBackend.figure_format = 'svg'
import pymysql
import configparser

def get_mysql_data(sql, mysql_config_name=None):
    '''
     description: 获取mysql数据
     param {*}
     return {*}
    '''
    con = configparser.RawConfigParser()
    con.read('../config/config.ini', encoding='utf-8')
    sections = con.sections()
    if mysql_config_name == None:
        try:
            mysql = dict(con.items('mysql_nlp_tagging'))
            connection = pymysql.connect(host=mysql['host'], port=int(mysql['port']), user=mysql['user'],password=mysql['password'], db=mysql['database'], charset='utf8mb4')
        except:
            connection = pymysql.connect(host='192.168.100.50', port=3306, user='root',password='Aid@pro888888', db='nlp_tagging', charset='utf8mb4')

    cursor = connection.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()
    cursor.close()
    connection.close()

    return data

pykeen.env()

In [None]:
triples_data  = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug')")
triples_data = np.array(triples_data)
train_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data))
test_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data[int(len(triples_data)*0.7):int(len(triples_data)*0.9)]))
vali_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data[int(len(triples_data)*0.9):]))
print('data train:', train_triples)
print('data test:', test_triples)
print('data vali:', vali_triples)

pipeline_result = pipeline(
    training=train_triples,
    testing=test_triples,
    validation=vali_triples,
    model='TransE',#'TransE'TuckER,
    model_kwargs=dict(embedding_dim=128),
    training_kwargs=dict(num_epochs=10, use_tqdm_batch=False),
    device='cpu',
    )
# pipeline_result.save_to_directory('/data/nlp_models/TuckER') #('../models/TransE')

realistic_mean_rank = pipeline_result.get_metric('meanreciprocalrank')
mean_rank = pipeline_result.get_metric('mean_rank')
hits_at_1 = pipeline_result.get_metric('hits@10')
hits_at_3 = pipeline_result.get_metric('hits@10')
hits_at_10 = pipeline_result.get_metric('hits@10')
print('realistic_mean_rank:',realistic_mean_rank, '\nmean_rank:',mean_rank, '\nhits_at_1:',hits_at_1, '\nhits_at_3:',hits_at_3, '\nhits_at_10:',hits_at_10)
# pipeline_result.plot(er_kwargs=dict(plot_relations=True))
# plt.savefig('../rules/toy_3.png', dpi=1000)

# result = pipeline(
#     dataset='Nations',
#     model='PairRE',
#     # Training configuration
#     training_kwargs=dict(
#         num_epochs=200,
#         use_tqdm_batch=False,
#     ),  
#     # Runtime configuration
#     random_seed=1235,
#     device='cpu',
# )
# 0.003185485312360561 1225.2230224609375
# 0.004759519038076153 0.004759519038076153 0.004759519038076153

In [None]:
pipeline_result.save_to_directory('/data/nlp_models/TuckER')

In [None]:
if pipeline_result.model:
    model = pipeline_result.model
else:
    pass
    # model = torch.load('/data/nlp_models/TuckER/trained_model.pkl')#('../models/TransE/trained_model.pkl')

import pymysql
import configparser
def get_mysql_data(sql, mysql_config_name=None):
    '''
     description: 获取mysql数据
     param {*}
     return {*}
    '''
    con = configparser.RawConfigParser()
    con.read('../config/config.ini', encoding='utf-8')
    sections = con.sections()
    if mysql_config_name == None:
        try:
            mysql = dict(con.items('mysql_nlp_tagging'))
            connection = pymysql.connect(host=mysql['host'], port=int(mysql['port']), user=mysql['user'],password=mysql['password'], db=mysql['database'], charset='utf8mb4')
        except:
            connection = pymysql.connect(host='192.168.100.50', port=3306, user='root',password='Aid@pro888888', db='nlp_tagging', charset='utf8mb4')

    cursor = connection.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()
    cursor.close()
    connection.close()

    return data

triples_data  = get_mysql_data("SELECT entity,relation,object FROM triple WHERE  o_type IN ('disease','symptom','check', 'drug')")
triples_data = np.array(triples_data)
train_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data[:int(len(triples_data)*0.7)]))
test_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data[int(len(triples_data)*0.7):int(len(triples_data)*0.9)]))
val_triples = TriplesFactory.from_labeled_triples(triples=np.array(triples_data[int(len(triples_data)*0.9):]))


predict_data = []
head, rel, tail = '哮喘', 'Symptom_Disease', ''
head, rel, tail = '哮喘', 'Complication', ''
# predict_head_df = model.get_head_prediction_df(relation_label=rel, tail_label=tail, triples_factory=train_triples)
# predict_rel_df = model.get_relation_prediction_df(head_label=head, tail_label=tail, triples_factory=train_triples)
predict_tail_df = model.get_tail_prediction_df(head_label=head, relation_label=rel, triples_factory=train_triples)
for x in predict_tail_df.values.tolist():
    predict_data.append([head, x[0], x[1], x[2], x[3]])
predict_df = pd.DataFrame(predict_data, columns=['head', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
# predict_df = pd.DataFrame(predict_data, columns=['head', 'rel', 'tail', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
predict_df.sort_values('pre_score')
predict_df.to_csv('test2.csv')
predict_df.head(50)

# predict_data = []
# for i in triples_data:
#     head, rel, tail = i[0], i[1], i[2]
#     # predict_head_df = model.get_head_prediction_df(relation_label=rel, tail_label=tail, triples_factory=train_triples)
#     # predict_rel_df = model.get_relation_prediction_df(head_label=head, tail_label=tail, triples_factory=train_triples)
#     predict_tail_df = model.get_tail_prediction_df(head_label=head, relation_label=rel, triples_factory=train_triples)
#     for x in predict_tail_df.values.tolist():
#         if x[2] > 0:
#             if '哮喘' == head:
#                 predict_data.append([head, x[0], x[1], x[2], x[3]])
#                 # predict_data.append([head, rel, tail, x[0], x[1], x[2], x[3]])
#                 break

# predict_df = pd.DataFrame(predict_data, columns=['head', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
# # predict_df = pd.DataFrame(predict_data, columns=['head', 'rel', 'tail', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
# predict_df.sort_values('pre_score')
# predict_df.to_csv('test.csv')
# predict_df.head(50)


In [None]:
predict_data = []
head, rel, tail = '哮喘', 'Symptom_Disease', ''
# predict_head_df = model.get_head_prediction_df(relation_label=rel, tail_label=tail, triples_factory=train_triples)
# predict_rel_df = model.get_relation_prediction_df(head_label=head, tail_label=tail, triples_factory=train_triples)
predict_tail_df = model.get_tail_prediction_df(head_label=head, relation_label=rel, triples_factory=train_triples)
for x in predict_tail_df.values.tolist():
    predict_data.append([head, x[0], x[1], x[2], x[3]])
predict_df = pd.DataFrame(predict_data, columns=['head', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
# predict_df = pd.DataFrame(predict_data, columns=['head', 'rel', 'tail', 'pre_tail_id', 'pre_tail', 'pre_score', 'is_train'])
predict_df.sort_values('pre_score')
predict_df.to_csv('test.csv')
predict_df.head(50)