In [1]:
from sklearn.metrics import roc_auc_score
import torch
import numpy as np
import os
import time
import datetime
from collections import Counter
import random
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import copy
from common import *
from buildtrain import *
from scipy.stats import rankdata
from model import *
import torch
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import softmax
from torch.nn.init import xavier_normal_
from torch.nn import Parameter
from ordered_set import OrderedSet
from collections import defaultdict as ddict
from data_loader import *
from model import *

In [2]:
device = 'cuda:0'
#device = 'cpu'

In [3]:
class Data():
    def __init__(self,data_path,data_name,strategy='one_to_n',batch_size=1000,test_batch_size=2000,device='gpu'):
        self.data_path = data_path
        self.data_name = data_name
        self.strategy = strategy
        self.test_batch_size = test_batch_size
        self.batch_size = batch_size
        self.device = device
        self.ent2edges = None
        
    def load_data(self):
        """
        Reading in raw triples and converts it into a standard format.

        Parameters
        ----------
        self.p.dataset:         Takes in the name of the dataset (FB15k-237)

        Returns
        -------
        self.ent2id:            Entity to unique identifier mapping
        self.id2rel:            Inverse mapping of self.ent2id
        self.rel2id:            Relation to unique identifier mapping
        self.num_ent:           Number of entities in the Knowledge graph
        self.num_rel:           Number of relations in the Knowledge graph
        self.embed_dim:         Embedding dimension used
        self.data['train']:     Stores the triples corresponding to training dataset
        self.data['valid']:     Stores the triples corresponding to validation dataset
        self.data['test']:      Stores the triples corresponding to test dataset
        self.data_iter:     The dataloader for different data splits

        """
        ent_set, rel_set = OrderedSet(), OrderedSet()
        for split in ['train', 'test', 'valid']:
            for line in open('./{}/{}/{}.txt'.format(self.data_path, self.data_name,split)):
                sub, rel, obj = map(str.lower, line.strip().split('\t'))
                ent_set.add(sub)
                rel_set.add(rel)
                ent_set.add(obj)

        self.ent2id = {ent: idx for idx, ent in enumerate(ent_set)}
        self.rel2id = {rel: idx for idx, rel in enumerate(rel_set)}
        self.rel2id.update({rel + '_reverse': idx + len(self.rel2id) for idx, rel in enumerate(rel_set)})

        self.id2ent = {idx: ent for ent, idx in self.ent2id.items()}
        self.id2rel = {idx: rel for rel, idx in self.rel2id.items()}

        self.num_ent = len(self.ent2id)
        self.num_rel = len(self.rel2id) // 2
        print('num_ent {} num_rel {}'.format(self.num_ent, self.num_rel))
        
        self.triples = ddict(list)

        for split in ['train', 'test', 'valid']:
            for line in open('./{}/{}/{}.txt'.format(self.data_path,self.data_name, split)):
                sub, rel, obj = map(str.lower, line.strip().split('\t'))
                sub, rel, obj = self.ent2id[sub], self.rel2id[rel], self.ent2id[obj]
                self.triples[split].append((sub, rel, obj))
                self.triples[split].append((obj, rel + self.num_rel, sub))
        self.edge_index, self.edge_type = self.construct_adj()

    def construct_adj(self):
        """
        Constructor of the runner class

        Parameters
        ----------

        Returns
        -------
        Constructs the adjacency matrix for GCN

        """
        edge_index, edge_type = [], []

        for sub, rel, obj in self.triples['train']:
            edge_index.append((sub, obj))
            edge_type.append(rel)

        
       
        return edge_index, edge_type
    


In [4]:
data_path = 'data'
#data_name = 'WN18RR'
data_name = 'FB15k-237'
data = Data(data_path,data_name,device=device)
data.load_data()

num_ent 14541 num_rel 237


In [5]:
class HR2V(torch.nn.Module):
    def __init__(self,dim=100):
        super(HR2V,self).__init__()
        self.linear = torch.nn.Linear(2*dim,dim)
        self.ac = torch.nn.ReLU()
    def forward(self,h,r):
        return h*r
        

In [6]:

class HR2V_DeepE(torch.nn.Module):
    def __init__(self,dim=100,num_source_layers=1,input_drop=0.4,hidden_drop=0.4,identity_drop=0.0):
        super(HR2V_DeepE,self).__init__()
        self.input_drop = torch.nn.Dropout(input_drop)
        self.input_bn = torch.nn.BatchNorm1d(2*dim)
        self.source_layers = torch.nn.Sequential()
        
        for i in range(num_source_layers):
            if i ==0:
                input_emb = dim*2
            else:
                input_emb = dim
            self.source_layers.append(DeepEBlock(input_emb,dim,
                                                 hidden_drop,torch.nn.ReLU,layers=2,identity_drop=identity_drop))
        
        
    
    def forward(self,h,r):        
        stacked_inputs = torch.cat([h,r], -1)
        x = self.input_bn(stacked_inputs)
        x = self.input_drop(x)
        
        x = self.source_layers(x)
        return x

In [7]:
class GNNLayer(MessagePassing):
    def __init__(self,edge_index, edge_type,num_ent,hr2v_model,device,data,drop_adj=0.1):
        super(GNNLayer,self).__init__(aggr='mean')
        self.edge_index = edge_index
        self.edge_type = edge_type
        self.hr2v_model = hr2v_model
        self.data = data
        self.drop_adj = drop_adj
        self.device = device
        self.test_edge_index = torch.LongTensor(edge_index).to(self.device).t()
        self.test_edge_type = torch.LongTensor(edge_type).to(self.device)
        
        self.gen_train_adj(drop_adj = self.drop_adj )
        self.max_refresh_adj_num = 50
        self.now_adj_num = 0
        
    def gen_train_adj(self,drop_adj=0.1):
        al =list(zip(self.edge_index,self.edge_type))
        random.shuffle(al)
        leng = int(len(al)*(1-drop_adj))
        al = al[:leng]
        edge_index,edge_type = zip(*al)
        edge_index = torch.LongTensor(edge_index).to(self.device).t()
        edge_type = torch.LongTensor(edge_type).to(self.device)
        self.train_edge_index =  edge_index
        self.train_edge_type = edge_type
        #self.train_edge_index =  self.test_edge_index
        #self.train_edge_type = self.test_edge_type
        
        
    def forward(self, x, rel_embed,train=False):
        if train:
            self.now_adj_num += 1
            if self.now_adj_num>=self.max_refresh_adj_num:
                self.now_adj_num = 0
                self.gen_train_adj(drop_adj = self.drop_adj)
                
        if train:
            edge_index = self.train_edge_index
            edge_type = self.train_edge_type
        else:
            edge_index = self.test_edge_index
            edge_type = self.test_edge_type
            
        entity_embed = self.propagate(edge_index, size=None, x=x, edge_type=edge_type, 
                             rel_embed=rel_embed)
        return entity_embed, rel_embed
    
    def message(self, edge_index_i, edge_index_j, x_i, x_j, edge_type, rel_embed):
        rel_embed = torch.index_select(rel_embed, 0, edge_type)
        
        xj_rel = self.hr2v_model(x_j,rel_embed)
        return xj_rel
    
    def update(self, aggr_out):
        return aggr_out
    

In [8]:
class GNNModel(torch.nn.Module):
    def __init__(self,num_ent,num_rel,init_emb_dim,init_rel_dim,edge_index,edge_type,
                 hr2v_model,device='cpu',data=None,pred_model=None,
                 num_target_layers=2,target_drop=0.,inner_layers=3,drop_adj=0,beta=0.5,xdrop=0.8):
        
        super(GNNModel,self).__init__()
        self.xdrop=torch.nn.Dropout(xdrop)
        self.device = device
        self.beta = beta
        self.init_ent_emb = self.get_param((num_ent,init_emb_dim))
        #默认正反关系，所以加一倍
        self.init_rel_emb = self.get_param((num_rel*2,init_rel_dim))
        self.edge_index, self.edge_type = edge_index, edge_type
        self.hr2v_model = hr2v_model
        self.pred_model =pred_model
        self.conv_lst = []
        self.loss_func = torch.nn.CrossEntropyLoss()
        
        #暂时只用1层
        conv0 = GNNLayer(self.edge_index, self.edge_type, num_ent,hr2v_model,device=device,data=data,drop_adj=drop_adj)
        self.conv_lst.append(conv0)
        #self.conv_lst.append(conv1)
        
        self.data = data
        
        self.target_layers = torch.nn.Sequential()            
        for i in range(num_target_layers):
            self.target_layers.append(ResNetBlock(dim,dim,target_drop,torch.nn.ReLU,layers=inner_layers))
        self.target_bn = torch.nn.BatchNorm1d(dim)
        self.register_parameter('b', Parameter(torch.zeros(num_ent)))
        self.to(self.device)
        
    def get_param(self,shape):
        param = Parameter(torch.Tensor(*shape));
        xavier_normal_(param.data)
        return param
    
    def forward(self,sub, rel,train=False):
        #返回encode过的obj向量、所有实体的向量
        sub,rel = self.type_trans(sub,rel)
        x,r = self.init_ent_emb,self.init_rel_emb
        for conv in self.conv_lst:
            x,r = conv(x,r,train)
        
        
        
        dist2 = torch.nn.functional.pairwise_distance(x, self.init_ent_emb, p=2)
        l2_loss = torch.mean(dist2)
        
        #x=self.xdrop(x)
        
        x =   x + self.init_ent_emb
        
        sub_emb = torch.index_select(self.init_ent_emb, 0, sub)
        rel_emb = torch.index_select(self.init_rel_emb, 0, rel)
        
        enc_vec = self.pred_model(sub_emb, rel_emb)
        
        all_ent_vec = self.target_bn(x)
        all_ent_vec = self.target_layers(all_ent_vec)
        
        
        scores = torch.mm(enc_vec,all_ent_vec.transpose(1,0))
        #scores += self.b.expand_as(scores)
        
        
        return scores,l2_loss
    
    def forward_and_loss(self,sub,rel,obj):
        sub,rel,obj = self.type_trans(sub,rel,obj)
        scores,l2_loss = self.forward(sub,rel,train=True)
        return self.loss_func(scores,obj.long()) + l2_loss * self.beta
    
    
    def type_trans(self,*args):
        trans = []
        for arg in args:
            if isinstance(arg,np.ndarray):
                trans.append(torch.from_numpy(arg).to(self.device))
            elif isinstance(arg,list) or isinstance(arg,tuple):
                trans.append(torch.tensor(arg).to(self.device))
            elif isinstance(arg,torch.Tensor):
                trans.append(arg.to(self.device))
            else:
                print('type error')
                raise 
        return trans
        

In [9]:
def data_generator(triples,batch_size=2000):
    np.random.shuffle(triples)
    hs,rs,ts = zip(*triples)
    num_batches = len(triples) // batch_size + 1
    for i in range(num_batches):
        yield  np.array(hs[batch_size*i:batch_size*(i+1)]),np.array(rs[batch_size*i:batch_size*(i+1)]),\
        np.array(ts[batch_size*i:batch_size*(i+1)])
    
target_dict = get_target_dict(data.triples['train'],data.triples['valid'],data.triples['test'])
def evaluate(model,x_test,batch_size,target_dict):
    #target_dict:用于filter
    len_test = len(x_test)    
    batch_num = math.ceil(len(x_test) / batch_size)
    tail_scores_all = []
    tail_label = []
    
    for i in range(batch_num):
        batch_data  = x_test[batch_size*i:batch_size*(i+1)]
        batch_h,batch_r,batch_t = batch_data[:,0],batch_data[:,1],batch_data[:,2]
        tail_scores,_  = model.forward(batch_h,batch_r,train=False)
        tail_scores = tail_scores.cpu().detach().numpy()
        tail_scores_all.append(tail_scores)
        tail_label.append(batch_t)
        
    
    tail_scores_all = np.concatenate(tail_scores_all,axis=0)  
    tail_label = np.concatenate(tail_label,axis=0)  
    
    def cal_result(scores,labels,x_test,target_dict):
        ranks = []
        for i in range(len(labels)):
            arr = scores[i]
            mark = labels[i]
            h,r,t = x_test[i]
            mark_value = arr[mark]
            
            ##filter
            targets = target_dict[(h,r)]
            for target in targets:
                if target != mark:
                    arr[target] = np.finfo(np.float32).min
            ##
            rank = np.sum(arr>=mark_value)
            #rank+=1
            ranks.append(rank)
            
        mr, mrr, hits1, hits10 =0,[],[],[]
        mr = np.average(ranks)
        
        for rank in ranks:
            mrr.append(1/rank)
            if rank == 1:
                hits1.append(1)
            else:
                hits1.append(0)
            if rank <= 10:
                hits10.append(1)
            else:
                hits10.append(0)
        mrr = np.average(mrr)
        hits1 = np.average(hits1)
        hits10 = np.average(hits10)
        result = {'mr':mr, 'mrr':mrr, 'hits1':hits1, 'hits10':hits10}
        return result    
    
    tail_result = cal_result(tail_scores_all,tail_label, x_test,target_dict)
    return {'mr':tail_result['mr'], 'mrr':tail_result['mrr'], 
            'hits@1':tail_result['hits1'], 'hits@10':tail_result['hits10']}

def get_target_dict(train_doubles,x_valid,x_test):
    target_dict = {}
    for h,r,t in train_doubles:
        if (h,r) not in target_dict:
            target_dict[(h,r)] = set()
        target_dict[(h,r)].add(t)
    for h,r,t in x_valid:
        if (h,r) not in target_dict:
            target_dict[(h,r)] = set()
        target_dict[(h,r)].add(t)
    for h,r,t in x_test:
        if (h,r) not in target_dict:
            target_dict[(h,r)] = set()
        target_dict[(h,r)].add(t) 
    return target_dict 

class Trainer():
    def __init__(self,model,data,batch_size=1000,lr=0.003,weight_decay=5e-4 ,min_lr=1e-7,patience=5,factor=0.5):
        self.model = model
        self.opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.opt, 'min',
                                                                    factor=factor,min_lr=min_lr,patience=patience,verbose=True)
        self.data = data
        self.batch_size = batch_size
        
    def train(self,max_epoch=200):
        best_mrr = 0.0
        for epoch in range(max_epoch):
            if epoch>500:
                self.batch_size = 500
            self.train_one_epoch(epoch=epoch)
            if epoch!=0 and epoch %10 ==0:
                results = self.evaluate()
                print('previous best mrr',best_mrr)
                if results['mrr']>best_mrr:
                    print('new best mrr',results['mrr'])
                    best_mrr = results['mrr']
                    torch.save(self.model,'fb_rgnn.pt')
                
                print(results)
    
    def evaluate(self):
        self.model.eval()
        return evaluate(self.model,np.array(self.data.triples['valid']),batch_size=self.batch_size,target_dict=target_dict)
        
    def train_one_epoch(self,epoch=None):
        self.model.train()
        losses = []
        train_triples = self.data.triples['train']
        
        for i,(hs,rs,ts) in enumerate(data_generator(train_triples,self.batch_size)):
            self.opt.zero_grad()
            loss = self.model.forward_and_loss(hs,rs,ts)
            loss.backward()
            self.opt.step()
            losses.append(loss.detach().cpu().numpy())
        print('epoch:',epoch,'train loss:',np.average(losses))
        self.scheduler.step(np.average(losses))
            
            

In [11]:
beta = 0.5
drop_adj = 0.15

dim = 300
input_drop = 0.4
hidden_drop = 0.4
identity_drop = 0.01
num_source_layers =40
num_target_layers=1
target_drop=0.4
inner_layers=3
weight_decay = 5e-8
lr = 0.003
batch_size = 1000
hr2v_model = HR2V(dim=dim)
pred_model = HR2V_DeepE(dim=dim,num_source_layers=num_source_layers,input_drop=input_drop,
                        hidden_drop=hidden_drop,identity_drop=identity_drop)

gnn_model = GNNModel(data.num_ent,data.num_rel,dim,dim,data.edge_index,data.edge_type,
                     hr2v_model,device=device,data=data,pred_model=pred_model,
                    num_target_layers=num_target_layers,target_drop=target_drop,
                     inner_layers=inner_layers,drop_adj=drop_adj,beta=beta,xdrop=0)


trainer = Trainer(gnn_model,data,batch_size=batch_size,weight_decay = weight_decay,lr = lr)

trainer.train(max_epoch=1000)