# 0. 产生噪声样本

In [1]:
%run 基本定义.ipynb

# 1. 生成攻击的目标样本

In [11]:
## 修改DATASET生成不同的目标样本
DATASET = 'wn18rr'
if DATASET == "FB15k-237":
    target_rank = 1
elif DATASET == "wn18rr":
    target_rank = 5
else:
    raise Exception("dataset should be either FB15k-237 or wn18rr")
data_path = f"./data/{DATASET}"
print(f"generate target triples for DATASET {DATASET}")

# load dataset info
with open(os.path.join(data_path, 'entities.dict')) as fin:
    entity2id = dict()
    for line in fin:
        eid, entity = line.strip().split('\t')
        entity2id[entity] = int(eid)

with open(os.path.join(data_path, 'relations.dict')) as fin:
    relation2id = dict()
    for line in fin:
        rid, relation = line.strip().split('\t')
        relation2id[relation] = int(rid)
train_triples = read_triple(os.path.join(data_path, "train.txt"), entity2id, relation2id)
valid_triples = read_triple(os.path.join(data_path, 'valid.txt'), entity2id, relation2id)
test_triples = read_triple(os.path.join(data_path, 'test.txt'), entity2id, relation2id)
all_true_triples = train_triples + valid_triples + test_triples

entity_density = {}
for head, relation, tail in all_true_triples:
    if head not in entity_density:
        entity_density[head] = 0
    if tail not in entity_density:
        entity_density[tail] = 0
    entity_density[head] += 1
    entity_density[tail] += 1

# load testing results
model2triple2rank = {}
model2top1Triples = {}
for MODEL in ["TransE", "RotatE"]:
    triple2rank_path = f"./models/{MODEL}_{DATASET}_baseline/triple2ranking.pkl"
    with open(triple2rank_path, "rb") as f:
        triple2rank = pickle.load(f)
    model2triple2rank[MODEL] = triple2rank
    model2top1Triples[MODEL] = set()
    for triple, mode2ranking in triple2rank.items():
        rankh, rankt = mode2ranking["head-batch"], mode2ranking["tail-batch"]
        if rankh <= target_rank and rankt <= target_rank:
            model2top1Triples[MODEL].add(triple)

# load 
top1Triples = model2top1Triples["TransE"].intersection(model2top1Triples["RotatE"])

top1Triples_score = [((head, relation, tail), entity_density[head], entity_density[tail]) for head, relation, tail in top1Triples]
top1Triples_score = sorted(top1Triples_score, key=lambda x: -(x[1] + x[2]))[:100]
targetTriples = [triple for triple, _, _ in top1Triples_score]
with open(os.path.join(data_path, 'targetTriples.pkl'), "wb") as fw:
    print(f"get {len(targetTriples)} targetTriples")
    pickle.dump(targetTriples, fw)

generate target triples for DATASET wn18rr
get 100 targetTriples


In [4]:
import sys
import time
from IPython import embed
from collections import defaultdict
import torch.autograd as autograd


class GlobalRandomNoiseAttacker:
    def __init__(self, args):
        self.name = "GlobalRandomNoiseAttacker"
        self.args = args
        self.input_data = get_input_data(args)
        self.trainer = BaseTrainer.get_trainer(self.input_data, args)
        self.trainer.load_model()
        self.kge_model = self.trainer.kge_model
        self.all_relations = list(self.input_data.relation2id.values())
        self.all_entities = list(self.input_data.entity2id.values())
        self.target_triples = args['target_triples']
        if self.target_triples is None:
            self.target_triples = pickle.load(open(os.path.join(args['data_path'], "targetTriples.pkl"), "rb"))
        set_logger(args, args['identifier'])
        self.noise_triples = set()

    def get_noise_triples(self):
        noise_triples = set()
        all_true_triples = set(self.input_data.all_true_triples)
        for i in range(len(self.target_triples)):
            sys.stdout.write("%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
            # h, r, t = self.target_triples[i]
            rand_h = random.choice(self.all_entities)
            rand_r = random.choice(self.all_relations)
            rand_t = random.choice(self.all_entities)
            while (rand_h, rand_r, rand_t) in noise_triples or (rand_h, rand_r, rand_t) in all_true_triples:
                rand_h = random.choice(self.all_entities)
                rand_r = random.choice(self.all_relations)
                rand_t = random.choice(self.all_entities)
            noise_triples.add((rand_h, rand_r, rand_t))
        return list(noise_triples)

    def generate(self, identifier):
        dataset_model = self.args['init_checkpoint'].split("/")[-1]
        print(f'------ {self.name} starts to generate noise for {dataset_model} ------')
        start_time = time.time()
        noise_triples = self.get_noise_triples()
        print(f"Time taken:{time.time() - start_time}")
        print(f"Num Noise:{len(noise_triples)}")
        print(f"False Negative: {len(set(noise_triples).intersection(set(self.input_data.all_true_triples)))}")
        if not self.args['no_store']:
            with open(os.path.join(self.args['init_checkpoint'], "%s.pkl" % identifier), "wb") as fw:
                pickle.dump(noise_triples, fw)


class LocalRandomNoiseAttacker(GlobalRandomNoiseAttacker):
    def __init__(self, args):
        super(LocalRandomNoiseAttacker, self).__init__(args)
        self.name = "LocalRandomNoiseAttacker"

    def get_noise_triples(self):
        noise_triples = set()
        all_true_triples = set(self.input_data.all_true_triples)
        for i in range(len(self.target_triples)):
            sys.stdout.write("%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
            h, r, t = self.target_triples[i]
            rand_r = random.choice(self.all_relations)
            rand_e = random.choice(self.all_entities)
            if random.random() < 0.5:
                while (h, rand_r, rand_e) in noise_triples or (h, rand_r, rand_e) in all_true_triples:
                    rand_r = random.choice(self.all_relations)
                    rand_e = random.choice(self.all_entities)
                noise_triples.add((h, rand_r, rand_e))
            else:
                while (rand_e, rand_r, t) in noise_triples or (rand_e, rand_r, t) in all_true_triples:
                    rand_r = random.choice(self.all_relations)
                    rand_e = random.choice(self.all_entities)
                noise_triples.add((rand_e, rand_r, t))
        return list(noise_triples)

# 2. 添加扰动噪声样本

In [5]:
class DirectAddition(GlobalRandomNoiseAttacker):
    def __init__(self, args):
        super(DirectAddition, self).__init__(args)
        self.score_func = lambda s1, s2: args['lambda1'] * s1 - args['lambda2'] * s2
        self.name = "direct"

        self.true_rel_head, self.true_rel_tail = defaultdict(set), defaultdict(set)
        for triple in self.input_data.all_true_triples:
            self.add_true_triple(triple)
    
    def add_true_triple(self, triple):
        h, r, t = triple
        self.true_rel_tail[h].add((r, t))
        self.true_rel_head[t].add((r, h))

    def get_noise_for_head(self, test_triple, mode="head-batch"):
        args = self.args
        h, r, t = test_triple
        true_cand = self.true_rel_tail[h] if mode == "head-batch" else self.true_rel_head[t]
        s = time.time()
        cand_r_list = random.choices(self.all_relations, k=args['num_cand'])
        cand_e_list = random.choices(self.all_entities, k=args['num_cand'])
        cand_r_e_list = list(set(zip(cand_r_list, cand_e_list)).difference(true_cand))
        cand_r_list, cand_e_list = zip(*cand_r_e_list)
        cand_r_list, cand_e_list = list(cand_r_list), list(cand_e_list)
        args['num_cand'] = len(cand_r_list)

        embed_h = self.kge_model.entity_embedding[h]
        embed_r = self.kge_model.relation_embedding[r]
        embed_t = self.kge_model.entity_embedding[t]
        score = self.kge_model.score_embedding(embed_h, embed_r, embed_t)
        perturbed_embed_h, perturbed_embed_t = None, None
        if mode == "head-batch":
            embed_h_grad = autograd.grad(score, embed_h)[0]
            perturbed_embed_h = embed_h - args['epsilon'] * embed_h_grad
        elif mode == "tail-batch":
            embed_t_grad = autograd.grad(score, embed_t)[0]
            perturbed_embed_t = embed_t - args['epsilon'] * embed_t_grad

        b_begin = 0
        cand_scores = []
        with torch.no_grad():
            while b_begin < args['num_cand']:
                b_cand_r = cand_r_list[b_begin: b_begin + args['num_cand']]
                b_cand_e = cand_e_list[b_begin: b_begin + args['num_cand']]
                b_begin += args['num_cand']

                embed_cand_r = self.kge_model.relation_embedding[b_cand_r]
                embed_cand_e = self.kge_model.entity_embedding[b_cand_e]
                s1, s2 = None, None
                if mode == "head-batch":
                    s1 = self.kge_model.score_embedding(perturbed_embed_h, embed_cand_r, embed_cand_e, mode=mode)
                    s2 = self.kge_model.score_embedding(embed_h, embed_cand_r, embed_cand_e, mode=mode)
                elif mode == "tail-batch":
                    s1 = self.kge_model.score_embedding(embed_cand_e, embed_cand_r, perturbed_embed_t, mode=mode)
                    s2 = self.kge_model.score_embedding(embed_cand_e, embed_cand_r, embed_t, mode=mode)
                score = self.score_func(s1, s2)
                score = score.detach().cpu().numpy().tolist()
                cand_scores += score
        cand_scores = np.array(cand_scores)
        idx = np.argmax(cand_scores)
        score = cand_scores[idx]
        if mode == "head-batch":
            return (h, cand_r_list[idx], cand_e_list[idx]), score.item()
        return (cand_e_list[idx], cand_r_list[idx], t), score.item()

    def get_noise_triples(self):
        noise_triples, args = self.noise_triples, self.args
        args['num_cand'] = np.math.ceil((args['nentity']*args['nrelation'])*args['corruption_factor'] / 100)
        all_true_triples = set(self.input_data.all_true_triples)
        for i in range(len(self.target_triples)):
            sys.stdout.write("%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
            target_triple = self.target_triples[i]
            noise_triple_h, score_h = self.get_noise_for_head(target_triple, mode="head-batch")
            noise_triple_t, score_t = self.get_noise_for_head(target_triple, mode="tail-batch")
            if score_h > score_t:
                noise_triples.add(noise_triple_h)
                self.add_true_triple(noise_triple_h)
            else:
                noise_triples.add(noise_triple_t)
                self.add_true_triple(noise_triple_t)
        return list(noise_triples)

class CentralDiffAddition(DirectAddition):
    def __init__(self, args):
        super(CentralDiffAddition, self).__init__(args)
        self.name = "central_diff"
        self.args['epsilon'] = self.args['learning_rate']

    def get_noise_for_head(self, test_triple, mode="head-batch"):
        args = self.args
        h, r, t = test_triple
        true_cand = self.true_rel_tail[h] if mode == "head-batch" else self.true_rel_head[t]
        cand_r_list = random.choices(self.all_relations, k=args['num_cand'])
        cand_e_list = random.choices(self.all_entities, k=args['num_cand'])
        cand_r_e_list = list(set(zip(cand_r_list, cand_e_list)).difference(true_cand))
        cand_r_list, cand_e_list = zip(*cand_r_e_list)
        cand_r_list, cand_e_list = list(cand_r_list), list(cand_e_list)
        args['num_cand'] = len(cand_r_list)

        embed_h = self.kge_model.entity_embedding[h]
        embed_r = self.kge_model.relation_embedding[r]
        embed_t = self.kge_model.entity_embedding[t]
        score = self.kge_model.score_embedding(embed_h, embed_r, embed_t)
        perturbed_embed_e, enforced_embed_e = None, None
        ########## begin difference ############
        if mode == "head-batch":
            embed_h_grad = autograd.grad(score, embed_h)[0]
            perturbed_embed_e = embed_h - args['epsilon'] * embed_h_grad
            enforced_embed_e = embed_h + args['epsilon'] * embed_h_grad
        elif mode == "tail-batch":
            embed_t_grad = autograd.grad(score, embed_t)[0]
            perturbed_embed_e = embed_t - args['epsilon'] * embed_t_grad
            enforced_embed_e = embed_t + args['epsilon'] * embed_t_grad
        ########## end difference ############

        b_begin = 0
        cand_scores = []
        while b_begin < args['num_cand']:
            b_cand_r = cand_r_list[b_begin: b_begin + args['num_cand']]
            b_cand_e = cand_e_list[b_begin: b_begin + args['num_cand']]
            b_begin += args['num_cand']

            embed_cand_r = self.kge_model.relation_embedding[b_cand_r]
            embed_cand_e = self.kge_model.entity_embedding[b_cand_e]
            s1, s2 = None, None
            ########## begin difference ############
            if mode == "head-batch":
                s1 = self.kge_model.score_embedding(perturbed_embed_e, embed_cand_r, embed_cand_e, mode=mode)
                s2 = self.kge_model.score_embedding(enforced_embed_e, embed_cand_r, embed_cand_e, mode=mode)
            elif mode == "tail-batch":
                s1 = self.kge_model.score_embedding(embed_cand_e, embed_cand_r, perturbed_embed_e, mode=mode)
                s2 = self.kge_model.score_embedding(embed_cand_e, embed_cand_r, enforced_embed_e, mode=mode)
            ########## end difference ############
            score = self.score_func(s1, s2)
            score = score.detach().cpu().numpy().tolist()
            cand_scores += score
        cand_scores = np.array(cand_scores)
        idx = np.argmax(cand_scores)
        score = cand_scores[idx]
        if mode == "head-batch":
            return (h, cand_r_list[idx], cand_e_list[idx]), score.item()
        return (cand_e_list[idx], cand_r_list[idx], t), score.item()

class DirectRelAddition(DirectAddition):
    def __init__(self, args):
        super(DirectRelAddition, self).__init__(args)
        self.score_func = lambda s1, s2: args['lambda1'] * s1 - args['lambda2'] * s2
        self.name = "direct_rel"
        self.true_head_tail = {}
        for h, r, t in self.input_data.all_true_triples:
            if r not in self.true_head_tail:
                self.true_head_tail[r] = set()
            self.true_head_tail[r].add((h, t))

    def get_noise_for_head(self, test_triple, mode="head-batch"):
        if mode == "tail-batch":
            return test_triple, -1e9
        args = self.args
        h, r, t = test_triple
        s = time.time()
        true_cand = self.true_head_tail[r]
        cand_h_list = random.choices(self.all_entities, k=args['num_cand'])
        cand_t_list = random.choices(self.all_entities, k=args['num_cand'])
        cand_h_t_list = list(set(zip(cand_h_list, cand_t_list)).difference(true_cand))
        cand_h_list, cand_t_list = zip(*cand_h_t_list)
        cand_h_list, cand_t_list = list(cand_h_list), list(cand_t_list)
        args['num_cand'] = len(cand_h_list)
        e1 = time.time()

        embed_h = self.kge_model.entity_embedding[h]
        embed_r = self.kge_model.relation_embedding[r]
        embed_t = self.kge_model.entity_embedding[t]
        score = self.kge_model.score_embedding(embed_h, embed_r, embed_t)
        embed_r_grad = autograd.grad(score, embed_r)[0]
        perturbed_embed_r = embed_r - args['epsilon'] * embed_r_grad
        e2 = time.time()

        b_begin = 0
        cand_scores = []
        with torch.no_grad():
            while b_begin < args['num_cand']:
                b_cand_h = cand_h_list[b_begin: b_begin + args['num_cand']]
                b_cand_t = cand_t_list[b_begin: b_begin + args['num_cand']]
                b_begin += args['num_cand']

                embed_cand_h = self.kge_model.entity_embedding[b_cand_h]
                embed_cand_t = self.kge_model.entity_embedding[b_cand_t]
                s1 = self.kge_model.score_embedding(embed_cand_h, perturbed_embed_r, embed_cand_t, mode=mode)
                s2 = self.kge_model.score_embedding(embed_cand_h, embed_r, embed_cand_t, mode=mode)
                score = self.score_func(s1, s2)
                score = score.detach().cpu().numpy().tolist()
                cand_scores += score
        cand_scores = np.array(cand_scores)
        idx = np.argmax(cand_scores)
        score = cand_scores[idx]
        e3 = time.time()
        self.true_head_tail[r].add((cand_h_list[idx], cand_t_list[idx]))
        return (cand_h_list[idx], r, cand_t_list[idx]), score.item()

# 3. 添加相似度噪声样本

In [6]:
def generate_nghbrs(target_triples, train_triples):
    '''
    For every triple in target_triples set,
    return the index of neighbouring triple in train triples,
    '''
    triple2nghbrs = {}
    train_triples = np.array(train_triples)
    for h, r, t in target_triples:
        mask = (np.isin(train_triples[:, 0], [h, t]) | np.isin(train_triples[:, 2], [h, t]))
        mask_idx = np.where(mask)[0]
        triple2nghbrs[(h, r, t)] = [tuple(triple) for triple in train_triples[mask_idx].tolist()]
    return triple2nghbrs


class InstanceAttributionCos(GlobalRandomNoiseAttacker):
    def __init__(self, args):
        super(InstanceAttributionCos, self).__init__(args)
        self.entity_embedding = self.kge_model.entity_embedding.data
        self.train_triples = np.array(self.input_data.train_triples)
        triple2nghbrs_path = os.path.join(args['data_path'], "triple2nghbrs.pkl")
        if not os.path.exists(triple2nghbrs_path):
            with open(triple2nghbrs_path, "wb") as fw:
                self.triple2nghbrs = generate_nghbrs(self.target_triples, self.train_triples)
                pickle.dump(self.triple2nghbrs, fw)
        else:
            with open(triple2nghbrs_path, "rb") as f:
                self.triple2nghbrs = pickle.load(f)
        print(f"generate_nghbrs done")
        self.similarity_func = lambda vec, nghbr_vec: F.cosine_similarity(vec, nghbr_vec)
        self.name = "is_cos"

    def get_influential_triples(self):
        args = self.args
        influential_triples_path = os.path.join(args['init_checkpoint'], "%s_influential_triples.pkl" % self.name)
        if not args['no_store'] and os.path.exists(influential_triples_path):
            with open(influential_triples_path, "rb") as f:
                return pickle.load(f)
        triple2influential_triple = {}
        for i, (h, r, t) in enumerate(self.target_triples):
            sys.stdout.write("influential:\t%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
            sample = torch.LongTensor([h, r, t]).view(1, -1)
            if self.args['cuda']:
                sample = sample.cuda()
            vec_score = self.kge_model(sample, mode="single", get_vec=True).view(1, -1)
            ngbhrs = self.triple2nghbrs[(h, r, t)]
            if len(ngbhrs) == 0:
                print(f"we don't need to attack {h, r, t} in {args['data_path']}")
                continue
            b_beign = 0
            nghbr_sim = []
            while b_beign < len(ngbhrs):
                b_ngbhrs = ngbhrs[b_beign: b_beign+args['num_cand_batch']]
                b_beign += args['num_cand_batch']
                b_ngbhrs = torch.LongTensor(b_ngbhrs).view(-1, 3)
                if self.args['cuda']:
                    b_ngbhrs = b_ngbhrs.cuda()
                b_ngbhrs_vec = self.kge_model(b_ngbhrs, mode="single", get_vec=True).view(-1, vec_score.shape[-1])
                b_sim = self.similarity_func(vec_score, b_ngbhrs_vec).detach().cpu().numpy().tolist()
                nghbr_sim += b_sim
            nghbr_sim = np.array(nghbr_sim)
            idx = np.argmax(nghbr_sim)
            triple2influential_triple[(h, r, t)] = ngbhrs[idx]
        if not args['no_store']:
            with open(influential_triples_path, "wb") as fw:
                pickle.dump(triple2influential_triple, fw)
        return triple2influential_triple

    def find_least_similarity_entity(self, entity, r, e, mode):
        train_triples = np.array(self.input_data.train_triples + list(self.noise_triples))
        ent_embed = self.kge_model.entity_embedding[entity].view(1, -1)
        cos_sim_ent = F.cosine_similarity(ent_embed, self.entity_embedding)
        filter_ent = None
        if mode == "head-mode":
            filter_ent = train_triples[np.where((train_triples[:, 2] == e) & (train_triples[:, 1] == r)), 0]
        elif mode == "tail-mode":
            filter_ent = train_triples[np.where((train_triples[:, 0] == e) & (train_triples[:, 1] == r)), 2]
        cos_sim_ent[filter_ent.squeeze()] = 1e8
        idx = torch.argmin(cos_sim_ent).item()
        return idx

    def get_noise_triples(self):
        noise_triples = self.noise_triples
        influential_triples = self.get_influential_triples()
        for i in range(len(self.target_triples)):
            sys.stdout.write("%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
            h, r, t = self.target_triples[i]
            if (h, r, t) not in influential_triples:
                continue
            hi, ri, ti = influential_triples[(h, r, t)]
            if ti in [h, t]:
                fake_head = self.find_least_similarity_entity(hi, ri, ti, mode="head-mode")
                noise_triples.add((fake_head, ri, ti))
            elif hi in [h, t]:
                fake_tail = self.find_least_similarity_entity(ti, ri, hi, mode="tail-mode")
                noise_triples.add((hi, ri, fake_tail))
            else:
                print("unexpected behavior")

        return list(noise_triples)


class InstanceAttributionDot(InstanceAttributionCos):
    def __init__(self, args):
        super(InstanceAttributionDot, self).__init__(args)
        self.similarity_func = lambda vec, nghbr_vec: torch.matmul(vec, nghbr_vec.t())
        self.name = "is_dot"


class InstanceAttributionL2(InstanceAttributionCos):
    def __init__(self, args):
        super(InstanceAttributionL2, self).__init__(args)
        self.similarity_func = lambda vec, nghbr_vec: -torch.norm((nghbr_vec-vec), p=2, dim=-1)
        self.name = "is_l2"

# 4. 添加梯度相似度噪声样本

In [7]:
def get_non_zero_idx(matrix1, matrix2):
    idx1 = set(torch.nonzero(matrix1)[:, 0].detach().cpu().numpy().tolist())
    idx2 = set(torch.nonzero(matrix2)[:, 0].detach().cpu().numpy().tolist())
    idx = list(idx1.intersection(idx2))
    return idx

def jacobian(y: torch.Tensor, x: torch.Tensor, need_higher_grad=False) -> torch.Tensor:
    """ refer:https://zhuanlan.zhihu.com/p/530879775
    基于 torch.autograd.grad 函数的更清晰明了的 API，功能是计算一个雅可比矩阵。

    Args:
        y (torch.Tensor): 函数输出向量
        x (torch.Tensor): 函数输入向量
        need_higher_grad (bool, optional): 是否需要计算高阶导数，如果确定不需要可以设置为 False 以节约资源. 默认为 True.

    Returns:
        torch.Tensor: 计算好的“雅可比矩阵”。注意！输出的“雅可比矩阵”形状为 y.shape + x.shape。例如：y 是 n 个元素的张量，y.shape = [n]；x 是 m 个元素的张量，x.shape = [m]，则输出的雅可比矩阵形状为 n x m，符合常见的数学定义。
        但是若 y 是 1 x n 的张量，y.shape = [1,n]；x 是 1 x m 的张量，x.shape = [1,m]，则输出的雅可比矩阵形状为1 x n x 1 x m，如果嫌弃多余的维度可以自行使用 torch.squeeze(Jac) 一步到位。
        这样设计是因为考虑到 y 是 n1 x n2 的张量； 是 m1 x m2 的张量（或者形状更复杂的张量）时，输出 n1 x n2 x m1 x m2 （或对应更复杂形状）更有直观含义，方便用户知道哪一个元素对应的是哪一个偏导。
    """
    (Jac,) = torch.autograd.grad(
        outputs=(y.flatten(),),
        inputs=(x,),
        grad_outputs=(torch.eye(torch.numel(y)).cuda(),),
        retain_graph=True,
        create_graph=need_higher_grad,
        allow_unused=True,
        is_grads_batched=True
    )
    if Jac is None:
        Jac = torch.zeros(size=(y.shape + x.shape))
    else:
        Jac.reshape(shape=(y.shape + x.shape))
    return Jac



class InstanceAttributionCosGrad(InstanceAttributionCos):
    def __init__(self, args):
        super(InstanceAttributionCosGrad, self).__init__(args)
        self.similarity_func = lambda grad, nghbr_grad: F.cosine_similarity(grad, nghbr_grad)
        self.loss_func = torch.nn.CrossEntropyLoss(reduction="none")
        self.name = "gs_cos"

        named_parameters = list(self.kge_model.named_parameters())
        self.param_list = []
        for n, p in named_parameters:
            if p.requires_grad:
                self.param_list.append(p)

    def get_loss(self, triple):
        h, r, t = triple[:, 0], triple[:, 1], triple[:, 2]
        embed_h = self.kge_model.entity_embedding[h]
        embed_r = self.kge_model.relation_embedding[r]
        embed_t = self.kge_model.entity_embedding[t]
        loss = 0
        pred_h_embedding = self.kge_model.predict_embedding(embed_t, embed_r, "head-mode")
        pred_h = torch.mm(pred_h_embedding.squeeze(1), self.kge_model.entity_embedding.transpose(1,0))
        pred_h = torch.sigmoid(pred_h)
        loss += self.loss_func(pred_h, h)

        pred_t_embedding = self.kge_model.predict_embedding(embed_h, embed_r, "tail-mode")
        pred_t = torch.mm(pred_t_embedding.squeeze(1), self.kge_model.entity_embedding.transpose(1,0))
        pred_t = torch.sigmoid(pred_t)
        loss += self.loss_func(pred_t, t)
        return loss

    # given loss of two triples, calculate the score between them
    def calc_influential_score(self, loss1, loss2):
        batch_size = loss2.view(-1).shape[0]
        grad_e1 = jacobian(loss1, self.param_list[0]) # size:1*E*dim
        grad_r1 = jacobian(loss1, self.param_list[1]) # size:1*R*dim
        # embed()
        grad_e2 = jacobian(loss2, self.param_list[0]) # size:B*E*dim
        grad_r2 = jacobian(loss2, self.param_list[1]) # size:B*R*dim
        # grad_e1, grad_r1 = autograd.grad(loss1, self.param_list, retain_graph=True)
        # grad_e2, grad_r2 = autograd.grad(sumed_loss2, batched_param_list, retain_graph=True, \
        #     grad_outputs=(torch.eye(torch.numel(sumed_loss2)),), is_grads_batched=True)
        # embed()
        grad_e1, grad_e2 = grad_e1.view(1, -1), grad_e2.view(batch_size, -1)
        grad_r1, grad_r2 = grad_r1.view(1, -1), grad_r2.view(batch_size, -1)
        score = self.similarity_func(grad_e1, grad_e2) + self.similarity_func(grad_r1, grad_r2)
        score = score.detach().cpu().numpy().tolist()
        del grad_e1
        del grad_e2
        del grad_r1
        del grad_r2
        torch.cuda.empty_cache()
        return score

    def get_influential_triples(self):
        args = self.args
        influential_triples_path = os.path.join(args['init_checkpoint'], "%s_influential_triples.pkl" % self.name)
        if not args['no_store'] and os.path.exists(influential_triples_path):
            with open(influential_triples_path, "rb") as f:
                triple2influential_triple = pickle.load(f)
                if (triple2influential_triple is not None and type(triple2influential_triple) == type({1:1}) and all([triple in triple2influential_triple for triple in self.target_triples])):
                    return triple2influential_triple
        
        triple2influential_triple = {}
        for i, target_triple in enumerate(self.target_triples):
            ngbhrs = self.triple2nghbrs[target_triple]
            target_triple = torch.LongTensor(target_triple).view(-1, 3).cuda()
            target_loss = self.get_loss(target_triple)
            if len(ngbhrs) == 0:
                print(f"we don't need to attack {target_triple} in {args['data_path']}")
                continue
            nghbr_sim = []
            b_beign = 0
            # for i, b_ngbhrs in enumerate(ngbhrs):
            while b_beign < len(ngbhrs):
                b_ngbhrs = ngbhrs[b_beign: b_beign+args['num_cand_batch']]
                b_beign += args['num_cand_batch']

                t1 = time.time()
                b_ngbhrs = torch.LongTensor(b_ngbhrs).view(-1, 3).cuda()
                ngbhr_loss = self.get_loss(b_ngbhrs)
                grad_sim = self.calc_influential_score(target_loss, ngbhr_loss)
                nghbr_sim += grad_sim
                t2 = time.time()
                # print(f"time used: {t2 - t1}: {b_beign}/{len(ngbhrs)}")
            nghbr_sim = np.array(nghbr_sim)
            idx = np.argmax(nghbr_sim)
            target_triple = tuple(target_triple.view(-1).detach().cpu().tolist())
            triple2influential_triple[target_triple] = ngbhrs[idx]

            sys.stdout.write("influential:\t%d in %d\r" % (i, len(self.target_triples)))
            sys.stdout.flush()
        if not args['no_store']:
            with open(influential_triples_path, "wb") as fw:
                pickle.dump(triple2influential_triple, fw)
        return triple2influential_triple


class InstanceAttributionDotGrad(InstanceAttributionCosGrad):
    def __init__(self, args):
        super(InstanceAttributionDotGrad, self).__init__(args)
        self.similarity_func = lambda grad, nghbr_grad: torch.matmul(grad, nghbr_grad.T)
        self.name = "gs_dot"
        # embed()

class InstanceAttributionL2Grad(InstanceAttributionCosGrad):
    def __init__(self, args):
        super(InstanceAttributionL2Grad, self).__init__(args)
        self.similarity_func = lambda grad, nghbr_grad: -torch.norm((grad-nghbr_grad), p=2, dim=-1)
        self.name = "gs_l2"

# 5. 生成transE的wn18rr噪声

In [8]:
## 修改model可以改变模型
## 修改data_path可以改变训练集

args={
    'cuda': True, 
    'fake': None, 
    'do_train': True, 
    'do_valid': False, 
    'do_test': True, 
    'evaluate_train': False, 
    'data_path': 'data/wn18rr', 
    'model': 'TransE', 
    'double_entity_embedding': False, 
    'double_relation_embedding': False, 
    'negative_sample_size': 1024, 
    'hidden_dim': 200, 
    'gamma': 6.0, 
    'negative_adversarial_sampling': True, 
    'adversarial_temperature': 0.5, 
    'batch_size': 512, 
    'regularization': 0.0, 
    'test_batch_size': 8, 
    'uni_weight': False, 
    'learning_rate': 0.0005, 
    'cpu_num': 10, 
    'init_checkpoint': './models/TransE_wn18rr_baseline', 
    'max_steps': 40000, 
    'warm_up_steps': 20000, 
    'no_save': False, 
    'save_path': 'models/TransE_wn18rr_baseline', 
    'comments': '\n', 
    'save_checkpoint_steps': 10000, 
    'valid_steps': 10000, 
    'log_steps': 2000, 
    'classify_steps': 5000, 
    'test_log_steps': 1000, 
    'nentity': 40943, 
    'nrelation': 11, 
    'target_triples': None, 
    'identifier': None, 
    'epsilon': 1.0, 
    'lambda1': 1.0, 
    'lambda2': 1.0, 
    'corruption_factor': 10.0, 
    'num_cand_batch': 64, 
    'no_store': False
}
## 攻击实体
DirectAddition(args).generate("direct_10")
## 攻击实体    
CentralDiffAddition(args).generate("central_diff_10")
## 攻击关系
DirectRelAddition(args).generate("direct_rel")

load model from ./models/TransE_wn18rr_baseline/checkpoint
------ direct starts to generate noise for TransE_wn18rr_baseline ------
Time taken:7.952876567840576
Num Noise:100
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
------ central_diff starts to generate noise for TransE_wn18rr_baseline ------
Time taken:8.396076917648315
Num Noise:100
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
------ direct_rel starts to generate noise for TransE_wn18rr_baseline ------
Time taken:28.697504997253418
Num Noise:100
False Negative: 0


# 6. 基于相似度生成噪声

In [9]:
# 注意删除以前生成的pkl文件
# 余弦相似度
InstanceAttributionCos(args).generate("is_cos")
# 点积相似度
InstanceAttributionDot(args).generate("is_dot")
# L2范数相似度
InstanceAttributionL2(args).generate("is_l2")

load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ is_cos starts to generate noise for TransE_wn18rr_baseline ------
Time taken:1.839982032775879
Num Noise:50
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ is_dot starts to generate noise for TransE_wn18rr_baseline ------
Time taken:2.161444902420044
Num Noise:50
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ is_l2 starts to generate noise for TransE_wn18rr_baseline ------
Time taken:2.0773956775665283
Num Noise:50
False Negative: 0


# 7. 基于梯度相似度生成噪声样本

In [14]:
# 注意删除以前生成的pkl文件，如triple2nghbrs.pkl
## 梯度余弦
InstanceAttributionCosGrad(args).generate("gs_cos")
## 梯度点积
InstanceAttributionDotGrad(args).generate("gs_dot")
## 梯度L2范数
InstanceAttributionL2Grad(args).generate("gs_l2")

load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ gs_cos starts to generate noise for TransE_wn18rr_baseline ------
Time taken:49.068500995635986
Num Noise:100
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ gs_dot starts to generate noise for TransE_wn18rr_baseline ------
influential:	0 in 100

  nghbr_sim = np.array(nghbr_sim)


Time taken:48.77861666679382
Num Noise:100
False Negative: 0
load model from ./models/TransE_wn18rr_baseline/checkpoint
generate_nghbrs done
------ gs_l2 starts to generate noise for TransE_wn18rr_baseline ------
Time taken:48.550278425216675
Num Noise:100
False Negative: 0
