In [1]:
import numpy as np
import torch
import sys
import os

data_path = '/root/Low_Dimension_KGC/data/FB15k-237'
train_data_path = data_path + '/train'

triples = []
query_dict = {}

entity_num = 14541
relation_num = 237

with open(train_data_path) as fin:
    for line in fin:
        h, r, t = line.strip().split('\t')
        triples.append((int(h), int(r), int(t)))
        triples.append((int(t), int(r)+relation_num, int(h)))
        
        if (int(h), int(r)) not in query_dict:
            query_dict[(int(h), int(r))] = []
        query_dict[(int(h), int(r))].append(int(t))
        
        if (int(t), int(r)+relation_num) not in query_dict:
            query_dict[(int(t), int(r)+relation_num)] = []
        query_dict[(int(t), int(r)+relation_num)].append(int(h))

print('所有的三元组共有：', len(triples), '个')
print('不同的query共有：', len(query_dict), '个')


  from .autonotebook import tqdm as notebook_tqdm


所有的三元组共有： 544230 个
不同的query共有： 149689 个


In [6]:
import numpy as np
import matplotlib.pyplot as plt
import random
import os
from collections import defaultdict

data_path = '/root/Low_Dimension_KGC/data/FB15k-237'

def read_triple_with_reverse(file_path, entity2id, relation2id):
    triples = []
    relation_num = len(relation2id)
    with open(file_path) as fin:
        for line in fin:
            h, r, t = line.strip().split('\t')
            triples.append((entity2id[h], relation2id[r], entity2id[t]))
            triples.append((entity2id[t], relation2id[r]+relation_num, entity2id[h]))
    return triples

with open(os.path.join(data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

with open(os.path.join(data_path, 'relations.dict')) as fin:
    relation2id = dict()
    for line in fin:
        rid, relation = line.strip().split('\t')
        relation2id[relation] = int(rid)
    
train_triples = read_triple_with_reverse(os.path.join(data_path, 'train.txt'), entity2id, relation2id)
valid_triples = read_triple_with_reverse(os.path.join(data_path, 'valid.txt'), entity2id, relation2id)
test_triples = read_triple_with_reverse(os.path.join(data_path, 'test.txt'), entity2id, relation2id)

def judge_relation_type(triples):
    head_to_tail_count = defaultdict(lambda: defaultdict(set))  # {relation: {head: set(tails)}}
    tail_to_head_count = defaultdict(lambda: defaultdict(set))  # {relation: {tail: set(heads)}}
    
    # Build head-to-tail and tail-to-head mappings
    for h, r, t in triples:
        head_to_tail_count[r][h].add(t)
        tail_to_head_count[r][t].add(h)
    
    # Classify relationships
    one_to_one = []
    one_to_many = []
    many_to_one = []
    many_to_many = []
    
    for r in head_to_tail_count.keys():
        # Calculate the average number of tails per head
        avg_tails_per_head = sum(len(tails) for tails in head_to_tail_count[r].values()) / len(head_to_tail_count[r])
        # Calculate the average number of heads per tail
        avg_heads_per_tail = sum(len(heads) for heads in tail_to_head_count[r].values()) / len(tail_to_head_count[r])
        
        # Classify based on the criteria
        if avg_tails_per_head < 1.5 and avg_heads_per_tail < 1.5:
            one_to_one.append(r)
        elif avg_tails_per_head >= 1.5 and avg_heads_per_tail < 1.5:
            one_to_many.append(r)
        elif avg_tails_per_head < 1.5 and avg_heads_per_tail >= 1.5:
            many_to_one.append(r)
        else:
            many_to_many.append(r)
    
    return one_to_one, one_to_many, many_to_one, many_to_many


In [10]:
# 1. 对 train_triples 和 test_triples 进行分类
one_to_one_train, one_to_many_train, many_to_one_train, many_to_many_train = judge_relation_type(train_triples)
one_to_one_test, one_to_many_test, many_to_one_test, many_to_many_test = judge_relation_type(train_triples+test_triples)

# 2. 定义一个函数用于统计三元组的数量
def count_triples_by_relation_type(triples, one_to_one, one_to_many, many_to_one, many_to_many):
    relation_type_counts = {
        "1-to-1": 0,
        "1-to-Many": 0,
        "Many-to-1": 0,
        "Many-to-Many": 0
    }
    
    for h, r, t in triples:
        if r in one_to_one:
            relation_type_counts["1-to-1"] += 1
        elif r in one_to_many:
            relation_type_counts["1-to-Many"] += 1
        elif r in many_to_one:
            relation_type_counts["Many-to-1"] += 1
        elif r in many_to_many:
            relation_type_counts["Many-to-Many"] += 1

    return relation_type_counts

# 3. 分别统计训练集和测试集中三元组的数量
train_relation_counts = count_triples_by_relation_type(train_triples, one_to_one_train, one_to_many_train, many_to_one_train, many_to_many_train)
test_relation_counts = count_triples_by_relation_type(test_triples, one_to_one_test, one_to_many_test, many_to_one_test, many_to_many_test)

# 4. 计算每种类型在 train_triples 和 test_triples 中的占比
total_train_triples = len(train_triples)
total_test_triples = len(test_triples)

train_relation_percentages = {k: v / total_train_triples * 100 for k, v in train_relation_counts.items()}
test_relation_percentages = {k: v / total_test_triples * 100 for k, v in test_relation_counts.items()}

# 5. 输出结果
print("Train Triples Relation Counts:")
for relation_type, count in train_relation_counts.items():
    print(f"{relation_type}: {count} triples, {train_relation_percentages[relation_type]:.2f}% of total")

print("\nTest Triples Relation Counts:")
for relation_type, count in test_relation_counts.items():
    print(f"{relation_type}: {count} triples, {test_relation_percentages[relation_type]:.2f}% of total")


Train Triples Relation Counts:
1-to-1: 8556 triples, 1.57% of total
1-to-Many: 63171 triples, 11.61% of total
Many-to-1: 63171 triples, 11.61% of total
Many-to-Many: 409332 triples, 75.21% of total

Test Triples Relation Counts:
1-to-1: 384 triples, 0.94% of total
1-to-Many: 5722 triples, 13.98% of total
Many-to-1: 5722 triples, 13.98% of total
Many-to-Many: 29104 triples, 71.10% of total


In [8]:
"""
统计LorentzKG在不同关系类型下的实验指标
"""

import numpy as np
from collections import defaultdict

# 文件路径
LorentzKG_test_details_path = '/root/Low_Dimension_KGC/models/LorentzKG_FB15k-237_40/test_detail_result.txt'

# 假设已经有了这些关系类型的列表
one_to_one_train, one_to_many_train, many_to_one_train, many_to_many_train = judge_relation_type(train_triples)
one_to_one_test, one_to_many_test, many_to_one_test, many_to_many_test = judge_relation_type(train_triples + test_triples)

# 关系类型映射（假设关系的 ID 对应到关系类型）
relation_type_mapping = {}
for r in one_to_one_train:
    relation_type_mapping[r] = '1-to-1'
for r in one_to_many_train:
    relation_type_mapping[r] = '1-to-Many'
for r in many_to_one_train:
    relation_type_mapping[r] = 'Many-to-1'
for r in many_to_many_train:
    relation_type_mapping[r] = 'Many-to-Many'

# 读取并处理文件内容
def read_and_calculate_metrics(file_path, relation_type_mapping):
    # 存储每个关系类型的统计数据
    relation_metrics = defaultdict(lambda: {'ranks': [], 'count': 0})

    # 读取文件
    with open(file_path, 'r') as f:
        for line in f:
            # 每行格式为: head \t relation \t tail \t rank
            head, relation, tail, rank = line.strip().split('\t')
            relation = int(relation)  # 确保relation是整数类型
            rank = int(rank)

            # 获取关系类型
            relation_type = relation_type_mapping.get(relation, None)
            if relation_type:
                # 更新对应关系类型的排名数据
                relation_metrics[relation_type]['ranks'].append(rank)
                relation_metrics[relation_type]['count'] += 1

    # 计算每个关系类型的MRR, HIT1, HIT3, HIT10
    result = {}
    for relation_type, data in relation_metrics.items():
        ranks = np.array(data['ranks'])
        count = data['count']
        
        # 计算 MRR
        mrr = np.mean(1.0 / ranks) if ranks.size > 0 else 0
        
        # 计算 HIT@1, HIT@3, HIT@10
        hit1 = np.mean(ranks <= 1)  # HIT1: 排名 <= 1
        hit3 = np.mean(ranks <= 3)  # HIT3: 排名 <= 3
        hit10 = np.mean(ranks <= 10)  # HIT10: 排名 <= 10
        
        result[relation_type] = {
            'MRR': mrr,
            'HIT1': hit1,
            'HIT3': hit3,
            'HIT10': hit10,
            'count': count
        }

    return result

# 调用函数读取文件并计算各关系类型的指标
metrics = read_and_calculate_metrics(LorentzKG_test_details_path, relation_type_mapping)

# 输出结果
for relation_type, data in metrics.items():
    print(f"Relation Type: {relation_type}")
    print(f"  MRR: {data['MRR']:.4f}")
    print(f"  HIT1: {data['HIT1']:.4f}")
    print(f"  HIT3: {data['HIT3']:.4f}")
    print(f"  HIT10: {data['HIT10']:.4f}")
    print(f"  Total samples: {data['count']}\n")


Relation Type: Many-to-Many
  MRR: 0.3742
  HIT1: 0.2840
  HIT3: 0.4060
  HIT10: 0.5560
  Total samples: 28946

Relation Type: Many-to-1
  MRR: 0.6218
  HIT1: 0.5520
  HIT3: 0.6530
  HIT10: 0.7509
  Total samples: 5801

Relation Type: 1-to-Many
  MRR: 0.2687
  HIT1: 0.2217
  HIT3: 0.2787
  HIT10: 0.3572
  Total samples: 5801

Relation Type: 1-to-1
  MRR: 0.4547
  HIT1: 0.4115
  HIT3: 0.4740
  HIT10: 0.5260
  Total samples: 384



In [5]:
"""

查看关系类型随着test_triples的加入是否会有很大的变化

"""

# 首先使用 train_triples 进行分类
one_to_one_train, one_to_many_train, many_to_one_train, many_to_many_train = judge_relation_type(train_triples)

# 然后使用 train_triples + test_triples 进行分类
one_to_one_all, one_to_many_all, many_to_one_all, many_to_many_all = judge_relation_type(train_triples + test_triples)

# 统计各个类型的关系变化
def count_relation_changes(before, after):
    added = set(after) - set(before)
    removed = set(before) - set(after)
    unchanged = set(before) & set(after)
    return len(added), len(removed), len(unchanged), added, removed, unchanged

# 比较每个关系类型的变化
one_to_one_added, one_to_one_removed, one_to_one_unchanged, one_to_one_added_rel, one_to_one_removed_rel, one_to_one_unchanged_rel = count_relation_changes(one_to_one_train, one_to_one_all)
one_to_many_added, one_to_many_removed, one_to_many_unchanged, one_to_many_added_rel, one_to_many_removed_rel, one_to_many_unchanged_rel = count_relation_changes(one_to_many_train, one_to_many_all)
many_to_one_added, many_to_one_removed, many_to_one_unchanged, many_to_one_added_rel, many_to_one_removed_rel, many_to_one_unchanged_rel = count_relation_changes(many_to_one_train, many_to_one_all)
many_to_many_added, many_to_many_removed, many_to_many_unchanged, many_to_many_added_rel, many_to_many_removed_rel, many_to_many_unchanged_rel = count_relation_changes(many_to_many_train, many_to_many_all)

# 打印变化细节
print("1-to-1 Relations:")
print(f"Added: {one_to_one_added} relations, Removed: {one_to_one_removed} relations, Unchanged: {one_to_one_unchanged} relations")
print(f"Added Relations: {one_to_one_added_rel}")
print(f"Removed Relations: {one_to_one_removed_rel}")
print(f"Unchanged Relations: {one_to_one_unchanged_rel}")
print()

print("1-to-Many Relations:")
print(f"Added: {one_to_many_added} relations, Removed: {one_to_many_removed} relations, Unchanged: {one_to_many_unchanged} relations")
print(f"Added Relations: {one_to_many_added_rel}")
print(f"Removed Relations: {one_to_many_removed_rel}")
print(f"Unchanged Relations: {one_to_many_unchanged_rel}")
print()

print("Many-to-1 Relations:")
print(f"Added: {many_to_one_added} relations, Removed: {many_to_one_removed} relations, Unchanged: {many_to_one_unchanged} relations")
print(f"Added Relations: {many_to_one_added_rel}")
print(f"Removed Relations: {many_to_one_removed_rel}")
print(f"Unchanged Relations: {many_to_one_unchanged_rel}")
print()

print("Many-to-Many Relations:")
print(f"Added: {many_to_many_added} relations, Removed: {many_to_many_removed} relations, Unchanged: {many_to_many_unchanged} relations")
print(f"Added Relations: {many_to_many_added_rel}")
print(f"Removed Relations: {many_to_many_removed_rel}")
print(f"Unchanged Relations: {many_to_many_unchanged_rel}")



# 假设 train_triples 和 test_triples 是已经定义的三元组列表，
# 并且每个三元组的格式为 (subject, predicate, object)

# 定义你关心的关系ID
target_relations = [113, 52, 289, 350] # 这四个关系的关系类型变化了

# 创建一个辅助函数，统计特定关系类型的三元组数量
def count_relation_in_triples(triples, target_relations):
    count_dict = {relation: 0 for relation in target_relations}
    for triple in triples:
        head, predicate, tail = triple
        if predicate in target_relations:
            count_dict[predicate] += 1
    return count_dict

# 统计在 train_triples 中每个目标关系的三元组数量
train_relation_counts = count_relation_in_triples(train_triples, target_relations)

# 统计在 test_triples 中每个目标关系的三元组数量
test_relation_counts = count_relation_in_triples(test_triples, target_relations)

# 打印统计结果
print("Train Triples Relation Counts:")
for relation in target_relations:
    print(f"Relation {relation}: {train_relation_counts[relation]} triples")

print("\nTest Triples Relation Counts:")
for relation in target_relations:
    print(f"Relation {relation}: {test_relation_counts[relation]} triples")



1-to-1 Relations:
Added: 0 relations, Removed: 0 relations, Unchanged: 34 relations
Added Relations: set()
Removed Relations: set()
Unchanged Relations: {390, 263, 391, 137, 395, 142, 404, 406, 153, 26, 154, 158, 419, 167, 169, 300, 303, 182, 315, 63, 322, 66, 330, 78, 464, 337, 85, 93, 353, 227, 100, 116, 374, 379}

1-to-Many Relations:
Added: 0 relations, Removed: 2 relations, Unchanged: 110 relations
Added Relations: set()
Removed Relations: {289, 350}
Unchanged Relations: {20, 24, 25, 33, 39, 44, 46, 68, 71, 73, 90, 91, 106, 110, 117, 124, 146, 162, 168, 174, 175, 209, 221, 224, 231, 235, 237, 240, 243, 244, 249, 251, 252, 256, 264, 265, 266, 267, 272, 273, 274, 275, 278, 279, 280, 288, 294, 295, 297, 301, 304, 306, 309, 313, 316, 318, 320, 323, 329, 331, 333, 334, 338, 339, 351, 352, 357, 359, 363, 364, 367, 368, 375, 377, 378, 380, 384, 385, 386, 387, 393, 394, 396, 397, 408, 423, 424, 425, 428, 429, 430, 434, 438, 440, 443, 445, 447, 449, 450, 451, 454, 455, 456, 459, 463, 465, 