In [35]:
# === 0. 环境 & 配置 ===
import os, math, random, collections, time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
try:
    from tqdm import tqdm
except:
    tqdm = lambda x, **k: x

SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# DATA_DIR =  r"D:\workspace\***\小论文-知识图谱\data\OpenBG500"
DATA_DIR = r"D:\workspace\...\小论文-知识图谱\data\OpenBG500"
TRAIN = os.path.join(DATA_DIR, "OpenBG500_train.tsv")
DEV   = os.path.join(DATA_DIR, "OpenBG500_dev.tsv")
# TEST  = os.path.join(DATA_DIR, "OpenBG500_test.tsv") # 竞赛版的test数据集，缺少尾实体列 ，无法用作模型评测
TEST  = os.path.join(DATA_DIR, "OpenBG500_answer.tsv") # 评测使用answer文件，作为测试集

TEST_SYM  = os.path.join(DATA_DIR, "OpenBG500_answer_symmetric.tsv")
TEST_ASYM = os.path.join(DATA_DIR, "OpenBG500_answer_asymmetric.tsv")
TEST_1M   = os.path.join(DATA_DIR, "OpenBG500_answer_one-to-many.tsv")
TEST_M1   = os.path.join(DATA_DIR, "OpenBG500_answer_many-to-one.tsv")

EMB_DIM = 200     # 实体/关系嵌入维度
MARGIN  = 6.0     
LR      = 1e-3
BATCH   = 1024
NEG_K   = 25
EPOCHS  = 30
EVAL_BLOCK = 20000   # 分块评测，避免GPU出现OOM

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [36]:
# === 1. 数据读取与编码 ===
def read_triples(path: str):
    triples = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            p = line.rstrip('\n').split('\t')
            if len(p) >= 3:
                triples.append((p[0], p[1], p[2]))
    return triples

train_triples = read_triples(TRAIN)
dev_triples   = read_triples(DEV)
test_triples  = read_triples(TEST)

# 四种关系类型的测试子集
sym_triples  = read_triples(TEST_SYM)
asym_triples = read_triples(TEST_ASYM)
one2many_triples = read_triples(TEST_1M)
many2one_triples = read_triples(TEST_M1)


ents, rels = set(), set()
for (h,r,t) in (train_triples + dev_triples + test_triples):
    ents.add(h); ents.add(t); rels.add(r)
ent2id = {e:i for i,e in enumerate(sorted(ents))}
rel2id = {r:i for i,r in enumerate(sorted(rels))}

# def encode(triples):
#     return torch.tensor([[ent2id[h], rel2id[r], ent2id[t]] for h,r,t in triples], dtype=torch.long)

def encode(triples):
    import numpy as np
    if len(triples) == 0:
        # 返回二维空张量，避免后续 vstack 维度不匹配
        return torch.empty((0, 3), dtype=torch.long)
    arr = np.array([[ent2id[h], rel2id[r], ent2id[t]] for h,r,t in triples], dtype='int64')
    return torch.from_numpy(arr)

train_ids = encode(train_triples)
dev_ids   = encode(dev_triples)
test_ids  = encode(test_triples)

sym_ids      = encode(sym_triples)
asym_ids     = encode(asym_triples)
one2many_ids = encode(one2many_triples)
many2one_ids = encode(many2one_triples)

print("Sym:", sym_ids.size(0), 
      "Asym:", asym_ids.size(0),
      "1-to-M:", one2many_ids.size(0),
      "M-to-1:", many2one_ids.size(0))

n_ent, n_rel = len(ent2id), len(rel2id)
print("Entities:", n_ent, "Relations:", n_rel, "Train:", len(train_ids), "Dev:", len(dev_ids), "Test:", len(test_ids))

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\workspace\\...\\小论文-知识图谱\\data\\OpenBG500\\OpenBG500_train.tsv'

In [37]:
# === 2. Filter 字典（Filtered 评测用） ===
def build_filter_dict(all_ids: torch.Tensor):
    hr2t = collections.defaultdict(set)
    rt2h = collections.defaultdict(set)
    for h,r,t in all_ids.tolist():
        hr2t[(h,r)].add(t)
        rt2h[(r,t)].add(h)
    return hr2t, rt2h

hr2t, rt2h = build_filter_dict(torch.vstack([train_ids, dev_ids, test_ids]))

In [38]:
# === 3. 数据集 & 负采样 ===
class KGDataset(Dataset):
    def __init__(self, triples_tensor: torch.Tensor):
        self.data = triples_tensor
    def __len__(self): return self.data.size(0)
    def __getitem__(self, idx): return self.data[idx]

def negative_sample(batch_pos, n_ent, num_negs=NEG_K, device=DEVICE):
    B = batch_pos.size(0)
    mask = (torch.rand(B, device=device) < 0.5)
    neg = batch_pos.unsqueeze(1).repeat(1, num_negs, 1).clone()
    rand_ents = torch.randint(0, n_ent, (B, num_negs), device=device)
    neg[mask,:,0] = rand_ents[mask]
    neg[~mask,:,2] = rand_ents[~mask]
    return neg.view(-1,3)

In [39]:
# === 4. RotatE 模型 ===
class RotatE(nn.Module):
    def __init__(self, n_ent, n_rel, dim=200, margin=6.0):
        super().__init__()
        assert dim % 2 == 0, "RotatE需要偶数维度（实部+虚部）"
        self.dim = dim // 2
        self.margin = margin

        self.ent = nn.Embedding(n_ent, dim)
        self.rel_phase = nn.Embedding(n_rel, self.dim)  # 相位（弧度）

        nn.init.uniform_(self.ent.weight, -0.1, 0.1)
        nn.init.uniform_(self.rel_phase.weight, -math.pi, math.pi)

    def score(self, h, r, t):
        h_e = self.ent(h).view(-1, self.dim, 2)  # (B, D, 2)
        t_e = self.ent(t).view(-1, self.dim, 2)
        phase = self.rel_phase(r)  # (B, D)
        r_e = torch.stack([torch.cos(phase), torch.sin(phase)], dim=-1)  # (B, D, 2)

        # 旋转：h ∘ r
        hr = torch.stack([
            h_e[:,:,0]*r_e[:,:,0] - h_e[:,:,1]*r_e[:,:,1],
            h_e[:,:,0]*r_e[:,:,1] + h_e[:,:,1]*r_e[:,:,0]
        ], dim=-1)

        diff = hr - t_e
        dist = torch.norm(diff, p=2, dim=(1,2))  # [B]
        return self.margin - dist  # 越大越好

In [40]:
# === 5. 训练 & Loss ===
def train_one_epoch(model, loader, optimizer):
    model.train()
    total = 0.0
    for pos in tqdm(loader, desc="train"):
        pos = pos.to(DEVICE)
        neg = negative_sample(pos, n_ent, NEG_K, DEVICE)

        pos_s = model.score(pos[:,0], pos[:,1], pos[:,2])
        neg_s = model.score(neg[:,0], neg[:,1], neg[:,2]).view(pos.size(0), -1).mean(-1)

        loss = torch.relu(model.margin + neg_s - pos_s).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += loss.item()
    return total / len(loader)

model = RotatE(n_ent, n_rel, EMB_DIM, MARGIN).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [41]:
# === 6. 评测 (Filtered MRR/Hits@K) ===
@torch.no_grad()
def evaluate_filtered(model, triples, hr2t, rt2h, k_list=(1,3,10), block=EVAL_BLOCK, device=DEVICE):
    model.eval()
    ent_emb = model.ent.weight.to(device)

    def score_hr_t(h, r, t):
        h_e = model.ent(h).view(-1, model.dim, 2)
        t_e = model.ent(t).view(-1, model.dim, 2)
        phase = model.rel_phase(r)
        r_e = torch.stack([torch.cos(phase), torch.sin(phase)], dim=-1)
        hr = torch.stack([
            h_e[:,:,0]*r_e[:,:,0] - h_e[:,:,1]*r_e[:,:,1],
            h_e[:,:,0]*r_e[:,:,1] + h_e[:,:,1]*r_e[:,:,0]
        ], dim=-1)
        diff = hr - t_e
        return -torch.norm(diff, p=2, dim=(1,2))  # score

    def rank_tail(batch):
        h,r,t = batch[:,0], batch[:,1], batch[:,2]
        B = h.size(0)
        ranks = torch.full((B,), float('inf'), device=device)
        hr = model.ent(h).view(-1, model.dim, 2)
        phase = model.rel_phase(r)
        r_e = torch.stack([torch.cos(phase), torch.sin(phase)], dim=-1)
        hr_rot = torch.stack([
            hr[:,:,0]*r_e[:,:,0] - hr[:,:,1]*r_e[:,:,1],
            hr[:,:,0]*r_e[:,:,1] + hr[:,:,1]*r_e[:,:,0]
        ], dim=-1)

        start=0
        while start<n_ent:
            end=min(start+block, n_ent)
            cand = ent_emb[start:end].view(-1, model.dim, 2)
            diff = hr_rot.unsqueeze(1)-cand.unsqueeze(0)
            scores = -torch.norm(diff, dim=(2,3))
            for i in range(B):
                for tt in hr2t.get((int(h[i]),int(r[i])), []):
                    if start<=tt<end and tt!=int(t[i]):
                        scores[i,tt-start] = -1e9
            in_blk = (t>=start)&(t<end)
            if in_blk.any():
                gold = scores[in_blk, (t[in_blk]-start)]
                rank = (scores[in_blk] > gold.unsqueeze(1)).sum(1).float()+1
                ranks[in_blk] = torch.minimum(ranks[in_blk], rank)
            start=end
        return ranks

    # 类似写 rank_head，可省略示例；逻辑同上

    ranks_all=[]
    B=64
    for i in tqdm(range(0,len(triples),B),desc="eval"):
        batch=triples[i:i+B].to(device)
        ranks_all.append(rank_tail(batch)) # 这里只写 tail，想要严格可以再写 head
    ranks=torch.cat(ranks_all)
    mrr=(1.0/ranks).mean().item()
    hits={k:(ranks<=k).float().mean().item() for k in k_list}
    return mrr,hits

In [42]:
# === 7. 训练循环 ===
train_loader = DataLoader(KGDataset(train_ids), batch_size=BATCH, shuffle=True)

for epoch in range(1,EPOCHS+1):
    loss = train_one_epoch(model, train_loader, optimizer)
    dev_mrr, dev_hits = evaluate_filtered(model, dev_ids, hr2t, rt2h)
#     print(f"[Epoch {epoch:02d}] loss={loss:.4f}  dev MRR={dev_mrr:.4f}  Hits@1={dev_hits[1]:.4f}  Hits@10={dev_hits[10]:.4f}")
    print(f"[Epoch {epoch:02d}] loss={loss:.4f}  dev MRR={dev_mrr:.4f}  Hits@1={dev_hits[1]:.4f}  Hits@3={dev_hits[3]:.4f}  Hits@10={dev_hits[10]:.4f}")

train:   9%|██████▊                                                                 | 115/1219 [00:05<00:53, 20.59it/s]


KeyboardInterrupt: 

In [None]:
# === 8. 测试集评测 ===
test_mrr, test_hits = evaluate_filtered(model, test_ids, hr2t, rt2h)
print(f"[TEST] MRR={test_mrr:.4f}  Hits@1={test_hits[1]:.4f}  Hits@3={test_hits[3]:.4f}  Hits@10={test_hits[10]:.4f}")

In [None]:
# === 9. 按关系类型的测试集评测 ===
def eval_and_print(tag, triple_ids):
    if triple_ids.size(0) == 0:
        print(f"[TEST-{tag}] no triples, skip.")
        return
    mrr, hits = evaluate_filtered(model, triple_ids, hr2t, rt2h)
    print(f"[TEST-{tag}] MRR={mrr:.4f}  Hits@1={hits[1]:.4f}  Hits@3={hits[3]:.4f}  Hits@10={hits[10]:.4f}")

eval_and_print("SYMMETRIC",   sym_ids)
eval_and_print("ASYMMETRIC",  asym_ids)
eval_and_print("ONE-TO-MANY", one2many_ids)
eval_and_print("MANY-TO-ONE", many2one_ids)