In [1]:
"""
1. 定义DBPDataset类表示数据集,重写__getitem__方法返回三元组。
2. 定义KGEmb模型表示知识图谱Embedding,包含实体与关系Embedding层。
3. 构建DataLoader加载数据集,定义优化器与TransE损失函数进行训练。
4. 训练结束后,ent_emb与rel_emb分别为实体与关系Embedding结果。
5. 将模型参数保存为model.pkl文件,用于重新加载模型。
"""
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from scipy.spatial.distance import cosine


# 加载数据集
class DBPDataset(Dataset):
    def __init__(self, data_path):
        triples = []
        with open(data_path) as f:
            for line in f:
                head, rel, tail = line.strip().split()
                triples.append((int(head), int(rel), int(tail)))
        self.triples = triples

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        head, rel, tail = self.triples[idx]
        # 将输入的头实体head,关系rel和尾实体tail构建成pytorch的LongTensor类型。
        return torch.LongTensor([head, rel, tail])


# Embedding模型
class KGEmb(nn.Module):
    def __init__(self, n_ent, n_rel, dim=100):
        """
        :param n_ent: 实体的总数,即数据集中不重复的实体数量。
        :param n_rel: 关系的总数,即数据集中不重复的关系数量。
        :param dim: Embedding的维度,即每个实体/关系的向量表示长度。
        """
        super().__init__()
        self.ent_emb = nn.Embedding(n_ent, dim)
        self.rel_emb = nn.Embedding(n_rel, dim)

    def forward(self, x):
        """
        :param x:
        :return:头实体Embedding、关系Embedding和尾实体Embedding。
        """
        head, rel, tail = x[:, 0], x[:, 1], x[:, 2]
        head_emb = self.ent_emb(head)
        rel_emb = self.rel_emb(rel)
        tail_emb = self.ent_emb(tail)
        return head_emb, rel_emb, tail_emb


# 训练代码
dataset = DBPDataset('./data/index_rel_triple.txt')
# 设置固定的随机种子,使每次初始化产生的随机参数相同
torch.manual_seed(123)
model = KGEmb(n_ent=774, n_rel=324)

"""
1. Adam优化器更新Embedding使loss下降
2. DataLoader生成batch数据输入模型
3. 打乱数据顺序使模型泛化能力提高
"""
# 使用Adam优化器,对模型中的所有可训练参数进行更新，通过model.parameters()获得模型的所有可训练参数(Embedding)
opt = torch.optim.Adam(model.parameters())
# 使用DataLoader加载dataset数据集，每批数据大小为8192条三元组，shuffle=True:每个epoch打乱数据顺序
loader = DataLoader(dataset, batch_size=200, shuffle=True)

for epoch in range(200):
    """
    opt.zero_grad() 的作用是:清空优化器opt中保存的梯度值。
    每一次的 loss.backward() 都会累加当前的梯度到已经存储的梯度值。如果不清空梯度,梯度就会不断累加,最终的梯度值错误。
    所以,在每次迭代(每个batch)开始之前,需要调用 opt.zero_grad() 清空优化器中的梯度值,确保本次迭代计算的梯度值正确。
    """
    for step, x in enumerate(loader):
        opt.zero_grad()
        h, r, t = model(x)
        loss = torch.sum(h + r - t)  # TransE损失函数
        loss.backward()  # 反向传播,计算梯度值
        opt.step()  # 优化器更新参数

In [2]:
ent_emb = model.ent_emb.weight.detach().numpy()

In [24]:
ent_emb
for ent in ent_emb:
    print(ent)
#     ent_emb[ent]

[-1.05974190e-01 -6.21121407e-01 -7.46871591e-01 -1.03135586e+00
 -9.47391465e-02  2.16996536e-01 -6.62981570e-01 -8.22513700e-01
  3.23762685e-01 -1.63584685e+00  2.55007565e-01 -1.85306776e+00
 -2.63965100e-01  1.45180321e+00  5.21020927e-02 -1.74146131e-01
 -5.20364463e-01 -1.46381676e+00 -6.12300456e-01  4.74420428e-01
  1.13762474e+00  8.57696295e-01  8.31957936e-01 -6.44297183e-01
  5.31266779e-02 -2.01564908e+00  5.23227334e-01 -1.59148884e+00
 -1.60221267e+00 -1.17873386e-01 -1.07485032e+00 -3.28333926e+00
 -1.76830220e+00 -2.64916271e-01 -2.57709837e+00  6.09013498e-01
 -8.31826806e-01 -1.37769055e+00 -9.42488372e-01 -1.52999747e+00
  4.37200189e-01  1.11081374e+00  1.83271572e-01 -6.18831396e-01
 -3.45060706e-01 -5.36851823e-01 -1.77129969e-01 -1.02837956e+00
  4.33492810e-01  1.17879915e+00 -1.92128682e+00  6.89763367e-01
 -1.66363513e+00  8.70509982e-01  6.09955370e-01 -3.04539263e-01
  1.80398512e+00 -1.24698138e+00 -7.24186242e-01  3.26331913e-01
 -1.10290468e+00 -1.24127

[ 0.13590395  0.689292    0.476554   -0.6367603  -1.2213709   3.0833626
 -1.3395752  -0.8046125  -0.5537649  -0.58813137 -1.1088885  -1.5278612
 -2.128016   -1.0404128  -1.2761856  -0.68877935 -1.0912309   1.4479946
  0.48883766  2.0542316   0.48051122 -1.5308944   1.1250122   1.4521232
 -1.979987    0.07095377 -0.61569846 -0.66406447 -0.6576771  -3.2412758
 -1.0163419  -1.2599471  -2.3041677   0.38163105  0.07504177 -0.56455404
 -1.3868477  -2.370925   -3.1101775  -1.316927   -0.4537183   0.83372504
 -0.5784227  -0.59627056  0.05614157  0.34841278 -0.34420663 -2.4982433
  0.604641   -0.7791871  -0.5308768  -2.0917916  -0.61733633 -0.7022426
 -1.3762497   0.05955115  0.570271    0.16235587 -1.1328977   0.0434087
 -1.8079411  -0.46241552  0.06087026 -0.07352288  0.26691884 -0.31935972
  0.48929253 -1.0659366   0.05688144 -1.1247865  -0.1785689  -0.32451713
 -0.14000353 -0.32131824 -1.0208924  -0.12686515 -0.5101129   0.5337097
 -1.0811135   0.6156215   0.88413817 -2.0640461   0.17715308

In [4]:
with open('model.pkl', 'rb') as f:  
    data = pickle.load(f) 

NameError: name 'pickle' is not defined

In [5]:
import pickle

In [6]:
with open('model.pkl', 'rb') as f:  
    data = pickle.load(f) 

UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.

In [15]:
import pandas as pd  
import pickle  
  
# 读取 .pkl 文件  
with open('data/ent_attr_kg_transe.pkl', 'rb') as f:  
    data = pickle.load(f)  
  
# 将数据转换为 DataFrame  
df = pd.DataFrame(data) 
df.to_csv('data/ent_attr_kg_transe.csv', index=False)


In [19]:
    with open("data/ent_attr_kg_transe.pkl", 'rb') as f:
        kg= pickle.load(f)

    entity_kg_attr = torch.zeros((200,512))

#     for ent in kg:
# #         idx = int(ent)
# #         entity_kg_attr[idx] = torch.from_numpy(kg[ent])
#         print(kg[ent])

In [20]:
kg

array([[-0.10597415, -0.6211214 , -0.7468716 , ..., -1.7626548 ,
        -2.5348442 ,  0.5195076 ],
       [ 0.31435913, -0.1327353 ,  1.1130285 , ..., -0.27099785,
        -1.9246165 , -1.0356508 ],
       [ 0.8266607 , -0.7825705 , -0.7506822 , ..., -0.32326338,
        -2.1206853 , -1.0943722 ],
       ...,
       [ 0.55854505, -0.33839044, -1.7997851 , ..., -0.54086435,
         0.60215944,  1.0060074 ],
       [ 2.2439103 ,  2.1141937 ,  1.2392797 , ..., -0.85138524,
        -1.2934463 ,  1.1191169 ],
       [ 0.13085377, -0.32537973,  0.25538445, ...,  1.1325076 ,
         1.1952808 ,  1.2188861 ]], dtype=float32)

In [21]:
for  ent in kg:
    print(kg[ent])

IndexError: arrays used as indices must be of integer (or boolean) type

In [4]:
#加载模型
import torch
from TransE_embedding import KGEmb
model = KGEmb(n_ent=5360, n_rel=1809)
state_dict = torch.load('./data/model.pkl')  
model.load_state_dict(state_dict)  
model.eval()
with torch.no_grad():  
    print(model.rel_emb.weight.detach().numpy())


[[-0.31457832 -0.42272058 -0.6011338  ... -0.8709327  -0.9283732
  -0.03447386]
 [-1.7758917   1.0947118   0.38965416 ... -2.2160406  -2.2939932
  -0.80003923]
 [-2.6138384   0.5095369  -1.8878999  ...  0.22489564 -1.9779019
  -0.29523307]
 ...
 [ 0.77972597 -1.3559698  -0.8013467  ...  0.08178499 -2.1459723
   0.83720833]
 [ 0.61239564 -1.2397565  -0.10299543 ... -1.6117859  -0.03315364
  -0.42096043]
 [-0.13992463 -2.1796544  -2.6487985  ... -0.16128103 -0.24421178
  -0.54363894]]


In [6]:
import numpy as np
entity_attribute = np.array("俄乌战争", dtype=np.float64)

ValueError: could not convert string to float: '俄乌战争'

In [8]:
import numpy as np  
  
# 假设这是从文件中读取的一行数据，其中包含了文本属性  
line = "1\t俄乌战争\t3.14\t159"  
  
# 分割行数据  
parts = line.strip('\n').split('\t')  
entity_id = parts[0]  # 第一个元素是实体ID  
  
# 处理属性部分，区分文本属性和数字属性  
entity_attributes = []  
for attr in parts[1:]:  # 从第二个元素开始遍历  
    try:  
        # 尝试将属性转换为浮点数  
        entity_attributes.append(float(attr))  
    except ValueError:  
        # 如果转换失败（即属性是文本），则保留为字符串  
        entity_attributes.append(attr)  
  
# 实体属性现在是一个混合类型列表，包含浮点数和字符串  
# 注意：这样的列表不能直接转换为 NumPy 数组，因为 NumPy 数组需要所有元素都是相同类型的。  
# 如果需要进一步的数值处理，您可能需要只选择那些可以转换为浮点数的属性。

In [9]:
entity_attributes

['俄乌战争', 3.14, 159.0]