In [9]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GATConv
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import faiss
from transformers import AutoTokenizer
from transformers import AutoModel
# pip install faiss-cpu

In [3]:
import torch

print(torch.__version__)

print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
print("PyTorch Geometric is ready!")

2.5.1+cu124
CUDA available: True
CUDA version: 12.4
PyTorch Geometric is ready!


In [4]:
import pandas as pd

# 创建实体数据
entities = [
    {"entity_name": "Apple", "type": "Company", "description": "A technology company known for iPhones and MacBooks."},
    {"entity_name": "Steve Jobs", "type": "Person", "description": "The co-founder of Apple."},
    {"entity_name": "iPhone", "type": "Product", "description": "A smartphone product line developed by Apple."},
    {"entity_name": "Tim Cook", "type": "Person", "description": "The CEO of Apple after Steve Jobs."},
    {"entity_name": "MacBook", "type": "Product", "description": "A line of laptop computers developed by Apple."},
]

# 创建关系数据
relations = [
    {"source": "Steve Jobs", "target": "Apple", "description": "Steve Jobs co-founded Apple in 1976 and helped it become a global brand"},
    {"source": "Apple", "target": "iPhone", "description": "Apple released the first iPhone in 2007, revolutionizing the smartphone market"},
]

# 保存为 CSV 文件
entities_df = pd.DataFrame(entities)
relations_df = pd.DataFrame(relations)

entities_df.to_csv("entities.csv", index=False)
relations_df.to_csv("relations.csv", index=False)

print("✅ 成功创建 entities.csv 和 relations.csv")


✅ 成功创建 entities.csv 和 relations.csv


In [11]:

# ========= Step 1: Load CSV ========= #
entities_df = pd.read_csv('entities.csv')    # contains: entity_name, type, description
relations_df = pd.read_csv('relations.csv')  # contains: source, target, description

# Encode node names to integer IDs
node_encoder = LabelEncoder()
entities_df['node_id'] = node_encoder.fit_transform(entities_df['entity_name'])
node_name_to_id = dict(zip(entities_df['entity_name'], entities_df['node_id']))

# ========= Step 2: Build edge_index ========= #
edges = []
for _, row in relations_df.iterrows():
    src = node_name_to_id[row['source']]
    tgt = node_name_to_id[row['target']]
    edges.append([src, tgt])
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()  # shape: [2, num_edges]

# ========= Step 3: Generate node features using Sentence-BERT ========= #
model_id = "nomic-ai/nomic-embed-text-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)
model = AutoModel.from_pretrained(model_id,trust_remote_code=True)

model = SentenceTransformer('all-MiniLM-L6-v2')
node_descs = entities_df['description'].fillna("").tolist()
node_features = torch.tensor(model.encode(node_descs), dtype=torch.float)  # shape: [num_nodes, emb_dim]


edge_descs = relations_df['description'].fillna("").tolist()
# edge_features = model.encode(edge_descs, convert_to_tensor=True)  # shape: [num_edges, emb_dim] 
# set features on the device by default
edge_features = torch.tensor(model.encode(edge_descs, dtype=torch.float))  # shape: [num_edges, emb_dim] 


configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/104k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

<All keys matched successfully>


In [14]:
model.encode(node_descs)

array([[-0.08528187,  0.01041333,  0.00754764, ..., -0.09328268,
         0.11823732,  0.0499921 ],
       [-0.00147327,  0.02058624,  0.00411942, ..., -0.08852488,
         0.08957277,  0.03908511],
       [-0.11820599,  0.04054209,  0.03426793, ...,  0.00278324,
         0.12413181,  0.05148866],
       [-0.0102783 ,  0.01969168,  0.04184995, ..., -0.04358399,
         0.08945554,  0.05478818],
       [-0.03482116,  0.02066975,  0.00669234, ..., -0.00924592,
         0.09446841,  0.0352484 ]], dtype=float32)

In [16]:
# ========= Step 4: Build PyG Data ========= #
data = Data(
    x=node_features,  # [num_nodes, node_dim]
    edge_index=edge_index,  # [2, num_edges]
    edge_attr=edge_features  # [num_edges, emb_dim]
)


In [17]:
# ========= Step 5: Define GraphSAGE ========= #
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model_gnn = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=64)
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)

class EdgeAwareGNN(torch.nn.Module):
    def __init__(self, in_channels, edge_dim, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, edge_dim=edge_dim)
        self.conv2 = GATConv(hidden_channels, out_channels, edge_dim=edge_dim)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x
    
model_gnn = EdgeAwareGNN(in_channels=node_features.size(1), edge_dim=edge_features.size(1), hidden_channels=128, out_channels=node_features.size(1))
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)

In [18]:
# ========= Step 6: Train GNN (unsupervised, simple) ========= #
model_gnn.train()
for epoch in range(20):
    optimizer.zero_grad()
    # out = model_gnn(data.x, data.edge_index)
    out = model_gnn(data.x, data.edge_index, data.edge_attr)
    loss = torch.mean(out.norm(dim=1))  # Dummy regularization loss to "move" weights
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# ========= Step 7: Extract final embeddings ========= #
model_gnn.eval()
with torch.no_grad():
    embeddings = model_gnn(data.x, data.edge_index, data.edge_attr)
    embeddings_np = embeddings.numpy()


Epoch 1, Loss: 0.5632
Epoch 2, Loss: 0.5555
Epoch 3, Loss: 0.3002
Epoch 4, Loss: 0.2061
Epoch 5, Loss: 0.1568
Epoch 6, Loss: 0.1132
Epoch 7, Loss: 0.0897
Epoch 8, Loss: 0.0892
Epoch 9, Loss: 0.0952
Epoch 10, Loss: 0.0961
Epoch 11, Loss: 0.0899
Epoch 12, Loss: 0.0782
Epoch 13, Loss: 0.0653
Epoch 14, Loss: 0.0564
Epoch 15, Loss: 0.0542
Epoch 16, Loss: 0.0552
Epoch 17, Loss: 0.0547
Epoch 18, Loss: 0.0505
Epoch 19, Loss: 0.0443
Epoch 20, Loss: 0.0396


In [19]:
print("data.x device:", data.x.device)
print("data.edge_index device:", data.edge_index.device)
print("data.edge_attr device:", data.edge_attr.device)


data.x device: cpu
data.edge_index device: cpu
data.edge_attr device: cpu


In [20]:
print(data.edge_index.shape)  # (2, num_edges)
# print(data.edge_index)
print(data.edge_attr.shape)  # (num_edges, edge_feature_dim)
# print(data.edge_attr)

torch.Size([2, 2])
torch.Size([2, 384])


In [21]:
# ========= Step 8: Build FAISS index for retrieval ========= #
faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
faiss_index.add(embeddings_np)

# ========= Step 9: Test search ========= #
def search(query, top_k=3):
    query_vec = model.encode([query])
    query_vec = torch.tensor(query_vec, dtype=torch.float)
    print("dim of search vec:", query_vec.shape)  # 打印查询向量的形状
    print("dim of FAISS index:", faiss_index.d)  # 打印 FAISS 索引的维度

    D, I = faiss_index.search(query_vec.numpy(), top_k)
    results = [entities_df.iloc[i]['entity_name'] for i in I[0]]
    return results

# ========= Example ========= #
print("🔍 search: 'who is the founder of apple company ?'")
print(search("who is the founder of apple company ?"))

🔍 search: 'who is the founder of apple company ?'
dim of search vec: torch.Size([1, 384])
dim of FAISS index: 384
['Apple', 'Steve Jobs', 'iPhone']


In [3]:
import json
import pandas as pd

# 加载数据
with open("entities.json") as f:
    entities = json.load(f)
with open("relations.json") as f:
    relations = json.load(f)

# 转为 DataFrame
entity_df = pd.DataFrame(entities, columns=["name", "type", "desc"])
edge_df = pd.DataFrame(relations, columns=["source", "target", "desc"])

# 清理边：只保留source和target都在entity_df中name列里的
valid_names = set(entity_df["name"])
clean_edge_df = edge_df[edge_df["source"].isin(valid_names) & edge_df["target"].isin(valid_names)].reset_index(drop=True)


FileNotFoundError: [Errno 2] No such file or directory: 'entities.json'

In [2]:
# 找出edge_df中所有source和target
edge_nodes = set(edge_df["source"]) | set(edge_df["target"])
known_nodes = set(entity_df["name"])

# 缺失的实体名
missing_nodes = edge_nodes - known_nodes

# 补充这些实体，类型和描述可以用占位符
missing_entity_rows = [{"name": name, "type": "Unknown", "desc": "Auto-added"} for name in missing_nodes]
extended_entity_df = pd.concat([entity_df, pd.DataFrame(missing_entity_rows)], ignore_index=True)


NameError: name 'edge_df' is not defined