In [21]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GATConv
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import faiss

ModuleNotFoundError: No module named 'faiss'

In [5]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
print("PyTorch Geometric is ready!")

CUDA available: True
CUDA version: 12.4
PyTorch Geometric is ready!


In [12]:
import pandas as pd

# 创建实体数据
entities = [
    {"entity_name": "Apple", "type": "Company", "description": "A technology company known for iPhones and MacBooks."},
    {"entity_name": "Steve Jobs", "type": "Person", "description": "The co-founder of Apple."},
    {"entity_name": "iPhone", "type": "Product", "description": "A smartphone product line developed by Apple."},
    {"entity_name": "Tim Cook", "type": "Person", "description": "The CEO of Apple after Steve Jobs."},
    {"entity_name": "MacBook", "type": "Product", "description": "A line of laptop computers developed by Apple."},
]

# 创建关系数据
relations = [
    {"source": "Steve Jobs", "target": "Apple", "description": "Steve Jobs co-founded Apple in 1976 and helped it become a global brand"},
    {"source": "Apple", "target": "iPhone", "description": "Apple released the first iPhone in 2007, revolutionizing the smartphone market"},
]

# 保存为 CSV 文件
entities_df = pd.DataFrame(entities)
relations_df = pd.DataFrame(relations)

entities_df.to_csv("entities.csv", index=False)
relations_df.to_csv("relations.csv", index=False)

print("✅ 成功创建 entities.csv 和 relations.csv")


✅ 成功创建 entities.csv 和 relations.csv


In [None]:

# ========= Step 1: Load CSV ========= #
entities_df = pd.read_csv('entities.csv')    # contains: entity_name, type, description
relations_df = pd.read_csv('relations.csv')  # contains: source, target, description

# Encode node names to integer IDs
node_encoder = LabelEncoder()
entities_df['node_id'] = node_encoder.fit_transform(entities_df['entity_name'])
node_name_to_id = dict(zip(entities_df['entity_name'], entities_df['node_id']))

# ========= Step 2: Build edge_index ========= #
edges = []
for _, row in relations_df.iterrows():
    src = node_name_to_id[row['source']]
    tgt = node_name_to_id[row['target']]
    edges.append([src, tgt])
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()  # shape: [2, num_edges]

# ========= Step 3: Generate node features using Sentence-BERT ========= #
model = SentenceTransformer('all-MiniLM-L6-v2')
node_descs = entities_df['description'].fillna("").tolist()
node_features = torch.tensor(model.encode(node_descs), dtype=torch.float)  # shape: [num_nodes, emb_dim]


edge_descs = relations_df['description'].fillna("").tolist()
edge_features = model.encode(edge_descs, convert_to_tensor=True)  # shape: [num_edges, emb_dim]

In [None]:
# ========= Step 4: Build PyG Data ========= #
data = Data(
    x=node_features.float(),  # [num_nodes, node_dim]
    edge_index=edge_index,  # [2, num_edges]
    edge_attr=edge_features.float()  # [num_edges, emb_dim]
)


In [23]:
# ========= Step 5: Define GraphSAGE ========= #
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model_gnn = GraphSAGE(in_channels=node_features.size(1), hidden_channels=128, out_channels=64)
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)

class EdgeAwareGNN(torch.nn.Module):
    def __init__(self, in_channels, edge_dim, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, edge_dim=edge_dim)
        self.conv2 = GATConv(hidden_channels, out_channels, edge_dim=edge_dim)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x
    
model_gnn = EdgeAwareGNN(in_channels=node_features.size(1), edge_dim=edge_features.size(1), hidden_channels=128, out_channels=64)
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)

In [None]:
# ========= Step 6: Train GNN (unsupervised, simple) ========= #
model_gnn.train()
for epoch in range(20):
    optimizer.zero_grad()
    # out = model_gnn(data.x, data.edge_index)
    out = model_gnn(data.x, data.edge_index, data.edge_attr)
    loss = torch.mean(out.norm(dim=1))  # Dummy regularization loss to "move" weights
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# ========= Step 7: Extract final embeddings ========= #
model_gnn.eval()
with torch.no_grad():
    embeddings = model_gnn(data.x, data.edge_index, data.edge_attr)
    embeddings_np = embeddings.numpy()


Epoch 1, Loss: 0.0234
Epoch 2, Loss: 0.0260
Epoch 3, Loss: 0.0259
Epoch 4, Loss: 0.0228
Epoch 5, Loss: 0.0181
Epoch 6, Loss: 0.0153
Epoch 7, Loss: 0.0148
Epoch 8, Loss: 0.0149
Epoch 9, Loss: 0.0152
Epoch 10, Loss: 0.0149
Epoch 11, Loss: 0.0135
Epoch 12, Loss: 0.0114
Epoch 13, Loss: 0.0098
Epoch 14, Loss: 0.0099
Epoch 15, Loss: 0.0108
Epoch 16, Loss: 0.0100
Epoch 17, Loss: 0.0081
Epoch 18, Loss: 0.0079
Epoch 19, Loss: 0.0079
Epoch 20, Loss: 0.0073


In [None]:
# ========= Step 8: Build FAISS index for retrieval ========= #
faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
faiss_index.add(embeddings_np)

# ========= Step 9: Test search ========= #
def search(query, top_k=3):
    query_vec = model.encode([query])
    query_vec = torch.tensor(query_vec, dtype=torch.float)
    with torch.no_grad():
        query_emb = model_gnn(query_vec, data.edge_index)
    D, I = faiss_index.search(query_emb.numpy(), top_k)
    results = [entities_df.iloc[i]['entity_name'] for i in I[0]]
    return results

# ========= Example ========= #
print("🔍 查询: '苹果公司领导人'")
print(search("苹果公司领导人"))


NameError: name 'faiss' is not defined