In [None]:
import pandas as pd
import numpy as np
import re
import chardet
import torch
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import RotatE, ComplEx, ConvE
from pykeen.losses import NSSALoss
from pykeen.regularizers import PowerSumRegularizer
from sklearn.model_selection import train_test_split
from pykeen.evaluation import RankBasedEvaluator

def detect_file_encoding(file_path):
    with open(file_path, 'rb') as f:
        return chardet.detect(f.read(10000))['encoding']

def extract_properties(properties_str):
    props = {}
    matches = re.finditer(r'(\w+)\s*:\s*([^,}]+)', properties_str)
    for match in matches:
        key = match.group(1).strip().lower()
        value = match.group(2).strip()
        props[key] = value
    return props

def parse_cypher_relationship(line):
    pattern = (
        r'\(:\s*(?P<head_type>[\w_]+)\s*\{(?P<head_props>.*?)\}\)'
        r'\s*-\s*\[:(?P<relation>[^\]]+)\]\s*->\s*'
        r'\(:\s*(?P<tail_type>[\w_]+)\s*\{(?P<tail_props>.*?)\}\)'
    )
    match = re.search(pattern, line)
    if not match:
        print(f"格式错误: 不符合基本结构\n问题行: {line}")
        return None
    try:
        head_type = match.group('head_type').strip()
        head_props = extract_properties(match.group('head_props'))
        relation = match.group('relation').strip()
        tail_type = match.group('tail_type').strip()
        tail_props = extract_properties(match.group('tail_props'))
        required = ['name', 'id', 'label']
        for prop in required:
            if prop not in head_props:
                raise ValueError(f"头实体缺少必要属性: {prop}")
            if prop not in tail_props:
                raise ValueError(f"尾实体缺少必要属性: {prop}")
        return {
            "head_id": head_props['id'],
            "head_name": head_props['name'],
            "head_type": head_type,
            "head_label": head_props['label'],
            "tail_id": tail_props['id'],
            "tail_name": tail_props['name'],
            "tail_type": tail_type,
            "tail_label": tail_props['label'],
            "relation": relation
        }
    except Exception as e:
        print(f"解析失败: {str(e)}\n问题行: {line}")
        return None

def standardize_data(df):
    df['head_id'] = df['head_id'].astype(str).str.strip()
    df['tail_id'] = df['tail_id'].astype(str).str.strip()
    df['head_name'] = df['head_name'].str.strip()
    df['tail_name'] = df['tail_name'].str.strip()
    invalid_ids = df[(df['head_id'] == '') | (df['tail_id'] == '')]
    if not invalid_ids.empty:
        print(f"警告: 发现{len(invalid_ids)}个空ID，已自动过滤")
        df = df[(df['head_id'] != '') & (df['tail_id'] != '')]
    return df

def train_and_evaluate(train_tf, test_tf, model_name="RotatE"):
    """训练和评估不同模型"""
    # 模型配置 (移除了device参数)
    model_config = {
        "embedding_dim": 256,
        "loss": NSSALoss(margin=12.0),
        "regularizer": PowerSumRegularizer(p=2.0, weight=1e-5),
    }
    
    if model_name == "RotatE":
        model_config.update({
            "embedding_dim": 200,
            "loss": NSSALoss(margin=9.0)
        })
    elif model_name == "ComplEx":
        model_config.update({
            "embedding_dim": 256,
            "loss": NSSALoss(margin=5.0)
        })
    elif model_name == "ConvE":
        model_config.update({
            "embedding_dim": 200,
            "input_channels": 1,
            "output_channels": 32,
            "embedding_height": 10,
            "embedding_width": 20,
            "kernel_height": 3,
            "kernel_width": 3,
            "input_dropout": 0.2,
            "feature_map_dropout": 0.2,
            "output_dropout": 0.3
        })
    
    print(f"\n开始训练{model_name}模型...")
    result = pipeline(
        training=train_tf,
        testing=test_tf,
        model=model_name,
        model_kwargs=model_config,
        training_kwargs=dict(
            num_epochs=200,
            batch_size=256,
            checkpoint_name=f"{model_name}_checkpoint.pt",
            checkpoint_frequency=20
        ),
        evaluation_kwargs=dict(batch_size=128),
        random_seed=42,
        device="cuda" if torch.cuda.is_available() else "cpu"  # device参数移到这里
    )
    
    # 详细评估
    evaluator = RankBasedEvaluator()
    metric_results = evaluator.evaluate(
        model=result.model,
        mapped_triples=test_tf.mapped_triples,
        batch_size=128,
        additional_filter_triples=[train_tf.mapped_triples]
    )
    
    return result, metric_results

def main():
    file_path = "E:\\研究生内容\\眼科-论文\\人工智能\\知识图谱\\眼底疾病数据库\\Relation.csv"

    try:
        encoding = detect_file_encoding(file_path)
        print(f"文件编码: {encoding}")
        with open(file_path, "r", encoding=encoding, errors='replace') as f:
            lines = [line.strip() for line in f if line.strip()]
        print(f"\n读取到 {len(lines)} 行数据")
        for i, line in enumerate(lines[:3], 1):
            print(f"[样本{i}] {line}")
    except Exception as e:
        print(f"文件读取失败: {str(e)}")
        return

    relations = []
    error_log = []
    for line_num, line in enumerate(lines, 1):
        parsed = parse_cypher_relationship(line)
        if parsed:
            relations.append(parsed)
        else:
            error_log.append({"line_num": line_num, "content": line})

    print(f"\n解析结果:")
    print(f"√ 成功解析: {len(relations)} 行")
    print(f"× 解析失败: {len(error_log)} 行")

    if not relations:
        print("错误: 没有解析出任何有效关系")
        return

    df = pd.DataFrame(relations)
    df = standardize_data(df)
    
    # 使用实体名称作为标识符
    triples = [(row['head_name'], row['relation'], row['tail_name']) for _, row in df.iterrows()]
    
    # 分割训练测试集
    train_triples, test_triples = train_test_split(triples, test_size=0.2, random_state=42)
    
    # 创建TriplesFactory
    train_tf = TriplesFactory.from_labeled_triples(np.array(train_triples, dtype=str))
    test_tf = TriplesFactory.from_labeled_triples(np.array(test_triples, dtype=str), 
                                                entity_to_id=train_tf.entity_to_id,
                                                relation_to_id=train_tf.relation_to_id)

    # 训练和评估多个模型
    model_results = {}
    for model_name in ["RotatE", "ComplEx", "ConvE"]:
        result, metrics = train_and_evaluate(train_tf, test_tf, model_name)
        model_results[model_name] = {
            "result": result,
            "metrics": metrics
        }
        print(f"\n{model_name} 模型评估结果:")
        print(f"IHMR: {metrics.get_metric(name='both.realistic.inverse_harmonic_mean_rank')}")
        print(f"Hits@10: {metrics.get_metric(name='both.realistic.hits_at_10')}")

    # 选择最佳模型
    best_model = max(model_results.items(), 
                    key=lambda x: x[1]['metrics'].get_metric('both.realistic.inverse_harmonic_mean_rank'))
    print(f"\n最佳模型: {best_model[0]}")
    
    # 保存最佳模型的嵌入向量
    save_embeddings(best_model[1]['result'], train_tf, "best_model")

def save_embeddings(result, triples_factory, prefix):
    """保存嵌入向量"""
    # 实体嵌入
    entity_embeddings = result.model.entity_representations[0]
    entity_ids = list(triples_factory.entity_to_id.keys())
    entity_vectors = entity_embeddings(torch.arange(len(entity_ids))).detach().numpy()
    entity_df = pd.DataFrame(entity_vectors, index=entity_ids)
    entity_df.index.name = "entity"
    entity_df.to_csv(f"D:\Data\Graduate Content\Ophthalmology Papers\Artificial intelligence\Knowledge Graph\Retinal Disease\{prefix}_entity_embeddings.csv")
    
    # 关系嵌入
    relation_embeddings = result.model.relation_representations[0]
    relation_ids = list(triples_factory.relation_to_id.keys())
    relation_vectors = relation_embeddings(torch.arange(len(relation_ids))).detach().numpy()
    relation_df = pd.DataFrame(relation_vectors, index=relation_ids)
    relation_df.index.name = "relation"
    relation_df.to_csv(f"D:\Data\Graduate Content\Ophthalmology Papers\Artificial intelligence\Knowledge Graph\Retinal Disease\{prefix}_relation_embeddings.csv")
    
    print(f"\n已保存{prefix}的嵌入向量到指定目录")

if __name__ == "__main__":
    main()

: 