# Evaluation
(description)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import nltk
import ast

In [None]:
%cd "/content/drive/MyDrive/code/visualization/"

/content/drive/MyDrive/code/visualization


In [None]:
# dataset_name = "VTHNKG-CQ"
exp_name = "hynt"
exp_date = "20250615"
lr = 0.0004
dim = 256
test_epoch = 1050
dataset_names = [
    "VTKG-I",
    "VTHNKG-O",
    "VTHNKG-OT",
    "VTHNKG-CQ",
    "VTHNKG-CQI",
    "VTHNKG-NQ",
    "VTHNKG-NT",
    "VTHNKG-OA",
    "VTHNKG-OA_NT",
    "VTHNKG-OA_CQI"
]

In [None]:
entity_csvs = {}
relation_csvs = {}

for dataset_name in dataset_names:
    entity_csvs[dataset_name] = f"./{dataset_name}/{dataset_name}_{exp_name}_{exp_date}/lr_{lr}_dim_{dim}__{test_epoch}/entity_predictions.csv"
    relation_csvs[dataset_name] = f"./{dataset_name}/{dataset_name}_{exp_name}_{exp_date}/lr_{lr}_dim_{dim}__{test_epoch}/relation_predictions.csv"

## Entity Predictiion

In [None]:
import pandas as pd

def evaluate_link_prediction_metrics(df, target_type=None):
    """
    link prediction 평가를 위한 MRR, Hit@1, Hit@k 계산 함수

    Parameters:
        df (pd.DataFrame): 평가 결과가 담긴 DataFrame (rank 열 포함)
        target_type (str or None): 'head', 'tail', 또는 None (전체 포함)
        hit_k (int): Hit rate의 k 값 (Top-k 정확도 기준)

    Returns:
        dict: MRR, Hit@1, Hit@k 값 포함 딕셔너리
    """
    if target_type == 'triplet': target = ['head', 'tail']
    if target_type == 'head': target = ['head']
    if target_type == 'tail': target = ['tail']
    if target_type == 'all': target = ['head', 'tail', 'value']
    try:
        target_df = df[df['type'].isin(target)]
    except:
        target_df = df.copy()

    if target_df.empty:
        return "No data frame"

    mrr = (1 / target_df['rank']).mean()
    hit1 = (target_df['rank'] == 1).mean()
    hit3 = (target_df['rank'] <= 3).mean()
    hit10 = (target_df['rank'] <= 10).mean()

    return {
        'MRR': float(round(mrr, 4)),
        'Hit@1': float(round(hit1, 4)),
        'Hit@3': float(round(hit3, 4)),
        'Hit@10': float(round(hit10, 4)),
        'Samples': len(target_df)
    }

In [None]:
import pandas as pd

def evaluate_relation_prediction_metrics(df, target_type=None):
    if target_type == 'triplet': target = ['relation']
    if target_type == 'qualifier': target = ['qualifier']
    if target_type == 'all': target = ['relation', 'qualifier']
    try:
        target_df = df[df['type'].isin(target)]
    except:
        target_df = df.copy()

    if target_df.empty:
        return "No data frame"

    mrr = (1 / target_df['rank']).mean()
    hit1 = (target_df['rank'] == 1).mean()
    hit3 = (target_df['rank'] <= 3).mean()
    hit10 = (target_df['rank'] <= 10).mean()

    return {
        'MRR': float(round(mrr, 4)),
        'Hit@1': float(round(hit1, 4)),
        'Hit@3': float(round(hit3, 4)),
        'Hit@10': float(round(hit10, 4)),
        'Samples': len(target_df)
    }

In [None]:
def top1_answer_distribution(df):
    top1 = nltk.FreqDist(df['top1'].to_list())
    return (top1.most_common(5), len(top1))

def top5_answer_distribution(df):
    top5 = df['top5'].to_list()
    top5 = [item for sublist in top5 for item in ast.literal_eval(sublist)]
    top5 = nltk.FreqDist(top5)
    return (top5.most_common(5), len(top5))

def num_test(df):
    return len(df)

def top5_correct_rate(df):
    top5 = [ast.literal_eval(sublist) for sublist in df['top5'].to_list()]
    gt = df['gt'].to_list()
    correct = []

    for i in range(len(top5)):
        if gt[i] in top5[i]:
            correct.append(gt[i])

    fdist = nltk.FreqDist(correct)
    flatted_top5 = [item for sublist in top5 for item in sublist]
    fdist_top5 = nltk.FreqDist(flatted_top5)

    result = []
    for item in fdist:
        correct_count = fdist[item]
        total_top5_count = fdist_top5[item]
        accuracy = correct_count / total_top5_count
        result.append((item, round(accuracy, 4), total_top5_count))

    result.sort(key=lambda x: x[1], reverse=True)

    return result[:5]

In [None]:
def mean_se(df):
    return df['se'].mean()

In [None]:
for dataset_name in dataset_names:
    entity_csv = f"./{dataset_name}/{dataset_name}_{exp_name}_{exp_date}/lr_0.0004_dim_256_elayer_4_dlayer_4_head_8_hid_2048_drop_0.15_smoothing_0.4_batch_1024_steplr_150_1050/entity_predictions.csv"
    relation_csv = f"./{dataset_name}/{dataset_name}_{exp_name}_{exp_date}/lr_0.0004_dim_256_elayer_4_dlayer_4_head_8_hid_2048_drop_0.15_smoothing_0.4_batch_1024_steplr_150_1050/relation_predictions.csv"
    numeric_csv = f"./{dataset_name}/{dataset_name}_{exp_name}_{exp_date}/lr_0.0004_dim_256_elayer_4_dlayer_4_head_8_hid_2048_drop_0.15_smoothing_0.4_batch_1024_steplr_150_1050/numeric_predictions.csv"
    df1 = pd.read_csv(entity_csv)
    df2 = pd.read_csv(relation_csv)
    try: df3 = pd.read_csv(numeric_csv)
    except: df3 = None
    print("\n")
    print("Dataset: ", dataset_name)
    print("Link Prediction")
    print("Head:", evaluate_link_prediction_metrics(df1, target_type='head'))
    print("Tail:", evaluate_link_prediction_metrics(df1, target_type='tail'))
    print("Triplet:", evaluate_link_prediction_metrics(df1, target_type='triplet'))
    print("All:", evaluate_link_prediction_metrics(df1, target_type='all'))
    print("Top1 Answer Distribution:", top1_answer_distribution(df1)[0])
    print("Top1 Answer Diversity:", top1_answer_distribution(df1)[1])
    print("Top5 Answer Distribution:", top5_answer_distribution(df1)[0])
    print("Top5 Answer Diversity:", top5_answer_distribution(df1)[1])
    print("Top5 Correct Rate:", top5_correct_rate(df1))
    print("Number of Test:", num_test(df1))
    print("\n")
    print("Relation Prediction")
    print("Qualifier:", evaluate_relation_prediction_metrics(df2, target_type='qualifier'))
    print("Triplet:", evaluate_relation_prediction_metrics(df2, target_type='triplet'))
    print("All:", evaluate_relation_prediction_metrics(df2, target_type='all'))
    print("Top1 Answer Distribution:", top1_answer_distribution(df2)[0])
    print("Top1 Answer Diversity:", top1_answer_distribution(df2)[1])
    print("Top5 Answer Distribution:", top5_answer_distribution(df2)[0])
    print("Top5 Answer Diversity:", top5_answer_distribution(df2)[1])
    print("Top5 Correct Rate:", top5_correct_rate(df2))
    print("Number of Test:", num_test(df2))
    print("\n")
    if df3 is not None:
      print("Numeric Prediction")
      print("Mean SE:", mean_se(df3))
    print("\n")

    print("-------------------------------------------------------------------")



Dataset:  VTKG-I
Link Prediction
Head: {'MRR': 0.5836, 'Hit@1': 0.5154, 'Hit@3': 0.6231, 'Hit@10': 0.6769, 'Samples': 130}
Tail: {'MRR': 0.1857, 'Hit@1': 0.1154, 'Hit@3': 0.2077, 'Hit@10': 0.2923, 'Samples': 130}
Triplet: {'MRR': 0.3847, 'Hit@1': 0.3154, 'Hit@3': 0.4154, 'Hit@10': 0.4846, 'Samples': 260}
All: {'MRR': 0.3847, 'Hit@1': 0.3154, 'Hit@3': 0.4154, 'Hit@10': 0.4846, 'Samples': 260}
Top1 Answer Distribution: [('person.n.01', 62), ('surfboard.n.01', 9), ('pillow.n.01', 8), ('pizza.n.01', 6), ('street.n.01', 6)]
Top1 Answer Diversity: 85
Top5 Answer Distribution: [('person.n.01', 85), ('counter.n.01', 50), ('engine.n.01', 47), ('pizza.n.01', 41), ('dog.n.01', 29)]
Top5 Answer Diversity: 158
Top5 Correct Rate: [('person.n.01', 0.7529, 85), ('plate.n.04', 0.5, 2), ('sky.n.01', 0.1429, 7), ('bag.n.06', 0.0909, 11), ('baggage.n.01', 0.0909, 11)]
Number of Test: 260


Relation Prediction
Qualifier: No data frame
Triplet: {'MRR': 0.2739, 'Hit@1': 0.2, 'Hit@3': 0.2846, 'Hit@10': 0.43