# Evaluating Trajectories with Metrics


## Importing Libraries
Required External Libraries that must be installed:
- openpyxl
- pandas
- matplotlib


In [3]:

import json
import pandas as pd
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import subprocess



# Evaluating Trajectories with Metrics



In [4]:
# Configs
evaluation_folder = Path("../data/0-iteration-prompt/")
metrics_folder = Path("../data/0-iteration-prompt-evaluation-results/")
gold_answer_folder = Path(".../gold-standard-answers/velektronik-graph-clean-one-to-four-triple-patterns")
knowledge_graph_folder = "../supplybench/abox-en.ttl.gz:../supplybench/tbox.ttl"

In [None]:
for subfolder in evaluation_folder.iterdir():
    if subfolder.is_dir():
        metrics_sub_folder = metrics_folder / subfolder.name
        print(metrics_sub_folder.absolute())
        print(subfolder.absolute()) 
        subprocess.run(["python", "evaluate_sparql_queries.py", "-gf", str(gold_answer_folder.absolute()), "-pf", str(subfolder.absolute()), "-kgf", knowledge_graph_folder, "-o", str(metrics_sub_folder.absolute())])




## Exporting Evaluation Results

In [8]:

def load_evaluation_results(metrics_folder):
    # Run all evaluations
    path_to_experiments_folder = Path(metrics_folder)

    # Iterate over all subfolders and load the evaluation result detailed.json
    results = []
    for subfolder in path_to_experiments_folder.iterdir():
        if subfolder.is_dir():
            detailed_json_path = subfolder / "evaluation-results-detailed.json"
            if detailed_json_path.exists():
                with open(detailed_json_path, "r") as f:
                    evaluation_result = json.load(f)
                results.append({
                    "run": subfolder.name,
                    "model": "llama-v3p3-70b-instruct",
                    "dataset": "velektronik-graph-clean",
                    "evaluation_result": evaluation_result
                })
                
    # Create a dataframe containing all results
    dict = {
        "subjects": [],
        "em": 0,
        "f1": 0,
        "precision": 0,
        "recall": 0,
        "query": "",
        "question": "",
        "run": "",
        "model": "",
        "dataset": "",
        "gold_file_name": "",
        "sparql_error": False,
    }

    data = []
    for run in results:
        for question in run.get("evaluation_result"):
            if question.get("pred_file") == "":
                continue
            
            data.append({
                "subjects": question.get("subjects"),
                "pattern_type": "".join([_ for _ in question.get("subjects") if "pattern" in _]),
                "em": question.get("em"),
                "f1": question.get("f1"),
                "precision": question.get("precision"),
                "recall": question.get("recall"),
                "query": question.get("query"),
                "question": question.get("question"),
                "sparql_error": question.get("sparql_parsing_error") or question.get("sparql_not_found"),
                "run": run.get("run"),
                "model": run.get("model"),
                "dataset": run.get("dataset"),
                "gold_file_name": question.get("gold_file")
            })
            
    df = pd.DataFrame(data)
    return df


df_base_prompt = load_evaluation_results(Path("../data/trajectories/0-iteration-base-prompt-evaluation-results/"))
df_prompt_with_action_restrictions = load_evaluation_results(Path("../data/trajectories/1-iteration-new-prompt-evaluation-results/"))
df_prompt_with_phase_defintion = load_evaluation_results(Path("../data/trajectories/2-iteration-new-prompt-evaluation-results/"))


In [13]:
df_base_prompt.to_excel("df_base_prompt.xlsx")
df_prompt_with_action_restrictions.to_excel("df_prompt_with_action_restrictions.xlsx")
df_prompt_with_phase_defintion.to_excel("df_prompt_with_phase_defintion.xlsx")

