In [3]:
# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import sys
import os

# Add the parent directory of src to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.db.manager import DBManager
from src.input_to_instructions.load_and_execute import *
from src.input_to_instructions.types import *
from src.operation.execute import *
from src.response_generation.load_and_execute import *

INFO:datasets:PyTorch version 2.7.0+cu128 available.


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [7]:
from collections import defaultdict, Counter
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

# from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [8]:
BASE_DIR = "../"
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

In [9]:
ResponseGeneration.initialize(
    log_output=False,
    instance_type="unsloth"
)

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.2: Fast Siglip patching. Transformers: 4.52.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.189 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Siglip does not support SDPA - switching to eager!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [59]:

from src.input_to_instructions.types import InstructionQ_raw
def get_time(df, fmt="datetime"):
    # from df get 'timestamp' column and return them in format
    if fmt == "date":
        fmt = '%Y-%m-%d'
    elif fmt == "month":
        fmt = '%Y-%m'
    elif fmt == "year":
        fmt = '%Y'
    else:
        fmt = '%Y-%m-%d %H:%M:%S'
    
    if isinstance(df['timestamp'], pd.Timestamp):
        result = df['timestamp'].strftime(fmt)
    else:
        result = df['timestamp'].apply(lambda x: x.strftime(fmt))
    return sorted(list(set(result)))

def get_spatials(df):
    return pd.unique(df['idu_name'])

def get_tv(df, col:str|list[str], fmt="datetime"):
    if isinstance(col, str):
        col = [col]
    
    timestamps = get_time(df, fmt)
    return_tuple = tuple([timestamps] + [df[c] for c in col])
    return return_tuple

def data(metadata, mapping, query_results, t=str|list[str], s=str|list[str], m=str|list[str]):
    if isinstance(t, str):
        t = [t]
    if isinstance(s, str):
        s = [s]
    if isinstance(m, str):
        m = [m]

    t_raw = [mapping.temporal[t_highlevel] for t_highlevel in t]
    s_raw = [mapping.spatials[s_highlevel] for s_highlevel in s]
    m_raw = [mapping.modalities[m_highlevel] for m_highlevel in m]
    
    print(m_raw, t_raw, s_raw)
    result_df = DBManager.structured_query_data_t_v2(metadata, m_raw, t_raw, s_raw, get_rowids=True)

    cols = list(result_df.columns)
    cols.remove("id")
    cols.remove("idu_name")
    cols.remove("timestamp")
    rows = list(result_df["id"])
    query_results.append({
        "result_columns": cols,
        "result_indices": rows,
    })
    print(cols, rows)

    # For demo, drop rows where any value is -1
    result_df = result_df.loc[(result_df != -1).all(axis=1)]

    # drop "id" from result_df
    result_df = result_df.drop(columns=['id'])

    # change column names to high level
    inverse_mapping = {v: k for k, v in mapping.modalities.items()}
    result_df.columns = [inverse_mapping[col] if col in inverse_mapping else col for col in result_df.columns]

    # change idu_name raw values to high level
    inverse_mapping = {v: k for k, v in mapping.spatials.items()}
    result_df["idu_name"] = result_df["idu_name"].map(inverse_mapping)

    return result_df


def run_query_v2(user_input, metadata, mapping, expectations, required_variables, scripts):
    query_results = []
    variables = {}
    if scripts is not None:

        # search data(t=~~, ...,)
        globals()['metadata'] = metadata
        globals()['mapping'] = mapping
        globals()['query_results'] = query_results
        try:
            for script in scripts:
                
                if "data" in script:
                    script = script.replace("data(", "data(metadata, mapping, query_results, ")
                
                try:
                    exec(script, globals())
                except Exception as e:
                    print(f"Error in executing script: {script}")
                    print(e)
                    raise e
        
            variables = {name:globals()[name] for name in globals() if name.startswith("v_")}

            response, required_variables = ResponseGeneration.execute_v2(expectations, required_variables, variables, user_input, exp_tag=None)
            return response, variables, required_variables, query_results
        except Exception as e:
            print(f"Error in running query_v2: {e}")
            return "Ïã§ÌñâÏ§ë ÏóêÎü¨Í∞Ä Î∞úÏÉùÌñàÏäµÎãàÎã§.", variables, None, query_results
    else:
        variables = {}
        unknown_spatials = [k for k, v in mapping.spatials.items() if v == "Unknown"]
        unknown_modalities = [k for k, v in mapping.modalities.items() if v == "Unknown"]
        
        response_unknown = f"Ï£ÑÏÜ°Ìï©ÎãàÎã§, {unknown_spatials + unknown_modalities}Îäî Ï°¥Ïû¨ÌïòÏßÄ ÏïäÎäî Í≥µÍ∞ÑÏù¥ÎÇò Î™®Îã¨Î¶¨Ìã∞ ÏûÖÎãàÎã§."
        return response_unknown, variables, [], query_results


def run_query(user_input, metadata, instructions, exp_tag=None):
    variables = {
        "Metadata": metadata,
    }
    query_results = []
        
    
    for instruction in instructions:
        # logger.debug(f"Executing instruction: {instruction.__class__.__name__}")
        # print(f"Executing instruction: {instruction.__class__.__name__}")
        
        if type(instruction) == InstructionQ:
            # Execute query
            result_df = DBManager.structured_query_data_t(metadata, instruction.args, get_rowids=True)
            # if result_df is None:
                # print("Ï£ÑÏÜ°Ìï©ÎãàÎã§, Í¥ÄÎ†® Îç∞Ïù¥ÌÑ∞Î•º Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§.", "response")
                # return

            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # For demo, drop rows where any value is -1
            result_df = result_df.loc[(result_df != -1).all(axis=1)]

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
           
            #pd.set_option('display.max_rows', 10000)        
            #pd.set_option('display.max_columns', 1000)
            #pd.set_option('display.width', 1000)
            #pd.set_option('display.max_colwidth', 1000)
            #print(f"QueryResult: {result_df}")

            variables[instruction.result_name] = result_df
        elif type(instruction) == InstructionQ_raw:
            instruction.query = instruction.query.replace(" FROM \"data_t\"", ", \"id\" FROM \"data_t\"")
            result_df = DBManager.execute_structured_query_string(
                instruction.query
            )
            # rename idu_name to idu
            result_df = result_df.rename(columns={'idu_name': 'idu'})
            
            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
            
            variables[instruction.result_name] = result_df
            # print(result_df, flush=True)

        elif type(instruction) == InstructionO:
            # Execute operation
            # variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            result_dict = OperationExecutor.execute(variables, instruction.scripts)
            # print(instruction.scripts, instruction.returns, result_dict)
            variables.update(result_dict)
            pass
            # print(fig, "graph")
        elif type(instruction) == InstructionR:
            # Execute response generation
            variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            # variables_to_report = ResponseGeneration.stringify_variables(variables_to_report)
            # variables_to_report = summarize_variables_to_report(variables_to_report)

            # print(f"Variables: {variables_to_report}")

            keys_to_leave = ["modality_mapping", "idu_mapping"]
            metadata_ = {}
            for key in metadata.keys():
                if key in keys_to_leave:
                    metadata_[key] = metadata[key]

            response, required_variables = ResponseGeneration.execute(instruction, variables, user_input, metadata_, exp_tag=exp_tag)
            # print(f"Required variables: {required_variables}")
            
            # response = instruction.expectations[0] # "{{var}}..."
            # for var_name, var_value in required_variables.items():
            #     placeholder = f"{{{{{var_name}}}}}"
            #     if placeholder in response:
            #         response = response.replace(placeholder, str(var_value))

            
            return response, variables_to_report, required_variables, query_results

In [51]:
from copy import deepcopy


def build_query_groundtruth():
    dataset_name = "v7-250309-reduceinputanddatefunctioncall"
    def read(path):
        data = read_json(path)
        for i, d in enumerate(data):
            data[i]["Scenario"] = directory.name
            if "v7" in dataset_name:
                data[i]["Metadata"] = metadata
        return data

    ds_ts = []
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dataset_name}")
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir():
            if "v7" in dataset_name:
                metadata = read_json(f"{directory}/metadata.json")
            
            # d = read(f"{directory}/onlyq_ts.json")
            
            ds_ts.extend(read(f"{directory}/onlyq_ts.json"))
            ds_ts.extend(read(f"{directory}/onlyq_tr.json"))
            # ds_tr.extend(read(f"{directory}/graph.json"))
    
    ds = ds_ts
    print(len(ds))
    # if "v7" in dataset_name:
    #     db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
    # else:
    #     db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
    #     metadata = None
    
    # with open(db_gt_filename, "w", encoding="utf-8") as f:
        # f.write("[")
    # with tqdm(total=len(ds)) as pbar:
    
    gts = []

    for d in ds:
        cont = False
        tags = d["Tags"]["Style"]
        skip_tags = ["Reason", "Graph", "Unrelated", "Prediction"]
        for st in skip_tags:
            if st in tags:
                cont = True
                break
        if cont:
            continue

        # pbar.set_description(f"Processing {d['Input']}")
        # print("--")
        exp_tag = "v2"
        # print(f"Warning! exp_tag is v2")
        mapping, expectations, required_variables, scripts = InputToInstruction.postprocess_v2(deepcopy(d['Response']), exp_tag=exp_tag)
        user_input, tags, metadata, scenario = d["Input"], d["Tags"], d["Metadata"], d["Scenario"]
        # if user_input != "ÏßÄÍ∏à Î™áÏãúÏïº?":
        #     continue

        response, variables_to_report, required_variables, query_results = run_query_v2(user_input, metadata, mapping, expectations, required_variables, scripts)
        print(f"Ï∂úÎ†•: {response}")
        # print({k: (v, type(v)) for k, v in variables_to_report.items()})
        gts.append({
            "Input": user_input,
            "Metadata": metadata,
            "Scenario": scenario,
            "Tags": tags,
            "GT": d['Response'],
            "Response": response,
            # "RequiredVariables": required_variables,
            "QueryResults": query_results,
            # "VariablesToReport": variables_to_report,
        })

    # save to json
    with open(f"./gts.json", "w", encoding="utf-8") as f:
        json.dump(gts, f, ensure_ascii=False, indent=4)
  

In [57]:

build_query_groundtruth()

38
Ï∂úÎ†•: Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ(25.67¬∞C)ÏôÄ ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ(25.98¬∞C)Ïùò Ï∞®Ïù¥Îäî 0.31¬∞CÏûÖÎãàÎã§.
Ï∂úÎ†•: ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑ(23.00¬∞C)ÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ(28.50¬∞C)Ïùò Ï∞®Ïù¥Îäî 5.50¬∞CÏûÖÎãàÎã§.
Ï∂úÎ†•: ÏßÄÎÇúÎã¨ 8Ïõî 19ÏùºÏóê ÏÑ§Ï†ïÏò®ÎèÑ(23.00¬∞C)ÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ(22.00¬∞C) Ï∞®Ïù¥Í∞Ä 1.00¬∞CÎ°ú Í∞ÄÏû• Ïª∏ÏäµÎãàÎã§.
Ï∂úÎ†•: Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞ò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ(25.67¬∞C)ÏôÄ ÏòÜÎ∞ò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ(25.11¬∞C) Ï∞®Ïù¥Îäî 0.56¬∞CÏûÖÎãàÎã§.
Ï∂úÎ†•: 2Ï£ºÏ†Ñ 9Ïõî 12Ïùº, 13Ïùº, 14Ïùº, 15Ïùº, 16Ïùº, 17Ïùº, 18ÏùºÏóê Ïã§ÎÇ¥Ïò®ÎèÑ(26.00¬∞C)Í∞Ä Í∞ÄÏû• ÎÜíÏïòÏäµÎãàÎã§.
Ï∂úÎ†•: Ï£ÑÏÜ°Ìï©ÎãàÎã§, ['ÌôîÏÑ±']Îäî Ï°¥Ïû¨ÌïòÏßÄ ÏïäÎäî Í≥µÍ∞ÑÏù¥ÎÇò Î™®Îã¨Î¶¨Ìã∞ ÏûÖÎãàÎã§.
Ï∂úÎ†•: Ï£ÑÏÜ°Ìï©ÎãàÎã§, ['ÏäµÎèÑ']Îäî Ï°¥Ïû¨ÌïòÏßÄ ÏïäÎäî Í≥µÍ∞ÑÏù¥ÎÇò Î™®Îã¨Î¶¨Ìã∞ ÏûÖÎãàÎã§.
Ï∂úÎ†•: ÏßÄÎÇú 3ÏùºÍ∞Ñ Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥Ïò®ÎèÑ ÌèâÍ∑†Í∞íÏùÄ 25.13¬∞CÏòÄÏäµÎãàÎã§.
Ï∂úÎ†•: Ïò§Îäò Ïò§ÌõÑ 5Ïãú ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî 23.00¬∞CÏòÄÏäµÎãàÎã§.
Ï∂úÎ†•: Ïò¨Ìï¥ Ïó¨Î¶Ñ(6Ïõî ~ 8Ïõî) Ïö∞Î¶¨Î∞òÏùò 

In [65]:
from typing import Any  # Any ÌÉÄÏûÖ import ÌïÑÏöî

class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "QueryTruePositive"
    false_positive = "QueryFalsePositive"
    false_negative = "QueryFalseNegative"
    
def eval_query(cand_response_filename, db_gt_filename="./gts.json"):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []
    response_reports = []
    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            if "Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ?" not in input:
                continue

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # Í¥ÄÍ≥Ñ ÏóÜÎäî ÏßàÎ¨∏Îì§ÏùÄ Í±¥ÎÑàÎõ∞Ïûê
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue

            gt_report = gt_report[0]
            tags = gt_report["Tags"]
            # assert gt_report["QueryResults"] != []
            # if gt_report["Result"] == []:
            #     pbar.update(1)
            #     continue
            
            gt_results = [d for d in gt_report["QueryResults"]]
            gt_query_results = defaultdict(list)
            for gt_result in gt_results:
                for col in gt_result["result_columns"]:
                    gt_query_results[col].extend(gt_result["result_indices"])

            gt_total_combinations = sum(len(v) for v in gt_query_results.values())

            gt_response = gt_report["Response"]
            # gt_required_variables = gt_report["RequiredVariables"]
            # gt_variables_to_report = gt_report["VariablesToReport"]
            user_input = gt_report["Input"]

            response_report = {
                "Input": user_input,
                "Metadata": metadata,
                "GT_Response": gt_response,
                # "GT_RequiredVariables": gt_required_variables,
                # "GT_VariablesToReport": gt_variables_to_report,
            }
            # evaluation_report ÎîïÏÖîÎÑàÎ¶¨ ÏÉùÏÑ± (defaultdict ÏÇ¨Ïö©, Í∏∞Î≥∏Í∞í None)

            evaluation_report: dict[str, Any] = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Metadata"] = metadata
            evaluation_report["Tags"] = tags
            
            if isinstance(cand_response["Candidate"], dict):
                requirements = ["Thinking", "Expectations", "Mapping"]
                for requirement in requirements:
                    if requirement not in cand_response["Candidate"]:
                        evaluation_report[EM.json_structure] = False
                        break
                else:
                    evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
            
            if not evaluation_report[EM.json_structure]:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations

                print("Failed to parse input: ", input, cand_response["Candidate"])
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue
            
            
            exp_tag = \
                "woCoTExp" if "woCoTExp" in str(cand_response_filename) else \
                "woOp" if "woOp" in str(cand_response_filename) else \
                "woQM" if "woQM" in str(cand_response_filename) else \
                None
            
            mapping, expectations, required_variables, script = InputToInstruction.postprocess_v2(
                deepcopy(cand_response["Candidate"]), 
                exp_tag=exp_tag
            )
            
            response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, mapping, expectations, required_variables, script)
            print(response)
            response_report["PD_Response"] = response
            # try:
            #     # response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, instructions, exp_tag=exp_tag)
            # except Exception as e:
            #     print(f"Error: {e}")
            #     # evaluation_report[EM.true_positive] = 0
            #     # evaluation_report[EM.false_positive] = 0
            #     # evaluation_report[EM.false_negative] = gt_total_combinations
                            
            #     # evaluation_reports.append(evaluation_report)

            #     # response_reports.append(response_report)
                            
            #     # pbar.update(1)
            #     # continue
            
            response_reports.append(response_report)
            
            # required_variables = summarize_variables_to_report(required_variables)
            # print(required_variables)
            # required_variables = ResponseGeneration.stringify_variables(required_variables)
            
            # response_report["PD_RequiredVariables"] = required_variables
            # response_report["PD_VariablesToReport"] = variables_to_report

            if len(_cand_query_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                            
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                continue
            
            cand_query_results = defaultdict(list)
            for cand_query_result in _cand_query_results:
                for col in cand_query_result["result_columns"]:
                    cand_query_results[col].extend(cand_query_result["result_indices"])

            cand_total_combinations = sum(len(v) for v in gt_query_results.values())

            if len(gt_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = cand_total_combinations
                evaluation_report[EM.false_negative] = 0

                evaluation_reports.append(evaluation_report)
                pbar.update(1)

                continue
            
            # print(gt_total_combinations, cand_total_combinations)
            # True Positive: Í≥µÌÜµÎêú Ïª¨ÎüºÍ≥º Î°úÏö∞Ïùò Î™®Îì† Ï°∞Ìï©
            true_positive = 0
            false_negative = 0
            false_positive = 0
            print(gt_query_results, cand_query_results)
            for col in set(gt_query_results.keys())&set(cand_query_results.keys()):
                s_gt_query_result = set(gt_query_results[col])
                s_cand_query_result = set(cand_query_results[col])
                true_positive += len(s_gt_query_result & s_cand_query_result)
                false_negative += len(s_gt_query_result - s_cand_query_result)
                false_positive += len(s_cand_query_result - s_gt_query_result)

                # print(true_positive, false_negative, false_positive, len(s_gt_query_result), len(s_cand_query_result))
            # assert true_positive + false_positive + false_negative == gt_total_combinations
            

            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative

            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    with open(f"{cand_response_filename.replace('.json', '_response.json')}", "w", encoding="utf-8") as f:
        json.dump(response_reports, f, ensure_ascii=False, indent=4)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_print = eval_df.drop(columns=["Metadata", "Tags"])
    print(eval_print)
    eval_df[EM.true_positive] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df[EM.false_positive] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df[EM.false_negative] = eval_df[EM.false_negative] / eval_df["Total"]

    # # replace nan with 0
    # eval_df.fillna(0, inplace=True)

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    print(truepos_sum, falsepos_sum, falseneg_sum)
    print(precision, recall)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall

    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

In [67]:
# name = "r-v7_r256_a512_ours_tr6_0503-checkpoint-63"
# name = "r-v7_r256_a512_ours_tr18_0503-checkpoint-52"
# name = "r-v7_r256_a512_ours_tr30_0503-checkpoint-54"
# name = "r-v7_r256_a512_ours_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_ours_tr60_0503-checkpoint-108"

# name = "r-v7_r256_a512_woall_tr6_0503-checkpoint-28"
# name = "r-v7_r256_a512_woall_tr18_0503-checkpoint-70"
# name = "r-v7_r256_a512_woall_tr30_0503-checkpoint-57"
# name = "r-v7_r256_a512_woall_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_woall_tr60_0503-checkpoint-90"

names = [
# "r-v7_r256_a512_ours_tr6_0503-checkpoint-63",
# "r-v7_r256_a512_ours_tr18_0503-checkpoint-52",
# "r-v7_r256_a512_ours_tr30_0503-checkpoint-54",
# "r-v7_r256_a512_ours_tr45_0503-checkpoint-95",
# "r-v7_r256_a512_ours_tr60_0503-checkpoint-54",
# "r-v7_r256_a512_woCoT_tr60_0503--checkpoint-84",
# "r-v7_r256_a512_woCoTExp_tr60_0503--checkpoint-102",
# "r-v7_r256_a512_woOp_tr60_0503--checkpoint-90",
# "r-v7_r256_a512_woQM_tr60_0503--checkpoint-54"
# "r-v7_r170_a340_ours_tr56_0613--checkpoint-60",
# "r-v7_r256_a512_ours_tr56_0613--checkpoint-68",
"r-v7_r200_a400_ours_tr27_0613-checkpoint-34"
# "test"
# "v8"
]

for name in names:
    eval_query(
        f"../experiments/{name}.json"
    )

Processing Ïò§Îäò Ïò§ÌõÑ 5ÏãúÏóê ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî Ïñ¥Îï†Ïñ¥?:   0%|          | 0/11 [00:00<?, ?it/s]             

Processing Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ?:   0%|          | 0/11 [00:00<?, ?it/s]                

['roomtemp'] [87838]
ÏßàÎ¨∏: Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ?; Ìè¨Îß∑: ['ÌòÑÏû¨ Îëê Î∞© Ï§ë {{v_ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Î∞©}}Ïù¥(Í∞Ä) {{v_ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†}}‚ÑÉÎ°ú Í∞ÄÏû• ÎçîÏö¥ Î∞©Ïù¥ÏóêÏöî.']; Îç∞Ïù¥ÌÑ∞: {'v_ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†': np.float64(28.5), 'v_ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Î∞©': array(['Ïö∞Î¶¨Î∞ò'], dtype=object)};


Processing Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ?:   9%|‚ñâ         | 1/11 [00:00<00:04,  2.08it/s]

ÌòÑÏû¨ Îëê Î∞© Ï§ë Ïö∞Î¶¨Î∞òÏù¥(Í∞Ä) 28.50¬∞CÎ°ú Í∞ÄÏû• ÎçîÏö¥ Î∞©Ïù¥ÏóêÏöî.
SEx
defaultdict(<class 'list'>, {'roomtemp': [87838]}) defaultdict(<class 'list'>, {'roomtemp': [87838]})
                 Input  JsonStructureCorrectness  QueryTruePositive  \
0  Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ?                      True                  1   

   QueryFalsePositive  QueryFalseNegative  ExactMatch  Total  
0                   0                   0           1      1  
1.0 0.0 0.0
1.0 1.0
JsonStructureCorrectness: 1.00
ExactMatch: 1.00
F1: 1.00
Recall: 1.00



