In [10]:
# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from collections import defaultdict, Counter
import logging

import pandas as pd
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

from db.manager import DBManager
from pathlib import Path

In [12]:
BASE_DIR = "../"

In [13]:
class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "TruePositive"
    false_positive = "FalsePositive"
    false_negative = "FalseNegative"

def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

def run_query_and_get_report(input, metadata, scenario, instruction_set):
    input_report = {}
    input_report["Input"] = input
    input_report["Scenario"] = scenario
    input_report["Result"] = []
    for instruction in instruction_set:
        i_type = instruction["type"]
        if i_type == "q":
            # query
            args = instruction["args"]
            result_var_name = instruction["result_name"]
            # print(f"Query: {args}, {result_var_name}")
            if "temporal" in args:
                del args["table_name"]
                args["metadata"] = metadata
                result_df = DBManager.structured_query_data_t(args, get_rowids=True)
            else:
                result_df = DBManager.structured_query(args, get_rowids=True)
            # print(f"Result:\n{result_df}")
            try:
                if "timestamp" in result_df.columns:
                    try:
                        result_df["timestamp"] = result_df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
                    except Exception as e:
                        print(args)
                        print(result_df["timestamp"])
                result = result_df.to_dict(orient="index")
                cols = list(result_df.columns)
                result = [[row[col] for col in cols] for row in result.values()]
                input_report["Metadata"] = metadata
                input_report["Result"].append({
                    "type": "q",
                    # "args": args,
                    # "result_name": result_var_name,
                    "result_shape": result_df.shape,
                    "result_columns": cols,
                    "result_indices": list(result_df["id"]),
                    # "result": result
                })
            except Exception as e:
                logger.error(f"Error inside: {e}")
                logger.error(f"Invoked with Query: {args}, {result_var_name}")
        
    return input_report

def build_query_groundtruth(dateset_name):
    ds_ts = []
    dt_tr = []
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dateset_name}")
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir() and "scenario" in directory.name:
            ts = read_json(f"{directory}/onlyq_ts.json")
            if "v7" in dateset_name:
                metadata = read_json(f"{directory}/metadata.json")
            for i, d in enumerate(ts):
                ts[i]["Scenario"] = directory.name
                if "v7" in dateset_name:
                    ts[i]["Metadata"] = metadata
            ds_ts.extend(ts)
            

            tr = read_json(f"{directory}/onlyq_tr.json")
            for i, d in enumerate(tr):
                tr[i]["Scenario"] = directory.name
                if "v7" in dateset_name:
                    tr[i]["Metadata"] = metadata
            dt_tr.extend(tr)
    
    ds = ds_ts + dt_tr
    print(len(ds))
    
    if "v7" in dateset_name:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
    else:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
        metadata = None
    
    with open(db_gt_filename, "w", encoding="utf-8") as f:
        f.write("[")
        with tqdm(total=len(ds)) as pbar:
            for d in ds:
                pbar.set_description(f"Processing {d['Input']}")
                # print("--")
                
                input = d["Input"]
                # print(f"Input: {input}")
                scenario = d["Scenario"]
                
                metadata = d["Metadata"]
                response = d["Response"]
                # instruction_set = response["Instruction Set"]
                instruction_set = response["Instructions"]
                # print(f"Instruction Set: {type(instruction_set)}, {len(instruction_set)}")
                input_report = run_query_and_get_report(input, metadata, scenario, instruction_set)
                
                
                try:
                    # print(input_report)
                    # del input_report["Metadata"]
                    f.write(json.dumps(input_report, ensure_ascii=False) + ",\n")
                except Exception as e:
                    logger.error(f"Error outside: {e}")
                    logger.error(f"Invoked with Input: {input}")
                    logger.error(f"Input Report: {input_report}")
                    # exit()
                    raise e
                
                # print("\n")
                pbar.update(1)
    
        # make it json array format
        # remove last comma
        f.seek(f.tell() - 2, 0)
        f.write("]")



In [14]:
# build_query_groundtruth("v5-250228-multimetadata")

# Eval_query

In [15]:
# def eval_query(db_gt_filename, cand_response_filename):
#     db_gts = read_json(db_gt_filename)
#     cand_responses = read_json(cand_response_filename)

#     evaluation_reports = []

#     with tqdm(total=len(cand_responses)) as pbar:
#         for cand_response in cand_responses:
#             pbar.set_description(f"Processing {cand_response['Input']}")
#             input = cand_response["Input"]
#             scenario = cand_response["Scenario"]

#             if "Metadata" in cand_response:
#                 metadata = cand_response["Metadata"]
#             else:
#                 metadata = None
#             # 관계 없는 질문들은 건너뛰자
#             gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
#             assert len(gt_report) <= 1
#             if len(gt_report) == 0:
#                 pbar.update(1)
#                 continue
#             gt_report = gt_report[0]
#             if gt_report["Result"] == []:
#                 pbar.update(1)
#                 continue
            
#             evaluation_report = defaultdict(lambda: None)
#             evaluation_report["Input"] = input
#             evaluation_report["Scenario"] = scenario
            
#             if isinstance(cand_response["Candidate"], dict) and ("Instruction Set" in cand_response["Candidate"] or "지시" in cand_response["Candidate"]):
#                 if "Instruction Set" in cand_response["Candidate"]:
#                     cand_instruction_set = cand_response["Candidate"]["Instruction Set"]
#                 elif "지시" in cand_response["Candidate"]:
#                     cand_instruction_set = cand_response["Candidate"]["지시"]
                    
#                 evaluation_report[EM.json_structure] = True
#             else:
#                 evaluation_report[EM.json_structure] = False
#                 try:
#                     import re
#                     # get data between "Instruction Set": [ and the last]
#                     cand_instruction_set = re.search(r'(?<="Instruction Set": \[)(.*)(?=\])', cand_response["Candidate"], re.DOTALL).group(0)
#                     # find all {"type": ~ }, {"type": ~ }, {"type": ~ }
#                     cand_instruction_set = re.findall(r'({"type".*?})', cand_instruction_set)
#                     # print(list(cand_instruction_set))
#                     cand_instruction_set = [eval(d) for d in cand_instruction_set]
#                 except Exception as e:
#                     evaluation_report[EM.json_structure] = False
#                     print("Failed to parse input: ", input, cand_response["Candidate"])
#                     print(e)
#                     evaluation_reports.append(evaluation_report)
#                     pbar.update(1)
#                     print(evaluation_report)
#                     continue
                    
#             cand_report = run_query_and_get_report(input, metadata, scenario, cand_instruction_set) 
            
#             # print(f"Input: {input}")
            
#             gt_results, cand_results = gt_report["Result"], cand_report["Result"]
#             gt_cols, gt_rows = set(gt_results["result_columns"]), set(gt_results["result_indices"])
#             cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

#             # True Positive: 공통된 컬럼과 로우의 모든 조합
#             true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

#             # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
#             false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

#             # Candidate의 총 조합에서 TP를 뺀 값이 FP
#             false_positive = (len(cand_cols) * len(cand_rows)) - true_positive

#             # # print(len(gt_flatten), len(cand_flatten))
            
            
#             # # check if all gt results are in cand results
#             # true_positive, false_positive, false_negative = 0, 0, 0
#             # for gt_data in gt_flatten:
#             #     try:
#             #         cand_flatten.remove(gt_data)
#             #         true_positive += 1
#             #     except ValueError as e:
#             #         false_negative += 1
            
#             # false_positive = len(cand_flatten)
            
#             evaluation_report[EM.true_positive] = true_positive
#             evaluation_report[EM.false_positive] = false_positive
#             evaluation_report[EM.false_negative] = false_negative
            
#             evaluation_reports.append(evaluation_report)
#             # print(evaluation_report)
            
#             pbar.update(1)

#     eval_df = pd.DataFrame(evaluation_reports)
#     # print(eval_df)

#     eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
#     # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
#     # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
#     # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

#     final_result = {}

#     for col in ["JsonStructureCorrectness", "ExactMatch"]:
#         # print(f"{col}: {eval_df[col].mean()}")
#         final_result[col] = eval_df[col].mean()
    
#     # normalize per query
#     eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
#     eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
#     eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
#     eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

#     # # F1 score except nans.
#     truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
#     precision = truepos_sum / (truepos_sum + falsepos_sum)
#     recall = truepos_sum / (truepos_sum + falseneg_sum)
#     f1 = 2 * (precision * recall) / (precision + recall)
#     # print(f"F1: {f1}")
#     final_result["F1"] = f1
#     final_result["Recall"] = recall
#     for col in final_result:
#         print(f"{col}: {final_result[col]:.2f}")
    
#     return eval_df

def eval_query(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue
            gt_report = gt_report[0]
            if gt_report["Result"] == []:
                pbar.update(1)
                continue
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Scenario"] = scenario
            
            if isinstance(cand_response["Candidate"], dict) and ("Instruction Set" in cand_response["Candidate"] or "지시" in cand_response["Candidate"] or "Instructions" in cand_response["Candidate"]):
                if "Instruction Set" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instruction Set"]
                elif "지시" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["지시"]
                elif "Instructions" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instructions"]

                evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
                try:
                    import re
                    # get data between "Instruction Set": [ and the last]
                    cand_instruction_set = re.search(r'(?<="Instruction Set": \[)(.*)(?=\])', cand_response["Candidate"], re.DOTALL).group(0)
                    # find all {"type": ~ }, {"type": ~ }, {"type": ~ }
                    cand_instruction_set = re.findall(r'({"type".*?})', cand_instruction_set)
                    # print(list(cand_instruction_set))
                    cand_instruction_set = [eval(d) for d in cand_instruction_set]
                except Exception as e:
                    evaluation_report[EM.json_structure] = False
                    print("Failed to parse input: ", input, cand_response["Candidate"])
                    print(e)
                    evaluation_reports.append(evaluation_report)
                    pbar.update(1)
                    print(evaluation_report)
                    continue
                    
            cand_report = run_query_and_get_report(input, metadata, scenario, cand_instruction_set) 
            
            # print(f"Input: {input}")
            
            gt_results, cand_results = gt_report["Result"], cand_report["Result"]

            if len(cand_results) == 0:
                evaluation_report[EM.json_structure] = False
                # evaluation_report[EM.true_positive] = 0
                # evaluation_report[EM.false_positive] = false_positive
                # evaluation_report[EM.false_negative] = false_negative
                            
                evaluation_reports.append(evaluation_report)
                # print(evaluation_report)
                            
                pbar.update(1)
                continue
            cand_results = cand_results[0]

            gt_rows = []
            for gt_result in gt_results:
                gt_rows.extend(gt_result["result_indices"])
            
            gt_rows = set(gt_rows)
            gt_cols = set(gt_results[0]["result_columns"])
            cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

            gt_cols.remove("id")
            cand_cols.remove("id")
            gt_cols.remove("idu")
            cand_cols.remove("idu")

            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

            # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
            false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

            # Candidate의 총 조합에서 TP를 뺀 값이 FP
            false_positive = (len(cand_cols) * len(cand_rows)) - true_positive


            # print(len(gt_flatten), len(cand_flatten))
            
            # gt_counter = Counter(gt_flatten)
            # cand_counter = Counter(cand_flatten)

            # true_positive = sum(min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_negative = sum(gt_counter[item] - min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_positive = sum(cand_counter[item] - min(cand_counter[item], gt_counter.get(item, 0)) for item in cand_counter)
            
            # # check if all gt results are in cand results
            # true_positive, false_positive, false_negative = 0, 0, 0
            # for gt_data in gt_flatten:
            #     try:
            #         cand_flatten.remove(gt_data)
            #         true_positive += 1
            #     except ValueError as e:
            #         false_negative += 1
            
            # false_positive = len(cand_flatten)
            
            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            
            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall
    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df


# WoAll

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r128_a256_woall-checkpoint-60"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r64_a128_woall-checkpoint-72"
# cand_response_filename = "r-v5_r32_a64_woall-checkpoint-70-batch"
# cand_response_filename = "r-v6_r64_a128_woall_shorten-checkpoint-53"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

# print(eval_df)

In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

# FI

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "r-v5_r256_a512_FI-checkpoint-43-batch"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

print(eval_df)

In [None]:
eval_df

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r256_a512_ISP-checkpoint-104"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
(eval_df)

# Ours

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r256_a512_ours-checkpoint-20"
# cand_response_filename = "r-v5_r128_a256_ours-checkpoint-52-batch"
# cand_response_filename = "r-v5_r128_a256_ours_noexample-checkpoint-50-batch"
# cand_response_filename = "r-v6_r128_a256_ours-checkpoint-52"
# cand_response_filename = "r-v6_r256_a512_ours-checkpoint-40"
# cand_response_filename = "r-v6_r256_a512_ours_shorten-checkpoint-30"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

In [16]:
build_query_groundtruth("v7-250309-reduceinputanddatefunctioncall")

61


Processing Why is our classroom so cold:   0%|          | 0/61 [00:00<?, ?it/s]


NameError: name 'run_query_and_get_report' is not defined

In [40]:
def eval_query_gtgt(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_report in cand_responses:
            pbar.set_description(f"Processing {cand_report['Input']}")
            input = cand_report["Input"]
            scenario = cand_report["Scenario"]


            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            
            
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue
            gt_report = gt_report[0]
            if gt_report["Result"] == []:
                pbar.update(1)
                continue
            
            
            # print(f"Input: {input}")
            
            gt_results, cand_results = gt_report["Result"], cand_report["Result"]
            cand_results = cand_results[0]

            gt_rows = []
            for gt_result in gt_results:
                gt_rows.extend(gt_result["result_indices"])

            gt_rows = set(gt_rows)
            gt_cols = set(gt_results[0]["result_columns"])
            cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

            gt_cols.remove("id")
            cand_cols.remove("id")
            # gt_cols.remove("idu")
            cand_cols.remove("idu")

            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

            # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
            false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

            # Candidate의 총 조합에서 TP를 뺀 값이 FP
            false_positive = (len(cand_cols) * len(cand_rows)) - true_positive
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            evaluation_report["Input"] = input
            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall
    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"

eval_df = eval_query_gtgt(db_gt_filename, cand_response_filename)
print(eval_df)

JSONDecodeError: Expecting value: line 1 column 2 (char 1)

In [26]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,TruePositive,FalsePositive,FalseNegative
0,Why is our classroom so cold,0.125088,0.0,0.874912
1,오늘 아침과 저녁의 온도차이는 얼마나 돼?,0.504202,0.0,0.495798
2,지금 옆반 온도랑 우리반 온도 알려줘,0.166667,0.833333,0.0
4,현재 설정온도랑 실내온도 차이 알려줘.,0.166667,0.833333,0.0
6,앞반 전원 켜져있어?,0.166667,0.833333,0.0
8,최근에 설정온도가 가장 높았던 날 알려줘,0.875353,0.124647,0.0
9,지난 3일 동안 우리반 실내 온도 평균 값 알려줘.,0.749428,0.250572,0.0
14,올여름 제일 더웠던 날 알려줘,0.5,0.5,0.0
15,올해 봄 옆반 제일 추웠던 날 알려줘,0.5,0.5,0.0
20,우리반의 가장 최근 설정 온도 알려줘,0.000604,0.0,0.999396


In [42]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
cand_response_filename = f"{BASE_DIR}/experiments/r-v7_r8_a16_ours-checkpoint-40.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)
print(eval_df)

Processing Why is our classroom so cold:   0%|          | 0/12 [00:00<?, ?it/s]

timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '3 hours' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00'


Processing 오늘 아침과 저녁의 온도차이는 얼마나 돼?:   8%|▊         | 1/12 [00:00<00:00, 106.24it/s]

timestamp >= DATE_TRUNC('day', DATE '2022-09-30') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30' + INTERVAL '1 day')


Processing 지금 옆반 온도랑 우리반 온도 알려줘:  17%|█▋        | 2/12 [00:00<00:00, 53.45it/s]      

timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00'


Processing 이번주 우리반과 옆반 온도 변화 추이 비교해줘:  25%|██▌       | 3/12 [00:00<00:00, 68.85it/s]

timestamp >= DATE_TRUNC('week', DATE '2022-09-30') AND timestamp < DATE_TRUNC('week', DATE '2022-09-30' + INTERVAL '1 week')


Processing 현재 설정온도랑 실내온도 차이 알려줘.:  33%|███▎      | 4/12 [00:00<00:00, 20.84it/s]       

timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00'


Processing 지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?:  42%|████▏     | 5/12 [00:00<00:00, 20.84it/s]

timestamp >= DATE_TRUNC('week', DATE '2022-09-30' - INTERVAL '1 week') AND timestamp < DATE_TRUNC('week', DATE '2022-09-30')


Processing 앞반 전원 켜져있어?:  50%|█████     | 6/12 [00:00<00:00, 20.84it/s]                                     

timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00'


Processing 어제 전원 껐어?:  58%|█████▊    | 7/12 [00:00<00:00, 20.84it/s]    

timestamp >= DATE_TRUNC('day', DATE '2022-09-30' - INTERVAL '1 day') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30')


Processing 최근에 설정온도가 가장 높았던 날 알려줘:  67%|██████▋   | 8/12 [00:00<00:00, 20.84it/s]

timestamp >= DATE_TRUNC('month', DATE '2022-09-30' - INTERVAL '1 month') AND timestamp < DATE_TRUNC('month', DATE '2022-09-30')


Processing 지난 3일 동안 우리반 실내 온도 평균 값 알려줘.:  92%|█████████▏| 11/12 [00:00<00:00, 14.50it/s]

timestamp >= DATE_TRUNC('day', DATE '2022-09-30' - INTERVAL '3 day') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30')


Processing 지난 3일 동안 우리반 실내 온도 평균 값 알려줘.: 100%|██████████| 12/12 [00:00<00:00, 19.43it/s]

JsonStructureCorrectness: 1.00
ExactMatch: 0.40
F1: 0.80
Recall: 0.94
                              Input   Scenario  JsonStructureCorrectness  \
0      Why is our classroom so cold  scenario1                      True   
1           오늘 아침과 저녁의 온도차이는 얼마나 돼?  scenario1                      True   
2              지금 옆반 온도랑 우리반 온도 알려줘  scenario1                      True   
3         이번주 우리반과 옆반 온도 변화 추이 비교해줘  scenario1                      True   
4             현재 설정온도랑 실내온도 차이 알려줘.  scenario1                      True   
5  지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?  scenario1                      True   
6                       앞반 전원 켜져있어?  scenario1                      True   
7                         어제 전원 껐어?  scenario1                      True   
8            최근에 설정온도가 가장 높았던 날 알려줘  scenario1                      True   
9      지난 3일 동안 우리반 실내 온도 평균 값 알려줘.  scenario1                      True   

   TruePositive  FalsePositive  FalseNegative  ExactMatch   Total  
0      1.000000       0.0




In [44]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,TruePositive,FalsePositive,FalseNegative
1,오늘 아침과 저녁의 온도차이는 얼마나 돼?,0.249473,0.750527,0.0
2,지금 옆반 온도랑 우리반 온도 알려줘,0.5,0.5,0.0
4,현재 설정온도랑 실내온도 차이 알려줘.,0.666667,0.333333,0.0
7,어제 전원 껐어?,0.5,0.5,0.0
8,최근에 설정온도가 가장 높았던 날 알려줘,0.0,0.795323,0.204677
9,지난 3일 동안 우리반 실내 온도 평균 값 알려줘.,0.749428,0.0,0.250572
