In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, Counter
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

INFO:db.instance:Connected to the database PerSite_DB


In [3]:
BASE_DIR = "../"

In [14]:
class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "TruePositive"
    false_positive = "FalsePositive"
    false_negative = "FalseNegative"
    semantic_true_positive = "SemanticTruePositive"
    semantic_false_positive = "SemanticFalsePositive"
    semantic_false_negative = "SemanticFalseNegative"


def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

def run_query_and_get_report(input, tags, metadata, scenario, instruction_set):
    input_report = {}
    input_report["Input"] = input
    input_report["Tags"] = tags
    input_report["Scenario"] = scenario
    input_report["Result"] = []
    variables = {
        "Metadata": metadata
    }
    print(input)
    for instruction in instruction_set:
        i_type = instruction["type"]
        if i_type == "q":
            # query
            args = instruction["args"]
            result_var_name = instruction["result_name"]
            # print(f"Query: {args}, {result_var_name}")
            if "temporal" in args:
                del args["table_name"]
                args["metadata"] = metadata
                result_df = DBManager.structured_query_data_t(args, get_rowids=True)
            else:
                result_df = DBManager.structured_query(args, get_rowids=True)
            # print(f"Result:\n{result_df}")
            try:
                if "timestamp" in result_df.columns:
                    try:
                        timestamp = result_df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
                    except Exception as e:
                        print(args)
                        print(result_df["timestamp"])
                result = result_df.to_dict(orient="index")
                cols = list(result_df.columns)
                result = [[row[col] for col in cols] for row in result.values()]
                input_report["Metadata"] = metadata
                input_report["Result"].append({
                    "type": "q",
                    "args": args,
                    # "result_name": result_var_name,
                    "result_shape": result_df.shape,
                    "result_columns": cols,
                    "result_indices": list(result_df["id"]),
                    # "result": result
                })

                # drop rows where any value is -1
                result_df = result_df[~result_df.isin([-1]).any(axis=1)]
                variables[result_var_name] = result_df


            except Exception as e:
                logger.error(f"Error inside: {e}")
                logger.error(f"Invoked with Query: {args}, {result_var_name}")
        elif i_type == "o":
            script, returns = instruction["script"], instruction["returns"]
            scripts = script.split(";")
            scripts = [script.strip() for script in scripts]
            scripts = [script for script in scripts if script != ""]
            try:
                variables.update(
                    OperationExecutor.run_script(variables, scripts, returns)
                )
                
            except Exception as e:
                logger.error(f"Error inside: {e}")
                logger.error(f"Invoked with Script: {script}, Returns: {returns}")
                input_report["Result"].append({
                    "type": "o",
                    "script": script,
                    "returns": {k: None for k in returns}
                })
                
                continue
            variables_str = {}
            k_to_track = []
            k_to_track = ["total_time_insec"]
            for k, v in variables.items():
                if k in k_to_track:
                    print(1, k, v, type(v))
                # print(k, type(v))
                type_ = None
                while True:
                    if type(v) in [pd.DataFrame]:
                        # sort by timestamp
                        
                        if "timestamp" in v.columns:
                            v = v.sort_values(by="timestamp")
                        
                        v['timestamp'] = v['timestamp'].map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
                        v = v.to_dict(orient="index")
                        type_ = "pd"
                        break
                    
                    # pd.Index
                    elif type(v) in [pd.Index, np.ndarray, pd.Series]:
                        if len(v) == 0:
                            v = v.tolist()
                            type_ = "primitive"
                            continue
                        # if type(v[0]) in [pd.Timestamp, datetime.date, datetime.datetime, np.datetime64]:
                        #     # v = [x.strftime("%Y-%m-%d %H:%M:%S") for x in v]
                        # elif type(v[0]) in [np.int64, np.float64, np.bool]:
                        #     v = [x.item() for x in v]
                        # break

                        # if type(v) == np.ndarray:
                        #     v = pd.Series(v)
                        if type(v) in [pd.Series]:
                            v.reset_index(drop=True, inplace=True)
                        
                        if k in k_to_track:
                            print(2, k, v[0], type(v[0]))
                            # print(2, k, v)

                        v = pd.unique(v)
                        v = pd.Series(v)
                        if type(v[0]) in [pd.Timestamp, datetime.date, datetime.datetime, np.datetime64]:
                            v = v.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
                        # v = v.to_dict()
                        v = v.tolist()
                        # remove -1 in the list
                        v = [x for x in v if x not in [-1, np.nan]]
                        if len(v) > 5:
                            v = v[:5]
                        # v = v.to_dict()
                        type_ = "primitive"
                        break
                    
                    elif type(v) in [np.int64, np.float64, np.bool]:
                        v = v.item()
                    elif type(v) in [np.datetime64]:
                        v = pd.Timestamp(v)
                        if k in k_to_track:
                            print(3, k, v, type(v))
                    elif type(v) in [pd.Timestamp, datetime.date, datetime.datetime]:
                        v = v.strftime("%Y-%m-%d %H:%M:%S")
                    elif type(v) in [int, float, bool, str, list, dict]:
                        if type(v) in [int, float]:
                            if v in [-1, np.nan]:
                                v = None
                        elif type(v) in [list, dict]:
                            if len(v) == 0:
                                v = None
                            
                        type_ = "primitive"
                        break
                    else:
                        print(f"Type not handled: {k}: {type(v), v}")
                        type_ = "unknown"
                        break
                variables_str[k] = (type_, v)
                # print(k, v)
            input_report["Result"].append({
                "type": "o",
                "script": script,
                "returns": {k: variables_str[k] for k in returns}
            })

            for k in variables_str:
                if k in k_to_track:
                    print(k, variables_str[k])
                    
        elif i_type == "r":
            force = False

            type_os = [r for r in input_report["Result"] if r["type"] == "o"]
            returns = [r["returns"] for r in type_os]
            variables = {}

            for r in returns:
                variables.update(r)

            values = variables.values()
            values_has_no_value = any([v[1] is None for v in values])
            if len(variables) == 0 or values_has_no_value:
                force = True
                if values_has_no_value:
                    instruction["expectations"] = ["관련 데이터를 찾을 수 없습니다."]
            
            input_report["Result"].append({
                "type": "r",
                "expectations": instruction["expectations"],
                "force": force
            })
        elif i_type == "g":
            input_report["Result"].append({
                "type": "g",
                "args": instruction["args"]
            })
    return input_report

def build_query_groundtruth(dateset_name):
    def read(path):
        data = read_json(path)
        for i, d in enumerate(data):
            data[i]["Scenario"] = directory.name
            if "v7" in dateset_name:
                data[i]["Metadata"] = metadata
        return data

    ds_ts = []
    dt_tr = []
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dateset_name}")
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir() and "scenario" in directory.name:
            if "v7" in dateset_name:
                metadata = read_json(f"{directory}/metadata.json")
            
            ds_ts.extend(read(f"{directory}/onlyq_ts.json"))
            dt_tr.extend(read(f"{directory}/onlyq_tr.json"))
    
    ds = ds_ts + dt_tr
    
    if "v7" in dateset_name:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
    else:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
        metadata = None
    
    with open(db_gt_filename, "w", encoding="utf-8") as f:
        f.write("[")
        with tqdm(total=len(ds)) as pbar:
            for d in ds:
                pbar.set_description(f"Processing {d['Input']}")
                # print("--")
                
                input = d["Input"]
                # if not "time" in input:
                #     continue
                # print(f"Input: {input}")
                scenario = d["Scenario"]
                tags = d["Tags"]
                
                metadata = d["Metadata"]
                response = d["Response"]
                # instruction_set = response["Instruction Set"]
                instruction_set = response["Instructions"]
                # print(f"Instruction Set: {type(instruction_set)}, {len(instruction_set)}")
                instruction_set.append({
                    "type": "r",
                    "expectations": response["Expectations"]
                })
                input_report = run_query_and_get_report(input, tags, metadata, scenario, instruction_set)
                
                
                try:
                    # print(input_report)
                    # del input_report["Metadata"]
                    f.write(json.dumps(input_report, ensure_ascii=False) + ",\n")
                except Exception as e:
                    logger.error(f"Error outside: {e}")
                    logger.error(f"Invoked with Input: {input}")
                    logger.error(f"Input Report: {input_report}")
                    # exit()
                    raise e
                
                # print("\n")
                pbar.update(1)
    
        # make it json array format
        # remove last comma
        f.seek(f.tell() - 2, 0)
        f.write("]")

def eval_query(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            # if "오늘 아침과 저녁" not in input:
            #     continue

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue

            gt_report = gt_report[0]
            assert gt_report["Result"] != []
            # if gt_report["Result"] == []:
            #     pbar.update(1)
            #     continue
            
            gt_results = [d for d in gt_report["Result"] if d["type"] == "q"]
            if len(gt_results) != 0:
                gt_args = gt_results[0]["args"]

                # Assume all cols and spatials are same across all queries
                gt_semantic_cols = gt_args["columns"]
                gt_semantic_spatials = gt_args["spatials"]
                gt_semantics = gt_semantic_cols + gt_semantic_spatials
                
                gt_rows = []
                for gt_result in gt_results:
                    gt_rows.extend(gt_result["result_indices"])
                gt_rows = set(gt_rows)
                gt_cols = set(gt_results[0]["result_columns"])
                gt_cols.remove("id")
                gt_cols.remove("idu")
                gt_total_combinations = len(gt_cols) * len(gt_rows)
            else:
                gt_total_combinations = 0
                gt_semantics = []

            # ---
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Scenario"] = scenario
            
            if isinstance(cand_response["Candidate"], dict) and ("Instruction Set" in cand_response["Candidate"] or "지시" in cand_response["Candidate"] or "Instructions" in cand_response["Candidate"]):
                if "Instruction Set" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instruction Set"]
                elif "지시" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["지시"]
                elif "Instructions" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instructions"]

                evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
                try:
                    import re
                    # get data between "Instruction Set": [ and the last]
                    cand_instruction_set = re.search(r'(?<="Instructions": \[)(.*)(?=\])', cand_response["Candidate"], re.DOTALL).group(0)
                    # find all {"type": ~ }, {"type": ~ }, {"type": ~ }
                    cand_instruction_set = re.findall(r'({"type".*?})', cand_instruction_set)
                    # print(list(cand_instruction_set))
                    cand_instruction_set = [eval(d) for d in cand_instruction_set]
                except Exception as e:
                    evaluation_report[EM.json_structure] = False
                    evaluation_report[EM.true_positive] = 0
                    evaluation_report[EM.false_positive] = 0
                    evaluation_report[EM.false_negative] = gt_total_combinations
                    evaluation_report[EM.semantic_true_positive] = 0
                    evaluation_report[EM.semantic_false_positive] = 0
                    evaluation_report[EM.semantic_false_negative] = len(gt_semantics)

                    print("Failed to parse input: ", input, cand_response["Candidate"])
                    print(e)
                    evaluation_reports.append(evaluation_report)
                    pbar.update(1)
                    print(evaluation_report)
                    continue
            

            cand_report = run_query_and_get_report(input, None, metadata, scenario, cand_instruction_set) 
            
            cand_results = cand_report["Result"]
            cand_results = [d for d in cand_results if d["type"] == "q"]

            if len(cand_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                evaluation_report[EM.semantic_true_positive] = 0
                evaluation_report[EM.semantic_false_positive] = 0
                evaluation_report[EM.semantic_false_negative] = len(gt_semantics)
                            
                evaluation_reports.append(evaluation_report)
                # print(evaluation_report)
                            
                pbar.update(1)
                continue

            # assert len(cand_results) == 1

            cand_args = [d["args"] for d in cand_results if d["type"] == "q"]
            cand_semantic_cols = cand_args[0]["columns"]
            cand_semantic_spatials = cand_args[0]["spatials"]
            cand_semantics = cand_semantic_cols + cand_semantic_spatials

            cand_rows = []
            for cand_result in cand_results:
                cand_rows.extend(cand_result["result_indices"])

            
            cand_rows = set(cand_rows)
            cand_cols = set(cand_results[0]["result_columns"])
            cand_cols.remove("id")
            try:
                cand_cols.remove("idu")
            except Exception as e:
                pass

            if len(gt_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = len(cand_cols) * len(cand_rows)
                evaluation_report[EM.false_negative] = 0

                evaluation_report[EM.semantic_true_positive] = 0
                evaluation_report[EM.semantic_false_positive] = len(cand_semantics)
                evaluation_report[EM.semantic_false_negative] = 0

                pbar.update(1)
                continue
            
            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

            # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
            false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

            # Candidate의 총 조합에서 TP를 뺀 값이 FP
            false_positive = (len(cand_cols) * len(cand_rows)) - true_positive


            # print(len(gt_flatten), len(cand_flatten))
            
            # gt_counter = Counter(gt_flatten)
            # cand_counter = Counter(cand_flatten)

            # true_positive = sum(min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_negative = sum(gt_counter[item] - min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_positive = sum(cand_counter[item] - min(cand_counter[item], gt_counter.get(item, 0)) for item in cand_counter)
            
            # # check if all gt results are in cand results
            # true_positive, false_positive, false_negative = 0, 0, 0
            # for gt_data in gt_flatten:
            #     try:
            #         cand_flatten.remove(gt_data)
            #         true_positive += 1
            #     except ValueError as e:
            #         false_negative += 1
            
            # false_positive = len(cand_flatten)
            
            gt_semantics, cand_semantics = set(gt_semantics), set(cand_semantics)

            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            evaluation_report[EM.semantic_true_positive] = len(gt_semantics & cand_semantics)
            evaluation_report[EM.semantic_false_positive] = len(cand_semantics - gt_semantics)
            evaluation_report[EM.semantic_false_negative] = len(gt_semantics - cand_semantics)

            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # replace nan with 0
    # eval_df.fillna(0, inplace=True)

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall

    eval_df["Semantic_ExactMatch"] = eval_df.apply(lambda x: x[EM.semantic_false_positive] == 0 and x[EM.semantic_false_negative] == 0, axis=1).astype(int)
    final_result["Semantic_ExactMatch"] = eval_df["Semantic_ExactMatch"].mean()

    eval_df["Semantic_Total"] = eval_df[EM.semantic_true_positive] + eval_df[EM.semantic_false_positive] + eval_df[EM.semantic_false_negative]
    eval_df["Semantic_TruePositive"] = eval_df[EM.semantic_true_positive] / eval_df["Semantic_Total"]
    eval_df["Semantic_FalsePositive"] = eval_df[EM.semantic_false_positive] / eval_df["Semantic_Total"]
    eval_df["Semantic_FalseNegative"] = eval_df[EM.semantic_false_negative] / eval_df["Semantic_Total"]

    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.semantic_true_positive].sum(), eval_df[EM.semantic_false_positive].sum(), eval_df[EM.semantic_false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)

    final_result["Semantic_F1"] = f1
    final_result["Semantic_Recall"] = recall

    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df



In [21]:
# build_query_groundtruth("v5-250228-multimetadata")

# WoAll

In [51]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r128_a256_woall-checkpoint-60"
cand_response_filename = "r-v7_r256_a512_woall_16bit_adamw16bit_0322-checkpoint-60"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

# print(eval_df)

Processing 오늘 아침과 저녁의 온도차이는 얼마나 돼?:   0%|          | 0/15 [00:00<?, ?it/s]

오늘 아침과 저녁의 온도차이는 얼마나 돼?
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '6 hours' AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '9 hours' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '18 hours' AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '21 hours' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 지금 옆반 온도랑 우리반 온도 알려줘:   7%|▋         | 1/15 [00:00<00:00, 18.97it/s]

지금 옆반 온도랑 우리반 온도 알려줘
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB7') AND timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 현재 설정온도랑 실내온도 차이 알려줘.:  13%|█▎        | 2/15 [00:00<00:00, 33.42it/s]

현재 설정온도랑 실내온도 차이 알려줘.
SELECT "settemp", "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= TIMESTAMP '2022-09-30 12:00:00' - INTERVAL '5 minutes' AND timestamp <= TIMESTAMP '2022-09-30 12:00:00' AND "settemp" IS NOT NULL AND "settemp" IS DISTINCT FROM 'NaN' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?:  20%|██        | 3/15 [00:00<00:00, 46.75it/s]

지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?
SELECT "settemp", "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('week', DATE '2022-09-30' - INTERVAL '1 week') AND timestamp < DATE_TRUNC('week', DATE '2022-09-30') AND "settemp" IS NOT NULL AND "settemp" IS DISTINCT FROM 'NaN' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 이번주 설정온도가 실내온도보다 더 낮았던 날은?:  27%|██▋       | 4/15 [00:00<00:00, 19.82it/s]          

이번주 설정온도가 실내온도보다 더 낮았던 날은?
SELECT "settemp", "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('week', DATE '2022-09-30') AND timestamp < DATE_TRUNC('week', DATE '2022-09-30' + INTERVAL '1 week') AND "settemp" IS NOT NULL AND "settemp" IS DISTINCT FROM 'NaN' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 어제 전원 껐어?:  33%|███▎      | 5/15 [00:00<00:00, 19.82it/s]                               

어제 전원 껐어?
SELECT "oper", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30' - INTERVAL '1 day') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') AND "oper" IS NOT NULL AND "id" IS NOT NULL ORDER BY timestamp
SELECT "oper", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB7') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30' - INTERVAL '1 day') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') AND "oper" IS NOT NULL AND "id" IS NOT NULL ORDER BY timestamp
SELECT "oper", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '02_I81') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30' - INTERVAL '1 day') AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') AND "oper" IS NOT NULL AND "id" IS NOT NULL ORDER BY timestamp


ERROR:operation.execute:Error executing operation off_dates = daily_oper[daily_oper == False].index.strftime('%Y-%m-%d')
ERROR:operation.execute:'Index' object has no attribute 'strftime'
Processing 어제 전원 껐어?:  33%|███▎      | 5/15 [00:00<00:00, 14.09it/s]


AttributeError: 'Index' object has no attribute 'strftime'

In [48]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,Scenario,FalsePositive,FalseNegative


# FI

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "r-v5_r256_a512_FI-checkpoint-43-batch"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

print(eval_df)

In [None]:
eval_df

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r256_a512_ISP-checkpoint-104"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
(eval_df)

# Ours

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r256_a512_ours-checkpoint-20"
# cand_response_filename = "r-v5_r128_a256_ours-checkpoint-52-batch"
# cand_response_filename = "r-v5_r128_a256_ours_noexample-checkpoint-50-batch"
# cand_response_filename = "r-v6_r128_a256_ours-checkpoint-52"
# cand_response_filename = "r-v6_r256_a512_ours-checkpoint-40"
# cand_response_filename = "r-v6_r256_a512_ours_shorten-checkpoint-30"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

In [15]:
build_query_groundtruth("v7-250309-reduceinputanddatefunctioncall")

Processing Why is our classroom so cold:   0%|          | 0/72 [00:00<?, ?it/s]

Why is our classroom so cold
['qr']





AttributeError: 'str' object has no attribute 'strftime'

In [None]:
# def eval_query_gtgt(db_gt_filename, cand_response_filename):
#     db_gts = read_json(db_gt_filename)
#     cand_responses = read_json(cand_response_filename)
#     # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
#     evaluation_reports = []

#     with tqdm(total=len(cand_responses)) as pbar:
#         for cand_report in cand_responses:
#             pbar.set_description(f"Processing {cand_report['Input']}")
#             input = cand_report["Input"]
#             scenario = cand_report["Scenario"]


#             # 관계 없는 질문들은 건너뛰자
#             gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            
            
#             assert len(gt_report) <= 1
#             if len(gt_report) == 0:
#                 pbar.update(1)
#                 continue
#             gt_report = gt_report[0]
#             if gt_report["Result"] == []:
#                 pbar.update(1)
#                 continue
            
            
#             # print(f"Input: {input}")
            
#             gt_results, cand_results = gt_report["Result"], cand_report["Result"]
#             cand_results = cand_results[0]

#             gt_rows = []
#             for gt_result in gt_results:
#                 gt_rows.extend(gt_result["result_indices"])

#             gt_rows = set(gt_rows)
#             gt_cols = set(gt_results[0]["result_columns"])
#             cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

#             gt_cols.remove("id")
#             cand_cols.remove("id")
#             # gt_cols.remove("idu")
#             cand_cols.remove("idu")

#             # True Positive: 공통된 컬럼과 로우의 모든 조합
#             true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

#             # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
#             false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

#             # Candidate의 총 조합에서 TP를 뺀 값이 FP
#             false_positive = (len(cand_cols) * len(cand_rows)) - true_positive
            
#             evaluation_report = defaultdict(lambda: None)
#             evaluation_report[EM.true_positive] = true_positive
#             evaluation_report[EM.false_positive] = false_positive
#             evaluation_report[EM.false_negative] = false_negative
#             evaluation_report["Input"] = input
#             evaluation_reports.append(evaluation_report)
#             # print(evaluation_report)
            
#             pbar.update(1)

#     eval_df = pd.DataFrame(evaluation_reports)
#     # print(eval_df)

#     eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
#     # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
#     # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
#     # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

#     final_result = {}

#     for col in ["ExactMatch"]:
#         # print(f"{col}: {eval_df[col].mean()}")
#         final_result[col] = eval_df[col].mean()
    
#     # normalize per query
#     eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
#     eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
#     eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
#     eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

#     # # F1 score except nans.
#     truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
#     precision = truepos_sum / (truepos_sum + falsepos_sum)
#     recall = truepos_sum / (truepos_sum + falseneg_sum)
#     f1 = 2 * (precision * recall) / (precision + recall)
#     # print(f"F1: {f1}")
#     final_result["F1"] = f1
#     final_result["Recall"] = recall
#     for col in final_result:
#         print(f"{col}: {final_result[col]:.2f}")
    
#     return eval_df

# db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
# cand_response_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"

# eval_df = eval_query_gtgt(db_gt_filename, cand_response_filename)
# print(eval_df)

In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]

In [46]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
cand_response_filename = f"{BASE_DIR}/experiments/r-v7_r256_a512_ours_16bit_adamw16bit_0322-checkpoint-56.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)
print(eval_df)

Processing 오늘 아침과 저녁의 온도차이는 얼마나 돼?:   0%|          | 0/15 [00:00<?, ?it/s]

180
175
355
오늘 아침과 저녁의 온도차이는 얼마나 돼?
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB5') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '6 hours' AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '9 hours' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp
SELECT "roomtemp", "id", "timestamp" FROM "data_t" WHERE idu_id IN (SELECT id FROM idu_t WHERE name = '01_IB7') AND timestamp >= DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '18 hours' AND timestamp < DATE_TRUNC('day', DATE '2022-09-30') + INTERVAL '21 hours' AND "roomtemp" IS NOT NULL AND "roomtemp" IS DISTINCT FROM 'NaN' AND "id" IS NOT NULL ORDER BY timestamp


Processing 지금 몇시야?:   7%|▋         | 1/15 [00:00<00:00, 14.28it/s]                                 

JsonStructureCorrectness: 1.00
ExactMatch: 0.00
F1: 0.51
Recall: 0.51
Semantic_ExactMatch: 1.00
Semantic_F1: 1.00
Semantic_Recall: 1.00
                     Input   Scenario  JsonStructureCorrectness  TruePositive  \
0  오늘 아침과 저녁의 온도차이는 얼마나 돼?  scenario1                      True      0.339623   

   FalsePositive  FalseNegative  SemanticTruePositive  SemanticFalsePositive  \
0       0.330189       0.330189                     2                      0   

   SemanticFalseNegative  ExactMatch  Total  Semantic_ExactMatch  \
0                      0           0   1060                    1   

   Semantic_Total  Semantic_TruePositive  Semantic_FalsePositive  \
0               2                    1.0                     0.0   

   Semantic_FalseNegative  
0                     0.0  





In [42]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,TruePositive,FalsePositive,FalseNegative
0,오늘 아침과 저녁의 온도차이는 얼마나 돼?,0.339623,0.330189,0.330189
12,올해 여름 우리반 실내온도 최대값과 최소값 알려줘,0.333333,0.333333,0.333333
