In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict, Counter
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

INFO:db.instance:Connected to the database PerSite_DB


In [3]:
BASE_DIR = "../"

In [35]:
class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "TruePositive"
    false_positive = "FalsePositive"
    false_negative = "FalseNegative"

def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

def run_query_and_get_report(input, tags, metadata, scenario, instruction_set):
    input_report = {}
    input_report["Input"] = input
    input_report["Tags"] = tags
    input_report["Scenario"] = scenario
    input_report["Result"] = []
    variables = {}
    print(input)
    for instruction in instruction_set:
        i_type = instruction["type"]
        if i_type == "q":
            # query
            args = instruction["args"]
            result_var_name = instruction["result_name"]
            # print(f"Query: {args}, {result_var_name}")
            if "temporal" in args:
                del args["table_name"]
                args["metadata"] = metadata
                result_df = DBManager.structured_query_data_t(args, get_rowids=True)
            else:
                result_df = DBManager.structured_query(args, get_rowids=True)
            # print(f"Result:\n{result_df}")
            try:
                if "timestamp" in result_df.columns:
                    try:
                        timestamp = result_df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
                    except Exception as e:
                        print(args)
                        print(result_df["timestamp"])
                result = result_df.to_dict(orient="index")
                cols = list(result_df.columns)
                result = [[row[col] for col in cols] for row in result.values()]
                input_report["Metadata"] = metadata
                variables[result_var_name] = result_df
                input_report["Result"].append({
                    "type": "q",
                    # "args": args,
                    # "result_name": result_var_name,
                    "result_shape": result_df.shape,
                    "result_columns": cols,
                    "result_indices": list(result_df["id"]),
                    # "result": result
                })
            except Exception as e:
                logger.error(f"Error inside: {e}")
                logger.error(f"Invoked with Query: {args}, {result_var_name}")
        elif i_type == "o":
            script, returns = instruction["script"], instruction["returns"]
            variables.update(
                OperationExecutor.execute(variables, script, returns)
            )
            variables_str = {}
            k_to_track = []
            k_to_track = ["max_diff_timestamps"]
            for k, v in variables.items():
                if k in k_to_track:
                    print(1, k, v, type(v))
                # print(k, type(v))
                type_ = None
                while True:
                    if type(v) in [pd.DataFrame]:
                        v['timestamp'] = v['timestamp'].map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
                        v = v.to_dict(orient="index")
                        type_ = "pd"
                        break
                    
                    # pd.Index
                    elif type(v) in [pd.Index, np.ndarray, pd.Series]:
                        if len(v) == 0:
                            v = v.tolist()
                            type_ = "primitive"
                            break
                        # if type(v[0]) in [pd.Timestamp, datetime.date, datetime.datetime, np.datetime64]:
                        #     # v = [x.strftime("%Y-%m-%d %H:%M:%S") for x in v]
                        # elif type(v[0]) in [np.int64, np.float64, np.bool]:
                        #     v = [x.item() for x in v]
                        # break

                        # if type(v) == np.ndarray:
                        #     v = pd.Series(v)
                        if type(v) in [pd.Series]:
                            v.reset_index(drop=True, inplace=True)
                        
                        if k in k_to_track:
                            print(2, k, v[0], type(v[0]))
                            # print(2, k, v)

                        v = pd.unique(v)
                        v = pd.Series(v)
                        if type(v[0]) in [pd.Timestamp, datetime.date, datetime.datetime, np.datetime64]:
                            v = v.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
                        # v = v.to_dict()
                        v = v.tolist()
                        if len(v) > 5:
                            v = v[:5]
                        # v = v.to_dict()
                        type_ = "primitive"
                        break
                    
                    elif type(v) in [np.int64, np.float64, np.bool]:
                        v = v.item()
                    elif type(v) in [np.datetime64]:
                        v = pd.Timestamp(v)
                        if k in k_to_track:
                            print(3, k, v, type(v))
                    elif type(v) in [pd.Timestamp, datetime.date, datetime.datetime]:
                        v = v.strftime("%Y-%m-%d %H:%M:%S")
                    elif type(v) in [int, float, bool, str, list, dict]:
                        type_ = "primitive"
                        break
                    else:
                        print(f"Type not handled: {k}: {type(v), v}")
                        type_ = "unknown"
                        break
                variables_str[k] = (type_, v)
                # print(k, v)
            input_report["Result"].append({
                "type": "o",
                "script": script,
                "returns": {k: variables_str[k] for k in returns}
            })

            for k in variables_str:
                if k in k_to_track:
                    print(k, variables_str[k])
        elif i_type == "r":
            force = False

            type_os = [r for r in instruction["expectations"] if r["type"] == "o"]
            returns = [r["returns"] for r in type_os]
            assert len(returns) == 1
            returns = returns[0]

            if len(returns) == 0 or any([v == -1 for v in returns.values()]):
                force = True
                if any([v == -1 for v in returns.values()]):
                    instruction["expectations"] = "관련 데이터를 찾을 수 없습니다."
            

            input_report["Result"].append({
                "type": "r",
                "expectations": instruction["expectations"]
            })

    return input_report

def build_query_groundtruth(dateset_name):
    ds_ts = []
    dt_tr = []
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dateset_name}")
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir() and "scenario" in directory.name:
            ts = read_json(f"{directory}/onlyq_ts.json")
            if "v7" in dateset_name:
                metadata = read_json(f"{directory}/metadata.json")
            for i, d in enumerate(ts):
                ts[i]["Scenario"] = directory.name
                if "v7" in dateset_name:
                    ts[i]["Metadata"] = metadata
            ds_ts.extend(ts)
            

            tr = read_json(f"{directory}/onlyq_tr.json")
            for i, d in enumerate(tr):
                tr[i]["Scenario"] = directory.name
                if "v7" in dateset_name:
                    tr[i]["Metadata"] = metadata
            dt_tr.extend(tr)
    
    ds = ds_ts + dt_tr
    print(len(ds))
    
    if "v7" in dateset_name:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
    else:
        db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
        metadata = None
    
    with open(db_gt_filename, "w", encoding="utf-8") as f:
        f.write("[")
        with tqdm(total=len(ds)) as pbar:
            for d in ds:
                pbar.set_description(f"Processing {d['Input']}")
                # print("--")
                
                input = d["Input"]
                # print(f"Input: {input}")
                scenario = d["Scenario"]
                tags = d["Tags"]
                
                metadata = d["Metadata"]
                response = d["Response"]
                # instruction_set = response["Instruction Set"]
                instruction_set = response["Instructions"]
                # print(f"Instruction Set: {type(instruction_set)}, {len(instruction_set)}")
                instruction_set.append({
                    "type": "r",
                    "expectations": response["Expectations"]
                })
                input_report = run_query_and_get_report(input, tags, metadata, scenario, instruction_set)
                
                
                try:
                    # print(input_report)
                    # del input_report["Metadata"]
                    f.write(json.dumps(input_report, ensure_ascii=False) + ",\n")
                except Exception as e:
                    logger.error(f"Error outside: {e}")
                    logger.error(f"Invoked with Input: {input}")
                    logger.error(f"Input Report: {input_report}")
                    # exit()
                    raise e
                
                # print("\n")
                pbar.update(1)
    
        # make it json array format
        # remove last comma
        f.seek(f.tell() - 2, 0)
        f.write("]")



In [5]:
# build_query_groundtruth("v5-250228-multimetadata")

# Eval_query

In [6]:
def eval_query(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue
            gt_report = gt_report[0]
            if gt_report["Result"] == []:
                pbar.update(1)
                continue
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Scenario"] = scenario
            
            if isinstance(cand_response["Candidate"], dict) and ("Instruction Set" in cand_response["Candidate"] or "지시" in cand_response["Candidate"] or "Instructions" in cand_response["Candidate"]):
                if "Instruction Set" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instruction Set"]
                elif "지시" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["지시"]
                elif "Instructions" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instructions"]

                evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
                try:
                    import re
                    # get data between "Instruction Set": [ and the last]
                    cand_instruction_set = re.search(r'(?<="Instruction Set": \[)(.*)(?=\])', cand_response["Candidate"], re.DOTALL).group(0)
                    # find all {"type": ~ }, {"type": ~ }, {"type": ~ }
                    cand_instruction_set = re.findall(r'({"type".*?})', cand_instruction_set)
                    # print(list(cand_instruction_set))
                    cand_instruction_set = [eval(d) for d in cand_instruction_set]
                except Exception as e:
                    evaluation_report[EM.json_structure] = False
                    print("Failed to parse input: ", input, cand_response["Candidate"])
                    print(e)
                    evaluation_reports.append(evaluation_report)
                    pbar.update(1)
                    print(evaluation_report)
                    continue
                    
            cand_report = run_query_and_get_report(input, metadata, scenario, cand_instruction_set) 
            
            # print(f"Input: {input}")
            
            gt_results, cand_results = gt_report["Result"], cand_report["Result"]

            if len(cand_results) == 0:
                evaluation_report[EM.json_structure] = False
                # evaluation_report[EM.true_positive] = 0
                # evaluation_report[EM.false_positive] = false_positive
                # evaluation_report[EM.false_negative] = false_negative
                            
                evaluation_reports.append(evaluation_report)
                # print(evaluation_report)
                            
                pbar.update(1)
                continue
            cand_results = cand_results[0]

            gt_rows = []
            for gt_result in gt_results:
                gt_rows.extend(gt_result["result_indices"])
            
            gt_rows = set(gt_rows)
            gt_cols = set(gt_results[0]["result_columns"])
            cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

            gt_cols.remove("id")
            cand_cols.remove("id")
            gt_cols.remove("idu")
            try:
                cand_cols.remove("idu")
            except Exception as e:
                pass
            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

            # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
            false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

            # Candidate의 총 조합에서 TP를 뺀 값이 FP
            false_positive = (len(cand_cols) * len(cand_rows)) - true_positive


            # print(len(gt_flatten), len(cand_flatten))
            
            # gt_counter = Counter(gt_flatten)
            # cand_counter = Counter(cand_flatten)

            # true_positive = sum(min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_negative = sum(gt_counter[item] - min(gt_counter[item], cand_counter.get(item, 0)) for item in gt_counter)
            # false_positive = sum(cand_counter[item] - min(cand_counter[item], gt_counter.get(item, 0)) for item in cand_counter)
            
            # # check if all gt results are in cand results
            # true_positive, false_positive, false_negative = 0, 0, 0
            # for gt_data in gt_flatten:
            #     try:
            #         cand_flatten.remove(gt_data)
            #         true_positive += 1
            #     except ValueError as e:
            #         false_negative += 1
            
            # false_positive = len(cand_flatten)
            
            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            
            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall
    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df


# WoAll

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r128_a256_woall-checkpoint-60"
cand_response_filename = "r-v7_r8_a16_woall_4bit-checkpoint-97"
# cand_response_filename = "r-v5_r32_a64_woall-checkpoint-70-batch"
# cand_response_filename = "r-v6_r64_a128_woall_shorten-checkpoint-53"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

# print(eval_df)

In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

# FI

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "r-v5_r256_a512_FI-checkpoint-43-batch"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

print(eval_df)

In [None]:
eval_df

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r256_a512_ISP-checkpoint-104"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
(eval_df)

# Ours

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r256_a512_ours-checkpoint-20"
# cand_response_filename = "r-v5_r128_a256_ours-checkpoint-52-batch"
# cand_response_filename = "r-v5_r128_a256_ours_noexample-checkpoint-50-batch"
# cand_response_filename = "r-v6_r128_a256_ours-checkpoint-52"
# cand_response_filename = "r-v6_r256_a512_ours-checkpoint-40"
# cand_response_filename = "r-v6_r256_a512_ours_shorten-checkpoint-30"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

In [1]:
build_query_groundtruth("v7-250309-reduceinputanddatefunctioncall")

NameError: name 'build_query_groundtruth' is not defined

In [None]:
def eval_query_gtgt(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_report in cand_responses:
            pbar.set_description(f"Processing {cand_report['Input']}")
            input = cand_report["Input"]
            scenario = cand_report["Scenario"]


            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            
            
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue
            gt_report = gt_report[0]
            if gt_report["Result"] == []:
                pbar.update(1)
                continue
            
            
            # print(f"Input: {input}")
            
            gt_results, cand_results = gt_report["Result"], cand_report["Result"]
            cand_results = cand_results[0]

            gt_rows = []
            for gt_result in gt_results:
                gt_rows.extend(gt_result["result_indices"])

            gt_rows = set(gt_rows)
            gt_cols = set(gt_results[0]["result_columns"])
            cand_cols, cand_rows = set(cand_results["result_columns"]), set(cand_results["result_indices"])

            gt_cols.remove("id")
            cand_cols.remove("id")
            # gt_cols.remove("idu")
            cand_cols.remove("idu")

            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = len(gt_cols & cand_cols) * len(gt_rows & cand_rows)

            # Ground Truth의 총 조합에서 TP를 뺀 값이 FN
            false_negative = (len(gt_cols) * len(gt_rows)) - true_positive

            # Candidate의 총 조합에서 TP를 뺀 값이 FP
            false_positive = (len(cand_cols) * len(cand_rows)) - true_positive
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            evaluation_report["Input"] = input
            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall
    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"

eval_df = eval_query_gtgt(db_gt_filename, cand_response_filename)
print(eval_df)

In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]

In [None]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
cand_response_filename = f"{BASE_DIR}/experiments/r-v7_r8_a16_ours_8bit-checkpoint-194.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)
print(eval_df)

In [None]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "TruePositive", "FalsePositive", "FalseNegative"]]