In [2]:
from collections import defaultdict
import logging

import pandas as pd
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

from db.manager import DBManager
from pathlib import Path

INFO:db.instance:Connected to the database PerSite_DB


In [3]:
BASE_DIR = "../"

In [4]:
class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "TruePositive"
    false_positive = "FalsePositive"
    false_negative = "FalseNegative"

def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

def run_query_and_get_report(input, scenario, instruction_set):
    input_report = {}
    input_report["Input"] = input
    input_report["Scenario"] = scenario
    input_report["Result"] = []
    for instruction in instruction_set:
        i_type = instruction["type"]
        if i_type == "q":
            # query
            args = instruction["args"]
            result_var_name = instruction["result_name"]
            # print(f"Query: {args}, {result_var_name}")
            result_df = DBManager.structured_query(args, get_rowids=True)
            # print(f"Result:\n{result_df}")
            try:
                if "timestamp" in result_df.columns:
                    try:
                        result_df["timestamp"] = result_df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
                    except Exception as e:
                        print(args)
                        print(result_df["timestamp"])
                result = result_df.to_dict(orient="index")
                cols = list(result_df.columns)
                result = [[row[col] for col in cols] for row in result.values()]
                
                input_report["Result"].append({
                    "type": "q",
                    "args": args,
                    # "result_name": result_var_name,
                    "result_shape": result_df.shape,
                    "result_columns": cols,
                    "result_indices": list(result_df["id"]),
                    # "result": result
                })
            except Exception as e:
                logger.error(f"Error inside: {e}")
                logger.error(f"Invoked with Query: {args}, {result_var_name}")
        
    return input_report

def build_query_groundtruth():
    ds_ts = []
    dt_tr = []
    # base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/v5-250228-multimetadata")
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/v6-250306-optimizetoken")
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir() and "scenario" in directory.name:
            ts = read_json(f"{directory}/onlyq_ts.json")
            for i, d in enumerate(ts):
                ts[i]["Scenario"] = directory.name
            ds_ts.extend(ts)

            # tr = read_json(f"{directory}/onlyq_tr.json")
            # for i, d in enumerate(tr):
            #     tr[i]["Scenario"] = directory.name
            # dt_tr.extend(tr)
    
    ds = ds_ts
    
    db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
    
    with open(db_gt_filename, "w", encoding="utf-8") as f:
        f.write("[")
        with tqdm(total=len(ds)) as pbar:
            for d in ds:
                pbar.set_description(f"Processing {d['Input']}")
                # print("--")
                
                input = d["Input"]
                # print(f"Input: {input}")
                scenario = d["Scenario"]
                
                response = d["Response"]
                # instruction_set = response["Instruction Set"]
                instruction_set = response["지시"]
                # print(f"Instruction Set: {type(instruction_set)}, {len(instruction_set)}")
                
                input_report = run_query_and_get_report(input, scenario, instruction_set)
                
                try:
                    # print(input_report)
                    f.write(json.dumps(input_report, ensure_ascii=False) + ",\n")
                except Exception as e:
                    logger.error(f"Error outside: {e}")
                    logger.error(f"Invoked with Input: {input}")
                    logger.error(f"Input Report: {input_report}")
                    # exit()
                    raise e
                
                # print("\n")
                pbar.update(1)
    
        # make it json array format
        # remove last comma
        f.seek(f.tell() - 2, 0)
        f.write("]")

def product_query_info(result):
    cols = result["result_columns"]
    rows = result["result_indices"]
    return list(itertools.product(cols, rows))

# build_query_groundtruth()
    

Processing 지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?:  17%|█▋        | 4/23 [00:00<00:00, 21.39it/s]

Processing 앞반 전원 켜져있어?: 100%|██████████| 23/23 [00:00<00:00, 30.83it/s]                                  


# Eval_query

In [5]:
def eval_query(db_gt_filename, cand_response_filename):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)

    evaluation_reports = []

    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]
            
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                pbar.update(1)
                continue
            gt_report = gt_report[0]
            if gt_report["Result"] == []:
                pbar.update(1)
                continue
            
            evaluation_report = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Scenario"] = scenario
            
            if isinstance(cand_response["Candidate"], dict) and ("Instruction Set" in cand_response["Candidate"] or "지시" in cand_response["Candidate"]):
                if "Instruction Set" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["Instruction Set"]
                elif "지시" in cand_response["Candidate"]:
                    cand_instruction_set = cand_response["Candidate"]["지시"]
                    
                evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
                try:
                    import re
                    # get data between "Instruction Set": [ and the last]
                    cand_instruction_set = re.search(r'(?<="Instruction Set": \[)(.*)(?=\])', cand_response["Candidate"], re.DOTALL).group(0)
                    # find all {"type": ~ }, {"type": ~ }, {"type": ~ }
                    cand_instruction_set = re.findall(r'({"type".*?})', cand_instruction_set)
                    # print(list(cand_instruction_set))
                    cand_instruction_set = [eval(d) for d in cand_instruction_set]
                except Exception as e:
                    evaluation_report[EM.json_structure] = False
                    print("Failed to parse input: ", input, cand_response["Candidate"])
                    print(e)
                    evaluation_reports.append(evaluation_report)
                    pbar.update(1)
                    print(evaluation_report)
                    continue
                    
            cand_report = run_query_and_get_report(input, scenario, cand_instruction_set) 
            
            # print(f"Input: {input}")
            
            gt_results, cand_results = gt_report["Result"], cand_report["Result"]
            gt_flatten, cand_flatten = [product_query_info(r) for r in gt_results], [product_query_info(r) for r in cand_results]
            # flatten them
            gt_flatten, cand_flatten = [item for sublist in gt_flatten for item in sublist], [item for sublist in cand_flatten for item in sublist]
            
            # print(len(gt_flatten), len(cand_flatten))
            
            
            # check if all gt results are in cand results
            true_positive, false_positive, false_negative = 0, 0, 0
            for gt_data in gt_flatten:
                try:
                    cand_flatten.remove(gt_data)
                    true_positive += 1
                except ValueError as e:
                    false_negative += 1
            
            false_positive = len(cand_flatten)
            
            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative
            
            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_df["TruePositive"] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df["FalsePositive"] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df["FalseNegative"] = eval_df[EM.false_negative] / eval_df["Total"]

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall
    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

# WoAll

In [1]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r128_a256_woall-checkpoint-60"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r64_a128_woall-checkpoint-72"
# cand_response_filename = "r-v5_r32_a64_woall-checkpoint-70-batch"
cand_response_filename = "r-v6_r64_a128_woall_shorten-checkpoint-53"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

# print(eval_df)

NameError: name 'BASE_DIR' is not defined

In [6]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,Scenario,FalsePositive,FalseNegative
0,Why is our classroom so cold,scenario3,,
1,오늘 아침과 저녁의 온도차이는 얼마나 돼?,scenario3,0.874306,0.0
4,현재 설정온도랑 실내온도 차이 알려줘.,scenario3,0.0,1.0
10,지난 3일 동안 우리반 실내 온도 평균 값 알려줘.,scenario3,0.25,0.0
12,오늘 아침과 저녁의 온도차이는 얼마나 돼?,scenario1,0.874561,0.0
15,현재 설정온도랑 실내온도 차이 알려줘.,scenario1,0.0,1.0
22,Why is our classroom so cold,scenario2,0.498249,0.501751
23,오늘 아침과 저녁의 온도차이는 얼마나 돼?,scenario2,0.887157,0.112843
24,지금 옆반 온도랑 우리반 온도 알려줘,scenario2,0.5,0.5
26,현재 설정온도랑 실내온도 차이 알려줘.,scenario2,0.5,0.5


# FI

In [7]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "r-v5_r256_a512_FI-checkpoint-43-batch"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)

print(eval_df)

Processing 이번주 우리반과 옆반 온도 변화 추이 비교해줘:  46%|████▌     | 16/35 [00:01<00:02,  6.55it/s]/s]                  ERROR:db.instance:An error occurred while selecting data with
SELECT "timestamp", "roomtemp_ours", "roomtemp_beside", "id" FROM "data_t" WHERE idu_id = (SELECT id FROM idu_t WHERE name = '02_I84') AND timestamp BETWEEN '2022-09-26 00:00:00' AND '2022-10-02 23:59:59' AND "roomtemp_ours" IS NOT NULL AND "roomtemp_beside" IS NOT NULL AND "id" IS NOT NULL
ERROR:db.instance:Error message: column "roomtemp_ours" does not exist
LINE 1: SELECT "timestamp", "roomtemp_ours", "roomtemp_beside", "id"...
                            ^

ERROR:__main__:Error inside: 'NoneType' object has no attribute 'columns'
ERROR:__main__:Invoked with Query: {'table_name': 'data_t', 'columns': ['timestamp', 'roomtemp_ours', 'roomtemp_beside', 'id'], 'conditions': ["timestamp BETWEEN '2022-09-26 00:00:00' AND '2022-10-02 23:59:59'"], 'subquery': "idu_id = (SELECT id FROM idu_t WHERE name = '02_I84')"}, qr_ours
E

JsonStructureCorrectness: 1.00
ExactMatch: 0.62
F1: 0.84
Recall: 0.81
                               Input   Scenario  JsonStructureCorrectness  \
0       Why is our classroom so cold  scenario1                      True   
1            오늘 아침과 저녁의 온도차이는 얼마나 돼?  scenario1                      True   
2               지금 옆반 온도랑 우리반 온도 알려줘  scenario1                      True   
3          이번주 우리반과 옆반 온도 변화 추이 비교해줘  scenario1                      True   
4              현재 설정온도랑 실내온도 차이 알려줘.  scenario1                      True   
5   지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?  scenario1                      True   
6          어제 옆반 에어컨 전원 언제 꺼졌는지 알려줘.  scenario1                      True   
7                        앞반 전원 켜져있어?  scenario1                      True   
8                          어제 전원 껐어?  scenario1                      True   
9             최근에 설정온도가 가장 높았던 날 알려줘  scenario1                      True   
10      지난 3일 동안 우리반 실내 온도 평균 값 알려줘.  scenario1                      True   
11    




In [8]:
eval_df

Unnamed: 0,Input,Scenario,JsonStructureCorrectness,TruePositive,FalsePositive,FalseNegative,ExactMatch,Total
0,Why is our classroom so cold,scenario1,True,0.0,0.0,1.0,0,7115
1,오늘 아침과 저녁의 온도차이는 얼마나 돼?,scenario1,True,1.0,0.0,0.0,1,714
2,지금 옆반 온도랑 우리반 온도 알려줘,scenario1,True,1.0,0.0,0.0,1,4
3,이번주 우리반과 옆반 온도 변화 추이 비교해줘,scenario1,True,1.0,0.0,0.0,1,42603
4,현재 설정온도랑 실내온도 차이 알려줘.,scenario1,True,0.75,0.25,0.0,0,4
5,지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?,scenario1,True,1.0,0.0,0.0,1,39676
6,어제 옆반 에어컨 전원 언제 꺼졌는지 알려줘.,scenario1,True,1.0,0.0,0.0,1,4320
7,앞반 전원 켜져있어?,scenario1,True,1.0,0.0,0.0,1,2
8,어제 전원 껐어?,scenario1,True,1.0,0.0,0.0,1,2880
9,최근에 설정온도가 가장 높았던 날 알려줘,scenario1,True,1.0,0.0,0.0,1,29790


In [9]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-r256_a512_ISP-checkpoint-104"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


Processing Why is our classroom so cold:   0%|          | 0/13 [00:00<?, ?it/s]


KeyError: 'Scenario'

In [None]:
(eval_df)

# Ours

In [32]:
db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
cand_response_filename = "response-sh2orc-Llama-3.1-Korean-8B-Instruct-v5_r256_a512_ours-checkpoint-20"
# cand_response_filename = "r-v5_r128_a256_ours-checkpoint-52-batch"
# cand_response_filename = "r-v5_r128_a256_ours_noexample-checkpoint-50-batch"
# cand_response_filename = "r-v6_r128_a256_ours-checkpoint-52"
cand_response_filename = "r-v6_r256_a512_ours-checkpoint-40"
cand_response_filename = "r-v6_r256_a512_ours_shorten-checkpoint-30"
cand_response_filename = f"{BASE_DIR}/experiments/{cand_response_filename}.json"

eval_df = eval_query(db_gt_filename, cand_response_filename)


Processing 이번주 우리반과 옆반 온도 변화 추이 비교해줘:  13%|█▎        | 3/23 [00:00<00:00, 55.27it/s]

Processing 지난 3일 동안 우리반 실내 온도 평균 값 알려줘.: 100%|██████████| 23/23 [02:13<00:00,  5.82s/it]          

JsonStructureCorrectness: 1.00
ExactMatch: 0.60
F1: 0.87
Recall: 0.91





In [28]:
eval_df[eval_df["ExactMatch"] == 0][["Input", "Scenario", "FalsePositive", "FalseNegative"]]

Unnamed: 0,Input,Scenario,FalsePositive,FalseNegative
3,이번주 우리반과 옆반 온도 변화 추이 비교해줘,scenario3,0.0,0.666667
6,최근에 설정온도가 가장 높았던 날 알려줘,scenario3,0.416667,0.416667
7,지난 3일 동안 우리반 실내 온도 평균 값 알려줘.,scenario3,0.25,0.0
8,지금 옆반 온도랑 우리반 온도 알려줘,scenario2,0.5,0.5
11,지금 옆반 온도랑 우리반 온도 알려줘,scenario1,0.0,0.5
12,이번주 우리반과 옆반 온도 변화 추이 비교해줘,scenario1,0.0,0.333333
18,최근에 설정온도가 가장 높았던 날 알려줘,scenario1,0.416329,0.416975
